Upload 15 files
Browse files- .gitattributes +4 -0
- chat_template.json +1 -0
- config.json +221 -0
- configuration.json +1 -0
- generation_config.json +7 -0
- librkllmrt.so +3 -0
- long_test.wav +3 -0
- merges.txt +0 -0
- model.safetensors.index.json +715 -0
- preprocessor_config.json +14 -0
- rkllm_binding.py +1324 -0
- rknn/audio_encoder.rknn +3 -0
- rknn/language_model.rkllm +3 -0
- run_qwen3_asr_e2e.py +333 -0
- tokenizer_config.json +549 -0
- vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
librkllmrt.so filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
long_test.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
rknn/audio_encoder.rknn filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
rknn/language_model.rkllm filter=lfs diff=lfs merge=lfs -text
|
chat_template.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chat_template": "{%- set ns = namespace(system_text=\"\") -%}\n{%- for m in messages -%}\n {%- if m.role == 'system' -%}\n {%- if m.content is string -%}\n {%- set ns.system_text = ns.system_text + m.content -%}\n {%- else -%}\n {%- for c in m.content -%}\n {%- if c.type == 'text' and (c.text is defined) -%}\n {%- set ns.system_text = ns.system_text + c.text -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n\n{%- set ns2 = namespace(audio_tokens=\"\") -%}\n{%- for m in messages -%}\n {%- if m.content is not string -%}\n {%- for c in m.content -%}\n {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) -%}\n {%- set ns2.audio_tokens = ns2.audio_tokens + \"<|audio_start|><|audio_pad|><|audio_end|>\" -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n{%- endfor -%}\n\n{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n{%- if add_generation_prompt -%}\n{{- '<|im_start|>assistant\\n' -}}\n{%- endif -%}"}
|
config.json
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3ASRForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"model_type": "qwen3_asr",
|
| 6 |
+
"support_languages": [
|
| 7 |
+
"Chinese",
|
| 8 |
+
"English",
|
| 9 |
+
"Cantonese",
|
| 10 |
+
"Arabic",
|
| 11 |
+
"German",
|
| 12 |
+
"French",
|
| 13 |
+
"Spanish",
|
| 14 |
+
"Portuguese",
|
| 15 |
+
"Indonesian",
|
| 16 |
+
"Italian",
|
| 17 |
+
"Korean",
|
| 18 |
+
"Russian",
|
| 19 |
+
"Thai",
|
| 20 |
+
"Vietnamese",
|
| 21 |
+
"Japanese",
|
| 22 |
+
"Turkish",
|
| 23 |
+
"Hindi",
|
| 24 |
+
"Malay",
|
| 25 |
+
"Dutch",
|
| 26 |
+
"Swedish",
|
| 27 |
+
"Danish",
|
| 28 |
+
"Finnish",
|
| 29 |
+
"Polish",
|
| 30 |
+
"Czech",
|
| 31 |
+
"Filipino",
|
| 32 |
+
"Persian",
|
| 33 |
+
"Greek",
|
| 34 |
+
"Romanian",
|
| 35 |
+
"Hungarian",
|
| 36 |
+
"Macedonian"
|
| 37 |
+
],
|
| 38 |
+
"thinker_config": {
|
| 39 |
+
"model_type": "qwen3_asr",
|
| 40 |
+
"architectures": [
|
| 41 |
+
"Qwen3ASRForConditionalGeneration"
|
| 42 |
+
],
|
| 43 |
+
"audio_config": {
|
| 44 |
+
"_name_or_path": "",
|
| 45 |
+
"activation_dropout": 0,
|
| 46 |
+
"activation_function": "gelu",
|
| 47 |
+
"add_cross_attention": false,
|
| 48 |
+
"architectures": null,
|
| 49 |
+
"attention_dropout": 0,
|
| 50 |
+
"bad_words_ids": null,
|
| 51 |
+
"begin_suppress_tokens": null,
|
| 52 |
+
"bos_token_id": null,
|
| 53 |
+
"chunk_size_feed_forward": 0,
|
| 54 |
+
"conv_chunksize": 500,
|
| 55 |
+
"cross_attention_hidden_size": null,
|
| 56 |
+
"d_model": 1024,
|
| 57 |
+
"decoder_start_token_id": null,
|
| 58 |
+
"diversity_penalty": 0.0,
|
| 59 |
+
"do_sample": false,
|
| 60 |
+
"downsample_hidden_size": 480,
|
| 61 |
+
"dropout": 0,
|
| 62 |
+
"dtype": null,
|
| 63 |
+
"early_stopping": false,
|
| 64 |
+
"encoder_attention_heads": 16,
|
| 65 |
+
"encoder_ffn_dim": 4096,
|
| 66 |
+
"encoder_layers": 24,
|
| 67 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 68 |
+
"eos_token_id": null,
|
| 69 |
+
"exponential_decay_length_penalty": null,
|
| 70 |
+
"finetuning_task": null,
|
| 71 |
+
"forced_bos_token_id": null,
|
| 72 |
+
"forced_eos_token_id": null,
|
| 73 |
+
"id2label": {
|
| 74 |
+
"0": "LABEL_0",
|
| 75 |
+
"1": "LABEL_1"
|
| 76 |
+
},
|
| 77 |
+
"initializer_range": 0.02,
|
| 78 |
+
"is_decoder": false,
|
| 79 |
+
"is_encoder_decoder": false,
|
| 80 |
+
"label2id": {
|
| 81 |
+
"LABEL_0": 0,
|
| 82 |
+
"LABEL_1": 1
|
| 83 |
+
},
|
| 84 |
+
"length_penalty": 1.0,
|
| 85 |
+
"max_length": 20,
|
| 86 |
+
"max_source_positions": 1500,
|
| 87 |
+
"min_length": 0,
|
| 88 |
+
"model_type": "qwen3_asr_audio_encoder",
|
| 89 |
+
"n_window": 50,
|
| 90 |
+
"n_window_infer": 800,
|
| 91 |
+
"no_repeat_ngram_size": 0,
|
| 92 |
+
"num_beam_groups": 1,
|
| 93 |
+
"num_beams": 1,
|
| 94 |
+
"num_hidden_layers": 24,
|
| 95 |
+
"num_mel_bins": 128,
|
| 96 |
+
"num_return_sequences": 1,
|
| 97 |
+
"output_attentions": false,
|
| 98 |
+
"output_dim": 2048,
|
| 99 |
+
"output_hidden_states": false,
|
| 100 |
+
"output_scores": false,
|
| 101 |
+
"pad_token_id": null,
|
| 102 |
+
"prefix": null,
|
| 103 |
+
"problem_type": null,
|
| 104 |
+
"pruned_heads": {},
|
| 105 |
+
"remove_invalid_values": false,
|
| 106 |
+
"repetition_penalty": 1.0,
|
| 107 |
+
"return_dict": true,
|
| 108 |
+
"return_dict_in_generate": false,
|
| 109 |
+
"scale_embedding": false,
|
| 110 |
+
"sep_token_id": null,
|
| 111 |
+
"suppress_tokens": null,
|
| 112 |
+
"task_specific_params": null,
|
| 113 |
+
"temperature": 1.0,
|
| 114 |
+
"tf_legacy_loss": false,
|
| 115 |
+
"tie_encoder_decoder": false,
|
| 116 |
+
"tie_word_embeddings": true,
|
| 117 |
+
"tokenizer_class": null,
|
| 118 |
+
"top_k": 50,
|
| 119 |
+
"top_p": 1.0,
|
| 120 |
+
"torchscript": false,
|
| 121 |
+
"typical_p": 1.0,
|
| 122 |
+
"use_bfloat16": false
|
| 123 |
+
},
|
| 124 |
+
"audio_end_token_id": 151670,
|
| 125 |
+
"audio_start_token_id": 151669,
|
| 126 |
+
"audio_token_id": 151676,
|
| 127 |
+
"dtype": "bfloat16",
|
| 128 |
+
"initializer_range": 0.02,
|
| 129 |
+
"text_config": {
|
| 130 |
+
"_name_or_path": "",
|
| 131 |
+
"add_cross_attention": false,
|
| 132 |
+
"architectures": null,
|
| 133 |
+
"attention_bias": false,
|
| 134 |
+
"attention_dropout": 0.0,
|
| 135 |
+
"bad_words_ids": null,
|
| 136 |
+
"begin_suppress_tokens": null,
|
| 137 |
+
"bos_token_id": null,
|
| 138 |
+
"chunk_size_feed_forward": 0,
|
| 139 |
+
"cross_attention_hidden_size": null,
|
| 140 |
+
"decoder_start_token_id": null,
|
| 141 |
+
"diversity_penalty": 0.0,
|
| 142 |
+
"do_sample": false,
|
| 143 |
+
"dtype": null,
|
| 144 |
+
"early_stopping": false,
|
| 145 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 146 |
+
"eos_token_id": null,
|
| 147 |
+
"exponential_decay_length_penalty": null,
|
| 148 |
+
"finetuning_task": null,
|
| 149 |
+
"forced_bos_token_id": null,
|
| 150 |
+
"forced_eos_token_id": null,
|
| 151 |
+
"head_dim": 128,
|
| 152 |
+
"hidden_act": "silu",
|
| 153 |
+
"hidden_size": 2048,
|
| 154 |
+
"id2label": {
|
| 155 |
+
"0": "LABEL_0",
|
| 156 |
+
"1": "LABEL_1"
|
| 157 |
+
},
|
| 158 |
+
"initializer_range": 0.02,
|
| 159 |
+
"intermediate_size": 6144,
|
| 160 |
+
"is_decoder": false,
|
| 161 |
+
"is_encoder_decoder": false,
|
| 162 |
+
"label2id": {
|
| 163 |
+
"LABEL_0": 0,
|
| 164 |
+
"LABEL_1": 1
|
| 165 |
+
},
|
| 166 |
+
"length_penalty": 1.0,
|
| 167 |
+
"max_length": 20,
|
| 168 |
+
"max_position_embeddings": 65536,
|
| 169 |
+
"min_length": 0,
|
| 170 |
+
"model_type": "qwen3",
|
| 171 |
+
"no_repeat_ngram_size": 0,
|
| 172 |
+
"num_attention_heads": 16,
|
| 173 |
+
"num_beam_groups": 1,
|
| 174 |
+
"num_beams": 1,
|
| 175 |
+
"num_hidden_layers": 28,
|
| 176 |
+
"num_key_value_heads": 8,
|
| 177 |
+
"num_return_sequences": 1,
|
| 178 |
+
"output_attentions": false,
|
| 179 |
+
"output_hidden_states": false,
|
| 180 |
+
"output_scores": false,
|
| 181 |
+
"pad_token_id": null,
|
| 182 |
+
"prefix": null,
|
| 183 |
+
"problem_type": null,
|
| 184 |
+
"pruned_heads": {},
|
| 185 |
+
"remove_invalid_values": false,
|
| 186 |
+
"repetition_penalty": 1.0,
|
| 187 |
+
"return_dict": true,
|
| 188 |
+
"return_dict_in_generate": false,
|
| 189 |
+
"rms_norm_eps": 1e-06,
|
| 190 |
+
"rope_scaling": {
|
| 191 |
+
"interleaved": true,
|
| 192 |
+
"mrope_interleaved": true,
|
| 193 |
+
"mrope_section": [
|
| 194 |
+
24,
|
| 195 |
+
20,
|
| 196 |
+
20
|
| 197 |
+
],
|
| 198 |
+
"rope_type": "default",
|
| 199 |
+
"type": "default"
|
| 200 |
+
},
|
| 201 |
+
"rope_theta": 1000000,
|
| 202 |
+
"sep_token_id": null,
|
| 203 |
+
"suppress_tokens": null,
|
| 204 |
+
"task_specific_params": null,
|
| 205 |
+
"temperature": 1.0,
|
| 206 |
+
"tf_legacy_loss": false,
|
| 207 |
+
"tie_encoder_decoder": false,
|
| 208 |
+
"tie_word_embeddings": true,
|
| 209 |
+
"tokenizer_class": null,
|
| 210 |
+
"top_k": 50,
|
| 211 |
+
"top_p": 1.0,
|
| 212 |
+
"torchscript": false,
|
| 213 |
+
"typical_p": 1.0,
|
| 214 |
+
"use_bfloat16": false,
|
| 215 |
+
"use_cache": true,
|
| 216 |
+
"vocab_size": 151936
|
| 217 |
+
}
|
| 218 |
+
},
|
| 219 |
+
"transformers_version": "4.57.6"
|
| 220 |
+
}
|
| 221 |
+
|
configuration.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"framework":"Pytorch","task":"auto-speech-recognition"}
|
generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"eos_token_id": [151643,151645],
|
| 4 |
+
"pad_token_id": 151643,
|
| 5 |
+
"do_sample": false,
|
| 6 |
+
"temperature": 0.000001
|
| 7 |
+
}
|
librkllmrt.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbcf28a8666b9fbf7361d6aad892b957920f6ea92400c074899b48f4c5b2c96f
|
| 3 |
+
size 7543744
|
long_test.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4da15e0244edc3ca23fbcb4fe93669e8ce4d59002a72177906123a1a91f17c17
|
| 3 |
+
size 7110734
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors.index.json
ADDED
|
@@ -0,0 +1,715 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"format": "pt"
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"thinker.audio_tower.conv2d1.bias": "model-00001-of-00002.safetensors",
|
| 7 |
+
"thinker.audio_tower.conv2d1.weight": "model-00001-of-00002.safetensors",
|
| 8 |
+
"thinker.audio_tower.conv2d2.bias": "model-00001-of-00002.safetensors",
|
| 9 |
+
"thinker.audio_tower.conv2d2.weight": "model-00001-of-00002.safetensors",
|
| 10 |
+
"thinker.audio_tower.conv2d3.bias": "model-00001-of-00002.safetensors",
|
| 11 |
+
"thinker.audio_tower.conv2d3.weight": "model-00001-of-00002.safetensors",
|
| 12 |
+
"thinker.audio_tower.conv_out.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"thinker.audio_tower.layers.0.fc1.bias": "model-00001-of-00002.safetensors",
|
| 14 |
+
"thinker.audio_tower.layers.0.fc1.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"thinker.audio_tower.layers.0.fc2.bias": "model-00001-of-00002.safetensors",
|
| 16 |
+
"thinker.audio_tower.layers.0.fc2.weight": "model-00001-of-00002.safetensors",
|
| 17 |
+
"thinker.audio_tower.layers.0.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 18 |
+
"thinker.audio_tower.layers.0.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 19 |
+
"thinker.audio_tower.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 20 |
+
"thinker.audio_tower.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 21 |
+
"thinker.audio_tower.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 22 |
+
"thinker.audio_tower.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 23 |
+
"thinker.audio_tower.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 24 |
+
"thinker.audio_tower.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"thinker.audio_tower.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 26 |
+
"thinker.audio_tower.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"thinker.audio_tower.layers.0.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 28 |
+
"thinker.audio_tower.layers.0.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 29 |
+
"thinker.audio_tower.layers.1.fc1.bias": "model-00001-of-00002.safetensors",
|
| 30 |
+
"thinker.audio_tower.layers.1.fc1.weight": "model-00001-of-00002.safetensors",
|
| 31 |
+
"thinker.audio_tower.layers.1.fc2.bias": "model-00001-of-00002.safetensors",
|
| 32 |
+
"thinker.audio_tower.layers.1.fc2.weight": "model-00001-of-00002.safetensors",
|
| 33 |
+
"thinker.audio_tower.layers.1.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 34 |
+
"thinker.audio_tower.layers.1.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 35 |
+
"thinker.audio_tower.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 36 |
+
"thinker.audio_tower.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 37 |
+
"thinker.audio_tower.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 38 |
+
"thinker.audio_tower.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 39 |
+
"thinker.audio_tower.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 40 |
+
"thinker.audio_tower.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 41 |
+
"thinker.audio_tower.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 42 |
+
"thinker.audio_tower.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 43 |
+
"thinker.audio_tower.layers.1.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 44 |
+
"thinker.audio_tower.layers.1.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"thinker.audio_tower.layers.10.fc1.bias": "model-00001-of-00002.safetensors",
|
| 46 |
+
"thinker.audio_tower.layers.10.fc1.weight": "model-00001-of-00002.safetensors",
|
| 47 |
+
"thinker.audio_tower.layers.10.fc2.bias": "model-00001-of-00002.safetensors",
|
| 48 |
+
"thinker.audio_tower.layers.10.fc2.weight": "model-00001-of-00002.safetensors",
|
| 49 |
+
"thinker.audio_tower.layers.10.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 50 |
+
"thinker.audio_tower.layers.10.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 51 |
+
"thinker.audio_tower.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 52 |
+
"thinker.audio_tower.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 53 |
+
"thinker.audio_tower.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 54 |
+
"thinker.audio_tower.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 55 |
+
"thinker.audio_tower.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 56 |
+
"thinker.audio_tower.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 57 |
+
"thinker.audio_tower.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 58 |
+
"thinker.audio_tower.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 59 |
+
"thinker.audio_tower.layers.10.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 60 |
+
"thinker.audio_tower.layers.10.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 61 |
+
"thinker.audio_tower.layers.11.fc1.bias": "model-00001-of-00002.safetensors",
|
| 62 |
+
"thinker.audio_tower.layers.11.fc1.weight": "model-00001-of-00002.safetensors",
|
| 63 |
+
"thinker.audio_tower.layers.11.fc2.bias": "model-00001-of-00002.safetensors",
|
| 64 |
+
"thinker.audio_tower.layers.11.fc2.weight": "model-00001-of-00002.safetensors",
|
| 65 |
+
"thinker.audio_tower.layers.11.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 66 |
+
"thinker.audio_tower.layers.11.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 67 |
+
"thinker.audio_tower.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 68 |
+
"thinker.audio_tower.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 69 |
+
"thinker.audio_tower.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 70 |
+
"thinker.audio_tower.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 71 |
+
"thinker.audio_tower.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 72 |
+
"thinker.audio_tower.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 73 |
+
"thinker.audio_tower.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 74 |
+
"thinker.audio_tower.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 75 |
+
"thinker.audio_tower.layers.11.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 76 |
+
"thinker.audio_tower.layers.11.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 77 |
+
"thinker.audio_tower.layers.12.fc1.bias": "model-00001-of-00002.safetensors",
|
| 78 |
+
"thinker.audio_tower.layers.12.fc1.weight": "model-00001-of-00002.safetensors",
|
| 79 |
+
"thinker.audio_tower.layers.12.fc2.bias": "model-00001-of-00002.safetensors",
|
| 80 |
+
"thinker.audio_tower.layers.12.fc2.weight": "model-00001-of-00002.safetensors",
|
| 81 |
+
"thinker.audio_tower.layers.12.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 82 |
+
"thinker.audio_tower.layers.12.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 83 |
+
"thinker.audio_tower.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 84 |
+
"thinker.audio_tower.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 85 |
+
"thinker.audio_tower.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 86 |
+
"thinker.audio_tower.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 87 |
+
"thinker.audio_tower.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 88 |
+
"thinker.audio_tower.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 89 |
+
"thinker.audio_tower.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 90 |
+
"thinker.audio_tower.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 91 |
+
"thinker.audio_tower.layers.12.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 92 |
+
"thinker.audio_tower.layers.12.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 93 |
+
"thinker.audio_tower.layers.13.fc1.bias": "model-00001-of-00002.safetensors",
|
| 94 |
+
"thinker.audio_tower.layers.13.fc1.weight": "model-00001-of-00002.safetensors",
|
| 95 |
+
"thinker.audio_tower.layers.13.fc2.bias": "model-00001-of-00002.safetensors",
|
| 96 |
+
"thinker.audio_tower.layers.13.fc2.weight": "model-00001-of-00002.safetensors",
|
| 97 |
+
"thinker.audio_tower.layers.13.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 98 |
+
"thinker.audio_tower.layers.13.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 99 |
+
"thinker.audio_tower.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 100 |
+
"thinker.audio_tower.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 101 |
+
"thinker.audio_tower.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 102 |
+
"thinker.audio_tower.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 103 |
+
"thinker.audio_tower.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 104 |
+
"thinker.audio_tower.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 105 |
+
"thinker.audio_tower.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 106 |
+
"thinker.audio_tower.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 107 |
+
"thinker.audio_tower.layers.13.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 108 |
+
"thinker.audio_tower.layers.13.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 109 |
+
"thinker.audio_tower.layers.14.fc1.bias": "model-00001-of-00002.safetensors",
|
| 110 |
+
"thinker.audio_tower.layers.14.fc1.weight": "model-00001-of-00002.safetensors",
|
| 111 |
+
"thinker.audio_tower.layers.14.fc2.bias": "model-00001-of-00002.safetensors",
|
| 112 |
+
"thinker.audio_tower.layers.14.fc2.weight": "model-00001-of-00002.safetensors",
|
| 113 |
+
"thinker.audio_tower.layers.14.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 114 |
+
"thinker.audio_tower.layers.14.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 115 |
+
"thinker.audio_tower.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 116 |
+
"thinker.audio_tower.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 117 |
+
"thinker.audio_tower.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 118 |
+
"thinker.audio_tower.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 119 |
+
"thinker.audio_tower.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 120 |
+
"thinker.audio_tower.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"thinker.audio_tower.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 122 |
+
"thinker.audio_tower.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"thinker.audio_tower.layers.14.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 124 |
+
"thinker.audio_tower.layers.14.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 125 |
+
"thinker.audio_tower.layers.15.fc1.bias": "model-00001-of-00002.safetensors",
|
| 126 |
+
"thinker.audio_tower.layers.15.fc1.weight": "model-00001-of-00002.safetensors",
|
| 127 |
+
"thinker.audio_tower.layers.15.fc2.bias": "model-00001-of-00002.safetensors",
|
| 128 |
+
"thinker.audio_tower.layers.15.fc2.weight": "model-00001-of-00002.safetensors",
|
| 129 |
+
"thinker.audio_tower.layers.15.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 130 |
+
"thinker.audio_tower.layers.15.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 131 |
+
"thinker.audio_tower.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 132 |
+
"thinker.audio_tower.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 133 |
+
"thinker.audio_tower.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 134 |
+
"thinker.audio_tower.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 135 |
+
"thinker.audio_tower.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 136 |
+
"thinker.audio_tower.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 137 |
+
"thinker.audio_tower.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 138 |
+
"thinker.audio_tower.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 139 |
+
"thinker.audio_tower.layers.15.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 140 |
+
"thinker.audio_tower.layers.15.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 141 |
+
"thinker.audio_tower.layers.16.fc1.bias": "model-00001-of-00002.safetensors",
|
| 142 |
+
"thinker.audio_tower.layers.16.fc1.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"thinker.audio_tower.layers.16.fc2.bias": "model-00001-of-00002.safetensors",
|
| 144 |
+
"thinker.audio_tower.layers.16.fc2.weight": "model-00001-of-00002.safetensors",
|
| 145 |
+
"thinker.audio_tower.layers.16.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 146 |
+
"thinker.audio_tower.layers.16.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 147 |
+
"thinker.audio_tower.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 148 |
+
"thinker.audio_tower.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 149 |
+
"thinker.audio_tower.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 150 |
+
"thinker.audio_tower.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 151 |
+
"thinker.audio_tower.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 152 |
+
"thinker.audio_tower.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 153 |
+
"thinker.audio_tower.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 154 |
+
"thinker.audio_tower.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 155 |
+
"thinker.audio_tower.layers.16.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 156 |
+
"thinker.audio_tower.layers.16.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 157 |
+
"thinker.audio_tower.layers.17.fc1.bias": "model-00001-of-00002.safetensors",
|
| 158 |
+
"thinker.audio_tower.layers.17.fc1.weight": "model-00001-of-00002.safetensors",
|
| 159 |
+
"thinker.audio_tower.layers.17.fc2.bias": "model-00001-of-00002.safetensors",
|
| 160 |
+
"thinker.audio_tower.layers.17.fc2.weight": "model-00001-of-00002.safetensors",
|
| 161 |
+
"thinker.audio_tower.layers.17.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 162 |
+
"thinker.audio_tower.layers.17.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 163 |
+
"thinker.audio_tower.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 164 |
+
"thinker.audio_tower.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 165 |
+
"thinker.audio_tower.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 166 |
+
"thinker.audio_tower.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 167 |
+
"thinker.audio_tower.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 168 |
+
"thinker.audio_tower.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 169 |
+
"thinker.audio_tower.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 170 |
+
"thinker.audio_tower.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 171 |
+
"thinker.audio_tower.layers.17.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 172 |
+
"thinker.audio_tower.layers.17.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 173 |
+
"thinker.audio_tower.layers.18.fc1.bias": "model-00001-of-00002.safetensors",
|
| 174 |
+
"thinker.audio_tower.layers.18.fc1.weight": "model-00001-of-00002.safetensors",
|
| 175 |
+
"thinker.audio_tower.layers.18.fc2.bias": "model-00001-of-00002.safetensors",
|
| 176 |
+
"thinker.audio_tower.layers.18.fc2.weight": "model-00001-of-00002.safetensors",
|
| 177 |
+
"thinker.audio_tower.layers.18.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 178 |
+
"thinker.audio_tower.layers.18.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 179 |
+
"thinker.audio_tower.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 180 |
+
"thinker.audio_tower.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 181 |
+
"thinker.audio_tower.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 182 |
+
"thinker.audio_tower.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 183 |
+
"thinker.audio_tower.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 184 |
+
"thinker.audio_tower.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 185 |
+
"thinker.audio_tower.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 186 |
+
"thinker.audio_tower.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 187 |
+
"thinker.audio_tower.layers.18.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 188 |
+
"thinker.audio_tower.layers.18.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 189 |
+
"thinker.audio_tower.layers.19.fc1.bias": "model-00001-of-00002.safetensors",
|
| 190 |
+
"thinker.audio_tower.layers.19.fc1.weight": "model-00001-of-00002.safetensors",
|
| 191 |
+
"thinker.audio_tower.layers.19.fc2.bias": "model-00001-of-00002.safetensors",
|
| 192 |
+
"thinker.audio_tower.layers.19.fc2.weight": "model-00001-of-00002.safetensors",
|
| 193 |
+
"thinker.audio_tower.layers.19.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 194 |
+
"thinker.audio_tower.layers.19.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 195 |
+
"thinker.audio_tower.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 196 |
+
"thinker.audio_tower.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 197 |
+
"thinker.audio_tower.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 198 |
+
"thinker.audio_tower.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 199 |
+
"thinker.audio_tower.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 200 |
+
"thinker.audio_tower.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 201 |
+
"thinker.audio_tower.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 202 |
+
"thinker.audio_tower.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 203 |
+
"thinker.audio_tower.layers.19.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 204 |
+
"thinker.audio_tower.layers.19.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 205 |
+
"thinker.audio_tower.layers.2.fc1.bias": "model-00001-of-00002.safetensors",
|
| 206 |
+
"thinker.audio_tower.layers.2.fc1.weight": "model-00001-of-00002.safetensors",
|
| 207 |
+
"thinker.audio_tower.layers.2.fc2.bias": "model-00001-of-00002.safetensors",
|
| 208 |
+
"thinker.audio_tower.layers.2.fc2.weight": "model-00001-of-00002.safetensors",
|
| 209 |
+
"thinker.audio_tower.layers.2.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 210 |
+
"thinker.audio_tower.layers.2.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 211 |
+
"thinker.audio_tower.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 212 |
+
"thinker.audio_tower.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 213 |
+
"thinker.audio_tower.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 214 |
+
"thinker.audio_tower.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 215 |
+
"thinker.audio_tower.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 216 |
+
"thinker.audio_tower.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 217 |
+
"thinker.audio_tower.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 218 |
+
"thinker.audio_tower.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 219 |
+
"thinker.audio_tower.layers.2.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 220 |
+
"thinker.audio_tower.layers.2.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 221 |
+
"thinker.audio_tower.layers.20.fc1.bias": "model-00001-of-00002.safetensors",
|
| 222 |
+
"thinker.audio_tower.layers.20.fc1.weight": "model-00001-of-00002.safetensors",
|
| 223 |
+
"thinker.audio_tower.layers.20.fc2.bias": "model-00001-of-00002.safetensors",
|
| 224 |
+
"thinker.audio_tower.layers.20.fc2.weight": "model-00001-of-00002.safetensors",
|
| 225 |
+
"thinker.audio_tower.layers.20.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 226 |
+
"thinker.audio_tower.layers.20.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 227 |
+
"thinker.audio_tower.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 228 |
+
"thinker.audio_tower.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 229 |
+
"thinker.audio_tower.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 230 |
+
"thinker.audio_tower.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 231 |
+
"thinker.audio_tower.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 232 |
+
"thinker.audio_tower.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 233 |
+
"thinker.audio_tower.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 234 |
+
"thinker.audio_tower.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 235 |
+
"thinker.audio_tower.layers.20.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 236 |
+
"thinker.audio_tower.layers.20.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 237 |
+
"thinker.audio_tower.layers.21.fc1.bias": "model-00001-of-00002.safetensors",
|
| 238 |
+
"thinker.audio_tower.layers.21.fc1.weight": "model-00001-of-00002.safetensors",
|
| 239 |
+
"thinker.audio_tower.layers.21.fc2.bias": "model-00001-of-00002.safetensors",
|
| 240 |
+
"thinker.audio_tower.layers.21.fc2.weight": "model-00001-of-00002.safetensors",
|
| 241 |
+
"thinker.audio_tower.layers.21.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 242 |
+
"thinker.audio_tower.layers.21.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 243 |
+
"thinker.audio_tower.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 244 |
+
"thinker.audio_tower.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 245 |
+
"thinker.audio_tower.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 246 |
+
"thinker.audio_tower.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 247 |
+
"thinker.audio_tower.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 248 |
+
"thinker.audio_tower.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 249 |
+
"thinker.audio_tower.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 250 |
+
"thinker.audio_tower.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 251 |
+
"thinker.audio_tower.layers.21.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 252 |
+
"thinker.audio_tower.layers.21.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 253 |
+
"thinker.audio_tower.layers.22.fc1.bias": "model-00001-of-00002.safetensors",
|
| 254 |
+
"thinker.audio_tower.layers.22.fc1.weight": "model-00001-of-00002.safetensors",
|
| 255 |
+
"thinker.audio_tower.layers.22.fc2.bias": "model-00001-of-00002.safetensors",
|
| 256 |
+
"thinker.audio_tower.layers.22.fc2.weight": "model-00001-of-00002.safetensors",
|
| 257 |
+
"thinker.audio_tower.layers.22.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 258 |
+
"thinker.audio_tower.layers.22.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 259 |
+
"thinker.audio_tower.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 260 |
+
"thinker.audio_tower.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 261 |
+
"thinker.audio_tower.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 262 |
+
"thinker.audio_tower.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 263 |
+
"thinker.audio_tower.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 264 |
+
"thinker.audio_tower.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 265 |
+
"thinker.audio_tower.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 266 |
+
"thinker.audio_tower.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 267 |
+
"thinker.audio_tower.layers.22.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 268 |
+
"thinker.audio_tower.layers.22.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 269 |
+
"thinker.audio_tower.layers.23.fc1.bias": "model-00001-of-00002.safetensors",
|
| 270 |
+
"thinker.audio_tower.layers.23.fc1.weight": "model-00001-of-00002.safetensors",
|
| 271 |
+
"thinker.audio_tower.layers.23.fc2.bias": "model-00001-of-00002.safetensors",
|
| 272 |
+
"thinker.audio_tower.layers.23.fc2.weight": "model-00001-of-00002.safetensors",
|
| 273 |
+
"thinker.audio_tower.layers.23.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 274 |
+
"thinker.audio_tower.layers.23.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 275 |
+
"thinker.audio_tower.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 276 |
+
"thinker.audio_tower.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 277 |
+
"thinker.audio_tower.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 278 |
+
"thinker.audio_tower.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 279 |
+
"thinker.audio_tower.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 280 |
+
"thinker.audio_tower.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 281 |
+
"thinker.audio_tower.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 282 |
+
"thinker.audio_tower.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 283 |
+
"thinker.audio_tower.layers.23.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 284 |
+
"thinker.audio_tower.layers.23.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 285 |
+
"thinker.audio_tower.layers.3.fc1.bias": "model-00001-of-00002.safetensors",
|
| 286 |
+
"thinker.audio_tower.layers.3.fc1.weight": "model-00001-of-00002.safetensors",
|
| 287 |
+
"thinker.audio_tower.layers.3.fc2.bias": "model-00001-of-00002.safetensors",
|
| 288 |
+
"thinker.audio_tower.layers.3.fc2.weight": "model-00001-of-00002.safetensors",
|
| 289 |
+
"thinker.audio_tower.layers.3.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 290 |
+
"thinker.audio_tower.layers.3.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 291 |
+
"thinker.audio_tower.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 292 |
+
"thinker.audio_tower.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 293 |
+
"thinker.audio_tower.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 294 |
+
"thinker.audio_tower.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 295 |
+
"thinker.audio_tower.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 296 |
+
"thinker.audio_tower.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 297 |
+
"thinker.audio_tower.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 298 |
+
"thinker.audio_tower.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 299 |
+
"thinker.audio_tower.layers.3.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 300 |
+
"thinker.audio_tower.layers.3.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 301 |
+
"thinker.audio_tower.layers.4.fc1.bias": "model-00001-of-00002.safetensors",
|
| 302 |
+
"thinker.audio_tower.layers.4.fc1.weight": "model-00001-of-00002.safetensors",
|
| 303 |
+
"thinker.audio_tower.layers.4.fc2.bias": "model-00001-of-00002.safetensors",
|
| 304 |
+
"thinker.audio_tower.layers.4.fc2.weight": "model-00001-of-00002.safetensors",
|
| 305 |
+
"thinker.audio_tower.layers.4.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 306 |
+
"thinker.audio_tower.layers.4.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 307 |
+
"thinker.audio_tower.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 308 |
+
"thinker.audio_tower.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 309 |
+
"thinker.audio_tower.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 310 |
+
"thinker.audio_tower.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 311 |
+
"thinker.audio_tower.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 312 |
+
"thinker.audio_tower.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 313 |
+
"thinker.audio_tower.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 314 |
+
"thinker.audio_tower.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 315 |
+
"thinker.audio_tower.layers.4.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 316 |
+
"thinker.audio_tower.layers.4.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 317 |
+
"thinker.audio_tower.layers.5.fc1.bias": "model-00001-of-00002.safetensors",
|
| 318 |
+
"thinker.audio_tower.layers.5.fc1.weight": "model-00001-of-00002.safetensors",
|
| 319 |
+
"thinker.audio_tower.layers.5.fc2.bias": "model-00001-of-00002.safetensors",
|
| 320 |
+
"thinker.audio_tower.layers.5.fc2.weight": "model-00001-of-00002.safetensors",
|
| 321 |
+
"thinker.audio_tower.layers.5.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 322 |
+
"thinker.audio_tower.layers.5.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 323 |
+
"thinker.audio_tower.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 324 |
+
"thinker.audio_tower.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 325 |
+
"thinker.audio_tower.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 326 |
+
"thinker.audio_tower.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 327 |
+
"thinker.audio_tower.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 328 |
+
"thinker.audio_tower.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 329 |
+
"thinker.audio_tower.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 330 |
+
"thinker.audio_tower.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 331 |
+
"thinker.audio_tower.layers.5.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 332 |
+
"thinker.audio_tower.layers.5.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 333 |
+
"thinker.audio_tower.layers.6.fc1.bias": "model-00001-of-00002.safetensors",
|
| 334 |
+
"thinker.audio_tower.layers.6.fc1.weight": "model-00001-of-00002.safetensors",
|
| 335 |
+
"thinker.audio_tower.layers.6.fc2.bias": "model-00001-of-00002.safetensors",
|
| 336 |
+
"thinker.audio_tower.layers.6.fc2.weight": "model-00001-of-00002.safetensors",
|
| 337 |
+
"thinker.audio_tower.layers.6.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 338 |
+
"thinker.audio_tower.layers.6.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 339 |
+
"thinker.audio_tower.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 340 |
+
"thinker.audio_tower.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 341 |
+
"thinker.audio_tower.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 342 |
+
"thinker.audio_tower.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 343 |
+
"thinker.audio_tower.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 344 |
+
"thinker.audio_tower.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 345 |
+
"thinker.audio_tower.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 346 |
+
"thinker.audio_tower.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 347 |
+
"thinker.audio_tower.layers.6.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 348 |
+
"thinker.audio_tower.layers.6.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 349 |
+
"thinker.audio_tower.layers.7.fc1.bias": "model-00001-of-00002.safetensors",
|
| 350 |
+
"thinker.audio_tower.layers.7.fc1.weight": "model-00001-of-00002.safetensors",
|
| 351 |
+
"thinker.audio_tower.layers.7.fc2.bias": "model-00001-of-00002.safetensors",
|
| 352 |
+
"thinker.audio_tower.layers.7.fc2.weight": "model-00001-of-00002.safetensors",
|
| 353 |
+
"thinker.audio_tower.layers.7.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 354 |
+
"thinker.audio_tower.layers.7.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 355 |
+
"thinker.audio_tower.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 356 |
+
"thinker.audio_tower.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 357 |
+
"thinker.audio_tower.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 358 |
+
"thinker.audio_tower.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 359 |
+
"thinker.audio_tower.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 360 |
+
"thinker.audio_tower.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 361 |
+
"thinker.audio_tower.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 362 |
+
"thinker.audio_tower.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 363 |
+
"thinker.audio_tower.layers.7.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 364 |
+
"thinker.audio_tower.layers.7.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 365 |
+
"thinker.audio_tower.layers.8.fc1.bias": "model-00001-of-00002.safetensors",
|
| 366 |
+
"thinker.audio_tower.layers.8.fc1.weight": "model-00001-of-00002.safetensors",
|
| 367 |
+
"thinker.audio_tower.layers.8.fc2.bias": "model-00001-of-00002.safetensors",
|
| 368 |
+
"thinker.audio_tower.layers.8.fc2.weight": "model-00001-of-00002.safetensors",
|
| 369 |
+
"thinker.audio_tower.layers.8.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 370 |
+
"thinker.audio_tower.layers.8.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 371 |
+
"thinker.audio_tower.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 372 |
+
"thinker.audio_tower.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 373 |
+
"thinker.audio_tower.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 374 |
+
"thinker.audio_tower.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 375 |
+
"thinker.audio_tower.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 376 |
+
"thinker.audio_tower.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 377 |
+
"thinker.audio_tower.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 378 |
+
"thinker.audio_tower.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 379 |
+
"thinker.audio_tower.layers.8.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 380 |
+
"thinker.audio_tower.layers.8.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 381 |
+
"thinker.audio_tower.layers.9.fc1.bias": "model-00001-of-00002.safetensors",
|
| 382 |
+
"thinker.audio_tower.layers.9.fc1.weight": "model-00001-of-00002.safetensors",
|
| 383 |
+
"thinker.audio_tower.layers.9.fc2.bias": "model-00001-of-00002.safetensors",
|
| 384 |
+
"thinker.audio_tower.layers.9.fc2.weight": "model-00001-of-00002.safetensors",
|
| 385 |
+
"thinker.audio_tower.layers.9.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 386 |
+
"thinker.audio_tower.layers.9.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 387 |
+
"thinker.audio_tower.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 388 |
+
"thinker.audio_tower.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 389 |
+
"thinker.audio_tower.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 390 |
+
"thinker.audio_tower.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 391 |
+
"thinker.audio_tower.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 392 |
+
"thinker.audio_tower.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 393 |
+
"thinker.audio_tower.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 394 |
+
"thinker.audio_tower.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 395 |
+
"thinker.audio_tower.layers.9.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 396 |
+
"thinker.audio_tower.layers.9.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 397 |
+
"thinker.audio_tower.ln_post.bias": "model-00001-of-00002.safetensors",
|
| 398 |
+
"thinker.audio_tower.ln_post.weight": "model-00001-of-00002.safetensors",
|
| 399 |
+
"thinker.audio_tower.proj1.bias": "model-00001-of-00002.safetensors",
|
| 400 |
+
"thinker.audio_tower.proj1.weight": "model-00001-of-00002.safetensors",
|
| 401 |
+
"thinker.audio_tower.proj2.bias": "model-00001-of-00002.safetensors",
|
| 402 |
+
"thinker.audio_tower.proj2.weight": "model-00001-of-00002.safetensors",
|
| 403 |
+
"thinker.lm_head.weight": "model-00001-of-00002.safetensors",
|
| 404 |
+
"thinker.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 405 |
+
"thinker.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 406 |
+
"thinker.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 407 |
+
"thinker.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 408 |
+
"thinker.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 409 |
+
"thinker.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 410 |
+
"thinker.model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 411 |
+
"thinker.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 412 |
+
"thinker.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 413 |
+
"thinker.model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 414 |
+
"thinker.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 415 |
+
"thinker.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 416 |
+
"thinker.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 417 |
+
"thinker.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 418 |
+
"thinker.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 419 |
+
"thinker.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 420 |
+
"thinker.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 421 |
+
"thinker.model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 422 |
+
"thinker.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 423 |
+
"thinker.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 424 |
+
"thinker.model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 425 |
+
"thinker.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 426 |
+
"thinker.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 427 |
+
"thinker.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 428 |
+
"thinker.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 429 |
+
"thinker.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 430 |
+
"thinker.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 431 |
+
"thinker.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 432 |
+
"thinker.model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 433 |
+
"thinker.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 434 |
+
"thinker.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 435 |
+
"thinker.model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 436 |
+
"thinker.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 437 |
+
"thinker.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 438 |
+
"thinker.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 439 |
+
"thinker.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 440 |
+
"thinker.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 441 |
+
"thinker.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 442 |
+
"thinker.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 443 |
+
"thinker.model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 444 |
+
"thinker.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 445 |
+
"thinker.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 446 |
+
"thinker.model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 447 |
+
"thinker.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 448 |
+
"thinker.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 449 |
+
"thinker.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 450 |
+
"thinker.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 451 |
+
"thinker.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 452 |
+
"thinker.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 453 |
+
"thinker.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 454 |
+
"thinker.model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 455 |
+
"thinker.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 456 |
+
"thinker.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 457 |
+
"thinker.model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 458 |
+
"thinker.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 459 |
+
"thinker.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 460 |
+
"thinker.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 461 |
+
"thinker.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 462 |
+
"thinker.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 463 |
+
"thinker.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 464 |
+
"thinker.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 465 |
+
"thinker.model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 466 |
+
"thinker.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 467 |
+
"thinker.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 468 |
+
"thinker.model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 469 |
+
"thinker.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 470 |
+
"thinker.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 471 |
+
"thinker.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 472 |
+
"thinker.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 473 |
+
"thinker.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 474 |
+
"thinker.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 475 |
+
"thinker.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 476 |
+
"thinker.model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 477 |
+
"thinker.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 478 |
+
"thinker.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 479 |
+
"thinker.model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 480 |
+
"thinker.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 481 |
+
"thinker.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 482 |
+
"thinker.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 483 |
+
"thinker.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 484 |
+
"thinker.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 485 |
+
"thinker.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 486 |
+
"thinker.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 487 |
+
"thinker.model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 488 |
+
"thinker.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 489 |
+
"thinker.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 490 |
+
"thinker.model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 491 |
+
"thinker.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 492 |
+
"thinker.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 493 |
+
"thinker.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 494 |
+
"thinker.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 495 |
+
"thinker.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 496 |
+
"thinker.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 497 |
+
"thinker.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 498 |
+
"thinker.model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 499 |
+
"thinker.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 500 |
+
"thinker.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 501 |
+
"thinker.model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 502 |
+
"thinker.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 503 |
+
"thinker.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 504 |
+
"thinker.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 505 |
+
"thinker.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 506 |
+
"thinker.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 507 |
+
"thinker.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 508 |
+
"thinker.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 509 |
+
"thinker.model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 510 |
+
"thinker.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 511 |
+
"thinker.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 512 |
+
"thinker.model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 513 |
+
"thinker.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 514 |
+
"thinker.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 515 |
+
"thinker.model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 516 |
+
"thinker.model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 517 |
+
"thinker.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 518 |
+
"thinker.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 519 |
+
"thinker.model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 520 |
+
"thinker.model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 521 |
+
"thinker.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 522 |
+
"thinker.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 523 |
+
"thinker.model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 524 |
+
"thinker.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 525 |
+
"thinker.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 526 |
+
"thinker.model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 527 |
+
"thinker.model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 528 |
+
"thinker.model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 529 |
+
"thinker.model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 530 |
+
"thinker.model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 531 |
+
"thinker.model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 532 |
+
"thinker.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 533 |
+
"thinker.model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 534 |
+
"thinker.model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 535 |
+
"thinker.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 536 |
+
"thinker.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 537 |
+
"thinker.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 538 |
+
"thinker.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 539 |
+
"thinker.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 540 |
+
"thinker.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 541 |
+
"thinker.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 542 |
+
"thinker.model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 543 |
+
"thinker.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 544 |
+
"thinker.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 545 |
+
"thinker.model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 546 |
+
"thinker.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 547 |
+
"thinker.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 548 |
+
"thinker.model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 549 |
+
"thinker.model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 550 |
+
"thinker.model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 551 |
+
"thinker.model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 552 |
+
"thinker.model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 553 |
+
"thinker.model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 554 |
+
"thinker.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 555 |
+
"thinker.model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 556 |
+
"thinker.model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 557 |
+
"thinker.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 558 |
+
"thinker.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 559 |
+
"thinker.model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 560 |
+
"thinker.model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 561 |
+
"thinker.model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 562 |
+
"thinker.model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 563 |
+
"thinker.model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 564 |
+
"thinker.model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 565 |
+
"thinker.model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 566 |
+
"thinker.model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 567 |
+
"thinker.model.layers.21.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 568 |
+
"thinker.model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 569 |
+
"thinker.model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 570 |
+
"thinker.model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 571 |
+
"thinker.model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 572 |
+
"thinker.model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 573 |
+
"thinker.model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 574 |
+
"thinker.model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 575 |
+
"thinker.model.layers.22.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 576 |
+
"thinker.model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 577 |
+
"thinker.model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 578 |
+
"thinker.model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 579 |
+
"thinker.model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 580 |
+
"thinker.model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 581 |
+
"thinker.model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 582 |
+
"thinker.model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 583 |
+
"thinker.model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 584 |
+
"thinker.model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 585 |
+
"thinker.model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 586 |
+
"thinker.model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 587 |
+
"thinker.model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 588 |
+
"thinker.model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 589 |
+
"thinker.model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 590 |
+
"thinker.model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 591 |
+
"thinker.model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 592 |
+
"thinker.model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 593 |
+
"thinker.model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 594 |
+
"thinker.model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 595 |
+
"thinker.model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 596 |
+
"thinker.model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 597 |
+
"thinker.model.layers.24.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 598 |
+
"thinker.model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 599 |
+
"thinker.model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 600 |
+
"thinker.model.layers.24.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 601 |
+
"thinker.model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 602 |
+
"thinker.model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 603 |
+
"thinker.model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 604 |
+
"thinker.model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 605 |
+
"thinker.model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 606 |
+
"thinker.model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 607 |
+
"thinker.model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 608 |
+
"thinker.model.layers.25.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 609 |
+
"thinker.model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 610 |
+
"thinker.model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 611 |
+
"thinker.model.layers.25.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 612 |
+
"thinker.model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 613 |
+
"thinker.model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 614 |
+
"thinker.model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 615 |
+
"thinker.model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 616 |
+
"thinker.model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 617 |
+
"thinker.model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 618 |
+
"thinker.model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 619 |
+
"thinker.model.layers.26.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 620 |
+
"thinker.model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 621 |
+
"thinker.model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 622 |
+
"thinker.model.layers.26.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 623 |
+
"thinker.model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 624 |
+
"thinker.model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 625 |
+
"thinker.model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 626 |
+
"thinker.model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 627 |
+
"thinker.model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 628 |
+
"thinker.model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 629 |
+
"thinker.model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 630 |
+
"thinker.model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 631 |
+
"thinker.model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 632 |
+
"thinker.model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 633 |
+
"thinker.model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 634 |
+
"thinker.model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 635 |
+
"thinker.model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 636 |
+
"thinker.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 637 |
+
"thinker.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 638 |
+
"thinker.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 639 |
+
"thinker.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 640 |
+
"thinker.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 641 |
+
"thinker.model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 642 |
+
"thinker.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 643 |
+
"thinker.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 644 |
+
"thinker.model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 645 |
+
"thinker.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 646 |
+
"thinker.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 647 |
+
"thinker.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 648 |
+
"thinker.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 649 |
+
"thinker.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 650 |
+
"thinker.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 651 |
+
"thinker.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 652 |
+
"thinker.model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 653 |
+
"thinker.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 654 |
+
"thinker.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 655 |
+
"thinker.model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 656 |
+
"thinker.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 657 |
+
"thinker.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 658 |
+
"thinker.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 659 |
+
"thinker.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 660 |
+
"thinker.model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 661 |
+
"thinker.model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 662 |
+
"thinker.model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 663 |
+
"thinker.model.layers.5.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 664 |
+
"thinker.model.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 665 |
+
"thinker.model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 666 |
+
"thinker.model.layers.5.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 667 |
+
"thinker.model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 668 |
+
"thinker.model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 669 |
+
"thinker.model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 670 |
+
"thinker.model.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 671 |
+
"thinker.model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 672 |
+
"thinker.model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 673 |
+
"thinker.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 674 |
+
"thinker.model.layers.6.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 675 |
+
"thinker.model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 676 |
+
"thinker.model.layers.6.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 677 |
+
"thinker.model.layers.6.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 678 |
+
"thinker.model.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 679 |
+
"thinker.model.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 680 |
+
"thinker.model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 681 |
+
"thinker.model.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 682 |
+
"thinker.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 683 |
+
"thinker.model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 684 |
+
"thinker.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 685 |
+
"thinker.model.layers.7.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 686 |
+
"thinker.model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 687 |
+
"thinker.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 688 |
+
"thinker.model.layers.7.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 689 |
+
"thinker.model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 690 |
+
"thinker.model.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 691 |
+
"thinker.model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 692 |
+
"thinker.model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 693 |
+
"thinker.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 694 |
+
"thinker.model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 695 |
+
"thinker.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 696 |
+
"thinker.model.layers.8.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 697 |
+
"thinker.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 698 |
+
"thinker.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 699 |
+
"thinker.model.layers.8.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 700 |
+
"thinker.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 701 |
+
"thinker.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 702 |
+
"thinker.model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 703 |
+
"thinker.model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 704 |
+
"thinker.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 705 |
+
"thinker.model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 706 |
+
"thinker.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 707 |
+
"thinker.model.layers.9.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 708 |
+
"thinker.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 709 |
+
"thinker.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 710 |
+
"thinker.model.layers.9.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 711 |
+
"thinker.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 712 |
+
"thinker.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 713 |
+
"thinker.model.norm.weight": "model-00002-of-00002.safetensors"
|
| 714 |
+
}
|
| 715 |
+
}
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chunk_length": 30,
|
| 3 |
+
"dither": 0.0,
|
| 4 |
+
"feature_extractor_type": "WhisperFeatureExtractor",
|
| 5 |
+
"feature_size": 128,
|
| 6 |
+
"hop_length": 160,
|
| 7 |
+
"n_fft": 400,
|
| 8 |
+
"n_samples": 480000,
|
| 9 |
+
"nb_max_frames": 3000,
|
| 10 |
+
"padding_side": "right",
|
| 11 |
+
"padding_value": 0.0,
|
| 12 |
+
"processor_class": "Qwen3ASRProcessor",
|
| 13 |
+
"return_attention_mask": true
|
| 14 |
+
}
|
rkllm_binding.py
ADDED
|
@@ -0,0 +1,1324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import ctypes
|
| 3 |
+
import enum
|
| 4 |
+
import os
|
| 5 |
+
import threading
|
| 6 |
+
from typing import Optional, Sequence, Tuple
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
# Define constants from the header
|
| 11 |
+
CPU0 = (1 << 0) # 0x01
|
| 12 |
+
CPU1 = (1 << 1) # 0x02
|
| 13 |
+
CPU2 = (1 << 2) # 0x04
|
| 14 |
+
CPU3 = (1 << 3) # 0x08
|
| 15 |
+
CPU4 = (1 << 4) # 0x10
|
| 16 |
+
CPU5 = (1 << 5) # 0x20
|
| 17 |
+
CPU6 = (1 << 6) # 0x40
|
| 18 |
+
CPU7 = (1 << 7) # 0x80
|
| 19 |
+
|
| 20 |
+
# --- Enums ---
|
| 21 |
+
class LLMCallState(enum.IntEnum):
|
| 22 |
+
RKLLM_RUN_NORMAL = 0
|
| 23 |
+
RKLLM_RUN_WAITING = 1
|
| 24 |
+
RKLLM_RUN_FINISH = 2
|
| 25 |
+
RKLLM_RUN_ERROR = 3
|
| 26 |
+
|
| 27 |
+
class RKLLMInputType(enum.IntEnum):
|
| 28 |
+
RKLLM_INPUT_PROMPT = 0
|
| 29 |
+
RKLLM_INPUT_TOKEN = 1
|
| 30 |
+
RKLLM_INPUT_EMBED = 2
|
| 31 |
+
RKLLM_INPUT_MULTIMODAL = 3
|
| 32 |
+
|
| 33 |
+
class RKLLMInferMode(enum.IntEnum):
|
| 34 |
+
RKLLM_INFER_GENERATE = 0
|
| 35 |
+
RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1
|
| 36 |
+
RKLLM_INFER_GET_LOGITS = 2
|
| 37 |
+
|
| 38 |
+
# --- Structures ---
|
| 39 |
+
class RKLLMExtendParam(ctypes.Structure):
|
| 40 |
+
base_domain_id: ctypes.c_int32
|
| 41 |
+
embed_flash: ctypes.c_int8
|
| 42 |
+
enabled_cpus_num: ctypes.c_int8
|
| 43 |
+
enabled_cpus_mask: ctypes.c_uint32
|
| 44 |
+
n_batch: ctypes.c_uint8
|
| 45 |
+
use_cross_attn: ctypes.c_int8
|
| 46 |
+
reserved: ctypes.c_uint8 * 104
|
| 47 |
+
|
| 48 |
+
_fields_ = [
|
| 49 |
+
("base_domain_id", ctypes.c_int32), # 基础域ID
|
| 50 |
+
("embed_flash", ctypes.c_int8), # 是否从闪存查询词嵌入向量(1启用,0禁用)
|
| 51 |
+
("enabled_cpus_num", ctypes.c_int8), # 推理启用的CPU数量
|
| 52 |
+
("enabled_cpus_mask", ctypes.c_uint32), # 指示启用哪些CPU的位掩码
|
| 53 |
+
("n_batch", ctypes.c_uint8), # 一次前向传播中并发处理的输入样本数,设置>1启用批量推理,默认为1
|
| 54 |
+
("use_cross_attn", ctypes.c_int8), # 是否启用交叉注意力(非零启用,0禁用)
|
| 55 |
+
("reserved", ctypes.c_uint8 * 104) # 保留字段
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
class RKLLMParam(ctypes.Structure):
|
| 59 |
+
model_path: ctypes.c_char_p
|
| 60 |
+
max_context_len: ctypes.c_int32
|
| 61 |
+
max_new_tokens: ctypes.c_int32
|
| 62 |
+
top_k: ctypes.c_int32
|
| 63 |
+
n_keep: ctypes.c_int32
|
| 64 |
+
top_p: ctypes.c_float
|
| 65 |
+
temperature: ctypes.c_float
|
| 66 |
+
repeat_penalty: ctypes.c_float
|
| 67 |
+
frequency_penalty: ctypes.c_float
|
| 68 |
+
presence_penalty: ctypes.c_float
|
| 69 |
+
mirostat: ctypes.c_int32
|
| 70 |
+
mirostat_tau: ctypes.c_float
|
| 71 |
+
mirostat_eta: ctypes.c_float
|
| 72 |
+
skip_special_token: ctypes.c_bool
|
| 73 |
+
is_async: ctypes.c_bool
|
| 74 |
+
img_start: ctypes.c_char_p
|
| 75 |
+
img_end: ctypes.c_char_p
|
| 76 |
+
img_content: ctypes.c_char_p
|
| 77 |
+
extend_param: RKLLMExtendParam
|
| 78 |
+
|
| 79 |
+
_fields_ = [
|
| 80 |
+
("model_path", ctypes.c_char_p), # 模型文件路径
|
| 81 |
+
("max_context_len", ctypes.c_int32), # 上下文窗口最大token数
|
| 82 |
+
("max_new_tokens", ctypes.c_int32), # 最大生成新token数
|
| 83 |
+
("top_k", ctypes.c_int32), # Top-K采样参数
|
| 84 |
+
("n_keep", ctypes.c_int32), # 上下文窗口移动时保留的kv缓存数量
|
| 85 |
+
("top_p", ctypes.c_float), # Top-P(nucleus)采样参数
|
| 86 |
+
("temperature", ctypes.c_float), # 采样温度,影响token选择的随机性
|
| 87 |
+
("repeat_penalty", ctypes.c_float), # 重复token惩罚
|
| 88 |
+
("frequency_penalty", ctypes.c_float), # 频繁token惩罚
|
| 89 |
+
("presence_penalty", ctypes.c_float), # 输入中已存在token的惩罚
|
| 90 |
+
("mirostat", ctypes.c_int32), # Mirostat采样策略标志(0表示禁用)
|
| 91 |
+
("mirostat_tau", ctypes.c_float), # Mirostat采样Tau参数
|
| 92 |
+
("mirostat_eta", ctypes.c_float), # Mirostat采样Eta参数
|
| 93 |
+
("skip_special_token", ctypes.c_bool), # 是否跳过特殊token
|
| 94 |
+
("is_async", ctypes.c_bool), # 是否异步推理
|
| 95 |
+
("img_start", ctypes.c_char_p), # 多模态输入中图像的起始位置
|
| 96 |
+
("img_end", ctypes.c_char_p), # 多模态输入中图像的结束位置
|
| 97 |
+
("img_content", ctypes.c_char_p), # 图像内容指针
|
| 98 |
+
("extend_param", RKLLMExtendParam) # 扩展参数
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
class RKLLMLoraAdapter(ctypes.Structure):
|
| 102 |
+
lora_adapter_path: ctypes.c_char_p
|
| 103 |
+
lora_adapter_name: ctypes.c_char_p
|
| 104 |
+
scale: ctypes.c_float
|
| 105 |
+
|
| 106 |
+
_fields_ = [
|
| 107 |
+
("lora_adapter_path", ctypes.c_char_p),
|
| 108 |
+
("lora_adapter_name", ctypes.c_char_p),
|
| 109 |
+
("scale", ctypes.c_float)
|
| 110 |
+
]
|
| 111 |
+
|
| 112 |
+
class RKLLMEmbedInput(ctypes.Structure):
|
| 113 |
+
embed: ctypes.POINTER(ctypes.c_float)
|
| 114 |
+
n_tokens: ctypes.c_size_t
|
| 115 |
+
|
| 116 |
+
_fields_ = [
|
| 117 |
+
("embed", ctypes.POINTER(ctypes.c_float)),
|
| 118 |
+
("n_tokens", ctypes.c_size_t)
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
class RKLLMTokenInput(ctypes.Structure):
|
| 122 |
+
input_ids: ctypes.POINTER(ctypes.c_int32)
|
| 123 |
+
n_tokens: ctypes.c_size_t
|
| 124 |
+
|
| 125 |
+
_fields_ = [
|
| 126 |
+
("input_ids", ctypes.POINTER(ctypes.c_int32)),
|
| 127 |
+
("n_tokens", ctypes.c_size_t)
|
| 128 |
+
]
|
| 129 |
+
|
| 130 |
+
class RKLLMMultiModelInput(ctypes.Structure):
|
| 131 |
+
prompt: ctypes.c_char_p
|
| 132 |
+
image_embed: ctypes.POINTER(ctypes.c_float)
|
| 133 |
+
n_image_tokens: ctypes.c_size_t
|
| 134 |
+
n_image: ctypes.c_size_t
|
| 135 |
+
image_width: ctypes.c_size_t
|
| 136 |
+
image_height: ctypes.c_size_t
|
| 137 |
+
|
| 138 |
+
_fields_ = [
|
| 139 |
+
("prompt", ctypes.c_char_p),
|
| 140 |
+
("image_embed", ctypes.POINTER(ctypes.c_float)),
|
| 141 |
+
("n_image_tokens", ctypes.c_size_t),
|
| 142 |
+
("n_image", ctypes.c_size_t),
|
| 143 |
+
("image_width", ctypes.c_size_t),
|
| 144 |
+
("image_height", ctypes.c_size_t)
|
| 145 |
+
]
|
| 146 |
+
|
| 147 |
+
class RKLLMCrossAttnParam(ctypes.Structure):
|
| 148 |
+
"""
|
| 149 |
+
交叉注意力参数结构体
|
| 150 |
+
|
| 151 |
+
该结构体用于在解码器中执行交叉注意力时使用。
|
| 152 |
+
它提供编码器输出(键/值缓存)、位置索引和注意力掩码。
|
| 153 |
+
|
| 154 |
+
- encoder_k_cache必须存储在连续内存中,布局为:
|
| 155 |
+
[num_layers][num_tokens][num_kv_heads][head_dim]
|
| 156 |
+
- encoder_v_cache必须存储在连续内存中,布局为:
|
| 157 |
+
[num_layers][num_kv_heads][head_dim][num_tokens]
|
| 158 |
+
"""
|
| 159 |
+
encoder_k_cache: ctypes.POINTER(ctypes.c_float)
|
| 160 |
+
encoder_v_cache: ctypes.POINTER(ctypes.c_float)
|
| 161 |
+
encoder_mask: ctypes.POINTER(ctypes.c_float)
|
| 162 |
+
encoder_pos: ctypes.POINTER(ctypes.c_int32)
|
| 163 |
+
num_tokens: ctypes.c_int
|
| 164 |
+
|
| 165 |
+
_fields_ = [
|
| 166 |
+
("encoder_k_cache", ctypes.POINTER(ctypes.c_float)), # 编码器键缓存指针(大小:num_layers * num_tokens * num_kv_heads * head_dim)
|
| 167 |
+
("encoder_v_cache", ctypes.POINTER(ctypes.c_float)), # 编码器值缓存指针(大小:num_layers * num_kv_heads * head_dim * num_tokens)
|
| 168 |
+
("encoder_mask", ctypes.POINTER(ctypes.c_float)), # 编码器注意力掩码指针(大小:num_tokens的数组)
|
| 169 |
+
("encoder_pos", ctypes.POINTER(ctypes.c_int32)), # 编码器token位置指针(大小:num_tokens的数组)
|
| 170 |
+
("num_tokens", ctypes.c_int) # 编码器序列中的token数量
|
| 171 |
+
]
|
| 172 |
+
|
| 173 |
+
class RKLLMPerfStat(ctypes.Structure):
|
| 174 |
+
"""
|
| 175 |
+
性能统计结构体
|
| 176 |
+
|
| 177 |
+
用于保存预填充和生成阶段的性能统计信息。
|
| 178 |
+
"""
|
| 179 |
+
prefill_time_ms: ctypes.c_float
|
| 180 |
+
prefill_tokens: ctypes.c_int
|
| 181 |
+
generate_time_ms: ctypes.c_float
|
| 182 |
+
generate_tokens: ctypes.c_int
|
| 183 |
+
memory_usage_mb: ctypes.c_float
|
| 184 |
+
|
| 185 |
+
_fields_ = [
|
| 186 |
+
("prefill_time_ms", ctypes.c_float), # 预填充阶段总耗时(毫秒)
|
| 187 |
+
("prefill_tokens", ctypes.c_int), # 预填充阶段处理的token数量
|
| 188 |
+
("generate_time_ms", ctypes.c_float), # 生成阶段总耗时(毫秒)
|
| 189 |
+
("generate_tokens", ctypes.c_int), # 生成阶段处理的token数量
|
| 190 |
+
("memory_usage_mb", ctypes.c_float) # 推理期间VmHWM常驻内存使用量(MB)
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
+
class _RKLLMInputUnion(ctypes.Union):
|
| 194 |
+
prompt_input: ctypes.c_char_p
|
| 195 |
+
embed_input: RKLLMEmbedInput
|
| 196 |
+
token_input: RKLLMTokenInput
|
| 197 |
+
multimodal_input: RKLLMMultiModelInput
|
| 198 |
+
|
| 199 |
+
_fields_ = [
|
| 200 |
+
("prompt_input", ctypes.c_char_p),
|
| 201 |
+
("embed_input", RKLLMEmbedInput),
|
| 202 |
+
("token_input", RKLLMTokenInput),
|
| 203 |
+
("multimodal_input", RKLLMMultiModelInput)
|
| 204 |
+
]
|
| 205 |
+
|
| 206 |
+
class RKLLMInput(ctypes.Structure):
|
| 207 |
+
"""
|
| 208 |
+
LLM输入结构体
|
| 209 |
+
|
| 210 |
+
通过联合体表示不同类型的LLM输入。
|
| 211 |
+
"""
|
| 212 |
+
role: ctypes.c_char_p
|
| 213 |
+
enable_thinking: ctypes.c_bool
|
| 214 |
+
input_type: ctypes.c_int
|
| 215 |
+
_union_data: _RKLLMInputUnion
|
| 216 |
+
|
| 217 |
+
_fields_ = [
|
| 218 |
+
("role", ctypes.c_char_p), # 消息角色:"user"(用户输入)、"tool"(函数结果)
|
| 219 |
+
("enable_thinking", ctypes.c_bool), # 控制Qwen3模型是否启用"思考模式"
|
| 220 |
+
("input_type", ctypes.c_int), # 枚举类型,指定输入类型(如prompt、token、embed、multimodal)
|
| 221 |
+
("_union_data", _RKLLMInputUnion) # 联合体数据
|
| 222 |
+
]
|
| 223 |
+
# Properties to make accessing union members easier
|
| 224 |
+
@property
|
| 225 |
+
def prompt_input(self) -> bytes: # Assuming c_char_p maps to bytes
|
| 226 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
|
| 227 |
+
return self._union_data.prompt_input
|
| 228 |
+
raise AttributeError("Not a prompt input")
|
| 229 |
+
@prompt_input.setter
|
| 230 |
+
def prompt_input(self, value: bytes): # Assuming c_char_p maps to bytes
|
| 231 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
|
| 232 |
+
self._union_data.prompt_input = value
|
| 233 |
+
else:
|
| 234 |
+
raise AttributeError("Not a prompt input")
|
| 235 |
+
@property
|
| 236 |
+
def embed_input(self) -> RKLLMEmbedInput:
|
| 237 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
|
| 238 |
+
return self._union_data.embed_input
|
| 239 |
+
raise AttributeError("Not an embed input")
|
| 240 |
+
@embed_input.setter
|
| 241 |
+
def embed_input(self, value: RKLLMEmbedInput):
|
| 242 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
|
| 243 |
+
self._union_data.embed_input = value
|
| 244 |
+
else:
|
| 245 |
+
raise AttributeError("Not an embed input")
|
| 246 |
+
|
| 247 |
+
@property
|
| 248 |
+
def token_input(self) -> RKLLMTokenInput:
|
| 249 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
|
| 250 |
+
return self._union_data.token_input
|
| 251 |
+
raise AttributeError("Not a token input")
|
| 252 |
+
@token_input.setter
|
| 253 |
+
def token_input(self, value: RKLLMTokenInput):
|
| 254 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
|
| 255 |
+
self._union_data.token_input = value
|
| 256 |
+
else:
|
| 257 |
+
raise AttributeError("Not a token input")
|
| 258 |
+
|
| 259 |
+
@property
|
| 260 |
+
def multimodal_input(self) -> RKLLMMultiModelInput:
|
| 261 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
|
| 262 |
+
return self._union_data.multimodal_input
|
| 263 |
+
raise AttributeError("Not a multimodal input")
|
| 264 |
+
@multimodal_input.setter
|
| 265 |
+
def multimodal_input(self, value: RKLLMMultiModelInput):
|
| 266 |
+
if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
|
| 267 |
+
self._union_data.multimodal_input = value
|
| 268 |
+
else:
|
| 269 |
+
raise AttributeError("Not a multimodal input")
|
| 270 |
+
|
| 271 |
+
class RKLLMLoraParam(ctypes.Structure): # For inference
|
| 272 |
+
lora_adapter_name: ctypes.c_char_p
|
| 273 |
+
|
| 274 |
+
_fields_ = [
|
| 275 |
+
("lora_adapter_name", ctypes.c_char_p)
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
class RKLLMPromptCacheParam(ctypes.Structure): # For inference
|
| 279 |
+
save_prompt_cache: ctypes.c_int # bool-like
|
| 280 |
+
prompt_cache_path: ctypes.c_char_p
|
| 281 |
+
|
| 282 |
+
_fields_ = [
|
| 283 |
+
("save_prompt_cache", ctypes.c_int), # bool-like
|
| 284 |
+
("prompt_cache_path", ctypes.c_char_p)
|
| 285 |
+
]
|
| 286 |
+
|
| 287 |
+
class RKLLMInferParam(ctypes.Structure):
|
| 288 |
+
mode: ctypes.c_int
|
| 289 |
+
lora_params: ctypes.POINTER(RKLLMLoraParam)
|
| 290 |
+
prompt_cache_params: ctypes.POINTER(RKLLMPromptCacheParam)
|
| 291 |
+
keep_history: ctypes.c_int # bool-like
|
| 292 |
+
|
| 293 |
+
_fields_ = [
|
| 294 |
+
("mode", ctypes.c_int), # Enum will be passed as int, changed RKLLMInferMode to ctypes.c_int
|
| 295 |
+
("lora_params", ctypes.POINTER(RKLLMLoraParam)),
|
| 296 |
+
("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
|
| 297 |
+
("keep_history", ctypes.c_int) # bool-like
|
| 298 |
+
]
|
| 299 |
+
|
| 300 |
+
class RKLLMResultLastHiddenLayer(ctypes.Structure):
|
| 301 |
+
hidden_states: ctypes.POINTER(ctypes.c_float)
|
| 302 |
+
embd_size: ctypes.c_int
|
| 303 |
+
num_tokens: ctypes.c_int
|
| 304 |
+
|
| 305 |
+
_fields_ = [
|
| 306 |
+
("hidden_states", ctypes.POINTER(ctypes.c_float)),
|
| 307 |
+
("embd_size", ctypes.c_int),
|
| 308 |
+
("num_tokens", ctypes.c_int)
|
| 309 |
+
]
|
| 310 |
+
|
| 311 |
+
class RKLLMResultLogits(ctypes.Structure):
|
| 312 |
+
logits: ctypes.POINTER(ctypes.c_float)
|
| 313 |
+
vocab_size: ctypes.c_int
|
| 314 |
+
num_tokens: ctypes.c_int
|
| 315 |
+
|
| 316 |
+
_fields_ = [
|
| 317 |
+
("logits", ctypes.POINTER(ctypes.c_float)),
|
| 318 |
+
("vocab_size", ctypes.c_int),
|
| 319 |
+
("num_tokens", ctypes.c_int)
|
| 320 |
+
]
|
| 321 |
+
|
| 322 |
+
class RKLLMResult(ctypes.Structure):
|
| 323 |
+
"""
|
| 324 |
+
LLM推理结果结构体
|
| 325 |
+
|
| 326 |
+
表示LLM推理的结果,包含生成的文本、token ID、隐藏层状态、logits和性能统计。
|
| 327 |
+
"""
|
| 328 |
+
text: ctypes.c_char_p
|
| 329 |
+
token_id: ctypes.c_int32
|
| 330 |
+
last_hidden_layer: RKLLMResultLastHiddenLayer
|
| 331 |
+
logits: RKLLMResultLogits
|
| 332 |
+
perf: RKLLMPerfStat
|
| 333 |
+
|
| 334 |
+
_fields_ = [
|
| 335 |
+
("text", ctypes.c_char_p), # 生成的文本结果
|
| 336 |
+
("token_id", ctypes.c_int32), # 生成的token ID
|
| 337 |
+
("last_hidden_layer", RKLLMResultLastHiddenLayer), # 最后一层的隐藏状态(如果请求的话)
|
| 338 |
+
("logits", RKLLMResultLogits), # 模型输出的logits
|
| 339 |
+
("perf", RKLLMPerfStat) # 性能统计(预填充和生成)
|
| 340 |
+
]
|
| 341 |
+
|
| 342 |
+
# --- Typedefs ---
|
| 343 |
+
LLMHandle = ctypes.c_void_p
|
| 344 |
+
|
| 345 |
+
# --- Callback Function Type ---
|
| 346 |
+
LLMResultCallback = ctypes.CFUNCTYPE(
|
| 347 |
+
ctypes.c_int, # 返回类型:int,表示处理状态
|
| 348 |
+
ctypes.POINTER(RKLLMResult), # LLM结果指针
|
| 349 |
+
ctypes.c_void_p, # 用户数据指针
|
| 350 |
+
ctypes.c_int # LLM调用状态(LLMCallState枚举值)
|
| 351 |
+
)
|
| 352 |
+
"""
|
| 353 |
+
回调函数类型定义
|
| 354 |
+
|
| 355 |
+
用于处理LLM结果的回调函数。
|
| 356 |
+
|
| 357 |
+
参数:
|
| 358 |
+
- result: 指向LLM结果的指针
|
| 359 |
+
- userdata: 回调的用户数据指针
|
| 360 |
+
- state: LLM调用状态(例如:完成、错误)
|
| 361 |
+
|
| 362 |
+
返回值:
|
| 363 |
+
- 0: 正常继续推理
|
| 364 |
+
- 1: 暂停推理。如果用户想要修改或干预结果(例如编辑输出、注入新提示),
|
| 365 |
+
返回1以暂停当前推理。稍后,使用更新的内容调用rkllm_run来恢复推理。
|
| 366 |
+
"""
|
| 367 |
+
|
| 368 |
+
def _iter_library_candidates(library_path: str) -> Tuple[Sequence[str], Sequence[str]]:
|
| 369 |
+
default_name = "librkllmrt.so"
|
| 370 |
+
user_path = library_path or default_name
|
| 371 |
+
|
| 372 |
+
lib_name = default_name
|
| 373 |
+
user_dir = None
|
| 374 |
+
if os.path.isdir(user_path):
|
| 375 |
+
user_dir = user_path
|
| 376 |
+
else:
|
| 377 |
+
lib_name = os.path.basename(user_path) or default_name
|
| 378 |
+
user_dir = os.path.dirname(user_path) or None
|
| 379 |
+
if os.path.isfile(user_path):
|
| 380 |
+
return [user_path], [f"user file: {user_path}"]
|
| 381 |
+
|
| 382 |
+
module_dir = os.path.dirname(os.path.abspath(__file__))
|
| 383 |
+
search_dirs = []
|
| 384 |
+
if user_dir:
|
| 385 |
+
search_dirs.append(user_dir)
|
| 386 |
+
search_dirs.extend([module_dir, os.getcwd()])
|
| 387 |
+
|
| 388 |
+
seen = set()
|
| 389 |
+
candidates = []
|
| 390 |
+
labels = []
|
| 391 |
+
for base_dir in search_dirs:
|
| 392 |
+
norm = os.path.abspath(base_dir)
|
| 393 |
+
if norm in seen:
|
| 394 |
+
continue
|
| 395 |
+
seen.add(norm)
|
| 396 |
+
candidate = os.path.join(base_dir, lib_name)
|
| 397 |
+
candidates.append(candidate)
|
| 398 |
+
labels.append(f"dir: {base_dir}")
|
| 399 |
+
|
| 400 |
+
# System path lookup comes last via the loader's default search.
|
| 401 |
+
candidates.append(lib_name)
|
| 402 |
+
labels.append("system path")
|
| 403 |
+
return candidates, labels
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
class RKLLMRuntime:
|
| 407 |
+
def __init__(self, library_path="./librkllmrt.so"):
|
| 408 |
+
candidates, labels = _iter_library_candidates(library_path)
|
| 409 |
+
self.lib = None
|
| 410 |
+
errors = []
|
| 411 |
+
for candidate, label in zip(candidates, labels):
|
| 412 |
+
try:
|
| 413 |
+
self.lib = ctypes.CDLL(candidate)
|
| 414 |
+
break
|
| 415 |
+
except OSError as e:
|
| 416 |
+
errors.append((candidate, label, str(e)))
|
| 417 |
+
if self.lib is None:
|
| 418 |
+
lines = ["Failed to load RKLLM library. Tried:"]
|
| 419 |
+
for candidate, label, err in errors:
|
| 420 |
+
lines.append(f"- {candidate} ({label}): {err}")
|
| 421 |
+
raise OSError("\n".join(lines))
|
| 422 |
+
self._setup_functions()
|
| 423 |
+
self.llm_handle = LLMHandle()
|
| 424 |
+
self._c_callback = None # To keep the callback object alive
|
| 425 |
+
self._user_callback = None
|
| 426 |
+
|
| 427 |
+
def _setup_functions(self):
|
| 428 |
+
# RKLLMParam rkllm_createDefaultParam();
|
| 429 |
+
self.lib.rkllm_createDefaultParam.restype = RKLLMParam
|
| 430 |
+
self.lib.rkllm_createDefaultParam.argtypes = []
|
| 431 |
+
|
| 432 |
+
# int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback);
|
| 433 |
+
self.lib.rkllm_init.restype = ctypes.c_int
|
| 434 |
+
self.lib.rkllm_init.argtypes = [
|
| 435 |
+
ctypes.POINTER(LLMHandle),
|
| 436 |
+
ctypes.POINTER(RKLLMParam),
|
| 437 |
+
LLMResultCallback
|
| 438 |
+
]
|
| 439 |
+
|
| 440 |
+
# int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter);
|
| 441 |
+
self.lib.rkllm_load_lora.restype = ctypes.c_int
|
| 442 |
+
self.lib.rkllm_load_lora.argtypes = [LLMHandle, ctypes.POINTER(RKLLMLoraAdapter)]
|
| 443 |
+
|
| 444 |
+
# int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path);
|
| 445 |
+
self.lib.rkllm_load_prompt_cache.restype = ctypes.c_int
|
| 446 |
+
self.lib.rkllm_load_prompt_cache.argtypes = [LLMHandle, ctypes.c_char_p]
|
| 447 |
+
|
| 448 |
+
# int rkllm_release_prompt_cache(LLMHandle handle);
|
| 449 |
+
self.lib.rkllm_release_prompt_cache.restype = ctypes.c_int
|
| 450 |
+
self.lib.rkllm_release_prompt_cache.argtypes = [LLMHandle]
|
| 451 |
+
|
| 452 |
+
# int rkllm_destroy(LLMHandle handle);
|
| 453 |
+
self.lib.rkllm_destroy.restype = ctypes.c_int
|
| 454 |
+
self.lib.rkllm_destroy.argtypes = [LLMHandle]
|
| 455 |
+
|
| 456 |
+
# int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
|
| 457 |
+
self.lib.rkllm_run.restype = ctypes.c_int
|
| 458 |
+
self.lib.rkllm_run.argtypes = [
|
| 459 |
+
LLMHandle,
|
| 460 |
+
ctypes.POINTER(RKLLMInput),
|
| 461 |
+
ctypes.POINTER(RKLLMInferParam),
|
| 462 |
+
ctypes.c_void_p # userdata
|
| 463 |
+
]
|
| 464 |
+
|
| 465 |
+
# int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
|
| 466 |
+
# Assuming async also takes userdata for the callback context
|
| 467 |
+
self.lib.rkllm_run_async.restype = ctypes.c_int
|
| 468 |
+
self.lib.rkllm_run_async.argtypes = [
|
| 469 |
+
LLMHandle,
|
| 470 |
+
ctypes.POINTER(RKLLMInput),
|
| 471 |
+
ctypes.POINTER(RKLLMInferParam),
|
| 472 |
+
ctypes.c_void_p # userdata
|
| 473 |
+
]
|
| 474 |
+
|
| 475 |
+
# int rkllm_abort(LLMHandle handle);
|
| 476 |
+
self.lib.rkllm_abort.restype = ctypes.c_int
|
| 477 |
+
self.lib.rkllm_abort.argtypes = [LLMHandle]
|
| 478 |
+
|
| 479 |
+
# int rkllm_is_running(LLMHandle handle);
|
| 480 |
+
self.lib.rkllm_is_running.restype = ctypes.c_int # 0 if running, non-zero otherwise
|
| 481 |
+
self.lib.rkllm_is_running.argtypes = [LLMHandle]
|
| 482 |
+
|
| 483 |
+
# int rkllm_clear_kv_cache(LLMHandle handle, int keep_system_prompt, int* start_pos, int* end_pos);
|
| 484 |
+
self.lib.rkllm_clear_kv_cache.restype = ctypes.c_int
|
| 485 |
+
self.lib.rkllm_clear_kv_cache.argtypes = [
|
| 486 |
+
LLMHandle,
|
| 487 |
+
ctypes.c_int,
|
| 488 |
+
ctypes.POINTER(ctypes.c_int), # start_pos
|
| 489 |
+
ctypes.POINTER(ctypes.c_int) # end_pos
|
| 490 |
+
]
|
| 491 |
+
|
| 492 |
+
# int rkllm_get_kv_cache_size(LLMHandle handle, int* cache_sizes);
|
| 493 |
+
self.lib.rkllm_get_kv_cache_size.restype = ctypes.c_int
|
| 494 |
+
self.lib.rkllm_get_kv_cache_size.argtypes = [LLMHandle, ctypes.POINTER(ctypes.c_int)]
|
| 495 |
+
|
| 496 |
+
# int rkllm_set_chat_template(LLMHandle handle, const char* system_prompt, const char* prompt_prefix, const char* prompt_postfix);
|
| 497 |
+
self.lib.rkllm_set_chat_template.restype = ctypes.c_int
|
| 498 |
+
self.lib.rkllm_set_chat_template.argtypes = [
|
| 499 |
+
LLMHandle,
|
| 500 |
+
ctypes.c_char_p,
|
| 501 |
+
ctypes.c_char_p,
|
| 502 |
+
ctypes.c_char_p
|
| 503 |
+
]
|
| 504 |
+
|
| 505 |
+
# int rkllm_set_function_tools(LLMHandle handle, const char* system_prompt, const char* tools, const char* tool_response_str);
|
| 506 |
+
self.lib.rkllm_set_function_tools.restype = ctypes.c_int
|
| 507 |
+
self.lib.rkllm_set_function_tools.argtypes = [
|
| 508 |
+
LLMHandle,
|
| 509 |
+
ctypes.c_char_p, # system_prompt
|
| 510 |
+
ctypes.c_char_p, # tools
|
| 511 |
+
ctypes.c_char_p # tool_response_str
|
| 512 |
+
]
|
| 513 |
+
|
| 514 |
+
# int rkllm_set_cross_attn_params(LLMHandle handle, RKLLMCrossAttnParam* cross_attn_params);
|
| 515 |
+
self.lib.rkllm_set_cross_attn_params.restype = ctypes.c_int
|
| 516 |
+
self.lib.rkllm_set_cross_attn_params.argtypes = [LLMHandle, ctypes.POINTER(RKLLMCrossAttnParam)]
|
| 517 |
+
|
| 518 |
+
def create_default_param(self) -> RKLLMParam:
|
| 519 |
+
"""Creates a default RKLLMParam structure."""
|
| 520 |
+
return self.lib.rkllm_createDefaultParam()
|
| 521 |
+
|
| 522 |
+
def init(self, param: RKLLMParam, callback_func) -> int:
|
| 523 |
+
"""
|
| 524 |
+
Initializes the LLM.
|
| 525 |
+
:param param: RKLLMParam structure.
|
| 526 |
+
:param callback_func: A Python function that matches the signature:
|
| 527 |
+
def my_callback(result_ptr, userdata_ptr, state_enum):
|
| 528 |
+
result = result_ptr.contents # RKLLMResult
|
| 529 |
+
# Process result
|
| 530 |
+
# userdata can be retrieved if passed during run, or ignored
|
| 531 |
+
# state = LLMCallState(state_enum)
|
| 532 |
+
:return: 0 for success, non-zero for failure.
|
| 533 |
+
"""
|
| 534 |
+
if not callable(callback_func):
|
| 535 |
+
raise ValueError("callback_func must be a callable Python function.")
|
| 536 |
+
|
| 537 |
+
self._user_callback = callback_func
|
| 538 |
+
|
| 539 |
+
# Keep a reference to the ctypes callback object to prevent it from being garbage collected.
|
| 540 |
+
# Always register a trampoline so we can swap the Python-level handler when needed.
|
| 541 |
+
self._c_callback = LLMResultCallback(self._callback_trampoline)
|
| 542 |
+
|
| 543 |
+
ret = self.lib.rkllm_init(ctypes.byref(self.llm_handle), ctypes.byref(param), self._c_callback)
|
| 544 |
+
if ret != 0:
|
| 545 |
+
raise RuntimeError(f"rkllm_init failed with error code {ret}")
|
| 546 |
+
return ret
|
| 547 |
+
|
| 548 |
+
def load_lora(self, lora_adapter: RKLLMLoraAdapter) -> int:
|
| 549 |
+
"""Loads a Lora adapter."""
|
| 550 |
+
ret = self.lib.rkllm_load_lora(self.llm_handle, ctypes.byref(lora_adapter))
|
| 551 |
+
if ret != 0:
|
| 552 |
+
raise RuntimeError(f"rkllm_load_lora failed with error code {ret}")
|
| 553 |
+
return ret
|
| 554 |
+
|
| 555 |
+
def load_prompt_cache(self, prompt_cache_path: str) -> int:
|
| 556 |
+
"""Loads a prompt cache from a file."""
|
| 557 |
+
c_path = prompt_cache_path.encode('utf-8')
|
| 558 |
+
ret = self.lib.rkllm_load_prompt_cache(self.llm_handle, c_path)
|
| 559 |
+
if ret != 0:
|
| 560 |
+
raise RuntimeError(f"rkllm_load_prompt_cache failed for {prompt_cache_path} with error code {ret}")
|
| 561 |
+
return ret
|
| 562 |
+
|
| 563 |
+
def release_prompt_cache(self) -> int:
|
| 564 |
+
"""Releases the prompt cache from memory."""
|
| 565 |
+
ret = self.lib.rkllm_release_prompt_cache(self.llm_handle)
|
| 566 |
+
if ret != 0:
|
| 567 |
+
raise RuntimeError(f"rkllm_release_prompt_cache failed with error code {ret}")
|
| 568 |
+
return ret
|
| 569 |
+
|
| 570 |
+
def destroy(self) -> int:
|
| 571 |
+
"""Destroys the LLM instance and releases resources."""
|
| 572 |
+
if self.llm_handle and self.llm_handle.value: # Check if handle is not NULL
|
| 573 |
+
ret = self.lib.rkllm_destroy(self.llm_handle)
|
| 574 |
+
self.llm_handle = LLMHandle() # Reset handle
|
| 575 |
+
if ret != 0:
|
| 576 |
+
# Don't raise here as it might be called in __del__
|
| 577 |
+
print(f"Warning: rkllm_destroy failed with error code {ret}")
|
| 578 |
+
return ret
|
| 579 |
+
return 0 # Already destroyed or not initialized
|
| 580 |
+
|
| 581 |
+
def run(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
|
| 582 |
+
"""Runs an LLM inference task synchronously."""
|
| 583 |
+
# userdata can be a ctypes.py_object if you want to pass Python objects,
|
| 584 |
+
# then cast to c_void_p. Or simply None.
|
| 585 |
+
if userdata is not None:
|
| 586 |
+
# Store the userdata object to keep it alive during the call
|
| 587 |
+
self._userdata_ref = userdata
|
| 588 |
+
c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
|
| 589 |
+
else:
|
| 590 |
+
c_userdata = None
|
| 591 |
+
ret = self.lib.rkllm_run(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
|
| 592 |
+
if ret != 0:
|
| 593 |
+
raise RuntimeError(f"rkllm_run failed with error code {ret}")
|
| 594 |
+
return ret
|
| 595 |
+
|
| 596 |
+
def run_async(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
|
| 597 |
+
"""Runs an LLM inference task asynchronously."""
|
| 598 |
+
if userdata is not None:
|
| 599 |
+
# Store the userdata object to keep it alive during the call
|
| 600 |
+
self._userdata_ref = userdata
|
| 601 |
+
c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
|
| 602 |
+
else:
|
| 603 |
+
c_userdata = None
|
| 604 |
+
ret = self.lib.rkllm_run_async(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
|
| 605 |
+
if ret != 0:
|
| 606 |
+
raise RuntimeError(f"rkllm_run_async failed with error code {ret}")
|
| 607 |
+
return ret
|
| 608 |
+
|
| 609 |
+
def abort(self) -> int:
|
| 610 |
+
"""Aborts an ongoing LLM task."""
|
| 611 |
+
ret = self.lib.rkllm_abort(self.llm_handle)
|
| 612 |
+
if ret != 0:
|
| 613 |
+
raise RuntimeError(f"rkllm_abort failed with error code {ret}")
|
| 614 |
+
return ret
|
| 615 |
+
|
| 616 |
+
def is_running(self) -> bool:
|
| 617 |
+
"""Checks if an LLM task is currently running. Returns True if running."""
|
| 618 |
+
# The C API returns 0 if running, non-zero otherwise.
|
| 619 |
+
# This is a bit counter-intuitive for a boolean "is_running".
|
| 620 |
+
return self.lib.rkllm_is_running(self.llm_handle) == 0
|
| 621 |
+
|
| 622 |
+
def clear_kv_cache(self, keep_system_prompt: bool, start_pos: list = None, end_pos: list = None) -> int:
|
| 623 |
+
"""
|
| 624 |
+
清除键值缓存
|
| 625 |
+
|
| 626 |
+
此函数用于清除部分或全部KV缓存。
|
| 627 |
+
|
| 628 |
+
参数:
|
| 629 |
+
- keep_system_prompt: 是否在缓存中保留系统提示(True保留,False清除)
|
| 630 |
+
如果提供了特定范围[start_pos, end_pos),此标志将被忽略
|
| 631 |
+
- start_pos: 要清除的KV缓存范围的起始位置数组(包含),每个批次一个
|
| 632 |
+
- end_pos: 要清除的KV缓存范围的结束位置数组(不包含),每个批次一个
|
| 633 |
+
如果start_pos和end_pos都设置为None,将清除整个缓存,keep_system_prompt将生效
|
| 634 |
+
如果start_pos[i] < end_pos[i],只有指定的范围会被清除,keep_system_prompt将被忽略
|
| 635 |
+
|
| 636 |
+
注意:start_pos或end_pos只有在keep_history == 0且生成已通过在回调中返回1暂停时才有效
|
| 637 |
+
|
| 638 |
+
返回:0表示缓存清除成功,非零表示失败
|
| 639 |
+
"""
|
| 640 |
+
# 准备C数组参数
|
| 641 |
+
c_start_pos = None
|
| 642 |
+
c_end_pos = None
|
| 643 |
+
|
| 644 |
+
if start_pos is not None and end_pos is not None:
|
| 645 |
+
if len(start_pos) != len(end_pos):
|
| 646 |
+
raise ValueError("start_pos和end_pos数组长度必须相同")
|
| 647 |
+
|
| 648 |
+
# 创建C数组
|
| 649 |
+
c_start_pos = (ctypes.c_int * len(start_pos))(*start_pos)
|
| 650 |
+
c_end_pos = (ctypes.c_int * len(end_pos))(*end_pos)
|
| 651 |
+
|
| 652 |
+
ret = self.lib.rkllm_clear_kv_cache(
|
| 653 |
+
self.llm_handle,
|
| 654 |
+
ctypes.c_int(1 if keep_system_prompt else 0),
|
| 655 |
+
c_start_pos,
|
| 656 |
+
c_end_pos
|
| 657 |
+
)
|
| 658 |
+
if ret != 0:
|
| 659 |
+
raise RuntimeError(f"rkllm_clear_kv_cache失败,错误代码:{ret}")
|
| 660 |
+
return ret
|
| 661 |
+
|
| 662 |
+
def set_chat_template(self, system_prompt: str, prompt_prefix: str, prompt_postfix: str) -> int:
|
| 663 |
+
"""Sets the chat template for the LLM."""
|
| 664 |
+
c_system = system_prompt.encode('utf-8') if system_prompt else b""
|
| 665 |
+
c_prefix = prompt_prefix.encode('utf-8') if prompt_prefix else b""
|
| 666 |
+
c_postfix = prompt_postfix.encode('utf-8') if prompt_postfix else b""
|
| 667 |
+
|
| 668 |
+
ret = self.lib.rkllm_set_chat_template(self.llm_handle, c_system, c_prefix, c_postfix)
|
| 669 |
+
if ret != 0:
|
| 670 |
+
raise RuntimeError(f"rkllm_set_chat_template failed with error code {ret}")
|
| 671 |
+
return ret
|
| 672 |
+
|
| 673 |
+
def get_kv_cache_size(self, n_batch: int) -> list:
|
| 674 |
+
"""
|
| 675 |
+
获取给定LLM句柄的键值缓存当前大小
|
| 676 |
+
|
| 677 |
+
此函数返回当前存储在模型KV缓存中的位置总数。
|
| 678 |
+
|
| 679 |
+
参数:
|
| 680 |
+
- n_batch: 批次数量,用于确定返回数组的大小
|
| 681 |
+
|
| 682 |
+
返回:
|
| 683 |
+
- list: 每个批次的缓存大小列表
|
| 684 |
+
"""
|
| 685 |
+
# 预分配数组以存储每个批次的缓存大小
|
| 686 |
+
cache_sizes = (ctypes.c_int * n_batch)()
|
| 687 |
+
|
| 688 |
+
ret = self.lib.rkllm_get_kv_cache_size(self.llm_handle, cache_sizes)
|
| 689 |
+
if ret != 0:
|
| 690 |
+
raise RuntimeError(f"rkllm_get_kv_cache_size失败,错误代码:{ret}")
|
| 691 |
+
|
| 692 |
+
# 转换为Python列表
|
| 693 |
+
return [cache_sizes[i] for i in range(n_batch)]
|
| 694 |
+
|
| 695 |
+
def set_function_tools(self, system_prompt: str, tools: str, tool_response_str: str) -> int:
|
| 696 |
+
"""
|
| 697 |
+
为LLM设置函数调用配置,包括系统提示、工具定义和工具响应token
|
| 698 |
+
|
| 699 |
+
参数:
|
| 700 |
+
- system_prompt: 定义语言模型上下文或行为的系统提示
|
| 701 |
+
- tools: JSON格式的字符串,定义可用的函数,包括它们的名称、描述和参数
|
| 702 |
+
- tool_response_str: 用于识别对话中函数调用结果的唯一标签。它作为标记标签,
|
| 703 |
+
允许分词器将工具输出与正常对话轮次分开识别
|
| 704 |
+
|
| 705 |
+
返回:0表示配置设置成功,非零表示错误
|
| 706 |
+
"""
|
| 707 |
+
c_system = system_prompt.encode('utf-8') if system_prompt else b""
|
| 708 |
+
c_tools = tools.encode('utf-8') if tools else b""
|
| 709 |
+
c_tool_response = tool_response_str.encode('utf-8') if tool_response_str else b""
|
| 710 |
+
|
| 711 |
+
ret = self.lib.rkllm_set_function_tools(self.llm_handle, c_system, c_tools, c_tool_response)
|
| 712 |
+
if ret != 0:
|
| 713 |
+
raise RuntimeError(f"rkllm_set_function_tools失败,错误代码:{ret}")
|
| 714 |
+
return ret
|
| 715 |
+
|
| 716 |
+
def set_cross_attn_params(self, cross_attn_params: RKLLMCrossAttnParam) -> int:
|
| 717 |
+
"""
|
| 718 |
+
为LLM解码器设置交叉注意力参数
|
| 719 |
+
|
| 720 |
+
参数:
|
| 721 |
+
- cross_attn_params: 包含用于交叉注意力的编码器相关输入数据的结构体
|
| 722 |
+
(详见RKLLMCrossAttnParam说明)
|
| 723 |
+
|
| 724 |
+
返回:0表示参数设置成功,非零表示错误
|
| 725 |
+
"""
|
| 726 |
+
ret = self.lib.rkllm_set_cross_attn_params(self.llm_handle, ctypes.byref(cross_attn_params))
|
| 727 |
+
if ret != 0:
|
| 728 |
+
raise RuntimeError(f"rkllm_set_cross_attn_params失败,错误代码:{ret}")
|
| 729 |
+
return ret
|
| 730 |
+
|
| 731 |
+
def __enter__(self):
|
| 732 |
+
return self
|
| 733 |
+
|
| 734 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 735 |
+
self.destroy()
|
| 736 |
+
|
| 737 |
+
def __del__(self):
|
| 738 |
+
self.destroy() # Ensure resources are freed if object is garbage collected
|
| 739 |
+
|
| 740 |
+
def _callback_trampoline(self, result_ptr, userdata_ptr, state_enum):
|
| 741 |
+
"""
|
| 742 |
+
Bridge callback that forwards to the currently active Python handler.
|
| 743 |
+
This keeps the C callback pointer stable while allowing per-call overrides.
|
| 744 |
+
"""
|
| 745 |
+
handler = self._user_callback
|
| 746 |
+
if handler is None:
|
| 747 |
+
return 0
|
| 748 |
+
try:
|
| 749 |
+
return handler(result_ptr, userdata_ptr, state_enum)
|
| 750 |
+
except Exception as exc:
|
| 751 |
+
# Avoid propagating exceptions through the C callback boundary.
|
| 752 |
+
print(f"[rkllm_binding] Callback raised an exception: {exc}")
|
| 753 |
+
return 0
|
| 754 |
+
|
| 755 |
+
def forward_embed(
|
| 756 |
+
self,
|
| 757 |
+
embeds: np.ndarray,
|
| 758 |
+
*,
|
| 759 |
+
keep_history: bool = False,
|
| 760 |
+
timeout: Optional[float] = None,
|
| 761 |
+
return_last_only: bool = False,
|
| 762 |
+
) -> np.ndarray:
|
| 763 |
+
"""
|
| 764 |
+
Run a single forward pass with embedding input and return the last hidden layer.
|
| 765 |
+
|
| 766 |
+
Args:
|
| 767 |
+
embeds: Float32 embeddings shaped (T, H) or (1, T, H). Batch>1 is not supported.
|
| 768 |
+
keep_history: When False, KV cache will be cleared after the call. When True,
|
| 769 |
+
cache is kept; call clear_kv_cache() manually if needed.
|
| 770 |
+
timeout: Optional timeout (seconds) for waiting on the callback.
|
| 771 |
+
return_last_only: If True, return the last token vector shape (H,).
|
| 772 |
+
|
| 773 |
+
Returns:
|
| 774 |
+
np.ndarray containing hidden states (T, H) or the last token (H,).
|
| 775 |
+
"""
|
| 776 |
+
if embeds is None:
|
| 777 |
+
raise ValueError("embeds must not be None.")
|
| 778 |
+
|
| 779 |
+
np_embeds = np.asarray(embeds, dtype=np.float32)
|
| 780 |
+
if np_embeds.ndim == 3:
|
| 781 |
+
if np_embeds.shape[0] != 1:
|
| 782 |
+
raise ValueError("Only batch size 1 is supported for forward_embed.")
|
| 783 |
+
num_tokens = np_embeds.shape[1]
|
| 784 |
+
flat = np_embeds.reshape(-1)
|
| 785 |
+
elif np_embeds.ndim == 2:
|
| 786 |
+
num_tokens = np_embeds.shape[0]
|
| 787 |
+
flat = np_embeds.reshape(-1)
|
| 788 |
+
else:
|
| 789 |
+
raise ValueError("embeds must have shape (T, H) or (1, T, H).")
|
| 790 |
+
|
| 791 |
+
flat = np.ascontiguousarray(flat, dtype=np.float32)
|
| 792 |
+
embed_buffer = (ctypes.c_float * flat.size)(*flat)
|
| 793 |
+
|
| 794 |
+
rk_input = RKLLMInput()
|
| 795 |
+
rk_input.input_type = RKLLMInputType.RKLLM_INPUT_EMBED
|
| 796 |
+
embed_input = RKLLMEmbedInput()
|
| 797 |
+
embed_input.embed = embed_buffer
|
| 798 |
+
embed_input.n_tokens = num_tokens
|
| 799 |
+
rk_input._union_data.embed_input = embed_input
|
| 800 |
+
|
| 801 |
+
infer_params = RKLLMInferParam()
|
| 802 |
+
infer_params.mode = RKLLMInferMode.RKLLM_INFER_GET_LAST_HIDDEN_LAYER
|
| 803 |
+
infer_params.keep_history = 1 if keep_history else 0
|
| 804 |
+
infer_params.lora_params = None
|
| 805 |
+
infer_params.prompt_cache_params = None
|
| 806 |
+
|
| 807 |
+
done = threading.Event()
|
| 808 |
+
result_holder = {"hidden": None, "error": None}
|
| 809 |
+
|
| 810 |
+
def _capture_hidden(result_ptr, userdata_ptr, state_enum):
|
| 811 |
+
state = LLMCallState(state_enum)
|
| 812 |
+
if state == LLMCallState.RKLLM_RUN_ERROR:
|
| 813 |
+
result_holder["error"] = "RKLLM reported an error state."
|
| 814 |
+
done.set()
|
| 815 |
+
return 0
|
| 816 |
+
|
| 817 |
+
if not result_ptr:
|
| 818 |
+
result_holder["error"] = "Empty result pointer received."
|
| 819 |
+
done.set()
|
| 820 |
+
return 0
|
| 821 |
+
|
| 822 |
+
result = result_ptr.contents
|
| 823 |
+
if result.last_hidden_layer.hidden_states and result.last_hidden_layer.embd_size > 0:
|
| 824 |
+
hidden = np.ctypeslib.as_array(
|
| 825 |
+
result.last_hidden_layer.hidden_states,
|
| 826 |
+
shape=(result.last_hidden_layer.num_tokens, result.last_hidden_layer.embd_size),
|
| 827 |
+
).copy()
|
| 828 |
+
result_holder["hidden"] = hidden[-1].copy() if return_last_only else hidden
|
| 829 |
+
done.set()
|
| 830 |
+
return 1 # Pause further work; we already have the hidden states.
|
| 831 |
+
|
| 832 |
+
if state == LLMCallState.RKLLM_RUN_FINISH:
|
| 833 |
+
done.set()
|
| 834 |
+
return 0
|
| 835 |
+
|
| 836 |
+
previous_callback = self._user_callback
|
| 837 |
+
self._user_callback = _capture_hidden
|
| 838 |
+
try:
|
| 839 |
+
self.run(rk_input, infer_params)
|
| 840 |
+
if not done.wait(timeout):
|
| 841 |
+
raise TimeoutError("forward_embed timed out waiting for hidden states.")
|
| 842 |
+
finally:
|
| 843 |
+
self._user_callback = previous_callback
|
| 844 |
+
|
| 845 |
+
if result_holder["error"]:
|
| 846 |
+
raise RuntimeError(result_holder["error"])
|
| 847 |
+
if result_holder["hidden"] is None:
|
| 848 |
+
raise RuntimeError("forward_embed did not receive hidden states.")
|
| 849 |
+
|
| 850 |
+
try:
|
| 851 |
+
if not keep_history:
|
| 852 |
+
self.clear_kv_cache(True)
|
| 853 |
+
except Exception:
|
| 854 |
+
# Cache clearing best-effort; keep the forward result usable even if clearing fails.
|
| 855 |
+
pass
|
| 856 |
+
|
| 857 |
+
return result_holder["hidden"]
|
| 858 |
+
|
| 859 |
+
# --- Demo CLI ---
|
| 860 |
+
def _cli_parse_arguments() -> argparse.Namespace:
|
| 861 |
+
parser = argparse.ArgumentParser(
|
| 862 |
+
description="Demo application showcasing rkllm_binding usage."
|
| 863 |
+
)
|
| 864 |
+
parser.add_argument(
|
| 865 |
+
"model",
|
| 866 |
+
help="Path to the .rkllm model file used for inference."
|
| 867 |
+
)
|
| 868 |
+
parser.add_argument(
|
| 869 |
+
"--lib",
|
| 870 |
+
default="./librkllmrt.so",
|
| 871 |
+
help="Path to librkllmrt.so. Defaults to ./librkllmrt.so."
|
| 872 |
+
)
|
| 873 |
+
|
| 874 |
+
# Core generation parameters
|
| 875 |
+
parser.add_argument("--max-context-len", type=int, default=512, help="Maximum context length.")
|
| 876 |
+
parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate.")
|
| 877 |
+
parser.add_argument("--top-k", type=int, default=1, help="Top-K sampling parameter.")
|
| 878 |
+
parser.add_argument("--top-p", type=float, default=0.0, help="Top-P (nucleus) sampling parameter.")
|
| 879 |
+
parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature.")
|
| 880 |
+
parser.add_argument("--repeat-penalty", type=float, default=1.1, help="Penalty applied to repeated tokens.")
|
| 881 |
+
parser.add_argument("--n-keep", type=int, default=0, help="Number of tokens to keep when context slides.")
|
| 882 |
+
parser.add_argument("--mirostat", type=int, default=0, help="Enable Mirostat sampling (0 disables).")
|
| 883 |
+
parser.add_argument("--mirostat-tau", type=float, default=5.0, help="Mirostat tau parameter.")
|
| 884 |
+
parser.add_argument("--mirostat-eta", type=float, default=0.1, help="Mirostat eta parameter.")
|
| 885 |
+
parser.add_argument(
|
| 886 |
+
"--skip-special-token",
|
| 887 |
+
action="store_true",
|
| 888 |
+
help="Skip special tokens when generating output."
|
| 889 |
+
)
|
| 890 |
+
|
| 891 |
+
# Input management
|
| 892 |
+
parser.add_argument(
|
| 893 |
+
"--input-type",
|
| 894 |
+
choices=("prompt", "token", "multimodal"),
|
| 895 |
+
default="prompt",
|
| 896 |
+
help="Select prompt, raw token, or multimodal (image + prompt) input."
|
| 897 |
+
)
|
| 898 |
+
parser.add_argument("--prompt", help="Prompt text to send to the model.")
|
| 899 |
+
parser.add_argument("--prompt-file", help="Path to a UTF-8 text file containing the prompt.")
|
| 900 |
+
parser.add_argument(
|
| 901 |
+
"--token-ids",
|
| 902 |
+
type=int,
|
| 903 |
+
nargs="+",
|
| 904 |
+
help="Raw token IDs (space separated). Only valid when --input-type token."
|
| 905 |
+
)
|
| 906 |
+
parser.add_argument("--role", default="user", help="Role metadata for the input message (e.g., user/system).")
|
| 907 |
+
parser.add_argument(
|
| 908 |
+
"--enable-thinking",
|
| 909 |
+
action="store_true",
|
| 910 |
+
help="Enable thinking mode for supported models."
|
| 911 |
+
)
|
| 912 |
+
parser.add_argument("--image", help="Path to an image file used when --input-type multimodal.")
|
| 913 |
+
parser.add_argument("--vision-encoder", help="Path to the ONNX vision encoder model.")
|
| 914 |
+
parser.add_argument(
|
| 915 |
+
"--encoder-provider",
|
| 916 |
+
help="Comma separated ONNX Runtime providers (e.g., 'CPUExecutionProvider')."
|
| 917 |
+
)
|
| 918 |
+
parser.add_argument(
|
| 919 |
+
"--encoder-threads",
|
| 920 |
+
type=int,
|
| 921 |
+
help="Thread count hint for ONNX Runtime session."
|
| 922 |
+
)
|
| 923 |
+
parser.add_argument(
|
| 924 |
+
"--encoder-input-shape",
|
| 925 |
+
help="Override encoder input spatial size as HxW or H,W (e.g., 392x392)."
|
| 926 |
+
)
|
| 927 |
+
parser.add_argument(
|
| 928 |
+
"--norm",
|
| 929 |
+
choices=("imagenet", "divide_255", "divide_128_sub_1"),
|
| 930 |
+
default="imagenet",
|
| 931 |
+
help="Image normalization preset."
|
| 932 |
+
)
|
| 933 |
+
parser.add_argument(
|
| 934 |
+
"--norm-mean",
|
| 935 |
+
type=float,
|
| 936 |
+
nargs=3,
|
| 937 |
+
metavar=("R", "G", "B"),
|
| 938 |
+
help="Override normalization mean (RGB order)."
|
| 939 |
+
)
|
| 940 |
+
parser.add_argument(
|
| 941 |
+
"--norm-std",
|
| 942 |
+
type=float,
|
| 943 |
+
nargs=3,
|
| 944 |
+
metavar=("R", "G", "B"),
|
| 945 |
+
help="Override normalization std (RGB order)."
|
| 946 |
+
)
|
| 947 |
+
parser.add_argument(
|
| 948 |
+
"--image-background",
|
| 949 |
+
type=int,
|
| 950 |
+
nargs=3,
|
| 951 |
+
metavar=("R", "G", "B"),
|
| 952 |
+
default=(128, 128, 128),
|
| 953 |
+
help="Background color used when padding image to target size."
|
| 954 |
+
)
|
| 955 |
+
parser.add_argument("--img-start-token", help="Override image start token string passed to the model.")
|
| 956 |
+
parser.add_argument("--img-end-token", help="Override image end token string passed to the model.")
|
| 957 |
+
parser.add_argument("--img-content-token", help="Override image content token string passed to the model.")
|
| 958 |
+
|
| 959 |
+
# Inference options
|
| 960 |
+
parser.add_argument(
|
| 961 |
+
"--mode",
|
| 962 |
+
choices=("generate", "hidden", "logits"),
|
| 963 |
+
default="generate",
|
| 964 |
+
help="Inference mode: generate tokens, return last hidden layer, or logits."
|
| 965 |
+
)
|
| 966 |
+
parser.add_argument(
|
| 967 |
+
"--no-keep-history",
|
| 968 |
+
action="store_true",
|
| 969 |
+
help="Do not keep dialogue history on the device."
|
| 970 |
+
)
|
| 971 |
+
|
| 972 |
+
# Output options
|
| 973 |
+
parser.add_argument(
|
| 974 |
+
"--stream",
|
| 975 |
+
action="store_true",
|
| 976 |
+
default=True,
|
| 977 |
+
help="Stream tokens to stdout as they arrive from the callback."
|
| 978 |
+
)
|
| 979 |
+
parser.add_argument(
|
| 980 |
+
"--hide-stats",
|
| 981 |
+
action="store_true",
|
| 982 |
+
help="Suppress performance statistics after inference."
|
| 983 |
+
)
|
| 984 |
+
|
| 985 |
+
args = parser.parse_args()
|
| 986 |
+
|
| 987 |
+
if args.prompt and args.prompt_file:
|
| 988 |
+
parser.error("Arguments --prompt and --prompt-file cannot be used together.")
|
| 989 |
+
|
| 990 |
+
if args.input_type == "prompt":
|
| 991 |
+
if not args.prompt and not args.prompt_file:
|
| 992 |
+
parser.error("Provide --prompt or --prompt-file when --input-type is prompt.")
|
| 993 |
+
if args.token_ids:
|
| 994 |
+
parser.error("--token-ids is only valid when --input-type token.")
|
| 995 |
+
elif args.input_type == "token":
|
| 996 |
+
if not args.token_ids:
|
| 997 |
+
parser.error("--token-ids is required when --input-type token.")
|
| 998 |
+
if args.prompt or args.prompt_file:
|
| 999 |
+
parser.error("--prompt/--prompt-file cannot be combined with --input-type token.")
|
| 1000 |
+
else: # multimodal
|
| 1001 |
+
if args.token_ids:
|
| 1002 |
+
parser.error("--token-ids cannot be used with --input-type multimodal.")
|
| 1003 |
+
if not args.prompt and not args.prompt_file:
|
| 1004 |
+
parser.error("Provide --prompt or --prompt-file when --input-type is multimodal.")
|
| 1005 |
+
if not args.image:
|
| 1006 |
+
parser.error("--image is required when --input-type multimodal.")
|
| 1007 |
+
if not args.vision_encoder:
|
| 1008 |
+
parser.error("--vision-encoder is required when --input-type multimodal.")
|
| 1009 |
+
|
| 1010 |
+
if args.image_background:
|
| 1011 |
+
for component in args.image_background:
|
| 1012 |
+
if component < 0 or component > 255:
|
| 1013 |
+
parser.error("--image-background values must be in the range [0, 255].")
|
| 1014 |
+
|
| 1015 |
+
return args
|
| 1016 |
+
|
| 1017 |
+
|
| 1018 |
+
def _load_prompt_from_args(args: argparse.Namespace) -> str:
|
| 1019 |
+
if args.prompt:
|
| 1020 |
+
return args.prompt
|
| 1021 |
+
if args.prompt_file:
|
| 1022 |
+
try:
|
| 1023 |
+
with open(args.prompt_file, "r", encoding="utf-8") as fp:
|
| 1024 |
+
return fp.read()
|
| 1025 |
+
except OSError as exc:
|
| 1026 |
+
raise RuntimeError(f"Failed to read prompt file '{args.prompt_file}': {exc}") from exc
|
| 1027 |
+
raise RuntimeError("Prompt text is required but not provided.")
|
| 1028 |
+
|
| 1029 |
+
|
| 1030 |
+
def _mode_to_enum(mode: str) -> int:
|
| 1031 |
+
mapping = {
|
| 1032 |
+
"generate": RKLLMInferMode.RKLLM_INFER_GENERATE,
|
| 1033 |
+
"hidden": RKLLMInferMode.RKLLM_INFER_GET_LAST_HIDDEN_LAYER,
|
| 1034 |
+
"logits": RKLLMInferMode.RKLLM_INFER_GET_LOGITS,
|
| 1035 |
+
}
|
| 1036 |
+
return mapping[mode]
|
| 1037 |
+
|
| 1038 |
+
|
| 1039 |
+
def _parse_hw_string(value: str) -> Tuple[int, int]:
|
| 1040 |
+
separators = ("x", "X", ",", " ")
|
| 1041 |
+
token = value.strip()
|
| 1042 |
+
for sep in separators:
|
| 1043 |
+
if sep in token:
|
| 1044 |
+
parts = [p for p in token.split(sep) if p]
|
| 1045 |
+
break
|
| 1046 |
+
else:
|
| 1047 |
+
parts = [token]
|
| 1048 |
+
if len(parts) != 2:
|
| 1049 |
+
raise ValueError(f"Unable to parse height/width from '{value}'. Expected format like 392x392.")
|
| 1050 |
+
try:
|
| 1051 |
+
height = int(parts[0])
|
| 1052 |
+
width = int(parts[1])
|
| 1053 |
+
except ValueError as exc:
|
| 1054 |
+
raise ValueError(f"Height/width must be integers, got '{value}'.") from exc
|
| 1055 |
+
if height <= 0 or width <= 0:
|
| 1056 |
+
raise ValueError("Height and width must be positive integers.")
|
| 1057 |
+
return height, width
|
| 1058 |
+
|
| 1059 |
+
|
| 1060 |
+
def _infer_hw_from_onnx_shape(shape: Sequence) -> Tuple[Optional[int], Optional[int]]:
|
| 1061 |
+
if shape is None or len(shape) < 4:
|
| 1062 |
+
return None, None
|
| 1063 |
+
height = shape[-2]
|
| 1064 |
+
width = shape[-1]
|
| 1065 |
+
if isinstance(height, str) or height is None:
|
| 1066 |
+
height = None
|
| 1067 |
+
if isinstance(width, str) or width is None:
|
| 1068 |
+
width = None
|
| 1069 |
+
return height, width
|
| 1070 |
+
|
| 1071 |
+
|
| 1072 |
+
def _parse_providers(provider_str: Optional[str]) -> Optional[list]:
|
| 1073 |
+
if not provider_str:
|
| 1074 |
+
return None
|
| 1075 |
+
providers = [item.strip() for item in provider_str.split(",") if item.strip()]
|
| 1076 |
+
return providers or None
|
| 1077 |
+
|
| 1078 |
+
|
| 1079 |
+
def _load_vision_encoder_session(encoder_path: str, providers: Optional[list], threads: Optional[int]):
|
| 1080 |
+
try:
|
| 1081 |
+
import onnxruntime as ort
|
| 1082 |
+
except ImportError as exc:
|
| 1083 |
+
raise RuntimeError("onnxruntime is required for multimodal inference. Please install onnxruntime.") from exc
|
| 1084 |
+
|
| 1085 |
+
sess_options = ort.SessionOptions()
|
| 1086 |
+
if threads and threads > 0:
|
| 1087 |
+
sess_options.intra_op_num_threads = threads
|
| 1088 |
+
try:
|
| 1089 |
+
if providers:
|
| 1090 |
+
session = ort.InferenceSession(encoder_path, sess_options=sess_options, providers=providers)
|
| 1091 |
+
else:
|
| 1092 |
+
session = ort.InferenceSession(encoder_path, sess_options=sess_options)
|
| 1093 |
+
except Exception as exc:
|
| 1094 |
+
raise RuntimeError(f"Failed to load vision encoder '{encoder_path}': {exc}") from exc
|
| 1095 |
+
return session
|
| 1096 |
+
|
| 1097 |
+
|
| 1098 |
+
def _letterbox_resize(image, target_hw: Tuple[int, int], background_color: Sequence[int]):
|
| 1099 |
+
try:
|
| 1100 |
+
import cv2
|
| 1101 |
+
import numpy as np
|
| 1102 |
+
except ImportError as exc:
|
| 1103 |
+
raise RuntimeError("OpenCV (cv2) and numpy are required for multimodal preprocessing.") from exc
|
| 1104 |
+
|
| 1105 |
+
target_h, target_w = target_hw
|
| 1106 |
+
if image.ndim != 3 or image.shape[2] != 3:
|
| 1107 |
+
raise RuntimeError("Expected RGB image with 3 channels.")
|
| 1108 |
+
|
| 1109 |
+
src_h, src_w = image.shape[:2]
|
| 1110 |
+
if src_h == 0 or src_w == 0:
|
| 1111 |
+
raise RuntimeError("Loaded image has invalid dimensions.")
|
| 1112 |
+
|
| 1113 |
+
scale = min(target_w / src_w, target_h / src_h)
|
| 1114 |
+
resized_w = max(1, int(round(src_w * scale)))
|
| 1115 |
+
resized_h = max(1, int(round(src_h * scale)))
|
| 1116 |
+
resized = cv2.resize(image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR)
|
| 1117 |
+
|
| 1118 |
+
canvas = np.full((target_h, target_w, 3), background_color, dtype=resized.dtype)
|
| 1119 |
+
top = (target_h - resized_h) // 2
|
| 1120 |
+
left = (target_w - resized_w) // 2
|
| 1121 |
+
canvas[top:top + resized_h, left:left + resized_w] = resized
|
| 1122 |
+
return canvas, resized_h, resized_w
|
| 1123 |
+
|
| 1124 |
+
|
| 1125 |
+
def _normalize_image(image, method: str, mean: Optional[Sequence[float]], std: Optional[Sequence[float]]):
|
| 1126 |
+
import numpy as np
|
| 1127 |
+
|
| 1128 |
+
img = image.astype(np.float32)
|
| 1129 |
+
mean_arr = np.array(mean, dtype=np.float32) if mean else None
|
| 1130 |
+
std_arr = np.array(std, dtype=np.float32) if std else None
|
| 1131 |
+
|
| 1132 |
+
if method == "imagenet":
|
| 1133 |
+
img = img / 255.0
|
| 1134 |
+
if mean_arr is None:
|
| 1135 |
+
mean_arr = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
|
| 1136 |
+
if std_arr is None:
|
| 1137 |
+
std_arr = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
|
| 1138 |
+
img = (img - mean_arr) / std_arr
|
| 1139 |
+
elif method == "divide_255":
|
| 1140 |
+
img = img / 255.0
|
| 1141 |
+
if mean_arr is not None:
|
| 1142 |
+
img = img - mean_arr
|
| 1143 |
+
if std_arr is not None:
|
| 1144 |
+
img = img / std_arr
|
| 1145 |
+
elif method == "divide_128_sub_1":
|
| 1146 |
+
img = img / 128.0 - 1.0
|
| 1147 |
+
if mean_arr is not None:
|
| 1148 |
+
img = img - mean_arr
|
| 1149 |
+
if std_arr is not None:
|
| 1150 |
+
img = img / std_arr
|
| 1151 |
+
else:
|
| 1152 |
+
raise RuntimeError(f"Unsupported normalization method '{method}'.")
|
| 1153 |
+
|
| 1154 |
+
return img
|
| 1155 |
+
|
| 1156 |
+
|
| 1157 |
+
def _encode_image_to_embedding(
|
| 1158 |
+
session,
|
| 1159 |
+
image_path: str,
|
| 1160 |
+
input_name: str,
|
| 1161 |
+
output_name: str,
|
| 1162 |
+
target_hw: Tuple[int, int],
|
| 1163 |
+
background_color: Sequence[int],
|
| 1164 |
+
norm_method: str,
|
| 1165 |
+
norm_mean: Optional[Sequence[float]],
|
| 1166 |
+
norm_std: Optional[Sequence[float]]
|
| 1167 |
+
):
|
| 1168 |
+
try:
|
| 1169 |
+
import cv2
|
| 1170 |
+
import numpy as np
|
| 1171 |
+
except ImportError as exc:
|
| 1172 |
+
raise RuntimeError("OpenCV (cv2) and numpy are required for multimodal preprocessing.") from exc
|
| 1173 |
+
|
| 1174 |
+
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
|
| 1175 |
+
if image is None:
|
| 1176 |
+
raise RuntimeError(f"Failed to read image from '{image_path}'.")
|
| 1177 |
+
|
| 1178 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 1179 |
+
padded, resized_h, resized_w = _letterbox_resize(image, target_hw, background_color)
|
| 1180 |
+
|
| 1181 |
+
normalized = _normalize_image(padded, norm_method, norm_mean, norm_std)
|
| 1182 |
+
tensor = np.transpose(normalized, (2, 0, 1)) # HWC -> CHW
|
| 1183 |
+
tensor = np.expand_dims(tensor, axis=0) # Add batch dimension
|
| 1184 |
+
tensor = np.ascontiguousarray(tensor, dtype=np.float32)
|
| 1185 |
+
|
| 1186 |
+
try:
|
| 1187 |
+
output_list = session.run([output_name], {input_name: tensor})
|
| 1188 |
+
except Exception as exc:
|
| 1189 |
+
raise RuntimeError(f"Vision encoder inference failed: {exc}") from exc
|
| 1190 |
+
|
| 1191 |
+
if not output_list:
|
| 1192 |
+
raise RuntimeError("Vision encoder returned no outputs.")
|
| 1193 |
+
|
| 1194 |
+
embedding = output_list[0]
|
| 1195 |
+
if embedding.ndim == 3:
|
| 1196 |
+
if embedding.shape[0] != 1:
|
| 1197 |
+
raise RuntimeError("Vision encoder output batch dimension must be 1 for a single image.")
|
| 1198 |
+
n_tokens = embedding.shape[1]
|
| 1199 |
+
elif embedding.ndim == 2:
|
| 1200 |
+
n_tokens = embedding.shape[0]
|
| 1201 |
+
else:
|
| 1202 |
+
raise RuntimeError(f"Unsupported vision encoder output shape {embedding.shape}.")
|
| 1203 |
+
|
| 1204 |
+
flat_embedding = embedding.reshape(-1).astype(np.float32, copy=False)
|
| 1205 |
+
flat_embedding = np.ascontiguousarray(flat_embedding)
|
| 1206 |
+
|
| 1207 |
+
return flat_embedding, n_tokens, target_hw
|
| 1208 |
+
|
| 1209 |
+
if __name__ == "__main__":
|
| 1210 |
+
import os
|
| 1211 |
+
os.environ["RKLLM_LOG_LEVEL"] = "1"
|
| 1212 |
+
args = _cli_parse_arguments()
|
| 1213 |
+
|
| 1214 |
+
prompt_text = None
|
| 1215 |
+
if args.input_type == "prompt":
|
| 1216 |
+
prompt_text = _load_prompt_from_args(args)
|
| 1217 |
+
|
| 1218 |
+
token_id_array = None
|
| 1219 |
+
token_input_struct = None
|
| 1220 |
+
|
| 1221 |
+
generated_chunks = []
|
| 1222 |
+
perf_snapshot = {
|
| 1223 |
+
"prefill_tokens": 0,
|
| 1224 |
+
"prefill_time_ms": 0.0,
|
| 1225 |
+
"generate_tokens": 0,
|
| 1226 |
+
"generate_time_ms": 0.0,
|
| 1227 |
+
"memory_usage_mb": 0.0,
|
| 1228 |
+
}
|
| 1229 |
+
|
| 1230 |
+
def demo_callback(result_ptr, userdata_ptr, state_enum):
|
| 1231 |
+
state = LLMCallState(state_enum)
|
| 1232 |
+
result = result_ptr.contents
|
| 1233 |
+
|
| 1234 |
+
current_text = ""
|
| 1235 |
+
if result.text:
|
| 1236 |
+
current_text = result.text.decode("utf-8", errors="ignore")
|
| 1237 |
+
generated_chunks.append(current_text)
|
| 1238 |
+
if args.stream and current_text:
|
| 1239 |
+
print(current_text, end="", flush=True)
|
| 1240 |
+
|
| 1241 |
+
perf_snapshot.update(
|
| 1242 |
+
prefill_tokens=result.perf.prefill_tokens,
|
| 1243 |
+
prefill_time_ms=result.perf.prefill_time_ms,
|
| 1244 |
+
generate_tokens=result.perf.generate_tokens,
|
| 1245 |
+
generate_time_ms=result.perf.generate_time_ms,
|
| 1246 |
+
memory_usage_mb=result.perf.memory_usage_mb,
|
| 1247 |
+
)
|
| 1248 |
+
|
| 1249 |
+
if state == LLMCallState.RKLLM_RUN_ERROR:
|
| 1250 |
+
print("\n[Callback] 推理过程中出现错误。")
|
| 1251 |
+
|
| 1252 |
+
return 0
|
| 1253 |
+
|
| 1254 |
+
try:
|
| 1255 |
+
with RKLLMRuntime(library_path=args.lib) as rk_llm:
|
| 1256 |
+
params = rk_llm.create_default_param()
|
| 1257 |
+
params.model_path = os.path.abspath(args.model).encode("utf-8")
|
| 1258 |
+
params.max_context_len = args.max_context_len
|
| 1259 |
+
params.max_new_tokens = args.max_new_tokens
|
| 1260 |
+
params.top_k = args.top_k
|
| 1261 |
+
params.top_p = float(args.top_p)
|
| 1262 |
+
params.temperature = float(args.temperature)
|
| 1263 |
+
params.repeat_penalty = float(args.repeat_penalty)
|
| 1264 |
+
params.n_keep = args.n_keep
|
| 1265 |
+
params.mirostat = args.mirostat
|
| 1266 |
+
params.mirostat_tau = float(args.mirostat_tau)
|
| 1267 |
+
params.mirostat_eta = float(args.mirostat_eta)
|
| 1268 |
+
params.skip_special_token = bool(args.skip_special_token)
|
| 1269 |
+
params.is_async = False
|
| 1270 |
+
|
| 1271 |
+
rk_llm.init(params, demo_callback)
|
| 1272 |
+
|
| 1273 |
+
rk_input = RKLLMInput()
|
| 1274 |
+
rk_input.role = args.role.encode("utf-8")
|
| 1275 |
+
rk_input.enable_thinking = bool(args.enable_thinking)
|
| 1276 |
+
|
| 1277 |
+
if args.input_type == "prompt":
|
| 1278 |
+
rk_input.input_type = RKLLMInputType.RKLLM_INPUT_PROMPT
|
| 1279 |
+
rk_input._union_data.prompt_input = prompt_text.encode("utf-8")
|
| 1280 |
+
else:
|
| 1281 |
+
rk_input.input_type = RKLLMInputType.RKLLM_INPUT_TOKEN
|
| 1282 |
+
token_id_array = (ctypes.c_int32 * len(args.token_ids))(*args.token_ids)
|
| 1283 |
+
token_input_struct = RKLLMTokenInput()
|
| 1284 |
+
token_input_struct.input_ids = token_id_array
|
| 1285 |
+
token_input_struct.n_tokens = len(args.token_ids)
|
| 1286 |
+
rk_input._union_data.token_input = token_input_struct
|
| 1287 |
+
|
| 1288 |
+
infer_params = RKLLMInferParam()
|
| 1289 |
+
infer_params.mode = _mode_to_enum(args.mode)
|
| 1290 |
+
infer_params.keep_history = 0 if args.no_keep_history else 1
|
| 1291 |
+
infer_params.lora_params = None
|
| 1292 |
+
infer_params.prompt_cache_params = None
|
| 1293 |
+
|
| 1294 |
+
if args.stream:
|
| 1295 |
+
print("=== Streaming Output ===")
|
| 1296 |
+
|
| 1297 |
+
rk_llm.run(rk_input, infer_params)
|
| 1298 |
+
|
| 1299 |
+
except OSError as exc:
|
| 1300 |
+
print(f"无法加载 RKLLM 运行时库:{exc}")
|
| 1301 |
+
except RuntimeError as exc:
|
| 1302 |
+
print(f"推理失败:{exc}")
|
| 1303 |
+
except Exception as exc:
|
| 1304 |
+
print(f"发生未预期的错误:{exc}")
|
| 1305 |
+
else:
|
| 1306 |
+
if args.stream:
|
| 1307 |
+
print() # Ensure newline after streaming output
|
| 1308 |
+
|
| 1309 |
+
final_text = "".join(generated_chunks)
|
| 1310 |
+
if final_text:
|
| 1311 |
+
print("=== 生成结果 ===")
|
| 1312 |
+
print(final_text)
|
| 1313 |
+
else:
|
| 1314 |
+
print("未收到生成文本。")
|
| 1315 |
+
|
| 1316 |
+
if not args.hide_stats:
|
| 1317 |
+
print("=== 性能统计 ===")
|
| 1318 |
+
print(
|
| 1319 |
+
f"预填充: {perf_snapshot['prefill_tokens']} tokens / {perf_snapshot['prefill_time_ms']:.2f} ms"
|
| 1320 |
+
)
|
| 1321 |
+
print(
|
| 1322 |
+
f"生成: {perf_snapshot['generate_tokens']} tokens / {perf_snapshot['generate_time_ms']:.2f} ms"
|
| 1323 |
+
)
|
| 1324 |
+
print(f"最大常驻内存: {perf_snapshot['memory_usage_mb']:.2f} MB")
|
rknn/audio_encoder.rknn
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdf3c0ae8f3921061115747cc0bff82011c6444083d8e48fcea5b219a3af62a0
|
| 3 |
+
size 643043907
|
rknn/language_model.rkllm
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ce12ddf59600513439d8ec597c5f83631a872ba457c3ca967c9e36992532a8d
|
| 3 |
+
size 4092852588
|
run_qwen3_asr_e2e.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import ctypes
|
| 3 |
+
import math
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import time
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import faulthandler
|
| 10 |
+
import numpy as np
|
| 11 |
+
import soundfile as sf
|
| 12 |
+
from scipy.signal import resample_poly
|
| 13 |
+
from transformers import WhisperFeatureExtractor
|
| 14 |
+
|
| 15 |
+
faulthandler.enable()
|
| 16 |
+
os.environ.setdefault("RKLLM_LOG_LEVEL", "1")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 20 |
+
if str(REPO_ROOT) not in sys.path:
|
| 21 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 22 |
+
|
| 23 |
+
from rkllm_binding import ( # noqa: E402
|
| 24 |
+
LLMCallState,
|
| 25 |
+
RKLLMInferMode,
|
| 26 |
+
RKLLMInferParam,
|
| 27 |
+
RKLLMInput,
|
| 28 |
+
RKLLMInputType,
|
| 29 |
+
RKLLMResult,
|
| 30 |
+
RKLLMRuntime,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
import ztu_somemodelruntime_ez_rknn_async as ort
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
DEFAULT_ENCODER_PATH = "rknn/audio_encoder.rknn"
|
| 39 |
+
DEFAULT_LLM_PATH = "rknn/language_model.rkllm"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def now() -> float:
|
| 43 |
+
return time.perf_counter()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class StreamingTextCollector:
|
| 47 |
+
def __init__(self, stream_output: bool = True):
|
| 48 |
+
self.stream_output = stream_output
|
| 49 |
+
self.parts: list[str] = []
|
| 50 |
+
self.error = False
|
| 51 |
+
|
| 52 |
+
def __call__(self, result_ptr, userdata_ptr, state_enum):
|
| 53 |
+
state = LLMCallState(state_enum)
|
| 54 |
+
result: RKLLMResult = result_ptr.contents
|
| 55 |
+
|
| 56 |
+
if state == LLMCallState.RKLLM_RUN_NORMAL and result.text:
|
| 57 |
+
chunk = result.text.decode("utf-8", errors="ignore")
|
| 58 |
+
self.parts.append(chunk)
|
| 59 |
+
if self.stream_output:
|
| 60 |
+
print(chunk, end="", flush=True)
|
| 61 |
+
elif state == LLMCallState.RKLLM_RUN_FINISH and self.stream_output:
|
| 62 |
+
print("(finish)", flush=True)
|
| 63 |
+
elif state == LLMCallState.RKLLM_RUN_ERROR:
|
| 64 |
+
self.error = True
|
| 65 |
+
if self.stream_output:
|
| 66 |
+
print("\nrun error", flush=True)
|
| 67 |
+
return 0
|
| 68 |
+
|
| 69 |
+
@property
|
| 70 |
+
def text(self) -> str:
|
| 71 |
+
return "".join(self.parts)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def load_waveform(audio_path: str, target_sr: int = 16000) -> np.ndarray:
|
| 75 |
+
audio, sr = sf.read(audio_path, dtype="float32", always_2d=False)
|
| 76 |
+
audio = np.asarray(audio, dtype=np.float32)
|
| 77 |
+
if audio.ndim == 2:
|
| 78 |
+
audio = audio.mean(axis=-1)
|
| 79 |
+
if sr != target_sr:
|
| 80 |
+
divisor = math.gcd(int(sr), int(target_sr))
|
| 81 |
+
up = int(target_sr // divisor)
|
| 82 |
+
down = int(sr // divisor)
|
| 83 |
+
audio = resample_poly(audio, up=up, down=down).astype(np.float32)
|
| 84 |
+
return audio
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def configure_feature_extractor_for_audio(feature_extractor: WhisperFeatureExtractor, waveform: np.ndarray) -> None:
|
| 88 |
+
required_seconds = max(1, math.ceil(waveform.shape[0] / float(feature_extractor.sampling_rate)))
|
| 89 |
+
if required_seconds <= feature_extractor.chunk_length:
|
| 90 |
+
return
|
| 91 |
+
feature_extractor.chunk_length = required_seconds
|
| 92 |
+
feature_extractor.n_samples = int(required_seconds * feature_extractor.sampling_rate)
|
| 93 |
+
feature_extractor.nb_max_frames = feature_extractor.n_samples // feature_extractor.hop_length
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def extract_mel_features(model_path: str, audio_path: str) -> tuple[np.ndarray, int]:
|
| 97 |
+
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)
|
| 98 |
+
waveform = load_waveform(audio_path)
|
| 99 |
+
configure_feature_extractor_for_audio(feature_extractor, waveform)
|
| 100 |
+
outputs = feature_extractor(
|
| 101 |
+
waveform,
|
| 102 |
+
sampling_rate=16000,
|
| 103 |
+
return_attention_mask=True,
|
| 104 |
+
return_tensors="np",
|
| 105 |
+
)
|
| 106 |
+
input_features = outputs["input_features"][0].astype(np.float32)
|
| 107 |
+
feature_len = int(outputs["attention_mask"][0].sum())
|
| 108 |
+
return input_features, feature_len
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def split_mel_features(input_features: np.ndarray, feature_len: int, chunk_frames: int) -> list[tuple[np.ndarray, int]]:
|
| 112 |
+
chunks = []
|
| 113 |
+
start = 0
|
| 114 |
+
while start < feature_len:
|
| 115 |
+
cur_len = min(chunk_frames, feature_len - start)
|
| 116 |
+
chunk = np.zeros((input_features.shape[0], chunk_frames), dtype=np.float32)
|
| 117 |
+
chunk[:, :cur_len] = input_features[:, start : start + cur_len]
|
| 118 |
+
chunks.append((chunk, cur_len))
|
| 119 |
+
start += cur_len
|
| 120 |
+
return chunks
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def get_chunk_output_length_value(length: int) -> int:
|
| 124 |
+
value = int(length)
|
| 125 |
+
value = (value + 1) // 2
|
| 126 |
+
value = (value + 1) // 2
|
| 127 |
+
value = (value + 1) // 2
|
| 128 |
+
return value
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def parse_args():
|
| 132 |
+
parser = argparse.ArgumentParser(description="Run end-to-end Qwen3-ASR with RKNN audio encoder and RKLLM decoder.")
|
| 133 |
+
parser.add_argument("--model-path", type=str, default=".", help="Path to the original Qwen3-ASR model directory.")
|
| 134 |
+
parser.add_argument("--audio-path", type=str, required=True, help="Path to the input audio file.")
|
| 135 |
+
parser.add_argument(
|
| 136 |
+
"--encoder-model-path",
|
| 137 |
+
type=str,
|
| 138 |
+
default=DEFAULT_ENCODER_PATH,
|
| 139 |
+
help="Path to the audio encoder model (.rknn).",
|
| 140 |
+
)
|
| 141 |
+
parser.add_argument(
|
| 142 |
+
"--llm-model-path",
|
| 143 |
+
type=str,
|
| 144 |
+
default=DEFAULT_LLM_PATH,
|
| 145 |
+
help="Path to the exported .rkllm text model.",
|
| 146 |
+
)
|
| 147 |
+
parser.add_argument("--chunk-frames", type=int, default=100, help="Fixed mel chunk length.")
|
| 148 |
+
parser.add_argument("--max-new-tokens", type=int, default=1024, help="Maximum number of new tokens to generate.")
|
| 149 |
+
parser.add_argument("--max-context-len", type=int, default=4096, help="Maximum context length for RKLLM.")
|
| 150 |
+
parser.add_argument("--top-k", type=int, default=5, help="Top-k used by RKLLM decoding.")
|
| 151 |
+
parser.add_argument("--system-prompt", type=str, default="", help="Optional system prompt.")
|
| 152 |
+
parser.add_argument(
|
| 153 |
+
"--force-language",
|
| 154 |
+
type=str,
|
| 155 |
+
default=None,
|
| 156 |
+
help="Optional language suffix, for example 'Chinese'. Appends 'language X<asr_text>' after the assistant prompt.",
|
| 157 |
+
)
|
| 158 |
+
parser.add_argument("--save-audio-features", type=str, default=None, help="Optional path to save concatenated audio features.")
|
| 159 |
+
parser.add_argument("--save-text", type=str, default=None, help="Optional path to save the final decoded text.")
|
| 160 |
+
parser.add_argument("--no-stream", action="store_true", help="Disable streaming stdout from the RKLLM callback.")
|
| 161 |
+
return parser.parse_args()
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def build_chat_template(system_prompt: str, force_language) -> tuple[str, str, str]:
|
| 165 |
+
assistant_prefix = ""
|
| 166 |
+
if force_language:
|
| 167 |
+
assistant_prefix = f"language {force_language}<asr_text>"
|
| 168 |
+
return (
|
| 169 |
+
f"<|im_start|>system\n{system_prompt or ''}<|im_end|>\n",
|
| 170 |
+
"<|im_start|>user\n",
|
| 171 |
+
f"<|im_end|>\n<|im_start|>assistant\n{assistant_prefix}",
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def run_audio_encoder(
|
| 176 |
+
session,
|
| 177 |
+
input_features: np.ndarray,
|
| 178 |
+
feature_len: int,
|
| 179 |
+
chunk_frames: int,
|
| 180 |
+
) -> np.ndarray:
|
| 181 |
+
outputs = []
|
| 182 |
+
for chunk, chunk_len in split_mel_features(input_features, feature_len, chunk_frames):
|
| 183 |
+
session_outputs = session.run(
|
| 184 |
+
None,
|
| 185 |
+
{
|
| 186 |
+
"input_features": np.ascontiguousarray(chunk[None, ...], dtype=np.float32),
|
| 187 |
+
#"feature_len": np.asarray([chunk_len], dtype=np.int32),
|
| 188 |
+
},
|
| 189 |
+
)
|
| 190 |
+
audio_features = np.asarray(session_outputs[0], dtype=np.float32)
|
| 191 |
+
if len(session_outputs) >= 2:
|
| 192 |
+
valid_len = int(np.asarray(session_outputs[1]).reshape(-1)[0])
|
| 193 |
+
else:
|
| 194 |
+
valid_len = get_chunk_output_length_value(chunk_len)
|
| 195 |
+
outputs.append(audio_features[0, :valid_len])
|
| 196 |
+
return np.concatenate(outputs, axis=0) if outputs else np.zeros((0, 2048), dtype=np.float32)
|
| 197 |
+
|
| 198 |
+
def load_rkllm(
|
| 199 |
+
llm_model_path: str,
|
| 200 |
+
max_new_tokens: int,
|
| 201 |
+
max_context_len: int,
|
| 202 |
+
top_k: int,
|
| 203 |
+
system_prompt: str,
|
| 204 |
+
force_language,
|
| 205 |
+
stream_output: bool,
|
| 206 |
+
):
|
| 207 |
+
collector = StreamingTextCollector(stream_output=stream_output)
|
| 208 |
+
rk_llm = RKLLMRuntime()
|
| 209 |
+
param = rk_llm.create_default_param()
|
| 210 |
+
|
| 211 |
+
param.model_path = llm_model_path.encode("utf-8")
|
| 212 |
+
param.top_k = top_k
|
| 213 |
+
param.max_new_tokens = max_new_tokens
|
| 214 |
+
param.max_context_len = max_context_len
|
| 215 |
+
param.skip_special_token = True
|
| 216 |
+
param.img_start = b"<|audio_start|>"
|
| 217 |
+
param.img_end = b"<|audio_end|>"
|
| 218 |
+
param.img_content = b"<|audio_pad|>"
|
| 219 |
+
param.extend_param.base_domain_id = 1 # 4GB is not enough
|
| 220 |
+
|
| 221 |
+
rk_llm.init(param, collector)
|
| 222 |
+
|
| 223 |
+
system_text, prompt_prefix, prompt_postfix = build_chat_template(
|
| 224 |
+
system_prompt=system_prompt,
|
| 225 |
+
force_language=force_language,
|
| 226 |
+
)
|
| 227 |
+
rk_llm.set_chat_template(
|
| 228 |
+
system_prompt=system_text,
|
| 229 |
+
prompt_prefix=prompt_prefix,
|
| 230 |
+
prompt_postfix=prompt_postfix,
|
| 231 |
+
)
|
| 232 |
+
return rk_llm, collector
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def run_rkllm(
|
| 236 |
+
rk_llm: RKLLMRuntime,
|
| 237 |
+
audio_features: np.ndarray,
|
| 238 |
+
) -> None:
|
| 239 |
+
rkllm_input = RKLLMInput()
|
| 240 |
+
rkllm_input.role = b"user"
|
| 241 |
+
rkllm_input.input_type = RKLLMInputType.RKLLM_INPUT_MULTIMODAL
|
| 242 |
+
|
| 243 |
+
# RKLLM multimodal prompt must contain the literal "<image>" placeholder.
|
| 244 |
+
flattened = np.ascontiguousarray(audio_features.reshape(-1), dtype=np.float32)
|
| 245 |
+
rkllm_input.multimodal_input.prompt = b"<image>"
|
| 246 |
+
rkllm_input.multimodal_input.image_embed = flattened.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
|
| 247 |
+
rkllm_input.multimodal_input.n_image_tokens = audio_features.shape[0]
|
| 248 |
+
rkllm_input.multimodal_input.n_image = 1
|
| 249 |
+
rkllm_input.multimodal_input.image_height = 1
|
| 250 |
+
rkllm_input.multimodal_input.image_width = max(audio_features.shape[0], 1)
|
| 251 |
+
|
| 252 |
+
infer_param = RKLLMInferParam()
|
| 253 |
+
infer_param.mode = RKLLMInferMode.RKLLM_INFER_GENERATE
|
| 254 |
+
infer_param.keep_history = 0
|
| 255 |
+
|
| 256 |
+
rk_llm.run(rkllm_input, infer_param)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def main():
|
| 260 |
+
args = parse_args()
|
| 261 |
+
total_t0 = now()
|
| 262 |
+
encoder_session = None
|
| 263 |
+
rk_llm = None
|
| 264 |
+
collector = None
|
| 265 |
+
|
| 266 |
+
load_t0 = now()
|
| 267 |
+
mel_t0 = now()
|
| 268 |
+
input_features, feature_len = extract_mel_features(args.model_path, args.audio_path)
|
| 269 |
+
mel_elapsed = now() - mel_t0
|
| 270 |
+
|
| 271 |
+
encoder_session = ort.InferenceSession(args.encoder_model_path)
|
| 272 |
+
rkllm_init_t0 = now()
|
| 273 |
+
rk_llm, collector = load_rkllm(
|
| 274 |
+
llm_model_path=args.llm_model_path,
|
| 275 |
+
max_new_tokens=args.max_new_tokens,
|
| 276 |
+
max_context_len=args.max_context_len,
|
| 277 |
+
top_k=args.top_k,
|
| 278 |
+
system_prompt=args.system_prompt,
|
| 279 |
+
force_language=args.force_language,
|
| 280 |
+
stream_output=not args.no_stream,
|
| 281 |
+
)
|
| 282 |
+
rkllm_init_elapsed = now() - rkllm_init_t0
|
| 283 |
+
load_elapsed = now() - load_t0
|
| 284 |
+
|
| 285 |
+
infer_t0 = now()
|
| 286 |
+
encoder_t0 = now()
|
| 287 |
+
audio_features = run_audio_encoder(
|
| 288 |
+
session=encoder_session,
|
| 289 |
+
input_features=input_features,
|
| 290 |
+
feature_len=feature_len,
|
| 291 |
+
chunk_frames=args.chunk_frames,
|
| 292 |
+
)
|
| 293 |
+
encoder_elapsed = now() - encoder_t0
|
| 294 |
+
|
| 295 |
+
print(f"input_feature_len: {feature_len}")
|
| 296 |
+
print(f"audio_features: {audio_features.shape}")
|
| 297 |
+
print(f"time_mel_sec: {mel_elapsed:.3f}")
|
| 298 |
+
print(f"time_rkllm_init_sec: {rkllm_init_elapsed:.3f}")
|
| 299 |
+
print(f"time_load_total_sec: {load_elapsed:.3f}")
|
| 300 |
+
print(f"time_audio_encoder_sec: {encoder_elapsed:.3f}")
|
| 301 |
+
|
| 302 |
+
if args.save_audio_features:
|
| 303 |
+
savepath = Path(args.save_audio_features)
|
| 304 |
+
savepath.parent.mkdir(parents=True, exist_ok=True)
|
| 305 |
+
np.save(savepath, audio_features)
|
| 306 |
+
print(f"saved_audio_features: {savepath}")
|
| 307 |
+
|
| 308 |
+
generate_t0 = now()
|
| 309 |
+
run_rkllm(rk_llm=rk_llm, audio_features=audio_features)
|
| 310 |
+
generate_elapsed = now() - generate_t0
|
| 311 |
+
infer_elapsed = now() - infer_t0
|
| 312 |
+
total_elapsed = now() - total_t0
|
| 313 |
+
|
| 314 |
+
if collector and collector.error:
|
| 315 |
+
raise RuntimeError("RKLLM generation failed.")
|
| 316 |
+
text = collector.text if collector else ""
|
| 317 |
+
|
| 318 |
+
print(f"time_generate_sec: {generate_elapsed:.3f}")
|
| 319 |
+
print(f"time_infer_total_sec: {infer_elapsed:.3f}")
|
| 320 |
+
print(f"time_total_sec: {total_elapsed:.3f}")
|
| 321 |
+
|
| 322 |
+
if args.save_text:
|
| 323 |
+
savepath = Path(args.save_text)
|
| 324 |
+
savepath.parent.mkdir(parents=True, exist_ok=True)
|
| 325 |
+
savepath.write_text(text, encoding="utf-8")
|
| 326 |
+
print(f"saved_text: {savepath}")
|
| 327 |
+
|
| 328 |
+
if args.no_stream:
|
| 329 |
+
print(text)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
if __name__ == "__main__":
|
| 333 |
+
main()
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,549 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
},
|
| 213 |
+
"151669": {
|
| 214 |
+
"content": "<|audio_start|>",
|
| 215 |
+
"lstrip": false,
|
| 216 |
+
"normalized": false,
|
| 217 |
+
"rstrip": false,
|
| 218 |
+
"single_word": false,
|
| 219 |
+
"special": true
|
| 220 |
+
},
|
| 221 |
+
"151670": {
|
| 222 |
+
"content": "<|audio_end|>",
|
| 223 |
+
"lstrip": false,
|
| 224 |
+
"normalized": false,
|
| 225 |
+
"rstrip": false,
|
| 226 |
+
"single_word": false,
|
| 227 |
+
"special": true
|
| 228 |
+
},
|
| 229 |
+
"151671": {
|
| 230 |
+
"content": "<tts_pad>",
|
| 231 |
+
"lstrip": false,
|
| 232 |
+
"normalized": false,
|
| 233 |
+
"rstrip": false,
|
| 234 |
+
"single_word": false,
|
| 235 |
+
"special": true
|
| 236 |
+
},
|
| 237 |
+
"151672": {
|
| 238 |
+
"content": "<tts_text_bos>",
|
| 239 |
+
"lstrip": false,
|
| 240 |
+
"normalized": false,
|
| 241 |
+
"rstrip": false,
|
| 242 |
+
"single_word": false,
|
| 243 |
+
"special": true
|
| 244 |
+
},
|
| 245 |
+
"151673": {
|
| 246 |
+
"content": "<tts_text_eod>",
|
| 247 |
+
"lstrip": false,
|
| 248 |
+
"normalized": false,
|
| 249 |
+
"rstrip": false,
|
| 250 |
+
"single_word": false,
|
| 251 |
+
"special": true
|
| 252 |
+
},
|
| 253 |
+
"151674": {
|
| 254 |
+
"content": "<tts_text_bos_single>",
|
| 255 |
+
"lstrip": false,
|
| 256 |
+
"normalized": false,
|
| 257 |
+
"rstrip": false,
|
| 258 |
+
"single_word": false,
|
| 259 |
+
"special": true
|
| 260 |
+
},
|
| 261 |
+
"151675": {
|
| 262 |
+
"content": "<non_speech>",
|
| 263 |
+
"lstrip": false,
|
| 264 |
+
"normalized": false,
|
| 265 |
+
"rstrip": false,
|
| 266 |
+
"single_word": false,
|
| 267 |
+
"special": false
|
| 268 |
+
},
|
| 269 |
+
"151676": {
|
| 270 |
+
"content": "<|audio_pad|>",
|
| 271 |
+
"lstrip": false,
|
| 272 |
+
"normalized": false,
|
| 273 |
+
"rstrip": false,
|
| 274 |
+
"single_word": false,
|
| 275 |
+
"special": true
|
| 276 |
+
},
|
| 277 |
+
"151677": {
|
| 278 |
+
"content": "<blank1>",
|
| 279 |
+
"lstrip": false,
|
| 280 |
+
"normalized": false,
|
| 281 |
+
"rstrip": false,
|
| 282 |
+
"single_word": false,
|
| 283 |
+
"special": true
|
| 284 |
+
},
|
| 285 |
+
"151678": {
|
| 286 |
+
"content": "<blank2>",
|
| 287 |
+
"lstrip": false,
|
| 288 |
+
"normalized": false,
|
| 289 |
+
"rstrip": false,
|
| 290 |
+
"single_word": false,
|
| 291 |
+
"special": true
|
| 292 |
+
},
|
| 293 |
+
"151679": {
|
| 294 |
+
"content": "<blank3>",
|
| 295 |
+
"lstrip": false,
|
| 296 |
+
"normalized": false,
|
| 297 |
+
"rstrip": false,
|
| 298 |
+
"single_word": false,
|
| 299 |
+
"special": true
|
| 300 |
+
},
|
| 301 |
+
"151680": {
|
| 302 |
+
"content": "<blank4>",
|
| 303 |
+
"lstrip": false,
|
| 304 |
+
"normalized": false,
|
| 305 |
+
"rstrip": false,
|
| 306 |
+
"single_word": false,
|
| 307 |
+
"special": true
|
| 308 |
+
},
|
| 309 |
+
"151681": {
|
| 310 |
+
"content": "<blank5>",
|
| 311 |
+
"lstrip": false,
|
| 312 |
+
"normalized": false,
|
| 313 |
+
"rstrip": false,
|
| 314 |
+
"single_word": false,
|
| 315 |
+
"special": true
|
| 316 |
+
},
|
| 317 |
+
"151682": {
|
| 318 |
+
"content": "<blank6>",
|
| 319 |
+
"lstrip": false,
|
| 320 |
+
"normalized": false,
|
| 321 |
+
"rstrip": false,
|
| 322 |
+
"single_word": false,
|
| 323 |
+
"special": true
|
| 324 |
+
},
|
| 325 |
+
"151683": {
|
| 326 |
+
"content": "<blank7>",
|
| 327 |
+
"lstrip": false,
|
| 328 |
+
"normalized": false,
|
| 329 |
+
"rstrip": false,
|
| 330 |
+
"single_word": false,
|
| 331 |
+
"special": true
|
| 332 |
+
},
|
| 333 |
+
"151684": {
|
| 334 |
+
"content": "<blank8>",
|
| 335 |
+
"lstrip": false,
|
| 336 |
+
"normalized": false,
|
| 337 |
+
"rstrip": false,
|
| 338 |
+
"single_word": false,
|
| 339 |
+
"special": true
|
| 340 |
+
},
|
| 341 |
+
"151685": {
|
| 342 |
+
"content": "<blank9>",
|
| 343 |
+
"lstrip": false,
|
| 344 |
+
"normalized": false,
|
| 345 |
+
"rstrip": false,
|
| 346 |
+
"single_word": false,
|
| 347 |
+
"special": true
|
| 348 |
+
},
|
| 349 |
+
"151686": {
|
| 350 |
+
"content": "<blank10>",
|
| 351 |
+
"lstrip": false,
|
| 352 |
+
"normalized": false,
|
| 353 |
+
"rstrip": false,
|
| 354 |
+
"single_word": false,
|
| 355 |
+
"special": true
|
| 356 |
+
},
|
| 357 |
+
"151687": {
|
| 358 |
+
"content": "<blank11>",
|
| 359 |
+
"lstrip": false,
|
| 360 |
+
"normalized": false,
|
| 361 |
+
"rstrip": false,
|
| 362 |
+
"single_word": false,
|
| 363 |
+
"special": true
|
| 364 |
+
},
|
| 365 |
+
"151688": {
|
| 366 |
+
"content": "<blank12>",
|
| 367 |
+
"lstrip": false,
|
| 368 |
+
"normalized": false,
|
| 369 |
+
"rstrip": false,
|
| 370 |
+
"single_word": false,
|
| 371 |
+
"special": true
|
| 372 |
+
},
|
| 373 |
+
"151689": {
|
| 374 |
+
"content": "<blank13>",
|
| 375 |
+
"lstrip": false,
|
| 376 |
+
"normalized": false,
|
| 377 |
+
"rstrip": false,
|
| 378 |
+
"single_word": false,
|
| 379 |
+
"special": true
|
| 380 |
+
},
|
| 381 |
+
"151690": {
|
| 382 |
+
"content": "<blank14>",
|
| 383 |
+
"lstrip": false,
|
| 384 |
+
"normalized": false,
|
| 385 |
+
"rstrip": false,
|
| 386 |
+
"single_word": false,
|
| 387 |
+
"special": true
|
| 388 |
+
},
|
| 389 |
+
"151691": {
|
| 390 |
+
"content": "<blank15>",
|
| 391 |
+
"lstrip": false,
|
| 392 |
+
"normalized": false,
|
| 393 |
+
"rstrip": false,
|
| 394 |
+
"single_word": false,
|
| 395 |
+
"special": true
|
| 396 |
+
},
|
| 397 |
+
"151692": {
|
| 398 |
+
"content": "<blank16>",
|
| 399 |
+
"lstrip": false,
|
| 400 |
+
"normalized": false,
|
| 401 |
+
"rstrip": false,
|
| 402 |
+
"single_word": false,
|
| 403 |
+
"special": true
|
| 404 |
+
},
|
| 405 |
+
"151693": {
|
| 406 |
+
"content": "<blank17>",
|
| 407 |
+
"lstrip": false,
|
| 408 |
+
"normalized": false,
|
| 409 |
+
"rstrip": false,
|
| 410 |
+
"single_word": false,
|
| 411 |
+
"special": true
|
| 412 |
+
},
|
| 413 |
+
"151694": {
|
| 414 |
+
"content": "<blank18>",
|
| 415 |
+
"lstrip": false,
|
| 416 |
+
"normalized": false,
|
| 417 |
+
"rstrip": false,
|
| 418 |
+
"single_word": false,
|
| 419 |
+
"special": true
|
| 420 |
+
},
|
| 421 |
+
"151695": {
|
| 422 |
+
"content": "<blank19>",
|
| 423 |
+
"lstrip": false,
|
| 424 |
+
"normalized": false,
|
| 425 |
+
"rstrip": false,
|
| 426 |
+
"single_word": false,
|
| 427 |
+
"special": true
|
| 428 |
+
},
|
| 429 |
+
"151696": {
|
| 430 |
+
"content": "<blank20>",
|
| 431 |
+
"lstrip": false,
|
| 432 |
+
"normalized": false,
|
| 433 |
+
"rstrip": false,
|
| 434 |
+
"single_word": false,
|
| 435 |
+
"special": true
|
| 436 |
+
},
|
| 437 |
+
"151697": {
|
| 438 |
+
"content": "<blank21>",
|
| 439 |
+
"lstrip": false,
|
| 440 |
+
"normalized": false,
|
| 441 |
+
"rstrip": false,
|
| 442 |
+
"single_word": false,
|
| 443 |
+
"special": true
|
| 444 |
+
},
|
| 445 |
+
"151698": {
|
| 446 |
+
"content": "<blank22>",
|
| 447 |
+
"lstrip": false,
|
| 448 |
+
"normalized": false,
|
| 449 |
+
"rstrip": false,
|
| 450 |
+
"single_word": false,
|
| 451 |
+
"special": true
|
| 452 |
+
},
|
| 453 |
+
"151699": {
|
| 454 |
+
"content": "<blank23>",
|
| 455 |
+
"lstrip": false,
|
| 456 |
+
"normalized": false,
|
| 457 |
+
"rstrip": false,
|
| 458 |
+
"single_word": false,
|
| 459 |
+
"special": true
|
| 460 |
+
},
|
| 461 |
+
"151700": {
|
| 462 |
+
"content": "<blank24>",
|
| 463 |
+
"lstrip": false,
|
| 464 |
+
"normalized": false,
|
| 465 |
+
"rstrip": false,
|
| 466 |
+
"single_word": false,
|
| 467 |
+
"special": true
|
| 468 |
+
},
|
| 469 |
+
"151701": {
|
| 470 |
+
"content": "<blank25>",
|
| 471 |
+
"lstrip": false,
|
| 472 |
+
"normalized": false,
|
| 473 |
+
"rstrip": false,
|
| 474 |
+
"single_word": false,
|
| 475 |
+
"special": true
|
| 476 |
+
},
|
| 477 |
+
"151702": {
|
| 478 |
+
"content": "<blank26>",
|
| 479 |
+
"lstrip": false,
|
| 480 |
+
"normalized": false,
|
| 481 |
+
"rstrip": false,
|
| 482 |
+
"single_word": false,
|
| 483 |
+
"special": true
|
| 484 |
+
},
|
| 485 |
+
"151703": {
|
| 486 |
+
"content": "<blank27>",
|
| 487 |
+
"lstrip": false,
|
| 488 |
+
"normalized": false,
|
| 489 |
+
"rstrip": false,
|
| 490 |
+
"single_word": false,
|
| 491 |
+
"special": true
|
| 492 |
+
},
|
| 493 |
+
"151704": {
|
| 494 |
+
"content": "<asr_text>",
|
| 495 |
+
"lstrip": false,
|
| 496 |
+
"normalized": false,
|
| 497 |
+
"rstrip": false,
|
| 498 |
+
"single_word": false,
|
| 499 |
+
"special": false
|
| 500 |
+
}
|
| 501 |
+
},
|
| 502 |
+
"additional_special_tokens": [
|
| 503 |
+
"<|im_start|>",
|
| 504 |
+
"<|im_end|>",
|
| 505 |
+
"<|object_ref_start|>",
|
| 506 |
+
"<|object_ref_end|>",
|
| 507 |
+
"<|box_start|>",
|
| 508 |
+
"<|box_end|>",
|
| 509 |
+
"<|quad_start|>",
|
| 510 |
+
"<|quad_end|>",
|
| 511 |
+
"<|vision_start|>",
|
| 512 |
+
"<|vision_end|>",
|
| 513 |
+
"<|vision_pad|>",
|
| 514 |
+
"<|image_pad|>",
|
| 515 |
+
"<|video_pad|>",
|
| 516 |
+
"<|audio_start|>",
|
| 517 |
+
"<|audio_end|>",
|
| 518 |
+
"<tts_pad>",
|
| 519 |
+
"<tts_text_bos>",
|
| 520 |
+
"<tts_text_bos_single>",
|
| 521 |
+
"<|audio_pad|>"
|
| 522 |
+
],
|
| 523 |
+
"audio_bos_token": "<|audio_start|>",
|
| 524 |
+
"audio_eos_token": "<|audio_end|>",
|
| 525 |
+
"audio_token": "<|audio_pad|>",
|
| 526 |
+
"bos_token": null,
|
| 527 |
+
"clean_up_tokenization_spaces": false,
|
| 528 |
+
"eos_token": "<|im_end|>",
|
| 529 |
+
"errors": "replace",
|
| 530 |
+
"extra_special_tokens": {
|
| 531 |
+
"audio_bos_token": "<|audio_start|>",
|
| 532 |
+
"audio_eos_token": "<|audio_end|>",
|
| 533 |
+
"audio_token": "<|audio_pad|>",
|
| 534 |
+
"image_token": "<|image_pad|>",
|
| 535 |
+
"video_token": "<|video_pad|>",
|
| 536 |
+
"vision_bos_token": "<|vision_start|>",
|
| 537 |
+
"vision_eos_token": "<|vision_end|>"
|
| 538 |
+
},
|
| 539 |
+
"image_token": "<|image_pad|>",
|
| 540 |
+
"model_max_length": 131072,
|
| 541 |
+
"pad_token": "<|endoftext|>",
|
| 542 |
+
"processor_class": "Qwen3ASRProcessor",
|
| 543 |
+
"split_special_tokens": false,
|
| 544 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 545 |
+
"unk_token": null,
|
| 546 |
+
"video_token": "<|video_pad|>",
|
| 547 |
+
"vision_bos_token": "<|vision_start|>",
|
| 548 |
+
"vision_eos_token": "<|vision_end|>"
|
| 549 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|