alexmarques commited on
Commit
c37eee8
·
verified ·
1 Parent(s): 2d0d975

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tekken.json filter=lfs diff=lfs merge=lfs -text
consolidated.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8b29ec0ff94e63eef233331ea7bc10545aab6a622e43dcf80781d08b8f6f7d
3
+ size 28392883664
convert_voxtral_hf_to_mistral.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import argparse
16
+ import gc
17
+ import json
18
+ import os
19
+ import re
20
+
21
+ from safetensors.torch import save_file
22
+ from safetensors.torch import safe_open
23
+ from huggingface_hub import snapshot_download
24
+
25
+ from transformers import VoxtralConfig
26
+
27
+ # fmt: off
28
+ STATE_DICT_MAPPING = {
29
+ r"^language_model\.lm_head": r"output",
30
+ r"^language_model\.model\.norm": r"norm",
31
+ r"^language_model\.model\.embed_tokens": r"tok_embeddings",
32
+ r"^language_model\.model\.layers\.(\d+)\.input_layernorm": r"layers.\1.attention_norm",
33
+ r"^language_model\.model\.layers\.(\d+)\.post_attention_layernorm": r"layers.\1.ffn_norm",
34
+ r"^language_model\.model\.layers\.(\d+)\.self_attn\.(q|k|v|o)_proj": r"layers.\1.attention.w\2",
35
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.gate_proj": r"layers.\1.feed_forward.w1",
36
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.down_proj": r"layers.\1.feed_forward.w2",
37
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.up_proj": r"layers.\1.feed_forward.w3",
38
+ r"language_model.model.embed_tokens": r"tok_embeddings",
39
+ r"audio_tower.conv1": r"mm_whisper_embeddings.whisper_encoder.conv_layers.0" ,
40
+ r"audio_tower.conv2": r"mm_whisper_embeddings.whisper_encoder.conv_layers.1" ,
41
+ r"audio_tower.layer_norm": r"mm_whisper_embeddings.whisper_encoder.transformer.norm" ,
42
+ r"audio_tower.layers.(\d+).self_attn.(q|k|v)_proj": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.attention.w\2" ,
43
+ r"audio_tower.layers.(\d+).self_attn.out_proj": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.attention.wo" ,
44
+ r"audio_tower.layers.(\d+).self_attn_layer_norm": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.attention_norm" ,
45
+ r"audio_tower.layers.(\d+).fc(\d+)": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.feed_forward.w\2" ,
46
+ r"audio_tower.layers.(\d+).final_layer_norm": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.ffn_norm" ,
47
+ r"multi_modal_projector.linear_1": r"mm_whisper_embeddings.audio_language_projection.0" ,
48
+ r"multi_modal_projector.linear_2": r"mm_whisper_embeddings.audio_language_projection.2" ,
49
+ }
50
+ # fmt: on
51
+
52
+ SKIP_KEYS = ["audio_tower.embed_positions.weight"]
53
+
54
+ def add_quantization_config(config, hf_config: VoxtralConfig):
55
+ quantization_config = hf_config.quantization_config
56
+ mistral_ignore = [] # keys to ignore in the quantization config
57
+ for hf_key in quantization_config["ignore"]:
58
+ mistral_key = map_hf_key_to_mistral(hf_key)
59
+ mistral_ignore.append(mistral_key)
60
+ quantization_config["ignore"] = mistral_ignore
61
+ config["quantization"] = quantization_config
62
+
63
+ return config
64
+
65
+ def map_hf_key_to_mistral(hf_key):
66
+ """Map a key from HF format to Mistral format"""
67
+ for pattern, replacement in STATE_DICT_MAPPING.items():
68
+ new_key, n_replace = re.subn(pattern, replacement, hf_key)
69
+ if n_replace > 0:
70
+ return new_key.replace("weight_scale", "qscale_weight")
71
+
72
+ # If no mapping found, return the original key
73
+ return hf_key.replace("weight_scale", "qscale_weight")
74
+
75
+
76
+ def permute_for_mistral_rope(tensor, n_heads, dim1, dim2):
77
+ """Reverse the ROPE permutation to get back to Mistral format."""
78
+ tensor = tensor.view(n_heads, 2, dim1 // n_heads // 2, dim2)
79
+ tensor = tensor.transpose(1, 2)
80
+ tensor = tensor.reshape(dim1, dim2)
81
+ return tensor
82
+
83
+
84
+ def convert_state_dict(hf_state_dict, config):
85
+ """Convert HF Voxtral state dict to Mistral format"""
86
+ mistral_dict = {}
87
+
88
+ num_attention_heads = config["n_heads"]
89
+ hidden_size = config["dim"]
90
+ head_dim = config["head_dim"]
91
+ num_key_value_heads = config["n_kv_heads"]
92
+ key_value_dim = head_dim * num_key_value_heads
93
+ query_dim = head_dim * num_attention_heads
94
+
95
+ for hf_key, tensor in hf_state_dict.items():
96
+ if hf_key in SKIP_KEYS:
97
+ continue
98
+
99
+ mistral_key = map_hf_key_to_mistral(hf_key)
100
+
101
+ if "language_model" in hf_key:
102
+ if hf_key.endswith("q_proj.weight"):
103
+ tensor = permute_for_mistral_rope(tensor, num_attention_heads, query_dim, hidden_size)
104
+ elif hf_key.endswith("q_proj.weight_scale") and tensor.size(0) == num_attention_heads:
105
+ tensor = permute_for_mistral_rope(tensor, num_attention_heads, query_dim, 1)
106
+ elif hf_key.endswith("k_proj.weight"):
107
+ tensor = permute_for_mistral_rope(tensor, num_key_value_heads, key_value_dim, hidden_size)
108
+ elif hf_key.endswith("k_proj.weight_scale") and tensor.size(0) == num_key_value_heads:
109
+ tensor = permute_for_mistral_rope(tensor, num_key_value_heads, key_value_dim, 1)
110
+
111
+ mistral_dict[mistral_key] = tensor
112
+
113
+ return mistral_dict
114
+
115
+
116
+ def write_model(
117
+ input_path_or_repo,
118
+ output_dir,
119
+ unquantized_model_path=None,
120
+ ):
121
+ print("Converting HF Voxtral model to Mistral format.")
122
+ os.makedirs(output_dir, exist_ok=True)
123
+
124
+ # Load the HF Voxtral model
125
+ print(f"Loading HF Voxtral model from {input_path_or_repo}...")
126
+ hf_config = VoxtralConfig.from_pretrained(input_path_or_repo)
127
+
128
+ if os.path.exists(input_path_or_repo):
129
+ local_path = input_path_or_repo
130
+ else:
131
+ local_path = snapshot_download(input_path_or_repo)
132
+
133
+ # Convert config
134
+ if unquantized_model_path is not None:
135
+ if os.path.exists(unquantized_model_path):
136
+ unquantized_model_path = unquantized_model_path
137
+ else:
138
+ unquantized_model_path = snapshot_download(unquantized_model_path)
139
+ config_path = os.path.join(unquantized_model_path, "params.json")
140
+ with open(config_path, "r") as f:
141
+ config = json.load(f)
142
+ config = add_quantization_config(config, hf_config)
143
+
144
+ with open(os.path.join(output_dir, "params.json"), "w") as f:
145
+ json.dump(config, f, indent=2)
146
+ else:
147
+ raise ValueError(f"Unquantized model config not found for {unquantized_model_path}")
148
+
149
+ # Convert state dict
150
+ print("Converting state dict...")
151
+ tensor_files = sorted([f for f in os.listdir(os.path.join(local_path)) if f.endswith(".safetensors")])
152
+
153
+ hf_state_dict = {}
154
+
155
+ for file in tensor_files:
156
+ file_path = os.path.join(local_path, file)
157
+ with safe_open(file_path, framework="pt", device="cuda") as f:
158
+ for key in f.keys():
159
+ hf_state_dict[key] = f.get_tensor(key)
160
+
161
+ mistral_state_dict = convert_state_dict(hf_state_dict, config)
162
+
163
+ # save the state dict
164
+ save_file(mistral_state_dict, os.path.join(output_dir, "consolidated.safetensors"))
165
+
166
+ del hf_state_dict, mistral_state_dict
167
+ gc.collect()
168
+ print("Model converted successfully.")
169
+
170
+ def write_tokenizer(input_path_or_repo: str, output_dir: str):
171
+ """Extract and save the tokenizer from Voxtral model"""
172
+ from transformers import MistralCommonTokenizer
173
+
174
+ print("Extracting tokenizer...")
175
+ tokenizer = MistralCommonTokenizer.from_pretrained(input_path_or_repo)
176
+ tokenizer.save_pretrained(output_dir)
177
+ print("Tokenizer saved successfully.")
178
+
179
+
180
+ def main():
181
+ parser = argparse.ArgumentParser(description="Convert HF Voxtral weights to Mistral format")
182
+ parser.add_argument(
183
+ "--input_path_or_repo",
184
+ type=str,
185
+ default="RedHatAI/Voxtral-Small-24B-2507-FP8-dynamic",
186
+ help="Path or repo containing HF Voxtral model",
187
+ )
188
+ parser.add_argument(
189
+ "--output_dir",
190
+ type=str,
191
+ default="Voxtral-Small-24B-2507-FP8-dynamic-converted",
192
+ help="Location to write Mistral model and tokenizer",
193
+ )
194
+ parser.add_argument(
195
+ "--skip_tokenizer",
196
+ action="store_true",
197
+ help="Skip tokenizer conversion"
198
+ )
199
+ parser.add_argument(
200
+ "--unquantized_model_path",
201
+ type=str,
202
+ default="mistralai/Voxtral-Small-24B-2507",
203
+ help="Path to the unquantized model",
204
+ )
205
+ args = parser.parse_args()
206
+
207
+ write_model(
208
+ args.input_path_or_repo,
209
+ args.output_dir,
210
+ unquantized_model_path=args.unquantized_model_path,
211
+ )
212
+
213
+ if not args.skip_tokenizer:
214
+ write_tokenizer(
215
+ args.input_path_or_repo,
216
+ args.output_dir,
217
+ )
218
+
219
+
220
+ if __name__ == "__main__":
221
+ main()
params.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dim": 5120,
3
+ "n_layers": 40,
4
+ "head_dim": 128,
5
+ "hidden_dim": 32768,
6
+ "n_heads": 32,
7
+ "n_kv_heads": 8,
8
+ "rope_theta": 100000000.0,
9
+ "norm_eps": 1e-05,
10
+ "vocab_size": 131072,
11
+ "max_position_embeddings": 32768,
12
+ "multimodal": {
13
+ "whisper_model_args": {
14
+ "encoder_args": {
15
+ "dim": 1280,
16
+ "n_layers": 32,
17
+ "head_dim": 64,
18
+ "hidden_dim": 5120,
19
+ "n_heads": 20,
20
+ "vocab_size": 51866,
21
+ "max_source_positions": 1500,
22
+ "audio_encoding_args": {
23
+ "sampling_rate": 16000,
24
+ "num_mel_bins": 128,
25
+ "hop_length": 160,
26
+ "window_size": 400
27
+ }
28
+ },
29
+ "downsample_args": {
30
+ "downsample_factor": 4
31
+ }
32
+ }
33
+ },
34
+ "quantization": {
35
+ "config_groups": {
36
+ "group_0": {
37
+ "format": "float-quantized",
38
+ "input_activations": {
39
+ "actorder": null,
40
+ "block_structure": null,
41
+ "dynamic": true,
42
+ "group_size": null,
43
+ "num_bits": 8,
44
+ "observer": null,
45
+ "observer_kwargs": {},
46
+ "strategy": "token",
47
+ "symmetric": true,
48
+ "type": "float"
49
+ },
50
+ "output_activations": null,
51
+ "targets": [
52
+ "Linear"
53
+ ],
54
+ "weights": {
55
+ "actorder": null,
56
+ "block_structure": null,
57
+ "dynamic": false,
58
+ "group_size": null,
59
+ "num_bits": 8,
60
+ "observer": "mse",
61
+ "observer_kwargs": {},
62
+ "strategy": "channel",
63
+ "symmetric": true,
64
+ "type": "float"
65
+ }
66
+ }
67
+ },
68
+ "format": "float-quantized",
69
+ "global_compression_ratio": null,
70
+ "ignore": [
71
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wk",
72
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wv",
73
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wq",
74
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wo",
75
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.feed_forward.w1",
76
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.feed_forward.w2",
77
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wk",
78
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wv",
79
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wq",
80
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wo",
81
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.feed_forward.w1",
82
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.feed_forward.w2",
83
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wk",
84
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wv",
85
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wq",
86
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wo",
87
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.feed_forward.w1",
88
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.feed_forward.w2",
89
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wk",
90
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wv",
91
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wq",
92
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wo",
93
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.feed_forward.w1",
94
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.feed_forward.w2",
95
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wk",
96
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wv",
97
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wq",
98
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wo",
99
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.feed_forward.w1",
100
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.feed_forward.w2",
101
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wk",
102
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wv",
103
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wq",
104
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wo",
105
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.feed_forward.w1",
106
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.feed_forward.w2",
107
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wk",
108
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wv",
109
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wq",
110
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wo",
111
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.feed_forward.w1",
112
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.feed_forward.w2",
113
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wk",
114
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wv",
115
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wq",
116
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wo",
117
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.feed_forward.w1",
118
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.feed_forward.w2",
119
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wk",
120
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wv",
121
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wq",
122
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wo",
123
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.feed_forward.w1",
124
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.feed_forward.w2",
125
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wk",
126
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wv",
127
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wq",
128
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wo",
129
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.feed_forward.w1",
130
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.feed_forward.w2",
131
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wk",
132
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wv",
133
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wq",
134
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wo",
135
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.feed_forward.w1",
136
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.feed_forward.w2",
137
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wk",
138
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wv",
139
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wq",
140
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wo",
141
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.feed_forward.w1",
142
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.feed_forward.w2",
143
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wk",
144
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wv",
145
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wq",
146
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wo",
147
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.feed_forward.w1",
148
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.feed_forward.w2",
149
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wk",
150
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wv",
151
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wq",
152
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wo",
153
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.feed_forward.w1",
154
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.feed_forward.w2",
155
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wk",
156
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wv",
157
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wq",
158
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wo",
159
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.feed_forward.w1",
160
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.feed_forward.w2",
161
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wk",
162
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wv",
163
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wq",
164
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wo",
165
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.feed_forward.w1",
166
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.feed_forward.w2",
167
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wk",
168
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wv",
169
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wq",
170
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wo",
171
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.feed_forward.w1",
172
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.feed_forward.w2",
173
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wk",
174
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wv",
175
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wq",
176
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wo",
177
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.feed_forward.w1",
178
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.feed_forward.w2",
179
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wk",
180
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wv",
181
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wq",
182
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wo",
183
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.feed_forward.w1",
184
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.feed_forward.w2",
185
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wk",
186
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wv",
187
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wq",
188
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wo",
189
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.feed_forward.w1",
190
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.feed_forward.w2",
191
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wk",
192
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wv",
193
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wq",
194
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wo",
195
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.feed_forward.w1",
196
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.feed_forward.w2",
197
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wk",
198
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wv",
199
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wq",
200
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wo",
201
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.feed_forward.w1",
202
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.feed_forward.w2",
203
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wk",
204
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wv",
205
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wq",
206
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wo",
207
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.feed_forward.w1",
208
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.feed_forward.w2",
209
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wk",
210
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wv",
211
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wq",
212
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wo",
213
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.feed_forward.w1",
214
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.feed_forward.w2",
215
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wk",
216
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wv",
217
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wq",
218
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wo",
219
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.feed_forward.w1",
220
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.feed_forward.w2",
221
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wk",
222
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wv",
223
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wq",
224
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wo",
225
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.feed_forward.w1",
226
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.feed_forward.w2",
227
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wk",
228
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wv",
229
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wq",
230
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wo",
231
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.feed_forward.w1",
232
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.feed_forward.w2",
233
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wk",
234
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wv",
235
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wq",
236
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wo",
237
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.feed_forward.w1",
238
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.feed_forward.w2",
239
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wk",
240
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wv",
241
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wq",
242
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wo",
243
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.feed_forward.w1",
244
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.feed_forward.w2",
245
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wk",
246
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wv",
247
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wq",
248
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wo",
249
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.feed_forward.w1",
250
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.feed_forward.w2",
251
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wk",
252
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wv",
253
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wq",
254
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wo",
255
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.feed_forward.w1",
256
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.feed_forward.w2",
257
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wk",
258
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wv",
259
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wq",
260
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wo",
261
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.feed_forward.w1",
262
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.feed_forward.w2",
263
+ "layers.0.attention.wq",
264
+ "layers.0.attention.wk",
265
+ "layers.0.attention.wv",
266
+ "layers.0.attention.wo",
267
+ "layers.1.attention.wq",
268
+ "layers.1.attention.wk",
269
+ "layers.1.attention.wv",
270
+ "layers.1.attention.wo",
271
+ "layers.2.attention.wq",
272
+ "layers.2.attention.wk",
273
+ "layers.2.attention.wv",
274
+ "layers.2.attention.wo",
275
+ "layers.3.attention.wq",
276
+ "layers.3.attention.wk",
277
+ "layers.3.attention.wv",
278
+ "layers.3.attention.wo",
279
+ "layers.4.attention.wq",
280
+ "layers.4.attention.wk",
281
+ "layers.4.attention.wv",
282
+ "layers.4.attention.wo",
283
+ "layers.5.attention.wq",
284
+ "layers.5.attention.wk",
285
+ "layers.5.attention.wv",
286
+ "layers.5.attention.wo",
287
+ "layers.6.attention.wq",
288
+ "layers.6.attention.wk",
289
+ "layers.6.attention.wv",
290
+ "layers.6.attention.wo",
291
+ "layers.7.attention.wq",
292
+ "layers.7.attention.wk",
293
+ "layers.7.attention.wv",
294
+ "layers.7.attention.wo",
295
+ "layers.8.attention.wq",
296
+ "layers.8.attention.wk",
297
+ "layers.8.attention.wv",
298
+ "layers.8.attention.wo",
299
+ "layers.9.attention.wq",
300
+ "layers.9.attention.wk",
301
+ "layers.9.attention.wv",
302
+ "layers.9.attention.wo",
303
+ "layers.10.attention.wq",
304
+ "layers.10.attention.wk",
305
+ "layers.10.attention.wv",
306
+ "layers.10.attention.wo",
307
+ "layers.11.attention.wq",
308
+ "layers.11.attention.wk",
309
+ "layers.11.attention.wv",
310
+ "layers.11.attention.wo",
311
+ "layers.12.attention.wq",
312
+ "layers.12.attention.wk",
313
+ "layers.12.attention.wv",
314
+ "layers.12.attention.wo",
315
+ "layers.13.attention.wq",
316
+ "layers.13.attention.wk",
317
+ "layers.13.attention.wv",
318
+ "layers.13.attention.wo",
319
+ "layers.14.attention.wq",
320
+ "layers.14.attention.wk",
321
+ "layers.14.attention.wv",
322
+ "layers.14.attention.wo",
323
+ "layers.15.attention.wq",
324
+ "layers.15.attention.wk",
325
+ "layers.15.attention.wv",
326
+ "layers.15.attention.wo",
327
+ "layers.16.attention.wq",
328
+ "layers.16.attention.wk",
329
+ "layers.16.attention.wv",
330
+ "layers.16.attention.wo",
331
+ "layers.17.attention.wq",
332
+ "layers.17.attention.wk",
333
+ "layers.17.attention.wv",
334
+ "layers.17.attention.wo",
335
+ "layers.18.attention.wq",
336
+ "layers.18.attention.wk",
337
+ "layers.18.attention.wv",
338
+ "layers.18.attention.wo",
339
+ "layers.19.attention.wq",
340
+ "layers.19.attention.wk",
341
+ "layers.19.attention.wv",
342
+ "layers.19.attention.wo",
343
+ "layers.20.attention.wq",
344
+ "layers.20.attention.wk",
345
+ "layers.20.attention.wv",
346
+ "layers.20.attention.wo",
347
+ "layers.21.attention.wq",
348
+ "layers.21.attention.wk",
349
+ "layers.21.attention.wv",
350
+ "layers.21.attention.wo",
351
+ "layers.22.attention.wq",
352
+ "layers.22.attention.wk",
353
+ "layers.22.attention.wv",
354
+ "layers.22.attention.wo",
355
+ "layers.23.attention.wq",
356
+ "layers.23.attention.wk",
357
+ "layers.23.attention.wv",
358
+ "layers.23.attention.wo",
359
+ "layers.24.attention.wq",
360
+ "layers.24.attention.wk",
361
+ "layers.24.attention.wv",
362
+ "layers.24.attention.wo",
363
+ "layers.25.attention.wq",
364
+ "layers.25.attention.wk",
365
+ "layers.25.attention.wv",
366
+ "layers.25.attention.wo",
367
+ "layers.26.attention.wq",
368
+ "layers.26.attention.wk",
369
+ "layers.26.attention.wv",
370
+ "layers.26.attention.wo",
371
+ "layers.27.attention.wq",
372
+ "layers.27.attention.wk",
373
+ "layers.27.attention.wv",
374
+ "layers.27.attention.wo",
375
+ "layers.28.attention.wq",
376
+ "layers.28.attention.wk",
377
+ "layers.28.attention.wv",
378
+ "layers.28.attention.wo",
379
+ "layers.29.attention.wq",
380
+ "layers.29.attention.wk",
381
+ "layers.29.attention.wv",
382
+ "layers.29.attention.wo",
383
+ "layers.30.attention.wq",
384
+ "layers.30.attention.wk",
385
+ "layers.30.attention.wv",
386
+ "layers.30.attention.wo",
387
+ "layers.31.attention.wq",
388
+ "layers.31.attention.wk",
389
+ "layers.31.attention.wv",
390
+ "layers.31.attention.wo",
391
+ "layers.32.attention.wq",
392
+ "layers.32.attention.wk",
393
+ "layers.32.attention.wv",
394
+ "layers.32.attention.wo",
395
+ "layers.33.attention.wq",
396
+ "layers.33.attention.wk",
397
+ "layers.33.attention.wv",
398
+ "layers.33.attention.wo",
399
+ "layers.34.attention.wq",
400
+ "layers.34.attention.wk",
401
+ "layers.34.attention.wv",
402
+ "layers.34.attention.wo",
403
+ "layers.35.attention.wq",
404
+ "layers.35.attention.wk",
405
+ "layers.35.attention.wv",
406
+ "layers.35.attention.wo",
407
+ "layers.36.attention.wq",
408
+ "layers.36.attention.wk",
409
+ "layers.36.attention.wv",
410
+ "layers.36.attention.wo",
411
+ "layers.37.attention.wq",
412
+ "layers.37.attention.wk",
413
+ "layers.37.attention.wv",
414
+ "layers.37.attention.wo",
415
+ "layers.38.attention.wq",
416
+ "layers.38.attention.wk",
417
+ "layers.38.attention.wv",
418
+ "layers.38.attention.wo",
419
+ "layers.39.attention.wq",
420
+ "layers.39.attention.wk",
421
+ "layers.39.attention.wv",
422
+ "layers.39.attention.wo",
423
+ "output",
424
+ "mm_whisper_embeddings.audio_language_projection.0",
425
+ "mm_whisper_embeddings.audio_language_projection.2"
426
+ ],
427
+ "kv_cache_scheme": null,
428
+ "quant_method": "compressed-tensors",
429
+ "quantization_status": "compressed",
430
+ "sparsity_config": {},
431
+ "transform_config": {},
432
+ "version": "0.11.1.a20250923"
433
+ }
434
+ }
tekken.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aaf3836c2a5332f029ce85a7a62255c966f47b6797ef81dedd0ade9c862e4a8
3
+ size 14894206