rkazants commited on
Commit
d2ca102
·
verified ·
1 Parent(s): 42f204f

Upload 10 files

Browse files
config.json CHANGED
@@ -1,35 +1,27 @@
1
  {
 
2
  "architectures": [
3
  "MiniCPMO"
4
  ],
5
  "attention_dropout": 0.0,
6
  "audio_chunk_length": 1.0,
7
  "audio_config": {
8
- "_attn_implementation_autoset": true,
9
  "_name_or_path": "openai/whisper-medium",
10
- "activation_dropout": 0.0,
11
- "activation_function": "gelu",
12
- "apply_spec_augment": false,
13
  "architectures": [
14
  "MiniCPMWhisperEncoder"
15
  ],
16
- "attention_dropout": 0.0,
17
  "begin_suppress_tokens": [
18
  220,
19
  50257
20
  ],
21
  "bos_token_id": 50257,
22
- "classifier_proj_size": 256,
23
  "d_model": 1024,
24
  "decoder_attention_heads": 16,
25
  "decoder_ffn_dim": 1024,
26
- "decoder_layerdrop": 0.0,
27
  "decoder_layers": 1,
28
  "decoder_start_token_id": 50258,
29
- "dropout": 0.0,
30
  "encoder_attention_heads": 16,
31
  "encoder_ffn_dim": 4096,
32
- "encoder_layerdrop": 0.0,
33
  "encoder_layers": 1,
34
  "eos_token_id": 50257,
35
  "forced_decoder_ids": [
@@ -46,22 +38,10 @@
46
  50363
47
  ]
48
  ],
49
- "init_std": 0.02,
50
- "mask_feature_length": 10,
51
- "mask_feature_min_masks": 0,
52
- "mask_feature_prob": 0.0,
53
- "mask_time_length": 10,
54
- "mask_time_min_masks": 2,
55
- "mask_time_prob": 0.05,
56
  "max_length": 448,
57
- "max_source_positions": 1500,
58
- "max_target_positions": 448,
59
- "median_filter_width": 7,
60
  "model_type": "whisper",
61
  "num_hidden_layers": 24,
62
- "num_mel_bins": 80,
63
  "pad_token_id": 50257,
64
- "scale_embedding": false,
65
  "suppress_tokens": [
66
  1,
67
  2,
@@ -152,10 +132,7 @@
152
  50361,
153
  50362
154
  ],
155
- "torch_dtype": "float32",
156
- "use_cache": true,
157
- "use_weighted_layer_sum": false,
158
- "vocab_size": 51865
159
  },
160
  "audio_pool_step": 2,
161
  "auto_map": {
@@ -169,7 +146,7 @@
169
  "drop_vision_last_layer": false,
170
  "eos_token_id": 151645,
171
  "hidden_act": "silu",
172
- "hidden_size": 128,
173
  "image_size": 448,
174
  "init_audio": true,
175
  "init_tts": true,
@@ -191,25 +168,18 @@
191
  "rope_theta": 1000000.0,
192
  "slice_config": {
193
  "max_slice_nums": 9,
194
- "model_type": "minicpmv",
195
- "patch_size": 14,
196
- "scale_resolution": 448
197
  },
198
  "slice_mode": true,
199
- "sliding_window": 131072,
200
  "stream_input": false,
201
  "tie_word_embeddings": false,
202
  "torch_dtype": "bfloat16",
203
- "transformers_version": "4.50.0",
204
  "tts_config": {
205
- "_attn_implementation_autoset": true,
206
- "attn_implementation": "sdpa",
207
- "audio_bos_token_id": 21132,
208
- "aug_loss_weight": true,
209
  "hidden_size": 8,
210
  "intermediate_size": 4,
211
  "llm_dim": 4,
212
- "max_position_embeddings": 4096,
213
  "model_type": "conditional_chattts",
214
  "num_attention_heads": 1,
215
  "num_audio_tokens": 10,
@@ -217,19 +187,7 @@
217
  "num_hidden_layers": 1,
218
  "num_layers": 1,
219
  "num_mel_bins": 10,
220
- "num_spk_embs": 1,
221
- "num_text_tokens": 20,
222
- "num_vq": 4,
223
- "spk_emb_token_id": 21143,
224
- "streaming": true,
225
- "streaming_audio_chunk_size": 50,
226
- "streaming_text_chunk_size": 10,
227
- "streaming_text_reserved_len": 300,
228
- "text_eos_token_id": 21133,
229
- "use_llm_hidden_state": false,
230
- "use_mlp": true,
231
- "use_speaker_embedding": true,
232
- "use_text": true
233
  },
234
  "use_cache": true,
235
  "use_image_id": true,
@@ -237,16 +195,11 @@
237
  "version": 2.6,
238
  "vision_batch_size": 16,
239
  "vision_config": {
240
- "_attn_implementation_autoset": true,
241
- "attention_dropout": 0.0,
242
- "hidden_act": "gelu_pytorch_tanh",
243
  "hidden_size": 8,
244
  "image_size": 980,
245
  "intermediate_size": 8,
246
- "layer_norm_eps": 1e-06,
247
  "model_type": "siglip_vision_model",
248
  "num_attention_heads": 1,
249
- "num_channels": 3,
250
  "num_hidden_layers": 1,
251
  "patch_size": 14
252
  },
 
1
  {
2
+ "_name_or_path": "openbmb/MiniCPM-o-2_6",
3
  "architectures": [
4
  "MiniCPMO"
5
  ],
6
  "attention_dropout": 0.0,
7
  "audio_chunk_length": 1.0,
8
  "audio_config": {
 
9
  "_name_or_path": "openai/whisper-medium",
 
 
 
10
  "architectures": [
11
  "MiniCPMWhisperEncoder"
12
  ],
 
13
  "begin_suppress_tokens": [
14
  220,
15
  50257
16
  ],
17
  "bos_token_id": 50257,
 
18
  "d_model": 1024,
19
  "decoder_attention_heads": 16,
20
  "decoder_ffn_dim": 1024,
 
21
  "decoder_layers": 1,
22
  "decoder_start_token_id": 50258,
 
23
  "encoder_attention_heads": 16,
24
  "encoder_ffn_dim": 4096,
 
25
  "encoder_layers": 1,
26
  "eos_token_id": 50257,
27
  "forced_decoder_ids": [
 
38
  50363
39
  ]
40
  ],
 
 
 
 
 
 
 
41
  "max_length": 448,
 
 
 
42
  "model_type": "whisper",
43
  "num_hidden_layers": 24,
 
44
  "pad_token_id": 50257,
 
45
  "suppress_tokens": [
46
  1,
47
  2,
 
132
  50361,
133
  50362
134
  ],
135
+ "torch_dtype": "float32"
 
 
 
136
  },
137
  "audio_pool_step": 2,
138
  "auto_map": {
 
146
  "drop_vision_last_layer": false,
147
  "eos_token_id": 151645,
148
  "hidden_act": "silu",
149
+ "hidden_size": 168,
150
  "image_size": 448,
151
  "init_audio": true,
152
  "init_tts": true,
 
168
  "rope_theta": 1000000.0,
169
  "slice_config": {
170
  "max_slice_nums": 9,
171
+ "model_type": "minicpmv"
 
 
172
  },
173
  "slice_mode": true,
174
+ "sliding_window": null,
175
  "stream_input": false,
176
  "tie_word_embeddings": false,
177
  "torch_dtype": "bfloat16",
178
+ "transformers_version": "4.45.0",
179
  "tts_config": {
 
 
 
 
180
  "hidden_size": 8,
181
  "intermediate_size": 4,
182
  "llm_dim": 4,
 
183
  "model_type": "conditional_chattts",
184
  "num_attention_heads": 1,
185
  "num_audio_tokens": 10,
 
187
  "num_hidden_layers": 1,
188
  "num_layers": 1,
189
  "num_mel_bins": 10,
190
+ "num_text_tokens": 20
 
 
 
 
 
 
 
 
 
 
 
 
191
  },
192
  "use_cache": true,
193
  "use_image_id": true,
 
195
  "version": 2.6,
196
  "vision_batch_size": 16,
197
  "vision_config": {
 
 
 
198
  "hidden_size": 8,
199
  "image_size": 980,
200
  "intermediate_size": 8,
 
201
  "model_type": "siglip_vision_model",
202
  "num_attention_heads": 1,
 
203
  "num_hidden_layers": 1,
204
  "patch_size": 14
205
  },
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
- "transformers_version": "4.50.0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
+ "transformers_version": "4.45.0"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2018dc34b16742d14676c91397c965320a65715d8f16bf1c03d48417052c0e73
3
- size 143987592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7263b402a96591c569978646e693a05384f6662c39a3bf4f94bbbeb74f4fc156
3
+ size 168558024
preprocessor_config.json CHANGED
@@ -4,7 +4,6 @@
4
  "AutoProcessor": "openbmb/MiniCPM-o-2_6--processing_minicpmo.MiniCPMOProcessor"
5
  },
6
  "chunk_length": 30,
7
- "dither": 0.0,
8
  "feature_extractor_type": "WhisperFeatureExtractor",
9
  "feature_size": 80,
10
  "hop_length": 160,
 
4
  "AutoProcessor": "openbmb/MiniCPM-o-2_6--processing_minicpmo.MiniCPMOProcessor"
5
  },
6
  "chunk_length": 30,
 
7
  "feature_extractor_type": "WhisperFeatureExtractor",
8
  "feature_size": 80,
9
  "hop_length": 160,
tokenizer_config.json CHANGED
@@ -515,7 +515,6 @@
515
  "clean_up_tokenization_spaces": false,
516
  "eos_token": "<|im_end|>",
517
  "errors": "replace",
518
- "extra_special_tokens": {},
519
  "model_max_length": 131072,
520
  "pad_token": "<|endoftext|>",
521
  "processor_class": "MiniCPMOProcessor",
 
515
  "clean_up_tokenization_spaces": false,
516
  "eos_token": "<|im_end|>",
517
  "errors": "replace",
 
518
  "model_max_length": 131072,
519
  "pad_token": "<|endoftext|>",
520
  "processor_class": "MiniCPMOProcessor",