Add files using upload-large-folder tool
Browse files- README.md +4 -31
- config.json +2 -3
- configuration_deepseek.py +3 -3
README.md
CHANGED
|
@@ -13,7 +13,7 @@ tags:
|
|
| 13 |
|
| 14 |
Meet Keye-VL-671B-A37B — the most powerful multi-modal language model in the Keye series to date.
|
| 15 |
|
| 16 |
-
As one of the largest and most capable MLLMs currently in existence, Keye-VL
|
| 17 |
|
| 18 |
#### Key Enhancements:
|
| 19 |
|
|
@@ -32,36 +32,9 @@ As one of the largest and most capable MLLMs currently in existence, Keye-VL 671
|
|
| 32 |
|
| 33 |
## Model Performance
|
| 34 |
|
| 35 |
-
 | **79.0** |
|
| 54 |
-
| | MMVU | 70.1 | - | 78.4 (fp8) | **86.6** |
|
| 55 |
-
| | TempCompass | **83.7** | - | 81.03 (fp8) | 77.75 |
|
| 56 |
-
| Text Recog./Doc/chart | TextVQA | **81.8** | - | - | 76.21 |
|
| 57 |
-
| | DocVQA_VAL | **96.9** | 96.52 | 96.5 | 95.39 |
|
| 58 |
-
| | ChartQA_TEST | **89.1** | 87.68 | - | 86.68 |
|
| 59 |
-
| | InfoVQA | **91.2** | - | 89.5 | 86.93 |
|
| 60 |
-
| | CharXiv (RQ) | 60.2 | 64.4 | 66.1 | **79.4** |
|
| 61 |
-
| | CharXiv (DQ) | 92.6 | 92.1 | - | **94.5** |
|
| 62 |
-
| | AI2D_TEST | 87.3 | 88.37 | 89.2 | **91.19** |
|
| 63 |
-
| Pure Text | AIME2025 | - | 85.83 | **89.7** | 83.3 |
|
| 64 |
-
| | GPQA | - | **72.78** | - | 71.21 |
|
| 65 |
|
| 66 |
## Quickstart
|
| 67 |
|
|
|
|
| 13 |
|
| 14 |
Meet Keye-VL-671B-A37B — the most powerful multi-modal language model in the Keye series to date.
|
| 15 |
|
| 16 |
+
As one of the largest and most capable MLLMs currently in existence, Keye-VL-671B-A37B demonstrates top-tier and in some cases even leading performance in text understanding and generation, complex visual perception and reasoning, comprehensive video understanding, and Olympic-level mathematical reasoning.
|
| 17 |
|
| 18 |
#### Key Enhancements:
|
| 19 |
|
|
|
|
| 32 |
|
| 33 |
## Model Performance
|
| 34 |
|
| 35 |
+

|
| 36 |
+
|
| 37 |
+

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
## Quickstart
|
| 40 |
|
config.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"attention_bias": false,
|
| 6 |
"attention_dropout": 0.0,
|
| 7 |
"auto_map": {
|
| 8 |
-
"AutoConfig": "configuration_deepseek.
|
| 9 |
},
|
| 10 |
"bos_token_id": 0,
|
| 11 |
"dtype": "bfloat16",
|
|
@@ -244,8 +244,7 @@
|
|
| 244 |
],
|
| 245 |
"attention_dropout": 0.0,
|
| 246 |
"auto_map": {
|
| 247 |
-
"AutoConfig": "configuration_deepseek.KeyeVisionConfig"
|
| 248 |
-
"AutoModel": "modeling_deepseek.SiglipVisionModel"
|
| 249 |
},
|
| 250 |
"has_learnable_position_embedding": true,
|
| 251 |
"hidden_act": "gelu_pytorch_tanh",
|
|
|
|
| 5 |
"attention_bias": false,
|
| 6 |
"attention_dropout": 0.0,
|
| 7 |
"auto_map": {
|
| 8 |
+
"AutoConfig": "configuration_deepseek.KeyeVLMoeConfig"
|
| 9 |
},
|
| 10 |
"bos_token_id": 0,
|
| 11 |
"dtype": "bfloat16",
|
|
|
|
| 244 |
],
|
| 245 |
"attention_dropout": 0.0,
|
| 246 |
"auto_map": {
|
| 247 |
+
"AutoConfig": "configuration_deepseek.KeyeVisionConfig"
|
|
|
|
| 248 |
},
|
| 249 |
"has_learnable_position_embedding": true,
|
| 250 |
"hidden_act": "gelu_pytorch_tanh",
|
configuration_deepseek.py
CHANGED
|
@@ -60,7 +60,7 @@ class KeyeVisionConfig(PretrainedConfig):
|
|
| 60 |
self.tokens_per_second = tokens_per_second
|
| 61 |
|
| 62 |
|
| 63 |
-
class
|
| 64 |
r"""
|
| 65 |
This is the configuration class to store the configuration of a [`KeyeModel`]. It is used to instantiate a
|
| 66 |
KeyeVLMoeForConditionalGeneration model according to the specified arguments, defining the model architecture.
|
|
@@ -164,7 +164,7 @@ class DeepseekR1Config(PretrainedConfig):
|
|
| 164 |
>>> configuration = model.config
|
| 165 |
```"""
|
| 166 |
|
| 167 |
-
model_type = "
|
| 168 |
sub_configs = {"vision_config": KeyeVisionConfig}
|
| 169 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 170 |
|
|
@@ -263,4 +263,4 @@ class DeepseekR1Config(PretrainedConfig):
|
|
| 263 |
**kwargs,
|
| 264 |
)
|
| 265 |
|
| 266 |
-
__all__ = ["
|
|
|
|
| 60 |
self.tokens_per_second = tokens_per_second
|
| 61 |
|
| 62 |
|
| 63 |
+
class KeyeVLMoeConfig(PretrainedConfig):
|
| 64 |
r"""
|
| 65 |
This is the configuration class to store the configuration of a [`KeyeModel`]. It is used to instantiate a
|
| 66 |
KeyeVLMoeForConditionalGeneration model according to the specified arguments, defining the model architecture.
|
|
|
|
| 164 |
>>> configuration = model.config
|
| 165 |
```"""
|
| 166 |
|
| 167 |
+
model_type = "KeyeVL"
|
| 168 |
sub_configs = {"vision_config": KeyeVisionConfig}
|
| 169 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 170 |
|
|
|
|
| 263 |
**kwargs,
|
| 264 |
)
|
| 265 |
|
| 266 |
+
__all__ = ["KeyeVLMoeConfig"]
|