Kwai-Keye commited on
Commit
e1152af
·
verified ·
1 Parent(s): dd7c51a

Add files using upload-large-folder tool

Browse files
Files changed (3) hide show
  1. README.md +4 -31
  2. config.json +2 -3
  3. configuration_deepseek.py +3 -3
README.md CHANGED
@@ -13,7 +13,7 @@ tags:
13
 
14
  Meet Keye-VL-671B-A37B — the most powerful multi-modal language model in the Keye series to date.
15
 
16
- As one of the largest and most capable MLLMs currently in existence, Keye-VL 671B demonstrates achieved top-tier and in some cases even leading performance in text understanding and generation, complex visual perception and reasoning, comprehensive video understanding, and Olympic-level mathematical reasoning.
17
 
18
  #### Key Enhancements:
19
 
@@ -32,36 +32,9 @@ As one of the largest and most capable MLLMs currently in existence, Keye-VL 671
32
 
33
  ## Model Performance
34
 
35
- ![Performance Comparison](figures/radar.png)
36
-
37
- | | Benchmarks | Seed1.5-VL thinking | dots.vlm1 | Qwen3-VL-235B-A22B thinking | Keye-VL-671B-A37B |
38
- | --------------------- | -------------- | :-----------------: | :-------: | :-------------------------: | :-------------------: |
39
- | STEM/Reasoning | MMMU_VAL | 77.9 | 80.11 | 80.6 | **83.78** |
40
- | | MMMU_Pro | 67.6 | 70.11 | 69.3 | **72.49** |
41
- | | MathVision | 68.7 | 69.64 | **74.6** | 69.11 |
42
- | | MathVista | 85.6 | 85.0 | 85.8 | **86.2** |
43
- | | OlympiadBench | 65.0 | - | - | **74.92** |
44
- | | VisuLogic | 35.0 | 32.2 | 34.4 | **35.4** |
45
- | General VQA | RealWorldQA | 78.4 | 79.08 | 81.3 | **86.54** |
46
- | | MMStar | 77.8 | 76.67 | 78.7 | **86.67** |
47
- | | MMBench-en | 89.9 | 89.32 | 90.6 | **95.74** |
48
- | | MMbench-cn | 89.1 | 88.24 | - | **94.27** |
49
- | | MMVP | 69.3 | 72.0 | - | **88.0** |
50
- | | V* | 89.0 | - | - | **90.05** |
51
- | | HallusionBench | 60.3 | 64.83 | 66.7 | **72.3** |
52
- | Video | VideoMME | 77.9 | - | **79.0** | **79.0** |
53
- | | LongVideoBench | 74.0 | - | 65.2 (fp8) | **79.0** |
54
- | | MMVU | 70.1 | - | 78.4 (fp8) | **86.6** |
55
- | | TempCompass | **83.7** | - | 81.03 (fp8) | 77.75 |
56
- | Text Recog./Doc/chart | TextVQA | **81.8** | - | - | 76.21 |
57
- | | DocVQA_VAL | **96.9** | 96.52 | 96.5 | 95.39 |
58
- | | ChartQA_TEST | **89.1** | 87.68 | - | 86.68 |
59
- | | InfoVQA | **91.2** | - | 89.5 | 86.93 |
60
- | | CharXiv (RQ) | 60.2 | 64.4 | 66.1 | **79.4** |
61
- | | CharXiv (DQ) | 92.6 | 92.1 | - | **94.5** |
62
- | | AI2D_TEST | 87.3 | 88.37 | 89.2 | **91.19** |
63
- | Pure Text | AIME2025 | - | 85.83 | **89.7** | 83.3 |
64
- | | GPQA | - | **72.78** | - | 71.21 |
65
 
66
  ## Quickstart
67
 
 
13
 
14
  Meet Keye-VL-671B-A37B — the most powerful multi-modal language model in the Keye series to date.
15
 
16
+ As one of the largest and most capable MLLMs currently in existence, Keye-VL-671B-A37B demonstrates top-tier and in some cases even leading performance in text understanding and generation, complex visual perception and reasoning, comprehensive video understanding, and Olympic-level mathematical reasoning.
17
 
18
  #### Key Enhancements:
19
 
 
32
 
33
  ## Model Performance
34
 
35
+ ![Performance Comparison](https://github.com/Kwai-Keye/Keye/blob/main/asset/radar.png)
36
+
37
+ ![Performance on Public Benchmarks](https://github.com/Kwai-Keye/Keye/blob/main/asset/performance.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  ## Quickstart
40
 
config.json CHANGED
@@ -5,7 +5,7 @@
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
- "AutoConfig": "configuration_deepseek.DeepseekR1Config"
9
  },
10
  "bos_token_id": 0,
11
  "dtype": "bfloat16",
@@ -244,8 +244,7 @@
244
  ],
245
  "attention_dropout": 0.0,
246
  "auto_map": {
247
- "AutoConfig": "configuration_deepseek.KeyeVisionConfig",
248
- "AutoModel": "modeling_deepseek.SiglipVisionModel"
249
  },
250
  "has_learnable_position_embedding": true,
251
  "hidden_act": "gelu_pytorch_tanh",
 
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
+ "AutoConfig": "configuration_deepseek.KeyeVLMoeConfig"
9
  },
10
  "bos_token_id": 0,
11
  "dtype": "bfloat16",
 
244
  ],
245
  "attention_dropout": 0.0,
246
  "auto_map": {
247
+ "AutoConfig": "configuration_deepseek.KeyeVisionConfig"
 
248
  },
249
  "has_learnable_position_embedding": true,
250
  "hidden_act": "gelu_pytorch_tanh",
configuration_deepseek.py CHANGED
@@ -60,7 +60,7 @@ class KeyeVisionConfig(PretrainedConfig):
60
  self.tokens_per_second = tokens_per_second
61
 
62
 
63
- class DeepseekR1Config(PretrainedConfig):
64
  r"""
65
  This is the configuration class to store the configuration of a [`KeyeModel`]. It is used to instantiate a
66
  KeyeVLMoeForConditionalGeneration model according to the specified arguments, defining the model architecture.
@@ -164,7 +164,7 @@ class DeepseekR1Config(PretrainedConfig):
164
  >>> configuration = model.config
165
  ```"""
166
 
167
- model_type = "deepseek_r1"
168
  sub_configs = {"vision_config": KeyeVisionConfig}
169
  keys_to_ignore_at_inference = ["past_key_values"]
170
 
@@ -263,4 +263,4 @@ class DeepseekR1Config(PretrainedConfig):
263
  **kwargs,
264
  )
265
 
266
- __all__ = ["DeepseekR1Config"]
 
60
  self.tokens_per_second = tokens_per_second
61
 
62
 
63
+ class KeyeVLMoeConfig(PretrainedConfig):
64
  r"""
65
  This is the configuration class to store the configuration of a [`KeyeModel`]. It is used to instantiate a
66
  KeyeVLMoeForConditionalGeneration model according to the specified arguments, defining the model architecture.
 
164
  >>> configuration = model.config
165
  ```"""
166
 
167
+ model_type = "KeyeVL"
168
  sub_configs = {"vision_config": KeyeVisionConfig}
169
  keys_to_ignore_at_inference = ["past_key_values"]
170
 
 
263
  **kwargs,
264
  )
265
 
266
+ __all__ = ["KeyeVLMoeConfig"]