utdawn commited on
Commit
6c1e52f
·
verified ·
1 Parent(s): 0c40661

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +6 -6
README.md CHANGED
@@ -19,20 +19,20 @@ tags:
19
 
20
  | Benchmark | Qwen3-30B-A3B-Instruct-2507| Ling-flash-2.0 | LLaDA2.0-flash-preview | LLaDA2.0-flash |
21
  | :---: | :---: | :---: | :---: | :---: |
22
- | **Average** | 79.47 | 78.03 | - | 79.32 |
23
  | **Knowledge** | | | | |
24
  | MMLU | 87.13 | 87.98 | 83.15 | 87.69 |
25
  | MMLU-Pro | 74.23 | 76.84 | 49.22 | 73.36 |
26
  | GPQA | 57.34 | 67.12 | 46.59 | 61.98 |
27
- | arc-c | 95.81 | 95.08 | | 95.93 |
28
  | CMMLU | 86.36 | 86.59 | 67.53 | 85.13 |
29
  | C-EVAL | 88.17 | 88.03 | 66.54 | 86.75 |
30
- | GAOKAO-Bench | 94.53 | 93.24 | - | 93.90 |
31
  | **Reasoning** | | | | |
32
  | SQuAD 2.0 | 89.51 | 81.32 | 85.61 | 90.00 |
33
  | DROP | 87.57 | 88.32 | 79.49 | 87.90 |
34
  | KOR-Bench | 68.00 | 68.96 | 37.26 | 64.24 |
35
- | HellaSwag | 86.31 | 81.59 | - | 84.97 |
36
  | **Coding** | | | | |
37
  | CRUXEval-O | 86.75 | 82.75 | 61.88 | 85.12 |
38
  | MBPP | 86.65 | 85.01 | 77.75 | 88.29 |
@@ -40,12 +40,12 @@ tags:
40
  | HumanEval | 93.29 | 85.98 | 80.49 | 94.51 |
41
  | Bigcodebench-Full | 41.49 | 40.70 | 30.44 | 41.58 |
42
  | LiveCodeBench | 41.63 | 44.11 | 28.58 | 42.29 |
43
- | Spider | 81.79 | 80.58 | - | 82.49 |
44
  | **Math** | | | | |
45
  | GSM8K | 96.36 | 95.45 | 89.01 | 96.06 |
46
  | MATH | 96.70 | 96.1 | 73.50 | 95.44 |
47
  | OlympiadBench | 77.59 | 76.19 | 47.78 | 74.07 |
48
- | AIME 2025 | 61.88 | 55.89 | - | 60.00 |
49
  | **Agent & Alignment** | | | | |
50
  | BFCL_Live | 73.19 | 67.57 | 74.11 | 75.43 |
51
  | IFEval-strict -prompt | 84.29 | 81.52 | 62.50 | 81.70 |
 
19
 
20
  | Benchmark | Qwen3-30B-A3B-Instruct-2507| Ling-flash-2.0 | LLaDA2.0-flash-preview | LLaDA2.0-flash |
21
  | :---: | :---: | :---: | :---: | :---: |
22
+ | **Average** | 79.47 | 78.03 | 71.92 | 79.32 |
23
  | **Knowledge** | | | | |
24
  | MMLU | 87.13 | 87.98 | 83.15 | 87.69 |
25
  | MMLU-Pro | 74.23 | 76.84 | 49.22 | 73.36 |
26
  | GPQA | 57.34 | 67.12 | 46.59 | 61.98 |
27
+ | arc-c | 95.81 | 95.08 | 93.90 | 95.93 |
28
  | CMMLU | 86.36 | 86.59 | 67.53 | 85.13 |
29
  | C-EVAL | 88.17 | 88.03 | 66.54 | 86.75 |
30
+ | GAOKAO-Bench | 94.53 | 93.24 | 86.12 | 93.90 |
31
  | **Reasoning** | | | | |
32
  | SQuAD 2.0 | 89.51 | 81.32 | 85.61 | 90.00 |
33
  | DROP | 87.57 | 88.32 | 79.49 | 87.90 |
34
  | KOR-Bench | 68.00 | 68.96 | 37.26 | 64.24 |
35
+ | HellaSwag | 86.31 | 81.59 | 86.00 | 84.97 |
36
  | **Coding** | | | | |
37
  | CRUXEval-O | 86.75 | 82.75 | 61.88 | 85.12 |
38
  | MBPP | 86.65 | 85.01 | 77.75 | 88.29 |
 
40
  | HumanEval | 93.29 | 85.98 | 80.49 | 94.51 |
41
  | Bigcodebench-Full | 41.49 | 40.70 | 30.44 | 41.58 |
42
  | LiveCodeBench | 41.63 | 44.11 | 28.58 | 42.29 |
43
+ | Spider | 81.79 | 80.58 | 81.37 | 82.49 |
44
  | **Math** | | | | |
45
  | GSM8K | 96.36 | 95.45 | 89.01 | 96.06 |
46
  | MATH | 96.70 | 96.1 | 73.50 | 95.44 |
47
  | OlympiadBench | 77.59 | 76.19 | 47.78 | 74.07 |
48
+ | AIME 2025 | 61.88 | 55.89 | 23.33 | 60.00 |
49
  | **Agent & Alignment** | | | | |
50
  | BFCL_Live | 73.19 | 67.57 | 74.11 | 75.43 |
51
  | IFEval-strict -prompt | 84.29 | 81.52 | 62.50 | 81.70 |