Update README.md
Browse files
README.md
CHANGED
|
@@ -19,20 +19,20 @@ tags:
|
|
| 19 |
|
| 20 |
| Benchmark | Qwen3-30B-A3B-Instruct-2507| Ling-flash-2.0 | LLaDA2.0-flash-preview | LLaDA2.0-flash |
|
| 21 |
| :---: | :---: | :---: | :---: | :---: |
|
| 22 |
-
| **Average** | 79.47 | 78.03 |
|
| 23 |
| **Knowledge** | | | | |
|
| 24 |
| MMLU | 87.13 | 87.98 | 83.15 | 87.69 |
|
| 25 |
| MMLU-Pro | 74.23 | 76.84 | 49.22 | 73.36 |
|
| 26 |
| GPQA | 57.34 | 67.12 | 46.59 | 61.98 |
|
| 27 |
-
| arc-c | 95.81 | 95.08 | | 95.93 |
|
| 28 |
| CMMLU | 86.36 | 86.59 | 67.53 | 85.13 |
|
| 29 |
| C-EVAL | 88.17 | 88.03 | 66.54 | 86.75 |
|
| 30 |
-
| GAOKAO-Bench | 94.53 | 93.24 |
|
| 31 |
| **Reasoning** | | | | |
|
| 32 |
| SQuAD 2.0 | 89.51 | 81.32 | 85.61 | 90.00 |
|
| 33 |
| DROP | 87.57 | 88.32 | 79.49 | 87.90 |
|
| 34 |
| KOR-Bench | 68.00 | 68.96 | 37.26 | 64.24 |
|
| 35 |
-
| HellaSwag | 86.31 | 81.59 |
|
| 36 |
| **Coding** | | | | |
|
| 37 |
| CRUXEval-O | 86.75 | 82.75 | 61.88 | 85.12 |
|
| 38 |
| MBPP | 86.65 | 85.01 | 77.75 | 88.29 |
|
|
@@ -40,12 +40,12 @@ tags:
|
|
| 40 |
| HumanEval | 93.29 | 85.98 | 80.49 | 94.51 |
|
| 41 |
| Bigcodebench-Full | 41.49 | 40.70 | 30.44 | 41.58 |
|
| 42 |
| LiveCodeBench | 41.63 | 44.11 | 28.58 | 42.29 |
|
| 43 |
-
| Spider | 81.79 | 80.58 |
|
| 44 |
| **Math** | | | | |
|
| 45 |
| GSM8K | 96.36 | 95.45 | 89.01 | 96.06 |
|
| 46 |
| MATH | 96.70 | 96.1 | 73.50 | 95.44 |
|
| 47 |
| OlympiadBench | 77.59 | 76.19 | 47.78 | 74.07 |
|
| 48 |
-
| AIME 2025 | 61.88 | 55.89 |
|
| 49 |
| **Agent & Alignment** | | | | |
|
| 50 |
| BFCL_Live | 73.19 | 67.57 | 74.11 | 75.43 |
|
| 51 |
| IFEval-strict -prompt | 84.29 | 81.52 | 62.50 | 81.70 |
|
|
|
|
| 19 |
|
| 20 |
| Benchmark | Qwen3-30B-A3B-Instruct-2507| Ling-flash-2.0 | LLaDA2.0-flash-preview | LLaDA2.0-flash |
|
| 21 |
| :---: | :---: | :---: | :---: | :---: |
|
| 22 |
+
| **Average** | 79.47 | 78.03 | 71.92 | 79.32 |
|
| 23 |
| **Knowledge** | | | | |
|
| 24 |
| MMLU | 87.13 | 87.98 | 83.15 | 87.69 |
|
| 25 |
| MMLU-Pro | 74.23 | 76.84 | 49.22 | 73.36 |
|
| 26 |
| GPQA | 57.34 | 67.12 | 46.59 | 61.98 |
|
| 27 |
+
| arc-c | 95.81 | 95.08 | 93.90 | 95.93 |
|
| 28 |
| CMMLU | 86.36 | 86.59 | 67.53 | 85.13 |
|
| 29 |
| C-EVAL | 88.17 | 88.03 | 66.54 | 86.75 |
|
| 30 |
+
| GAOKAO-Bench | 94.53 | 93.24 | 86.12 | 93.90 |
|
| 31 |
| **Reasoning** | | | | |
|
| 32 |
| SQuAD 2.0 | 89.51 | 81.32 | 85.61 | 90.00 |
|
| 33 |
| DROP | 87.57 | 88.32 | 79.49 | 87.90 |
|
| 34 |
| KOR-Bench | 68.00 | 68.96 | 37.26 | 64.24 |
|
| 35 |
+
| HellaSwag | 86.31 | 81.59 | 86.00 | 84.97 |
|
| 36 |
| **Coding** | | | | |
|
| 37 |
| CRUXEval-O | 86.75 | 82.75 | 61.88 | 85.12 |
|
| 38 |
| MBPP | 86.65 | 85.01 | 77.75 | 88.29 |
|
|
|
|
| 40 |
| HumanEval | 93.29 | 85.98 | 80.49 | 94.51 |
|
| 41 |
| Bigcodebench-Full | 41.49 | 40.70 | 30.44 | 41.58 |
|
| 42 |
| LiveCodeBench | 41.63 | 44.11 | 28.58 | 42.29 |
|
| 43 |
+
| Spider | 81.79 | 80.58 | 81.37 | 82.49 |
|
| 44 |
| **Math** | | | | |
|
| 45 |
| GSM8K | 96.36 | 95.45 | 89.01 | 96.06 |
|
| 46 |
| MATH | 96.70 | 96.1 | 73.50 | 95.44 |
|
| 47 |
| OlympiadBench | 77.59 | 76.19 | 47.78 | 74.07 |
|
| 48 |
+
| AIME 2025 | 61.88 | 55.89 | 23.33 | 60.00 |
|
| 49 |
| **Agent & Alignment** | | | | |
|
| 50 |
| BFCL_Live | 73.19 | 67.57 | 74.11 | 75.43 |
|
| 51 |
| IFEval-strict -prompt | 84.29 | 81.52 | 62.50 | 81.70 |
|