Add community evaluation results for DEEP-SWE, GPQA, HLE, SWE-BENCH_PRO

#12
by nielsr HF Staff - opened
.eval_results/deep-swe.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: datacurve/deep-swe
3
+ task_id: deep_swe
4
+ value: 46.2
5
+ source:
6
+ url: https://huggingface.co/zai-org/GLM-5.2
7
+ name: Model Card
.eval_results/gpqa.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: Idavidrein/gpqa
3
+ task_id: diamond
4
+ value: 91.2
5
+ source:
6
+ url: https://huggingface.co/zai-org/GLM-5.2
7
+ name: Model Card
.eval_results/hle.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: cais/hle
3
+ task_id: hle
4
+ value: 40.5
5
+ source:
6
+ url: https://huggingface.co/zai-org/GLM-5.2
7
+ name: Model Card
8
+
9
+ - dataset:
10
+ id: cais/hle
11
+ task_id: hle
12
+ value: 54.7
13
+ source:
14
+ url: https://huggingface.co/zai-org/GLM-5.2
15
+ name: Model Card
16
+ notes: "With tools"
.eval_results/swe-bench_pro.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: ScaleAI/SWE-bench_Pro
3
+ task_id: SWE_Bench_Pro
4
+ value: 62.1
5
+ source:
6
+ url: https://huggingface.co/zai-org/GLM-5.2
7
+ name: Model Card