Add evaluation results

#43
by nielsr HF Staff - opened
.eval_results/.eval_results/swe_bench_pro.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: ScaleAI/SWE-bench_Pro
3
+ task_id: SWE_Bench_Pro
4
+ value: 44.3
5
+ source:
6
+ url: https://huggingface.co/papers/2603.00729
7
+ name: Qwen3-Coder-Next technical report
8
+ user: nielsr
9
+ notes: SWE-Agent as harness
.eval_results/swe_bench_verified.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: SWE-bench/SWE-bench_Verified
3
+ task_id: swe_bench_%_resolved
4
+ value: 70.6
5
+ source:
6
+ url: https://huggingface.co/papers/2603.00729
7
+ name: Qwen3-Coder-Next technical report
8
+ user: nielsr
9
+ notes: SWE-Agent as harness
.eval_results/terminal_bench_2.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: harborframework/terminal-bench-2.0
3
+ task_id: terminalbench_2
4
+ value: 36.2
5
+ source:
6
+ url: https://huggingface.co/Qwen/Qwen3-Coder-Next
7
+ name: Model Card
8
+ user: nielsr
9
+ notes: "agent: Terminus 2"