nmmursit commited on
Commit
89043b9
·
1 Parent(s): 7dea7c1

updated human arena data

Browse files
Files changed (27) hide show
  1. result/human_arena/{detail_9269ad1c.json → detail_075bdf8e.json} +7 -7
  2. result/human_arena/{detail_28fc12fe.json → detail_1406b4e7.json} +7 -7
  3. result/human_arena/{detail_41959a49.json → detail_2208a984.json} +8 -8
  4. result/human_arena/{detail_108710af.json → detail_2b3c8f6f.json} +9 -9
  5. result/human_arena/{detail_9039f703.json → detail_38a3ba66.json} +8 -8
  6. result/human_arena/{detail_c6101f2a.json → detail_3f0f56f0.json} +9 -9
  7. result/human_arena/{detail_7b0f26ef.json → detail_66536d64.json} +8 -8
  8. result/human_arena/{detail_5c24d08a.json → detail_6730ed29.json} +3 -3
  9. result/human_arena/{detail_15b52ac5.json → detail_6c246e0d.json} +9 -9
  10. result/human_arena/{detail_3ac28fb0.json → detail_9085338d.json} +9 -9
  11. result/human_arena/{detail_47471816.json → detail_94a01d1e.json} +6 -6
  12. result/human_arena/{detail_d442d95f.json → detail_97567fd5.json} +9 -9
  13. result/human_arena/{detail_5ff4c010.json → detail_9aa4b0bf.json} +9 -9
  14. result/human_arena/{detail_ce44ff08.json → detail_9b9bb2ba.json} +7 -7
  15. result/human_arena/{detail_d3d5ea7c.json → detail_a10ef85a.json} +8 -8
  16. result/human_arena/{detail_318acf27.json → detail_ad1efebc.json} +6 -6
  17. result/human_arena/{detail_6b53b3f3.json → detail_ad3f6c99.json} +7 -7
  18. result/human_arena/{detail_37a12169.json → detail_aded928d.json} +7 -7
  19. result/human_arena/{detail_da4d94c9.json → detail_b49185d7.json} +9 -9
  20. result/human_arena/{detail_74139fe9.json → detail_b8046ea9.json} +9 -9
  21. result/human_arena/{detail_2cb1472f.json → detail_c77ffeba.json} +9 -9
  22. result/human_arena/{detail_f8254c92.json → detail_cadcb1aa.json} +7 -7
  23. result/human_arena/{detail_12087995.json → detail_dbd2f986.json} +7 -7
  24. result/human_arena/{detail_633cccf3.json → detail_ead2378e.json} +7 -7
  25. result/human_arena/{detail_b1bdd263.json → detail_ee6f0443.json} +6 -6
  26. result/human_arena/{detail_78ed43f0.json → detail_f190d45f.json} +7 -7
  27. result/human_arena/{detail_d7aced26.json → detail_f64d9185.json} +9 -9
result/human_arena/{detail_9269ad1c.json → detail_075bdf8e.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "grok-4-1-fast-reasoning",
3
- "run_id": "9269ad1c",
4
  "category": "reasoning",
5
- "elo_rating": 1436.0,
6
  "wins": 52,
7
- "losses": 50,
8
- "ties": 23,
9
- "total_games": 125,
10
  "win_rate": 42.0,
11
- "votes": 125,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "grok-4-1-fast-reasoning",
3
+ "run_id": "075bdf8e",
4
  "category": "reasoning",
5
+ "elo_rating": 1463.0,
6
  "wins": 52,
7
+ "losses": 49,
8
+ "ties": 22,
9
+ "total_games": 123,
10
  "win_rate": 42.0,
11
+ "votes": 123,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_28fc12fe.json → detail_1406b4e7.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "grok-4-fast-reasoning",
3
- "run_id": "28fc12fe",
4
  "category": "reasoning",
5
- "elo_rating": 1393.0,
6
  "wins": 37,
7
- "losses": 62,
8
- "ties": 30,
9
- "total_games": 129,
10
  "win_rate": 29.0,
11
- "votes": 129,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "grok-4-fast-reasoning",
3
+ "run_id": "1406b4e7",
4
  "category": "reasoning",
5
+ "elo_rating": 1418.0,
6
  "wins": 37,
7
+ "losses": 61,
8
+ "ties": 29,
9
+ "total_games": 127,
10
  "win_rate": 29.0,
11
+ "votes": 127,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_41959a49.json → detail_2208a984.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gpt-5-mini",
3
- "run_id": "41959a49",
4
  "category": "reasoning",
5
- "elo_rating": 1375.0,
6
- "wins": 15,
7
- "losses": 54,
8
  "ties": 24,
9
- "total_games": 93,
10
- "win_rate": 16.0,
11
- "votes": 93,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gpt-5-mini",
3
+ "run_id": "2208a984",
4
  "category": "reasoning",
5
+ "elo_rating": 1401.0,
6
+ "wins": 14,
7
+ "losses": 53,
8
  "ties": 24,
9
+ "total_games": 91,
10
+ "win_rate": 15.0,
11
+ "votes": 91,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_108710af.json → detail_2b3c8f6f.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "grok-3",
3
- "run_id": "108710af",
4
  "category": "general",
5
- "elo_rating": 1546.0,
6
- "wins": 539,
7
- "losses": 234,
8
- "ties": 458,
9
- "total_games": 1231,
10
- "win_rate": 44.0,
11
- "votes": 1231,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "grok-3",
3
+ "run_id": "2b3c8f6f",
4
  "category": "general",
5
+ "elo_rating": 1539.0,
6
+ "wins": 105,
7
+ "losses": 63,
8
+ "ties": 21,
9
+ "total_games": 189,
10
+ "win_rate": 56.0,
11
+ "votes": 189,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_9039f703.json → detail_38a3ba66.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gemma-3-27b-it",
3
- "run_id": "9039f703",
4
  "category": "general",
5
- "elo_rating": 1556.0,
6
- "wins": 14,
7
- "losses": 11,
8
  "ties": 0,
9
- "total_games": 25,
10
- "win_rate": 56.0,
11
- "votes": 25,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gemma-3-27b-it",
3
+ "run_id": "38a3ba66",
4
  "category": "general",
5
+ "elo_rating": 1512.0,
6
+ "wins": 10,
7
+ "losses": 10,
8
  "ties": 0,
9
+ "total_games": 20,
10
+ "win_rate": 50.0,
11
+ "votes": 20,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_c6101f2a.json → detail_3f0f56f0.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "newmindai/Llama-3.3-70B-Instruct",
3
- "run_id": "c6101f2a",
4
  "category": "general",
5
- "elo_rating": 1489.0,
6
- "wins": 584,
7
- "losses": 473,
8
- "ties": 294,
9
- "total_games": 1351,
10
- "win_rate": 43.0,
11
- "votes": 1351,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "newmindai/Llama-3.3-70B-Instruct",
3
+ "run_id": "3f0f56f0",
4
  "category": "general",
5
+ "elo_rating": 1472.0,
6
+ "wins": 17,
7
+ "losses": 31,
8
+ "ties": 7,
9
+ "total_games": 55,
10
+ "win_rate": 31.0,
11
+ "votes": 55,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_7b0f26ef.json → detail_66536d64.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "grok-4-0709",
3
- "run_id": "7b0f26ef",
4
  "category": "general",
5
- "elo_rating": 1456.0,
6
- "wins": 75,
7
- "losses": 90,
8
  "ties": 31,
9
- "total_games": 196,
10
- "win_rate": 38.0,
11
- "votes": 196,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "grok-4-0709",
3
+ "run_id": "66536d64",
4
  "category": "general",
5
+ "elo_rating": 1449.0,
6
+ "wins": 71,
7
+ "losses": 89,
8
  "ties": 31,
9
+ "total_games": 191,
10
+ "win_rate": 37.0,
11
+ "votes": 191,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_5c24d08a.json → detail_6730ed29.json} RENAMED
@@ -1,8 +1,8 @@
1
  {
2
  "model_name": "openai/gpt-oss-20b",
3
- "run_id": "5c24d08a",
4
  "category": "reasoning",
5
- "elo_rating": 1302.0,
6
  "wins": 33,
7
  "losses": 66,
8
  "ties": 25,
@@ -11,6 +11,6 @@
11
  "votes": 124,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "openai/gpt-oss-20b",
3
+ "run_id": "6730ed29",
4
  "category": "reasoning",
5
+ "elo_rating": 1328.0,
6
  "wins": 33,
7
  "losses": 66,
8
  "ties": 25,
 
11
  "votes": 124,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_15b52ac5.json → detail_6c246e0d.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gpt-4o-mini",
3
- "run_id": "15b52ac5",
4
  "category": "general",
5
- "elo_rating": 1398.0,
6
- "wins": 69,
7
- "losses": 94,
8
- "ties": 35,
9
- "total_games": 198,
10
- "win_rate": 35.0,
11
- "votes": 198,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gpt-4o-mini",
3
+ "run_id": "6c246e0d",
4
  "category": "general",
5
+ "elo_rating": 1391.0,
6
+ "wins": 66,
7
+ "losses": 92,
8
+ "ties": 34,
9
+ "total_games": 192,
10
+ "win_rate": 34.0,
11
+ "votes": 192,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_3ac28fb0.json → detail_9085338d.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "grok-4",
3
- "run_id": "3ac28fb0",
4
  "category": "general",
5
- "elo_rating": 1634.0,
6
- "wins": 113,
7
- "losses": 64,
8
- "ties": 21,
9
- "total_games": 198,
10
- "win_rate": 57.0,
11
- "votes": 198,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "grok-4",
3
+ "run_id": "9085338d",
4
  "category": "general",
5
+ "elo_rating": 1628.0,
6
+ "wins": 111,
7
+ "losses": 60,
8
+ "ties": 20,
9
+ "total_games": 191,
10
+ "win_rate": 58.0,
11
+ "votes": 191,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_47471816.json → detail_94a01d1e.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gemini-2.5-pro-preview-03-25",
3
- "run_id": "47471816",
4
  "category": "reasoning",
5
- "elo_rating": 1736.0,
6
- "wins": 92,
7
  "losses": 23,
8
  "ties": 7,
9
- "total_games": 122,
10
  "win_rate": 75.0,
11
- "votes": 122,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gemini-2.5-pro-preview-03-25",
3
+ "run_id": "94a01d1e",
4
  "category": "reasoning",
5
+ "elo_rating": 1760.0,
6
+ "wins": 91,
7
  "losses": 23,
8
  "ties": 7,
9
+ "total_games": 121,
10
  "win_rate": 75.0,
11
+ "votes": 121,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_d442d95f.json → detail_97567fd5.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "meta-llama/Llama-3.3-70B-Instruct",
3
- "run_id": "d442d95f",
4
  "category": "general",
5
- "elo_rating": 1421.0,
6
- "wins": 649,
7
- "losses": 505,
8
- "ties": 417,
9
- "total_games": 1571,
10
- "win_rate": 41.0,
11
- "votes": 1571,
12
  "dtype": "bfloat16",
13
  "license": "Llama-3.3",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "meta-llama/Llama-3.3-70B-Instruct",
3
+ "run_id": "97567fd5",
4
  "category": "general",
5
+ "elo_rating": 1414.0,
6
+ "wins": 58,
7
+ "losses": 100,
8
+ "ties": 39,
9
+ "total_games": 197,
10
+ "win_rate": 29.0,
11
+ "votes": 197,
12
  "dtype": "bfloat16",
13
  "license": "Llama-3.3",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_5ff4c010.json → detail_9aa4b0bf.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "Qwen/QwQ-32B",
3
- "run_id": "5ff4c010",
4
  "category": "general",
5
- "elo_rating": 1477.0,
6
- "wins": 387,
7
- "losses": 788,
8
- "ties": 165,
9
- "total_games": 1340,
10
- "win_rate": 29.0,
11
- "votes": 1340,
12
  "dtype": "bfloat16",
13
  "license": "Apache 2.0",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "Qwen/QwQ-32B",
3
+ "run_id": "9aa4b0bf",
4
  "category": "general",
5
+ "elo_rating": 1472.0,
6
+ "wins": 21,
7
+ "losses": 22,
8
+ "ties": 13,
9
+ "total_games": 56,
10
+ "win_rate": 38.0,
11
+ "votes": 56,
12
  "dtype": "bfloat16",
13
  "license": "Apache 2.0",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_ce44ff08.json → detail_9b9bb2ba.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gpt-4o",
3
- "run_id": "ce44ff08",
4
  "category": "general",
5
- "elo_rating": 1379.0,
6
- "wins": 71,
7
- "losses": 100,
8
  "ties": 30,
9
- "total_games": 201,
10
  "win_rate": 35.0,
11
- "votes": 201,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gpt-4o",
3
+ "run_id": "9b9bb2ba",
4
  "category": "general",
5
+ "elo_rating": 1373.0,
6
+ "wins": 67,
7
+ "losses": 97,
8
  "ties": 30,
9
+ "total_games": 194,
10
  "win_rate": 35.0,
11
+ "votes": 194,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_d3d5ea7c.json → detail_a10ef85a.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "qwen-plus",
3
- "run_id": "d3d5ea7c",
4
  "category": "general",
5
- "elo_rating": 1524.0,
6
  "wins": 73,
7
- "losses": 77,
8
- "ties": 36,
9
- "total_games": 186,
10
- "win_rate": 39.0,
11
- "votes": 186,
12
  "dtype": "Unknown",
13
  "license": "Qwen",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "qwen-plus",
3
+ "run_id": "a10ef85a",
4
  "category": "general",
5
+ "elo_rating": 1518.0,
6
  "wins": 73,
7
+ "losses": 75,
8
+ "ties": 35,
9
+ "total_games": 183,
10
+ "win_rate": 40.0,
11
+ "votes": 183,
12
  "dtype": "Unknown",
13
  "license": "Qwen",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_318acf27.json → detail_ad1efebc.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "openai/gpt-oss-120b",
3
- "run_id": "318acf27",
4
  "category": "reasoning",
5
- "elo_rating": 1381.0,
6
  "wins": 50,
7
- "losses": 58,
8
  "ties": 18,
9
- "total_games": 126,
10
  "win_rate": 40.0,
11
- "votes": 126,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "openai/gpt-oss-120b",
3
+ "run_id": "ad1efebc",
4
  "category": "reasoning",
5
+ "elo_rating": 1408.0,
6
  "wins": 50,
7
+ "losses": 56,
8
  "ties": 18,
9
+ "total_games": 124,
10
  "win_rate": 40.0,
11
+ "votes": 124,
12
  "dtype": "bfloat16",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_6b53b3f3.json → detail_ad3f6c99.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gpt-5",
3
- "run_id": "6b53b3f3",
4
  "category": "reasoning",
5
- "elo_rating": 1748.0,
6
- "wins": 76,
7
- "losses": 37,
8
  "ties": 14,
9
- "total_games": 127,
10
  "win_rate": 60.0,
11
- "votes": 127,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gpt-5",
3
+ "run_id": "ad3f6c99",
4
  "category": "reasoning",
5
+ "elo_rating": 1773.0,
6
+ "wins": 74,
7
+ "losses": 36,
8
  "ties": 14,
9
+ "total_games": 124,
10
  "win_rate": 60.0,
11
+ "votes": 124,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_37a12169.json → detail_aded928d.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "deepseek-ai/DeepSeek-V3",
3
- "run_id": "37a12169",
4
  "category": "general",
5
- "elo_rating": 1535.0,
6
  "wins": 81,
7
- "losses": 82,
8
  "ties": 38,
9
- "total_games": 201,
10
- "win_rate": 40.0,
11
- "votes": 201,
12
  "dtype": "bfloat16",
13
  "license": "MIT",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "deepseek-ai/DeepSeek-V3",
3
+ "run_id": "aded928d",
4
  "category": "general",
5
+ "elo_rating": 1528.0,
6
  "wins": 81,
7
+ "losses": 71,
8
  "ties": 38,
9
+ "total_games": 190,
10
+ "win_rate": 43.0,
11
+ "votes": 190,
12
  "dtype": "bfloat16",
13
  "license": "MIT",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_da4d94c9.json → detail_b49185d7.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "newmindai/QwQ-32B",
3
- "run_id": "da4d94c9",
4
  "category": "reasoning",
5
- "elo_rating": 1295.0,
6
- "wins": 284,
7
- "losses": 941,
8
- "ties": 180,
9
- "total_games": 1405,
10
- "win_rate": 20.0,
11
- "votes": 1405,
12
  "dtype": "bfloat16",
13
  "license": "Apache 2.0",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "newmindai/QwQ-32B",
3
+ "run_id": "b49185d7",
4
  "category": "reasoning",
5
+ "elo_rating": 1377.0,
6
+ "wins": 6,
7
+ "losses": 24,
8
+ "ties": 12,
9
+ "total_games": 42,
10
+ "win_rate": 14.0,
11
+ "votes": 42,
12
  "dtype": "bfloat16",
13
  "license": "Apache 2.0",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_74139fe9.json → detail_b8046ea9.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gemini-2.0-flash",
3
- "run_id": "74139fe9",
4
  "category": "general",
5
- "elo_rating": 1575.0,
6
- "wins": 99,
7
- "losses": 69,
8
- "ties": 25,
9
- "total_games": 193,
10
- "win_rate": 51.0,
11
- "votes": 193,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gemini-2.0-flash",
3
+ "run_id": "b8046ea9",
4
  "category": "general",
5
+ "elo_rating": 1569.0,
6
+ "wins": 98,
7
+ "losses": 66,
8
+ "ties": 24,
9
+ "total_games": 188,
10
+ "win_rate": 52.0,
11
+ "votes": 188,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_2cb1472f.json → detail_c77ffeba.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "grok-3-mini-fast-beta",
3
- "run_id": "2cb1472f",
4
  "category": "reasoning",
5
- "elo_rating": 1393.0,
6
- "wins": 414,
7
- "losses": 483,
8
- "ties": 242,
9
- "total_games": 1139,
10
- "win_rate": 36.0,
11
- "votes": 1139,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "grok-3-mini-fast-beta",
3
+ "run_id": "c77ffeba",
4
  "category": "reasoning",
5
+ "elo_rating": 1420.0,
6
+ "wins": 27,
7
+ "losses": 63,
8
+ "ties": 24,
9
+ "total_games": 114,
10
+ "win_rate": 24.0,
11
+ "votes": 114,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_f8254c92.json → detail_cadcb1aa.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "qwen-turbo",
3
- "run_id": "f8254c92",
4
  "category": "general",
5
- "elo_rating": 1458.0,
6
  "wins": 42,
7
- "losses": 97,
8
  "ties": 42,
9
- "total_games": 181,
10
- "win_rate": 23.0,
11
- "votes": 181,
12
  "dtype": "Unknown",
13
  "license": "Qwen",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "qwen-turbo",
3
+ "run_id": "cadcb1aa",
4
  "category": "general",
5
+ "elo_rating": 1451.0,
6
  "wins": 42,
7
+ "losses": 95,
8
  "ties": 42,
9
+ "total_games": 179,
10
+ "win_rate": 24.0,
11
+ "votes": 179,
12
  "dtype": "Unknown",
13
  "license": "Qwen",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_12087995.json → detail_dbd2f986.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "llama-3.3-70b-versatile",
3
- "run_id": "12087995",
4
  "category": "general",
5
- "elo_rating": 1489.0,
6
- "wins": 69,
7
- "losses": 105,
8
  "ties": 27,
9
- "total_games": 201,
10
  "win_rate": 34.0,
11
- "votes": 201,
12
  "dtype": "bfloat16",
13
  "license": "Llama-3.3",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "llama-3.3-70b-versatile",
3
+ "run_id": "dbd2f986",
4
  "category": "general",
5
+ "elo_rating": 1482.0,
6
+ "wins": 66,
7
+ "losses": 103,
8
  "ties": 27,
9
+ "total_games": 196,
10
  "win_rate": 34.0,
11
+ "votes": 196,
12
  "dtype": "bfloat16",
13
  "license": "Llama-3.3",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_633cccf3.json → detail_ead2378e.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gemini-3-pro-preview",
3
- "run_id": "633cccf3",
4
  "category": "reasoning",
5
- "elo_rating": 1627.0,
6
- "wins": 77,
7
  "losses": 28,
8
  "ties": 9,
9
- "total_games": 114,
10
- "win_rate": 68.0,
11
- "votes": 114,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gemini-3-pro-preview",
3
+ "run_id": "ead2378e",
4
  "category": "reasoning",
5
+ "elo_rating": 1652.0,
6
+ "wins": 75,
7
  "losses": 28,
8
  "ties": 9,
9
+ "total_games": 112,
10
+ "win_rate": 67.0,
11
+ "votes": 112,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_b1bdd263.json → detail_ee6f0443.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "qwen-max",
3
- "run_id": "b1bdd263",
4
  "category": "general",
5
- "elo_rating": 1595.0,
6
  "wins": 72,
7
- "losses": 68,
8
  "ties": 37,
9
- "total_games": 177,
10
  "win_rate": 41.0,
11
- "votes": 177,
12
  "dtype": "Unknown",
13
  "license": "Qwen",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "qwen-max",
3
+ "run_id": "ee6f0443",
4
  "category": "general",
5
+ "elo_rating": 1589.0,
6
  "wins": 72,
7
+ "losses": 67,
8
  "ties": 37,
9
+ "total_games": 176,
10
  "win_rate": 41.0,
11
+ "votes": 176,
12
  "dtype": "Unknown",
13
  "license": "Qwen",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_78ed43f0.json → detail_f190d45f.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "gemini-2.5-flash-preview-04-17",
3
- "run_id": "78ed43f0",
4
  "category": "general",
5
- "elo_rating": 1614.0,
6
- "wins": 128,
7
  "losses": 40,
8
  "ties": 28,
9
- "total_games": 196,
10
- "win_rate": 65.0,
11
- "votes": 196,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "gemini-2.5-flash-preview-04-17",
3
+ "run_id": "f190d45f",
4
  "category": "general",
5
+ "elo_rating": 1607.0,
6
+ "wins": 120,
7
  "losses": 40,
8
  "ties": 28,
9
+ "total_games": 188,
10
+ "win_rate": 64.0,
11
+ "votes": 188,
12
  "dtype": "Unknown",
13
  "license": "Proprietary",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }
result/human_arena/{detail_d7aced26.json → detail_f64d9185.json} RENAMED
@@ -1,16 +1,16 @@
1
  {
2
  "model_name": "Qwen/Qwen2.5-72B-Instruct",
3
- "run_id": "d7aced26",
4
  "category": "general",
5
- "elo_rating": 1469.0,
6
- "wins": 393,
7
- "losses": 539,
8
- "ties": 298,
9
- "total_games": 1230,
10
- "win_rate": 32.0,
11
- "votes": 1230,
12
  "dtype": "bfloat16",
13
  "license": "Qwen",
14
- "evaluation_date": "2026-01-19",
15
  "evaluation_type": "Human Arena"
16
  }
 
1
  {
2
  "model_name": "Qwen/Qwen2.5-72B-Instruct",
3
+ "run_id": "f64d9185",
4
  "category": "general",
5
+ "elo_rating": 1506.0,
6
+ "wins": 24,
7
+ "losses": 21,
8
+ "ties": 10,
9
+ "total_games": 55,
10
+ "win_rate": 44.0,
11
+ "votes": 55,
12
  "dtype": "bfloat16",
13
  "license": "Qwen",
14
+ "evaluation_date": "2026-01-21",
15
  "evaluation_type": "Human Arena"
16
  }