updated human arena data
Browse files- result/human_arena/{detail_9269ad1c.json → detail_075bdf8e.json} +7 -7
- result/human_arena/{detail_28fc12fe.json → detail_1406b4e7.json} +7 -7
- result/human_arena/{detail_41959a49.json → detail_2208a984.json} +8 -8
- result/human_arena/{detail_108710af.json → detail_2b3c8f6f.json} +9 -9
- result/human_arena/{detail_9039f703.json → detail_38a3ba66.json} +8 -8
- result/human_arena/{detail_c6101f2a.json → detail_3f0f56f0.json} +9 -9
- result/human_arena/{detail_7b0f26ef.json → detail_66536d64.json} +8 -8
- result/human_arena/{detail_5c24d08a.json → detail_6730ed29.json} +3 -3
- result/human_arena/{detail_15b52ac5.json → detail_6c246e0d.json} +9 -9
- result/human_arena/{detail_3ac28fb0.json → detail_9085338d.json} +9 -9
- result/human_arena/{detail_47471816.json → detail_94a01d1e.json} +6 -6
- result/human_arena/{detail_d442d95f.json → detail_97567fd5.json} +9 -9
- result/human_arena/{detail_5ff4c010.json → detail_9aa4b0bf.json} +9 -9
- result/human_arena/{detail_ce44ff08.json → detail_9b9bb2ba.json} +7 -7
- result/human_arena/{detail_d3d5ea7c.json → detail_a10ef85a.json} +8 -8
- result/human_arena/{detail_318acf27.json → detail_ad1efebc.json} +6 -6
- result/human_arena/{detail_6b53b3f3.json → detail_ad3f6c99.json} +7 -7
- result/human_arena/{detail_37a12169.json → detail_aded928d.json} +7 -7
- result/human_arena/{detail_da4d94c9.json → detail_b49185d7.json} +9 -9
- result/human_arena/{detail_74139fe9.json → detail_b8046ea9.json} +9 -9
- result/human_arena/{detail_2cb1472f.json → detail_c77ffeba.json} +9 -9
- result/human_arena/{detail_f8254c92.json → detail_cadcb1aa.json} +7 -7
- result/human_arena/{detail_12087995.json → detail_dbd2f986.json} +7 -7
- result/human_arena/{detail_633cccf3.json → detail_ead2378e.json} +7 -7
- result/human_arena/{detail_b1bdd263.json → detail_ee6f0443.json} +6 -6
- result/human_arena/{detail_78ed43f0.json → detail_f190d45f.json} +7 -7
- result/human_arena/{detail_d7aced26.json → detail_f64d9185.json} +9 -9
result/human_arena/{detail_9269ad1c.json → detail_075bdf8e.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4-1-fast-reasoning",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 52,
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 42.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4-1-fast-reasoning",
|
| 3 |
+
"run_id": "075bdf8e",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1463.0,
|
| 6 |
"wins": 52,
|
| 7 |
+
"losses": 49,
|
| 8 |
+
"ties": 22,
|
| 9 |
+
"total_games": 123,
|
| 10 |
"win_rate": 42.0,
|
| 11 |
+
"votes": 123,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_28fc12fe.json → detail_1406b4e7.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4-fast-reasoning",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 37,
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 29.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4-fast-reasoning",
|
| 3 |
+
"run_id": "1406b4e7",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1418.0,
|
| 6 |
"wins": 37,
|
| 7 |
+
"losses": 61,
|
| 8 |
+
"ties": 29,
|
| 9 |
+
"total_games": 127,
|
| 10 |
"win_rate": 29.0,
|
| 11 |
+
"votes": 127,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_41959a49.json → detail_2208a984.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-5-mini",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
"ties": 24,
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-5-mini",
|
| 3 |
+
"run_id": "2208a984",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1401.0,
|
| 6 |
+
"wins": 14,
|
| 7 |
+
"losses": 53,
|
| 8 |
"ties": 24,
|
| 9 |
+
"total_games": 91,
|
| 10 |
+
"win_rate": 15.0,
|
| 11 |
+
"votes": 91,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_108710af.json → detail_2b3c8f6f.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-3",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-3",
|
| 3 |
+
"run_id": "2b3c8f6f",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1539.0,
|
| 6 |
+
"wins": 105,
|
| 7 |
+
"losses": 63,
|
| 8 |
+
"ties": 21,
|
| 9 |
+
"total_games": 189,
|
| 10 |
+
"win_rate": 56.0,
|
| 11 |
+
"votes": 189,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_9039f703.json → detail_38a3ba66.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gemma-3-27b-it",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
"ties": 0,
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gemma-3-27b-it",
|
| 3 |
+
"run_id": "38a3ba66",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1512.0,
|
| 6 |
+
"wins": 10,
|
| 7 |
+
"losses": 10,
|
| 8 |
"ties": 0,
|
| 9 |
+
"total_games": 20,
|
| 10 |
+
"win_rate": 50.0,
|
| 11 |
+
"votes": 20,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_c6101f2a.json → detail_3f0f56f0.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "newmindai/Llama-3.3-70B-Instruct",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "newmindai/Llama-3.3-70B-Instruct",
|
| 3 |
+
"run_id": "3f0f56f0",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1472.0,
|
| 6 |
+
"wins": 17,
|
| 7 |
+
"losses": 31,
|
| 8 |
+
"ties": 7,
|
| 9 |
+
"total_games": 55,
|
| 10 |
+
"win_rate": 31.0,
|
| 11 |
+
"votes": 55,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_7b0f26ef.json → detail_66536d64.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4-0709",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
"ties": 31,
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4-0709",
|
| 3 |
+
"run_id": "66536d64",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1449.0,
|
| 6 |
+
"wins": 71,
|
| 7 |
+
"losses": 89,
|
| 8 |
"ties": 31,
|
| 9 |
+
"total_games": 191,
|
| 10 |
+
"win_rate": 37.0,
|
| 11 |
+
"votes": 191,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_5c24d08a.json → detail_6730ed29.json}
RENAMED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "openai/gpt-oss-20b",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 33,
|
| 7 |
"losses": 66,
|
| 8 |
"ties": 25,
|
|
@@ -11,6 +11,6 @@
|
|
| 11 |
"votes": 124,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "openai/gpt-oss-20b",
|
| 3 |
+
"run_id": "6730ed29",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1328.0,
|
| 6 |
"wins": 33,
|
| 7 |
"losses": 66,
|
| 8 |
"ties": 25,
|
|
|
|
| 11 |
"votes": 124,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_15b52ac5.json → detail_6c246e0d.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-4o-mini",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-4o-mini",
|
| 3 |
+
"run_id": "6c246e0d",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1391.0,
|
| 6 |
+
"wins": 66,
|
| 7 |
+
"losses": 92,
|
| 8 |
+
"ties": 34,
|
| 9 |
+
"total_games": 192,
|
| 10 |
+
"win_rate": 34.0,
|
| 11 |
+
"votes": 192,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_3ac28fb0.json → detail_9085338d.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-4",
|
| 3 |
+
"run_id": "9085338d",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1628.0,
|
| 6 |
+
"wins": 111,
|
| 7 |
+
"losses": 60,
|
| 8 |
+
"ties": 20,
|
| 9 |
+
"total_games": 191,
|
| 10 |
+
"win_rate": 58.0,
|
| 11 |
+
"votes": 191,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_47471816.json → detail_94a01d1e.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-2.5-pro-preview-03-25",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
"losses": 23,
|
| 8 |
"ties": 7,
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 75.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-2.5-pro-preview-03-25",
|
| 3 |
+
"run_id": "94a01d1e",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1760.0,
|
| 6 |
+
"wins": 91,
|
| 7 |
"losses": 23,
|
| 8 |
"ties": 7,
|
| 9 |
+
"total_games": 121,
|
| 10 |
"win_rate": 75.0,
|
| 11 |
+
"votes": 121,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_d442d95f.json → detail_97567fd5.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Llama-3.3",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "meta-llama/Llama-3.3-70B-Instruct",
|
| 3 |
+
"run_id": "97567fd5",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1414.0,
|
| 6 |
+
"wins": 58,
|
| 7 |
+
"losses": 100,
|
| 8 |
+
"ties": 39,
|
| 9 |
+
"total_games": 197,
|
| 10 |
+
"win_rate": 29.0,
|
| 11 |
+
"votes": 197,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Llama-3.3",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_5ff4c010.json → detail_9aa4b0bf.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "Qwen/QwQ-32B",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Apache 2.0",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "Qwen/QwQ-32B",
|
| 3 |
+
"run_id": "9aa4b0bf",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1472.0,
|
| 6 |
+
"wins": 21,
|
| 7 |
+
"losses": 22,
|
| 8 |
+
"ties": 13,
|
| 9 |
+
"total_games": 56,
|
| 10 |
+
"win_rate": 38.0,
|
| 11 |
+
"votes": 56,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Apache 2.0",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_ce44ff08.json → detail_9b9bb2ba.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-4o",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
"ties": 30,
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 35.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-4o",
|
| 3 |
+
"run_id": "9b9bb2ba",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1373.0,
|
| 6 |
+
"wins": 67,
|
| 7 |
+
"losses": 97,
|
| 8 |
"ties": 30,
|
| 9 |
+
"total_games": 194,
|
| 10 |
"win_rate": 35.0,
|
| 11 |
+
"votes": 194,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_d3d5ea7c.json → detail_a10ef85a.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "qwen-plus",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 73,
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Qwen",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "qwen-plus",
|
| 3 |
+
"run_id": "a10ef85a",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1518.0,
|
| 6 |
"wins": 73,
|
| 7 |
+
"losses": 75,
|
| 8 |
+
"ties": 35,
|
| 9 |
+
"total_games": 183,
|
| 10 |
+
"win_rate": 40.0,
|
| 11 |
+
"votes": 183,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Qwen",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_318acf27.json → detail_ad1efebc.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "openai/gpt-oss-120b",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 50,
|
| 7 |
-
"losses":
|
| 8 |
"ties": 18,
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 40.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "openai/gpt-oss-120b",
|
| 3 |
+
"run_id": "ad1efebc",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1408.0,
|
| 6 |
"wins": 50,
|
| 7 |
+
"losses": 56,
|
| 8 |
"ties": 18,
|
| 9 |
+
"total_games": 124,
|
| 10 |
"win_rate": 40.0,
|
| 11 |
+
"votes": 124,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_6b53b3f3.json → detail_ad3f6c99.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-5",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
"ties": 14,
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 60.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gpt-5",
|
| 3 |
+
"run_id": "ad3f6c99",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1773.0,
|
| 6 |
+
"wins": 74,
|
| 7 |
+
"losses": 36,
|
| 8 |
"ties": 14,
|
| 9 |
+
"total_games": 124,
|
| 10 |
"win_rate": 60.0,
|
| 11 |
+
"votes": 124,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_37a12169.json → detail_aded928d.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "deepseek-ai/DeepSeek-V3",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 81,
|
| 7 |
-
"losses":
|
| 8 |
"ties": 38,
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "MIT",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "deepseek-ai/DeepSeek-V3",
|
| 3 |
+
"run_id": "aded928d",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1528.0,
|
| 6 |
"wins": 81,
|
| 7 |
+
"losses": 71,
|
| 8 |
"ties": 38,
|
| 9 |
+
"total_games": 190,
|
| 10 |
+
"win_rate": 43.0,
|
| 11 |
+
"votes": 190,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "MIT",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_da4d94c9.json → detail_b49185d7.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "newmindai/QwQ-32B",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Apache 2.0",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "newmindai/QwQ-32B",
|
| 3 |
+
"run_id": "b49185d7",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1377.0,
|
| 6 |
+
"wins": 6,
|
| 7 |
+
"losses": 24,
|
| 8 |
+
"ties": 12,
|
| 9 |
+
"total_games": 42,
|
| 10 |
+
"win_rate": 14.0,
|
| 11 |
+
"votes": 42,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Apache 2.0",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_74139fe9.json → detail_b8046ea9.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-2.0-flash",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-2.0-flash",
|
| 3 |
+
"run_id": "b8046ea9",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1569.0,
|
| 6 |
+
"wins": 98,
|
| 7 |
+
"losses": 66,
|
| 8 |
+
"ties": 24,
|
| 9 |
+
"total_games": 188,
|
| 10 |
+
"win_rate": 52.0,
|
| 11 |
+
"votes": 188,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_2cb1472f.json → detail_c77ffeba.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-3-mini-fast-beta",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "grok-3-mini-fast-beta",
|
| 3 |
+
"run_id": "c77ffeba",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1420.0,
|
| 6 |
+
"wins": 27,
|
| 7 |
+
"losses": 63,
|
| 8 |
+
"ties": 24,
|
| 9 |
+
"total_games": 114,
|
| 10 |
+
"win_rate": 24.0,
|
| 11 |
+
"votes": 114,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_f8254c92.json → detail_cadcb1aa.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "qwen-turbo",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 42,
|
| 7 |
-
"losses":
|
| 8 |
"ties": 42,
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Qwen",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "qwen-turbo",
|
| 3 |
+
"run_id": "cadcb1aa",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1451.0,
|
| 6 |
"wins": 42,
|
| 7 |
+
"losses": 95,
|
| 8 |
"ties": 42,
|
| 9 |
+
"total_games": 179,
|
| 10 |
+
"win_rate": 24.0,
|
| 11 |
+
"votes": 179,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Qwen",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_12087995.json → detail_dbd2f986.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "llama-3.3-70b-versatile",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
"ties": 27,
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 34.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Llama-3.3",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "llama-3.3-70b-versatile",
|
| 3 |
+
"run_id": "dbd2f986",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1482.0,
|
| 6 |
+
"wins": 66,
|
| 7 |
+
"losses": 103,
|
| 8 |
"ties": 27,
|
| 9 |
+
"total_games": 196,
|
| 10 |
"win_rate": 34.0,
|
| 11 |
+
"votes": 196,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Llama-3.3",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_633cccf3.json → detail_ead2378e.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-3-pro-preview",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "reasoning",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
"losses": 28,
|
| 8 |
"ties": 9,
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-3-pro-preview",
|
| 3 |
+
"run_id": "ead2378e",
|
| 4 |
"category": "reasoning",
|
| 5 |
+
"elo_rating": 1652.0,
|
| 6 |
+
"wins": 75,
|
| 7 |
"losses": 28,
|
| 8 |
"ties": 9,
|
| 9 |
+
"total_games": 112,
|
| 10 |
+
"win_rate": 67.0,
|
| 11 |
+
"votes": 112,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_b1bdd263.json → detail_ee6f0443.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "qwen-max",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
"wins": 72,
|
| 7 |
-
"losses":
|
| 8 |
"ties": 37,
|
| 9 |
-
"total_games":
|
| 10 |
"win_rate": 41.0,
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Qwen",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "qwen-max",
|
| 3 |
+
"run_id": "ee6f0443",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1589.0,
|
| 6 |
"wins": 72,
|
| 7 |
+
"losses": 67,
|
| 8 |
"ties": 37,
|
| 9 |
+
"total_games": 176,
|
| 10 |
"win_rate": 41.0,
|
| 11 |
+
"votes": 176,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Qwen",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_78ed43f0.json → detail_f190d45f.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-2.5-flash-preview-04-17",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
"losses": 40,
|
| 8 |
"ties": 28,
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "gemini-2.5-flash-preview-04-17",
|
| 3 |
+
"run_id": "f190d45f",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1607.0,
|
| 6 |
+
"wins": 120,
|
| 7 |
"losses": 40,
|
| 8 |
"ties": 28,
|
| 9 |
+
"total_games": 188,
|
| 10 |
+
"win_rate": 64.0,
|
| 11 |
+
"votes": 188,
|
| 12 |
"dtype": "Unknown",
|
| 13 |
"license": "Proprietary",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
result/human_arena/{detail_d7aced26.json → detail_f64d9185.json}
RENAMED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 3 |
-
"run_id": "
|
| 4 |
"category": "general",
|
| 5 |
-
"elo_rating":
|
| 6 |
-
"wins":
|
| 7 |
-
"losses":
|
| 8 |
-
"ties":
|
| 9 |
-
"total_games":
|
| 10 |
-
"win_rate":
|
| 11 |
-
"votes":
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Qwen",
|
| 14 |
-
"evaluation_date": "2026-01-
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 3 |
+
"run_id": "f64d9185",
|
| 4 |
"category": "general",
|
| 5 |
+
"elo_rating": 1506.0,
|
| 6 |
+
"wins": 24,
|
| 7 |
+
"losses": 21,
|
| 8 |
+
"ties": 10,
|
| 9 |
+
"total_games": 55,
|
| 10 |
+
"win_rate": 44.0,
|
| 11 |
+
"votes": 55,
|
| 12 |
"dtype": "bfloat16",
|
| 13 |
"license": "Qwen",
|
| 14 |
+
"evaluation_date": "2026-01-21",
|
| 15 |
"evaluation_type": "Human Arena"
|
| 16 |
}
|