NowAI-Bench / data /leaderboard.json
bradnow's picture
Sync leaderboard: EOG + EVA 2026-06-10
ba5565d
Raw
History Blame Contribute Delete
3.07 kB
{
"updated": "2026-06-10T18:19:04.013Z",
"sources": {
"eog": {
"url": "https://raw.githubusercontent.com/EnterpriseOps-Gym/EnterpriseOps-Gym.github.io/main/script.js",
"hash": "sha256:f96e11145e82ffd4bb2654a9e7cf42f5ebd3b10f481aa1027be22a0d3a1badba",
"fetchedAt": "2026-06-10T18:19:04.012Z"
},
"eva": {
"url": "https://raw.githubusercontent.com/ServiceNow/eva/main/website/src/data/leaderboardStats.json",
"hash": "sha256:1f7a33021ffea8a3719c24cf31f75cfeff4b0271f77ef8abfd0063099081b288",
"fetchedAt": "2026-06-10T18:19:04.013Z"
}
},
"eog": {
"metricLabel": "Task Success Rate 路 Oracle mode",
"rows": [
{
"rank": 1,
"model": "Gemini 3.5 Flash (High)",
"org": "Google",
"type": "closed",
"score": 46,
"bar": 46
},
{
"rank": 2,
"model": "Claude Opus 4.6",
"org": "Anthropic",
"type": "closed",
"score": 44.6,
"bar": 45
},
{
"rank": 3,
"model": "GPT-5.5 (High)",
"org": "OpenAI",
"type": "closed",
"score": 41.2,
"bar": 41
},
{
"rank": 4,
"model": "Claude Opus 4.7 (High)",
"org": "Anthropic",
"type": "closed",
"score": 41.2,
"bar": 41
},
{
"rank": 5,
"model": "Claude Sonnet 4.6",
"org": "Anthropic",
"type": "closed",
"score": 40.4,
"bar": 40
}
]
},
"evaAccuracy": {
"metricLabel": "Pass@1",
"rows": [
{
"rank": 1,
"name": "Scribe v2.2 Realtime + GPT-5.4 + Eleven Flash v2 (ElevenAgents)",
"subtitle": "Mixed Models 路 Cascade",
"score": 0.67,
"ciLower": 0.62,
"ciUpper": 0.72,
"bar": 67
},
{
"rank": 2,
"name": "Grok Voice Think Fast 1.0",
"subtitle": "Grok 路 Speech-to-Speech",
"score": 0.59,
"ciLower": 0.54,
"ciUpper": 0.64,
"bar": 59
},
{
"rank": 3,
"name": "Nova 3 + GPT-5.4 + Sonic 3",
"subtitle": "Mixed Models 路 Cascade",
"score": 0.5,
"ciLower": 0.46,
"ciUpper": 0.54,
"bar": 50
}
]
},
"evaExperience": {
"metricLabel": "Pass@1",
"rows": [
{
"rank": 1,
"name": "Scribe v2.2 Realtime + Claude Haiku 4.5 + Eleven Flash v2 (ElevenAgents)",
"subtitle": "Mixed Models 路 Cascade",
"score": 0.82,
"ciLower": 0.79,
"ciUpper": 0.84,
"bar": 82
},
{
"rank": 2,
"name": "Gemini 3.1 Flash Live",
"subtitle": "Google 路 Speech-to-Speech",
"score": 0.59,
"ciLower": 0.56,
"ciUpper": 0.62,
"bar": 59
},
{
"rank": 3,
"name": "Grok Voice Think Fast 1.0",
"subtitle": "Grok 路 Speech-to-Speech",
"score": 0.57,
"ciLower": 0.53,
"ciUpper": 0.61,
"bar": 57
}
]
}
}