Spaces:

ServiceNow-AI
/

NowAI-Bench

Running

bradnow commited on Jun 3

Commit

b384f8e

1 Parent(s): 84f5427

feat: bars show absolute % of metric max (EOG/100, EVA/1.0)

Files changed (2) hide show

.claude/skills/sync-nowai-leaderboard/lib/transform.mjs CHANGED Viewed

@@ -52,7 +52,6 @@ export function vendorFromName(name) {
 export function deriveEog(eogArray) {
   const sorted = [...eogArray].sort((a, b) => b.avg - a.avg).slice(0, 5);
-  const top = sorted[0]?.avg ?? 0;
   return {
     metricLabel: 'Task Success Rate · Oracle mode',
     rows: sorted.map((m, i) => ({
@@ -61,7 +60,8 @@ export function deriveEog(eogArray) {
       org: vendorFromName(m.model),
       type: m.type,
       score: round1(m.avg),
-      bar: top ? Math.round((m.avg / top) * 100) : 0,
     })),
   };
 }
@@ -80,7 +80,6 @@ export function deriveEvaBoard(systems, metricKey) {
     .filter((x) => x.pooled && typeof x.pooled.point === 'number')
     .sort((a, b) => b.pooled.point - a.pooled.point)
     .slice(0, 5);
-  const top = scored[0]?.pooled.point ?? 0;
   return {
     metricLabel: 'Pass@1',
     rows: scored.map(({ s, pooled }, i) => ({
@@ -93,7 +92,8 @@ export function deriveEvaBoard(systems, metricKey) {
       score: round2(pooled.point),
       ciLower: round2(pooled.ci_lower),
       ciUpper: round2(pooled.ci_upper),
-      bar: top ? Math.round((pooled.point / top) * 100) : 0,
     })),
   };
 }

 export function deriveEog(eogArray) {
   const sorted = [...eogArray].sort((a, b) => b.avg - a.avg).slice(0, 5);
   return {
     metricLabel: 'Task Success Rate · Oracle mode',
     rows: sorted.map((m, i) => ({
       org: vendorFromName(m.model),
       type: m.type,
       score: round1(m.avg),
+      // bar = absolute % of max. EOG score is already a 0–100 success rate.
+      bar: Math.round(m.avg),
     })),
   };
 }
     .filter((x) => x.pooled && typeof x.pooled.point === 'number')
     .sort((a, b) => b.pooled.point - a.pooled.point)
     .slice(0, 5);
   return {
     metricLabel: 'Pass@1',
     rows: scored.map(({ s, pooled }, i) => ({
       score: round2(pooled.point),
       ciLower: round2(pooled.ci_lower),
       ciUpper: round2(pooled.ci_upper),
+      // bar = absolute % of max. EVA Pass@1 is a 0–1 proportion.
+      bar: Math.round(pooled.point * 100),
     })),
   };
 }

.claude/skills/sync-nowai-leaderboard/test/transform.test.mjs CHANGED Viewed

@@ -45,9 +45,9 @@ test('deriveEog sorts by avg desc, takes top 5, computes bars', () => {
   assert.deepEqual(out.rows.map(r => r.model), ['B', 'C', 'A']);
   assert.equal(out.rows[0].rank, 1);
   assert.equal(out.rows[0].score, 40);
-  assert.equal(out.rows[0].bar, 100);
-  assert.equal(out.rows[1].bar, 75);   // 30/40
-  assert.equal(out.rows[2].bar, 50);   // 20/40
   assert.equal(out.rows[0].type, 'open');
 });
@@ -84,11 +84,11 @@ test('deriveEvaBoard ranks by accuracy point, builds subtitle + ci + bars', () =
   assert.deepEqual(out.rows.map(r => r.name), ['Nova + GPT + Sonic', 'Gemini Live']);
   assert.equal(out.rows[0].score, 0.41);
   assert.equal(out.rows[0].subtitle, 'Mixed Models · Cascade');
-  assert.equal(out.rows[0].bar, 100);
   assert.equal(out.rows[0].ciLower, 0.39);
   assert.equal(out.rows[0].ciUpper, 0.43);
   assert.equal(out.rows[1].subtitle, 'Google · Speech-to-Speech');
-  assert.equal(out.rows[1].bar, 73); // 0.30/0.41 = 0.7317 → 73
 });
 test('deriveEvaBoard ranks experience board independently', () => {

   assert.deepEqual(out.rows.map(r => r.model), ['B', 'C', 'A']);
   assert.equal(out.rows[0].rank, 1);
   assert.equal(out.rows[0].score, 40);
+  assert.equal(out.rows[0].bar, 40);   // absolute: avg 40 of max 100
+  assert.equal(out.rows[1].bar, 30);   // avg 30
+  assert.equal(out.rows[2].bar, 20);   // avg 20
   assert.equal(out.rows[0].type, 'open');
 });
   assert.deepEqual(out.rows.map(r => r.name), ['Nova + GPT + Sonic', 'Gemini Live']);
   assert.equal(out.rows[0].score, 0.41);
   assert.equal(out.rows[0].subtitle, 'Mixed Models · Cascade');
+  assert.equal(out.rows[0].bar, 41);   // absolute: 0.41 × 100
   assert.equal(out.rows[0].ciLower, 0.39);
   assert.equal(out.rows[0].ciUpper, 0.43);
   assert.equal(out.rows[1].subtitle, 'Google · Speech-to-Speech');
+  assert.equal(out.rows[1].bar, 30); // 0.30 × 100
 });
 test('deriveEvaBoard ranks experience board independently', () => {