Spaces:
Running
Running
feat: bars show absolute % of metric max (EOG/100, EVA/1.0)
Browse files
.claude/skills/sync-nowai-leaderboard/lib/transform.mjs
CHANGED
|
@@ -52,7 +52,6 @@ export function vendorFromName(name) {
|
|
| 52 |
|
| 53 |
export function deriveEog(eogArray) {
|
| 54 |
const sorted = [...eogArray].sort((a, b) => b.avg - a.avg).slice(0, 5);
|
| 55 |
-
const top = sorted[0]?.avg ?? 0;
|
| 56 |
return {
|
| 57 |
metricLabel: 'Task Success Rate · Oracle mode',
|
| 58 |
rows: sorted.map((m, i) => ({
|
|
@@ -61,7 +60,8 @@ export function deriveEog(eogArray) {
|
|
| 61 |
org: vendorFromName(m.model),
|
| 62 |
type: m.type,
|
| 63 |
score: round1(m.avg),
|
| 64 |
-
bar
|
|
|
|
| 65 |
})),
|
| 66 |
};
|
| 67 |
}
|
|
@@ -80,7 +80,6 @@ export function deriveEvaBoard(systems, metricKey) {
|
|
| 80 |
.filter((x) => x.pooled && typeof x.pooled.point === 'number')
|
| 81 |
.sort((a, b) => b.pooled.point - a.pooled.point)
|
| 82 |
.slice(0, 5);
|
| 83 |
-
const top = scored[0]?.pooled.point ?? 0;
|
| 84 |
return {
|
| 85 |
metricLabel: 'Pass@1',
|
| 86 |
rows: scored.map(({ s, pooled }, i) => ({
|
|
@@ -93,7 +92,8 @@ export function deriveEvaBoard(systems, metricKey) {
|
|
| 93 |
score: round2(pooled.point),
|
| 94 |
ciLower: round2(pooled.ci_lower),
|
| 95 |
ciUpper: round2(pooled.ci_upper),
|
| 96 |
-
bar
|
|
|
|
| 97 |
})),
|
| 98 |
};
|
| 99 |
}
|
|
|
|
| 52 |
|
| 53 |
export function deriveEog(eogArray) {
|
| 54 |
const sorted = [...eogArray].sort((a, b) => b.avg - a.avg).slice(0, 5);
|
|
|
|
| 55 |
return {
|
| 56 |
metricLabel: 'Task Success Rate · Oracle mode',
|
| 57 |
rows: sorted.map((m, i) => ({
|
|
|
|
| 60 |
org: vendorFromName(m.model),
|
| 61 |
type: m.type,
|
| 62 |
score: round1(m.avg),
|
| 63 |
+
// bar = absolute % of max. EOG score is already a 0–100 success rate.
|
| 64 |
+
bar: Math.round(m.avg),
|
| 65 |
})),
|
| 66 |
};
|
| 67 |
}
|
|
|
|
| 80 |
.filter((x) => x.pooled && typeof x.pooled.point === 'number')
|
| 81 |
.sort((a, b) => b.pooled.point - a.pooled.point)
|
| 82 |
.slice(0, 5);
|
|
|
|
| 83 |
return {
|
| 84 |
metricLabel: 'Pass@1',
|
| 85 |
rows: scored.map(({ s, pooled }, i) => ({
|
|
|
|
| 92 |
score: round2(pooled.point),
|
| 93 |
ciLower: round2(pooled.ci_lower),
|
| 94 |
ciUpper: round2(pooled.ci_upper),
|
| 95 |
+
// bar = absolute % of max. EVA Pass@1 is a 0–1 proportion.
|
| 96 |
+
bar: Math.round(pooled.point * 100),
|
| 97 |
})),
|
| 98 |
};
|
| 99 |
}
|
.claude/skills/sync-nowai-leaderboard/test/transform.test.mjs
CHANGED
|
@@ -45,9 +45,9 @@ test('deriveEog sorts by avg desc, takes top 5, computes bars', () => {
|
|
| 45 |
assert.deepEqual(out.rows.map(r => r.model), ['B', 'C', 'A']);
|
| 46 |
assert.equal(out.rows[0].rank, 1);
|
| 47 |
assert.equal(out.rows[0].score, 40);
|
| 48 |
-
assert.equal(out.rows[0].bar,
|
| 49 |
-
assert.equal(out.rows[1].bar,
|
| 50 |
-
assert.equal(out.rows[2].bar,
|
| 51 |
assert.equal(out.rows[0].type, 'open');
|
| 52 |
});
|
| 53 |
|
|
@@ -84,11 +84,11 @@ test('deriveEvaBoard ranks by accuracy point, builds subtitle + ci + bars', () =
|
|
| 84 |
assert.deepEqual(out.rows.map(r => r.name), ['Nova + GPT + Sonic', 'Gemini Live']);
|
| 85 |
assert.equal(out.rows[0].score, 0.41);
|
| 86 |
assert.equal(out.rows[0].subtitle, 'Mixed Models · Cascade');
|
| 87 |
-
assert.equal(out.rows[0].bar,
|
| 88 |
assert.equal(out.rows[0].ciLower, 0.39);
|
| 89 |
assert.equal(out.rows[0].ciUpper, 0.43);
|
| 90 |
assert.equal(out.rows[1].subtitle, 'Google · Speech-to-Speech');
|
| 91 |
-
assert.equal(out.rows[1].bar,
|
| 92 |
});
|
| 93 |
|
| 94 |
test('deriveEvaBoard ranks experience board independently', () => {
|
|
|
|
| 45 |
assert.deepEqual(out.rows.map(r => r.model), ['B', 'C', 'A']);
|
| 46 |
assert.equal(out.rows[0].rank, 1);
|
| 47 |
assert.equal(out.rows[0].score, 40);
|
| 48 |
+
assert.equal(out.rows[0].bar, 40); // absolute: avg 40 of max 100
|
| 49 |
+
assert.equal(out.rows[1].bar, 30); // avg 30
|
| 50 |
+
assert.equal(out.rows[2].bar, 20); // avg 20
|
| 51 |
assert.equal(out.rows[0].type, 'open');
|
| 52 |
});
|
| 53 |
|
|
|
|
| 84 |
assert.deepEqual(out.rows.map(r => r.name), ['Nova + GPT + Sonic', 'Gemini Live']);
|
| 85 |
assert.equal(out.rows[0].score, 0.41);
|
| 86 |
assert.equal(out.rows[0].subtitle, 'Mixed Models · Cascade');
|
| 87 |
+
assert.equal(out.rows[0].bar, 41); // absolute: 0.41 × 100
|
| 88 |
assert.equal(out.rows[0].ciLower, 0.39);
|
| 89 |
assert.equal(out.rows[0].ciUpper, 0.43);
|
| 90 |
assert.equal(out.rows[1].subtitle, 'Google · Speech-to-Speech');
|
| 91 |
+
assert.equal(out.rows[1].bar, 30); // 0.30 × 100
|
| 92 |
});
|
| 93 |
|
| 94 |
test('deriveEvaBoard ranks experience board independently', () => {
|