Spaces:
Running
Running
Adding banner and sequence image
Browse files
app/src/content/article.mdx
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title: "Are LLMs any good at the Science Game?
|
| 3 |
-
subtitle: "
|
| 4 |
description: "A benchmark for evaluating LLM scientific reasoning using the card game Eleusis, testing iterative hypothesis formation, calibration, and strategic experimentation."
|
| 5 |
authors:
|
| 6 |
- name: "David Louapre"
|
|
|
|
| 1 |
---
|
| 2 |
+
title: "Are LLMs any good at the Science Game?"
|
| 3 |
+
subtitle: "Evaluating scientific reasoning using the card game Eleusis"
|
| 4 |
description: "A benchmark for evaluating LLM scientific reasoning using the card game Eleusis, testing iterative hypothesis formation, calibration, and strategic experimentation."
|
| 5 |
authors:
|
| 6 |
- name: "David Louapre"
|
app/src/content/assets/data/overall_performance.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c620d1614704161071e6b3fdf51031228bc35a0aab8f70d6221f024a68e21e32
|
| 3 |
+
size 1413
|
app/src/content/assets/image/example_sequence.png
ADDED
|
Git LFS Details
|
app/src/content/chapters/eleusis/introduction.mdx
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import Sidenote from "../../../components/Sidenote.astro";
|
| 2 |
-
import
|
|
|
|
|
|
|
| 3 |
|
| 4 |
Large language models are increasingly being deployed as tools for scientific research—analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
|
| 5 |
|
|
@@ -25,9 +27,12 @@ Eleusis was designed by Robert Abbott explicitly to simulate the process of scie
|
|
| 25 |
|
| 26 |
It's a microcosm of the scientific method: the rule is a hidden law of nature, each card play is an experiment, and the sequence of accepted and rejected cards is the accumulating evidence.
|
| 27 |
|
| 28 |
-
<
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: can models act like scientists? Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
|
| 33 |
|
|
|
|
| 1 |
import Sidenote from "../../../components/Sidenote.astro";
|
| 2 |
+
import Image from "../../../components/Image.astro";
|
| 3 |
+
|
| 4 |
+
import exampleSequence from "../../assets/image/example_sequence.png";
|
| 5 |
|
| 6 |
Large language models are increasingly being deployed as tools for scientific research—analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
|
| 7 |
|
|
|
|
| 27 |
|
| 28 |
It's a microcosm of the scientific method: the rule is a hidden law of nature, each card play is an experiment, and the sequence of accepted and rejected cards is the accumulating evidence.
|
| 29 |
|
| 30 |
+
<Image
|
| 31 |
+
src={exampleSequence}
|
| 32 |
+
alt="Example Eleusis game sequence with the secret rule 'alternating colors': mainline shows 5♠, K♥, J♠, A♦, 6♣ following the pattern, while the sideline below shows rejected cards 10♠ and 2♦"
|
| 33 |
+
caption="An example Eleusis game with the secret rule 'alternating colors'. The main line (top) shows accepted cards: 5♠ → K♥ → J♠ → A♦ → 6♣, each alternating between black and red. The sideline (bottom) shows rejected cards that would have violated the pattern."
|
| 34 |
+
id="fig-example-sequence"
|
| 35 |
+
/>
|
| 36 |
|
| 37 |
We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: can models act like scientists? Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
|
| 38 |
|
app/src/content/embeds/banner.html
CHANGED
|
@@ -1,258 +1,449 @@
|
|
| 1 |
-
<div class="d3-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
<script>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
if (
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
const minCircleSize = 4; // minimum diameter in pixels
|
| 37 |
-
const maxCircleSize = 12; // maximum diameter in pixels
|
| 38 |
-
|
| 39 |
-
// Generate spiral + bulge
|
| 40 |
-
const twoPi = Math.PI * 2;
|
| 41 |
-
const t = Float64Array.from({ length: numPoints }, () => Math.random() * (twoPi * numTurns));
|
| 42 |
-
const armIndices = Int16Array.from({ length: numPoints }, () => Math.floor(Math.random() * numArms));
|
| 43 |
-
const armOffsets = Float64Array.from(armIndices, (k) => k * (twoPi / numArms));
|
| 44 |
-
const theta = Float64Array.from(t, (tv, i) => tv + armOffsets[i] + d3.randomNormal.source(Math.random)(0, angleJitter)());
|
| 45 |
-
const rNorm = Float64Array.from(t, (tv) => Math.pow(tv / (twoPi * numTurns), 0.9));
|
| 46 |
-
const noiseScale = (rn) => posNoise * (0.8 + 0.6 * rn);
|
| 47 |
-
const noiseX = Float64Array.from(rNorm, (rn) => d3.randomNormal.source(Math.random)(0, noiseScale(rn))());
|
| 48 |
-
const noiseY = Float64Array.from(rNorm, (rn) => d3.randomNormal.source(Math.random)(0, noiseScale(rn))());
|
| 49 |
-
|
| 50 |
-
const xSpiral = Float64Array.from(theta, (th, i) => cx + a * rNorm[i] * Math.cos(th) + noiseX[i]);
|
| 51 |
-
const ySpiral = Float64Array.from(theta, (th, i) => cy + b * rNorm[i] * Math.sin(th) + noiseY[i]);
|
| 52 |
-
|
| 53 |
-
const bulgePoints = Math.floor(0.18 * numPoints);
|
| 54 |
-
const phiB = Float64Array.from({ length: bulgePoints }, () => twoPi * Math.random());
|
| 55 |
-
const rB = Float64Array.from({ length: bulgePoints }, () => Math.pow(Math.random(), 2.2) * 0.22);
|
| 56 |
-
const noiseXB = Float64Array.from({ length: bulgePoints }, () => d3.randomNormal.source(Math.random)(0, posNoise * 0.6)());
|
| 57 |
-
const noiseYB = Float64Array.from({ length: bulgePoints }, () => d3.randomNormal.source(Math.random)(0, posNoise * 0.6)());
|
| 58 |
-
const xBulge = Float64Array.from(phiB, (ph, i) => cx + a * rB[i] * Math.cos(ph) + noiseXB[i]);
|
| 59 |
-
const yBulge = Float64Array.from(phiB, (ph, i) => cy + b * rB[i] * Math.sin(ph) + noiseYB[i]);
|
| 60 |
-
|
| 61 |
-
// Concatenate
|
| 62 |
-
const X = Array.from(xSpiral).concat(Array.from(xBulge));
|
| 63 |
-
const Y = Array.from(ySpiral).concat(Array.from(yBulge));
|
| 64 |
-
const lenSpiral = xSpiral.length;
|
| 65 |
-
|
| 66 |
-
const zSpiral = Array.from(rNorm, (rn) => 1 - rn);
|
| 67 |
-
const maxRB = rB && rB.length ? (window.d3 && d3.max ? d3.max(rB) : Math.max.apply(null, Array.from(rB))) : 1;
|
| 68 |
-
const zBulge = Array.from(rB, (rb) => 1 - (maxRB ? rb / maxRB : 0));
|
| 69 |
-
const Zraw = zSpiral.concat(zBulge);
|
| 70 |
-
const sizesPx = Zraw.map((z) => minCircleSize + z * (maxCircleSize - minCircleSize)); // diameter in pixels
|
| 71 |
-
|
| 72 |
-
// Labels (same categories as Python version)
|
| 73 |
-
const labelOf = (i) => {
|
| 74 |
-
const z = Zraw[i];
|
| 75 |
-
if (z < 0.25) return 'tiny star';
|
| 76 |
-
if (z < 0.5) return 'small star';
|
| 77 |
-
if (z < 0.75) return 'medium star';
|
| 78 |
-
return 'large star';
|
| 79 |
-
};
|
| 80 |
-
|
| 81 |
-
// Sort by size ascending for z-index: small first, big last
|
| 82 |
-
const idx = d3.range(X.length).sort((i, j) => sizesPx[i] - sizesPx[j]);
|
| 83 |
-
|
| 84 |
-
// Colors: piecewise gradient [0 -> 0.5 -> 1]
|
| 85 |
-
const c0 = d3.rgb(78, 165, 183); // rgb(78, 165, 183)
|
| 86 |
-
const c1 = d3.rgb(206, 192, 250); // rgb(206, 192, 250)
|
| 87 |
-
const c2 = d3.rgb(232, 137, 171); // rgb(232, 137, 171)
|
| 88 |
-
const interp01 = d3.interpolateRgb(c0, c1);
|
| 89 |
-
const interp12 = d3.interpolateRgb(c1, c2);
|
| 90 |
-
const colorFor = (v) => {
|
| 91 |
-
const t = Math.max(0, Math.min(1, v));
|
| 92 |
-
return t <= 0.5 ? interp01(t / 0.5) : interp12((t - 0.5) / 0.5);
|
| 93 |
-
};
|
| 94 |
-
|
| 95 |
-
// Create SVG
|
| 96 |
-
const svg = d3.select(container).append('svg')
|
| 97 |
-
.attr('width', '100%')
|
| 98 |
-
.style('display', 'block')
|
| 99 |
-
.style('cursor', 'crosshair');
|
| 100 |
-
|
| 101 |
-
const render = () => {
|
| 102 |
-
const width = container.clientWidth || 800;
|
| 103 |
-
const height = Math.max(260, Math.round(width / 3)); // keep ~3:1, min height
|
| 104 |
-
svg.attr('width', width).attr('height', height);
|
| 105 |
-
|
| 106 |
-
const xScale = d3.scaleLinear().domain([0, 3]).range([0, width]);
|
| 107 |
-
const yScale = d3.scaleLinear().domain([0, 1]).range([height, 0]);
|
| 108 |
-
|
| 109 |
-
// Subtle stroke color depending on theme
|
| 110 |
-
const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
|
| 111 |
-
const strokeColor = isDark ? 'rgba(255,255,255,0.18)' : 'rgba(0,0,0,0.12)';
|
| 112 |
-
const glowColor = isDark ? 'rgba(255,255,255,0.35)' : 'rgba(0,0,0,0.25)';
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
// Group for points (no blend mode for better print/PDF visibility)
|
| 116 |
-
const g = svg.selectAll('g.points').data([0]).join('g').attr('class', 'points');
|
| 117 |
-
|
| 118 |
-
// Ensure container can host an absolute tooltip
|
| 119 |
-
container.style.position = container.style.position || 'relative';
|
| 120 |
-
let tip = container.querySelector('.d3-tooltip');
|
| 121 |
-
let tipInner;
|
| 122 |
-
if (!tip) {
|
| 123 |
-
tip = document.createElement('div');
|
| 124 |
-
tip.className = 'd3-tooltip';
|
| 125 |
-
Object.assign(tip.style, {
|
| 126 |
-
position: 'absolute',
|
| 127 |
-
top: '0px',
|
| 128 |
-
left: '0px',
|
| 129 |
-
transform: 'translate(-9999px, -9999px)',
|
| 130 |
-
pointerEvents: 'none',
|
| 131 |
-
padding: '10px 12px',
|
| 132 |
-
borderRadius: '12px',
|
| 133 |
-
fontSize: '12px',
|
| 134 |
-
lineHeight: '1.35',
|
| 135 |
-
border: '1px solid var(--border-color)',
|
| 136 |
-
background: 'var(--surface-bg)',
|
| 137 |
-
color: 'var(--text-color)',
|
| 138 |
-
boxShadow: '0 8px 32px rgba(0,0,0,.28), 0 2px 8px rgba(0,0,0,.12)',
|
| 139 |
-
opacity: '0',
|
| 140 |
-
transition: 'opacity .12s ease',
|
| 141 |
-
backdropFilter: 'saturate(1.12) blur(8px)',
|
| 142 |
-
zIndex: '20'
|
| 143 |
-
});
|
| 144 |
-
tipInner = document.createElement('div');
|
| 145 |
-
tipInner.className = 'd3-tooltip__inner';
|
| 146 |
-
Object.assign(tipInner.style, {
|
| 147 |
-
textAlign: 'left',
|
| 148 |
-
display: 'flex',
|
| 149 |
-
flexDirection: 'column',
|
| 150 |
-
gap: '6px',
|
| 151 |
-
minWidth: '220px'
|
| 152 |
-
});
|
| 153 |
-
tip.appendChild(tipInner);
|
| 154 |
-
container.appendChild(tip);
|
| 155 |
-
} else {
|
| 156 |
-
tipInner = tip.querySelector('.d3-tooltip__inner') || tip;
|
| 157 |
-
}
|
| 158 |
-
|
| 159 |
-
// Final filter: remove small dots very close to the galaxy center (after placement)
|
| 160 |
-
const centerHoleRadius = 0.48; // elliptical radius threshold
|
| 161 |
-
const smallSizeThreshold = 7.5; // same notion as Python size cut
|
| 162 |
-
const rTotal = idx.map((i) => Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2));
|
| 163 |
-
const idxFiltered = idx.filter((i, k) => !(rTotal[k] <= centerHoleRadius && sizesPx[i] < smallSizeThreshold));
|
| 164 |
-
|
| 165 |
-
const sel = g.selectAll('circle').data(idxFiltered, (i) => i);
|
| 166 |
-
sel.join(
|
| 167 |
-
(enter) => enter.append('circle')
|
| 168 |
-
.attr('cx', (i) => xScale(X[i]))
|
| 169 |
-
.attr('cy', (i) => yScale(Y[i]))
|
| 170 |
-
.attr('r', (i) => sizesPx[i] / 2)
|
| 171 |
-
.attr('fill', (i) => colorFor(Zraw[i]))
|
| 172 |
-
.attr('fill-opacity', 0.9)
|
| 173 |
-
.on('mouseenter', function (ev, i) {
|
| 174 |
-
d3.select(this).raise()
|
| 175 |
-
.style('filter', `drop-shadow(0 0 8px ${glowColor})`)
|
| 176 |
-
.transition().duration(120).ease(d3.easeCubicOut)
|
| 177 |
-
.attr('r', (sizesPx[i] / 2) * 1.25)
|
| 178 |
-
.attr('fill-opacity', 1);
|
| 179 |
-
const r = Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2);
|
| 180 |
-
const type = i < lenSpiral ? 'spiral' : 'bulge';
|
| 181 |
-
const arm = i < lenSpiral ? (armIndices[i] + 1) : null;
|
| 182 |
-
tipInner.innerHTML =
|
| 183 |
-
`<div style="font-weight:800;letter-spacing:.1px;"><strong>${labelOf(i)}</strong></div>` +
|
| 184 |
-
`<div style="font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;letter-spacing:.1px;"><strong>Type</strong> ${type}${arm ? ` (Arm ${arm})` : ''}</div>` +
|
| 185 |
-
`<div style="padding-top:6px;border-top:1px solid var(--border-color);"><strong>Position</strong> X ${X[i].toFixed(2)} · <strong>Y</strong> ${Y[i].toFixed(2)}</div>` +
|
| 186 |
-
`<div><strong>Distance</strong> Radius ${r.toFixed(3)} · <strong>Z</strong> ${Zraw[i].toFixed(3)}</div>` +
|
| 187 |
-
`<div><strong>Size</strong> ${sizesPx[i].toFixed(1)} px</div>`;
|
| 188 |
-
tip.style.opacity = '1';
|
| 189 |
-
})
|
| 190 |
-
.on('mousemove', (ev, i) => {
|
| 191 |
-
const [mx, my] = d3.pointer(ev, container);
|
| 192 |
-
const offsetX = 10, offsetY = 12;
|
| 193 |
-
tip.style.transform = `translate(${Math.round(mx + offsetX)}px, ${Math.round(my + offsetY)}px)`;
|
| 194 |
-
})
|
| 195 |
-
.on('mouseleave', function () {
|
| 196 |
-
tip.style.opacity = '0';
|
| 197 |
-
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 198 |
-
d3.select(this)
|
| 199 |
-
.style('filter', null)
|
| 200 |
-
.transition().duration(120).ease(d3.easeCubicOut)
|
| 201 |
-
.attr('r', (i2) => sizesPx[i2] / 2)
|
| 202 |
-
.attr('fill-opacity', 0.9);
|
| 203 |
-
}),
|
| 204 |
-
(update) => update
|
| 205 |
-
.attr('cx', (i) => xScale(X[i]))
|
| 206 |
-
.attr('cy', (i) => yScale(Y[i]))
|
| 207 |
-
.attr('r', (i) => sizesPx[i] / 2)
|
| 208 |
-
.attr('fill', (i) => colorFor(Zraw[i]))
|
| 209 |
-
.attr('fill-opacity', 0.9)
|
| 210 |
-
.on('mouseenter', function (ev, i) {
|
| 211 |
-
d3.select(this).raise()
|
| 212 |
-
.style('filter', `drop-shadow(0 0 8px ${glowColor})`)
|
| 213 |
-
.transition().duration(120).ease(d3.easeCubicOut)
|
| 214 |
-
.attr('r', (sizesPx[i] / 2) * 1.25)
|
| 215 |
-
.attr('fill-opacity', 1);
|
| 216 |
-
const r = Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2);
|
| 217 |
-
const type = i < lenSpiral ? 'spiral' : 'bulge';
|
| 218 |
-
const arm = i < lenSpiral ? (armIndices[i] + 1) : null;
|
| 219 |
-
tipInner.innerHTML =
|
| 220 |
-
`<div style="font-weight:800;letter-spacing:.1px;"><strong>${labelOf(i)}</strong></div>` +
|
| 221 |
-
`<div style="font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;letter-spacing:.1px;"><strong>Type</strong> ${type}${arm ? ` (Arm ${arm})` : ''}</div>` +
|
| 222 |
-
`<div style="padding-top:6px;border-top:1px solid var(--border-color);"><strong>Position</strong> X ${X[i].toFixed(2)} · <strong>Y</strong> ${Y[i].toFixed(2)}</div>` +
|
| 223 |
-
`<div><strong>Distance</strong> Radius ${r.toFixed(3)} · <strong>Z</strong> ${Zraw[i].toFixed(3)}</div>` +
|
| 224 |
-
`<div><strong>Size</strong> ${sizesPx[i].toFixed(1)} px</div>`;
|
| 225 |
-
tip.style.opacity = '1';
|
| 226 |
-
})
|
| 227 |
-
.on('mousemove', (ev, i) => {
|
| 228 |
-
const [mx, my] = d3.pointer(ev, container);
|
| 229 |
-
const offsetX = 10, offsetY = 12;
|
| 230 |
-
tip.style.transform = `translate(${Math.round(mx + offsetX)}px, ${Math.round(my + offsetY)}px)`;
|
| 231 |
-
})
|
| 232 |
-
.on('mouseleave', function () {
|
| 233 |
-
tip.style.opacity = '0';
|
| 234 |
-
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 235 |
-
d3.select(this)
|
| 236 |
-
.style('filter', null)
|
| 237 |
-
.transition().duration(120).ease(d3.easeCubicOut)
|
| 238 |
-
.attr('r', (i2) => sizesPx[i2] / 2)
|
| 239 |
-
.attr('fill-opacity', 0.9);
|
| 240 |
-
})
|
| 241 |
-
);
|
| 242 |
-
};
|
| 243 |
-
|
| 244 |
-
// First render + resize
|
| 245 |
-
if (window.ResizeObserver) {
|
| 246 |
-
const ro = new ResizeObserver(() => render());
|
| 247 |
-
ro.observe(container);
|
| 248 |
-
} else {
|
| 249 |
-
window.addEventListener('resize', render);
|
| 250 |
-
}
|
| 251 |
-
render();
|
| 252 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="d3-overall-performance"></div>
|
| 2 |
+
<style>
|
| 3 |
+
.d3-overall-performance {
|
| 4 |
+
width: 100%;
|
| 5 |
+
margin: 10px 0;
|
| 6 |
+
position: relative;
|
| 7 |
+
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
.d3-overall-performance svg {
|
| 11 |
+
display: block;
|
| 12 |
+
width: 100%;
|
| 13 |
+
height: auto;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
.d3-overall-performance .axes path,
|
| 17 |
+
.d3-overall-performance .axes line {
|
| 18 |
+
stroke: var(--axis-color, var(--text-color));
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
.d3-overall-performance .axes text {
|
| 22 |
+
fill: var(--tick-color, var(--muted-color));
|
| 23 |
+
font-size: 11px;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.d3-overall-performance .grid line {
|
| 27 |
+
stroke: var(--grid-color, rgba(0,0,0,.08));
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.d3-overall-performance .axis-label {
|
| 31 |
+
font-size: 12px;
|
| 32 |
+
font-weight: 500;
|
| 33 |
+
fill: var(--text-color);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.d3-overall-performance .chart-title {
|
| 37 |
+
font-size: 16px;
|
| 38 |
+
font-weight: 600;
|
| 39 |
+
fill: var(--text-color);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.d3-overall-performance .point {
|
| 43 |
+
cursor: pointer;
|
| 44 |
+
transition: opacity 0.15s ease;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
.d3-overall-performance .point:hover {
|
| 48 |
+
opacity: 0.8;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
.d3-overall-performance .point-label {
|
| 52 |
+
font-size: 11px;
|
| 53 |
+
fill: var(--text-color);
|
| 54 |
+
pointer-events: none;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.d3-overall-performance .header {
|
| 58 |
+
display: flex;
|
| 59 |
+
flex-wrap: wrap;
|
| 60 |
+
gap: 16px;
|
| 61 |
+
justify-content: space-between;
|
| 62 |
+
align-items: flex-start;
|
| 63 |
+
margin-top: 12px;
|
| 64 |
+
padding: 0 8px;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
.d3-overall-performance .legend {
|
| 68 |
+
display: flex;
|
| 69 |
+
flex-direction: column;
|
| 70 |
+
align-items: flex-start;
|
| 71 |
+
gap: 6px;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
.d3-overall-performance .legend-title {
|
| 75 |
+
font-size: 12px;
|
| 76 |
+
font-weight: 700;
|
| 77 |
+
color: var(--text-color);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.d3-overall-performance .legend .items {
|
| 81 |
+
display: flex;
|
| 82 |
+
flex-wrap: wrap;
|
| 83 |
+
gap: 8px 14px;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.d3-overall-performance .legend .item {
|
| 87 |
+
display: inline-flex;
|
| 88 |
+
align-items: center;
|
| 89 |
+
gap: 6px;
|
| 90 |
+
white-space: nowrap;
|
| 91 |
+
font-size: 12px;
|
| 92 |
+
color: var(--text-color);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
.d3-overall-performance .legend .swatch {
|
| 96 |
+
width: 14px;
|
| 97 |
+
height: 14px;
|
| 98 |
+
border-radius: 50%;
|
| 99 |
+
border: 2px solid var(--border-color);
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
.d3-overall-performance .legend .swatch.filled {
|
| 103 |
+
border: none;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
.d3-overall-performance .d3-tooltip {
|
| 107 |
+
position: absolute;
|
| 108 |
+
top: 0;
|
| 109 |
+
left: 0;
|
| 110 |
+
transform: translate(-9999px, -9999px);
|
| 111 |
+
pointer-events: none;
|
| 112 |
+
padding: 10px 12px;
|
| 113 |
+
border-radius: 8px;
|
| 114 |
+
font-size: 12px;
|
| 115 |
+
line-height: 1.4;
|
| 116 |
+
border: 1px solid var(--border-color);
|
| 117 |
+
background: var(--surface-bg);
|
| 118 |
+
color: var(--text-color);
|
| 119 |
+
box-shadow: 0 4px 24px rgba(0,0,0,.18);
|
| 120 |
+
opacity: 0;
|
| 121 |
+
transition: opacity 0.12s ease;
|
| 122 |
+
z-index: 10;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.d3-overall-performance .d3-tooltip .model-name {
|
| 126 |
+
font-weight: 600;
|
| 127 |
+
margin-bottom: 4px;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.d3-overall-performance .d3-tooltip .metric {
|
| 131 |
+
display: flex;
|
| 132 |
+
justify-content: space-between;
|
| 133 |
+
gap: 16px;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.d3-overall-performance .d3-tooltip .metric-label {
|
| 137 |
+
color: var(--muted-color);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.d3-overall-performance .d3-tooltip .metric-value {
|
| 141 |
+
font-weight: 500;
|
| 142 |
+
}
|
| 143 |
+
</style>
|
| 144 |
<script>
|
| 145 |
+
(() => {
|
| 146 |
+
const ensureD3 = (cb) => {
|
| 147 |
+
if (window.d3 && typeof window.d3.select === 'function') return cb();
|
| 148 |
+
let s = document.getElementById('d3-cdn-script');
|
| 149 |
+
if (!s) {
|
| 150 |
+
s = document.createElement('script');
|
| 151 |
+
s.id = 'd3-cdn-script';
|
| 152 |
+
s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
|
| 153 |
+
document.head.appendChild(s);
|
| 154 |
+
}
|
| 155 |
+
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
|
| 156 |
+
s.addEventListener('load', onReady, { once: true });
|
| 157 |
+
if (window.d3) onReady();
|
| 158 |
+
};
|
| 159 |
+
|
| 160 |
+
const bootstrap = () => {
|
| 161 |
+
const scriptEl = document.currentScript;
|
| 162 |
+
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 163 |
+
if (!(container && container.classList && container.classList.contains('d3-overall-performance'))) {
|
| 164 |
+
const candidates = Array.from(document.querySelectorAll('.d3-overall-performance'))
|
| 165 |
+
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 166 |
+
container = candidates[candidates.length - 1] || null;
|
| 167 |
+
}
|
| 168 |
+
if (!container) return;
|
| 169 |
+
if (container.dataset) {
|
| 170 |
+
if (container.dataset.mounted === 'true') return;
|
| 171 |
+
container.dataset.mounted = 'true';
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
// Tooltip setup
|
| 175 |
+
container.style.position = container.style.position || 'relative';
|
| 176 |
+
const tip = document.createElement('div');
|
| 177 |
+
tip.className = 'd3-tooltip';
|
| 178 |
+
container.appendChild(tip);
|
| 179 |
+
|
| 180 |
+
// SVG setup
|
| 181 |
+
const svg = d3.select(container).append('svg');
|
| 182 |
+
const gRoot = svg.append('g');
|
| 183 |
+
|
| 184 |
+
// Chart groups
|
| 185 |
+
const gGrid = gRoot.append('g').attr('class', 'grid');
|
| 186 |
+
const gAxes = gRoot.append('g').attr('class', 'axes');
|
| 187 |
+
const gPoints = gRoot.append('g').attr('class', 'points');
|
| 188 |
+
const gLabels = gRoot.append('g').attr('class', 'labels');
|
| 189 |
+
|
| 190 |
+
// State
|
| 191 |
+
let data = null;
|
| 192 |
+
let width = 800;
|
| 193 |
+
let height = 450;
|
| 194 |
+
const margin = { top: 40, right: 120, bottom: 56, left: 72 };
|
| 195 |
+
|
| 196 |
+
// Scales
|
| 197 |
+
const xScale = d3.scaleLinear();
|
| 198 |
+
const yScale = d3.scaleLinear();
|
| 199 |
+
|
| 200 |
+
// Data loading
|
| 201 |
+
const JSON_PATHS = [
|
| 202 |
+
'/data/overall_performance.json',
|
| 203 |
+
'./assets/figures/overall_performance.json',
|
| 204 |
+
'../assets/figures/overall_performance.json',
|
| 205 |
+
'../../assets/figures/overall_performance.json'
|
| 206 |
+
];
|
| 207 |
|
| 208 |
+
const fetchFirstAvailable = async (paths) => {
|
| 209 |
+
for (const p of paths) {
|
| 210 |
+
try {
|
| 211 |
+
const r = await fetch(p, { cache: 'no-cache' });
|
| 212 |
+
if (r.ok) return await r.json();
|
| 213 |
+
} catch (_) {}
|
| 214 |
+
}
|
| 215 |
+
throw new Error('Data not found');
|
| 216 |
+
};
|
| 217 |
+
|
| 218 |
+
function updateSize() {
|
| 219 |
+
width = container.clientWidth || 800;
|
| 220 |
+
height = Math.max(300, Math.round(width / 1.78)); // 16:9 aspect ratio
|
| 221 |
+
svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
|
| 222 |
+
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 223 |
+
return {
|
| 224 |
+
innerWidth: width - margin.left - margin.right,
|
| 225 |
+
innerHeight: height - margin.top - margin.bottom
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
};
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
function showTooltip(event, d) {
|
| 230 |
+
const rect = container.getBoundingClientRect();
|
| 231 |
+
const x = event.clientX - rect.left;
|
| 232 |
+
const y = event.clientY - rect.top;
|
| 233 |
+
|
| 234 |
+
tip.innerHTML = `
|
| 235 |
+
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 236 |
+
<div class="metric">
|
| 237 |
+
<span class="metric-label">Score:</span>
|
| 238 |
+
<span class="metric-value">${d.avg_score.toFixed(2)}</span>
|
| 239 |
+
</div>
|
| 240 |
+
<div class="metric">
|
| 241 |
+
<span class="metric-label">Tokens/Turn:</span>
|
| 242 |
+
<span class="metric-value">${Math.round(d.avg_output_tokens_per_turn).toLocaleString()}</span>
|
| 243 |
+
</div>
|
| 244 |
+
<div class="metric">
|
| 245 |
+
<span class="metric-label">Type:</span>
|
| 246 |
+
<span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
|
| 247 |
+
</div>
|
| 248 |
+
`;
|
| 249 |
+
|
| 250 |
+
const tipWidth = tip.offsetWidth || 150;
|
| 251 |
+
const tipHeight = tip.offsetHeight || 80;
|
| 252 |
+
let tipX = x + 12;
|
| 253 |
+
let tipY = y - tipHeight / 2;
|
| 254 |
+
|
| 255 |
+
if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
|
| 256 |
+
if (tipY < 0) tipY = 8;
|
| 257 |
+
if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
|
| 258 |
+
|
| 259 |
+
tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
|
| 260 |
+
tip.style.opacity = '1';
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
function hideTooltip() {
|
| 264 |
+
tip.style.opacity = '0';
|
| 265 |
+
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
function render() {
|
| 269 |
+
if (!data) return;
|
| 270 |
+
|
| 271 |
+
const { innerWidth, innerHeight } = updateSize();
|
| 272 |
+
const models = data.models;
|
| 273 |
+
|
| 274 |
+
// Update scales
|
| 275 |
+
const xExtent = d3.extent(models, d => d.avg_output_tokens_per_turn);
|
| 276 |
+
const yExtent = d3.extent(models, d => d.avg_score);
|
| 277 |
+
const xPadding = (xExtent[1] - xExtent[0]) * 0.1;
|
| 278 |
+
const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
|
| 279 |
+
|
| 280 |
+
xScale
|
| 281 |
+
.domain([xExtent[0] - xPadding, xExtent[1] + xPadding])
|
| 282 |
+
.range([0, innerWidth])
|
| 283 |
+
.nice();
|
| 284 |
+
|
| 285 |
+
yScale
|
| 286 |
+
.domain([yExtent[0] - yPadding, yExtent[1] + yPadding])
|
| 287 |
+
.range([innerHeight, 0])
|
| 288 |
+
.nice();
|
| 289 |
+
|
| 290 |
+
// Grid lines
|
| 291 |
+
const xTicks = xScale.ticks(6);
|
| 292 |
+
const yTicks = yScale.ticks(6);
|
| 293 |
+
|
| 294 |
+
gGrid.selectAll('.grid-x')
|
| 295 |
+
.data(xTicks)
|
| 296 |
+
.join('line')
|
| 297 |
+
.attr('class', 'grid-x')
|
| 298 |
+
.attr('x1', d => xScale(d))
|
| 299 |
+
.attr('x2', d => xScale(d))
|
| 300 |
+
.attr('y1', 0)
|
| 301 |
+
.attr('y2', innerHeight);
|
| 302 |
+
|
| 303 |
+
gGrid.selectAll('.grid-y')
|
| 304 |
+
.data(yTicks)
|
| 305 |
+
.join('line')
|
| 306 |
+
.attr('class', 'grid-y')
|
| 307 |
+
.attr('x1', 0)
|
| 308 |
+
.attr('x2', innerWidth)
|
| 309 |
+
.attr('y1', d => yScale(d))
|
| 310 |
+
.attr('y2', d => yScale(d));
|
| 311 |
+
|
| 312 |
+
// Axes
|
| 313 |
+
gAxes.selectAll('.x-axis')
|
| 314 |
+
.data([0])
|
| 315 |
+
.join('g')
|
| 316 |
+
.attr('class', 'x-axis')
|
| 317 |
+
.attr('transform', `translate(0,${innerHeight})`)
|
| 318 |
+
.call(d3.axisBottom(xScale).ticks(6).tickFormat(d => d.toLocaleString()));
|
| 319 |
+
|
| 320 |
+
gAxes.selectAll('.y-axis')
|
| 321 |
+
.data([0])
|
| 322 |
+
.join('g')
|
| 323 |
+
.attr('class', 'y-axis')
|
| 324 |
+
.call(d3.axisLeft(yScale).ticks(6));
|
| 325 |
+
|
| 326 |
+
// Axis labels
|
| 327 |
+
gAxes.selectAll('.x-label')
|
| 328 |
+
.data([0])
|
| 329 |
+
.join('text')
|
| 330 |
+
.attr('class', 'x-label axis-label')
|
| 331 |
+
.attr('x', innerWidth / 2)
|
| 332 |
+
.attr('y', innerHeight + 44)
|
| 333 |
+
.attr('text-anchor', 'middle')
|
| 334 |
+
.text('Average Output Tokens per Turn');
|
| 335 |
+
|
| 336 |
+
gAxes.selectAll('.y-label')
|
| 337 |
+
.data([0])
|
| 338 |
+
.join('text')
|
| 339 |
+
.attr('class', 'y-label axis-label')
|
| 340 |
+
.attr('x', -innerHeight / 2)
|
| 341 |
+
.attr('y', -52)
|
| 342 |
+
.attr('text-anchor', 'middle')
|
| 343 |
+
.attr('transform', 'rotate(-90)')
|
| 344 |
+
.text('Average Score');
|
| 345 |
+
|
| 346 |
+
// Chart title
|
| 347 |
+
gAxes.selectAll('.title')
|
| 348 |
+
.data([0])
|
| 349 |
+
.join('text')
|
| 350 |
+
.attr('class', 'title chart-title')
|
| 351 |
+
.attr('x', innerWidth / 2)
|
| 352 |
+
.attr('y', -16)
|
| 353 |
+
.attr('text-anchor', 'middle')
|
| 354 |
+
.text('Overall Performance: Score vs Token Usage');
|
| 355 |
+
|
| 356 |
+
// Points
|
| 357 |
+
const pointRadius = Math.max(8, Math.min(16, innerWidth / 60));
|
| 358 |
+
|
| 359 |
+
gPoints.selectAll('.point')
|
| 360 |
+
.data(models)
|
| 361 |
+
.join('circle')
|
| 362 |
+
.attr('class', 'point')
|
| 363 |
+
.attr('cx', d => xScale(d.avg_output_tokens_per_turn))
|
| 364 |
+
.attr('cy', d => yScale(d.avg_score))
|
| 365 |
+
.attr('r', pointRadius)
|
| 366 |
+
.attr('fill', d => d.is_open ? 'transparent' : d.color)
|
| 367 |
+
.attr('stroke', d => d.color)
|
| 368 |
+
.attr('stroke-width', d => d.is_open ? 3 : 0)
|
| 369 |
+
.on('mouseenter', showTooltip)
|
| 370 |
+
.on('mousemove', showTooltip)
|
| 371 |
+
.on('mouseleave', hideTooltip);
|
| 372 |
+
|
| 373 |
+
// Point labels
|
| 374 |
+
gLabels.selectAll('.point-label')
|
| 375 |
+
.data(models)
|
| 376 |
+
.join('text')
|
| 377 |
+
.attr('class', 'point-label')
|
| 378 |
+
.attr('x', d => xScale(d.avg_output_tokens_per_turn) + pointRadius + 6)
|
| 379 |
+
.attr('y', d => yScale(d.avg_score) + 4)
|
| 380 |
+
.text(d => d.name);
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
function buildLegend() {
|
| 384 |
+
let header = container.querySelector('.header');
|
| 385 |
+
if (!header) {
|
| 386 |
+
header = document.createElement('div');
|
| 387 |
+
header.className = 'header';
|
| 388 |
+
container.appendChild(header);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
let legend = header.querySelector('.legend');
|
| 392 |
+
if (!legend) {
|
| 393 |
+
legend = document.createElement('div');
|
| 394 |
+
legend.className = 'legend';
|
| 395 |
+
header.appendChild(legend);
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
legend.innerHTML = `
|
| 399 |
+
<div class="legend-title">Legend</div>
|
| 400 |
+
<div class="items">
|
| 401 |
+
<span class="item">
|
| 402 |
+
<span class="swatch filled" style="background: #666"></span>
|
| 403 |
+
<span>Closed model</span>
|
| 404 |
+
</span>
|
| 405 |
+
<span class="item">
|
| 406 |
+
<span class="swatch" style="border-color: #666"></span>
|
| 407 |
+
<span>Open model</span>
|
| 408 |
+
</span>
|
| 409 |
+
</div>
|
| 410 |
+
`;
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
// Initialize
|
| 414 |
+
fetchFirstAvailable(JSON_PATHS)
|
| 415 |
+
.then(json => {
|
| 416 |
+
data = json;
|
| 417 |
+
buildLegend();
|
| 418 |
+
render();
|
| 419 |
+
})
|
| 420 |
+
.catch(err => {
|
| 421 |
+
const pre = document.createElement('pre');
|
| 422 |
+
pre.style.color = 'red';
|
| 423 |
+
pre.style.padding = '16px';
|
| 424 |
+
pre.textContent = `Error loading data: ${err.message}`;
|
| 425 |
+
container.appendChild(pre);
|
| 426 |
+
});
|
| 427 |
+
|
| 428 |
+
// Resize handling
|
| 429 |
+
if (window.ResizeObserver) {
|
| 430 |
+
new ResizeObserver(() => render()).observe(container);
|
| 431 |
+
} else {
|
| 432 |
+
window.addEventListener('resize', render);
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
// Theme change handling
|
| 436 |
+
const observer = new MutationObserver(() => render());
|
| 437 |
+
observer.observe(document.documentElement, {
|
| 438 |
+
attributes: true,
|
| 439 |
+
attributeFilter: ['data-theme']
|
| 440 |
+
});
|
| 441 |
+
};
|
| 442 |
|
| 443 |
+
if (document.readyState === 'loading') {
|
| 444 |
+
document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
|
| 445 |
+
} else {
|
| 446 |
+
ensureD3(bootstrap);
|
| 447 |
+
}
|
| 448 |
+
})();
|
| 449 |
+
</script>
|