dlouapre HF Staff commited on
Commit
95fefdd
·
1 Parent(s): 7f4144c

Adding banner and sequence image

Browse files
app/src/content/article.mdx CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: "Are LLMs any good at the Science Game?\n Evaluating scientific reasoning using the card game Eleusis"
3
- subtitle: "Testing LLM calibration and iterative hypothesis formation"
4
  description: "A benchmark for evaluating LLM scientific reasoning using the card game Eleusis, testing iterative hypothesis formation, calibration, and strategic experimentation."
5
  authors:
6
  - name: "David Louapre"
 
1
  ---
2
+ title: "Are LLMs any good at the Science Game?"
3
+ subtitle: "Evaluating scientific reasoning using the card game Eleusis"
4
  description: "A benchmark for evaluating LLM scientific reasoning using the card game Eleusis, testing iterative hypothesis formation, calibration, and strategic experimentation."
5
  authors:
6
  - name: "David Louapre"
app/src/content/assets/data/overall_performance.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c620d1614704161071e6b3fdf51031228bc35a0aab8f70d6221f024a68e21e32
3
+ size 1413
app/src/content/assets/image/example_sequence.png ADDED

Git LFS Details

  • SHA256: 00cd0c917a7347b49acf9a69cd150eea944bc65428f7cdae03d7e9a7576d7eee
  • Pointer size: 131 Bytes
  • Size of remote file: 301 kB
app/src/content/chapters/eleusis/introduction.mdx CHANGED
@@ -1,5 +1,7 @@
1
  import Sidenote from "../../../components/Sidenote.astro";
2
- import Note from "../../../components/Note.astro";
 
 
3
 
4
  Large language models are increasingly being deployed as tools for scientific research—analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
5
 
@@ -25,9 +27,12 @@ Eleusis was designed by Robert Abbott explicitly to simulate the process of scie
25
 
26
  It's a microcosm of the scientific method: the rule is a hidden law of nature, each card play is an experiment, and the sequence of accepted and rejected cards is the accumulating evidence.
27
 
28
- <Note variant="info">
29
- **TODO**: Add figure showing an example Eleusis game sequence with the secret rule "alternating colors" (red, black, red, black...).
30
- </Note>
 
 
 
31
 
32
  We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: can models act like scientists? Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
33
 
 
1
  import Sidenote from "../../../components/Sidenote.astro";
2
+ import Image from "../../../components/Image.astro";
3
+
4
+ import exampleSequence from "../../assets/image/example_sequence.png";
5
 
6
  Large language models are increasingly being deployed as tools for scientific research—analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
7
 
 
27
 
28
  It's a microcosm of the scientific method: the rule is a hidden law of nature, each card play is an experiment, and the sequence of accepted and rejected cards is the accumulating evidence.
29
 
30
+ <Image
31
+ src={exampleSequence}
32
+ alt="Example Eleusis game sequence with the secret rule 'alternating colors': mainline shows 5♠, K♥, J♠, A♦, 6♣ following the pattern, while the sideline below shows rejected cards 10♠ and 2♦"
33
+ caption="An example Eleusis game with the secret rule 'alternating colors'. The main line (top) shows accepted cards: 5♠ → K♥ → J♠ → A♦ → 6♣, each alternating between black and red. The sideline (bottom) shows rejected cards that would have violated the pattern."
34
+ id="fig-example-sequence"
35
+ />
36
 
37
  We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: can models act like scientists? Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
38
 
app/src/content/embeds/banner.html CHANGED
@@ -1,258 +1,449 @@
1
- <div class="d3-galaxy" style="width:100%;margin:10px 0;aspect-ratio:3/1;min-height:260px;"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  <script>
3
- (() => {
4
- const ensureD3 = (cb) => {
5
- if (window.d3 && typeof window.d3.select === 'function') return cb();
6
- let s = document.getElementById('d3-cdn-script');
7
- if (!s) {
8
- s = document.createElement('script');
9
- s.id = 'd3-cdn-script';
10
- s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
11
- document.head.appendChild(s);
12
- }
13
- const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
14
- s.addEventListener('load', onReady, { once: true });
15
- if (window.d3) onReady();
16
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- const bootstrap = () => {
19
- const mount = document.currentScript ? document.currentScript.previousElementSibling : null;
20
- const container = (mount && mount.querySelector && mount.querySelector('.d3-galaxy')) || document.querySelector('.d3-galaxy');
21
- if (!container) return;
22
- if (container.dataset) {
23
- if (container.dataset.mounted === 'true') return;
24
- container.dataset.mounted = 'true';
25
- }
26
- // Scene params (match previous Plotly ranges)
27
- const cx = 1.5, cy = 0.5;
28
- const a = 1.3, b = 0.45;
29
- const numPoints = 3000;
30
- const numArms = 3;
31
- const numTurns = 2.1;
32
- const angleJitter = 0.12;
33
- const posNoise = 0.015;
34
-
35
- // Circle size settings
36
- const minCircleSize = 4; // minimum diameter in pixels
37
- const maxCircleSize = 12; // maximum diameter in pixels
38
-
39
- // Generate spiral + bulge
40
- const twoPi = Math.PI * 2;
41
- const t = Float64Array.from({ length: numPoints }, () => Math.random() * (twoPi * numTurns));
42
- const armIndices = Int16Array.from({ length: numPoints }, () => Math.floor(Math.random() * numArms));
43
- const armOffsets = Float64Array.from(armIndices, (k) => k * (twoPi / numArms));
44
- const theta = Float64Array.from(t, (tv, i) => tv + armOffsets[i] + d3.randomNormal.source(Math.random)(0, angleJitter)());
45
- const rNorm = Float64Array.from(t, (tv) => Math.pow(tv / (twoPi * numTurns), 0.9));
46
- const noiseScale = (rn) => posNoise * (0.8 + 0.6 * rn);
47
- const noiseX = Float64Array.from(rNorm, (rn) => d3.randomNormal.source(Math.random)(0, noiseScale(rn))());
48
- const noiseY = Float64Array.from(rNorm, (rn) => d3.randomNormal.source(Math.random)(0, noiseScale(rn))());
49
-
50
- const xSpiral = Float64Array.from(theta, (th, i) => cx + a * rNorm[i] * Math.cos(th) + noiseX[i]);
51
- const ySpiral = Float64Array.from(theta, (th, i) => cy + b * rNorm[i] * Math.sin(th) + noiseY[i]);
52
-
53
- const bulgePoints = Math.floor(0.18 * numPoints);
54
- const phiB = Float64Array.from({ length: bulgePoints }, () => twoPi * Math.random());
55
- const rB = Float64Array.from({ length: bulgePoints }, () => Math.pow(Math.random(), 2.2) * 0.22);
56
- const noiseXB = Float64Array.from({ length: bulgePoints }, () => d3.randomNormal.source(Math.random)(0, posNoise * 0.6)());
57
- const noiseYB = Float64Array.from({ length: bulgePoints }, () => d3.randomNormal.source(Math.random)(0, posNoise * 0.6)());
58
- const xBulge = Float64Array.from(phiB, (ph, i) => cx + a * rB[i] * Math.cos(ph) + noiseXB[i]);
59
- const yBulge = Float64Array.from(phiB, (ph, i) => cy + b * rB[i] * Math.sin(ph) + noiseYB[i]);
60
-
61
- // Concatenate
62
- const X = Array.from(xSpiral).concat(Array.from(xBulge));
63
- const Y = Array.from(ySpiral).concat(Array.from(yBulge));
64
- const lenSpiral = xSpiral.length;
65
-
66
- const zSpiral = Array.from(rNorm, (rn) => 1 - rn);
67
- const maxRB = rB && rB.length ? (window.d3 && d3.max ? d3.max(rB) : Math.max.apply(null, Array.from(rB))) : 1;
68
- const zBulge = Array.from(rB, (rb) => 1 - (maxRB ? rb / maxRB : 0));
69
- const Zraw = zSpiral.concat(zBulge);
70
- const sizesPx = Zraw.map((z) => minCircleSize + z * (maxCircleSize - minCircleSize)); // diameter in pixels
71
-
72
- // Labels (same categories as Python version)
73
- const labelOf = (i) => {
74
- const z = Zraw[i];
75
- if (z < 0.25) return 'tiny star';
76
- if (z < 0.5) return 'small star';
77
- if (z < 0.75) return 'medium star';
78
- return 'large star';
79
- };
80
-
81
- // Sort by size ascending for z-index: small first, big last
82
- const idx = d3.range(X.length).sort((i, j) => sizesPx[i] - sizesPx[j]);
83
-
84
- // Colors: piecewise gradient [0 -> 0.5 -> 1]
85
- const c0 = d3.rgb(78, 165, 183); // rgb(78, 165, 183)
86
- const c1 = d3.rgb(206, 192, 250); // rgb(206, 192, 250)
87
- const c2 = d3.rgb(232, 137, 171); // rgb(232, 137, 171)
88
- const interp01 = d3.interpolateRgb(c0, c1);
89
- const interp12 = d3.interpolateRgb(c1, c2);
90
- const colorFor = (v) => {
91
- const t = Math.max(0, Math.min(1, v));
92
- return t <= 0.5 ? interp01(t / 0.5) : interp12((t - 0.5) / 0.5);
93
- };
94
-
95
- // Create SVG
96
- const svg = d3.select(container).append('svg')
97
- .attr('width', '100%')
98
- .style('display', 'block')
99
- .style('cursor', 'crosshair');
100
-
101
- const render = () => {
102
- const width = container.clientWidth || 800;
103
- const height = Math.max(260, Math.round(width / 3)); // keep ~3:1, min height
104
- svg.attr('width', width).attr('height', height);
105
-
106
- const xScale = d3.scaleLinear().domain([0, 3]).range([0, width]);
107
- const yScale = d3.scaleLinear().domain([0, 1]).range([height, 0]);
108
-
109
- // Subtle stroke color depending on theme
110
- const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
111
- const strokeColor = isDark ? 'rgba(255,255,255,0.18)' : 'rgba(0,0,0,0.12)';
112
- const glowColor = isDark ? 'rgba(255,255,255,0.35)' : 'rgba(0,0,0,0.25)';
113
-
114
-
115
- // Group for points (no blend mode for better print/PDF visibility)
116
- const g = svg.selectAll('g.points').data([0]).join('g').attr('class', 'points');
117
-
118
- // Ensure container can host an absolute tooltip
119
- container.style.position = container.style.position || 'relative';
120
- let tip = container.querySelector('.d3-tooltip');
121
- let tipInner;
122
- if (!tip) {
123
- tip = document.createElement('div');
124
- tip.className = 'd3-tooltip';
125
- Object.assign(tip.style, {
126
- position: 'absolute',
127
- top: '0px',
128
- left: '0px',
129
- transform: 'translate(-9999px, -9999px)',
130
- pointerEvents: 'none',
131
- padding: '10px 12px',
132
- borderRadius: '12px',
133
- fontSize: '12px',
134
- lineHeight: '1.35',
135
- border: '1px solid var(--border-color)',
136
- background: 'var(--surface-bg)',
137
- color: 'var(--text-color)',
138
- boxShadow: '0 8px 32px rgba(0,0,0,.28), 0 2px 8px rgba(0,0,0,.12)',
139
- opacity: '0',
140
- transition: 'opacity .12s ease',
141
- backdropFilter: 'saturate(1.12) blur(8px)',
142
- zIndex: '20'
143
- });
144
- tipInner = document.createElement('div');
145
- tipInner.className = 'd3-tooltip__inner';
146
- Object.assign(tipInner.style, {
147
- textAlign: 'left',
148
- display: 'flex',
149
- flexDirection: 'column',
150
- gap: '6px',
151
- minWidth: '220px'
152
- });
153
- tip.appendChild(tipInner);
154
- container.appendChild(tip);
155
- } else {
156
- tipInner = tip.querySelector('.d3-tooltip__inner') || tip;
157
- }
158
-
159
- // Final filter: remove small dots very close to the galaxy center (after placement)
160
- const centerHoleRadius = 0.48; // elliptical radius threshold
161
- const smallSizeThreshold = 7.5; // same notion as Python size cut
162
- const rTotal = idx.map((i) => Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2));
163
- const idxFiltered = idx.filter((i, k) => !(rTotal[k] <= centerHoleRadius && sizesPx[i] < smallSizeThreshold));
164
-
165
- const sel = g.selectAll('circle').data(idxFiltered, (i) => i);
166
- sel.join(
167
- (enter) => enter.append('circle')
168
- .attr('cx', (i) => xScale(X[i]))
169
- .attr('cy', (i) => yScale(Y[i]))
170
- .attr('r', (i) => sizesPx[i] / 2)
171
- .attr('fill', (i) => colorFor(Zraw[i]))
172
- .attr('fill-opacity', 0.9)
173
- .on('mouseenter', function (ev, i) {
174
- d3.select(this).raise()
175
- .style('filter', `drop-shadow(0 0 8px ${glowColor})`)
176
- .transition().duration(120).ease(d3.easeCubicOut)
177
- .attr('r', (sizesPx[i] / 2) * 1.25)
178
- .attr('fill-opacity', 1);
179
- const r = Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2);
180
- const type = i < lenSpiral ? 'spiral' : 'bulge';
181
- const arm = i < lenSpiral ? (armIndices[i] + 1) : null;
182
- tipInner.innerHTML =
183
- `<div style="font-weight:800;letter-spacing:.1px;"><strong>${labelOf(i)}</strong></div>` +
184
- `<div style="font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;letter-spacing:.1px;"><strong>Type</strong> ${type}${arm ? ` (Arm ${arm})` : ''}</div>` +
185
- `<div style="padding-top:6px;border-top:1px solid var(--border-color);"><strong>Position</strong> X ${X[i].toFixed(2)} · <strong>Y</strong> ${Y[i].toFixed(2)}</div>` +
186
- `<div><strong>Distance</strong> Radius ${r.toFixed(3)} · <strong>Z</strong> ${Zraw[i].toFixed(3)}</div>` +
187
- `<div><strong>Size</strong> ${sizesPx[i].toFixed(1)} px</div>`;
188
- tip.style.opacity = '1';
189
- })
190
- .on('mousemove', (ev, i) => {
191
- const [mx, my] = d3.pointer(ev, container);
192
- const offsetX = 10, offsetY = 12;
193
- tip.style.transform = `translate(${Math.round(mx + offsetX)}px, ${Math.round(my + offsetY)}px)`;
194
- })
195
- .on('mouseleave', function () {
196
- tip.style.opacity = '0';
197
- tip.style.transform = 'translate(-9999px, -9999px)';
198
- d3.select(this)
199
- .style('filter', null)
200
- .transition().duration(120).ease(d3.easeCubicOut)
201
- .attr('r', (i2) => sizesPx[i2] / 2)
202
- .attr('fill-opacity', 0.9);
203
- }),
204
- (update) => update
205
- .attr('cx', (i) => xScale(X[i]))
206
- .attr('cy', (i) => yScale(Y[i]))
207
- .attr('r', (i) => sizesPx[i] / 2)
208
- .attr('fill', (i) => colorFor(Zraw[i]))
209
- .attr('fill-opacity', 0.9)
210
- .on('mouseenter', function (ev, i) {
211
- d3.select(this).raise()
212
- .style('filter', `drop-shadow(0 0 8px ${glowColor})`)
213
- .transition().duration(120).ease(d3.easeCubicOut)
214
- .attr('r', (sizesPx[i] / 2) * 1.25)
215
- .attr('fill-opacity', 1);
216
- const r = Math.sqrt(((X[i] - cx) / a) ** 2 + ((Y[i] - cy) / b) ** 2);
217
- const type = i < lenSpiral ? 'spiral' : 'bulge';
218
- const arm = i < lenSpiral ? (armIndices[i] + 1) : null;
219
- tipInner.innerHTML =
220
- `<div style="font-weight:800;letter-spacing:.1px;"><strong>${labelOf(i)}</strong></div>` +
221
- `<div style="font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;letter-spacing:.1px;"><strong>Type</strong> ${type}${arm ? ` (Arm ${arm})` : ''}</div>` +
222
- `<div style="padding-top:6px;border-top:1px solid var(--border-color);"><strong>Position</strong> X ${X[i].toFixed(2)} · <strong>Y</strong> ${Y[i].toFixed(2)}</div>` +
223
- `<div><strong>Distance</strong> Radius ${r.toFixed(3)} · <strong>Z</strong> ${Zraw[i].toFixed(3)}</div>` +
224
- `<div><strong>Size</strong> ${sizesPx[i].toFixed(1)} px</div>`;
225
- tip.style.opacity = '1';
226
- })
227
- .on('mousemove', (ev, i) => {
228
- const [mx, my] = d3.pointer(ev, container);
229
- const offsetX = 10, offsetY = 12;
230
- tip.style.transform = `translate(${Math.round(mx + offsetX)}px, ${Math.round(my + offsetY)}px)`;
231
- })
232
- .on('mouseleave', function () {
233
- tip.style.opacity = '0';
234
- tip.style.transform = 'translate(-9999px, -9999px)';
235
- d3.select(this)
236
- .style('filter', null)
237
- .transition().duration(120).ease(d3.easeCubicOut)
238
- .attr('r', (i2) => sizesPx[i2] / 2)
239
- .attr('fill-opacity', 0.9);
240
- })
241
- );
242
- };
243
-
244
- // First render + resize
245
- if (window.ResizeObserver) {
246
- const ro = new ResizeObserver(() => render());
247
- ro.observe(container);
248
- } else {
249
- window.addEventListener('resize', render);
250
- }
251
- render();
252
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- if (document.readyState === 'loading') {
255
- document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
256
- } else { ensureD3(bootstrap); }
257
- })();
258
- </script>
 
 
 
1
+ <div class="d3-overall-performance"></div>
2
+ <style>
3
+ .d3-overall-performance {
4
+ width: 100%;
5
+ margin: 10px 0;
6
+ position: relative;
7
+ font-family: system-ui, -apple-system, sans-serif;
8
+ }
9
+
10
+ .d3-overall-performance svg {
11
+ display: block;
12
+ width: 100%;
13
+ height: auto;
14
+ }
15
+
16
+ .d3-overall-performance .axes path,
17
+ .d3-overall-performance .axes line {
18
+ stroke: var(--axis-color, var(--text-color));
19
+ }
20
+
21
+ .d3-overall-performance .axes text {
22
+ fill: var(--tick-color, var(--muted-color));
23
+ font-size: 11px;
24
+ }
25
+
26
+ .d3-overall-performance .grid line {
27
+ stroke: var(--grid-color, rgba(0,0,0,.08));
28
+ }
29
+
30
+ .d3-overall-performance .axis-label {
31
+ font-size: 12px;
32
+ font-weight: 500;
33
+ fill: var(--text-color);
34
+ }
35
+
36
+ .d3-overall-performance .chart-title {
37
+ font-size: 16px;
38
+ font-weight: 600;
39
+ fill: var(--text-color);
40
+ }
41
+
42
+ .d3-overall-performance .point {
43
+ cursor: pointer;
44
+ transition: opacity 0.15s ease;
45
+ }
46
+
47
+ .d3-overall-performance .point:hover {
48
+ opacity: 0.8;
49
+ }
50
+
51
+ .d3-overall-performance .point-label {
52
+ font-size: 11px;
53
+ fill: var(--text-color);
54
+ pointer-events: none;
55
+ }
56
+
57
+ .d3-overall-performance .header {
58
+ display: flex;
59
+ flex-wrap: wrap;
60
+ gap: 16px;
61
+ justify-content: space-between;
62
+ align-items: flex-start;
63
+ margin-top: 12px;
64
+ padding: 0 8px;
65
+ }
66
+
67
+ .d3-overall-performance .legend {
68
+ display: flex;
69
+ flex-direction: column;
70
+ align-items: flex-start;
71
+ gap: 6px;
72
+ }
73
+
74
+ .d3-overall-performance .legend-title {
75
+ font-size: 12px;
76
+ font-weight: 700;
77
+ color: var(--text-color);
78
+ }
79
+
80
+ .d3-overall-performance .legend .items {
81
+ display: flex;
82
+ flex-wrap: wrap;
83
+ gap: 8px 14px;
84
+ }
85
+
86
+ .d3-overall-performance .legend .item {
87
+ display: inline-flex;
88
+ align-items: center;
89
+ gap: 6px;
90
+ white-space: nowrap;
91
+ font-size: 12px;
92
+ color: var(--text-color);
93
+ }
94
+
95
+ .d3-overall-performance .legend .swatch {
96
+ width: 14px;
97
+ height: 14px;
98
+ border-radius: 50%;
99
+ border: 2px solid var(--border-color);
100
+ }
101
+
102
+ .d3-overall-performance .legend .swatch.filled {
103
+ border: none;
104
+ }
105
+
106
+ .d3-overall-performance .d3-tooltip {
107
+ position: absolute;
108
+ top: 0;
109
+ left: 0;
110
+ transform: translate(-9999px, -9999px);
111
+ pointer-events: none;
112
+ padding: 10px 12px;
113
+ border-radius: 8px;
114
+ font-size: 12px;
115
+ line-height: 1.4;
116
+ border: 1px solid var(--border-color);
117
+ background: var(--surface-bg);
118
+ color: var(--text-color);
119
+ box-shadow: 0 4px 24px rgba(0,0,0,.18);
120
+ opacity: 0;
121
+ transition: opacity 0.12s ease;
122
+ z-index: 10;
123
+ }
124
+
125
+ .d3-overall-performance .d3-tooltip .model-name {
126
+ font-weight: 600;
127
+ margin-bottom: 4px;
128
+ }
129
+
130
+ .d3-overall-performance .d3-tooltip .metric {
131
+ display: flex;
132
+ justify-content: space-between;
133
+ gap: 16px;
134
+ }
135
+
136
+ .d3-overall-performance .d3-tooltip .metric-label {
137
+ color: var(--muted-color);
138
+ }
139
+
140
+ .d3-overall-performance .d3-tooltip .metric-value {
141
+ font-weight: 500;
142
+ }
143
+ </style>
144
  <script>
145
+ (() => {
146
+ const ensureD3 = (cb) => {
147
+ if (window.d3 && typeof window.d3.select === 'function') return cb();
148
+ let s = document.getElementById('d3-cdn-script');
149
+ if (!s) {
150
+ s = document.createElement('script');
151
+ s.id = 'd3-cdn-script';
152
+ s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
153
+ document.head.appendChild(s);
154
+ }
155
+ const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
156
+ s.addEventListener('load', onReady, { once: true });
157
+ if (window.d3) onReady();
158
+ };
159
+
160
+ const bootstrap = () => {
161
+ const scriptEl = document.currentScript;
162
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
163
+ if (!(container && container.classList && container.classList.contains('d3-overall-performance'))) {
164
+ const candidates = Array.from(document.querySelectorAll('.d3-overall-performance'))
165
+ .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
166
+ container = candidates[candidates.length - 1] || null;
167
+ }
168
+ if (!container) return;
169
+ if (container.dataset) {
170
+ if (container.dataset.mounted === 'true') return;
171
+ container.dataset.mounted = 'true';
172
+ }
173
+
174
+ // Tooltip setup
175
+ container.style.position = container.style.position || 'relative';
176
+ const tip = document.createElement('div');
177
+ tip.className = 'd3-tooltip';
178
+ container.appendChild(tip);
179
+
180
+ // SVG setup
181
+ const svg = d3.select(container).append('svg');
182
+ const gRoot = svg.append('g');
183
+
184
+ // Chart groups
185
+ const gGrid = gRoot.append('g').attr('class', 'grid');
186
+ const gAxes = gRoot.append('g').attr('class', 'axes');
187
+ const gPoints = gRoot.append('g').attr('class', 'points');
188
+ const gLabels = gRoot.append('g').attr('class', 'labels');
189
+
190
+ // State
191
+ let data = null;
192
+ let width = 800;
193
+ let height = 450;
194
+ const margin = { top: 40, right: 120, bottom: 56, left: 72 };
195
+
196
+ // Scales
197
+ const xScale = d3.scaleLinear();
198
+ const yScale = d3.scaleLinear();
199
+
200
+ // Data loading
201
+ const JSON_PATHS = [
202
+ '/data/overall_performance.json',
203
+ './assets/figures/overall_performance.json',
204
+ '../assets/figures/overall_performance.json',
205
+ '../../assets/figures/overall_performance.json'
206
+ ];
207
 
208
+ const fetchFirstAvailable = async (paths) => {
209
+ for (const p of paths) {
210
+ try {
211
+ const r = await fetch(p, { cache: 'no-cache' });
212
+ if (r.ok) return await r.json();
213
+ } catch (_) {}
214
+ }
215
+ throw new Error('Data not found');
216
+ };
217
+
218
+ function updateSize() {
219
+ width = container.clientWidth || 800;
220
+ height = Math.max(300, Math.round(width / 1.78)); // 16:9 aspect ratio
221
+ svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
222
+ gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
223
+ return {
224
+ innerWidth: width - margin.left - margin.right,
225
+ innerHeight: height - margin.top - margin.bottom
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  };
227
+ }
228
+
229
+ function showTooltip(event, d) {
230
+ const rect = container.getBoundingClientRect();
231
+ const x = event.clientX - rect.left;
232
+ const y = event.clientY - rect.top;
233
+
234
+ tip.innerHTML = `
235
+ <div class="model-name" style="color: ${d.color}">${d.name}</div>
236
+ <div class="metric">
237
+ <span class="metric-label">Score:</span>
238
+ <span class="metric-value">${d.avg_score.toFixed(2)}</span>
239
+ </div>
240
+ <div class="metric">
241
+ <span class="metric-label">Tokens/Turn:</span>
242
+ <span class="metric-value">${Math.round(d.avg_output_tokens_per_turn).toLocaleString()}</span>
243
+ </div>
244
+ <div class="metric">
245
+ <span class="metric-label">Type:</span>
246
+ <span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
247
+ </div>
248
+ `;
249
+
250
+ const tipWidth = tip.offsetWidth || 150;
251
+ const tipHeight = tip.offsetHeight || 80;
252
+ let tipX = x + 12;
253
+ let tipY = y - tipHeight / 2;
254
+
255
+ if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
256
+ if (tipY < 0) tipY = 8;
257
+ if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
258
+
259
+ tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
260
+ tip.style.opacity = '1';
261
+ }
262
+
263
+ function hideTooltip() {
264
+ tip.style.opacity = '0';
265
+ tip.style.transform = 'translate(-9999px, -9999px)';
266
+ }
267
+
268
+ function render() {
269
+ if (!data) return;
270
+
271
+ const { innerWidth, innerHeight } = updateSize();
272
+ const models = data.models;
273
+
274
+ // Update scales
275
+ const xExtent = d3.extent(models, d => d.avg_output_tokens_per_turn);
276
+ const yExtent = d3.extent(models, d => d.avg_score);
277
+ const xPadding = (xExtent[1] - xExtent[0]) * 0.1;
278
+ const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
279
+
280
+ xScale
281
+ .domain([xExtent[0] - xPadding, xExtent[1] + xPadding])
282
+ .range([0, innerWidth])
283
+ .nice();
284
+
285
+ yScale
286
+ .domain([yExtent[0] - yPadding, yExtent[1] + yPadding])
287
+ .range([innerHeight, 0])
288
+ .nice();
289
+
290
+ // Grid lines
291
+ const xTicks = xScale.ticks(6);
292
+ const yTicks = yScale.ticks(6);
293
+
294
+ gGrid.selectAll('.grid-x')
295
+ .data(xTicks)
296
+ .join('line')
297
+ .attr('class', 'grid-x')
298
+ .attr('x1', d => xScale(d))
299
+ .attr('x2', d => xScale(d))
300
+ .attr('y1', 0)
301
+ .attr('y2', innerHeight);
302
+
303
+ gGrid.selectAll('.grid-y')
304
+ .data(yTicks)
305
+ .join('line')
306
+ .attr('class', 'grid-y')
307
+ .attr('x1', 0)
308
+ .attr('x2', innerWidth)
309
+ .attr('y1', d => yScale(d))
310
+ .attr('y2', d => yScale(d));
311
+
312
+ // Axes
313
+ gAxes.selectAll('.x-axis')
314
+ .data([0])
315
+ .join('g')
316
+ .attr('class', 'x-axis')
317
+ .attr('transform', `translate(0,${innerHeight})`)
318
+ .call(d3.axisBottom(xScale).ticks(6).tickFormat(d => d.toLocaleString()));
319
+
320
+ gAxes.selectAll('.y-axis')
321
+ .data([0])
322
+ .join('g')
323
+ .attr('class', 'y-axis')
324
+ .call(d3.axisLeft(yScale).ticks(6));
325
+
326
+ // Axis labels
327
+ gAxes.selectAll('.x-label')
328
+ .data([0])
329
+ .join('text')
330
+ .attr('class', 'x-label axis-label')
331
+ .attr('x', innerWidth / 2)
332
+ .attr('y', innerHeight + 44)
333
+ .attr('text-anchor', 'middle')
334
+ .text('Average Output Tokens per Turn');
335
+
336
+ gAxes.selectAll('.y-label')
337
+ .data([0])
338
+ .join('text')
339
+ .attr('class', 'y-label axis-label')
340
+ .attr('x', -innerHeight / 2)
341
+ .attr('y', -52)
342
+ .attr('text-anchor', 'middle')
343
+ .attr('transform', 'rotate(-90)')
344
+ .text('Average Score');
345
+
346
+ // Chart title
347
+ gAxes.selectAll('.title')
348
+ .data([0])
349
+ .join('text')
350
+ .attr('class', 'title chart-title')
351
+ .attr('x', innerWidth / 2)
352
+ .attr('y', -16)
353
+ .attr('text-anchor', 'middle')
354
+ .text('Overall Performance: Score vs Token Usage');
355
+
356
+ // Points
357
+ const pointRadius = Math.max(8, Math.min(16, innerWidth / 60));
358
+
359
+ gPoints.selectAll('.point')
360
+ .data(models)
361
+ .join('circle')
362
+ .attr('class', 'point')
363
+ .attr('cx', d => xScale(d.avg_output_tokens_per_turn))
364
+ .attr('cy', d => yScale(d.avg_score))
365
+ .attr('r', pointRadius)
366
+ .attr('fill', d => d.is_open ? 'transparent' : d.color)
367
+ .attr('stroke', d => d.color)
368
+ .attr('stroke-width', d => d.is_open ? 3 : 0)
369
+ .on('mouseenter', showTooltip)
370
+ .on('mousemove', showTooltip)
371
+ .on('mouseleave', hideTooltip);
372
+
373
+ // Point labels
374
+ gLabels.selectAll('.point-label')
375
+ .data(models)
376
+ .join('text')
377
+ .attr('class', 'point-label')
378
+ .attr('x', d => xScale(d.avg_output_tokens_per_turn) + pointRadius + 6)
379
+ .attr('y', d => yScale(d.avg_score) + 4)
380
+ .text(d => d.name);
381
+ }
382
+
383
+ function buildLegend() {
384
+ let header = container.querySelector('.header');
385
+ if (!header) {
386
+ header = document.createElement('div');
387
+ header.className = 'header';
388
+ container.appendChild(header);
389
+ }
390
+
391
+ let legend = header.querySelector('.legend');
392
+ if (!legend) {
393
+ legend = document.createElement('div');
394
+ legend.className = 'legend';
395
+ header.appendChild(legend);
396
+ }
397
+
398
+ legend.innerHTML = `
399
+ <div class="legend-title">Legend</div>
400
+ <div class="items">
401
+ <span class="item">
402
+ <span class="swatch filled" style="background: #666"></span>
403
+ <span>Closed model</span>
404
+ </span>
405
+ <span class="item">
406
+ <span class="swatch" style="border-color: #666"></span>
407
+ <span>Open model</span>
408
+ </span>
409
+ </div>
410
+ `;
411
+ }
412
+
413
+ // Initialize
414
+ fetchFirstAvailable(JSON_PATHS)
415
+ .then(json => {
416
+ data = json;
417
+ buildLegend();
418
+ render();
419
+ })
420
+ .catch(err => {
421
+ const pre = document.createElement('pre');
422
+ pre.style.color = 'red';
423
+ pre.style.padding = '16px';
424
+ pre.textContent = `Error loading data: ${err.message}`;
425
+ container.appendChild(pre);
426
+ });
427
+
428
+ // Resize handling
429
+ if (window.ResizeObserver) {
430
+ new ResizeObserver(() => render()).observe(container);
431
+ } else {
432
+ window.addEventListener('resize', render);
433
+ }
434
+
435
+ // Theme change handling
436
+ const observer = new MutationObserver(() => render());
437
+ observer.observe(document.documentElement, {
438
+ attributes: true,
439
+ attributeFilter: ['data-theme']
440
+ });
441
+ };
442
 
443
+ if (document.readyState === 'loading') {
444
+ document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
445
+ } else {
446
+ ensureD3(bootstrap);
447
+ }
448
+ })();
449
+ </script>