dlouapre HF Staff commited on
Commit
cafe265
·
1 Parent(s): e0197ee

Improved charts

Browse files
app/plugins/rehype/post-citation.mjs CHANGED
@@ -299,20 +299,28 @@ export default function rehypeReferencesAndFootnotes() {
299
  }
300
  };
301
 
302
- // Find references container and normalize its list
303
- const findReferencesRoot = () => {
304
- let found = null;
305
  walk(tree, null, (node) => {
306
- if (found) return;
307
  if (!isElement(node)) return;
308
  const id = getAttr(node, 'id');
309
  if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
310
- found = node;
 
 
 
311
  }
312
  });
313
  return found;
314
  };
315
 
 
 
 
 
 
 
316
  const toOrderedList = (container) => {
317
  // If there is already an <ol>, use it; otherwise convert common structures
318
  let ol = getChildren(container).find((c) => isElement(c) && c.tagName === 'ol');
@@ -340,15 +348,18 @@ export default function rehypeReferencesAndFootnotes() {
340
  return ol;
341
  };
342
 
343
- const refsRoot = findReferencesRoot();
344
- let refsOl = null;
 
345
  const refIdSet = new Set();
346
  const refIdToExternalHref = new Map();
347
 
348
- if (refsRoot) {
349
- refsOl = toOrderedList(refsRoot);
 
 
350
  // Collect item ids and linkify their content
351
- for (const li of getChildren(refsOl)) {
352
  if (!isElement(li) || li.tagName !== 'li') continue;
353
  if (!getAttr(li, 'id')) {
354
  // Try to find a nested element with id to promote
@@ -380,6 +391,7 @@ export default function rehypeReferencesAndFootnotes() {
380
  if (externalHref) refIdToExternalHref.set(String(id), externalHref);
381
  }
382
  }
 
383
  setAttr(refsRoot, 'data-built-refs', '1');
384
  }
385
 
 
299
  }
300
  };
301
 
302
+ // Find ALL references containers (there may be multiple from different MDX imports)
303
+ const findAllReferencesRoots = () => {
304
+ const found = [];
305
  walk(tree, null, (node) => {
 
306
  if (!isElement(node)) return;
307
  const id = getAttr(node, 'id');
308
  if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
309
+ // Don't add if already found (shouldn't happen but be safe)
310
+ if (!found.includes(node)) {
311
+ found.push(node);
312
+ }
313
  }
314
  });
315
  return found;
316
  };
317
 
318
+ // Legacy function for backwards compatibility
319
+ const findReferencesRoot = () => {
320
+ const all = findAllReferencesRoots();
321
+ return all.length > 0 ? all[0] : null;
322
+ };
323
+
324
  const toOrderedList = (container) => {
325
  // If there is already an <ol>, use it; otherwise convert common structures
326
  let ol = getChildren(container).find((c) => isElement(c) && c.tagName === 'ol');
 
348
  return ol;
349
  };
350
 
351
+ // Process ALL references sections (there may be multiple from different MDX imports)
352
+ const allRefsRoots = findAllReferencesRoots();
353
+ let refsOl = null; // Keep track of the first one for backlink processing
354
  const refIdSet = new Set();
355
  const refIdToExternalHref = new Map();
356
 
357
+ for (const refsRoot of allRefsRoots) {
358
+ const currentOl = toOrderedList(refsRoot);
359
+ if (!refsOl) refsOl = currentOl; // Use first ol for backlinks
360
+
361
  // Collect item ids and linkify their content
362
+ for (const li of getChildren(currentOl)) {
363
  if (!isElement(li) || li.tagName !== 'li') continue;
364
  if (!getAttr(li, 'id')) {
365
  // Try to find a nested element with id to promote
 
391
  if (externalHref) refIdToExternalHref.set(String(id), externalHref);
392
  }
393
  }
394
+ // Mark each references section so Footer.astro can find them all
395
  setAttr(refsRoot, 'data-built-refs', '1');
396
  }
397
 
app/src/components/Footer.astro CHANGED
@@ -142,15 +142,82 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
142
  return null;
143
  };
144
 
145
- const referencesEl = findFirstOutsideFooter([
146
- "#bibliography-references-list",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  "[data-bibliography-block]",
148
- "#references",
 
 
149
  "#refs",
150
- ".references",
151
  ".bibliography",
152
  ]);
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  // Try multiple selectors for footnotes
155
  const footnotesEl = findFirstOutsideFooter([
156
  "[data-built-footnotes]",
@@ -159,7 +226,6 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
159
  "div.footnotes",
160
  ]);
161
 
162
- const movedRefs = moveIntoFooter(referencesEl, "References");
163
  const movedNotes = moveIntoFooter(footnotesEl, "Footnotes");
164
 
165
  if (movedRefs || movedNotes) {
@@ -196,8 +262,50 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
196
  // Final attempt after a short delay
197
  setTimeout(attemptMove, 300);
198
 
199
- // Resize on window changes (e.g., fonts, layout)
200
- // No textarea auto-resize needed for <pre> blocks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  })();
202
  </script>
203
 
 
142
  return null;
143
  };
144
 
145
+ // Find ALL references/bibliography sections and consolidate them
146
+ const findAllOutsideFooter = (selectors) => {
147
+ const results = [];
148
+ const searchRoots = [contentRoot, document.body].filter(Boolean);
149
+ for (const root of searchRoots) {
150
+ for (const sel of selectors) {
151
+ const els = root.querySelectorAll(sel);
152
+ els.forEach(el => {
153
+ if (el && !footer.contains(el) && !results.includes(el)) {
154
+ results.push(el);
155
+ }
156
+ });
157
+ }
158
+ }
159
+ return results;
160
+ };
161
+
162
+ // Find all bibliography/references sections
163
+ // Note: post-citation.mjs adds data-built-refs="1" to processed sections
164
+ // We use multiple selectors to catch different formats, prioritizing data attributes
165
+ // over IDs (since duplicate IDs are invalid HTML and have undefined querySelector behavior)
166
+ const allRefsEls = findAllOutsideFooter([
167
+ "[data-built-refs]",
168
  "[data-bibliography-block]",
169
+ "#bibliography-references-list",
170
+ "section#references",
171
+ "div#references",
172
  "#refs",
173
+ ".references:not(ol)",
174
  ".bibliography",
175
  ]);
176
 
177
+ // Consolidate multiple bibliography sections into one
178
+ let movedRefs = false;
179
+ if (allRefsEls.length > 0) {
180
+ // Move the first one normally
181
+ movedRefs = moveIntoFooter(allRefsEls[0], "References");
182
+
183
+ // For additional bibliography sections, merge their list items into the first one
184
+ if (allRefsEls.length > 1) {
185
+ // Find the target ol - it's now inside the moved element within target
186
+ const targetOl = target.querySelector("ol.references") || target.querySelector("ol");
187
+
188
+ for (let i = 1; i < allRefsEls.length; i++) {
189
+ const extraEl = allRefsEls[i];
190
+ // Find ol inside the extra section (could be nested)
191
+ const extraOl = extraEl.querySelector("ol.references") || extraEl.querySelector("ol");
192
+
193
+ if (extraOl && targetOl) {
194
+ // Move all list items from extra bibliography to the consolidated one
195
+ const items = Array.from(extraOl.querySelectorAll(":scope > li"));
196
+ items.forEach(item => {
197
+ // Check if this reference already exists (by id) to avoid duplicates
198
+ const itemId = item.id;
199
+ if (itemId) {
200
+ // Use try-catch since CSS.escape might not be available in all browsers
201
+ try {
202
+ const escapedId = CSS.escape ? CSS.escape(itemId) : itemId.replace(/([^\w-])/g, '\\$1');
203
+ if (targetOl.querySelector(`#${escapedId}`)) {
204
+ return; // Skip duplicate
205
+ }
206
+ } catch (e) {
207
+ // If selector fails, check manually
208
+ const existing = Array.from(targetOl.querySelectorAll('li')).find(li => li.id === itemId);
209
+ if (existing) return;
210
+ }
211
+ }
212
+ targetOl.appendChild(item);
213
+ });
214
+ }
215
+ // Remove the now-empty extra bibliography section from the DOM
216
+ extraEl.remove();
217
+ }
218
+ }
219
+ }
220
+
221
  // Try multiple selectors for footnotes
222
  const footnotesEl = findFirstOutsideFooter([
223
  "[data-built-footnotes]",
 
226
  "div.footnotes",
227
  ]);
228
 
 
229
  const movedNotes = moveIntoFooter(footnotesEl, "Footnotes");
230
 
231
  if (movedRefs || movedNotes) {
 
262
  // Final attempt after a short delay
263
  setTimeout(attemptMove, 300);
264
 
265
+ // Watch for dynamically added content (e.g., lazy-loaded components)
266
+ // This catches references sections that might be added after initial render
267
+ const observer = new MutationObserver((mutations) => {
268
+ // Only re-run if we haven't fully processed yet or new ref sections appeared
269
+ if (footer.dataset.processed !== "true") {
270
+ attemptMove();
271
+ } else {
272
+ // Check if any new references sections were added
273
+ for (const mutation of mutations) {
274
+ for (const node of mutation.addedNodes) {
275
+ if (node.nodeType === 1) { // Element node
276
+ const el = node;
277
+ if (
278
+ el.id === "references" ||
279
+ el.classList?.contains("references") ||
280
+ el.classList?.contains("bibliography") ||
281
+ el.hasAttribute?.("data-built-refs")
282
+ ) {
283
+ // Reset processed flag and re-consolidate
284
+ footer.dataset.processed = "false";
285
+ attemptMove();
286
+ return;
287
+ }
288
+ // Also check for nested references
289
+ if (el.querySelector?.("[data-built-refs], #references, .references, .bibliography")) {
290
+ footer.dataset.processed = "false";
291
+ attemptMove();
292
+ return;
293
+ }
294
+ }
295
+ }
296
+ }
297
+ }
298
+ });
299
+
300
+ // Observe the main content area for changes
301
+ if (contentRoot) {
302
+ observer.observe(contentRoot, { childList: true, subtree: true });
303
+ }
304
+
305
+ // Stop observing after page is fully loaded + a delay
306
+ window.addEventListener("load", () => {
307
+ setTimeout(() => observer.disconnect(), 2000);
308
+ }, { once: true });
309
  })();
310
  </script>
311
 
app/src/content/assets/data/basic_metrics.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f67f1217568824b751da562d8106fae602792a64c38abb4b7c8bae75698249c0
3
- size 2716
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:847fb061c6643d04446b69249d9c56ba67ea1b502013fc57ff71366d36978a23
3
+ size 2817
app/src/content/assets/data/complexity_ratio.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a59aeba80d14b977f47948d2fcfbd818685df06b033c0b3bb6ee889ae976ab4
3
+ size 2386
app/src/content/assets/data/complexity_ratio.png ADDED

Git LFS Details

  • SHA256: 32c2783e40f3b71ac7c61a138d8af768a5a39721e0bae40298f54c0d5e60dac4
  • Pointer size: 130 Bytes
  • Size of remote file: 93.2 kB
app/src/content/assets/data/overall_performance.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ddb4557b07eab530ae73d9ce849c542f503fc3656166e9b6164034b5cba83bf
3
  size 2391
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e079335d1cf6f5c53df229031920c15561847b7e65476ba93f6526669df8a8
3
  size 2391
app/src/content/assets/data/overall_performance.png CHANGED

Git LFS Details

  • SHA256: 02a2fedd1f6b603d295472aa3ceae73c0159a6b6e675311a6376e2323441bf3d
  • Pointer size: 130 Bytes
  • Size of remote file: 79 kB

Git LFS Details

  • SHA256: c00e19078eaea1aa2ef665814b5659f68a2ade00fb397bf78b47816b1312c37a
  • Pointer size: 130 Bytes
  • Size of remote file: 79 kB
app/src/content/assets/data/score_vs_recklessness.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fb4ed9d7a1296dd45431123b58fb52c0d5224a8cc4113cb1b53eab95b8fb610
3
+ size 3251
app/src/content/assets/data/score_vs_recklessness.png ADDED

Git LFS Details

  • SHA256: e42807adf23e46a8288607a425f3a20d97c0db484f9f62329347e6dfd011da7d
  • Pointer size: 130 Bytes
  • Size of remote file: 85.6 kB
app/src/content/assets/data/summary.txt CHANGED
@@ -25,17 +25,17 @@ Loaded colors for 17 models
25
  BASIC MODEL COMPARISON
26
  ============================================================
27
 
28
- model rounds_played total_score avg_score total_floored_score avg_floored_score total_turns total_output_tokens total_wall_clock avg_failed_guesses success_rate total_no_stakes_score avg_no_stakes_score avg_output_tokens_per_turn wall_clock_per_turn intra_rule_variance inter_rule_variance variance_ratio
29
- Claude Opus 4.5 78 1128 14.461538 1324 16.974359 852 4333716 86367.64 2.000000 0.833333 1598.0 20.487179 5086.521127 101.370469 25.000000 81.385983 0.307178
30
- Kimi K2 78 804 10.307692 1262 16.179487 975 12281540 101346.76 2.038462 0.769231 1481.0 18.987179 12596.451282 103.945395 25.538462 88.446496 0.288745
31
- Grok 4 1 Fast Reasoning 78 737 9.448718 1182 15.153846 998 8178655 120364.22 2.564103 0.717949 1441.0 18.474359 8195.045090 120.605431 25.243590 106.499829 0.237029
32
- Gpt 5.2 High 78 1158 14.846154 1174 15.051282 1205 3341037 73525.83 0.282051 0.948718 1505.0 19.294872 2772.644813 61.017286 24.628205 36.601709 0.672870
33
- Gpt 5 Mini Medium 78 942 12.076923 1052 13.487179 1261 3618399 58345.97 1.166667 0.705128 1325.0 16.987179 2869.467883 46.269603 39.141026 82.882051 0.472250
34
- Deepseek R1 78 511 6.551282 1036 13.282051 1104 9229131 165334.16 3.192308 0.641026 1331.0 17.064103 8359.720109 149.759203 29.628205 115.135043 0.257334
35
- Gemini 3 Flash Preview Low 78 817 10.474359 1024 13.128205 1315 1581524 12702.02 0.961538 0.705128 1226.0 15.717949 1202.679848 9.659331 29.923077 83.049573 0.360304
36
- Gpt Oss 120B 78 580 7.435897 1004 12.871795 1243 3190828 24633.15 2.153846 0.679487 1279.0 16.397436 2567.037812 19.817498 46.692308 78.676239 0.593474
37
- Gpt Oss 20B 78 131 1.679487 927 11.884615 1297 7009392 62397.50 2.974359 0.589744 1206.0 15.461538 5404.311488 48.109098 47.576923 88.239487 0.539180
38
- Claude Haiku 4.5 78 -37 -0.474359 894 11.461538 1254 6973411 57734.39 3.948718 0.564103 1198.0 15.358974 5560.933812 46.040183 45.102564 107.387350 0.419999
39
 
40
  Saved: results/260121_78_rounds/basic_metrics.csv
41
  Saved: results/260121_78_rounds/overall_performance.png
@@ -130,6 +130,8 @@ Saved: results/260121_78_rounds/excess_caution.png
130
  Saved: results/260121_78_rounds/excess_caution.json
131
  Saved: results/260121_78_rounds/caution_vs_failed_guesses.png
132
  Saved: results/260121_78_rounds/caution_vs_failed_guesses.json
 
 
133
 
134
  ============================================================
135
  RECKLESS GUESSING ANALYSIS
@@ -169,6 +171,58 @@ Longest streak: 8 consecutive wrong guesses
169
  Saved: results/260121_78_rounds/reckless_guessing.png
170
  Saved: results/260121_78_rounds/reckless_guessing.json
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  ============================================================
173
  PER-MODEL REPORTS
174
  ============================================================
 
25
  BASIC MODEL COMPARISON
26
  ============================================================
27
 
28
+ model rounds_played total_score avg_score total_floored_score avg_floored_score total_turns total_output_tokens total_wall_clock avg_failed_guesses success_rate counting_output_tokens total_no_stakes_score avg_no_stakes_score avg_output_tokens_per_turn wall_clock_per_turn intra_rule_variance inter_rule_variance variance_ratio
29
+ Claude Opus 4.5 78 1128 14.461538 1324 16.974359 756 4333716 86367.64 2.000000 0.833333 3430535 1598.0 20.487179 4537.744709 114.242910 25.000000 81.385983 0.307178
30
+ Kimi K2 78 804 10.307692 1262 16.179487 801 12281540 101346.76 2.038462 0.769231 5918992 1481.0 18.987179 7389.503121 126.525293 25.538462 88.446496 0.288745
31
+ Grok 4 1 Fast Reasoning 78 737 9.448718 1182 15.153846 795 8178655 120364.22 2.564103 0.717949 4559832 1441.0 18.474359 5735.637736 151.401535 25.243590 106.499829 0.237029
32
+ Gpt 5.2 High 78 1158 14.846154 1174 15.051282 1195 3341037 73525.83 0.282051 0.948718 3232254 1505.0 19.294872 2704.815063 61.527891 24.628205 36.601709 0.672870
33
+ Gpt 5 Mini Medium 78 942 12.076923 1052 13.487179 1163 3618399 58345.97 1.166667 0.705128 2998454 1325.0 16.987179 2578.206363 50.168504 39.141026 82.882051 0.472250
34
+ Deepseek R1 78 511 6.551282 1036 13.282051 851 9229131 165334.16 3.192308 0.641026 5944454 1331.0 17.064103 6985.257344 194.282209 29.628205 115.135043 0.257334
35
+ Gemini 3 Flash Preview Low 78 817 10.474359 1024 13.128205 1207 1581524 12702.02 0.961538 0.705128 1389850 1226.0 15.717949 1151.491301 10.523629 29.923077 83.049573 0.360304
36
+ Gpt Oss 120B 78 580 7.435897 1004 12.871795 1041 3190828 24633.15 2.153846 0.679487 2250622 1279.0 16.397436 2161.980788 23.662968 46.692308 78.676239 0.593474
37
+ Gpt Oss 20B 78 131 1.679487 927 11.884615 972 7009392 62397.50 2.974359 0.589744 3234713 1206.0 15.461538 3327.894033 64.194959 47.576923 88.239487 0.539180
38
+ Claude Haiku 4.5 78 -37 -0.474359 894 11.461538 848 6973411 57734.39 3.948718 0.564103 4053200 1198.0 15.358974 4779.716981 68.083007 45.102564 107.387350 0.419999
39
 
40
  Saved: results/260121_78_rounds/basic_metrics.csv
41
  Saved: results/260121_78_rounds/overall_performance.png
 
130
  Saved: results/260121_78_rounds/excess_caution.json
131
  Saved: results/260121_78_rounds/caution_vs_failed_guesses.png
132
  Saved: results/260121_78_rounds/caution_vs_failed_guesses.json
133
+ Saved: results/260121_78_rounds/score_vs_recklessness.png
134
+ Saved: results/260121_78_rounds/score_vs_recklessness.json
135
 
136
  ============================================================
137
  RECKLESS GUESSING ANALYSIS
 
171
  Saved: results/260121_78_rounds/reckless_guessing.png
172
  Saved: results/260121_78_rounds/reckless_guessing.json
173
 
174
+ ============================================================
175
+ COMPLEXITY RATIO ANALYSIS
176
+ ============================================================
177
+
178
+ Analyzed 9634 tentative rules with confidence >= 5
179
+ Using optimal k = 0.420 for aggregated complexity
180
+
181
+ Complexity Ratio by Model:
182
+ (Ratio = Tentative Complexity / Actual Complexity)
183
+
184
+ Model Median Q25 Q75 Count
185
+ Gpt Oss 120B 1.322 0.873 2.355 1182
186
+ Gpt Oss 20B 1.155 0.782 2.065 1219
187
+ Claude Haiku 4.5 1.054 0.736 2.000 1001
188
+ Deepseek R1 1.000 0.762 1.756 933
189
+ Gemini 3 Flash Preview Low 1.000 0.781 1.519 1016
190
+ Gpt 5 Mini Medium 1.000 0.765 1.664 939
191
+ Gpt 5.2 High 1.000 0.791 1.187 857
192
+ Grok 4 1 Fast Reasoning 1.000 0.777 1.657 938
193
+ Claude Opus 4.5 0.984 0.707 1.169 664
194
+ Kimi K2 0.976 0.622 1.275 885
195
+
196
+ Interpretation:
197
+ - Ratio > 1: Model tends to overcomplicate rules
198
+ - Ratio < 1: Model tends to oversimplify rules
199
+ - Ratio ≈ 1: Model matches actual rule complexity
200
+
201
+ Highest median: Gpt Oss 120B (1.322)
202
+ Lowest median: Kimi K2 (0.976)
203
+
204
+ Saved: results/260121_78_rounds/complexity_ratio.png
205
+ Saved: results/260121_78_rounds/complexity_ratio.json
206
+
207
+ ============================================================
208
+ OUTPUT TOKENS BY TURN
209
+ ============================================================
210
+
211
+ Saved: results/260121_78_rounds/tokens_by_turn.png
212
+ Saved: results/260121_78_rounds/tokens_by_turn.json
213
+
214
+ Tokens trend summary (early vs late turns):
215
+ Claude Haiku 4.5: early=3191, late=5889 (+84.5%)
216
+ Claude Opus 4.5: early=2649, late=8447 (+218.9%)
217
+ Deepseek R1: early=5083, late=10946 (+115.3%)
218
+ Gemini 3 Flash Preview Low: early=1046, late=1351 (+29.1%)
219
+ Gpt 5 Mini Medium: early=1241, late=4862 (+291.9%)
220
+ Gpt 5.2 High: early=963, late=5910 (+514.0%)
221
+ Gpt Oss 120B: early=1050, late=4475 (+326.2%)
222
+ Gpt Oss 20B: early=1744, late=7789 (+346.6%)
223
+ Grok 4 1 Fast Reasoning: early=2810, late=17827 (+534.4%)
224
+ Kimi K2: early=5545, late=10653 (+92.1%)
225
+
226
  ============================================================
227
  PER-MODEL REPORTS
228
  ============================================================
app/src/content/assets/data/tokens_by_turn.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceb3f9cc62ed081b7c59ff9f58903166d3d267ab7f3ad73143b41682301dddd9
3
+ size 39913
app/src/content/assets/data/tokens_by_turn.png ADDED

Git LFS Details

  • SHA256: b1076a4296b75b27f7244a090fd4f28019c19989bcf4f74184578f31310acdce
  • Pointer size: 131 Bytes
  • Size of remote file: 280 kB
app/src/content/chapters/eleusis/appendix.mdx CHANGED
@@ -1,40 +1,8 @@
1
- import Accordion from "../../../components/Accordion.astro";
2
  import Note from "../../../components/Note.astro";
3
  import Sidenote from "../../../components/Sidenote.astro";
4
 
5
  ## Appendix: Detailed Methods
6
 
7
- ### Models Evaluated
8
-
9
- <Accordion title="Model configurations" open>
10
-
11
- We evaluated 10 models across 5 providers:
12
-
13
- | Model | Provider | Type |
14
- |-------|----------|------|
15
- | Claude Opus 4.5 | Anthropic | Proprietary |
16
- | Claude Haiku 4.5 | Anthropic | Proprietary |
17
- | GPT 5.2 High | OpenAI | Proprietary |
18
- | GPT 5 Mini Medium | OpenAI | Proprietary |
19
- | Gemini 3 Flash Preview Low | Google | Proprietary |
20
- | Grok 4.1 Fast Reasoning | xAI | Proprietary |
21
- | Kimi K2 | Moonshot (via HF) | Open weights |
22
- | DeepSeek R1 | DeepSeek (via HF) | Open weights |
23
- | GPT OSS 120B | Community (via HF) | Open weights |
24
- | GPT OSS 20B | Community (via HF) | Open weights |
25
-
26
- All models were evaluated with the following settings:
27
-
28
- | Parameter | Value |
29
- |-----------|-------|
30
- | Temperature | 0.7 |
31
- | Max tokens | 16384 |
32
- | Retries | 3 (on API failures) |
33
-
34
- Reasoning models were allowed their default reasoning budgets.
35
-
36
- </Accordion>
37
-
38
  ### Rule Checking
39
 
40
 
 
 
1
  import Note from "../../../components/Note.astro";
2
  import Sidenote from "../../../components/Sidenote.astro";
3
 
4
  ## Appendix: Detailed Methods
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  ### Rule Checking
7
 
8
 
app/src/content/chapters/eleusis/benchmark.mdx CHANGED
@@ -4,21 +4,19 @@ import Sidenote from "../../../components/Sidenote.astro";
4
 
5
  ### The Original Game
6
 
7
- In the original Eleusis card game, one player acts as the "dealer" (sometimes called "God" or "Nature") and secretly invents a rule determining which cards can be legally played. The other players (called "scientists") don't know this rule, they must discover it through experimentation.
8
-
9
- Players take turns playing cards from their hand onto a central "mainline." If a card satisfies the secret rule, the dealer accepts it and it is added to the mainline. If it violates the rule, it's rejected and placed in a "sideline" below the mainline at that position. Over time, the pattern of accepted and rejected cards provides evidence about the hidden rule.
10
 
11
  <Sidenote>
12
  The name "Eleusis" comes from the ancient Greek mystery cult, where initiates gradually discovered hidden truths.
13
  </Sidenote>
14
 
15
- At any point, a player can attempt to guess the rule; correctly identifying it ends the game, but a wrong guess incurs a penalty. The game continues until someone correctly identifies the rule. A specific scoring system rewards efficiency in discovering the rule while penalizing reckless guessing.
16
 
17
  ### Our Adaptation
18
 
19
  We adapted Eleusis into a single-player benchmark focused purely on the scientific reasoning process. By removing multi-player dynamics, we isolate the core challenge: hypothesis formation and testing under uncertainty.
20
 
21
- The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and four suits. A secret rule determines whether each card is accepted or rejected. It is a deterministic function that takes the card being played and the current sequence of accepted cards (the "mainline"). The player maintains a hand of 12 cards, drawing a replacement after each play.
22
 
23
  On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
24
 
@@ -31,7 +29,7 @@ For instance, a player who correctly identifies the rule on turn 13 with no wron
31
  This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence and acting accordingly.
32
 
33
  ### Rule Library
34
- In the original game, the dealer has to invent a secret rule on the spot. However, for benchmarking LLMs, we need a fixed set of rules to ensure comparability across model runs. We created a library of 26 hand-crafted rules spanning a range of types and complexity. Some rules involve simply card properties (e.g., "only red cards"), while others depend on the sequence of previously accepted cards (e.g., "card rank must be higher than previous card"). The rule might involve rank, suits, color or a combination thereof, and may include positional dependencies.
35
 
36
  Here are some example rules from our library, with a tentative categorization:
37
 
@@ -57,7 +55,8 @@ The model is free to reason, but it is asked to output a structured response con
57
  4. **Confidence level**: A self-reported probability (0–10 scale, where 7 means "I estimate 70% chance my tentative rule is correct");
58
  5. **Guess decision**: Whether to formally try to guess the rule this turn, or not.
59
 
60
- Example output
 
61
  ```
62
  {
63
  "reasoning_summary": "To test if the rule depends on rank, I play a 4♣ (same rank as the starter 4♠) hoping to see if same-rank cards are accepted.",
@@ -68,6 +67,12 @@ Example output
68
  }
69
  ```
70
 
71
- **This structure lets us analyze not just whether models succeed, but *how* they reason:** Do they update hypotheses appropriately when evidence contradicts them? Do they explore strategically or play conservatively? Is their stated confidence calibrated to their actual accuracy? In particular, forcing the model to articulate a tentative rule and a confidence level in it (even if they don't want to guess it yet) allows us to (secretely) evaluate it nonetheless, which will be useful for measuring calibration and guessing abilities.
 
 
 
 
 
 
72
 
73
 
 
4
 
5
  ### The Original Game
6
 
7
+ To recap the core mechanics: players take turns playing cards onto a central "mainline." If a card satisfies the secret rule, it is accepted; otherwise it's rejected and placed in a "sideline" below that position. At any point, a player can attempt to guess the rule—correctly identifying it ends the game, but a wrong guess incurs a penalty.
 
 
8
 
9
  <Sidenote>
10
  The name "Eleusis" comes from the ancient Greek mystery cult, where initiates gradually discovered hidden truths.
11
  </Sidenote>
12
 
13
+ The scoring system rewards efficiency: discovering the rule quickly earns more points, while wrong guesses are penalized.
14
 
15
  ### Our Adaptation
16
 
17
  We adapted Eleusis into a single-player benchmark focused purely on the scientific reasoning process. By removing multi-player dynamics, we isolate the core challenge: hypothesis formation and testing under uncertainty.
18
 
19
+ The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and four suits. The secret rule is a deterministic function of the card being played and the current mainline sequence. The player maintains a hand of 12 cards, drawing a replacement after each play.
20
 
21
  On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
22
 
 
29
  This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence and acting accordingly.
30
 
31
  ### Rule Library
32
+ In the original game, the dealer invents a secret rule on the spot. For benchmarking LLMs, we need a fixed set of rules to ensure comparability across runs. We created a library of 26 hand-crafted rules designed to cover the space of rule types (static properties, sequential dependencies, cyclic patterns) while remaining tractable to evaluate. Some rules involve simply card properties (e.g., "only red cards"), while others depend on the sequence of previously accepted cards (e.g., "card rank must be higher than previous card"). The rule might involve rank, suits, color or a combination thereof, and may include positional dependencies.
33
 
34
  Here are some example rules from our library, with a tentative categorization:
35
 
 
55
  4. **Confidence level**: A self-reported probability (0–10 scale, where 7 means "I estimate 70% chance my tentative rule is correct");
56
  5. **Guess decision**: Whether to formally try to guess the rule this turn, or not.
57
 
58
+ #### Example output
59
+
60
  ```
61
  {
62
  "reasoning_summary": "To test if the rule depends on rank, I play a 4♣ (same rank as the starter 4♠) hoping to see if same-rank cards are accepted.",
 
67
  }
68
  ```
69
 
70
+ **This structure lets us analyze not just whether models succeed, but *how* they reason:**
71
+
72
+ - Do they update hypotheses appropriately when evidence contradicts them?
73
+ - Do they explore strategically or play conservatively?
74
+ - Is their stated confidence calibrated to their actual accuracy?
75
+
76
+ Forcing the model to articulate a tentative rule and confidence level (even when not formally guessing) allows us to secretly evaluate it at every turn—useful for measuring calibration.
77
 
78
 
app/src/content/chapters/eleusis/introduction.mdx CHANGED
@@ -3,17 +3,17 @@ import Image from "../../../components/Image.astro";
3
 
4
  import exampleSequence from "../../assets/image/example_sequence.png";
5
 
6
- Large language models are increasingly being deployed as tools for scientific research : analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
7
 
8
  <Sidenote>
9
  Read time: 15–20 minutes.
10
  </Sidenote>
11
 
12
- Most reasoning benchmarks test whether models can solve well-defined problems: given premises, derive a conclusion. The ARC challenge [@chollet2019measure], for instance, evaluates inductive reasoning on visual patterns. **These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.**
13
 
14
  First, real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement, often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic thinking*: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
15
 
16
- Also, beyond pure reasoning, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?) [@lichtenstein1977calibration], **metacognition** (how certain am I about my uncertainty?) [@flavell1979metacognition], and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis instead of trying to challenge it) [@nickerson1998confirmation]. A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
17
 
18
  We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
19
 
@@ -33,4 +33,4 @@ Eleusis was designed by @abbott1977eleusis explicitly to simulate the process of
33
 
34
  We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: *can models act like scientists?* Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
35
 
36
- These skills are fundamental not just to science, but to debugging code, medical diagnosis, and everyday reasoning under uncertainty.
 
3
 
4
  import exampleSequence from "../../assets/image/example_sequence.png";
5
 
6
+ Large language models are increasingly being deployed as tools for scientific research: analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
7
 
8
  <Sidenote>
9
  Read time: 15–20 minutes.
10
  </Sidenote>
11
 
12
+ Most reasoning benchmarks test whether models can solve well-defined problems with clear solutions. The ARC challenge [@chollet2019measure], for instance, evaluates inductive reasoning on visual patterns. **These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.**
13
 
14
  First, real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement, often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic thinking*: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
15
 
16
+ Moreover, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?) [@lichtenstein1977calibration], **metacognition** (how certain am I about my uncertainty?) [@flavell1979metacognition], and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis instead of trying to challenge it) [@nickerson1998confirmation]. A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
17
 
18
  We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
19
 
 
33
 
34
  We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: *can models act like scientists?* Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
35
 
36
+ These skills matter beyond the laboratory: debugging code, diagnosing patients, and navigating everyday uncertainty all require the same iterative process of hypothesis and test.
app/src/content/chapters/eleusis/results.mdx CHANGED
@@ -6,9 +6,28 @@ import HtmlEmbed from "../../../components/HtmlEmbed.astro";
6
 
7
  ## 2. Results
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ### Overall Performance
10
 
11
- We evaluated ten models on the Eleusis benchmark, including both proprietary and open-weight models. Performance is measured as the average score per turn. We also report token usage (output + reasoning) per turn to compare efficiency.
12
 
13
  <HtmlEmbed
14
  src="overall-performance.html"
@@ -19,15 +38,15 @@ We evaluated ten models on the Eleusis benchmark, including both proprietary and
19
 
20
  Performance varies dramatically among tested models.
21
 
22
- * **Claude Opus 4.5** achieves top performance with 17.0 score and moderate token usage. The open-weight model **Kimi K2 Thinking** comes second at 16.2 and performs competitively with the best proprietary models (outperforming GPT 5.2 High and being close to Claude Opus 4.5), but at the price of a 2.5× larger reasoning budget.
23
 
24
- * **GPT 5.2 High** and **Grok 4.1 Fast Reasoning** show a similar performance around 15, but GPT 5.2 High is 3 times more token efficient.
25
 
26
- * **GPT-5-Mini**, **GPT OSS-120B** and **Gemini 3 Flash Preview Low** cluster in the mid-tier (around 13) with low token usage. While Deepseek R1, an open-weight model specialized for reasoning tasks, achieves a similar score but with a much larger token count.
27
 
28
- * Finally, **GPT-OSS 20B** and **Claude Haiku 4.5** lag behind, scoring between 11 and 12 with moderate token usage.
29
 
30
- As we mentionned, this score reflects not only the pure model's ability to find the correct rule, but also its metacognitive skills: knowing when to commit, how confident it is, and how to balance exploration vs. exploitation. To distinguish these factors, we also computed an alternative "no-stakes" score that removes penalties for wrong guesses and counts tentative rules as guesses.
31
 
32
  ### Pure discovery versus metacognition
33
 
@@ -47,7 +66,7 @@ Even if using this alternative scoring does not change a lot the relative rankin
47
  * GPT 5.2 High and Claude Haiku 4.5 are the two models with the largest difference between raw and no-stakes scores (more than 4), suggesting they are the most penalized by wrong guesses or delayed guessing.
48
  * On the other hand, Gemini 3 Flash Preview Low and Kimi K2 have the smallest difference (less than 3) and benefit the least from this alternative scoring, indicating a better balance between discovery and metacognition.
49
 
50
- They might be two reasons for the difference between the raw and the no-stakes scores:
51
  1. The model is reckless and makes a lot of wrong guesses, incurring penalties.
52
  2. The model is too cautious and waits too long before guessing, missing out on points.
53
 
@@ -67,14 +86,28 @@ To estimate caution, we can compute on average **how many turns a model waits wh
67
  src="caution-vs-failed-guesses.html"
68
  caption="<strong>Figure 3:</strong> The caution-recklessness trade-off. Models in the upper-left are cautious (delay correct guesses); models in the lower-right are reckless (many failed guesses). The ideal position is lower-left: quick to commit when right, rarely wrong."
69
  id="fig-caution-reckless"
 
70
  />
71
 
72
- How should we interpret those values ? Knowing that a failed guess costs 2 points, while each turn of delay costs 1 point, the optimal number of failed guesses per round should be around 0.5 (i.e., 1 failed guess every 2 rounds) to balance the two sources of loss. We can see that most models are above that threshold, indicating **a clear tendency towards recklessness**. This is confirmed by the fact that they have a low caution value (most models wait around 1 turn or less on average before guessing when they have the correct rule).
73
 
74
- On the other hand, **GPT 5.2 High has a singular behavior** with very few failed guesses (0.28 per round) but a high caution (waiting 3.5 turns on average before guessing when it has the correct rule). Gemini 3 Flash Preview Low and GPT 5 Mini Medium are intermediate in both dimensions, Gemini achieving a better balance with on average 2 points lost due to caution and 2 points lost due to recklessness (1 failed guess every round on average).
 
 
75
 
76
  To try to understand deeper the causes of recklessness and caution, we now turn to an analysis of confidence and guessing strategies.
77
 
 
 
 
 
 
 
 
 
 
 
 
78
  ### Confidence and Calibration
79
 
80
  Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempt to do so. **This allows us to evaluate calibration: does reported confidence match actual accuracy?** This is particularly relevant as modern neural networks have been shown to be poorly calibrated [@guo2017calibration].
@@ -83,17 +116,18 @@ Models are asked to output their confidence level, with clear instructions on wh
83
  src="calibration-curves.html"
84
  caption="<strong>Figure 4:</strong> Calibration curves for each model (for reported confidence ≥5). A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
85
  id="fig-calibration"
 
86
  />
87
 
88
  The calibration analysis reveals several patterns:
89
 
90
- - **All models are very overconfident** : for instance when they report 80% confidence, their actual success rates are often closer to 20% !
91
- - GPT 5.2 is the best calibrated model overall, being the closest to the diagonal line, although it is still slightly overconfident.
92
- - Even models with a strong performance like Claude Opus 4.5 and Kimi K2 show significant overconfidence.
93
 
94
- Is overconfidence a problem ? In our setting, not necessarily; it depends on how the model decides to act on it.
95
 
96
- **For a perfectly calibrated model**, as the expected loss for a failed guess is twice the expected opportunity cost of waiting one turn, **the optimal confidence threshold for guessing is 0.67** (i.e., guess when you believe your tentative rule has at least a 67% chance of being correct). But do model follow such a strategy ?
97
 
98
  For this, we can look at how often models guess at each reported confidence level. This is shown in the following figure. For each confidence level (from 5 to 10), we compute the guess rate: the fraction of turns the model actually attempts to guess when reporting that confidence.
99
 
@@ -102,16 +136,37 @@ For this, we can look at how often models guess at each reported confidence leve
102
  src="guess-rate.html"
103
  caption="<strong>Figure 5:</strong> Guess rate per confidence level. The optimal decision theoretic curve for a perfectly calibrated model should be a step at 67%. Click legend items to show/hide models."
104
  id="fig-confidence"
 
105
  />
106
 
107
  Once again, we observe significant differences from one model to another. Grok 4.1 and Gemini 3 will essentially only guess when very confident (9 or 10). Most other models will also often guess at confidence levels above 8 and rarely below. The two Claude models show different behaviors: Claude Opus 4.5 tends to guess more aggressively at confidence level 8, while Claude Haiku 4.5 often guesses even at confidence level 7.
108
 
109
- We can see that **models on average are more cautious than the optimal decision-theoretic strategy** for a perfectly calibrated model, which would guess as soon as confidence exceeds 67%. This is somehow a good thing for them, given that all models are overconfident. **By raising the threshold for guessing, they reduce the risk of wrong guesses and compensate for their poor calibration.**
110
 
111
- This is particularly true for Gemini 3 Flash Preview Low which is very cautious, guessing only 1/3 of the time at reported confidence 9 ! This compensates its overconfidence, which is probably what helps it achieve a good balance between failed guesses and lost opportunity cost. This is reflected in our "no-stakes" analysis by the fact that it's the model with the smallest difference between raw and no-stakes scores.
112
 
113
  The case of GPT 5.2 High is different: it is both fairly well calibrated and very cautious, leading to very few failed guesses but a high opportunity cost due to delayed guessing. This suggests that GPT 5.2 High could improve its performance by being more aggressive in guessing once it has a correct tentative rule, especially at confidence level 8.
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  ### Performance by Rule Complexity
117
 
@@ -143,20 +198,46 @@ The following plot breaks down the success rate of each model per complexity qua
143
  />
144
 
145
 
146
- Interestingly, code complexity (as measured by our combination of cyclomatic complexity and AST node count) doesn't perfectly predict difficulty, as semantic concepts also play a role. For instance a rule like "only face cards" has a complexity equivalent to "only A, 2 and 3", but the former is easier for models (and humans !) due to familiarity with the semantic category of face cards.
147
 
148
- Also rules involving rare events (low acceptance rate). Only aces is harder than "only even ranks" despite being simpler, simply because models need more evidence to confirm it.
 
 
149
 
150
- An interesting test: are symmetric rules equally difficult? For example, "only spades" vs "only non-spades" should be logically equivalent in difficulty, but models might have biases.
151
- For instance average score on "only spades" is 25, but "no spades" is 20.
152
 
153
  ### Complexity of rules produced
154
 
155
- #### Overly Complex Rules
156
- Failure mode: models have a tendency to produce over complicated rules, even if they were informed that the rule is typically one sentence. They can produce tentative rules like "...".
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- TODO : Backup this with examples from logs and "guess complexity" vs "actual complexity".
159
 
 
160
 
161
- #### Overfitting Rules
162
- We have observed qualitative evidence of model producing overfit rules that explain all observations so far, but fail to generalize. For instance if all accepted cards so far are red, and happens to be only number cards (simply because no red face card has been tried yet), the model may hypothesize "only red number cards" rather than the simpler "only red cards."
 
6
 
7
  ## 2. Results
8
 
9
+ ### Models Evaluated
10
+
11
+ We evaluated ten frontier models from six labs, including both proprietary and open-weight models. Open-weight models were accessed via Hugging Face inference providers. Several models offer configurable reasoning levels, which we indicate when applicable.
12
+
13
+ | Model | Lab | Provider | Reasoning |
14
+ |-------|-----|----------|-----------|
15
+ | Claude Opus 4.5 | Anthropic | Anthropic | 16000 tok. |
16
+ | Claude Haiku 4.5 | Anthropic | Anthropic | 16000 tok. |
17
+ | GPT 5.2 | OpenAI | OpenAI | High |
18
+ | GPT 5 Mini | OpenAI | OpenAI | Medium |
19
+ | Gemini 3 Flash Preview | Google | Google | Low |
20
+ | Grok 4.1 | xAI | xAI | Fast |
21
+ | Kimi K2 Thinking | Moonshot | 🤗 Inference providers | — |
22
+ | DeepSeek R1 | DeepSeek | 🤗 Inference providers | — |
23
+ | GPT OSS 120B | Community | 🤗 Inference providers | — |
24
+ | GPT OSS 20B | Community | 🤗 Inference providers | — |
25
+
26
+ All models were evaluated with temperature 0.7 and max tokens of 16,384. Reasoning models were allowed their default reasoning budgets. Each model played 78 rounds (26 rules × 3 seeds).
27
+
28
  ### Overall Performance
29
 
30
+ Performance is measured as the average score per round. We also report token usage (output + reasoning) per turn to compare efficiency.
31
 
32
  <HtmlEmbed
33
  src="overall-performance.html"
 
38
 
39
  Performance varies dramatically among tested models.
40
 
41
+ * **Claude Opus 4.5** achieves top performance with a score of 17.0 and moderate token usage. The open-weight model **Kimi K2** comes second at 16.2, performing competitively with the best proprietary models, but at the cost of a larger reasoning budget.
42
 
43
+ * **GPT 5.2 High** and **Grok 4.1 Fast Reasoning** show similar performance around 15, but GPT 5.2 High is 2 times more token-efficient.
44
 
45
+ * **GPT 5 Mini Medium**, **GPT OSS 120B**, and **Gemini 3 Flash Preview Low** cluster in the mid-tier (around 13) with low token usage. **DeepSeek R1**, an open-weight model specialized for reasoning tasks, achieves a similar score but with a much larger token count.
46
 
47
+ * Finally, **GPT OSS 20B** and **Claude Haiku 4.5** lag behind, scoring between 11 and 12 with moderate token usage.
48
 
49
+ As mentioned earlier, this score reflects not only the model's ability to find the correct rule, but also its metacognitive skills: knowing when to commit, how confident to be, and how to balance exploration versus exploitation. To distinguish these factors, we computed an alternative "no-stakes" score that removes penalties for wrong guesses and counts tentative rules as guesses.
50
 
51
  ### Pure discovery versus metacognition
52
 
 
66
  * GPT 5.2 High and Claude Haiku 4.5 are the two models with the largest difference between raw and no-stakes scores (more than 4), suggesting they are the most penalized by wrong guesses or delayed guessing.
67
  * On the other hand, Gemini 3 Flash Preview Low and Kimi K2 have the smallest difference (less than 3) and benefit the least from this alternative scoring, indicating a better balance between discovery and metacognition.
68
 
69
+ There are two possible reasons for the gap between raw and no-stakes scores:
70
  1. The model is reckless and makes a lot of wrong guesses, incurring penalties.
71
  2. The model is too cautious and waits too long before guessing, missing out on points.
72
 
 
86
  src="caution-vs-failed-guesses.html"
87
  caption="<strong>Figure 3:</strong> The caution-recklessness trade-off. Models in the upper-left are cautious (delay correct guesses); models in the lower-right are reckless (many failed guesses). The ideal position is lower-left: quick to commit when right, rarely wrong."
88
  id="fig-caution-reckless"
89
+ wide
90
  />
91
 
92
+ How should we interpret these values? A failed guess costs 2 points, while each turn of delay costs 1 point, so the optimal number of failed guesses per round should be around 0.5 (one failed guess every two rounds) to balance both sources of loss. Most models exceed this threshold, indicating **a clear tendency towards recklessness**. This is confirmed by low caution values: most models wait around 1 turn or less on average before guessing when they have the correct rule.
93
 
94
+ **GPT 5.2 High stands out** with very few failed guesses (0.28 per round) but high cautionwaiting 3.5 turns on average before guessing when it has the correct rule.
95
+
96
+ Gemini 3 Flash Preview Low and GPT 5 Mini Medium occupy an intermediate position. Gemini achieves a better balance, losing on average 2 points to caution and 2 points to recklessness (about one failed guess per round).
97
 
98
  To try to understand deeper the causes of recklessness and caution, we now turn to an analysis of confidence and guessing strategies.
99
 
100
+ A way to summarize this behavior is to compute a **boldness index** as the difference between the points lost by being reckless (failed guesses) and the points lost by being cautious (delayed correct guesses). A positive value indicates more loss due to recklessness, while a negative value indicates more loss due to caution. This is reported in the following chart.
101
+
102
+ <HtmlEmbed
103
+ src="score-vs-recklessness.html"
104
+ caption="<strong>Figure 3b:</strong> Score vs Boldness Index. The boldness index combines failed guesses and caution into a single metric (2 × failed guesses − caution). Models in the center have a decision strategy that balances recklessness and caution. Models on the left are loosing point because of their excessive caution, while models on the right are losing points because of their recklessness."
105
+ id="fig-score-recklessness"
106
+ wide
107
+ />
108
+
109
+ A way to understand this chart is in terms of missed opportunity. Models in the center achieve a good balance between recklessness and caution, minimizing lost points. They perform at the best permitted by their inductive abilities. Models on the left are too cautious, missing out on points by delaying correct guesses. At identical inductive ability, they could improve their score by guessing earlier. Models on the right are too reckless, losing points from frequent wrong guesses. At identical inductive ability, they could improve their score by being more cautious and guessing less often.
110
+
111
  ### Confidence and Calibration
112
 
113
  Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempt to do so. **This allows us to evaluate calibration: does reported confidence match actual accuracy?** This is particularly relevant as modern neural networks have been shown to be poorly calibrated [@guo2017calibration].
 
116
  src="calibration-curves.html"
117
  caption="<strong>Figure 4:</strong> Calibration curves for each model (for reported confidence ≥5). A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
118
  id="fig-calibration"
119
+ wide
120
  />
121
 
122
  The calibration analysis reveals several patterns:
123
 
124
+ - **All models are overconfident**: when they report 80% confidence, their actual success rates are often closer to 20%.
125
+ - GPT 5.2 High is the best-calibrated model overall, staying closest to the diagonal, though still slightly overconfident.
126
+ - Even strong performers like Claude Opus 4.5 and Kimi K2 show significant overconfidence.
127
 
128
+ Is overconfidence a problem? In our setting, not necessarilyit depends on how the model acts on it.
129
 
130
+ **For a perfectly calibrated model**, since the expected loss from a failed guess is twice the opportunity cost of waiting one turn, **the optimal confidence threshold for guessing is 0.67** (guess when you believe your tentative rule has at least a 67% chance of being correct). But do models follow such a strategy?
131
 
132
  For this, we can look at how often models guess at each reported confidence level. This is shown in the following figure. For each confidence level (from 5 to 10), we compute the guess rate: the fraction of turns the model actually attempts to guess when reporting that confidence.
133
 
 
136
  src="guess-rate.html"
137
  caption="<strong>Figure 5:</strong> Guess rate per confidence level. The optimal decision theoretic curve for a perfectly calibrated model should be a step at 67%. Click legend items to show/hide models."
138
  id="fig-confidence"
139
+ wide
140
  />
141
 
142
  Once again, we observe significant differences from one model to another. Grok 4.1 and Gemini 3 will essentially only guess when very confident (9 or 10). Most other models will also often guess at confidence levels above 8 and rarely below. The two Claude models show different behaviors: Claude Opus 4.5 tends to guess more aggressively at confidence level 8, while Claude Haiku 4.5 often guesses even at confidence level 7.
143
 
144
+ **Models are on average more cautious than the optimal decision-theoretic strategy** for a perfectly calibrated model, which would guess as soon as confidence exceeds 67%. This actually benefits them, given their overconfidence: **by raising the threshold for guessing, they reduce the risk of wrong guesses and compensate for their poor calibration.**
145
 
146
+ This is particularly true for Gemini 3 Flash Preview Low, which is very cautious, guessing only 1/3 of the time at reported confidence 9. This compensates for its overconfidence and likely explains its good balance between failed guesses and lost opportunity cost—reflected in its having the smallest gap between raw and no-stakes scores. This is reflected in our "no-stakes" analysis by the fact that it's the model with the smallest difference between raw and no-stakes scores.
147
 
148
  The case of GPT 5.2 High is different: it is both fairly well calibrated and very cautious, leading to very few failed guesses but a high opportunity cost due to delayed guessing. This suggests that GPT 5.2 High could improve its performance by being more aggressive in guessing once it has a correct tentative rule, especially at confidence level 8.
149
 
150
+ ### Reasoning effort vs turn count
151
+
152
+ To see whether models tend to think more per turn when the round is longer, we plotted the average number of output tokens per turn.
153
+
154
+ <HtmlEmbed
155
+ src="tokens-by-turn.html"
156
+ caption="<strong>Figure 5b:</strong> Average output tokens per turn across the game (in the 'no-stakes' counting scenario where all the rounds will last up to 30 turns). Each line shows how a model's reasoning effort evolves as the round progresses. Click legend items to show/hide models. Note: sample sizes decrease for later turns as games that end early don't contribute data."
157
+ id="fig-tokens-by-turn"
158
+ wide
159
+ />
160
+
161
+ The patterns reveal striking differences in how models allocate reasoning effort:
162
+
163
+ - Most models show gradual increase in the reasoning effort (token usage) as the turn number increases.
164
+
165
+ - **Grok 4.1 Fast Reasoning** stands out with dramatically increasing token usage, starting around 1,200 tokens per turn and reaching over 20,000 by turn 30. This suggests the model invests more reasoning effort as problems become harder to solve.
166
+
167
+ - **Gemini 3 Flash Preview Low** maintains remarkably flat token usage throughout, staying around 1,000-1,400 tokens regardless of turn number. This suggests a consistent, lightweight reasoning approach that doesn't scale with problem difficulty.
168
+
169
+ The general upward trend makes sense: later turns only occur in harder games where the rule hasn't been found yet, requiring more extensive reasoning. However, the magnitude of increase varies widely, from Gemini's flat profile to Grok's 15x increase.
170
 
171
  ### Performance by Rule Complexity
172
 
 
198
  />
199
 
200
 
201
+ Interestingly, code complexity (as measured by our combination of cyclomatic complexity and AST node count) doesn't perfectly predict difficulty, as semantic concepts also play a role. A rule like "only face cards" has complexity equivalent to "only A, 2 and 3", but the former is easier for models (and humans) due to familiarity with the semantic category of face cards.
202
 
203
+ Rules involving rare events also prove challenging. "Only aces" is harder than "only even ranks" despite being simpler, because models need more evidence to confirm it.
204
+
205
+ This raises an interesting question: are symmetric rules equally difficult? Logically, "only spades" and "no spades" should be equivalent in difficulty, but models might have biases. Indeed, the average score on "only spades" is 25, while "no spades" scores only 20.
206
 
 
 
207
 
208
  ### Complexity of rules produced
209
 
210
+ One common failure mode we observed is that models tend to produce overly complicated tentative rules, even though they were informed that rules are typically simple one-sentence statements. They also produce rules that fit all observed data so far, but fail to generalize to new cards because they are more complex than necessary.
211
+
212
+ As an illustration, here is an example of tentative rule by Claude Haiku 4.5. The mainline was state (rejected cards are in parentheses)
213
+
214
+ <p style={{fontStyle: 'italic', padding: '0.5em 1em', borderLeft: '3px solid var(--border-color)'}}>
215
+ 6♠ <span style={{color: '#e53935'}}>6♦</span> 9♠ (<span style={{color: '#e53935'}}>Q♥</span>) <span style={{color: '#e53935'}}>9♦</span> (9♣) 7♠ (<span style={{color: '#e53935'}}>5♦</span>) (<span style={{color: '#e53935'}}>J♦</span>) (<span style={{color: '#e53935'}}>A♦</span>) (<span style={{color: '#e53935'}}>Q♦</span>) (<span style={{color: '#e53935'}}>2♦</span>) (<span style={{color: '#e53935'}}>4♦</span>) (<span style={{color: '#e53935'}}>9♦</span>) (8♠) (A♠) (<span style={{color: '#e53935'}}>10♥</span>) (<span style={{color: '#e53935'}}>J♦</span>) (<span style={{color: '#e53935'}}>9♥</span>) <span style={{color: '#e53935'}}>7♦</span> 9♠ (<span style={{color: '#e53935'}}>A♥</span>) (<span style={{color: '#e53935'}}>8♥</span>)
216
+ </p>
217
+
218
+ The actual rule was *"Rank repeats in pairs"*. The tentative rule proposed by Haiku 4.5 at this stage of the game was:
219
+
220
+ > "Odd-positioned mainline cards must be spades, even-positioned mainline cards must be diamonds. Consecutive pairs of positions must have matching ranks. Additionally, each rank (6, 7, 9) can appear only twice on the mainline, meaning position 8 must be a diamond with a rank different from 6, 7, and 9, or the pattern breaks at position 8 with new rules."
221
+
222
+ This is overly complicated compared to the actual rule, and as you can read, it contains the actual rule "Consecutive pairs of positions must have matching ranks" but adds unnecessary constraints about suits and counts that do not generalize.
223
+
224
+ To quantify this, we computed the **complexity ratio**: the complexity of the model's tentative rule divided by the actual rule complexity, using the same code-based metric described above.
225
+
226
+ <HtmlEmbed
227
+ src="complexity-ratio.html"
228
+ caption="<strong>Figure 8:</strong> Median complexity ratio of tentative rules vs actual rules. A ratio > 1 indicates the model overcomplicates (hypothesizes more complex rules than necessary); < 1 indicates oversimplification. Whiskers show interquartile range. Only tentative rules with confidence ≥ 5 are included."
229
+ id="fig-complexity-ratio"
230
+ wide
231
+ />
232
+
233
+ The results reveal a clear tendency toward overcomplication among several models:
234
+
235
+ - **GPT OSS 120B and GPT OSS 20B** stand out with median ratios of 1.32 and 1.15 respectively, consistently hypothesizing more complex rules than necessary.
236
+ - **Claude Haiku 4.5** also tends to overcomplicate slightly (1.05) on average, but with high variance and many tentative rules being much more complex than needed.
237
+ - **Claude Opus 4.5, GPT 5.2 and Kimi K2** are the best calibrated, with median ratios close to 1.0 and moderate variance, suggesting they match rule complexity most accurately.
238
+ - Most models cluster around 1.0, indicating reasonable complexity calibration on average, but the wide interquartile ranges show substantial variation across individual games.
239
 
 
240
 
241
+ ### Summary
242
 
243
+ Our evaluation reveals substantial variation in how models approach the Eleusis task. Claude Opus 4.5 leads in overall performance, followed closely by the open-weight Kimi K2. All models exhibit overconfidence—reporting higher certainty than their accuracy warrants—but they partially compensate by being more cautious than decision theory would recommend. The caution-recklessness trade-off varies dramatically: GPT 5.2 High is extremely cautious (high success rate but slow to commit), while Claude Haiku 4.5 and DeepSeek R1 are reckless (many failed guesses). Rule complexity matters, but semantic familiarity and evidence availability also influence difficulty. Finally, models tend to overcomplicate their hypotheses—particularly the open-weight GPT OSS models—while Claude Opus 4.5, GPT 5.2 High, and Kimi K2 best match actual rule complexity.
 
app/src/content/embeds/banner-bar-chart.html ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="d3-banner-bar"></div>
2
+ <style>
3
+ .d3-banner-bar {
4
+ width: 100%;
5
+ margin: 10px 0;
6
+ position: relative;
7
+ font-family: system-ui, -apple-system, sans-serif;
8
+ }
9
+
10
+ .d3-banner-bar svg {
11
+ display: block;
12
+ width: 100%;
13
+ height: auto;
14
+ }
15
+
16
+ .d3-banner-bar .axes path,
17
+ .d3-banner-bar .axes line {
18
+ stroke: var(--axis-color, var(--text-color));
19
+ }
20
+
21
+ .d3-banner-bar .axes text {
22
+ fill: var(--tick-color, var(--muted-color));
23
+ font-size: 12px;
24
+ }
25
+
26
+ .d3-banner-bar .grid line {
27
+ stroke: var(--grid-color, rgba(0,0,0,.08));
28
+ }
29
+
30
+ .d3-banner-bar .axes text.axis-label {
31
+ font-size: 14px;
32
+ font-weight: 500;
33
+ fill: var(--text-color);
34
+ }
35
+
36
+ .d3-banner-bar .model-label {
37
+ font-size: 13px;
38
+ font-weight: 500;
39
+ }
40
+
41
+ .d3-banner-bar .bar {
42
+ cursor: pointer;
43
+ transition: opacity 0.15s ease;
44
+ }
45
+
46
+ .d3-banner-bar .bar:hover {
47
+ opacity: 0.8;
48
+ }
49
+
50
+ .d3-banner-bar .score-label {
51
+ font-size: 12px;
52
+ font-weight: 500;
53
+ fill: var(--text-color);
54
+ }
55
+
56
+ .d3-banner-bar .d3-tooltip {
57
+ position: absolute;
58
+ top: 0;
59
+ left: 0;
60
+ transform: translate(-9999px, -9999px);
61
+ pointer-events: none;
62
+ padding: 10px 12px;
63
+ border-radius: 8px;
64
+ font-size: 12px;
65
+ line-height: 1.4;
66
+ border: 1px solid var(--border-color);
67
+ background: var(--surface-bg);
68
+ color: var(--text-color);
69
+ box-shadow: 0 4px 24px rgba(0,0,0,.18);
70
+ opacity: 0;
71
+ transition: opacity 0.12s ease;
72
+ z-index: 10;
73
+ }
74
+
75
+ .d3-banner-bar .d3-tooltip .model-name {
76
+ font-weight: 600;
77
+ margin-bottom: 4px;
78
+ }
79
+
80
+ .d3-banner-bar .d3-tooltip .metric {
81
+ display: flex;
82
+ justify-content: space-between;
83
+ gap: 16px;
84
+ }
85
+
86
+ .d3-banner-bar .d3-tooltip .metric-label {
87
+ color: var(--muted-color);
88
+ }
89
+
90
+ .d3-banner-bar .d3-tooltip .metric-value {
91
+ font-weight: 500;
92
+ }
93
+ </style>
94
+ <script>
95
+ (() => {
96
+ const ensureD3 = (cb) => {
97
+ if (window.d3 && typeof window.d3.select === 'function') return cb();
98
+ let s = document.getElementById('d3-cdn-script');
99
+ if (!s) {
100
+ s = document.createElement('script');
101
+ s.id = 'd3-cdn-script';
102
+ s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
103
+ document.head.appendChild(s);
104
+ }
105
+ const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
106
+ s.addEventListener('load', onReady, { once: true });
107
+ if (window.d3) onReady();
108
+ };
109
+
110
+ const bootstrap = () => {
111
+ const scriptEl = document.currentScript;
112
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
113
+ if (!(container && container.classList && container.classList.contains('d3-banner-bar'))) {
114
+ const candidates = Array.from(document.querySelectorAll('.d3-banner-bar'))
115
+ .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
116
+ container = candidates[candidates.length - 1] || null;
117
+ }
118
+ if (!container) return;
119
+ if (container.dataset) {
120
+ if (container.dataset.mounted === 'true') return;
121
+ container.dataset.mounted = 'true';
122
+ }
123
+
124
+ // Tooltip setup
125
+ container.style.position = container.style.position || 'relative';
126
+ const tip = document.createElement('div');
127
+ tip.className = 'd3-tooltip';
128
+ container.appendChild(tip);
129
+
130
+ // SVG setup
131
+ const svg = d3.select(container).append('svg');
132
+ const gRoot = svg.append('g');
133
+
134
+ // Chart groups
135
+ const gGrid = gRoot.append('g').attr('class', 'grid');
136
+ const gAxes = gRoot.append('g').attr('class', 'axes');
137
+ const gBars = gRoot.append('g').attr('class', 'bars');
138
+ const gLabels = gRoot.append('g').attr('class', 'labels');
139
+
140
+ // State
141
+ let data = null;
142
+ let width = 800;
143
+ let height = 450;
144
+ const margin = { top: 20, right: 60, bottom: 40, left: 20 };
145
+
146
+ // Scales
147
+ const xScale = d3.scaleLinear();
148
+ const yScale = d3.scaleBand();
149
+
150
+ // Data loading
151
+ const JSON_PATHS = [
152
+ '/data/overall_performance.json',
153
+ './assets/figures/overall_performance.json',
154
+ '../assets/figures/overall_performance.json',
155
+ '../../assets/figures/overall_performance.json'
156
+ ];
157
+
158
+ const fetchFirstAvailable = async (paths) => {
159
+ for (const p of paths) {
160
+ try {
161
+ const r = await fetch(p, { cache: 'no-cache' });
162
+ if (r.ok) return await r.json();
163
+ } catch (_) {}
164
+ }
165
+ throw new Error('Data not found');
166
+ };
167
+
168
+ function updateSize() {
169
+ width = container.clientWidth || 800;
170
+ // Height based on number of bars (will be set after data loads)
171
+ const numModels = data ? data.models.length : 10;
172
+ const barHeight = 36;
173
+ height = margin.top + margin.bottom + numModels * barHeight;
174
+ svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
175
+ gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
176
+ return {
177
+ innerWidth: width - margin.left - margin.right,
178
+ innerHeight: height - margin.top - margin.bottom
179
+ };
180
+ }
181
+
182
+ function showTooltip(event, d) {
183
+ const rect = container.getBoundingClientRect();
184
+ const x = event.clientX - rect.left;
185
+ const y = event.clientY - rect.top;
186
+
187
+ tip.innerHTML = `
188
+ <div class="model-name" style="color: ${d.color}">${d.name}</div>
189
+ <div class="metric">
190
+ <span class="metric-label">Score:</span>
191
+ <span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
192
+ </div>
193
+ <div class="metric">
194
+ <span class="metric-label">Tokens/Turn:</span>
195
+ <span class="metric-value">${Math.round(d.avg_output_tokens_per_turn).toLocaleString()}</span>
196
+ </div>
197
+ <div class="metric">
198
+ <span class="metric-label">Type:</span>
199
+ <span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
200
+ </div>
201
+ `;
202
+
203
+ const tipWidth = tip.offsetWidth || 150;
204
+ const tipHeight = tip.offsetHeight || 80;
205
+ let tipX = x + 12;
206
+ let tipY = y - tipHeight / 2;
207
+
208
+ if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
209
+ if (tipY < 0) tipY = 8;
210
+ if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
211
+
212
+ tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
213
+ tip.style.opacity = '1';
214
+ }
215
+
216
+ function hideTooltip() {
217
+ tip.style.opacity = '0';
218
+ tip.style.transform = 'translate(-9999px, -9999px)';
219
+ }
220
+
221
+ // Calculate relative luminance and return black or white for best contrast
222
+ function getContrastColor(hexColor) {
223
+ const hex = hexColor.replace('#', '');
224
+ const r = parseInt(hex.substr(0, 2), 16) / 255;
225
+ const g = parseInt(hex.substr(2, 2), 16) / 255;
226
+ const b = parseInt(hex.substr(4, 2), 16) / 255;
227
+ // Relative luminance formula
228
+ const luminance = 0.299 * r + 0.587 * g + 0.114 * b;
229
+ return luminance > 0.5 ? '#000000' : '#ffffff';
230
+ }
231
+
232
+ function render() {
233
+ if (!data) return;
234
+
235
+ const { innerWidth, innerHeight } = updateSize();
236
+
237
+ // Sort models by score descending
238
+ const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
239
+
240
+ // Update scales
241
+ xScale
242
+ .domain([0, d3.max(models, d => d.avg_floored_score) * 1.05])
243
+ .range([0, innerWidth])
244
+ .nice();
245
+
246
+ yScale
247
+ .domain(models.map(d => d.name))
248
+ .range([0, innerHeight])
249
+ .padding(0.25);
250
+
251
+ // Grid lines (vertical)
252
+ const xTicks = xScale.ticks(6);
253
+ gGrid.selectAll('.grid-x')
254
+ .data(xTicks)
255
+ .join('line')
256
+ .attr('class', 'grid-x')
257
+ .attr('x1', d => xScale(d))
258
+ .attr('x2', d => xScale(d))
259
+ .attr('y1', 0)
260
+ .attr('y2', innerHeight);
261
+
262
+ // X-axis (bottom)
263
+ gAxes.selectAll('.x-axis')
264
+ .data([0])
265
+ .join('g')
266
+ .attr('class', 'x-axis')
267
+ .attr('transform', `translate(0,${innerHeight})`)
268
+ .call(d3.axisBottom(xScale).ticks(6).tickSizeOuter(0));
269
+
270
+ // X-axis label
271
+ gAxes.selectAll('.x-label')
272
+ .data([0])
273
+ .join('text')
274
+ .attr('class', 'x-label axis-label')
275
+ .attr('x', innerWidth / 2)
276
+ .attr('y', innerHeight + 34)
277
+ .attr('text-anchor', 'middle')
278
+ .text('Average Score');
279
+
280
+ // Bars
281
+ const barHeight = yScale.bandwidth();
282
+ gBars.selectAll('.bar')
283
+ .data(models, d => d.name)
284
+ .join('rect')
285
+ .attr('class', 'bar')
286
+ .attr('x', 0)
287
+ .attr('y', d => yScale(d.name))
288
+ .attr('width', d => xScale(d.avg_floored_score))
289
+ .attr('height', barHeight)
290
+ .attr('fill', d => d.color)
291
+ .attr('rx', 3)
292
+ .attr('ry', 3)
293
+ .on('mouseenter', showTooltip)
294
+ .on('mousemove', showTooltip)
295
+ .on('mouseleave', hideTooltip);
296
+
297
+ // Model labels (inside bars)
298
+ gLabels.selectAll('.model-label')
299
+ .data(models, d => d.name)
300
+ .join('text')
301
+ .attr('class', 'model-label')
302
+ .attr('x', 8)
303
+ .attr('y', d => yScale(d.name) + barHeight / 2)
304
+ .attr('dy', '0.35em')
305
+ .attr('text-anchor', 'start')
306
+ .style('fill', d => getContrastColor(d.color))
307
+ .text(d => d.name);
308
+
309
+ // Score labels (end of bars)
310
+ gLabels.selectAll('.score-label')
311
+ .data(models, d => d.name)
312
+ .join('text')
313
+ .attr('class', 'score-label')
314
+ .attr('x', d => xScale(d.avg_floored_score) + 6)
315
+ .attr('y', d => yScale(d.name) + barHeight / 2)
316
+ .attr('dy', '0.35em')
317
+ .attr('text-anchor', 'start')
318
+ .text(d => d.avg_floored_score.toFixed(1));
319
+ }
320
+
321
+ // Initialize
322
+ fetchFirstAvailable(JSON_PATHS)
323
+ .then(json => {
324
+ data = json;
325
+ render();
326
+ })
327
+ .catch(err => {
328
+ const pre = document.createElement('pre');
329
+ pre.style.color = 'red';
330
+ pre.style.padding = '16px';
331
+ pre.textContent = `Error loading data: ${err.message}`;
332
+ container.appendChild(pre);
333
+ });
334
+
335
+ // Resize handling
336
+ if (window.ResizeObserver) {
337
+ new ResizeObserver(() => render()).observe(container);
338
+ } else {
339
+ window.addEventListener('resize', render);
340
+ }
341
+
342
+ // Theme change handling
343
+ const observer = new MutationObserver(() => render());
344
+ observer.observe(document.documentElement, {
345
+ attributes: true,
346
+ attributeFilter: ['data-theme']
347
+ });
348
+ };
349
+
350
+ if (document.readyState === 'loading') {
351
+ document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
352
+ } else {
353
+ ensureD3(bootstrap);
354
+ }
355
+ })();
356
+ </script>
app/src/content/embeds/banner.html CHANGED
@@ -1,59 +1,64 @@
1
- <div class="d3-banner-bar"></div>
2
  <style>
3
- .d3-banner-bar {
4
  width: 100%;
5
  margin: 10px 0;
6
  position: relative;
7
  font-family: system-ui, -apple-system, sans-serif;
8
  }
9
 
10
- .d3-banner-bar svg {
11
  display: block;
12
  width: 100%;
13
  height: auto;
14
  }
15
 
16
- .d3-banner-bar .axes path,
17
- .d3-banner-bar .axes line {
18
  stroke: var(--axis-color, var(--text-color));
19
  }
20
 
21
- .d3-banner-bar .axes text {
22
  fill: var(--tick-color, var(--muted-color));
23
- font-size: 12px;
24
  }
25
 
26
- .d3-banner-bar .grid line {
27
- stroke: var(--grid-color, rgba(0,0,0,.08));
28
  }
29
 
30
- .d3-banner-bar .axes text.axis-label {
31
- font-size: 14px;
32
  font-weight: 500;
33
  fill: var(--text-color);
34
  }
35
 
36
- .d3-banner-bar .model-label {
37
- font-size: 13px;
38
- font-weight: 500;
39
  }
40
 
41
- .d3-banner-bar .bar {
42
  cursor: pointer;
43
  transition: opacity 0.15s ease;
44
  }
45
 
46
- .d3-banner-bar .bar:hover {
47
  opacity: 0.8;
48
  }
49
 
50
- .d3-banner-bar .score-label {
51
- font-size: 12px;
52
- font-weight: 500;
53
  fill: var(--text-color);
 
 
 
 
 
 
 
54
  }
55
 
56
- .d3-banner-bar .d3-tooltip {
57
  position: absolute;
58
  top: 0;
59
  left: 0;
@@ -72,22 +77,22 @@
72
  z-index: 10;
73
  }
74
 
75
- .d3-banner-bar .d3-tooltip .model-name {
76
  font-weight: 600;
77
  margin-bottom: 4px;
78
  }
79
 
80
- .d3-banner-bar .d3-tooltip .metric {
81
  display: flex;
82
  justify-content: space-between;
83
  gap: 16px;
84
  }
85
 
86
- .d3-banner-bar .d3-tooltip .metric-label {
87
  color: var(--muted-color);
88
  }
89
 
90
- .d3-banner-bar .d3-tooltip .metric-value {
91
  font-weight: 500;
92
  }
93
  </style>
@@ -110,8 +115,8 @@
110
  const bootstrap = () => {
111
  const scriptEl = document.currentScript;
112
  let container = scriptEl ? scriptEl.previousElementSibling : null;
113
- if (!(container && container.classList && container.classList.contains('d3-banner-bar'))) {
114
- const candidates = Array.from(document.querySelectorAll('.d3-banner-bar'))
115
  .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
116
  container = candidates[candidates.length - 1] || null;
117
  }
@@ -129,48 +134,62 @@
129
 
130
  // SVG setup
131
  const svg = d3.select(container).append('svg');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  const gRoot = svg.append('g');
133
 
134
- // Chart groups
 
135
  const gGrid = gRoot.append('g').attr('class', 'grid');
136
  const gAxes = gRoot.append('g').attr('class', 'axes');
137
- const gBars = gRoot.append('g').attr('class', 'bars');
 
138
  const gLabels = gRoot.append('g').attr('class', 'labels');
139
 
140
  // State
141
  let data = null;
142
  let width = 800;
143
  let height = 450;
144
- const margin = { top: 20, right: 60, bottom: 40, left: 20 };
145
 
146
  // Scales
147
  const xScale = d3.scaleLinear();
148
- const yScale = d3.scaleBand();
149
 
150
  // Data loading
151
- const JSON_PATHS = [
152
- '/data/overall_performance.json',
153
- './assets/figures/overall_performance.json',
154
- '../assets/figures/overall_performance.json',
155
- '../../assets/figures/overall_performance.json'
156
- ];
157
-
158
- const fetchFirstAvailable = async (paths) => {
159
- for (const p of paths) {
160
- try {
161
- const r = await fetch(p, { cache: 'no-cache' });
162
- if (r.ok) return await r.json();
163
- } catch (_) {}
164
  }
165
- throw new Error('Data not found');
166
  };
167
 
168
  function updateSize() {
169
  width = container.clientWidth || 800;
170
- // Height based on number of bars (will be set after data loads)
171
- const numModels = data ? data.models.length : 10;
172
- const barHeight = 36;
173
- height = margin.top + margin.bottom + numModels * barHeight;
174
  svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
175
  gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
176
  return {
@@ -188,11 +207,19 @@
188
  <div class="model-name" style="color: ${d.color}">${d.name}</div>
189
  <div class="metric">
190
  <span class="metric-label">Score:</span>
191
- <span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
192
  </div>
193
  <div class="metric">
194
- <span class="metric-label">Tokens/Turn:</span>
195
- <span class="metric-value">${Math.round(d.avg_output_tokens_per_turn).toLocaleString()}</span>
 
 
 
 
 
 
 
 
196
  </div>
197
  <div class="metric">
198
  <span class="metric-label">Type:</span>
@@ -200,8 +227,8 @@
200
  </div>
201
  `;
202
 
203
- const tipWidth = tip.offsetWidth || 150;
204
- const tipHeight = tip.offsetHeight || 80;
205
  let tipX = x + 12;
206
  let tipY = y - tipHeight / 2;
207
 
@@ -218,38 +245,40 @@
218
  tip.style.transform = 'translate(-9999px, -9999px)';
219
  }
220
 
221
- // Calculate relative luminance and return black or white for best contrast
222
- function getContrastColor(hexColor) {
223
- const hex = hexColor.replace('#', '');
224
- const r = parseInt(hex.substr(0, 2), 16) / 255;
225
- const g = parseInt(hex.substr(2, 2), 16) / 255;
226
- const b = parseInt(hex.substr(4, 2), 16) / 255;
227
- // Relative luminance formula
228
- const luminance = 0.299 * r + 0.587 * g + 0.114 * b;
229
- return luminance > 0.5 ? '#000000' : '#ffffff';
230
- }
231
-
232
  function render() {
233
  if (!data) return;
234
 
235
  const { innerWidth, innerHeight } = updateSize();
 
236
 
237
- // Sort models by score descending
238
- const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
239
-
240
- // Update scales
241
  xScale
242
- .domain([0, d3.max(models, d => d.avg_floored_score) * 1.05])
243
- .range([0, innerWidth])
244
- .nice();
245
 
 
 
 
246
  yScale
247
- .domain(models.map(d => d.name))
248
- .range([0, innerHeight])
249
- .padding(0.25);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- // Grid lines (vertical)
252
- const xTicks = xScale.ticks(6);
253
  gGrid.selectAll('.grid-x')
254
  .data(xTicks)
255
  .join('line')
@@ -259,67 +288,125 @@
259
  .attr('y1', 0)
260
  .attr('y2', innerHeight);
261
 
262
- // X-axis (bottom)
 
 
 
 
 
 
 
 
 
 
263
  gAxes.selectAll('.x-axis')
264
  .data([0])
265
  .join('g')
266
  .attr('class', 'x-axis')
267
  .attr('transform', `translate(0,${innerHeight})`)
268
- .call(d3.axisBottom(xScale).ticks(6).tickSizeOuter(0));
269
 
270
- // X-axis label
 
 
 
 
 
 
271
  gAxes.selectAll('.x-label')
272
  .data([0])
273
  .join('text')
274
  .attr('class', 'x-label axis-label')
275
  .attr('x', innerWidth / 2)
276
- .attr('y', innerHeight + 34)
277
  .attr('text-anchor', 'middle')
278
- .text('Average Score');
279
 
280
- // Bars
281
- const barHeight = yScale.bandwidth();
282
- gBars.selectAll('.bar')
283
- .data(models, d => d.name)
284
- .join('rect')
285
- .attr('class', 'bar')
286
- .attr('x', 0)
287
- .attr('y', d => yScale(d.name))
288
- .attr('width', d => xScale(d.avg_floored_score))
289
- .attr('height', barHeight)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  .attr('fill', d => d.color)
291
- .attr('rx', 3)
292
- .attr('ry', 3)
293
  .on('mouseenter', showTooltip)
294
  .on('mousemove', showTooltip)
295
  .on('mouseleave', hideTooltip);
296
 
297
- // Model labels (inside bars)
298
- gLabels.selectAll('.model-label')
299
- .data(models, d => d.name)
300
- .join('text')
301
- .attr('class', 'model-label')
302
- .attr('x', 8)
303
- .attr('y', d => yScale(d.name) + barHeight / 2)
304
- .attr('dy', '0.35em')
305
- .attr('text-anchor', 'start')
306
- .style('fill', d => getContrastColor(d.color))
307
- .text(d => d.name);
 
308
 
309
- // Score labels (end of bars)
310
- gLabels.selectAll('.score-label')
311
  .data(models, d => d.name)
312
  .join('text')
313
- .attr('class', 'score-label')
314
- .attr('x', d => xScale(d.avg_floored_score) + 6)
315
- .attr('y', d => yScale(d.name) + barHeight / 2)
316
- .attr('dy', '0.35em')
317
- .attr('text-anchor', 'start')
318
- .text(d => d.avg_floored_score.toFixed(1));
 
 
 
 
 
 
 
 
319
  }
320
 
321
  // Initialize
322
- fetchFirstAvailable(JSON_PATHS)
 
323
  .then(json => {
324
  data = json;
325
  render();
 
1
+ <div class="d3-score-vs-recklessness"></div>
2
  <style>
3
+ .d3-score-vs-recklessness {
4
  width: 100%;
5
  margin: 10px 0;
6
  position: relative;
7
  font-family: system-ui, -apple-system, sans-serif;
8
  }
9
 
10
+ .d3-score-vs-recklessness svg {
11
  display: block;
12
  width: 100%;
13
  height: auto;
14
  }
15
 
16
+ .d3-score-vs-recklessness .axes path,
17
+ .d3-score-vs-recklessness .axes line {
18
  stroke: var(--axis-color, var(--text-color));
19
  }
20
 
21
+ .d3-score-vs-recklessness .axes text {
22
  fill: var(--tick-color, var(--muted-color));
23
+ font-size: 14px;
24
  }
25
 
26
+ .d3-score-vs-recklessness .grid line {
27
+ stroke: var(--grid-color, rgba(0,0,0,.15));
28
  }
29
 
30
+ .d3-score-vs-recklessness .axes text.axis-label {
31
+ font-size: 18px;
32
  font-weight: 500;
33
  fill: var(--text-color);
34
  }
35
 
36
+ .d3-score-vs-recklessness .x-axis text {
37
+ transform: translateY(4px);
 
38
  }
39
 
40
+ .d3-score-vs-recklessness .point {
41
  cursor: pointer;
42
  transition: opacity 0.15s ease;
43
  }
44
 
45
+ .d3-score-vs-recklessness .point:hover {
46
  opacity: 0.8;
47
  }
48
 
49
+ .d3-score-vs-recklessness .point-label {
50
+ font-size: 11px;
 
51
  fill: var(--text-color);
52
+ pointer-events: none;
53
+ }
54
+
55
+ .d3-score-vs-recklessness .annotation {
56
+ font-size: 11px;
57
+ font-style: italic;
58
+ fill: var(--muted-color);
59
  }
60
 
61
+ .d3-score-vs-recklessness .d3-tooltip {
62
  position: absolute;
63
  top: 0;
64
  left: 0;
 
77
  z-index: 10;
78
  }
79
 
80
+ .d3-score-vs-recklessness .d3-tooltip .model-name {
81
  font-weight: 600;
82
  margin-bottom: 4px;
83
  }
84
 
85
+ .d3-score-vs-recklessness .d3-tooltip .metric {
86
  display: flex;
87
  justify-content: space-between;
88
  gap: 16px;
89
  }
90
 
91
+ .d3-score-vs-recklessness .d3-tooltip .metric-label {
92
  color: var(--muted-color);
93
  }
94
 
95
+ .d3-score-vs-recklessness .d3-tooltip .metric-value {
96
  font-weight: 500;
97
  }
98
  </style>
 
115
  const bootstrap = () => {
116
  const scriptEl = document.currentScript;
117
  let container = scriptEl ? scriptEl.previousElementSibling : null;
118
+ if (!(container && container.classList && container.classList.contains('d3-score-vs-recklessness'))) {
119
+ const candidates = Array.from(document.querySelectorAll('.d3-score-vs-recklessness'))
120
  .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
121
  container = candidates[candidates.length - 1] || null;
122
  }
 
134
 
135
  // SVG setup
136
  const svg = d3.select(container).append('svg');
137
+
138
+ // Add gradient definition
139
+ const defs = svg.append('defs');
140
+ const gradient = defs.append('linearGradient')
141
+ .attr('id', 'recklessness-gradient')
142
+ .attr('x1', '0%')
143
+ .attr('x2', '100%')
144
+ .attr('y1', '0%')
145
+ .attr('y2', '0%');
146
+
147
+ // Gradient stops: red -> orange -> yellow -> green -> yellow -> orange -> red
148
+ gradient.append('stop').attr('offset', '0%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
149
+ gradient.append('stop').attr('offset', '20%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
150
+ gradient.append('stop').attr('offset', '35%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
151
+ gradient.append('stop').attr('offset', '50%').attr('stop-color', 'rgba(102, 187, 106, 0.35)'); // green
152
+ gradient.append('stop').attr('offset', '65%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
153
+ gradient.append('stop').attr('offset', '80%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
154
+ gradient.append('stop').attr('offset', '100%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
155
+
156
  const gRoot = svg.append('g');
157
 
158
+ // Chart groups (order matters for layering)
159
+ const gBackground = gRoot.append('g').attr('class', 'background');
160
  const gGrid = gRoot.append('g').attr('class', 'grid');
161
  const gAxes = gRoot.append('g').attr('class', 'axes');
162
+ const gAnnotations = gRoot.append('g').attr('class', 'annotations');
163
+ const gPoints = gRoot.append('g').attr('class', 'points');
164
  const gLabels = gRoot.append('g').attr('class', 'labels');
165
 
166
  // State
167
  let data = null;
168
  let width = 800;
169
  let height = 450;
170
+ const margin = { top: 20, right: 120, bottom: 56, left: 72 };
171
 
172
  // Scales
173
  const xScale = d3.scaleLinear();
174
+ const yScale = d3.scaleLinear();
175
 
176
  // Data loading
177
+ const DATA_URL = '/data/score_vs_recklessness.json';
178
+
179
+ // Helper function to create a 5-point star path
180
+ const starPath = (cx, cy, outerR, innerR) => {
181
+ const points = [];
182
+ for (let i = 0; i < 10; i++) {
183
+ const r = i % 2 === 0 ? outerR : innerR;
184
+ const angle = (Math.PI / 2) + (i * Math.PI / 5);
185
+ points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
 
 
 
 
186
  }
187
+ return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
188
  };
189
 
190
  function updateSize() {
191
  width = container.clientWidth || 800;
192
+ height = Math.max(300, Math.round(width / 1.5));
 
 
 
193
  svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
194
  gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
195
  return {
 
207
  <div class="model-name" style="color: ${d.color}">${d.name}</div>
208
  <div class="metric">
209
  <span class="metric-label">Score:</span>
210
+ <span class="metric-value">${d.avg_floored_score.toFixed(1)}</span>
211
  </div>
212
  <div class="metric">
213
+ <span class="metric-label">Recklessness Index:</span>
214
+ <span class="metric-value">${d.recklessness_index.toFixed(2)}</span>
215
+ </div>
216
+ <div class="metric">
217
+ <span class="metric-label">Failed Guesses:</span>
218
+ <span class="metric-value">${d.avg_failed_guesses.toFixed(2)}</span>
219
+ </div>
220
+ <div class="metric">
221
+ <span class="metric-label">Caution:</span>
222
+ <span class="metric-value">${d.avg_caution.toFixed(2)}</span>
223
  </div>
224
  <div class="metric">
225
  <span class="metric-label">Type:</span>
 
227
  </div>
228
  `;
229
 
230
+ const tipWidth = tip.offsetWidth || 180;
231
+ const tipHeight = tip.offsetHeight || 120;
232
  let tipX = x + 12;
233
  let tipY = y - tipHeight / 2;
234
 
 
245
  tip.style.transform = 'translate(-9999px, -9999px)';
246
  }
247
 
 
 
 
 
 
 
 
 
 
 
 
248
  function render() {
249
  if (!data) return;
250
 
251
  const { innerWidth, innerHeight } = updateSize();
252
+ const models = data.models;
253
 
254
+ // Fixed symmetric X scale from -8 to 8
 
 
 
255
  xScale
256
+ .domain([-8, 8])
257
+ .range([0, innerWidth]);
 
258
 
259
+ // Y scale based on data
260
+ const yExtent = d3.extent(models, d => d.avg_floored_score);
261
+ const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
262
  yScale
263
+ .domain([yExtent[0], yExtent[1] + yPadding])
264
+ .range([innerHeight, 0])
265
+ .nice();
266
+
267
+ // Background gradient rectangle
268
+ gBackground.selectAll('.bg-gradient')
269
+ .data([0])
270
+ .join('rect')
271
+ .attr('class', 'bg-gradient')
272
+ .attr('x', 0)
273
+ .attr('y', 0)
274
+ .attr('width', innerWidth)
275
+ .attr('height', innerHeight)
276
+ .attr('fill', 'url(#recklessness-gradient)');
277
+
278
+ // Grid lines
279
+ const xTicks = xScale.ticks(8);
280
+ const yTicks = yScale.ticks(6);
281
 
 
 
282
  gGrid.selectAll('.grid-x')
283
  .data(xTicks)
284
  .join('line')
 
288
  .attr('y1', 0)
289
  .attr('y2', innerHeight);
290
 
291
+ gGrid.selectAll('.grid-y')
292
+ .data(yTicks)
293
+ .join('line')
294
+ .attr('class', 'grid-y')
295
+ .attr('x1', 0)
296
+ .attr('x2', innerWidth)
297
+ .attr('y1', d => yScale(d))
298
+ .attr('y2', d => yScale(d));
299
+
300
+ // Axes with inner ticks
301
+ const tickSize = 6;
302
  gAxes.selectAll('.x-axis')
303
  .data([0])
304
  .join('g')
305
  .attr('class', 'x-axis')
306
  .attr('transform', `translate(0,${innerHeight})`)
307
+ .call(d3.axisBottom(xScale).ticks(8).tickSizeInner(-tickSize).tickSizeOuter(0));
308
 
309
+ gAxes.selectAll('.y-axis')
310
+ .data([0])
311
+ .join('g')
312
+ .attr('class', 'y-axis')
313
+ .call(d3.axisLeft(yScale).ticks(6).tickSizeInner(-tickSize).tickSizeOuter(0));
314
+
315
+ // Axis labels
316
  gAxes.selectAll('.x-label')
317
  .data([0])
318
  .join('text')
319
  .attr('class', 'x-label axis-label')
320
  .attr('x', innerWidth / 2)
321
+ .attr('y', innerHeight + 44)
322
  .attr('text-anchor', 'middle')
323
+ .text('Boldness Index');
324
 
325
+ gAxes.selectAll('.y-label')
326
+ .data([0])
327
+ .join('text')
328
+ .attr('class', 'y-label axis-label')
329
+ .attr('x', -innerHeight / 2)
330
+ .attr('y', -52)
331
+ .attr('text-anchor', 'middle')
332
+ .attr('transform', 'rotate(-90)')
333
+ .text('Score');
334
+
335
+ // Top annotations: Overcautious / Cautious / Measured / Bold / Reckless
336
+ const annotations = [
337
+ { label: 'Overcautious', color: 'rgba(239, 83, 80, 0.9)', pos: 0.07}, // red
338
+ { label: 'Cautious', color: 'rgba(255, 180, 0, 0.9)', pos: 0.25 }, // yellow/orange
339
+ { label: 'Measured', color: 'rgba(76, 175, 80, 0.9)', pos: 0.5 }, // green
340
+ { label: 'Bold', color: 'rgba(255, 180, 0, 0.9)', pos: 0.75 }, // yellow/orange
341
+ { label: 'Reckless', color: 'rgba(239, 83, 80, 0.9)', pos: 0.95 } // red
342
+ ];
343
+
344
+ gAnnotations.selectAll('.annotation-label')
345
+ .data(annotations, d => d.label)
346
+ .join('text')
347
+ .attr('class', 'annotation annotation-label')
348
+ .attr('x', d => d.pos * innerWidth)
349
+ .attr('y', 16)
350
+ .attr('text-anchor', d => d.pos === 0 ? 'start' : d.pos === 1 ? 'end' : 'middle')
351
+ .style('fill', d => d.color)
352
+ .style('font-weight', 'bold')
353
+ .style('font-size', '13px')
354
+ .text(d => d.label);
355
+
356
+ // Points
357
+ const pointRadius = Math.max(8, Math.min(14, innerWidth / 60));
358
+
359
+ // Closed models as filled circles
360
+ const closedModels = models.filter(d => !d.is_open);
361
+ gPoints.selectAll('.point-closed')
362
+ .data(closedModels, d => d.name)
363
+ .join('circle')
364
+ .attr('class', 'point point-closed')
365
+ .attr('cx', d => xScale(d.recklessness_index))
366
+ .attr('cy', d => yScale(d.avg_floored_score))
367
+ .attr('r', pointRadius)
368
  .attr('fill', d => d.color)
369
+ .attr('stroke', 'none')
 
370
  .on('mouseenter', showTooltip)
371
  .on('mousemove', showTooltip)
372
  .on('mouseleave', hideTooltip);
373
 
374
+ // Open models as stars
375
+ const openModels = models.filter(d => d.is_open);
376
+ gPoints.selectAll('.point-star')
377
+ .data(openModels, d => d.name)
378
+ .join('path')
379
+ .attr('class', 'point point-star')
380
+ .attr('d', d => starPath(xScale(d.recklessness_index), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
381
+ .attr('fill', d => d.color)
382
+ .attr('stroke', 'none')
383
+ .on('mouseenter', showTooltip)
384
+ .on('mousemove', showTooltip)
385
+ .on('mouseleave', hideTooltip);
386
 
387
+ // Point labels with smart positioning
388
+ gLabels.selectAll('.point-label')
389
  .data(models, d => d.name)
390
  .join('text')
391
+ .attr('class', 'point-label')
392
+ .attr('x', d => {
393
+ const xPos = xScale(d.recklessness_index);
394
+ if (xPos > innerWidth - 100) {
395
+ return xPos - pointRadius - 6;
396
+ }
397
+ return xPos + pointRadius + 6;
398
+ })
399
+ .attr('y', d => yScale(d.avg_floored_score) + 4)
400
+ .attr('text-anchor', d => {
401
+ const xPos = xScale(d.recklessness_index);
402
+ return xPos > innerWidth - 100 ? 'end' : 'start';
403
+ })
404
+ .text(d => d.name);
405
  }
406
 
407
  // Initialize
408
+ fetch(DATA_URL, { cache: 'no-cache' })
409
+ .then(r => r.json())
410
  .then(json => {
411
  data = json;
412
  render();
app/src/content/embeds/complexity-ratio.html ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="d3-complexity-ratio"></div>
2
+ <style>
3
+ .d3-complexity-ratio {
4
+ width: 100%;
5
+ margin: 10px 0;
6
+ position: relative;
7
+ font-family: system-ui, -apple-system, sans-serif;
8
+ }
9
+
10
+ .d3-complexity-ratio svg {
11
+ display: block;
12
+ width: 100%;
13
+ height: auto;
14
+ }
15
+
16
+ .d3-complexity-ratio .axes path,
17
+ .d3-complexity-ratio .axes line {
18
+ stroke: var(--axis-color, var(--text-color));
19
+ }
20
+
21
+ .d3-complexity-ratio .axes text {
22
+ fill: var(--tick-color, var(--muted-color));
23
+ font-size: 11px;
24
+ }
25
+
26
+ .d3-complexity-ratio .grid line {
27
+ stroke: var(--grid-color, rgba(0,0,0,.08));
28
+ }
29
+
30
+ .d3-complexity-ratio .axes text.axis-label {
31
+ font-size: 14px;
32
+ font-weight: 500;
33
+ fill: var(--text-color);
34
+ }
35
+
36
+ .d3-complexity-ratio .reference-line {
37
+ stroke: var(--muted-color);
38
+ stroke-dasharray: 5, 5;
39
+ stroke-width: 1.5;
40
+ }
41
+
42
+ .d3-complexity-ratio .whisker-line {
43
+ stroke-width: 1.5;
44
+ }
45
+
46
+ .d3-complexity-ratio .whisker-cap {
47
+ stroke-width: 1.5;
48
+ }
49
+
50
+ .d3-complexity-ratio .model-point {
51
+ stroke-width: 2;
52
+ cursor: pointer;
53
+ }
54
+
55
+ .d3-complexity-ratio .model-point:hover {
56
+ stroke-width: 3;
57
+ }
58
+
59
+ .d3-complexity-ratio .ratio-label {
60
+ font-size: 11px;
61
+ fill: var(--muted-color);
62
+ }
63
+
64
+ .d3-complexity-ratio .legend-item {
65
+ cursor: default;
66
+ }
67
+
68
+ .d3-complexity-ratio .legend-text {
69
+ font-size: 11px;
70
+ fill: var(--text-color);
71
+ }
72
+
73
+ .d3-complexity-ratio .subtitle {
74
+ font-size: 11px;
75
+ fill: var(--muted-color);
76
+ }
77
+
78
+ .d3-complexity-ratio .d3-tooltip {
79
+ position: absolute;
80
+ top: 0;
81
+ left: 0;
82
+ transform: translate(-9999px, -9999px);
83
+ pointer-events: none;
84
+ padding: 10px 12px;
85
+ border-radius: 8px;
86
+ font-size: 12px;
87
+ line-height: 1.4;
88
+ border: 1px solid var(--border-color);
89
+ background: var(--surface-bg);
90
+ color: var(--text-color);
91
+ box-shadow: 0 4px 24px rgba(0,0,0,.18);
92
+ opacity: 0;
93
+ transition: opacity 0.12s ease;
94
+ z-index: 10;
95
+ }
96
+
97
+ .d3-complexity-ratio .d3-tooltip .model-name {
98
+ font-weight: 600;
99
+ margin-bottom: 4px;
100
+ }
101
+
102
+ .d3-complexity-ratio .d3-tooltip .metric {
103
+ display: flex;
104
+ justify-content: space-between;
105
+ gap: 16px;
106
+ }
107
+
108
+ .d3-complexity-ratio .d3-tooltip .metric-label {
109
+ color: var(--muted-color);
110
+ }
111
+
112
+ .d3-complexity-ratio .d3-tooltip .metric-value {
113
+ font-weight: 500;
114
+ }
115
+ </style>
116
+ <script>
117
+ (() => {
118
+ const ensureD3 = (cb) => {
119
+ if (window.d3 && typeof window.d3.select === 'function') return cb();
120
+ let s = document.getElementById('d3-cdn-script');
121
+ if (!s) {
122
+ s = document.createElement('script');
123
+ s.id = 'd3-cdn-script';
124
+ s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
125
+ document.head.appendChild(s);
126
+ }
127
+ const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
128
+ s.addEventListener('load', onReady, { once: true });
129
+ if (window.d3) onReady();
130
+ };
131
+
132
+ const bootstrap = () => {
133
+ const scriptEl = document.currentScript;
134
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
135
+ if (!(container && container.classList && container.classList.contains('d3-complexity-ratio'))) {
136
+ const candidates = Array.from(document.querySelectorAll('.d3-complexity-ratio'))
137
+ .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
138
+ container = candidates[candidates.length - 1] || null;
139
+ }
140
+ if (!container) return;
141
+ if (container.dataset) {
142
+ if (container.dataset.mounted === 'true') return;
143
+ container.dataset.mounted = 'true';
144
+ }
145
+
146
+ // Tooltip setup
147
+ container.style.position = container.style.position || 'relative';
148
+ const tip = document.createElement('div');
149
+ tip.className = 'd3-tooltip';
150
+ container.appendChild(tip);
151
+
152
+ // SVG setup
153
+ const svg = d3.select(container).append('svg');
154
+ const gRoot = svg.append('g');
155
+
156
+ // Chart groups
157
+ const gGrid = gRoot.append('g').attr('class', 'grid');
158
+ const gReference = gRoot.append('g').attr('class', 'reference');
159
+ const gAxes = gRoot.append('g').attr('class', 'axes');
160
+ const gWhiskers = gRoot.append('g').attr('class', 'whiskers');
161
+ const gPoints = gRoot.append('g').attr('class', 'points');
162
+ const gLabels = gRoot.append('g').attr('class', 'labels');
163
+ const gLegend = gRoot.append('g').attr('class', 'legend');
164
+
165
+ // State
166
+ let data = null;
167
+ let width = 800;
168
+ let height = 500;
169
+ const margin = { top: 30, right: 100, bottom: 60, left: 180 };
170
+
171
+ // Scales
172
+ const xScale = d3.scaleLinear();
173
+ const yScale = d3.scaleBand();
174
+
175
+ // Data loading
176
+ const DATA_URL = '/data/complexity_ratio.json';
177
+
178
+ function showTooltip(event, model) {
179
+ const rect = container.getBoundingClientRect();
180
+ const x = event.clientX - rect.left;
181
+ const y = event.clientY - rect.top;
182
+
183
+ const interpretation = model.median_ratio > 1.05
184
+ ? 'Tends to overcomplicate'
185
+ : model.median_ratio < 0.95
186
+ ? 'Tends to oversimplify'
187
+ : 'Matches complexity well';
188
+
189
+ tip.innerHTML = `
190
+ <div class="model-name" style="color: ${model.color}">${model.name}</div>
191
+ <div class="metric">
192
+ <span class="metric-label">Median ratio:</span>
193
+ <span class="metric-value">${model.median_ratio.toFixed(2)}</span>
194
+ </div>
195
+ <div class="metric">
196
+ <span class="metric-label">IQR:</span>
197
+ <span class="metric-value">${model.q25.toFixed(2)} – ${model.q75.toFixed(2)}</span>
198
+ </div>
199
+ <div class="metric">
200
+ <span class="metric-label">Samples:</span>
201
+ <span class="metric-value">n=${model.count}</span>
202
+ </div>
203
+ <div class="metric" style="margin-top: 4px;">
204
+ <span class="metric-label">Interpretation:</span>
205
+ <span class="metric-value">${interpretation}</span>
206
+ </div>
207
+ `;
208
+
209
+ const tipWidth = tip.offsetWidth || 180;
210
+ const tipHeight = tip.offsetHeight || 120;
211
+ let tipX = x + 12;
212
+ let tipY = y - tipHeight / 2;
213
+
214
+ if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
215
+ if (tipY < 0) tipY = 8;
216
+ if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
217
+
218
+ tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
219
+ tip.style.opacity = '1';
220
+ }
221
+
222
+ function hideTooltip() {
223
+ tip.style.opacity = '0';
224
+ tip.style.transform = 'translate(-9999px, -9999px)';
225
+ }
226
+
227
+ function updateSize() {
228
+ width = container.clientWidth || 800;
229
+ height = Math.max(420, Math.round(width * 0.55));
230
+ svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
231
+ gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
232
+ return {
233
+ innerWidth: width - margin.left - margin.right,
234
+ innerHeight: height - margin.top - margin.bottom
235
+ };
236
+ }
237
+
238
+ function render() {
239
+ if (!data) return;
240
+
241
+ const { innerWidth, innerHeight } = updateSize();
242
+
243
+ // Sort models by median ratio (ascending - lowest at top)
244
+ const models = [...data.models].sort((a, b) => a.median_ratio - b.median_ratio);
245
+
246
+ // X scale: ratio values with padding
247
+ const xMin = d3.min(models, m => m.q25);
248
+ const xMax = d3.max(models, m => m.q75);
249
+ const xPadding = (xMax - xMin) * 0.1;
250
+ xScale
251
+ .domain([Math.min(0.6, xMin - xPadding), Math.max(2.4, xMax + xPadding)])
252
+ .range([0, innerWidth]);
253
+
254
+ // Y scale: categorical (model names)
255
+ yScale
256
+ .domain(models.map(m => m.name))
257
+ .range([0, innerHeight])
258
+ .padding(0.4);
259
+
260
+ // Grid lines (vertical)
261
+ const xTicks = xScale.ticks(8);
262
+ gGrid.selectAll('.grid-x')
263
+ .data(xTicks)
264
+ .join('line')
265
+ .attr('class', 'grid-x')
266
+ .attr('x1', d => xScale(d))
267
+ .attr('x2', d => xScale(d))
268
+ .attr('y1', 0)
269
+ .attr('y2', innerHeight);
270
+
271
+ // Reference line at x=1
272
+ gReference.selectAll('.reference-line')
273
+ .data([1])
274
+ .join('line')
275
+ .attr('class', 'reference-line')
276
+ .attr('x1', d => xScale(d))
277
+ .attr('x2', d => xScale(d))
278
+ .attr('y1', 0)
279
+ .attr('y2', innerHeight);
280
+
281
+ // Axes
282
+ const tickSize = 6;
283
+
284
+ gAxes.selectAll('.x-axis')
285
+ .data([0])
286
+ .join('g')
287
+ .attr('class', 'x-axis')
288
+ .attr('transform', `translate(0,${innerHeight})`)
289
+ .call(d3.axisBottom(xScale)
290
+ .ticks(8)
291
+ .tickFormat(d3.format('.2f'))
292
+ .tickSizeInner(-tickSize)
293
+ .tickSizeOuter(0));
294
+
295
+ gAxes.selectAll('.y-axis')
296
+ .data([0])
297
+ .join('g')
298
+ .attr('class', 'y-axis')
299
+ .call(d3.axisLeft(yScale)
300
+ .tickSizeInner(-tickSize)
301
+ .tickSizeOuter(0));
302
+
303
+ // X-axis label
304
+ gAxes.selectAll('.x-label')
305
+ .data([0])
306
+ .join('text')
307
+ .attr('class', 'x-label axis-label')
308
+ .attr('x', innerWidth / 2)
309
+ .attr('y', innerHeight + 40)
310
+ .attr('text-anchor', 'middle')
311
+ .text('Complexity Ratio (Tentative / Actual)');
312
+
313
+ // Subtitle
314
+ gAxes.selectAll('.subtitle')
315
+ .data([0])
316
+ .join('text')
317
+ .attr('class', 'subtitle')
318
+ .attr('x', innerWidth / 2)
319
+ .attr('y', innerHeight + 54)
320
+ .attr('text-anchor', 'middle')
321
+ .text('>1: Overcomplicates | <1: Oversimplifies | =1: Matches complexity');
322
+
323
+ const bandHeight = yScale.bandwidth();
324
+ const capHeight = bandHeight * 0.4;
325
+ const pointSize = Math.min(8, bandHeight * 0.35);
326
+
327
+ // Whiskers (IQR lines)
328
+ gWhiskers.selectAll('.whisker-line')
329
+ .data(models, d => d.name)
330
+ .join('line')
331
+ .attr('class', 'whisker-line')
332
+ .attr('x1', d => xScale(d.q25))
333
+ .attr('x2', d => xScale(d.q75))
334
+ .attr('y1', d => yScale(d.name) + bandHeight / 2)
335
+ .attr('y2', d => yScale(d.name) + bandHeight / 2)
336
+ .attr('stroke', d => d.color);
337
+
338
+ // Left whisker caps
339
+ gWhiskers.selectAll('.whisker-cap-left')
340
+ .data(models, d => d.name)
341
+ .join('line')
342
+ .attr('class', 'whisker-cap whisker-cap-left')
343
+ .attr('x1', d => xScale(d.q25))
344
+ .attr('x2', d => xScale(d.q25))
345
+ .attr('y1', d => yScale(d.name) + bandHeight / 2 - capHeight / 2)
346
+ .attr('y2', d => yScale(d.name) + bandHeight / 2 + capHeight / 2)
347
+ .attr('stroke', d => d.color);
348
+
349
+ // Right whisker caps
350
+ gWhiskers.selectAll('.whisker-cap-right')
351
+ .data(models, d => d.name)
352
+ .join('line')
353
+ .attr('class', 'whisker-cap whisker-cap-right')
354
+ .attr('x1', d => xScale(d.q75))
355
+ .attr('x2', d => xScale(d.q75))
356
+ .attr('y1', d => yScale(d.name) + bandHeight / 2 - capHeight / 2)
357
+ .attr('y2', d => yScale(d.name) + bandHeight / 2 + capHeight / 2)
358
+ .attr('stroke', d => d.color);
359
+
360
+ // Model points - circles for closed, squares for open
361
+ const closedModels = models.filter(m => !m.is_open);
362
+ const openModels = models.filter(m => m.is_open);
363
+
364
+ // Closed models: circles
365
+ gPoints.selectAll('.model-point-circle')
366
+ .data(closedModels, d => d.name)
367
+ .join('circle')
368
+ .attr('class', 'model-point model-point-circle')
369
+ .attr('cx', d => xScale(d.median_ratio))
370
+ .attr('cy', d => yScale(d.name) + bandHeight / 2)
371
+ .attr('r', pointSize)
372
+ .attr('fill', d => d.color)
373
+ .attr('stroke', d => d.color)
374
+ .on('mouseenter', (event, d) => showTooltip(event, d))
375
+ .on('mousemove', (event, d) => showTooltip(event, d))
376
+ .on('mouseleave', hideTooltip);
377
+
378
+ // Open models: squares
379
+ gPoints.selectAll('.model-point-square')
380
+ .data(openModels, d => d.name)
381
+ .join('rect')
382
+ .attr('class', 'model-point model-point-square')
383
+ .attr('x', d => xScale(d.median_ratio) - pointSize)
384
+ .attr('y', d => yScale(d.name) + bandHeight / 2 - pointSize)
385
+ .attr('width', pointSize * 2)
386
+ .attr('height', pointSize * 2)
387
+ .attr('fill', 'none')
388
+ .attr('stroke', d => d.color)
389
+ .attr('stroke-width', 2)
390
+ .on('mouseenter', (event, d) => showTooltip(event, d))
391
+ .on('mousemove', (event, d) => showTooltip(event, d))
392
+ .on('mouseleave', hideTooltip);
393
+
394
+ // Ratio labels on the right
395
+ gLabels.selectAll('.ratio-label')
396
+ .data(models, d => d.name)
397
+ .join('text')
398
+ .attr('class', 'ratio-label')
399
+ .attr('x', innerWidth + 8)
400
+ .attr('y', d => yScale(d.name) + bandHeight / 2)
401
+ .attr('dy', '0.35em')
402
+ .text(d => `${d.median_ratio.toFixed(2)} (n=${d.count})`);
403
+
404
+ // Legend
405
+ const legendY = -15;
406
+ const legendItems = [
407
+ { label: 'Closed model', shape: 'circle' },
408
+ { label: 'Open model', shape: 'square' }
409
+ ];
410
+
411
+ const legendGroup = gLegend.selectAll('.legend-item')
412
+ .data(legendItems)
413
+ .join('g')
414
+ .attr('class', 'legend-item')
415
+ .attr('transform', (d, i) => `translate(${innerWidth - 80 - i * 100}, ${legendY})`);
416
+
417
+ legendGroup.selectAll('.legend-shape-circle')
418
+ .data(d => d.shape === 'circle' ? [d] : [])
419
+ .join('circle')
420
+ .attr('class', 'legend-shape-circle')
421
+ .attr('cx', 0)
422
+ .attr('cy', 0)
423
+ .attr('r', 5)
424
+ .attr('fill', 'var(--muted-color)');
425
+
426
+ legendGroup.selectAll('.legend-shape-square')
427
+ .data(d => d.shape === 'square' ? [d] : [])
428
+ .join('rect')
429
+ .attr('class', 'legend-shape-square')
430
+ .attr('x', -5)
431
+ .attr('y', -5)
432
+ .attr('width', 10)
433
+ .attr('height', 10)
434
+ .attr('fill', 'none')
435
+ .attr('stroke', 'var(--muted-color)')
436
+ .attr('stroke-width', 2);
437
+
438
+ legendGroup.selectAll('.legend-text')
439
+ .data(d => [d])
440
+ .join('text')
441
+ .attr('class', 'legend-text')
442
+ .attr('x', 10)
443
+ .attr('y', 0)
444
+ .attr('dy', '0.35em')
445
+ .text(d => d.label);
446
+ }
447
+
448
+ // Initialize
449
+ fetch(DATA_URL, { cache: 'no-cache' })
450
+ .then(r => r.json())
451
+ .then(json => {
452
+ data = json;
453
+ render();
454
+ })
455
+ .catch(err => {
456
+ const pre = document.createElement('pre');
457
+ pre.style.color = 'red';
458
+ pre.style.padding = '16px';
459
+ pre.textContent = `Error loading data: ${err.message}`;
460
+ container.appendChild(pre);
461
+ });
462
+
463
+ // Resize handling
464
+ if (window.ResizeObserver) {
465
+ new ResizeObserver(() => render()).observe(container);
466
+ } else {
467
+ window.addEventListener('resize', render);
468
+ }
469
+
470
+ // Theme change handling
471
+ const observer = new MutationObserver(() => render());
472
+ observer.observe(document.documentElement, {
473
+ attributes: true,
474
+ attributeFilter: ['data-theme']
475
+ });
476
+ };
477
+
478
+ if (document.readyState === 'loading') {
479
+ document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
480
+ } else {
481
+ ensureD3(bootstrap);
482
+ }
483
+ })();
484
+ </script>
app/src/content/embeds/score-vs-recklessness.html ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="d3-score-vs-recklessness"></div>
2
+ <style>
3
+ .d3-score-vs-recklessness {
4
+ width: 100%;
5
+ margin: 10px 0;
6
+ position: relative;
7
+ font-family: system-ui, -apple-system, sans-serif;
8
+ }
9
+
10
+ .d3-score-vs-recklessness svg {
11
+ display: block;
12
+ width: 100%;
13
+ height: auto;
14
+ }
15
+
16
+ .d3-score-vs-recklessness .axes path,
17
+ .d3-score-vs-recklessness .axes line {
18
+ stroke: var(--axis-color, var(--text-color));
19
+ }
20
+
21
+ .d3-score-vs-recklessness .axes text {
22
+ fill: var(--tick-color, var(--muted-color));
23
+ font-size: 14px;
24
+ }
25
+
26
+ .d3-score-vs-recklessness .grid line {
27
+ stroke: var(--grid-color, rgba(0,0,0,.15));
28
+ }
29
+
30
+ .d3-score-vs-recklessness .axes text.axis-label {
31
+ font-size: 18px;
32
+ font-weight: 500;
33
+ fill: var(--text-color);
34
+ }
35
+
36
+ .d3-score-vs-recklessness .x-axis text {
37
+ transform: translateY(4px);
38
+ }
39
+
40
+ .d3-score-vs-recklessness .point {
41
+ cursor: pointer;
42
+ transition: opacity 0.15s ease;
43
+ }
44
+
45
+ .d3-score-vs-recklessness .point:hover {
46
+ opacity: 0.8;
47
+ }
48
+
49
+ .d3-score-vs-recklessness .point-label {
50
+ font-size: 11px;
51
+ fill: var(--text-color);
52
+ pointer-events: none;
53
+ }
54
+
55
+ .d3-score-vs-recklessness .annotation {
56
+ font-size: 11px;
57
+ font-style: italic;
58
+ fill: var(--muted-color);
59
+ }
60
+
61
+ .d3-score-vs-recklessness .d3-tooltip {
62
+ position: absolute;
63
+ top: 0;
64
+ left: 0;
65
+ transform: translate(-9999px, -9999px);
66
+ pointer-events: none;
67
+ padding: 10px 12px;
68
+ border-radius: 8px;
69
+ font-size: 12px;
70
+ line-height: 1.4;
71
+ border: 1px solid var(--border-color);
72
+ background: var(--surface-bg);
73
+ color: var(--text-color);
74
+ box-shadow: 0 4px 24px rgba(0,0,0,.18);
75
+ opacity: 0;
76
+ transition: opacity 0.12s ease;
77
+ z-index: 10;
78
+ }
79
+
80
+ .d3-score-vs-recklessness .d3-tooltip .model-name {
81
+ font-weight: 600;
82
+ margin-bottom: 4px;
83
+ }
84
+
85
+ .d3-score-vs-recklessness .d3-tooltip .metric {
86
+ display: flex;
87
+ justify-content: space-between;
88
+ gap: 16px;
89
+ }
90
+
91
+ .d3-score-vs-recklessness .d3-tooltip .metric-label {
92
+ color: var(--muted-color);
93
+ }
94
+
95
+ .d3-score-vs-recklessness .d3-tooltip .metric-value {
96
+ font-weight: 500;
97
+ }
98
+ </style>
99
+ <script>
100
+ (() => {
101
+ const ensureD3 = (cb) => {
102
+ if (window.d3 && typeof window.d3.select === 'function') return cb();
103
+ let s = document.getElementById('d3-cdn-script');
104
+ if (!s) {
105
+ s = document.createElement('script');
106
+ s.id = 'd3-cdn-script';
107
+ s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
108
+ document.head.appendChild(s);
109
+ }
110
+ const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
111
+ s.addEventListener('load', onReady, { once: true });
112
+ if (window.d3) onReady();
113
+ };
114
+
115
+ const bootstrap = () => {
116
+ const scriptEl = document.currentScript;
117
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
118
+ if (!(container && container.classList && container.classList.contains('d3-score-vs-recklessness'))) {
119
+ const candidates = Array.from(document.querySelectorAll('.d3-score-vs-recklessness'))
120
+ .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
121
+ container = candidates[candidates.length - 1] || null;
122
+ }
123
+ if (!container) return;
124
+ if (container.dataset) {
125
+ if (container.dataset.mounted === 'true') return;
126
+ container.dataset.mounted = 'true';
127
+ }
128
+
129
+ // Tooltip setup
130
+ container.style.position = container.style.position || 'relative';
131
+ const tip = document.createElement('div');
132
+ tip.className = 'd3-tooltip';
133
+ container.appendChild(tip);
134
+
135
+ // SVG setup
136
+ const svg = d3.select(container).append('svg');
137
+
138
+ // Add gradient definition
139
+ const defs = svg.append('defs');
140
+ const gradient = defs.append('linearGradient')
141
+ .attr('id', 'recklessness-gradient')
142
+ .attr('x1', '0%')
143
+ .attr('x2', '100%')
144
+ .attr('y1', '0%')
145
+ .attr('y2', '0%');
146
+
147
+ // Gradient stops: red -> orange -> yellow -> green -> yellow -> orange -> red
148
+ gradient.append('stop').attr('offset', '0%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
149
+ gradient.append('stop').attr('offset', '20%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
150
+ gradient.append('stop').attr('offset', '35%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
151
+ gradient.append('stop').attr('offset', '50%').attr('stop-color', 'rgba(102, 187, 106, 0.35)'); // green
152
+ gradient.append('stop').attr('offset', '65%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
153
+ gradient.append('stop').attr('offset', '80%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
154
+ gradient.append('stop').attr('offset', '100%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
155
+
156
+ const gRoot = svg.append('g');
157
+
158
+ // Chart groups (order matters for layering)
159
+ const gBackground = gRoot.append('g').attr('class', 'background');
160
+ const gGrid = gRoot.append('g').attr('class', 'grid');
161
+ const gAxes = gRoot.append('g').attr('class', 'axes');
162
+ const gAnnotations = gRoot.append('g').attr('class', 'annotations');
163
+ const gPoints = gRoot.append('g').attr('class', 'points');
164
+ const gLabels = gRoot.append('g').attr('class', 'labels');
165
+
166
+ // State
167
+ let data = null;
168
+ let width = 800;
169
+ let height = 450;
170
+ const margin = { top: 20, right: 120, bottom: 56, left: 72 };
171
+
172
+ // Scales
173
+ const xScale = d3.scaleLinear();
174
+ const yScale = d3.scaleLinear();
175
+
176
+ // Data loading
177
+ const DATA_URL = '/data/score_vs_recklessness.json';
178
+
179
+ // Helper function to create a 5-point star path
180
+ const starPath = (cx, cy, outerR, innerR) => {
181
+ const points = [];
182
+ for (let i = 0; i < 10; i++) {
183
+ const r = i % 2 === 0 ? outerR : innerR;
184
+ const angle = (Math.PI / 2) + (i * Math.PI / 5);
185
+ points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
186
+ }
187
+ return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
188
+ };
189
+
190
+ function updateSize() {
191
+ width = container.clientWidth || 800;
192
+ height = Math.max(300, Math.round(width / 1.5));
193
+ svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
194
+ gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
195
+ return {
196
+ innerWidth: width - margin.left - margin.right,
197
+ innerHeight: height - margin.top - margin.bottom
198
+ };
199
+ }
200
+
201
+ function showTooltip(event, d) {
202
+ const rect = container.getBoundingClientRect();
203
+ const x = event.clientX - rect.left;
204
+ const y = event.clientY - rect.top;
205
+
206
+ tip.innerHTML = `
207
+ <div class="model-name" style="color: ${d.color}">${d.name}</div>
208
+ <div class="metric">
209
+ <span class="metric-label">Score:</span>
210
+ <span class="metric-value">${d.avg_floored_score.toFixed(1)}</span>
211
+ </div>
212
+ <div class="metric">
213
+ <span class="metric-label">Recklessness Index:</span>
214
+ <span class="metric-value">${d.recklessness_index.toFixed(2)}</span>
215
+ </div>
216
+ <div class="metric">
217
+ <span class="metric-label">Failed Guesses:</span>
218
+ <span class="metric-value">${d.avg_failed_guesses.toFixed(2)}</span>
219
+ </div>
220
+ <div class="metric">
221
+ <span class="metric-label">Caution:</span>
222
+ <span class="metric-value">${d.avg_caution.toFixed(2)}</span>
223
+ </div>
224
+ <div class="metric">
225
+ <span class="metric-label">Type:</span>
226
+ <span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
227
+ </div>
228
+ `;
229
+
230
+ const tipWidth = tip.offsetWidth || 180;
231
+ const tipHeight = tip.offsetHeight || 120;
232
+ let tipX = x + 12;
233
+ let tipY = y - tipHeight / 2;
234
+
235
+ if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
236
+ if (tipY < 0) tipY = 8;
237
+ if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
238
+
239
+ tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
240
+ tip.style.opacity = '1';
241
+ }
242
+
243
+ function hideTooltip() {
244
+ tip.style.opacity = '0';
245
+ tip.style.transform = 'translate(-9999px, -9999px)';
246
+ }
247
+
248
+ function render() {
249
+ if (!data) return;
250
+
251
+ const { innerWidth, innerHeight } = updateSize();
252
+ const models = data.models;
253
+
254
+ // Fixed symmetric X scale from -8 to 8
255
+ xScale
256
+ .domain([-8, 8])
257
+ .range([0, innerWidth]);
258
+
259
+ // Y scale based on data
260
+ const yExtent = d3.extent(models, d => d.avg_floored_score);
261
+ const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
262
+ yScale
263
+ .domain([yExtent[0], yExtent[1] + yPadding])
264
+ .range([innerHeight, 0])
265
+ .nice();
266
+
267
+ // Background gradient rectangle
268
+ gBackground.selectAll('.bg-gradient')
269
+ .data([0])
270
+ .join('rect')
271
+ .attr('class', 'bg-gradient')
272
+ .attr('x', 0)
273
+ .attr('y', 0)
274
+ .attr('width', innerWidth)
275
+ .attr('height', innerHeight)
276
+ .attr('fill', 'url(#recklessness-gradient)');
277
+
278
+ // Grid lines
279
+ const xTicks = xScale.ticks(8);
280
+ const yTicks = yScale.ticks(6);
281
+
282
+ gGrid.selectAll('.grid-x')
283
+ .data(xTicks)
284
+ .join('line')
285
+ .attr('class', 'grid-x')
286
+ .attr('x1', d => xScale(d))
287
+ .attr('x2', d => xScale(d))
288
+ .attr('y1', 0)
289
+ .attr('y2', innerHeight);
290
+
291
+ gGrid.selectAll('.grid-y')
292
+ .data(yTicks)
293
+ .join('line')
294
+ .attr('class', 'grid-y')
295
+ .attr('x1', 0)
296
+ .attr('x2', innerWidth)
297
+ .attr('y1', d => yScale(d))
298
+ .attr('y2', d => yScale(d));
299
+
300
+ // Axes with inner ticks
301
+ const tickSize = 6;
302
+ gAxes.selectAll('.x-axis')
303
+ .data([0])
304
+ .join('g')
305
+ .attr('class', 'x-axis')
306
+ .attr('transform', `translate(0,${innerHeight})`)
307
+ .call(d3.axisBottom(xScale).ticks(8).tickSizeInner(-tickSize).tickSizeOuter(0));
308
+
309
+ gAxes.selectAll('.y-axis')
310
+ .data([0])
311
+ .join('g')
312
+ .attr('class', 'y-axis')
313
+ .call(d3.axisLeft(yScale).ticks(6).tickSizeInner(-tickSize).tickSizeOuter(0));
314
+
315
+ // Axis labels
316
+ gAxes.selectAll('.x-label')
317
+ .data([0])
318
+ .join('text')
319
+ .attr('class', 'x-label axis-label')
320
+ .attr('x', innerWidth / 2)
321
+ .attr('y', innerHeight + 44)
322
+ .attr('text-anchor', 'middle')
323
+ .text('Boldness Index');
324
+
325
+ gAxes.selectAll('.y-label')
326
+ .data([0])
327
+ .join('text')
328
+ .attr('class', 'y-label axis-label')
329
+ .attr('x', -innerHeight / 2)
330
+ .attr('y', -52)
331
+ .attr('text-anchor', 'middle')
332
+ .attr('transform', 'rotate(-90)')
333
+ .text('Score');
334
+
335
+ // Top annotations: Overcautious / Cautious / Measured / Bold / Reckless
336
+ const annotations = [
337
+ { label: 'Overcautious', color: 'rgba(239, 83, 80, 0.9)', pos: 0.07}, // red
338
+ { label: 'Cautious', color: 'rgba(255, 180, 0, 0.9)', pos: 0.25 }, // yellow/orange
339
+ { label: 'Measured', color: 'rgba(76, 175, 80, 0.9)', pos: 0.5 }, // green
340
+ { label: 'Bold', color: 'rgba(255, 180, 0, 0.9)', pos: 0.75 }, // yellow/orange
341
+ { label: 'Reckless', color: 'rgba(239, 83, 80, 0.9)', pos: 0.95 } // red
342
+ ];
343
+
344
+ gAnnotations.selectAll('.annotation-label')
345
+ .data(annotations, d => d.label)
346
+ .join('text')
347
+ .attr('class', 'annotation annotation-label')
348
+ .attr('x', d => d.pos * innerWidth)
349
+ .attr('y', 16)
350
+ .attr('text-anchor', d => d.pos === 0 ? 'start' : d.pos === 1 ? 'end' : 'middle')
351
+ .style('fill', d => d.color)
352
+ .style('font-weight', 'bold')
353
+ .style('font-size', '13px')
354
+ .text(d => d.label);
355
+
356
+ // Points
357
+ const pointRadius = Math.max(8, Math.min(14, innerWidth / 60));
358
+
359
+ // Closed models as filled circles
360
+ const closedModels = models.filter(d => !d.is_open);
361
+ gPoints.selectAll('.point-closed')
362
+ .data(closedModels, d => d.name)
363
+ .join('circle')
364
+ .attr('class', 'point point-closed')
365
+ .attr('cx', d => xScale(d.recklessness_index))
366
+ .attr('cy', d => yScale(d.avg_floored_score))
367
+ .attr('r', pointRadius)
368
+ .attr('fill', d => d.color)
369
+ .attr('stroke', 'none')
370
+ .on('mouseenter', showTooltip)
371
+ .on('mousemove', showTooltip)
372
+ .on('mouseleave', hideTooltip);
373
+
374
+ // Open models as stars
375
+ const openModels = models.filter(d => d.is_open);
376
+ gPoints.selectAll('.point-star')
377
+ .data(openModels, d => d.name)
378
+ .join('path')
379
+ .attr('class', 'point point-star')
380
+ .attr('d', d => starPath(xScale(d.recklessness_index), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
381
+ .attr('fill', d => d.color)
382
+ .attr('stroke', 'none')
383
+ .on('mouseenter', showTooltip)
384
+ .on('mousemove', showTooltip)
385
+ .on('mouseleave', hideTooltip);
386
+
387
+ // Point labels with smart positioning
388
+ gLabels.selectAll('.point-label')
389
+ .data(models, d => d.name)
390
+ .join('text')
391
+ .attr('class', 'point-label')
392
+ .attr('x', d => {
393
+ const xPos = xScale(d.recklessness_index);
394
+ if (xPos > innerWidth - 100) {
395
+ return xPos - pointRadius - 6;
396
+ }
397
+ return xPos + pointRadius + 6;
398
+ })
399
+ .attr('y', d => yScale(d.avg_floored_score) + 4)
400
+ .attr('text-anchor', d => {
401
+ const xPos = xScale(d.recklessness_index);
402
+ return xPos > innerWidth - 100 ? 'end' : 'start';
403
+ })
404
+ .text(d => d.name);
405
+ }
406
+
407
+ // Initialize
408
+ fetch(DATA_URL, { cache: 'no-cache' })
409
+ .then(r => r.json())
410
+ .then(json => {
411
+ data = json;
412
+ render();
413
+ })
414
+ .catch(err => {
415
+ const pre = document.createElement('pre');
416
+ pre.style.color = 'red';
417
+ pre.style.padding = '16px';
418
+ pre.textContent = `Error loading data: ${err.message}`;
419
+ container.appendChild(pre);
420
+ });
421
+
422
+ // Resize handling
423
+ if (window.ResizeObserver) {
424
+ new ResizeObserver(() => render()).observe(container);
425
+ } else {
426
+ window.addEventListener('resize', render);
427
+ }
428
+
429
+ // Theme change handling
430
+ const observer = new MutationObserver(() => render());
431
+ observer.observe(document.documentElement, {
432
+ attributes: true,
433
+ attributeFilter: ['data-theme']
434
+ });
435
+ };
436
+
437
+ if (document.readyState === 'loading') {
438
+ document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
439
+ } else {
440
+ ensureD3(bootstrap);
441
+ }
442
+ })();
443
+ </script>
app/src/content/embeds/tokens-by-turn.html ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="d3-tokens-by-turn"></div>
2
+ <style>
3
+ .d3-tokens-by-turn {
4
+ width: 100%;
5
+ margin: 10px 0;
6
+ position: relative;
7
+ font-family: system-ui, -apple-system, sans-serif;
8
+ }
9
+
10
+ .d3-tokens-by-turn svg {
11
+ display: block;
12
+ width: 100%;
13
+ height: auto;
14
+ }
15
+
16
+ .d3-tokens-by-turn .axes path,
17
+ .d3-tokens-by-turn .axes line {
18
+ stroke: var(--axis-color, var(--text-color));
19
+ }
20
+
21
+ .d3-tokens-by-turn .axes text {
22
+ fill: var(--tick-color, var(--muted-color));
23
+ font-size: 11px;
24
+ }
25
+
26
+ .d3-tokens-by-turn .grid line {
27
+ stroke: var(--grid-color, rgba(0,0,0,.08));
28
+ }
29
+
30
+ .d3-tokens-by-turn .axes text.axis-label {
31
+ font-size: 14px;
32
+ font-weight: 500;
33
+ fill: var(--text-color);
34
+ }
35
+
36
+ .d3-tokens-by-turn .x-axis text {
37
+ transform: translateY(4px);
38
+ }
39
+
40
+ .d3-tokens-by-turn .tokens-line {
41
+ fill: none;
42
+ stroke-width: 1.5;
43
+ }
44
+
45
+ .d3-tokens-by-turn .data-point {
46
+ cursor: pointer;
47
+ transition: opacity 0.15s ease;
48
+ }
49
+
50
+ .d3-tokens-by-turn .data-point:hover {
51
+ opacity: 0.8;
52
+ }
53
+
54
+ .d3-tokens-by-turn .legend {
55
+ font-size: 11px;
56
+ }
57
+
58
+ .d3-tokens-by-turn .legend-item {
59
+ cursor: pointer;
60
+ }
61
+
62
+ .d3-tokens-by-turn .legend-item.dimmed .legend-line,
63
+ .d3-tokens-by-turn .legend-item.dimmed .legend-marker {
64
+ opacity: 0.3;
65
+ }
66
+
67
+ .d3-tokens-by-turn .legend-item.dimmed text {
68
+ opacity: 0.4;
69
+ }
70
+
71
+ .d3-tokens-by-turn .legend-text {
72
+ fill: var(--text-color);
73
+ }
74
+
75
+ .d3-tokens-by-turn .d3-tooltip {
76
+ position: absolute;
77
+ top: 0;
78
+ left: 0;
79
+ transform: translate(-9999px, -9999px);
80
+ pointer-events: none;
81
+ padding: 10px 12px;
82
+ border-radius: 8px;
83
+ font-size: 12px;
84
+ line-height: 1.4;
85
+ border: 1px solid var(--border-color);
86
+ background: var(--surface-bg);
87
+ color: var(--text-color);
88
+ box-shadow: 0 4px 24px rgba(0,0,0,.18);
89
+ opacity: 0;
90
+ transition: opacity 0.12s ease;
91
+ z-index: 10;
92
+ }
93
+
94
+ .d3-tokens-by-turn .d3-tooltip .model-name {
95
+ font-weight: 600;
96
+ margin-bottom: 4px;
97
+ }
98
+
99
+ .d3-tokens-by-turn .d3-tooltip .metric {
100
+ display: flex;
101
+ justify-content: space-between;
102
+ gap: 16px;
103
+ }
104
+
105
+ .d3-tokens-by-turn .d3-tooltip .metric-label {
106
+ color: var(--muted-color);
107
+ }
108
+
109
+ .d3-tokens-by-turn .d3-tooltip .metric-value {
110
+ font-weight: 500;
111
+ }
112
+ </style>
113
+ <script>
114
+ (() => {
115
+ const ensureD3 = (cb) => {
116
+ if (window.d3 && typeof window.d3.select === 'function') return cb();
117
+ let s = document.getElementById('d3-cdn-script');
118
+ if (!s) {
119
+ s = document.createElement('script');
120
+ s.id = 'd3-cdn-script';
121
+ s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
122
+ document.head.appendChild(s);
123
+ }
124
+ const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
125
+ s.addEventListener('load', onReady, { once: true });
126
+ if (window.d3) onReady();
127
+ };
128
+
129
+ const bootstrap = () => {
130
+ const scriptEl = document.currentScript;
131
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
132
+ if (!(container && container.classList && container.classList.contains('d3-tokens-by-turn'))) {
133
+ const candidates = Array.from(document.querySelectorAll('.d3-tokens-by-turn'))
134
+ .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
135
+ container = candidates[candidates.length - 1] || null;
136
+ }
137
+ if (!container) return;
138
+ if (container.dataset) {
139
+ if (container.dataset.mounted === 'true') return;
140
+ container.dataset.mounted = 'true';
141
+ }
142
+
143
+ // Tooltip setup
144
+ container.style.position = container.style.position || 'relative';
145
+ const tip = document.createElement('div');
146
+ tip.className = 'd3-tooltip';
147
+ container.appendChild(tip);
148
+
149
+ // SVG setup
150
+ const svg = d3.select(container).append('svg');
151
+ const gRoot = svg.append('g');
152
+
153
+ // Chart groups (order matters for layering)
154
+ const gGrid = gRoot.append('g').attr('class', 'grid');
155
+ const gLines = gRoot.append('g').attr('class', 'lines');
156
+ const gPoints = gRoot.append('g').attr('class', 'points');
157
+ const gAxes = gRoot.append('g').attr('class', 'axes');
158
+ const gLegend = gRoot.append('g').attr('class', 'legend');
159
+
160
+ // State
161
+ let data = null;
162
+ let width = 800;
163
+ let height = 450;
164
+ const margin = { top: 20, right: 180, bottom: 56, left: 72 };
165
+ let hiddenModels = new Set();
166
+
167
+ // Scales
168
+ const xScale = d3.scaleLinear();
169
+ const yScale = d3.scaleLinear();
170
+
171
+ // Line generator
172
+ const line = d3.line()
173
+ .x(d => xScale(d.turn_number))
174
+ .y(d => yScale(d.avg_output_tokens));
175
+
176
+ // Data loading
177
+ const DATA_URL = '/data/tokens_by_turn.json';
178
+
179
+ function updateSize() {
180
+ width = container.clientWidth || 800;
181
+ height = Math.max(350, Math.round(width * 0.5));
182
+ svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
183
+ gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
184
+ return {
185
+ innerWidth: width - margin.left - margin.right,
186
+ innerHeight: height - margin.top - margin.bottom
187
+ };
188
+ }
189
+
190
+ function showTooltip(event, d, model) {
191
+ const rect = container.getBoundingClientRect();
192
+ const x = event.clientX - rect.left;
193
+ const y = event.clientY - rect.top;
194
+
195
+ tip.innerHTML = `
196
+ <div class="model-name" style="color: ${model.color}">${model.name}</div>
197
+ <div class="metric">
198
+ <span class="metric-label">Turn:</span>
199
+ <span class="metric-value">${d.turn_number}</span>
200
+ </div>
201
+ <div class="metric">
202
+ <span class="metric-label">Avg tokens:</span>
203
+ <span class="metric-value">${Math.round(d.avg_output_tokens).toLocaleString()}</span>
204
+ </div>
205
+ <div class="metric">
206
+ <span class="metric-label">Sample size:</span>
207
+ <span class="metric-value">${d.sample_count}</span>
208
+ </div>
209
+ `;
210
+
211
+ const tipWidth = tip.offsetWidth || 150;
212
+ const tipHeight = tip.offsetHeight || 100;
213
+ let tipX = x + 12;
214
+ let tipY = y - tipHeight / 2;
215
+
216
+ if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
217
+ if (tipY < 0) tipY = 8;
218
+ if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
219
+
220
+ tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
221
+ tip.style.opacity = '1';
222
+ }
223
+
224
+ function hideTooltip() {
225
+ tip.style.opacity = '0';
226
+ tip.style.transform = 'translate(-9999px, -9999px)';
227
+ }
228
+
229
+ function toggleModel(modelName) {
230
+ if (hiddenModels.has(modelName)) {
231
+ hiddenModels.delete(modelName);
232
+ } else {
233
+ hiddenModels.add(modelName);
234
+ }
235
+ render();
236
+ }
237
+
238
+ // Helper function to create a 5-point star path
239
+ const starPath = (cx, cy, outerR, innerR) => {
240
+ const points = [];
241
+ for (let i = 0; i < 10; i++) {
242
+ const r = i % 2 === 0 ? outerR : innerR;
243
+ const angle = (Math.PI / 2) + (i * Math.PI / 5);
244
+ points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
245
+ }
246
+ return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
247
+ };
248
+
249
+ function render() {
250
+ if (!data) return;
251
+
252
+ const { innerWidth, innerHeight } = updateSize();
253
+ const models = data.models;
254
+
255
+ // Find visible models and compute extents
256
+ const visibleModels = models.filter(m => !hiddenModels.has(m.name));
257
+
258
+ // X scale: turn number 1-30
259
+ xScale
260
+ .domain([1, 30])
261
+ .range([0, innerWidth]);
262
+
263
+ // Y scale: find max tokens across visible models
264
+ let maxTokens = 0;
265
+ visibleModels.forEach(m => {
266
+ m.tokens_by_turn.forEach(t => {
267
+ if (t.avg_output_tokens > maxTokens) maxTokens = t.avg_output_tokens;
268
+ });
269
+ });
270
+ maxTokens = Math.ceil(maxTokens / 2000) * 2000; // Round up to nearest 2000
271
+
272
+ yScale
273
+ .domain([0, maxTokens])
274
+ .range([innerHeight, 0]);
275
+
276
+ // Grid lines
277
+ const xTicks = d3.range(5, 31, 5); // 5, 10, 15, 20, 25, 30
278
+ const yTicks = yScale.ticks(6);
279
+
280
+ gGrid.selectAll('.grid-x')
281
+ .data(xTicks)
282
+ .join('line')
283
+ .attr('class', 'grid-x')
284
+ .attr('x1', d => xScale(d))
285
+ .attr('x2', d => xScale(d))
286
+ .attr('y1', 0)
287
+ .attr('y2', innerHeight);
288
+
289
+ gGrid.selectAll('.grid-y')
290
+ .data(yTicks)
291
+ .join('line')
292
+ .attr('class', 'grid-y')
293
+ .attr('x1', 0)
294
+ .attr('x2', innerWidth)
295
+ .attr('y1', d => yScale(d))
296
+ .attr('y2', d => yScale(d));
297
+
298
+ // Axes
299
+ const tickSize = 6;
300
+
301
+ gAxes.selectAll('.x-axis')
302
+ .data([0])
303
+ .join('g')
304
+ .attr('class', 'x-axis')
305
+ .attr('transform', `translate(0,${innerHeight})`)
306
+ .call(d3.axisBottom(xScale)
307
+ .tickValues([1, 5, 10, 15, 20, 25, 30])
308
+ .tickSizeInner(-tickSize)
309
+ .tickSizeOuter(0));
310
+
311
+ gAxes.selectAll('.y-axis')
312
+ .data([0])
313
+ .join('g')
314
+ .attr('class', 'y-axis')
315
+ .call(d3.axisLeft(yScale)
316
+ .ticks(6)
317
+ .tickFormat(d => d >= 1000 ? `${d/1000}k` : d)
318
+ .tickSizeInner(-tickSize)
319
+ .tickSizeOuter(0));
320
+
321
+ // Axis labels
322
+ gAxes.selectAll('.x-label')
323
+ .data([0])
324
+ .join('text')
325
+ .attr('class', 'x-label axis-label')
326
+ .attr('x', innerWidth / 2)
327
+ .attr('y', innerHeight + 44)
328
+ .attr('text-anchor', 'middle')
329
+ .text('Turn Number');
330
+
331
+ gAxes.selectAll('.y-label')
332
+ .data([0])
333
+ .join('text')
334
+ .attr('class', 'y-label axis-label')
335
+ .attr('x', -innerHeight / 2)
336
+ .attr('y', -52)
337
+ .attr('text-anchor', 'middle')
338
+ .attr('transform', 'rotate(-90)')
339
+ .text('Average Output Tokens');
340
+
341
+ // Lines for each model
342
+ gLines.selectAll('.tokens-line')
343
+ .data(visibleModels, d => d.name)
344
+ .join('path')
345
+ .attr('class', 'tokens-line')
346
+ .attr('d', d => line(d.tokens_by_turn))
347
+ .attr('stroke', d => d.color)
348
+ .attr('stroke-dasharray', d => d.is_open ? '6,3' : 'none');
349
+
350
+ // Data points
351
+ const allPoints = visibleModels.flatMap(model =>
352
+ model.tokens_by_turn.map(p => ({ ...p, model }))
353
+ );
354
+ const closedPoints = allPoints.filter(d => !d.model.is_open);
355
+ const openPoints = allPoints.filter(d => d.model.is_open);
356
+
357
+ // Circles for closed models
358
+ gPoints.selectAll('.data-point-circle')
359
+ .data(closedPoints, d => `${d.model.name}-${d.turn_number}`)
360
+ .join('circle')
361
+ .attr('class', 'data-point data-point-circle')
362
+ .attr('cx', d => xScale(d.turn_number))
363
+ .attr('cy', d => yScale(d.avg_output_tokens))
364
+ .attr('r', 3)
365
+ .attr('fill', d => d.model.color)
366
+ .attr('stroke', 'var(--surface-bg, white)')
367
+ .attr('stroke-width', 1)
368
+ .on('mouseenter', (event, d) => showTooltip(event, d, d.model))
369
+ .on('mousemove', (event, d) => showTooltip(event, d, d.model))
370
+ .on('mouseleave', hideTooltip);
371
+
372
+ // Stars for open models
373
+ gPoints.selectAll('.data-point-star')
374
+ .data(openPoints, d => `${d.model.name}-${d.turn_number}`)
375
+ .join('path')
376
+ .attr('class', 'data-point data-point-star')
377
+ .attr('d', d => starPath(
378
+ xScale(d.turn_number),
379
+ yScale(d.avg_output_tokens),
380
+ 5, 2.2
381
+ ))
382
+ .attr('fill', d => d.model.color)
383
+ .attr('stroke', 'var(--surface-bg, white)')
384
+ .attr('stroke-width', 0.6)
385
+ .on('mouseenter', (event, d) => showTooltip(event, d, d.model))
386
+ .on('mousemove', (event, d) => showTooltip(event, d, d.model))
387
+ .on('mouseleave', hideTooltip);
388
+
389
+ // Legend
390
+ const legendX = innerWidth + 16;
391
+ const legendItemHeight = 20;
392
+
393
+ gLegend.selectAll('.legend-item')
394
+ .data(models, d => d.name)
395
+ .join('g')
396
+ .attr('class', d => `legend-item ${hiddenModels.has(d.name) ? 'dimmed' : ''}`)
397
+ .attr('transform', (d, i) => `translate(${legendX}, ${i * legendItemHeight})`)
398
+ .each(function(d) {
399
+ const g = d3.select(this);
400
+ g.selectAll('*').remove();
401
+
402
+ // Line segment
403
+ g.append('line')
404
+ .attr('class', 'legend-line')
405
+ .attr('x1', 0)
406
+ .attr('x2', 20)
407
+ .attr('y1', 0)
408
+ .attr('y2', 0)
409
+ .attr('stroke', d.color)
410
+ .attr('stroke-width', 1.5)
411
+ .attr('stroke-dasharray', d.is_open ? '4,2' : 'none');
412
+
413
+ // Marker - circle for closed, star for open
414
+ if (d.is_open) {
415
+ g.append('path')
416
+ .attr('class', 'legend-marker')
417
+ .attr('d', starPath(10, 0, 5, 2.2))
418
+ .attr('fill', d.color);
419
+ } else {
420
+ g.append('circle')
421
+ .attr('class', 'legend-marker')
422
+ .attr('cx', 10)
423
+ .attr('cy', 0)
424
+ .attr('r', 3)
425
+ .attr('fill', d.color);
426
+ }
427
+
428
+ g.append('text')
429
+ .attr('class', 'legend-text')
430
+ .attr('x', 26)
431
+ .attr('y', 4)
432
+ .text(d.name);
433
+
434
+ g.style('cursor', 'pointer')
435
+ .on('click', () => toggleModel(d.name));
436
+ });
437
+
438
+ // Legend note about line styles
439
+ const noteY = models.length * legendItemHeight + 12;
440
+ gLegend.selectAll('.legend-note')
441
+ .data([0])
442
+ .join('text')
443
+ .attr('class', 'legend-note')
444
+ .attr('x', legendX)
445
+ .attr('y', noteY)
446
+ .attr('font-size', '10px')
447
+ .attr('fill', 'var(--muted-color)')
448
+ .text('Solid = Closed, Dashed = Open');
449
+ }
450
+
451
+ // Initialize
452
+ fetch(DATA_URL, { cache: 'no-cache' })
453
+ .then(r => r.json())
454
+ .then(json => {
455
+ data = json;
456
+ render();
457
+ })
458
+ .catch(err => {
459
+ const pre = document.createElement('pre');
460
+ pre.style.color = 'red';
461
+ pre.style.padding = '16px';
462
+ pre.textContent = `Error loading data: ${err.message}`;
463
+ container.appendChild(pre);
464
+ });
465
+
466
+ // Resize handling
467
+ if (window.ResizeObserver) {
468
+ new ResizeObserver(() => render()).observe(container);
469
+ } else {
470
+ window.addEventListener('resize', render);
471
+ }
472
+
473
+ // Theme change handling
474
+ const observer = new MutationObserver(() => render());
475
+ observer.observe(document.documentElement, {
476
+ attributes: true,
477
+ attributeFilter: ['data-theme']
478
+ });
479
+ };
480
+
481
+ if (document.readyState === 'loading') {
482
+ document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
483
+ } else {
484
+ ensureD3(bootstrap);
485
+ }
486
+ })();
487
+ </script>
app/src/styles/_layout.css CHANGED
@@ -195,4 +195,30 @@
195
  width: 100%;
196
  min-width: 0;
197
  }
198
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  width: 100%;
196
  min-width: 0;
197
  }
198
+ }
199
+
200
+ /* ============================================================================ */
201
+ /* Bibliography/References - hide inline sections that should be in footer */
202
+ /* ---------------------------------------------------------------------------- */
203
+ /* References sections with data-built-refs are generated per-chapter by
204
+ rehype-citation. The Footer.astro script consolidates them into the footer.
205
+ These styles ensure inline refs don't display in the main content area. */
206
+
207
+ /* References in main content should not be displayed - they belong in footer.
208
+ The Footer.astro script moves them; this CSS is a visual safeguard. */
209
+ main [data-built-refs],
210
+ main #references:not(.footer-processed),
211
+ main section.references:not(ol),
212
+ main div.references:not(ol),
213
+ main .bibliography:not(ol) {
214
+ /* Collapse to zero height to prevent layout impact, but keep in DOM for JS */
215
+ max-height: 0;
216
+ overflow: hidden;
217
+ margin: 0 !important;
218
+ padding: 0 !important;
219
+ border: none !important;
220
+ opacity: 0;
221
+ pointer-events: none;
222
+ }
223
+
224
+ /* Once moved to footer, these styles don't apply (not inside main) */
bibliography_fix.md ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bibliography System Fix for Multi-Chapter MDX Articles
2
+
3
+ This document describes changes made to the [Research Article Template](https://huggingface.co/spaces/tfrere/research-article-template) to fix bibliography/references placement when using multiple MDX chapter files.
4
+
5
+ ## The Problem
6
+
7
+ When an article is split into multiple MDX chapter files (e.g., `introduction.mdx`, `results.mdx`, etc.) that are imported into a main `article.mdx`, the bibliography appears at the end of each chapter instead of consolidated in the footer.
8
+
9
+ ### Root Cause
10
+
11
+ Astro compiles each MDX file independently through the remark/rehype pipeline. The `rehype-citation` plugin appends a `<section id="references">` to the end of **every** MDX file that contains citations.
12
+
13
+ This causes two issues:
14
+ 1. **Duplicate IDs**: Multiple `<section id="references">` elements (invalid HTML)
15
+ 2. **Scattered bibliographies**: References appear after each chapter instead of once at the end
16
+
17
+ ### Original Template Behavior
18
+
19
+ The original `Footer.astro` only looked for the **first** references section using `findFirstOutsideFooter()`. This worked for single-file articles but failed for multi-chapter structures.
20
+
21
+ ## The Solution
22
+
23
+ A two-phase approach: build-time marking + runtime consolidation.
24
+
25
+ ### Phase 1: Build-Time (post-citation.mjs)
26
+
27
+ Mark ALL references sections so they can be found at runtime.
28
+
29
+ ### Phase 2: Runtime (Footer.astro)
30
+
31
+ Consolidate all marked sections into the footer, merging list items and removing duplicates.
32
+
33
+ ### Phase 3: CSS Fallback (_layout.css)
34
+
35
+ Hide any unconsolidated sections as a visual safety net.
36
+
37
+ ---
38
+
39
+ ## Changes Made
40
+
41
+ ### 1. `app/plugins/rehype/post-citation.mjs`
42
+
43
+ **Change**: Find and process ALL references sections, not just the first one.
44
+
45
+ ```javascript
46
+ // BEFORE: Only found first section
47
+ const findReferencesRoot = () => {
48
+ let found = null;
49
+ walk(tree, null, (node) => {
50
+ if (found) return; // <-- Stopped after first match
51
+ // ...
52
+ });
53
+ return found;
54
+ };
55
+
56
+ // AFTER: Find ALL sections
57
+ const findAllReferencesRoots = () => {
58
+ const found = [];
59
+ walk(tree, null, (node) => {
60
+ if (!isElement(node)) return;
61
+ const id = getAttr(node, 'id');
62
+ if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
63
+ if (!found.includes(node)) {
64
+ found.push(node);
65
+ }
66
+ }
67
+ });
68
+ return found;
69
+ };
70
+ ```
71
+
72
+ **Change**: Process all sections in a loop and mark each with `data-built-refs`.
73
+
74
+ ```javascript
75
+ // BEFORE: Single section processing
76
+ const refsRoot = findReferencesRoot();
77
+ if (refsRoot) {
78
+ // ... process single section
79
+ setAttr(refsRoot, 'data-built-refs', '1');
80
+ }
81
+
82
+ // AFTER: Loop through all sections
83
+ const allRefsRoots = findAllReferencesRoots();
84
+ for (const refsRoot of allRefsRoots) {
85
+ // ... process each section
86
+ setAttr(refsRoot, 'data-built-refs', '1');
87
+ }
88
+ ```
89
+
90
+ ---
91
+
92
+ ### 2. `app/src/components/Footer.astro`
93
+
94
+ **Change**: Add `[data-built-refs]` to selector list (was missing).
95
+
96
+ ```javascript
97
+ // BEFORE: Missing the data attribute selector
98
+ const allRefsEls = findAllOutsideFooter([
99
+ "#bibliography-references-list",
100
+ "[data-bibliography-block]", // <-- This doesn't exist
101
+ "#references",
102
+ // ...
103
+ ]);
104
+
105
+ // AFTER: Added data-built-refs and improved selector order
106
+ const allRefsEls = findAllOutsideFooter([
107
+ "[data-built-refs]", // <-- Added: what post-citation.mjs actually sets
108
+ "[data-bibliography-block]",
109
+ "#bibliography-references-list",
110
+ "section#references",
111
+ "div#references",
112
+ "#refs",
113
+ ".references:not(ol)",
114
+ ".bibliography",
115
+ ]);
116
+ ```
117
+
118
+ **Change**: Improved duplicate detection with CSS.escape fallback.
119
+
120
+ ```javascript
121
+ // BEFORE: Could fail if CSS.escape unavailable or ID has special chars
122
+ if (!itemId || !targetOl.querySelector(`#${CSS.escape(itemId)}`)) {
123
+ targetOl.appendChild(item);
124
+ }
125
+
126
+ // AFTER: Robust fallback
127
+ if (itemId) {
128
+ try {
129
+ const escapedId = CSS.escape ? CSS.escape(itemId) : itemId.replace(/([^\w-])/g, '\\$1');
130
+ if (targetOl.querySelector(`#${escapedId}`)) {
131
+ return; // Skip duplicate
132
+ }
133
+ } catch (e) {
134
+ // Manual check if selector fails
135
+ const existing = Array.from(targetOl.querySelectorAll('li')).find(li => li.id === itemId);
136
+ if (existing) return;
137
+ }
138
+ }
139
+ targetOl.appendChild(item);
140
+ ```
141
+
142
+ **Change**: Added MutationObserver to catch dynamically rendered content.
143
+
144
+ ```javascript
145
+ // Watch for dynamically added content (e.g., lazy-loaded components)
146
+ const observer = new MutationObserver((mutations) => {
147
+ if (footer.dataset.processed !== "true") {
148
+ attemptMove();
149
+ } else {
150
+ // Check if any new references sections were added
151
+ for (const mutation of mutations) {
152
+ for (const node of mutation.addedNodes) {
153
+ if (node.nodeType === 1) {
154
+ const el = node;
155
+ if (
156
+ el.id === "references" ||
157
+ el.classList?.contains("references") ||
158
+ el.hasAttribute?.("data-built-refs")
159
+ ) {
160
+ footer.dataset.processed = "false";
161
+ attemptMove();
162
+ return;
163
+ }
164
+ }
165
+ }
166
+ }
167
+ }
168
+ });
169
+
170
+ if (contentRoot) {
171
+ observer.observe(contentRoot, { childList: true, subtree: true });
172
+ }
173
+
174
+ // Stop observing after page is fully loaded
175
+ window.addEventListener("load", () => {
176
+ setTimeout(() => observer.disconnect(), 2000);
177
+ }, { once: true });
178
+ ```
179
+
180
+ ---
181
+
182
+ ### 3. `app/src/styles/_layout.css`
183
+
184
+ **Change**: Added CSS to hide any inline references sections that weren't consolidated.
185
+
186
+ ```css
187
+ /* Bibliography/References - hide inline sections that should be in footer */
188
+ /* These styles ensure inline refs don't display in the main content area. */
189
+
190
+ main [data-built-refs],
191
+ main #references:not(.footer-processed),
192
+ main section.references:not(ol),
193
+ main div.references:not(ol),
194
+ main .bibliography:not(ol) {
195
+ /* Collapse to zero height to prevent layout impact, but keep in DOM for JS */
196
+ max-height: 0;
197
+ overflow: hidden;
198
+ margin: 0 !important;
199
+ padding: 0 !important;
200
+ border: none !important;
201
+ opacity: 0;
202
+ pointer-events: none;
203
+ }
204
+ ```
205
+
206
+ ---
207
+
208
+ ## How It Works Now
209
+
210
+ 1. **Build time**: Each MDX chapter is compiled. `rehype-citation` adds a bibliography section to each. `post-citation.mjs` marks ALL of them with `data-built-refs="1"`.
211
+
212
+ 2. **Page load**: `Footer.astro` JavaScript runs:
213
+ - Finds all elements with `[data-built-refs]` or other bibliography selectors
214
+ - Moves the first section to the footer
215
+ - Extracts `<li>` items from subsequent sections and appends to the consolidated list
216
+ - Skips duplicates (same ID)
217
+ - Removes empty leftover sections
218
+
219
+ 3. **Visual fallback**: CSS hides any sections that might remain in the main content (timing edge cases).
220
+
221
+ ---
222
+
223
+ ## Testing
224
+
225
+ 1. Run `npm run dev` and open the article
226
+ 2. Scroll to the footer - all references should appear there
227
+ 3. Open browser dev tools:
228
+ - Search for `data-built-refs` - should only exist in footer
229
+ - Check that no `#references` sections remain in `<main>`
230
+ 4. Click citation links - should scroll to footer references
231
+
232
+ ---
233
+
234
+ ## Files Modified
235
+
236
+ | File | Change |
237
+ |------|--------|
238
+ | `app/plugins/rehype/post-citation.mjs` | Find and mark ALL references sections |
239
+ | `app/src/components/Footer.astro` | Improved selectors, robust deduplication, MutationObserver |
240
+ | `app/src/styles/_layout.css` | CSS fallback to hide unconsolidated sections |
241
+
242
+ ---
243
+
244
+ ## Upstream Contribution
245
+
246
+ These changes could be contributed back to the original template. The fix is backward-compatible:
247
+ - Single-file articles work exactly as before
248
+ - Multi-chapter articles now work correctly
249
+ - No configuration changes needed
interactive-charts.md CHANGED
@@ -409,9 +409,11 @@ For frameless embedding (like the banner):
409
  | 3 | `confidence_distribution.json` | Grouped histogram | Done (confidence-distribution.html) |
410
  | 4 | `score_vs_failed_guesses.json` | Scatter | TODO |
411
  | 5 | `excess_caution.json` | Box plot | TODO |
 
412
  | 6 | `caution_vs_failed_guesses.json` | Scatter | Done (caution-vs-failed-guesses.html) |
413
  | 7 | `by_rule.json` | Strip plot | Done (by-rule.html) |
414
  | 8 | `complexity_analysis.json` | Heatmap | Done (complexity-analysis.html) |
 
415
 
416
  ## Testing
417
 
 
409
  | 3 | `confidence_distribution.json` | Grouped histogram | Done (confidence-distribution.html) |
410
  | 4 | `score_vs_failed_guesses.json` | Scatter | TODO |
411
  | 5 | `excess_caution.json` | Box plot | TODO |
412
+ | 5b | `tokens_by_turn.json` | Multi-line | Done (tokens-by-turn.html) |
413
  | 6 | `caution_vs_failed_guesses.json` | Scatter | Done (caution-vs-failed-guesses.html) |
414
  | 7 | `by_rule.json` | Strip plot | Done (by-rule.html) |
415
  | 8 | `complexity_analysis.json` | Heatmap | Done (complexity-analysis.html) |
416
+ | 9 | `complexity_ratio.json` | Horizontal dot plot | Done (complexity-ratio.html) |
417
 
418
  ## Testing
419