eleusis-benchmark

Running

App Files Files Community

dlouapre HF Staff commited on Jan 30

Commit

cafe265

1 Parent(s): e0197ee

Improved charts

Browse files

Files changed (24) hide show

app/plugins/rehype/post-citation.mjs +22 -10
app/src/components/Footer.astro +115 -7
app/src/content/assets/data/basic_metrics.csv +2 -2
app/src/content/assets/data/complexity_ratio.json +3 -0
app/src/content/assets/data/complexity_ratio.png +3 -0
app/src/content/assets/data/overall_performance.json +1 -1
app/src/content/assets/data/overall_performance.png +2 -2
app/src/content/assets/data/score_vs_recklessness.json +3 -0
app/src/content/assets/data/score_vs_recklessness.png +3 -0
app/src/content/assets/data/summary.txt +65 -11
app/src/content/assets/data/tokens_by_turn.json +3 -0
app/src/content/assets/data/tokens_by_turn.png +3 -0
app/src/content/chapters/eleusis/appendix.mdx +0 -32
app/src/content/chapters/eleusis/benchmark.mdx +13 -8
app/src/content/chapters/eleusis/introduction.mdx +4 -4
app/src/content/chapters/eleusis/results.mdx +106 -25
app/src/content/embeds/banner-bar-chart.html +356 -0
app/src/content/embeds/banner.html +200 -113
app/src/content/embeds/complexity-ratio.html +484 -0
app/src/content/embeds/score-vs-recklessness.html +443 -0
app/src/content/embeds/tokens-by-turn.html +487 -0
app/src/styles/_layout.css +27 -1
bibliography_fix.md +249 -0
interactive-charts.md +2 -0

app/plugins/rehype/post-citation.mjs CHANGED Viewed

@@ -299,20 +299,28 @@ export default function rehypeReferencesAndFootnotes() {
       }
     };
-    // Find references container and normalize its list
-    const findReferencesRoot = () => {
-      let found = null;
       walk(tree, null, (node) => {
-        if (found) return;
         if (!isElement(node)) return;
         const id = getAttr(node, 'id');
         if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
-          found = node;
         }
       });
       return found;
     };
     const toOrderedList = (container) => {
       // If there is already an <ol>, use it; otherwise convert common structures
       let ol = getChildren(container).find((c) => isElement(c) && c.tagName === 'ol');
@@ -340,15 +348,18 @@ export default function rehypeReferencesAndFootnotes() {
       return ol;
     };
-    const refsRoot = findReferencesRoot();
-    let refsOl = null;
     const refIdSet = new Set();
     const refIdToExternalHref = new Map();
-    if (refsRoot) {
-      refsOl = toOrderedList(refsRoot);
       // Collect item ids and linkify their content
-      for (const li of getChildren(refsOl)) {
         if (!isElement(li) || li.tagName !== 'li') continue;
         if (!getAttr(li, 'id')) {
           // Try to find a nested element with id to promote
@@ -380,6 +391,7 @@ export default function rehypeReferencesAndFootnotes() {
           if (externalHref) refIdToExternalHref.set(String(id), externalHref);
         }
       }
       setAttr(refsRoot, 'data-built-refs', '1');
     }

       }
     };
+    // Find ALL references containers (there may be multiple from different MDX imports)
+    const findAllReferencesRoots = () => {
+      const found = [];
       walk(tree, null, (node) => {
         if (!isElement(node)) return;
         const id = getAttr(node, 'id');
         if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
+          // Don't add if already found (shouldn't happen but be safe)
+          if (!found.includes(node)) {
+            found.push(node);
+          }
         }
       });
       return found;
     };
+    // Legacy function for backwards compatibility
+    const findReferencesRoot = () => {
+      const all = findAllReferencesRoots();
+      return all.length > 0 ? all[0] : null;
+    };
     const toOrderedList = (container) => {
       // If there is already an <ol>, use it; otherwise convert common structures
       let ol = getChildren(container).find((c) => isElement(c) && c.tagName === 'ol');
       return ol;
     };
+    // Process ALL references sections (there may be multiple from different MDX imports)
+    const allRefsRoots = findAllReferencesRoots();
+    let refsOl = null;  // Keep track of the first one for backlink processing
     const refIdSet = new Set();
     const refIdToExternalHref = new Map();
+    for (const refsRoot of allRefsRoots) {
+      const currentOl = toOrderedList(refsRoot);
+      if (!refsOl) refsOl = currentOl;  // Use first ol for backlinks
       // Collect item ids and linkify their content
+      for (const li of getChildren(currentOl)) {
         if (!isElement(li) || li.tagName !== 'li') continue;
         if (!getAttr(li, 'id')) {
           // Try to find a nested element with id to promote
           if (externalHref) refIdToExternalHref.set(String(id), externalHref);
         }
       }
+      // Mark each references section so Footer.astro can find them all
       setAttr(refsRoot, 'data-built-refs', '1');
     }

app/src/components/Footer.astro CHANGED Viewed

@@ -142,15 +142,82 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
         return null;
       };
-      const referencesEl = findFirstOutsideFooter([
-        "#bibliography-references-list",
         "[data-bibliography-block]",
-        "#references",
         "#refs",
-        ".references",
         ".bibliography",
       ]);
       // Try multiple selectors for footnotes
       const footnotesEl = findFirstOutsideFooter([
         "[data-built-footnotes]",
@@ -159,7 +226,6 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
         "div.footnotes",
       ]);
-      const movedRefs = moveIntoFooter(referencesEl, "References");
       const movedNotes = moveIntoFooter(footnotesEl, "Footnotes");
       if (movedRefs || movedNotes) {
@@ -196,8 +262,50 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
     // Final attempt after a short delay
     setTimeout(attemptMove, 300);
-    // Resize on window changes (e.g., fonts, layout)
-    // No textarea auto-resize needed for <pre> blocks
   })();
 </script>

         return null;
       };
+      // Find ALL references/bibliography sections and consolidate them
+      const findAllOutsideFooter = (selectors) => {
+        const results = [];
+        const searchRoots = [contentRoot, document.body].filter(Boolean);
+        for (const root of searchRoots) {
+          for (const sel of selectors) {
+            const els = root.querySelectorAll(sel);
+            els.forEach(el => {
+              if (el && !footer.contains(el) && !results.includes(el)) {
+                results.push(el);
+              }
+            });
+          }
+        }
+        return results;
+      };
+      // Find all bibliography/references sections
+      // Note: post-citation.mjs adds data-built-refs="1" to processed sections
+      // We use multiple selectors to catch different formats, prioritizing data attributes
+      // over IDs (since duplicate IDs are invalid HTML and have undefined querySelector behavior)
+      const allRefsEls = findAllOutsideFooter([
+        "[data-built-refs]",
         "[data-bibliography-block]",
+        "#bibliography-references-list",
+        "section#references",
+        "div#references",
         "#refs",
+        ".references:not(ol)",
         ".bibliography",
       ]);
+      // Consolidate multiple bibliography sections into one
+      let movedRefs = false;
+      if (allRefsEls.length > 0) {
+        // Move the first one normally
+        movedRefs = moveIntoFooter(allRefsEls[0], "References");
+        // For additional bibliography sections, merge their list items into the first one
+        if (allRefsEls.length > 1) {
+          // Find the target ol - it's now inside the moved element within target
+          const targetOl = target.querySelector("ol.references") || target.querySelector("ol");
+          for (let i = 1; i < allRefsEls.length; i++) {
+            const extraEl = allRefsEls[i];
+            // Find ol inside the extra section (could be nested)
+            const extraOl = extraEl.querySelector("ol.references") || extraEl.querySelector("ol");
+            if (extraOl && targetOl) {
+              // Move all list items from extra bibliography to the consolidated one
+              const items = Array.from(extraOl.querySelectorAll(":scope > li"));
+              items.forEach(item => {
+                // Check if this reference already exists (by id) to avoid duplicates
+                const itemId = item.id;
+                if (itemId) {
+                  // Use try-catch since CSS.escape might not be available in all browsers
+                  try {
+                    const escapedId = CSS.escape ? CSS.escape(itemId) : itemId.replace(/([^\w-])/g, '\\$1');
+                    if (targetOl.querySelector(`#${escapedId}`)) {
+                      return; // Skip duplicate
+                    }
+                  } catch (e) {
+                    // If selector fails, check manually
+                    const existing = Array.from(targetOl.querySelectorAll('li')).find(li => li.id === itemId);
+                    if (existing) return;
+                  }
+                }
+                targetOl.appendChild(item);
+              });
+            }
+            // Remove the now-empty extra bibliography section from the DOM
+            extraEl.remove();
+          }
+        }
+      }
       // Try multiple selectors for footnotes
       const footnotesEl = findFirstOutsideFooter([
         "[data-built-footnotes]",
         "div.footnotes",
       ]);
       const movedNotes = moveIntoFooter(footnotesEl, "Footnotes");
       if (movedRefs || movedNotes) {
     // Final attempt after a short delay
     setTimeout(attemptMove, 300);
+    // Watch for dynamically added content (e.g., lazy-loaded components)
+    // This catches references sections that might be added after initial render
+    const observer = new MutationObserver((mutations) => {
+      // Only re-run if we haven't fully processed yet or new ref sections appeared
+      if (footer.dataset.processed !== "true") {
+        attemptMove();
+      } else {
+        // Check if any new references sections were added
+        for (const mutation of mutations) {
+          for (const node of mutation.addedNodes) {
+            if (node.nodeType === 1) { // Element node
+              const el = node;
+              if (
+                el.id === "references" ||
+                el.classList?.contains("references") ||
+                el.classList?.contains("bibliography") ||
+                el.hasAttribute?.("data-built-refs")
+              ) {
+                // Reset processed flag and re-consolidate
+                footer.dataset.processed = "false";
+                attemptMove();
+                return;
+              }
+              // Also check for nested references
+              if (el.querySelector?.("[data-built-refs], #references, .references, .bibliography")) {
+                footer.dataset.processed = "false";
+                attemptMove();
+                return;
+              }
+            }
+          }
+        }
+      }
+    });
+    // Observe the main content area for changes
+    if (contentRoot) {
+      observer.observe(contentRoot, { childList: true, subtree: true });
+    }
+    // Stop observing after page is fully loaded + a delay
+    window.addEventListener("load", () => {
+      setTimeout(() => observer.disconnect(), 2000);
+    }, { once: true });
   })();
 </script>

app/src/content/assets/data/basic_metrics.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f67f1217568824b751da562d8106fae602792a64c38abb4b7c8bae75698249c0
-size 2716

 version https://git-lfs.github.com/spec/v1
+oid sha256:847fb061c6643d04446b69249d9c56ba67ea1b502013fc57ff71366d36978a23
+size 2817

app/src/content/assets/data/complexity_ratio.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a59aeba80d14b977f47948d2fcfbd818685df06b033c0b3bb6ee889ae976ab4
+size 2386

app/src/content/assets/data/complexity_ratio.png ADDED Viewed

Git LFS Details

SHA256: 32c2783e40f3b71ac7c61a138d8af768a5a39721e0bae40298f54c0d5e60dac4
Pointer size: 130 Bytes
Size of remote file: 93.2 kB

app/src/content/assets/data/overall_performance.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ddb4557b07eab530ae73d9ce849c542f503fc3656166e9b6164034b5cba83bf
 size 2391

 version https://git-lfs.github.com/spec/v1
+oid sha256:a5e079335d1cf6f5c53df229031920c15561847b7e65476ba93f6526669df8a8
 size 2391

app/src/content/assets/data/overall_performance.png CHANGED Viewed

Git LFS Details

SHA256: 02a2fedd1f6b603d295472aa3ceae73c0159a6b6e675311a6376e2323441bf3d
Pointer size: 130 Bytes
Size of remote file: 79 kB

Git LFS Details

SHA256: c00e19078eaea1aa2ef665814b5659f68a2ade00fb397bf78b47816b1312c37a
Pointer size: 130 Bytes
Size of remote file: 79 kB

app/src/content/assets/data/score_vs_recklessness.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fb4ed9d7a1296dd45431123b58fb52c0d5224a8cc4113cb1b53eab95b8fb610
+size 3251

app/src/content/assets/data/score_vs_recklessness.png ADDED Viewed

Git LFS Details

SHA256: e42807adf23e46a8288607a425f3a20d97c0db484f9f62329347e6dfd011da7d
Pointer size: 130 Bytes
Size of remote file: 85.6 kB

app/src/content/assets/data/summary.txt CHANGED Viewed

@@ -25,17 +25,17 @@ Loaded colors for 17 models
 BASIC MODEL COMPARISON
 ============================================================
-                     model  rounds_played  total_score  avg_score  total_floored_score  avg_floored_score  total_turns  total_output_tokens  total_wall_clock  avg_failed_guesses  success_rate  total_no_stakes_score  avg_no_stakes_score  avg_output_tokens_per_turn  wall_clock_per_turn  intra_rule_variance  inter_rule_variance  variance_ratio
-           Claude Opus 4.5             78         1128  14.461538                 1324          16.974359          852              4333716          86367.64            2.000000      0.833333                 1598.0            20.487179                 5086.521127           101.370469            25.000000            81.385983        0.307178
-                   Kimi K2             78          804  10.307692                 1262          16.179487          975             12281540         101346.76            2.038462      0.769231                 1481.0            18.987179                12596.451282           103.945395            25.538462            88.446496        0.288745
-   Grok 4 1 Fast Reasoning             78          737   9.448718                 1182          15.153846          998              8178655         120364.22            2.564103      0.717949                 1441.0            18.474359                 8195.045090           120.605431            25.243590           106.499829        0.237029
-              Gpt 5.2 High             78         1158  14.846154                 1174          15.051282         1205              3341037          73525.83            0.282051      0.948718                 1505.0            19.294872                 2772.644813            61.017286            24.628205            36.601709        0.672870
-         Gpt 5 Mini Medium             78          942  12.076923                 1052          13.487179         1261              3618399          58345.97            1.166667      0.705128                 1325.0            16.987179                 2869.467883            46.269603            39.141026            82.882051        0.472250
-               Deepseek R1             78          511   6.551282                 1036          13.282051         1104              9229131         165334.16            3.192308      0.641026                 1331.0            17.064103                 8359.720109           149.759203            29.628205           115.135043        0.257334
-Gemini 3 Flash Preview Low             78          817  10.474359                 1024          13.128205         1315              1581524          12702.02            0.961538      0.705128                 1226.0            15.717949                 1202.679848             9.659331            29.923077            83.049573        0.360304
-              Gpt Oss 120B             78          580   7.435897                 1004          12.871795         1243              3190828          24633.15            2.153846      0.679487                 1279.0            16.397436                 2567.037812            19.817498            46.692308            78.676239        0.593474
-               Gpt Oss 20B             78          131   1.679487                  927          11.884615         1297              7009392          62397.50            2.974359      0.589744                 1206.0            15.461538                 5404.311488            48.109098            47.576923            88.239487        0.539180
-          Claude Haiku 4.5             78          -37  -0.474359                  894          11.461538         1254              6973411          57734.39            3.948718      0.564103                 1198.0            15.358974                 5560.933812            46.040183            45.102564           107.387350        0.419999
 Saved: results/260121_78_rounds/basic_metrics.csv
 Saved: results/260121_78_rounds/overall_performance.png
@@ -130,6 +130,8 @@ Saved: results/260121_78_rounds/excess_caution.png
 Saved: results/260121_78_rounds/excess_caution.json
 Saved: results/260121_78_rounds/caution_vs_failed_guesses.png
 Saved: results/260121_78_rounds/caution_vs_failed_guesses.json
 ============================================================
 RECKLESS GUESSING ANALYSIS
@@ -169,6 +171,58 @@ Longest streak: 8 consecutive wrong guesses
 Saved: results/260121_78_rounds/reckless_guessing.png
 Saved: results/260121_78_rounds/reckless_guessing.json
 ============================================================
 PER-MODEL REPORTS
 ============================================================

 BASIC MODEL COMPARISON
 ============================================================
+                     model  rounds_played  total_score  avg_score  total_floored_score  avg_floored_score  total_turns  total_output_tokens  total_wall_clock  avg_failed_guesses  success_rate  counting_output_tokens  total_no_stakes_score  avg_no_stakes_score  avg_output_tokens_per_turn  wall_clock_per_turn  intra_rule_variance  inter_rule_variance  variance_ratio
+           Claude Opus 4.5             78         1128  14.461538                 1324          16.974359          756              4333716          86367.64            2.000000      0.833333                 3430535                 1598.0            20.487179                 4537.744709           114.242910            25.000000            81.385983        0.307178
+                   Kimi K2             78          804  10.307692                 1262          16.179487          801             12281540         101346.76            2.038462      0.769231                 5918992                 1481.0            18.987179                 7389.503121           126.525293            25.538462            88.446496        0.288745
+   Grok 4 1 Fast Reasoning             78          737   9.448718                 1182          15.153846          795              8178655         120364.22            2.564103      0.717949                 4559832                 1441.0            18.474359                 5735.637736           151.401535            25.243590           106.499829        0.237029
+              Gpt 5.2 High             78         1158  14.846154                 1174          15.051282         1195              3341037          73525.83            0.282051      0.948718                 3232254                 1505.0            19.294872                 2704.815063            61.527891            24.628205            36.601709        0.672870
+         Gpt 5 Mini Medium             78          942  12.076923                 1052          13.487179         1163              3618399          58345.97            1.166667      0.705128                 2998454                 1325.0            16.987179                 2578.206363            50.168504            39.141026            82.882051        0.472250
+               Deepseek R1             78          511   6.551282                 1036          13.282051          851              9229131         165334.16            3.192308      0.641026                 5944454                 1331.0            17.064103                 6985.257344           194.282209            29.628205           115.135043        0.257334
+Gemini 3 Flash Preview Low             78          817  10.474359                 1024          13.128205         1207              1581524          12702.02            0.961538      0.705128                 1389850                 1226.0            15.717949                 1151.491301            10.523629            29.923077            83.049573        0.360304
+              Gpt Oss 120B             78          580   7.435897                 1004          12.871795         1041              3190828          24633.15            2.153846      0.679487                 2250622                 1279.0            16.397436                 2161.980788            23.662968            46.692308            78.676239        0.593474
+               Gpt Oss 20B             78          131   1.679487                  927          11.884615          972              7009392          62397.50            2.974359      0.589744                 3234713                 1206.0            15.461538                 3327.894033            64.194959            47.576923            88.239487        0.539180
+          Claude Haiku 4.5             78          -37  -0.474359                  894          11.461538          848              6973411          57734.39            3.948718      0.564103                 4053200                 1198.0            15.358974                 4779.716981            68.083007            45.102564           107.387350        0.419999
 Saved: results/260121_78_rounds/basic_metrics.csv
 Saved: results/260121_78_rounds/overall_performance.png
 Saved: results/260121_78_rounds/excess_caution.json
 Saved: results/260121_78_rounds/caution_vs_failed_guesses.png
 Saved: results/260121_78_rounds/caution_vs_failed_guesses.json
+Saved: results/260121_78_rounds/score_vs_recklessness.png
+Saved: results/260121_78_rounds/score_vs_recklessness.json
 ============================================================
 RECKLESS GUESSING ANALYSIS
 Saved: results/260121_78_rounds/reckless_guessing.png
 Saved: results/260121_78_rounds/reckless_guessing.json
+============================================================
+COMPLEXITY RATIO ANALYSIS
+============================================================
+Analyzed 9634 tentative rules with confidence >= 5
+Using optimal k = 0.420 for aggregated complexity
+Complexity Ratio by Model:
+(Ratio = Tentative Complexity / Actual Complexity)
+                     Model  Median   Q25   Q75  Count
+              Gpt Oss 120B   1.322 0.873 2.355   1182
+               Gpt Oss 20B   1.155 0.782 2.065   1219
+          Claude Haiku 4.5   1.054 0.736 2.000   1001
+               Deepseek R1   1.000 0.762 1.756    933
+Gemini 3 Flash Preview Low   1.000 0.781 1.519   1016
+         Gpt 5 Mini Medium   1.000 0.765 1.664    939
+              Gpt 5.2 High   1.000 0.791 1.187    857
+   Grok 4 1 Fast Reasoning   1.000 0.777 1.657    938
+           Claude Opus 4.5   0.984 0.707 1.169    664
+                   Kimi K2   0.976 0.622 1.275    885
+Interpretation:
+  - Ratio > 1: Model tends to overcomplicate rules
+  - Ratio < 1: Model tends to oversimplify rules
+  - Ratio ≈ 1: Model matches actual rule complexity
+Highest median: Gpt Oss 120B (1.322)
+Lowest median: Kimi K2 (0.976)
+Saved: results/260121_78_rounds/complexity_ratio.png
+Saved: results/260121_78_rounds/complexity_ratio.json
+============================================================
+OUTPUT TOKENS BY TURN
+============================================================
+Saved: results/260121_78_rounds/tokens_by_turn.png
+Saved: results/260121_78_rounds/tokens_by_turn.json
+Tokens trend summary (early vs late turns):
+  Claude Haiku 4.5: early=3191, late=5889 (+84.5%)
+  Claude Opus 4.5: early=2649, late=8447 (+218.9%)
+  Deepseek R1: early=5083, late=10946 (+115.3%)
+  Gemini 3 Flash Preview Low: early=1046, late=1351 (+29.1%)
+  Gpt 5 Mini Medium: early=1241, late=4862 (+291.9%)
+  Gpt 5.2 High: early=963, late=5910 (+514.0%)
+  Gpt Oss 120B: early=1050, late=4475 (+326.2%)
+  Gpt Oss 20B: early=1744, late=7789 (+346.6%)
+  Grok 4 1 Fast Reasoning: early=2810, late=17827 (+534.4%)
+  Kimi K2: early=5545, late=10653 (+92.1%)
 ============================================================
 PER-MODEL REPORTS
 ============================================================

app/src/content/assets/data/tokens_by_turn.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceb3f9cc62ed081b7c59ff9f58903166d3d267ab7f3ad73143b41682301dddd9
+size 39913

app/src/content/assets/data/tokens_by_turn.png ADDED Viewed

Git LFS Details

SHA256: b1076a4296b75b27f7244a090fd4f28019c19989bcf4f74184578f31310acdce
Pointer size: 131 Bytes
Size of remote file: 280 kB

app/src/content/chapters/eleusis/appendix.mdx CHANGED Viewed

@@ -1,40 +1,8 @@
-import Accordion from "../../../components/Accordion.astro";
 import Note from "../../../components/Note.astro";
 import Sidenote from "../../../components/Sidenote.astro";
 ## Appendix: Detailed Methods
-### Models Evaluated
-<Accordion title="Model configurations" open>
-We evaluated 10 models across 5 providers:
-| Model | Provider | Type |
-|-------|----------|------|
-| Claude Opus 4.5 | Anthropic | Proprietary |
-| Claude Haiku 4.5 | Anthropic | Proprietary |
-| GPT 5.2 High | OpenAI | Proprietary |
-| GPT 5 Mini Medium | OpenAI | Proprietary |
-| Gemini 3 Flash Preview Low | Google | Proprietary |
-| Grok 4.1 Fast Reasoning | xAI | Proprietary |
-| Kimi K2 | Moonshot (via HF) | Open weights |
-| DeepSeek R1 | DeepSeek (via HF) | Open weights |
-| GPT OSS 120B | Community (via HF) | Open weights |
-| GPT OSS 20B | Community (via HF) | Open weights |
-All models were evaluated with the following settings:
-| Parameter | Value |
-|-----------|-------|
-| Temperature | 0.7 |
-| Max tokens | 16384 |
-| Retries | 3 (on API failures) |
-Reasoning models were allowed their default reasoning budgets.
-</Accordion>
 ### Rule Checking

 import Note from "../../../components/Note.astro";
 import Sidenote from "../../../components/Sidenote.astro";
 ## Appendix: Detailed Methods
 ### Rule Checking

app/src/content/chapters/eleusis/benchmark.mdx CHANGED Viewed

@@ -4,21 +4,19 @@ import Sidenote from "../../../components/Sidenote.astro";
 ### The Original Game
-In the original Eleusis card game, one player acts as the "dealer" (sometimes called "God" or "Nature") and secretly invents a rule determining which cards can be legally played. The other players (called "scientists") don't know this rule, they must discover it through experimentation.
-Players take turns playing cards from their hand onto a central "mainline." If a card satisfies the secret rule, the dealer accepts it and it is added to the mainline. If it violates the rule, it's rejected and placed in a "sideline" below the mainline at that position. Over time, the pattern of accepted and rejected cards provides evidence about the hidden rule.
 <Sidenote>
   The name "Eleusis" comes from the ancient Greek mystery cult, where initiates gradually discovered hidden truths.
 </Sidenote>
-At any point, a player can attempt to guess the rule; correctly identifying it ends the game, but a wrong guess incurs a penalty. The game continues until someone correctly identifies the rule. A specific scoring system rewards efficiency in discovering the rule while penalizing reckless guessing.
 ### Our Adaptation
 We adapted Eleusis into a single-player benchmark focused purely on the scientific reasoning process. By removing multi-player dynamics, we isolate the core challenge: hypothesis formation and testing under uncertainty.
-The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and four suits. A secret rule determines whether each card is accepted or rejected. It is a deterministic function that takes the card being played and the current sequence of accepted cards (the "mainline"). The player maintains a hand of 12 cards, drawing a replacement after each play.
 On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
@@ -31,7 +29,7 @@ For instance, a player who correctly identifies the rule on turn 13 with no wron
 This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence and acting accordingly.
 ### Rule Library
-In the original game, the dealer has to invent a secret rule on the spot. However, for benchmarking LLMs, we need a fixed set of rules to ensure comparability across model runs. We created a library of 26 hand-crafted rules spanning a range of types and complexity. Some rules involve simply card properties (e.g., "only red cards"), while others depend on the sequence of previously accepted cards (e.g., "card rank must be higher than previous card"). The rule might involve rank, suits, color or a combination thereof, and may include positional dependencies.
 Here are some example rules from our library, with a tentative categorization:
@@ -57,7 +55,8 @@ The model is free to reason, but it is asked to output a structured response con
 4. **Confidence level**: A self-reported probability (0–10 scale, where 7 means "I estimate 70% chance my tentative rule is correct");
 5. **Guess decision**: Whether to formally try to guess the rule this turn, or not.
-Example output
 ```
 {
     "reasoning_summary": "To test if the rule depends on rank, I play a 4♣ (same rank as the starter 4♠) hoping to see if same-rank cards are accepted.",
@@ -68,6 +67,12 @@ Example output
 }
 ```
-**This structure lets us analyze not just whether models succeed, but *how* they reason:** Do they update hypotheses appropriately when evidence contradicts them? Do they explore strategically or play conservatively? Is their stated confidence calibrated to their actual accuracy? In particular, forcing the model to articulate a tentative rule and a confidence level in it (even if they don't want to guess it yet) allows us to (secretely) evaluate it nonetheless, which will be useful for measuring calibration and guessing abilities.

 ### The Original Game
+To recap the core mechanics: players take turns playing cards onto a central "mainline." If a card satisfies the secret rule, it is accepted; otherwise it's rejected and placed in a "sideline" below that position. At any point, a player can attempt to guess the rule—correctly identifying it ends the game, but a wrong guess incurs a penalty.
 <Sidenote>
   The name "Eleusis" comes from the ancient Greek mystery cult, where initiates gradually discovered hidden truths.
 </Sidenote>
+The scoring system rewards efficiency: discovering the rule quickly earns more points, while wrong guesses are penalized.
 ### Our Adaptation
 We adapted Eleusis into a single-player benchmark focused purely on the scientific reasoning process. By removing multi-player dynamics, we isolate the core challenge: hypothesis formation and testing under uncertainty.
+The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and four suits. The secret rule is a deterministic function of the card being played and the current mainline sequence. The player maintains a hand of 12 cards, drawing a replacement after each play.
 On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
 This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence and acting accordingly.
 ### Rule Library
+In the original game, the dealer invents a secret rule on the spot. For benchmarking LLMs, we need a fixed set of rules to ensure comparability across runs. We created a library of 26 hand-crafted rules designed to cover the space of rule types (static properties, sequential dependencies, cyclic patterns) while remaining tractable to evaluate. Some rules involve simply card properties (e.g., "only red cards"), while others depend on the sequence of previously accepted cards (e.g., "card rank must be higher than previous card"). The rule might involve rank, suits, color or a combination thereof, and may include positional dependencies.
 Here are some example rules from our library, with a tentative categorization:
 4. **Confidence level**: A self-reported probability (0–10 scale, where 7 means "I estimate 70% chance my tentative rule is correct");
 5. **Guess decision**: Whether to formally try to guess the rule this turn, or not.
+#### Example output
 ```
 {
     "reasoning_summary": "To test if the rule depends on rank, I play a 4♣ (same rank as the starter 4♠) hoping to see if same-rank cards are accepted.",
 }
 ```
+**This structure lets us analyze not just whether models succeed, but *how* they reason:**
+- Do they update hypotheses appropriately when evidence contradicts them?
+- Do they explore strategically or play conservatively?
+- Is their stated confidence calibrated to their actual accuracy?
+Forcing the model to articulate a tentative rule and confidence level (even when not formally guessing) allows us to secretly evaluate it at every turn—useful for measuring calibration.

app/src/content/chapters/eleusis/introduction.mdx CHANGED Viewed

@@ -3,17 +3,17 @@ import Image from "../../../components/Image.astro";
 import exampleSequence from "../../assets/image/example_sequence.png";
-Large language models are increasingly being deployed as tools for scientific research : analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
 <Sidenote>
   Read time: 15–20 minutes.
 </Sidenote>
-Most reasoning benchmarks test whether models can solve well-defined problems: given premises, derive a conclusion. The ARC challenge [@chollet2019measure], for instance, evaluates inductive reasoning on visual patterns. **These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.**
 First, real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement, often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic thinking*: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
-Also, beyond pure reasoning, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?) [@lichtenstein1977calibration], **metacognition** (how certain am I about my uncertainty?) [@flavell1979metacognition], and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis instead of trying to challenge it) [@nickerson1998confirmation]. A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
 We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
@@ -33,4 +33,4 @@ Eleusis was designed by @abbott1977eleusis explicitly to simulate the process of
 We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: *can models act like scientists?* Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
-These skills are fundamental not just to science, but to debugging code, medical diagnosis, and everyday reasoning under uncertainty.

 import exampleSequence from "../../assets/image/example_sequence.png";
+Large language models are increasingly being deployed as tools for scientific research: analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
 <Sidenote>
   Read time: 15–20 minutes.
 </Sidenote>
+Most reasoning benchmarks test whether models can solve well-defined problems with clear solutions. The ARC challenge [@chollet2019measure], for instance, evaluates inductive reasoning on visual patterns. **These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.**
 First, real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement, often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic thinking*: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
+Moreover, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?) [@lichtenstein1977calibration], **metacognition** (how certain am I about my uncertainty?) [@flavell1979metacognition], and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis instead of trying to challenge it) [@nickerson1998confirmation]. A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
 We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
 We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: *can models act like scientists?* Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
+These skills matter beyond the laboratory: debugging code, diagnosing patients, and navigating everyday uncertainty all require the same iterative process of hypothesis and test.

app/src/content/chapters/eleusis/results.mdx CHANGED Viewed

@@ -6,9 +6,28 @@ import HtmlEmbed from "../../../components/HtmlEmbed.astro";
 ## 2. Results
 ### Overall Performance
-We evaluated ten models on the Eleusis benchmark, including both proprietary and open-weight models. Performance is measured as the average score per turn. We also report token usage (output + reasoning) per turn to compare efficiency.
 <HtmlEmbed
   src="overall-performance.html"
@@ -19,15 +38,15 @@ We evaluated ten models on the Eleusis benchmark, including both proprietary and
 Performance varies dramatically among tested models.
-* **Claude Opus 4.5** achieves top performance with 17.0 score and moderate token usage. The open-weight model **Kimi K2 Thinking** comes second at 16.2 and performs competitively with the best proprietary models (outperforming GPT 5.2 High and being close to Claude Opus 4.5), but at the price of a 2.5× larger reasoning budget.
-* **GPT 5.2 High** and **Grok 4.1 Fast Reasoning** show a similar performance around 15, but GPT 5.2 High is 3 times more token efficient.
-* **GPT-5-Mini**, **GPT OSS-120B** and **Gemini 3 Flash Preview Low** cluster in the mid-tier (around 13) with low token usage. While Deepseek R1, an open-weight model specialized for reasoning tasks, achieves a similar score but with a much larger token count.
-* Finally, **GPT-OSS 20B** and **Claude Haiku 4.5** lag behind, scoring between 11 and 12 with moderate token usage.
-As we mentionned, this score reflects not only the pure model's ability to find the correct rule, but also its metacognitive skills: knowing when to commit, how confident it is, and how to balance exploration vs. exploitation. To distinguish these factors, we also computed an alternative "no-stakes" score that removes penalties for wrong guesses and counts tentative rules as guesses.
 ### Pure discovery versus metacognition
@@ -47,7 +66,7 @@ Even if using this alternative scoring does not change a lot the relative rankin
 * GPT 5.2 High and Claude Haiku 4.5 are the two models with the largest difference between raw and no-stakes scores (more than 4), suggesting they are the most penalized by wrong guesses or delayed guessing.
 * On the other hand, Gemini 3 Flash Preview Low and Kimi K2 have the smallest difference (less than 3) and benefit the least from this alternative scoring, indicating a better balance between discovery and metacognition.
-They might be two reasons for the difference between the raw and the no-stakes scores:
 1. The model is reckless and makes a lot of wrong guesses, incurring penalties.
 2. The model is too cautious and waits too long before guessing, missing out on points.
@@ -67,14 +86,28 @@ To estimate caution, we can compute on average **how many turns a model waits wh
     src="caution-vs-failed-guesses.html"
     caption="<strong>Figure 3:</strong> The caution-recklessness trade-off. Models in the upper-left are cautious (delay correct guesses); models in the lower-right are reckless (many failed guesses). The ideal position is lower-left: quick to commit when right, rarely wrong."
     id="fig-caution-reckless"
 />
-How should we interpret those values ? Knowing that a failed guess costs 2 points, while each turn of delay costs 1 point, the optimal number of failed guesses per round should be around 0.5 (i.e., 1 failed guess every 2 rounds) to balance the two sources of loss. We can see that most models are above that threshold, indicating **a clear tendency towards recklessness**. This is confirmed by the fact that they have a low caution value (most models wait around 1 turn or less on average before guessing when they have the correct rule).
-On the other hand, **GPT 5.2 High has a singular behavior** with very few failed guesses (0.28 per round) but a high caution (waiting 3.5 turns on average before guessing when it has the correct rule). Gemini 3 Flash Preview Low and GPT 5 Mini Medium are intermediate in both dimensions, Gemini achieving a better balance with on average 2 points lost due to caution and 2 points lost due to recklessness (1 failed guess every round on average).
 To try to understand deeper the causes of recklessness and caution, we now turn to an analysis of confidence and guessing strategies.
 ### Confidence and Calibration
 Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempt to do so. **This allows us to evaluate calibration: does reported confidence match actual accuracy?** This is particularly relevant as modern neural networks have been shown to be poorly calibrated [@guo2017calibration].
@@ -83,17 +116,18 @@ Models are asked to output their confidence level, with clear instructions on wh
   src="calibration-curves.html"
   caption="<strong>Figure 4:</strong> Calibration curves for each model (for reported confidence ≥5). A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
   id="fig-calibration"
 />
 The calibration analysis reveals several patterns:
-- **All models are very overconfident** : for instance when they report 80% confidence, their actual success rates are often closer to 20% !
-- GPT 5.2 is the best calibrated model overall, being the closest to the diagonal line, although it is still slightly overconfident.
-- Even models with a strong performance like Claude Opus 4.5 and Kimi K2 show significant overconfidence.
-Is overconfidence a problem ? In our setting, not necessarily; it depends on how the model decides to act on it.
-**For a perfectly calibrated model**, as the expected loss for a failed guess is twice the expected opportunity cost of waiting one turn, **the optimal confidence threshold for guessing is 0.67** (i.e., guess when you believe your tentative rule has at least a 67% chance of being correct). But do model follow such a strategy ?
 For this, we can look at how often models guess at each reported confidence level. This is shown in the following figure. For each confidence level (from 5 to 10), we compute the guess rate: the fraction of turns the model actually attempts to guess when reporting that confidence.
@@ -102,16 +136,37 @@ For this, we can look at how often models guess at each reported confidence leve
   src="guess-rate.html"
   caption="<strong>Figure 5:</strong> Guess rate per confidence level. The optimal decision theoretic curve for a perfectly calibrated model should be a step at 67%. Click legend items to show/hide models."
   id="fig-confidence"
 />
 Once again, we observe significant differences from one model to another. Grok 4.1 and Gemini 3 will essentially only guess when very confident (9 or 10). Most other models will also often guess at confidence levels above 8 and rarely below. The two Claude models show different behaviors: Claude Opus 4.5 tends to guess more aggressively at confidence level 8, while Claude Haiku 4.5 often guesses even at confidence level 7.
-We can see that **models on average are more cautious than the optimal decision-theoretic strategy** for a perfectly calibrated model, which would guess as soon as confidence exceeds 67%. This is somehow a good thing for them, given that all models are overconfident. **By raising the threshold for guessing, they reduce the risk of wrong guesses and compensate for their poor calibration.**
-This is particularly true for Gemini 3 Flash Preview Low which is very cautious, guessing only 1/3 of the time at reported confidence 9 ! This compensates its overconfidence, which is probably what helps it achieve a good balance between failed guesses and lost opportunity cost. This is reflected in our "no-stakes" analysis by the fact that it's the model with the smallest difference between raw and no-stakes scores.
 The case of GPT 5.2 High is different: it is both fairly well calibrated and very cautious, leading to very few failed guesses but a high opportunity cost due to delayed guessing. This suggests that GPT 5.2 High could improve its performance by being more aggressive in guessing once it has a correct tentative rule, especially at confidence level 8.
 ### Performance by Rule Complexity
@@ -143,20 +198,46 @@ The following plot breaks down the success rate of each model per complexity qua
 />
-Interestingly, code complexity (as measured by our combination of cyclomatic complexity and AST node count) doesn't perfectly predict difficulty, as semantic concepts also play a role. For instance a rule like "only face cards" has a complexity equivalent to "only A, 2 and 3", but the former is easier for models (and humans !) due to familiarity with the semantic category of face cards.
-Also rules involving rare events (low acceptance rate). Only aces is harder than "only even ranks" despite being simpler, simply because models need more evidence to confirm it.
-An interesting test: are symmetric rules equally difficult? For example, "only spades" vs "only non-spades" should be logically equivalent in difficulty, but models might have biases.
-For instance average score on "only spades" is 25, but "no spades" is 20.
 ### Complexity of rules produced
-#### Overly Complex Rules
-Failure mode: models have a tendency to produce over complicated rules, even if they were informed that the rule is typically one sentence. They can produce tentative rules like "...".
-TODO : Backup this with examples from logs and "guess complexity" vs "actual complexity".
-#### Overfitting Rules
-We have observed qualitative evidence of model producing overfit rules that explain all observations so far, but fail to generalize. For instance if all accepted cards so far are red, and happens to be only number cards (simply because no red face card has been tried yet), the model may hypothesize "only red number cards" rather than the simpler "only red cards."

 ## 2. Results
+### Models Evaluated
+We evaluated ten frontier models from six labs, including both proprietary and open-weight models. Open-weight models were accessed via Hugging Face inference providers. Several models offer configurable reasoning levels, which we indicate when applicable.
+| Model | Lab | Provider | Reasoning |
+|-------|-----|----------|-----------|
+| Claude Opus 4.5 | Anthropic | Anthropic | 16000 tok. |
+| Claude Haiku 4.5 | Anthropic | Anthropic | 16000 tok. |
+| GPT 5.2 | OpenAI | OpenAI | High |
+| GPT 5 Mini | OpenAI | OpenAI | Medium |
+| Gemini 3 Flash Preview | Google | Google | Low |
+| Grok 4.1 | xAI | xAI | Fast |
+| Kimi K2 Thinking | Moonshot | 🤗 Inference providers | — |
+| DeepSeek R1 | DeepSeek | 🤗 Inference providers | — |
+| GPT OSS 120B | Community | 🤗 Inference providers | — |
+| GPT OSS 20B | Community | 🤗 Inference providers | — |
+All models were evaluated with temperature 0.7 and max tokens of 16,384. Reasoning models were allowed their default reasoning budgets. Each model played 78 rounds (26 rules × 3 seeds).
 ### Overall Performance
+Performance is measured as the average score per round. We also report token usage (output + reasoning) per turn to compare efficiency.
 <HtmlEmbed
   src="overall-performance.html"
 Performance varies dramatically among tested models.
+* **Claude Opus 4.5** achieves top performance with a score of 17.0 and moderate token usage. The open-weight model **Kimi K2** comes second at 16.2, performing competitively with the best proprietary models, but at the cost of a larger reasoning budget.
+* **GPT 5.2 High** and **Grok 4.1 Fast Reasoning** show similar performance around 15, but GPT 5.2 High is 2 times more token-efficient.
+* **GPT 5 Mini Medium**, **GPT OSS 120B**, and **Gemini 3 Flash Preview Low** cluster in the mid-tier (around 13) with low token usage. **DeepSeek R1**, an open-weight model specialized for reasoning tasks, achieves a similar score but with a much larger token count.
+* Finally, **GPT OSS 20B** and **Claude Haiku 4.5** lag behind, scoring between 11 and 12 with moderate token usage.
+As mentioned earlier, this score reflects not only the model's ability to find the correct rule, but also its metacognitive skills: knowing when to commit, how confident to be, and how to balance exploration versus exploitation. To distinguish these factors, we computed an alternative "no-stakes" score that removes penalties for wrong guesses and counts tentative rules as guesses.
 ### Pure discovery versus metacognition
 * GPT 5.2 High and Claude Haiku 4.5 are the two models with the largest difference between raw and no-stakes scores (more than 4), suggesting they are the most penalized by wrong guesses or delayed guessing.
 * On the other hand, Gemini 3 Flash Preview Low and Kimi K2 have the smallest difference (less than 3) and benefit the least from this alternative scoring, indicating a better balance between discovery and metacognition.
+There are two possible reasons for the gap between raw and no-stakes scores:
 1. The model is reckless and makes a lot of wrong guesses, incurring penalties.
 2. The model is too cautious and waits too long before guessing, missing out on points.
     src="caution-vs-failed-guesses.html"
     caption="<strong>Figure 3:</strong> The caution-recklessness trade-off. Models in the upper-left are cautious (delay correct guesses); models in the lower-right are reckless (many failed guesses). The ideal position is lower-left: quick to commit when right, rarely wrong."
     id="fig-caution-reckless"
+    wide
 />
+How should we interpret these values? A failed guess costs 2 points, while each turn of delay costs 1 point, so the optimal number of failed guesses per round should be around 0.5 (one failed guess every two rounds) to balance both sources of loss. Most models exceed this threshold, indicating **a clear tendency towards recklessness**. This is confirmed by low caution values: most models wait around 1 turn or less on average before guessing when they have the correct rule.
+**GPT 5.2 High stands out** with very few failed guesses (0.28 per round) but high caution—waiting 3.5 turns on average before guessing when it has the correct rule.
+Gemini 3 Flash Preview Low and GPT 5 Mini Medium occupy an intermediate position. Gemini achieves a better balance, losing on average 2 points to caution and 2 points to recklessness (about one failed guess per round).
 To try to understand deeper the causes of recklessness and caution, we now turn to an analysis of confidence and guessing strategies.
+A way to summarize this behavior is to compute a **boldness index** as the difference between the points lost by being reckless (failed guesses) and the points lost by being cautious (delayed correct guesses). A positive value indicates more loss due to recklessness, while a negative value indicates more loss due to caution. This is reported in the following chart.
+<HtmlEmbed
+    src="score-vs-recklessness.html"
+    caption="<strong>Figure 3b:</strong> Score vs Boldness Index. The boldness index combines failed guesses and caution into a single metric (2 × failed guesses − caution). Models in the center have a decision strategy that balances recklessness and caution. Models on the left are loosing point because of their excessive caution, while models on the right are losing points because of their recklessness."
+    id="fig-score-recklessness"
+    wide
+/>
+A way to understand this chart is in terms of missed opportunity. Models in the center achieve a good balance between recklessness and caution, minimizing lost points. They perform at the best permitted by their inductive abilities. Models on the left are too cautious, missing out on points by delaying correct guesses. At identical inductive ability, they could improve their score by guessing earlier. Models on the right are too reckless, losing points from frequent wrong guesses. At identical inductive ability, they could improve their score by being more cautious and guessing less often.
 ### Confidence and Calibration
 Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempt to do so. **This allows us to evaluate calibration: does reported confidence match actual accuracy?** This is particularly relevant as modern neural networks have been shown to be poorly calibrated [@guo2017calibration].
   src="calibration-curves.html"
   caption="<strong>Figure 4:</strong> Calibration curves for each model (for reported confidence ≥5). A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
   id="fig-calibration"
+  wide
 />
 The calibration analysis reveals several patterns:
+- **All models are overconfident**: when they report 80% confidence, their actual success rates are often closer to 20%.
+- GPT 5.2 High is the best-calibrated model overall, staying closest to the diagonal, though still slightly overconfident.
+- Even strong performers like Claude Opus 4.5 and Kimi K2 show significant overconfidence.
+Is overconfidence a problem? In our setting, not necessarily—it depends on how the model acts on it.
+**For a perfectly calibrated model**, since the expected loss from a failed guess is twice the opportunity cost of waiting one turn, **the optimal confidence threshold for guessing is 0.67** (guess when you believe your tentative rule has at least a 67% chance of being correct). But do models follow such a strategy?
 For this, we can look at how often models guess at each reported confidence level. This is shown in the following figure. For each confidence level (from 5 to 10), we compute the guess rate: the fraction of turns the model actually attempts to guess when reporting that confidence.
   src="guess-rate.html"
   caption="<strong>Figure 5:</strong> Guess rate per confidence level. The optimal decision theoretic curve for a perfectly calibrated model should be a step at 67%. Click legend items to show/hide models."
   id="fig-confidence"
+  wide
 />
 Once again, we observe significant differences from one model to another. Grok 4.1 and Gemini 3 will essentially only guess when very confident (9 or 10). Most other models will also often guess at confidence levels above 8 and rarely below. The two Claude models show different behaviors: Claude Opus 4.5 tends to guess more aggressively at confidence level 8, while Claude Haiku 4.5 often guesses even at confidence level 7.
+**Models are on average more cautious than the optimal decision-theoretic strategy** for a perfectly calibrated model, which would guess as soon as confidence exceeds 67%. This actually benefits them, given their overconfidence: **by raising the threshold for guessing, they reduce the risk of wrong guesses and compensate for their poor calibration.**
+This is particularly true for Gemini 3 Flash Preview Low, which is very cautious, guessing only 1/3 of the time at reported confidence 9. This compensates for its overconfidence and likely explains its good balance between failed guesses and lost opportunity cost—reflected in its having the smallest gap between raw and no-stakes scores. This is reflected in our "no-stakes" analysis by the fact that it's the model with the smallest difference between raw and no-stakes scores.
 The case of GPT 5.2 High is different: it is both fairly well calibrated and very cautious, leading to very few failed guesses but a high opportunity cost due to delayed guessing. This suggests that GPT 5.2 High could improve its performance by being more aggressive in guessing once it has a correct tentative rule, especially at confidence level 8.
+### Reasoning effort vs turn count
+To see whether models tend to think more per turn when the round is longer, we plotted the average number of output tokens per turn.
+<HtmlEmbed
+  src="tokens-by-turn.html"
+  caption="<strong>Figure 5b:</strong> Average output tokens per turn across the game (in the 'no-stakes' counting scenario where all the rounds will last up to 30 turns). Each line shows how a model's reasoning effort evolves as the round progresses. Click legend items to show/hide models. Note: sample sizes decrease for later turns as games that end early don't contribute data."
+  id="fig-tokens-by-turn"
+  wide
+/>
+The patterns reveal striking differences in how models allocate reasoning effort:
+- Most models show gradual increase in the reasoning effort (token usage) as the turn number increases.
+- **Grok 4.1 Fast Reasoning** stands out with dramatically increasing token usage, starting around 1,200 tokens per turn and reaching over 20,000 by turn 30. This suggests the model invests more reasoning effort as problems become harder to solve.
+- **Gemini 3 Flash Preview Low** maintains remarkably flat token usage throughout, staying around 1,000-1,400 tokens regardless of turn number. This suggests a consistent, lightweight reasoning approach that doesn't scale with problem difficulty.
+The general upward trend makes sense: later turns only occur in harder games where the rule hasn't been found yet, requiring more extensive reasoning. However, the magnitude of increase varies widely, from Gemini's flat profile to Grok's 15x increase.
 ### Performance by Rule Complexity
 />
+Interestingly, code complexity (as measured by our combination of cyclomatic complexity and AST node count) doesn't perfectly predict difficulty, as semantic concepts also play a role. A rule like "only face cards" has complexity equivalent to "only A, 2 and 3", but the former is easier for models (and humans) due to familiarity with the semantic category of face cards.
+Rules involving rare events also prove challenging. "Only aces" is harder than "only even ranks" despite being simpler, because models need more evidence to confirm it.
+This raises an interesting question: are symmetric rules equally difficult? Logically, "only spades" and "no spades" should be equivalent in difficulty, but models might have biases. Indeed, the average score on "only spades" is 25, while "no spades" scores only 20.
 ### Complexity of rules produced
+One common failure mode we observed is that models tend to produce overly complicated tentative rules, even though they were informed that rules are typically simple one-sentence statements. They also produce rules that fit all observed data so far, but fail to generalize to new cards because they are more complex than necessary.
+As an illustration, here is an example of tentative rule by Claude Haiku 4.5. The mainline was state (rejected cards are in parentheses)
+<p style={{fontStyle: 'italic', padding: '0.5em 1em', borderLeft: '3px solid var(--border-color)'}}>
+6♠ <span style={{color: '#e53935'}}>6♦</span> 9♠ (<span style={{color: '#e53935'}}>Q♥</span>) <span style={{color: '#e53935'}}>9♦</span> (9♣) 7♠ (<span style={{color: '#e53935'}}>5♦</span>) (<span style={{color: '#e53935'}}>J♦</span>) (<span style={{color: '#e53935'}}>A♦</span>) (<span style={{color: '#e53935'}}>Q♦</span>) (<span style={{color: '#e53935'}}>2♦</span>) (<span style={{color: '#e53935'}}>4♦</span>) (<span style={{color: '#e53935'}}>9♦</span>) (8♠) (A♠) (<span style={{color: '#e53935'}}>10♥</span>) (<span style={{color: '#e53935'}}>J♦</span>) (<span style={{color: '#e53935'}}>9♥</span>) <span style={{color: '#e53935'}}>7♦</span> 9♠ (<span style={{color: '#e53935'}}>A♥</span>) (<span style={{color: '#e53935'}}>8♥</span>)
+</p>
+The actual rule was *"Rank repeats in pairs"*. The tentative rule proposed by Haiku 4.5 at this stage of the game was:
+> "Odd-positioned mainline cards must be spades, even-positioned mainline cards must be diamonds. Consecutive pairs of positions must have matching ranks. Additionally, each rank (6, 7, 9) can appear only twice on the mainline, meaning position 8 must be a diamond with a rank different from 6, 7, and 9, or the pattern breaks at position 8 with new rules."
+This is overly complicated compared to the actual rule, and as you can read, it contains the actual rule "Consecutive pairs of positions must have matching ranks" but adds unnecessary constraints about suits and counts that do not generalize.
+To quantify this, we computed the **complexity ratio**: the complexity of the model's tentative rule divided by the actual rule complexity, using the same code-based metric described above.
+<HtmlEmbed
+  src="complexity-ratio.html"
+  caption="<strong>Figure 8:</strong> Median complexity ratio of tentative rules vs actual rules. A ratio > 1 indicates the model overcomplicates (hypothesizes more complex rules than necessary); < 1 indicates oversimplification. Whiskers show interquartile range. Only tentative rules with confidence ≥ 5 are included."
+  id="fig-complexity-ratio"
+  wide
+/>
+The results reveal a clear tendency toward overcomplication among several models:
+- **GPT OSS 120B and GPT OSS 20B** stand out with median ratios of 1.32 and 1.15 respectively, consistently hypothesizing more complex rules than necessary.
+- **Claude Haiku 4.5** also tends to overcomplicate slightly (1.05) on average, but with high variance and many tentative rules being much more complex than needed.
+- **Claude Opus 4.5, GPT 5.2 and Kimi K2** are the best calibrated, with median ratios close to 1.0 and moderate variance, suggesting they match rule complexity most accurately.
+- Most models cluster around 1.0, indicating reasonable complexity calibration on average, but the wide interquartile ranges show substantial variation across individual games.
+### Summary
+Our evaluation reveals substantial variation in how models approach the Eleusis task. Claude Opus 4.5 leads in overall performance, followed closely by the open-weight Kimi K2. All models exhibit overconfidence—reporting higher certainty than their accuracy warrants—but they partially compensate by being more cautious than decision theory would recommend. The caution-recklessness trade-off varies dramatically: GPT 5.2 High is extremely cautious (high success rate but slow to commit), while Claude Haiku 4.5 and DeepSeek R1 are reckless (many failed guesses). Rule complexity matters, but semantic familiarity and evidence availability also influence difficulty. Finally, models tend to overcomplicate their hypotheses—particularly the open-weight GPT OSS models—while Claude Opus 4.5, GPT 5.2 High, and Kimi K2 best match actual rule complexity.

app/src/content/embeds/banner-bar-chart.html ADDED Viewed

	@@ -0,0 +1,356 @@

+<div class="d3-banner-bar"></div>
+<style>
+  .d3-banner-bar {
+    width: 100%;
+    margin: 10px 0;
+    position: relative;
+    font-family: system-ui, -apple-system, sans-serif;
+  }
+  .d3-banner-bar svg {
+    display: block;
+    width: 100%;
+    height: auto;
+  }
+  .d3-banner-bar .axes path,
+  .d3-banner-bar .axes line {
+    stroke: var(--axis-color, var(--text-color));
+  }
+  .d3-banner-bar .axes text {
+    fill: var(--tick-color, var(--muted-color));
+    font-size: 12px;
+  }
+  .d3-banner-bar .grid line {
+    stroke: var(--grid-color, rgba(0,0,0,.08));
+  }
+  .d3-banner-bar .axes text.axis-label {
+    font-size: 14px;
+    font-weight: 500;
+    fill: var(--text-color);
+  }
+  .d3-banner-bar .model-label {
+    font-size: 13px;
+    font-weight: 500;
+  }
+  .d3-banner-bar .bar {
+    cursor: pointer;
+    transition: opacity 0.15s ease;
+  }
+  .d3-banner-bar .bar:hover {
+    opacity: 0.8;
+  }
+  .d3-banner-bar .score-label {
+    font-size: 12px;
+    font-weight: 500;
+    fill: var(--text-color);
+  }
+  .d3-banner-bar .d3-tooltip {
+    position: absolute;
+    top: 0;
+    left: 0;
+    transform: translate(-9999px, -9999px);
+    pointer-events: none;
+    padding: 10px 12px;
+    border-radius: 8px;
+    font-size: 12px;
+    line-height: 1.4;
+    border: 1px solid var(--border-color);
+    background: var(--surface-bg);
+    color: var(--text-color);
+    box-shadow: 0 4px 24px rgba(0,0,0,.18);
+    opacity: 0;
+    transition: opacity 0.12s ease;
+    z-index: 10;
+  }
+  .d3-banner-bar .d3-tooltip .model-name {
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .d3-banner-bar .d3-tooltip .metric {
+    display: flex;
+    justify-content: space-between;
+    gap: 16px;
+  }
+  .d3-banner-bar .d3-tooltip .metric-label {
+    color: var(--muted-color);
+  }
+  .d3-banner-bar .d3-tooltip .metric-value {
+    font-weight: 500;
+  }
+</style>
+<script>
+  (() => {
+    const ensureD3 = (cb) => {
+      if (window.d3 && typeof window.d3.select === 'function') return cb();
+      let s = document.getElementById('d3-cdn-script');
+      if (!s) {
+        s = document.createElement('script');
+        s.id = 'd3-cdn-script';
+        s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
+        document.head.appendChild(s);
+      }
+      const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
+      s.addEventListener('load', onReady, { once: true });
+      if (window.d3) onReady();
+    };
+    const bootstrap = () => {
+      const scriptEl = document.currentScript;
+      let container = scriptEl ? scriptEl.previousElementSibling : null;
+      if (!(container && container.classList && container.classList.contains('d3-banner-bar'))) {
+        const candidates = Array.from(document.querySelectorAll('.d3-banner-bar'))
+          .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
+        container = candidates[candidates.length - 1] || null;
+      }
+      if (!container) return;
+      if (container.dataset) {
+        if (container.dataset.mounted === 'true') return;
+        container.dataset.mounted = 'true';
+      }
+      // Tooltip setup
+      container.style.position = container.style.position || 'relative';
+      const tip = document.createElement('div');
+      tip.className = 'd3-tooltip';
+      container.appendChild(tip);
+      // SVG setup
+      const svg = d3.select(container).append('svg');
+      const gRoot = svg.append('g');
+      // Chart groups
+      const gGrid = gRoot.append('g').attr('class', 'grid');
+      const gAxes = gRoot.append('g').attr('class', 'axes');
+      const gBars = gRoot.append('g').attr('class', 'bars');
+      const gLabels = gRoot.append('g').attr('class', 'labels');
+      // State
+      let data = null;
+      let width = 800;
+      let height = 450;
+      const margin = { top: 20, right: 60, bottom: 40, left: 20 };
+      // Scales
+      const xScale = d3.scaleLinear();
+      const yScale = d3.scaleBand();
+      // Data loading
+      const JSON_PATHS = [
+        '/data/overall_performance.json',
+        './assets/figures/overall_performance.json',
+        '../assets/figures/overall_performance.json',
+        '../../assets/figures/overall_performance.json'
+      ];
+      const fetchFirstAvailable = async (paths) => {
+        for (const p of paths) {
+          try {
+            const r = await fetch(p, { cache: 'no-cache' });
+            if (r.ok) return await r.json();
+          } catch (_) {}
+        }
+        throw new Error('Data not found');
+      };
+      function updateSize() {
+        width = container.clientWidth || 800;
+        // Height based on number of bars (will be set after data loads)
+        const numModels = data ? data.models.length : 10;
+        const barHeight = 36;
+        height = margin.top + margin.bottom + numModels * barHeight;
+        svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
+        gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
+        return {
+          innerWidth: width - margin.left - margin.right,
+          innerHeight: height - margin.top - margin.bottom
+        };
+      }
+      function showTooltip(event, d) {
+        const rect = container.getBoundingClientRect();
+        const x = event.clientX - rect.left;
+        const y = event.clientY - rect.top;
+        tip.innerHTML = `
+          <div class="model-name" style="color: ${d.color}">${d.name}</div>
+          <div class="metric">
+            <span class="metric-label">Score:</span>
+            <span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Tokens/Turn:</span>
+            <span class="metric-value">${Math.round(d.avg_output_tokens_per_turn).toLocaleString()}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Type:</span>
+            <span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
+          </div>
+        `;
+        const tipWidth = tip.offsetWidth || 150;
+        const tipHeight = tip.offsetHeight || 80;
+        let tipX = x + 12;
+        let tipY = y - tipHeight / 2;
+        if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
+        if (tipY < 0) tipY = 8;
+        if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
+        tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
+        tip.style.opacity = '1';
+      }
+      function hideTooltip() {
+        tip.style.opacity = '0';
+        tip.style.transform = 'translate(-9999px, -9999px)';
+      }
+      // Calculate relative luminance and return black or white for best contrast
+      function getContrastColor(hexColor) {
+        const hex = hexColor.replace('#', '');
+        const r = parseInt(hex.substr(0, 2), 16) / 255;
+        const g = parseInt(hex.substr(2, 2), 16) / 255;
+        const b = parseInt(hex.substr(4, 2), 16) / 255;
+        // Relative luminance formula
+        const luminance = 0.299 * r + 0.587 * g + 0.114 * b;
+        return luminance > 0.5 ? '#000000' : '#ffffff';
+      }
+      function render() {
+        if (!data) return;
+        const { innerWidth, innerHeight } = updateSize();
+        // Sort models by score descending
+        const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
+        // Update scales
+        xScale
+          .domain([0, d3.max(models, d => d.avg_floored_score) * 1.05])
+          .range([0, innerWidth])
+          .nice();
+        yScale
+          .domain(models.map(d => d.name))
+          .range([0, innerHeight])
+          .padding(0.25);
+        // Grid lines (vertical)
+        const xTicks = xScale.ticks(6);
+        gGrid.selectAll('.grid-x')
+          .data(xTicks)
+          .join('line')
+          .attr('class', 'grid-x')
+          .attr('x1', d => xScale(d))
+          .attr('x2', d => xScale(d))
+          .attr('y1', 0)
+          .attr('y2', innerHeight);
+        // X-axis (bottom)
+        gAxes.selectAll('.x-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'x-axis')
+          .attr('transform', `translate(0,${innerHeight})`)
+          .call(d3.axisBottom(xScale).ticks(6).tickSizeOuter(0));
+        // X-axis label
+        gAxes.selectAll('.x-label')
+          .data([0])
+          .join('text')
+          .attr('class', 'x-label axis-label')
+          .attr('x', innerWidth / 2)
+          .attr('y', innerHeight + 34)
+          .attr('text-anchor', 'middle')
+          .text('Average Score');
+        // Bars
+        const barHeight = yScale.bandwidth();
+        gBars.selectAll('.bar')
+          .data(models, d => d.name)
+          .join('rect')
+          .attr('class', 'bar')
+          .attr('x', 0)
+          .attr('y', d => yScale(d.name))
+          .attr('width', d => xScale(d.avg_floored_score))
+          .attr('height', barHeight)
+          .attr('fill', d => d.color)
+          .attr('rx', 3)
+          .attr('ry', 3)
+          .on('mouseenter', showTooltip)
+          .on('mousemove', showTooltip)
+          .on('mouseleave', hideTooltip);
+        // Model labels (inside bars)
+        gLabels.selectAll('.model-label')
+          .data(models, d => d.name)
+          .join('text')
+          .attr('class', 'model-label')
+          .attr('x', 8)
+          .attr('y', d => yScale(d.name) + barHeight / 2)
+          .attr('dy', '0.35em')
+          .attr('text-anchor', 'start')
+          .style('fill', d => getContrastColor(d.color))
+          .text(d => d.name);
+        // Score labels (end of bars)
+        gLabels.selectAll('.score-label')
+          .data(models, d => d.name)
+          .join('text')
+          .attr('class', 'score-label')
+          .attr('x', d => xScale(d.avg_floored_score) + 6)
+          .attr('y', d => yScale(d.name) + barHeight / 2)
+          .attr('dy', '0.35em')
+          .attr('text-anchor', 'start')
+          .text(d => d.avg_floored_score.toFixed(1));
+      }
+      // Initialize
+      fetchFirstAvailable(JSON_PATHS)
+        .then(json => {
+          data = json;
+          render();
+        })
+        .catch(err => {
+          const pre = document.createElement('pre');
+          pre.style.color = 'red';
+          pre.style.padding = '16px';
+          pre.textContent = `Error loading data: ${err.message}`;
+          container.appendChild(pre);
+        });
+      // Resize handling
+      if (window.ResizeObserver) {
+        new ResizeObserver(() => render()).observe(container);
+      } else {
+        window.addEventListener('resize', render);
+      }
+      // Theme change handling
+      const observer = new MutationObserver(() => render());
+      observer.observe(document.documentElement, {
+        attributes: true,
+        attributeFilter: ['data-theme']
+      });
+    };
+    if (document.readyState === 'loading') {
+      document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
+    } else {
+      ensureD3(bootstrap);
+    }
+  })();
+</script>

app/src/content/embeds/banner.html CHANGED Viewed

@@ -1,59 +1,64 @@
-<div class="d3-banner-bar"></div>
 <style>
-  .d3-banner-bar {
     width: 100%;
     margin: 10px 0;
     position: relative;
     font-family: system-ui, -apple-system, sans-serif;
   }
-  .d3-banner-bar svg {
     display: block;
     width: 100%;
     height: auto;
   }
-  .d3-banner-bar .axes path,
-  .d3-banner-bar .axes line {
     stroke: var(--axis-color, var(--text-color));
   }
-  .d3-banner-bar .axes text {
     fill: var(--tick-color, var(--muted-color));
-    font-size: 12px;
   }
-  .d3-banner-bar .grid line {
-    stroke: var(--grid-color, rgba(0,0,0,.08));
   }
-  .d3-banner-bar .axes text.axis-label {
-    font-size: 14px;
     font-weight: 500;
     fill: var(--text-color);
   }
-  .d3-banner-bar .model-label {
-    font-size: 13px;
-    font-weight: 500;
   }
-  .d3-banner-bar .bar {
     cursor: pointer;
     transition: opacity 0.15s ease;
   }
-  .d3-banner-bar .bar:hover {
     opacity: 0.8;
   }
-  .d3-banner-bar .score-label {
-    font-size: 12px;
-    font-weight: 500;
     fill: var(--text-color);
   }
-  .d3-banner-bar .d3-tooltip {
     position: absolute;
     top: 0;
     left: 0;
@@ -72,22 +77,22 @@
     z-index: 10;
   }
-  .d3-banner-bar .d3-tooltip .model-name {
     font-weight: 600;
     margin-bottom: 4px;
   }
-  .d3-banner-bar .d3-tooltip .metric {
     display: flex;
     justify-content: space-between;
     gap: 16px;
   }
-  .d3-banner-bar .d3-tooltip .metric-label {
     color: var(--muted-color);
   }
-  .d3-banner-bar .d3-tooltip .metric-value {
     font-weight: 500;
   }
 </style>
@@ -110,8 +115,8 @@
     const bootstrap = () => {
       const scriptEl = document.currentScript;
       let container = scriptEl ? scriptEl.previousElementSibling : null;
-      if (!(container && container.classList && container.classList.contains('d3-banner-bar'))) {
-        const candidates = Array.from(document.querySelectorAll('.d3-banner-bar'))
           .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
         container = candidates[candidates.length - 1] || null;
       }
@@ -129,48 +134,62 @@
       // SVG setup
       const svg = d3.select(container).append('svg');
       const gRoot = svg.append('g');
-      // Chart groups
       const gGrid = gRoot.append('g').attr('class', 'grid');
       const gAxes = gRoot.append('g').attr('class', 'axes');
-      const gBars = gRoot.append('g').attr('class', 'bars');
       const gLabels = gRoot.append('g').attr('class', 'labels');
       // State
       let data = null;
       let width = 800;
       let height = 450;
-      const margin = { top: 20, right: 60, bottom: 40, left: 20 };
       // Scales
       const xScale = d3.scaleLinear();
-      const yScale = d3.scaleBand();
       // Data loading
-      const JSON_PATHS = [
-        '/data/overall_performance.json',
-        './assets/figures/overall_performance.json',
-        '../assets/figures/overall_performance.json',
-        '../../assets/figures/overall_performance.json'
-      ];
-      const fetchFirstAvailable = async (paths) => {
-        for (const p of paths) {
-          try {
-            const r = await fetch(p, { cache: 'no-cache' });
-            if (r.ok) return await r.json();
-          } catch (_) {}
         }
-        throw new Error('Data not found');
       };
       function updateSize() {
         width = container.clientWidth || 800;
-        // Height based on number of bars (will be set after data loads)
-        const numModels = data ? data.models.length : 10;
-        const barHeight = 36;
-        height = margin.top + margin.bottom + numModels * barHeight;
         svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
         gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
         return {
@@ -188,11 +207,19 @@
           <div class="model-name" style="color: ${d.color}">${d.name}</div>
           <div class="metric">
             <span class="metric-label">Score:</span>
-            <span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
           </div>
           <div class="metric">
-            <span class="metric-label">Tokens/Turn:</span>
-            <span class="metric-value">${Math.round(d.avg_output_tokens_per_turn).toLocaleString()}</span>
           </div>
           <div class="metric">
             <span class="metric-label">Type:</span>
@@ -200,8 +227,8 @@
           </div>
         `;
-        const tipWidth = tip.offsetWidth || 150;
-        const tipHeight = tip.offsetHeight || 80;
         let tipX = x + 12;
         let tipY = y - tipHeight / 2;
@@ -218,38 +245,40 @@
         tip.style.transform = 'translate(-9999px, -9999px)';
       }
-      // Calculate relative luminance and return black or white for best contrast
-      function getContrastColor(hexColor) {
-        const hex = hexColor.replace('#', '');
-        const r = parseInt(hex.substr(0, 2), 16) / 255;
-        const g = parseInt(hex.substr(2, 2), 16) / 255;
-        const b = parseInt(hex.substr(4, 2), 16) / 255;
-        // Relative luminance formula
-        const luminance = 0.299 * r + 0.587 * g + 0.114 * b;
-        return luminance > 0.5 ? '#000000' : '#ffffff';
-      }
       function render() {
         if (!data) return;
         const { innerWidth, innerHeight } = updateSize();
-        // Sort models by score descending
-        const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
-        // Update scales
         xScale
-          .domain([0, d3.max(models, d => d.avg_floored_score) * 1.05])
-          .range([0, innerWidth])
-          .nice();
         yScale
-          .domain(models.map(d => d.name))
-          .range([0, innerHeight])
-          .padding(0.25);
-        // Grid lines (vertical)
-        const xTicks = xScale.ticks(6);
         gGrid.selectAll('.grid-x')
           .data(xTicks)
           .join('line')
@@ -259,67 +288,125 @@
           .attr('y1', 0)
           .attr('y2', innerHeight);
-        // X-axis (bottom)
         gAxes.selectAll('.x-axis')
           .data([0])
           .join('g')
           .attr('class', 'x-axis')
           .attr('transform', `translate(0,${innerHeight})`)
-          .call(d3.axisBottom(xScale).ticks(6).tickSizeOuter(0));
-        // X-axis label
         gAxes.selectAll('.x-label')
           .data([0])
           .join('text')
           .attr('class', 'x-label axis-label')
           .attr('x', innerWidth / 2)
-          .attr('y', innerHeight + 34)
           .attr('text-anchor', 'middle')
-          .text('Average Score');
-        // Bars
-        const barHeight = yScale.bandwidth();
-        gBars.selectAll('.bar')
-          .data(models, d => d.name)
-          .join('rect')
-          .attr('class', 'bar')
-          .attr('x', 0)
-          .attr('y', d => yScale(d.name))
-          .attr('width', d => xScale(d.avg_floored_score))
-          .attr('height', barHeight)
           .attr('fill', d => d.color)
-          .attr('rx', 3)
-          .attr('ry', 3)
           .on('mouseenter', showTooltip)
           .on('mousemove', showTooltip)
           .on('mouseleave', hideTooltip);
-        // Model labels (inside bars)
-        gLabels.selectAll('.model-label')
-          .data(models, d => d.name)
-          .join('text')
-          .attr('class', 'model-label')
-          .attr('x', 8)
-          .attr('y', d => yScale(d.name) + barHeight / 2)
-          .attr('dy', '0.35em')
-          .attr('text-anchor', 'start')
-          .style('fill', d => getContrastColor(d.color))
-          .text(d => d.name);
-        // Score labels (end of bars)
-        gLabels.selectAll('.score-label')
           .data(models, d => d.name)
           .join('text')
-          .attr('class', 'score-label')
-          .attr('x', d => xScale(d.avg_floored_score) + 6)
-          .attr('y', d => yScale(d.name) + barHeight / 2)
-          .attr('dy', '0.35em')
-          .attr('text-anchor', 'start')
-          .text(d => d.avg_floored_score.toFixed(1));
       }
       // Initialize
-      fetchFirstAvailable(JSON_PATHS)
         .then(json => {
           data = json;
           render();

+<div class="d3-score-vs-recklessness"></div>
 <style>
+  .d3-score-vs-recklessness {
     width: 100%;
     margin: 10px 0;
     position: relative;
     font-family: system-ui, -apple-system, sans-serif;
   }
+  .d3-score-vs-recklessness svg {
     display: block;
     width: 100%;
     height: auto;
   }
+  .d3-score-vs-recklessness .axes path,
+  .d3-score-vs-recklessness .axes line {
     stroke: var(--axis-color, var(--text-color));
   }
+  .d3-score-vs-recklessness .axes text {
     fill: var(--tick-color, var(--muted-color));
+    font-size: 14px;
   }
+  .d3-score-vs-recklessness .grid line {
+    stroke: var(--grid-color, rgba(0,0,0,.15));
   }
+  .d3-score-vs-recklessness .axes text.axis-label {
+    font-size: 18px;
     font-weight: 500;
     fill: var(--text-color);
   }
+  .d3-score-vs-recklessness .x-axis text {
+    transform: translateY(4px);
   }
+  .d3-score-vs-recklessness .point {
     cursor: pointer;
     transition: opacity 0.15s ease;
   }
+  .d3-score-vs-recklessness .point:hover {
     opacity: 0.8;
   }
+  .d3-score-vs-recklessness .point-label {
+    font-size: 11px;
     fill: var(--text-color);
+    pointer-events: none;
+  }
+  .d3-score-vs-recklessness .annotation {
+    font-size: 11px;
+    font-style: italic;
+    fill: var(--muted-color);
   }
+  .d3-score-vs-recklessness .d3-tooltip {
     position: absolute;
     top: 0;
     left: 0;
     z-index: 10;
   }
+  .d3-score-vs-recklessness .d3-tooltip .model-name {
     font-weight: 600;
     margin-bottom: 4px;
   }
+  .d3-score-vs-recklessness .d3-tooltip .metric {
     display: flex;
     justify-content: space-between;
     gap: 16px;
   }
+  .d3-score-vs-recklessness .d3-tooltip .metric-label {
     color: var(--muted-color);
   }
+  .d3-score-vs-recklessness .d3-tooltip .metric-value {
     font-weight: 500;
   }
 </style>
     const bootstrap = () => {
       const scriptEl = document.currentScript;
       let container = scriptEl ? scriptEl.previousElementSibling : null;
+      if (!(container && container.classList && container.classList.contains('d3-score-vs-recklessness'))) {
+        const candidates = Array.from(document.querySelectorAll('.d3-score-vs-recklessness'))
           .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
         container = candidates[candidates.length - 1] || null;
       }
       // SVG setup
       const svg = d3.select(container).append('svg');
+      // Add gradient definition
+      const defs = svg.append('defs');
+      const gradient = defs.append('linearGradient')
+        .attr('id', 'recklessness-gradient')
+        .attr('x1', '0%')
+        .attr('x2', '100%')
+        .attr('y1', '0%')
+        .attr('y2', '0%');
+      // Gradient stops: red -> orange -> yellow -> green -> yellow -> orange -> red
+      gradient.append('stop').attr('offset', '0%').attr('stop-color', 'rgba(239, 83, 80, 0.25)');      // red
+      gradient.append('stop').attr('offset', '20%').attr('stop-color', 'rgba(255, 152, 0, 0.25)');     // orange
+      gradient.append('stop').attr('offset', '35%').attr('stop-color', 'rgba(255, 235, 59, 0.25)');   // yellow
+      gradient.append('stop').attr('offset', '50%').attr('stop-color', 'rgba(102, 187, 106, 0.35)');  // green
+      gradient.append('stop').attr('offset', '65%').attr('stop-color', 'rgba(255, 235, 59, 0.25)');   // yellow
+      gradient.append('stop').attr('offset', '80%').attr('stop-color', 'rgba(255, 152, 0, 0.25)');     // orange
+      gradient.append('stop').attr('offset', '100%').attr('stop-color', 'rgba(239, 83, 80, 0.25)');   // red
       const gRoot = svg.append('g');
+      // Chart groups (order matters for layering)
+      const gBackground = gRoot.append('g').attr('class', 'background');
       const gGrid = gRoot.append('g').attr('class', 'grid');
       const gAxes = gRoot.append('g').attr('class', 'axes');
+      const gAnnotations = gRoot.append('g').attr('class', 'annotations');
+      const gPoints = gRoot.append('g').attr('class', 'points');
       const gLabels = gRoot.append('g').attr('class', 'labels');
       // State
       let data = null;
       let width = 800;
       let height = 450;
+      const margin = { top: 20, right: 120, bottom: 56, left: 72 };
       // Scales
       const xScale = d3.scaleLinear();
+      const yScale = d3.scaleLinear();
       // Data loading
+      const DATA_URL = '/data/score_vs_recklessness.json';
+      // Helper function to create a 5-point star path
+      const starPath = (cx, cy, outerR, innerR) => {
+        const points = [];
+        for (let i = 0; i < 10; i++) {
+          const r = i % 2 === 0 ? outerR : innerR;
+          const angle = (Math.PI / 2) + (i * Math.PI / 5);
+          points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
         }
+        return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
       };
       function updateSize() {
         width = container.clientWidth || 800;
+        height = Math.max(300, Math.round(width / 1.5));
         svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
         gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
         return {
           <div class="model-name" style="color: ${d.color}">${d.name}</div>
           <div class="metric">
             <span class="metric-label">Score:</span>
+            <span class="metric-value">${d.avg_floored_score.toFixed(1)}</span>
           </div>
           <div class="metric">
+            <span class="metric-label">Recklessness Index:</span>
+            <span class="metric-value">${d.recklessness_index.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Failed Guesses:</span>
+            <span class="metric-value">${d.avg_failed_guesses.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Caution:</span>
+            <span class="metric-value">${d.avg_caution.toFixed(2)}</span>
           </div>
           <div class="metric">
             <span class="metric-label">Type:</span>
           </div>
         `;
+        const tipWidth = tip.offsetWidth || 180;
+        const tipHeight = tip.offsetHeight || 120;
         let tipX = x + 12;
         let tipY = y - tipHeight / 2;
         tip.style.transform = 'translate(-9999px, -9999px)';
       }
       function render() {
         if (!data) return;
         const { innerWidth, innerHeight } = updateSize();
+        const models = data.models;
+        // Fixed symmetric X scale from -8 to 8
         xScale
+          .domain([-8, 8])
+          .range([0, innerWidth]);
+        // Y scale based on data
+        const yExtent = d3.extent(models, d => d.avg_floored_score);
+        const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
         yScale
+          .domain([yExtent[0], yExtent[1] + yPadding])
+          .range([innerHeight, 0])
+          .nice();
+        // Background gradient rectangle
+        gBackground.selectAll('.bg-gradient')
+          .data([0])
+          .join('rect')
+          .attr('class', 'bg-gradient')
+          .attr('x', 0)
+          .attr('y', 0)
+          .attr('width', innerWidth)
+          .attr('height', innerHeight)
+          .attr('fill', 'url(#recklessness-gradient)');
+        // Grid lines
+        const xTicks = xScale.ticks(8);
+        const yTicks = yScale.ticks(6);
         gGrid.selectAll('.grid-x')
           .data(xTicks)
           .join('line')
           .attr('y1', 0)
           .attr('y2', innerHeight);
+        gGrid.selectAll('.grid-y')
+          .data(yTicks)
+          .join('line')
+          .attr('class', 'grid-y')
+          .attr('x1', 0)
+          .attr('x2', innerWidth)
+          .attr('y1', d => yScale(d))
+          .attr('y2', d => yScale(d));
+        // Axes with inner ticks
+        const tickSize = 6;
         gAxes.selectAll('.x-axis')
           .data([0])
           .join('g')
           .attr('class', 'x-axis')
           .attr('transform', `translate(0,${innerHeight})`)
+          .call(d3.axisBottom(xScale).ticks(8).tickSizeInner(-tickSize).tickSizeOuter(0));
+        gAxes.selectAll('.y-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'y-axis')
+          .call(d3.axisLeft(yScale).ticks(6).tickSizeInner(-tickSize).tickSizeOuter(0));
+        // Axis labels
         gAxes.selectAll('.x-label')
           .data([0])
           .join('text')
           .attr('class', 'x-label axis-label')
           .attr('x', innerWidth / 2)
+          .attr('y', innerHeight + 44)
           .attr('text-anchor', 'middle')
+          .text('Boldness Index');
+        gAxes.selectAll('.y-label')
+          .data([0])
+          .join('text')
+          .attr('class', 'y-label axis-label')
+          .attr('x', -innerHeight / 2)
+          .attr('y', -52)
+          .attr('text-anchor', 'middle')
+          .attr('transform', 'rotate(-90)')
+          .text('Score');
+        // Top annotations: Overcautious / Cautious / Measured / Bold / Reckless
+        const annotations = [
+          { label: 'Overcautious', color: 'rgba(239, 83, 80, 0.9)', pos: 0.07},      // red
+          { label: 'Cautious', color: 'rgba(255, 180, 0, 0.9)', pos: 0.25 },       // yellow/orange
+          { label: 'Measured', color: 'rgba(76, 175, 80, 0.9)', pos: 0.5 },        // green
+          { label: 'Bold', color: 'rgba(255, 180, 0, 0.9)', pos: 0.75 },           // yellow/orange
+          { label: 'Reckless', color: 'rgba(239, 83, 80, 0.9)', pos: 0.95 }           // red
+        ];
+        gAnnotations.selectAll('.annotation-label')
+          .data(annotations, d => d.label)
+          .join('text')
+          .attr('class', 'annotation annotation-label')
+          .attr('x', d => d.pos * innerWidth)
+          .attr('y', 16)
+          .attr('text-anchor', d => d.pos === 0 ? 'start' : d.pos === 1 ? 'end' : 'middle')
+          .style('fill', d => d.color)
+          .style('font-weight', 'bold')
+          .style('font-size', '13px')
+          .text(d => d.label);
+        // Points
+        const pointRadius = Math.max(8, Math.min(14, innerWidth / 60));
+        // Closed models as filled circles
+        const closedModels = models.filter(d => !d.is_open);
+        gPoints.selectAll('.point-closed')
+          .data(closedModels, d => d.name)
+          .join('circle')
+          .attr('class', 'point point-closed')
+          .attr('cx', d => xScale(d.recklessness_index))
+          .attr('cy', d => yScale(d.avg_floored_score))
+          .attr('r', pointRadius)
           .attr('fill', d => d.color)
+          .attr('stroke', 'none')
           .on('mouseenter', showTooltip)
           .on('mousemove', showTooltip)
           .on('mouseleave', hideTooltip);
+        // Open models as stars
+        const openModels = models.filter(d => d.is_open);
+        gPoints.selectAll('.point-star')
+          .data(openModels, d => d.name)
+          .join('path')
+          .attr('class', 'point point-star')
+          .attr('d', d => starPath(xScale(d.recklessness_index), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
+          .attr('fill', d => d.color)
+          .attr('stroke', 'none')
+          .on('mouseenter', showTooltip)
+          .on('mousemove', showTooltip)
+          .on('mouseleave', hideTooltip);
+        // Point labels with smart positioning
+        gLabels.selectAll('.point-label')
           .data(models, d => d.name)
           .join('text')
+          .attr('class', 'point-label')
+          .attr('x', d => {
+            const xPos = xScale(d.recklessness_index);
+            if (xPos > innerWidth - 100) {
+              return xPos - pointRadius - 6;
+            }
+            return xPos + pointRadius + 6;
+          })
+          .attr('y', d => yScale(d.avg_floored_score) + 4)
+          .attr('text-anchor', d => {
+            const xPos = xScale(d.recklessness_index);
+            return xPos > innerWidth - 100 ? 'end' : 'start';
+          })
+          .text(d => d.name);
       }
       // Initialize
+      fetch(DATA_URL, { cache: 'no-cache' })
+        .then(r => r.json())
         .then(json => {
           data = json;
           render();

app/src/content/embeds/complexity-ratio.html ADDED Viewed

	@@ -0,0 +1,484 @@

+<div class="d3-complexity-ratio"></div>
+<style>
+  .d3-complexity-ratio {
+    width: 100%;
+    margin: 10px 0;
+    position: relative;
+    font-family: system-ui, -apple-system, sans-serif;
+  }
+  .d3-complexity-ratio svg {
+    display: block;
+    width: 100%;
+    height: auto;
+  }
+  .d3-complexity-ratio .axes path,
+  .d3-complexity-ratio .axes line {
+    stroke: var(--axis-color, var(--text-color));
+  }
+  .d3-complexity-ratio .axes text {
+    fill: var(--tick-color, var(--muted-color));
+    font-size: 11px;
+  }
+  .d3-complexity-ratio .grid line {
+    stroke: var(--grid-color, rgba(0,0,0,.08));
+  }
+  .d3-complexity-ratio .axes text.axis-label {
+    font-size: 14px;
+    font-weight: 500;
+    fill: var(--text-color);
+  }
+  .d3-complexity-ratio .reference-line {
+    stroke: var(--muted-color);
+    stroke-dasharray: 5, 5;
+    stroke-width: 1.5;
+  }
+  .d3-complexity-ratio .whisker-line {
+    stroke-width: 1.5;
+  }
+  .d3-complexity-ratio .whisker-cap {
+    stroke-width: 1.5;
+  }
+  .d3-complexity-ratio .model-point {
+    stroke-width: 2;
+    cursor: pointer;
+  }
+  .d3-complexity-ratio .model-point:hover {
+    stroke-width: 3;
+  }
+  .d3-complexity-ratio .ratio-label {
+    font-size: 11px;
+    fill: var(--muted-color);
+  }
+  .d3-complexity-ratio .legend-item {
+    cursor: default;
+  }
+  .d3-complexity-ratio .legend-text {
+    font-size: 11px;
+    fill: var(--text-color);
+  }
+  .d3-complexity-ratio .subtitle {
+    font-size: 11px;
+    fill: var(--muted-color);
+  }
+  .d3-complexity-ratio .d3-tooltip {
+    position: absolute;
+    top: 0;
+    left: 0;
+    transform: translate(-9999px, -9999px);
+    pointer-events: none;
+    padding: 10px 12px;
+    border-radius: 8px;
+    font-size: 12px;
+    line-height: 1.4;
+    border: 1px solid var(--border-color);
+    background: var(--surface-bg);
+    color: var(--text-color);
+    box-shadow: 0 4px 24px rgba(0,0,0,.18);
+    opacity: 0;
+    transition: opacity 0.12s ease;
+    z-index: 10;
+  }
+  .d3-complexity-ratio .d3-tooltip .model-name {
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .d3-complexity-ratio .d3-tooltip .metric {
+    display: flex;
+    justify-content: space-between;
+    gap: 16px;
+  }
+  .d3-complexity-ratio .d3-tooltip .metric-label {
+    color: var(--muted-color);
+  }
+  .d3-complexity-ratio .d3-tooltip .metric-value {
+    font-weight: 500;
+  }
+</style>
+<script>
+  (() => {
+    const ensureD3 = (cb) => {
+      if (window.d3 && typeof window.d3.select === 'function') return cb();
+      let s = document.getElementById('d3-cdn-script');
+      if (!s) {
+        s = document.createElement('script');
+        s.id = 'd3-cdn-script';
+        s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
+        document.head.appendChild(s);
+      }
+      const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
+      s.addEventListener('load', onReady, { once: true });
+      if (window.d3) onReady();
+    };
+    const bootstrap = () => {
+      const scriptEl = document.currentScript;
+      let container = scriptEl ? scriptEl.previousElementSibling : null;
+      if (!(container && container.classList && container.classList.contains('d3-complexity-ratio'))) {
+        const candidates = Array.from(document.querySelectorAll('.d3-complexity-ratio'))
+          .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
+        container = candidates[candidates.length - 1] || null;
+      }
+      if (!container) return;
+      if (container.dataset) {
+        if (container.dataset.mounted === 'true') return;
+        container.dataset.mounted = 'true';
+      }
+      // Tooltip setup
+      container.style.position = container.style.position || 'relative';
+      const tip = document.createElement('div');
+      tip.className = 'd3-tooltip';
+      container.appendChild(tip);
+      // SVG setup
+      const svg = d3.select(container).append('svg');
+      const gRoot = svg.append('g');
+      // Chart groups
+      const gGrid = gRoot.append('g').attr('class', 'grid');
+      const gReference = gRoot.append('g').attr('class', 'reference');
+      const gAxes = gRoot.append('g').attr('class', 'axes');
+      const gWhiskers = gRoot.append('g').attr('class', 'whiskers');
+      const gPoints = gRoot.append('g').attr('class', 'points');
+      const gLabels = gRoot.append('g').attr('class', 'labels');
+      const gLegend = gRoot.append('g').attr('class', 'legend');
+      // State
+      let data = null;
+      let width = 800;
+      let height = 500;
+      const margin = { top: 30, right: 100, bottom: 60, left: 180 };
+      // Scales
+      const xScale = d3.scaleLinear();
+      const yScale = d3.scaleBand();
+      // Data loading
+      const DATA_URL = '/data/complexity_ratio.json';
+      function showTooltip(event, model) {
+        const rect = container.getBoundingClientRect();
+        const x = event.clientX - rect.left;
+        const y = event.clientY - rect.top;
+        const interpretation = model.median_ratio > 1.05
+          ? 'Tends to overcomplicate'
+          : model.median_ratio < 0.95
+            ? 'Tends to oversimplify'
+            : 'Matches complexity well';
+        tip.innerHTML = `
+          <div class="model-name" style="color: ${model.color}">${model.name}</div>
+          <div class="metric">
+            <span class="metric-label">Median ratio:</span>
+            <span class="metric-value">${model.median_ratio.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">IQR:</span>
+            <span class="metric-value">${model.q25.toFixed(2)} – ${model.q75.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Samples:</span>
+            <span class="metric-value">n=${model.count}</span>
+          </div>
+          <div class="metric" style="margin-top: 4px;">
+            <span class="metric-label">Interpretation:</span>
+            <span class="metric-value">${interpretation}</span>
+          </div>
+        `;
+        const tipWidth = tip.offsetWidth || 180;
+        const tipHeight = tip.offsetHeight || 120;
+        let tipX = x + 12;
+        let tipY = y - tipHeight / 2;
+        if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
+        if (tipY < 0) tipY = 8;
+        if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
+        tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
+        tip.style.opacity = '1';
+      }
+      function hideTooltip() {
+        tip.style.opacity = '0';
+        tip.style.transform = 'translate(-9999px, -9999px)';
+      }
+      function updateSize() {
+        width = container.clientWidth || 800;
+        height = Math.max(420, Math.round(width * 0.55));
+        svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
+        gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
+        return {
+          innerWidth: width - margin.left - margin.right,
+          innerHeight: height - margin.top - margin.bottom
+        };
+      }
+      function render() {
+        if (!data) return;
+        const { innerWidth, innerHeight } = updateSize();
+        // Sort models by median ratio (ascending - lowest at top)
+        const models = [...data.models].sort((a, b) => a.median_ratio - b.median_ratio);
+        // X scale: ratio values with padding
+        const xMin = d3.min(models, m => m.q25);
+        const xMax = d3.max(models, m => m.q75);
+        const xPadding = (xMax - xMin) * 0.1;
+        xScale
+          .domain([Math.min(0.6, xMin - xPadding), Math.max(2.4, xMax + xPadding)])
+          .range([0, innerWidth]);
+        // Y scale: categorical (model names)
+        yScale
+          .domain(models.map(m => m.name))
+          .range([0, innerHeight])
+          .padding(0.4);
+        // Grid lines (vertical)
+        const xTicks = xScale.ticks(8);
+        gGrid.selectAll('.grid-x')
+          .data(xTicks)
+          .join('line')
+          .attr('class', 'grid-x')
+          .attr('x1', d => xScale(d))
+          .attr('x2', d => xScale(d))
+          .attr('y1', 0)
+          .attr('y2', innerHeight);
+        // Reference line at x=1
+        gReference.selectAll('.reference-line')
+          .data([1])
+          .join('line')
+          .attr('class', 'reference-line')
+          .attr('x1', d => xScale(d))
+          .attr('x2', d => xScale(d))
+          .attr('y1', 0)
+          .attr('y2', innerHeight);
+        // Axes
+        const tickSize = 6;
+        gAxes.selectAll('.x-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'x-axis')
+          .attr('transform', `translate(0,${innerHeight})`)
+          .call(d3.axisBottom(xScale)
+            .ticks(8)
+            .tickFormat(d3.format('.2f'))
+            .tickSizeInner(-tickSize)
+            .tickSizeOuter(0));
+        gAxes.selectAll('.y-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'y-axis')
+          .call(d3.axisLeft(yScale)
+            .tickSizeInner(-tickSize)
+            .tickSizeOuter(0));
+        // X-axis label
+        gAxes.selectAll('.x-label')
+          .data([0])
+          .join('text')
+          .attr('class', 'x-label axis-label')
+          .attr('x', innerWidth / 2)
+          .attr('y', innerHeight + 40)
+          .attr('text-anchor', 'middle')
+          .text('Complexity Ratio (Tentative / Actual)');
+        // Subtitle
+        gAxes.selectAll('.subtitle')
+          .data([0])
+          .join('text')
+          .attr('class', 'subtitle')
+          .attr('x', innerWidth / 2)
+          .attr('y', innerHeight + 54)
+          .attr('text-anchor', 'middle')
+          .text('>1: Overcomplicates  |  <1: Oversimplifies  |  =1: Matches complexity');
+        const bandHeight = yScale.bandwidth();
+        const capHeight = bandHeight * 0.4;
+        const pointSize = Math.min(8, bandHeight * 0.35);
+        // Whiskers (IQR lines)
+        gWhiskers.selectAll('.whisker-line')
+          .data(models, d => d.name)
+          .join('line')
+          .attr('class', 'whisker-line')
+          .attr('x1', d => xScale(d.q25))
+          .attr('x2', d => xScale(d.q75))
+          .attr('y1', d => yScale(d.name) + bandHeight / 2)
+          .attr('y2', d => yScale(d.name) + bandHeight / 2)
+          .attr('stroke', d => d.color);
+        // Left whisker caps
+        gWhiskers.selectAll('.whisker-cap-left')
+          .data(models, d => d.name)
+          .join('line')
+          .attr('class', 'whisker-cap whisker-cap-left')
+          .attr('x1', d => xScale(d.q25))
+          .attr('x2', d => xScale(d.q25))
+          .attr('y1', d => yScale(d.name) + bandHeight / 2 - capHeight / 2)
+          .attr('y2', d => yScale(d.name) + bandHeight / 2 + capHeight / 2)
+          .attr('stroke', d => d.color);
+        // Right whisker caps
+        gWhiskers.selectAll('.whisker-cap-right')
+          .data(models, d => d.name)
+          .join('line')
+          .attr('class', 'whisker-cap whisker-cap-right')
+          .attr('x1', d => xScale(d.q75))
+          .attr('x2', d => xScale(d.q75))
+          .attr('y1', d => yScale(d.name) + bandHeight / 2 - capHeight / 2)
+          .attr('y2', d => yScale(d.name) + bandHeight / 2 + capHeight / 2)
+          .attr('stroke', d => d.color);
+        // Model points - circles for closed, squares for open
+        const closedModels = models.filter(m => !m.is_open);
+        const openModels = models.filter(m => m.is_open);
+        // Closed models: circles
+        gPoints.selectAll('.model-point-circle')
+          .data(closedModels, d => d.name)
+          .join('circle')
+          .attr('class', 'model-point model-point-circle')
+          .attr('cx', d => xScale(d.median_ratio))
+          .attr('cy', d => yScale(d.name) + bandHeight / 2)
+          .attr('r', pointSize)
+          .attr('fill', d => d.color)
+          .attr('stroke', d => d.color)
+          .on('mouseenter', (event, d) => showTooltip(event, d))
+          .on('mousemove', (event, d) => showTooltip(event, d))
+          .on('mouseleave', hideTooltip);
+        // Open models: squares
+        gPoints.selectAll('.model-point-square')
+          .data(openModels, d => d.name)
+          .join('rect')
+          .attr('class', 'model-point model-point-square')
+          .attr('x', d => xScale(d.median_ratio) - pointSize)
+          .attr('y', d => yScale(d.name) + bandHeight / 2 - pointSize)
+          .attr('width', pointSize * 2)
+          .attr('height', pointSize * 2)
+          .attr('fill', 'none')
+          .attr('stroke', d => d.color)
+          .attr('stroke-width', 2)
+          .on('mouseenter', (event, d) => showTooltip(event, d))
+          .on('mousemove', (event, d) => showTooltip(event, d))
+          .on('mouseleave', hideTooltip);
+        // Ratio labels on the right
+        gLabels.selectAll('.ratio-label')
+          .data(models, d => d.name)
+          .join('text')
+          .attr('class', 'ratio-label')
+          .attr('x', innerWidth + 8)
+          .attr('y', d => yScale(d.name) + bandHeight / 2)
+          .attr('dy', '0.35em')
+          .text(d => `${d.median_ratio.toFixed(2)} (n=${d.count})`);
+        // Legend
+        const legendY = -15;
+        const legendItems = [
+          { label: 'Closed model', shape: 'circle' },
+          { label: 'Open model', shape: 'square' }
+        ];
+        const legendGroup = gLegend.selectAll('.legend-item')
+          .data(legendItems)
+          .join('g')
+          .attr('class', 'legend-item')
+          .attr('transform', (d, i) => `translate(${innerWidth - 80 - i * 100}, ${legendY})`);
+        legendGroup.selectAll('.legend-shape-circle')
+          .data(d => d.shape === 'circle' ? [d] : [])
+          .join('circle')
+          .attr('class', 'legend-shape-circle')
+          .attr('cx', 0)
+          .attr('cy', 0)
+          .attr('r', 5)
+          .attr('fill', 'var(--muted-color)');
+        legendGroup.selectAll('.legend-shape-square')
+          .data(d => d.shape === 'square' ? [d] : [])
+          .join('rect')
+          .attr('class', 'legend-shape-square')
+          .attr('x', -5)
+          .attr('y', -5)
+          .attr('width', 10)
+          .attr('height', 10)
+          .attr('fill', 'none')
+          .attr('stroke', 'var(--muted-color)')
+          .attr('stroke-width', 2);
+        legendGroup.selectAll('.legend-text')
+          .data(d => [d])
+          .join('text')
+          .attr('class', 'legend-text')
+          .attr('x', 10)
+          .attr('y', 0)
+          .attr('dy', '0.35em')
+          .text(d => d.label);
+      }
+      // Initialize
+      fetch(DATA_URL, { cache: 'no-cache' })
+        .then(r => r.json())
+        .then(json => {
+          data = json;
+          render();
+        })
+        .catch(err => {
+          const pre = document.createElement('pre');
+          pre.style.color = 'red';
+          pre.style.padding = '16px';
+          pre.textContent = `Error loading data: ${err.message}`;
+          container.appendChild(pre);
+        });
+      // Resize handling
+      if (window.ResizeObserver) {
+        new ResizeObserver(() => render()).observe(container);
+      } else {
+        window.addEventListener('resize', render);
+      }
+      // Theme change handling
+      const observer = new MutationObserver(() => render());
+      observer.observe(document.documentElement, {
+        attributes: true,
+        attributeFilter: ['data-theme']
+      });
+    };
+    if (document.readyState === 'loading') {
+      document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
+    } else {
+      ensureD3(bootstrap);
+    }
+  })();
+</script>

app/src/content/embeds/score-vs-recklessness.html ADDED Viewed

	@@ -0,0 +1,443 @@

+<div class="d3-score-vs-recklessness"></div>
+<style>
+  .d3-score-vs-recklessness {
+    width: 100%;
+    margin: 10px 0;
+    position: relative;
+    font-family: system-ui, -apple-system, sans-serif;
+  }
+  .d3-score-vs-recklessness svg {
+    display: block;
+    width: 100%;
+    height: auto;
+  }
+  .d3-score-vs-recklessness .axes path,
+  .d3-score-vs-recklessness .axes line {
+    stroke: var(--axis-color, var(--text-color));
+  }
+  .d3-score-vs-recklessness .axes text {
+    fill: var(--tick-color, var(--muted-color));
+    font-size: 14px;
+  }
+  .d3-score-vs-recklessness .grid line {
+    stroke: var(--grid-color, rgba(0,0,0,.15));
+  }
+  .d3-score-vs-recklessness .axes text.axis-label {
+    font-size: 18px;
+    font-weight: 500;
+    fill: var(--text-color);
+  }
+  .d3-score-vs-recklessness .x-axis text {
+    transform: translateY(4px);
+  }
+  .d3-score-vs-recklessness .point {
+    cursor: pointer;
+    transition: opacity 0.15s ease;
+  }
+  .d3-score-vs-recklessness .point:hover {
+    opacity: 0.8;
+  }
+  .d3-score-vs-recklessness .point-label {
+    font-size: 11px;
+    fill: var(--text-color);
+    pointer-events: none;
+  }
+  .d3-score-vs-recklessness .annotation {
+    font-size: 11px;
+    font-style: italic;
+    fill: var(--muted-color);
+  }
+  .d3-score-vs-recklessness .d3-tooltip {
+    position: absolute;
+    top: 0;
+    left: 0;
+    transform: translate(-9999px, -9999px);
+    pointer-events: none;
+    padding: 10px 12px;
+    border-radius: 8px;
+    font-size: 12px;
+    line-height: 1.4;
+    border: 1px solid var(--border-color);
+    background: var(--surface-bg);
+    color: var(--text-color);
+    box-shadow: 0 4px 24px rgba(0,0,0,.18);
+    opacity: 0;
+    transition: opacity 0.12s ease;
+    z-index: 10;
+  }
+  .d3-score-vs-recklessness .d3-tooltip .model-name {
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .d3-score-vs-recklessness .d3-tooltip .metric {
+    display: flex;
+    justify-content: space-between;
+    gap: 16px;
+  }
+  .d3-score-vs-recklessness .d3-tooltip .metric-label {
+    color: var(--muted-color);
+  }
+  .d3-score-vs-recklessness .d3-tooltip .metric-value {
+    font-weight: 500;
+  }
+</style>
+<script>
+  (() => {
+    const ensureD3 = (cb) => {
+      if (window.d3 && typeof window.d3.select === 'function') return cb();
+      let s = document.getElementById('d3-cdn-script');
+      if (!s) {
+        s = document.createElement('script');
+        s.id = 'd3-cdn-script';
+        s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
+        document.head.appendChild(s);
+      }
+      const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
+      s.addEventListener('load', onReady, { once: true });
+      if (window.d3) onReady();
+    };
+    const bootstrap = () => {
+      const scriptEl = document.currentScript;
+      let container = scriptEl ? scriptEl.previousElementSibling : null;
+      if (!(container && container.classList && container.classList.contains('d3-score-vs-recklessness'))) {
+        const candidates = Array.from(document.querySelectorAll('.d3-score-vs-recklessness'))
+          .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
+        container = candidates[candidates.length - 1] || null;
+      }
+      if (!container) return;
+      if (container.dataset) {
+        if (container.dataset.mounted === 'true') return;
+        container.dataset.mounted = 'true';
+      }
+      // Tooltip setup
+      container.style.position = container.style.position || 'relative';
+      const tip = document.createElement('div');
+      tip.className = 'd3-tooltip';
+      container.appendChild(tip);
+      // SVG setup
+      const svg = d3.select(container).append('svg');
+      // Add gradient definition
+      const defs = svg.append('defs');
+      const gradient = defs.append('linearGradient')
+        .attr('id', 'recklessness-gradient')
+        .attr('x1', '0%')
+        .attr('x2', '100%')
+        .attr('y1', '0%')
+        .attr('y2', '0%');
+      // Gradient stops: red -> orange -> yellow -> green -> yellow -> orange -> red
+      gradient.append('stop').attr('offset', '0%').attr('stop-color', 'rgba(239, 83, 80, 0.25)');      // red
+      gradient.append('stop').attr('offset', '20%').attr('stop-color', 'rgba(255, 152, 0, 0.25)');     // orange
+      gradient.append('stop').attr('offset', '35%').attr('stop-color', 'rgba(255, 235, 59, 0.25)');   // yellow
+      gradient.append('stop').attr('offset', '50%').attr('stop-color', 'rgba(102, 187, 106, 0.35)');  // green
+      gradient.append('stop').attr('offset', '65%').attr('stop-color', 'rgba(255, 235, 59, 0.25)');   // yellow
+      gradient.append('stop').attr('offset', '80%').attr('stop-color', 'rgba(255, 152, 0, 0.25)');     // orange
+      gradient.append('stop').attr('offset', '100%').attr('stop-color', 'rgba(239, 83, 80, 0.25)');   // red
+      const gRoot = svg.append('g');
+      // Chart groups (order matters for layering)
+      const gBackground = gRoot.append('g').attr('class', 'background');
+      const gGrid = gRoot.append('g').attr('class', 'grid');
+      const gAxes = gRoot.append('g').attr('class', 'axes');
+      const gAnnotations = gRoot.append('g').attr('class', 'annotations');
+      const gPoints = gRoot.append('g').attr('class', 'points');
+      const gLabels = gRoot.append('g').attr('class', 'labels');
+      // State
+      let data = null;
+      let width = 800;
+      let height = 450;
+      const margin = { top: 20, right: 120, bottom: 56, left: 72 };
+      // Scales
+      const xScale = d3.scaleLinear();
+      const yScale = d3.scaleLinear();
+      // Data loading
+      const DATA_URL = '/data/score_vs_recklessness.json';
+      // Helper function to create a 5-point star path
+      const starPath = (cx, cy, outerR, innerR) => {
+        const points = [];
+        for (let i = 0; i < 10; i++) {
+          const r = i % 2 === 0 ? outerR : innerR;
+          const angle = (Math.PI / 2) + (i * Math.PI / 5);
+          points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
+        }
+        return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
+      };
+      function updateSize() {
+        width = container.clientWidth || 800;
+        height = Math.max(300, Math.round(width / 1.5));
+        svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
+        gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
+        return {
+          innerWidth: width - margin.left - margin.right,
+          innerHeight: height - margin.top - margin.bottom
+        };
+      }
+      function showTooltip(event, d) {
+        const rect = container.getBoundingClientRect();
+        const x = event.clientX - rect.left;
+        const y = event.clientY - rect.top;
+        tip.innerHTML = `
+          <div class="model-name" style="color: ${d.color}">${d.name}</div>
+          <div class="metric">
+            <span class="metric-label">Score:</span>
+            <span class="metric-value">${d.avg_floored_score.toFixed(1)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Recklessness Index:</span>
+            <span class="metric-value">${d.recklessness_index.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Failed Guesses:</span>
+            <span class="metric-value">${d.avg_failed_guesses.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Caution:</span>
+            <span class="metric-value">${d.avg_caution.toFixed(2)}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Type:</span>
+            <span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
+          </div>
+        `;
+        const tipWidth = tip.offsetWidth || 180;
+        const tipHeight = tip.offsetHeight || 120;
+        let tipX = x + 12;
+        let tipY = y - tipHeight / 2;
+        if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
+        if (tipY < 0) tipY = 8;
+        if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
+        tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
+        tip.style.opacity = '1';
+      }
+      function hideTooltip() {
+        tip.style.opacity = '0';
+        tip.style.transform = 'translate(-9999px, -9999px)';
+      }
+      function render() {
+        if (!data) return;
+        const { innerWidth, innerHeight } = updateSize();
+        const models = data.models;
+        // Fixed symmetric X scale from -8 to 8
+        xScale
+          .domain([-8, 8])
+          .range([0, innerWidth]);
+        // Y scale based on data
+        const yExtent = d3.extent(models, d => d.avg_floored_score);
+        const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
+        yScale
+          .domain([yExtent[0], yExtent[1] + yPadding])
+          .range([innerHeight, 0])
+          .nice();
+        // Background gradient rectangle
+        gBackground.selectAll('.bg-gradient')
+          .data([0])
+          .join('rect')
+          .attr('class', 'bg-gradient')
+          .attr('x', 0)
+          .attr('y', 0)
+          .attr('width', innerWidth)
+          .attr('height', innerHeight)
+          .attr('fill', 'url(#recklessness-gradient)');
+        // Grid lines
+        const xTicks = xScale.ticks(8);
+        const yTicks = yScale.ticks(6);
+        gGrid.selectAll('.grid-x')
+          .data(xTicks)
+          .join('line')
+          .attr('class', 'grid-x')
+          .attr('x1', d => xScale(d))
+          .attr('x2', d => xScale(d))
+          .attr('y1', 0)
+          .attr('y2', innerHeight);
+        gGrid.selectAll('.grid-y')
+          .data(yTicks)
+          .join('line')
+          .attr('class', 'grid-y')
+          .attr('x1', 0)
+          .attr('x2', innerWidth)
+          .attr('y1', d => yScale(d))
+          .attr('y2', d => yScale(d));
+        // Axes with inner ticks
+        const tickSize = 6;
+        gAxes.selectAll('.x-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'x-axis')
+          .attr('transform', `translate(0,${innerHeight})`)
+          .call(d3.axisBottom(xScale).ticks(8).tickSizeInner(-tickSize).tickSizeOuter(0));
+        gAxes.selectAll('.y-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'y-axis')
+          .call(d3.axisLeft(yScale).ticks(6).tickSizeInner(-tickSize).tickSizeOuter(0));
+        // Axis labels
+        gAxes.selectAll('.x-label')
+          .data([0])
+          .join('text')
+          .attr('class', 'x-label axis-label')
+          .attr('x', innerWidth / 2)
+          .attr('y', innerHeight + 44)
+          .attr('text-anchor', 'middle')
+          .text('Boldness Index');
+        gAxes.selectAll('.y-label')
+          .data([0])
+          .join('text')
+          .attr('class', 'y-label axis-label')
+          .attr('x', -innerHeight / 2)
+          .attr('y', -52)
+          .attr('text-anchor', 'middle')
+          .attr('transform', 'rotate(-90)')
+          .text('Score');
+        // Top annotations: Overcautious / Cautious / Measured / Bold / Reckless
+        const annotations = [
+          { label: 'Overcautious', color: 'rgba(239, 83, 80, 0.9)', pos: 0.07},      // red
+          { label: 'Cautious', color: 'rgba(255, 180, 0, 0.9)', pos: 0.25 },       // yellow/orange
+          { label: 'Measured', color: 'rgba(76, 175, 80, 0.9)', pos: 0.5 },        // green
+          { label: 'Bold', color: 'rgba(255, 180, 0, 0.9)', pos: 0.75 },           // yellow/orange
+          { label: 'Reckless', color: 'rgba(239, 83, 80, 0.9)', pos: 0.95 }           // red
+        ];
+        gAnnotations.selectAll('.annotation-label')
+          .data(annotations, d => d.label)
+          .join('text')
+          .attr('class', 'annotation annotation-label')
+          .attr('x', d => d.pos * innerWidth)
+          .attr('y', 16)
+          .attr('text-anchor', d => d.pos === 0 ? 'start' : d.pos === 1 ? 'end' : 'middle')
+          .style('fill', d => d.color)
+          .style('font-weight', 'bold')
+          .style('font-size', '13px')
+          .text(d => d.label);
+        // Points
+        const pointRadius = Math.max(8, Math.min(14, innerWidth / 60));
+        // Closed models as filled circles
+        const closedModels = models.filter(d => !d.is_open);
+        gPoints.selectAll('.point-closed')
+          .data(closedModels, d => d.name)
+          .join('circle')
+          .attr('class', 'point point-closed')
+          .attr('cx', d => xScale(d.recklessness_index))
+          .attr('cy', d => yScale(d.avg_floored_score))
+          .attr('r', pointRadius)
+          .attr('fill', d => d.color)
+          .attr('stroke', 'none')
+          .on('mouseenter', showTooltip)
+          .on('mousemove', showTooltip)
+          .on('mouseleave', hideTooltip);
+        // Open models as stars
+        const openModels = models.filter(d => d.is_open);
+        gPoints.selectAll('.point-star')
+          .data(openModels, d => d.name)
+          .join('path')
+          .attr('class', 'point point-star')
+          .attr('d', d => starPath(xScale(d.recklessness_index), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
+          .attr('fill', d => d.color)
+          .attr('stroke', 'none')
+          .on('mouseenter', showTooltip)
+          .on('mousemove', showTooltip)
+          .on('mouseleave', hideTooltip);
+        // Point labels with smart positioning
+        gLabels.selectAll('.point-label')
+          .data(models, d => d.name)
+          .join('text')
+          .attr('class', 'point-label')
+          .attr('x', d => {
+            const xPos = xScale(d.recklessness_index);
+            if (xPos > innerWidth - 100) {
+              return xPos - pointRadius - 6;
+            }
+            return xPos + pointRadius + 6;
+          })
+          .attr('y', d => yScale(d.avg_floored_score) + 4)
+          .attr('text-anchor', d => {
+            const xPos = xScale(d.recklessness_index);
+            return xPos > innerWidth - 100 ? 'end' : 'start';
+          })
+          .text(d => d.name);
+      }
+      // Initialize
+      fetch(DATA_URL, { cache: 'no-cache' })
+        .then(r => r.json())
+        .then(json => {
+          data = json;
+          render();
+        })
+        .catch(err => {
+          const pre = document.createElement('pre');
+          pre.style.color = 'red';
+          pre.style.padding = '16px';
+          pre.textContent = `Error loading data: ${err.message}`;
+          container.appendChild(pre);
+        });
+      // Resize handling
+      if (window.ResizeObserver) {
+        new ResizeObserver(() => render()).observe(container);
+      } else {
+        window.addEventListener('resize', render);
+      }
+      // Theme change handling
+      const observer = new MutationObserver(() => render());
+      observer.observe(document.documentElement, {
+        attributes: true,
+        attributeFilter: ['data-theme']
+      });
+    };
+    if (document.readyState === 'loading') {
+      document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
+    } else {
+      ensureD3(bootstrap);
+    }
+  })();
+</script>

app/src/content/embeds/tokens-by-turn.html ADDED Viewed

	@@ -0,0 +1,487 @@

+<div class="d3-tokens-by-turn"></div>
+<style>
+  .d3-tokens-by-turn {
+    width: 100%;
+    margin: 10px 0;
+    position: relative;
+    font-family: system-ui, -apple-system, sans-serif;
+  }
+  .d3-tokens-by-turn svg {
+    display: block;
+    width: 100%;
+    height: auto;
+  }
+  .d3-tokens-by-turn .axes path,
+  .d3-tokens-by-turn .axes line {
+    stroke: var(--axis-color, var(--text-color));
+  }
+  .d3-tokens-by-turn .axes text {
+    fill: var(--tick-color, var(--muted-color));
+    font-size: 11px;
+  }
+  .d3-tokens-by-turn .grid line {
+    stroke: var(--grid-color, rgba(0,0,0,.08));
+  }
+  .d3-tokens-by-turn .axes text.axis-label {
+    font-size: 14px;
+    font-weight: 500;
+    fill: var(--text-color);
+  }
+  .d3-tokens-by-turn .x-axis text {
+    transform: translateY(4px);
+  }
+  .d3-tokens-by-turn .tokens-line {
+    fill: none;
+    stroke-width: 1.5;
+  }
+  .d3-tokens-by-turn .data-point {
+    cursor: pointer;
+    transition: opacity 0.15s ease;
+  }
+  .d3-tokens-by-turn .data-point:hover {
+    opacity: 0.8;
+  }
+  .d3-tokens-by-turn .legend {
+    font-size: 11px;
+  }
+  .d3-tokens-by-turn .legend-item {
+    cursor: pointer;
+  }
+  .d3-tokens-by-turn .legend-item.dimmed .legend-line,
+  .d3-tokens-by-turn .legend-item.dimmed .legend-marker {
+    opacity: 0.3;
+  }
+  .d3-tokens-by-turn .legend-item.dimmed text {
+    opacity: 0.4;
+  }
+  .d3-tokens-by-turn .legend-text {
+    fill: var(--text-color);
+  }
+  .d3-tokens-by-turn .d3-tooltip {
+    position: absolute;
+    top: 0;
+    left: 0;
+    transform: translate(-9999px, -9999px);
+    pointer-events: none;
+    padding: 10px 12px;
+    border-radius: 8px;
+    font-size: 12px;
+    line-height: 1.4;
+    border: 1px solid var(--border-color);
+    background: var(--surface-bg);
+    color: var(--text-color);
+    box-shadow: 0 4px 24px rgba(0,0,0,.18);
+    opacity: 0;
+    transition: opacity 0.12s ease;
+    z-index: 10;
+  }
+  .d3-tokens-by-turn .d3-tooltip .model-name {
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .d3-tokens-by-turn .d3-tooltip .metric {
+    display: flex;
+    justify-content: space-between;
+    gap: 16px;
+  }
+  .d3-tokens-by-turn .d3-tooltip .metric-label {
+    color: var(--muted-color);
+  }
+  .d3-tokens-by-turn .d3-tooltip .metric-value {
+    font-weight: 500;
+  }
+</style>
+<script>
+  (() => {
+    const ensureD3 = (cb) => {
+      if (window.d3 && typeof window.d3.select === 'function') return cb();
+      let s = document.getElementById('d3-cdn-script');
+      if (!s) {
+        s = document.createElement('script');
+        s.id = 'd3-cdn-script';
+        s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
+        document.head.appendChild(s);
+      }
+      const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
+      s.addEventListener('load', onReady, { once: true });
+      if (window.d3) onReady();
+    };
+    const bootstrap = () => {
+      const scriptEl = document.currentScript;
+      let container = scriptEl ? scriptEl.previousElementSibling : null;
+      if (!(container && container.classList && container.classList.contains('d3-tokens-by-turn'))) {
+        const candidates = Array.from(document.querySelectorAll('.d3-tokens-by-turn'))
+          .filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
+        container = candidates[candidates.length - 1] || null;
+      }
+      if (!container) return;
+      if (container.dataset) {
+        if (container.dataset.mounted === 'true') return;
+        container.dataset.mounted = 'true';
+      }
+      // Tooltip setup
+      container.style.position = container.style.position || 'relative';
+      const tip = document.createElement('div');
+      tip.className = 'd3-tooltip';
+      container.appendChild(tip);
+      // SVG setup
+      const svg = d3.select(container).append('svg');
+      const gRoot = svg.append('g');
+      // Chart groups (order matters for layering)
+      const gGrid = gRoot.append('g').attr('class', 'grid');
+      const gLines = gRoot.append('g').attr('class', 'lines');
+      const gPoints = gRoot.append('g').attr('class', 'points');
+      const gAxes = gRoot.append('g').attr('class', 'axes');
+      const gLegend = gRoot.append('g').attr('class', 'legend');
+      // State
+      let data = null;
+      let width = 800;
+      let height = 450;
+      const margin = { top: 20, right: 180, bottom: 56, left: 72 };
+      let hiddenModels = new Set();
+      // Scales
+      const xScale = d3.scaleLinear();
+      const yScale = d3.scaleLinear();
+      // Line generator
+      const line = d3.line()
+        .x(d => xScale(d.turn_number))
+        .y(d => yScale(d.avg_output_tokens));
+      // Data loading
+      const DATA_URL = '/data/tokens_by_turn.json';
+      function updateSize() {
+        width = container.clientWidth || 800;
+        height = Math.max(350, Math.round(width * 0.5));
+        svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
+        gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
+        return {
+          innerWidth: width - margin.left - margin.right,
+          innerHeight: height - margin.top - margin.bottom
+        };
+      }
+      function showTooltip(event, d, model) {
+        const rect = container.getBoundingClientRect();
+        const x = event.clientX - rect.left;
+        const y = event.clientY - rect.top;
+        tip.innerHTML = `
+          <div class="model-name" style="color: ${model.color}">${model.name}</div>
+          <div class="metric">
+            <span class="metric-label">Turn:</span>
+            <span class="metric-value">${d.turn_number}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Avg tokens:</span>
+            <span class="metric-value">${Math.round(d.avg_output_tokens).toLocaleString()}</span>
+          </div>
+          <div class="metric">
+            <span class="metric-label">Sample size:</span>
+            <span class="metric-value">${d.sample_count}</span>
+          </div>
+        `;
+        const tipWidth = tip.offsetWidth || 150;
+        const tipHeight = tip.offsetHeight || 100;
+        let tipX = x + 12;
+        let tipY = y - tipHeight / 2;
+        if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
+        if (tipY < 0) tipY = 8;
+        if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
+        tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
+        tip.style.opacity = '1';
+      }
+      function hideTooltip() {
+        tip.style.opacity = '0';
+        tip.style.transform = 'translate(-9999px, -9999px)';
+      }
+      function toggleModel(modelName) {
+        if (hiddenModels.has(modelName)) {
+          hiddenModels.delete(modelName);
+        } else {
+          hiddenModels.add(modelName);
+        }
+        render();
+      }
+      // Helper function to create a 5-point star path
+      const starPath = (cx, cy, outerR, innerR) => {
+        const points = [];
+        for (let i = 0; i < 10; i++) {
+          const r = i % 2 === 0 ? outerR : innerR;
+          const angle = (Math.PI / 2) + (i * Math.PI / 5);
+          points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
+        }
+        return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
+      };
+      function render() {
+        if (!data) return;
+        const { innerWidth, innerHeight } = updateSize();
+        const models = data.models;
+        // Find visible models and compute extents
+        const visibleModels = models.filter(m => !hiddenModels.has(m.name));
+        // X scale: turn number 1-30
+        xScale
+          .domain([1, 30])
+          .range([0, innerWidth]);
+        // Y scale: find max tokens across visible models
+        let maxTokens = 0;
+        visibleModels.forEach(m => {
+          m.tokens_by_turn.forEach(t => {
+            if (t.avg_output_tokens > maxTokens) maxTokens = t.avg_output_tokens;
+          });
+        });
+        maxTokens = Math.ceil(maxTokens / 2000) * 2000; // Round up to nearest 2000
+        yScale
+          .domain([0, maxTokens])
+          .range([innerHeight, 0]);
+        // Grid lines
+        const xTicks = d3.range(5, 31, 5); // 5, 10, 15, 20, 25, 30
+        const yTicks = yScale.ticks(6);
+        gGrid.selectAll('.grid-x')
+          .data(xTicks)
+          .join('line')
+          .attr('class', 'grid-x')
+          .attr('x1', d => xScale(d))
+          .attr('x2', d => xScale(d))
+          .attr('y1', 0)
+          .attr('y2', innerHeight);
+        gGrid.selectAll('.grid-y')
+          .data(yTicks)
+          .join('line')
+          .attr('class', 'grid-y')
+          .attr('x1', 0)
+          .attr('x2', innerWidth)
+          .attr('y1', d => yScale(d))
+          .attr('y2', d => yScale(d));
+        // Axes
+        const tickSize = 6;
+        gAxes.selectAll('.x-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'x-axis')
+          .attr('transform', `translate(0,${innerHeight})`)
+          .call(d3.axisBottom(xScale)
+            .tickValues([1, 5, 10, 15, 20, 25, 30])
+            .tickSizeInner(-tickSize)
+            .tickSizeOuter(0));
+        gAxes.selectAll('.y-axis')
+          .data([0])
+          .join('g')
+          .attr('class', 'y-axis')
+          .call(d3.axisLeft(yScale)
+            .ticks(6)
+            .tickFormat(d => d >= 1000 ? `${d/1000}k` : d)
+            .tickSizeInner(-tickSize)
+            .tickSizeOuter(0));
+        // Axis labels
+        gAxes.selectAll('.x-label')
+          .data([0])
+          .join('text')
+          .attr('class', 'x-label axis-label')
+          .attr('x', innerWidth / 2)
+          .attr('y', innerHeight + 44)
+          .attr('text-anchor', 'middle')
+          .text('Turn Number');
+        gAxes.selectAll('.y-label')
+          .data([0])
+          .join('text')
+          .attr('class', 'y-label axis-label')
+          .attr('x', -innerHeight / 2)
+          .attr('y', -52)
+          .attr('text-anchor', 'middle')
+          .attr('transform', 'rotate(-90)')
+          .text('Average Output Tokens');
+        // Lines for each model
+        gLines.selectAll('.tokens-line')
+          .data(visibleModels, d => d.name)
+          .join('path')
+          .attr('class', 'tokens-line')
+          .attr('d', d => line(d.tokens_by_turn))
+          .attr('stroke', d => d.color)
+          .attr('stroke-dasharray', d => d.is_open ? '6,3' : 'none');
+        // Data points
+        const allPoints = visibleModels.flatMap(model =>
+          model.tokens_by_turn.map(p => ({ ...p, model }))
+        );
+        const closedPoints = allPoints.filter(d => !d.model.is_open);
+        const openPoints = allPoints.filter(d => d.model.is_open);
+        // Circles for closed models
+        gPoints.selectAll('.data-point-circle')
+          .data(closedPoints, d => `${d.model.name}-${d.turn_number}`)
+          .join('circle')
+          .attr('class', 'data-point data-point-circle')
+          .attr('cx', d => xScale(d.turn_number))
+          .attr('cy', d => yScale(d.avg_output_tokens))
+          .attr('r', 3)
+          .attr('fill', d => d.model.color)
+          .attr('stroke', 'var(--surface-bg, white)')
+          .attr('stroke-width', 1)
+          .on('mouseenter', (event, d) => showTooltip(event, d, d.model))
+          .on('mousemove', (event, d) => showTooltip(event, d, d.model))
+          .on('mouseleave', hideTooltip);
+        // Stars for open models
+        gPoints.selectAll('.data-point-star')
+          .data(openPoints, d => `${d.model.name}-${d.turn_number}`)
+          .join('path')
+          .attr('class', 'data-point data-point-star')
+          .attr('d', d => starPath(
+            xScale(d.turn_number),
+            yScale(d.avg_output_tokens),
+            5, 2.2
+          ))
+          .attr('fill', d => d.model.color)
+          .attr('stroke', 'var(--surface-bg, white)')
+          .attr('stroke-width', 0.6)
+          .on('mouseenter', (event, d) => showTooltip(event, d, d.model))
+          .on('mousemove', (event, d) => showTooltip(event, d, d.model))
+          .on('mouseleave', hideTooltip);
+        // Legend
+        const legendX = innerWidth + 16;
+        const legendItemHeight = 20;
+        gLegend.selectAll('.legend-item')
+          .data(models, d => d.name)
+          .join('g')
+          .attr('class', d => `legend-item ${hiddenModels.has(d.name) ? 'dimmed' : ''}`)
+          .attr('transform', (d, i) => `translate(${legendX}, ${i * legendItemHeight})`)
+          .each(function(d) {
+            const g = d3.select(this);
+            g.selectAll('*').remove();
+            // Line segment
+            g.append('line')
+              .attr('class', 'legend-line')
+              .attr('x1', 0)
+              .attr('x2', 20)
+              .attr('y1', 0)
+              .attr('y2', 0)
+              .attr('stroke', d.color)
+              .attr('stroke-width', 1.5)
+              .attr('stroke-dasharray', d.is_open ? '4,2' : 'none');
+            // Marker - circle for closed, star for open
+            if (d.is_open) {
+              g.append('path')
+                .attr('class', 'legend-marker')
+                .attr('d', starPath(10, 0, 5, 2.2))
+                .attr('fill', d.color);
+            } else {
+              g.append('circle')
+                .attr('class', 'legend-marker')
+                .attr('cx', 10)
+                .attr('cy', 0)
+                .attr('r', 3)
+                .attr('fill', d.color);
+            }
+            g.append('text')
+              .attr('class', 'legend-text')
+              .attr('x', 26)
+              .attr('y', 4)
+              .text(d.name);
+            g.style('cursor', 'pointer')
+              .on('click', () => toggleModel(d.name));
+          });
+        // Legend note about line styles
+        const noteY = models.length * legendItemHeight + 12;
+        gLegend.selectAll('.legend-note')
+          .data([0])
+          .join('text')
+          .attr('class', 'legend-note')
+          .attr('x', legendX)
+          .attr('y', noteY)
+          .attr('font-size', '10px')
+          .attr('fill', 'var(--muted-color)')
+          .text('Solid = Closed, Dashed = Open');
+      }
+      // Initialize
+      fetch(DATA_URL, { cache: 'no-cache' })
+        .then(r => r.json())
+        .then(json => {
+          data = json;
+          render();
+        })
+        .catch(err => {
+          const pre = document.createElement('pre');
+          pre.style.color = 'red';
+          pre.style.padding = '16px';
+          pre.textContent = `Error loading data: ${err.message}`;
+          container.appendChild(pre);
+        });
+      // Resize handling
+      if (window.ResizeObserver) {
+        new ResizeObserver(() => render()).observe(container);
+      } else {
+        window.addEventListener('resize', render);
+      }
+      // Theme change handling
+      const observer = new MutationObserver(() => render());
+      observer.observe(document.documentElement, {
+        attributes: true,
+        attributeFilter: ['data-theme']
+      });
+    };
+    if (document.readyState === 'loading') {
+      document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
+    } else {
+      ensureD3(bootstrap);
+    }
+  })();
+</script>

app/src/styles/_layout.css CHANGED Viewed

@@ -195,4 +195,30 @@
     width: 100%;
     min-width: 0;
   }
-}

     width: 100%;
     min-width: 0;
   }
+}
+/* ============================================================================ */
+/* Bibliography/References - hide inline sections that should be in footer      */
+/* ---------------------------------------------------------------------------- */
+/* References sections with data-built-refs are generated per-chapter by
+   rehype-citation. The Footer.astro script consolidates them into the footer.
+   These styles ensure inline refs don't display in the main content area. */
+/* References in main content should not be displayed - they belong in footer.
+   The Footer.astro script moves them; this CSS is a visual safeguard. */
+main [data-built-refs],
+main #references:not(.footer-processed),
+main section.references:not(ol),
+main div.references:not(ol),
+main .bibliography:not(ol) {
+  /* Collapse to zero height to prevent layout impact, but keep in DOM for JS */
+  max-height: 0;
+  overflow: hidden;
+  margin: 0 !important;
+  padding: 0 !important;
+  border: none !important;
+  opacity: 0;
+  pointer-events: none;
+}
+/* Once moved to footer, these styles don't apply (not inside main) */

bibliography_fix.md ADDED Viewed

	@@ -0,0 +1,249 @@

+# Bibliography System Fix for Multi-Chapter MDX Articles
+This document describes changes made to the [Research Article Template](https://huggingface.co/spaces/tfrere/research-article-template) to fix bibliography/references placement when using multiple MDX chapter files.
+## The Problem
+When an article is split into multiple MDX chapter files (e.g., `introduction.mdx`, `results.mdx`, etc.) that are imported into a main `article.mdx`, the bibliography appears at the end of each chapter instead of consolidated in the footer.
+### Root Cause
+Astro compiles each MDX file independently through the remark/rehype pipeline. The `rehype-citation` plugin appends a `<section id="references">` to the end of **every** MDX file that contains citations.
+This causes two issues:
+1. **Duplicate IDs**: Multiple `<section id="references">` elements (invalid HTML)
+2. **Scattered bibliographies**: References appear after each chapter instead of once at the end
+### Original Template Behavior
+The original `Footer.astro` only looked for the **first** references section using `findFirstOutsideFooter()`. This worked for single-file articles but failed for multi-chapter structures.
+## The Solution
+A two-phase approach: build-time marking + runtime consolidation.
+### Phase 1: Build-Time (post-citation.mjs)
+Mark ALL references sections so they can be found at runtime.
+### Phase 2: Runtime (Footer.astro)
+Consolidate all marked sections into the footer, merging list items and removing duplicates.
+### Phase 3: CSS Fallback (_layout.css)
+Hide any unconsolidated sections as a visual safety net.
+---
+## Changes Made
+### 1. `app/plugins/rehype/post-citation.mjs`
+**Change**: Find and process ALL references sections, not just the first one.
+```javascript
+// BEFORE: Only found first section
+const findReferencesRoot = () => {
+  let found = null;
+  walk(tree, null, (node) => {
+    if (found) return;  // <-- Stopped after first match
+    // ...
+  });
+  return found;
+};
+// AFTER: Find ALL sections
+const findAllReferencesRoots = () => {
+  const found = [];
+  walk(tree, null, (node) => {
+    if (!isElement(node)) return;
+    const id = getAttr(node, 'id');
+    if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
+      if (!found.includes(node)) {
+        found.push(node);
+      }
+    }
+  });
+  return found;
+};
+```
+**Change**: Process all sections in a loop and mark each with `data-built-refs`.
+```javascript
+// BEFORE: Single section processing
+const refsRoot = findReferencesRoot();
+if (refsRoot) {
+  // ... process single section
+  setAttr(refsRoot, 'data-built-refs', '1');
+}
+// AFTER: Loop through all sections
+const allRefsRoots = findAllReferencesRoots();
+for (const refsRoot of allRefsRoots) {
+  // ... process each section
+  setAttr(refsRoot, 'data-built-refs', '1');
+}
+```
+---
+### 2. `app/src/components/Footer.astro`
+**Change**: Add `[data-built-refs]` to selector list (was missing).
+```javascript
+// BEFORE: Missing the data attribute selector
+const allRefsEls = findAllOutsideFooter([
+  "#bibliography-references-list",
+  "[data-bibliography-block]",  // <-- This doesn't exist
+  "#references",
+  // ...
+]);
+// AFTER: Added data-built-refs and improved selector order
+const allRefsEls = findAllOutsideFooter([
+  "[data-built-refs]",          // <-- Added: what post-citation.mjs actually sets
+  "[data-bibliography-block]",
+  "#bibliography-references-list",
+  "section#references",
+  "div#references",
+  "#refs",
+  ".references:not(ol)",
+  ".bibliography",
+]);
+```
+**Change**: Improved duplicate detection with CSS.escape fallback.
+```javascript
+// BEFORE: Could fail if CSS.escape unavailable or ID has special chars
+if (!itemId || !targetOl.querySelector(`#${CSS.escape(itemId)}`)) {
+  targetOl.appendChild(item);
+}
+// AFTER: Robust fallback
+if (itemId) {
+  try {
+    const escapedId = CSS.escape ? CSS.escape(itemId) : itemId.replace(/([^\w-])/g, '\\$1');
+    if (targetOl.querySelector(`#${escapedId}`)) {
+      return; // Skip duplicate
+    }
+  } catch (e) {
+    // Manual check if selector fails
+    const existing = Array.from(targetOl.querySelectorAll('li')).find(li => li.id === itemId);
+    if (existing) return;
+  }
+}
+targetOl.appendChild(item);
+```
+**Change**: Added MutationObserver to catch dynamically rendered content.
+```javascript
+// Watch for dynamically added content (e.g., lazy-loaded components)
+const observer = new MutationObserver((mutations) => {
+  if (footer.dataset.processed !== "true") {
+    attemptMove();
+  } else {
+    // Check if any new references sections were added
+    for (const mutation of mutations) {
+      for (const node of mutation.addedNodes) {
+        if (node.nodeType === 1) {
+          const el = node;
+          if (
+            el.id === "references" ||
+            el.classList?.contains("references") ||
+            el.hasAttribute?.("data-built-refs")
+          ) {
+            footer.dataset.processed = "false";
+            attemptMove();
+            return;
+          }
+        }
+      }
+    }
+  }
+});
+if (contentRoot) {
+  observer.observe(contentRoot, { childList: true, subtree: true });
+}
+// Stop observing after page is fully loaded
+window.addEventListener("load", () => {
+  setTimeout(() => observer.disconnect(), 2000);
+}, { once: true });
+```
+---
+### 3. `app/src/styles/_layout.css`
+**Change**: Added CSS to hide any inline references sections that weren't consolidated.
+```css
+/* Bibliography/References - hide inline sections that should be in footer */
+/* These styles ensure inline refs don't display in the main content area. */
+main [data-built-refs],
+main #references:not(.footer-processed),
+main section.references:not(ol),
+main div.references:not(ol),
+main .bibliography:not(ol) {
+  /* Collapse to zero height to prevent layout impact, but keep in DOM for JS */
+  max-height: 0;
+  overflow: hidden;
+  margin: 0 !important;
+  padding: 0 !important;
+  border: none !important;
+  opacity: 0;
+  pointer-events: none;
+}
+```
+---
+## How It Works Now
+1. **Build time**: Each MDX chapter is compiled. `rehype-citation` adds a bibliography section to each. `post-citation.mjs` marks ALL of them with `data-built-refs="1"`.
+2. **Page load**: `Footer.astro` JavaScript runs:
+   - Finds all elements with `[data-built-refs]` or other bibliography selectors
+   - Moves the first section to the footer
+   - Extracts `<li>` items from subsequent sections and appends to the consolidated list
+   - Skips duplicates (same ID)
+   - Removes empty leftover sections
+3. **Visual fallback**: CSS hides any sections that might remain in the main content (timing edge cases).
+---
+## Testing
+1. Run `npm run dev` and open the article
+2. Scroll to the footer - all references should appear there
+3. Open browser dev tools:
+   - Search for `data-built-refs` - should only exist in footer
+   - Check that no `#references` sections remain in `<main>`
+4. Click citation links - should scroll to footer references
+---
+## Files Modified
+| File | Change |
+|------|--------|
+| `app/plugins/rehype/post-citation.mjs` | Find and mark ALL references sections |
+| `app/src/components/Footer.astro` | Improved selectors, robust deduplication, MutationObserver |
+| `app/src/styles/_layout.css` | CSS fallback to hide unconsolidated sections |
+---
+## Upstream Contribution
+These changes could be contributed back to the original template. The fix is backward-compatible:
+- Single-file articles work exactly as before
+- Multi-chapter articles now work correctly
+- No configuration changes needed

interactive-charts.md CHANGED Viewed

@@ -409,9 +409,11 @@ For frameless embedding (like the banner):
 | 3 | `confidence_distribution.json` | Grouped histogram | Done (confidence-distribution.html) |
 | 4 | `score_vs_failed_guesses.json` | Scatter | TODO |
 | 5 | `excess_caution.json` | Box plot | TODO |
 | 6 | `caution_vs_failed_guesses.json` | Scatter | Done (caution-vs-failed-guesses.html) |
 | 7 | `by_rule.json` | Strip plot | Done (by-rule.html) |
 | 8 | `complexity_analysis.json` | Heatmap | Done (complexity-analysis.html) |
 ## Testing

 | 3 | `confidence_distribution.json` | Grouped histogram | Done (confidence-distribution.html) |
 | 4 | `score_vs_failed_guesses.json` | Scatter | TODO |
 | 5 | `excess_caution.json` | Box plot | TODO |
+| 5b | `tokens_by_turn.json` | Multi-line | Done (tokens-by-turn.html) |
 | 6 | `caution_vs_failed_guesses.json` | Scatter | Done (caution-vs-failed-guesses.html) |
 | 7 | `by_rule.json` | Strip plot | Done (by-rule.html) |
 | 8 | `complexity_analysis.json` | Heatmap | Done (complexity-analysis.html) |
+| 9 | `complexity_ratio.json` | Horizontal dot plot | Done (complexity-ratio.html) |
 ## Testing