Spaces:
Running
Running
Improved charts
Browse files- app/plugins/rehype/post-citation.mjs +22 -10
- app/src/components/Footer.astro +115 -7
- app/src/content/assets/data/basic_metrics.csv +2 -2
- app/src/content/assets/data/complexity_ratio.json +3 -0
- app/src/content/assets/data/complexity_ratio.png +3 -0
- app/src/content/assets/data/overall_performance.json +1 -1
- app/src/content/assets/data/overall_performance.png +2 -2
- app/src/content/assets/data/score_vs_recklessness.json +3 -0
- app/src/content/assets/data/score_vs_recklessness.png +3 -0
- app/src/content/assets/data/summary.txt +65 -11
- app/src/content/assets/data/tokens_by_turn.json +3 -0
- app/src/content/assets/data/tokens_by_turn.png +3 -0
- app/src/content/chapters/eleusis/appendix.mdx +0 -32
- app/src/content/chapters/eleusis/benchmark.mdx +13 -8
- app/src/content/chapters/eleusis/introduction.mdx +4 -4
- app/src/content/chapters/eleusis/results.mdx +106 -25
- app/src/content/embeds/banner-bar-chart.html +356 -0
- app/src/content/embeds/banner.html +200 -113
- app/src/content/embeds/complexity-ratio.html +484 -0
- app/src/content/embeds/score-vs-recklessness.html +443 -0
- app/src/content/embeds/tokens-by-turn.html +487 -0
- app/src/styles/_layout.css +27 -1
- bibliography_fix.md +249 -0
- interactive-charts.md +2 -0
app/plugins/rehype/post-citation.mjs
CHANGED
|
@@ -299,20 +299,28 @@ export default function rehypeReferencesAndFootnotes() {
|
|
| 299 |
}
|
| 300 |
};
|
| 301 |
|
| 302 |
-
// Find references
|
| 303 |
-
const
|
| 304 |
-
|
| 305 |
walk(tree, null, (node) => {
|
| 306 |
-
if (found) return;
|
| 307 |
if (!isElement(node)) return;
|
| 308 |
const id = getAttr(node, 'id');
|
| 309 |
if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
|
| 310 |
-
found
|
|
|
|
|
|
|
|
|
|
| 311 |
}
|
| 312 |
});
|
| 313 |
return found;
|
| 314 |
};
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
const toOrderedList = (container) => {
|
| 317 |
// If there is already an <ol>, use it; otherwise convert common structures
|
| 318 |
let ol = getChildren(container).find((c) => isElement(c) && c.tagName === 'ol');
|
|
@@ -340,15 +348,18 @@ export default function rehypeReferencesAndFootnotes() {
|
|
| 340 |
return ol;
|
| 341 |
};
|
| 342 |
|
| 343 |
-
|
| 344 |
-
|
|
|
|
| 345 |
const refIdSet = new Set();
|
| 346 |
const refIdToExternalHref = new Map();
|
| 347 |
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
| 350 |
// Collect item ids and linkify their content
|
| 351 |
-
for (const li of getChildren(
|
| 352 |
if (!isElement(li) || li.tagName !== 'li') continue;
|
| 353 |
if (!getAttr(li, 'id')) {
|
| 354 |
// Try to find a nested element with id to promote
|
|
@@ -380,6 +391,7 @@ export default function rehypeReferencesAndFootnotes() {
|
|
| 380 |
if (externalHref) refIdToExternalHref.set(String(id), externalHref);
|
| 381 |
}
|
| 382 |
}
|
|
|
|
| 383 |
setAttr(refsRoot, 'data-built-refs', '1');
|
| 384 |
}
|
| 385 |
|
|
|
|
| 299 |
}
|
| 300 |
};
|
| 301 |
|
| 302 |
+
// Find ALL references containers (there may be multiple from different MDX imports)
|
| 303 |
+
const findAllReferencesRoots = () => {
|
| 304 |
+
const found = [];
|
| 305 |
walk(tree, null, (node) => {
|
|
|
|
| 306 |
if (!isElement(node)) return;
|
| 307 |
const id = getAttr(node, 'id');
|
| 308 |
if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
|
| 309 |
+
// Don't add if already found (shouldn't happen but be safe)
|
| 310 |
+
if (!found.includes(node)) {
|
| 311 |
+
found.push(node);
|
| 312 |
+
}
|
| 313 |
}
|
| 314 |
});
|
| 315 |
return found;
|
| 316 |
};
|
| 317 |
|
| 318 |
+
// Legacy function for backwards compatibility
|
| 319 |
+
const findReferencesRoot = () => {
|
| 320 |
+
const all = findAllReferencesRoots();
|
| 321 |
+
return all.length > 0 ? all[0] : null;
|
| 322 |
+
};
|
| 323 |
+
|
| 324 |
const toOrderedList = (container) => {
|
| 325 |
// If there is already an <ol>, use it; otherwise convert common structures
|
| 326 |
let ol = getChildren(container).find((c) => isElement(c) && c.tagName === 'ol');
|
|
|
|
| 348 |
return ol;
|
| 349 |
};
|
| 350 |
|
| 351 |
+
// Process ALL references sections (there may be multiple from different MDX imports)
|
| 352 |
+
const allRefsRoots = findAllReferencesRoots();
|
| 353 |
+
let refsOl = null; // Keep track of the first one for backlink processing
|
| 354 |
const refIdSet = new Set();
|
| 355 |
const refIdToExternalHref = new Map();
|
| 356 |
|
| 357 |
+
for (const refsRoot of allRefsRoots) {
|
| 358 |
+
const currentOl = toOrderedList(refsRoot);
|
| 359 |
+
if (!refsOl) refsOl = currentOl; // Use first ol for backlinks
|
| 360 |
+
|
| 361 |
// Collect item ids and linkify their content
|
| 362 |
+
for (const li of getChildren(currentOl)) {
|
| 363 |
if (!isElement(li) || li.tagName !== 'li') continue;
|
| 364 |
if (!getAttr(li, 'id')) {
|
| 365 |
// Try to find a nested element with id to promote
|
|
|
|
| 391 |
if (externalHref) refIdToExternalHref.set(String(id), externalHref);
|
| 392 |
}
|
| 393 |
}
|
| 394 |
+
// Mark each references section so Footer.astro can find them all
|
| 395 |
setAttr(refsRoot, 'data-built-refs', '1');
|
| 396 |
}
|
| 397 |
|
app/src/components/Footer.astro
CHANGED
|
@@ -142,15 +142,82 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
|
|
| 142 |
return null;
|
| 143 |
};
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
"[data-bibliography-block]",
|
| 148 |
-
"#references",
|
|
|
|
|
|
|
| 149 |
"#refs",
|
| 150 |
-
".references",
|
| 151 |
".bibliography",
|
| 152 |
]);
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
// Try multiple selectors for footnotes
|
| 155 |
const footnotesEl = findFirstOutsideFooter([
|
| 156 |
"[data-built-footnotes]",
|
|
@@ -159,7 +226,6 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
|
|
| 159 |
"div.footnotes",
|
| 160 |
]);
|
| 161 |
|
| 162 |
-
const movedRefs = moveIntoFooter(referencesEl, "References");
|
| 163 |
const movedNotes = moveIntoFooter(footnotesEl, "Footnotes");
|
| 164 |
|
| 165 |
if (movedRefs || movedNotes) {
|
|
@@ -196,8 +262,50 @@ const { citationText, bibtex, licence, doi } = Astro.props as Props;
|
|
| 196 |
// Final attempt after a short delay
|
| 197 |
setTimeout(attemptMove, 300);
|
| 198 |
|
| 199 |
-
//
|
| 200 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
})();
|
| 202 |
</script>
|
| 203 |
|
|
|
|
| 142 |
return null;
|
| 143 |
};
|
| 144 |
|
| 145 |
+
// Find ALL references/bibliography sections and consolidate them
|
| 146 |
+
const findAllOutsideFooter = (selectors) => {
|
| 147 |
+
const results = [];
|
| 148 |
+
const searchRoots = [contentRoot, document.body].filter(Boolean);
|
| 149 |
+
for (const root of searchRoots) {
|
| 150 |
+
for (const sel of selectors) {
|
| 151 |
+
const els = root.querySelectorAll(sel);
|
| 152 |
+
els.forEach(el => {
|
| 153 |
+
if (el && !footer.contains(el) && !results.includes(el)) {
|
| 154 |
+
results.push(el);
|
| 155 |
+
}
|
| 156 |
+
});
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
return results;
|
| 160 |
+
};
|
| 161 |
+
|
| 162 |
+
// Find all bibliography/references sections
|
| 163 |
+
// Note: post-citation.mjs adds data-built-refs="1" to processed sections
|
| 164 |
+
// We use multiple selectors to catch different formats, prioritizing data attributes
|
| 165 |
+
// over IDs (since duplicate IDs are invalid HTML and have undefined querySelector behavior)
|
| 166 |
+
const allRefsEls = findAllOutsideFooter([
|
| 167 |
+
"[data-built-refs]",
|
| 168 |
"[data-bibliography-block]",
|
| 169 |
+
"#bibliography-references-list",
|
| 170 |
+
"section#references",
|
| 171 |
+
"div#references",
|
| 172 |
"#refs",
|
| 173 |
+
".references:not(ol)",
|
| 174 |
".bibliography",
|
| 175 |
]);
|
| 176 |
|
| 177 |
+
// Consolidate multiple bibliography sections into one
|
| 178 |
+
let movedRefs = false;
|
| 179 |
+
if (allRefsEls.length > 0) {
|
| 180 |
+
// Move the first one normally
|
| 181 |
+
movedRefs = moveIntoFooter(allRefsEls[0], "References");
|
| 182 |
+
|
| 183 |
+
// For additional bibliography sections, merge their list items into the first one
|
| 184 |
+
if (allRefsEls.length > 1) {
|
| 185 |
+
// Find the target ol - it's now inside the moved element within target
|
| 186 |
+
const targetOl = target.querySelector("ol.references") || target.querySelector("ol");
|
| 187 |
+
|
| 188 |
+
for (let i = 1; i < allRefsEls.length; i++) {
|
| 189 |
+
const extraEl = allRefsEls[i];
|
| 190 |
+
// Find ol inside the extra section (could be nested)
|
| 191 |
+
const extraOl = extraEl.querySelector("ol.references") || extraEl.querySelector("ol");
|
| 192 |
+
|
| 193 |
+
if (extraOl && targetOl) {
|
| 194 |
+
// Move all list items from extra bibliography to the consolidated one
|
| 195 |
+
const items = Array.from(extraOl.querySelectorAll(":scope > li"));
|
| 196 |
+
items.forEach(item => {
|
| 197 |
+
// Check if this reference already exists (by id) to avoid duplicates
|
| 198 |
+
const itemId = item.id;
|
| 199 |
+
if (itemId) {
|
| 200 |
+
// Use try-catch since CSS.escape might not be available in all browsers
|
| 201 |
+
try {
|
| 202 |
+
const escapedId = CSS.escape ? CSS.escape(itemId) : itemId.replace(/([^\w-])/g, '\\$1');
|
| 203 |
+
if (targetOl.querySelector(`#${escapedId}`)) {
|
| 204 |
+
return; // Skip duplicate
|
| 205 |
+
}
|
| 206 |
+
} catch (e) {
|
| 207 |
+
// If selector fails, check manually
|
| 208 |
+
const existing = Array.from(targetOl.querySelectorAll('li')).find(li => li.id === itemId);
|
| 209 |
+
if (existing) return;
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
targetOl.appendChild(item);
|
| 213 |
+
});
|
| 214 |
+
}
|
| 215 |
+
// Remove the now-empty extra bibliography section from the DOM
|
| 216 |
+
extraEl.remove();
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
// Try multiple selectors for footnotes
|
| 222 |
const footnotesEl = findFirstOutsideFooter([
|
| 223 |
"[data-built-footnotes]",
|
|
|
|
| 226 |
"div.footnotes",
|
| 227 |
]);
|
| 228 |
|
|
|
|
| 229 |
const movedNotes = moveIntoFooter(footnotesEl, "Footnotes");
|
| 230 |
|
| 231 |
if (movedRefs || movedNotes) {
|
|
|
|
| 262 |
// Final attempt after a short delay
|
| 263 |
setTimeout(attemptMove, 300);
|
| 264 |
|
| 265 |
+
// Watch for dynamically added content (e.g., lazy-loaded components)
|
| 266 |
+
// This catches references sections that might be added after initial render
|
| 267 |
+
const observer = new MutationObserver((mutations) => {
|
| 268 |
+
// Only re-run if we haven't fully processed yet or new ref sections appeared
|
| 269 |
+
if (footer.dataset.processed !== "true") {
|
| 270 |
+
attemptMove();
|
| 271 |
+
} else {
|
| 272 |
+
// Check if any new references sections were added
|
| 273 |
+
for (const mutation of mutations) {
|
| 274 |
+
for (const node of mutation.addedNodes) {
|
| 275 |
+
if (node.nodeType === 1) { // Element node
|
| 276 |
+
const el = node;
|
| 277 |
+
if (
|
| 278 |
+
el.id === "references" ||
|
| 279 |
+
el.classList?.contains("references") ||
|
| 280 |
+
el.classList?.contains("bibliography") ||
|
| 281 |
+
el.hasAttribute?.("data-built-refs")
|
| 282 |
+
) {
|
| 283 |
+
// Reset processed flag and re-consolidate
|
| 284 |
+
footer.dataset.processed = "false";
|
| 285 |
+
attemptMove();
|
| 286 |
+
return;
|
| 287 |
+
}
|
| 288 |
+
// Also check for nested references
|
| 289 |
+
if (el.querySelector?.("[data-built-refs], #references, .references, .bibliography")) {
|
| 290 |
+
footer.dataset.processed = "false";
|
| 291 |
+
attemptMove();
|
| 292 |
+
return;
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
}
|
| 296 |
+
}
|
| 297 |
+
}
|
| 298 |
+
});
|
| 299 |
+
|
| 300 |
+
// Observe the main content area for changes
|
| 301 |
+
if (contentRoot) {
|
| 302 |
+
observer.observe(contentRoot, { childList: true, subtree: true });
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
// Stop observing after page is fully loaded + a delay
|
| 306 |
+
window.addEventListener("load", () => {
|
| 307 |
+
setTimeout(() => observer.disconnect(), 2000);
|
| 308 |
+
}, { once: true });
|
| 309 |
})();
|
| 310 |
</script>
|
| 311 |
|
app/src/content/assets/data/basic_metrics.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:847fb061c6643d04446b69249d9c56ba67ea1b502013fc57ff71366d36978a23
|
| 3 |
+
size 2817
|
app/src/content/assets/data/complexity_ratio.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a59aeba80d14b977f47948d2fcfbd818685df06b033c0b3bb6ee889ae976ab4
|
| 3 |
+
size 2386
|
app/src/content/assets/data/complexity_ratio.png
ADDED
|
Git LFS Details
|
app/src/content/assets/data/overall_performance.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2391
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5e079335d1cf6f5c53df229031920c15561847b7e65476ba93f6526669df8a8
|
| 3 |
size 2391
|
app/src/content/assets/data/overall_performance.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/score_vs_recklessness.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fb4ed9d7a1296dd45431123b58fb52c0d5224a8cc4113cb1b53eab95b8fb610
|
| 3 |
+
size 3251
|
app/src/content/assets/data/score_vs_recklessness.png
ADDED
|
Git LFS Details
|
app/src/content/assets/data/summary.txt
CHANGED
|
@@ -25,17 +25,17 @@ Loaded colors for 17 models
|
|
| 25 |
BASIC MODEL COMPARISON
|
| 26 |
============================================================
|
| 27 |
|
| 28 |
-
model rounds_played total_score avg_score total_floored_score avg_floored_score total_turns total_output_tokens total_wall_clock avg_failed_guesses success_rate total_no_stakes_score avg_no_stakes_score avg_output_tokens_per_turn wall_clock_per_turn intra_rule_variance inter_rule_variance variance_ratio
|
| 29 |
-
Claude Opus 4.5 78 1128 14.461538 1324 16.974359
|
| 30 |
-
Kimi K2 78 804 10.307692 1262 16.179487
|
| 31 |
-
Grok 4 1 Fast Reasoning 78 737 9.448718 1182 15.153846
|
| 32 |
-
Gpt 5.2 High 78 1158 14.846154 1174 15.051282
|
| 33 |
-
Gpt 5 Mini Medium 78 942 12.076923 1052 13.487179
|
| 34 |
-
Deepseek R1 78 511 6.551282 1036 13.282051
|
| 35 |
-
Gemini 3 Flash Preview Low 78 817 10.474359 1024 13.128205
|
| 36 |
-
Gpt Oss 120B 78 580 7.435897 1004 12.871795
|
| 37 |
-
Gpt Oss 20B 78 131 1.679487 927 11.884615
|
| 38 |
-
Claude Haiku 4.5 78 -37 -0.474359 894 11.461538
|
| 39 |
|
| 40 |
Saved: results/260121_78_rounds/basic_metrics.csv
|
| 41 |
Saved: results/260121_78_rounds/overall_performance.png
|
|
@@ -130,6 +130,8 @@ Saved: results/260121_78_rounds/excess_caution.png
|
|
| 130 |
Saved: results/260121_78_rounds/excess_caution.json
|
| 131 |
Saved: results/260121_78_rounds/caution_vs_failed_guesses.png
|
| 132 |
Saved: results/260121_78_rounds/caution_vs_failed_guesses.json
|
|
|
|
|
|
|
| 133 |
|
| 134 |
============================================================
|
| 135 |
RECKLESS GUESSING ANALYSIS
|
|
@@ -169,6 +171,58 @@ Longest streak: 8 consecutive wrong guesses
|
|
| 169 |
Saved: results/260121_78_rounds/reckless_guessing.png
|
| 170 |
Saved: results/260121_78_rounds/reckless_guessing.json
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
============================================================
|
| 173 |
PER-MODEL REPORTS
|
| 174 |
============================================================
|
|
|
|
| 25 |
BASIC MODEL COMPARISON
|
| 26 |
============================================================
|
| 27 |
|
| 28 |
+
model rounds_played total_score avg_score total_floored_score avg_floored_score total_turns total_output_tokens total_wall_clock avg_failed_guesses success_rate counting_output_tokens total_no_stakes_score avg_no_stakes_score avg_output_tokens_per_turn wall_clock_per_turn intra_rule_variance inter_rule_variance variance_ratio
|
| 29 |
+
Claude Opus 4.5 78 1128 14.461538 1324 16.974359 756 4333716 86367.64 2.000000 0.833333 3430535 1598.0 20.487179 4537.744709 114.242910 25.000000 81.385983 0.307178
|
| 30 |
+
Kimi K2 78 804 10.307692 1262 16.179487 801 12281540 101346.76 2.038462 0.769231 5918992 1481.0 18.987179 7389.503121 126.525293 25.538462 88.446496 0.288745
|
| 31 |
+
Grok 4 1 Fast Reasoning 78 737 9.448718 1182 15.153846 795 8178655 120364.22 2.564103 0.717949 4559832 1441.0 18.474359 5735.637736 151.401535 25.243590 106.499829 0.237029
|
| 32 |
+
Gpt 5.2 High 78 1158 14.846154 1174 15.051282 1195 3341037 73525.83 0.282051 0.948718 3232254 1505.0 19.294872 2704.815063 61.527891 24.628205 36.601709 0.672870
|
| 33 |
+
Gpt 5 Mini Medium 78 942 12.076923 1052 13.487179 1163 3618399 58345.97 1.166667 0.705128 2998454 1325.0 16.987179 2578.206363 50.168504 39.141026 82.882051 0.472250
|
| 34 |
+
Deepseek R1 78 511 6.551282 1036 13.282051 851 9229131 165334.16 3.192308 0.641026 5944454 1331.0 17.064103 6985.257344 194.282209 29.628205 115.135043 0.257334
|
| 35 |
+
Gemini 3 Flash Preview Low 78 817 10.474359 1024 13.128205 1207 1581524 12702.02 0.961538 0.705128 1389850 1226.0 15.717949 1151.491301 10.523629 29.923077 83.049573 0.360304
|
| 36 |
+
Gpt Oss 120B 78 580 7.435897 1004 12.871795 1041 3190828 24633.15 2.153846 0.679487 2250622 1279.0 16.397436 2161.980788 23.662968 46.692308 78.676239 0.593474
|
| 37 |
+
Gpt Oss 20B 78 131 1.679487 927 11.884615 972 7009392 62397.50 2.974359 0.589744 3234713 1206.0 15.461538 3327.894033 64.194959 47.576923 88.239487 0.539180
|
| 38 |
+
Claude Haiku 4.5 78 -37 -0.474359 894 11.461538 848 6973411 57734.39 3.948718 0.564103 4053200 1198.0 15.358974 4779.716981 68.083007 45.102564 107.387350 0.419999
|
| 39 |
|
| 40 |
Saved: results/260121_78_rounds/basic_metrics.csv
|
| 41 |
Saved: results/260121_78_rounds/overall_performance.png
|
|
|
|
| 130 |
Saved: results/260121_78_rounds/excess_caution.json
|
| 131 |
Saved: results/260121_78_rounds/caution_vs_failed_guesses.png
|
| 132 |
Saved: results/260121_78_rounds/caution_vs_failed_guesses.json
|
| 133 |
+
Saved: results/260121_78_rounds/score_vs_recklessness.png
|
| 134 |
+
Saved: results/260121_78_rounds/score_vs_recklessness.json
|
| 135 |
|
| 136 |
============================================================
|
| 137 |
RECKLESS GUESSING ANALYSIS
|
|
|
|
| 171 |
Saved: results/260121_78_rounds/reckless_guessing.png
|
| 172 |
Saved: results/260121_78_rounds/reckless_guessing.json
|
| 173 |
|
| 174 |
+
============================================================
|
| 175 |
+
COMPLEXITY RATIO ANALYSIS
|
| 176 |
+
============================================================
|
| 177 |
+
|
| 178 |
+
Analyzed 9634 tentative rules with confidence >= 5
|
| 179 |
+
Using optimal k = 0.420 for aggregated complexity
|
| 180 |
+
|
| 181 |
+
Complexity Ratio by Model:
|
| 182 |
+
(Ratio = Tentative Complexity / Actual Complexity)
|
| 183 |
+
|
| 184 |
+
Model Median Q25 Q75 Count
|
| 185 |
+
Gpt Oss 120B 1.322 0.873 2.355 1182
|
| 186 |
+
Gpt Oss 20B 1.155 0.782 2.065 1219
|
| 187 |
+
Claude Haiku 4.5 1.054 0.736 2.000 1001
|
| 188 |
+
Deepseek R1 1.000 0.762 1.756 933
|
| 189 |
+
Gemini 3 Flash Preview Low 1.000 0.781 1.519 1016
|
| 190 |
+
Gpt 5 Mini Medium 1.000 0.765 1.664 939
|
| 191 |
+
Gpt 5.2 High 1.000 0.791 1.187 857
|
| 192 |
+
Grok 4 1 Fast Reasoning 1.000 0.777 1.657 938
|
| 193 |
+
Claude Opus 4.5 0.984 0.707 1.169 664
|
| 194 |
+
Kimi K2 0.976 0.622 1.275 885
|
| 195 |
+
|
| 196 |
+
Interpretation:
|
| 197 |
+
- Ratio > 1: Model tends to overcomplicate rules
|
| 198 |
+
- Ratio < 1: Model tends to oversimplify rules
|
| 199 |
+
- Ratio ≈ 1: Model matches actual rule complexity
|
| 200 |
+
|
| 201 |
+
Highest median: Gpt Oss 120B (1.322)
|
| 202 |
+
Lowest median: Kimi K2 (0.976)
|
| 203 |
+
|
| 204 |
+
Saved: results/260121_78_rounds/complexity_ratio.png
|
| 205 |
+
Saved: results/260121_78_rounds/complexity_ratio.json
|
| 206 |
+
|
| 207 |
+
============================================================
|
| 208 |
+
OUTPUT TOKENS BY TURN
|
| 209 |
+
============================================================
|
| 210 |
+
|
| 211 |
+
Saved: results/260121_78_rounds/tokens_by_turn.png
|
| 212 |
+
Saved: results/260121_78_rounds/tokens_by_turn.json
|
| 213 |
+
|
| 214 |
+
Tokens trend summary (early vs late turns):
|
| 215 |
+
Claude Haiku 4.5: early=3191, late=5889 (+84.5%)
|
| 216 |
+
Claude Opus 4.5: early=2649, late=8447 (+218.9%)
|
| 217 |
+
Deepseek R1: early=5083, late=10946 (+115.3%)
|
| 218 |
+
Gemini 3 Flash Preview Low: early=1046, late=1351 (+29.1%)
|
| 219 |
+
Gpt 5 Mini Medium: early=1241, late=4862 (+291.9%)
|
| 220 |
+
Gpt 5.2 High: early=963, late=5910 (+514.0%)
|
| 221 |
+
Gpt Oss 120B: early=1050, late=4475 (+326.2%)
|
| 222 |
+
Gpt Oss 20B: early=1744, late=7789 (+346.6%)
|
| 223 |
+
Grok 4 1 Fast Reasoning: early=2810, late=17827 (+534.4%)
|
| 224 |
+
Kimi K2: early=5545, late=10653 (+92.1%)
|
| 225 |
+
|
| 226 |
============================================================
|
| 227 |
PER-MODEL REPORTS
|
| 228 |
============================================================
|
app/src/content/assets/data/tokens_by_turn.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ceb3f9cc62ed081b7c59ff9f58903166d3d267ab7f3ad73143b41682301dddd9
|
| 3 |
+
size 39913
|
app/src/content/assets/data/tokens_by_turn.png
ADDED
|
Git LFS Details
|
app/src/content/chapters/eleusis/appendix.mdx
CHANGED
|
@@ -1,40 +1,8 @@
|
|
| 1 |
-
import Accordion from "../../../components/Accordion.astro";
|
| 2 |
import Note from "../../../components/Note.astro";
|
| 3 |
import Sidenote from "../../../components/Sidenote.astro";
|
| 4 |
|
| 5 |
## Appendix: Detailed Methods
|
| 6 |
|
| 7 |
-
### Models Evaluated
|
| 8 |
-
|
| 9 |
-
<Accordion title="Model configurations" open>
|
| 10 |
-
|
| 11 |
-
We evaluated 10 models across 5 providers:
|
| 12 |
-
|
| 13 |
-
| Model | Provider | Type |
|
| 14 |
-
|-------|----------|------|
|
| 15 |
-
| Claude Opus 4.5 | Anthropic | Proprietary |
|
| 16 |
-
| Claude Haiku 4.5 | Anthropic | Proprietary |
|
| 17 |
-
| GPT 5.2 High | OpenAI | Proprietary |
|
| 18 |
-
| GPT 5 Mini Medium | OpenAI | Proprietary |
|
| 19 |
-
| Gemini 3 Flash Preview Low | Google | Proprietary |
|
| 20 |
-
| Grok 4.1 Fast Reasoning | xAI | Proprietary |
|
| 21 |
-
| Kimi K2 | Moonshot (via HF) | Open weights |
|
| 22 |
-
| DeepSeek R1 | DeepSeek (via HF) | Open weights |
|
| 23 |
-
| GPT OSS 120B | Community (via HF) | Open weights |
|
| 24 |
-
| GPT OSS 20B | Community (via HF) | Open weights |
|
| 25 |
-
|
| 26 |
-
All models were evaluated with the following settings:
|
| 27 |
-
|
| 28 |
-
| Parameter | Value |
|
| 29 |
-
|-----------|-------|
|
| 30 |
-
| Temperature | 0.7 |
|
| 31 |
-
| Max tokens | 16384 |
|
| 32 |
-
| Retries | 3 (on API failures) |
|
| 33 |
-
|
| 34 |
-
Reasoning models were allowed their default reasoning budgets.
|
| 35 |
-
|
| 36 |
-
</Accordion>
|
| 37 |
-
|
| 38 |
### Rule Checking
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
| 1 |
import Note from "../../../components/Note.astro";
|
| 2 |
import Sidenote from "../../../components/Sidenote.astro";
|
| 3 |
|
| 4 |
## Appendix: Detailed Methods
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
### Rule Checking
|
| 7 |
|
| 8 |
|
app/src/content/chapters/eleusis/benchmark.mdx
CHANGED
|
@@ -4,21 +4,19 @@ import Sidenote from "../../../components/Sidenote.astro";
|
|
| 4 |
|
| 5 |
### The Original Game
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
Players take turns playing cards from their hand onto a central "mainline." If a card satisfies the secret rule, the dealer accepts it and it is added to the mainline. If it violates the rule, it's rejected and placed in a "sideline" below the mainline at that position. Over time, the pattern of accepted and rejected cards provides evidence about the hidden rule.
|
| 10 |
|
| 11 |
<Sidenote>
|
| 12 |
The name "Eleusis" comes from the ancient Greek mystery cult, where initiates gradually discovered hidden truths.
|
| 13 |
</Sidenote>
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
### Our Adaptation
|
| 18 |
|
| 19 |
We adapted Eleusis into a single-player benchmark focused purely on the scientific reasoning process. By removing multi-player dynamics, we isolate the core challenge: hypothesis formation and testing under uncertainty.
|
| 20 |
|
| 21 |
-
The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and four suits.
|
| 22 |
|
| 23 |
On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
|
| 24 |
|
|
@@ -31,7 +29,7 @@ For instance, a player who correctly identifies the rule on turn 13 with no wron
|
|
| 31 |
This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence and acting accordingly.
|
| 32 |
|
| 33 |
### Rule Library
|
| 34 |
-
In the original game, the dealer
|
| 35 |
|
| 36 |
Here are some example rules from our library, with a tentative categorization:
|
| 37 |
|
|
@@ -57,7 +55,8 @@ The model is free to reason, but it is asked to output a structured response con
|
|
| 57 |
4. **Confidence level**: A self-reported probability (0–10 scale, where 7 means "I estimate 70% chance my tentative rule is correct");
|
| 58 |
5. **Guess decision**: Whether to formally try to guess the rule this turn, or not.
|
| 59 |
|
| 60 |
-
Example output
|
|
|
|
| 61 |
```
|
| 62 |
{
|
| 63 |
"reasoning_summary": "To test if the rule depends on rank, I play a 4♣ (same rank as the starter 4♠) hoping to see if same-rank cards are accepted.",
|
|
@@ -68,6 +67,12 @@ Example output
|
|
| 68 |
}
|
| 69 |
```
|
| 70 |
|
| 71 |
-
**This structure lets us analyze not just whether models succeed, but *how* they reason:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
|
|
|
| 4 |
|
| 5 |
### The Original Game
|
| 6 |
|
| 7 |
+
To recap the core mechanics: players take turns playing cards onto a central "mainline." If a card satisfies the secret rule, it is accepted; otherwise it's rejected and placed in a "sideline" below that position. At any point, a player can attempt to guess the rule—correctly identifying it ends the game, but a wrong guess incurs a penalty.
|
|
|
|
|
|
|
| 8 |
|
| 9 |
<Sidenote>
|
| 10 |
The name "Eleusis" comes from the ancient Greek mystery cult, where initiates gradually discovered hidden truths.
|
| 11 |
</Sidenote>
|
| 12 |
|
| 13 |
+
The scoring system rewards efficiency: discovering the rule quickly earns more points, while wrong guesses are penalized.
|
| 14 |
|
| 15 |
### Our Adaptation
|
| 16 |
|
| 17 |
We adapted Eleusis into a single-player benchmark focused purely on the scientific reasoning process. By removing multi-player dynamics, we isolate the core challenge: hypothesis formation and testing under uncertainty.
|
| 18 |
|
| 19 |
+
The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and four suits. The secret rule is a deterministic function of the card being played and the current mainline sequence. The player maintains a hand of 12 cards, drawing a replacement after each play.
|
| 20 |
|
| 21 |
On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
|
| 22 |
|
|
|
|
| 29 |
This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence and acting accordingly.
|
| 30 |
|
| 31 |
### Rule Library
|
| 32 |
+
In the original game, the dealer invents a secret rule on the spot. For benchmarking LLMs, we need a fixed set of rules to ensure comparability across runs. We created a library of 26 hand-crafted rules designed to cover the space of rule types (static properties, sequential dependencies, cyclic patterns) while remaining tractable to evaluate. Some rules involve simply card properties (e.g., "only red cards"), while others depend on the sequence of previously accepted cards (e.g., "card rank must be higher than previous card"). The rule might involve rank, suits, color or a combination thereof, and may include positional dependencies.
|
| 33 |
|
| 34 |
Here are some example rules from our library, with a tentative categorization:
|
| 35 |
|
|
|
|
| 55 |
4. **Confidence level**: A self-reported probability (0–10 scale, where 7 means "I estimate 70% chance my tentative rule is correct");
|
| 56 |
5. **Guess decision**: Whether to formally try to guess the rule this turn, or not.
|
| 57 |
|
| 58 |
+
#### Example output
|
| 59 |
+
|
| 60 |
```
|
| 61 |
{
|
| 62 |
"reasoning_summary": "To test if the rule depends on rank, I play a 4♣ (same rank as the starter 4♠) hoping to see if same-rank cards are accepted.",
|
|
|
|
| 67 |
}
|
| 68 |
```
|
| 69 |
|
| 70 |
+
**This structure lets us analyze not just whether models succeed, but *how* they reason:**
|
| 71 |
+
|
| 72 |
+
- Do they update hypotheses appropriately when evidence contradicts them?
|
| 73 |
+
- Do they explore strategically or play conservatively?
|
| 74 |
+
- Is their stated confidence calibrated to their actual accuracy?
|
| 75 |
+
|
| 76 |
+
Forcing the model to articulate a tentative rule and confidence level (even when not formally guessing) allows us to secretly evaluate it at every turn—useful for measuring calibration.
|
| 77 |
|
| 78 |
|
app/src/content/chapters/eleusis/introduction.mdx
CHANGED
|
@@ -3,17 +3,17 @@ import Image from "../../../components/Image.astro";
|
|
| 3 |
|
| 4 |
import exampleSequence from "../../assets/image/example_sequence.png";
|
| 5 |
|
| 6 |
-
Large language models are increasingly being deployed as tools for scientific research
|
| 7 |
|
| 8 |
<Sidenote>
|
| 9 |
Read time: 15–20 minutes.
|
| 10 |
</Sidenote>
|
| 11 |
|
| 12 |
-
Most reasoning benchmarks test whether models can solve well-defined problems
|
| 13 |
|
| 14 |
First, real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement, often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic thinking*: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
|
| 19 |
|
|
@@ -33,4 +33,4 @@ Eleusis was designed by @abbott1977eleusis explicitly to simulate the process of
|
|
| 33 |
|
| 34 |
We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: *can models act like scientists?* Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
|
| 35 |
|
| 36 |
-
These skills
|
|
|
|
| 3 |
|
| 4 |
import exampleSequence from "../../assets/image/example_sequence.png";
|
| 5 |
|
| 6 |
+
Large language models are increasingly being deployed as tools for scientific research: analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
|
| 7 |
|
| 8 |
<Sidenote>
|
| 9 |
Read time: 15–20 minutes.
|
| 10 |
</Sidenote>
|
| 11 |
|
| 12 |
+
Most reasoning benchmarks test whether models can solve well-defined problems with clear solutions. The ARC challenge [@chollet2019measure], for instance, evaluates inductive reasoning on visual patterns. **These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.**
|
| 13 |
|
| 14 |
First, real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement, often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic thinking*: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
|
| 15 |
|
| 16 |
+
Moreover, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?) [@lichtenstein1977calibration], **metacognition** (how certain am I about my uncertainty?) [@flavell1979metacognition], and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis instead of trying to challenge it) [@nickerson1998confirmation]. A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
|
| 17 |
|
| 18 |
We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
|
| 19 |
|
|
|
|
| 33 |
|
| 34 |
We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: *can models act like scientists?* Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
|
| 35 |
|
| 36 |
+
These skills matter beyond the laboratory: debugging code, diagnosing patients, and navigating everyday uncertainty all require the same iterative process of hypothesis and test.
|
app/src/content/chapters/eleusis/results.mdx
CHANGED
|
@@ -6,9 +6,28 @@ import HtmlEmbed from "../../../components/HtmlEmbed.astro";
|
|
| 6 |
|
| 7 |
## 2. Results
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
### Overall Performance
|
| 10 |
|
| 11 |
-
|
| 12 |
|
| 13 |
<HtmlEmbed
|
| 14 |
src="overall-performance.html"
|
|
@@ -19,15 +38,15 @@ We evaluated ten models on the Eleusis benchmark, including both proprietary and
|
|
| 19 |
|
| 20 |
Performance varies dramatically among tested models.
|
| 21 |
|
| 22 |
-
* **Claude Opus 4.5** achieves top performance with 17.0
|
| 23 |
|
| 24 |
-
* **GPT 5.2 High** and **Grok 4.1 Fast Reasoning** show
|
| 25 |
|
| 26 |
-
* **GPT
|
| 27 |
|
| 28 |
-
* Finally, **GPT
|
| 29 |
|
| 30 |
-
As
|
| 31 |
|
| 32 |
### Pure discovery versus metacognition
|
| 33 |
|
|
@@ -47,7 +66,7 @@ Even if using this alternative scoring does not change a lot the relative rankin
|
|
| 47 |
* GPT 5.2 High and Claude Haiku 4.5 are the two models with the largest difference between raw and no-stakes scores (more than 4), suggesting they are the most penalized by wrong guesses or delayed guessing.
|
| 48 |
* On the other hand, Gemini 3 Flash Preview Low and Kimi K2 have the smallest difference (less than 3) and benefit the least from this alternative scoring, indicating a better balance between discovery and metacognition.
|
| 49 |
|
| 50 |
-
|
| 51 |
1. The model is reckless and makes a lot of wrong guesses, incurring penalties.
|
| 52 |
2. The model is too cautious and waits too long before guessing, missing out on points.
|
| 53 |
|
|
@@ -67,14 +86,28 @@ To estimate caution, we can compute on average **how many turns a model waits wh
|
|
| 67 |
src="caution-vs-failed-guesses.html"
|
| 68 |
caption="<strong>Figure 3:</strong> The caution-recklessness trade-off. Models in the upper-left are cautious (delay correct guesses); models in the lower-right are reckless (many failed guesses). The ideal position is lower-left: quick to commit when right, rarely wrong."
|
| 69 |
id="fig-caution-reckless"
|
|
|
|
| 70 |
/>
|
| 71 |
|
| 72 |
-
How should we interpret
|
| 73 |
|
| 74 |
-
|
|
|
|
|
|
|
| 75 |
|
| 76 |
To try to understand deeper the causes of recklessness and caution, we now turn to an analysis of confidence and guessing strategies.
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
### Confidence and Calibration
|
| 79 |
|
| 80 |
Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempt to do so. **This allows us to evaluate calibration: does reported confidence match actual accuracy?** This is particularly relevant as modern neural networks have been shown to be poorly calibrated [@guo2017calibration].
|
|
@@ -83,17 +116,18 @@ Models are asked to output their confidence level, with clear instructions on wh
|
|
| 83 |
src="calibration-curves.html"
|
| 84 |
caption="<strong>Figure 4:</strong> Calibration curves for each model (for reported confidence ≥5). A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
|
| 85 |
id="fig-calibration"
|
|
|
|
| 86 |
/>
|
| 87 |
|
| 88 |
The calibration analysis reveals several patterns:
|
| 89 |
|
| 90 |
-
- **All models are
|
| 91 |
-
- GPT 5.2 is the best
|
| 92 |
-
- Even
|
| 93 |
|
| 94 |
-
Is overconfidence a problem
|
| 95 |
|
| 96 |
-
**For a perfectly calibrated model**,
|
| 97 |
|
| 98 |
For this, we can look at how often models guess at each reported confidence level. This is shown in the following figure. For each confidence level (from 5 to 10), we compute the guess rate: the fraction of turns the model actually attempts to guess when reporting that confidence.
|
| 99 |
|
|
@@ -102,16 +136,37 @@ For this, we can look at how often models guess at each reported confidence leve
|
|
| 102 |
src="guess-rate.html"
|
| 103 |
caption="<strong>Figure 5:</strong> Guess rate per confidence level. The optimal decision theoretic curve for a perfectly calibrated model should be a step at 67%. Click legend items to show/hide models."
|
| 104 |
id="fig-confidence"
|
|
|
|
| 105 |
/>
|
| 106 |
|
| 107 |
Once again, we observe significant differences from one model to another. Grok 4.1 and Gemini 3 will essentially only guess when very confident (9 or 10). Most other models will also often guess at confidence levels above 8 and rarely below. The two Claude models show different behaviors: Claude Opus 4.5 tends to guess more aggressively at confidence level 8, while Claude Haiku 4.5 often guesses even at confidence level 7.
|
| 108 |
|
| 109 |
-
|
| 110 |
|
| 111 |
-
This is particularly true for Gemini 3 Flash Preview Low which is very cautious, guessing only 1/3 of the time at reported confidence 9
|
| 112 |
|
| 113 |
The case of GPT 5.2 High is different: it is both fairly well calibrated and very cautious, leading to very few failed guesses but a high opportunity cost due to delayed guessing. This suggests that GPT 5.2 High could improve its performance by being more aggressive in guessing once it has a correct tentative rule, especially at confidence level 8.
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
### Performance by Rule Complexity
|
| 117 |
|
|
@@ -143,20 +198,46 @@ The following plot breaks down the success rate of each model per complexity qua
|
|
| 143 |
/>
|
| 144 |
|
| 145 |
|
| 146 |
-
Interestingly, code complexity (as measured by our combination of cyclomatic complexity and AST node count) doesn't perfectly predict difficulty, as semantic concepts also play a role.
|
| 147 |
|
| 148 |
-
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
An interesting test: are symmetric rules equally difficult? For example, "only spades" vs "only non-spades" should be logically equivalent in difficulty, but models might have biases.
|
| 151 |
-
For instance average score on "only spades" is 25, but "no spades" is 20.
|
| 152 |
|
| 153 |
### Complexity of rules produced
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
TODO : Backup this with examples from logs and "guess complexity" vs "actual complexity".
|
| 159 |
|
|
|
|
| 160 |
|
| 161 |
-
|
| 162 |
-
We have observed qualitative evidence of model producing overfit rules that explain all observations so far, but fail to generalize. For instance if all accepted cards so far are red, and happens to be only number cards (simply because no red face card has been tried yet), the model may hypothesize "only red number cards" rather than the simpler "only red cards."
|
|
|
|
| 6 |
|
| 7 |
## 2. Results
|
| 8 |
|
| 9 |
+
### Models Evaluated
|
| 10 |
+
|
| 11 |
+
We evaluated ten frontier models from six labs, including both proprietary and open-weight models. Open-weight models were accessed via Hugging Face inference providers. Several models offer configurable reasoning levels, which we indicate when applicable.
|
| 12 |
+
|
| 13 |
+
| Model | Lab | Provider | Reasoning |
|
| 14 |
+
|-------|-----|----------|-----------|
|
| 15 |
+
| Claude Opus 4.5 | Anthropic | Anthropic | 16000 tok. |
|
| 16 |
+
| Claude Haiku 4.5 | Anthropic | Anthropic | 16000 tok. |
|
| 17 |
+
| GPT 5.2 | OpenAI | OpenAI | High |
|
| 18 |
+
| GPT 5 Mini | OpenAI | OpenAI | Medium |
|
| 19 |
+
| Gemini 3 Flash Preview | Google | Google | Low |
|
| 20 |
+
| Grok 4.1 | xAI | xAI | Fast |
|
| 21 |
+
| Kimi K2 Thinking | Moonshot | 🤗 Inference providers | — |
|
| 22 |
+
| DeepSeek R1 | DeepSeek | 🤗 Inference providers | — |
|
| 23 |
+
| GPT OSS 120B | Community | 🤗 Inference providers | — |
|
| 24 |
+
| GPT OSS 20B | Community | 🤗 Inference providers | — |
|
| 25 |
+
|
| 26 |
+
All models were evaluated with temperature 0.7 and max tokens of 16,384. Reasoning models were allowed their default reasoning budgets. Each model played 78 rounds (26 rules × 3 seeds).
|
| 27 |
+
|
| 28 |
### Overall Performance
|
| 29 |
|
| 30 |
+
Performance is measured as the average score per round. We also report token usage (output + reasoning) per turn to compare efficiency.
|
| 31 |
|
| 32 |
<HtmlEmbed
|
| 33 |
src="overall-performance.html"
|
|
|
|
| 38 |
|
| 39 |
Performance varies dramatically among tested models.
|
| 40 |
|
| 41 |
+
* **Claude Opus 4.5** achieves top performance with a score of 17.0 and moderate token usage. The open-weight model **Kimi K2** comes second at 16.2, performing competitively with the best proprietary models, but at the cost of a larger reasoning budget.
|
| 42 |
|
| 43 |
+
* **GPT 5.2 High** and **Grok 4.1 Fast Reasoning** show similar performance around 15, but GPT 5.2 High is 2 times more token-efficient.
|
| 44 |
|
| 45 |
+
* **GPT 5 Mini Medium**, **GPT OSS 120B**, and **Gemini 3 Flash Preview Low** cluster in the mid-tier (around 13) with low token usage. **DeepSeek R1**, an open-weight model specialized for reasoning tasks, achieves a similar score but with a much larger token count.
|
| 46 |
|
| 47 |
+
* Finally, **GPT OSS 20B** and **Claude Haiku 4.5** lag behind, scoring between 11 and 12 with moderate token usage.
|
| 48 |
|
| 49 |
+
As mentioned earlier, this score reflects not only the model's ability to find the correct rule, but also its metacognitive skills: knowing when to commit, how confident to be, and how to balance exploration versus exploitation. To distinguish these factors, we computed an alternative "no-stakes" score that removes penalties for wrong guesses and counts tentative rules as guesses.
|
| 50 |
|
| 51 |
### Pure discovery versus metacognition
|
| 52 |
|
|
|
|
| 66 |
* GPT 5.2 High and Claude Haiku 4.5 are the two models with the largest difference between raw and no-stakes scores (more than 4), suggesting they are the most penalized by wrong guesses or delayed guessing.
|
| 67 |
* On the other hand, Gemini 3 Flash Preview Low and Kimi K2 have the smallest difference (less than 3) and benefit the least from this alternative scoring, indicating a better balance between discovery and metacognition.
|
| 68 |
|
| 69 |
+
There are two possible reasons for the gap between raw and no-stakes scores:
|
| 70 |
1. The model is reckless and makes a lot of wrong guesses, incurring penalties.
|
| 71 |
2. The model is too cautious and waits too long before guessing, missing out on points.
|
| 72 |
|
|
|
|
| 86 |
src="caution-vs-failed-guesses.html"
|
| 87 |
caption="<strong>Figure 3:</strong> The caution-recklessness trade-off. Models in the upper-left are cautious (delay correct guesses); models in the lower-right are reckless (many failed guesses). The ideal position is lower-left: quick to commit when right, rarely wrong."
|
| 88 |
id="fig-caution-reckless"
|
| 89 |
+
wide
|
| 90 |
/>
|
| 91 |
|
| 92 |
+
How should we interpret these values? A failed guess costs 2 points, while each turn of delay costs 1 point, so the optimal number of failed guesses per round should be around 0.5 (one failed guess every two rounds) to balance both sources of loss. Most models exceed this threshold, indicating **a clear tendency towards recklessness**. This is confirmed by low caution values: most models wait around 1 turn or less on average before guessing when they have the correct rule.
|
| 93 |
|
| 94 |
+
**GPT 5.2 High stands out** with very few failed guesses (0.28 per round) but high caution—waiting 3.5 turns on average before guessing when it has the correct rule.
|
| 95 |
+
|
| 96 |
+
Gemini 3 Flash Preview Low and GPT 5 Mini Medium occupy an intermediate position. Gemini achieves a better balance, losing on average 2 points to caution and 2 points to recklessness (about one failed guess per round).
|
| 97 |
|
| 98 |
To try to understand deeper the causes of recklessness and caution, we now turn to an analysis of confidence and guessing strategies.
|
| 99 |
|
| 100 |
+
A way to summarize this behavior is to compute a **boldness index** as the difference between the points lost by being reckless (failed guesses) and the points lost by being cautious (delayed correct guesses). A positive value indicates more loss due to recklessness, while a negative value indicates more loss due to caution. This is reported in the following chart.
|
| 101 |
+
|
| 102 |
+
<HtmlEmbed
|
| 103 |
+
src="score-vs-recklessness.html"
|
| 104 |
+
caption="<strong>Figure 3b:</strong> Score vs Boldness Index. The boldness index combines failed guesses and caution into a single metric (2 × failed guesses − caution). Models in the center have a decision strategy that balances recklessness and caution. Models on the left are loosing point because of their excessive caution, while models on the right are losing points because of their recklessness."
|
| 105 |
+
id="fig-score-recklessness"
|
| 106 |
+
wide
|
| 107 |
+
/>
|
| 108 |
+
|
| 109 |
+
A way to understand this chart is in terms of missed opportunity. Models in the center achieve a good balance between recklessness and caution, minimizing lost points. They perform at the best permitted by their inductive abilities. Models on the left are too cautious, missing out on points by delaying correct guesses. At identical inductive ability, they could improve their score by guessing earlier. Models on the right are too reckless, losing points from frequent wrong guesses. At identical inductive ability, they could improve their score by being more cautious and guessing less often.
|
| 110 |
+
|
| 111 |
### Confidence and Calibration
|
| 112 |
|
| 113 |
Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempt to do so. **This allows us to evaluate calibration: does reported confidence match actual accuracy?** This is particularly relevant as modern neural networks have been shown to be poorly calibrated [@guo2017calibration].
|
|
|
|
| 116 |
src="calibration-curves.html"
|
| 117 |
caption="<strong>Figure 4:</strong> Calibration curves for each model (for reported confidence ≥5). A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
|
| 118 |
id="fig-calibration"
|
| 119 |
+
wide
|
| 120 |
/>
|
| 121 |
|
| 122 |
The calibration analysis reveals several patterns:
|
| 123 |
|
| 124 |
+
- **All models are overconfident**: when they report 80% confidence, their actual success rates are often closer to 20%.
|
| 125 |
+
- GPT 5.2 High is the best-calibrated model overall, staying closest to the diagonal, though still slightly overconfident.
|
| 126 |
+
- Even strong performers like Claude Opus 4.5 and Kimi K2 show significant overconfidence.
|
| 127 |
|
| 128 |
+
Is overconfidence a problem? In our setting, not necessarily—it depends on how the model acts on it.
|
| 129 |
|
| 130 |
+
**For a perfectly calibrated model**, since the expected loss from a failed guess is twice the opportunity cost of waiting one turn, **the optimal confidence threshold for guessing is 0.67** (guess when you believe your tentative rule has at least a 67% chance of being correct). But do models follow such a strategy?
|
| 131 |
|
| 132 |
For this, we can look at how often models guess at each reported confidence level. This is shown in the following figure. For each confidence level (from 5 to 10), we compute the guess rate: the fraction of turns the model actually attempts to guess when reporting that confidence.
|
| 133 |
|
|
|
|
| 136 |
src="guess-rate.html"
|
| 137 |
caption="<strong>Figure 5:</strong> Guess rate per confidence level. The optimal decision theoretic curve for a perfectly calibrated model should be a step at 67%. Click legend items to show/hide models."
|
| 138 |
id="fig-confidence"
|
| 139 |
+
wide
|
| 140 |
/>
|
| 141 |
|
| 142 |
Once again, we observe significant differences from one model to another. Grok 4.1 and Gemini 3 will essentially only guess when very confident (9 or 10). Most other models will also often guess at confidence levels above 8 and rarely below. The two Claude models show different behaviors: Claude Opus 4.5 tends to guess more aggressively at confidence level 8, while Claude Haiku 4.5 often guesses even at confidence level 7.
|
| 143 |
|
| 144 |
+
**Models are on average more cautious than the optimal decision-theoretic strategy** for a perfectly calibrated model, which would guess as soon as confidence exceeds 67%. This actually benefits them, given their overconfidence: **by raising the threshold for guessing, they reduce the risk of wrong guesses and compensate for their poor calibration.**
|
| 145 |
|
| 146 |
+
This is particularly true for Gemini 3 Flash Preview Low, which is very cautious, guessing only 1/3 of the time at reported confidence 9. This compensates for its overconfidence and likely explains its good balance between failed guesses and lost opportunity cost—reflected in its having the smallest gap between raw and no-stakes scores. This is reflected in our "no-stakes" analysis by the fact that it's the model with the smallest difference between raw and no-stakes scores.
|
| 147 |
|
| 148 |
The case of GPT 5.2 High is different: it is both fairly well calibrated and very cautious, leading to very few failed guesses but a high opportunity cost due to delayed guessing. This suggests that GPT 5.2 High could improve its performance by being more aggressive in guessing once it has a correct tentative rule, especially at confidence level 8.
|
| 149 |
|
| 150 |
+
### Reasoning effort vs turn count
|
| 151 |
+
|
| 152 |
+
To see whether models tend to think more per turn when the round is longer, we plotted the average number of output tokens per turn.
|
| 153 |
+
|
| 154 |
+
<HtmlEmbed
|
| 155 |
+
src="tokens-by-turn.html"
|
| 156 |
+
caption="<strong>Figure 5b:</strong> Average output tokens per turn across the game (in the 'no-stakes' counting scenario where all the rounds will last up to 30 turns). Each line shows how a model's reasoning effort evolves as the round progresses. Click legend items to show/hide models. Note: sample sizes decrease for later turns as games that end early don't contribute data."
|
| 157 |
+
id="fig-tokens-by-turn"
|
| 158 |
+
wide
|
| 159 |
+
/>
|
| 160 |
+
|
| 161 |
+
The patterns reveal striking differences in how models allocate reasoning effort:
|
| 162 |
+
|
| 163 |
+
- Most models show gradual increase in the reasoning effort (token usage) as the turn number increases.
|
| 164 |
+
|
| 165 |
+
- **Grok 4.1 Fast Reasoning** stands out with dramatically increasing token usage, starting around 1,200 tokens per turn and reaching over 20,000 by turn 30. This suggests the model invests more reasoning effort as problems become harder to solve.
|
| 166 |
+
|
| 167 |
+
- **Gemini 3 Flash Preview Low** maintains remarkably flat token usage throughout, staying around 1,000-1,400 tokens regardless of turn number. This suggests a consistent, lightweight reasoning approach that doesn't scale with problem difficulty.
|
| 168 |
+
|
| 169 |
+
The general upward trend makes sense: later turns only occur in harder games where the rule hasn't been found yet, requiring more extensive reasoning. However, the magnitude of increase varies widely, from Gemini's flat profile to Grok's 15x increase.
|
| 170 |
|
| 171 |
### Performance by Rule Complexity
|
| 172 |
|
|
|
|
| 198 |
/>
|
| 199 |
|
| 200 |
|
| 201 |
+
Interestingly, code complexity (as measured by our combination of cyclomatic complexity and AST node count) doesn't perfectly predict difficulty, as semantic concepts also play a role. A rule like "only face cards" has complexity equivalent to "only A, 2 and 3", but the former is easier for models (and humans) due to familiarity with the semantic category of face cards.
|
| 202 |
|
| 203 |
+
Rules involving rare events also prove challenging. "Only aces" is harder than "only even ranks" despite being simpler, because models need more evidence to confirm it.
|
| 204 |
+
|
| 205 |
+
This raises an interesting question: are symmetric rules equally difficult? Logically, "only spades" and "no spades" should be equivalent in difficulty, but models might have biases. Indeed, the average score on "only spades" is 25, while "no spades" scores only 20.
|
| 206 |
|
|
|
|
|
|
|
| 207 |
|
| 208 |
### Complexity of rules produced
|
| 209 |
|
| 210 |
+
One common failure mode we observed is that models tend to produce overly complicated tentative rules, even though they were informed that rules are typically simple one-sentence statements. They also produce rules that fit all observed data so far, but fail to generalize to new cards because they are more complex than necessary.
|
| 211 |
+
|
| 212 |
+
As an illustration, here is an example of tentative rule by Claude Haiku 4.5. The mainline was state (rejected cards are in parentheses)
|
| 213 |
+
|
| 214 |
+
<p style={{fontStyle: 'italic', padding: '0.5em 1em', borderLeft: '3px solid var(--border-color)'}}>
|
| 215 |
+
6♠ <span style={{color: '#e53935'}}>6♦</span> 9♠ (<span style={{color: '#e53935'}}>Q♥</span>) <span style={{color: '#e53935'}}>9♦</span> (9♣) 7♠ (<span style={{color: '#e53935'}}>5♦</span>) (<span style={{color: '#e53935'}}>J♦</span>) (<span style={{color: '#e53935'}}>A♦</span>) (<span style={{color: '#e53935'}}>Q♦</span>) (<span style={{color: '#e53935'}}>2♦</span>) (<span style={{color: '#e53935'}}>4♦</span>) (<span style={{color: '#e53935'}}>9♦</span>) (8♠) (A♠) (<span style={{color: '#e53935'}}>10♥</span>) (<span style={{color: '#e53935'}}>J♦</span>) (<span style={{color: '#e53935'}}>9♥</span>) <span style={{color: '#e53935'}}>7♦</span> 9♠ (<span style={{color: '#e53935'}}>A♥</span>) (<span style={{color: '#e53935'}}>8♥</span>)
|
| 216 |
+
</p>
|
| 217 |
+
|
| 218 |
+
The actual rule was *"Rank repeats in pairs"*. The tentative rule proposed by Haiku 4.5 at this stage of the game was:
|
| 219 |
+
|
| 220 |
+
> "Odd-positioned mainline cards must be spades, even-positioned mainline cards must be diamonds. Consecutive pairs of positions must have matching ranks. Additionally, each rank (6, 7, 9) can appear only twice on the mainline, meaning position 8 must be a diamond with a rank different from 6, 7, and 9, or the pattern breaks at position 8 with new rules."
|
| 221 |
+
|
| 222 |
+
This is overly complicated compared to the actual rule, and as you can read, it contains the actual rule "Consecutive pairs of positions must have matching ranks" but adds unnecessary constraints about suits and counts that do not generalize.
|
| 223 |
+
|
| 224 |
+
To quantify this, we computed the **complexity ratio**: the complexity of the model's tentative rule divided by the actual rule complexity, using the same code-based metric described above.
|
| 225 |
+
|
| 226 |
+
<HtmlEmbed
|
| 227 |
+
src="complexity-ratio.html"
|
| 228 |
+
caption="<strong>Figure 8:</strong> Median complexity ratio of tentative rules vs actual rules. A ratio > 1 indicates the model overcomplicates (hypothesizes more complex rules than necessary); < 1 indicates oversimplification. Whiskers show interquartile range. Only tentative rules with confidence ≥ 5 are included."
|
| 229 |
+
id="fig-complexity-ratio"
|
| 230 |
+
wide
|
| 231 |
+
/>
|
| 232 |
+
|
| 233 |
+
The results reveal a clear tendency toward overcomplication among several models:
|
| 234 |
+
|
| 235 |
+
- **GPT OSS 120B and GPT OSS 20B** stand out with median ratios of 1.32 and 1.15 respectively, consistently hypothesizing more complex rules than necessary.
|
| 236 |
+
- **Claude Haiku 4.5** also tends to overcomplicate slightly (1.05) on average, but with high variance and many tentative rules being much more complex than needed.
|
| 237 |
+
- **Claude Opus 4.5, GPT 5.2 and Kimi K2** are the best calibrated, with median ratios close to 1.0 and moderate variance, suggesting they match rule complexity most accurately.
|
| 238 |
+
- Most models cluster around 1.0, indicating reasonable complexity calibration on average, but the wide interquartile ranges show substantial variation across individual games.
|
| 239 |
|
|
|
|
| 240 |
|
| 241 |
+
### Summary
|
| 242 |
|
| 243 |
+
Our evaluation reveals substantial variation in how models approach the Eleusis task. Claude Opus 4.5 leads in overall performance, followed closely by the open-weight Kimi K2. All models exhibit overconfidence—reporting higher certainty than their accuracy warrants—but they partially compensate by being more cautious than decision theory would recommend. The caution-recklessness trade-off varies dramatically: GPT 5.2 High is extremely cautious (high success rate but slow to commit), while Claude Haiku 4.5 and DeepSeek R1 are reckless (many failed guesses). Rule complexity matters, but semantic familiarity and evidence availability also influence difficulty. Finally, models tend to overcomplicate their hypotheses—particularly the open-weight GPT OSS models—while Claude Opus 4.5, GPT 5.2 High, and Kimi K2 best match actual rule complexity.
|
|
|
app/src/content/embeds/banner-bar-chart.html
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="d3-banner-bar"></div>
|
| 2 |
+
<style>
|
| 3 |
+
.d3-banner-bar {
|
| 4 |
+
width: 100%;
|
| 5 |
+
margin: 10px 0;
|
| 6 |
+
position: relative;
|
| 7 |
+
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
.d3-banner-bar svg {
|
| 11 |
+
display: block;
|
| 12 |
+
width: 100%;
|
| 13 |
+
height: auto;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
.d3-banner-bar .axes path,
|
| 17 |
+
.d3-banner-bar .axes line {
|
| 18 |
+
stroke: var(--axis-color, var(--text-color));
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
.d3-banner-bar .axes text {
|
| 22 |
+
fill: var(--tick-color, var(--muted-color));
|
| 23 |
+
font-size: 12px;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.d3-banner-bar .grid line {
|
| 27 |
+
stroke: var(--grid-color, rgba(0,0,0,.08));
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.d3-banner-bar .axes text.axis-label {
|
| 31 |
+
font-size: 14px;
|
| 32 |
+
font-weight: 500;
|
| 33 |
+
fill: var(--text-color);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.d3-banner-bar .model-label {
|
| 37 |
+
font-size: 13px;
|
| 38 |
+
font-weight: 500;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
.d3-banner-bar .bar {
|
| 42 |
+
cursor: pointer;
|
| 43 |
+
transition: opacity 0.15s ease;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
.d3-banner-bar .bar:hover {
|
| 47 |
+
opacity: 0.8;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
.d3-banner-bar .score-label {
|
| 51 |
+
font-size: 12px;
|
| 52 |
+
font-weight: 500;
|
| 53 |
+
fill: var(--text-color);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.d3-banner-bar .d3-tooltip {
|
| 57 |
+
position: absolute;
|
| 58 |
+
top: 0;
|
| 59 |
+
left: 0;
|
| 60 |
+
transform: translate(-9999px, -9999px);
|
| 61 |
+
pointer-events: none;
|
| 62 |
+
padding: 10px 12px;
|
| 63 |
+
border-radius: 8px;
|
| 64 |
+
font-size: 12px;
|
| 65 |
+
line-height: 1.4;
|
| 66 |
+
border: 1px solid var(--border-color);
|
| 67 |
+
background: var(--surface-bg);
|
| 68 |
+
color: var(--text-color);
|
| 69 |
+
box-shadow: 0 4px 24px rgba(0,0,0,.18);
|
| 70 |
+
opacity: 0;
|
| 71 |
+
transition: opacity 0.12s ease;
|
| 72 |
+
z-index: 10;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.d3-banner-bar .d3-tooltip .model-name {
|
| 76 |
+
font-weight: 600;
|
| 77 |
+
margin-bottom: 4px;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.d3-banner-bar .d3-tooltip .metric {
|
| 81 |
+
display: flex;
|
| 82 |
+
justify-content: space-between;
|
| 83 |
+
gap: 16px;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.d3-banner-bar .d3-tooltip .metric-label {
|
| 87 |
+
color: var(--muted-color);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.d3-banner-bar .d3-tooltip .metric-value {
|
| 91 |
+
font-weight: 500;
|
| 92 |
+
}
|
| 93 |
+
</style>
|
| 94 |
+
<script>
|
| 95 |
+
(() => {
|
| 96 |
+
const ensureD3 = (cb) => {
|
| 97 |
+
if (window.d3 && typeof window.d3.select === 'function') return cb();
|
| 98 |
+
let s = document.getElementById('d3-cdn-script');
|
| 99 |
+
if (!s) {
|
| 100 |
+
s = document.createElement('script');
|
| 101 |
+
s.id = 'd3-cdn-script';
|
| 102 |
+
s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
|
| 103 |
+
document.head.appendChild(s);
|
| 104 |
+
}
|
| 105 |
+
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
|
| 106 |
+
s.addEventListener('load', onReady, { once: true });
|
| 107 |
+
if (window.d3) onReady();
|
| 108 |
+
};
|
| 109 |
+
|
| 110 |
+
const bootstrap = () => {
|
| 111 |
+
const scriptEl = document.currentScript;
|
| 112 |
+
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 113 |
+
if (!(container && container.classList && container.classList.contains('d3-banner-bar'))) {
|
| 114 |
+
const candidates = Array.from(document.querySelectorAll('.d3-banner-bar'))
|
| 115 |
+
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 116 |
+
container = candidates[candidates.length - 1] || null;
|
| 117 |
+
}
|
| 118 |
+
if (!container) return;
|
| 119 |
+
if (container.dataset) {
|
| 120 |
+
if (container.dataset.mounted === 'true') return;
|
| 121 |
+
container.dataset.mounted = 'true';
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
// Tooltip setup
|
| 125 |
+
container.style.position = container.style.position || 'relative';
|
| 126 |
+
const tip = document.createElement('div');
|
| 127 |
+
tip.className = 'd3-tooltip';
|
| 128 |
+
container.appendChild(tip);
|
| 129 |
+
|
| 130 |
+
// SVG setup
|
| 131 |
+
const svg = d3.select(container).append('svg');
|
| 132 |
+
const gRoot = svg.append('g');
|
| 133 |
+
|
| 134 |
+
// Chart groups
|
| 135 |
+
const gGrid = gRoot.append('g').attr('class', 'grid');
|
| 136 |
+
const gAxes = gRoot.append('g').attr('class', 'axes');
|
| 137 |
+
const gBars = gRoot.append('g').attr('class', 'bars');
|
| 138 |
+
const gLabels = gRoot.append('g').attr('class', 'labels');
|
| 139 |
+
|
| 140 |
+
// State
|
| 141 |
+
let data = null;
|
| 142 |
+
let width = 800;
|
| 143 |
+
let height = 450;
|
| 144 |
+
const margin = { top: 20, right: 60, bottom: 40, left: 20 };
|
| 145 |
+
|
| 146 |
+
// Scales
|
| 147 |
+
const xScale = d3.scaleLinear();
|
| 148 |
+
const yScale = d3.scaleBand();
|
| 149 |
+
|
| 150 |
+
// Data loading
|
| 151 |
+
const JSON_PATHS = [
|
| 152 |
+
'/data/overall_performance.json',
|
| 153 |
+
'./assets/figures/overall_performance.json',
|
| 154 |
+
'../assets/figures/overall_performance.json',
|
| 155 |
+
'../../assets/figures/overall_performance.json'
|
| 156 |
+
];
|
| 157 |
+
|
| 158 |
+
const fetchFirstAvailable = async (paths) => {
|
| 159 |
+
for (const p of paths) {
|
| 160 |
+
try {
|
| 161 |
+
const r = await fetch(p, { cache: 'no-cache' });
|
| 162 |
+
if (r.ok) return await r.json();
|
| 163 |
+
} catch (_) {}
|
| 164 |
+
}
|
| 165 |
+
throw new Error('Data not found');
|
| 166 |
+
};
|
| 167 |
+
|
| 168 |
+
function updateSize() {
|
| 169 |
+
width = container.clientWidth || 800;
|
| 170 |
+
// Height based on number of bars (will be set after data loads)
|
| 171 |
+
const numModels = data ? data.models.length : 10;
|
| 172 |
+
const barHeight = 36;
|
| 173 |
+
height = margin.top + margin.bottom + numModels * barHeight;
|
| 174 |
+
svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
|
| 175 |
+
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 176 |
+
return {
|
| 177 |
+
innerWidth: width - margin.left - margin.right,
|
| 178 |
+
innerHeight: height - margin.top - margin.bottom
|
| 179 |
+
};
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
function showTooltip(event, d) {
|
| 183 |
+
const rect = container.getBoundingClientRect();
|
| 184 |
+
const x = event.clientX - rect.left;
|
| 185 |
+
const y = event.clientY - rect.top;
|
| 186 |
+
|
| 187 |
+
tip.innerHTML = `
|
| 188 |
+
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 189 |
+
<div class="metric">
|
| 190 |
+
<span class="metric-label">Score:</span>
|
| 191 |
+
<span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
|
| 192 |
+
</div>
|
| 193 |
+
<div class="metric">
|
| 194 |
+
<span class="metric-label">Tokens/Turn:</span>
|
| 195 |
+
<span class="metric-value">${Math.round(d.avg_output_tokens_per_turn).toLocaleString()}</span>
|
| 196 |
+
</div>
|
| 197 |
+
<div class="metric">
|
| 198 |
+
<span class="metric-label">Type:</span>
|
| 199 |
+
<span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
|
| 200 |
+
</div>
|
| 201 |
+
`;
|
| 202 |
+
|
| 203 |
+
const tipWidth = tip.offsetWidth || 150;
|
| 204 |
+
const tipHeight = tip.offsetHeight || 80;
|
| 205 |
+
let tipX = x + 12;
|
| 206 |
+
let tipY = y - tipHeight / 2;
|
| 207 |
+
|
| 208 |
+
if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
|
| 209 |
+
if (tipY < 0) tipY = 8;
|
| 210 |
+
if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
|
| 211 |
+
|
| 212 |
+
tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
|
| 213 |
+
tip.style.opacity = '1';
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
function hideTooltip() {
|
| 217 |
+
tip.style.opacity = '0';
|
| 218 |
+
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
// Calculate relative luminance and return black or white for best contrast
|
| 222 |
+
function getContrastColor(hexColor) {
|
| 223 |
+
const hex = hexColor.replace('#', '');
|
| 224 |
+
const r = parseInt(hex.substr(0, 2), 16) / 255;
|
| 225 |
+
const g = parseInt(hex.substr(2, 2), 16) / 255;
|
| 226 |
+
const b = parseInt(hex.substr(4, 2), 16) / 255;
|
| 227 |
+
// Relative luminance formula
|
| 228 |
+
const luminance = 0.299 * r + 0.587 * g + 0.114 * b;
|
| 229 |
+
return luminance > 0.5 ? '#000000' : '#ffffff';
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
function render() {
|
| 233 |
+
if (!data) return;
|
| 234 |
+
|
| 235 |
+
const { innerWidth, innerHeight } = updateSize();
|
| 236 |
+
|
| 237 |
+
// Sort models by score descending
|
| 238 |
+
const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
|
| 239 |
+
|
| 240 |
+
// Update scales
|
| 241 |
+
xScale
|
| 242 |
+
.domain([0, d3.max(models, d => d.avg_floored_score) * 1.05])
|
| 243 |
+
.range([0, innerWidth])
|
| 244 |
+
.nice();
|
| 245 |
+
|
| 246 |
+
yScale
|
| 247 |
+
.domain(models.map(d => d.name))
|
| 248 |
+
.range([0, innerHeight])
|
| 249 |
+
.padding(0.25);
|
| 250 |
+
|
| 251 |
+
// Grid lines (vertical)
|
| 252 |
+
const xTicks = xScale.ticks(6);
|
| 253 |
+
gGrid.selectAll('.grid-x')
|
| 254 |
+
.data(xTicks)
|
| 255 |
+
.join('line')
|
| 256 |
+
.attr('class', 'grid-x')
|
| 257 |
+
.attr('x1', d => xScale(d))
|
| 258 |
+
.attr('x2', d => xScale(d))
|
| 259 |
+
.attr('y1', 0)
|
| 260 |
+
.attr('y2', innerHeight);
|
| 261 |
+
|
| 262 |
+
// X-axis (bottom)
|
| 263 |
+
gAxes.selectAll('.x-axis')
|
| 264 |
+
.data([0])
|
| 265 |
+
.join('g')
|
| 266 |
+
.attr('class', 'x-axis')
|
| 267 |
+
.attr('transform', `translate(0,${innerHeight})`)
|
| 268 |
+
.call(d3.axisBottom(xScale).ticks(6).tickSizeOuter(0));
|
| 269 |
+
|
| 270 |
+
// X-axis label
|
| 271 |
+
gAxes.selectAll('.x-label')
|
| 272 |
+
.data([0])
|
| 273 |
+
.join('text')
|
| 274 |
+
.attr('class', 'x-label axis-label')
|
| 275 |
+
.attr('x', innerWidth / 2)
|
| 276 |
+
.attr('y', innerHeight + 34)
|
| 277 |
+
.attr('text-anchor', 'middle')
|
| 278 |
+
.text('Average Score');
|
| 279 |
+
|
| 280 |
+
// Bars
|
| 281 |
+
const barHeight = yScale.bandwidth();
|
| 282 |
+
gBars.selectAll('.bar')
|
| 283 |
+
.data(models, d => d.name)
|
| 284 |
+
.join('rect')
|
| 285 |
+
.attr('class', 'bar')
|
| 286 |
+
.attr('x', 0)
|
| 287 |
+
.attr('y', d => yScale(d.name))
|
| 288 |
+
.attr('width', d => xScale(d.avg_floored_score))
|
| 289 |
+
.attr('height', barHeight)
|
| 290 |
+
.attr('fill', d => d.color)
|
| 291 |
+
.attr('rx', 3)
|
| 292 |
+
.attr('ry', 3)
|
| 293 |
+
.on('mouseenter', showTooltip)
|
| 294 |
+
.on('mousemove', showTooltip)
|
| 295 |
+
.on('mouseleave', hideTooltip);
|
| 296 |
+
|
| 297 |
+
// Model labels (inside bars)
|
| 298 |
+
gLabels.selectAll('.model-label')
|
| 299 |
+
.data(models, d => d.name)
|
| 300 |
+
.join('text')
|
| 301 |
+
.attr('class', 'model-label')
|
| 302 |
+
.attr('x', 8)
|
| 303 |
+
.attr('y', d => yScale(d.name) + barHeight / 2)
|
| 304 |
+
.attr('dy', '0.35em')
|
| 305 |
+
.attr('text-anchor', 'start')
|
| 306 |
+
.style('fill', d => getContrastColor(d.color))
|
| 307 |
+
.text(d => d.name);
|
| 308 |
+
|
| 309 |
+
// Score labels (end of bars)
|
| 310 |
+
gLabels.selectAll('.score-label')
|
| 311 |
+
.data(models, d => d.name)
|
| 312 |
+
.join('text')
|
| 313 |
+
.attr('class', 'score-label')
|
| 314 |
+
.attr('x', d => xScale(d.avg_floored_score) + 6)
|
| 315 |
+
.attr('y', d => yScale(d.name) + barHeight / 2)
|
| 316 |
+
.attr('dy', '0.35em')
|
| 317 |
+
.attr('text-anchor', 'start')
|
| 318 |
+
.text(d => d.avg_floored_score.toFixed(1));
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
// Initialize
|
| 322 |
+
fetchFirstAvailable(JSON_PATHS)
|
| 323 |
+
.then(json => {
|
| 324 |
+
data = json;
|
| 325 |
+
render();
|
| 326 |
+
})
|
| 327 |
+
.catch(err => {
|
| 328 |
+
const pre = document.createElement('pre');
|
| 329 |
+
pre.style.color = 'red';
|
| 330 |
+
pre.style.padding = '16px';
|
| 331 |
+
pre.textContent = `Error loading data: ${err.message}`;
|
| 332 |
+
container.appendChild(pre);
|
| 333 |
+
});
|
| 334 |
+
|
| 335 |
+
// Resize handling
|
| 336 |
+
if (window.ResizeObserver) {
|
| 337 |
+
new ResizeObserver(() => render()).observe(container);
|
| 338 |
+
} else {
|
| 339 |
+
window.addEventListener('resize', render);
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
// Theme change handling
|
| 343 |
+
const observer = new MutationObserver(() => render());
|
| 344 |
+
observer.observe(document.documentElement, {
|
| 345 |
+
attributes: true,
|
| 346 |
+
attributeFilter: ['data-theme']
|
| 347 |
+
});
|
| 348 |
+
};
|
| 349 |
+
|
| 350 |
+
if (document.readyState === 'loading') {
|
| 351 |
+
document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
|
| 352 |
+
} else {
|
| 353 |
+
ensureD3(bootstrap);
|
| 354 |
+
}
|
| 355 |
+
})();
|
| 356 |
+
</script>
|
app/src/content/embeds/banner.html
CHANGED
|
@@ -1,59 +1,64 @@
|
|
| 1 |
-
<div class="d3-
|
| 2 |
<style>
|
| 3 |
-
.d3-
|
| 4 |
width: 100%;
|
| 5 |
margin: 10px 0;
|
| 6 |
position: relative;
|
| 7 |
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
}
|
| 9 |
|
| 10 |
-
.d3-
|
| 11 |
display: block;
|
| 12 |
width: 100%;
|
| 13 |
height: auto;
|
| 14 |
}
|
| 15 |
|
| 16 |
-
.d3-
|
| 17 |
-
.d3-
|
| 18 |
stroke: var(--axis-color, var(--text-color));
|
| 19 |
}
|
| 20 |
|
| 21 |
-
.d3-
|
| 22 |
fill: var(--tick-color, var(--muted-color));
|
| 23 |
-
font-size:
|
| 24 |
}
|
| 25 |
|
| 26 |
-
.d3-
|
| 27 |
-
stroke: var(--grid-color, rgba(0,0,0,.
|
| 28 |
}
|
| 29 |
|
| 30 |
-
.d3-
|
| 31 |
-
font-size:
|
| 32 |
font-weight: 500;
|
| 33 |
fill: var(--text-color);
|
| 34 |
}
|
| 35 |
|
| 36 |
-
.d3-
|
| 37 |
-
|
| 38 |
-
font-weight: 500;
|
| 39 |
}
|
| 40 |
|
| 41 |
-
.d3-
|
| 42 |
cursor: pointer;
|
| 43 |
transition: opacity 0.15s ease;
|
| 44 |
}
|
| 45 |
|
| 46 |
-
.d3-
|
| 47 |
opacity: 0.8;
|
| 48 |
}
|
| 49 |
|
| 50 |
-
.d3-
|
| 51 |
-
font-size:
|
| 52 |
-
font-weight: 500;
|
| 53 |
fill: var(--text-color);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
}
|
| 55 |
|
| 56 |
-
.d3-
|
| 57 |
position: absolute;
|
| 58 |
top: 0;
|
| 59 |
left: 0;
|
|
@@ -72,22 +77,22 @@
|
|
| 72 |
z-index: 10;
|
| 73 |
}
|
| 74 |
|
| 75 |
-
.d3-
|
| 76 |
font-weight: 600;
|
| 77 |
margin-bottom: 4px;
|
| 78 |
}
|
| 79 |
|
| 80 |
-
.d3-
|
| 81 |
display: flex;
|
| 82 |
justify-content: space-between;
|
| 83 |
gap: 16px;
|
| 84 |
}
|
| 85 |
|
| 86 |
-
.d3-
|
| 87 |
color: var(--muted-color);
|
| 88 |
}
|
| 89 |
|
| 90 |
-
.d3-
|
| 91 |
font-weight: 500;
|
| 92 |
}
|
| 93 |
</style>
|
|
@@ -110,8 +115,8 @@
|
|
| 110 |
const bootstrap = () => {
|
| 111 |
const scriptEl = document.currentScript;
|
| 112 |
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 113 |
-
if (!(container && container.classList && container.classList.contains('d3-
|
| 114 |
-
const candidates = Array.from(document.querySelectorAll('.d3-
|
| 115 |
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 116 |
container = candidates[candidates.length - 1] || null;
|
| 117 |
}
|
|
@@ -129,48 +134,62 @@
|
|
| 129 |
|
| 130 |
// SVG setup
|
| 131 |
const svg = d3.select(container).append('svg');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
const gRoot = svg.append('g');
|
| 133 |
|
| 134 |
-
// Chart groups
|
|
|
|
| 135 |
const gGrid = gRoot.append('g').attr('class', 'grid');
|
| 136 |
const gAxes = gRoot.append('g').attr('class', 'axes');
|
| 137 |
-
const
|
|
|
|
| 138 |
const gLabels = gRoot.append('g').attr('class', 'labels');
|
| 139 |
|
| 140 |
// State
|
| 141 |
let data = null;
|
| 142 |
let width = 800;
|
| 143 |
let height = 450;
|
| 144 |
-
const margin = { top: 20, right:
|
| 145 |
|
| 146 |
// Scales
|
| 147 |
const xScale = d3.scaleLinear();
|
| 148 |
-
const yScale = d3.
|
| 149 |
|
| 150 |
// Data loading
|
| 151 |
-
const
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
try {
|
| 161 |
-
const r = await fetch(p, { cache: 'no-cache' });
|
| 162 |
-
if (r.ok) return await r.json();
|
| 163 |
-
} catch (_) {}
|
| 164 |
}
|
| 165 |
-
|
| 166 |
};
|
| 167 |
|
| 168 |
function updateSize() {
|
| 169 |
width = container.clientWidth || 800;
|
| 170 |
-
|
| 171 |
-
const numModels = data ? data.models.length : 10;
|
| 172 |
-
const barHeight = 36;
|
| 173 |
-
height = margin.top + margin.bottom + numModels * barHeight;
|
| 174 |
svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
|
| 175 |
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 176 |
return {
|
|
@@ -188,11 +207,19 @@
|
|
| 188 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 189 |
<div class="metric">
|
| 190 |
<span class="metric-label">Score:</span>
|
| 191 |
-
<span class="metric-value">${d.avg_floored_score.toFixed(
|
| 192 |
</div>
|
| 193 |
<div class="metric">
|
| 194 |
-
<span class="metric-label">
|
| 195 |
-
<span class="metric-value">${
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
</div>
|
| 197 |
<div class="metric">
|
| 198 |
<span class="metric-label">Type:</span>
|
|
@@ -200,8 +227,8 @@
|
|
| 200 |
</div>
|
| 201 |
`;
|
| 202 |
|
| 203 |
-
const tipWidth = tip.offsetWidth ||
|
| 204 |
-
const tipHeight = tip.offsetHeight ||
|
| 205 |
let tipX = x + 12;
|
| 206 |
let tipY = y - tipHeight / 2;
|
| 207 |
|
|
@@ -218,38 +245,40 @@
|
|
| 218 |
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 219 |
}
|
| 220 |
|
| 221 |
-
// Calculate relative luminance and return black or white for best contrast
|
| 222 |
-
function getContrastColor(hexColor) {
|
| 223 |
-
const hex = hexColor.replace('#', '');
|
| 224 |
-
const r = parseInt(hex.substr(0, 2), 16) / 255;
|
| 225 |
-
const g = parseInt(hex.substr(2, 2), 16) / 255;
|
| 226 |
-
const b = parseInt(hex.substr(4, 2), 16) / 255;
|
| 227 |
-
// Relative luminance formula
|
| 228 |
-
const luminance = 0.299 * r + 0.587 * g + 0.114 * b;
|
| 229 |
-
return luminance > 0.5 ? '#000000' : '#ffffff';
|
| 230 |
-
}
|
| 231 |
-
|
| 232 |
function render() {
|
| 233 |
if (!data) return;
|
| 234 |
|
| 235 |
const { innerWidth, innerHeight } = updateSize();
|
|
|
|
| 236 |
|
| 237 |
-
//
|
| 238 |
-
const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
|
| 239 |
-
|
| 240 |
-
// Update scales
|
| 241 |
xScale
|
| 242 |
-
.domain([
|
| 243 |
-
.range([0, innerWidth])
|
| 244 |
-
.nice();
|
| 245 |
|
|
|
|
|
|
|
|
|
|
| 246 |
yScale
|
| 247 |
-
.domain(
|
| 248 |
-
.range([
|
| 249 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
-
// Grid lines (vertical)
|
| 252 |
-
const xTicks = xScale.ticks(6);
|
| 253 |
gGrid.selectAll('.grid-x')
|
| 254 |
.data(xTicks)
|
| 255 |
.join('line')
|
|
@@ -259,67 +288,125 @@
|
|
| 259 |
.attr('y1', 0)
|
| 260 |
.attr('y2', innerHeight);
|
| 261 |
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
gAxes.selectAll('.x-axis')
|
| 264 |
.data([0])
|
| 265 |
.join('g')
|
| 266 |
.attr('class', 'x-axis')
|
| 267 |
.attr('transform', `translate(0,${innerHeight})`)
|
| 268 |
-
.call(d3.axisBottom(xScale).ticks(
|
| 269 |
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
gAxes.selectAll('.x-label')
|
| 272 |
.data([0])
|
| 273 |
.join('text')
|
| 274 |
.attr('class', 'x-label axis-label')
|
| 275 |
.attr('x', innerWidth / 2)
|
| 276 |
-
.attr('y', innerHeight +
|
| 277 |
.attr('text-anchor', 'middle')
|
| 278 |
-
.text('
|
| 279 |
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
.
|
| 284 |
-
.
|
| 285 |
-
.attr('
|
| 286 |
-
.attr('
|
| 287 |
-
.attr('
|
| 288 |
-
.
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
.attr('fill', d => d.color)
|
| 291 |
-
.attr('
|
| 292 |
-
.attr('ry', 3)
|
| 293 |
.on('mouseenter', showTooltip)
|
| 294 |
.on('mousemove', showTooltip)
|
| 295 |
.on('mouseleave', hideTooltip);
|
| 296 |
|
| 297 |
-
//
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
.
|
| 301 |
-
.
|
| 302 |
-
.attr('
|
| 303 |
-
.attr('
|
| 304 |
-
.attr('
|
| 305 |
-
.attr('
|
| 306 |
-
.
|
| 307 |
-
.
|
|
|
|
| 308 |
|
| 309 |
-
//
|
| 310 |
-
gLabels.selectAll('.
|
| 311 |
.data(models, d => d.name)
|
| 312 |
.join('text')
|
| 313 |
-
.attr('class', '
|
| 314 |
-
.attr('x', d =>
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
}
|
| 320 |
|
| 321 |
// Initialize
|
| 322 |
-
|
|
|
|
| 323 |
.then(json => {
|
| 324 |
data = json;
|
| 325 |
render();
|
|
|
|
| 1 |
+
<div class="d3-score-vs-recklessness"></div>
|
| 2 |
<style>
|
| 3 |
+
.d3-score-vs-recklessness {
|
| 4 |
width: 100%;
|
| 5 |
margin: 10px 0;
|
| 6 |
position: relative;
|
| 7 |
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
}
|
| 9 |
|
| 10 |
+
.d3-score-vs-recklessness svg {
|
| 11 |
display: block;
|
| 12 |
width: 100%;
|
| 13 |
height: auto;
|
| 14 |
}
|
| 15 |
|
| 16 |
+
.d3-score-vs-recklessness .axes path,
|
| 17 |
+
.d3-score-vs-recklessness .axes line {
|
| 18 |
stroke: var(--axis-color, var(--text-color));
|
| 19 |
}
|
| 20 |
|
| 21 |
+
.d3-score-vs-recklessness .axes text {
|
| 22 |
fill: var(--tick-color, var(--muted-color));
|
| 23 |
+
font-size: 14px;
|
| 24 |
}
|
| 25 |
|
| 26 |
+
.d3-score-vs-recklessness .grid line {
|
| 27 |
+
stroke: var(--grid-color, rgba(0,0,0,.15));
|
| 28 |
}
|
| 29 |
|
| 30 |
+
.d3-score-vs-recklessness .axes text.axis-label {
|
| 31 |
+
font-size: 18px;
|
| 32 |
font-weight: 500;
|
| 33 |
fill: var(--text-color);
|
| 34 |
}
|
| 35 |
|
| 36 |
+
.d3-score-vs-recklessness .x-axis text {
|
| 37 |
+
transform: translateY(4px);
|
|
|
|
| 38 |
}
|
| 39 |
|
| 40 |
+
.d3-score-vs-recklessness .point {
|
| 41 |
cursor: pointer;
|
| 42 |
transition: opacity 0.15s ease;
|
| 43 |
}
|
| 44 |
|
| 45 |
+
.d3-score-vs-recklessness .point:hover {
|
| 46 |
opacity: 0.8;
|
| 47 |
}
|
| 48 |
|
| 49 |
+
.d3-score-vs-recklessness .point-label {
|
| 50 |
+
font-size: 11px;
|
|
|
|
| 51 |
fill: var(--text-color);
|
| 52 |
+
pointer-events: none;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.d3-score-vs-recklessness .annotation {
|
| 56 |
+
font-size: 11px;
|
| 57 |
+
font-style: italic;
|
| 58 |
+
fill: var(--muted-color);
|
| 59 |
}
|
| 60 |
|
| 61 |
+
.d3-score-vs-recklessness .d3-tooltip {
|
| 62 |
position: absolute;
|
| 63 |
top: 0;
|
| 64 |
left: 0;
|
|
|
|
| 77 |
z-index: 10;
|
| 78 |
}
|
| 79 |
|
| 80 |
+
.d3-score-vs-recklessness .d3-tooltip .model-name {
|
| 81 |
font-weight: 600;
|
| 82 |
margin-bottom: 4px;
|
| 83 |
}
|
| 84 |
|
| 85 |
+
.d3-score-vs-recklessness .d3-tooltip .metric {
|
| 86 |
display: flex;
|
| 87 |
justify-content: space-between;
|
| 88 |
gap: 16px;
|
| 89 |
}
|
| 90 |
|
| 91 |
+
.d3-score-vs-recklessness .d3-tooltip .metric-label {
|
| 92 |
color: var(--muted-color);
|
| 93 |
}
|
| 94 |
|
| 95 |
+
.d3-score-vs-recklessness .d3-tooltip .metric-value {
|
| 96 |
font-weight: 500;
|
| 97 |
}
|
| 98 |
</style>
|
|
|
|
| 115 |
const bootstrap = () => {
|
| 116 |
const scriptEl = document.currentScript;
|
| 117 |
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 118 |
+
if (!(container && container.classList && container.classList.contains('d3-score-vs-recklessness'))) {
|
| 119 |
+
const candidates = Array.from(document.querySelectorAll('.d3-score-vs-recklessness'))
|
| 120 |
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 121 |
container = candidates[candidates.length - 1] || null;
|
| 122 |
}
|
|
|
|
| 134 |
|
| 135 |
// SVG setup
|
| 136 |
const svg = d3.select(container).append('svg');
|
| 137 |
+
|
| 138 |
+
// Add gradient definition
|
| 139 |
+
const defs = svg.append('defs');
|
| 140 |
+
const gradient = defs.append('linearGradient')
|
| 141 |
+
.attr('id', 'recklessness-gradient')
|
| 142 |
+
.attr('x1', '0%')
|
| 143 |
+
.attr('x2', '100%')
|
| 144 |
+
.attr('y1', '0%')
|
| 145 |
+
.attr('y2', '0%');
|
| 146 |
+
|
| 147 |
+
// Gradient stops: red -> orange -> yellow -> green -> yellow -> orange -> red
|
| 148 |
+
gradient.append('stop').attr('offset', '0%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
|
| 149 |
+
gradient.append('stop').attr('offset', '20%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
|
| 150 |
+
gradient.append('stop').attr('offset', '35%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
|
| 151 |
+
gradient.append('stop').attr('offset', '50%').attr('stop-color', 'rgba(102, 187, 106, 0.35)'); // green
|
| 152 |
+
gradient.append('stop').attr('offset', '65%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
|
| 153 |
+
gradient.append('stop').attr('offset', '80%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
|
| 154 |
+
gradient.append('stop').attr('offset', '100%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
|
| 155 |
+
|
| 156 |
const gRoot = svg.append('g');
|
| 157 |
|
| 158 |
+
// Chart groups (order matters for layering)
|
| 159 |
+
const gBackground = gRoot.append('g').attr('class', 'background');
|
| 160 |
const gGrid = gRoot.append('g').attr('class', 'grid');
|
| 161 |
const gAxes = gRoot.append('g').attr('class', 'axes');
|
| 162 |
+
const gAnnotations = gRoot.append('g').attr('class', 'annotations');
|
| 163 |
+
const gPoints = gRoot.append('g').attr('class', 'points');
|
| 164 |
const gLabels = gRoot.append('g').attr('class', 'labels');
|
| 165 |
|
| 166 |
// State
|
| 167 |
let data = null;
|
| 168 |
let width = 800;
|
| 169 |
let height = 450;
|
| 170 |
+
const margin = { top: 20, right: 120, bottom: 56, left: 72 };
|
| 171 |
|
| 172 |
// Scales
|
| 173 |
const xScale = d3.scaleLinear();
|
| 174 |
+
const yScale = d3.scaleLinear();
|
| 175 |
|
| 176 |
// Data loading
|
| 177 |
+
const DATA_URL = '/data/score_vs_recklessness.json';
|
| 178 |
+
|
| 179 |
+
// Helper function to create a 5-point star path
|
| 180 |
+
const starPath = (cx, cy, outerR, innerR) => {
|
| 181 |
+
const points = [];
|
| 182 |
+
for (let i = 0; i < 10; i++) {
|
| 183 |
+
const r = i % 2 === 0 ? outerR : innerR;
|
| 184 |
+
const angle = (Math.PI / 2) + (i * Math.PI / 5);
|
| 185 |
+
points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
}
|
| 187 |
+
return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
|
| 188 |
};
|
| 189 |
|
| 190 |
function updateSize() {
|
| 191 |
width = container.clientWidth || 800;
|
| 192 |
+
height = Math.max(300, Math.round(width / 1.5));
|
|
|
|
|
|
|
|
|
|
| 193 |
svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
|
| 194 |
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 195 |
return {
|
|
|
|
| 207 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 208 |
<div class="metric">
|
| 209 |
<span class="metric-label">Score:</span>
|
| 210 |
+
<span class="metric-value">${d.avg_floored_score.toFixed(1)}</span>
|
| 211 |
</div>
|
| 212 |
<div class="metric">
|
| 213 |
+
<span class="metric-label">Recklessness Index:</span>
|
| 214 |
+
<span class="metric-value">${d.recklessness_index.toFixed(2)}</span>
|
| 215 |
+
</div>
|
| 216 |
+
<div class="metric">
|
| 217 |
+
<span class="metric-label">Failed Guesses:</span>
|
| 218 |
+
<span class="metric-value">${d.avg_failed_guesses.toFixed(2)}</span>
|
| 219 |
+
</div>
|
| 220 |
+
<div class="metric">
|
| 221 |
+
<span class="metric-label">Caution:</span>
|
| 222 |
+
<span class="metric-value">${d.avg_caution.toFixed(2)}</span>
|
| 223 |
</div>
|
| 224 |
<div class="metric">
|
| 225 |
<span class="metric-label">Type:</span>
|
|
|
|
| 227 |
</div>
|
| 228 |
`;
|
| 229 |
|
| 230 |
+
const tipWidth = tip.offsetWidth || 180;
|
| 231 |
+
const tipHeight = tip.offsetHeight || 120;
|
| 232 |
let tipX = x + 12;
|
| 233 |
let tipY = y - tipHeight / 2;
|
| 234 |
|
|
|
|
| 245 |
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 246 |
}
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
function render() {
|
| 249 |
if (!data) return;
|
| 250 |
|
| 251 |
const { innerWidth, innerHeight } = updateSize();
|
| 252 |
+
const models = data.models;
|
| 253 |
|
| 254 |
+
// Fixed symmetric X scale from -8 to 8
|
|
|
|
|
|
|
|
|
|
| 255 |
xScale
|
| 256 |
+
.domain([-8, 8])
|
| 257 |
+
.range([0, innerWidth]);
|
|
|
|
| 258 |
|
| 259 |
+
// Y scale based on data
|
| 260 |
+
const yExtent = d3.extent(models, d => d.avg_floored_score);
|
| 261 |
+
const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
|
| 262 |
yScale
|
| 263 |
+
.domain([yExtent[0], yExtent[1] + yPadding])
|
| 264 |
+
.range([innerHeight, 0])
|
| 265 |
+
.nice();
|
| 266 |
+
|
| 267 |
+
// Background gradient rectangle
|
| 268 |
+
gBackground.selectAll('.bg-gradient')
|
| 269 |
+
.data([0])
|
| 270 |
+
.join('rect')
|
| 271 |
+
.attr('class', 'bg-gradient')
|
| 272 |
+
.attr('x', 0)
|
| 273 |
+
.attr('y', 0)
|
| 274 |
+
.attr('width', innerWidth)
|
| 275 |
+
.attr('height', innerHeight)
|
| 276 |
+
.attr('fill', 'url(#recklessness-gradient)');
|
| 277 |
+
|
| 278 |
+
// Grid lines
|
| 279 |
+
const xTicks = xScale.ticks(8);
|
| 280 |
+
const yTicks = yScale.ticks(6);
|
| 281 |
|
|
|
|
|
|
|
| 282 |
gGrid.selectAll('.grid-x')
|
| 283 |
.data(xTicks)
|
| 284 |
.join('line')
|
|
|
|
| 288 |
.attr('y1', 0)
|
| 289 |
.attr('y2', innerHeight);
|
| 290 |
|
| 291 |
+
gGrid.selectAll('.grid-y')
|
| 292 |
+
.data(yTicks)
|
| 293 |
+
.join('line')
|
| 294 |
+
.attr('class', 'grid-y')
|
| 295 |
+
.attr('x1', 0)
|
| 296 |
+
.attr('x2', innerWidth)
|
| 297 |
+
.attr('y1', d => yScale(d))
|
| 298 |
+
.attr('y2', d => yScale(d));
|
| 299 |
+
|
| 300 |
+
// Axes with inner ticks
|
| 301 |
+
const tickSize = 6;
|
| 302 |
gAxes.selectAll('.x-axis')
|
| 303 |
.data([0])
|
| 304 |
.join('g')
|
| 305 |
.attr('class', 'x-axis')
|
| 306 |
.attr('transform', `translate(0,${innerHeight})`)
|
| 307 |
+
.call(d3.axisBottom(xScale).ticks(8).tickSizeInner(-tickSize).tickSizeOuter(0));
|
| 308 |
|
| 309 |
+
gAxes.selectAll('.y-axis')
|
| 310 |
+
.data([0])
|
| 311 |
+
.join('g')
|
| 312 |
+
.attr('class', 'y-axis')
|
| 313 |
+
.call(d3.axisLeft(yScale).ticks(6).tickSizeInner(-tickSize).tickSizeOuter(0));
|
| 314 |
+
|
| 315 |
+
// Axis labels
|
| 316 |
gAxes.selectAll('.x-label')
|
| 317 |
.data([0])
|
| 318 |
.join('text')
|
| 319 |
.attr('class', 'x-label axis-label')
|
| 320 |
.attr('x', innerWidth / 2)
|
| 321 |
+
.attr('y', innerHeight + 44)
|
| 322 |
.attr('text-anchor', 'middle')
|
| 323 |
+
.text('Boldness Index');
|
| 324 |
|
| 325 |
+
gAxes.selectAll('.y-label')
|
| 326 |
+
.data([0])
|
| 327 |
+
.join('text')
|
| 328 |
+
.attr('class', 'y-label axis-label')
|
| 329 |
+
.attr('x', -innerHeight / 2)
|
| 330 |
+
.attr('y', -52)
|
| 331 |
+
.attr('text-anchor', 'middle')
|
| 332 |
+
.attr('transform', 'rotate(-90)')
|
| 333 |
+
.text('Score');
|
| 334 |
+
|
| 335 |
+
// Top annotations: Overcautious / Cautious / Measured / Bold / Reckless
|
| 336 |
+
const annotations = [
|
| 337 |
+
{ label: 'Overcautious', color: 'rgba(239, 83, 80, 0.9)', pos: 0.07}, // red
|
| 338 |
+
{ label: 'Cautious', color: 'rgba(255, 180, 0, 0.9)', pos: 0.25 }, // yellow/orange
|
| 339 |
+
{ label: 'Measured', color: 'rgba(76, 175, 80, 0.9)', pos: 0.5 }, // green
|
| 340 |
+
{ label: 'Bold', color: 'rgba(255, 180, 0, 0.9)', pos: 0.75 }, // yellow/orange
|
| 341 |
+
{ label: 'Reckless', color: 'rgba(239, 83, 80, 0.9)', pos: 0.95 } // red
|
| 342 |
+
];
|
| 343 |
+
|
| 344 |
+
gAnnotations.selectAll('.annotation-label')
|
| 345 |
+
.data(annotations, d => d.label)
|
| 346 |
+
.join('text')
|
| 347 |
+
.attr('class', 'annotation annotation-label')
|
| 348 |
+
.attr('x', d => d.pos * innerWidth)
|
| 349 |
+
.attr('y', 16)
|
| 350 |
+
.attr('text-anchor', d => d.pos === 0 ? 'start' : d.pos === 1 ? 'end' : 'middle')
|
| 351 |
+
.style('fill', d => d.color)
|
| 352 |
+
.style('font-weight', 'bold')
|
| 353 |
+
.style('font-size', '13px')
|
| 354 |
+
.text(d => d.label);
|
| 355 |
+
|
| 356 |
+
// Points
|
| 357 |
+
const pointRadius = Math.max(8, Math.min(14, innerWidth / 60));
|
| 358 |
+
|
| 359 |
+
// Closed models as filled circles
|
| 360 |
+
const closedModels = models.filter(d => !d.is_open);
|
| 361 |
+
gPoints.selectAll('.point-closed')
|
| 362 |
+
.data(closedModels, d => d.name)
|
| 363 |
+
.join('circle')
|
| 364 |
+
.attr('class', 'point point-closed')
|
| 365 |
+
.attr('cx', d => xScale(d.recklessness_index))
|
| 366 |
+
.attr('cy', d => yScale(d.avg_floored_score))
|
| 367 |
+
.attr('r', pointRadius)
|
| 368 |
.attr('fill', d => d.color)
|
| 369 |
+
.attr('stroke', 'none')
|
|
|
|
| 370 |
.on('mouseenter', showTooltip)
|
| 371 |
.on('mousemove', showTooltip)
|
| 372 |
.on('mouseleave', hideTooltip);
|
| 373 |
|
| 374 |
+
// Open models as stars
|
| 375 |
+
const openModels = models.filter(d => d.is_open);
|
| 376 |
+
gPoints.selectAll('.point-star')
|
| 377 |
+
.data(openModels, d => d.name)
|
| 378 |
+
.join('path')
|
| 379 |
+
.attr('class', 'point point-star')
|
| 380 |
+
.attr('d', d => starPath(xScale(d.recklessness_index), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
|
| 381 |
+
.attr('fill', d => d.color)
|
| 382 |
+
.attr('stroke', 'none')
|
| 383 |
+
.on('mouseenter', showTooltip)
|
| 384 |
+
.on('mousemove', showTooltip)
|
| 385 |
+
.on('mouseleave', hideTooltip);
|
| 386 |
|
| 387 |
+
// Point labels with smart positioning
|
| 388 |
+
gLabels.selectAll('.point-label')
|
| 389 |
.data(models, d => d.name)
|
| 390 |
.join('text')
|
| 391 |
+
.attr('class', 'point-label')
|
| 392 |
+
.attr('x', d => {
|
| 393 |
+
const xPos = xScale(d.recklessness_index);
|
| 394 |
+
if (xPos > innerWidth - 100) {
|
| 395 |
+
return xPos - pointRadius - 6;
|
| 396 |
+
}
|
| 397 |
+
return xPos + pointRadius + 6;
|
| 398 |
+
})
|
| 399 |
+
.attr('y', d => yScale(d.avg_floored_score) + 4)
|
| 400 |
+
.attr('text-anchor', d => {
|
| 401 |
+
const xPos = xScale(d.recklessness_index);
|
| 402 |
+
return xPos > innerWidth - 100 ? 'end' : 'start';
|
| 403 |
+
})
|
| 404 |
+
.text(d => d.name);
|
| 405 |
}
|
| 406 |
|
| 407 |
// Initialize
|
| 408 |
+
fetch(DATA_URL, { cache: 'no-cache' })
|
| 409 |
+
.then(r => r.json())
|
| 410 |
.then(json => {
|
| 411 |
data = json;
|
| 412 |
render();
|
app/src/content/embeds/complexity-ratio.html
ADDED
|
@@ -0,0 +1,484 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="d3-complexity-ratio"></div>
|
| 2 |
+
<style>
|
| 3 |
+
.d3-complexity-ratio {
|
| 4 |
+
width: 100%;
|
| 5 |
+
margin: 10px 0;
|
| 6 |
+
position: relative;
|
| 7 |
+
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
.d3-complexity-ratio svg {
|
| 11 |
+
display: block;
|
| 12 |
+
width: 100%;
|
| 13 |
+
height: auto;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
.d3-complexity-ratio .axes path,
|
| 17 |
+
.d3-complexity-ratio .axes line {
|
| 18 |
+
stroke: var(--axis-color, var(--text-color));
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
.d3-complexity-ratio .axes text {
|
| 22 |
+
fill: var(--tick-color, var(--muted-color));
|
| 23 |
+
font-size: 11px;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.d3-complexity-ratio .grid line {
|
| 27 |
+
stroke: var(--grid-color, rgba(0,0,0,.08));
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.d3-complexity-ratio .axes text.axis-label {
|
| 31 |
+
font-size: 14px;
|
| 32 |
+
font-weight: 500;
|
| 33 |
+
fill: var(--text-color);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.d3-complexity-ratio .reference-line {
|
| 37 |
+
stroke: var(--muted-color);
|
| 38 |
+
stroke-dasharray: 5, 5;
|
| 39 |
+
stroke-width: 1.5;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.d3-complexity-ratio .whisker-line {
|
| 43 |
+
stroke-width: 1.5;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
.d3-complexity-ratio .whisker-cap {
|
| 47 |
+
stroke-width: 1.5;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
.d3-complexity-ratio .model-point {
|
| 51 |
+
stroke-width: 2;
|
| 52 |
+
cursor: pointer;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.d3-complexity-ratio .model-point:hover {
|
| 56 |
+
stroke-width: 3;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.d3-complexity-ratio .ratio-label {
|
| 60 |
+
font-size: 11px;
|
| 61 |
+
fill: var(--muted-color);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.d3-complexity-ratio .legend-item {
|
| 65 |
+
cursor: default;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
.d3-complexity-ratio .legend-text {
|
| 69 |
+
font-size: 11px;
|
| 70 |
+
fill: var(--text-color);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.d3-complexity-ratio .subtitle {
|
| 74 |
+
font-size: 11px;
|
| 75 |
+
fill: var(--muted-color);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.d3-complexity-ratio .d3-tooltip {
|
| 79 |
+
position: absolute;
|
| 80 |
+
top: 0;
|
| 81 |
+
left: 0;
|
| 82 |
+
transform: translate(-9999px, -9999px);
|
| 83 |
+
pointer-events: none;
|
| 84 |
+
padding: 10px 12px;
|
| 85 |
+
border-radius: 8px;
|
| 86 |
+
font-size: 12px;
|
| 87 |
+
line-height: 1.4;
|
| 88 |
+
border: 1px solid var(--border-color);
|
| 89 |
+
background: var(--surface-bg);
|
| 90 |
+
color: var(--text-color);
|
| 91 |
+
box-shadow: 0 4px 24px rgba(0,0,0,.18);
|
| 92 |
+
opacity: 0;
|
| 93 |
+
transition: opacity 0.12s ease;
|
| 94 |
+
z-index: 10;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.d3-complexity-ratio .d3-tooltip .model-name {
|
| 98 |
+
font-weight: 600;
|
| 99 |
+
margin-bottom: 4px;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
.d3-complexity-ratio .d3-tooltip .metric {
|
| 103 |
+
display: flex;
|
| 104 |
+
justify-content: space-between;
|
| 105 |
+
gap: 16px;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.d3-complexity-ratio .d3-tooltip .metric-label {
|
| 109 |
+
color: var(--muted-color);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.d3-complexity-ratio .d3-tooltip .metric-value {
|
| 113 |
+
font-weight: 500;
|
| 114 |
+
}
|
| 115 |
+
</style>
|
| 116 |
+
<script>
|
| 117 |
+
(() => {
|
| 118 |
+
const ensureD3 = (cb) => {
|
| 119 |
+
if (window.d3 && typeof window.d3.select === 'function') return cb();
|
| 120 |
+
let s = document.getElementById('d3-cdn-script');
|
| 121 |
+
if (!s) {
|
| 122 |
+
s = document.createElement('script');
|
| 123 |
+
s.id = 'd3-cdn-script';
|
| 124 |
+
s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
|
| 125 |
+
document.head.appendChild(s);
|
| 126 |
+
}
|
| 127 |
+
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
|
| 128 |
+
s.addEventListener('load', onReady, { once: true });
|
| 129 |
+
if (window.d3) onReady();
|
| 130 |
+
};
|
| 131 |
+
|
| 132 |
+
const bootstrap = () => {
|
| 133 |
+
const scriptEl = document.currentScript;
|
| 134 |
+
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 135 |
+
if (!(container && container.classList && container.classList.contains('d3-complexity-ratio'))) {
|
| 136 |
+
const candidates = Array.from(document.querySelectorAll('.d3-complexity-ratio'))
|
| 137 |
+
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 138 |
+
container = candidates[candidates.length - 1] || null;
|
| 139 |
+
}
|
| 140 |
+
if (!container) return;
|
| 141 |
+
if (container.dataset) {
|
| 142 |
+
if (container.dataset.mounted === 'true') return;
|
| 143 |
+
container.dataset.mounted = 'true';
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
// Tooltip setup
|
| 147 |
+
container.style.position = container.style.position || 'relative';
|
| 148 |
+
const tip = document.createElement('div');
|
| 149 |
+
tip.className = 'd3-tooltip';
|
| 150 |
+
container.appendChild(tip);
|
| 151 |
+
|
| 152 |
+
// SVG setup
|
| 153 |
+
const svg = d3.select(container).append('svg');
|
| 154 |
+
const gRoot = svg.append('g');
|
| 155 |
+
|
| 156 |
+
// Chart groups
|
| 157 |
+
const gGrid = gRoot.append('g').attr('class', 'grid');
|
| 158 |
+
const gReference = gRoot.append('g').attr('class', 'reference');
|
| 159 |
+
const gAxes = gRoot.append('g').attr('class', 'axes');
|
| 160 |
+
const gWhiskers = gRoot.append('g').attr('class', 'whiskers');
|
| 161 |
+
const gPoints = gRoot.append('g').attr('class', 'points');
|
| 162 |
+
const gLabels = gRoot.append('g').attr('class', 'labels');
|
| 163 |
+
const gLegend = gRoot.append('g').attr('class', 'legend');
|
| 164 |
+
|
| 165 |
+
// State
|
| 166 |
+
let data = null;
|
| 167 |
+
let width = 800;
|
| 168 |
+
let height = 500;
|
| 169 |
+
const margin = { top: 30, right: 100, bottom: 60, left: 180 };
|
| 170 |
+
|
| 171 |
+
// Scales
|
| 172 |
+
const xScale = d3.scaleLinear();
|
| 173 |
+
const yScale = d3.scaleBand();
|
| 174 |
+
|
| 175 |
+
// Data loading
|
| 176 |
+
const DATA_URL = '/data/complexity_ratio.json';
|
| 177 |
+
|
| 178 |
+
function showTooltip(event, model) {
|
| 179 |
+
const rect = container.getBoundingClientRect();
|
| 180 |
+
const x = event.clientX - rect.left;
|
| 181 |
+
const y = event.clientY - rect.top;
|
| 182 |
+
|
| 183 |
+
const interpretation = model.median_ratio > 1.05
|
| 184 |
+
? 'Tends to overcomplicate'
|
| 185 |
+
: model.median_ratio < 0.95
|
| 186 |
+
? 'Tends to oversimplify'
|
| 187 |
+
: 'Matches complexity well';
|
| 188 |
+
|
| 189 |
+
tip.innerHTML = `
|
| 190 |
+
<div class="model-name" style="color: ${model.color}">${model.name}</div>
|
| 191 |
+
<div class="metric">
|
| 192 |
+
<span class="metric-label">Median ratio:</span>
|
| 193 |
+
<span class="metric-value">${model.median_ratio.toFixed(2)}</span>
|
| 194 |
+
</div>
|
| 195 |
+
<div class="metric">
|
| 196 |
+
<span class="metric-label">IQR:</span>
|
| 197 |
+
<span class="metric-value">${model.q25.toFixed(2)} – ${model.q75.toFixed(2)}</span>
|
| 198 |
+
</div>
|
| 199 |
+
<div class="metric">
|
| 200 |
+
<span class="metric-label">Samples:</span>
|
| 201 |
+
<span class="metric-value">n=${model.count}</span>
|
| 202 |
+
</div>
|
| 203 |
+
<div class="metric" style="margin-top: 4px;">
|
| 204 |
+
<span class="metric-label">Interpretation:</span>
|
| 205 |
+
<span class="metric-value">${interpretation}</span>
|
| 206 |
+
</div>
|
| 207 |
+
`;
|
| 208 |
+
|
| 209 |
+
const tipWidth = tip.offsetWidth || 180;
|
| 210 |
+
const tipHeight = tip.offsetHeight || 120;
|
| 211 |
+
let tipX = x + 12;
|
| 212 |
+
let tipY = y - tipHeight / 2;
|
| 213 |
+
|
| 214 |
+
if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
|
| 215 |
+
if (tipY < 0) tipY = 8;
|
| 216 |
+
if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
|
| 217 |
+
|
| 218 |
+
tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
|
| 219 |
+
tip.style.opacity = '1';
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
function hideTooltip() {
|
| 223 |
+
tip.style.opacity = '0';
|
| 224 |
+
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
function updateSize() {
|
| 228 |
+
width = container.clientWidth || 800;
|
| 229 |
+
height = Math.max(420, Math.round(width * 0.55));
|
| 230 |
+
svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
|
| 231 |
+
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 232 |
+
return {
|
| 233 |
+
innerWidth: width - margin.left - margin.right,
|
| 234 |
+
innerHeight: height - margin.top - margin.bottom
|
| 235 |
+
};
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
function render() {
|
| 239 |
+
if (!data) return;
|
| 240 |
+
|
| 241 |
+
const { innerWidth, innerHeight } = updateSize();
|
| 242 |
+
|
| 243 |
+
// Sort models by median ratio (ascending - lowest at top)
|
| 244 |
+
const models = [...data.models].sort((a, b) => a.median_ratio - b.median_ratio);
|
| 245 |
+
|
| 246 |
+
// X scale: ratio values with padding
|
| 247 |
+
const xMin = d3.min(models, m => m.q25);
|
| 248 |
+
const xMax = d3.max(models, m => m.q75);
|
| 249 |
+
const xPadding = (xMax - xMin) * 0.1;
|
| 250 |
+
xScale
|
| 251 |
+
.domain([Math.min(0.6, xMin - xPadding), Math.max(2.4, xMax + xPadding)])
|
| 252 |
+
.range([0, innerWidth]);
|
| 253 |
+
|
| 254 |
+
// Y scale: categorical (model names)
|
| 255 |
+
yScale
|
| 256 |
+
.domain(models.map(m => m.name))
|
| 257 |
+
.range([0, innerHeight])
|
| 258 |
+
.padding(0.4);
|
| 259 |
+
|
| 260 |
+
// Grid lines (vertical)
|
| 261 |
+
const xTicks = xScale.ticks(8);
|
| 262 |
+
gGrid.selectAll('.grid-x')
|
| 263 |
+
.data(xTicks)
|
| 264 |
+
.join('line')
|
| 265 |
+
.attr('class', 'grid-x')
|
| 266 |
+
.attr('x1', d => xScale(d))
|
| 267 |
+
.attr('x2', d => xScale(d))
|
| 268 |
+
.attr('y1', 0)
|
| 269 |
+
.attr('y2', innerHeight);
|
| 270 |
+
|
| 271 |
+
// Reference line at x=1
|
| 272 |
+
gReference.selectAll('.reference-line')
|
| 273 |
+
.data([1])
|
| 274 |
+
.join('line')
|
| 275 |
+
.attr('class', 'reference-line')
|
| 276 |
+
.attr('x1', d => xScale(d))
|
| 277 |
+
.attr('x2', d => xScale(d))
|
| 278 |
+
.attr('y1', 0)
|
| 279 |
+
.attr('y2', innerHeight);
|
| 280 |
+
|
| 281 |
+
// Axes
|
| 282 |
+
const tickSize = 6;
|
| 283 |
+
|
| 284 |
+
gAxes.selectAll('.x-axis')
|
| 285 |
+
.data([0])
|
| 286 |
+
.join('g')
|
| 287 |
+
.attr('class', 'x-axis')
|
| 288 |
+
.attr('transform', `translate(0,${innerHeight})`)
|
| 289 |
+
.call(d3.axisBottom(xScale)
|
| 290 |
+
.ticks(8)
|
| 291 |
+
.tickFormat(d3.format('.2f'))
|
| 292 |
+
.tickSizeInner(-tickSize)
|
| 293 |
+
.tickSizeOuter(0));
|
| 294 |
+
|
| 295 |
+
gAxes.selectAll('.y-axis')
|
| 296 |
+
.data([0])
|
| 297 |
+
.join('g')
|
| 298 |
+
.attr('class', 'y-axis')
|
| 299 |
+
.call(d3.axisLeft(yScale)
|
| 300 |
+
.tickSizeInner(-tickSize)
|
| 301 |
+
.tickSizeOuter(0));
|
| 302 |
+
|
| 303 |
+
// X-axis label
|
| 304 |
+
gAxes.selectAll('.x-label')
|
| 305 |
+
.data([0])
|
| 306 |
+
.join('text')
|
| 307 |
+
.attr('class', 'x-label axis-label')
|
| 308 |
+
.attr('x', innerWidth / 2)
|
| 309 |
+
.attr('y', innerHeight + 40)
|
| 310 |
+
.attr('text-anchor', 'middle')
|
| 311 |
+
.text('Complexity Ratio (Tentative / Actual)');
|
| 312 |
+
|
| 313 |
+
// Subtitle
|
| 314 |
+
gAxes.selectAll('.subtitle')
|
| 315 |
+
.data([0])
|
| 316 |
+
.join('text')
|
| 317 |
+
.attr('class', 'subtitle')
|
| 318 |
+
.attr('x', innerWidth / 2)
|
| 319 |
+
.attr('y', innerHeight + 54)
|
| 320 |
+
.attr('text-anchor', 'middle')
|
| 321 |
+
.text('>1: Overcomplicates | <1: Oversimplifies | =1: Matches complexity');
|
| 322 |
+
|
| 323 |
+
const bandHeight = yScale.bandwidth();
|
| 324 |
+
const capHeight = bandHeight * 0.4;
|
| 325 |
+
const pointSize = Math.min(8, bandHeight * 0.35);
|
| 326 |
+
|
| 327 |
+
// Whiskers (IQR lines)
|
| 328 |
+
gWhiskers.selectAll('.whisker-line')
|
| 329 |
+
.data(models, d => d.name)
|
| 330 |
+
.join('line')
|
| 331 |
+
.attr('class', 'whisker-line')
|
| 332 |
+
.attr('x1', d => xScale(d.q25))
|
| 333 |
+
.attr('x2', d => xScale(d.q75))
|
| 334 |
+
.attr('y1', d => yScale(d.name) + bandHeight / 2)
|
| 335 |
+
.attr('y2', d => yScale(d.name) + bandHeight / 2)
|
| 336 |
+
.attr('stroke', d => d.color);
|
| 337 |
+
|
| 338 |
+
// Left whisker caps
|
| 339 |
+
gWhiskers.selectAll('.whisker-cap-left')
|
| 340 |
+
.data(models, d => d.name)
|
| 341 |
+
.join('line')
|
| 342 |
+
.attr('class', 'whisker-cap whisker-cap-left')
|
| 343 |
+
.attr('x1', d => xScale(d.q25))
|
| 344 |
+
.attr('x2', d => xScale(d.q25))
|
| 345 |
+
.attr('y1', d => yScale(d.name) + bandHeight / 2 - capHeight / 2)
|
| 346 |
+
.attr('y2', d => yScale(d.name) + bandHeight / 2 + capHeight / 2)
|
| 347 |
+
.attr('stroke', d => d.color);
|
| 348 |
+
|
| 349 |
+
// Right whisker caps
|
| 350 |
+
gWhiskers.selectAll('.whisker-cap-right')
|
| 351 |
+
.data(models, d => d.name)
|
| 352 |
+
.join('line')
|
| 353 |
+
.attr('class', 'whisker-cap whisker-cap-right')
|
| 354 |
+
.attr('x1', d => xScale(d.q75))
|
| 355 |
+
.attr('x2', d => xScale(d.q75))
|
| 356 |
+
.attr('y1', d => yScale(d.name) + bandHeight / 2 - capHeight / 2)
|
| 357 |
+
.attr('y2', d => yScale(d.name) + bandHeight / 2 + capHeight / 2)
|
| 358 |
+
.attr('stroke', d => d.color);
|
| 359 |
+
|
| 360 |
+
// Model points - circles for closed, squares for open
|
| 361 |
+
const closedModels = models.filter(m => !m.is_open);
|
| 362 |
+
const openModels = models.filter(m => m.is_open);
|
| 363 |
+
|
| 364 |
+
// Closed models: circles
|
| 365 |
+
gPoints.selectAll('.model-point-circle')
|
| 366 |
+
.data(closedModels, d => d.name)
|
| 367 |
+
.join('circle')
|
| 368 |
+
.attr('class', 'model-point model-point-circle')
|
| 369 |
+
.attr('cx', d => xScale(d.median_ratio))
|
| 370 |
+
.attr('cy', d => yScale(d.name) + bandHeight / 2)
|
| 371 |
+
.attr('r', pointSize)
|
| 372 |
+
.attr('fill', d => d.color)
|
| 373 |
+
.attr('stroke', d => d.color)
|
| 374 |
+
.on('mouseenter', (event, d) => showTooltip(event, d))
|
| 375 |
+
.on('mousemove', (event, d) => showTooltip(event, d))
|
| 376 |
+
.on('mouseleave', hideTooltip);
|
| 377 |
+
|
| 378 |
+
// Open models: squares
|
| 379 |
+
gPoints.selectAll('.model-point-square')
|
| 380 |
+
.data(openModels, d => d.name)
|
| 381 |
+
.join('rect')
|
| 382 |
+
.attr('class', 'model-point model-point-square')
|
| 383 |
+
.attr('x', d => xScale(d.median_ratio) - pointSize)
|
| 384 |
+
.attr('y', d => yScale(d.name) + bandHeight / 2 - pointSize)
|
| 385 |
+
.attr('width', pointSize * 2)
|
| 386 |
+
.attr('height', pointSize * 2)
|
| 387 |
+
.attr('fill', 'none')
|
| 388 |
+
.attr('stroke', d => d.color)
|
| 389 |
+
.attr('stroke-width', 2)
|
| 390 |
+
.on('mouseenter', (event, d) => showTooltip(event, d))
|
| 391 |
+
.on('mousemove', (event, d) => showTooltip(event, d))
|
| 392 |
+
.on('mouseleave', hideTooltip);
|
| 393 |
+
|
| 394 |
+
// Ratio labels on the right
|
| 395 |
+
gLabels.selectAll('.ratio-label')
|
| 396 |
+
.data(models, d => d.name)
|
| 397 |
+
.join('text')
|
| 398 |
+
.attr('class', 'ratio-label')
|
| 399 |
+
.attr('x', innerWidth + 8)
|
| 400 |
+
.attr('y', d => yScale(d.name) + bandHeight / 2)
|
| 401 |
+
.attr('dy', '0.35em')
|
| 402 |
+
.text(d => `${d.median_ratio.toFixed(2)} (n=${d.count})`);
|
| 403 |
+
|
| 404 |
+
// Legend
|
| 405 |
+
const legendY = -15;
|
| 406 |
+
const legendItems = [
|
| 407 |
+
{ label: 'Closed model', shape: 'circle' },
|
| 408 |
+
{ label: 'Open model', shape: 'square' }
|
| 409 |
+
];
|
| 410 |
+
|
| 411 |
+
const legendGroup = gLegend.selectAll('.legend-item')
|
| 412 |
+
.data(legendItems)
|
| 413 |
+
.join('g')
|
| 414 |
+
.attr('class', 'legend-item')
|
| 415 |
+
.attr('transform', (d, i) => `translate(${innerWidth - 80 - i * 100}, ${legendY})`);
|
| 416 |
+
|
| 417 |
+
legendGroup.selectAll('.legend-shape-circle')
|
| 418 |
+
.data(d => d.shape === 'circle' ? [d] : [])
|
| 419 |
+
.join('circle')
|
| 420 |
+
.attr('class', 'legend-shape-circle')
|
| 421 |
+
.attr('cx', 0)
|
| 422 |
+
.attr('cy', 0)
|
| 423 |
+
.attr('r', 5)
|
| 424 |
+
.attr('fill', 'var(--muted-color)');
|
| 425 |
+
|
| 426 |
+
legendGroup.selectAll('.legend-shape-square')
|
| 427 |
+
.data(d => d.shape === 'square' ? [d] : [])
|
| 428 |
+
.join('rect')
|
| 429 |
+
.attr('class', 'legend-shape-square')
|
| 430 |
+
.attr('x', -5)
|
| 431 |
+
.attr('y', -5)
|
| 432 |
+
.attr('width', 10)
|
| 433 |
+
.attr('height', 10)
|
| 434 |
+
.attr('fill', 'none')
|
| 435 |
+
.attr('stroke', 'var(--muted-color)')
|
| 436 |
+
.attr('stroke-width', 2);
|
| 437 |
+
|
| 438 |
+
legendGroup.selectAll('.legend-text')
|
| 439 |
+
.data(d => [d])
|
| 440 |
+
.join('text')
|
| 441 |
+
.attr('class', 'legend-text')
|
| 442 |
+
.attr('x', 10)
|
| 443 |
+
.attr('y', 0)
|
| 444 |
+
.attr('dy', '0.35em')
|
| 445 |
+
.text(d => d.label);
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
// Initialize
|
| 449 |
+
fetch(DATA_URL, { cache: 'no-cache' })
|
| 450 |
+
.then(r => r.json())
|
| 451 |
+
.then(json => {
|
| 452 |
+
data = json;
|
| 453 |
+
render();
|
| 454 |
+
})
|
| 455 |
+
.catch(err => {
|
| 456 |
+
const pre = document.createElement('pre');
|
| 457 |
+
pre.style.color = 'red';
|
| 458 |
+
pre.style.padding = '16px';
|
| 459 |
+
pre.textContent = `Error loading data: ${err.message}`;
|
| 460 |
+
container.appendChild(pre);
|
| 461 |
+
});
|
| 462 |
+
|
| 463 |
+
// Resize handling
|
| 464 |
+
if (window.ResizeObserver) {
|
| 465 |
+
new ResizeObserver(() => render()).observe(container);
|
| 466 |
+
} else {
|
| 467 |
+
window.addEventListener('resize', render);
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
// Theme change handling
|
| 471 |
+
const observer = new MutationObserver(() => render());
|
| 472 |
+
observer.observe(document.documentElement, {
|
| 473 |
+
attributes: true,
|
| 474 |
+
attributeFilter: ['data-theme']
|
| 475 |
+
});
|
| 476 |
+
};
|
| 477 |
+
|
| 478 |
+
if (document.readyState === 'loading') {
|
| 479 |
+
document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
|
| 480 |
+
} else {
|
| 481 |
+
ensureD3(bootstrap);
|
| 482 |
+
}
|
| 483 |
+
})();
|
| 484 |
+
</script>
|
app/src/content/embeds/score-vs-recklessness.html
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="d3-score-vs-recklessness"></div>
|
| 2 |
+
<style>
|
| 3 |
+
.d3-score-vs-recklessness {
|
| 4 |
+
width: 100%;
|
| 5 |
+
margin: 10px 0;
|
| 6 |
+
position: relative;
|
| 7 |
+
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
.d3-score-vs-recklessness svg {
|
| 11 |
+
display: block;
|
| 12 |
+
width: 100%;
|
| 13 |
+
height: auto;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
.d3-score-vs-recklessness .axes path,
|
| 17 |
+
.d3-score-vs-recklessness .axes line {
|
| 18 |
+
stroke: var(--axis-color, var(--text-color));
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
.d3-score-vs-recklessness .axes text {
|
| 22 |
+
fill: var(--tick-color, var(--muted-color));
|
| 23 |
+
font-size: 14px;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.d3-score-vs-recklessness .grid line {
|
| 27 |
+
stroke: var(--grid-color, rgba(0,0,0,.15));
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.d3-score-vs-recklessness .axes text.axis-label {
|
| 31 |
+
font-size: 18px;
|
| 32 |
+
font-weight: 500;
|
| 33 |
+
fill: var(--text-color);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.d3-score-vs-recklessness .x-axis text {
|
| 37 |
+
transform: translateY(4px);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
.d3-score-vs-recklessness .point {
|
| 41 |
+
cursor: pointer;
|
| 42 |
+
transition: opacity 0.15s ease;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.d3-score-vs-recklessness .point:hover {
|
| 46 |
+
opacity: 0.8;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.d3-score-vs-recklessness .point-label {
|
| 50 |
+
font-size: 11px;
|
| 51 |
+
fill: var(--text-color);
|
| 52 |
+
pointer-events: none;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.d3-score-vs-recklessness .annotation {
|
| 56 |
+
font-size: 11px;
|
| 57 |
+
font-style: italic;
|
| 58 |
+
fill: var(--muted-color);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.d3-score-vs-recklessness .d3-tooltip {
|
| 62 |
+
position: absolute;
|
| 63 |
+
top: 0;
|
| 64 |
+
left: 0;
|
| 65 |
+
transform: translate(-9999px, -9999px);
|
| 66 |
+
pointer-events: none;
|
| 67 |
+
padding: 10px 12px;
|
| 68 |
+
border-radius: 8px;
|
| 69 |
+
font-size: 12px;
|
| 70 |
+
line-height: 1.4;
|
| 71 |
+
border: 1px solid var(--border-color);
|
| 72 |
+
background: var(--surface-bg);
|
| 73 |
+
color: var(--text-color);
|
| 74 |
+
box-shadow: 0 4px 24px rgba(0,0,0,.18);
|
| 75 |
+
opacity: 0;
|
| 76 |
+
transition: opacity 0.12s ease;
|
| 77 |
+
z-index: 10;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.d3-score-vs-recklessness .d3-tooltip .model-name {
|
| 81 |
+
font-weight: 600;
|
| 82 |
+
margin-bottom: 4px;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.d3-score-vs-recklessness .d3-tooltip .metric {
|
| 86 |
+
display: flex;
|
| 87 |
+
justify-content: space-between;
|
| 88 |
+
gap: 16px;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
.d3-score-vs-recklessness .d3-tooltip .metric-label {
|
| 92 |
+
color: var(--muted-color);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
.d3-score-vs-recklessness .d3-tooltip .metric-value {
|
| 96 |
+
font-weight: 500;
|
| 97 |
+
}
|
| 98 |
+
</style>
|
| 99 |
+
<script>
|
| 100 |
+
(() => {
|
| 101 |
+
const ensureD3 = (cb) => {
|
| 102 |
+
if (window.d3 && typeof window.d3.select === 'function') return cb();
|
| 103 |
+
let s = document.getElementById('d3-cdn-script');
|
| 104 |
+
if (!s) {
|
| 105 |
+
s = document.createElement('script');
|
| 106 |
+
s.id = 'd3-cdn-script';
|
| 107 |
+
s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
|
| 108 |
+
document.head.appendChild(s);
|
| 109 |
+
}
|
| 110 |
+
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
|
| 111 |
+
s.addEventListener('load', onReady, { once: true });
|
| 112 |
+
if (window.d3) onReady();
|
| 113 |
+
};
|
| 114 |
+
|
| 115 |
+
const bootstrap = () => {
|
| 116 |
+
const scriptEl = document.currentScript;
|
| 117 |
+
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 118 |
+
if (!(container && container.classList && container.classList.contains('d3-score-vs-recklessness'))) {
|
| 119 |
+
const candidates = Array.from(document.querySelectorAll('.d3-score-vs-recklessness'))
|
| 120 |
+
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 121 |
+
container = candidates[candidates.length - 1] || null;
|
| 122 |
+
}
|
| 123 |
+
if (!container) return;
|
| 124 |
+
if (container.dataset) {
|
| 125 |
+
if (container.dataset.mounted === 'true') return;
|
| 126 |
+
container.dataset.mounted = 'true';
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
// Tooltip setup
|
| 130 |
+
container.style.position = container.style.position || 'relative';
|
| 131 |
+
const tip = document.createElement('div');
|
| 132 |
+
tip.className = 'd3-tooltip';
|
| 133 |
+
container.appendChild(tip);
|
| 134 |
+
|
| 135 |
+
// SVG setup
|
| 136 |
+
const svg = d3.select(container).append('svg');
|
| 137 |
+
|
| 138 |
+
// Add gradient definition
|
| 139 |
+
const defs = svg.append('defs');
|
| 140 |
+
const gradient = defs.append('linearGradient')
|
| 141 |
+
.attr('id', 'recklessness-gradient')
|
| 142 |
+
.attr('x1', '0%')
|
| 143 |
+
.attr('x2', '100%')
|
| 144 |
+
.attr('y1', '0%')
|
| 145 |
+
.attr('y2', '0%');
|
| 146 |
+
|
| 147 |
+
// Gradient stops: red -> orange -> yellow -> green -> yellow -> orange -> red
|
| 148 |
+
gradient.append('stop').attr('offset', '0%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
|
| 149 |
+
gradient.append('stop').attr('offset', '20%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
|
| 150 |
+
gradient.append('stop').attr('offset', '35%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
|
| 151 |
+
gradient.append('stop').attr('offset', '50%').attr('stop-color', 'rgba(102, 187, 106, 0.35)'); // green
|
| 152 |
+
gradient.append('stop').attr('offset', '65%').attr('stop-color', 'rgba(255, 235, 59, 0.25)'); // yellow
|
| 153 |
+
gradient.append('stop').attr('offset', '80%').attr('stop-color', 'rgba(255, 152, 0, 0.25)'); // orange
|
| 154 |
+
gradient.append('stop').attr('offset', '100%').attr('stop-color', 'rgba(239, 83, 80, 0.25)'); // red
|
| 155 |
+
|
| 156 |
+
const gRoot = svg.append('g');
|
| 157 |
+
|
| 158 |
+
// Chart groups (order matters for layering)
|
| 159 |
+
const gBackground = gRoot.append('g').attr('class', 'background');
|
| 160 |
+
const gGrid = gRoot.append('g').attr('class', 'grid');
|
| 161 |
+
const gAxes = gRoot.append('g').attr('class', 'axes');
|
| 162 |
+
const gAnnotations = gRoot.append('g').attr('class', 'annotations');
|
| 163 |
+
const gPoints = gRoot.append('g').attr('class', 'points');
|
| 164 |
+
const gLabels = gRoot.append('g').attr('class', 'labels');
|
| 165 |
+
|
| 166 |
+
// State
|
| 167 |
+
let data = null;
|
| 168 |
+
let width = 800;
|
| 169 |
+
let height = 450;
|
| 170 |
+
const margin = { top: 20, right: 120, bottom: 56, left: 72 };
|
| 171 |
+
|
| 172 |
+
// Scales
|
| 173 |
+
const xScale = d3.scaleLinear();
|
| 174 |
+
const yScale = d3.scaleLinear();
|
| 175 |
+
|
| 176 |
+
// Data loading
|
| 177 |
+
const DATA_URL = '/data/score_vs_recklessness.json';
|
| 178 |
+
|
| 179 |
+
// Helper function to create a 5-point star path
|
| 180 |
+
const starPath = (cx, cy, outerR, innerR) => {
|
| 181 |
+
const points = [];
|
| 182 |
+
for (let i = 0; i < 10; i++) {
|
| 183 |
+
const r = i % 2 === 0 ? outerR : innerR;
|
| 184 |
+
const angle = (Math.PI / 2) + (i * Math.PI / 5);
|
| 185 |
+
points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
|
| 186 |
+
}
|
| 187 |
+
return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
function updateSize() {
|
| 191 |
+
width = container.clientWidth || 800;
|
| 192 |
+
height = Math.max(300, Math.round(width / 1.5));
|
| 193 |
+
svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
|
| 194 |
+
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 195 |
+
return {
|
| 196 |
+
innerWidth: width - margin.left - margin.right,
|
| 197 |
+
innerHeight: height - margin.top - margin.bottom
|
| 198 |
+
};
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
function showTooltip(event, d) {
|
| 202 |
+
const rect = container.getBoundingClientRect();
|
| 203 |
+
const x = event.clientX - rect.left;
|
| 204 |
+
const y = event.clientY - rect.top;
|
| 205 |
+
|
| 206 |
+
tip.innerHTML = `
|
| 207 |
+
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 208 |
+
<div class="metric">
|
| 209 |
+
<span class="metric-label">Score:</span>
|
| 210 |
+
<span class="metric-value">${d.avg_floored_score.toFixed(1)}</span>
|
| 211 |
+
</div>
|
| 212 |
+
<div class="metric">
|
| 213 |
+
<span class="metric-label">Recklessness Index:</span>
|
| 214 |
+
<span class="metric-value">${d.recklessness_index.toFixed(2)}</span>
|
| 215 |
+
</div>
|
| 216 |
+
<div class="metric">
|
| 217 |
+
<span class="metric-label">Failed Guesses:</span>
|
| 218 |
+
<span class="metric-value">${d.avg_failed_guesses.toFixed(2)}</span>
|
| 219 |
+
</div>
|
| 220 |
+
<div class="metric">
|
| 221 |
+
<span class="metric-label">Caution:</span>
|
| 222 |
+
<span class="metric-value">${d.avg_caution.toFixed(2)}</span>
|
| 223 |
+
</div>
|
| 224 |
+
<div class="metric">
|
| 225 |
+
<span class="metric-label">Type:</span>
|
| 226 |
+
<span class="metric-value">${d.is_open ? 'Open' : 'Closed'}</span>
|
| 227 |
+
</div>
|
| 228 |
+
`;
|
| 229 |
+
|
| 230 |
+
const tipWidth = tip.offsetWidth || 180;
|
| 231 |
+
const tipHeight = tip.offsetHeight || 120;
|
| 232 |
+
let tipX = x + 12;
|
| 233 |
+
let tipY = y - tipHeight / 2;
|
| 234 |
+
|
| 235 |
+
if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
|
| 236 |
+
if (tipY < 0) tipY = 8;
|
| 237 |
+
if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
|
| 238 |
+
|
| 239 |
+
tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
|
| 240 |
+
tip.style.opacity = '1';
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
function hideTooltip() {
|
| 244 |
+
tip.style.opacity = '0';
|
| 245 |
+
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
function render() {
|
| 249 |
+
if (!data) return;
|
| 250 |
+
|
| 251 |
+
const { innerWidth, innerHeight } = updateSize();
|
| 252 |
+
const models = data.models;
|
| 253 |
+
|
| 254 |
+
// Fixed symmetric X scale from -8 to 8
|
| 255 |
+
xScale
|
| 256 |
+
.domain([-8, 8])
|
| 257 |
+
.range([0, innerWidth]);
|
| 258 |
+
|
| 259 |
+
// Y scale based on data
|
| 260 |
+
const yExtent = d3.extent(models, d => d.avg_floored_score);
|
| 261 |
+
const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
|
| 262 |
+
yScale
|
| 263 |
+
.domain([yExtent[0], yExtent[1] + yPadding])
|
| 264 |
+
.range([innerHeight, 0])
|
| 265 |
+
.nice();
|
| 266 |
+
|
| 267 |
+
// Background gradient rectangle
|
| 268 |
+
gBackground.selectAll('.bg-gradient')
|
| 269 |
+
.data([0])
|
| 270 |
+
.join('rect')
|
| 271 |
+
.attr('class', 'bg-gradient')
|
| 272 |
+
.attr('x', 0)
|
| 273 |
+
.attr('y', 0)
|
| 274 |
+
.attr('width', innerWidth)
|
| 275 |
+
.attr('height', innerHeight)
|
| 276 |
+
.attr('fill', 'url(#recklessness-gradient)');
|
| 277 |
+
|
| 278 |
+
// Grid lines
|
| 279 |
+
const xTicks = xScale.ticks(8);
|
| 280 |
+
const yTicks = yScale.ticks(6);
|
| 281 |
+
|
| 282 |
+
gGrid.selectAll('.grid-x')
|
| 283 |
+
.data(xTicks)
|
| 284 |
+
.join('line')
|
| 285 |
+
.attr('class', 'grid-x')
|
| 286 |
+
.attr('x1', d => xScale(d))
|
| 287 |
+
.attr('x2', d => xScale(d))
|
| 288 |
+
.attr('y1', 0)
|
| 289 |
+
.attr('y2', innerHeight);
|
| 290 |
+
|
| 291 |
+
gGrid.selectAll('.grid-y')
|
| 292 |
+
.data(yTicks)
|
| 293 |
+
.join('line')
|
| 294 |
+
.attr('class', 'grid-y')
|
| 295 |
+
.attr('x1', 0)
|
| 296 |
+
.attr('x2', innerWidth)
|
| 297 |
+
.attr('y1', d => yScale(d))
|
| 298 |
+
.attr('y2', d => yScale(d));
|
| 299 |
+
|
| 300 |
+
// Axes with inner ticks
|
| 301 |
+
const tickSize = 6;
|
| 302 |
+
gAxes.selectAll('.x-axis')
|
| 303 |
+
.data([0])
|
| 304 |
+
.join('g')
|
| 305 |
+
.attr('class', 'x-axis')
|
| 306 |
+
.attr('transform', `translate(0,${innerHeight})`)
|
| 307 |
+
.call(d3.axisBottom(xScale).ticks(8).tickSizeInner(-tickSize).tickSizeOuter(0));
|
| 308 |
+
|
| 309 |
+
gAxes.selectAll('.y-axis')
|
| 310 |
+
.data([0])
|
| 311 |
+
.join('g')
|
| 312 |
+
.attr('class', 'y-axis')
|
| 313 |
+
.call(d3.axisLeft(yScale).ticks(6).tickSizeInner(-tickSize).tickSizeOuter(0));
|
| 314 |
+
|
| 315 |
+
// Axis labels
|
| 316 |
+
gAxes.selectAll('.x-label')
|
| 317 |
+
.data([0])
|
| 318 |
+
.join('text')
|
| 319 |
+
.attr('class', 'x-label axis-label')
|
| 320 |
+
.attr('x', innerWidth / 2)
|
| 321 |
+
.attr('y', innerHeight + 44)
|
| 322 |
+
.attr('text-anchor', 'middle')
|
| 323 |
+
.text('Boldness Index');
|
| 324 |
+
|
| 325 |
+
gAxes.selectAll('.y-label')
|
| 326 |
+
.data([0])
|
| 327 |
+
.join('text')
|
| 328 |
+
.attr('class', 'y-label axis-label')
|
| 329 |
+
.attr('x', -innerHeight / 2)
|
| 330 |
+
.attr('y', -52)
|
| 331 |
+
.attr('text-anchor', 'middle')
|
| 332 |
+
.attr('transform', 'rotate(-90)')
|
| 333 |
+
.text('Score');
|
| 334 |
+
|
| 335 |
+
// Top annotations: Overcautious / Cautious / Measured / Bold / Reckless
|
| 336 |
+
const annotations = [
|
| 337 |
+
{ label: 'Overcautious', color: 'rgba(239, 83, 80, 0.9)', pos: 0.07}, // red
|
| 338 |
+
{ label: 'Cautious', color: 'rgba(255, 180, 0, 0.9)', pos: 0.25 }, // yellow/orange
|
| 339 |
+
{ label: 'Measured', color: 'rgba(76, 175, 80, 0.9)', pos: 0.5 }, // green
|
| 340 |
+
{ label: 'Bold', color: 'rgba(255, 180, 0, 0.9)', pos: 0.75 }, // yellow/orange
|
| 341 |
+
{ label: 'Reckless', color: 'rgba(239, 83, 80, 0.9)', pos: 0.95 } // red
|
| 342 |
+
];
|
| 343 |
+
|
| 344 |
+
gAnnotations.selectAll('.annotation-label')
|
| 345 |
+
.data(annotations, d => d.label)
|
| 346 |
+
.join('text')
|
| 347 |
+
.attr('class', 'annotation annotation-label')
|
| 348 |
+
.attr('x', d => d.pos * innerWidth)
|
| 349 |
+
.attr('y', 16)
|
| 350 |
+
.attr('text-anchor', d => d.pos === 0 ? 'start' : d.pos === 1 ? 'end' : 'middle')
|
| 351 |
+
.style('fill', d => d.color)
|
| 352 |
+
.style('font-weight', 'bold')
|
| 353 |
+
.style('font-size', '13px')
|
| 354 |
+
.text(d => d.label);
|
| 355 |
+
|
| 356 |
+
// Points
|
| 357 |
+
const pointRadius = Math.max(8, Math.min(14, innerWidth / 60));
|
| 358 |
+
|
| 359 |
+
// Closed models as filled circles
|
| 360 |
+
const closedModels = models.filter(d => !d.is_open);
|
| 361 |
+
gPoints.selectAll('.point-closed')
|
| 362 |
+
.data(closedModels, d => d.name)
|
| 363 |
+
.join('circle')
|
| 364 |
+
.attr('class', 'point point-closed')
|
| 365 |
+
.attr('cx', d => xScale(d.recklessness_index))
|
| 366 |
+
.attr('cy', d => yScale(d.avg_floored_score))
|
| 367 |
+
.attr('r', pointRadius)
|
| 368 |
+
.attr('fill', d => d.color)
|
| 369 |
+
.attr('stroke', 'none')
|
| 370 |
+
.on('mouseenter', showTooltip)
|
| 371 |
+
.on('mousemove', showTooltip)
|
| 372 |
+
.on('mouseleave', hideTooltip);
|
| 373 |
+
|
| 374 |
+
// Open models as stars
|
| 375 |
+
const openModels = models.filter(d => d.is_open);
|
| 376 |
+
gPoints.selectAll('.point-star')
|
| 377 |
+
.data(openModels, d => d.name)
|
| 378 |
+
.join('path')
|
| 379 |
+
.attr('class', 'point point-star')
|
| 380 |
+
.attr('d', d => starPath(xScale(d.recklessness_index), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
|
| 381 |
+
.attr('fill', d => d.color)
|
| 382 |
+
.attr('stroke', 'none')
|
| 383 |
+
.on('mouseenter', showTooltip)
|
| 384 |
+
.on('mousemove', showTooltip)
|
| 385 |
+
.on('mouseleave', hideTooltip);
|
| 386 |
+
|
| 387 |
+
// Point labels with smart positioning
|
| 388 |
+
gLabels.selectAll('.point-label')
|
| 389 |
+
.data(models, d => d.name)
|
| 390 |
+
.join('text')
|
| 391 |
+
.attr('class', 'point-label')
|
| 392 |
+
.attr('x', d => {
|
| 393 |
+
const xPos = xScale(d.recklessness_index);
|
| 394 |
+
if (xPos > innerWidth - 100) {
|
| 395 |
+
return xPos - pointRadius - 6;
|
| 396 |
+
}
|
| 397 |
+
return xPos + pointRadius + 6;
|
| 398 |
+
})
|
| 399 |
+
.attr('y', d => yScale(d.avg_floored_score) + 4)
|
| 400 |
+
.attr('text-anchor', d => {
|
| 401 |
+
const xPos = xScale(d.recklessness_index);
|
| 402 |
+
return xPos > innerWidth - 100 ? 'end' : 'start';
|
| 403 |
+
})
|
| 404 |
+
.text(d => d.name);
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
// Initialize
|
| 408 |
+
fetch(DATA_URL, { cache: 'no-cache' })
|
| 409 |
+
.then(r => r.json())
|
| 410 |
+
.then(json => {
|
| 411 |
+
data = json;
|
| 412 |
+
render();
|
| 413 |
+
})
|
| 414 |
+
.catch(err => {
|
| 415 |
+
const pre = document.createElement('pre');
|
| 416 |
+
pre.style.color = 'red';
|
| 417 |
+
pre.style.padding = '16px';
|
| 418 |
+
pre.textContent = `Error loading data: ${err.message}`;
|
| 419 |
+
container.appendChild(pre);
|
| 420 |
+
});
|
| 421 |
+
|
| 422 |
+
// Resize handling
|
| 423 |
+
if (window.ResizeObserver) {
|
| 424 |
+
new ResizeObserver(() => render()).observe(container);
|
| 425 |
+
} else {
|
| 426 |
+
window.addEventListener('resize', render);
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
// Theme change handling
|
| 430 |
+
const observer = new MutationObserver(() => render());
|
| 431 |
+
observer.observe(document.documentElement, {
|
| 432 |
+
attributes: true,
|
| 433 |
+
attributeFilter: ['data-theme']
|
| 434 |
+
});
|
| 435 |
+
};
|
| 436 |
+
|
| 437 |
+
if (document.readyState === 'loading') {
|
| 438 |
+
document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
|
| 439 |
+
} else {
|
| 440 |
+
ensureD3(bootstrap);
|
| 441 |
+
}
|
| 442 |
+
})();
|
| 443 |
+
</script>
|
app/src/content/embeds/tokens-by-turn.html
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="d3-tokens-by-turn"></div>
|
| 2 |
+
<style>
|
| 3 |
+
.d3-tokens-by-turn {
|
| 4 |
+
width: 100%;
|
| 5 |
+
margin: 10px 0;
|
| 6 |
+
position: relative;
|
| 7 |
+
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
.d3-tokens-by-turn svg {
|
| 11 |
+
display: block;
|
| 12 |
+
width: 100%;
|
| 13 |
+
height: auto;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
.d3-tokens-by-turn .axes path,
|
| 17 |
+
.d3-tokens-by-turn .axes line {
|
| 18 |
+
stroke: var(--axis-color, var(--text-color));
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
.d3-tokens-by-turn .axes text {
|
| 22 |
+
fill: var(--tick-color, var(--muted-color));
|
| 23 |
+
font-size: 11px;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.d3-tokens-by-turn .grid line {
|
| 27 |
+
stroke: var(--grid-color, rgba(0,0,0,.08));
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.d3-tokens-by-turn .axes text.axis-label {
|
| 31 |
+
font-size: 14px;
|
| 32 |
+
font-weight: 500;
|
| 33 |
+
fill: var(--text-color);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.d3-tokens-by-turn .x-axis text {
|
| 37 |
+
transform: translateY(4px);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
.d3-tokens-by-turn .tokens-line {
|
| 41 |
+
fill: none;
|
| 42 |
+
stroke-width: 1.5;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.d3-tokens-by-turn .data-point {
|
| 46 |
+
cursor: pointer;
|
| 47 |
+
transition: opacity 0.15s ease;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
.d3-tokens-by-turn .data-point:hover {
|
| 51 |
+
opacity: 0.8;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.d3-tokens-by-turn .legend {
|
| 55 |
+
font-size: 11px;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
.d3-tokens-by-turn .legend-item {
|
| 59 |
+
cursor: pointer;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.d3-tokens-by-turn .legend-item.dimmed .legend-line,
|
| 63 |
+
.d3-tokens-by-turn .legend-item.dimmed .legend-marker {
|
| 64 |
+
opacity: 0.3;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
.d3-tokens-by-turn .legend-item.dimmed text {
|
| 68 |
+
opacity: 0.4;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
.d3-tokens-by-turn .legend-text {
|
| 72 |
+
fill: var(--text-color);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.d3-tokens-by-turn .d3-tooltip {
|
| 76 |
+
position: absolute;
|
| 77 |
+
top: 0;
|
| 78 |
+
left: 0;
|
| 79 |
+
transform: translate(-9999px, -9999px);
|
| 80 |
+
pointer-events: none;
|
| 81 |
+
padding: 10px 12px;
|
| 82 |
+
border-radius: 8px;
|
| 83 |
+
font-size: 12px;
|
| 84 |
+
line-height: 1.4;
|
| 85 |
+
border: 1px solid var(--border-color);
|
| 86 |
+
background: var(--surface-bg);
|
| 87 |
+
color: var(--text-color);
|
| 88 |
+
box-shadow: 0 4px 24px rgba(0,0,0,.18);
|
| 89 |
+
opacity: 0;
|
| 90 |
+
transition: opacity 0.12s ease;
|
| 91 |
+
z-index: 10;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
.d3-tokens-by-turn .d3-tooltip .model-name {
|
| 95 |
+
font-weight: 600;
|
| 96 |
+
margin-bottom: 4px;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
.d3-tokens-by-turn .d3-tooltip .metric {
|
| 100 |
+
display: flex;
|
| 101 |
+
justify-content: space-between;
|
| 102 |
+
gap: 16px;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.d3-tokens-by-turn .d3-tooltip .metric-label {
|
| 106 |
+
color: var(--muted-color);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.d3-tokens-by-turn .d3-tooltip .metric-value {
|
| 110 |
+
font-weight: 500;
|
| 111 |
+
}
|
| 112 |
+
</style>
|
| 113 |
+
<script>
|
| 114 |
+
(() => {
|
| 115 |
+
const ensureD3 = (cb) => {
|
| 116 |
+
if (window.d3 && typeof window.d3.select === 'function') return cb();
|
| 117 |
+
let s = document.getElementById('d3-cdn-script');
|
| 118 |
+
if (!s) {
|
| 119 |
+
s = document.createElement('script');
|
| 120 |
+
s.id = 'd3-cdn-script';
|
| 121 |
+
s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js';
|
| 122 |
+
document.head.appendChild(s);
|
| 123 |
+
}
|
| 124 |
+
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
|
| 125 |
+
s.addEventListener('load', onReady, { once: true });
|
| 126 |
+
if (window.d3) onReady();
|
| 127 |
+
};
|
| 128 |
+
|
| 129 |
+
const bootstrap = () => {
|
| 130 |
+
const scriptEl = document.currentScript;
|
| 131 |
+
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 132 |
+
if (!(container && container.classList && container.classList.contains('d3-tokens-by-turn'))) {
|
| 133 |
+
const candidates = Array.from(document.querySelectorAll('.d3-tokens-by-turn'))
|
| 134 |
+
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 135 |
+
container = candidates[candidates.length - 1] || null;
|
| 136 |
+
}
|
| 137 |
+
if (!container) return;
|
| 138 |
+
if (container.dataset) {
|
| 139 |
+
if (container.dataset.mounted === 'true') return;
|
| 140 |
+
container.dataset.mounted = 'true';
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
// Tooltip setup
|
| 144 |
+
container.style.position = container.style.position || 'relative';
|
| 145 |
+
const tip = document.createElement('div');
|
| 146 |
+
tip.className = 'd3-tooltip';
|
| 147 |
+
container.appendChild(tip);
|
| 148 |
+
|
| 149 |
+
// SVG setup
|
| 150 |
+
const svg = d3.select(container).append('svg');
|
| 151 |
+
const gRoot = svg.append('g');
|
| 152 |
+
|
| 153 |
+
// Chart groups (order matters for layering)
|
| 154 |
+
const gGrid = gRoot.append('g').attr('class', 'grid');
|
| 155 |
+
const gLines = gRoot.append('g').attr('class', 'lines');
|
| 156 |
+
const gPoints = gRoot.append('g').attr('class', 'points');
|
| 157 |
+
const gAxes = gRoot.append('g').attr('class', 'axes');
|
| 158 |
+
const gLegend = gRoot.append('g').attr('class', 'legend');
|
| 159 |
+
|
| 160 |
+
// State
|
| 161 |
+
let data = null;
|
| 162 |
+
let width = 800;
|
| 163 |
+
let height = 450;
|
| 164 |
+
const margin = { top: 20, right: 180, bottom: 56, left: 72 };
|
| 165 |
+
let hiddenModels = new Set();
|
| 166 |
+
|
| 167 |
+
// Scales
|
| 168 |
+
const xScale = d3.scaleLinear();
|
| 169 |
+
const yScale = d3.scaleLinear();
|
| 170 |
+
|
| 171 |
+
// Line generator
|
| 172 |
+
const line = d3.line()
|
| 173 |
+
.x(d => xScale(d.turn_number))
|
| 174 |
+
.y(d => yScale(d.avg_output_tokens));
|
| 175 |
+
|
| 176 |
+
// Data loading
|
| 177 |
+
const DATA_URL = '/data/tokens_by_turn.json';
|
| 178 |
+
|
| 179 |
+
function updateSize() {
|
| 180 |
+
width = container.clientWidth || 800;
|
| 181 |
+
height = Math.max(350, Math.round(width * 0.5));
|
| 182 |
+
svg.attr('width', width).attr('height', height).attr('viewBox', `0 0 ${width} ${height}`);
|
| 183 |
+
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 184 |
+
return {
|
| 185 |
+
innerWidth: width - margin.left - margin.right,
|
| 186 |
+
innerHeight: height - margin.top - margin.bottom
|
| 187 |
+
};
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
function showTooltip(event, d, model) {
|
| 191 |
+
const rect = container.getBoundingClientRect();
|
| 192 |
+
const x = event.clientX - rect.left;
|
| 193 |
+
const y = event.clientY - rect.top;
|
| 194 |
+
|
| 195 |
+
tip.innerHTML = `
|
| 196 |
+
<div class="model-name" style="color: ${model.color}">${model.name}</div>
|
| 197 |
+
<div class="metric">
|
| 198 |
+
<span class="metric-label">Turn:</span>
|
| 199 |
+
<span class="metric-value">${d.turn_number}</span>
|
| 200 |
+
</div>
|
| 201 |
+
<div class="metric">
|
| 202 |
+
<span class="metric-label">Avg tokens:</span>
|
| 203 |
+
<span class="metric-value">${Math.round(d.avg_output_tokens).toLocaleString()}</span>
|
| 204 |
+
</div>
|
| 205 |
+
<div class="metric">
|
| 206 |
+
<span class="metric-label">Sample size:</span>
|
| 207 |
+
<span class="metric-value">${d.sample_count}</span>
|
| 208 |
+
</div>
|
| 209 |
+
`;
|
| 210 |
+
|
| 211 |
+
const tipWidth = tip.offsetWidth || 150;
|
| 212 |
+
const tipHeight = tip.offsetHeight || 100;
|
| 213 |
+
let tipX = x + 12;
|
| 214 |
+
let tipY = y - tipHeight / 2;
|
| 215 |
+
|
| 216 |
+
if (tipX + tipWidth > width) tipX = x - tipWidth - 12;
|
| 217 |
+
if (tipY < 0) tipY = 8;
|
| 218 |
+
if (tipY + tipHeight > height) tipY = height - tipHeight - 8;
|
| 219 |
+
|
| 220 |
+
tip.style.transform = `translate(${tipX}px, ${tipY}px)`;
|
| 221 |
+
tip.style.opacity = '1';
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
function hideTooltip() {
|
| 225 |
+
tip.style.opacity = '0';
|
| 226 |
+
tip.style.transform = 'translate(-9999px, -9999px)';
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
function toggleModel(modelName) {
|
| 230 |
+
if (hiddenModels.has(modelName)) {
|
| 231 |
+
hiddenModels.delete(modelName);
|
| 232 |
+
} else {
|
| 233 |
+
hiddenModels.add(modelName);
|
| 234 |
+
}
|
| 235 |
+
render();
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
// Helper function to create a 5-point star path
|
| 239 |
+
const starPath = (cx, cy, outerR, innerR) => {
|
| 240 |
+
const points = [];
|
| 241 |
+
for (let i = 0; i < 10; i++) {
|
| 242 |
+
const r = i % 2 === 0 ? outerR : innerR;
|
| 243 |
+
const angle = (Math.PI / 2) + (i * Math.PI / 5);
|
| 244 |
+
points.push([cx + r * Math.cos(angle), cy - r * Math.sin(angle)]);
|
| 245 |
+
}
|
| 246 |
+
return 'M' + points.map(p => p.join(',')).join('L') + 'Z';
|
| 247 |
+
};
|
| 248 |
+
|
| 249 |
+
function render() {
|
| 250 |
+
if (!data) return;
|
| 251 |
+
|
| 252 |
+
const { innerWidth, innerHeight } = updateSize();
|
| 253 |
+
const models = data.models;
|
| 254 |
+
|
| 255 |
+
// Find visible models and compute extents
|
| 256 |
+
const visibleModels = models.filter(m => !hiddenModels.has(m.name));
|
| 257 |
+
|
| 258 |
+
// X scale: turn number 1-30
|
| 259 |
+
xScale
|
| 260 |
+
.domain([1, 30])
|
| 261 |
+
.range([0, innerWidth]);
|
| 262 |
+
|
| 263 |
+
// Y scale: find max tokens across visible models
|
| 264 |
+
let maxTokens = 0;
|
| 265 |
+
visibleModels.forEach(m => {
|
| 266 |
+
m.tokens_by_turn.forEach(t => {
|
| 267 |
+
if (t.avg_output_tokens > maxTokens) maxTokens = t.avg_output_tokens;
|
| 268 |
+
});
|
| 269 |
+
});
|
| 270 |
+
maxTokens = Math.ceil(maxTokens / 2000) * 2000; // Round up to nearest 2000
|
| 271 |
+
|
| 272 |
+
yScale
|
| 273 |
+
.domain([0, maxTokens])
|
| 274 |
+
.range([innerHeight, 0]);
|
| 275 |
+
|
| 276 |
+
// Grid lines
|
| 277 |
+
const xTicks = d3.range(5, 31, 5); // 5, 10, 15, 20, 25, 30
|
| 278 |
+
const yTicks = yScale.ticks(6);
|
| 279 |
+
|
| 280 |
+
gGrid.selectAll('.grid-x')
|
| 281 |
+
.data(xTicks)
|
| 282 |
+
.join('line')
|
| 283 |
+
.attr('class', 'grid-x')
|
| 284 |
+
.attr('x1', d => xScale(d))
|
| 285 |
+
.attr('x2', d => xScale(d))
|
| 286 |
+
.attr('y1', 0)
|
| 287 |
+
.attr('y2', innerHeight);
|
| 288 |
+
|
| 289 |
+
gGrid.selectAll('.grid-y')
|
| 290 |
+
.data(yTicks)
|
| 291 |
+
.join('line')
|
| 292 |
+
.attr('class', 'grid-y')
|
| 293 |
+
.attr('x1', 0)
|
| 294 |
+
.attr('x2', innerWidth)
|
| 295 |
+
.attr('y1', d => yScale(d))
|
| 296 |
+
.attr('y2', d => yScale(d));
|
| 297 |
+
|
| 298 |
+
// Axes
|
| 299 |
+
const tickSize = 6;
|
| 300 |
+
|
| 301 |
+
gAxes.selectAll('.x-axis')
|
| 302 |
+
.data([0])
|
| 303 |
+
.join('g')
|
| 304 |
+
.attr('class', 'x-axis')
|
| 305 |
+
.attr('transform', `translate(0,${innerHeight})`)
|
| 306 |
+
.call(d3.axisBottom(xScale)
|
| 307 |
+
.tickValues([1, 5, 10, 15, 20, 25, 30])
|
| 308 |
+
.tickSizeInner(-tickSize)
|
| 309 |
+
.tickSizeOuter(0));
|
| 310 |
+
|
| 311 |
+
gAxes.selectAll('.y-axis')
|
| 312 |
+
.data([0])
|
| 313 |
+
.join('g')
|
| 314 |
+
.attr('class', 'y-axis')
|
| 315 |
+
.call(d3.axisLeft(yScale)
|
| 316 |
+
.ticks(6)
|
| 317 |
+
.tickFormat(d => d >= 1000 ? `${d/1000}k` : d)
|
| 318 |
+
.tickSizeInner(-tickSize)
|
| 319 |
+
.tickSizeOuter(0));
|
| 320 |
+
|
| 321 |
+
// Axis labels
|
| 322 |
+
gAxes.selectAll('.x-label')
|
| 323 |
+
.data([0])
|
| 324 |
+
.join('text')
|
| 325 |
+
.attr('class', 'x-label axis-label')
|
| 326 |
+
.attr('x', innerWidth / 2)
|
| 327 |
+
.attr('y', innerHeight + 44)
|
| 328 |
+
.attr('text-anchor', 'middle')
|
| 329 |
+
.text('Turn Number');
|
| 330 |
+
|
| 331 |
+
gAxes.selectAll('.y-label')
|
| 332 |
+
.data([0])
|
| 333 |
+
.join('text')
|
| 334 |
+
.attr('class', 'y-label axis-label')
|
| 335 |
+
.attr('x', -innerHeight / 2)
|
| 336 |
+
.attr('y', -52)
|
| 337 |
+
.attr('text-anchor', 'middle')
|
| 338 |
+
.attr('transform', 'rotate(-90)')
|
| 339 |
+
.text('Average Output Tokens');
|
| 340 |
+
|
| 341 |
+
// Lines for each model
|
| 342 |
+
gLines.selectAll('.tokens-line')
|
| 343 |
+
.data(visibleModels, d => d.name)
|
| 344 |
+
.join('path')
|
| 345 |
+
.attr('class', 'tokens-line')
|
| 346 |
+
.attr('d', d => line(d.tokens_by_turn))
|
| 347 |
+
.attr('stroke', d => d.color)
|
| 348 |
+
.attr('stroke-dasharray', d => d.is_open ? '6,3' : 'none');
|
| 349 |
+
|
| 350 |
+
// Data points
|
| 351 |
+
const allPoints = visibleModels.flatMap(model =>
|
| 352 |
+
model.tokens_by_turn.map(p => ({ ...p, model }))
|
| 353 |
+
);
|
| 354 |
+
const closedPoints = allPoints.filter(d => !d.model.is_open);
|
| 355 |
+
const openPoints = allPoints.filter(d => d.model.is_open);
|
| 356 |
+
|
| 357 |
+
// Circles for closed models
|
| 358 |
+
gPoints.selectAll('.data-point-circle')
|
| 359 |
+
.data(closedPoints, d => `${d.model.name}-${d.turn_number}`)
|
| 360 |
+
.join('circle')
|
| 361 |
+
.attr('class', 'data-point data-point-circle')
|
| 362 |
+
.attr('cx', d => xScale(d.turn_number))
|
| 363 |
+
.attr('cy', d => yScale(d.avg_output_tokens))
|
| 364 |
+
.attr('r', 3)
|
| 365 |
+
.attr('fill', d => d.model.color)
|
| 366 |
+
.attr('stroke', 'var(--surface-bg, white)')
|
| 367 |
+
.attr('stroke-width', 1)
|
| 368 |
+
.on('mouseenter', (event, d) => showTooltip(event, d, d.model))
|
| 369 |
+
.on('mousemove', (event, d) => showTooltip(event, d, d.model))
|
| 370 |
+
.on('mouseleave', hideTooltip);
|
| 371 |
+
|
| 372 |
+
// Stars for open models
|
| 373 |
+
gPoints.selectAll('.data-point-star')
|
| 374 |
+
.data(openPoints, d => `${d.model.name}-${d.turn_number}`)
|
| 375 |
+
.join('path')
|
| 376 |
+
.attr('class', 'data-point data-point-star')
|
| 377 |
+
.attr('d', d => starPath(
|
| 378 |
+
xScale(d.turn_number),
|
| 379 |
+
yScale(d.avg_output_tokens),
|
| 380 |
+
5, 2.2
|
| 381 |
+
))
|
| 382 |
+
.attr('fill', d => d.model.color)
|
| 383 |
+
.attr('stroke', 'var(--surface-bg, white)')
|
| 384 |
+
.attr('stroke-width', 0.6)
|
| 385 |
+
.on('mouseenter', (event, d) => showTooltip(event, d, d.model))
|
| 386 |
+
.on('mousemove', (event, d) => showTooltip(event, d, d.model))
|
| 387 |
+
.on('mouseleave', hideTooltip);
|
| 388 |
+
|
| 389 |
+
// Legend
|
| 390 |
+
const legendX = innerWidth + 16;
|
| 391 |
+
const legendItemHeight = 20;
|
| 392 |
+
|
| 393 |
+
gLegend.selectAll('.legend-item')
|
| 394 |
+
.data(models, d => d.name)
|
| 395 |
+
.join('g')
|
| 396 |
+
.attr('class', d => `legend-item ${hiddenModels.has(d.name) ? 'dimmed' : ''}`)
|
| 397 |
+
.attr('transform', (d, i) => `translate(${legendX}, ${i * legendItemHeight})`)
|
| 398 |
+
.each(function(d) {
|
| 399 |
+
const g = d3.select(this);
|
| 400 |
+
g.selectAll('*').remove();
|
| 401 |
+
|
| 402 |
+
// Line segment
|
| 403 |
+
g.append('line')
|
| 404 |
+
.attr('class', 'legend-line')
|
| 405 |
+
.attr('x1', 0)
|
| 406 |
+
.attr('x2', 20)
|
| 407 |
+
.attr('y1', 0)
|
| 408 |
+
.attr('y2', 0)
|
| 409 |
+
.attr('stroke', d.color)
|
| 410 |
+
.attr('stroke-width', 1.5)
|
| 411 |
+
.attr('stroke-dasharray', d.is_open ? '4,2' : 'none');
|
| 412 |
+
|
| 413 |
+
// Marker - circle for closed, star for open
|
| 414 |
+
if (d.is_open) {
|
| 415 |
+
g.append('path')
|
| 416 |
+
.attr('class', 'legend-marker')
|
| 417 |
+
.attr('d', starPath(10, 0, 5, 2.2))
|
| 418 |
+
.attr('fill', d.color);
|
| 419 |
+
} else {
|
| 420 |
+
g.append('circle')
|
| 421 |
+
.attr('class', 'legend-marker')
|
| 422 |
+
.attr('cx', 10)
|
| 423 |
+
.attr('cy', 0)
|
| 424 |
+
.attr('r', 3)
|
| 425 |
+
.attr('fill', d.color);
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
g.append('text')
|
| 429 |
+
.attr('class', 'legend-text')
|
| 430 |
+
.attr('x', 26)
|
| 431 |
+
.attr('y', 4)
|
| 432 |
+
.text(d.name);
|
| 433 |
+
|
| 434 |
+
g.style('cursor', 'pointer')
|
| 435 |
+
.on('click', () => toggleModel(d.name));
|
| 436 |
+
});
|
| 437 |
+
|
| 438 |
+
// Legend note about line styles
|
| 439 |
+
const noteY = models.length * legendItemHeight + 12;
|
| 440 |
+
gLegend.selectAll('.legend-note')
|
| 441 |
+
.data([0])
|
| 442 |
+
.join('text')
|
| 443 |
+
.attr('class', 'legend-note')
|
| 444 |
+
.attr('x', legendX)
|
| 445 |
+
.attr('y', noteY)
|
| 446 |
+
.attr('font-size', '10px')
|
| 447 |
+
.attr('fill', 'var(--muted-color)')
|
| 448 |
+
.text('Solid = Closed, Dashed = Open');
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
// Initialize
|
| 452 |
+
fetch(DATA_URL, { cache: 'no-cache' })
|
| 453 |
+
.then(r => r.json())
|
| 454 |
+
.then(json => {
|
| 455 |
+
data = json;
|
| 456 |
+
render();
|
| 457 |
+
})
|
| 458 |
+
.catch(err => {
|
| 459 |
+
const pre = document.createElement('pre');
|
| 460 |
+
pre.style.color = 'red';
|
| 461 |
+
pre.style.padding = '16px';
|
| 462 |
+
pre.textContent = `Error loading data: ${err.message}`;
|
| 463 |
+
container.appendChild(pre);
|
| 464 |
+
});
|
| 465 |
+
|
| 466 |
+
// Resize handling
|
| 467 |
+
if (window.ResizeObserver) {
|
| 468 |
+
new ResizeObserver(() => render()).observe(container);
|
| 469 |
+
} else {
|
| 470 |
+
window.addEventListener('resize', render);
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
// Theme change handling
|
| 474 |
+
const observer = new MutationObserver(() => render());
|
| 475 |
+
observer.observe(document.documentElement, {
|
| 476 |
+
attributes: true,
|
| 477 |
+
attributeFilter: ['data-theme']
|
| 478 |
+
});
|
| 479 |
+
};
|
| 480 |
+
|
| 481 |
+
if (document.readyState === 'loading') {
|
| 482 |
+
document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
|
| 483 |
+
} else {
|
| 484 |
+
ensureD3(bootstrap);
|
| 485 |
+
}
|
| 486 |
+
})();
|
| 487 |
+
</script>
|
app/src/styles/_layout.css
CHANGED
|
@@ -195,4 +195,30 @@
|
|
| 195 |
width: 100%;
|
| 196 |
min-width: 0;
|
| 197 |
}
|
| 198 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
width: 100%;
|
| 196 |
min-width: 0;
|
| 197 |
}
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
/* ============================================================================ */
|
| 201 |
+
/* Bibliography/References - hide inline sections that should be in footer */
|
| 202 |
+
/* ---------------------------------------------------------------------------- */
|
| 203 |
+
/* References sections with data-built-refs are generated per-chapter by
|
| 204 |
+
rehype-citation. The Footer.astro script consolidates them into the footer.
|
| 205 |
+
These styles ensure inline refs don't display in the main content area. */
|
| 206 |
+
|
| 207 |
+
/* References in main content should not be displayed - they belong in footer.
|
| 208 |
+
The Footer.astro script moves them; this CSS is a visual safeguard. */
|
| 209 |
+
main [data-built-refs],
|
| 210 |
+
main #references:not(.footer-processed),
|
| 211 |
+
main section.references:not(ol),
|
| 212 |
+
main div.references:not(ol),
|
| 213 |
+
main .bibliography:not(ol) {
|
| 214 |
+
/* Collapse to zero height to prevent layout impact, but keep in DOM for JS */
|
| 215 |
+
max-height: 0;
|
| 216 |
+
overflow: hidden;
|
| 217 |
+
margin: 0 !important;
|
| 218 |
+
padding: 0 !important;
|
| 219 |
+
border: none !important;
|
| 220 |
+
opacity: 0;
|
| 221 |
+
pointer-events: none;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
/* Once moved to footer, these styles don't apply (not inside main) */
|
bibliography_fix.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Bibliography System Fix for Multi-Chapter MDX Articles
|
| 2 |
+
|
| 3 |
+
This document describes changes made to the [Research Article Template](https://huggingface.co/spaces/tfrere/research-article-template) to fix bibliography/references placement when using multiple MDX chapter files.
|
| 4 |
+
|
| 5 |
+
## The Problem
|
| 6 |
+
|
| 7 |
+
When an article is split into multiple MDX chapter files (e.g., `introduction.mdx`, `results.mdx`, etc.) that are imported into a main `article.mdx`, the bibliography appears at the end of each chapter instead of consolidated in the footer.
|
| 8 |
+
|
| 9 |
+
### Root Cause
|
| 10 |
+
|
| 11 |
+
Astro compiles each MDX file independently through the remark/rehype pipeline. The `rehype-citation` plugin appends a `<section id="references">` to the end of **every** MDX file that contains citations.
|
| 12 |
+
|
| 13 |
+
This causes two issues:
|
| 14 |
+
1. **Duplicate IDs**: Multiple `<section id="references">` elements (invalid HTML)
|
| 15 |
+
2. **Scattered bibliographies**: References appear after each chapter instead of once at the end
|
| 16 |
+
|
| 17 |
+
### Original Template Behavior
|
| 18 |
+
|
| 19 |
+
The original `Footer.astro` only looked for the **first** references section using `findFirstOutsideFooter()`. This worked for single-file articles but failed for multi-chapter structures.
|
| 20 |
+
|
| 21 |
+
## The Solution
|
| 22 |
+
|
| 23 |
+
A two-phase approach: build-time marking + runtime consolidation.
|
| 24 |
+
|
| 25 |
+
### Phase 1: Build-Time (post-citation.mjs)
|
| 26 |
+
|
| 27 |
+
Mark ALL references sections so they can be found at runtime.
|
| 28 |
+
|
| 29 |
+
### Phase 2: Runtime (Footer.astro)
|
| 30 |
+
|
| 31 |
+
Consolidate all marked sections into the footer, merging list items and removing duplicates.
|
| 32 |
+
|
| 33 |
+
### Phase 3: CSS Fallback (_layout.css)
|
| 34 |
+
|
| 35 |
+
Hide any unconsolidated sections as a visual safety net.
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## Changes Made
|
| 40 |
+
|
| 41 |
+
### 1. `app/plugins/rehype/post-citation.mjs`
|
| 42 |
+
|
| 43 |
+
**Change**: Find and process ALL references sections, not just the first one.
|
| 44 |
+
|
| 45 |
+
```javascript
|
| 46 |
+
// BEFORE: Only found first section
|
| 47 |
+
const findReferencesRoot = () => {
|
| 48 |
+
let found = null;
|
| 49 |
+
walk(tree, null, (node) => {
|
| 50 |
+
if (found) return; // <-- Stopped after first match
|
| 51 |
+
// ...
|
| 52 |
+
});
|
| 53 |
+
return found;
|
| 54 |
+
};
|
| 55 |
+
|
| 56 |
+
// AFTER: Find ALL sections
|
| 57 |
+
const findAllReferencesRoots = () => {
|
| 58 |
+
const found = [];
|
| 59 |
+
walk(tree, null, (node) => {
|
| 60 |
+
if (!isElement(node)) return;
|
| 61 |
+
const id = getAttr(node, 'id');
|
| 62 |
+
if (id === 'references' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
|
| 63 |
+
if (!found.includes(node)) {
|
| 64 |
+
found.push(node);
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
});
|
| 68 |
+
return found;
|
| 69 |
+
};
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
**Change**: Process all sections in a loop and mark each with `data-built-refs`.
|
| 73 |
+
|
| 74 |
+
```javascript
|
| 75 |
+
// BEFORE: Single section processing
|
| 76 |
+
const refsRoot = findReferencesRoot();
|
| 77 |
+
if (refsRoot) {
|
| 78 |
+
// ... process single section
|
| 79 |
+
setAttr(refsRoot, 'data-built-refs', '1');
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// AFTER: Loop through all sections
|
| 83 |
+
const allRefsRoots = findAllReferencesRoots();
|
| 84 |
+
for (const refsRoot of allRefsRoots) {
|
| 85 |
+
// ... process each section
|
| 86 |
+
setAttr(refsRoot, 'data-built-refs', '1');
|
| 87 |
+
}
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
### 2. `app/src/components/Footer.astro`
|
| 93 |
+
|
| 94 |
+
**Change**: Add `[data-built-refs]` to selector list (was missing).
|
| 95 |
+
|
| 96 |
+
```javascript
|
| 97 |
+
// BEFORE: Missing the data attribute selector
|
| 98 |
+
const allRefsEls = findAllOutsideFooter([
|
| 99 |
+
"#bibliography-references-list",
|
| 100 |
+
"[data-bibliography-block]", // <-- This doesn't exist
|
| 101 |
+
"#references",
|
| 102 |
+
// ...
|
| 103 |
+
]);
|
| 104 |
+
|
| 105 |
+
// AFTER: Added data-built-refs and improved selector order
|
| 106 |
+
const allRefsEls = findAllOutsideFooter([
|
| 107 |
+
"[data-built-refs]", // <-- Added: what post-citation.mjs actually sets
|
| 108 |
+
"[data-bibliography-block]",
|
| 109 |
+
"#bibliography-references-list",
|
| 110 |
+
"section#references",
|
| 111 |
+
"div#references",
|
| 112 |
+
"#refs",
|
| 113 |
+
".references:not(ol)",
|
| 114 |
+
".bibliography",
|
| 115 |
+
]);
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
**Change**: Improved duplicate detection with CSS.escape fallback.
|
| 119 |
+
|
| 120 |
+
```javascript
|
| 121 |
+
// BEFORE: Could fail if CSS.escape unavailable or ID has special chars
|
| 122 |
+
if (!itemId || !targetOl.querySelector(`#${CSS.escape(itemId)}`)) {
|
| 123 |
+
targetOl.appendChild(item);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
// AFTER: Robust fallback
|
| 127 |
+
if (itemId) {
|
| 128 |
+
try {
|
| 129 |
+
const escapedId = CSS.escape ? CSS.escape(itemId) : itemId.replace(/([^\w-])/g, '\\$1');
|
| 130 |
+
if (targetOl.querySelector(`#${escapedId}`)) {
|
| 131 |
+
return; // Skip duplicate
|
| 132 |
+
}
|
| 133 |
+
} catch (e) {
|
| 134 |
+
// Manual check if selector fails
|
| 135 |
+
const existing = Array.from(targetOl.querySelectorAll('li')).find(li => li.id === itemId);
|
| 136 |
+
if (existing) return;
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
targetOl.appendChild(item);
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
**Change**: Added MutationObserver to catch dynamically rendered content.
|
| 143 |
+
|
| 144 |
+
```javascript
|
| 145 |
+
// Watch for dynamically added content (e.g., lazy-loaded components)
|
| 146 |
+
const observer = new MutationObserver((mutations) => {
|
| 147 |
+
if (footer.dataset.processed !== "true") {
|
| 148 |
+
attemptMove();
|
| 149 |
+
} else {
|
| 150 |
+
// Check if any new references sections were added
|
| 151 |
+
for (const mutation of mutations) {
|
| 152 |
+
for (const node of mutation.addedNodes) {
|
| 153 |
+
if (node.nodeType === 1) {
|
| 154 |
+
const el = node;
|
| 155 |
+
if (
|
| 156 |
+
el.id === "references" ||
|
| 157 |
+
el.classList?.contains("references") ||
|
| 158 |
+
el.hasAttribute?.("data-built-refs")
|
| 159 |
+
) {
|
| 160 |
+
footer.dataset.processed = "false";
|
| 161 |
+
attemptMove();
|
| 162 |
+
return;
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
});
|
| 169 |
+
|
| 170 |
+
if (contentRoot) {
|
| 171 |
+
observer.observe(contentRoot, { childList: true, subtree: true });
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
// Stop observing after page is fully loaded
|
| 175 |
+
window.addEventListener("load", () => {
|
| 176 |
+
setTimeout(() => observer.disconnect(), 2000);
|
| 177 |
+
}, { once: true });
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
---
|
| 181 |
+
|
| 182 |
+
### 3. `app/src/styles/_layout.css`
|
| 183 |
+
|
| 184 |
+
**Change**: Added CSS to hide any inline references sections that weren't consolidated.
|
| 185 |
+
|
| 186 |
+
```css
|
| 187 |
+
/* Bibliography/References - hide inline sections that should be in footer */
|
| 188 |
+
/* These styles ensure inline refs don't display in the main content area. */
|
| 189 |
+
|
| 190 |
+
main [data-built-refs],
|
| 191 |
+
main #references:not(.footer-processed),
|
| 192 |
+
main section.references:not(ol),
|
| 193 |
+
main div.references:not(ol),
|
| 194 |
+
main .bibliography:not(ol) {
|
| 195 |
+
/* Collapse to zero height to prevent layout impact, but keep in DOM for JS */
|
| 196 |
+
max-height: 0;
|
| 197 |
+
overflow: hidden;
|
| 198 |
+
margin: 0 !important;
|
| 199 |
+
padding: 0 !important;
|
| 200 |
+
border: none !important;
|
| 201 |
+
opacity: 0;
|
| 202 |
+
pointer-events: none;
|
| 203 |
+
}
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
---
|
| 207 |
+
|
| 208 |
+
## How It Works Now
|
| 209 |
+
|
| 210 |
+
1. **Build time**: Each MDX chapter is compiled. `rehype-citation` adds a bibliography section to each. `post-citation.mjs` marks ALL of them with `data-built-refs="1"`.
|
| 211 |
+
|
| 212 |
+
2. **Page load**: `Footer.astro` JavaScript runs:
|
| 213 |
+
- Finds all elements with `[data-built-refs]` or other bibliography selectors
|
| 214 |
+
- Moves the first section to the footer
|
| 215 |
+
- Extracts `<li>` items from subsequent sections and appends to the consolidated list
|
| 216 |
+
- Skips duplicates (same ID)
|
| 217 |
+
- Removes empty leftover sections
|
| 218 |
+
|
| 219 |
+
3. **Visual fallback**: CSS hides any sections that might remain in the main content (timing edge cases).
|
| 220 |
+
|
| 221 |
+
---
|
| 222 |
+
|
| 223 |
+
## Testing
|
| 224 |
+
|
| 225 |
+
1. Run `npm run dev` and open the article
|
| 226 |
+
2. Scroll to the footer - all references should appear there
|
| 227 |
+
3. Open browser dev tools:
|
| 228 |
+
- Search for `data-built-refs` - should only exist in footer
|
| 229 |
+
- Check that no `#references` sections remain in `<main>`
|
| 230 |
+
4. Click citation links - should scroll to footer references
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
## Files Modified
|
| 235 |
+
|
| 236 |
+
| File | Change |
|
| 237 |
+
|------|--------|
|
| 238 |
+
| `app/plugins/rehype/post-citation.mjs` | Find and mark ALL references sections |
|
| 239 |
+
| `app/src/components/Footer.astro` | Improved selectors, robust deduplication, MutationObserver |
|
| 240 |
+
| `app/src/styles/_layout.css` | CSS fallback to hide unconsolidated sections |
|
| 241 |
+
|
| 242 |
+
---
|
| 243 |
+
|
| 244 |
+
## Upstream Contribution
|
| 245 |
+
|
| 246 |
+
These changes could be contributed back to the original template. The fix is backward-compatible:
|
| 247 |
+
- Single-file articles work exactly as before
|
| 248 |
+
- Multi-chapter articles now work correctly
|
| 249 |
+
- No configuration changes needed
|
interactive-charts.md
CHANGED
|
@@ -409,9 +409,11 @@ For frameless embedding (like the banner):
|
|
| 409 |
| 3 | `confidence_distribution.json` | Grouped histogram | Done (confidence-distribution.html) |
|
| 410 |
| 4 | `score_vs_failed_guesses.json` | Scatter | TODO |
|
| 411 |
| 5 | `excess_caution.json` | Box plot | TODO |
|
|
|
|
| 412 |
| 6 | `caution_vs_failed_guesses.json` | Scatter | Done (caution-vs-failed-guesses.html) |
|
| 413 |
| 7 | `by_rule.json` | Strip plot | Done (by-rule.html) |
|
| 414 |
| 8 | `complexity_analysis.json` | Heatmap | Done (complexity-analysis.html) |
|
|
|
|
| 415 |
|
| 416 |
## Testing
|
| 417 |
|
|
|
|
| 409 |
| 3 | `confidence_distribution.json` | Grouped histogram | Done (confidence-distribution.html) |
|
| 410 |
| 4 | `score_vs_failed_guesses.json` | Scatter | TODO |
|
| 411 |
| 5 | `excess_caution.json` | Box plot | TODO |
|
| 412 |
+
| 5b | `tokens_by_turn.json` | Multi-line | Done (tokens-by-turn.html) |
|
| 413 |
| 6 | `caution_vs_failed_guesses.json` | Scatter | Done (caution-vs-failed-guesses.html) |
|
| 414 |
| 7 | `by_rule.json` | Strip plot | Done (by-rule.html) |
|
| 415 |
| 8 | `complexity_analysis.json` | Heatmap | Done (complexity-analysis.html) |
|
| 416 |
+
| 9 | `complexity_ratio.json` | Horizontal dot plot | Done (complexity-ratio.html) |
|
| 417 |
|
| 418 |
## Testing
|
| 419 |
|