Spaces:
Running
Running
Improved results and new assesment
Browse files- ASSESSMENT_V2.md +197 -0
- app/src/content/assets/data/basic_metrics.csv +2 -2
- app/src/content/assets/data/by_rule.json +2 -2
- app/src/content/assets/data/by_rule.png +2 -2
- app/src/content/assets/data/calibration_curves.json +2 -2
- app/src/content/assets/data/calibration_curves.png +2 -2
- app/src/content/assets/data/caution_vs_failed_guesses.json +2 -2
- app/src/content/assets/data/caution_vs_failed_guesses.png +2 -2
- app/src/content/assets/data/complexity_analysis.json +2 -2
- app/src/content/assets/data/complexity_analysis.png +2 -2
- app/src/content/assets/data/excess_caution.png +0 -3
- app/src/content/assets/data/{confidence_distribution.json → guess_rate.json} +2 -2
- app/src/content/assets/data/{confidence_distribution.png → guess_rate.png} +2 -2
- app/src/content/assets/data/model_claude_haiku_4_5.png +2 -2
- app/src/content/assets/data/model_claude_opus_4_5.png +2 -2
- app/src/content/assets/data/model_deepseek_r1.png +2 -2
- app/src/content/assets/data/model_gemini_3_flash_preview_low.png +2 -2
- app/src/content/assets/data/model_gpt_5_2_high.png +2 -2
- app/src/content/assets/data/model_gpt_5_mini_medium.png +2 -2
- app/src/content/assets/data/model_gpt_oss_120b.png +2 -2
- app/src/content/assets/data/model_gpt_oss_20b.png +2 -2
- app/src/content/assets/data/model_grok_4_1_fast_reasoning.png +2 -2
- app/src/content/assets/data/model_kimi_k2.png +2 -2
- app/src/content/assets/data/overall_performance.json +2 -2
- app/src/content/assets/data/overall_performance.png +2 -2
- app/src/content/assets/data/reckless_guessing.json +2 -2
- app/src/content/assets/data/reckless_guessing.png +2 -2
- app/src/content/assets/data/score_stack.json +2 -2
- app/src/content/assets/data/score_stack.png +2 -2
- app/src/content/assets/data/score_vs_failed_guesses.json +2 -2
- app/src/content/assets/data/score_vs_failed_guesses.png +2 -2
- app/src/content/assets/data/summary.txt +71 -71
- app/src/content/chapters/eleusis/benchmark.mdx +7 -3
- app/src/content/chapters/eleusis/introduction.mdx +2 -2
- app/src/content/chapters/eleusis/results.mdx +49 -85
- app/src/content/embeds/banner.html +6 -6
- app/src/content/embeds/by-rule.html +3 -3
- app/src/content/embeds/{confidence-distribution.html → guess-rate.html} +38 -42
- app/src/content/embeds/overall-performance.html +5 -5
- app/src/content/embeds/score-stack.html +20 -54
- app/src/content/embeds/score-vs-failed-guesses.html +5 -5
ASSESSMENT_V2.md
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Revised Assessment: Eleusis Benchmark Article (v2)
|
| 2 |
+
|
| 3 |
+
## Executive Summary
|
| 4 |
+
|
| 5 |
+
The article has improved significantly since the first assessment. The **Results section is now well-structured** with a clear narrative arc: overall performance → the metacognition insight → caution/recklessness trade-off → calibration → performance by rule. The key message about metacognition is now prominent and supported by the logical flow.
|
| 6 |
+
|
| 7 |
+
The main remaining issues are:
|
| 8 |
+
1. **Data inconsistencies** between text and data files (numbers are outdated)
|
| 9 |
+
2. **The "Deeper Analysis" section** needs restructuring—much of it now duplicates the improved Results section
|
| 10 |
+
3. Minor typos
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## 1. What's Working Well
|
| 15 |
+
|
| 16 |
+
### 1.1 Results Section Structure
|
| 17 |
+
The new structure is excellent:
|
| 18 |
+
```
|
| 19 |
+
Results
|
| 20 |
+
├── Overall Performance (intro)
|
| 21 |
+
├── Pure discovery vs metacognition (the key insight, early!)
|
| 22 |
+
├── Caution-Recklessness Trade-off (central analysis)
|
| 23 |
+
├── Confidence and Calibration (supporting evidence)
|
| 24 |
+
└── Performance by Rule (rule-level breakdown)
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
This addresses the main criticism from v1: readers now build understanding progressively and the metacognition insight is front and center.
|
| 28 |
+
|
| 29 |
+
### 1.2 Figure Flow
|
| 30 |
+
Figures now tell a coherent story:
|
| 31 |
+
- Fig 1: Overview (where does each model sit?)
|
| 32 |
+
- Fig 2: Score breakdown (what drives score differences?)
|
| 33 |
+
- Fig 3: Caution vs recklessness (the key trade-off)
|
| 34 |
+
- Fig 4: Calibration (why is timing hard?)
|
| 35 |
+
- Fig 5: Guess rate (how do models decide when to commit?)
|
| 36 |
+
- Fig 6-7: Rule-level analysis (drill-down)
|
| 37 |
+
|
| 38 |
+
### 1.3 New Guess Rate Analysis (Figure 5)
|
| 39 |
+
This is a valuable addition that wasn't in the original. It shows how models operationalize their confidence into actual decisions, connecting calibration to behavior.
|
| 40 |
+
|
| 41 |
+
### 1.4 Clear Messaging
|
| 42 |
+
Lines like "knowing when to commit is as important as finding the rule" now appear early and are reinforced throughout.
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
## 2. Critical Issues
|
| 47 |
+
|
| 48 |
+
### 2.1 Data Inconsistencies (Must Fix)
|
| 49 |
+
|
| 50 |
+
The text still uses outdated numbers. Current data (from `summary.txt` and `overall_performance.json`) vs text:
|
| 51 |
+
|
| 52 |
+
| Metric | In Text | Actual Data |
|
| 53 |
+
|--------|---------|-------------|
|
| 54 |
+
| Claude Opus 4.5 avg score | 15.9 (conclusion.mdx:10) | **17.0** (avg_floored_score) |
|
| 55 |
+
| Claude Opus 4.5 success rate | 92% (conclusion.mdx:10) | **83%** |
|
| 56 |
+
| Claude Haiku 4.5 success rate | 70% (conclusion.mdx:10) | **56%** |
|
| 57 |
+
| Claude Haiku 4.5 failed guesses | 7.5/round (analysis.mdx:15) | **3.95/round** |
|
| 58 |
+
| Kimi K2 avg score | 14.5 (analysis.mdx:60) | **16.2** |
|
| 59 |
+
| GPT OSS 120B score | 12.0 (analysis.mdx:60) | **12.9** |
|
| 60 |
+
| GPT 5.2 High early correct turns | 3.6 (multiple places) | **3.56** ✓ (close enough) |
|
| 61 |
+
|
| 62 |
+
**Action:** Audit all numbers in `results.mdx`, `analysis.mdx`, and `conclusion.mdx` against the latest data files.
|
| 63 |
+
|
| 64 |
+
### 2.2 Typos Still Present
|
| 65 |
+
|
| 66 |
+
| Location | Issue |
|
| 67 |
+
|----------|-------|
|
| 68 |
+
| results.mdx:20 | "closed to Claude Opus 4.5" → "close to" |
|
| 69 |
+
| results.mdx:85 | "overconfident : for instance" → remove space before colon |
|
| 70 |
+
| results.mdx:86 | "GPT 5.2 is the best calibrated" → "GPT 5.2 High" |
|
| 71 |
+
| results.mdx:102 | "THis is somehow" → "This is somehow" |
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## 3. The "Deeper Analysis" Section
|
| 76 |
+
|
| 77 |
+
### 3.1 Current Problem
|
| 78 |
+
|
| 79 |
+
The "Deeper Analysis" section is now partially redundant. It covers:
|
| 80 |
+
1. **Metacognition** (duplicates Results § "Pure discovery vs metacognition")
|
| 81 |
+
2. **Learning Curves** (TODO, placeholder)
|
| 82 |
+
3. **Failure Modes** (valuable, keep)
|
| 83 |
+
4. **Open vs Closed Models** (brief, could be expanded)
|
| 84 |
+
5. **Symmetric Rules** (interesting niche finding)
|
| 85 |
+
6. **Confirmation Bias** (preliminary, incomplete)
|
| 86 |
+
7. **Qualitative Observations** (nice examples, but disconnected)
|
| 87 |
+
|
| 88 |
+
### 3.2 Recommended Restructure
|
| 89 |
+
|
| 90 |
+
Rename to "Discussion" and reorganize:
|
| 91 |
+
|
| 92 |
+
```markdown
|
| 93 |
+
## Discussion
|
| 94 |
+
|
| 95 |
+
### What Explains the Performance Gap?
|
| 96 |
+
- Brief synthesis: metacognition > raw ability
|
| 97 |
+
- The caution-recklessness trade-off determines ranking more than success rate
|
| 98 |
+
- Move the GPT 5.2 High / Claude Opus 4.5 / Claude Haiku 4.5 characterizations here
|
| 99 |
+
(but avoid repeating numbers already in Results)
|
| 100 |
+
|
| 101 |
+
### Scientific Temperaments
|
| 102 |
+
- This is where the "scientific personality" framing could shine
|
| 103 |
+
- The Perfectionist (GPT 5.2 High): needs too much evidence
|
| 104 |
+
- The Pragmatist (Claude Opus 4.5): good-enough is good enough
|
| 105 |
+
- The Gambler (Claude Haiku 4.5): acts on insufficient evidence
|
| 106 |
+
- Link to real-world science: these map to actual failure modes in research
|
| 107 |
+
|
| 108 |
+
### Failure Modes [keep the accordion, it's excellent]
|
| 109 |
+
- Already well-written, just tighten the taxonomy
|
| 110 |
+
|
| 111 |
+
### Open vs Proprietary Models
|
| 112 |
+
- Currently too brief (1 paragraph)
|
| 113 |
+
- Could expand: why might open models trend reckless? (RLHF differences?)
|
| 114 |
+
- Kimi K2's success is notable—worth highlighting more
|
| 115 |
+
|
| 116 |
+
### Implications for AI-Assisted Science
|
| 117 |
+
- Currently in Conclusion but could be expanded here
|
| 118 |
+
- An overconfident assistant leads researchers astray
|
| 119 |
+
- An overcautious assistant wastes resources
|
| 120 |
+
- The calibration problem is particularly concerning
|
| 121 |
+
|
| 122 |
+
### Move to Appendix (or delete)
|
| 123 |
+
- Learning Curves (TODO) → either implement or remove
|
| 124 |
+
- Symmetric Rules → niche, move to appendix or cut
|
| 125 |
+
- Confirmation Bias → too preliminary, either expand significantly or cut
|
| 126 |
+
- Qualitative Observations → keep 1-2 good examples, cut the rest
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
### 3.3 Delete the Redundancy
|
| 130 |
+
|
| 131 |
+
The current Metacognition subsection (analysis.mdx:7-16) largely repeats what's now better expressed in Results. Either:
|
| 132 |
+
- Delete it entirely and rely on Results
|
| 133 |
+
- Or transform it into the "Scientific Temperaments" narrative frame (more memorable)
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## 4. Missing Content (Lower Priority)
|
| 138 |
+
|
| 139 |
+
### 4.1 TODOs Still Present
|
| 140 |
+
- Learning curves figure (analysis.mdx:22) — either implement or remove the placeholder
|
| 141 |
+
- Failure mode distribution stacked bar (analysis.mdx:55) — nice to have, not critical
|
| 142 |
+
|
| 143 |
+
### 4.2 Human Baseline
|
| 144 |
+
Still missing. Consider adding a sentence like: "Without human performance data on the same rules, we cannot assess whether these success rates represent strong or weak performance in absolute terms—only that models differ substantially among themselves."
|
| 145 |
+
|
| 146 |
+
### 4.3 Example Turn Figure
|
| 147 |
+
Would still be valuable in benchmark.mdx to make the task concrete. A simple 3-panel showing:
|
| 148 |
+
```
|
| 149 |
+
[Board state] → [Model reasoning excerpt] → [Decision output]
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## 5. Minor Polish
|
| 155 |
+
|
| 156 |
+
### 5.1 Model Name Consistency
|
| 157 |
+
Some inconsistencies remain:
|
| 158 |
+
- "Grok 4.1 Fast Reasoning" vs "Grok 4 1 Fast Reasoning" (in data)
|
| 159 |
+
- "DeepSeek R1" vs "Deepseek R1" (in data)
|
| 160 |
+
- Decide on one capitalization style and apply consistently
|
| 161 |
+
|
| 162 |
+
### 5.2 The "floored" Score
|
| 163 |
+
The article doesn't explain that scores below 0 are floored to 0. This affects interpretation—might be worth a brief mention in the Benchmark section or a sidenote.
|
| 164 |
+
|
| 165 |
+
### 5.3 Sidenote on Optimal Threshold
|
| 166 |
+
Results.mdx mentions the 0.67 optimal threshold but doesn't explain why. A brief derivation in a sidenote would help:
|
| 167 |
+
> For a perfectly calibrated model: E[guess at p] = p×(points remaining) - (1-p)×2. Setting E[guess] > E[wait 1 turn] gives p > 2/3 ≈ 0.67.
|
| 168 |
+
|
| 169 |
+
---
|
| 170 |
+
|
| 171 |
+
## 6. Summary of Recommended Actions
|
| 172 |
+
|
| 173 |
+
### Must Do
|
| 174 |
+
1. ☐ Fix all data inconsistencies (audit numbers against data files)
|
| 175 |
+
2. ☐ Fix typos listed in §2.2
|
| 176 |
+
3. ☐ Remove or transform redundant content in "Deeper Analysis"
|
| 177 |
+
|
| 178 |
+
### Should Do
|
| 179 |
+
4. ☐ Rename "Deeper Analysis" → "Discussion"
|
| 180 |
+
5. ☐ Restructure Discussion per §3.2
|
| 181 |
+
6. ☐ Either implement Learning Curves figure or remove the TODO
|
| 182 |
+
|
| 183 |
+
### Nice to Have
|
| 184 |
+
7. ☐ Add "Scientific Temperaments" framing
|
| 185 |
+
8. ☐ Add example turn figure in benchmark.mdx
|
| 186 |
+
9. ☐ Explain the score flooring mechanism
|
| 187 |
+
10. ☐ Expand Open vs Proprietary discussion
|
| 188 |
+
|
| 189 |
+
---
|
| 190 |
+
|
| 191 |
+
## 7. Overall Assessment
|
| 192 |
+
|
| 193 |
+
**Grade: B+ (up from B-)**
|
| 194 |
+
|
| 195 |
+
The structural problems identified in v1 are largely resolved. The article now tells a clear story: models vary in their "scientific temperament," and metacognition—knowing when you know—matters as much as raw reasoning ability.
|
| 196 |
+
|
| 197 |
+
The remaining work is mostly cleanup (data consistency, typos) and deciding what to do with the Deeper Analysis section. The article is close to publication-ready once the numbers are fixed.
|
app/src/content/assets/data/basic_metrics.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f67f1217568824b751da562d8106fae602792a64c38abb4b7c8bae75698249c0
|
| 3 |
+
size 2716
|
app/src/content/assets/data/by_rule.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9331eb6e257a86e681b479a1538f8c885bfc17aa788e67e508b142c0e2de38f
|
| 3 |
+
size 30754
|
app/src/content/assets/data/by_rule.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/calibration_curves.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2db00c0caa1dd76dea08cad41d51e54171853d0a7361bc3b27fac76680310687
|
| 3 |
+
size 9460
|
app/src/content/assets/data/calibration_curves.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/caution_vs_failed_guesses.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baed4a816edf6d9d421718d40f785a2bfd3eac4c0f9c33c33655d3a12a76690e
|
| 3 |
+
size 2456
|
app/src/content/assets/data/caution_vs_failed_guesses.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/complexity_analysis.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5260de9ec24d856da4fbc955b2a1645e00eee76bde6289212ccdd73550594e11
|
| 3 |
+
size 2363
|
app/src/content/assets/data/complexity_analysis.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/excess_caution.png
DELETED
Git LFS Details
|
app/src/content/assets/data/{confidence_distribution.json → guess_rate.json}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef7941e2df9815331f4b02172abf8c14fd147812b4ab5a39cff7e30ccf2e9ac6
|
| 3 |
+
size 10684
|
app/src/content/assets/data/{confidence_distribution.png → guess_rate.png}
RENAMED
|
File without changes
|
app/src/content/assets/data/model_claude_haiku_4_5.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_claude_opus_4_5.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_deepseek_r1.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_gemini_3_flash_preview_low.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_gpt_5_2_high.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_gpt_5_mini_medium.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_gpt_oss_120b.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_gpt_oss_20b.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_grok_4_1_fast_reasoning.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/model_kimi_k2.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/overall_performance.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ddb4557b07eab530ae73d9ce849c542f503fc3656166e9b6164034b5cba83bf
|
| 3 |
+
size 2391
|
app/src/content/assets/data/overall_performance.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/reckless_guessing.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98cdab33871f6d7dfec15536fa6e67c5f05ecf6ca12ff2db506303b68318ec0b
|
| 3 |
+
size 14795
|
app/src/content/assets/data/reckless_guessing.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/score_stack.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e41c2dc2bf2fb303a9a1c2550f9e4f7274812ddbd8a937ed68d7969558f5a1c
|
| 3 |
+
size 2876
|
app/src/content/assets/data/score_stack.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/score_vs_failed_guesses.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dca6c8ada5a4856ee2227386c4fee89a4f06901708927b47d9b203a360a3bf52
|
| 3 |
+
size 2294
|
app/src/content/assets/data/score_vs_failed_guesses.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
app/src/content/assets/data/summary.txt
CHANGED
|
@@ -26,16 +26,16 @@ BASIC MODEL COMPARISON
|
|
| 26 |
============================================================
|
| 27 |
|
| 28 |
model rounds_played total_score avg_score total_floored_score avg_floored_score total_turns total_output_tokens total_wall_clock avg_failed_guesses success_rate total_no_stakes_score avg_no_stakes_score avg_output_tokens_per_turn wall_clock_per_turn intra_rule_variance inter_rule_variance variance_ratio
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
Gpt Oss 20B 78 131 1.679487 927 11.884615 1297 7009392 62397.50
|
| 38 |
-
Claude Haiku 4.5 78 -37 -0.474359 894 11.461538 1254 6973411 57734.39
|
| 39 |
|
| 40 |
Saved: results/260121_78_rounds/basic_metrics.csv
|
| 41 |
Saved: results/260121_78_rounds/overall_performance.png
|
|
@@ -44,8 +44,8 @@ Saved: results/260121_78_rounds/score_vs_failed_guesses.png
|
|
| 44 |
Saved: results/260121_78_rounds/score_vs_failed_guesses.json
|
| 45 |
Saved: results/260121_78_rounds/calibration_curves.png
|
| 46 |
Saved: results/260121_78_rounds/calibration_curves.json
|
| 47 |
-
Saved: results/260121_78_rounds/
|
| 48 |
-
Saved: results/260121_78_rounds/
|
| 49 |
Saved: results/260121_78_rounds/score_stack.png
|
| 50 |
Saved: results/260121_78_rounds/score_stack.json
|
| 51 |
|
|
@@ -53,16 +53,16 @@ Saved: results/260121_78_rounds/score_stack.json
|
|
| 53 |
COMPLEXITY ANALYSIS
|
| 54 |
============================================================
|
| 55 |
|
| 56 |
-
Optimal K for aggregated complexity: 0.
|
| 57 |
-
Formula: complexity = cyclomatic + 0.
|
| 58 |
-
Correlation with success_rate: -0.
|
| 59 |
|
| 60 |
Stats by complexity quartile:
|
| 61 |
-
complexity_bin count
|
| 62 |
-
Q1 240
|
| 63 |
-
Q2 150
|
| 64 |
-
Q3 180
|
| 65 |
-
Q4 210
|
| 66 |
|
| 67 |
Saved: results/260121_78_rounds/complexity_analysis.png
|
| 68 |
Saved: results/260121_78_rounds/complexity_analysis.json
|
|
@@ -71,34 +71,34 @@ Saved: results/260121_78_rounds/complexity_analysis.json
|
|
| 71 |
BY-RULE ANALYSIS
|
| 72 |
============================================================
|
| 73 |
|
| 74 |
-
Score by rule (sorted by
|
| 75 |
-
rule_description count
|
| 76 |
-
Only red cards (hearts or diamonds). 30
|
| 77 |
-
Only cards of the suit spades. 30
|
| 78 |
-
Cards must alternate between red and black colors. Any card may start the line. 30
|
| 79 |
-
Only cards with an even rank (2,4,6,8,10,12). 30
|
| 80 |
-
The card must be of a different suit than the card just before it. Any card may start the line. 30
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
Suits must repeat in the cyclic order hearts → spades → clubs → diamonds → hearts... Any card may start the line. 30
|
| 89 |
-
Only cards between 1 and 7 inclusive. 30
|
| 90 |
-
Only black face cards. 30
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
Suits must appear in pairs: card 1 and 2 same suit, cards 3 and 4 same suit (different from 1 and 2), cards 5 and 6 same suit (different from 3 and 4), etc. 30
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
Rank repeats in pairs: ranks must come in doubles: (x, x), then (y, y) with y different from x, then (z, z) with z different from y, etc. 30
|
| 102 |
|
| 103 |
Saved: results/260121_78_rounds/by_rule.png
|
| 104 |
Saved: results/260121_78_rounds/by_rule.json
|
|
@@ -139,32 +139,32 @@ Double-Down Rate: After a wrong guess, % of next turns with another guess
|
|
| 139 |
(Only counts official guesses, not shadow/tentative guesses)
|
| 140 |
|
| 141 |
Model Wrong Guesses Next Turn Guesses Double-Down %
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
Claude Opus 4.5
|
| 148 |
-
Gpt Oss 120B
|
| 149 |
-
Gemini 3 Flash Preview Low
|
| 150 |
-
Gpt 5 Mini Medium
|
| 151 |
-
Gpt 5.2 High
|
| 152 |
|
| 153 |
Wrong Guess Streak Statistics:
|
| 154 |
Model Streaks Mean Length Max Length Total Wrong
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
Claude Opus 4.5
|
| 161 |
-
Gpt Oss 120B
|
| 162 |
-
Gemini 3 Flash Preview Low
|
| 163 |
-
Gpt 5 Mini Medium
|
| 164 |
-
Gpt 5.2 High
|
| 165 |
-
|
| 166 |
-
Longest streak:
|
| 167 |
-
-
|
| 168 |
|
| 169 |
Saved: results/260121_78_rounds/reckless_guessing.png
|
| 170 |
Saved: results/260121_78_rounds/reckless_guessing.json
|
|
|
|
| 26 |
============================================================
|
| 27 |
|
| 28 |
model rounds_played total_score avg_score total_floored_score avg_floored_score total_turns total_output_tokens total_wall_clock avg_failed_guesses success_rate total_no_stakes_score avg_no_stakes_score avg_output_tokens_per_turn wall_clock_per_turn intra_rule_variance inter_rule_variance variance_ratio
|
| 29 |
+
Claude Opus 4.5 78 1128 14.461538 1324 16.974359 852 4333716 86367.64 2.000000 0.833333 1598.0 20.487179 5086.521127 101.370469 25.000000 81.385983 0.307178
|
| 30 |
+
Kimi K2 78 804 10.307692 1262 16.179487 975 12281540 101346.76 2.038462 0.769231 1481.0 18.987179 12596.451282 103.945395 25.538462 88.446496 0.288745
|
| 31 |
+
Grok 4 1 Fast Reasoning 78 737 9.448718 1182 15.153846 998 8178655 120364.22 2.564103 0.717949 1441.0 18.474359 8195.045090 120.605431 25.243590 106.499829 0.237029
|
| 32 |
+
Gpt 5.2 High 78 1158 14.846154 1174 15.051282 1205 3341037 73525.83 0.282051 0.948718 1505.0 19.294872 2772.644813 61.017286 24.628205 36.601709 0.672870
|
| 33 |
+
Gpt 5 Mini Medium 78 942 12.076923 1052 13.487179 1261 3618399 58345.97 1.166667 0.705128 1325.0 16.987179 2869.467883 46.269603 39.141026 82.882051 0.472250
|
| 34 |
+
Deepseek R1 78 511 6.551282 1036 13.282051 1104 9229131 165334.16 3.192308 0.641026 1331.0 17.064103 8359.720109 149.759203 29.628205 115.135043 0.257334
|
| 35 |
+
Gemini 3 Flash Preview Low 78 817 10.474359 1024 13.128205 1315 1581524 12702.02 0.961538 0.705128 1226.0 15.717949 1202.679848 9.659331 29.923077 83.049573 0.360304
|
| 36 |
+
Gpt Oss 120B 78 580 7.435897 1004 12.871795 1243 3190828 24633.15 2.153846 0.679487 1279.0 16.397436 2567.037812 19.817498 46.692308 78.676239 0.593474
|
| 37 |
+
Gpt Oss 20B 78 131 1.679487 927 11.884615 1297 7009392 62397.50 2.974359 0.589744 1206.0 15.461538 5404.311488 48.109098 47.576923 88.239487 0.539180
|
| 38 |
+
Claude Haiku 4.5 78 -37 -0.474359 894 11.461538 1254 6973411 57734.39 3.948718 0.564103 1198.0 15.358974 5560.933812 46.040183 45.102564 107.387350 0.419999
|
| 39 |
|
| 40 |
Saved: results/260121_78_rounds/basic_metrics.csv
|
| 41 |
Saved: results/260121_78_rounds/overall_performance.png
|
|
|
|
| 44 |
Saved: results/260121_78_rounds/score_vs_failed_guesses.json
|
| 45 |
Saved: results/260121_78_rounds/calibration_curves.png
|
| 46 |
Saved: results/260121_78_rounds/calibration_curves.json
|
| 47 |
+
Saved: results/260121_78_rounds/guess_rate.png
|
| 48 |
+
Saved: results/260121_78_rounds/guess_rate.json
|
| 49 |
Saved: results/260121_78_rounds/score_stack.png
|
| 50 |
Saved: results/260121_78_rounds/score_stack.json
|
| 51 |
|
|
|
|
| 53 |
COMPLEXITY ANALYSIS
|
| 54 |
============================================================
|
| 55 |
|
| 56 |
+
Optimal K for aggregated complexity: 0.14
|
| 57 |
+
Formula: complexity = cyclomatic + 0.14 * node_count
|
| 58 |
+
Correlation with success_rate: -0.659
|
| 59 |
|
| 60 |
Stats by complexity quartile:
|
| 61 |
+
complexity_bin count avg_floored_score success_rate
|
| 62 |
+
Q1 240 19.829167 0.920833
|
| 63 |
+
Q2 150 14.973333 0.773333
|
| 64 |
+
Q3 180 15.344444 0.794444
|
| 65 |
+
Q4 210 5.295238 0.371429
|
| 66 |
|
| 67 |
Saved: results/260121_78_rounds/complexity_analysis.png
|
| 68 |
Saved: results/260121_78_rounds/complexity_analysis.json
|
|
|
|
| 71 |
BY-RULE ANALYSIS
|
| 72 |
============================================================
|
| 73 |
|
| 74 |
+
Score by rule (sorted by avg_floored_score):
|
| 75 |
+
rule_description count avg_floored_score std_floored_score success_rate
|
| 76 |
+
Only red cards (hearts or diamonds). 30 25.633333 2.204749 1.000000
|
| 77 |
+
Only cards of the suit spades. 30 25.200000 2.023994 1.000000
|
| 78 |
+
Cards must alternate between red and black colors. Any card may start the line. 30 25.166667 2.640315 1.000000
|
| 79 |
+
Only cards with an even rank (2,4,6,8,10,12). 30 24.300000 2.692903 1.000000
|
| 80 |
+
The card must be of a different suit than the card just before it. Any card may start the line. 30 22.200000 6.477547 0.966667
|
| 81 |
+
Only hearts, clubs, and diamonds allowed. Spades are forbidden. 30 20.666667 5.516954 0.966667
|
| 82 |
+
Card rank must have opposite odd/even parity to the previous card's rank. Any card may start the line. 30 20.666667 5.148373 1.000000
|
| 83 |
+
Only Aces (rank 1) . 30 20.366667 8.580183 0.933333
|
| 84 |
+
The card must be of a different suit than but same color as the card just before it. Any card may start the line. 30 20.333333 5.541899 0.966667
|
| 85 |
+
Only ranks that are prime numbers (2,3,5,7,11,13). 30 19.966667 6.965349 0.933333
|
| 86 |
+
Only face cards (11,12,13). 30 19.833333 8.288269 0.900000
|
| 87 |
+
Only spades and diamonds. 30 19.066667 4.487018 1.000000
|
| 88 |
+
Suits must repeat in the cyclic order hearts → spades → clubs → diamonds → hearts... Any card may start the line. 30 16.766667 7.663993 0.900000
|
| 89 |
+
Only cards between 1 and 7 inclusive. 30 14.466667 7.238467 0.900000
|
| 90 |
+
Only black face cards. 30 11.466667 9.000894 0.700000
|
| 91 |
+
Alternate face and number cards. Any card may start the line. 30 9.066667 9.409948 0.600000
|
| 92 |
+
Each card must share at least one property with the previous card: same color, or same parity. Any card may start the line. 30 8.600000 9.193701 0.533333
|
| 93 |
+
Each card must have a rank greater or equal to the previous card. Only Ace can start the line. 30 8.333333 9.400416 0.533333
|
| 94 |
+
Only cards between 5 and 9 inclusive. 30 8.166667 7.153891 0.666667
|
| 95 |
+
Only red cards whose rank is <=7. 30 7.700000 6.808463 0.666667
|
| 96 |
+
Suits must appear in pairs: card 1 and 2 same suit, cards 3 and 4 same suit (different from 1 and 2), cards 5 and 6 same suit (different from 3 and 4), etc. 30 4.966667 6.321738 0.500000
|
| 97 |
+
Face cards (11-13) must be red; number cards (1-10) must be black. 30 2.966667 5.816050 0.266667
|
| 98 |
+
Hearts and spades form Group A; clubs and diamonds form Group B. Alternate between groups. Any card may start the line. 30 2.466667 5.612384 0.200000
|
| 99 |
+
If the previous card was red, rank must increase or be equal; if black, rank must decrease or be equal. Starting card must be between 5 and 9 inclusive. 30 1.600000 4.022351 0.166667
|
| 100 |
+
Face cards imposes the suit: if a face card is played, the next card must match its suit. Otherwise, the next card must be a different suit than it. 30 1.533333 3.598212 0.200000
|
| 101 |
+
Rank repeats in pairs: ranks must come in doubles: (x, x), then (y, y) with y different from x, then (z, z) with z different from y, etc. 30 1.133333 3.549972 0.100000
|
| 102 |
|
| 103 |
Saved: results/260121_78_rounds/by_rule.png
|
| 104 |
Saved: results/260121_78_rounds/by_rule.json
|
|
|
|
| 139 |
(Only counts official guesses, not shadow/tentative guesses)
|
| 140 |
|
| 141 |
Model Wrong Guesses Next Turn Guesses Double-Down %
|
| 142 |
+
Grok 4 1 Fast Reasoning 200 108 54.0
|
| 143 |
+
Deepseek R1 249 132 53.0
|
| 144 |
+
Claude Haiku 4.5 308 161 52.3
|
| 145 |
+
Kimi K2 159 67 42.1
|
| 146 |
+
Gpt Oss 20B 232 97 41.8
|
| 147 |
+
Claude Opus 4.5 156 50 32.1
|
| 148 |
+
Gpt Oss 120B 168 37 22.0
|
| 149 |
+
Gemini 3 Flash Preview Low 75 15 20.0
|
| 150 |
+
Gpt 5 Mini Medium 91 8 8.8
|
| 151 |
+
Gpt 5.2 High 22 0 0.0
|
| 152 |
|
| 153 |
Wrong Guess Streak Statistics:
|
| 154 |
Model Streaks Mean Length Max Length Total Wrong
|
| 155 |
+
Grok 4 1 Fast Reasoning 103 1.94 8 200
|
| 156 |
+
Deepseek R1 121 2.06 7 249
|
| 157 |
+
Claude Haiku 4.5 157 1.96 7 308
|
| 158 |
+
Kimi K2 100 1.59 7 159
|
| 159 |
+
Gpt Oss 20B 141 1.65 7 232
|
| 160 |
+
Claude Opus 4.5 115 1.36 5 156
|
| 161 |
+
Gpt Oss 120B 133 1.26 5 168
|
| 162 |
+
Gemini 3 Flash Preview Low 63 1.19 4 75
|
| 163 |
+
Gpt 5 Mini Medium 85 1.07 3 91
|
| 164 |
+
Gpt 5.2 High 22 1.00 1 22
|
| 165 |
+
|
| 166 |
+
Longest streak: 8 consecutive wrong guesses
|
| 167 |
+
- Grok 4 1 Fast Reasoning in round 67
|
| 168 |
|
| 169 |
Saved: results/260121_78_rounds/reckless_guessing.png
|
| 170 |
Saved: results/260121_78_rounds/reckless_guessing.json
|
app/src/content/chapters/eleusis/benchmark.mdx
CHANGED
|
@@ -24,11 +24,13 @@ The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and f
|
|
| 24 |
|
| 25 |
On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
|
| 26 |
|
| 27 |
-
When correctly guessing the rule, the player scores as many points as the number of
|
| 28 |
|
| 29 |
$$\text{score} = (30 - \text{turns\_elapsed} + 1) - 2 \times \text{num_wrong\_guesses}$$
|
| 30 |
|
| 31 |
-
A player who correctly identifies the rule on turn 13 with no wrong guesses scores 18 points; one who made 3 wrong guesses along the way scores only 12.
|
|
|
|
|
|
|
| 32 |
|
| 33 |
### Rule Library
|
| 34 |
|
|
@@ -67,4 +69,6 @@ Example output
|
|
| 67 |
}
|
| 68 |
```
|
| 69 |
|
| 70 |
-
This structure lets us analyze not just whether models succeed, but *how* they reason: Do they update hypotheses appropriately when evidence contradicts them? Do they explore strategically or play conservatively? Is their stated confidence calibrated to their actual accuracy?
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. While playing a card, the player may attempt to guess the rule. The game continues until the player correctly identifies the rule or reaches 30 turns.
|
| 26 |
|
| 27 |
+
When correctly guessing the rule, the player scores as many points as the number of turns spent, and each wrong guess deducts a penalty of 2 points:
|
| 28 |
|
| 29 |
$$\text{score} = (30 - \text{turns\_elapsed} + 1) - 2 \times \text{num_wrong\_guesses}$$
|
| 30 |
|
| 31 |
+
A player who correctly identifies the rule on turn 13 with no wrong guesses scores 18 points; one who made 3 wrong guesses along the way scores only 12. If because of penalties the score drops to zero or below, the round stops and the final score is recorded as zero (similar to a scientist having wasted all their resources).
|
| 32 |
+
|
| 33 |
+
This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence and acting accordingly.
|
| 34 |
|
| 35 |
### Rule Library
|
| 36 |
|
|
|
|
| 69 |
}
|
| 70 |
```
|
| 71 |
|
| 72 |
+
This structure lets us analyze not just whether models succeed, but *how* they reason: Do they update hypotheses appropriately when evidence contradicts them? Do they explore strategically or play conservatively? Is their stated confidence calibrated to their actual accuracy? In particular, forcing the model to articulate a tentative rule and a confidence level in it (even if they don't want to guess it yet) allows us to (secretely) evaluate it nonetheless, which will be useful for measuring calibration and guessing abilities.
|
| 73 |
+
|
| 74 |
+
|
app/src/content/chapters/eleusis/introduction.mdx
CHANGED
|
@@ -11,9 +11,9 @@ Large language models are increasingly being deployed as tools for scientific re
|
|
| 11 |
|
| 12 |
Most reasoning benchmarks test whether models can solve well-defined problems: given premises, derive a conclusion. The ARC challenge, for instance, evaluates inductive reasoning on visual patterns. These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.
|
| 13 |
|
| 14 |
-
Real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement
|
| 15 |
|
| 16 |
-
Beyond pure reasoning, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?), **metacognition** (how certain am I about my uncertainty?), and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis). A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
|
| 17 |
|
| 18 |
We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
|
| 19 |
|
|
|
|
| 11 |
|
| 12 |
Most reasoning benchmarks test whether models can solve well-defined problems: given premises, derive a conclusion. The ARC challenge, for instance, evaluates inductive reasoning on visual patterns. These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.
|
| 13 |
|
| 14 |
+
Real scientific reasoning is not a single inference step. It's an iterative agentic process of observation, hypothesis formation, experimentation, and refinement, often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic* thinking: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
|
| 15 |
|
| 16 |
+
Beyond pure reasoning, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?), **metacognition** (how certain am I about my uncertainty?), and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis instead of trying to challenge it). A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
|
| 17 |
|
| 18 |
We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
|
| 19 |
|
app/src/content/chapters/eleusis/results.mdx
CHANGED
|
@@ -8,7 +8,7 @@ import HtmlEmbed from "../../../components/HtmlEmbed.astro";
|
|
| 8 |
|
| 9 |
### Overall Performance
|
| 10 |
|
| 11 |
-
We evaluated ten models on the Eleusis benchmark, including both proprietary and open-weight models. Performance is measured as the average score per turn. We also report token usage (output + reasoning) to compare efficiency.
|
| 12 |
|
| 13 |
<HtmlEmbed
|
| 14 |
src="overall-performance.html"
|
|
@@ -17,130 +17,94 @@ We evaluated ten models on the Eleusis benchmark, including both proprietary and
|
|
| 17 |
wide
|
| 18 |
/>
|
| 19 |
|
| 20 |
-
Performance varies dramatically among tested models. Claude Opus 4.5 achieves top performance with moderate token usage. The open-weight model Kimi K2
|
| 21 |
|
| 22 |
-
GPT
|
| 23 |
|
| 24 |
-
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempted to guess. This allows us to evaluate calibration: does reported confidence match actual accuracy?
|
| 29 |
|
| 30 |
-
|
| 31 |
-
src="calibration-curves.html"
|
| 32 |
-
caption="<strong>Figure 2:</strong> Calibration curves for each model. A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
|
| 33 |
-
id="fig-calibration"
|
| 34 |
-
/>
|
| 35 |
-
|
| 36 |
-
The calibration analysis reveals several patterns:
|
| 37 |
-
|
| 38 |
-
- **All models are overconfident** : for instance when they report 80% confidence, their actual success rates are often closer to 20% !
|
| 39 |
-
- GPT 5.2 is the best calibrated model overall.
|
| 40 |
-
- Even models with a strong performance like Claude Opus 4.5 and Kimi K2 show significant overconfidence.
|
| 41 |
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
<HtmlEmbed
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
| 49 |
/>
|
| 50 |
|
| 51 |
-
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
|
| 58 |
-
###
|
| 59 |
|
| 60 |
-
|
| 61 |
|
| 62 |
-
|
| 63 |
-
src="score-vs-failed-guesses.html"
|
| 64 |
-
caption="<strong>Figure 4:</strong> Score vs. failed guesses per round. Models in the upper-left are efficient (high scores, few wrong guesses). Models that guess recklessly appear on the right with low scores."
|
| 65 |
-
id="fig-guessing"
|
| 66 |
-
/>
|
| 67 |
|
| 68 |
<Sidenote>
|
| 69 |
-
|
| 70 |
</Sidenote>
|
| 71 |
|
| 72 |
-
### The Caution-Recklessness Trade-off
|
| 73 |
-
|
| 74 |
-
Failed guesses tell only half the story. A model might avoid wrong guesses by being *too* cautious—waiting many turns after it already has the correct answer. To measure this, we tracked "early correct turns": how many consecutive turns a model's tentative rule was correct before it finally chose to guess.
|
| 75 |
-
|
| 76 |
-
<HtmlEmbed
|
| 77 |
-
src="excess-caution.html"
|
| 78 |
-
caption="<strong>Figure 5:</strong> Distribution of early correct turns (waiting with the correct answer). Higher values indicate excessive caution—the model knew the answer but hesitated to commit. GPT 5.2 High stands out as extremely cautious, with a mean of 3.6 turns of unnecessary delay."
|
| 79 |
-
id="fig-excess-caution"
|
| 80 |
-
/>
|
| 81 |
-
|
| 82 |
-
The results reveal striking differences in guessing personalities:
|
| 83 |
-
|
| 84 |
-
- **GPT 5.2 High** is remarkably cautious, waiting an average of 3.6 turns after finding the correct rule before guessing. In 87% of successful rounds, it waited at least one turn too long.
|
| 85 |
-
- **Claude Opus 4.5** shows excellent timing—only 0.9 early correct turns on average, meaning it commits almost immediately after finding the answer.
|
| 86 |
-
- **Claude Haiku 4.5** and **DeepSeek R1** are the least cautious (0.5 early turns), but this comes at a cost: they also have the highest failed guess rates.
|
| 87 |
-
|
| 88 |
<HtmlEmbed
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
/>
|
| 93 |
|
| 94 |
-
<Sidenote>
|
| 95 |
-
This trade-off mirrors a fundamental tension in science: being overconfident too early might risk false positives, leading to wasted resources and reputational damage; being overly cautious can delay discoveries and allow others to scoop you. Scientists must balance the risk of trying to publish too early and risk being wrong, wait too long and lose priority (or in our case, points).
|
| 96 |
-
</Sidenote>
|
| 97 |
-
|
| 98 |
-
This visualization reveals distinct behavioral patterns:
|
| 99 |
|
| 100 |
-
|
| 101 |
|
| 102 |
-
|
| 103 |
|
| 104 |
-
|
| 105 |
|
|
|
|
| 106 |
|
| 107 |
-
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
|
|
|
| 111 |
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
|
| 115 |
-
|
| 116 |
|
| 117 |
-
1. **Raw score**: The standard scoring (30 - turns - 2×wrong guesses)
|
| 118 |
-
2. **Floored score**: Same formula, but negative scores are counted as zero
|
| 119 |
-
3. **No-stakes score**: No penalty for wrong guesses, and tentative rules count as guesses
|
| 120 |
|
| 121 |
<HtmlEmbed
|
| 122 |
-
src="
|
| 123 |
-
caption="<strong>Figure
|
| 124 |
-
id="fig-
|
| 125 |
-
wide
|
| 126 |
/>
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
The no-stakes gain (green) shows what models would gain if we simply tested their tentative rule each turn. Interestingly, this gain is relatively consistent across models (2.5–4.2 points), suggesting that most models form correct hypotheses at similar rates, but differ dramatically in their ability to *recognize* when they have the right answer.
|
| 131 |
-
|
| 132 |
-
Under any scoring system, Claude Opus 4.5 and GPT 5.2 High remain the top performers. The ranking compression at no-stakes scores (15.4 to 20.5 vs raw -0.5 to 14.8) confirms that our scoring system appropriately rewards good metacognition—knowing when you know.
|
| 133 |
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
|
| 137 |
-
|
| 138 |
|
| 139 |
-
<HtmlEmbed
|
| 140 |
-
src="reckless-guessing.html"
|
| 141 |
-
caption="<strong>Figure 7b:</strong> Double-down rate: how often a model guesses again immediately after a wrong guess. Higher values indicate more reckless behavior—the model keeps guessing despite recent failures."
|
| 142 |
-
id="fig-reckless-guessing"
|
| 143 |
-
/>
|
| 144 |
|
| 145 |
### Performance by Rule
|
| 146 |
|
|
@@ -152,7 +116,7 @@ The following figure breaks down performance by rule across all models and runs.
|
|
| 152 |
|
| 153 |
<HtmlEmbed
|
| 154 |
src="by-rule.html"
|
| 155 |
-
caption="<strong>Figure
|
| 156 |
id="fig-by-rule"
|
| 157 |
wide
|
| 158 |
/>
|
|
@@ -163,7 +127,7 @@ The following plot breaks down the relative score of each model (as measured by
|
|
| 163 |
|
| 164 |
<HtmlEmbed
|
| 165 |
src="complexity-analysis.html"
|
| 166 |
-
caption="<strong>Figure
|
| 167 |
id="fig-complexity"
|
| 168 |
/>
|
| 169 |
|
|
|
|
| 8 |
|
| 9 |
### Overall Performance
|
| 10 |
|
| 11 |
+
We evaluated ten models on the Eleusis benchmark, including both proprietary and open-weight models. Performance is measured as the average score per turn. We also report token usage (output + reasoning) per turn to compare efficiency.
|
| 12 |
|
| 13 |
<HtmlEmbed
|
| 14 |
src="overall-performance.html"
|
|
|
|
| 17 |
wide
|
| 18 |
/>
|
| 19 |
|
| 20 |
+
Performance varies dramatically among tested models. Claude Opus 4.5 achieves top performance with moderate token usage. The open-weight model Kimi K2 comes second and performs competitively with the best proprietary models, outperforming GPT 5.2 High and being closed to Claude Opus 4.5, but at the price of a 2.5× larger reasoning budget.
|
| 21 |
|
| 22 |
+
GPT 5.2 High and Grok 4.1 Fast Reasoning show a similar performance but GPT 5.2 High is significantly more token efficient.
|
| 23 |
|
| 24 |
+
GPT-5-Mini, GPT OSS-120B and Gemini 3 Flash Preview Low cluster in the mid-tier (around 13) with moderate token usage.While Deepseek R1, an open-weight model specialized for reasoning tasks, achieves a similar score with much larger token count.
|
| 25 |
|
| 26 |
+
Finally, GPT-OSS 20B and Claude Haiku 4.5 lag behind, scoring between 11 and 12 with moderate token usage.
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
As we mentionned, this score reflects not only the pure model's ability to find the correct rule, but also its metacognitive skills: knowing when to commit, how confident it is, and how to balance exploration vs. exploitation. To distinguish these factors, we also computed an alternative "no-stakes" score that removes penalties for wrong guesses and counts tentative rules as guesses. This allows us to isolate pure rule-discovery ability from metacognitive skills.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
### Pure discovery versus metacognition
|
| 31 |
|
| 32 |
+
The following chart shows the score of each model, and which score it would have achieved under a "no stakes" scenario where guessing is free and systematic.
|
| 33 |
|
| 34 |
<HtmlEmbed
|
| 35 |
+
src="score-stack.html"
|
| 36 |
+
caption="<strong>Figure 2:</strong> Score breakdown under alternative scoring systems. Blue shows raw score (standard scoring). while green shows no-stakes gain (additional gain from removing wrong-guess penalties). Models sorted by total no-stakes score."
|
| 37 |
+
id="fig-score-stack"
|
| 38 |
+
wide
|
| 39 |
/>
|
| 40 |
|
| 41 |
+
Even if using this alternative scoring does not change a lot the relative ranking of models, it reveals important differences in their behavior. GPT 5.2 High and Claude Haiku 4.5 are the two models with the largest difference between raw and no-stakes scores (more than 4) while Gemini and Kimi K2 have the smallest difference (less than 3).
|
| 42 |
|
| 43 |
+
They might be two reason for the difference between the raw and the no-stakes scores:
|
| 44 |
+
1. The model is reckless and makes a lot of wrong guesses, incurring penalties.
|
| 45 |
+
2. The model is too cautious and waits too long before guessing, missing out on points.
|
| 46 |
|
| 47 |
+
We analyze these two aspects in more details below.
|
| 48 |
|
| 49 |
|
| 50 |
+
### The Caution-Recklessness Trade-off
|
| 51 |
|
| 52 |
+
To estimate how reckless or cautious a model is, we can compute the average number of failed guesses per round (recklessness). It directly relates to how many points a model loses due to wrong guesses.
|
| 53 |
|
| 54 |
+
To estimate caution, we can compute on average how many turns a model waits while having the correct tentative rule before actually guessing it. This relates to how many points a model loses by waiting too long to commit.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
<Sidenote>
|
| 57 |
+
This trade-off mirrors a fundamental tension in science: being overconfident too early might risk false positives, leading to wasted resources and reputational damage; being overly cautious can delay discoveries and allow others to scoop you. Scientists must balance the risk of trying to publish too early and risk being wrong, wait too long and lose priority (or in our case, points).
|
| 58 |
</Sidenote>
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
<HtmlEmbed
|
| 61 |
+
src="caution-vs-failed-guesses.html"
|
| 62 |
+
caption="<strong>Figure 3:</strong> The caution-recklessness trade-off. Models in the upper-left are cautious (delay correct guesses); models in the lower-right are reckless (many failed guesses). The ideal position is lower-left: quick to commit when right, rarely wrong."
|
| 63 |
+
id="fig-caution-reckless"
|
| 64 |
/>
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
How should we interpret those values ? Knowing that a failed guess costs 2 points, while each turn of delay costs 1 point, the optimal number of failed guesses per round should be around 0.5 (i.e., 1 failed guess every 2 rounds) to balance the two sources of loss. We can see that most models are above that threshold, indicating a tendency towards recklessness. This is confirmed by the fact that they have a low caution value (most models wait around 1 turns on average before guessing when they have the correct rule).
|
| 68 |
|
| 69 |
+
On the other hand, GPT 5.2 High has a singular behavior with very few failed guesses (0.28 per round) but a high caution (waiting 3.5 turns on average before guessing when it has the correct rule). Gemini 3 Flash Preview Low and GPT 5 Mini Medium are intermediate in both dimensions, Gemini achieving a better balance with on average 2 points lost due to recklessness and 2 points lost due to caution.
|
| 70 |
|
| 71 |
+
To try to understand deeper the causes of recklessness and caution, we now turn to an analysis of confidence and guessing strategies.
|
| 72 |
|
| 73 |
+
### Confidence and Calibration
|
| 74 |
|
| 75 |
+
Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly, even if they didn't formally attempted to guess. This allows us to evaluate calibration: does reported confidence match actual accuracy?
|
| 76 |
|
| 77 |
+
<HtmlEmbed
|
| 78 |
+
src="calibration-curves.html"
|
| 79 |
+
caption="<strong>Figure 4:</strong> Calibration curves for each model. A perfectly calibrated model would follow the diagonal. Points below the line indicate overconfidence: they correspond to confidence levels where actual success rates are lower than reported. Click legend items to show/hide models."
|
| 80 |
+
id="fig-calibration"
|
| 81 |
+
/>
|
| 82 |
|
| 83 |
+
The calibration analysis reveals several patterns:
|
| 84 |
|
| 85 |
+
- **All models are overconfident** : for instance when they report 80% confidence, their actual success rates are often closer to 20% !
|
| 86 |
+
- GPT 5.2 is the best calibrated model overall.
|
| 87 |
+
- Even models with a strong performance like Claude Opus 4.5 and Kimi K2 show significant overconfidence.
|
| 88 |
|
| 89 |
+
Is overconfidence a problem ? It depends on how the model decides to act on it.
|
| 90 |
|
| 91 |
+
For a perfectly calibrated model, as the expected loss for a failed guess is twice the expected opportunity cost of waiting one turn, the optimal confidence threshold for guessing is 0.67 (i.e., guess when you believe your tentative rule has at least a 67% chance of being correct). But do model follow such a strategy ? For this, we can look at how often models guess at each confidence level.
|
| 92 |
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
<HtmlEmbed
|
| 95 |
+
src="guess-rate.html"
|
| 96 |
+
caption="<strong>Figure 5:</strong> Guess rate per confidence level. The optimal decision theoretic curve for a perfectly calibrated model should be a step at 67%. Click legend items to show/hide models."
|
| 97 |
+
id="fig-confidence"
|
|
|
|
| 98 |
/>
|
| 99 |
|
| 100 |
+
We can see that some models like Grok 4.1 or Gemini 3 will essentially only guess when very confident (9 or 10). Most other models will guess at confidence levels above 8 and rarely below. The two Claude models show different behaviors: Claude Opus 4.5 tends to guess more agressively at confidence level 8, while Claude Haiku 4.5 guesses even at confidence level 7.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
We can see that models on average are more cautious than the optimal decision-theoretic strategy for a perfectly calibrated model, which would guess as soon as confidence exceeds 67%. THis is somehow a good thing, given that all models are overconfident. By raising the bar for guessing, they reduce the risk of wrong guesses and compensate for their poor calibration.
|
| 103 |
|
| 104 |
+
This is particularly true for Gemini 3 Flash Preview Low which is very cautious despite being overconfident, and this is probably what helps it achieve a good balance between failed guesses and lost opportunity cost. It is also consistent with the fact that it's the model with the smallest difference between raw and no-stakes scores.
|
| 105 |
|
| 106 |
+
The case of GPT 5.2 High is different: it is both fairly well calibrated and very cautious, leading to very few failed guesses but a high opportunity cost due to delayed guessing. This suggests that GPT 5.2 High could improve its performance by being more agressive in guessing once it has a correct tentative rule.
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
### Performance by Rule
|
| 110 |
|
|
|
|
| 116 |
|
| 117 |
<HtmlEmbed
|
| 118 |
src="by-rule.html"
|
| 119 |
+
caption="<strong>Figure 6:</strong> Score distribution by rule. Each row is a different rule, with individual run scores shown as colored dots (one per model run). Hover over rule names for details. The left column shows average success rate. Click legend items to show/hide models."
|
| 120 |
id="fig-by-rule"
|
| 121 |
wide
|
| 122 |
/>
|
|
|
|
| 127 |
|
| 128 |
<HtmlEmbed
|
| 129 |
src="complexity-analysis.html"
|
| 130 |
+
caption="<strong>Figure 7:</strong> Relationship between rule complexity and model performance. The heatmap shows relative scores (value > 1 means above-average performance) for each model across complexity quartiles. Hover over cells for details."
|
| 131 |
id="fig-complexity"
|
| 132 |
/>
|
| 133 |
|
app/src/content/embeds/banner.html
CHANGED
|
@@ -188,7 +188,7 @@
|
|
| 188 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 189 |
<div class="metric">
|
| 190 |
<span class="metric-label">Score:</span>
|
| 191 |
-
<span class="metric-value">${d.
|
| 192 |
</div>
|
| 193 |
<div class="metric">
|
| 194 |
<span class="metric-label">Tokens/Turn:</span>
|
|
@@ -235,11 +235,11 @@
|
|
| 235 |
const { innerWidth, innerHeight } = updateSize();
|
| 236 |
|
| 237 |
// Sort models by score descending
|
| 238 |
-
const models = [...data.models].sort((a, b) => b.
|
| 239 |
|
| 240 |
// Update scales
|
| 241 |
xScale
|
| 242 |
-
.domain([0, d3.max(models, d => d.
|
| 243 |
.range([0, innerWidth])
|
| 244 |
.nice();
|
| 245 |
|
|
@@ -285,7 +285,7 @@
|
|
| 285 |
.attr('class', 'bar')
|
| 286 |
.attr('x', 0)
|
| 287 |
.attr('y', d => yScale(d.name))
|
| 288 |
-
.attr('width', d => xScale(d.
|
| 289 |
.attr('height', barHeight)
|
| 290 |
.attr('fill', d => d.color)
|
| 291 |
.attr('rx', 3)
|
|
@@ -311,11 +311,11 @@
|
|
| 311 |
.data(models, d => d.name)
|
| 312 |
.join('text')
|
| 313 |
.attr('class', 'score-label')
|
| 314 |
-
.attr('x', d => xScale(d.
|
| 315 |
.attr('y', d => yScale(d.name) + barHeight / 2)
|
| 316 |
.attr('dy', '0.35em')
|
| 317 |
.attr('text-anchor', 'start')
|
| 318 |
-
.text(d => d.
|
| 319 |
}
|
| 320 |
|
| 321 |
// Initialize
|
|
|
|
| 188 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 189 |
<div class="metric">
|
| 190 |
<span class="metric-label">Score:</span>
|
| 191 |
+
<span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
|
| 192 |
</div>
|
| 193 |
<div class="metric">
|
| 194 |
<span class="metric-label">Tokens/Turn:</span>
|
|
|
|
| 235 |
const { innerWidth, innerHeight } = updateSize();
|
| 236 |
|
| 237 |
// Sort models by score descending
|
| 238 |
+
const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
|
| 239 |
|
| 240 |
// Update scales
|
| 241 |
xScale
|
| 242 |
+
.domain([0, d3.max(models, d => d.avg_floored_score) * 1.05])
|
| 243 |
.range([0, innerWidth])
|
| 244 |
.nice();
|
| 245 |
|
|
|
|
| 285 |
.attr('class', 'bar')
|
| 286 |
.attr('x', 0)
|
| 287 |
.attr('y', d => yScale(d.name))
|
| 288 |
+
.attr('width', d => xScale(d.avg_floored_score))
|
| 289 |
.attr('height', barHeight)
|
| 290 |
.attr('fill', d => d.color)
|
| 291 |
.attr('rx', 3)
|
|
|
|
| 311 |
.data(models, d => d.name)
|
| 312 |
.join('text')
|
| 313 |
.attr('class', 'score-label')
|
| 314 |
+
.attr('x', d => xScale(d.avg_floored_score) + 6)
|
| 315 |
.attr('y', d => yScale(d.name) + barHeight / 2)
|
| 316 |
.attr('dy', '0.35em')
|
| 317 |
.attr('text-anchor', 'start')
|
| 318 |
+
.text(d => d.avg_floored_score.toFixed(1));
|
| 319 |
}
|
| 320 |
|
| 321 |
// Initialize
|
app/src/content/embeds/by-rule.html
CHANGED
|
@@ -234,7 +234,7 @@
|
|
| 234 |
</div>
|
| 235 |
<div class="metric">
|
| 236 |
<span class="metric-label">Average Score:</span>
|
| 237 |
-
<span class="metric-value">${rule.
|
| 238 |
</div>
|
| 239 |
<div class="metric">
|
| 240 |
<span class="metric-label">Cyclomatic Complexity:</span>
|
|
@@ -311,7 +311,7 @@
|
|
| 311 |
// Update scales
|
| 312 |
const allScores = [];
|
| 313 |
rules.forEach(rule => {
|
| 314 |
-
Object.values(rule.
|
| 315 |
allScores.push(...scores);
|
| 316 |
});
|
| 317 |
});
|
|
@@ -400,7 +400,7 @@
|
|
| 400 |
// Data points
|
| 401 |
const pointData = [];
|
| 402 |
rules.forEach(rule => {
|
| 403 |
-
Object.entries(rule.
|
| 404 |
scores.forEach((score, seedIdx) => {
|
| 405 |
const color = modelColors[modelName] || '#888888';
|
| 406 |
pointData.push({
|
|
|
|
| 234 |
</div>
|
| 235 |
<div class="metric">
|
| 236 |
<span class="metric-label">Average Score:</span>
|
| 237 |
+
<span class="metric-value">${rule.avg_floored_score.toFixed(1)}</span>
|
| 238 |
</div>
|
| 239 |
<div class="metric">
|
| 240 |
<span class="metric-label">Cyclomatic Complexity:</span>
|
|
|
|
| 311 |
// Update scales
|
| 312 |
const allScores = [];
|
| 313 |
rules.forEach(rule => {
|
| 314 |
+
Object.values(rule.floored_scores_by_model).forEach(scores => {
|
| 315 |
allScores.push(...scores);
|
| 316 |
});
|
| 317 |
});
|
|
|
|
| 400 |
// Data points
|
| 401 |
const pointData = [];
|
| 402 |
rules.forEach(rule => {
|
| 403 |
+
Object.entries(rule.floored_scores_by_model).forEach(([modelName, scores]) => {
|
| 404 |
scores.forEach((score, seedIdx) => {
|
| 405 |
const color = modelColors[modelName] || '#888888';
|
| 406 |
pointData.push({
|
app/src/content/embeds/{confidence-distribution.html → guess-rate.html}
RENAMED
|
@@ -1,78 +1,78 @@
|
|
| 1 |
-
<div class="d3-
|
| 2 |
<style>
|
| 3 |
-
.d3-
|
| 4 |
width: 100%;
|
| 5 |
margin: 10px 0;
|
| 6 |
position: relative;
|
| 7 |
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
}
|
| 9 |
|
| 10 |
-
.d3-
|
| 11 |
display: block;
|
| 12 |
width: 100%;
|
| 13 |
height: auto;
|
| 14 |
}
|
| 15 |
|
| 16 |
-
.d3-
|
| 17 |
-
.d3-
|
| 18 |
stroke: var(--axis-color, var(--text-color));
|
| 19 |
}
|
| 20 |
|
| 21 |
-
.d3-
|
| 22 |
fill: var(--tick-color, var(--muted-color));
|
| 23 |
font-size: 11px;
|
| 24 |
}
|
| 25 |
|
| 26 |
-
.d3-
|
| 27 |
stroke: var(--grid-color, rgba(0,0,0,.08));
|
| 28 |
}
|
| 29 |
|
| 30 |
-
.d3-
|
| 31 |
font-size: 14px;
|
| 32 |
font-weight: 500;
|
| 33 |
fill: var(--text-color);
|
| 34 |
}
|
| 35 |
|
| 36 |
-
.d3-
|
| 37 |
transform: translateY(4px);
|
| 38 |
}
|
| 39 |
|
| 40 |
-
.d3-
|
| 41 |
fill: none;
|
| 42 |
stroke-width: 1.5;
|
| 43 |
}
|
| 44 |
|
| 45 |
-
.d3-
|
| 46 |
cursor: pointer;
|
| 47 |
transition: opacity 0.15s ease;
|
| 48 |
}
|
| 49 |
|
| 50 |
-
.d3-
|
| 51 |
opacity: 0.8;
|
| 52 |
}
|
| 53 |
|
| 54 |
-
.d3-
|
| 55 |
font-size: 11px;
|
| 56 |
}
|
| 57 |
|
| 58 |
-
.d3-
|
| 59 |
cursor: pointer;
|
| 60 |
}
|
| 61 |
|
| 62 |
-
.d3-
|
| 63 |
-
.d3-
|
| 64 |
opacity: 0.3;
|
| 65 |
}
|
| 66 |
|
| 67 |
-
.d3-
|
| 68 |
opacity: 0.4;
|
| 69 |
}
|
| 70 |
|
| 71 |
-
.d3-
|
| 72 |
fill: var(--text-color);
|
| 73 |
}
|
| 74 |
|
| 75 |
-
.d3-
|
| 76 |
position: absolute;
|
| 77 |
top: 0;
|
| 78 |
left: 0;
|
|
@@ -91,22 +91,22 @@
|
|
| 91 |
z-index: 10;
|
| 92 |
}
|
| 93 |
|
| 94 |
-
.d3-
|
| 95 |
font-weight: 600;
|
| 96 |
margin-bottom: 4px;
|
| 97 |
}
|
| 98 |
|
| 99 |
-
.d3-
|
| 100 |
display: flex;
|
| 101 |
justify-content: space-between;
|
| 102 |
gap: 16px;
|
| 103 |
}
|
| 104 |
|
| 105 |
-
.d3-
|
| 106 |
color: var(--muted-color);
|
| 107 |
}
|
| 108 |
|
| 109 |
-
.d3-
|
| 110 |
font-weight: 500;
|
| 111 |
}
|
| 112 |
</style>
|
|
@@ -129,8 +129,8 @@
|
|
| 129 |
const bootstrap = () => {
|
| 130 |
const scriptEl = document.currentScript;
|
| 131 |
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 132 |
-
if (!(container && container.classList && container.classList.contains('d3-
|
| 133 |
-
const candidates = Array.from(document.querySelectorAll('.d3-
|
| 134 |
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 135 |
container = candidates[candidates.length - 1] || null;
|
| 136 |
}
|
|
@@ -171,10 +171,10 @@
|
|
| 171 |
// Line generator
|
| 172 |
const line = d3.line()
|
| 173 |
.x(d => xScale(d.confidence_level))
|
| 174 |
-
.y(d => yScale(d.
|
| 175 |
|
| 176 |
// Data loading
|
| 177 |
-
const DATA_URL = '/data/
|
| 178 |
|
| 179 |
function updateSize() {
|
| 180 |
width = container.clientWidth || 800;
|
|
@@ -199,15 +199,15 @@
|
|
| 199 |
<div class="model-name" style="color: ${model.color}">${model.name}</div>
|
| 200 |
<div class="metric">
|
| 201 |
<span class="metric-label">Confidence level:</span>
|
| 202 |
-
<span class="metric-value">${d.confidence_level
|
| 203 |
</div>
|
| 204 |
<div class="metric">
|
| 205 |
-
<span class="metric-label">
|
| 206 |
-
<span class="metric-value">${(d.
|
| 207 |
</div>
|
| 208 |
<div class="metric">
|
| 209 |
-
<span class="metric-label">
|
| 210 |
-
<span class="metric-value">${d.
|
| 211 |
</div>
|
| 212 |
`;
|
| 213 |
|
|
@@ -250,14 +250,10 @@
|
|
| 250 |
.domain([5, 10])
|
| 251 |
.range([0, innerWidth]);
|
| 252 |
|
| 253 |
-
// Y scale:
|
| 254 |
-
const maxProportion = d3.max(visibleModels, m =>
|
| 255 |
-
d3.max(m.distribution, d => d.proportion)
|
| 256 |
-
) || 0.8;
|
| 257 |
yScale
|
| 258 |
-
.domain([0,
|
| 259 |
-
.range([innerHeight, 0])
|
| 260 |
-
.nice();
|
| 261 |
|
| 262 |
// Grid lines
|
| 263 |
const xTicks = [5, 6, 7, 8, 9, 10];
|
|
@@ -324,7 +320,7 @@
|
|
| 324 |
.attr('y', -52)
|
| 325 |
.attr('text-anchor', 'middle')
|
| 326 |
.attr('transform', 'rotate(-90)')
|
| 327 |
-
.text('
|
| 328 |
|
| 329 |
// Lines for each model
|
| 330 |
gLines.selectAll('.distribution-line')
|
|
@@ -358,7 +354,7 @@
|
|
| 358 |
.join('circle')
|
| 359 |
.attr('class', 'data-point data-point-circle')
|
| 360 |
.attr('cx', d => xScale(d.confidence_level))
|
| 361 |
-
.attr('cy', d => yScale(d.
|
| 362 |
.attr('r', 4)
|
| 363 |
.attr('fill', d => d.model.color)
|
| 364 |
.attr('stroke', 'var(--surface-bg, white)')
|
|
@@ -374,7 +370,7 @@
|
|
| 374 |
.attr('class', 'data-point data-point-star')
|
| 375 |
.attr('d', d => starPath(
|
| 376 |
xScale(d.confidence_level),
|
| 377 |
-
yScale(d.
|
| 378 |
6, 2.6
|
| 379 |
))
|
| 380 |
.attr('fill', d => d.model.color)
|
|
|
|
| 1 |
+
<div class="d3-guess-rate"></div>
|
| 2 |
<style>
|
| 3 |
+
.d3-guess-rate {
|
| 4 |
width: 100%;
|
| 5 |
margin: 10px 0;
|
| 6 |
position: relative;
|
| 7 |
font-family: system-ui, -apple-system, sans-serif;
|
| 8 |
}
|
| 9 |
|
| 10 |
+
.d3-guess-rate svg {
|
| 11 |
display: block;
|
| 12 |
width: 100%;
|
| 13 |
height: auto;
|
| 14 |
}
|
| 15 |
|
| 16 |
+
.d3-guess-rate .axes path,
|
| 17 |
+
.d3-guess-rate .axes line {
|
| 18 |
stroke: var(--axis-color, var(--text-color));
|
| 19 |
}
|
| 20 |
|
| 21 |
+
.d3-guess-rate .axes text {
|
| 22 |
fill: var(--tick-color, var(--muted-color));
|
| 23 |
font-size: 11px;
|
| 24 |
}
|
| 25 |
|
| 26 |
+
.d3-guess-rate .grid line {
|
| 27 |
stroke: var(--grid-color, rgba(0,0,0,.08));
|
| 28 |
}
|
| 29 |
|
| 30 |
+
.d3-guess-rate .axes text.axis-label {
|
| 31 |
font-size: 14px;
|
| 32 |
font-weight: 500;
|
| 33 |
fill: var(--text-color);
|
| 34 |
}
|
| 35 |
|
| 36 |
+
.d3-guess-rate .x-axis text {
|
| 37 |
transform: translateY(4px);
|
| 38 |
}
|
| 39 |
|
| 40 |
+
.d3-guess-rate .distribution-line {
|
| 41 |
fill: none;
|
| 42 |
stroke-width: 1.5;
|
| 43 |
}
|
| 44 |
|
| 45 |
+
.d3-guess-rate .data-point {
|
| 46 |
cursor: pointer;
|
| 47 |
transition: opacity 0.15s ease;
|
| 48 |
}
|
| 49 |
|
| 50 |
+
.d3-guess-rate .data-point:hover {
|
| 51 |
opacity: 0.8;
|
| 52 |
}
|
| 53 |
|
| 54 |
+
.d3-guess-rate .legend {
|
| 55 |
font-size: 11px;
|
| 56 |
}
|
| 57 |
|
| 58 |
+
.d3-guess-rate .legend-item {
|
| 59 |
cursor: pointer;
|
| 60 |
}
|
| 61 |
|
| 62 |
+
.d3-guess-rate .legend-item.dimmed .legend-line,
|
| 63 |
+
.d3-guess-rate .legend-item.dimmed .legend-marker {
|
| 64 |
opacity: 0.3;
|
| 65 |
}
|
| 66 |
|
| 67 |
+
.d3-guess-rate .legend-item.dimmed text {
|
| 68 |
opacity: 0.4;
|
| 69 |
}
|
| 70 |
|
| 71 |
+
.d3-guess-rate .legend-text {
|
| 72 |
fill: var(--text-color);
|
| 73 |
}
|
| 74 |
|
| 75 |
+
.d3-guess-rate .d3-tooltip {
|
| 76 |
position: absolute;
|
| 77 |
top: 0;
|
| 78 |
left: 0;
|
|
|
|
| 91 |
z-index: 10;
|
| 92 |
}
|
| 93 |
|
| 94 |
+
.d3-guess-rate .d3-tooltip .model-name {
|
| 95 |
font-weight: 600;
|
| 96 |
margin-bottom: 4px;
|
| 97 |
}
|
| 98 |
|
| 99 |
+
.d3-guess-rate .d3-tooltip .metric {
|
| 100 |
display: flex;
|
| 101 |
justify-content: space-between;
|
| 102 |
gap: 16px;
|
| 103 |
}
|
| 104 |
|
| 105 |
+
.d3-guess-rate .d3-tooltip .metric-label {
|
| 106 |
color: var(--muted-color);
|
| 107 |
}
|
| 108 |
|
| 109 |
+
.d3-guess-rate .d3-tooltip .metric-value {
|
| 110 |
font-weight: 500;
|
| 111 |
}
|
| 112 |
</style>
|
|
|
|
| 129 |
const bootstrap = () => {
|
| 130 |
const scriptEl = document.currentScript;
|
| 131 |
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 132 |
+
if (!(container && container.classList && container.classList.contains('d3-guess-rate'))) {
|
| 133 |
+
const candidates = Array.from(document.querySelectorAll('.d3-guess-rate'))
|
| 134 |
.filter((el) => !(el.dataset && el.dataset.mounted === 'true'));
|
| 135 |
container = candidates[candidates.length - 1] || null;
|
| 136 |
}
|
|
|
|
| 171 |
// Line generator
|
| 172 |
const line = d3.line()
|
| 173 |
.x(d => xScale(d.confidence_level))
|
| 174 |
+
.y(d => yScale(d.guess_rate));
|
| 175 |
|
| 176 |
// Data loading
|
| 177 |
+
const DATA_URL = '/data/guess_rate.json';
|
| 178 |
|
| 179 |
function updateSize() {
|
| 180 |
width = container.clientWidth || 800;
|
|
|
|
| 199 |
<div class="model-name" style="color: ${model.color}">${model.name}</div>
|
| 200 |
<div class="metric">
|
| 201 |
<span class="metric-label">Confidence level:</span>
|
| 202 |
+
<span class="metric-value">${d.confidence_level}</span>
|
| 203 |
</div>
|
| 204 |
<div class="metric">
|
| 205 |
+
<span class="metric-label">Guess rate:</span>
|
| 206 |
+
<span class="metric-value">${(d.guess_rate * 100).toFixed(1)}%</span>
|
| 207 |
</div>
|
| 208 |
<div class="metric">
|
| 209 |
+
<span class="metric-label">Guesses / Turns:</span>
|
| 210 |
+
<span class="metric-value">${d.guess_count} / ${d.total_turns}</span>
|
| 211 |
</div>
|
| 212 |
`;
|
| 213 |
|
|
|
|
| 250 |
.domain([5, 10])
|
| 251 |
.range([0, innerWidth]);
|
| 252 |
|
| 253 |
+
// Y scale: guess rate (0 to 1)
|
|
|
|
|
|
|
|
|
|
| 254 |
yScale
|
| 255 |
+
.domain([0, 1])
|
| 256 |
+
.range([innerHeight, 0]);
|
|
|
|
| 257 |
|
| 258 |
// Grid lines
|
| 259 |
const xTicks = [5, 6, 7, 8, 9, 10];
|
|
|
|
| 320 |
.attr('y', -52)
|
| 321 |
.attr('text-anchor', 'middle')
|
| 322 |
.attr('transform', 'rotate(-90)')
|
| 323 |
+
.text('Guess Rate');
|
| 324 |
|
| 325 |
// Lines for each model
|
| 326 |
gLines.selectAll('.distribution-line')
|
|
|
|
| 354 |
.join('circle')
|
| 355 |
.attr('class', 'data-point data-point-circle')
|
| 356 |
.attr('cx', d => xScale(d.confidence_level))
|
| 357 |
+
.attr('cy', d => yScale(d.guess_rate))
|
| 358 |
.attr('r', 4)
|
| 359 |
.attr('fill', d => d.model.color)
|
| 360 |
.attr('stroke', 'var(--surface-bg, white)')
|
|
|
|
| 370 |
.attr('class', 'data-point data-point-star')
|
| 371 |
.attr('d', d => starPath(
|
| 372 |
xScale(d.confidence_level),
|
| 373 |
+
yScale(d.guess_rate),
|
| 374 |
6, 2.6
|
| 375 |
))
|
| 376 |
.attr('fill', d => d.model.color)
|
app/src/content/embeds/overall-performance.html
CHANGED
|
@@ -184,7 +184,7 @@
|
|
| 184 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 185 |
<div class="metric">
|
| 186 |
<span class="metric-label">Score:</span>
|
| 187 |
-
<span class="metric-value">${d.
|
| 188 |
</div>
|
| 189 |
<div class="metric">
|
| 190 |
<span class="metric-label">Tokens/Turn:</span>
|
|
@@ -222,7 +222,7 @@
|
|
| 222 |
|
| 223 |
// Update scales
|
| 224 |
const xExtent = d3.extent(models, d => d.avg_output_tokens_per_turn);
|
| 225 |
-
const yExtent = d3.extent(models, d => d.
|
| 226 |
const xPadding = (xExtent[1] - xExtent[0]) * 0.1;
|
| 227 |
const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
|
| 228 |
|
|
@@ -314,7 +314,7 @@
|
|
| 314 |
.join('circle')
|
| 315 |
.attr('class', 'point point-circle')
|
| 316 |
.attr('cx', d => xScale(d.avg_output_tokens_per_turn))
|
| 317 |
-
.attr('cy', d => yScale(d.
|
| 318 |
.attr('r', pointRadius)
|
| 319 |
.attr('fill', d => d.color)
|
| 320 |
.attr('stroke', 'none')
|
|
@@ -328,7 +328,7 @@
|
|
| 328 |
.data(openModels, d => d.name)
|
| 329 |
.join('path')
|
| 330 |
.attr('class', 'point point-star')
|
| 331 |
-
.attr('d', d => starPath(xScale(d.avg_output_tokens_per_turn), yScale(d.
|
| 332 |
.attr('fill', d => d.color)
|
| 333 |
.attr('stroke', 'none')
|
| 334 |
.on('mouseenter', showTooltip)
|
|
@@ -341,7 +341,7 @@
|
|
| 341 |
.join('text')
|
| 342 |
.attr('class', 'point-label')
|
| 343 |
.attr('x', d => xScale(d.avg_output_tokens_per_turn) + pointRadius + 6)
|
| 344 |
-
.attr('y', d => yScale(d.
|
| 345 |
.text(d => d.name);
|
| 346 |
}
|
| 347 |
|
|
|
|
| 184 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 185 |
<div class="metric">
|
| 186 |
<span class="metric-label">Score:</span>
|
| 187 |
+
<span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
|
| 188 |
</div>
|
| 189 |
<div class="metric">
|
| 190 |
<span class="metric-label">Tokens/Turn:</span>
|
|
|
|
| 222 |
|
| 223 |
// Update scales
|
| 224 |
const xExtent = d3.extent(models, d => d.avg_output_tokens_per_turn);
|
| 225 |
+
const yExtent = d3.extent(models, d => d.avg_floored_score);
|
| 226 |
const xPadding = (xExtent[1] - xExtent[0]) * 0.1;
|
| 227 |
const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
|
| 228 |
|
|
|
|
| 314 |
.join('circle')
|
| 315 |
.attr('class', 'point point-circle')
|
| 316 |
.attr('cx', d => xScale(d.avg_output_tokens_per_turn))
|
| 317 |
+
.attr('cy', d => yScale(d.avg_floored_score))
|
| 318 |
.attr('r', pointRadius)
|
| 319 |
.attr('fill', d => d.color)
|
| 320 |
.attr('stroke', 'none')
|
|
|
|
| 328 |
.data(openModels, d => d.name)
|
| 329 |
.join('path')
|
| 330 |
.attr('class', 'point point-star')
|
| 331 |
+
.attr('d', d => starPath(xScale(d.avg_output_tokens_per_turn), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
|
| 332 |
.attr('fill', d => d.color)
|
| 333 |
.attr('stroke', 'none')
|
| 334 |
.on('mouseenter', showTooltip)
|
|
|
|
| 341 |
.join('text')
|
| 342 |
.attr('class', 'point-label')
|
| 343 |
.attr('x', d => xScale(d.avg_output_tokens_per_turn) + pointRadius + 6)
|
| 344 |
+
.attr('y', d => yScale(d.avg_floored_score) + 4)
|
| 345 |
.text(d => d.name);
|
| 346 |
}
|
| 347 |
|
app/src/content/embeds/score-stack.html
CHANGED
|
@@ -168,8 +168,7 @@
|
|
| 168 |
|
| 169 |
// Colors for segments
|
| 170 |
const segmentColors = {
|
| 171 |
-
|
| 172 |
-
floored: '#E8973E', // Orange - flooring gain
|
| 173 |
noStakes: '#5AAA5A' // Green - no-stakes gain
|
| 174 |
};
|
| 175 |
|
|
@@ -198,14 +197,10 @@
|
|
| 198 |
const y = event.clientY - rect.top;
|
| 199 |
|
| 200 |
let segmentName, segmentValue, description;
|
| 201 |
-
if (segment === '
|
| 202 |
-
segmentName = '
|
| 203 |
-
segmentValue = d.
|
| 204 |
-
description = '
|
| 205 |
-
} else if (segment === 'floored') {
|
| 206 |
-
segmentName = 'Flooring Gain';
|
| 207 |
-
segmentValue = '+' + d.floored_delta.toFixed(2);
|
| 208 |
-
description = 'Gain if negative scores count as 0';
|
| 209 |
} else {
|
| 210 |
segmentName = 'No-Stakes Gain';
|
| 211 |
segmentValue = '+' + d.no_stakes_delta.toFixed(2);
|
|
@@ -221,11 +216,7 @@
|
|
| 221 |
<div style="font-size: 11px; color: var(--muted-color); margin-top: 4px;">${description}</div>
|
| 222 |
<hr style="border: none; border-top: 1px solid var(--border-color); margin: 8px 0;">
|
| 223 |
<div class="metric">
|
| 224 |
-
<span class="metric-label">
|
| 225 |
-
<span class="metric-value">${d.avg_score.toFixed(2)}</span>
|
| 226 |
-
</div>
|
| 227 |
-
<div class="metric">
|
| 228 |
-
<span class="metric-label">Floored Score:</span>
|
| 229 |
<span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
|
| 230 |
</div>
|
| 231 |
<div class="metric">
|
|
@@ -257,8 +248,8 @@
|
|
| 257 |
|
| 258 |
const { innerWidth, innerHeight } = updateSize();
|
| 259 |
|
| 260 |
-
// Sort models by
|
| 261 |
-
const models = [...data.models].sort((a, b) => b.
|
| 262 |
|
| 263 |
// Update scales
|
| 264 |
const maxScore = d3.max(models, d => d.avg_no_stakes_score);
|
|
@@ -323,49 +314,28 @@
|
|
| 323 |
const safeId = toClassName(d.name);
|
| 324 |
|
| 325 |
// Calculate segment positions
|
| 326 |
-
//
|
| 327 |
-
const
|
| 328 |
-
const
|
| 329 |
-
|
| 330 |
-
// Floored delta starts where raw score ends (if positive) or at 0 (if raw was negative)
|
| 331 |
-
const flooredStart = rawEnd;
|
| 332 |
-
const flooredEnd = flooredStart + d.floored_delta;
|
| 333 |
|
| 334 |
// No-stakes delta starts where floored ends
|
| 335 |
const noStakesStart = flooredEnd;
|
| 336 |
const noStakesEnd = noStakesStart + d.no_stakes_delta;
|
| 337 |
|
| 338 |
-
//
|
| 339 |
-
gBars.selectAll(`.bar-
|
| 340 |
.data([d])
|
| 341 |
.join('rect')
|
| 342 |
-
.attr('class', `bar-segment bar-
|
| 343 |
-
.attr('x', xScale(
|
| 344 |
.attr('y', y)
|
| 345 |
-
.attr('width', Math.max(0, xScale(
|
| 346 |
.attr('height', barHeight)
|
| 347 |
-
.attr('fill', segmentColors.
|
| 348 |
-
.on('mouseenter', (e) => showTooltip(e, d, '
|
| 349 |
-
.on('mousemove', (e) => showTooltip(e, d, '
|
| 350 |
.on('mouseleave', hideTooltip);
|
| 351 |
|
| 352 |
-
// Floored delta segment (only if positive)
|
| 353 |
-
if (d.floored_delta > 0.01) {
|
| 354 |
-
gBars.selectAll(`.bar-floored-${safeId}`)
|
| 355 |
-
.data([d])
|
| 356 |
-
.join('rect')
|
| 357 |
-
.attr('class', `bar-segment bar-floored-${safeId}`)
|
| 358 |
-
.attr('x', xScale(flooredStart))
|
| 359 |
-
.attr('y', y)
|
| 360 |
-
.attr('width', Math.max(0, xScale(flooredEnd) - xScale(flooredStart)))
|
| 361 |
-
.attr('height', barHeight)
|
| 362 |
-
.attr('fill', segmentColors.floored)
|
| 363 |
-
.attr('opacity', 0.5)
|
| 364 |
-
.on('mouseenter', (e) => showTooltip(e, d, 'floored'))
|
| 365 |
-
.on('mousemove', (e) => showTooltip(e, d, 'floored'))
|
| 366 |
-
.on('mouseleave', hideTooltip);
|
| 367 |
-
}
|
| 368 |
-
|
| 369 |
// No-stakes delta segment (only if positive)
|
| 370 |
if (d.no_stakes_delta > 0.01) {
|
| 371 |
gBars.selectAll(`.bar-nostakes-${safeId}`)
|
|
@@ -386,13 +356,9 @@
|
|
| 386 |
|
| 387 |
// Update legend
|
| 388 |
legendDiv.innerHTML = `
|
| 389 |
-
<div class="legend-item">
|
| 390 |
-
<div class="legend-swatch" style="background: ${segmentColors.raw}"></div>
|
| 391 |
-
<span class="legend-label">Raw Score</span>
|
| 392 |
-
</div>
|
| 393 |
<div class="legend-item">
|
| 394 |
<div class="legend-swatch" style="background: ${segmentColors.floored}"></div>
|
| 395 |
-
<span class="legend-label">
|
| 396 |
</div>
|
| 397 |
<div class="legend-item">
|
| 398 |
<div class="legend-swatch" style="background: ${segmentColors.noStakes}"></div>
|
|
|
|
| 168 |
|
| 169 |
// Colors for segments
|
| 170 |
const segmentColors = {
|
| 171 |
+
floored: '#4A90D9', // Blue - floored score
|
|
|
|
| 172 |
noStakes: '#5AAA5A' // Green - no-stakes gain
|
| 173 |
};
|
| 174 |
|
|
|
|
| 197 |
const y = event.clientY - rect.top;
|
| 198 |
|
| 199 |
let segmentName, segmentValue, description;
|
| 200 |
+
if (segment === 'floored') {
|
| 201 |
+
segmentName = 'Score';
|
| 202 |
+
segmentValue = d.avg_floored_score.toFixed(2);
|
| 203 |
+
description = 'Floored score (negative scores count as 0)';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
} else {
|
| 205 |
segmentName = 'No-Stakes Gain';
|
| 206 |
segmentValue = '+' + d.no_stakes_delta.toFixed(2);
|
|
|
|
| 216 |
<div style="font-size: 11px; color: var(--muted-color); margin-top: 4px;">${description}</div>
|
| 217 |
<hr style="border: none; border-top: 1px solid var(--border-color); margin: 8px 0;">
|
| 218 |
<div class="metric">
|
| 219 |
+
<span class="metric-label">Score:</span>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
<span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
|
| 221 |
</div>
|
| 222 |
<div class="metric">
|
|
|
|
| 248 |
|
| 249 |
const { innerWidth, innerHeight } = updateSize();
|
| 250 |
|
| 251 |
+
// Sort models by floored score (descending)
|
| 252 |
+
const models = [...data.models].sort((a, b) => b.avg_floored_score - a.avg_floored_score);
|
| 253 |
|
| 254 |
// Update scales
|
| 255 |
const maxScore = d3.max(models, d => d.avg_no_stakes_score);
|
|
|
|
| 314 |
const safeId = toClassName(d.name);
|
| 315 |
|
| 316 |
// Calculate segment positions
|
| 317 |
+
// Floored score starts from 0
|
| 318 |
+
const flooredStart = 0;
|
| 319 |
+
const flooredEnd = d.avg_floored_score;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
// No-stakes delta starts where floored ends
|
| 322 |
const noStakesStart = flooredEnd;
|
| 323 |
const noStakesEnd = noStakesStart + d.no_stakes_delta;
|
| 324 |
|
| 325 |
+
// Floored score segment (base)
|
| 326 |
+
gBars.selectAll(`.bar-floored-${safeId}`)
|
| 327 |
.data([d])
|
| 328 |
.join('rect')
|
| 329 |
+
.attr('class', `bar-segment bar-floored-${safeId}`)
|
| 330 |
+
.attr('x', xScale(flooredStart))
|
| 331 |
.attr('y', y)
|
| 332 |
+
.attr('width', Math.max(0, xScale(flooredEnd) - xScale(flooredStart)))
|
| 333 |
.attr('height', barHeight)
|
| 334 |
+
.attr('fill', segmentColors.floored)
|
| 335 |
+
.on('mouseenter', (e) => showTooltip(e, d, 'floored'))
|
| 336 |
+
.on('mousemove', (e) => showTooltip(e, d, 'floored'))
|
| 337 |
.on('mouseleave', hideTooltip);
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
// No-stakes delta segment (only if positive)
|
| 340 |
if (d.no_stakes_delta > 0.01) {
|
| 341 |
gBars.selectAll(`.bar-nostakes-${safeId}`)
|
|
|
|
| 356 |
|
| 357 |
// Update legend
|
| 358 |
legendDiv.innerHTML = `
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
<div class="legend-item">
|
| 360 |
<div class="legend-swatch" style="background: ${segmentColors.floored}"></div>
|
| 361 |
+
<span class="legend-label">Score</span>
|
| 362 |
</div>
|
| 363 |
<div class="legend-item">
|
| 364 |
<div class="legend-swatch" style="background: ${segmentColors.noStakes}"></div>
|
app/src/content/embeds/score-vs-failed-guesses.html
CHANGED
|
@@ -169,7 +169,7 @@
|
|
| 169 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 170 |
<div class="metric">
|
| 171 |
<span class="metric-label">Score:</span>
|
| 172 |
-
<span class="metric-value">${d.
|
| 173 |
</div>
|
| 174 |
<div class="metric">
|
| 175 |
<span class="metric-label">Failed Guesses:</span>
|
|
@@ -207,7 +207,7 @@
|
|
| 207 |
|
| 208 |
// Update scales
|
| 209 |
const xExtent = d3.extent(models, d => d.avg_failed_guesses);
|
| 210 |
-
const yExtent = d3.extent(models, d => d.
|
| 211 |
const xPadding = (xExtent[1] - xExtent[0]) * 0.1;
|
| 212 |
const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
|
| 213 |
|
|
@@ -299,7 +299,7 @@
|
|
| 299 |
.join('circle')
|
| 300 |
.attr('class', 'point point-circle')
|
| 301 |
.attr('cx', d => xScale(d.avg_failed_guesses))
|
| 302 |
-
.attr('cy', d => yScale(d.
|
| 303 |
.attr('r', pointRadius)
|
| 304 |
.attr('fill', d => d.color)
|
| 305 |
.attr('stroke', 'none')
|
|
@@ -313,7 +313,7 @@
|
|
| 313 |
.data(openModels, d => d.name)
|
| 314 |
.join('path')
|
| 315 |
.attr('class', 'point point-star')
|
| 316 |
-
.attr('d', d => starPath(xScale(d.avg_failed_guesses), yScale(d.
|
| 317 |
.attr('fill', d => d.color)
|
| 318 |
.attr('stroke', 'none')
|
| 319 |
.on('mouseenter', showTooltip)
|
|
@@ -326,7 +326,7 @@
|
|
| 326 |
.join('text')
|
| 327 |
.attr('class', 'point-label')
|
| 328 |
.attr('x', d => xScale(d.avg_failed_guesses) + pointRadius + 6)
|
| 329 |
-
.attr('y', d => yScale(d.
|
| 330 |
.text(d => d.name);
|
| 331 |
}
|
| 332 |
|
|
|
|
| 169 |
<div class="model-name" style="color: ${d.color}">${d.name}</div>
|
| 170 |
<div class="metric">
|
| 171 |
<span class="metric-label">Score:</span>
|
| 172 |
+
<span class="metric-value">${d.avg_floored_score.toFixed(2)}</span>
|
| 173 |
</div>
|
| 174 |
<div class="metric">
|
| 175 |
<span class="metric-label">Failed Guesses:</span>
|
|
|
|
| 207 |
|
| 208 |
// Update scales
|
| 209 |
const xExtent = d3.extent(models, d => d.avg_failed_guesses);
|
| 210 |
+
const yExtent = d3.extent(models, d => d.avg_floored_score);
|
| 211 |
const xPadding = (xExtent[1] - xExtent[0]) * 0.1;
|
| 212 |
const yPadding = (yExtent[1] - yExtent[0]) * 0.1;
|
| 213 |
|
|
|
|
| 299 |
.join('circle')
|
| 300 |
.attr('class', 'point point-circle')
|
| 301 |
.attr('cx', d => xScale(d.avg_failed_guesses))
|
| 302 |
+
.attr('cy', d => yScale(d.avg_floored_score))
|
| 303 |
.attr('r', pointRadius)
|
| 304 |
.attr('fill', d => d.color)
|
| 305 |
.attr('stroke', 'none')
|
|
|
|
| 313 |
.data(openModels, d => d.name)
|
| 314 |
.join('path')
|
| 315 |
.attr('class', 'point point-star')
|
| 316 |
+
.attr('d', d => starPath(xScale(d.avg_failed_guesses), yScale(d.avg_floored_score), pointRadius * 1.2, pointRadius * 0.5))
|
| 317 |
.attr('fill', d => d.color)
|
| 318 |
.attr('stroke', 'none')
|
| 319 |
.on('mouseenter', showTooltip)
|
|
|
|
| 326 |
.join('text')
|
| 327 |
.attr('class', 'point-label')
|
| 328 |
.attr('x', d => xScale(d.avg_failed_guesses) + pointRadius + 6)
|
| 329 |
+
.attr('y', d => yScale(d.avg_floored_score) + 4)
|
| 330 |
.text(d => d.name);
|
| 331 |
}
|
| 332 |
|