pepijn223 HF Staff commited on
Commit
32be670
Β·
unverified Β·
1 Parent(s): 255daaf

Switch to Clopper-Pearson CIs and STEP sequential testing

Browse files

- Replace Wilson CIs with exact Clopper-Pearson intervals in success
rate chart
- Switch from Barnard's exact test to STEP (TRI) for pairwise
statistical comparisons with Bonferroni correction (n_max=50)
- Recompute CLD assignments using STEP
- Remove separate "Sort by" controls; auto-sort by selected metric
- Add Leandro von Werra to authors and acknowledgments
- Link STEP project page and TRI blog throughout

Made-with: Cursor

app/src/content/article.mdx CHANGED
@@ -30,6 +30,9 @@ authors:
30
  - name: "Virgile Batto"
31
  url: "https://huggingface.co/VirgileBatto"
32
  affiliations: [1]
 
 
 
33
  - name: "Thomas Wolf"
34
  url: "https://huggingface.co/thomwolf"
35
  affiliations: [1]
 
30
  - name: "Virgile Batto"
31
  url: "https://huggingface.co/VirgileBatto"
32
  affiliations: [1]
33
+ - name: "Leandro von Werra"
34
+ url: "https://huggingface.co/lvwerra"
35
+ affiliations: [1]
36
  - name: "Thomas Wolf"
37
  url: "https://huggingface.co/thomwolf"
38
  affiliations: [1]
app/src/content/chapters/folding/07-evaluation.mdx CHANGED
@@ -62,4 +62,4 @@ Scores are summed across all rollouts in an experiment. With 10 L1 rollouts (max
62
 
63
  ### Statistical uncertainty
64
 
65
- With 20 rollouts per experiment, even large apparent differences can be statistically indistinguishable. We report **Wilson 90% confidence intervals** on all success rates and run formal pairwise significance tests. Running 50-100 rollouts per experiment would give tighter estimates but was not feasible for us across 11 experiments on real hardware.
 
62
 
63
  ### Statistical uncertainty
64
 
65
+ With 20 rollouts per experiment, even large apparent differences can be statistically indistinguishable. We report **Clopper-Pearson 90% confidence intervals** on all success rates and run formal pairwise significance tests using [STEP](https://tri-ml.github.io/step/) (Sequential Testing for Efficient Policy comparison). Running 50-100 rollouts per experiment would give tighter estimates but was not feasible for us across 11 experiments on real hardware.
app/src/content/chapters/folding/08-ablations.mdx CHANGED
@@ -114,7 +114,7 @@ Before interpreting success rates, it helps to understand *how* each experiment
114
 
115
  ### Which differences are real?
116
 
117
- With 20 rollouts per experiment, not every visible gap is real. We run **Barnard's exact test** on all 55 pairs with **Bonferroni correction** (Ξ± = 0.10, per-pair p < 0.0018), following [TRI's statistical evaluation framework](https://medium.com/toyotaresearch/statistical-thinking-for-robot-policy-evaluation-from-rigorous-a-b-testing-to-effective-0ae886fbd68d). The chart below shows the full **Bayesian Beta posterior** over each policy's true success rate. **CLD letters** above each violin indicate which experiments are statistically separable, policies sharing a letter are not significantly different.
118
 
119
  <HtmlEmbed
120
  id="statistical-analysis"
 
114
 
115
  ### Which differences are real?
116
 
117
+ With 20 rollouts per experiment, not every visible gap is real. We run **[STEP](https://tri-ml.github.io/step/)** (Sequential Testing for Efficient Policy comparison) on all 55 pairs with **Bonferroni correction** (Ξ± = 0.10, per-pair Ξ± < 0.0018, max sample size = 50), following [TRI's statistical evaluation framework](https://medium.com/toyotaresearch/statistical-thinking-for-robot-policy-evaluation-from-rigorous-a-b-testing-to-effective-0ae886fbd68d). Unlike batch tests, STEP allows us to collect additional rollouts later without p-hacking concerns we set max sample size to 50, leaving room for future data. The chart below shows the full **Bayesian Beta posterior** over each policy's true success rate. **CLD letters** above each violin indicate which experiments are statistically separable, policies sharing a letter are not significantly different.
118
 
119
  <HtmlEmbed
120
  id="statistical-analysis"
app/src/content/chapters/folding/12-references.mdx CHANGED
@@ -17,6 +17,7 @@ This project would not have been possible without the contributions and support
17
  <HfUser username="nepyope" name="Martino Russi" />
18
  <HfUser username="Nico-robot" name="Nicolas Rabault" />
19
  <HfUser username="VirgileBatto" name="Virgile Batto" />
 
20
  <HfUser username="thomwolf" name="Thomas Wolf" />
21
  </div>
22
 
 
17
  <HfUser username="nepyope" name="Martino Russi" />
18
  <HfUser username="Nico-robot" name="Nicolas Rabault" />
19
  <HfUser username="VirgileBatto" name="Virgile Batto" />
20
+ <HfUser username="lvwerra" name="Leandro von Werra" />
21
  <HfUser username="thomwolf" name="Thomas Wolf" />
22
  </div>
23
 
app/src/content/embeds/folding/statistical-analysis.html CHANGED
@@ -34,7 +34,7 @@ svg text{font-family:system-ui,sans-serif}
34
  <body>
35
  <div class="wrap">
36
  <div class="insight purple">
37
- <strong>What this shows:</strong> Each violin is the Bayesian posterior distribution over a policy's true success rate, given the observed rollouts. Wide = high uncertainty; narrow = high certainty. <strong>CLD letters above</strong> summarise which policies are statistically separable (Bonferroni-corrected). Shared letter β†’ not significantly different.
38
  </div>
39
  <div class="card">
40
  <div class="card-head">
@@ -49,10 +49,10 @@ svg text{font-family:system-ui,sans-serif}
49
  <div class="legend">
50
  <div class="li"><div class="lsw" style="background:#f7934f"></div>Series 1</div>
51
  <div class="li"><div class="lsw" style="background:#4dc98a"></div>Series 2</div>
52
- <div class="li" style="font-size:10px;color:#8b8fa8">Bold letter = CLD group (Bonferroni Ξ±=0.10/55)</div>
53
  </div>
54
  </div>
55
- <p class="note">Violins represent posterior uncertainty, not confidence intervals. Two overlapping violins can still be statistically distinct. Barnard's exact test with Bonferroni correction for 55 pairwise comparisons; CLD groups computed at Ξ±=0.10 per-pair threshold (p&lt;0.0018).</p>
56
  </div>
57
 
58
  <div class="tooltip" id="tooltip"></div>
@@ -75,11 +75,11 @@ const DATA = {
75
  '2.5 HQ+RABC+Relβ˜…': {total:[18,20], L1:[10,10], L2:[8,10], series:2},
76
  };
77
 
78
- // CLD assignments (Barnard's exact test, two-sided, Bonferroni Ξ±=0.10/55)
79
  const CLD = {
80
- total: {'2.5 HQ+RABC+Relβ˜…':'a','2.2 HQ+RABC+Rel':'ab','1.1 Ο€0':'bc','1.7 Rel+RABC':'bc','2.1 HQ':'bc','1.3 Relative':'bc','1.2 Ο€0.5':'c','2.4 HQ chunk45':'c','1.4 RABC low':'c','2.3 HQ+mirror':'c','1.5 RABC high':'c'},
81
- L1: {'2.2 HQ+RABC+Rel':'a','2.5 HQ+RABC+Relβ˜…':'a','1.1 Ο€0':'ab','1.7 Rel+RABC':'ab','1.3 Relative':'ab','2.1 HQ':'ab','1.2 Ο€0.5':'abc','2.4 HQ chunk45':'abc','1.4 RABC low':'bc','1.5 RABC high':'c','2.3 HQ+mirror':'c'},
82
- L2: {'2.5 HQ+RABC+Relβ˜…':'a','2.2 HQ+RABC+Rel':'ab','2.1 HQ':'b','2.3 HQ+mirror':'b','1.1 Ο€0':'b','1.2 Ο€0.5':'b','1.3 Relative':'b','1.4 RABC low':'b','1.5 RABC high':'b','1.7 Rel+RABC':'b','2.4 HQ chunk45':'b'},
83
  };
84
 
85
  // ── BETA DISTRIBUTION PDF ────────────────────────────────────────────────────
 
34
  <body>
35
  <div class="wrap">
36
  <div class="insight purple">
37
+ <strong>What this shows:</strong> Each violin is the Bayesian posterior distribution over a policy's true success rate, given the observed rollouts. Wide = high uncertainty; narrow = high certainty. <strong>CLD letters above</strong> summarise which policies are statistically separable (<a href="https://tri-ml.github.io/step/" style="color:#818cf8">STEP</a>, Bonferroni-corrected). Shared letter β†’ not significantly different.
38
  </div>
39
  <div class="card">
40
  <div class="card-head">
 
49
  <div class="legend">
50
  <div class="li"><div class="lsw" style="background:#f7934f"></div>Series 1</div>
51
  <div class="li"><div class="lsw" style="background:#4dc98a"></div>Series 2</div>
52
+ <div class="li" style="font-size:10px;color:#8b8fa8">Bold letter = CLD group (STEP, Bonferroni Ξ±=0.10/55, n<sub>max</sub>=50)</div>
53
  </div>
54
  </div>
55
+ <p class="note">Violins represent posterior uncertainty, not confidence intervals. Two overlapping violins can still be statistically distinct. <a href="https://tri-ml.github.io/step/" style="color:#555e7a">STEP</a> sequential test with Bonferroni correction for 55 pairwise comparisons (Ξ±=0.10, per-pair Ξ±&lt;0.0018, n<sub>max</sub>=50).</p>
56
  </div>
57
 
58
  <div class="tooltip" id="tooltip"></div>
 
75
  '2.5 HQ+RABC+Relβ˜…': {total:[18,20], L1:[10,10], L2:[8,10], series:2},
76
  };
77
 
78
+ // CLD assignments (STEP sequential test, two-sided, Bonferroni Ξ±=0.10/55, n_max=50)
79
  const CLD = {
80
+ total: {'2.5 HQ+RABC+Relβ˜…':'a','2.2 HQ+RABC+Rel':'ab','1.1 Ο€0':'bc','1.7 Rel+RABC':'bc','2.1 HQ':'bc','1.3 Relative':'bc','1.2 Ο€0.5':'cd','2.4 HQ chunk45':'cd','1.4 RABC low':'cd','2.3 HQ+mirror':'cd','1.5 RABC high':'d'},
81
+ L1: {'2.2 HQ+RABC+Rel':'a','2.5 HQ+RABC+Relβ˜…':'a','1.1 Ο€0':'a','1.7 Rel+RABC':'a','1.3 Relative':'a','2.1 HQ':'a','1.2 Ο€0.5':'ab','2.4 HQ chunk45':'ab','1.4 RABC low':'ab','1.5 RABC high':'b','2.3 HQ+mirror':'b'},
82
+ L2: {'2.5 HQ+RABC+Relβ˜…':'a','2.2 HQ+RABC+Rel':'ab','2.1 HQ':'ab','2.3 HQ+mirror':'ab','1.1 Ο€0':'b','1.2 Ο€0.5':'b','1.3 Relative':'b','1.4 RABC low':'b','1.5 RABC high':'b','1.7 Rel+RABC':'b','2.4 HQ chunk45':'b'},
83
  };
84
 
85
  // ── BETA DISTRIBUTION PDF ────────────────────────────────────────────────────
app/src/content/embeds/folding/success-rates.html CHANGED
@@ -23,13 +23,6 @@
23
  .toggle-btn.active-total { background: rgba(167,139,250,0.18); border-color: #a78bfa; color: #a78bfa; }
24
  .toggle-btn.active-l1 { background: rgba(77,201,138,0.18); border-color: #4dc98a; color: #4dc98a; }
25
  .toggle-btn.active-l2 { background: rgba(248,113,113,0.18); border-color: #f87171; color: #f87171; }
26
- .divider { width: 1px; height: 22px; background: #2a2d3a; }
27
- .sort-btn {
28
- background: none; border: 1px solid #2a2d3a; color: #8b8fa8;
29
- font-size: 11px; padding: 4px 10px; border-radius: 6px; cursor: pointer; transition: all .15s;
30
- }
31
- .sort-btn:hover { color: #e8eaf0; border-color: #555; }
32
- .sort-btn.active { color: #e8eaf0; border-color: #555; background: #1a1d27; }
33
 
34
  /* ── Legend ── */
35
  .legend { display: flex; gap: 16px; justify-content: center; flex-wrap: wrap; margin-bottom: 4px; }
@@ -38,7 +31,7 @@
38
  .legend-dot { width: 11px; height: 11px; border-radius: 2px; }
39
  .legend-pip { width: 8px; height: 8px; border-radius: 50%; display: inline-block; }
40
 
41
- /* ── Wilson explanation box ── */
42
  .ci-note {
43
  background: rgba(255,255,255,0.03); border: 1px solid #2a2d3a;
44
  border-radius: 8px; padding: 9px 14px; margin-bottom: 10px;
@@ -78,11 +71,6 @@
78
  <button class="toggle-btn active-total" id="btn-total" onclick="toggleMetric('total')">Total SR</button>
79
  <button class="toggle-btn active-l1" id="btn-l1" onclick="toggleMetric('l1')">Level 1</button>
80
  <button class="toggle-btn active-l2" id="btn-l2" onclick="toggleMetric('l2')">Level 2</button>
81
- <div class="divider"></div>
82
- <span class="ctrl-label">Sort by:</span>
83
- <button class="sort-btn active" id="sort-total" onclick="setSort('total')">Total</button>
84
- <button class="sort-btn" id="sort-l1" onclick="setSort('l1')">L1</button>
85
- <button class="sort-btn" id="sort-l2" onclick="setSort('l2')">L2</button>
86
  </div>
87
 
88
  <!-- Legend -->
@@ -96,11 +84,11 @@
96
  </div>
97
  </div>
98
 
99
- <!-- Wilson CI explanation -->
100
  <div class="ci-note">
101
  <span class="ci-note-icon">β„Ή</span>
102
  <span>
103
- <strong>Error bars = Wilson 90% CI</strong> (Total: n=20, L1/L2: n=10). Wide bars reflect small sample sizes.
104
  </span>
105
  </div>
106
 
@@ -144,23 +132,61 @@ const raw = [
144
  {label:"2.5 HQ+RABC+Relβ˜…", series:"2", total:90, l1:100, l2:80},
145
  ];
146
 
147
- // ── Wilson 90% CI ──────────────────────────────────────────────────────────
148
- function wilson(pct, n, z = 1.645) {
149
- const p = pct / 100;
150
- const denom = 1 + z * z / n;
151
- const center = (p + z * z / (2 * n)) / denom;
152
- const margin = (z / denom) * Math.sqrt(p * (1 - p) / n + z * z / (4 * n * n));
153
- return {
154
- lo: Math.max(0, (center - margin) * 100),
155
- hi: Math.min(100, (center + margin) * 100),
156
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  }
158
 
159
  // Pre-compute CIs and attach to data
160
  const data_with_ci = raw.map(d => {
161
  const out = {...d};
162
  ["total","l1","l2"].forEach(k => {
163
- out[k + "_ci"] = wilson(d[k], N[k]);
164
  });
165
  return out;
166
  });
@@ -175,17 +201,12 @@ window.toggleMetric = function(key) {
175
  const numActive = Object.values(active).filter(Boolean).length;
176
  if (active[key] && numActive === 1) return;
177
  active[key] = !active[key];
 
 
178
  updateLegend();
179
  render();
180
  };
181
 
182
- window.setSort = function(key) {
183
- sortKey = key;
184
- document.querySelectorAll(".sort-btn").forEach(b => b.classList.remove("active"));
185
- document.getElementById("sort-" + key).classList.add("active");
186
- render();
187
- };
188
-
189
  function updateLegend() {
190
  ["total","l1","l2"].forEach(k => {
191
  document.getElementById("btn-" + k).className = "toggle-btn" + (active[k] ? " active-"+k : "");
@@ -283,7 +304,7 @@ function render() {
283
  </span>
284
  </div>
285
  <div style="margin-top:5px;font-size:10px;color:#555">
286
- 90% Wilson CI Β· this metric: ${k}/${n} successes
287
  </div>
288
  ${exp.note ? `<div class="tooltip-note">${exp.note}</div>` : ""}
289
  `);
@@ -293,7 +314,7 @@ function render() {
293
  })
294
  .on("mouseleave", () => tooltip.style("opacity", 0));
295
 
296
- // Error bar (Wilson CI)
297
  groups.append("line").attr("class","error-bar")
298
  .attr("x1", cx).attr("x2", cx)
299
  .attr("y1", d => y(d[key + "_ci"].hi))
 
23
  .toggle-btn.active-total { background: rgba(167,139,250,0.18); border-color: #a78bfa; color: #a78bfa; }
24
  .toggle-btn.active-l1 { background: rgba(77,201,138,0.18); border-color: #4dc98a; color: #4dc98a; }
25
  .toggle-btn.active-l2 { background: rgba(248,113,113,0.18); border-color: #f87171; color: #f87171; }
 
 
 
 
 
 
 
26
 
27
  /* ── Legend ── */
28
  .legend { display: flex; gap: 16px; justify-content: center; flex-wrap: wrap; margin-bottom: 4px; }
 
31
  .legend-dot { width: 11px; height: 11px; border-radius: 2px; }
32
  .legend-pip { width: 8px; height: 8px; border-radius: 50%; display: inline-block; }
33
 
34
+ /* ── CI explanation box ── */
35
  .ci-note {
36
  background: rgba(255,255,255,0.03); border: 1px solid #2a2d3a;
37
  border-radius: 8px; padding: 9px 14px; margin-bottom: 10px;
 
71
  <button class="toggle-btn active-total" id="btn-total" onclick="toggleMetric('total')">Total SR</button>
72
  <button class="toggle-btn active-l1" id="btn-l1" onclick="toggleMetric('l1')">Level 1</button>
73
  <button class="toggle-btn active-l2" id="btn-l2" onclick="toggleMetric('l2')">Level 2</button>
 
 
 
 
 
74
  </div>
75
 
76
  <!-- Legend -->
 
84
  </div>
85
  </div>
86
 
87
+ <!-- Clopper-Pearson CI explanation -->
88
  <div class="ci-note">
89
  <span class="ci-note-icon">β„Ή</span>
90
  <span>
91
+ <strong>Error bars = Clopper-Pearson 90% CI</strong> (Total: n=20, L1/L2: n=10). Wide bars reflect small sample sizes.
92
  </span>
93
  </div>
94
 
 
132
  {label:"2.5 HQ+RABC+Relβ˜…", series:"2", total:90, l1:100, l2:80},
133
  ];
134
 
135
+ // ── Clopper-Pearson exact 90% CI ────────────────────────────────────────────
136
+ function lgamma(x) {
137
+ if (x < 0.5) return Math.log(Math.PI / Math.sin(Math.PI * x)) - lgamma(1 - x);
138
+ x--;
139
+ let a = 0.99999999999980993;
140
+ const c = [676.5203681218851,-1259.1392167224028,771.32342877765313,-176.61502916214059,
141
+ 12.507343278686905,-0.13857109526572012,9.9843695780195716e-6,1.5056327351493116e-7];
142
+ for (let i = 0; i < 8; i++) a += c[i] / (x + i + 1);
143
+ const t = x + 8 - 0.5;
144
+ return 0.5 * Math.log(2 * Math.PI) + (x + 0.5) * Math.log(t) - t + Math.log(a);
145
+ }
146
+ function betaCf(x, a, b) {
147
+ const qab = a + b, qap = a + 1, qam = a - 1;
148
+ let c = 1, d = 1 - qab * x / qap;
149
+ if (Math.abs(d) < 1e-30) d = 1e-30;
150
+ d = 1 / d; let h = d;
151
+ for (let m = 1; m <= 200; m++) {
152
+ const m2 = 2 * m;
153
+ let aa = m * (b - m) * x / ((qam + m2) * (a + m2));
154
+ d = 1 + aa * d; if (Math.abs(d) < 1e-30) d = 1e-30;
155
+ c = 1 + aa / c; if (Math.abs(c) < 1e-30) c = 1e-30;
156
+ d = 1 / d; h *= d * c;
157
+ aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
158
+ d = 1 + aa * d; if (Math.abs(d) < 1e-30) d = 1e-30;
159
+ c = 1 + aa / c; if (Math.abs(c) < 1e-30) c = 1e-30;
160
+ d = 1 / d; const del = d * c; h *= del;
161
+ if (Math.abs(del - 1) < 3e-12) break;
162
+ }
163
+ return h;
164
+ }
165
+ function betaI(x, a, b) {
166
+ if (x <= 0) return 0; if (x >= 1) return 1;
167
+ const bt = Math.exp(lgamma(a+b) - lgamma(a) - lgamma(b) + a*Math.log(x) + b*Math.log(1-x));
168
+ return x < (a+1)/(a+b+2) ? bt * betaCf(x,a,b) / a : 1 - bt * betaCf(1-x,b,a) / b;
169
+ }
170
+ function betaInv(p, a, b) {
171
+ let lo = 0, hi = 1;
172
+ for (let i = 0; i < 100; i++) {
173
+ const mid = (lo + hi) / 2;
174
+ if (betaI(mid, a, b) < p) lo = mid; else hi = mid;
175
+ }
176
+ return (lo + hi) / 2;
177
+ }
178
+ function clopperPearson(pct, n, alpha = 0.10) {
179
+ const k = Math.round(pct / 100 * n);
180
+ const lo = k === 0 ? 0 : betaInv(alpha / 2, k, n - k + 1);
181
+ const hi = k === n ? 1 : betaInv(1 - alpha / 2, k + 1, n - k);
182
+ return { lo: lo * 100, hi: hi * 100 };
183
  }
184
 
185
  // Pre-compute CIs and attach to data
186
  const data_with_ci = raw.map(d => {
187
  const out = {...d};
188
  ["total","l1","l2"].forEach(k => {
189
+ out[k + "_ci"] = clopperPearson(d[k], N[k]);
190
  });
191
  return out;
192
  });
 
201
  const numActive = Object.values(active).filter(Boolean).length;
202
  if (active[key] && numActive === 1) return;
203
  active[key] = !active[key];
204
+ if (active[key]) sortKey = key;
205
+ else if (sortKey === key) sortKey = ["total","l1","l2"].find(k => active[k]);
206
  updateLegend();
207
  render();
208
  };
209
 
 
 
 
 
 
 
 
210
  function updateLegend() {
211
  ["total","l1","l2"].forEach(k => {
212
  document.getElementById("btn-" + k).className = "toggle-btn" + (active[k] ? " active-"+k : "");
 
304
  </span>
305
  </div>
306
  <div style="margin-top:5px;font-size:10px;color:#555">
307
+ 90% Clopper-Pearson CI Β· this metric: ${k}/${n} successes
308
  </div>
309
  ${exp.note ? `<div class="tooltip-note">${exp.note}</div>` : ""}
310
  `);
 
314
  })
315
  .on("mouseleave", () => tooltip.style("opacity", 0));
316
 
317
+ // Error bar (Clopper-Pearson CI)
318
  groups.append("line").attr("class","error-bar")
319
  .attr("x1", cx).attr("x2", cx)
320
  .attr("y1", d => y(d[key + "_ci"].hi))