robot-folding / app /src /content /embeds /folding /failure-analysis.html
pepijn223's picture
pepijn223 HF Staff
Improve DAgger explainer, add conclusion and expand references
f0f3d44 unverified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<style>
:root {
--bg: transparent;
--text: #e8eaf0;
--subtext: #8b8fa8;
--border: #2a2d3a;
--chart-bg: #1a1d27;
--btn-active-bg: #252835;
--btn-active-text: #e8eaf0;
--btn-active-border: #4a4d5a;
}
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
background: var(--bg);
font-family: system-ui, sans-serif;
color: var(--text);
padding: 0;
min-height: 100vh;
}
.container {
max-width: 960px;
margin: 0 auto;
padding: 28px 24px 36px;
}
.tab-row {
display: flex;
gap: 4px;
margin-bottom: 20px;
border-bottom: 1px solid var(--border);
padding-bottom: 0;
}
.tab {
padding: 7px 16px;
border: 1px solid transparent;
border-bottom: none;
background: none;
color: var(--subtext);
cursor: pointer;
font-family: inherit;
font-size: 13px;
letter-spacing: 0.05em;
text-transform: uppercase;
border-radius: 4px 4px 0 0;
transition: all 0.15s;
position: relative;
bottom: -1px;
}
.tab:hover { color: var(--text); background: var(--chart-bg); }
.tab.active {
color: var(--text);
background: var(--chart-bg);
border-color: var(--border);
border-bottom-color: var(--chart-bg);
}
.panel { display: none; }
.panel.active { display: block; }
.chart-wrap {
background: var(--chart-bg);
border: 1px solid var(--border);
border-radius: 6px;
padding: 20px 20px 12px;
}
.chart-header {
display: flex;
justify-content: space-between;
align-items: center;
flex-wrap: wrap;
gap: 8px;
margin-bottom: 16px;
}
.chart-title {
font-size: 13px;
text-transform: uppercase;
letter-spacing: 0.08em;
color: var(--subtext);
}
.mode-toggle {
display: flex;
gap: 0;
}
.mode-btn {
padding: 5px 14px;
font-size: 12px;
font-family: inherit;
cursor: pointer;
border: 1px solid var(--border);
background: none;
color: var(--subtext);
transition: all 0.15s;
letter-spacing: 0.04em;
}
.mode-btn:first-child { border-radius: 4px 0 0 4px; }
.mode-btn:last-child { border-radius: 0 4px 4px 0; border-left: none; }
.mode-btn.active { background: var(--btn-active-bg); color: var(--btn-active-text); border-color: var(--btn-active-border); }
.mode-btn:hover:not(.active) { color: var(--text); }
.legend {
display: flex;
flex-wrap: wrap;
gap: 10px 18px;
margin-top: 14px;
padding-top: 12px;
border-top: 1px solid var(--border);
}
.legend-item {
display: flex;
align-items: center;
gap: 6px;
font-size: 12px;
color: var(--subtext);
letter-spacing: 0.03em;
}
.legend-swatch {
width: 10px;
height: 10px;
border-radius: 2px;
flex-shrink: 0;
}
.series-labels {
display: flex;
gap: 16px;
margin-bottom: 14px;
}
.series-badge {
display: flex;
align-items: center;
gap: 6px;
font-size: 12px;
letter-spacing: 0.05em;
text-transform: uppercase;
color: var(--subtext);
}
.series-badge span {
width: 24px;
height: 3px;
border-radius: 2px;
display: inline-block;
}
svg text { font-family: system-ui, sans-serif; }
.note {
font-size: 12px;
color: var(--subtext);
margin-top: 10px;
line-height: 1.6;
letter-spacing: 0.02em;
}
.insight-box {
background: #12151f;
border: 1px solid #2a2d3a;
border-left: 3px solid #f7934f;
border-radius: 4px;
padding: 10px 14px;
margin-bottom: 16px;
font-size: 12px;
color: #8b8fa8;
line-height: 1.6;
}
.insight-box strong { color: #e8eaf0; }
.filter-toggle {
font-size: 12px; color: var(--subtext); cursor: pointer; border: 1px solid var(--border);
background: none; padding: 4px 12px; border-radius: 14px; transition: all .15s;
user-select: none; display: inline-flex; align-items: center; gap: 4px; font-family: inherit;
}
.filter-toggle:hover { color: var(--text); }
.filter-toggle.showing-all { background: rgba(248,147,79,0.12); border-color: #f7934f; color: #f7934f; }
</style>
</head>
<body>
<div class="container">
<div class="tab-row" style="justify-content:space-between;align-items:center">
<div style="display:flex;gap:4px">
<button class="tab active" onclick="showTab('l1')">Level 1: Failure point</button>
<button class="tab" onclick="showTab('l2')">Level 2: Failure point</button>
</div>
<button class="filter-toggle" id="fa-filter-btn" onclick="window._faToggleFilter()">Show all 11</button>
</div>
<!-- LEVEL 2 PANEL -->
<div class="panel" id="panel-l2">
<div class="insight-box">
<strong>Initial training (1.x):</strong> nearly all level 2 failures occur at Unfold (the robot never gets past step 1).&nbsp;
<strong>Fine-tuned (2.x):</strong> Unfold failures collapse (2.5: 0%), but late-stage failures (Fold 3, Rotation) emerge: the model now reliably unfolds but precision degrades at the end.
</div>
<div class="chart-wrap">
<div class="chart-header">
<div class="chart-title">Where does the robot fail? Level 2 failed rollouts by subtask</div>
<div class="mode-toggle">
<button class="mode-btn active" id="mode-l2-abs" onclick="setMode('l2','abs')">Counts</button>
<button class="mode-btn" id="mode-l2-pct" onclick="setMode('l2','pct')">Percentage</button>
</div>
</div>
<svg id="chart-l2" width="100%" height="320" style="overflow:visible"></svg>
<div class="legend" id="legend-l2"></div>
</div>
<p class="note">Each bar = one experiment, showing how its failed Level 2 rollouts distribute across subtasks. Only failed rollouts shown; successful rollouts are excluded. Toggle "Percentage" to compare failure distributions regardless of total failure count.</p>
</div>
<!-- LEVEL 1 PANEL -->
<div class="panel active" id="panel-l1">
<div class="insight-box">
<strong>Level 1 failures</strong> are more distributed since unfolding is given. Initial-training failures concentrate at Fold 2 and Fold 4 (mid-task precision). Fine-tuning nearly eliminates failures entirely; only 2.3 (mirroring) and 2.4 (chunk=45) regress significantly.
</div>
<div class="chart-wrap">
<div class="chart-header">
<div class="chart-title">Where does the robot fail? Level 1 failed rollouts by subtask</div>
<div class="mode-toggle">
<button class="mode-btn active" id="mode-l1-abs" onclick="setMode('l1','abs')">Counts</button>
<button class="mode-btn" id="mode-l1-pct" onclick="setMode('l1','pct')">Percentage</button>
</div>
</div>
<svg id="chart-l1" width="100%" height="320" style="overflow:visible"></svg>
<div class="legend" id="legend-l1"></div>
</div>
<p class="note">Level 1 begins with the shirt already laid flat, so "Unfold" is not a failure point. Toggle "Percentage" to compare where each experiment struggles, independent of how many total failures it has.</p>
</div>
</div>
<script>
function _initFailureAnalysis() {
const EXPERIMENTS = [
{ id:'1.1 π0', series:1 },
{ id:'1.2 π0.5', series:1 },
{ id:'1.3 Relative', series:1 },
{ id:'1.4 RABC low', series:1 },
{ id:'1.5 RABC high', series:1 },
{ id:'1.7 Rel+RABC', series:1 },
{ id:'2.1 HQ', series:2 },
{ id:'2.2 HQ+RABC+Rel', series:2 },
{ id:'2.3 HQ+mirror', series:2 },
{ id:'2.4 HQ chunk45', series:2 },
{ id:'2.5 HQ+RABC+Rel★', series:2 },
];
const L2_FAILURES = {
'1.1 π0': { 'Unfold':10 },
'1.2 π0.5': { 'Unfold':9, 'Rotation':1 },
'1.3 Relative': { 'Unfold':10 },
'1.4 RABC low': { 'Unfold':10 },
'1.5 RABC high': { 'Unfold':9, 'Fold 1':1 },
'1.7 Rel+RABC': { 'Unfold':8, 'Fold 3':1, 'Rotation':1 },
'2.1 HQ': { 'Unfold':8, 'Rotation':1 },
'2.2 HQ+RABC+Rel': { 'Unfold':4, 'Rotation':1 },
'2.3 HQ+mirror': { 'Unfold':8, 'Fold 1':1 },
'2.4 HQ chunk45': { 'Unfold':9, 'Fold 3':1 },
'2.5 HQ+RABC+Rel★': { 'Unfold':2 },
};
const L1_FAILURES = {
'1.1 π0': { 'Fold 2':1 },
'1.2 π0.5': { 'Rotation':4, 'Fold 4':2, 'Fold 2':1 },
'1.3 Relative': { 'Rotation':1, 'Fold 4':1 },
'1.4 RABC low': { 'Rotation':2, 'Fold 3':1, 'Fold 4':2, 'Fold 2':3 },
'1.5 RABC high': { 'Fold 3':2, 'Fold 2':6, 'Fold 1':1 },
'1.7 Rel+RABC': { 'Fold 4':1, 'Fold 2':1, 'Rotation':1 },
'2.1 HQ': { 'Fold 2':1, 'Fold 4':1 },
'2.2 HQ+RABC+Rel': { 'Fold 2':1 },
'2.3 HQ+mirror': { 'Fold 1':3, 'Fold 4':3, 'Fold 3':3 },
'2.4 HQ chunk45': { 'Rotation':2, 'Fold 4':3, 'Fold 3':1 },
'2.5 HQ+RABC+Rel★': {},
};
const SUBTASKS_L2 = ['Unfold','Fold 1','Fold 2','Fold 3','Fold 4','Rotation'];
const SUBTASKS_L1 = ['Fold 1','Fold 2','Fold 3','Fold 4','Rotation'];
const COLORS = {
'Unfold': '#ef4444',
'Fold 1': '#f97316',
'Fold 2': '#eab308',
'Fold 3': '#84cc16',
'Fold 4': '#22d3ee',
'Rotation': '#818cf8',
};
const HIDDEN_BY_DEFAULT = new Set(['1.4 RABC low','1.5 RABC high','2.3 HQ+mirror','2.4 HQ chunk45']);
let faShowAll = false;
window._faToggleFilter = function() {
faShowAll = !faShowAll;
const btn = document.getElementById("fa-filter-btn");
btn.textContent = faShowAll ? "Key experiments" : "Show all 11";
btn.classList.toggle("showing-all", faShowAll);
rendered.l1 = false;
rendered.l2 = false;
renderTab(document.querySelector('.panel.active').id.replace('panel-',''));
};
function getVisibleExperiments() {
return faShowAll ? EXPERIMENTS : EXPERIMENTS.filter(e => !HIDDEN_BY_DEFAULT.has(e.id));
}
const modes = { l2: 'abs', l1: 'abs' };
function setMode(level, mode) {
modes[level] = mode;
document.getElementById(`mode-${level}-abs`).classList.toggle('active', mode === 'abs');
document.getElementById(`mode-${level}-pct`).classList.toggle('active', mode === 'pct');
// Force re-render
rendered[level] = false;
renderTab(level);
}
function cssVar(name) {
return getComputedStyle(document.documentElement).getPropertyValue(name).trim();
}
function buildStackedBar(svgId, legendId, data, subtasks, experiments, normalize) {
const textColor = cssVar('--text');
const subtextColor = cssVar('--subtext');
const borderColor = cssVar('--border');
const svgEl = document.getElementById(svgId);
const W = svgEl.parentElement.clientWidth - 40;
const H = 340;
const margin = { top: 30, right: 16, bottom: 80, left: 70 };
const innerW = W - margin.left - margin.right;
const innerH = H - margin.top - margin.bottom;
svgEl.setAttribute('viewBox', `0 0 ${W} ${H}`);
svgEl.setAttribute('height', H);
const svg = d3.select(`#${svgId}`)
.attr('viewBox', `0 0 ${W} ${H}`)
.attr('height', H);
svg.selectAll('*').remove();
const g = svg.append('g')
.attr('transform', `translate(${margin.left},${margin.top})`);
const expIds = experiments.map(a => a.id);
const stackData = expIds.map(id => {
const row = { id };
subtasks.forEach(s => { row[s] = (data[id] && data[id][s]) || 0; });
row._total = subtasks.reduce((sum, s) => sum + row[s], 0);
return row;
});
let displayData;
if (normalize) {
displayData = stackData.map(row => {
const out = { id: row.id, _total: row._total };
subtasks.forEach(s => {
out[s] = row._total > 0 ? (row[s] / row._total) * 100 : 0;
});
out._displayTotal = row._total > 0 ? 100 : 0;
return out;
});
} else {
displayData = stackData.map(row => ({ ...row, _displayTotal: row._total }));
}
const maxVal = normalize ? 100 : (d3.max(displayData, d => d._displayTotal) || 10);
const x = d3.scaleBand().domain(expIds).range([0, innerW]).padding(0.28);
const y = d3.scaleLinear().domain([0, maxVal]).range([innerH, 0]).nice();
const stack = d3.stack().keys(subtasks)(displayData);
// Grid lines
g.append('g').attr('class', 'grid')
.call(d3.axisLeft(y).tickSize(-innerW).tickFormat('').ticks(5))
.call(gg => {
gg.select('.domain').remove();
gg.selectAll('line').attr('stroke', borderColor).attr('stroke-dasharray', '3,3');
});
// Stacked bars
const layer = g.selectAll('.layer').data(stack).join('g')
.attr('class', 'layer').attr('fill', d => COLORS[d.key] || '#666');
layer.selectAll('rect').data(d => d).join('rect')
.attr('x', d => x(d.data.id))
.attr('y', d => y(d[1]))
.attr('height', d => Math.max(0, y(d[0]) - y(d[1])))
.attr('width', x.bandwidth())
.attr('rx', 2)
.attr('opacity', 0.88);
// Labels on top
g.selectAll('.bar-label').data(displayData).join('text')
.attr('class', 'bar-label')
.attr('x', d => x(d.id) + x.bandwidth() / 2)
.attr('y', d => d._total === 0 ? y(0) - 4 : y(d._displayTotal) - 5)
.attr('text-anchor', 'middle')
.attr('fill', d => d._total === 0 ? borderColor : subtextColor)
.attr('font-size', '11')
.text(d => {
if (d._total === 0) return '✓ 0 failures';
return normalize ? `n=${d._total}` : d._total;
});
// Series divider line
const s1Last = experiments.filter(a => a.series === 1).pop().id;
const s2First = experiments.filter(a => a.series === 2)[0]?.id;
if (s1Last && s2First) {
const xDiv = x(s1Last) + x.bandwidth() + x.step() * 0.14;
g.append('line')
.attr('x1', xDiv).attr('x2', xDiv)
.attr('y1', -22).attr('y2', innerH + 4)
.attr('stroke', borderColor).attr('stroke-width', 1).attr('stroke-dasharray', '4,3');
g.append('text').attr('x', xDiv - 6).attr('y', -18).attr('text-anchor', 'end')
.attr('fill', '#f7934f').attr('font-size', '10').attr('letter-spacing', '0.06em').text('SERIES 1');
if (s2First) {
g.append('text').attr('x', xDiv + 6).attr('y', -18).attr('text-anchor', 'start')
.attr('fill', '#4dc98a').attr('font-size', '10').attr('letter-spacing', '0.06em').text('SERIES 2');
}
}
// Axes
g.append('g')
.call(d3.axisLeft(y).ticks(5).tickSize(4).tickFormat(d => normalize ? d + '%' : d))
.call(gg => {
gg.select('.domain').attr('stroke', borderColor);
gg.selectAll('text').attr('fill', subtextColor).attr('font-size', '11');
gg.selectAll('line').attr('stroke', borderColor);
});
g.append('g').attr('transform', `translate(0,${innerH})`)
.call(d3.axisBottom(x).tickSize(0))
.call(gg => {
gg.select('.domain').attr('stroke', borderColor);
gg.selectAll('text')
.attr('fill', d => {
const a = experiments.find(a => a.id === d);
return a?.series === 2 ? '#4dc98a' : '#f7934f';
})
.attr('font-size', '11')
.attr('transform', 'rotate(-40)')
.attr('text-anchor', 'end')
.attr('dx', '-0.5em')
.attr('dy', '0.3em');
});
// Y axis label
svg.append('text').attr('transform', 'rotate(-90)')
.attr('x', -(margin.top + innerH / 2)).attr('y', 10).attr('text-anchor', 'middle')
.attr('fill', subtextColor).attr('font-size', '11')
.text(normalize ? 'Failure distribution (%)' : 'Failed rollouts (n)');
// Legend
const legendEl = document.getElementById(legendId);
legendEl.innerHTML = subtasks.map(s => `
<div class="legend-item">
<div class="legend-swatch" style="background:${COLORS[s]}"></div>
<span>${s}</span>
</div>
`).join('');
}
const rendered = { l2: false, l1: false };
function renderTab(id) {
if (rendered[id]) return;
rendered[id] = true;
const normalize = modes[id] === 'pct';
const visExps = getVisibleExperiments();
if (id === 'l2') buildStackedBar('chart-l2', 'legend-l2', L2_FAILURES, SUBTASKS_L2, visExps, normalize);
if (id === 'l1') buildStackedBar('chart-l1', 'legend-l1', L1_FAILURES, SUBTASKS_L1, visExps, normalize);
}
function showTab(id) {
document.querySelectorAll('.panel').forEach(p => p.classList.remove('active'));
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
document.getElementById('panel-' + id).classList.add('active');
document.querySelectorAll('.tab').forEach(t => {
if (t.getAttribute('onclick').includes("'" + id + "'")) t.classList.add('active');
});
renderTab(id);
}
window.showTab = showTab;
window.setMode = setMode;
renderTab('l1');
}
if (typeof d3 !== "undefined") {
_initFailureAnalysis();
} else {
var s = document.createElement("script");
s.src = "https://cdnjs.cloudflare.com/ajax/libs/d3/7.9.0/d3.min.js";
s.onload = _initFailureAnalysis;
document.head.appendChild(s);
}
</script>
</body>
</html>