pepijn223 HF Staff commited on
Commit
516e6e9
·
unverified ·
1 Parent(s): 9a36854

Add action representation diagram explaining relative vs delta vs absolute

Browse files

Adapted from UMI paper (Chi et al., 2024). Shows why relative trajectory
avoids error accumulation and calibration requirements.

Made-with: Cursor

app/src/content/chapters/folding/08-ablations.mdx CHANGED
@@ -139,6 +139,15 @@ We hypothesise that the root cause is the difference in **multi-modality** betwe
139
 
140
  #### 2. Relative actions improve performance consistently
141
 
 
 
 
 
 
 
 
 
 
142
  Comparing π0.5 without relative actions (1.2: 20% total SR, 40% L1) to π0.5 with relative actions and quantile normalization (1.3: 35% total SR, 70% L1), and then to the full combination in 1.7 (40% total SR, 80% L1), shows that training with relative actions consistently improves performance. The trend is clear and shows up in every comparison we made.
143
 
144
  The effect size doesn't separate cleanly at 20 rollouts, but the direction is consistent. **Caveat:** π0.5 is likely pretrained with relative actions, so 1.3 and 1.7 fine-tune in a regime consistent with pretraining, while 1.2 fine-tunes against it.
 
139
 
140
  #### 2. Relative actions improve performance consistently
141
 
142
+ We use **relative trajectory** actions as defined by [UMI](https://arxiv.org/abs/2402.10329): each action in the chunk is an offset from the robot's current state at prediction time, not from the previous action. This avoids error accumulation (unlike true delta) and doesn't require a global coordinate frame (unlike absolute). LeRobot uses absolute actions by default — switching to relative trajectory was one of our key improvements.
143
+
144
+ <HtmlEmbed
145
+ id="action-representations"
146
+ src="folding/action-representations.html"
147
+ title="Action Representations"
148
+ desc="Relative trajectory (blue) references all actions to the current state. Delta (yellow) chains each action to the previous one, accumulating error. Absolute (red) requires a global coordinate frame. Diagram adapted from UMI (Chi et al., 2024)."
149
+ />
150
+
151
  Comparing π0.5 without relative actions (1.2: 20% total SR, 40% L1) to π0.5 with relative actions and quantile normalization (1.3: 35% total SR, 70% L1), and then to the full combination in 1.7 (40% total SR, 80% L1), shows that training with relative actions consistently improves performance. The trend is clear and shows up in every comparison we made.
152
 
153
  The effect size doesn't separate cleanly at 20 rollouts, but the direction is consistent. **Caveat:** π0.5 is likely pretrained with relative actions, so 1.3 and 1.7 fine-tune in a regime consistent with pretraining, while 1.2 fine-tunes against it.
app/src/content/embeds/folding/action-representations.html ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <style>
7
+ * { box-sizing: border-box; margin: 0; padding: 0; }
8
+ body { background: transparent; font-family: system-ui, sans-serif; color: #e8eaf0; }
9
+ svg text { font-family: system-ui, sans-serif; }
10
+ </style>
11
+ </head>
12
+ <body>
13
+ <svg id="action-rep" style="overflow:visible"></svg>
14
+ <script>
15
+ function _initActionRep() {
16
+ const svgEl = document.getElementById('action-rep');
17
+ const W = Math.min(svgEl.parentElement.clientWidth || 700, 760);
18
+ const H = 320;
19
+ svgEl.setAttribute('width', W);
20
+ svgEl.setAttribute('height', H);
21
+
22
+ const svg = d3.select('#action-rep').attr('width', W).attr('height', H);
23
+ svg.selectAll('*').remove();
24
+
25
+ const m = { top: 36, right: 30, bottom: 44, left: 50 };
26
+ const w = W - m.left - m.right;
27
+ const h = H - m.top - m.bottom;
28
+ const g = svg.append('g').attr('transform', `translate(${m.left},${m.top})`);
29
+
30
+ const GRID = '#2a2d3a';
31
+ const SUB = '#8b8fa8';
32
+
33
+ // Pose trajectory (ground truth)
34
+ const poses = [
35
+ {t:0, p:0.5}, {t:1, p:1.2}, {t:2, p:1.8}, {t:3, p:2.6},
36
+ {t:4, p:3.0}, {t:5, p:3.9}, {t:6, p:4.3}, {t:7, p:5.2}, {t:8, p:5.8}
37
+ ];
38
+
39
+ const x = d3.scaleLinear().domain([0, 8]).range([0, w]);
40
+ const y = d3.scaleLinear().domain([0, 6.5]).range([h, 0]);
41
+
42
+ // Grid
43
+ g.append('g').selectAll('line').data(y.ticks(5)).join('line')
44
+ .attr('x1', 0).attr('x2', w).attr('y1', d => y(d)).attr('y2', d => y(d))
45
+ .attr('stroke', GRID).attr('stroke-dasharray', '3,3');
46
+
47
+ // Inference boundaries
48
+ [0, 4].forEach(t => {
49
+ g.append('line').attr('x1', x(t)).attr('x2', x(t)).attr('y1', -20).attr('y2', h + 8)
50
+ .attr('stroke', '#555').attr('stroke-dasharray', '6,3').attr('stroke-width', 1);
51
+ g.append('text').attr('x', x(t) + 4).attr('y', -8)
52
+ .attr('fill', SUB).attr('font-size', 9)
53
+ .text(t === 0 ? 'Inference at t=0' : 'Inference at t=4');
54
+ });
55
+
56
+ // Axes
57
+ g.append('g').attr('transform', `translate(0,${h})`).call(d3.axisBottom(x).ticks(8).tickSize(0))
58
+ .call(gg => { gg.select('.domain').attr('stroke', GRID); gg.selectAll('text').attr('fill', SUB).attr('font-size', 10); });
59
+ g.append('g').call(d3.axisLeft(y).ticks(5).tickSize(0))
60
+ .call(gg => { gg.select('.domain').attr('stroke', GRID); gg.selectAll('text').attr('fill', SUB).attr('font-size', 10); });
61
+
62
+ g.append('text').attr('x', w / 2).attr('y', h + 36).attr('text-anchor', 'middle')
63
+ .attr('fill', SUB).attr('font-size', 11).text('Time');
64
+ g.append('text').attr('x', -h / 2).attr('y', -36).attr('text-anchor', 'middle')
65
+ .attr('transform', 'rotate(-90)').attr('fill', SUB).attr('font-size', 11).text('Pose');
66
+
67
+ // Colors
68
+ const COL_REL = '#3b82f6';
69
+ const COL_DELTA = '#f59e0b';
70
+ const COL_ABS = '#ef4444';
71
+
72
+ // --- ABSOLUTE: red dots at each pose ---
73
+ poses.forEach(d => {
74
+ g.append('circle').attr('cx', x(d.t)).attr('cy', y(d.p)).attr('r', 5)
75
+ .attr('fill', COL_ABS).attr('stroke', '#1a1d27').attr('stroke-width', 1.5);
76
+ });
77
+
78
+ // --- RELATIVE: arrows from current state to each future action in chunk ---
79
+ // Chunk 1: from state at t=0 to actions 0..3
80
+ const chunk1Origin = poses[0];
81
+ for (let i = 1; i <= 3; i++) {
82
+ drawArrow(g, x(chunk1Origin.t), y(chunk1Origin.p), x(poses[i].t), y(poses[i].p), COL_REL, 1.5, 0.7);
83
+ }
84
+ // Chunk 2: from state at t=4 to actions 5..7
85
+ const chunk2Origin = poses[4];
86
+ for (let i = 5; i <= 7; i++) {
87
+ drawArrow(g, x(chunk2Origin.t), y(chunk2Origin.p), x(poses[i].t), y(poses[i].p), COL_REL, 1.5, 0.7);
88
+ }
89
+
90
+ // --- DELTA: arrows from each action to the next ---
91
+ for (let i = 0; i < poses.length - 1; i++) {
92
+ drawArrow(g, x(poses[i].t), y(poses[i].p), x(poses[i + 1].t), y(poses[i + 1].p), COL_DELTA, 1.2, 0.45);
93
+ }
94
+
95
+ // Legend
96
+ const legY = -24;
97
+ const items = [
98
+ { color: COL_REL, label: 'Relative trajectory (used here)', style: 'arrow' },
99
+ { color: COL_DELTA, label: 'Delta (accumulates error)', style: 'arrow' },
100
+ { color: COL_ABS, label: 'Absolute (needs calibration)', style: 'dot' },
101
+ ];
102
+
103
+ let lx = w - 10;
104
+ items.slice().reverse().forEach(item => {
105
+ const textEl = g.append('text').attr('y', legY + 4).attr('fill', item.color).attr('font-size', 10).attr('font-weight', 500).text(item.label);
106
+ const tw = textEl.node().getComputedTextLength();
107
+ textEl.attr('x', lx - tw);
108
+ if (item.style === 'dot') {
109
+ g.append('circle').attr('cx', lx - tw - 10).attr('cy', legY + 1).attr('r', 4)
110
+ .attr('fill', item.color).attr('stroke', '#1a1d27').attr('stroke-width', 1);
111
+ } else {
112
+ g.append('line').attr('x1', lx - tw - 20).attr('x2', lx - tw - 6)
113
+ .attr('y1', legY + 1).attr('y2', legY + 1)
114
+ .attr('stroke', item.color).attr('stroke-width', 2);
115
+ g.append('polygon')
116
+ .attr('points', `${lx - tw - 6},${legY - 2} ${lx - tw - 6},${legY + 4} ${lx - tw - 2},${legY + 1}`)
117
+ .attr('fill', item.color);
118
+ }
119
+ lx = lx - tw - 28;
120
+ });
121
+
122
+ function drawArrow(parent, x1, y1, x2, y2, color, width, opacity) {
123
+ const dx = x2 - x1, dy = y2 - y1;
124
+ const len = Math.sqrt(dx * dx + dy * dy);
125
+ const ux = dx / len, uy = dy / len;
126
+ const headLen = 6;
127
+ const tipX = x2 - ux * 2, tipY = y2 - uy * 2;
128
+ const baseX = tipX - ux * headLen, baseY = tipY - uy * headLen;
129
+ const perpX = -uy * 3, perpY = ux * 3;
130
+
131
+ parent.append('line')
132
+ .attr('x1', x1).attr('y1', y1).attr('x2', baseX).attr('y2', baseY)
133
+ .attr('stroke', color).attr('stroke-width', width).attr('opacity', opacity);
134
+ parent.append('polygon')
135
+ .attr('points', `${tipX},${tipY} ${baseX + perpX},${baseY + perpY} ${baseX - perpX},${baseY - perpY}`)
136
+ .attr('fill', color).attr('opacity', opacity);
137
+ }
138
+ }
139
+
140
+ function render() {
141
+ _initActionRep();
142
+ }
143
+
144
+ if (typeof d3 !== "undefined") {
145
+ render();
146
+ } else {
147
+ var s = document.createElement("script");
148
+ s.src = "https://cdnjs.cloudflare.com/ajax/libs/d3/7.9.0/d3.min.js";
149
+ s.onload = render;
150
+ document.head.appendChild(s);
151
+ }
152
+ window.addEventListener('resize', render);
153
+ </script>
154
+ </body>
155
+ </html>