thibaud frere commited on
Commit
59924a2
Β·
1 Parent(s): d431d95
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. app/scripts/latex-to-markdown/mdx-converter.mjs +161 -35
  2. app/scripts/latex-to-markdown/output/main.mdx +0 -0
  3. app/src/content/article.mdx +653 -5
  4. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-approaches.png +0 -0
  5. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-classical-limitations.png +0 -0
  6. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-cost-accessibility.png +0 -0
  7. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-floor-box.png +0 -0
  8. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-floor-shelf.png +0 -0
  9. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-floor.png +0 -0
  10. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-free.png +0 -0
  11. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-platforms.png +0 -0
  12. app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-so100-to-planar-manipulator.png +0 -0
  13. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-agent-env.png +0 -0
  14. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-duck-sim-vs-real.png +0 -0
  15. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-hil-serl-examples.png +0 -0
  16. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-learning-atlas.png +0 -0
  17. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-learning-benefits.png +0 -0
  18. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-many-ducks.png +0 -0
  19. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-rl-algorithms-atlas.png +0 -0
  20. app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-rl-examples.png +0 -0
  21. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-act-decoder.png +0 -0
  22. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-act-encoder.png +0 -0
  23. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-act.png +0 -0
  24. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-action-vs-observation-distribution.png +0 -0
  25. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-async-inference.png +0 -0
  26. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-bc-trajectories.png +0 -0
  27. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-diffusion-policy.png +0 -0
  28. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-diffusion-robot-actions.png +0 -0
  29. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-diffusion-vs-flowmatching.png +0 -0
  30. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-issues-with-bc.png +0 -0
  31. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-latent-variable-model.png +0 -0
  32. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-many-latents.png +0 -0
  33. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-normalizing-flows.png +0 -0
  34. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-observation-action-mapping.png +0 -0
  35. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-queues.png +0 -0
  36. app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-task-effect-on-pairs.png +0 -0
  37. app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-generalist-policies-timeline.png +0 -0
  38. app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-ml-vs-robotics-foundation.png +0 -0
  39. app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-pi0-sampling-timesteps.png +0 -0
  40. app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-pi0.png +0 -0
  41. app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-smolvla.png +0 -0
  42. app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-trends.png +0 -0
  43. app/src/content/assets/{data β†’ image/figures/data}/somedata.json +0 -0
  44. assets/image/figures/ch1/ch1-lerobot-figure1.png +3 -0
  45. app/src/content/assets/image/misc/lerobot-team.jpeg β†’ assets/image/figures/ch2/ch2-approaches.png +2 -2
  46. assets/image/figures/ch2/ch2-classical-limitations.png +3 -0
  47. assets/image/figures/ch2/ch2-cost-accessibility.png +3 -0
  48. assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png +3 -0
  49. assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png +3 -0
  50. assets/image/figures/ch2/ch2-planar-manipulator-floor.png +3 -0
app/scripts/latex-to-markdown/mdx-converter.mjs CHANGED
@@ -62,33 +62,62 @@ Examples:
62
  */
63
  const usedComponents = new Set();
64
 
 
 
 
 
 
65
  /**
66
  * Add required component imports to the frontmatter
67
  * @param {string} content - MDX content
68
  * @returns {string} - Content with component imports
69
  */
 
 
 
 
 
 
 
 
 
 
 
70
  function addComponentImports(content) {
71
- console.log(' πŸ“¦ Adding component imports...');
72
 
73
- if (usedComponents.size === 0) {
74
- console.log(' ℹ️ No components to import');
75
- return content;
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  }
77
 
78
- // Create import statements
79
- const imports = Array.from(usedComponents)
80
- .map(component => `import ${component} from '../components/${component}.astro';`)
81
- .join('\n');
82
 
83
- console.log(` βœ… Importing: ${Array.from(usedComponents).join(', ')}`);
84
 
85
  // Insert imports after frontmatter
86
  const frontmatterEnd = content.indexOf('---', 3) + 3;
87
  if (frontmatterEnd > 2) {
88
- return content.slice(0, frontmatterEnd) + '\n\n' + imports + '\n' + content.slice(frontmatterEnd);
89
  } else {
90
  // No frontmatter, add at beginning
91
- return imports + '\n\n' + content;
92
  }
93
  }
94
 
@@ -98,54 +127,150 @@ function addComponentImports(content) {
98
  * @param {string} content - MDX content
99
  * @returns {string} - Content with ResponsiveImage components
100
  */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  function transformImages(content) {
102
- console.log(' πŸ–ΌοΈ Transforming images to ResponsiveImage components...');
103
 
104
  let hasImages = false;
105
 
106
- // Transform HTML figure/img to ResponsiveImage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  content = content.replace(
108
  /<figure id="([^"]*)">\s*<img src="([^"]*)" \/>\s*<figcaption>\s*(.*?)\s*<\/figcaption>\s*<\/figure>/gs,
109
  (match, id, src, caption) => {
110
- // Clean up the source path for web
111
- const cleanSrc = src.replace(/.*\/output\/assets\//, '/assets/');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  hasImages = true;
113
- usedComponents.add('ResponsiveImage');
114
 
115
- return `<ResponsiveImage src="${cleanSrc}" alt="${caption.replace(/<[^>]*>/g, '')}" id="${id}" />`;
116
  }
117
  );
118
 
119
- // Transform Pandoc-style images: ![alt](src){#id attr="value"}
120
  content = content.replace(
121
  /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g,
122
  (match, alt, src, attributes) => {
123
- // Clean up the source path for web
124
- const cleanSrc = src.replace(/.*\/output\/assets\//, '/assets/');
125
  hasImages = true;
126
- usedComponents.add('ResponsiveImage');
127
-
128
- let props = [];
129
- props.push(`src="${cleanSrc}"`);
130
- if (alt) props.push(`alt="${alt}"`);
131
 
132
- // Parse attributes if present
133
  if (attributes) {
134
  const idMatch = attributes.match(/#([\w-]+)/);
135
- if (idMatch) props.push(`id="${idMatch[1]}"`);
136
-
137
- const widthMatch = attributes.match(/width="([^"]+)"/);
138
- if (widthMatch && widthMatch[1] !== '\\linewidth') {
139
- props.push(`width="${widthMatch[1]}"`);
140
- }
141
  }
142
 
143
- return `<ResponsiveImage ${props.join(' ')} />`;
144
  }
145
  );
146
 
147
  if (hasImages) {
148
- console.log(' βœ… ResponsiveImage component will be imported');
149
  }
150
 
151
  return content;
@@ -258,8 +383,9 @@ function cleanMdxSyntax(content) {
258
  function processMdxContent(content) {
259
  console.log('πŸ”§ Processing for Astro MDX compatibility...');
260
 
261
- // Clear previous component tracking
262
  usedComponents.clear();
 
263
 
264
  let processedContent = content;
265
 
 
62
  */
63
  const usedComponents = new Set();
64
 
65
+ /**
66
+ * Track individual image imports needed
67
+ */
68
+ const imageImports = new Map(); // src -> varName
69
+
70
  /**
71
  * Add required component imports to the frontmatter
72
  * @param {string} content - MDX content
73
  * @returns {string} - Content with component imports
74
  */
75
+ /**
76
+ * Generate a variable name from image path
77
+ * @param {string} src - Image source path
78
+ * @returns {string} - Valid variable name
79
+ */
80
+ function generateImageVarName(src) {
81
+ // Extract filename without extension and make it a valid JS variable
82
+ const filename = src.split('/').pop().replace(/\.[^.]+$/, '');
83
+ return filename.replace(/[^a-zA-Z0-9]/g, '_').replace(/^[0-9]/, 'img_$&');
84
+ }
85
+
86
  function addComponentImports(content) {
87
+ console.log(' πŸ“¦ Adding component and image imports...');
88
 
89
+ let imports = [];
90
+
91
+ // Add component imports
92
+ if (usedComponents.size > 0) {
93
+ const componentImports = Array.from(usedComponents)
94
+ .map(component => `import ${component} from '../components/${component}.astro';`);
95
+ imports.push(...componentImports);
96
+ console.log(` βœ… Importing components: ${Array.from(usedComponents).join(', ')}`);
97
+ }
98
+
99
+ // Add image imports
100
+ if (imageImports.size > 0) {
101
+ const imageImportStatements = Array.from(imageImports.entries())
102
+ .map(([src, varName]) => `import ${varName} from '${src}';`);
103
+ imports.push(...imageImportStatements);
104
+ console.log(` βœ… Importing ${imageImports.size} image(s)`);
105
  }
106
 
107
+ if (imports.length === 0) {
108
+ console.log(' ℹ️ No imports needed');
109
+ return content;
110
+ }
111
 
112
+ const importBlock = imports.join('\n');
113
 
114
  // Insert imports after frontmatter
115
  const frontmatterEnd = content.indexOf('---', 3) + 3;
116
  if (frontmatterEnd > 2) {
117
+ return content.slice(0, frontmatterEnd) + '\n\n' + importBlock + '\n' + content.slice(frontmatterEnd);
118
  } else {
119
  // No frontmatter, add at beginning
120
+ return importBlock + '\n\n' + content;
121
  }
122
  }
123
 
 
127
  * @param {string} content - MDX content
128
  * @returns {string} - Content with ResponsiveImage components
129
  */
130
+ /**
131
+ * Create ResponsiveImage component with import
132
+ * @param {string} src - Clean image source
133
+ * @param {string} alt - Alt text
134
+ * @param {string} id - Element ID
135
+ * @param {string} caption - Figure caption
136
+ * @param {string} width - Optional width
137
+ * @returns {string} - ResponsiveImage component markup
138
+ */
139
+ function createResponsiveImageComponent(src, alt = '', id = '', caption = '', width = '') {
140
+ const varName = generateImageVarName(src);
141
+ imageImports.set(src, varName);
142
+ usedComponents.add('ResponsiveImage');
143
+
144
+ const props = [];
145
+ props.push(`src={${varName}}`);
146
+ props.push('zoomable');
147
+ props.push('downloadable');
148
+ if (id) props.push(`id="${id}"`);
149
+ props.push('layout="fixed"');
150
+ if (alt) props.push(`alt="${alt}"`);
151
+ if (caption) props.push(`caption={'${caption}'}`);
152
+
153
+ return `<ResponsiveImage\n ${props.join('\n ')}\n/>`;
154
+ }
155
+
156
  function transformImages(content) {
157
+ console.log(' πŸ–ΌοΈ Transforming images to ResponsiveImage components with imports...');
158
 
159
  let hasImages = false;
160
 
161
+ // Helper function to clean source paths
162
+ const cleanSrcPath = (src) => {
163
+ return src.replace(/.*\/output\/assets\//, '../assets/')
164
+ .replace(/\/Users\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/app\/scripts\/latex-to-markdown\/output\/assets\//, '../assets/');
165
+ };
166
+
167
+ // Helper to clean caption text
168
+ const cleanCaption = (caption) => {
169
+ return caption
170
+ .replace(/<[^>]*>/g, '') // Remove HTML tags
171
+ .replace(/\n/g, ' ') // Replace newlines with spaces
172
+ .replace(/\r/g, ' ') // Replace carriage returns with spaces
173
+ .replace(/\s+/g, ' ') // Replace multiple spaces with single space
174
+ .replace(/'/g, "\\'") // Escape quotes
175
+ .trim(); // Trim whitespace
176
+ };
177
+
178
+ // Helper to clean alt text
179
+ const cleanAltText = (alt, maxLength = 100) => {
180
+ const cleaned = alt
181
+ .replace(/<[^>]*>/g, '') // Remove HTML tags
182
+ .replace(/\n/g, ' ') // Replace newlines with spaces
183
+ .replace(/\r/g, ' ') // Replace carriage returns with spaces
184
+ .replace(/\s+/g, ' ') // Replace multiple spaces with single space
185
+ .trim(); // Trim whitespace
186
+
187
+ return cleaned.length > maxLength
188
+ ? cleaned.substring(0, maxLength) + '...'
189
+ : cleaned;
190
+ };
191
+
192
+ // 1. Transform complex HTML figures with style attributes
193
+ content = content.replace(
194
+ /<figure id="([^"]*)">\s*<img src="([^"]*)"(?:\s+style="([^"]*)")?\s*\/>\s*<figcaption>\s*(.*?)\s*<\/figcaption>\s*<\/figure>/gs,
195
+ (match, id, src, style, caption) => {
196
+ const cleanSrc = cleanSrcPath(src);
197
+ const cleanCap = cleanCaption(caption);
198
+ const altText = cleanAltText(cleanCap);
199
+ hasImages = true;
200
+
201
+ return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
202
+ }
203
+ );
204
+
205
+ // 2. Transform standalone img tags with style
206
+ content = content.replace(
207
+ /<img src="([^"]*)"(?:\s+style="([^"]*)")?\s*(?:alt="([^"]*)")?\s*\/>/g,
208
+ (match, src, style, alt) => {
209
+ const cleanSrc = cleanSrcPath(src);
210
+ const cleanAlt = cleanAltText(alt || 'Figure');
211
+ hasImages = true;
212
+
213
+ return createResponsiveImageComponent(cleanSrc, cleanAlt);
214
+ }
215
+ );
216
+
217
+ // 3. Transform images within wrapfigure divs
218
+ content = content.replace(
219
+ /<div class="wrapfigure">\s*r[\d.]+\s*<img src="([^"]*)"[^>]*\/>\s*<\/div>/gs,
220
+ (match, src) => {
221
+ const cleanSrc = cleanSrcPath(src);
222
+ hasImages = true;
223
+
224
+ return createResponsiveImageComponent(cleanSrc, 'Figure');
225
+ }
226
+ );
227
+
228
+ // 4. Transform simple HTML figure/img without style
229
  content = content.replace(
230
  /<figure id="([^"]*)">\s*<img src="([^"]*)" \/>\s*<figcaption>\s*(.*?)\s*<\/figcaption>\s*<\/figure>/gs,
231
  (match, id, src, caption) => {
232
+ const cleanSrc = cleanSrcPath(src);
233
+ const cleanCap = cleanCaption(caption);
234
+ const altText = cleanAltText(cleanCap);
235
+ hasImages = true;
236
+
237
+ return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
238
+ }
239
+ );
240
+
241
+ // 5. Clean up figures with minipage divs
242
+ content = content.replace(
243
+ /<figure id="([^"]*)">\s*<div class="minipage">\s*<img src="([^"]*)"[^>]*\/>\s*<\/div>\s*<figcaption[^>]*>(.*?)<\/figcaption>\s*<\/figure>/gs,
244
+ (match, id, src, caption) => {
245
+ const cleanSrc = cleanSrcPath(src);
246
+ const cleanCap = cleanCaption(caption);
247
+ const altText = cleanAltText(cleanCap);
248
  hasImages = true;
 
249
 
250
+ return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
251
  }
252
  );
253
 
254
+ // 6. Transform Pandoc-style images: ![alt](src){#id attr="value"}
255
  content = content.replace(
256
  /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g,
257
  (match, alt, src, attributes) => {
258
+ const cleanSrc = cleanSrcPath(src);
259
+ const cleanAlt = cleanAltText(alt || 'Figure');
260
  hasImages = true;
 
 
 
 
 
261
 
262
+ let id = '';
263
  if (attributes) {
264
  const idMatch = attributes.match(/#([\w-]+)/);
265
+ if (idMatch) id = idMatch[1];
 
 
 
 
 
266
  }
267
 
268
+ return createResponsiveImageComponent(cleanSrc, cleanAlt, id);
269
  }
270
  );
271
 
272
  if (hasImages) {
273
+ console.log(' βœ… ResponsiveImage components with imports will be created');
274
  }
275
 
276
  return content;
 
383
  function processMdxContent(content) {
384
  console.log('πŸ”§ Processing for Astro MDX compatibility...');
385
 
386
+ // Clear previous tracking
387
  usedComponents.clear();
388
+ imageImports.clear();
389
 
390
  let processedContent = content;
391
 
app/scripts/latex-to-markdown/output/main.mdx CHANGED
The diff for this file is too large to render. See raw diff
 
app/src/content/article.mdx CHANGED
@@ -4,7 +4,48 @@ description: "Converted from LaTeX to MDX"
4
  date: "2025-09-18"
5
  ---
6
 
7
- import ResponsiveImage from "../components/ResponsiveImage.astro";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Foreword
10
 
@@ -14,11 +55,11 @@ Nonetheless, we also hold that the wealth of research from both academia and ind
14
 
15
  This tutorial...
16
 
17
- - Does *not* aim to be a comprehensive guide to general field of robotics, manipulation or underactuated systems:Β [@sicilianoSpringerHandbookRobotics2016](#bibliography) andΒ [@tedrakeRoboticManipulationPerception](#bibliography), [@tedrakeUnderactuatedRoboticsAlgorithms](#bibliography) do this better than we ever could.
18
 
19
- - Does *not* aim to be an introduction to statistical or deep learning:Β [@shalev-shwartzUnderstandingMachineLearning2014](#bibliography) andΒ [@prince2023understanding](#bibliography) cover these subjects better than we ever could.
20
 
21
- - Does *not* aim to be a deep dive into Reinforcement Learning, Diffusion Models, or Flow Matching: invaluable works such asΒ [@suttonReinforcementLearningIntroduction2018](#bibliography),Β [@nakkiranStepbyStepDiffusionElementary2024](#bibliography), andΒ [@lipmanFlowMatchingGuide2024](#bibliography) do this better than we ever could.
22
 
23
  Instead, our goal here is to provide an intuitive explanation as per why these disparate ideas have converged to form the exciting field of modern robot learning, driving the unprecedented progress we see today. In this spirit, we follow the adage: "a jack of all trades is a master of none, *but oftentimes better than a master of one*."
24
 
@@ -26,4 +67,611 @@ We sincerely hope this tutorial serves as a valuable starting point for your jou
26
 
27
  # Introduction
28
 
29
- <ResponsiveImage src="/assets/image/figures/ch1/ch1-lerobot-figure1.png" alt="lerobotΒ is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch." id="fig" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  date: "2025-09-18"
5
  ---
6
 
7
+ import ResponsiveImage from '../components/ResponsiveImage.astro';
8
+ import ch1_lerobot_figure1 from '../assets/image/figures/ch1/ch1-lerobot-figure1.png';
9
+ import ch2_approaches from '../assets/image/figures/ch2/ch2-approaches.png';
10
+ import ch2_platforms from '../assets/image/figures/ch2/ch2-platforms.png';
11
+ import ch2_cost_accessibility from '../assets/image/figures/ch2/ch2-cost-accessibility.png';
12
+ import ch2_so100_to_planar_manipulator from '../assets/image/figures/ch2/ch2-so100-to-planar-manipulator.png';
13
+ import ch2_planar_manipulator_free from '../assets/image/figures/ch2/ch2-planar-manipulator-free.png';
14
+ import ch2_planar_manipulator_floor from '../assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
15
+ import ch2_planar_manipulator_floor_shelf from '../assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png';
16
+ import ch2_classical_limitations from '../assets/image/figures/ch2/ch2-classical-limitations.png';
17
+ import ch3_learning_benefits from '../assets/image/figures/ch3/ch3-learning-benefits.png';
18
+ import ch3_learning_atlas from '../assets/image/figures/ch3/ch3-learning-atlas.png';
19
+ import ch3_rl_examples from '../assets/image/figures/ch3/ch3-rl-examples.png';
20
+ import ch3_agent_env from '../assets/image/figures/ch3/ch3-agent-env.png';
21
+ import ch3_rl_algorithms_atlas from '../assets/image/figures/ch3/ch3-rl-algorithms-atlas.png';
22
+ import ch3_duck_sim_vs_real from '../assets/image/figures/ch3/ch3-duck-sim-vs-real.png';
23
+ import ch3_many_ducks from '../assets/image/figures/ch3/ch3-many-ducks.png';
24
+ import ch3_hil_serl_examples from '../assets/image/figures/ch3/ch3-hil-serl-examples.png';
25
+ import ch4_bc_trajectories from '../assets/image/figures/ch4/ch4-bc-trajectories.png';
26
+ import ch4_observation_action_mapping from '../assets/image/figures/ch4/ch4-observation-action-mapping.png';
27
+ import ch4_issues_with_bc from '../assets/image/figures/ch4/ch4-issues-with-bc.png';
28
+ import ch4_task_effect_on_pairs from '../assets/image/figures/ch4/ch4-task-effect-on-pairs.png';
29
+ import ch4_latent_variable_model from '../assets/image/figures/ch4/ch4-latent-variable-model.png';
30
+ import ch4_many_latents from '../assets/image/figures/ch4/ch4-many-latents.png';
31
+ import ch4_diffusion_robot_actions from '../assets/image/figures/ch4/ch4-diffusion-robot-actions.png';
32
+ import ch4_action_vs_observation_distribution from '../assets/image/figures/ch4/ch4-action-vs-observation-distribution.png';
33
+ import ch4_normalizing_flows from '../assets/image/figures/ch4/ch4-normalizing-flows.png';
34
+ import ch4_diffusion_vs_flowmatching from '../assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png';
35
+ import ch4_act from '../assets/image/figures/ch4/ch4-act.png';
36
+ import ch4_act_encoder from '../assets/image/figures/ch4/ch4-act-encoder.png';
37
+ import ch4_act_decoder from '../assets/image/figures/ch4/ch4-act-decoder.png';
38
+ import ch4_diffusion_policy from '../assets/image/figures/ch4/ch4-diffusion-policy.png';
39
+ import ch5_ml_vs_robotics_foundation from '../assets/image/figures/ch5/ch5-ml-vs-robotics-foundation.png';
40
+ import ch5_generalist_policies_timeline from '../assets/image/figures/ch5/ch5-generalist-policies-timeline.png';
41
+ import ch5_trends from '../assets/image/figures/ch5/ch5-trends.png';
42
+ import ch5_pi0 from '../assets/image/figures/ch5/ch5-pi0.png';
43
+ import ch5_smolvla from '../assets/image/figures/ch5/ch5-smolvla.png';
44
+ import ch2_planar_manipulator_floor_box from '../assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png';
45
+ import ch4_async_inference from '../assets/image/figures/ch4/ch4-async-inference.png';
46
+ import ch4_queues from '../assets/image/figures/ch4/ch4-queues.png';
47
+ import ch5_pi0_sampling_timesteps from '../assets/image/figures/ch5/ch5-pi0-sampling-timesteps.png';
48
+
49
 
50
  # Foreword
51
 
 
55
 
56
  This tutorial...
57
 
58
+ - Does *not* aim to be a comprehensive guide to general field of robotics, manipulation or underactuated systems:Β @sicilianoSpringerHandbookRobotics2016 andΒ @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms do this better than we ever could.
59
 
60
+ - Does *not* aim to be an introduction to statistical or deep learning:Β @shalev-shwartzUnderstandingMachineLearning2014 andΒ @prince2023understanding cover these subjects better than we ever could.
61
 
62
+ - Does *not* aim to be a deep dive into Reinforcement Learning, Diffusion Models, or Flow Matching: invaluable works such asΒ @suttonReinforcementLearningIntroduction2018,Β @nakkiranStepbyStepDiffusionElementary2024, andΒ @lipmanFlowMatchingGuide2024 do this better than we ever could.
63
 
64
  Instead, our goal here is to provide an intuitive explanation as per why these disparate ideas have converged to form the exciting field of modern robot learning, driving the unprecedented progress we see today. In this spirit, we follow the adage: "a jack of all trades is a master of none, *but oftentimes better than a master of one*."
65
 
 
67
 
68
  # Introduction
69
 
70
+ <ResponsiveImage
71
+ src={ch1_lerobot_figure1}
72
+ zoomable
73
+ downloadable
74
+ id="fig:figure1"
75
+ layout="fixed"
76
+ alt="lerobot is the open-source library for end-to-end robotics developed by Hugging Face. The library is..."
77
+ caption={'lerobot is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.'}
78
+ />
79
+
80
+ Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
81
+
82
+ The frontier of robotics research is indeed increasingly moving away from classical model-based control paradigm, embracing the advancements made in ML, aiming to unlock (1) monolithic perception-to-action control pipelines and (2) multi-modal data-driven feature extraction strategies, together with (3) reduced reliance on precise models of the world and (4) a better positioning to benefit from the growing availability of open robotics data. While central problems in manipulation, locomotion and whole-body control demand knowledge of rigid-body dynamics, contact modeling, planning under uncertainty, recent results seem to indicate learning can prove just as effective as explicit modeling, sparking interest in the field of *robot learning*. This interest can be largely justified considering the significant challenges related to deriving accurate models of robot-environment interactions.
83
+
84
+ Moreover, since end-to-end learning on ever-growing collections of text and image data has historically been at the core of the development of *foundation models* capable of semantic reasoning across multiple modalities (images, text, audio, etc.), deriving robotics methods grounded in learning appears particularly consequential, especially as the number of openly available datasets continues to grow.
85
+
86
+ Robotics is, at its core, an inherently multidisciplinary field, requiring a wide range of expertise in both *software* and *hardware*. The integration of learning-based techniques further broadens this spectrum of skills, raising the bar for both research and practical applications. `lerobot`Β is an open-source library designed to integrate end-to-end with the entire robotics stack. With a strong focus on accessible, real-world robots <span class="text-hf-secondary">(1) `lerobot`Β supports many, openly available, robotic platforms</span> for manipulation, locomotion and even whole-body control. `lerobot`also implements a <span class="text-hf-secondary">(2) unified, low-level approach to reading/writing robot configurations</span> to extend support for other robot platforms with relatively low effort. The library introduces `LeRobotDataset`, <span class="text-hf-secondary">(3) a native robotics dataset’s format</span> currently being used by the community to efficiently record and share datasets. `lerobot`Β also supports many state-of-the-art (SOTA) algorithms in robot learning--mainly based on Reinforcement Learning (RL) and Behavioral Cloning (BC) techniques--with efficient implementations in Pytorch, and extended support to experimentation and experiments tracking. Lastly, `lerobot`Β defines a custom, optimized inference stack for robotic policies decoupling action planning from action execution, proving effective in guaranteeing more adaptability at runtime.
87
+
88
+ This tutorial serves the double purpose of providing useful references for the Science behind--and practical use of--common robot learning techniques. To this aim, we strike to provide a rigorous yet concise overview of the core concepts behind the techniques presented, paired with practical examples of how to use such techniques concretely, with code examples in `lerobot`, for researchers and practitioners interested in the field of robot learning. This tutorial is structured as follows:
89
+
90
+ - SectionΒ <a href="#sec:classical" data-reference-type="ref" data-reference="sec:classical">2</a> reviews classical robotics foundations, introducing the limitations of dynamics-based approaches to robotics.
91
+
92
+ - SectionΒ <a href="#sec:learning-rl" data-reference-type="ref" data-reference="sec:learning-rl">3</a> elaborates on the limitations of dynamics-based methods, and introduce RL as a practical approach to solve robotics problems, considering its upsides and potential limitations.
93
+
94
+ - SectionΒ <a href="#sec:learning-imitation" data-reference-type="ref" data-reference="sec:learning-imitation">4</a> further describes robot learning techniques that aim at solving single-tasks learning, leveraging BC techniques to autonomously reproduce specific expert demonstrations.
95
+
96
+ - SectionΒ <a href="#sec:learning-foundation" data-reference-type="ref" data-reference="sec:learning-foundation">5</a> presents recent contributions on developing generalist models for robotics applications, by learning from large corpora of multi-task Β multi-robot data (*robotics foundation models*).
97
+
98
+ Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today. We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using `lerobot`, and start here by presenting the dataset format introduced with `lerobot`.
99
+
100
+ ## `LeRobotDataset`
101
+
102
+ `LeRobotDataset`Β is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset`Β also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding.
103
+
104
+ In this, `LeRobotDataset`Β provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. `LeRobotDataset`Β can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in `lerobot`, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users.
105
+
106
+ ### The dataset class design
107
+
108
+ A core design choice behind `LeRobotDataset`Β is separating the underlying data storage from the user-facing API. This allows for efficient storage while presenting the data in an intuitive, ready-to-use format.
109
+
110
+ Datasets are always organized into three main components:
111
+
112
+ - **Tabular Data**: Low-dimensional, high-frequency data such as joint states, and actions are stored in efficient memory-mapped files, and typically offloaded to the more mature `datasets` library by Hugging Face, providing fast with limited memory consumption.
113
+
114
+ - **Visual Data**: To handle large volumes of camera data, frames are concatenated and encoded into MP4 files. Frames from the same episode are always grouped together into the same video, and multiple videos are grouped together by camera. To reduce stress on the file system, groups of videos for the same camera view are also broke into multiple sub-directories, after a given threshold number.
115
+
116
+ - **Metadata** A collection of JSON files which describes the dataset’s structure in terms of its metadata, serving as the relational counterpart to both the tabular and visual dimensions of data. Metadata include the different feature schema, frame rates, normalization statistics, and episode boundaries.
117
+
118
+ For scalability, and to support datasets with potentially millions of trajectories (resulting in hundreds of millions or billions of individual camera frames), we merge data from different episodes into the same high-level structure. Concretely, this means that any given tabular collection and video will not typically contain information about one episode only, but rather a concatenation of the information available in multiple episodes. This keeps the pressure on the file system limited, both locally and on remote storage providers like Hugging Face, though at the expense of leveraging more heavily relational-like, metadata parts of the dataset, which are used to reconstruct information such as at which position, in a given file, an episode starts or ends. An example struture for a given `LeRobotDataset`Β would appear as follows:
119
+
120
+ - `meta/info.json`: This metadata is a central metadata file. It contains the complete dataset schema, defining all features (e.g., `observation.state`, `action`), their shapes, and data types. It also stores crucial information like the dataset’s frames-per-second (`fps`), `lerobot`’s version at the time of capture, and the path templates used to locate data and video files.
121
+
122
+ - `meta/stats.json`: This file stores aggregated statistics (mean, std, min, max) for each feature across the entire dataset, used for data normalization for most policy models and accessible externally via `dataset.meta.stats`.
123
+
124
+ - `meta/tasks.jsonl`: This file contains the mapping from natural language task descriptions to integer task indices, which are useful for task-conditioned policy training.
125
+
126
+ - `meta/episodes/*` This directory contains metadata about each individual episode, such as its length, the corresponding task, and pointers to where its data is stored in the dataset’s files. For scalability, this information is stored in files rather than a single large JSON file.
127
+
128
+ - `data/*`: Contains the core frame-by-frame tabular data, using parquet files to allow for fast, memory-mapped access. To improve performance and handle large datasets, data from multiple episodes are concatenated into larger files. These files are organized into chunked subdirectories to keep the size of directories manageable. A single file typically contains data for more than one single episode.
129
+
130
+ - `videos/*`: Contains the MP4 video files for all visual observation streams. Similar to the `data/` directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems.
131
+
132
+ ## Code Example: Batching a (Streaming) Dataset
133
+
134
+ This section provides an overview of how to access datasets hosted on Hugging Face using the `LeRobotDataset`Β class. Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction.
135
+
136
+ In practice, most reinforcement learning (RL) and behavioral cloning (BC) algorithms tend to operate on stack of observation and actions. For the sake of brevity, we will refer to joint spaces, and camera frames with the single term of *frame*. For instance, RL algorithms may use a history of previous frames $o_{t-H_o:t}$ to mitigate partial observability, and BC algorithms are in practice trained to regress chunks of multiple actions ($a_{t+t+H_a}$) rather than single controls. To accommodate for these specifics of robot learning training, `LeRobotDataset`Β provides a native windowing operation, whereby users can define the *seconds* of a given window (before and after) around any given frame, by using the `delta_timestemps` functionality. Unavailable frames are opportunely padded, and a padding mask is also returned to filter out the padded frames. Notably, this all happens within the `LeRobotDataset`, and is entirely transparent to higher level wrappers commonly used in training ML models such as `torch.utils.data.DataLoader`.
137
+
138
+ Conveniently, by using `LeRobotDataset`Β with a Pytorch `DataLoader` one can automatically collate the individual sample dictionaries from the dataset into a single dictionary of batched tensors for downstream training or inference. `LeRobotDataset`Β also natively supports streaming mode for datasets. Users can stream data of a large dataset hosted on the Hugging Face Hub, with a one-line change in their implementation. Streaming datasets supports high-performance batch processing (ca. 80-100 it/s, varying on connectivity) and high levels of frames randomization, key features for practical BC algorithms which otherwise may be slow or operating on highly non-i.i.d. data. This feature is designed to improve on accessibility so that large datasets can be processed by users without requiring large amounts of memory and storage.
139
+
140
+ <div class="pbox">
141
+
142
+ Batching a (Streaming) Dataset
143
+
144
+ ```python
145
+ import torch
146
+ from lerobot.datasets.lerobot_dataset import LeRobotDataset
147
+ from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset
148
+
149
+ delta_timestamps = {
150
+ "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame
151
+ }
152
+
153
+ # Optionally, use StreamingLeRobotDataset to avoid downloading the dataset
154
+ dataset = LeRobotDataset(
155
+ "lerobot/svla_so101_pickplace",
156
+ delta_timestamps=delta_timestamps
157
+ )
158
+
159
+ # Streams frames from the Hugging Face Hub without loading into memory
160
+ streaming_dataset = StreamingLeRobotDataset(
161
+ "lerobot/svla_so101_pickplace",
162
+ delta_timestamps=delta_timestamps
163
+ )
164
+
165
+ # Get the 100th frame in the dataset by
166
+ sample = dataset[100]
167
+ print(sample)
168
+ #
169
+
170
+ batch_size=16
171
+ # wrap the dataset in a DataLoader to use process it batches for training purposes
172
+ data_loader = torch.utils.data.DataLoader(
173
+ dataset,
174
+ batch_size=batch_size
175
+ )
176
+
177
+ # Iterate over the DataLoader in a training loop
178
+ num_epochs = 1
179
+ device = "cuda" if torch.cuda.is_available() else "cpu"
180
+
181
+ for epoch in range(num_epochs):
182
+ for batch in data_loader:
183
+ # Move data to the appropriate device (e.g., GPU)
184
+ observations = batch["observation.state"].to(device)
185
+ actions = batch["action"].to(device)
186
+ images = batch["observation.images.wrist_camera"].to(device)
187
+
188
+ # Next, you can do amazing_model.forward(batch)
189
+ ...
190
+ ```
191
+
192
+ </div>
193
+
194
+ # Classical Robotics
195
+
196
+ <div class="epigraph">
197
+
198
+ *Know your enemy* \[...\]
199
+
200
+ Sun Tzu
201
+
202
+ </div>
203
+ <div class="callout">
204
+
205
+ TL;DR Learning-based approaches to robotics are motivated by the need to (1) generalize across tasks and embodiments (2) reduce dependency on human expertise (3) leverage historical trends on the production of data--all traditionally overlooked by dynamics-based techniques.
206
+
207
+ </div>
208
+
209
+ ## Explicit and Implicit Models
210
+
211
+ <ResponsiveImage
212
+ src={ch2_approaches}
213
+ zoomable
214
+ downloadable
215
+ id="fig:generating-motion-atlas"
216
+ layout="fixed"
217
+ alt="Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). Th..."
218
+ caption={'Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (dynamics-based) or implicitly (learning-based) model robot-environment interactions.'}
219
+ />
220
+
221
+ Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplinesΒ @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <span class="text-hf-secondary">producing artificial motion in the physical world</span>.
222
+
223
+ Methods to produce robotics motion range from traditional *explicit* models--<span class="text-hf-secondary">dynamics-based</span>[^1] methods, leveraging precise descriptions of the mechanics of robots’ rigid bodies and their interactions with eventual obstacles in the environment--to *implicit* models--<span class="text-hf-secondary">learning-based</span> methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readingsΒ @agrawalComputationalSensorimotorLearning, @bekrisStateRobotMotion2024. A variety of methods have been developed between these two extrema. For instance, Β @hansenTemporalDifferenceLearning2022 show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learningΒ @suttonReinforcementLearningIntroduction2018 with Model-Predictive Control (MPC). Conversely, as explicit models may be relying on assumptions proving overly simplistic--or even unrealistic--in practice, learning can prove effective to improve modeling of complex phenomena or complement perceptionΒ @mccormacSemanticFusionDense3D2016. Such examples aim at demonstrating the richness of approaches to robotics, and FigureΒ <a href="#fig:generating-motion-atlas" data-reference-type="ref" data-reference="fig:generating-motion-atlas">2</a> graphically illustrates some of the most relevant techniques. Such a list is clearly far from being exhaustive, and we refer toΒ @bekrisStateRobotMotion2024 for a more comprehensive overview of both general and application-specific methods for motion generation. In this section, we wish to introduce the inherent benefits of <span class="text-hf-secondary">learning-based approaches to robotics</span>--the core focus on this tutorial.
224
+
225
+ ## Different Types of Motion
226
+
227
+ <ResponsiveImage
228
+ src={ch2_platforms}
229
+ zoomable
230
+ downloadable
231
+ id="fig:robotics-platforms-atlas"
232
+ layout="fixed"
233
+ alt="Different kinds of motions are achieved with potentially very different robotic platforms. From left..."
234
+ caption={'Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.'}
235
+ />
236
+
237
+ In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (FigureΒ <a href="#fig:robotics-platforms-atlas" data-reference-type="ref" data-reference="fig:robotics-platforms-atlas">3</a>).
238
+
239
+ Effects such as (1) are typically achieved *through* the robot, i.e. generating motion to perform an action inducing a desirable modification, effectively *manipulating* the environment (manipulation). Motions like (2) may result in changes in the robot’s physical location within its environment. Generally, modifications to a robot’s location within its environment may be considered instances of the general *locomotion* problem, further specified as *wheeled* or *legged* locomotion based on whenever a robot makes use of wheels or leg(s) to move in the environment. Lastly, an increased level of dynamism in the robot-environment interactions can be obtained combining (1) and (2), thus designing systems capable to interact with *and* move within their environment. This category is problems is typically termed *mobile manipulation*, and is characterized by a typically much larger set of control variables compared to either locomotion or manipulation alone.
240
+
241
+ The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches. ML has indeed proven particularly transformative across the entire robotics stack, first empowering planning-based techniques with improved state estimation used for traditional planningΒ @tangPerceptionNavigationAutonomous2023 and then end-to-end replacing controllers, effectively yielding perception-to-action methodsΒ @koberReinforcementLearningRobotics. Work in producing robots capable of navigating a diverse set of terrains demonstrated the premise of both dynamics and learning-based approaches for locomotionΒ @griffinWalkingStabilizationUsing2017, @jiDribbleBotDynamicLegged2023, @leeLearningQuadrupedalLocomotion2020, @margolisRapidLocomotionReinforcement2022, and recent works on whole-body control indicated the premise of learning-based approaches to generate rich motion on complex robots, including humanoidsΒ @zhangWoCoCoLearningWholeBody2024, @bjorckGR00TN1Open2025. Manipulation has also been widely studied, particularly considering its relevance for many impactful use-cases ranging from high-risk applications for humansΒ @fujitaDevelopmentRobotsNuclear2020, @alizadehComprehensiveSurveySpace2024 to manufacturingΒ @sannemanStateIndustrialRobotics2020. While explicit models have proven fundamental in achieving important milestones towards the development of modern robotics, recent works leveraging implicit models proved particularly promising in surpassing scalability and applicability challenges via learningΒ @koberReinforcementLearningRobotics.
242
+
243
+ ## Example: Planar Manipulation
244
+
245
+ Robot manipulators typically consist of a series of links and joints, articulated in a chain finally connected to an *end-effector*. Actuated joints are considered responsible for generating motion of the links, while the end effector is instead used to perform specific actions at the target location (e.g., grasping/releasing objects via closing/opening a gripper end-effector, using a specialized tool like a screwdriver, etc.).
246
+
247
+ Recently, the development of low-cost manipulators like the ALOHAΒ @zhaoLearningFineGrainedBimanual2023 ALOHA-2Β @aldacoALOHA2Enhanced and SO-100/SO-101Β @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (FigureΒ <a href="#fig:robotic-platforms-costs" data-reference-type="ref" data-reference="fig:robotic-platforms-costs">4</a>).
248
+
249
+ <ResponsiveImage
250
+ src={ch2_cost_accessibility}
251
+ zoomable
252
+ downloadable
253
+ id="fig:robotic-platforms-costs"
254
+ layout="fixed"
255
+ alt="Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platf..."
256
+ caption={'Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.'}
257
+ />
258
+
259
+ Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works includingΒ @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
260
+
261
+ <ResponsiveImage
262
+ src={ch2_so100_to_planar_manipulator}
263
+ zoomable
264
+ downloadable
265
+ id="fig:make-so100-planar-manipulator"
266
+ layout="fixed"
267
+ alt="The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex ..."
268
+ caption={'The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).'}
269
+ />
270
+
271
+ Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in FigureΒ <a href="#fig:make-so100-planar-manipulator" data-reference-type="ref" data-reference="fig:make-so100-planar-manipulator">5</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
272
+
273
+ Further, let us make the simplifying assumption that actuators can produce rotations up to $2 \pi$ radians. In practice, this is seldom the case due to movement obstructions caused by the robot body itself (for instance, the shoulder lift cannot produce counter-clockwise movement due to the presence of the robot’s base used to secure the SO-100 to its support and host the robot bus), but we will introduce movement obstruction at a later stage.
274
+
275
+ All these simplifying assumptions leave us with the planar manipulator of FigureΒ <a href="#fig:planar-manipulation-simple" data-reference-type="ref" data-reference="fig:planar-manipulation-simple">6</a>, free of moving its end-effector by controlling the angles $\theta_1$ and $\theta_2$, jointly referred to as the robot’s *configuration*, and indicated with $q = [\theta_1, \theta_2 ] \in [-\pi, +\pi]^2$. The axis attached to the joints indicate the associated reference frame, whereas circular arrows indicate the maximal feasible rotation allowed at each joint. In this tutorial, we do not cover topics related to spatial algebra, and we instead refer the reader to and for excellent explanations of the mechanics and theoretical foundations of producing motion on rigid bodies.
276
+
277
+ <figure id="fig:planar-manipulator-floor-shelf">
278
+ <ResponsiveImage
279
+ src={ch2_planar_manipulator_free}
280
+ zoomable
281
+ downloadable
282
+ id="fig:planar-manipulation-simple"
283
+ layout="fixed"
284
+ alt="Free to move"
285
+ caption={'Free to move'}
286
+ />
287
+ <ResponsiveImage
288
+ src={ch2_planar_manipulator_floor}
289
+ zoomable
290
+ downloadable
291
+ id="fig:planar-manipulator-floor"
292
+ layout="fixed"
293
+ alt="Constrained by the surface"
294
+ caption={'Constrained by the surface'}
295
+ />
296
+ <ResponsiveImage
297
+ src={ch2_planar_manipulator_floor_shelf}
298
+ zoomable
299
+ downloadable
300
+ id="fig:planar-manipulator-floor-shelf"
301
+ layout="fixed"
302
+ alt="Constrained by surface and (fixed) obstacle"
303
+ caption={'Constrained by surface and (fixed) obstacle'}
304
+ />
305
+ <figcaption>Planar, 2-dof schematic representation of the SO-100 manipulator under diverse deployment settings. From left to right: completely free of moving; constrained by the presence of the surface; constrained by the surface and presence of obstacles. Circular arrows around each joint indicate the maximal rotation feasible at that joint.</figcaption>
306
+ </figure>
307
+
308
+ Considering the (toy) example presented in FigureΒ <a href="#fig:planar-manipulation-simple" data-reference-type="ref" data-reference="fig:planar-manipulation-simple">6</a>, then we can analytically write the end-effector’s position $p \in \mathbb R^2$ as a function of the robot’s configuration, $p = p(q), p: \mathcal Q \mapsto \mathbb R^2$. In particular, we have:
309
+ $$
310
+ `p(q) =
311
+ \begin
312
+ p_x(\theta_1, \theta_2)\\
313
+ p_y(\theta_1, \theta_2)
314
+ \end{pmatrix}
315
+ =
316
+ \begin{pmatrix}
317
+ l \cos(\theta_1) + l \cos(\theta_1 + \theta_2)\\
318
+ l \sin(\theta_1) + l \sin(\theta_1 + \theta_2)
319
+ \end{pmatrix}
320
+ \in S^{n=2}_{l_1+l_2} = \{ p(q) \in \mathbb R^2: \Vert p(q) \Vert_2^2 \leq (2l)^2, \ \forall q \in \mathcal Q \}`
321
+ $$
322
+
323
+
324
+ Deriving the end-effector’s *pose*--position *and* orientation--in some $m$-dimensional space $\boldsymbol{p} \in \mathcal{P} \subset \mathbb{R}^{m}$ starting from the configuration ${\textnormal{q}}\in \mathcal Q \subset \mathbb R^n$ of a $n$-joints robot is referred to as *forward kinematics* (FK), whereas identifying the configuration corresponding to any given target pose is termed *inverse kinematics* (IK). In that, FK is used to map a robot configuration into the corresponding end-effector pose, whereas IK is used to reconstruct the configuration(s) given an end-effector pose.
325
+
326
+ In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
327
+
328
+ $\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
329
+ $
330
+
331
+ Exact analytical solutions to IK are even less appealing when one considers the presence of obstacles in the robot’s workspace, resulting in constraints on the possible values of $q \in \mathcal Q \subseteq [-\pi, +\pi]^n \subset \mathbb R^n$ in the general case of $n$-links robots.
332
+
333
+ For instance, the robot in FigureΒ <a href="#fig:planar-manipulator-floor" data-reference-type="ref" data-reference="fig:planar-manipulator-floor">7</a> is (very naturally) obstacled by the presence of the surface upon which it rests: $\theta_1$ can now exclusively vary within $[0, \pi]$, while possible variations in $\theta_2$ depend on $\theta_1$ (when $\theta_1 \to 0$ or $\theta_1 \to \pi$, further downwards movements are restricted). Even for a simplified kinematic model, developing techniques to solveΒ eq.Β <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a> is in general non-trivial in the presence of constraints, particularly considering that the feasible set of solutions $\mathcal Q$ may change across problems. FigureΒ <a href="#fig:planar-manipulator-floor-shelf" data-reference-type="ref" data-reference="fig:planar-manipulator-floor-shelf">9</a> provides an example of how the environment influences the feasible set considered, with a new set of constraints deriving from the position of a new obstacle.
334
+
335
+ However, IK--solving eq.Β <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq.Β <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq.Β <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}: \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
336
+
337
+ Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*, $\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
338
+ $
339
+
340
+ UnlikeΒ eq.Β <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a>, solving for $\dot q$ is much less dependent on the environment (typically, variations in velocity are constrained by physical limits on the actuators). Conveniently, eq.Β <a href="#eq:reg_ik_velocity" data-reference-type="ref" data-reference="eq:reg_ik_velocity">[eq:reg_ik_velocity]</a> also often admits the closed-form solution $\dot q = J(q)^+ \dot {p}^*$, where $J^+(q)$ denotes the Moore-Penrose pseudo-inverse of $J(q)$. Finally, discrete-time joint configurations $q$ can be reconstructed from joint velocities $\dot q$ using forward-integration on the continuous-time joint velocity , $q_{t+1} = q_t + \Delta t\,\dot q_t$ for a given $\Delta t$, resulting in tracking via diff-IK.
341
+
342
+ Following trajectories with diff-IK is a valid option in well-controlled and static environments (e.g., industrial manipulators in controlled manufacturing settings), and relies on the ability to define a set of target velocities to track $[\dot {p}^*_0, \dot {p}^*_1, \dots, \dot {p}^*_k ]$--an error-prone task largely requiring human expertise. Furthermore, diff-IK relies on the ability to (1) access $J(q) \, \forall q \in \mathcal Q$ and (2) compute its pseudo-inverse at every iteration of a given control cycle--a challenging assumption in highly dynamical settings, or for complex kinematic chains.
343
+
344
+ ### Adding Feedback Loops
345
+
346
+ While very effective when a goal trajectory has been well specified, the performance of diff-IK can degrade significantly in the presence of modeling/tracking errors, or in the presence of non-modeled dynamics in the environment.
347
+
348
+ <div class="wrapfigure">
349
+
350
+ r0.3 <ResponsiveImage
351
+ src={ch2_planar_manipulator_floor_box}
352
+ zoomable
353
+ downloadable
354
+ layout="fixed"
355
+ alt="image"
356
+ />
357
+ </div>
358
+
359
+ One such case is presented in FigureΒ <a href="#fig:planar-manipulator-box-velocity" data-reference-type="ref" data-reference="fig:planar-manipulator-box-velocity">[fig:planar-manipulator-box-velocity]</a>, where another rigid body other than the manipulator is moving in the environment along the horizontal axis, with velocity $\dot x_B$. Accounting analytically for the presence of this disturbance--for instance, to prevent the midpoint of the link from ever colliding with the object--requires access to $\dot x_B$ at least, to derive the equation characterizing the motion of the environment.
360
+
361
+ Less predictable disturbances however (e.g., $\dot x_B \leftarrow \dot x_B + {\varepsilon}, {\varepsilon}\sim N(0,1)$) may prove challenging to model analytically, and one could attain the same result of preventing link-object collision by adding a condition on the distance between the midpoint of $l$ and $x_B$, enforced through a feedback loop on the position of the robot and object at each control cycle.
362
+
363
+ To mitigate the effect of modeling errors, sensing noise and other disturbances, classical pipelines indeed do augment diff-IK with feedback control looping back quantities of interest. In practice, following a trajectory with a closed feedback loop might consist in backwarding the error between the target and measured pose, $\Delta p = p^*- p(q)$, hereby modifying the control applied to $\dot q = J(q)^+ (\dot {p}^*+ k_p \Delta p )$, with $k_p$ defined as the (proportional) gain.
364
+
365
+ More advanced techniques for control consisting in feedback linearization, PID control, Linear Quatratic Regulator (LQR) or Model-Predictive Control (MPC) can be employed to stabilize tracking and reject moderate perturbations, and we refer to for in-detail explanation of these concepts, or for a simple, intuitive example in the case of a point-mass system. Nonetheless, feedback control presents its challenges as well: tuning gains remains laborious and system-specific. Further, manipulation tasks present intermittent contacts inducing hybrid dynamics (mode switches) and discontinuities in the Jacobian, challenging the stability guarantees of the controller and thus often necessitating rather conservative gains and substantial hand-tuning.
366
+
367
+ We point the interested reader toΒ , , andΒ  for extended coverage of FK, IK, diff-IK and control for (diff-)IK.
368
+
369
+ ## Limitations of Dynamics-based Robotics
370
+
371
+ Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
372
+
373
+ <ResponsiveImage
374
+ src={ch2_classical_limitations}
375
+ zoomable
376
+ downloadable
377
+ id="fig:classical-limitations"
378
+ layout="fixed"
379
+ alt="Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple co..."
380
+ caption={'Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.'}
381
+ />
382
+
383
+ Dynamics-based robotics pipelines have historically been <span class="text-hf-secondary">developed sequentially, engineering the different blocks</span> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
384
+
385
+ Moreover, classical planners operate on compact, assumed-sufficient state representations; extending them to reason directly over raw, heterogeneous and noisy data streams is non-trivial. This results in a <span class="text-hf-secondary">limited scalability to multimodal data and multitask settings</span>, as incorporating high-dimensional perceptual inputs (RGB, depth, tactile, audio) traditionally required extensive engineering efforts to extract meaningful features for control. Also, the large number of tasks, coupled with the adoption of *per-task* planners, goal parameterizations, and safety constraints, results in an explosion in design and validation options, with little opportunity to reuse solutions across tasks.
386
+
387
+ Setting aside integration and scalability challenges: developing accurate modeling of contact, friction, and compliance for complicated systems remains difficult. Rigid-body approximations are often insufficient in the presence of deformable objects, and <span class="text-hf-secondary">relying on approximated models hinders real-world applicability</span> of the methods developed. In the case of complex, time-dependent and/or non-linear dynamics, even moderate mismatches in parameters, unmodeled evolutions, or grasp-induced couplings can qualitatively affect the observed dynamics.
388
+
389
+ Lastly, dynamics-based methods (naturally) overlook the rather recent <span class="text-hf-secondary">increase in availability of openly-available robotics datasets</span>. The curation of academic datasets by large centralized groups of human experts in roboticsΒ @collaborationOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 is now increasingly complemented by a <span class="text-hf-secondary">growing number of robotics datasets contributed in a decentralized fashion</span> by individuals with varied expertise. If not tangentially, dynamics-based approaches are not posed to maximally benefit from this trend, which holds the premise of allowing generalization in the space of tasks and embodiments, like data was the cornerstone for advancements in visionΒ @alayracFlamingoVisualLanguage2022 and natural-language understandingΒ @brownLanguageModelsAre2020.
390
+
391
+ Taken together, these limitations (FigureΒ <a href="#fig:classical-limitations" data-reference-type="ref" data-reference="fig:classical-limitations">10</a>) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available.
392
+
393
+ # Robot (Reinforcement) Learning
394
+
395
+ <div class="epigraph">
396
+
397
+ *Approximate the solution, not the problem* \[...\]
398
+
399
+ Richard Sutton
400
+
401
+ </div>
402
+ <div class="callout">
403
+
404
+ TL;DR The need for expensive high-fidelity simulators can be obviated by learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
405
+
406
+ </div>
407
+ <ResponsiveImage
408
+ src={ch3_learning_benefits}
409
+ zoomable
410
+ downloadable
411
+ id="fig:robot-learning-upsides"
412
+ layout="fixed"
413
+ alt="Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level contro..."
414
+ caption={'Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used.'}
415
+ />
416
+
417
+ Learning-based techniques for robotics naturally address the limitations presented inΒ <a href="#sec:classical" data-reference-type="ref" data-reference="sec:classical">2</a> (FigureΒ <a href="#fig:robot-learning-upsides" data-reference-type="ref" data-reference="fig:robot-learning-upsides">11</a>). Learning-based techniques typically rely on prediction-to-action (*visuomotor policies*), thereby directly mapping sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensorimotor inputs to actions directly also allows to add diverse input modalities, leveraging the automatic feature extraction characteristic of most modern learning systems. Further, learning-based approaches can in principle entirely bypass modeling efforts and instead rely exclusively on interactions data, proving transformative when dynamics are challenging to model or even entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision first and natural language processing later did historically benefit from large scale corpora of (possibly non curated) data, in great part overlooked by dynamics-based approaches.
418
+
419
+ Being a field at its relative nascent stages, no prevalent technique(s) proved distinctly better better in robot learning. Still, two major classes of methods gained prominence: <span class="text-hf-secondary">reinforcement learning (RL)</span> and <span class="text-hf-secondary">Behavioral Cloning (BC)</span> (FigureΒ <a href="#fig:robot-learning-atlas" data-reference-type="ref" data-reference="fig:robot-learning-atlas">12</a>). In this section, we provide a conceptual overview of applications of the former to robotics, as well as introduce practical examples of how to use RL within `lerobot`. We then introduce the major limitations RL suffers from, to introduce BC techniques in the next sections (<a href="#sec:learning-bc-single, sec:learning-bc-generalist" data-reference-type="ref" data-reference="sec:learning-bc-single, sec:learning-bc-generalist">[sec:learning-bc-single, sec:learning-bc-generalist]</a>).
420
+
421
+ <ResponsiveImage
422
+ src={ch3_learning_atlas}
423
+ zoomable
424
+ downloadable
425
+ id="fig:robot-learning-atlas"
426
+ layout="fixed"
427
+ alt="Overview of the robot learning methods implemented in lerobot."
428
+ caption={'Overview of the robot learning methods implemented in lerobot.'}
429
+ />
430
+
431
+ In FigureΒ <a href="#fig:robot-learning-atlas" data-reference-type="ref" data-reference="fig:robot-learning-atlas">12</a> we decided to include generalist robot modelsΒ @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significant different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--foundation models are largely trained to reproduce trajectories contained in a large training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas.
432
+
433
+ FigureΒ <a href="#fig:robot-learning-atlas" data-reference-type="ref" data-reference="fig:robot-learning-atlas">12</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`: Action Chunking with Transformers (ACT)Β @zhaoLearningFineGrainedBimanual2023, Diffusion PolicyΒ @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT)Β @leeBehaviorGenerationLatent2024, $\pi_0$Β @blackp0VisionLanguageActionFlow2024, SmolVLAΒ @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL)Β @luoPreciseDexterousRobotic2024 and TD-MPCΒ @hansenTemporalDifferenceLearning2022.
434
+
435
+ <ResponsiveImage
436
+ src={ch3_rl_examples}
437
+ zoomable
438
+ downloadable
439
+ id="fig:robotics-with-rl-examples"
440
+ layout="fixed"
441
+ alt="Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent l..."
442
+ caption={'Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.'}
443
+ />
444
+
445
+ Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematicsΒ @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. FigureΒ <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. FigureΒ <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling).
446
+
447
+ ## A (Concise) Introduction to RL
448
+
449
+ The RL frameworkΒ @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problemsΒ @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (FigureΒ <a href="#fig:rl-most-famous-pic" data-reference-type="ref" data-reference="fig:rl-most-famous-pic">14</a>) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*).
450
+
451
+ <ResponsiveImage
452
+ src={ch3_agent_env}
453
+ zoomable
454
+ downloadable
455
+ id="fig:rl-most-famous-pic"
456
+ layout="fixed"
457
+ alt="Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018..."
458
+ caption={'Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).'}
459
+ />
460
+
461
+ Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP)Β @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically sound framework for learning *without* an explicit dynamic model. While accommodating also a continuous time formulation, MDPs are typically considered in discrete time in RL, thus assuming interactions to atomically take place over the course of discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ( $T \to + \infty$ ) are typically termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ cannot grow unbounded. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs here.
462
+
463
+ Formally, a lenght-$T$ Markov Decision Process (MDP) is a tuple $\mathcal M = \langle \mathcal S, \mathcal A, \mathcal D, r, \gamma, \rho, T \rangle$, where:
464
+
465
+ - $\mathcal S$ is the *state space*; $s_t\in \mathcal S$ denotes the (possibly non-directly observable) environment state at time $t$. In robotics, states often comprise robot configuration and velocities ($q_t, \dot q_t$), and can accomodate sensor readings such as camera or audio streams.
466
+
467
+ - $\mathcal A$ is the *action space*; $a_t\in \mathcal A$ may represent joint torques, joint velocities, or even end-effector commands. In general, actions correspond to commands intervenings on the configuration of the robot.
468
+
469
+ - $\mathcal D$ represents the (possibly non-deterministic) environment dynamics, with $\mathcal D: \mathcal S\times \mathcal A\times \mathcal S\mapsto [0, 1]$ corresponding to $\mathcal D\, (s_t, a_t, s_{t+1})= \mathbb P (s_{t+1}\vert s_t, a_t)$. For instance, for a planar manipulator dynamics could be considered deterministic when the environment is fully described (FigureΒ <a href="#fig:planar-manipulation-simple" data-reference-type="ref" data-reference="fig:planar-manipulation-simple">6</a>), and stochastic when unmodeled disturbances depending on non-observable parameters intervene (FigureΒ <a href="#fig:planar-manipulator-box-velocity" data-reference-type="ref" data-reference="fig:planar-manipulator-box-velocity">[fig:planar-manipulator-box-velocity]</a>).
470
+
471
+ - $r: \mathcal S\times \mathcal A\times \mathcal S\to \mathbb R$ is the *reward function*, weighing the transition $(s_t, a_t, s_{t+1})$ in the context of the achievement of an arbitrary goal. For instance, a simple reward function for quickly moving the along the $x$ axis in 3D-space (FigureΒ <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a>) could be based on the absolute position of the robot along the $x$ axisΒ ($p_x$), present negative penalties for falling over (measured from $p_z$) and a introduce bonuses $\dot p_x$ for speed, $r (s_t, a_t, s_{t+1})\equiv r(s_t) = p_{x_t} \cdot \dot p_{x_t} - \tfrac{1}{p_{z_t}}$.
472
+
473
+ Lastly, $\gamma \in [0,1]$ represent the discount factor regulating preference for immediate versus long-term reward (with an effective horizon equal to $\tfrac{1}{1-\gamma}$), and $\rho$ is the distribution, defined over $\mathcal S$, the MDP’s *initial* state is sampled from, $s_0 \sim \rho$.
474
+
475
+ A length-$T$ *trajectory* is the (random) sequence
476
+ ``` math
477
+ \begin{equation}
478
+
479
+ \tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),
480
+ \end{equation}
481
+ ```
482
+ with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
483
+ $$
484
+ `\mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) = \mathbb P (s_{t+1}\vert s_t, a_t)\\
485
+ \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) = \mathbb P(a_t\vert s_t) `
486
+ $$
487
+ The probability of observing a given trajectory $\tau$ factorizes into
488
+ ``` math
489
+ \begin{equation}
490
+
491
+ \mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).
492
+ \end{equation}
493
+ ```
494
+
495
+ Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
496
+ ``` math
497
+ G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
498
+ ```
499
+ In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
500
+ $$
501
+ `J(\pi_\theta) = \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)],\\
502
+ \mathbb P_{\theta; \mathcal D} (\tau) = \rho \prod_{t=0}^{T-1} \mathcal D (s_t, a_t, s_{t+1})\ \pi_\theta (a_t\vert s_t).`
503
+ $$
504
+
505
+
506
+ Because in the RL framework the agent is assumed to only be able to observe the environment dynamics and not to intervene on them,Β <a href="#eq:RL-j-function" data-reference-type="ref" data-reference="eq:RL-j-function">[eq:RL-j-function]</a> varies exclusively with the policy followed. In turn, MDPs naturally provide a framework to optimize over the space of the possible behaviors an agent might enact ($\pi \in \Pi$), searching for the *optimal policy* $\pi^* = \arg \max_{\theta} J(\pi_\theta)$, where $\theta$ is the parametrization adopted by the policy set $\Pi: \pi_\theta \in \Pi, \ \forall \theta$. Other than providing a target for policy search, $G(\tau)$ can also be used as a target to discriminate between states and state-action pairs. Given any state $s \in \mathcal S$--e.g., a given configuration of the robot--the *state-value* function
507
+ ``` math
508
+ V_\pi(s) = \mathbb E_{\tau \sim \pi} [G(\tau) \big \vert s_0 = s]
509
+ ```
510
+ can be used to discriminate between desirable and undesirable state in terms of long-term (discounted) reward maximization, under a given policy $\pi$. Similarily, the *state-action* value function also conditions the cumulative discounted reward on selecting action $a$ when in $s$, and thereafter act according to $\pi$:
511
+ ``` math
512
+ Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
513
+ ```
514
+ Crucially, value functions are interrelated:
515
+ $$
516
+ `Q_\pi(s_t, a_t) = \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})]\\
517
+ V_\pi(s_t) = \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)]
518
+ `
519
+ $$
520
+ Inducing an ordering over states and state-action pairs under $\pi$, value functions are central to most RL algorithms. A variety of methods have been developed in RL as standalone attemps to find (approximate) solutions to the problem of maximizing cumulative reward (FigureΒ <a href="#fig:rl-algos-atlas" data-reference-type="ref" data-reference="fig:rl-algos-atlas">15</a>).
521
+
522
+ <ResponsiveImage
523
+ src={ch3_rl_algorithms_atlas}
524
+ zoomable
525
+ downloadable
526
+ id="fig:rl-algos-atlas"
527
+ layout="fixed"
528
+ alt="Popular RL algorithms. See @SpinningUp2018 for a complete list of citations."
529
+ caption={'Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.'}
530
+ />
531
+
532
+ Popular approaches to continuous state and action space--such as those studied within robotics--includeΒ @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulationΒ @akkayaSolvingRubiksCube2019 and locomotionΒ @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader toΒ @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024.
533
+
534
+ ## Real-world RL for Robotics
535
+
536
+ Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency.
537
+
538
+ First, especially early in training, <span class="text-hf-secondary">actions are typically explorative, and thus erractic</span>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and brittle process.
539
+
540
+ Second, learning with a limited number of samples remains problematic in RL, <span class="text-hf-secondary">limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</span>. Even strong algorithms such as SACΒ @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On hardware, generating these data is time-consuming and can even be prohibitive.
541
+
542
+ <ResponsiveImage
543
+ src={ch3_duck_sim_vs_real}
544
+ zoomable
545
+ downloadable
546
+ id="fig:synthetic-vs-real-duck"
547
+ layout="fixed"
548
+ alt="Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality ..."
549
+ caption={'Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality gap) pose risks to policy transfer.'}
550
+ />
551
+
552
+ Training RL policies in simulationΒ @tobinDomainRandomizationTransferring2017 addresses both issues: it eliminates physical risk and dramatically increases throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting transferring policies learned in simulation due the discrepancy between real and simulated environments (*reality gap*, FigureΒ <a href="#fig:synthetic-vs-real-duck" data-reference-type="ref" data-reference="fig:synthetic-vs-real-duck">16</a>). *Domain randomization* (DR) is a popular technique to overcome the reality gap, consisting in randomizing parameters of the simulated environment during training, to induce robustness to specific disturbances. In turn, DR is employed to increase the diversity of scenarios over the course of training, improving on the chances sim-to-real transferΒ @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed further parametrizing the *simulator*’s dynamics $\mathcal D \equiv \mathcal D_\xi$ with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (FigureΒ <a href="#fig:ducks-on-terrains" data-reference-type="ref" data-reference="fig:ducks-on-terrains">17</a>), or the center of mass of an object for a manipulation task.
553
+
554
+ <ResponsiveImage
555
+ src={ch3_many_ducks}
556
+ zoomable
557
+ downloadable
558
+ id="fig:ducks-on-terrains"
559
+ layout="fixed"
560
+ alt="The same locomotion task can be carried out in different (simulated) domains (exemplified by the dif..."
561
+ caption={'The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.'}
562
+ />
563
+
564
+ While effective in transfering policies across the reality gap in real-world roboticsΒ @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important, making selecting these parameters yet another source of brittlness.
565
+
566
+ Selecting the dynamics distribution $\Xi$ is also non-trivial. On the one hand, distributions with low entropy might risk to cause failure at transfer time, due to the limited robustness induced over the course of training. On the other hand, excessive randomization may cause over-regularization and hinder performance. Consequently, the research community investigated approaches to automatically select the randomization distribution $\Xi$, using signals from the training process or tuning it to reproduce observed real-world trajectories. Β @akkayaSolvingRubiksCube2019 use a parametric uniform distribution $\mathcal U(a, b)$ as $\Xi$, widening the bounds as training progresses and the agent’s performance improves (AutoDR). While effective, AutoDR requires significant tuning--the bounds are widened by a fixed, pre-specified amount $\Delta$--and may disregard data when performance *does not* improve after a distribution updateΒ @tiboniDomainRandomizationEntropy2024. Β @tiboniDomainRandomizationEntropy2024 propose a similar method to AutoDR (DORAEMON) to evolve $\Xi$ based on training signal, but with the key difference of explicitly maximizing the entropy of parametric Beta distributions, inherently more flexible than uniform distributions. DORAEMON proves particularly effective at dynamically increasing the entropy levels of the training distribution by employing a max-entropy objective, under performance constraints formulation. Other approaches to automatic DR consist in specifically tuning $\Xi$ to align as much as possible the simulation and real-world domains. For instance, Β @chebotar2019closing interleave in-simulation policy training with repeated real-world policy rollouts used to adjust $\Xi$ based on real-world data, while Β @tiboniDROPOSimtoRealTransfer2023 leverage a single, pre-collected set of real-world trajectories and tune $\Xi$ under a simple likelihood objective.
567
+
568
+ While DR has shown promise, it does not address the main limitation that, even under the assumption that an ideal distribution $\Xi$ to sample from was indeed available, many robotics problems <span class="text-hf-secondary">cannot be simulated with high-enough fidelity under practical computational constraints</span> in the first place. Simulating contact-rich manipulation of possibly deformable or soft materials--i.e., *folding a piece of clothing*--can be costly and even time-intensive, limiting the benefits of in-simulation training.
569
+
570
+ A perhaps more foundamental limitation of RL for robotics is the general unavailability of complicated tasks’ *dense* reward function, the design of which is essentially based on human expertise and trial-and-error. In practice, *sparse* reward functions can be used to conclude whether one specific goal has been attained--*has this t-shirt been correctly folded?*--but unfortunately incur in more challenging learning. As a result, despite notable successes, deploying RL directly on real-world robots at scale remains challenging.
571
+
572
+ To make the most of (1) the growing number of openly available datasets and (2) relatively inexpensive robots like the SO-100, RL could (1) be anchored in already-collected trajectories--limiting erratic and dangerous exploration--and (2) train in the real-world directly--bypassing the aforementioned issues with low-fidelity simulations. In such a context, sample-efficient learning is also paramount, as training on the real-world is inherently time-bottlenecked.
573
+
574
+ Off-policy algorithms like Soft Actor-Critic (SAC)Β @haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpartΒ @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of the training. Other than allowing to re-use transitions $(s_t, a_t, r_t, s_{t+1})$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training processΒ @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-worldΒ @luoSERLSoftwareSuite2025. Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hoursΒ @luoPreciseDexterousRobotic2024.
575
+
576
+ #### Sample-efficient RL
577
+
578
+ In an MDP, the optimal policy $\pi^*$ can be derived from its associated $Q$-function, $Q_{\pi^*}$, and in particular the optimal action(s) $\mu(s_t)$ can be selected maximizing the optimal $Q$-functionΒ over the action space,
579
+ ``` math
580
+ \mu(s_t) = \max_{a_t\in \mathcal A} Q_{\pi^*}(s_t, a_t).
581
+ ```
582
+ Interestingly, the $Q^*$-function satisfies a recursive relationship (*Bellman equation*) based on a very natural intuition [^2]:
583
+
584
+ > \[...\] If the optimal value $Q^*(s_{t+1}, a_{t+1})$ of the \[state\] $s_{t+1}$ was known for all possible actions $a_{t+1}$, then the optimal strategy is to select the action $a_{t+1}$ maximizing the expected value of $r_t + \gamma Q^*(s_{t+1}, a_{t+1})$
585
+ > ``` math
586
+ > Q^*(s_t, a_t) = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma \max_{a_{t+1} \in \mathcal A} Q^*(s_{t+1}, a_{t+1}) \big\vert s_t, a_t]
587
+ > ```
588
+
589
+ In turn, the optimal $Q$-functionΒ  is guaranteed to be self-consistent by definition. *Value-iteration* methods exploit this relationship (and/or its state-value counterpart, $V^*(s_t)$ ) by iteratively updating an initial estimate of $Q^*$, $Q_k$ using the Bellman equation as update rule (*Q-learning*):
590
+ ``` math
591
+ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma \max_{a_{t+1} \in \mathcal A} Q_i (s_{t+1}, a_{t+1}) \big\vert s_t, a_t], \quad i=0,1,2,\dots,K
592
+ ```
593
+ Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
594
+
595
+ Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN),Β @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
596
+ $$
597
+ `\mathcal L(\theta_i) = \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
598
+ \big[
599
+ (\underbrace{y_i - Q_{\theta_i}(s_t, a_t)}_{\delta_i})^2
600
+ \big],\\
601
+ y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma \max_{a_t\in \mathcal A} Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) \big], `
602
+ $$
603
+ Where $\chi$ represents a behavior distribution over state-action pairs. Crucially, $\chi$ can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a *replay buffer* in the form of $(s_t, a_t, r_t, s_{t+1})$ transitions, used to form the TD-target $y_i$, TD-error $\delta_i$ and loss functionΒ <a href="#eq:dqn-loss" data-reference-type="ref" data-reference="eq:dqn-loss">[eq:dqn-loss]</a> via Monte-Carlo (MC) estimates.
604
+
605
+ While effective in handling large, unstructured state spaces for discrete action-space problems, DQN application’s to continous control problems proved challenging. Indeed, in the case of high-capacity function approximators such as neural networks, solving $\max_{a_t \in \mathcal A} Q_\theta(s_t, a_t)$ at each timestep is simply unfeasible due to the (1) continous nature of the action space ($\mathcal A\subset \mathbb R^n$ for some $n$) and (2) impossibility to express the find a cheap (ideally, closed-form) solution to $Q_\theta$. Β @silverDeterministicPolicyGradient2014 tackle this fundamental challenge by using a *deterministic* function of the state $s_t$ as policy, $\mu_\phi(s_t) = a_t$, parametrized by $\phi$. Thus, policies can be iteratively refined updating $\phi$ along the direction:
606
+ ``` math
607
+ \begin{equation}
608
+
609
+ d_\phi = \mathbb E_{s_t \sim \mathbb P (\bullet)} [\nabla_\phi Q(s_t, a_t)\vert_{a_t = \mu_\phi(s_t)}] = \mathbb E_{s_t \sim \mathbb P(\bullet)} [\nabla_{a_t} Q(s_t, a_t) \vert_{a_t = \mu_\phi(s_t)} \cdot \nabla_\phi \mu(s_t)]
610
+ \end{equation}
611
+ ```
612
+ Provably, <a href="#eq:deterministic-pg" data-reference-type="ref" data-reference="eq:deterministic-pg">[eq:deterministic-pg]</a> is the *deterministic policy gradient* (DPG) of the policy $\mu_\phi$Β @silverDeterministicPolicyGradient2014, so that updates $\phi_{k+1}\leftarrow \phi_k + \alpha d_\phi$ are guaranteed to increase the (deterministic) cumulative discounted reward, $J(\mu_\phi)$. Β @lillicrapContinuousControlDeep2019 extended DPG to the case of (1) high-dimensional unstructured observations and (2) continuous action spaces, introducing Deep Deterministic Policy Gradient (DDPG), an important algorithm RL and its applications to robotics. DDPG adopts a modified TD-target compared to the one defined inΒ <a href="#eq:TD-target" data-reference-type="ref" data-reference="eq:TD-target">[eq:TD-target]</a>, by maintaining a policy network used to select actions, yielding
613
+ ``` math
614
+ \begin{equation}
615
+
616
+ y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma Q_{\theta_{i-1}} (s_{t+1}, \mu_\phi(s_{t+1})) \big] .
617
+ \end{equation}
618
+ ```
619
+ Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
620
+
621
+ Soft Actor-Critic (SAC)Β @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="text-hf-secondary">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RLΒ @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy, $J(\pi) = \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))] $ This modified objective results in the *soft* TD-target:
622
+ ``` math
623
+ \begin{equation}
624
+
625
+ y_i = \mathbb E_{s_{t+1} \sim \mathbb P( \bullet \vert s_t, a_t)} [r_t + \gamma \left( Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) - \alpha \log \pi_\phi(a_{t+1} \vert s_{t+1}) \right)], \quad a_{t+1} \sim \pi_\phi(\bullet \vert s_t)
626
+ \end{equation}
627
+ ```
628
+ Similarily to DDPG, SAC also maintains an explicit policy, trained under the same MaxEnt framework for the maximization of <a href="#eq:J-soft" data-reference-type="ref" data-reference="eq:J-soft">[eq:J-soft]</a>, and updated using:
629
+ ``` math
630
+ \begin{equation}
631
+
632
+ \pi_{k+1} \leftarrow \arg\min_{\pi^\prime \in \Pi} \text{D}_{\text{KL}}\left(\pi^\prime (\bullet \vert s_t) \bigg\Vert \frac{\exp(Q_{\pi_k}(s_t, \bullet))}{Z_{\pi_k}(s_t)} \right)
633
+ \end{equation}
634
+ ```
635
+ The update rule provided in <a href="#eq:sac-policy-update" data-reference-type="ref" data-reference="eq:sac-policy-update">[eq:sac-policy-update]</a> optimizes the policy while projecting it on a set $\Pi$ of tractable distributions (e.g., Gaussians,Β @haarnojaReinforcementLearningDeep2017).
636
+
637
+ #### Sample-efficient, data-driven RL
638
+
639
+ Importantly, sampling $(s_t, a_t, r_t, s_{t+1})$ from the replay buffer $D$ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $D$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process.
640
+
641
+ Reinforcement Learning with Prior Data (RLPD)Β @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $D_\text{offline}$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine.
642
+
643
+ #### Sample-efficient, data-driven, real-world RL
644
+
645
+ Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment. As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness. In their technical report,Β @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $c$, as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($e^+$) or failure ($e^-$) states, $s \in \mathcal S$, with a natural choice for the state-conditioned reward function being $r \mathcal S \mapsto \mathbb R$ being $r(s) = \log c(e^+ \ vert s )$. Further,Β @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies.
646
+
647
+ Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability,Β @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$),@luoSERLSoftwareSuite2025 achieved a similar result to that of having to manually randomize the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
648
+
649
+ <ResponsiveImage
650
+ src={ch3_hil_serl_examples}
651
+ zoomable
652
+ downloadable
653
+ id="fig:hil-serl-blocks"
654
+ layout="fixed"
655
+ alt="(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advance..."
656
+ caption={'(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a SO-100.'}
657
+ />
658
+
659
+ Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration and performance, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (FigureΒ <a href="#fig:hil-serl-blocks" data-reference-type="ref" data-reference="fig:hil-serl-blocks">18</a>),Β @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
660
+
661
+ Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL)Β @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process. Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. Consequently, given an intervention timestep $k \in (0, T)$, length-$K$ human intervention data $\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of trainingΒ @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training.
662
+
663
+ ### Code Example: Real-world RL
664
+
665
+ **TODO(fracapuano): work out rl training example**
666
+
667
+ ### Limitations of RL in Real-World Robotics: Simulators and Reward Design
668
+
669
+ Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations:
670
+
671
+ - In those instances where real-world training experience is prohibitively expensive to gatherΒ @degraveMagneticControlTokamak2022, @bellemareAutonomousNavigationStratospheric2020, in-simulation training is often the only option. However, high-fidelity simulators for real-world problems can be difficult to build and maintain, especially for contact-rich manipulation and tasks involving deformable or soft materials.
672
+
673
+ - Reward design poses an additional source of brittleness. Dense shaping terms are often required to guide exploration in long-horizon problems, but poorly tuned terms can lead to specification gaming or local optima. Sparse rewards avoid shaping but exacerbate credit assignment and slow down learning. In practice, complex behaviors require efforts shaping rewards: a britlle and error prone process.
674
+
675
+ Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns. By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions.
676
+
677
+ # Robot (Imitation) Learning
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-approaches.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-classical-limitations.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-cost-accessibility.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-floor-box.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-floor-shelf.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-floor.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-planar-manipulator-free.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-platforms.png RENAMED
File without changes
app/src/content/assets/image/{ch2 β†’ figures/ch2}/ch2-so100-to-planar-manipulator.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-agent-env.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-duck-sim-vs-real.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-hil-serl-examples.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-learning-atlas.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-learning-benefits.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-many-ducks.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-rl-algorithms-atlas.png RENAMED
File without changes
app/src/content/assets/image/{ch3 β†’ figures/ch3}/ch3-rl-examples.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-act-decoder.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-act-encoder.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-act.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-action-vs-observation-distribution.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-async-inference.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-bc-trajectories.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-diffusion-policy.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-diffusion-robot-actions.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-diffusion-vs-flowmatching.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-issues-with-bc.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-latent-variable-model.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-many-latents.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-normalizing-flows.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-observation-action-mapping.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-queues.png RENAMED
File without changes
app/src/content/assets/image/{ch4 β†’ figures/ch4}/ch4-task-effect-on-pairs.png RENAMED
File without changes
app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-generalist-policies-timeline.png RENAMED
File without changes
app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-ml-vs-robotics-foundation.png RENAMED
File without changes
app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-pi0-sampling-timesteps.png RENAMED
File without changes
app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-pi0.png RENAMED
File without changes
app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-smolvla.png RENAMED
File without changes
app/src/content/assets/image/{ch5 β†’ figures/ch5}/ch5-trends.png RENAMED
File without changes
app/src/content/assets/{data β†’ image/figures/data}/somedata.json RENAMED
File without changes
assets/image/figures/ch1/ch1-lerobot-figure1.png ADDED

Git LFS Details

  • SHA256: a850d2b9170736a42366d65dd858408dcffafa3420a0c6cfd678bbdd29a196fa
  • Pointer size: 132 Bytes
  • Size of remote file: 2.86 MB
app/src/content/assets/image/misc/lerobot-team.jpeg β†’ assets/image/figures/ch2/ch2-approaches.png RENAMED
File without changes
assets/image/figures/ch2/ch2-classical-limitations.png ADDED

Git LFS Details

  • SHA256: 85742a774d8d1ad3e36fc50d89c5a69409bce98ebe6bdba734896156ba668aa8
  • Pointer size: 132 Bytes
  • Size of remote file: 4.74 MB
assets/image/figures/ch2/ch2-cost-accessibility.png ADDED

Git LFS Details

  • SHA256: 606cbb89fda90a2ddb22dc721ea978ffa9fe34a7f9f0bf1614b6ae53b4117411
  • Pointer size: 132 Bytes
  • Size of remote file: 1.96 MB
assets/image/figures/ch2/ch2-planar-manipulator-floor-box.png ADDED

Git LFS Details

  • SHA256: 3c856918ffb061c235d05e74df6310412f5b41ea907f0f12f55fed5c8b45590b
  • Pointer size: 130 Bytes
  • Size of remote file: 93.1 kB
assets/image/figures/ch2/ch2-planar-manipulator-floor-shelf.png ADDED

Git LFS Details

  • SHA256: e4abb239c45a576a02fc2cbd0d87f877b2c5f61dcac74e1b8c79a70ebacaca3e
  • Pointer size: 130 Bytes
  • Size of remote file: 83.6 kB
assets/image/figures/ch2/ch2-planar-manipulator-floor.png ADDED

Git LFS Details

  • SHA256: 4a2c70f2d7c903d9f16433a9ca44c10892fd0e10ca90e2d9b8438c3d25fa623a
  • Pointer size: 130 Bytes
  • Size of remote file: 58.9 kB