tfrere HF Staff commited on
Commit
759d176
·
1 Parent(s): 2b50d2d
app/scripts/latex-importer/mdx-converter.mjs CHANGED
@@ -222,38 +222,38 @@ ${imagesJson}
222
  }
223
 
224
  /**
225
- * Transform images to ResponsiveImage components
226
  * @param {string} content - MDX content
227
- * @returns {string} - Content with ResponsiveImage components
228
  */
229
  /**
230
- * Create ResponsiveImage component with import
231
  * @param {string} src - Clean image source
232
  * @param {string} alt - Alt text
233
  * @param {string} id - Element ID
234
  * @param {string} caption - Figure caption
235
  * @param {string} width - Optional width
236
- * @returns {string} - ResponsiveImage component markup
237
  */
238
- function createResponsiveImageComponent(src, alt = '', id = '', caption = '', width = '') {
239
  const varName = generateImageVarName(src);
240
  imageImports.set(src, varName);
241
- usedComponents.add('ResponsiveImage');
242
 
243
  const props = [];
244
  props.push(`src={${varName}}`);
245
  props.push('zoomable');
246
  props.push('downloadable');
247
  if (id) props.push(`id="${id}"`);
248
- props.push('layout="fixed"');
249
  if (alt) props.push(`alt="${alt}"`);
250
  if (caption) props.push(`caption={'${caption}'}`);
251
 
252
- return `<ResponsiveImage\n ${props.join('\n ')}\n/>`;
253
  }
254
 
255
  function transformImages(content) {
256
- console.log(' 🖼️ Transforming images to ResponsiveImage components with imports...');
257
 
258
  let hasImages = false;
259
 
@@ -297,7 +297,7 @@ function transformImages(content) {
297
  const altText = cleanAltText(cleanCap);
298
  hasImages = true;
299
 
300
- return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
301
  }
302
  );
303
 
@@ -309,7 +309,7 @@ function transformImages(content) {
309
  const cleanAlt = cleanAltText(alt || 'Figure');
310
  hasImages = true;
311
 
312
- return createResponsiveImageComponent(cleanSrc, cleanAlt);
313
  }
314
  );
315
 
@@ -320,7 +320,7 @@ function transformImages(content) {
320
  const cleanSrc = cleanSrcPath(src);
321
  hasImages = true;
322
 
323
- return createResponsiveImageComponent(cleanSrc, 'Figure');
324
  }
325
  );
326
 
@@ -333,7 +333,7 @@ function transformImages(content) {
333
  const altText = cleanAltText(cleanCap);
334
  hasImages = true;
335
 
336
- return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
337
  }
338
  );
339
 
@@ -346,11 +346,12 @@ function transformImages(content) {
346
  const altText = cleanAltText(cleanCap);
347
  hasImages = true;
348
 
349
- return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
350
  }
351
  );
352
 
353
- // 6. Transform Pandoc-style images: ![alt](src){#id attr="value"}
 
354
  content = content.replace(
355
  /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g,
356
  (match, alt, src, attributes) => {
@@ -364,17 +365,121 @@ function transformImages(content) {
364
  if (idMatch) id = idMatch[1];
365
  }
366
 
367
- return createResponsiveImageComponent(cleanSrc, cleanAlt, id);
368
  }
369
  );
370
 
371
  if (hasImages) {
372
- console.log(' ✅ ResponsiveImage components with imports will be created');
373
  }
374
 
375
  return content;
376
  }
377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  /**
379
  * Transform HTML spans with style attributes to appropriate components
380
  * @param {string} content - MDX content
@@ -951,6 +1056,7 @@ function processMdxContent(content, latexContent = '') {
951
  processedContent = cleanMdxSyntax(processedContent);
952
  processedContent = convertSubfiguresToMultiImage(processedContent);
953
  processedContent = transformImages(processedContent);
 
954
  processedContent = transformStyledSpans(processedContent);
955
  processedContent = transformHighlightSpans(processedContent);
956
  processedContent = fixEscapedMarkTags(processedContent);
 
222
  }
223
 
224
  /**
225
+ * Transform images to Image components
226
  * @param {string} content - MDX content
227
+ * @returns {string} - Content with Image components
228
  */
229
  /**
230
+ * Create Image component with import
231
  * @param {string} src - Clean image source
232
  * @param {string} alt - Alt text
233
  * @param {string} id - Element ID
234
  * @param {string} caption - Figure caption
235
  * @param {string} width - Optional width
236
+ * @returns {string} - Image component markup
237
  */
238
+ function createImageComponent(src, alt = '', id = '', caption = '', width = '') {
239
  const varName = generateImageVarName(src);
240
  imageImports.set(src, varName);
241
+ usedComponents.add('Image');
242
 
243
  const props = [];
244
  props.push(`src={${varName}}`);
245
  props.push('zoomable');
246
  props.push('downloadable');
247
  if (id) props.push(`id="${id}"`);
248
+ if (width) props.push(`width={${width}}`);
249
  if (alt) props.push(`alt="${alt}"`);
250
  if (caption) props.push(`caption={'${caption}'}`);
251
 
252
+ return `<Image\n ${props.join('\n ')}\n/>`;
253
  }
254
 
255
  function transformImages(content) {
256
+ console.log(' 🖼️ Transforming images to Image components with imports...');
257
 
258
  let hasImages = false;
259
 
 
297
  const altText = cleanAltText(cleanCap);
298
  hasImages = true;
299
 
300
+ return createImageComponent(cleanSrc, altText, id, cleanCap);
301
  }
302
  );
303
 
 
309
  const cleanAlt = cleanAltText(alt || 'Figure');
310
  hasImages = true;
311
 
312
+ return createImageComponent(cleanSrc, cleanAlt);
313
  }
314
  );
315
 
 
320
  const cleanSrc = cleanSrcPath(src);
321
  hasImages = true;
322
 
323
+ return createImageComponent(cleanSrc, 'Figure');
324
  }
325
  );
326
 
 
333
  const altText = cleanAltText(cleanCap);
334
  hasImages = true;
335
 
336
+ return createImageComponent(cleanSrc, altText, id, cleanCap);
337
  }
338
  );
339
 
 
346
  const altText = cleanAltText(cleanCap);
347
  hasImages = true;
348
 
349
+ return createImageComponent(cleanSrc, altText, id, cleanCap);
350
  }
351
  );
352
 
353
+
354
+ // 7. Transform Pandoc-style images: ![alt](src){#id attr="value"}
355
  content = content.replace(
356
  /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g,
357
  (match, alt, src, attributes) => {
 
365
  if (idMatch) id = idMatch[1];
366
  }
367
 
368
+ return createImageComponent(cleanSrc, cleanAlt, id);
369
  }
370
  );
371
 
372
  if (hasImages) {
373
+ console.log(' ✅ Image components with imports will be created');
374
  }
375
 
376
  return content;
377
  }
378
 
379
+ /**
380
+ * Transform figures with Image components that still have separate figcaptions
381
+ * @param {string} content - MDX content
382
+ * @returns {string} - Content with Image components using caption props
383
+ */
384
+ function transformImageFigures(content) {
385
+ console.log(' 🔧 Transforming figures with Image components and separate figcaptions...');
386
+
387
+ let hasTransformed = false;
388
+
389
+
390
+ // Transform figures with Image components that still have separate figcaptions
391
+ content = content.replace(
392
+ /<figure>\s*<Image([\s\S]*?)\/>\s*<span[^>]*><\/span>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
393
+ (match, imageProps, caption) => {
394
+ hasTransformed = true;
395
+
396
+ // Clean caption text
397
+ const cleanCap = caption
398
+ .replace(/<[^>]*>/g, '') // Remove HTML tags
399
+ .replace(/\n/g, ' ') // Replace newlines with spaces
400
+ .replace(/\r/g, ' ') // Replace carriage returns with spaces
401
+ .replace(/\s+/g, ' ') // Replace multiple spaces with single space
402
+ .replace(/'/g, "\\'") // Escape quotes
403
+ .trim(); // Trim whitespace
404
+
405
+ // Extract the Image component and add the caption prop
406
+ const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
407
+ return imageComponent;
408
+ }
409
+ );
410
+
411
+ // Also try a more flexible pattern that handles escaped HTML
412
+ content = content.replace(
413
+ /<figure>\s*<Image([\s\S]*?)\/>\s*<p>&lt;span[^&]*&gt;&lt;\/span&gt;<\/p>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
414
+ (match, imageProps, caption) => {
415
+ hasTransformed = true;
416
+
417
+ // Clean caption text
418
+ const cleanCap = caption
419
+ .replace(/<[^>]*>/g, '') // Remove HTML tags
420
+ .replace(/\n/g, ' ') // Replace newlines with spaces
421
+ .replace(/\r/g, ' ') // Replace carriage returns with spaces
422
+ .replace(/\s+/g, ' ') // Replace multiple spaces with single space
423
+ .replace(/'/g, "\\'") // Escape quotes
424
+ .trim(); // Trim whitespace
425
+
426
+ // Extract the Image component and add the caption prop
427
+ const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
428
+ return imageComponent;
429
+ }
430
+ );
431
+
432
+ // Handle figures with minipage divs
433
+ content = content.replace(
434
+ /<figure>\s*<div class="minipage">\s*<Image([\s\S]*?)\/>\s*<span[^>]*><\/span>\s*<\/div>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
435
+ (match, imageProps, caption) => {
436
+ hasTransformed = true;
437
+
438
+ // Clean caption text
439
+ const cleanCap = caption
440
+ .replace(/<[^>]*>/g, '') // Remove HTML tags
441
+ .replace(/\n/g, ' ') // Replace newlines with spaces
442
+ .replace(/\r/g, ' ') // Replace carriage returns with spaces
443
+ .replace(/\s+/g, ' ') // Replace multiple spaces with single space
444
+ .replace(/'/g, "\\'") // Escape quotes
445
+ .trim(); // Trim whitespace
446
+
447
+ // Extract the Image component and add the caption prop
448
+ const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
449
+ return imageComponent;
450
+ }
451
+ );
452
+
453
+ // Handle figures with minipage divs (escaped HTML version)
454
+ content = content.replace(
455
+ /<figure>\s*<div class="minipage">\s*<Image([\s\S]*?)\/>\s*<p>&lt;span[^&]*&gt;&lt;\/span&gt;<\/p>\s*<\/div>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
456
+ (match, imageProps, caption) => {
457
+ hasTransformed = true;
458
+
459
+ // Clean caption text
460
+ const cleanCap = caption
461
+ .replace(/<[^>]*>/g, '') // Remove HTML tags
462
+ .replace(/\n/g, ' ') // Replace newlines with spaces
463
+ .replace(/\r/g, ' ') // Replace carriage returns with spaces
464
+ .replace(/\s+/g, ' ') // Replace multiple spaces with single space
465
+ .replace(/'/g, "\\'") // Escape quotes
466
+ .trim(); // Trim whitespace
467
+
468
+ // Extract the Image component and add the caption prop
469
+ const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
470
+ return imageComponent;
471
+ }
472
+ );
473
+
474
+ if (hasTransformed) {
475
+ console.log(' ✅ Transformed figures with Image components to use caption props');
476
+ } else {
477
+ console.log(' ℹ️ No figures with Image components and separate figcaptions found');
478
+ }
479
+
480
+ return content;
481
+ }
482
+
483
  /**
484
  * Transform HTML spans with style attributes to appropriate components
485
  * @param {string} content - MDX content
 
1056
  processedContent = cleanMdxSyntax(processedContent);
1057
  processedContent = convertSubfiguresToMultiImage(processedContent);
1058
  processedContent = transformImages(processedContent);
1059
+ processedContent = transformImageFigures(processedContent);
1060
  processedContent = transformStyledSpans(processedContent);
1061
  processedContent = transformHighlightSpans(processedContent);
1062
  processedContent = fixEscapedMarkTags(processedContent);
app/scripts/latex-importer/output/main.mdx CHANGED
@@ -19,7 +19,7 @@ tableOfContentsAutoCollapse: true
19
  ---
20
 
21
  import MultiImage from '../components/MultiImage.astro';
22
- import ResponsiveImage from '../components/ResponsiveImage.astro';
23
  import Quote from '../components/Quote.astro';
24
  import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
25
  import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
@@ -84,17 +84,12 @@ We sincerely hope this tutorial serves as a valuable starting point for your jou
84
 
85
  ## Introduction
86
 
87
- <figure>
88
- <ResponsiveImage
89
  src={ch1_lerobot_figure1}
90
  zoomable
91
  downloadable
92
- layout="fixed"
93
  alt="Figure"
94
- />
95
- <span id="figure1" style="position: absolute;"></span>
96
- <figcaption><code>lerobot</code> is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.</figcaption>
97
- </figure>
98
 
99
  Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
100
 
@@ -293,17 +288,12 @@ TL;DR Learning-based approaches to robotics are motivated by the need to (1) gen
293
 
294
  ### Explicit and Implicit Models
295
 
296
- <figure>
297
- <ResponsiveImage
298
  src={ch2_approaches}
299
  zoomable
300
  downloadable
301
- layout="fixed"
302
  alt="Figure"
303
- />
304
- <span id="generating-motion-atlas" style="position: absolute;"></span>
305
- <figcaption>Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (<em>dynamics-based</em>) or implicitly (<em>learning-based</em>) model robot-environment interactions.</figcaption>
306
- </figure>
307
 
308
  Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
309
 
@@ -311,17 +301,12 @@ Methods to produce robotics motion range from traditional *explicit* models--<ma
311
 
312
  ### Different Types of Motion
313
 
314
- <figure>
315
- <ResponsiveImage
316
  src={ch2_platforms}
317
  zoomable
318
  downloadable
319
- layout="fixed"
320
  alt="Figure"
321
- />
322
- <span id="robotics-platforms-atlas" style="position: absolute;"></span>
323
- <figcaption>Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.</figcaption>
324
- </figure>
325
 
326
  In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
327
 
@@ -335,31 +320,21 @@ Robot manipulators typically consist of a series of links and joints, articulate
335
 
336
  Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
337
 
338
- <figure>
339
- <ResponsiveImage
340
  src={ch2_cost_accessibility}
341
  zoomable
342
  downloadable
343
- layout="fixed"
344
  alt="Figure"
345
- />
346
- <span id="robotic-platforms-costs" style="position: absolute;"></span>
347
- <figcaption>Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.</figcaption>
348
- </figure>
349
 
350
  Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
351
 
352
- <figure>
353
- <ResponsiveImage
354
  src={ch2_so100_to_planar_manipulator}
355
  zoomable
356
  downloadable
357
- layout="fixed"
358
  alt="Figure"
359
- />
360
- <span id="make-so100-planar-manipulator" style="position: absolute;"></span>
361
- <figcaption>The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).</figcaption>
362
- </figure>
363
 
364
  Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
365
 
@@ -437,11 +412,10 @@ While very effective when a goal trajectory has been well specified, the perform
437
 
438
  <div class="wrapfigure">
439
 
440
- r0.3 <ResponsiveImage
441
  src={ch2_planar_manipulator_floor_box}
442
  zoomable
443
  downloadable
444
- layout="fixed"
445
  alt="image"
446
  />
447
 
@@ -462,17 +436,12 @@ We point the interested reader to , , and  for extended coverage of FK, IK, di
462
 
463
  Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
464
 
465
- <figure>
466
- <ResponsiveImage
467
  src={ch2_classical_limitations}
468
  zoomable
469
  downloadable
470
- layout="fixed"
471
  alt="Figure"
472
- />
473
- <span id="classical-limitations" style="position: absolute;"></span>
474
- <figcaption>Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.</figcaption>
475
- </figure>
476
 
477
  Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
478
 
@@ -495,17 +464,12 @@ Taken together, these limitations (Figure <a href="#classical-limitations" data
495
  TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
496
 
497
  </div>
498
- <figure>
499
- <ResponsiveImage
500
  src={ch3_learning_benefits}
501
  zoomable
502
  downloadable
503
- layout="fixed"
504
  alt="Figure"
505
- />
506
- <span id="robot-learning-upsides" style="position: absolute;"></span>
507
- <figcaption>Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used. </figcaption>
508
- </figure>
509
 
510
  Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
511
 
@@ -513,11 +477,10 @@ Being a field at its relative nascent stages, no prevalent technique(s) proves d
513
 
514
  <div class="wrapfigure">
515
 
516
- r0.3 <ResponsiveImage
517
  src={ch3_learning_atlas}
518
  zoomable
519
  downloadable
520
- layout="fixed"
521
  alt="image"
522
  />
523
 
@@ -526,17 +489,12 @@ r0.3 <ResponsiveImage
526
 
527
  In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
528
 
529
- <figure>
530
- <ResponsiveImage
531
  src={ch3_rl_examples}
532
  zoomable
533
  downloadable
534
- layout="fixed"
535
  alt="Figure"
536
- />
537
- <span id="robotics-with-rl-examples" style="position: absolute;"></span>
538
- <figcaption>Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.</figcaption>
539
- </figure>
540
 
541
  Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
542
 
@@ -544,17 +502,12 @@ Applications of RL to robotics have been studied long enough that the relationsh
544
 
545
  The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
546
 
547
- <figure>
548
- <ResponsiveImage
549
  src={ch3_agent_env}
550
  zoomable
551
  downloadable
552
- layout="fixed"
553
  alt="Figure"
554
- />
555
- <span id="rl-most-famous-pic" style="position: absolute;"></span>
556
- <figcaption>Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).</figcaption>
557
- </figure>
558
 
559
  Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
560
 
@@ -628,17 +581,12 @@ V_\pi(s_t) &= \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)],
628
  ```
629
  inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
630
 
631
- <figure>
632
- <ResponsiveImage
633
  src={ch3_rl_algorithms_atlas}
634
  zoomable
635
  downloadable
636
- layout="fixed"
637
  alt="Figure"
638
- />
639
- <span id="rl-algos-atlas" style="position: absolute;"></span>
640
- <figcaption>Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.</figcaption>
641
- </figure>
642
 
643
  Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
644
 
@@ -648,31 +596,21 @@ Streamlined end-to-end control pipelines, data-driven feature extraction and a d
648
 
649
  First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
650
 
651
- <figure>
652
- <ResponsiveImage
653
  src={ch3_duck_sim_vs_real}
654
  zoomable
655
  downloadable
656
- layout="fixed"
657
  alt="Figure"
658
- />
659
- <span id="synthetic-vs-real-duck" style="position: absolute;"></span>
660
- <figcaption>Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (<em>reality gap</em>) pose risks to policy transfer.</figcaption>
661
- </figure>
662
 
663
  Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
664
 
665
- <figure>
666
- <ResponsiveImage
667
  src={ch3_many_ducks}
668
  zoomable
669
  downloadable
670
- layout="fixed"
671
  alt="Figure"
672
- />
673
- <span id="ducks-on-terrains" style="position: absolute;"></span>
674
- <figcaption>The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.</figcaption>
675
- </figure>
676
 
677
  While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
678
 
@@ -768,17 +706,12 @@ Reward classifiers are particularly useful in treating complex, dynamic tasks--e
768
 
769
  Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
770
 
771
- <figure>
772
- <ResponsiveImage
773
  src={ch3_hil_serl_examples}
774
  zoomable
775
  downloadable
776
- layout="fixed"
777
  alt="Figure"
778
- />
779
- <span id="hil-serl-blocks" style="position: absolute;"></span>
780
- <figcaption>(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.</figcaption>
781
- </figure>
782
 
783
  Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
784
 
@@ -786,17 +719,12 @@ Human-in-the-Loop, Sample Efficient Robot reinforcement Learning (HIL-SERL) @lu
786
 
787
  #### Code Example- Real-world RL
788
 
789
- <figure>
790
- <ResponsiveImage
791
  src={ch3_hil_serl_architecture}
792
  zoomable
793
  downloadable
794
- layout="fixed"
795
  alt="Figure"
796
- />
797
- <span id="ch3-hil-serl-architecture" style="position: absolute;"></span>
798
- <figcaption>HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in <code>lerobot</code> relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions <span class="math inline">(<em>s</em> <sub> <em>t</em> </sub>, <em>a</em> <sub> <em>t</em> </sub>, <em>r</em> <sub> <em>t</em> </sub>, <em>s</em> <sub> <em>t</em> + 1</sub>)</span> and (2) parameters <span class="math inline"> <em>θ</em> </span>.</figcaption>
799
- </figure>
800
 
801
  This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
802
 
@@ -1066,33 +994,23 @@ Advances in learning to act from potentially large corpora of human demonstratio
1066
  TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
1067
 
1068
  </div>
1069
- <figure>
1070
- <ResponsiveImage
1071
  src={ch4_bc_trajectories}
1072
  zoomable
1073
  downloadable
1074
- layout="fixed"
1075
  alt="Figure"
1076
- />
1077
- <span id="ch4-bc-trajectories" style="position: absolute;"></span>
1078
- <figcaption>(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.</figcaption>
1079
- </figure>
1080
 
1081
  Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
1082
 
1083
  Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
1084
 
1085
- <figure>
1086
- <ResponsiveImage
1087
  src={ch4_observation_action_mapping}
1088
  zoomable
1089
  downloadable
1090
- layout="fixed"
1091
  alt="Figure"
1092
- />
1093
- <span id="ch4-observation-action-mapping" style="position: absolute;"></span>
1094
- <figcaption>Sample observations and action pairs over the course of a given trajectory recorded in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.</figcaption>
1095
- </figure>
1096
 
1097
  Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
1098
  ``` math
@@ -1104,17 +1022,12 @@ Typically, the expert’s joint observation-action distribution $p: \mathcal O\t
1104
 
1105
  Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
1106
 
1107
- <figure>
1108
- <ResponsiveImage
1109
  src={ch4_issues_with_bc}
1110
  zoomable
1111
  downloadable
1112
- layout="fixed"
1113
  alt="Figure"
1114
- />
1115
- <span id="ch4-issues-with-bc" style="position: absolute;"></span>
1116
- <figcaption>Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.</figcaption>
1117
- </figure>
1118
 
1119
  While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
1120
 
@@ -1124,17 +1037,12 @@ Generative Models (GMs) aim to learn the stochastic process underlying the very
1124
 
1125
  #### Variational Auto-Encoders
1126
 
1127
- <figure>
1128
- <ResponsiveImage
1129
  src={ch4_task_effect_on_pairs}
1130
  zoomable
1131
  downloadable
1132
- layout="fixed"
1133
  alt="Figure"
1134
- />
1135
- <span id="ch4-task-effect-on-pairs" style="position: absolute;"></span>
1136
- <figcaption>Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.</figcaption>
1137
- </figure>
1138
 
1139
  A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
1140
  ``` math
@@ -1142,17 +1050,12 @@ A common inductive bias used in GM posits samples $(o,a)$ are influenced from an
1142
  ```
1143
  Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
1144
 
1145
- <figure>
1146
- <ResponsiveImage
1147
  src={ch4_latent_variable_model}
1148
  zoomable
1149
  downloadable
1150
- layout="fixed"
1151
  alt="Figure"
1152
- />
1153
- <span id="ch4-latent-variable-model" style="position: absolute;"></span>
1154
- <figcaption>(A) The latent variable model in a robotics application regulates influence between observed (<span class="math inline"> <em>o</em>, <em>a</em>)</span> variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.</figcaption>
1155
- </figure>
1156
 
1157
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
1158
  <span id="evidence-definition-1" style="position: absolute;">
@@ -1241,17 +1144,12 @@ VAEs approximate probability distributions via a *single* latent variable model,
1241
  ```
1242
  where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
1243
 
1244
- <figure>
1245
- <ResponsiveImage
1246
  src={ch4_many_latents}
1247
  zoomable
1248
  downloadable
1249
- layout="fixed"
1250
  alt="Figure"
1251
- />
1252
- <span id="ch4-many-latents" style="position: absolute;"></span>
1253
- <figcaption>HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.</figcaption>
1254
- </figure>
1255
 
1256
  Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
1257
 
@@ -1301,17 +1199,12 @@ In their seminal work on using DMs for variational inference, @hoDenoisingDiffu
1301
  ```
1302
  where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
1303
 
1304
- <figure>
1305
- <ResponsiveImage
1306
  src={ch4_diffusion_robot_actions}
1307
  zoomable
1308
  downloadable
1309
- layout="fixed"
1310
  alt="Figure"
1311
- />
1312
- <span id="diffusion-robot-actions" style="position: absolute;"></span>
1313
- <figcaption>DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.</figcaption>
1314
- </figure>
1315
 
1316
  Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
1317
  <span id="diffusion-simplified-loss" style="position: absolute;">
@@ -1327,17 +1220,12 @@ Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradien
1327
  \end{align}
1328
  ```
1329
 
1330
- <figure>
1331
- <ResponsiveImage
1332
  src={ch4_action_vs_observation_distribution}
1333
  zoomable
1334
  downloadable
1335
- layout="fixed"
1336
  alt="Figure"
1337
- />
1338
- <span id="ch4-action-vs-observation-distribution" style="position: absolute;"></span>
1339
- <figcaption>A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.</figcaption>
1340
- </figure>
1341
 
1342
  In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
1343
 
@@ -1371,31 +1259,21 @@ FM proved very effective in a variety of applications, ranging from image @esse
1371
  ```
1372
  Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
1373
 
1374
- <figure>
1375
- <ResponsiveImage
1376
  src={ch4_normalizing_flows}
1377
  zoomable
1378
  downloadable
1379
- layout="fixed"
1380
  alt="Figure"
1381
- />
1382
- <span id="ch4-normalizing-flows" style="position: absolute;"></span>
1383
- <figcaption>Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields <span class="math inline"> <em>u</em> <sub>1</sub>(<em>x</em>, <em>y</em>) = (<em>x</em>, 0)</span> and <span class="math inline">$u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$</span>). Notice time flows <em>continuously</em> in <span class="math inline">[0, 1]</span>. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.</figcaption>
1384
- </figure>
1385
 
1386
  While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
1387
 
1388
- <figure>
1389
- <ResponsiveImage
1390
  src={ch4_diffusion_vs_flowmatching}
1391
  zoomable
1392
  downloadable
1393
- layout="fixed"
1394
  alt="Figure"
1395
- />
1396
- <span id="ch4-diffusion-paths-versus-fm" style="position: absolute;"></span>
1397
- <figcaption>Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over <span class="math inline"> <em>T</em> = 50</span> steps.</figcaption>
1398
- </figure>
1399
 
1400
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
1401
  <span id="flow-matching-objective" style="position: absolute;">
@@ -1435,45 +1313,30 @@ In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn
1435
 
1436
  In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
1437
 
1438
- <figure>
1439
- <ResponsiveImage
1440
  src={ch4_act_encoder}
1441
  zoomable
1442
  downloadable
1443
- layout="fixed"
1444
  alt="Figure"
1445
- />
1446
- <span id="ch4-act-encoder" style="position: absolute;"></span>
1447
- <figcaption>The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned <code>[CLS]</code> token used to aggregate input level information, and predict the style variable <span class="math inline"> <em>z</em> </span>. The encoder is exclusively used to <em>train</em> the decoder, and it is entirely disregarded at inference time.</figcaption>
1448
- </figure>
1449
 
1450
  However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
1451
 
1452
- <figure>
1453
- <ResponsiveImage
1454
  src={ch4_act_decoder}
1455
  zoomable
1456
  downloadable
1457
- layout="fixed"
1458
  alt="Figure"
1459
- />
1460
- <span id="ch4-act-decoder" style="position: absolute;"></span>
1461
- <figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"> <em>n</em> </span> camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable <span class="math inline"> <em>z</em> </span> retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices <span class="math inline"> <em>K</em>, <em>V</em> </span> with the decoder, and is trained to decode fixed position embeddings into action chunks.</figcaption>
1462
- </figure>
1463
 
1464
  #### Code Example: Training and Using ACT in Practice
1465
 
1466
- <figure>
1467
- <ResponsiveImage
1468
  src={ch4_act}
1469
  zoomable
1470
  downloadable
1471
- layout="fixed"
1472
  alt="Figure"
1473
- />
1474
- <span id="ch4-act" style="position: absolute;"></span>
1475
- <figcaption>Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.</figcaption>
1476
- </figure>
1477
  <div class="pbox">
1478
 
1479
  Training ACT
@@ -1612,17 +1475,12 @@ In practice, conditioning on observation data is achieved conditioning the noise
1612
  ```
1613
  Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
1614
 
1615
- <figure>
1616
- <ResponsiveImage
1617
  src={ch4_diffusion_policy}
1618
  zoomable
1619
  downloadable
1620
- layout="fixed"
1621
  alt="Figure"
1622
- />
1623
- <span id="diffusion-policy-architecture" style="position: absolute;"></span>
1624
- <figcaption>The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of <span class="math inline"> <em>H</em> <sub> <em>o</em> </sub> </span> previous observations is used as external conditioning to denoise a group of <span class="math inline"> <em>H</em> <sub> <em>a</em> </sub> </span> actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as <span class="math inline"> <em>T</em> = 10</span> denoising steps.</figcaption>
1625
- </figure>
1626
 
1627
  Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
1628
 
@@ -1759,19 +1617,12 @@ A robot may indeed execute an entire action chunk $\mathbf{A}_t$ *before* a new
1759
 
1760
  One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
1761
 
1762
- <figure>
1763
- <div class="minipage">
1764
- <ResponsiveImage
1765
  src={ch4_async_inference}
1766
  zoomable
1767
  downloadable
1768
- layout="fixed"
1769
  alt="Figure"
1770
- />
1771
- <span id="ch4-async-inference" style="position: absolute;"></span>
1772
- </div>
1773
- <figcaption><strong>Asynchronous inference</strong>. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.</figcaption>
1774
- </figure>
1775
  <div class="algorithm">
1776
 
1777
  <span id="alg-async-inference" style="position: absolute;"></span>
@@ -1796,19 +1647,12 @@ Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queu
1796
 
1797
  - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
1798
 
1799
- <figure>
1800
- <div class="minipage">
1801
- <ResponsiveImage
1802
  src={ch4_queues}
1803
  zoomable
1804
  downloadable
1805
- layout="fixed"
1806
  alt="Figure"
1807
- />
1808
- <span id="ch4-queues" style="position: absolute;"></span>
1809
- </div>
1810
- <figcaption>Action queue size evolution at runtime for various levels of <span class="math inline"> <em>g</em> </span> when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.</figcaption>
1811
- </figure>
1812
 
1813
  Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
1814
 
@@ -1947,33 +1791,23 @@ TL;DR Openly available, large-scale datasets and the development of stable-to-tr
1947
 
1948
  The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
1949
 
1950
- <figure>
1951
- <ResponsiveImage
1952
  src={ch5_ml_vs_robotics_foundation}
1953
  zoomable
1954
  downloadable
1955
- layout="fixed"
1956
  alt="Figure"
1957
- />
1958
- <span id="ch5-ml-vs-robotics-foundation" style="position: absolute;"></span>
1959
- <figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
1960
- </figure>
1961
 
1962
  ### Preliminaries: Models and Data
1963
 
1964
  The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
1965
 
1966
- <figure>
1967
- <ResponsiveImage
1968
  src={ch5_generalist_policies_timeline}
1969
  zoomable
1970
  downloadable
1971
- layout="fixed"
1972
  alt="Figure"
1973
- />
1974
- <span id="ch5-generalist-policies-timeline" style="position: absolute;"></span>
1975
- <figcaption>Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, <span class="math inline"> <em>π</em> <sub>0</sub> </span> @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.</figcaption>
1976
- </figure>
1977
 
1978
  Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
1979
 
@@ -1983,17 +1817,12 @@ Traditionally, research efforts revolved around not only training models, but al
1983
 
1984
  Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
1985
 
1986
- <figure>
1987
- <ResponsiveImage
1988
  src={ch5_trends}
1989
  zoomable
1990
  downloadable
1991
- layout="fixed"
1992
  alt="Figure"
1993
- />
1994
- <span id="ch5-trends" style="position: absolute;"></span>
1995
- <figcaption>Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.</figcaption>
1996
- </figure>
1997
 
1998
  Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
1999
 
@@ -2013,17 +1842,12 @@ Recently, compute efficiency has also become a central focus in multi-modal rese
2013
 
2014
  $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
2015
 
2016
- <figure>
2017
- <ResponsiveImage
2018
  src={ch5_pi0}
2019
  zoomable
2020
  downloadable
2021
- layout="fixed"
2022
  alt="Figure"
2023
- />
2024
- <span id="ch5-pi0" style="position: absolute;"></span>
2025
- <figcaption>The <span class="math inline"> <em>π</em> <sub>0</sub> </span> architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.</figcaption>
2026
- </figure>
2027
 
2028
  Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
2029
 
@@ -2058,11 +1882,10 @@ Flow matching  can be seen as a continuous time, deterministic generalization o
2058
 
2059
  <div class="wrapfigure">
2060
 
2061
- r0.4 <ResponsiveImage
2062
  src={ch5_pi0_sampling_timesteps}
2063
  zoomable
2064
  downloadable
2065
- layout="fixed"
2066
  alt="image"
2067
  />
2068
 
@@ -2141,17 +1964,12 @@ for epoch in range(num_epochs):
2141
 
2142
  With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
2143
 
2144
- <figure>
2145
- <ResponsiveImage
2146
  src={ch5_smolvla}
2147
  zoomable
2148
  downloadable
2149
- layout="fixed"
2150
  alt="Figure"
2151
- />
2152
- <span id="ch5-smolvla" style="position: absolute;"></span>
2153
- <figcaption>The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than <span class="math inline"> <em>π</em> <sub>0</sub> </span> (450M parameters vs. <span class="math inline"> <em>π</em> <sub>0</sub> </span>’s 3.3B).</figcaption>
2154
- </figure>
2155
 
2156
  While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.
2157
 
 
19
  ---
20
 
21
  import MultiImage from '../components/MultiImage.astro';
22
+ import Image from '../components/Image.astro';
23
  import Quote from '../components/Quote.astro';
24
  import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
25
  import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
 
84
 
85
  ## Introduction
86
 
87
+ <Image
 
88
  src={ch1_lerobot_figure1}
89
  zoomable
90
  downloadable
 
91
  alt="Figure"
92
+ caption={'lerobot is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.'}/>
 
 
 
93
 
94
  Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
95
 
 
288
 
289
  ### Explicit and Implicit Models
290
 
291
+ <Image
 
292
  src={ch2_approaches}
293
  zoomable
294
  downloadable
 
295
  alt="Figure"
296
+ caption={'Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (dynamics-based) or implicitly (learning-based) model robot-environment interactions.'}/>
 
 
 
297
 
298
  Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
299
 
 
301
 
302
  ### Different Types of Motion
303
 
304
+ <Image
 
305
  src={ch2_platforms}
306
  zoomable
307
  downloadable
 
308
  alt="Figure"
309
+ caption={'Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.'}/>
 
 
 
310
 
311
  In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
312
 
 
320
 
321
  Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
322
 
323
+ <Image
 
324
  src={ch2_cost_accessibility}
325
  zoomable
326
  downloadable
 
327
  alt="Figure"
328
+ caption={'Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.'}/>
 
 
 
329
 
330
  Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
331
 
332
+ <Image
 
333
  src={ch2_so100_to_planar_manipulator}
334
  zoomable
335
  downloadable
 
336
  alt="Figure"
337
+ caption={'The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).'}/>
 
 
 
338
 
339
  Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
340
 
 
412
 
413
  <div class="wrapfigure">
414
 
415
+ r0.3 <Image
416
  src={ch2_planar_manipulator_floor_box}
417
  zoomable
418
  downloadable
 
419
  alt="image"
420
  />
421
 
 
436
 
437
  Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
438
 
439
+ <Image
 
440
  src={ch2_classical_limitations}
441
  zoomable
442
  downloadable
 
443
  alt="Figure"
444
+ caption={'Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.'}/>
 
 
 
445
 
446
  Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
447
 
 
464
  TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
465
 
466
  </div>
467
+ <Image
 
468
  src={ch3_learning_benefits}
469
  zoomable
470
  downloadable
 
471
  alt="Figure"
472
+ caption={'Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used.'}/>
 
 
 
473
 
474
  Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
475
 
 
477
 
478
  <div class="wrapfigure">
479
 
480
+ r0.3 <Image
481
  src={ch3_learning_atlas}
482
  zoomable
483
  downloadable
 
484
  alt="image"
485
  />
486
 
 
489
 
490
  In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
491
 
492
+ <Image
 
493
  src={ch3_rl_examples}
494
  zoomable
495
  downloadable
 
496
  alt="Figure"
497
+ caption={'Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.'}/>
 
 
 
498
 
499
  Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
500
 
 
502
 
503
  The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
504
 
505
+ <Image
 
506
  src={ch3_agent_env}
507
  zoomable
508
  downloadable
 
509
  alt="Figure"
510
+ caption={'Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).'}/>
 
 
 
511
 
512
  Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
513
 
 
581
  ```
582
  inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
583
 
584
+ <Image
 
585
  src={ch3_rl_algorithms_atlas}
586
  zoomable
587
  downloadable
 
588
  alt="Figure"
589
+ caption={'Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.'}/>
 
 
 
590
 
591
  Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
592
 
 
596
 
597
  First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
598
 
599
+ <Image
 
600
  src={ch3_duck_sim_vs_real}
601
  zoomable
602
  downloadable
 
603
  alt="Figure"
604
+ caption={'Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality gap) pose risks to policy transfer.'}/>
 
 
 
605
 
606
  Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
607
 
608
+ <Image
 
609
  src={ch3_many_ducks}
610
  zoomable
611
  downloadable
 
612
  alt="Figure"
613
+ caption={'The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.'}/>
 
 
 
614
 
615
  While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
616
 
 
706
 
707
  Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
708
 
709
+ <Image
 
710
  src={ch3_hil_serl_examples}
711
  zoomable
712
  downloadable
 
713
  alt="Figure"
714
+ caption={'(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.'}/>
 
 
 
715
 
716
  Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
717
 
 
719
 
720
  #### Code Example- Real-world RL
721
 
722
+ <Image
 
723
  src={ch3_hil_serl_architecture}
724
  zoomable
725
  downloadable
 
726
  alt="Figure"
727
+ caption={'HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in lerobot relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions (s t , a t , r t , s t + 1) and (2) parameters θ .'}/>
 
 
 
728
 
729
  This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
730
 
 
994
  TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
995
 
996
  </div>
997
+ <Image
 
998
  src={ch4_bc_trajectories}
999
  zoomable
1000
  downloadable
 
1001
  alt="Figure"
1002
+ caption={'(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in lerobot/svla_so101_pickplace. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.'}/>
 
 
 
1003
 
1004
  Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
1005
 
1006
  Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
1007
 
1008
+ <Image
 
1009
  src={ch4_observation_action_mapping}
1010
  zoomable
1011
  downloadable
 
1012
  alt="Figure"
1013
+ caption={'Sample observations and action pairs over the course of a given trajectory recorded in lerobot/svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.'}/>
 
 
 
1014
 
1015
  Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
1016
  ``` math
 
1022
 
1023
  Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
1024
 
1025
+ <Image
 
1026
  src={ch4_issues_with_bc}
1027
  zoomable
1028
  downloadable
 
1029
  alt="Figure"
1030
+ caption={'Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.'}/>
 
 
 
1031
 
1032
  While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
1033
 
 
1037
 
1038
  #### Variational Auto-Encoders
1039
 
1040
+ <Image
 
1041
  src={ch4_task_effect_on_pairs}
1042
  zoomable
1043
  downloadable
 
1044
  alt="Figure"
1045
+ caption={'Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.'}/>
 
 
 
1046
 
1047
  A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
1048
  ``` math
 
1050
  ```
1051
  Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
1052
 
1053
+ <Image
 
1054
  src={ch4_latent_variable_model}
1055
  zoomable
1056
  downloadable
 
1057
  alt="Figure"
1058
+ caption={'(A) The latent variable model in a robotics application regulates influence between observed ( o, a) variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.'}/>
 
 
 
1059
 
1060
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
1061
  <span id="evidence-definition-1" style="position: absolute;">
 
1144
  ```
1145
  where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
1146
 
1147
+ <Image
 
1148
  src={ch4_many_latents}
1149
  zoomable
1150
  downloadable
 
1151
  alt="Figure"
1152
+ caption={'HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.'}/>
 
 
 
1153
 
1154
  Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
1155
 
 
1199
  ```
1200
  where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
1201
 
1202
+ <Image
 
1203
  src={ch4_diffusion_robot_actions}
1204
  zoomable
1205
  downloadable
 
1206
  alt="Figure"
1207
+ caption={'DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.'}/>
 
 
 
1208
 
1209
  Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
1210
  <span id="diffusion-simplified-loss" style="position: absolute;">
 
1220
  \end{align}
1221
  ```
1222
 
1223
+ <Image
 
1224
  src={ch4_action_vs_observation_distribution}
1225
  zoomable
1226
  downloadable
 
1227
  alt="Figure"
1228
+ caption={'A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.'}/>
 
 
 
1229
 
1230
  In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
1231
 
 
1259
  ```
1260
  Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
1261
 
1262
+ <Image
 
1263
  src={ch4_normalizing_flows}
1264
  zoomable
1265
  downloadable
 
1266
  alt="Figure"
1267
+ caption={'Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields u 1(x, y) = (x, 0) and $u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$). Notice time flows continuously in [0, 1]. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.'}/>
 
 
 
1268
 
1269
  While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
1270
 
1271
+ <Image
 
1272
  src={ch4_diffusion_vs_flowmatching}
1273
  zoomable
1274
  downloadable
 
1275
  alt="Figure"
1276
+ caption={'Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over T = 50 steps.'}/>
 
 
 
1277
 
1278
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
1279
  <span id="flow-matching-objective" style="position: absolute;">
 
1313
 
1314
  In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
1315
 
1316
+ <Image
 
1317
  src={ch4_act_encoder}
1318
  zoomable
1319
  downloadable
 
1320
  alt="Figure"
1321
+ caption={'The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned [CLS] token used to aggregate input level information, and predict the style variable z . The encoder is exclusively used to train the decoder, and it is entirely disregarded at inference time.'}/>
 
 
 
1322
 
1323
  However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
1324
 
1325
+ <Image
 
1326
  src={ch4_act_decoder}
1327
  zoomable
1328
  downloadable
 
1329
  alt="Figure"
1330
+ caption={'The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all n camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable z retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices K, V with the decoder, and is trained to decode fixed position embeddings into action chunks.'}/>
 
 
 
1331
 
1332
  #### Code Example: Training and Using ACT in Practice
1333
 
1334
+ <Image
 
1335
  src={ch4_act}
1336
  zoomable
1337
  downloadable
 
1338
  alt="Figure"
1339
+ caption={'Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.'}/>
 
 
 
1340
  <div class="pbox">
1341
 
1342
  Training ACT
 
1475
  ```
1476
  Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
1477
 
1478
+ <Image
 
1479
  src={ch4_diffusion_policy}
1480
  zoomable
1481
  downloadable
 
1482
  alt="Figure"
1483
+ caption={'The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of H o previous observations is used as external conditioning to denoise a group of H a actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as T = 10 denoising steps.'}/>
 
 
 
1484
 
1485
  Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
1486
 
 
1617
 
1618
  One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
1619
 
1620
+ <Image
 
 
1621
  src={ch4_async_inference}
1622
  zoomable
1623
  downloadable
 
1624
  alt="Figure"
1625
+ caption={'Asynchronous inference. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.'}/>
 
 
 
 
1626
  <div class="algorithm">
1627
 
1628
  <span id="alg-async-inference" style="position: absolute;"></span>
 
1647
 
1648
  - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
1649
 
1650
+ <Image
 
 
1651
  src={ch4_queues}
1652
  zoomable
1653
  downloadable
 
1654
  alt="Figure"
1655
+ caption={'Action queue size evolution at runtime for various levels of g when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.'}/>
 
 
 
 
1656
 
1657
  Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
1658
 
 
1791
 
1792
  The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
1793
 
1794
+ <Image
 
1795
  src={ch5_ml_vs_robotics_foundation}
1796
  zoomable
1797
  downloadable
 
1798
  alt="Figure"
1799
+ caption={'Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).'}/>
 
 
 
1800
 
1801
  ### Preliminaries: Models and Data
1802
 
1803
  The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
1804
 
1805
+ <Image
 
1806
  src={ch5_generalist_policies_timeline}
1807
  zoomable
1808
  downloadable
 
1809
  alt="Figure"
1810
+ caption={'Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, π 0 @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.'}/>
 
 
 
1811
 
1812
  Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
1813
 
 
1817
 
1818
  Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
1819
 
1820
+ <Image
 
1821
  src={ch5_trends}
1822
  zoomable
1823
  downloadable
 
1824
  alt="Figure"
1825
+ caption={'Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.'}/>
 
 
 
1826
 
1827
  Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
1828
 
 
1842
 
1843
  $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
1844
 
1845
+ <Image
 
1846
  src={ch5_pi0}
1847
  zoomable
1848
  downloadable
 
1849
  alt="Figure"
1850
+ caption={'The π 0 architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.'}/>
 
 
 
1851
 
1852
  Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1853
 
 
1882
 
1883
  <div class="wrapfigure">
1884
 
1885
+ r0.4 <Image
1886
  src={ch5_pi0_sampling_timesteps}
1887
  zoomable
1888
  downloadable
 
1889
  alt="image"
1890
  />
1891
 
 
1964
 
1965
  With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
1966
 
1967
+ <Image
 
1968
  src={ch5_smolvla}
1969
  zoomable
1970
  downloadable
 
1971
  alt="Figure"
1972
+ caption={'The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than π 0 (450M parameters vs. π 0 ’s 3.3B).'}/>
 
 
 
1973
 
1974
  While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.
1975
 
app/src/components/Hero.astro CHANGED
@@ -101,12 +101,7 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
101
  <section class="hero">
102
  <h1 class="hero-title" set:html={title} />
103
  <div class="hero-banner">
104
- <Image
105
- src="/src/content/assets/lerobot-logo-thumbnail.png"
106
- alt="LeRobot Logo"
107
- width={400}
108
- height={200}
109
- />
110
  {description && <p class="hero-desc">{description}</p>}
111
  </div>
112
  </section>
@@ -372,10 +367,6 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
372
  max-width: 980px;
373
  margin: 0 auto;
374
  }
375
- .hero-banner img {
376
- width: 100%;
377
- height: auto;
378
- }
379
  .hero-desc {
380
  color: var(--muted-color);
381
  font-style: italic;
 
101
  <section class="hero">
102
  <h1 class="hero-title" set:html={title} />
103
  <div class="hero-banner">
104
+ <HtmlEmbed src="banner.html" frameless />
 
 
 
 
 
105
  {description && <p class="hero-desc">{description}</p>}
106
  </div>
107
  </section>
 
367
  max-width: 980px;
368
  margin: 0 auto;
369
  }
 
 
 
 
370
  .hero-desc {
371
  color: var(--muted-color);
372
  font-style: italic;
app/src/content/article.mdx CHANGED
@@ -19,7 +19,7 @@ tableOfContentsAutoCollapse: true
19
  ---
20
 
21
  import MultiImage from '../components/MultiImage.astro';
22
- import ResponsiveImage from '../components/ResponsiveImage.astro';
23
  import Quote from '../components/Quote.astro';
24
  import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
25
  import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
@@ -84,17 +84,12 @@ We sincerely hope this tutorial serves as a valuable starting point for your jou
84
 
85
  ## Introduction
86
 
87
- <figure>
88
- <ResponsiveImage
89
  src={ch1_lerobot_figure1}
90
  zoomable
91
  downloadable
92
- layout="fixed"
93
  alt="Figure"
94
- />
95
- <span id="figure1" style="position: absolute;"></span>
96
- <figcaption><code>lerobot</code> is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.</figcaption>
97
- </figure>
98
 
99
  Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
100
 
@@ -293,17 +288,12 @@ TL;DR Learning-based approaches to robotics are motivated by the need to (1) gen
293
 
294
  ### Explicit and Implicit Models
295
 
296
- <figure>
297
- <ResponsiveImage
298
  src={ch2_approaches}
299
  zoomable
300
  downloadable
301
- layout="fixed"
302
  alt="Figure"
303
- />
304
- <span id="generating-motion-atlas" style="position: absolute;"></span>
305
- <figcaption>Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (<em>dynamics-based</em>) or implicitly (<em>learning-based</em>) model robot-environment interactions.</figcaption>
306
- </figure>
307
 
308
  Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
309
 
@@ -311,17 +301,12 @@ Methods to produce robotics motion range from traditional *explicit* models--<ma
311
 
312
  ### Different Types of Motion
313
 
314
- <figure>
315
- <ResponsiveImage
316
  src={ch2_platforms}
317
  zoomable
318
  downloadable
319
- layout="fixed"
320
  alt="Figure"
321
- />
322
- <span id="robotics-platforms-atlas" style="position: absolute;"></span>
323
- <figcaption>Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.</figcaption>
324
- </figure>
325
 
326
  In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
327
 
@@ -335,31 +320,21 @@ Robot manipulators typically consist of a series of links and joints, articulate
335
 
336
  Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
337
 
338
- <figure>
339
- <ResponsiveImage
340
  src={ch2_cost_accessibility}
341
  zoomable
342
  downloadable
343
- layout="fixed"
344
  alt="Figure"
345
- />
346
- <span id="robotic-platforms-costs" style="position: absolute;"></span>
347
- <figcaption>Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.</figcaption>
348
- </figure>
349
 
350
  Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
351
 
352
- <figure>
353
- <ResponsiveImage
354
  src={ch2_so100_to_planar_manipulator}
355
  zoomable
356
  downloadable
357
- layout="fixed"
358
  alt="Figure"
359
- />
360
- <span id="make-so100-planar-manipulator" style="position: absolute;"></span>
361
- <figcaption>The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).</figcaption>
362
- </figure>
363
 
364
  Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
365
 
@@ -437,11 +412,10 @@ While very effective when a goal trajectory has been well specified, the perform
437
 
438
  <div class="wrapfigure">
439
 
440
- r0.3 <ResponsiveImage
441
  src={ch2_planar_manipulator_floor_box}
442
  zoomable
443
  downloadable
444
- layout="fixed"
445
  alt="image"
446
  />
447
 
@@ -462,17 +436,12 @@ We point the interested reader to , , and  for extended coverage of FK, IK, di
462
 
463
  Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
464
 
465
- <figure>
466
- <ResponsiveImage
467
  src={ch2_classical_limitations}
468
  zoomable
469
  downloadable
470
- layout="fixed"
471
  alt="Figure"
472
- />
473
- <span id="classical-limitations" style="position: absolute;"></span>
474
- <figcaption>Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.</figcaption>
475
- </figure>
476
 
477
  Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
478
 
@@ -495,17 +464,12 @@ Taken together, these limitations (Figure <a href="#classical-limitations" data
495
  TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
496
 
497
  </div>
498
- <figure>
499
- <ResponsiveImage
500
  src={ch3_learning_benefits}
501
  zoomable
502
  downloadable
503
- layout="fixed"
504
  alt="Figure"
505
- />
506
- <span id="robot-learning-upsides" style="position: absolute;"></span>
507
- <figcaption>Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used. </figcaption>
508
- </figure>
509
 
510
  Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
511
 
@@ -513,11 +477,10 @@ Being a field at its relative nascent stages, no prevalent technique(s) proves d
513
 
514
  <div class="wrapfigure">
515
 
516
- r0.3 <ResponsiveImage
517
  src={ch3_learning_atlas}
518
  zoomable
519
  downloadable
520
- layout="fixed"
521
  alt="image"
522
  />
523
 
@@ -526,17 +489,12 @@ r0.3 <ResponsiveImage
526
 
527
  In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
528
 
529
- <figure>
530
- <ResponsiveImage
531
  src={ch3_rl_examples}
532
  zoomable
533
  downloadable
534
- layout="fixed"
535
  alt="Figure"
536
- />
537
- <span id="robotics-with-rl-examples" style="position: absolute;"></span>
538
- <figcaption>Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.</figcaption>
539
- </figure>
540
 
541
  Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
542
 
@@ -544,17 +502,12 @@ Applications of RL to robotics have been studied long enough that the relationsh
544
 
545
  The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
546
 
547
- <figure>
548
- <ResponsiveImage
549
  src={ch3_agent_env}
550
  zoomable
551
  downloadable
552
- layout="fixed"
553
  alt="Figure"
554
- />
555
- <span id="rl-most-famous-pic" style="position: absolute;"></span>
556
- <figcaption>Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).</figcaption>
557
- </figure>
558
 
559
  Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
560
 
@@ -628,17 +581,12 @@ V_\pi(s_t) &= \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)],
628
  ```
629
  inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
630
 
631
- <figure>
632
- <ResponsiveImage
633
  src={ch3_rl_algorithms_atlas}
634
  zoomable
635
  downloadable
636
- layout="fixed"
637
  alt="Figure"
638
- />
639
- <span id="rl-algos-atlas" style="position: absolute;"></span>
640
- <figcaption>Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.</figcaption>
641
- </figure>
642
 
643
  Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
644
 
@@ -648,31 +596,21 @@ Streamlined end-to-end control pipelines, data-driven feature extraction and a d
648
 
649
  First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
650
 
651
- <figure>
652
- <ResponsiveImage
653
  src={ch3_duck_sim_vs_real}
654
  zoomable
655
  downloadable
656
- layout="fixed"
657
  alt="Figure"
658
- />
659
- <span id="synthetic-vs-real-duck" style="position: absolute;"></span>
660
- <figcaption>Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (<em>reality gap</em>) pose risks to policy transfer.</figcaption>
661
- </figure>
662
 
663
  Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
664
 
665
- <figure>
666
- <ResponsiveImage
667
  src={ch3_many_ducks}
668
  zoomable
669
  downloadable
670
- layout="fixed"
671
  alt="Figure"
672
- />
673
- <span id="ducks-on-terrains" style="position: absolute;"></span>
674
- <figcaption>The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.</figcaption>
675
- </figure>
676
 
677
  While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
678
 
@@ -768,17 +706,12 @@ Reward classifiers are particularly useful in treating complex, dynamic tasks--e
768
 
769
  Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
770
 
771
- <figure>
772
- <ResponsiveImage
773
  src={ch3_hil_serl_examples}
774
  zoomable
775
  downloadable
776
- layout="fixed"
777
  alt="Figure"
778
- />
779
- <span id="hil-serl-blocks" style="position: absolute;"></span>
780
- <figcaption>(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.</figcaption>
781
- </figure>
782
 
783
  Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
784
 
@@ -786,17 +719,12 @@ Human-in-the-Loop, Sample Efficient Robot reinforcement Learning (HIL-SERL) @lu
786
 
787
  #### Code Example- Real-world RL
788
 
789
- <figure>
790
- <ResponsiveImage
791
  src={ch3_hil_serl_architecture}
792
  zoomable
793
  downloadable
794
- layout="fixed"
795
  alt="Figure"
796
- />
797
- <span id="ch3-hil-serl-architecture" style="position: absolute;"></span>
798
- <figcaption>HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in <code>lerobot</code> relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions <span class="math inline">(<em>s</em> <sub> <em>t</em> </sub>, <em>a</em> <sub> <em>t</em> </sub>, <em>r</em> <sub> <em>t</em> </sub>, <em>s</em> <sub> <em>t</em> + 1</sub>)</span> and (2) parameters <span class="math inline"> <em>θ</em> </span>.</figcaption>
799
- </figure>
800
 
801
  This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
802
 
@@ -1066,33 +994,23 @@ Advances in learning to act from potentially large corpora of human demonstratio
1066
  TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
1067
 
1068
  </div>
1069
- <figure>
1070
- <ResponsiveImage
1071
  src={ch4_bc_trajectories}
1072
  zoomable
1073
  downloadable
1074
- layout="fixed"
1075
  alt="Figure"
1076
- />
1077
- <span id="ch4-bc-trajectories" style="position: absolute;"></span>
1078
- <figcaption>(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.</figcaption>
1079
- </figure>
1080
 
1081
  Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
1082
 
1083
  Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
1084
 
1085
- <figure>
1086
- <ResponsiveImage
1087
  src={ch4_observation_action_mapping}
1088
  zoomable
1089
  downloadable
1090
- layout="fixed"
1091
  alt="Figure"
1092
- />
1093
- <span id="ch4-observation-action-mapping" style="position: absolute;"></span>
1094
- <figcaption>Sample observations and action pairs over the course of a given trajectory recorded in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.</figcaption>
1095
- </figure>
1096
 
1097
  Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
1098
  ``` math
@@ -1104,17 +1022,12 @@ Typically, the expert’s joint observation-action distribution $p: \mathcal O\t
1104
 
1105
  Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
1106
 
1107
- <figure>
1108
- <ResponsiveImage
1109
  src={ch4_issues_with_bc}
1110
  zoomable
1111
  downloadable
1112
- layout="fixed"
1113
  alt="Figure"
1114
- />
1115
- <span id="ch4-issues-with-bc" style="position: absolute;"></span>
1116
- <figcaption>Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.</figcaption>
1117
- </figure>
1118
 
1119
  While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
1120
 
@@ -1124,17 +1037,12 @@ Generative Models (GMs) aim to learn the stochastic process underlying the very
1124
 
1125
  #### Variational Auto-Encoders
1126
 
1127
- <figure>
1128
- <ResponsiveImage
1129
  src={ch4_task_effect_on_pairs}
1130
  zoomable
1131
  downloadable
1132
- layout="fixed"
1133
  alt="Figure"
1134
- />
1135
- <span id="ch4-task-effect-on-pairs" style="position: absolute;"></span>
1136
- <figcaption>Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.</figcaption>
1137
- </figure>
1138
 
1139
  A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
1140
  ``` math
@@ -1142,17 +1050,12 @@ A common inductive bias used in GM posits samples $(o,a)$ are influenced from an
1142
  ```
1143
  Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
1144
 
1145
- <figure>
1146
- <ResponsiveImage
1147
  src={ch4_latent_variable_model}
1148
  zoomable
1149
  downloadable
1150
- layout="fixed"
1151
  alt="Figure"
1152
- />
1153
- <span id="ch4-latent-variable-model" style="position: absolute;"></span>
1154
- <figcaption>(A) The latent variable model in a robotics application regulates influence between observed (<span class="math inline"> <em>o</em>, <em>a</em>)</span> variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.</figcaption>
1155
- </figure>
1156
 
1157
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
1158
  <span id="evidence-definition-1" style="position: absolute;">
@@ -1241,17 +1144,12 @@ VAEs approximate probability distributions via a *single* latent variable model,
1241
  ```
1242
  where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
1243
 
1244
- <figure>
1245
- <ResponsiveImage
1246
  src={ch4_many_latents}
1247
  zoomable
1248
  downloadable
1249
- layout="fixed"
1250
  alt="Figure"
1251
- />
1252
- <span id="ch4-many-latents" style="position: absolute;"></span>
1253
- <figcaption>HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.</figcaption>
1254
- </figure>
1255
 
1256
  Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
1257
 
@@ -1301,17 +1199,12 @@ In their seminal work on using DMs for variational inference, @hoDenoisingDiffu
1301
  ```
1302
  where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
1303
 
1304
- <figure>
1305
- <ResponsiveImage
1306
  src={ch4_diffusion_robot_actions}
1307
  zoomable
1308
  downloadable
1309
- layout="fixed"
1310
  alt="Figure"
1311
- />
1312
- <span id="diffusion-robot-actions" style="position: absolute;"></span>
1313
- <figcaption>DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.</figcaption>
1314
- </figure>
1315
 
1316
  Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
1317
  <span id="diffusion-simplified-loss" style="position: absolute;">
@@ -1327,17 +1220,12 @@ Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradien
1327
  \end{align}
1328
  ```
1329
 
1330
- <figure>
1331
- <ResponsiveImage
1332
  src={ch4_action_vs_observation_distribution}
1333
  zoomable
1334
  downloadable
1335
- layout="fixed"
1336
  alt="Figure"
1337
- />
1338
- <span id="ch4-action-vs-observation-distribution" style="position: absolute;"></span>
1339
- <figcaption>A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.</figcaption>
1340
- </figure>
1341
 
1342
  In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
1343
 
@@ -1371,31 +1259,21 @@ FM proved very effective in a variety of applications, ranging from image @esse
1371
  ```
1372
  Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
1373
 
1374
- <figure>
1375
- <ResponsiveImage
1376
  src={ch4_normalizing_flows}
1377
  zoomable
1378
  downloadable
1379
- layout="fixed"
1380
  alt="Figure"
1381
- />
1382
- <span id="ch4-normalizing-flows" style="position: absolute;"></span>
1383
- <figcaption>Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields <span class="math inline"> <em>u</em> <sub>1</sub>(<em>x</em>, <em>y</em>) = (<em>x</em>, 0)</span> and <span class="math inline">$u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$</span>). Notice time flows <em>continuously</em> in <span class="math inline">[0, 1]</span>. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.</figcaption>
1384
- </figure>
1385
 
1386
  While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
1387
 
1388
- <figure>
1389
- <ResponsiveImage
1390
  src={ch4_diffusion_vs_flowmatching}
1391
  zoomable
1392
  downloadable
1393
- layout="fixed"
1394
  alt="Figure"
1395
- />
1396
- <span id="ch4-diffusion-paths-versus-fm" style="position: absolute;"></span>
1397
- <figcaption>Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over <span class="math inline"> <em>T</em> = 50</span> steps.</figcaption>
1398
- </figure>
1399
 
1400
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
1401
  <span id="flow-matching-objective" style="position: absolute;">
@@ -1435,45 +1313,30 @@ In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn
1435
 
1436
  In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
1437
 
1438
- <figure>
1439
- <ResponsiveImage
1440
  src={ch4_act_encoder}
1441
  zoomable
1442
  downloadable
1443
- layout="fixed"
1444
  alt="Figure"
1445
- />
1446
- <span id="ch4-act-encoder" style="position: absolute;"></span>
1447
- <figcaption>The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned <code>[CLS]</code> token used to aggregate input level information, and predict the style variable <span class="math inline"> <em>z</em> </span>. The encoder is exclusively used to <em>train</em> the decoder, and it is entirely disregarded at inference time.</figcaption>
1448
- </figure>
1449
 
1450
  However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
1451
 
1452
- <figure>
1453
- <ResponsiveImage
1454
  src={ch4_act_decoder}
1455
  zoomable
1456
  downloadable
1457
- layout="fixed"
1458
  alt="Figure"
1459
- />
1460
- <span id="ch4-act-decoder" style="position: absolute;"></span>
1461
- <figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"> <em>n</em> </span> camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable <span class="math inline"> <em>z</em> </span> retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices <span class="math inline"> <em>K</em>, <em>V</em> </span> with the decoder, and is trained to decode fixed position embeddings into action chunks.</figcaption>
1462
- </figure>
1463
 
1464
  #### Code Example: Training and Using ACT in Practice
1465
 
1466
- <figure>
1467
- <ResponsiveImage
1468
  src={ch4_act}
1469
  zoomable
1470
  downloadable
1471
- layout="fixed"
1472
  alt="Figure"
1473
- />
1474
- <span id="ch4-act" style="position: absolute;"></span>
1475
- <figcaption>Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.</figcaption>
1476
- </figure>
1477
  <div class="pbox">
1478
 
1479
  Training ACT
@@ -1612,17 +1475,12 @@ In practice, conditioning on observation data is achieved conditioning the noise
1612
  ```
1613
  Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
1614
 
1615
- <figure>
1616
- <ResponsiveImage
1617
  src={ch4_diffusion_policy}
1618
  zoomable
1619
  downloadable
1620
- layout="fixed"
1621
  alt="Figure"
1622
- />
1623
- <span id="diffusion-policy-architecture" style="position: absolute;"></span>
1624
- <figcaption>The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of <span class="math inline"> <em>H</em> <sub> <em>o</em> </sub> </span> previous observations is used as external conditioning to denoise a group of <span class="math inline"> <em>H</em> <sub> <em>a</em> </sub> </span> actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as <span class="math inline"> <em>T</em> = 10</span> denoising steps.</figcaption>
1625
- </figure>
1626
 
1627
  Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
1628
 
@@ -1759,19 +1617,12 @@ A robot may indeed execute an entire action chunk $\mathbf{A}_t$ *before* a new
1759
 
1760
  One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
1761
 
1762
- <figure>
1763
- <div class="minipage">
1764
- <ResponsiveImage
1765
  src={ch4_async_inference}
1766
  zoomable
1767
  downloadable
1768
- layout="fixed"
1769
  alt="Figure"
1770
- />
1771
- <span id="ch4-async-inference" style="position: absolute;"></span>
1772
- </div>
1773
- <figcaption><strong>Asynchronous inference</strong>. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.</figcaption>
1774
- </figure>
1775
  <div class="algorithm">
1776
 
1777
  <span id="alg-async-inference" style="position: absolute;"></span>
@@ -1796,19 +1647,12 @@ Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queu
1796
 
1797
  - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
1798
 
1799
- <figure>
1800
- <div class="minipage">
1801
- <ResponsiveImage
1802
  src={ch4_queues}
1803
  zoomable
1804
  downloadable
1805
- layout="fixed"
1806
  alt="Figure"
1807
- />
1808
- <span id="ch4-queues" style="position: absolute;"></span>
1809
- </div>
1810
- <figcaption>Action queue size evolution at runtime for various levels of <span class="math inline"> <em>g</em> </span> when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.</figcaption>
1811
- </figure>
1812
 
1813
  Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
1814
 
@@ -1947,33 +1791,23 @@ TL;DR Openly available, large-scale datasets and the development of stable-to-tr
1947
 
1948
  The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
1949
 
1950
- <figure>
1951
- <ResponsiveImage
1952
  src={ch5_ml_vs_robotics_foundation}
1953
  zoomable
1954
  downloadable
1955
- layout="fixed"
1956
  alt="Figure"
1957
- />
1958
- <span id="ch5-ml-vs-robotics-foundation" style="position: absolute;"></span>
1959
- <figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
1960
- </figure>
1961
 
1962
  ### Preliminaries: Models and Data
1963
 
1964
  The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
1965
 
1966
- <figure>
1967
- <ResponsiveImage
1968
  src={ch5_generalist_policies_timeline}
1969
  zoomable
1970
  downloadable
1971
- layout="fixed"
1972
  alt="Figure"
1973
- />
1974
- <span id="ch5-generalist-policies-timeline" style="position: absolute;"></span>
1975
- <figcaption>Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, <span class="math inline"> <em>π</em> <sub>0</sub> </span> @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.</figcaption>
1976
- </figure>
1977
 
1978
  Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
1979
 
@@ -1983,17 +1817,12 @@ Traditionally, research efforts revolved around not only training models, but al
1983
 
1984
  Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
1985
 
1986
- <figure>
1987
- <ResponsiveImage
1988
  src={ch5_trends}
1989
  zoomable
1990
  downloadable
1991
- layout="fixed"
1992
  alt="Figure"
1993
- />
1994
- <span id="ch5-trends" style="position: absolute;"></span>
1995
- <figcaption>Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.</figcaption>
1996
- </figure>
1997
 
1998
  Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
1999
 
@@ -2013,17 +1842,12 @@ Recently, compute efficiency has also become a central focus in multi-modal rese
2013
 
2014
  $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
2015
 
2016
- <figure>
2017
- <ResponsiveImage
2018
  src={ch5_pi0}
2019
  zoomable
2020
  downloadable
2021
- layout="fixed"
2022
  alt="Figure"
2023
- />
2024
- <span id="ch5-pi0" style="position: absolute;"></span>
2025
- <figcaption>The <span class="math inline"> <em>π</em> <sub>0</sub> </span> architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.</figcaption>
2026
- </figure>
2027
 
2028
  Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
2029
 
@@ -2058,11 +1882,10 @@ Flow matching  can be seen as a continuous time, deterministic generalization o
2058
 
2059
  <div class="wrapfigure">
2060
 
2061
- r0.4 <ResponsiveImage
2062
  src={ch5_pi0_sampling_timesteps}
2063
  zoomable
2064
  downloadable
2065
- layout="fixed"
2066
  alt="image"
2067
  />
2068
 
@@ -2141,17 +1964,12 @@ for epoch in range(num_epochs):
2141
 
2142
  With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
2143
 
2144
- <figure>
2145
- <ResponsiveImage
2146
  src={ch5_smolvla}
2147
  zoomable
2148
  downloadable
2149
- layout="fixed"
2150
  alt="Figure"
2151
- />
2152
- <span id="ch5-smolvla" style="position: absolute;"></span>
2153
- <figcaption>The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than <span class="math inline"> <em>π</em> <sub>0</sub> </span> (450M parameters vs. <span class="math inline"> <em>π</em> <sub>0</sub> </span>’s 3.3B).</figcaption>
2154
- </figure>
2155
 
2156
  While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.
2157
 
 
19
  ---
20
 
21
  import MultiImage from '../components/MultiImage.astro';
22
+ import Image from '../components/Image.astro';
23
  import Quote from '../components/Quote.astro';
24
  import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
25
  import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
 
84
 
85
  ## Introduction
86
 
87
+ <Image
 
88
  src={ch1_lerobot_figure1}
89
  zoomable
90
  downloadable
 
91
  alt="Figure"
92
+ caption={'lerobot is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.'}/>
 
 
 
93
 
94
  Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
95
 
 
288
 
289
  ### Explicit and Implicit Models
290
 
291
+ <Image
 
292
  src={ch2_approaches}
293
  zoomable
294
  downloadable
 
295
  alt="Figure"
296
+ caption={'Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (dynamics-based) or implicitly (learning-based) model robot-environment interactions.'}/>
 
 
 
297
 
298
  Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
299
 
 
301
 
302
  ### Different Types of Motion
303
 
304
+ <Image
 
305
  src={ch2_platforms}
306
  zoomable
307
  downloadable
 
308
  alt="Figure"
309
+ caption={'Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.'}/>
 
 
 
310
 
311
  In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
312
 
 
320
 
321
  Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
322
 
323
+ <Image
 
324
  src={ch2_cost_accessibility}
325
  zoomable
326
  downloadable
 
327
  alt="Figure"
328
+ caption={'Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.'}/>
 
 
 
329
 
330
  Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
331
 
332
+ <Image
 
333
  src={ch2_so100_to_planar_manipulator}
334
  zoomable
335
  downloadable
 
336
  alt="Figure"
337
+ caption={'The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).'}/>
 
 
 
338
 
339
  Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
340
 
 
412
 
413
  <div class="wrapfigure">
414
 
415
+ r0.3 <Image
416
  src={ch2_planar_manipulator_floor_box}
417
  zoomable
418
  downloadable
 
419
  alt="image"
420
  />
421
 
 
436
 
437
  Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
438
 
439
+ <Image
 
440
  src={ch2_classical_limitations}
441
  zoomable
442
  downloadable
 
443
  alt="Figure"
444
+ caption={'Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.'}/>
 
 
 
445
 
446
  Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
447
 
 
464
  TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
465
 
466
  </div>
467
+ <Image
 
468
  src={ch3_learning_benefits}
469
  zoomable
470
  downloadable
 
471
  alt="Figure"
472
+ caption={'Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used.'}/>
 
 
 
473
 
474
  Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
475
 
 
477
 
478
  <div class="wrapfigure">
479
 
480
+ r0.3 <Image
481
  src={ch3_learning_atlas}
482
  zoomable
483
  downloadable
 
484
  alt="image"
485
  />
486
 
 
489
 
490
  In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
491
 
492
+ <Image
 
493
  src={ch3_rl_examples}
494
  zoomable
495
  downloadable
 
496
  alt="Figure"
497
+ caption={'Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.'}/>
 
 
 
498
 
499
  Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
500
 
 
502
 
503
  The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
504
 
505
+ <Image
 
506
  src={ch3_agent_env}
507
  zoomable
508
  downloadable
 
509
  alt="Figure"
510
+ caption={'Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).'}/>
 
 
 
511
 
512
  Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
513
 
 
581
  ```
582
  inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
583
 
584
+ <Image
 
585
  src={ch3_rl_algorithms_atlas}
586
  zoomable
587
  downloadable
 
588
  alt="Figure"
589
+ caption={'Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.'}/>
 
 
 
590
 
591
  Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
592
 
 
596
 
597
  First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
598
 
599
+ <Image
 
600
  src={ch3_duck_sim_vs_real}
601
  zoomable
602
  downloadable
 
603
  alt="Figure"
604
+ caption={'Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality gap) pose risks to policy transfer.'}/>
 
 
 
605
 
606
  Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
607
 
608
+ <Image
 
609
  src={ch3_many_ducks}
610
  zoomable
611
  downloadable
 
612
  alt="Figure"
613
+ caption={'The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.'}/>
 
 
 
614
 
615
  While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
616
 
 
706
 
707
  Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
708
 
709
+ <Image
 
710
  src={ch3_hil_serl_examples}
711
  zoomable
712
  downloadable
 
713
  alt="Figure"
714
+ caption={'(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.'}/>
 
 
 
715
 
716
  Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
717
 
 
719
 
720
  #### Code Example- Real-world RL
721
 
722
+ <Image
 
723
  src={ch3_hil_serl_architecture}
724
  zoomable
725
  downloadable
 
726
  alt="Figure"
727
+ caption={'HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in lerobot relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions (s t , a t , r t , s t + 1) and (2) parameters θ .'}/>
 
 
 
728
 
729
  This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
730
 
 
994
  TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
995
 
996
  </div>
997
+ <Image
 
998
  src={ch4_bc_trajectories}
999
  zoomable
1000
  downloadable
 
1001
  alt="Figure"
1002
+ caption={'(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in lerobot/svla_so101_pickplace. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.'}/>
 
 
 
1003
 
1004
  Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
1005
 
1006
  Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
1007
 
1008
+ <Image
 
1009
  src={ch4_observation_action_mapping}
1010
  zoomable
1011
  downloadable
 
1012
  alt="Figure"
1013
+ caption={'Sample observations and action pairs over the course of a given trajectory recorded in lerobot/svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.'}/>
 
 
 
1014
 
1015
  Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
1016
  ``` math
 
1022
 
1023
  Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
1024
 
1025
+ <Image
 
1026
  src={ch4_issues_with_bc}
1027
  zoomable
1028
  downloadable
 
1029
  alt="Figure"
1030
+ caption={'Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.'}/>
 
 
 
1031
 
1032
  While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
1033
 
 
1037
 
1038
  #### Variational Auto-Encoders
1039
 
1040
+ <Image
 
1041
  src={ch4_task_effect_on_pairs}
1042
  zoomable
1043
  downloadable
 
1044
  alt="Figure"
1045
+ caption={'Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.'}/>
 
 
 
1046
 
1047
  A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
1048
  ``` math
 
1050
  ```
1051
  Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
1052
 
1053
+ <Image
 
1054
  src={ch4_latent_variable_model}
1055
  zoomable
1056
  downloadable
 
1057
  alt="Figure"
1058
+ caption={'(A) The latent variable model in a robotics application regulates influence between observed ( o, a) variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.'}/>
 
 
 
1059
 
1060
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
1061
  <span id="evidence-definition-1" style="position: absolute;">
 
1144
  ```
1145
  where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
1146
 
1147
+ <Image
 
1148
  src={ch4_many_latents}
1149
  zoomable
1150
  downloadable
 
1151
  alt="Figure"
1152
+ caption={'HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.'}/>
 
 
 
1153
 
1154
  Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
1155
 
 
1199
  ```
1200
  where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
1201
 
1202
+ <Image
 
1203
  src={ch4_diffusion_robot_actions}
1204
  zoomable
1205
  downloadable
 
1206
  alt="Figure"
1207
+ caption={'DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.'}/>
 
 
 
1208
 
1209
  Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
1210
  <span id="diffusion-simplified-loss" style="position: absolute;">
 
1220
  \end{align}
1221
  ```
1222
 
1223
+ <Image
 
1224
  src={ch4_action_vs_observation_distribution}
1225
  zoomable
1226
  downloadable
 
1227
  alt="Figure"
1228
+ caption={'A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.'}/>
 
 
 
1229
 
1230
  In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
1231
 
 
1259
  ```
1260
  Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
1261
 
1262
+ <Image
 
1263
  src={ch4_normalizing_flows}
1264
  zoomable
1265
  downloadable
 
1266
  alt="Figure"
1267
+ caption={'Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields u 1(x, y) = (x, 0) and $u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$). Notice time flows continuously in [0, 1]. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.'}/>
 
 
 
1268
 
1269
  While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
1270
 
1271
+ <Image
 
1272
  src={ch4_diffusion_vs_flowmatching}
1273
  zoomable
1274
  downloadable
 
1275
  alt="Figure"
1276
+ caption={'Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over T = 50 steps.'}/>
 
 
 
1277
 
1278
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
1279
  <span id="flow-matching-objective" style="position: absolute;">
 
1313
 
1314
  In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
1315
 
1316
+ <Image
 
1317
  src={ch4_act_encoder}
1318
  zoomable
1319
  downloadable
 
1320
  alt="Figure"
1321
+ caption={'The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned [CLS] token used to aggregate input level information, and predict the style variable z . The encoder is exclusively used to train the decoder, and it is entirely disregarded at inference time.'}/>
 
 
 
1322
 
1323
  However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
1324
 
1325
+ <Image
 
1326
  src={ch4_act_decoder}
1327
  zoomable
1328
  downloadable
 
1329
  alt="Figure"
1330
+ caption={'The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all n camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable z retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices K, V with the decoder, and is trained to decode fixed position embeddings into action chunks.'}/>
 
 
 
1331
 
1332
  #### Code Example: Training and Using ACT in Practice
1333
 
1334
+ <Image
 
1335
  src={ch4_act}
1336
  zoomable
1337
  downloadable
 
1338
  alt="Figure"
1339
+ caption={'Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.'}/>
 
 
 
1340
  <div class="pbox">
1341
 
1342
  Training ACT
 
1475
  ```
1476
  Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
1477
 
1478
+ <Image
 
1479
  src={ch4_diffusion_policy}
1480
  zoomable
1481
  downloadable
 
1482
  alt="Figure"
1483
+ caption={'The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of H o previous observations is used as external conditioning to denoise a group of H a actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as T = 10 denoising steps.'}/>
 
 
 
1484
 
1485
  Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
1486
 
 
1617
 
1618
  One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
1619
 
1620
+ <Image
 
 
1621
  src={ch4_async_inference}
1622
  zoomable
1623
  downloadable
 
1624
  alt="Figure"
1625
+ caption={'Asynchronous inference. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.'}/>
 
 
 
 
1626
  <div class="algorithm">
1627
 
1628
  <span id="alg-async-inference" style="position: absolute;"></span>
 
1647
 
1648
  - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
1649
 
1650
+ <Image
 
 
1651
  src={ch4_queues}
1652
  zoomable
1653
  downloadable
 
1654
  alt="Figure"
1655
+ caption={'Action queue size evolution at runtime for various levels of g when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.'}/>
 
 
 
 
1656
 
1657
  Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
1658
 
 
1791
 
1792
  The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
1793
 
1794
+ <Image
 
1795
  src={ch5_ml_vs_robotics_foundation}
1796
  zoomable
1797
  downloadable
 
1798
  alt="Figure"
1799
+ caption={'Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).'}/>
 
 
 
1800
 
1801
  ### Preliminaries: Models and Data
1802
 
1803
  The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
1804
 
1805
+ <Image
 
1806
  src={ch5_generalist_policies_timeline}
1807
  zoomable
1808
  downloadable
 
1809
  alt="Figure"
1810
+ caption={'Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, π 0 @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.'}/>
 
 
 
1811
 
1812
  Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
1813
 
 
1817
 
1818
  Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
1819
 
1820
+ <Image
 
1821
  src={ch5_trends}
1822
  zoomable
1823
  downloadable
 
1824
  alt="Figure"
1825
+ caption={'Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.'}/>
 
 
 
1826
 
1827
  Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
1828
 
 
1842
 
1843
  $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
1844
 
1845
+ <Image
 
1846
  src={ch5_pi0}
1847
  zoomable
1848
  downloadable
 
1849
  alt="Figure"
1850
+ caption={'The π 0 architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.'}/>
 
 
 
1851
 
1852
  Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1853
 
 
1882
 
1883
  <div class="wrapfigure">
1884
 
1885
+ r0.4 <Image
1886
  src={ch5_pi0_sampling_timesteps}
1887
  zoomable
1888
  downloadable
 
1889
  alt="image"
1890
  />
1891
 
 
1964
 
1965
  With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
1966
 
1967
+ <Image
 
1968
  src={ch5_smolvla}
1969
  zoomable
1970
  downloadable
 
1971
  alt="Figure"
1972
+ caption={'The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than π 0 (450M parameters vs. π 0 ’s 3.3B).'}/>
 
 
 
1973
 
1974
  While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.
1975
 
app/src/content/embeds/{banner.html → banner2.html} RENAMED
File without changes