Spaces:

lerobot
/

robot-learning-tutorial

Running

App Files Files Community

tfrere HF Staff commited on Oct 14, 2025

Commit

759d176

1 Parent(s): 2b50d2d

update

Browse files

Files changed (5) hide show

app/scripts/latex-importer/mdx-converter.mjs +123 -17
app/scripts/latex-importer/output/main.mdx +74 -256
app/src/components/Hero.astro +1 -10
app/src/content/article.mdx +74 -256
app/src/content/embeds/{banner.html → banner2.html} +0 -0

app/scripts/latex-importer/mdx-converter.mjs CHANGED Viewed

@@ -222,38 +222,38 @@ ${imagesJson}
 }
 /**
- * Transform images to ResponsiveImage components
  * @param {string} content - MDX content
- * @returns {string} - Content with ResponsiveImage components
  */
 /**
- * Create ResponsiveImage component with import
  * @param {string} src - Clean image source
  * @param {string} alt - Alt text
  * @param {string} id - Element ID
  * @param {string} caption - Figure caption
  * @param {string} width - Optional width
- * @returns {string} - ResponsiveImage component markup
  */
-function createResponsiveImageComponent(src, alt = '', id = '', caption = '', width = '') {
     const varName = generateImageVarName(src);
     imageImports.set(src, varName);
-    usedComponents.add('ResponsiveImage');
     const props = [];
     props.push(`src={${varName}}`);
     props.push('zoomable');
     props.push('downloadable');
     if (id) props.push(`id="${id}"`);
-    props.push('layout="fixed"');
     if (alt) props.push(`alt="${alt}"`);
     if (caption) props.push(`caption={'${caption}'}`);
-    return `<ResponsiveImage\n  ${props.join('\n  ')}\n/>`;
 }
 function transformImages(content) {
-    console.log('  🖼️  Transforming images to ResponsiveImage components with imports...');
     let hasImages = false;
@@ -297,7 +297,7 @@ function transformImages(content) {
             const altText = cleanAltText(cleanCap);
             hasImages = true;
-            return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
         }
     );
@@ -309,7 +309,7 @@ function transformImages(content) {
             const cleanAlt = cleanAltText(alt || 'Figure');
             hasImages = true;
-            return createResponsiveImageComponent(cleanSrc, cleanAlt);
         }
     );
@@ -320,7 +320,7 @@ function transformImages(content) {
             const cleanSrc = cleanSrcPath(src);
             hasImages = true;
-            return createResponsiveImageComponent(cleanSrc, 'Figure');
         }
     );
@@ -333,7 +333,7 @@ function transformImages(content) {
             const altText = cleanAltText(cleanCap);
             hasImages = true;
-            return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
         }
     );
@@ -346,11 +346,12 @@ function transformImages(content) {
             const altText = cleanAltText(cleanCap);
             hasImages = true;
-            return createResponsiveImageComponent(cleanSrc, altText, id, cleanCap);
         }
     );
-    // 6. Transform Pandoc-style images: ![alt](src){#id attr="value"}
     content = content.replace(
         /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g,
         (match, alt, src, attributes) => {
@@ -364,17 +365,121 @@ function transformImages(content) {
                 if (idMatch) id = idMatch[1];
             }
-            return createResponsiveImageComponent(cleanSrc, cleanAlt, id);
         }
     );
     if (hasImages) {
-        console.log('    ✅ ResponsiveImage components with imports will be created');
     }
     return content;
 }
 /**
  * Transform HTML spans with style attributes to appropriate components
  * @param {string} content - MDX content
@@ -951,6 +1056,7 @@ function processMdxContent(content, latexContent = '') {
     processedContent = cleanMdxSyntax(processedContent);
     processedContent = convertSubfiguresToMultiImage(processedContent);
     processedContent = transformImages(processedContent);
     processedContent = transformStyledSpans(processedContent);
     processedContent = transformHighlightSpans(processedContent);
     processedContent = fixEscapedMarkTags(processedContent);

 }
 /**
+ * Transform images to Image components
  * @param {string} content - MDX content
+ * @returns {string} - Content with Image components
  */
 /**
+ * Create Image component with import
  * @param {string} src - Clean image source
  * @param {string} alt - Alt text
  * @param {string} id - Element ID
  * @param {string} caption - Figure caption
  * @param {string} width - Optional width
+ * @returns {string} - Image component markup
  */
+function createImageComponent(src, alt = '', id = '', caption = '', width = '') {
     const varName = generateImageVarName(src);
     imageImports.set(src, varName);
+    usedComponents.add('Image');
     const props = [];
     props.push(`src={${varName}}`);
     props.push('zoomable');
     props.push('downloadable');
     if (id) props.push(`id="${id}"`);
+    if (width) props.push(`width={${width}}`);
     if (alt) props.push(`alt="${alt}"`);
     if (caption) props.push(`caption={'${caption}'}`);
+    return `<Image\n  ${props.join('\n  ')}\n/>`;
 }
 function transformImages(content) {
+    console.log('  🖼️  Transforming images to Image components with imports...');
     let hasImages = false;
             const altText = cleanAltText(cleanCap);
             hasImages = true;
+            return createImageComponent(cleanSrc, altText, id, cleanCap);
         }
     );
             const cleanAlt = cleanAltText(alt || 'Figure');
             hasImages = true;
+            return createImageComponent(cleanSrc, cleanAlt);
         }
     );
             const cleanSrc = cleanSrcPath(src);
             hasImages = true;
+            return createImageComponent(cleanSrc, 'Figure');
         }
     );
             const altText = cleanAltText(cleanCap);
             hasImages = true;
+            return createImageComponent(cleanSrc, altText, id, cleanCap);
         }
     );
             const altText = cleanAltText(cleanCap);
             hasImages = true;
+            return createImageComponent(cleanSrc, altText, id, cleanCap);
         }
     );
+    // 7. Transform Pandoc-style images: ![alt](src){#id attr="value"}
     content = content.replace(
         /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g,
         (match, alt, src, attributes) => {
                 if (idMatch) id = idMatch[1];
             }
+            return createImageComponent(cleanSrc, cleanAlt, id);
         }
     );
     if (hasImages) {
+        console.log('    ✅ Image components with imports will be created');
     }
     return content;
 }
+/**
+ * Transform figures with Image components that still have separate figcaptions
+ * @param {string} content - MDX content
+ * @returns {string} - Content with Image components using caption props
+ */
+function transformImageFigures(content) {
+    console.log('  🔧 Transforming figures with Image components and separate figcaptions...');
+    let hasTransformed = false;
+    // Transform figures with Image components that still have separate figcaptions
+    content = content.replace(
+        /<figure>\s*<Image([\s\S]*?)\/>\s*<span[^>]*><\/span>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
+        (match, imageProps, caption) => {
+            hasTransformed = true;
+            // Clean caption text
+            const cleanCap = caption
+                .replace(/<[^>]*>/g, '')          // Remove HTML tags
+                .replace(/\n/g, ' ')              // Replace newlines with spaces
+                .replace(/\r/g, ' ')              // Replace carriage returns with spaces
+                .replace(/\s+/g, ' ')             // Replace multiple spaces with single space
+                .replace(/'/g, "\\'")             // Escape quotes
+                .trim();                          // Trim whitespace
+            // Extract the Image component and add the caption prop
+            const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
+            return imageComponent;
+        }
+    );
+    // Also try a more flexible pattern that handles escaped HTML
+    content = content.replace(
+        /<figure>\s*<Image([\s\S]*?)\/>\s*<p>&lt;span[^&]*&gt;&lt;\/span&gt;<\/p>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
+        (match, imageProps, caption) => {
+            hasTransformed = true;
+            // Clean caption text
+            const cleanCap = caption
+                .replace(/<[^>]*>/g, '')          // Remove HTML tags
+                .replace(/\n/g, ' ')              // Replace newlines with spaces
+                .replace(/\r/g, ' ')              // Replace carriage returns with spaces
+                .replace(/\s+/g, ' ')             // Replace multiple spaces with single space
+                .replace(/'/g, "\\'")             // Escape quotes
+                .trim();                          // Trim whitespace
+            // Extract the Image component and add the caption prop
+            const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
+            return imageComponent;
+        }
+    );
+    // Handle figures with minipage divs
+    content = content.replace(
+        /<figure>\s*<div class="minipage">\s*<Image([\s\S]*?)\/>\s*<span[^>]*><\/span>\s*<\/div>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
+        (match, imageProps, caption) => {
+            hasTransformed = true;
+            // Clean caption text
+            const cleanCap = caption
+                .replace(/<[^>]*>/g, '')          // Remove HTML tags
+                .replace(/\n/g, ' ')              // Replace newlines with spaces
+                .replace(/\r/g, ' ')              // Replace carriage returns with spaces
+                .replace(/\s+/g, ' ')             // Replace multiple spaces with single space
+                .replace(/'/g, "\\'")             // Escape quotes
+                .trim();                          // Trim whitespace
+            // Extract the Image component and add the caption prop
+            const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
+            return imageComponent;
+        }
+    );
+    // Handle figures with minipage divs (escaped HTML version)
+    content = content.replace(
+        /<figure>\s*<div class="minipage">\s*<Image([\s\S]*?)\/>\s*<p>&lt;span[^&]*&gt;&lt;\/span&gt;<\/p>\s*<\/div>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/gs,
+        (match, imageProps, caption) => {
+            hasTransformed = true;
+            // Clean caption text
+            const cleanCap = caption
+                .replace(/<[^>]*>/g, '')          // Remove HTML tags
+                .replace(/\n/g, ' ')              // Replace newlines with spaces
+                .replace(/\r/g, ' ')              // Replace carriage returns with spaces
+                .replace(/\s+/g, ' ')             // Replace multiple spaces with single space
+                .replace(/'/g, "\\'")             // Escape quotes
+                .trim();                          // Trim whitespace
+            // Extract the Image component and add the caption prop
+            const imageComponent = `<Image${imageProps} caption={'${cleanCap}'}/>`;
+            return imageComponent;
+        }
+    );
+    if (hasTransformed) {
+        console.log('    ✅ Transformed figures with Image components to use caption props');
+    } else {
+        console.log('    ℹ️  No figures with Image components and separate figcaptions found');
+    }
+    return content;
+}
 /**
  * Transform HTML spans with style attributes to appropriate components
  * @param {string} content - MDX content
     processedContent = cleanMdxSyntax(processedContent);
     processedContent = convertSubfiguresToMultiImage(processedContent);
     processedContent = transformImages(processedContent);
+    processedContent = transformImageFigures(processedContent);
     processedContent = transformStyledSpans(processedContent);
     processedContent = transformHighlightSpans(processedContent);
     processedContent = fixEscapedMarkTags(processedContent);

app/scripts/latex-importer/output/main.mdx CHANGED Viewed

@@ -19,7 +19,7 @@ tableOfContentsAutoCollapse: true
 ---
 import MultiImage from '../components/MultiImage.astro';
-import ResponsiveImage from '../components/ResponsiveImage.astro';
 import Quote from '../components/Quote.astro';
 import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
 import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
@@ -84,17 +84,12 @@ We sincerely hope this tutorial serves as a valuable starting point for your jou
 ## Introduction
-<figure>
-<ResponsiveImage
   src={ch1_lerobot_figure1}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="figure1" style="position: absolute;"></span>
-<figcaption><code>lerobot</code> is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.</figcaption>
-</figure>
 Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
@@ -293,17 +288,12 @@ TL;DR Learning-based approaches to robotics are motivated by the need to (1) gen
 ### Explicit and Implicit Models
-<figure>
-<ResponsiveImage
   src={ch2_approaches}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="generating-motion-atlas" style="position: absolute;"></span>
-<figcaption>Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (<em>dynamics-based</em>) or implicitly (<em>learning-based</em>) model robot-environment interactions.</figcaption>
-</figure>
 Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
@@ -311,17 +301,12 @@ Methods to produce robotics motion range from traditional *explicit* models--<ma
 ### Different Types of Motion
-<figure>
-<ResponsiveImage
   src={ch2_platforms}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robotics-platforms-atlas" style="position: absolute;"></span>
-<figcaption>Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.</figcaption>
-</figure>
 In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
@@ -335,31 +320,21 @@ Robot manipulators typically consist of a series of links and joints, articulate
 Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
-<figure>
-<ResponsiveImage
   src={ch2_cost_accessibility}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robotic-platforms-costs" style="position: absolute;"></span>
-<figcaption>Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.</figcaption>
-</figure>
 Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
-<figure>
-<ResponsiveImage
   src={ch2_so100_to_planar_manipulator}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="make-so100-planar-manipulator" style="position: absolute;"></span>
-<figcaption>The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).</figcaption>
-</figure>
 Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
@@ -437,11 +412,10 @@ While very effective when a goal trajectory has been well specified, the perform
 <div class="wrapfigure">
-r0.3 <ResponsiveImage
   src={ch2_planar_manipulator_floor_box}
   zoomable
   downloadable
-  layout="fixed"
   alt="image"
 />
@@ -462,17 +436,12 @@ We point the interested reader to , , and  for extended coverage of FK, IK, di
 Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
-<figure>
-<ResponsiveImage
   src={ch2_classical_limitations}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="classical-limitations" style="position: absolute;"></span>
-<figcaption>Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.</figcaption>
-</figure>
 Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
@@ -495,17 +464,12 @@ Taken together, these limitations (Figure <a href="#classical-limitations" data
 TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
 </div>
-<figure>
-<ResponsiveImage
   src={ch3_learning_benefits}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robot-learning-upsides" style="position: absolute;"></span>
-<figcaption>Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used. </figcaption>
-</figure>
 Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
@@ -513,11 +477,10 @@ Being a field at its relative nascent stages, no prevalent technique(s) proves d
 <div class="wrapfigure">
-r0.3 <ResponsiveImage
   src={ch3_learning_atlas}
   zoomable
   downloadable
-  layout="fixed"
   alt="image"
 />
@@ -526,17 +489,12 @@ r0.3 <ResponsiveImage
 In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
-<figure>
-<ResponsiveImage
   src={ch3_rl_examples}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robotics-with-rl-examples" style="position: absolute;"></span>
-<figcaption>Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.</figcaption>
-</figure>
 Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
@@ -544,17 +502,12 @@ Applications of RL to robotics have been studied long enough that the relationsh
 The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
-<figure>
-<ResponsiveImage
   src={ch3_agent_env}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="rl-most-famous-pic" style="position: absolute;"></span>
-<figcaption>Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).</figcaption>
-</figure>
 Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
@@ -628,17 +581,12 @@ V_\pi(s_t) &= \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)],
 ```
 inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
-<figure>
-<ResponsiveImage
   src={ch3_rl_algorithms_atlas}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="rl-algos-atlas" style="position: absolute;"></span>
-<figcaption>Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.</figcaption>
-</figure>
 Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
@@ -648,31 +596,21 @@ Streamlined end-to-end control pipelines, data-driven feature extraction and a d
 First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
-<figure>
-<ResponsiveImage
   src={ch3_duck_sim_vs_real}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="synthetic-vs-real-duck" style="position: absolute;"></span>
-<figcaption>Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (<em>reality gap</em>) pose risks to policy transfer.</figcaption>
-</figure>
 Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
-<figure>
-<ResponsiveImage
   src={ch3_many_ducks}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ducks-on-terrains" style="position: absolute;"></span>
-<figcaption>The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.</figcaption>
-</figure>
 While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
@@ -768,17 +706,12 @@ Reward classifiers are particularly useful in treating complex, dynamic tasks--e
 Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
-<figure>
-<ResponsiveImage
   src={ch3_hil_serl_examples}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="hil-serl-blocks" style="position: absolute;"></span>
-<figcaption>(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.</figcaption>
-</figure>
 Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
@@ -786,17 +719,12 @@ Human-in-the-Loop, Sample Efficient Robot reinforcement Learning (HIL-SERL) @lu
 #### Code Example- Real-world RL
-<figure>
-<ResponsiveImage
   src={ch3_hil_serl_architecture}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch3-hil-serl-architecture" style="position: absolute;"></span>
-<figcaption>HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in <code>lerobot</code> relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions <span class="math inline">(<em>s</em> <sub> <em>t</em> </sub>, <em>a</em> <sub> <em>t</em> </sub>, <em>r</em> <sub> <em>t</em> </sub>, <em>s</em> <sub> <em>t</em> + 1</sub>)</span> and (2) parameters <span class="math inline"> <em>θ</em> </span>.</figcaption>
-</figure>
 This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
@@ -1066,33 +994,23 @@ Advances in learning to act from potentially large corpora of human demonstratio
 TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
 </div>
-<figure>
-<ResponsiveImage
   src={ch4_bc_trajectories}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-bc-trajectories" style="position: absolute;"></span>
-<figcaption>(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.</figcaption>
-</figure>
 Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
 Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
-<figure>
-<ResponsiveImage
   src={ch4_observation_action_mapping}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-observation-action-mapping" style="position: absolute;"></span>
-<figcaption>Sample observations and action pairs over the course of a given trajectory recorded in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.</figcaption>
-</figure>
 Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
 ``` math
@@ -1104,17 +1022,12 @@ Typically, the expert’s joint observation-action distribution $p: \mathcal O\t
 Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
-<figure>
-<ResponsiveImage
   src={ch4_issues_with_bc}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-issues-with-bc" style="position: absolute;"></span>
-<figcaption>Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.</figcaption>
-</figure>
 While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
@@ -1124,17 +1037,12 @@ Generative Models (GMs) aim to learn the stochastic process underlying the very
 #### Variational Auto-Encoders
-<figure>
-<ResponsiveImage
   src={ch4_task_effect_on_pairs}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-task-effect-on-pairs" style="position: absolute;"></span>
-<figcaption>Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.</figcaption>
-</figure>
 A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
 ``` math
@@ -1142,17 +1050,12 @@ A common inductive bias used in GM posits samples $(o,a)$ are influenced from an
 ```
 Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
-<figure>
-<ResponsiveImage
   src={ch4_latent_variable_model}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-latent-variable-model" style="position: absolute;"></span>
-<figcaption>(A) The latent variable model in a robotics application regulates influence between observed (<span class="math inline"> <em>o</em>, <em>a</em>)</span> variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.</figcaption>
-</figure>
 Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
 <span id="evidence-definition-1" style="position: absolute;">
@@ -1241,17 +1144,12 @@ VAEs approximate probability distributions via a *single* latent variable model,
 ```
 where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
-<figure>
-<ResponsiveImage
   src={ch4_many_latents}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-many-latents" style="position: absolute;"></span>
-<figcaption>HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.</figcaption>
-</figure>
 Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
@@ -1301,17 +1199,12 @@ In their seminal work on using DMs for variational inference, @hoDenoisingDiffu
 ```
 where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
-<figure>
-<ResponsiveImage
   src={ch4_diffusion_robot_actions}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="diffusion-robot-actions" style="position: absolute;"></span>
-<figcaption>DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.</figcaption>
-</figure>
 Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
 <span id="diffusion-simplified-loss" style="position: absolute;">
@@ -1327,17 +1220,12 @@ Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradien
 \end{align}
 ```
-<figure>
-<ResponsiveImage
   src={ch4_action_vs_observation_distribution}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-action-vs-observation-distribution" style="position: absolute;"></span>
-<figcaption>A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.</figcaption>
-</figure>
 In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
@@ -1371,31 +1259,21 @@ FM proved very effective in a variety of applications, ranging from image @esse
 ```
 Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
-<figure>
-<ResponsiveImage
   src={ch4_normalizing_flows}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-normalizing-flows" style="position: absolute;"></span>
-<figcaption>Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields <span class="math inline"> <em>u</em> <sub>1</sub>(<em>x</em>, <em>y</em>) = (<em>x</em>, 0)</span> and <span class="math inline">$u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$</span>). Notice time flows <em>continuously</em> in <span class="math inline">[0, 1]</span>. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.</figcaption>
-</figure>
 While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
-<figure>
-<ResponsiveImage
   src={ch4_diffusion_vs_flowmatching}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-diffusion-paths-versus-fm" style="position: absolute;"></span>
-<figcaption>Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over <span class="math inline"> <em>T</em> = 50</span> steps.</figcaption>
-</figure>
 In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
 <span id="flow-matching-objective" style="position: absolute;">
@@ -1435,45 +1313,30 @@ In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn
 In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
-<figure>
-<ResponsiveImage
   src={ch4_act_encoder}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-act-encoder" style="position: absolute;"></span>
-<figcaption>The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned <code>[CLS]</code> token used to aggregate input level information, and predict the style variable <span class="math inline"> <em>z</em> </span>. The encoder is exclusively used to <em>train</em> the decoder, and it is entirely disregarded at inference time.</figcaption>
-</figure>
 However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
-<figure>
-<ResponsiveImage
   src={ch4_act_decoder}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-act-decoder" style="position: absolute;"></span>
-<figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"> <em>n</em> </span> camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable <span class="math inline"> <em>z</em> </span> retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices <span class="math inline"> <em>K</em>, <em>V</em> </span> with the decoder, and is trained to decode fixed position embeddings into action chunks.</figcaption>
-</figure>
 #### Code Example: Training and Using ACT in Practice
-<figure>
-<ResponsiveImage
   src={ch4_act}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-act" style="position: absolute;"></span>
-<figcaption>Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.</figcaption>
-</figure>
 <div class="pbox">
 Training ACT
@@ -1612,17 +1475,12 @@ In practice, conditioning on observation data is achieved conditioning the noise
 ```
 Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
-<figure>
-<ResponsiveImage
   src={ch4_diffusion_policy}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="diffusion-policy-architecture" style="position: absolute;"></span>
-<figcaption>The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of <span class="math inline"> <em>H</em> <sub> <em>o</em> </sub> </span> previous observations is used as external conditioning to denoise a group of <span class="math inline"> <em>H</em> <sub> <em>a</em> </sub> </span> actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as <span class="math inline"> <em>T</em> = 10</span> denoising steps.</figcaption>
-</figure>
 Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
@@ -1759,19 +1617,12 @@ A robot may indeed execute an entire action chunk $\mathbf{A}_t$ *before* a new
 One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
-<figure>
-<div class="minipage">
-<ResponsiveImage
   src={ch4_async_inference}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-async-inference" style="position: absolute;"></span>
-</div>
-<figcaption><strong>Asynchronous inference</strong>. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.</figcaption>
-</figure>
 <div class="algorithm">
 <span id="alg-async-inference" style="position: absolute;"></span>
@@ -1796,19 +1647,12 @@ Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queu
 - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
-<figure>
-<div class="minipage">
-<ResponsiveImage
   src={ch4_queues}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-queues" style="position: absolute;"></span>
-</div>
-<figcaption>Action queue size evolution at runtime for various levels of <span class="math inline"> <em>g</em> </span> when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.</figcaption>
-</figure>
 Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
@@ -1947,33 +1791,23 @@ TL;DR Openly available, large-scale datasets and the development of stable-to-tr
 The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
-<figure>
-<ResponsiveImage
   src={ch5_ml_vs_robotics_foundation}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-ml-vs-robotics-foundation" style="position: absolute;"></span>
-<figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
-</figure>
 ### Preliminaries: Models and Data
 The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
-<figure>
-<ResponsiveImage
   src={ch5_generalist_policies_timeline}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-generalist-policies-timeline" style="position: absolute;"></span>
-<figcaption>Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, <span class="math inline"> <em>π</em> <sub>0</sub> </span> @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.</figcaption>
-</figure>
 Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
@@ -1983,17 +1817,12 @@ Traditionally, research efforts revolved around not only training models, but al
 Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
-<figure>
-<ResponsiveImage
   src={ch5_trends}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-trends" style="position: absolute;"></span>
-<figcaption>Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.</figcaption>
-</figure>
 Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
@@ -2013,17 +1842,12 @@ Recently, compute efficiency has also become a central focus in multi-modal rese
 $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
-<figure>
-<ResponsiveImage
   src={ch5_pi0}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-pi0" style="position: absolute;"></span>
-<figcaption>The <span class="math inline"> <em>π</em> <sub>0</sub> </span> architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.</figcaption>
-</figure>
 Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
@@ -2058,11 +1882,10 @@ Flow matching  can be seen as a continuous time, deterministic generalization o
 <div class="wrapfigure">
-r0.4 <ResponsiveImage
   src={ch5_pi0_sampling_timesteps}
   zoomable
   downloadable
-  layout="fixed"
   alt="image"
 />
@@ -2141,17 +1964,12 @@ for epoch in range(num_epochs):
 With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
-<figure>
-<ResponsiveImage
   src={ch5_smolvla}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-smolvla" style="position: absolute;"></span>
-<figcaption>The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than <span class="math inline"> <em>π</em> <sub>0</sub> </span> (450M parameters vs. <span class="math inline"> <em>π</em> <sub>0</sub> </span>’s 3.3B).</figcaption>
-</figure>
 While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.

 ---
 import MultiImage from '../components/MultiImage.astro';
+import Image from '../components/Image.astro';
 import Quote from '../components/Quote.astro';
 import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
 import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
 ## Introduction
+<Image
   src={ch1_lerobot_figure1}
   zoomable
   downloadable
   alt="Figure"
+ caption={'lerobot is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.'}/>
 Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
 ### Explicit and Implicit Models
+<Image
   src={ch2_approaches}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (dynamics-based) or implicitly (learning-based) model robot-environment interactions.'}/>
 Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
 ### Different Types of Motion
+<Image
   src={ch2_platforms}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.'}/>
 In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
 Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
+<Image
   src={ch2_cost_accessibility}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.'}/>
 Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
+<Image
   src={ch2_so100_to_planar_manipulator}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).'}/>
 Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
 <div class="wrapfigure">
+r0.3 <Image
   src={ch2_planar_manipulator_floor_box}
   zoomable
   downloadable
   alt="image"
 />
 Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
+<Image
   src={ch2_classical_limitations}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.'}/>
 Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
 TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
 </div>
+<Image
   src={ch3_learning_benefits}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used.'}/>
 Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
 <div class="wrapfigure">
+r0.3 <Image
   src={ch3_learning_atlas}
   zoomable
   downloadable
   alt="image"
 />
 In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
+<Image
   src={ch3_rl_examples}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.'}/>
 Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
 The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
+<Image
   src={ch3_agent_env}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).'}/>
 Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
 ```
 inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
+<Image
   src={ch3_rl_algorithms_atlas}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.'}/>
 Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
 First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
+<Image
   src={ch3_duck_sim_vs_real}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality gap) pose risks to policy transfer.'}/>
 Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
+<Image
   src={ch3_many_ducks}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.'}/>
 While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
 Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
+<Image
   src={ch3_hil_serl_examples}
   zoomable
   downloadable
   alt="Figure"
+ caption={'(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.'}/>
 Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
 #### Code Example- Real-world RL
+<Image
   src={ch3_hil_serl_architecture}
   zoomable
   downloadable
   alt="Figure"
+ caption={'HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in lerobot relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions (s t , a t , r t , s t + 1) and (2) parameters θ .'}/>
 This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
 TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
 </div>
+<Image
   src={ch4_bc_trajectories}
   zoomable
   downloadable
   alt="Figure"
+ caption={'(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in lerobot/svla_so101_pickplace. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.'}/>
 Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
 Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
+<Image
   src={ch4_observation_action_mapping}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Sample observations and action pairs over the course of a given trajectory recorded in lerobot/svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.'}/>
 Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
 ``` math
 Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
+<Image
   src={ch4_issues_with_bc}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.'}/>
 While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
 #### Variational Auto-Encoders
+<Image
   src={ch4_task_effect_on_pairs}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.'}/>
 A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
 ``` math
 ```
 Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
+<Image
   src={ch4_latent_variable_model}
   zoomable
   downloadable
   alt="Figure"
+ caption={'(A) The latent variable model in a robotics application regulates influence between observed ( o, a) variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.'}/>
 Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
 <span id="evidence-definition-1" style="position: absolute;">
 ```
 where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
+<Image
   src={ch4_many_latents}
   zoomable
   downloadable
   alt="Figure"
+ caption={'HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.'}/>
 Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
 ```
 where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
+<Image
   src={ch4_diffusion_robot_actions}
   zoomable
   downloadable
   alt="Figure"
+ caption={'DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.'}/>
 Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
 <span id="diffusion-simplified-loss" style="position: absolute;">
 \end{align}
 ```
+<Image
   src={ch4_action_vs_observation_distribution}
   zoomable
   downloadable
   alt="Figure"
+ caption={'A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.'}/>
 In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
 ```
 Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
+<Image
   src={ch4_normalizing_flows}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields u 1(x, y) = (x, 0) and $u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$). Notice time flows continuously in [0, 1]. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.'}/>
 While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
+<Image
   src={ch4_diffusion_vs_flowmatching}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over T = 50 steps.'}/>
 In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
 <span id="flow-matching-objective" style="position: absolute;">
 In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
+<Image
   src={ch4_act_encoder}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned [CLS] token used to aggregate input level information, and predict the style variable z . The encoder is exclusively used to train the decoder, and it is entirely disregarded at inference time.'}/>
 However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
+<Image
   src={ch4_act_decoder}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all n camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable z retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices K, V with the decoder, and is trained to decode fixed position embeddings into action chunks.'}/>
 #### Code Example: Training and Using ACT in Practice
+<Image
   src={ch4_act}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.'}/>
 <div class="pbox">
 Training ACT
 ```
 Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
+<Image
   src={ch4_diffusion_policy}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of H o previous observations is used as external conditioning to denoise a group of H a actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as T = 10 denoising steps.'}/>
 Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
 One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
+<Image
   src={ch4_async_inference}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Asynchronous inference. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.'}/>
 <div class="algorithm">
 <span id="alg-async-inference" style="position: absolute;"></span>
 - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
+<Image
   src={ch4_queues}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Action queue size evolution at runtime for various levels of g when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.'}/>
 Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
 The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
+<Image
   src={ch5_ml_vs_robotics_foundation}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).'}/>
 ### Preliminaries: Models and Data
 The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
+<Image
   src={ch5_generalist_policies_timeline}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, π 0 @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.'}/>
 Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
 Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
+<Image
   src={ch5_trends}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.'}/>
 Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
 $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
+<Image
   src={ch5_pi0}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The π 0 architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.'}/>
 Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
 <div class="wrapfigure">
+r0.4 <Image
   src={ch5_pi0_sampling_timesteps}
   zoomable
   downloadable
   alt="image"
 />
 With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
+<Image
   src={ch5_smolvla}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than π 0 (450M parameters vs. π 0 ’s 3.3B).'}/>
 While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.

app/src/components/Hero.astro CHANGED Viewed

@@ -101,12 +101,7 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
 <section class="hero">
   <h1 class="hero-title" set:html={title} />
   <div class="hero-banner">
-    <Image
-      src="/src/content/assets/lerobot-logo-thumbnail.png"
-      alt="LeRobot Logo"
-      width={400}
-      height={200}
-    />
     {description && <p class="hero-desc">{description}</p>}
   </div>
 </section>
@@ -372,10 +367,6 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
     max-width: 980px;
     margin: 0 auto;
   }
-  .hero-banner img {
-    width: 100%;
-    height: auto;
-  }
   .hero-desc {
     color: var(--muted-color);
     font-style: italic;

 <section class="hero">
   <h1 class="hero-title" set:html={title} />
   <div class="hero-banner">
+    <HtmlEmbed src="banner.html" frameless />
     {description && <p class="hero-desc">{description}</p>}
   </div>
 </section>
     max-width: 980px;
     margin: 0 auto;
   }
   .hero-desc {
     color: var(--muted-color);
     font-style: italic;

app/src/content/article.mdx CHANGED Viewed

@@ -19,7 +19,7 @@ tableOfContentsAutoCollapse: true
 ---
 import MultiImage from '../components/MultiImage.astro';
-import ResponsiveImage from '../components/ResponsiveImage.astro';
 import Quote from '../components/Quote.astro';
 import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
 import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
@@ -84,17 +84,12 @@ We sincerely hope this tutorial serves as a valuable starting point for your jou
 ## Introduction
-<figure>
-<ResponsiveImage
   src={ch1_lerobot_figure1}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="figure1" style="position: absolute;"></span>
-<figcaption><code>lerobot</code> is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.</figcaption>
-</figure>
 Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
@@ -293,17 +288,12 @@ TL;DR Learning-based approaches to robotics are motivated by the need to (1) gen
 ### Explicit and Implicit Models
-<figure>
-<ResponsiveImage
   src={ch2_approaches}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="generating-motion-atlas" style="position: absolute;"></span>
-<figcaption>Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (<em>dynamics-based</em>) or implicitly (<em>learning-based</em>) model robot-environment interactions.</figcaption>
-</figure>
 Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
@@ -311,17 +301,12 @@ Methods to produce robotics motion range from traditional *explicit* models--<ma
 ### Different Types of Motion
-<figure>
-<ResponsiveImage
   src={ch2_platforms}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robotics-platforms-atlas" style="position: absolute;"></span>
-<figcaption>Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.</figcaption>
-</figure>
 In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
@@ -335,31 +320,21 @@ Robot manipulators typically consist of a series of links and joints, articulate
 Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
-<figure>
-<ResponsiveImage
   src={ch2_cost_accessibility}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robotic-platforms-costs" style="position: absolute;"></span>
-<figcaption>Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.</figcaption>
-</figure>
 Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
-<figure>
-<ResponsiveImage
   src={ch2_so100_to_planar_manipulator}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="make-so100-planar-manipulator" style="position: absolute;"></span>
-<figcaption>The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).</figcaption>
-</figure>
 Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
@@ -437,11 +412,10 @@ While very effective when a goal trajectory has been well specified, the perform
 <div class="wrapfigure">
-r0.3 <ResponsiveImage
   src={ch2_planar_manipulator_floor_box}
   zoomable
   downloadable
-  layout="fixed"
   alt="image"
 />
@@ -462,17 +436,12 @@ We point the interested reader to , , and  for extended coverage of FK, IK, di
 Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
-<figure>
-<ResponsiveImage
   src={ch2_classical_limitations}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="classical-limitations" style="position: absolute;"></span>
-<figcaption>Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.</figcaption>
-</figure>
 Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
@@ -495,17 +464,12 @@ Taken together, these limitations (Figure <a href="#classical-limitations" data
 TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
 </div>
-<figure>
-<ResponsiveImage
   src={ch3_learning_benefits}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robot-learning-upsides" style="position: absolute;"></span>
-<figcaption>Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used. </figcaption>
-</figure>
 Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
@@ -513,11 +477,10 @@ Being a field at its relative nascent stages, no prevalent technique(s) proves d
 <div class="wrapfigure">
-r0.3 <ResponsiveImage
   src={ch3_learning_atlas}
   zoomable
   downloadable
-  layout="fixed"
   alt="image"
 />
@@ -526,17 +489,12 @@ r0.3 <ResponsiveImage
 In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
-<figure>
-<ResponsiveImage
   src={ch3_rl_examples}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="robotics-with-rl-examples" style="position: absolute;"></span>
-<figcaption>Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.</figcaption>
-</figure>
 Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
@@ -544,17 +502,12 @@ Applications of RL to robotics have been studied long enough that the relationsh
 The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
-<figure>
-<ResponsiveImage
   src={ch3_agent_env}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="rl-most-famous-pic" style="position: absolute;"></span>
-<figcaption>Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).</figcaption>
-</figure>
 Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
@@ -628,17 +581,12 @@ V_\pi(s_t) &= \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)],
 ```
 inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
-<figure>
-<ResponsiveImage
   src={ch3_rl_algorithms_atlas}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="rl-algos-atlas" style="position: absolute;"></span>
-<figcaption>Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.</figcaption>
-</figure>
 Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
@@ -648,31 +596,21 @@ Streamlined end-to-end control pipelines, data-driven feature extraction and a d
 First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
-<figure>
-<ResponsiveImage
   src={ch3_duck_sim_vs_real}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="synthetic-vs-real-duck" style="position: absolute;"></span>
-<figcaption>Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (<em>reality gap</em>) pose risks to policy transfer.</figcaption>
-</figure>
 Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
-<figure>
-<ResponsiveImage
   src={ch3_many_ducks}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ducks-on-terrains" style="position: absolute;"></span>
-<figcaption>The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.</figcaption>
-</figure>
 While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
@@ -768,17 +706,12 @@ Reward classifiers are particularly useful in treating complex, dynamic tasks--e
 Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
-<figure>
-<ResponsiveImage
   src={ch3_hil_serl_examples}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="hil-serl-blocks" style="position: absolute;"></span>
-<figcaption>(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.</figcaption>
-</figure>
 Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
@@ -786,17 +719,12 @@ Human-in-the-Loop, Sample Efficient Robot reinforcement Learning (HIL-SERL) @lu
 #### Code Example- Real-world RL
-<figure>
-<ResponsiveImage
   src={ch3_hil_serl_architecture}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch3-hil-serl-architecture" style="position: absolute;"></span>
-<figcaption>HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in <code>lerobot</code> relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions <span class="math inline">(<em>s</em> <sub> <em>t</em> </sub>, <em>a</em> <sub> <em>t</em> </sub>, <em>r</em> <sub> <em>t</em> </sub>, <em>s</em> <sub> <em>t</em> + 1</sub>)</span> and (2) parameters <span class="math inline"> <em>θ</em> </span>.</figcaption>
-</figure>
 This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
@@ -1066,33 +994,23 @@ Advances in learning to act from potentially large corpora of human demonstratio
 TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
 </div>
-<figure>
-<ResponsiveImage
   src={ch4_bc_trajectories}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-bc-trajectories" style="position: absolute;"></span>
-<figcaption>(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.</figcaption>
-</figure>
 Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
 Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
-<figure>
-<ResponsiveImage
   src={ch4_observation_action_mapping}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-observation-action-mapping" style="position: absolute;"></span>
-<figcaption>Sample observations and action pairs over the course of a given trajectory recorded in <a href="lerobot/svla_so101_pickplace" class="uri">lerobot/svla_so101_pickplace</a>. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.</figcaption>
-</figure>
 Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
 ``` math
@@ -1104,17 +1022,12 @@ Typically, the expert’s joint observation-action distribution $p: \mathcal O\t
 Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
-<figure>
-<ResponsiveImage
   src={ch4_issues_with_bc}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-issues-with-bc" style="position: absolute;"></span>
-<figcaption>Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.</figcaption>
-</figure>
 While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
@@ -1124,17 +1037,12 @@ Generative Models (GMs) aim to learn the stochastic process underlying the very
 #### Variational Auto-Encoders
-<figure>
-<ResponsiveImage
   src={ch4_task_effect_on_pairs}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-task-effect-on-pairs" style="position: absolute;"></span>
-<figcaption>Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.</figcaption>
-</figure>
 A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
 ``` math
@@ -1142,17 +1050,12 @@ A common inductive bias used in GM posits samples $(o,a)$ are influenced from an
 ```
 Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
-<figure>
-<ResponsiveImage
   src={ch4_latent_variable_model}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-latent-variable-model" style="position: absolute;"></span>
-<figcaption>(A) The latent variable model in a robotics application regulates influence between observed (<span class="math inline"> <em>o</em>, <em>a</em>)</span> variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.</figcaption>
-</figure>
 Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
 <span id="evidence-definition-1" style="position: absolute;">
@@ -1241,17 +1144,12 @@ VAEs approximate probability distributions via a *single* latent variable model,
 ```
 where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
-<figure>
-<ResponsiveImage
   src={ch4_many_latents}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-many-latents" style="position: absolute;"></span>
-<figcaption>HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.</figcaption>
-</figure>
 Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
@@ -1301,17 +1199,12 @@ In their seminal work on using DMs for variational inference, @hoDenoisingDiffu
 ```
 where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
-<figure>
-<ResponsiveImage
   src={ch4_diffusion_robot_actions}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="diffusion-robot-actions" style="position: absolute;"></span>
-<figcaption>DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.</figcaption>
-</figure>
 Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
 <span id="diffusion-simplified-loss" style="position: absolute;">
@@ -1327,17 +1220,12 @@ Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradien
 \end{align}
 ```
-<figure>
-<ResponsiveImage
   src={ch4_action_vs_observation_distribution}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-action-vs-observation-distribution" style="position: absolute;"></span>
-<figcaption>A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.</figcaption>
-</figure>
 In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
@@ -1371,31 +1259,21 @@ FM proved very effective in a variety of applications, ranging from image @esse
 ```
 Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
-<figure>
-<ResponsiveImage
   src={ch4_normalizing_flows}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-normalizing-flows" style="position: absolute;"></span>
-<figcaption>Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields <span class="math inline"> <em>u</em> <sub>1</sub>(<em>x</em>, <em>y</em>) = (<em>x</em>, 0)</span> and <span class="math inline">$u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$</span>). Notice time flows <em>continuously</em> in <span class="math inline">[0, 1]</span>. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.</figcaption>
-</figure>
 While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
-<figure>
-<ResponsiveImage
   src={ch4_diffusion_vs_flowmatching}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-diffusion-paths-versus-fm" style="position: absolute;"></span>
-<figcaption>Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over <span class="math inline"> <em>T</em> = 50</span> steps.</figcaption>
-</figure>
 In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
 <span id="flow-matching-objective" style="position: absolute;">
@@ -1435,45 +1313,30 @@ In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn
 In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
-<figure>
-<ResponsiveImage
   src={ch4_act_encoder}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-act-encoder" style="position: absolute;"></span>
-<figcaption>The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned <code>[CLS]</code> token used to aggregate input level information, and predict the style variable <span class="math inline"> <em>z</em> </span>. The encoder is exclusively used to <em>train</em> the decoder, and it is entirely disregarded at inference time.</figcaption>
-</figure>
 However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
-<figure>
-<ResponsiveImage
   src={ch4_act_decoder}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-act-decoder" style="position: absolute;"></span>
-<figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"> <em>n</em> </span> camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable <span class="math inline"> <em>z</em> </span> retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices <span class="math inline"> <em>K</em>, <em>V</em> </span> with the decoder, and is trained to decode fixed position embeddings into action chunks.</figcaption>
-</figure>
 #### Code Example: Training and Using ACT in Practice
-<figure>
-<ResponsiveImage
   src={ch4_act}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-act" style="position: absolute;"></span>
-<figcaption>Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.</figcaption>
-</figure>
 <div class="pbox">
 Training ACT
@@ -1612,17 +1475,12 @@ In practice, conditioning on observation data is achieved conditioning the noise
 ```
 Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
-<figure>
-<ResponsiveImage
   src={ch4_diffusion_policy}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="diffusion-policy-architecture" style="position: absolute;"></span>
-<figcaption>The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of <span class="math inline"> <em>H</em> <sub> <em>o</em> </sub> </span> previous observations is used as external conditioning to denoise a group of <span class="math inline"> <em>H</em> <sub> <em>a</em> </sub> </span> actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as <span class="math inline"> <em>T</em> = 10</span> denoising steps.</figcaption>
-</figure>
 Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
@@ -1759,19 +1617,12 @@ A robot may indeed execute an entire action chunk $\mathbf{A}_t$ *before* a new
 One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
-<figure>
-<div class="minipage">
-<ResponsiveImage
   src={ch4_async_inference}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-async-inference" style="position: absolute;"></span>
-</div>
-<figcaption><strong>Asynchronous inference</strong>. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.</figcaption>
-</figure>
 <div class="algorithm">
 <span id="alg-async-inference" style="position: absolute;"></span>
@@ -1796,19 +1647,12 @@ Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queu
 - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
-<figure>
-<div class="minipage">
-<ResponsiveImage
   src={ch4_queues}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch4-queues" style="position: absolute;"></span>
-</div>
-<figcaption>Action queue size evolution at runtime for various levels of <span class="math inline"> <em>g</em> </span> when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.</figcaption>
-</figure>
 Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
@@ -1947,33 +1791,23 @@ TL;DR Openly available, large-scale datasets and the development of stable-to-tr
 The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
-<figure>
-<ResponsiveImage
   src={ch5_ml_vs_robotics_foundation}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-ml-vs-robotics-foundation" style="position: absolute;"></span>
-<figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
-</figure>
 ### Preliminaries: Models and Data
 The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
-<figure>
-<ResponsiveImage
   src={ch5_generalist_policies_timeline}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-generalist-policies-timeline" style="position: absolute;"></span>
-<figcaption>Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, <span class="math inline"> <em>π</em> <sub>0</sub> </span> @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.</figcaption>
-</figure>
 Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
@@ -1983,17 +1817,12 @@ Traditionally, research efforts revolved around not only training models, but al
 Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
-<figure>
-<ResponsiveImage
   src={ch5_trends}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-trends" style="position: absolute;"></span>
-<figcaption>Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.</figcaption>
-</figure>
 Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
@@ -2013,17 +1842,12 @@ Recently, compute efficiency has also become a central focus in multi-modal rese
 $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
-<figure>
-<ResponsiveImage
   src={ch5_pi0}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-pi0" style="position: absolute;"></span>
-<figcaption>The <span class="math inline"> <em>π</em> <sub>0</sub> </span> architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.</figcaption>
-</figure>
 Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
@@ -2058,11 +1882,10 @@ Flow matching  can be seen as a continuous time, deterministic generalization o
 <div class="wrapfigure">
-r0.4 <ResponsiveImage
   src={ch5_pi0_sampling_timesteps}
   zoomable
   downloadable
-  layout="fixed"
   alt="image"
 />
@@ -2141,17 +1964,12 @@ for epoch in range(num_epochs):
 With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
-<figure>
-<ResponsiveImage
   src={ch5_smolvla}
   zoomable
   downloadable
-  layout="fixed"
   alt="Figure"
-/>
-<span id="ch5-smolvla" style="position: absolute;"></span>
-<figcaption>The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than <span class="math inline"> <em>π</em> <sub>0</sub> </span> (450M parameters vs. <span class="math inline"> <em>π</em> <sub>0</sub> </span>’s 3.3B).</figcaption>
-</figure>
 While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.

 ---
 import MultiImage from '../components/MultiImage.astro';
+import Image from '../components/Image.astro';
 import Quote from '../components/Quote.astro';
 import ch2_planar_manipulator_free from './assets/image/figures/ch2/ch2-planar-manipulator-free.png';
 import ch2_planar_manipulator_floor from './assets/image/figures/ch2/ch2-planar-manipulator-floor.png';
 ## Introduction
+<Image
   src={ch1_lerobot_figure1}
   zoomable
   downloadable
   alt="Figure"
+ caption={'lerobot is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.'}/>
 Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks. Consequently, the field of robotics has been widely studied since its first inception in the 1950s. Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
 ### Explicit and Implicit Models
+<Image
   src={ch2_approaches}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Overview of methods to generate motion (clearly non-exhausitve, see @bekrisStateRobotMotion2024). The different methods can be grouped based on whether they explicitly (dynamics-based) or implicitly (learning-based) model robot-environment interactions.'}/>
 Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion. Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components. Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s. More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines @connellRobotLearning1993. As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of <mark>producing artificial motion in the physical world</mark>.
 ### Different Types of Motion
+<Image
   src={ch2_platforms}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics’ Spot, Open-Duck, 1X’s NEO, Boston Dynamics’ Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.'}/>
 In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links. A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure <a href="#robotics-platforms-atlas" data-reference-type="ref" data-reference="robotics-platforms-atlas">[robotics-platforms-atlas]</a>).
 Recently, the development of low-cost manipulators like the ALOHA @zhaoLearningFineGrainedBimanual2023 ALOHA-2 @aldacoALOHA2Enhanced and SO-100/SO-101 @knightStandardOpenSO100 platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure <a href="#robotic-platforms-costs" data-reference-type="ref" data-reference="robotic-platforms-costs">[robotic-platforms-costs]</a>).
+<Image
   src={ch2_cost_accessibility}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.'}/>
 Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory. Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, @lynchModernRoboticsMechanics2017, @tedrakeRoboticManipulationPerception, @tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques. Here, we mostly wish to highlight the benefits of ML over these traditional techniques
+<Image
   src={ch2_so100_to_planar_manipulator}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).'}/>
 Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors. This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper). As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure <a href="#make-so100-planar-manipulator" data-reference-type="ref" data-reference="make-so100-planar-manipulator">[make-so100-planar-manipulator]</a>, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
 <div class="wrapfigure">
+r0.3 <Image
   src={ch2_planar_manipulator_floor_box}
   zoomable
   downloadable
   alt="image"
 />
 Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
+<Image
   src={ch2_classical_limitations}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to @antonovaReinforcementLearningPivoting2017) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.'}/>
 Dynamics-based robotics pipelines have historically been <mark>developed sequentially, engineering the different blocks</mark> now within most architectures for specific purposes. That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces. Pipelining these specific modules proved error-prone, and brittleness emerges--alongside compounding errors--whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures). Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
 TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
 </div>
+<Image
   src={ch3_learning_benefits}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used.'}/>
 Learning-based techniques for robotics naturally address the limitations presented in Section <a href="#classical" data-reference-type="ref" data-reference="classical">[classical]</a> (Figure <a href="#robot-learning-upsides" data-reference-type="ref" data-reference="robot-learning-upsides">[robot-learning-upsides]</a>). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches.
 <div class="wrapfigure">
+r0.3 <Image
   src={ch3_learning_atlas}
   zoomable
   downloadable
   alt="image"
 />
 In Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure <a href="#robot-learning-atlas" data-reference-type="ref" data-reference="robot-learning-atlas">[robot-learning-atlas]</a> illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
+<Image
   src={ch3_rl_examples}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.'}/>
 Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure <a href="#robotics-with-rl-examples" data-reference-type="ref" data-reference="robotics-with-rl-examples">[robotics-with-rl-examples]</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling).
 The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure <a href="#rl-most-famous-pic" data-reference-type="ref" data-reference="rl-most-famous-pic">[rl-most-famous-pic]</a>) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
+<Image
   src={ch3_agent_env}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).'}/>
 Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs.
 ```
 inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure <a href="#rl-algos-atlas" data-reference-type="ref" data-reference="rl-algos-atlas">[rl-algos-atlas]</a>).
+<Image
   src={ch3_rl_algorithms_atlas}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.'}/>
 Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025.
 First, especially early in training, <mark>actions are typically explorative, and thus may be erractic</mark>. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, <mark>limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training</mark>. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
+<Image
   src={ch3_duck_sim_vs_real}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality gap) pose risks to policy transfer.'}/>
 Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure <a href="#synthetic-vs-real-duck" data-reference-type="ref" data-reference="synthetic-vs-real-duck">[synthetic-vs-real-duck]</a>). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure <a href="#ducks-on-terrains" data-reference-type="ref" data-reference="ducks-on-terrains">[ducks-on-terrains]</a>), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
+<Image
   src={ch3_many_ducks}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.'}/>
 While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness.
 Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
+<Image
   src={ch3_hil_serl_examples}
   zoomable
   downloadable
   alt="Figure"
+ caption={'(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.'}/>
 Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure <a href="#hil-serl-blocks" data-reference-type="ref" data-reference="hil-serl-blocks">[hil-serl-blocks]</a>), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
 #### Code Example- Real-world RL
+<Image
   src={ch3_hil_serl_architecture}
   zoomable
   downloadable
   alt="Figure"
+ caption={'HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in lerobot relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions (s t , a t , r t , s t + 1) and (2) parameters θ .'}/>
 This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice.
 TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
 </div>
+<Image
   src={ch4_bc_trajectories}
   zoomable
   downloadable
   alt="Figure"
+ caption={'(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in lerobot/svla_so101_pickplace. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.'}/>
 Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>. Indeed, especially in real-world robotics, online exploration is typically <mark>costly and potentially unsafe</mark>, and designing (dense) reward signals is a <mark>brittle and task-specific</mark> process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by <mark>casting control an imitation learning problem</mark>, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
 Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section <a href="#learning-rl" data-reference-type="ref" data-reference="learning-rl">[learning-rl]</a>, in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure <a href="#ch4-bc-trajectories" data-reference-type="ref" data-reference="ch4-bc-trajectories">[ch4-bc-trajectories]</a> graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure <a href="#ch4-observation-action-mapping" data-reference-type="ref" data-reference="ch4-observation-action-mapping">[ch4-observation-action-mapping]</a> shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors.
+<Image
   src={ch4_observation_action_mapping}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Sample observations and action pairs over the course of a given trajectory recorded in lerobot/svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.'}/>
 Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving
 ``` math
 Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings.
+<Image
   src={ch4_issues_with_bc}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.'}/>
 While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. <a href="#loss-minimization-SL" data-reference-type="ref" data-reference="loss-minimization-SL">[loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#ch4-issues-with-bc" data-reference-type="ref" data-reference="ch4-issues-with-bc">[ch4-issues-with-bc]</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$.
 #### Variational Auto-Encoders
+<Image
   src={ch4_task_effect_on_pairs}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.'}/>
 A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in:
 ``` math
 ```
 Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure <a href="#ch4-task-effect-on-pairs" data-reference-type="ref" data-reference="ch4-task-effect-on-pairs">[ch4-task-effect-on-pairs]</a> graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a> represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a> using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure <a href="#ch4-latent-variable-model" data-reference-type="ref" data-reference="ch4-latent-variable-model">[ch4-latent-variable-model]</a>).
+<Image
   src={ch4_latent_variable_model}
   zoomable
   downloadable
   alt="Figure"
+ caption={'(A) The latent variable model in a robotics application regulates influence between observed ( o, a) variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.'}/>
 Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as:
 <span id="evidence-definition-1" style="position: absolute;">
 ```
 where we explicitly showed the marginalization over the multiple latents in eq. <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>, and used the law of conditional probability and Markov property in eq. <a href="#BC-multi-latent-model-2" data-reference-type="ref" data-reference="BC-multi-latent-model-2">[BC-multi-latent-model-2]</a>. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
+<Image
   src={ch4_many_latents}
   zoomable
   downloadable
   alt="Figure"
+ caption={'HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.'}/>
 Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system.
 ```
 where the former term is equivalent to the reconstruction term in eq. <a href="#VAE-min-neg-ELBO" data-reference-type="ref" data-reference="VAE-min-neg-ELBO">[VAE-min-neg-ELBO]</a> and the latter term can be obtained in closed form.
+<Image
   src={ch4_diffusion_robot_actions}
   zoomable
   downloadable
   alt="Figure"
+ caption={'DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.'}/>
 Besides mathematical tractability of eq. <a href="#diffusion-likelihood-gradient" data-reference-type="ref" data-reference="diffusion-likelihood-gradient">[diffusion-likelihood-gradient]</a>, adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>). Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a> illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]:
 <span id="diffusion-simplified-loss" style="position: absolute;">
 \end{align}
 ```
+<Image
   src={ch4_action_vs_observation_distribution}
   zoomable
   downloadable
   alt="Figure"
+ caption={'A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.'}/>
 In this simplified (minimization) objective, the optimization process differs from eq. <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution.
 ```
 Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
+<Image
   src={ch4_normalizing_flows}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields u 1(x, y) = (x, 0) and $u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$). Notice time flows continuously in [0, 1]. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.'}/>
 While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure <a href="#ch4-normalizing-flows" data-reference-type="ref" data-reference="ch4-normalizing-flows">[ch4-normalizing-flows]</a>). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure <a href="#ch4-diffusion-paths-versus-fm" data-reference-type="ref" data-reference="ch4-diffusion-paths-versus-fm">[ch4-diffusion-paths-versus-fm]</a>), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
+<Image
   src={ch4_diffusion_vs_flowmatching}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over T = 50 steps.'}/>
 In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as:
 <span id="flow-matching-objective" style="position: absolute;">
 In ACT (Figure <a href="#ch4-act" data-reference-type="ref" data-reference="ch4-act">[ch4-act]</a>), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
+<Image
   src={ch4_act_encoder}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned [CLS] token used to aggregate input level information, and predict the style variable z . The encoder is exclusively used to train the decoder, and it is entirely disregarded at inference time.'}/>
 However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
+<Image
   src={ch4_act_decoder}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all n camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable z retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices K, V with the decoder, and is trained to decode fixed position embeddings into action chunks.'}/>
 #### Code Example: Training and Using ACT in Practice
+<Image
   src={ch4_act}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.'}/>
 <div class="pbox">
 Training ACT
 ```
 Note how in eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a> the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
+<Image
   src={ch4_diffusion_policy}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of H o previous observations is used as external conditioning to denoise a group of H a actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as T = 10 denoising steps.'}/>
 Figure <a href="#diffusion-policy-architecture" data-reference-type="ref" data-reference="diffusion-policy-architecture">[diffusion-policy-architecture]</a> shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. <a href="#diffusion-policy-objective" data-reference-type="ref" data-reference="diffusion-policy-objective">[diffusion-policy-objective]</a>. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$.
 One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference (<a href="#alg-async-inference" data-reference-type="ref" data-reference="alg-async-inference">[alg-async-inference]</a>), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure <a href="#ch4-async-inference" data-reference-type="ref" data-reference="ch4-async-inference">[ch4-async-inference]</a>). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
+<Image
   src={ch4_async_inference}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Asynchronous inference. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.'}/>
 <div class="algorithm">
 <span id="alg-async-inference" style="position: absolute;"></span>
 - **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
+<Image
   src={ch4_queues}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Action queue size evolution at runtime for various levels of g when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.'}/>
 Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#ch4-queues" data-reference-type="ref" data-reference="ch4-queues">[ch4-queues]</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
 The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section <a href="#learning-imitation" data-reference-type="ref" data-reference="learning-imitation">[learning-imitation]</a> introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
+<Image
   src={ch5_ml_vs_robotics_foundation}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).'}/>
 ### Preliminaries: Models and Data
 The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure <a href="#ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="ch5-ml-vs-robotics-foundation">[ch5-ml-vs-robotics-foundation]</a>).
+<Image
   src={ch5_generalist_policies_timeline}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, π 0 @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.'}/>
 Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure <a href="#ch5-generalist-policies-timeline" data-reference-type="ref" data-reference="ch5-generalist-policies-timeline">[ch5-generalist-policies-timeline]</a> shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm.
 Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
+<Image
   src={ch5_trends}
   zoomable
   downloadable
   alt="Figure"
+ caption={'Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.'}/>
 Figure <a href="#ch5-trends" data-reference-type="ref" data-reference="ch5-trends">[ch5-trends]</a> shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
 $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
+<Image
   src={ch5_pi0}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The π 0 architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.'}/>
 Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
 <div class="wrapfigure">
+r0.4 <Image
   src={ch5_pi0_sampling_timesteps}
   zoomable
   downloadable
   alt="image"
 />
 With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
+<Image
   src={ch5_smolvla}
   zoomable
   downloadable
   alt="Figure"
+ caption={'The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than π 0 (450M parameters vs. π 0 ’s 3.3B).'}/>
 While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure <a href="#ch5-smolvla" data-reference-type="ref" data-reference="ch5-smolvla">[ch5-smolvla]</a>) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters.

app/src/content/embeds/{banner.html → banner2.html} RENAMED Viewed

File without changes