diff --git "a/app/src/content/article.mdx" "b/app/src/content/article.mdx" --- "a/app/src/content/article.mdx" +++ "b/app/src/content/article.mdx" @@ -2,10 +2,10 @@ title: "Robot Learning: A Tutorial" authors: - name: "Francesco Capuano" - affiliations: [1, 2] - - name: "Adil Zouitine" affiliations: [2] - - name: "Pepijn Kooijmans" + - name: "Caroline Pascal" + affiliations: [2] + - name: "Adil Zouitine" affiliations: [2] - name: "Thomas Wolf" affiliations: [2] @@ -14,7 +14,7 @@ authors: affiliations: - name: "École Normale Supérieure Paris-Saclay" - name: "Hugging Face" -published: "Sep 18, 2025" +published: "Oct 14, 2025" tableOfContentsAutoCollapse: true --- @@ -38,6 +38,7 @@ import ch3_rl_algorithms_atlas from './assets/image/figures/ch3/ch3-rl-algorithm import ch3_duck_sim_vs_real from './assets/image/figures/ch3/ch3-duck-sim-vs-real.png'; import ch3_many_ducks from './assets/image/figures/ch3/ch3-many-ducks.png'; import ch3_hil_serl_examples from './assets/image/figures/ch3/ch3-hil-serl-examples.png'; +import ch3_hil_serl_architecture from './assets/image/figures/ch3/ch3-hil-serl-architecture.png'; import ch4_bc_trajectories from './assets/image/figures/ch4/ch4-bc-trajectories.png'; import ch4_observation_action_mapping from './assets/image/figures/ch4/ch4-observation-action-mapping.png'; import ch4_issues_with_bc from './assets/image/figures/ch4/ch4-issues-with-bc.png'; @@ -48,9 +49,9 @@ import ch4_diffusion_robot_actions from './assets/image/figures/ch4/ch4-diffusio import ch4_action_vs_observation_distribution from './assets/image/figures/ch4/ch4-action-vs-observation-distribution.png'; import ch4_normalizing_flows from './assets/image/figures/ch4/ch4-normalizing-flows.png'; import ch4_diffusion_vs_flowmatching from './assets/image/figures/ch4/ch4-diffusion-vs-flowmatching.png'; -import ch4_act from './assets/image/figures/ch4/ch4-act.png'; import ch4_act_encoder from './assets/image/figures/ch4/ch4-act-encoder.png'; import ch4_act_decoder from './assets/image/figures/ch4/ch4-act-decoder.png'; +import ch4_act from './assets/image/figures/ch4/ch4-act.png'; import ch4_diffusion_policy from './assets/image/figures/ch4/ch4-diffusion-policy.png'; import ch4_async_inference from './assets/image/figures/ch4/ch4-async-inference.png'; import ch4_queues from './assets/image/figures/ch4/ch4-queues.png'; @@ -64,7 +65,7 @@ import ch5_smolvla from './assets/image/figures/ch5/ch5-smolvla.png'; ## Foreword -Robotics is an inherently multidisciplinary field, and is not witnessing unprecedented advancements since its inception in the 1960s. Yet, more than sixty years after the debut of Unimate, robots have still not fully integrated into the rich, unstructured, and dynamic world we humans inhabit. Over the decades, numerous disciplines have shown immense promise in tackling the challenges of creating autonomous systems. This tutorial takes a clear stance in the debate on whether modern Machine Learning can play a pivotal role in the development of autonomous robot systems: we believe this to be the case. +Robotics is an inherently multidisciplinary field, which is witnessing unprecedented advancements since its inception in the 1960s. Yet, more than sixty years after the debut of Unimate, robots have still not fully integrated into the rich, unstructured, and dynamic world we humans inhabit. Over the decades, numerous disciplines have shown immense promise in tackling the challenges of creating autonomous robotic systems. This tutorial takes a clear stance in the debate on whether modern Machine Learning can play a pivotal role in the development of autonomous robots: we believe this to be the case. Nonetheless, we also hold that the wealth of research from both academia and industry in classical robotics over the past six decades is, simply put, too valuable to be cast aside in favor of purely learning-based methods. However, the interplay between classical robotics and modern machine learning is still in its nascent stages, and the path to integration yet to be clearly defined. In turn our goal here is to present what we consider to be the most relevant approaches within robot learning today, while warmly extending an invite to collaborate to expand the breadth of this work! Start contributing today [here](https://github.com/fracapuano/robot-learning-tutorial). @@ -108,7 +109,7 @@ This tutorial serves the double purpose of providing useful references for the S - Section [learning-rl] elaborates on the limitations of dynamics-based methods, and introduce RL as a practical approach to solve robotics problems, considering its upsides and potential limitations. -- Section [robot-imitation-learning] further describes robot learning techniques that aim at solving single-tasks learning, leveraging BC techniques to autonomously reproduce specific expert demonstrations. +- Section [learning-imitation] further describes robot learning techniques that aim at solving single-tasks learning, leveraging BC techniques to autonomously reproduce specific expert demonstrations. - Section [learning-foundation] presents recent contributions on developing generalist models for robotics applications, by learning from large corpora of multi-task  multi-robot data (*robotics foundation models*). @@ -116,7 +117,7 @@ Our goal with this tutorial is to provide an intuitive explanation of the reason ### `LeRobotDataset` -`LeRobotDataset` is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset` also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding. +`LeRobotDataset` is one of the most impactful features of `lerobot`, developed in keeping with the observation that robotics data is increasingly central in robot learning. Thus, `lerobot` defines a standardized dataset format designed to address the specific needs of robot learning research, providing a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset` also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding. In this, `LeRobotDataset` provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. `LeRobotDataset` can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in `lerobot`, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users. @@ -156,7 +157,71 @@ Conveniently, by using `LeRobotDataset` with a Pytorch `DataLoader` one can aut
-Batching a (Streaming) Dataset +Batching a (Streaming) Dataset +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch1/01_datasets.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch1/01_datasets.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+ +### Code Example: Collecting Data + + +
+ +Record a Dataset +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch1/02_record_data.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch1/02_record_data.py) ```python import torch @@ -418,7 +483,7 @@ Moreover, classical planners operate on compact, assumed-sufficient state repres Setting aside integration and scalability challenges: developing accurate modeling of contact, friction, and compliance for complicated systems remains difficult. Rigid-body approximations are often insufficient in the presence of deformable objects, and relying on approximated models hinders real-world applicability of the methods developed. In the case of complex, time-dependent and/or non-linear dynamics, even moderate mismatches in parameters, unmodeled evolutions, or grasp-induced couplings can qualitatively affect the observed dynamics. -Lastly, dynamics-based methods (naturally) overlook the rather recent increase in availability of openly-available robotics datasets. The curation of academic datasets by large centralized groups of human experts in robotics @collaborationOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 is now increasingly complemented by a growing number of robotics datasets contributed in a decentralized fashion by individuals with varied expertise. If not tangentially, dynamics-based approaches are not posed to maximally benefit from this trend, which holds the premise of allowing generalization in the space of tasks and embodiments, like data was the cornerstone for advancements in vision @alayracFlamingoVisualLanguage2022 and natural-language understanding @brownLanguageModelsAre2020. +Lastly, dynamics-based methods (naturally) overlook the rather recent increase in availability of openly-available robotics datasets. The curation of academic datasets by large centralized groups of human experts in robotics @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 is now increasingly complemented by a growing number of robotics datasets contributed in a decentralized fashion by individuals with varied expertise. If not tangentially, dynamics-based approaches are not posed to maximally benefit from this trend, which holds the premise of allowing generalization in the space of tasks and embodiments, like data was the cornerstone for advancements in vision @alayracFlamingoVisualLanguage2022 and natural-language understanding @brownLanguageModelsAre2020. Taken together, these limitations (Figure [classical-limitations]) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available. @@ -434,7 +499,7 @@ Richard Sutton
-TL;DR The need for expensive high-fidelity simulators can be obviated by learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware. +TL;DR The need for expensive, high-fidelity simulators can be obviated learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
@@ -449,25 +514,24 @@ TL;DR The need for expensive high-fidelity simulators can be obviated by learnin
Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with the scale of the data used.
-Learning-based techniques for robotics naturally address the limitations presented in [classical] (Figure [robot-learning-upsides]). Learning-based techniques typically rely on prediction-to-action (*visuomotor policies*), thereby directly mapping sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensorimotor inputs to actions directly also allows to add diverse input modalities, leveraging the automatic feature extraction characteristic of most modern learning systems. Further, learning-based approaches can in principle entirely bypass modeling efforts and instead rely exclusively on interactions data, proving transformative when dynamics are challenging to model or even entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision first and natural language processing later did historically benefit from large scale corpora of (possibly non curated) data, in great part overlooked by dynamics-based approaches. +Learning-based techniques for robotics naturally address the limitations presented in Section [classical] (Figure [robot-learning-upsides]). In particular, learning-based techniques typically rely on monolithich prediction-to-action pipelines (*visuomotor policies*) which do directly map sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components. Mapping sensory inputs to actions also makes it possible to incorporate diverse input modalities, leveraging the automatic feature extraction capabilities of modern learning systems. Moreover, learning-based approaches can, in principle, bypass explicit modeling altogether and instead rely solely on interaction data--an advantage that proves transformative when dynamics are difficult to model or entirely unknown. Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision and natural language processing did historically benefit from large-scale corpora of data, in great part overlooked by dynamics-based approaches. -Being a field at its relative nascent stages, no prevalent technique(s) proved distinctly better better in robot learning. Still, two major classes of methods gained prominence- reinforcement learning (RL) and Behavioral Cloning (BC) (Figure [robot-learning-atlas]). In this section, we provide a conceptual overview of applications of the former to robotics, as well as introduce practical examples of how to use RL within `lerobot`. We then introduce the major limitations RL suffers from, to introduce BC techniques in the next sections ([learning-bc-single-sec-learning-bc-generalist]). +Being a field at its relative nascent stages, no prevalent technique(s) proves distinctly better than any other in the domain of robot learning. Still, two major classes of methods gained prominence- Reinforcement Learning (RL) and Behavioral Cloning (BC) (Figure [robot-learning-atlas]). In this section, we provide a conceptual overview of applications of RL to robotics, as well as introduce practical examples of how to use RL within `lerobot`. We then introduce the major limitations RL suffers from, to introduce BC techniques in Section [learning-imitation] and Section sec-learning-foundation. -
- + +r0.3 - -
Overview of the robot learning methods implemented in lerobot.
-
-In Figure [robot-learning-atlas] we decided to include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significant different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--foundation models are largely trained to reproduce trajectories contained in a large training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. + + -Figure [robot-learning-atlas] illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022. +In Figure [robot-learning-atlas] we deliberately include generalist robot models @blackp0VisionLanguageActionFlow2024, @shukorSmolVLAVisionLanguageActionModel2025 alongside task-specific BC methods. While significantly different in spirit--*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task--*foundation* models are still largely trained to reproduce trajectories contained in a (large) training set of input demonstrations. Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas. Figure [robot-learning-atlas] illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in `lerobot`- Action Chunking with Transformers (ACT) @zhaoLearningFineGrainedBimanual2023, Diffusion Policy @chiDiffusionPolicyVisuomotor2024, Vector-Quantized Behavior Transformer (VQ-BeT) @leeBehaviorGenerationLatent2024, $\pi_0$ @blackp0VisionLanguageActionFlow2024, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025, Human-in-the-loop Sample-efficient RL (HIL-SERL) @luoPreciseDexterousRobotic2024 and TD-MPC @hansenTemporalDifferenceLearning2022.
Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.
-Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. Figure [robotics-with-rl-examples] depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. Figure [robotics-with-rl-examples] also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling). +Applications of RL to robotics have been studied long enough that the relationship between these two disciplines has been compared to that of physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their inherently interactive and sequential nature, robotics control problems can be directly cast as RL problems. Figure [robotics-with-rl-examples] presents two of such cases. Reaching for an object to then move it somewhere else in the scene is a sequential problem where over time the controller needs to adjust the position of the robot arm based on the current configuration and the (possibly varying) position of the object. Figure [robotics-with-rl-examples] also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation- while sliding to the side, the controller needs to keep adjusting to the robot’s to avoid failure (falling). ### A (Concise) Introduction to RL -The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (Figure [rl-most-famous-pic]) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*). +The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to tackle robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) capable to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents improve through trial and error, bypassing explicit models of the problem dynamics in favor of interaction data. In RL, this feedback loop between actions and outcomes (Figure [rl-most-famous-pic]) is established through the agent sensing a scalar quantity (*reward*) measuring how desirable a given *transition* is for the accomplishment of its goal.
Agent-Environment interaction diagram (image credits to @suttonReinforcementLearningIntroduction2018).
-Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically sound framework for learning *without* an explicit dynamic model. While accommodating also a continuous time formulation, MDPs are typically considered in discrete time in RL, thus assuming interactions to atomically take place over the course of discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ( $T \to + \infty$ ) are typically termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ cannot grow unbounded. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs here. +Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) @bellmanMarkovianDecisionProcess1957. Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP’s inherently stochastic formulation and (2) providing a theoretically-sound framework for learning *without* an explicit model of the environment dynamics. While accommodating a continuous time formulation too, MDPs are typically considered in discrete time in RL, assuming interactions to atomically take place at discrete *timestep* $t=0,1,2,3, \dots, T$. MDPs allowing for an unbounded number of interactions ($T \to + \infty$) are termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $T$ is finite. Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs. Formally, a lenght-$T$ Markov Decision Process (MDP) is a tuple $\mathcal M = \langle \mathcal S, \mathcal A, \mathcal D, r, \gamma, \rho, T \rangle$, where: -- $\mathcal S$ is the *state space*; $s_t\in \mathcal S$ denotes the (possibly non-directly observable) environment state at time $t$. In robotics, states often comprise robot configuration and velocities ($q_t, \dot q_t$), and can accomodate sensor readings such as camera or audio streams. +- $\mathcal S$ is the *state space*; $s_t\in \mathcal S$ denotes the (possibly non-directly observable) environment state at time $t$. In robotics, states often comprise robot configuration and velocities ($q_t, \dot q_t$), and can also accomodate sensor readings such as camera or audio streams. -- $\mathcal A$ is the *action space*; $a_t\in \mathcal A$ may represent joint torques, joint velocities, or even end-effector commands. In general, actions correspond to commands intervenings on the configuration of the robot. +- $\mathcal A$ is the *action space*; $a_t\in \mathcal A$ may represent joint torques, joint velocities, or even end-effector commands at timestep $t$. In general, actions correspond to commands intervenings on the configuration of the robot. -- $\mathcal D$ represents the (possibly non-deterministic) environment dynamics, with $\mathcal D: \mathcal S\times \mathcal A\times \mathcal S\mapsto [0, 1]$ corresponding to $\mathcal D\, (s_t, a_t, s_{t+1})= \mathbb P (s_{t+1}\vert s_t, a_t)$. For instance, for a planar manipulator dynamics could be considered deterministic when the environment is fully described (Figure [planar-manipulation-simple]), and stochastic when unmodeled disturbances depending on non-observable parameters intervene (Figure [planar-manipulator-box-velocity]). +- $\mathcal D$ represents the (possibly non-deterministic) environment dynamics, with $\mathcal D: \mathcal S\times \mathcal A\times \mathcal S\mapsto [0, 1]$, $\mathcal D\, (s_t, a_t, s_{t+1})= \mathbb P (s_{t+1}\vert s_t, a_t)$. For instance, for a planar manipulator dynamics could be considered deterministic when the environment is fully described (Figure [planar-manipulation-simple]), and stochastic when unmodeled disturbances depending on non-observable parameters intervene (Figure [planar-manipulator-box-velocity]). -- $r- \mathcal S\times \mathcal A\times \mathcal S\to \mathbb R$ is the *reward function*, weighing the transition $(s_t, a_t, s_{t+1})$ in the context of the achievement of an arbitrary goal. For instance, a simple reward function for quickly moving the along the $x$ axis in 3D-space (Figure [robotics-with-rl-examples]) could be based on the absolute position of the robot along the $x$ axis ($p_x$), present negative penalties for falling over (measured from $p_z$) and a introduce bonuses $\dot p_x$ for speed, $r (s_t, a_t, s_{t+1})\equiv r(s_t) = p_{x_t} \cdot \dot p_{x_t} - \tfrac{1}{p_{z_t}}$. +- $r- \mathcal S\times \mathcal A\times \mathcal S\to \mathbb R$ is the *reward function*, weighing the transition $(s_t, a_t, s_{t+1})$ in the context of the achievement of an arbitrary goal. For instance, a simple reward function for quickly moving along the $x$ axis (Figure [robotics-with-rl-examples]) could be based on the absolute position of the robot along the $x$ axis ($p_{x_t}$), present negative penalties for falling over (measured from $p_{z_t}$) and a introduce bonuses $\dot p_{x_t}$ for speed, $r (s_t, a_t, s_{t+1})\equiv r(s_t) = p_{x_t} \cdot \dot p_{x_t} - \tfrac{1}{p_{z_t}}$. -Lastly, $\gamma \in [0,1]$ represent the discount factor regulating preference for immediate versus long-term reward (with an effective horizon equal to $\tfrac{1}{1-\gamma}$), and $\rho$ is the distribution, defined over $\mathcal S$, the MDP’s *initial* state is sampled from, $s_0 \sim \rho$. +Lastly, $\gamma \in [0,1)$ represent the discount factor regulating preference for immediate versus long-term reward (with an effective horizon equal to $\tfrac{1}{1-\gamma}$), and $\rho$ is the distribution over $\mathcal S$ for the MDP’s *initial*, $s_0 \sim \rho$. -A length-$T$ *trajectory* is the (random) sequence +Therefore, a length-$T$ *trajectory* is the (random) sequence ``` math \htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),} ``` -with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*: +with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation. Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--i.e., the *policy*--to be *Markovian*: ``` math \begin{align} \mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) &= \mathbb P (s_{t+1}\vert s_t, a_t) \\ -\mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t) +\mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t), \end{align} ``` -The probability of observing a given trajectory $\tau$ factorizes into +the probability of observing a given trajectory $\tau$ factorizes into: ``` math \htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).} ``` -Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory: +Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$, and are traine by optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over an arbitrary trajectory, ``` math G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t. ``` @@ -549,27 +613,27 @@ In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maxi \end{align} ``` -Because in the RL framework the agent is assumed to only be able to observe the environment dynamics and not to intervene on them, [RL-j-function] varies exclusively with the policy followed. In turn, MDPs naturally provide a framework to optimize over the space of the possible behaviors an agent might enact ($\pi \in \Pi$), searching for the *optimal policy* $\pi^* = \arg \max_{\theta} J(\pi_\theta)$, where $\theta$ is the parametrization adopted by the policy set $\Pi: \pi_\theta \in \Pi, \ \forall \theta$. Other than providing a target for policy search, $G(\tau)$ can also be used as a target to discriminate between states and state-action pairs. Given any state $s \in \mathcal S$--e.g., a given configuration of the robot--the *state-value* function +Crucially, in the RL framework the agent is assumed to only *observe* the environment dynamics and not to intervene on them, and thus eq. [RL-j-function] varies exclusively with the policy followed. In turn, MDPs naturally provide a framework to optimize over the space of the possible behaviors an agent might enact ($\pi \in \Pi$), searching for the *optimal policy* $\pi^* = \arg \max_{\theta} J(\pi_\theta)$, where $\theta$ is the parametrization adopted by the policy set $\Pi: \pi_\theta \in \Pi, \ \forall \theta$. Besides providing a target for policy search, $G(\tau)$ can also be used to discriminate between states $s_t$ and $s_t, a_t$ pairs. Given any state $s \in \mathcal S$--e.g., given a configuration $q$ of a robot--the *state-value* function ``` math V_\pi(s) = \mathbb E_{\tau \sim \pi} [G(\tau) \big \vert s_0 = s] ``` -can be used to discriminate between desirable and undesirable state in terms of long-term (discounted) reward maximization, under a given policy $\pi$. Similarily, the *state-action* value function also conditions the cumulative discounted reward on selecting action $a$ when in $s$, and thereafter act according to $\pi$: +can be used to discriminate between desirable and undesirable state in terms of long-term (discounted) reward maximization, under a given policy $\pi$. Similarily, the *state-action* value function also conditions the cumulative discounted reward on selecting action $a$ when in $s$, and thereafter act according to $\pi$, ``` math -Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a] +Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]. ``` -Crucially, value functions are interrelated: +Importantly, value functions are interrelated: ``` math \begin{align} Q_\pi(s_t, a_t) &= \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})] \\ -V_\pi(s_t) &= \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)] +V_\pi(s_t) &= \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)], \end{align} ``` -Inducing an ordering over states and state-action pairs under $\pi$, value functions are central to most RL algorithms. A variety of methods have been developed in RL as standalone attemps to find (approximate) solutions to the problem of maximizing cumulative reward (Figure [rl-algos-atlas]). +inducing an ordering over states and state-action pairs under $\pi$, and value functions are thus central to most RL algorithms. A variety of algorithms have been developed in RL attempting to find (approximate) solutions to the problem of maximizing cumulative reward (we report some in Figure [rl-algos-atlas]).
Popular RL algorithms. See @SpinningUp2018 for a complete list of citations.
-Popular approaches to continuous state and action space--such as those studied within robotics--include @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulation @akkayaSolvingRubiksCube2019 and locomotion @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024. +Popular approaches to continuous state and action space--such as those studied within robotics--include ,  and . Across manipulation @akkayaSolvingRubiksCube2019 and locomotion problems @leeLearningQuadrupedalLocomotion2020, RL proved extremely effective in providing a platform to (1) leverage a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensory streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2025. ### Real-world RL for Robotics -Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency. - -First, especially early in training, actions are typically explorative, and thus erractic. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and brittle process. +Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, RL still suffers from limitations concerning safety and learning efficiency, particularly pressing for real-world robotics applications. -Second, learning with a limited number of samples remains problematic in RL, limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On hardware, generating these data is time-consuming and can even be prohibitive. +First, especially early in training, actions are typically explorative, and thus may be erractic. On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage. Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision. Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and error-prone process. Second, learning efficiently remains problematic in RL, limiting the applicability of RL in real-world robotics due to consequently prohibitive timescales of training. Even strong algorithms such as SAC @haarnojaSoftActorCriticOffPolicy2018 typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_{t+1})\}_{t=1}^N$. On real-world hardware, generating this data is time-consuming.
Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (reality gap) pose risks to policy transfer.
-Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues: it eliminates physical risk and dramatically increases throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting transferring policies learned in simulation due the discrepancy between real and simulated environments (*reality gap*, Figure [synthetic-vs-real-duck]). *Domain randomization* (DR) is a popular technique to overcome the reality gap, consisting in randomizing parameters of the simulated environment during training, to induce robustness to specific disturbances. In turn, DR is employed to increase the diversity of scenarios over the course of training, improving on the chances sim-to-real transfer @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed further parametrizing the *simulator*’s dynamics $\mathcal D \equiv \mathcal D_\xi$ with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure [ducks-on-terrains]), or the center of mass of an object for a manipulation task. +Training RL policies in simulation @tobinDomainRandomizationTransferring2017 addresses both issues, eliminating physical risk and dramatically increasing throughput. Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting the possibilities to transfer the policies learned in simulation, due the discrepancy between real and simulated environments (*reality gap*, Figure [synthetic-vs-real-duck]). *Domain randomization* @tobinDomainRandomizationTransferring2017 (DR) is a popular technique to overcome the reality gap, and consists in randomizing the parameters of the simulated environment during training, aiming at inducing robustness to specific disturbances. In this, DR is typically employed to increase the diversity of scenarios over the course of training, improving on the performace sim-to-real transferred policies @akkayaSolvingRubiksCube2019, @antonovaReinforcementLearningPivoting2017, @jiDribbleBotDynamicLegged2023. In practice, DR is performed training in simulation on simulated dynamics $\mathcal D$, further parametrized as $\mathcal D \equiv \mathcal D_\xi$, with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$. For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure [ducks-on-terrains]), or the center of mass of an object for a manipulation task. Over the course of training--typically at each episode’s reset--a new $\xi$ is drawn, and used to specify the environment’s dynamics for that episode.
The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.
-While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important, making selecting these parameters yet another source of brittlness. +While effective in transfering policies across the reality gap in real-world robotics @tobinDomainRandomizationTransferring2017, @akkayaSolvingRubiksCube2019, @jiDribbleBotDynamicLegged2023, @tiboniDomainRandomizationEntropy2024, DR often requires extensive manual engineering. First, identifying which parameters to randomize--i.e., the *support* $\text{supp} (\Xi)$ of $\Xi$--is an inherently task specific process. When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints’ fatigue, etc.) may prove just as important in practice, making selecting these parameters yet another source of brittlness. -Selecting the dynamics distribution $\Xi$ is also non-trivial. On the one hand, distributions with low entropy might risk to cause failure at transfer time, due to the limited robustness induced over the course of training. On the other hand, excessive randomization may cause over-regularization and hinder performance. Consequently, the research community investigated approaches to automatically select the randomization distribution $\Xi$, using signals from the training process or tuning it to reproduce observed real-world trajectories.  @akkayaSolvingRubiksCube2019 use a parametric uniform distribution $\mathcal U(a, b)$ as $\Xi$, widening the bounds as training progresses and the agent’s performance improves (AutoDR). While effective, AutoDR requires significant tuning--the bounds are widened by a fixed, pre-specified amount $\Delta$--and may disregard data when performance *does not* improve after a distribution update @tiboniDomainRandomizationEntropy2024.  @tiboniDomainRandomizationEntropy2024 propose a similar method to AutoDR (DORAEMON) to evolve $\Xi$ based on training signal, but with the key difference of explicitly maximizing the entropy of parametric Beta distributions, inherently more flexible than uniform distributions. DORAEMON proves particularly effective at dynamically increasing the entropy levels of the training distribution by employing a max-entropy objective, under performance constraints formulation. Other approaches to automatic DR consist in specifically tuning $\Xi$ to align as much as possible the simulation and real-world domains. For instance,  @chebotar2019closing interleave in-simulation policy training with repeated real-world policy rollouts used to adjust $\Xi$ based on real-world data, while  @tiboniDROPOSimtoRealTransfer2023 leverage a single, pre-collected set of real-world trajectories and tune $\Xi$ under a simple likelihood objective. +Selecting the dynamics distribution $\Xi$ is also non-trivial. On the one hand, distributions with low entropy might risk to cause failure at transfer time, due to the limited robustness induced over the course of training. On the other hand, excessive randomization may cause over-regularization and hinder performance @margolisRapidLocomotionReinforcement2022. Consequently, the research community investigated approaches to automatically select the randomization distribution $\Xi$, using signals from the training process or tuning it to reproduce observed real-world trajectories. @akkayaSolvingRubiksCube2019 use a parametric uniform distribution $\mathcal U(a, b)$ as $\Xi$, widening the bounds $a, b$ as training progresses and the agent’s performance improves (AutoDR). While effective, AutoDR requires significant tuning--the bounds are widened by a fixed, pre-specified amount $\Delta$ along--and may disregard data when performance *does not* improve after a distribution update @tiboniDomainRandomizationEntropy2024. @tiboniDomainRandomizationEntropy2024 propose a similar method to AutoDR (DORAEMON) to evolve $\Xi$ based on the training signal, but with the key difference of explicitly maximizing the entropy of a parametric Beta distribution--inherently more flexible than uniform distributions--with learned updates instead of fixed $\Delta$. In this, DORAEMON proves particularly effective at dynamically increasing the entropy levels of the training distribution by employing an outer-loop max-entropy objective, tackled under performance constraints in the inner-loop RL problem. Other approaches to automatically perform DR consist in specifically tuning $\Xi$ to align as much as possible the simulation and real-world domains. For instance, @chebotarClosingSimtorealLoop2019 interleave in-simulation policy training with repeated real-world policy rollouts used to adjust $\Xi$ based on real-world data, while @tiboniDROPOSimtoRealTransfer2023 leverage a single, pre-collected set of real-world trajectories and tune $\Xi$ under a simple likelihood objective. -While DR has shown promise, it does not address the main limitation that, even under the assumption that an ideal distribution $\Xi$ to sample from was indeed available, many robotics problems cannot be simulated with high-enough fidelity under practical computational constraints in the first place. Simulating contact-rich manipulation of possibly deformable or soft materials--i.e., *folding a piece of clothing*--can be costly and even time-intensive, limiting the benefits of in-simulation training. +While DR has shown promise, it does not address the main limitation that, even under the assumption that an ideal distribution $\Xi$ was available, many robotics problems cannot be simulated with high-enough fidelity under practical computational constraints. Simulating contact-rich manipulation of possibly deformable or soft materials--i.e., *folding a piece of clothing*--can prove time-intensive, limiting the benefits of in-simulation training. -A perhaps more foundamental limitation of RL for robotics is the general unavailability of complicated tasks’ *dense* reward function, the design of which is essentially based on human expertise and trial-and-error. In practice, *sparse* reward functions can be used to conclude whether one specific goal has been attained--*has this t-shirt been correctly folded?*--but unfortunately incur in more challenging learning. As a result, despite notable successes, deploying RL directly on real-world robots at scale remains challenging. +A perhaps more foundamental limitation of RL for robotics is the general unavailability of complicated tasks’ *dense* reward function, the design of which is essentially based on human expertise, ingenuity and trial-and-error. In practice, *sparse* reward functions can be used to conclude whether one specific goal has been attained--*has this t-shirt been correctly folded?*--but unfortunately incur in more challenging learning. As a result, despite notable successes, deploying RL directly on real-world robots at scale remains challenging. To make the most of (1) the growing number of openly available datasets and (2) relatively inexpensive robots like the SO-100, RL could (1) be anchored in already-collected trajectories--limiting erratic and dangerous exploration--and (2) train in the real-world directly--bypassing the aforementioned issues with low-fidelity simulations. In such a context, sample-efficient learning is also paramount, as training on the real-world is inherently time-bottlenecked. -Off-policy algorithms like Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpart @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of the training. Other than allowing to re-use transitions $(s_t, a_t, r_t, s_{t+1})$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training process @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-world @luoSERLSoftwareSuite2025. Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours @luoPreciseDexterousRobotic2024. +Off-policy algorithms like Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpart @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of training. Other than allowing to re-use past transitions $(s_t, a_t, r_t, s_{t+1})$, the replay buffer can also accomodate for the injection of previously-collected data in the training process @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL can be effectively carried out in the real-world @luoSERLSoftwareSuite2025. Interestingly, when complemented with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours @luoPreciseDexterousRobotic2024. ##### Sample-efficient RL -In an MDP, the optimal policy $\pi^*$ can be derived from its associated $Q$-function, $Q_{\pi^*}$, and in particular the optimal action(s) $\mu(s_t)$ can be selected maximizing the optimal $Q$-function over the action space, +In an MDP, the optimal policy $\pi^*$ can be derived from its associated $Q$-function, $Q^* \equiv Q_{\pi^*}$, and in particular the optimal action(s) $\mu(s_t)$ can be selected maximizing the optimal $Q$-function over the action space, ``` math -\mu(s_t) = \max_{a_t\in \mathcal A} Q_{\pi^*}(s_t, a_t). +\mu(s_t) = \max_{a_t\in \mathcal A} Q^*(s_t, a_t). ``` Interestingly, the $Q^*$-function satisfies a recursive relationship (*Bellman equation*) based on a very natural intuition [^2]: @@ -648,9 +710,9 @@ In turn, the optimal $Q$-function  is guaranteed to be self-consistent by defin ``` math Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma \max_{a_{t+1} \in \mathcal A} Q_i (s_{t+1}, a_{t+1}) \big\vert s_t, a_t], \quad i=0,1,2,\dots,K ``` -Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$. +Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. Indeed, one can show that under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$. -Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$): +Effective in its early applications to small-scale discrete problems, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which storing $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$): @@ -663,27 +725,28 @@ Effective in its early applications to small-scale discrete problems and theoret y_i &= \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma \max_{a_t\in \mathcal A} Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) \big], \end{align} ``` -Where $\chi$ represents a behavior distribution over state-action pairs. Crucially, $\chi$ can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a *replay buffer* in the form of $(s_t, a_t, r_t, s_{t+1})$ transitions, used to form the TD-target $y_i$, TD-error $\delta_i$ and loss function [dqn-loss] via Monte-Carlo (MC) estimates. +where $\chi$ represents a behavior distribution over state-action pairs. Crucially, $\chi$ can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a *replay buffer* $D$ in the form of $(s_t, a_t, r_t, s_{t+1})$ transitions, used to form the TD-target $y_i$, TD-error $\delta_i$ and loss function eq. [dqn-loss] via Monte-Carlo (MC) estimates. -While effective in handling large, unstructured state spaces for discrete action-space problems, DQN application’s to continous control problems proved challenging. Indeed, in the case of high-capacity function approximators such as neural networks, solving $\max_{a_t \in \mathcal A} Q_\theta(s_t, a_t)$ at each timestep is simply unfeasible due to the (1) continous nature of the action space ($\mathcal A\subset \mathbb R^n$ for some $n$) and (2) impossibility to express the find a cheap (ideally, closed-form) solution to $Q_\theta$.  @silverDeterministicPolicyGradient2014 tackle this fundamental challenge by using a *deterministic* function of the state $s_t$ as policy, $\mu_\phi(s_t) = a_t$, parametrized by $\phi$. Thus, policies can be iteratively refined updating $\phi$ along the direction: +While effective in handling large, unstructured state spaces for discrete action-space problems, DQN’s application to continous control problems proved challenging. Indeed, in the case of high-capacity function approximators such as neural networks, solving $\max_{a_t \in \mathcal A} Q_\theta(s_t, a_t)$ at each timestep is simply unfeasible due to the (1) continous nature of the action space ($\mathcal A\subset \mathbb R^n$ for some $n$) and (2) impossibility to express the policy with a cheap (ideally, even closed-form) formulation, so that $\max Q_\theta$ could be solved analytically. @pmlr-v32-silver14 tackle these fundamental challenges by using a *deterministic* function of the state $s_t$ as policy, $\mu_\phi(s_t) = a_t$, parametrized by $\phi$. Thus, policies can be iteratively refined updating $\phi$ along the direction: ``` math \htmlId{deterministic-pg}{d_\phi = \mathbb E_{s_t \sim \mathbb P (\bullet)} [\nabla_\phi Q(s_t, a_t)\vert_{a_t = \mu_\phi(s_t)}] = \mathbb E_{s_t \sim \mathbb P(\bullet)} [\nabla_{a_t} Q(s_t, a_t) \vert_{a_t = \mu_\phi(s_t)} \cdot \nabla_\phi \mu(s_t)]} ``` -Provably, [deterministic-pg] is the *deterministic policy gradient* (DPG) of the policy $\mu_\phi$ @silverDeterministicPolicyGradient2014, so that updates $\phi_{k+1}\leftarrow \phi_k + \alpha d_\phi$ are guaranteed to increase the (deterministic) cumulative discounted reward, $J(\mu_\phi)$.  @lillicrapContinuousControlDeep2019 extended DPG to the case of (1) high-dimensional unstructured observations and (2) continuous action spaces, introducing Deep Deterministic Policy Gradient (DDPG), an important algorithm RL and its applications to robotics. DDPG adopts a modified TD-target compared to the one defined in [TD-target], by maintaining a policy network used to select actions, yielding +Provably, eq. [deterministic-pg] is the *deterministic policy gradient* (DPG) of the policy $\mu_\phi$ @pmlr-v32-silver14, so that updates $\phi_{k+1}\leftarrow \phi_k + \alpha d_\phi$ are guaranteed to increase the (deterministic) cumulative discounted reward, $J(\mu_\phi)$.  @lillicrapContinuousControlDeep2019a extended DPG to the case of (1) high-dimensional unstructured observations and (2) continuous action spaces, introducing Deep Deterministic Policy Gradient (DDPG), an important algorithm in RL and its applications to robotics. DDPG adopts a modified TD-target compared to eq. [TD-target], by maintaining a policy network used to select actions, yielding ``` math \htmlId{TD-target-ddpg}{y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma Q_{\theta_{i-1}} (s_{t+1}, \mu_\phi(s_{t+1})) \big] .} ``` -Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates. +Similarily to DQN, DDPG also employs the same replay buffer mechanism, reusing past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates. -Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with maximizing the discounted cumulative reward, while acting as randomly as possible. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy, +Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with maximizing the discounted cumulative reward, while acting as randomly as possible. MaxEnt RL @haarnojaReinforcementLearningDeep2017b has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy $\mathcal H(\pi (\bullet \vert s_t))$, ``` math \begin{align} - J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))] + J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]. + \end{align} ``` @@ -692,23 +755,25 @@ This modified objective results in the *soft* TD-target: ``` math \htmlId{soft-td-target}{y_i = \mathbb E_{s_{t+1} \sim \mathbb P( \bullet \vert s_t, a_t)} [r_t + \gamma \left( Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) - \alpha \log \pi_\phi(a_{t+1} \vert s_{t+1}) \right)], \quad a_{t+1} \sim \pi_\phi(\bullet \vert s_t)} ``` -Similarily to DDPG, SAC also maintains an explicit policy, trained under the same MaxEnt framework for the maximization of [J-soft], and updated using- +Similarily to DDPG, SAC also maintains an explicit policy, trained under the same MaxEnt framework for the maximization of eq. [J-soft], updated using- ``` math \htmlId{sac-policy-update}{\pi_{k+1} \leftarrow \arg\min_{\pi^\prime \in \Pi} \text{D}_{\text{KL}}\left(\pi^\prime (\bullet \vert s_t) \bigg\Vert \frac{\exp(Q_{\pi_k}(s_t, \bullet))}{Z_{\pi_k}(s_t)} \right)} ``` -The update rule provided in [sac-policy-update] optimizes the policy while projecting it on a set $\Pi$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017). +The update rule provided in eq. [sac-policy-update] optimizes the policy while projecting it on a set $\Pi$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017b). ##### Sample-efficient, data-driven RL -Importantly, sampling $(s_t, a_t, r_t, s_{t+1})$ from the replay buffer $D$ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $D$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process. +Sampling $(s_t, a_t, r_t, s_{t+1})$ from the replay buffer $D$ conveniently allows to approximate expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $D$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded by a human demonstrator into the training process. -Reinforcement Learning with Prior Data (RLPD) @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $D_\text{offline}$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine. +Reinforcement Learning with Prior Data (RLPD) @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead only uses the available offline data $D_\text{offline}$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportions, and used in the underlying SAC routine. Together with other implementation details (using LayerNorm layers to prevent value overestimation, and the use of ensembles techniques to form the TD-target), RLPD proves a particularly simple yet effective approach to use $D_\text{offline}$ for Offline-to-Online RL. ##### Sample-efficient, data-driven, real-world RL -Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment. As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness. In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $c$, as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($e^+$) or failure ($e^-$) states, $s \in \mathcal S$, with a natural choice for the state-conditioned reward function being $r \mathcal S \mapsto \mathbb R$ being $r(s) = \log c(e^+ \ vert s )$. Further, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies. +Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely on augmenting propioperceptive inputs with camera streams, and thus even well-defined rewards would need to be defined starting from unstructured observation--a challenging assumption in practice. In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it starting from unstructured, image observations. In particular,  introduces a suite of tools streamlining training of *reward classifiers* $c$, as well as jointly learn forward-backward controllers to speed up real-world RL. -Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$),@luoSERLSoftwareSuite2025 achieved a similar result to that of having to manually randomize the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach. +Reward classifiers are particularly useful in treating complex, dynamic tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($e^+$) or failure ($e^-$) states, rather than from a precise formulation of $r_t$, with a natural target for the reward classifier being $r(s) = \log c(e^+ \ vert s )$. Furthermore, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning separate (1) *forward* and (2) *backward* controllers--parametrized by separate policies--where (1) the former learns to execute a task to completion and (2) the latter learns to reset the environment to its initial state from terminal states, thereby aiding training in real-world episodic settings. + +Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q$ in the frame of the end-effector pose at $t=0$. Randomizing the initial pose of the end-effector ($s_0$), @luoSERLSoftwareSuite2025 achieved a similar result to that of manually randomizing the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
-
(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a SO-100.
+
(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a real-world SO-100.
-Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration and performance, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure [hil-serl-blocks]), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours. +Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure [hil-serl-blocks]), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours. -Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL) @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process. Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. Consequently, given an intervention timestep $k \in (0, T)$, length-$K$ human intervention data $\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of training @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training. +Human-in-the-Loop, Sample Efficient Robot reinforcement Learning (HIL-SERL) @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While offline demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive, online corrections allow a human supervisor to intervene on failure modes and supply targeted interventions, greatly aiding the learning process @luoPreciseDexterousRobotic2024. Crucially, human intervention data is stored in *both* the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. In turn, given an intervention timestep $k \in (0, T)$, length-$K$ human intervention data $\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K$ is more likely to be sampled than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates (99%+) on diverse manipulation tasks within 1-2 hours of training @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training. #### Code Example- Real-world RL -**TODO(fracapuano): work out rl training example** - -#### Limitations of RL in Real-World Robotics: Simulators and Reward Design - -Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations: - -- In those instances where real-world training experience is prohibitively expensive to gather @degraveMagneticControlTokamak2022, @bellemareAutonomousNavigationStratospheric2020, in-simulation training is often the only option. However, high-fidelity simulators for real-world problems can be difficult to build and maintain, especially for contact-rich manipulation and tasks involving deformable or soft materials. - -- Reward design poses an additional source of brittleness. Dense shaping terms are often required to guide exploration in long-horizon problems, but poorly tuned terms can lead to specification gaming or local optima. Sparse rewards avoid shaping but exacerbate credit assignment and slow down learning. In practice, complex behaviors require efforts shaping rewards: a britlle and error prone process. - -Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns. By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions. - -## Robot (Imitation) Learning - - -
- -*The best material model for a cat is another, or preferably the same cat* - -Norbert Wiener - -
-
- -TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets. - -
- -
(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in lerobot/svla_so101_pickplace. Proprioperceptive state provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.
+ +
HIL-SERL is a SOTA RL algorithm for training control policies directly in the real-world. Its implementation in lerobot relies on a decoupled actor-learner architecture, communicating over processes (and possibly networks) with queues used to share (1) transitions (s t , a t , r t , s t + 1) and (2) parameters θ .
-Learning from human demonstrations provides a pragmatic alternative to the reinforcement-learning pipeline discussed in Section [learning-rl]. Indeed, in real-world robotics online exploration is typically costly and potentially unsafe, and designing (dense) reward signals is a brittle and task-specific process. In general, success detection itself may often require bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by casting control an imitation learning problem, leveraging previously collected expert demonstrations. Most notably, by learning to imitate autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which obviates reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether. +This example shows how to use the HIL-SERL implementation supported by `lerobot`. This code example is organized into four parts: we first show how to train a reward classifier from a custom set of demonstrations, then define the `Actor` and `Learner` components, and finally, we bring them together in a complete script showing how to use HIL-SERL in practice. -Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section [learning-rl], in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure [ch4-bc-trajectories] graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints over a group of teleoperated episodes for the SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of teleoperation data. Figure [ch4-observation-action-mapping] shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated just alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in possibly multiple, different behaviors. +At a higher level, the HIL-SERL architecture (Figure [ch3-hil-serl-architecture]) relies on two main components: -
- - -
Sample observations and action pairs over the course of a given trajectory recorded in lerobot/svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observation.
-
+- An `Actor`, running a frozen policy network used to interact with the environment and obtain observations. Observations are used to both condition the frozen actor in selecting the action to enact, and to form $(s_t, a_t, r_t, s_{t+1})$ transitions that are shared with the `Learner`. Rewards are inferred using a custom, learned reward classifier trained on a dataset of offline demonstrations. -Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988a aims at synthetizing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving -``` math -\htmlId{loss-minimization-SL}{\min_{f} \mathbb{E}_{(o_t, a_t) \sim p(\bullet)} \mathcal L(a_t, f(o_t)),} -``` -for a given risk function $\mathcal L: \mathcal A \times \mathcal A \mapsto \mathbb{R}, \ \mathcal L (a, a^\prime)$. +- A `Learner`, used to optimize the policy’s parameters $\theta$ for maximum expected return. The learner samples batches of offline data from online and offline buffers in equal proportion @ballEfficientOnlineReinforcement2023, and shares updated parameters with the `Actor`. -Typically, the expert’s joint observation-action distribution $p: \mathcal O\times \mathcal A\mapsto [0,1]$ such that $(o,a) \sim p(\bullet)$ is assumed to be unknown, in keeping with a classic Supervised Learning (SL) framework[^3]. However, differently from standard SL’s assumptions, the samples collected in $\mathcal D$, correspoding to observations of the underlying $p$ are *not* i.i.d., as expert demonstrations are collected *sequentially* in trajectories. In practice, this aspect can be partially mitigated by considering pairs in a non-sequential order--*shuffling* the samples in $\mathcal D$--so that the expected risk under $p$ can be approximated using MC estimates, although estimates may in general be less accurate. Another strategy to mitigate the impact of regressing over non-i.i.d. samples relies on the possibility of interleaving BC and data collection @rossReductionImitationLearning2011, aggregating multiple datasets iteratively. However, because we only consider the case where a single offline dataset $\mathcal D$ of (expert) trajectories is already available, dataset aggregation falls out of scope. +The HIL-SERL architecture presented in this example can be exclusively run locally, but the implementation in `lerobot` also allows the `Actor` and `Learner` to run on two separate machines connected by the network. -Despite the inherent challenges of learning on non-i.i.d. data, the BC formulation affords several operational advantages in robotics. First, training happens offline and typically uses expert human demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent and task completion. This also mitigates the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. However, BC can in principle only learn behaviors that are, at most, as good as the one exhibited by the demonstrator, and thus critically provides no mitigation for the suboptimal decision making that might be enaced by humans. Still, while problematic in sequential-decision making problems for which expert demonstrations are not generally available--data migth be expensive to collect, or human performance may be inherently suboptimal--many robotics applications benefit from relative cheap pipelines to acquire high-quality trajectories generated by humans, thus justifying BC approaches. +
-
- - -
Point-wise policies suffer from limitations due to (A) covariate shifts and poor approximation of (B) multimodal demonstrations. (A) Initially small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in a scene, either left or right-first, are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.
-
+Training a Reward Classifier +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/01_reward_classifier.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/01_reward_classifier.py) -While conceptually elegant, point-estimate policies $f : \mathcal O\mapsto \mathcal A$ learned by solving [loss-minimization-SL] have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure [ch4-issues-with-bc] illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus errors compounding (Figure [ch4-issues-with-bc], left).Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving robotics problems, since multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure [ch4-issues-with-bc], right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the generative model $p(o, a)$ underlying the samples in $\mathcal D$, rather than an explicitly learning a prediction function $f(o) = a$. +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset -### A (Concise) Introduction to Generative Models +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} -Generative Models (GMs) aim to learn the stochastic process underlying the very generation of the data collected, and typically do so by fitting a probability distribution that approximates the unknown *data distribution*, $p$. In the case of BC, this unknown data distribution $p$ represents the expert’s joint distribution over $(o, a)$-pairs. Thus, given a finite set of $N$ pairs $\mathcal D = \{ (o,a)_i \}_{i=0}^N$ used as an imitation learning target (and thus assumed to be i.i.d.), GM seeks to learn a *parametric* distribution $p_\theta(o,a)$ such that (1) new samples $(o,a) \sim p_\theta(\bullet)$ resemble those stored in $\mathcal D$, and (2) high likelihood is assigned to the observed regions of the unobservable $p$. Likelihood-based learning provides a principled training objective to achieve both objectives, and it is thus extensively used in GM @prince2023understanding. +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) -#### Variational Auto-Encoders +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) -
- - -
Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.
-
+# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } -A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in -``` math -\htmlId{BC-latent-variable}{p (o,a) = \int_{\text{supp}({Z})} p(o,a \vert z) p(z)} -``` -Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\text{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure [ch4-task-effect-on-pairs] graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, [BC-latent-variable] still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure [ch4-latent-variable-model] represents this framework of latent-variable for a robotics application- the true, $z$-conditioned generative process on assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\text{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in [BC-latent-variable]) using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure [ch4-latent-variable-model]). +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) -
- - -
(A) The latent variable model in a robotics application regulates influence between observed ( o, a) variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.
-
+# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" -Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as: - - +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) -``` math -\begin{align} - \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\ - &= \log \sum_{i=0}^N \int_{\text{supp}({Z})} p_\theta((o,a)_i \vert z) p(z) \\ - &= \log \sum_{i=0}^N \int_{\text{supp}({Z})} \frac{q_\theta(z \vert (o,a)_i)}{q_\theta(z \vert (o,a)_i)} \cdot p_\theta((o,a)_i \vert z) p(z) \\ - &= \log \sum_{i=0}^N \mathbb E_{z \sim p_\theta(\bullet \vert (o,a)_i)} [\frac{p(z)}{q_\theta(z \vert (o,a)_i)} \cdot p_\theta((o,a)_i \vert z)], -\end{align} + # Next, you can do amazing_model.forward(batch) + ... ``` -where we used [BC-latent-variable] in [evidence-definition-1], multiplied by $1 = \frac{q_\theta(z \vert (o,a)_i)}{q_\theta(z \vert (o,a)_i)}$ in [evidence-definition-2], and used the definition of expected value in [evidence-definition]. - -In the special case where one assumes distributions to be tractable, $p_\theta (\mathcal D)$ is typically tractable too, and $\max_\theta \log p_\theta(\mathcal D)$ provides a natural target for (point-wise) infering the unknown parameters $\theta$ of the generative model. Unfortunately, [evidence-definition] is rarely tractable when the distribution $p$ is modeled with approximators such as neural networks, especially for high-dimensional, unstructured data. -In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to [evidence-definition] as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$. - -In particular, the lower bound on [evidence-definition] (Evidence LOwer Bound, *ELBO*) can be derived from [evidence-definition] applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding: - - +
+
-``` math -\begin{align} - \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left( - \mathbb{E}_{z \sim p_\theta(\cdot \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big] - + \mathbb{E}_{z \sim p_\theta(\cdot \vert (o,a)_i)} [\log \left( \frac{p(z)}{q_\theta(z \vert (o,a)_i)} \right)] - \right) \\ - &= \sum_{i=0}^{N} \left( - \mathbb{E}_{z \sim p_\theta(\cdot \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big] - - \text{D}_{\text{KL}}\big[ q_\theta(z \vert (o,a)_i) \Vert p(z) \big] - \right) -\end{align} -``` -The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in [ELBO-intractable], and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective, - - +Defining the `Actor` +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/02_actor.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/02_actor.py) -``` math -\begin{align} -\text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left( - \mathbb{E}_{z \sim q_\phi(\cdot \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big] - - \text{D}_{\text{KL}}\big[ q_\phi(z \vert (o,a)_i) \Vert p(z) \big] - \right) - -\end{align} -``` -From Jensen’s inequality, maximizing ELBO results in maximizing the log-likelihood of the data too, thus providing a natural, tractable optimization target. Indeed, expectations can be estimated using MC estimates from the learned distributions in [ELBO], while the KL-divergence term can typically be computed in closed-form (1) modeling $q_\phi$ as a Gaussian $q_\phi(z \vert o,a) = \mathcal N\big(\mu_\phi(o,a), \Sigma_\phi(o,a) \big)$ and (2) imposing a standard Gaussian prior on the latent space, $p(z) = \mathcal N(\mathbf{0}, \mathbf{I})$. +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset -An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} - - +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) -``` math -\begin{align} -\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\ -\mathbf{L^{\text{rec}}}(\theta) &= \mathbb{E}_{z \sim q_\phi(\cdot \vert o,a} \big[ \log p_\theta(o,a \vert z) \big] \\ -\mathbf{L^{\text{reg}}}(\phi) &= \text{D}_{\text{KL}}\big[ q_\phi(z \vert o,a) \Vert p(z) \big] -\end{align} -``` +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) -For any given $(o,a)$ pair, the expected value term of [VAE-Lrec] is typically computed via MC estimates, resulting in +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+
+ +Defining the `Learner` +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/03_learner.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/03_learner.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+
+ +Using HIL-SERL +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/04_hil_serl.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch3/04_hil_serl.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+ +#### Limitations of RL in Real-World Robotics: Simulators and Reward Design + +Despite the advancements in real-world RL training, training RL agents for real-world tasks still suffers from the following limitations: + +- In those instances where real-world training experience is prohibitively expensive to gather (e.g., Tokamak control @degraveMagneticControlTokamak2022, Autonomous Stratospehere Navigation @bellemareAutonomousNavigationStratospheric2020)in-simulation training is often the only viable option. However, high-fidelity simulators for real-world problems can be difficult to build and maintain, especially for contact-rich manipulation and tasks involving deformable or soft materials. + +- Reward design is a fundamental source of brittleness in real-world RL pipelines. While shaping dense rewards is often necessary to guide exploration in long-horizon tasks, the process is error-prone and heavily reliant on human expertise and intuition. Poorly tuned terms can lead to specification gaming or convergence to local optima, making reward shaping a critical challenge for applying RL in practice. Sparse rewards that only signal successful trajectories can avoid these pitfalls but typically result in much slower learning due to reduced supervision. + +Advances in learning to act from potentially large corpora of human demonstrations via Behavioral Cloning (BC) address both of these concerns. Although suffering from an inherent suboptimality--imitation learning can at most match the performance level of the demonstrator--learning to reproduce expert demonstrations via BC has proven increasingly competitive and practical, bypassing the need for simulated environments and hard-to-define reward functions. + +## Robot (Imitation) Learning + + +
+ +*The best material model for a cat is another, or preferably the same cat* + +Norbert Wiener + +
+
+ +TL;DR Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets. + +
+
+ + +
(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in lerobot/svla_so101_pickplace. Proprioperceptive states provide invaluable to determine the robot’s state during an episode. (B) Camera frames are also recorded alongside measurements on the robot’s state, capturing information about the robot’s interaction with its environment.
+
+ +Learning from human demonstrations provides a pragmatic alternative to the RL pipeline discussed in Section [learning-rl]. Indeed, especially in real-world robotics, online exploration is typically costly and potentially unsafe, and designing (dense) reward signals is a brittle and task-specific process. Further, even success detection itself often requires bespoke instrumentation, while episodic training demands reliable resets--all factors complicating training RL algorithms on hardware at scale. Behavioral Cloning (BC) sidesteps these constraints by casting control an imitation learning problem, leveraging previously collected expert demonstrations to anchor the learned autonomous behavior. Most notably, by *learning-to-imitate*, autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether. + +Formally, let $\mathcal D = \{ \tau^{(i)} \}_{i=1}^N$ be a set of expert trajectories, with $\tau^{(i)} = \{(o_t^{(i)}, a_t^{(i)})\}_{t=0}^{T_i}$ representing the $i$-th length-$T_i$ trajectory in $\mathcal D$, $o_t \in \mathcal O$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal A$ the expert actions. Typically, observations $o \in \mathcal O$ consist of both image and proprioperceptive information, while actions $a \in \mathcal A$ represent control specifications for the robot to execute, e.g. a joint configuration. Note that differently from Section [learning-rl], in the imitation learning context $\mathcal D$ denotes an offline dataset collecting $N$ length-$T_i$ reward-free (expert) human trajectories $\tau^{(i)}$, and *not* the environment dynamics. Similarily, in this section $\tau^{(i)}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information. Figure [ch4-bc-trajectories] graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints of a teleoperated SO-100 manipulator. Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of both image and joint teleoperation data. Figure [ch4-observation-action-mapping] shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated alongside the corresponding observation. In principle, (expert) trajectories $\tau^{(i)}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in multiple, different behaviors. + +
+ + +
Sample observations and action pairs over the course of a given trajectory recorded in lerobot/svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observations.
+
+ +Behavioral Cloning (BC) @pomerleauALVINNAutonomousLand1988 aims at producing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal O\mapsto \mathcal A, \ a_t = f(o_t)$ by solving +``` math +\htmlId{loss-minimization-SL}{\min_{f} \mathbb{E}_{(o_t, a_t) \sim p(\bullet)} \mathcal L(a_t, f(o_t)),} +``` +given an arbitrary risk function $\mathcal L: \mathcal A \times \mathcal A \mapsto \mathbb{R}, \ \mathcal L (a, a^\prime)$. + +Typically, the expert’s joint observation-action distribution $p: \mathcal O\times \mathcal A\mapsto [0,1]$ is assumed to be unknown, in keeping with a classic Supervised Learning (SL) framework[^3]. However, differently from standard SL assumptions, the samples collected in $\mathcal D$--realizations of the underlying $p$--are *not* i.i.d., as expert demonstrations are collected *sequentially* in the form of trajectories. In practice, this aspect can be partially mitigated by considering pairs in a non-sequential order--*shuffling* the samples in $\mathcal D$--so that the expected risk under $p$ can be approximated using MC estimates, although these estimates may in general be less accurate. Another strategy to mitigate the impact of regressing over non-i.i.d. samples relies on the possibility of interleaving BC and data collection , aggregating multiple datasets iteratively. However, because we only consider the case where a single offline dataset $\mathcal D$ of trajectories is available and no more data can be collected, DAgger falls out of our scope. + +Despite the inherent challenges of learning from non-i.i.d. data, the BC formulation presents several operational advantages in robotics. First, training happens offline and naturally accomodates for expert, demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether, by anchoring action in imitation. Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent. The absence of rewards also prevents the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL @heessEmergenceLocomotionBehaviours2017. Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset. Finally, empirical evidence suggests the performance of BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments. Nonetheless, BC can, in principle, only reproduce behaviors that are at best as good as those of the demonstrator, and therefore offers no remedy for the suboptimal decisions that humans may enact. This limitation is particularly problematic in sequential decision-making tasks where expert demonstrations are scarce---either because data collection is costly or because human performance is inherently suboptimal. Yet, many robotics applications still benefit from relatively inexpensive pipelines for collecting high-quality human-generated trajectories, justifying the use of BC in such settings. + +
+ + +
Point-wise policies suffer from limitations due to (A) covariate shifts and (B) poor approximation of multimodal demonstrations. (A) Small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in the scene--either left or right-first--are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.
+
+ +While conceptually elegant, *point-estimate policies* $f : \mathcal O\mapsto \mathcal A$ learned by solving eq. [loss-minimization-SL] have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure [ch4-issues-with-bc] illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus compounding errors (Figure [ch4-issues-with-bc], left). Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving real-world robotics problems, as multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure [ch4-issues-with-bc], right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the *generative model* $p(o, a)$ underlying the samples in $\mathcal D$, rather than explicitly learning a prediction function $f- a = f(o)$. + +### A (Concise) Introduction to Generative Models + +Generative Models (GMs) aim to learn the stochastic process underlying the very generation of the data collected, and typically do so by fitting a probability distribution that approximates the unknown *data distribution*, $p$. In keeping with the GM literature, $p(x) \leftarrow \mathbb P(x), x \sim p$. In the case of BC, the unknown data distribution $p$ may represent the expert’s joint distribution over $(o, a)$-pairs. Thus, given a finite set of $N$ pairs $\mathcal D = \{ (o,a)_i \}_{i=0}^N$ available as an imitation learning target (and thus assumed to be i.i.d.), GMs seek to learn a *parametric* distribution $p_\theta(o,a)$ such that (1) new samples $(o,a) \sim p_\theta(\bullet)$ resemble those stored in $\mathcal D$, and (2) high likelihood is assigned to the *observed* regions of the *unobservable* $p$. Likelihood-based learning provides a principled training objective to achieve both goals, and it is thus extensively used in GMs @prince2023understanding. + +#### Variational Auto-Encoders + +
+ + +
Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper’s opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.
+
+ +A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in: +``` math +\htmlId{BC-latent-variable}{p (o,a) = \int_{\operatorname{supp}({Z})} p(o,a \vert z) p(z)} +``` +Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z$ could be interpreted as some high level representation of the underlying task being performed by the human demonstrator. In such case, treating $p(o,a)$ as a marginalization over $\operatorname{supp}({Z})$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs. Figure [ch4-task-effect-on-pairs] graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper--the higher $q_6$, the wider the gripper’s opening--should intuitively be (A) high or (B) low, depending on the task performed. While the latent space $Z$ typically has a much richer structure than the set of all actual tasks performed, eq. [BC-latent-variable] still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors. Figure [ch4-latent-variable-model] represents this latent-variable framework in the context of a robotics application- the true, $z$-conditioned generative process assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair. Using Bayes’ theorem, one can reconstruct the *posterior* distribution on $\operatorname{supp}({Z})$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$. VAEs approximate the latent variable model presented in eq. [BC-latent-variable] using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure [ch4-latent-variable-model]). + +
+ + +
(A) The latent variable model in a robotics application regulates influence between observed ( o, a) variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.
+
+ +Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can be written as: + + + +``` math +\begin{align} + \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\ + &= \log \sum_{i=0}^N \int_{\operatorname{supp}({Z})} p_\theta((o,a)_i \vert z) p(z) \\ + &= \log \sum_{i=0}^N \int_{\operatorname{supp}({Z})} \frac{q_\theta(z \vert (o,a)_i)}{q_\theta(z \vert (o,a)_i)} \cdot p_\theta((o,a)_i \vert z) p(z) \\ + &= \log \sum_{i=0}^N \mathbb E_{z \sim q_\theta(\bullet \vert (o,a)_i)} [\frac{p(z)}{q_\theta(z \vert (o,a)_i)} \cdot p_\theta((o,a)_i \vert z)], +\end{align} +``` +where we used eq. [BC-latent-variable] in eq. [evidence-definition-1], multiplied by $1 = \frac{q_\theta(z \vert (o,a)_i)}{q_\theta(z \vert (o,a)_i)}$ in eq. [evidence-definition-2], and used the definition of expected value in eq. [evidence-definition]. + +In the special case where one assumes distributions to be tractable, $p_\theta (\mathcal D)$ is typically tractable too, and $\max_\theta \log p_\theta(\mathcal D)$ provides a natural target for (point-wise) infering the unknown parameters $\theta$ of the generative model. Unfortunately, eq. [evidence-definition] is rarely tractable when the distribution $p$ is modeled with approximators such as neural networks, especially for high-dimensional, unstructured data. + +In their seminal work on Variational Auto-Encoders (VAEs), @kingma2013auto present two major contributions to learn complex latent-variable GMs from unstructured data, proposing (1) a tractable, variational lower-bound to eq. [evidence-definition] as an optimization target to jointly learn likelihood and posterior and (2) using high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$. + +In particular, the lower bound on eq. [evidence-definition] (Evidence LOwer Bound, *ELBO*) can be derived from eq. [evidence-definition] applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding: + + + +``` math +\begin{align} + \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left( + \mathbb{E}_{z \sim q_\theta(\bullet \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big] + + \mathbb{E}_{z \sim q_\theta(\bullet \vert (o,a)_i)} [\log \left( \frac{p(z)}{q_\theta(z \vert (o,a)_i)} \right)] + \right) \\ + &= \sum_{i=0}^{N} \left( + \mathbb{E}_{z \sim q_\theta(\bullet \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big] + - \text{D}_{\text{KL}}\big[ q_\theta(z \vert (o,a)_i) \Vert p(z) \big] + \right) +\end{align} +``` + +The true, generally intractable, posterior $q_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in eq. [ELBO-intractable], and therefore @kingma2013auto propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable, ELBO objective, + + + +``` math +\begin{align} +\text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left( + \mathbb{E}_{z \sim q_\phi(\bullet \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big] + - \text{D}_{\text{KL}}\big[ q_\phi(z \vert (o,a)_i) \Vert p(z) \big] + \right) + +\end{align} +``` +From Jensen’s inequality, maximizing ELBO results in maximizing the log-likelihood of the data too, thus providing a natural, tractable optimization target. Indeed, expectations can be estimated using MC estimates from the learned distributions in eq. [ELBO], while the KL-divergence term can typically be computed in closed-form (1) modeling $q_\phi$ as a Gaussian $q_\phi(z \vert o,a) = \mathcal N\big(\mu_\phi(o,a), \Sigma_\phi(o,a) \big)$ with learned mean vector $\mu_\phi(o,a)$ and learned variance-covariance matrix $\Sigma_\phi(o,a)$ and (2) imposing a standard Gaussian prior on the latent space, $p(z) = \mathcal N(\mathbf{0}, \mathbf{I})$. + +An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits the particularly interpretable factorization (considering, without loss of generality, only one $(o,a) \sim \mathcal D$): + + + +``` math +\begin{align} +\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi), \\ +\mathbf{L^{\text{rec}}}(\theta) &= \mathbb{E}_{z \sim q_\phi(\bullet \vert o,a}) \big[ \log p_\theta(o,a \vert z) \big] \\ +\mathbf{L^{\text{reg}}}(\phi) &= \text{D}_{\text{KL}}\big[ q_\phi(z \vert o,a) \Vert p(z) \big]. +\end{align} +``` + +For any given $(o,a)$ pair, the expected value term in eq. [VAE-Lrec] is typically computed via MC estimates, resulting in ``` math -\mathbb{E}_{z \sim q_\phi(\bullet \vert o,a)} \big[ \log p_\theta(o,a \vert z) \big] = \mathbf{L^{\text{rec}}} \approx - \frac{1}{n} \sum_{i=0}^n \log p_\theta(o,a \vert z_i). ``` -Assuming $p_\theta(o,a \vert z)$ is parametrized as an isotropic Gaussian distribution with mean $\mu_\theta (z) \in \mathbb R^d$ and variance $\sigma^2$, the log-likelihood thus simplifies to- +Assuming $p_\theta(o,a \vert z)$ to be parametrized with an isotropic Gaussian distribution with mean $\mu_\theta (z) \in \mathbb R^d$ and variance $\sigma^2$, the log-likelihood thus simplifies to- ``` math \log p(o,a \vert z_i) = -\frac{1}{2\sigma^{2}} \big \Vert (o,a)-\mu_\theta(z_i) \big\Vert_2^2 -\frac{d}{2}\log(2\pi \sigma^{2}) \implies \mathbf{L^\text{rec}} \approx \frac {1}{n} \sum_{i=0}^n \big\Vert (o,a) - \mu_\theta(z_i) \big \Vert^2_2 ``` -Indeed, it is very common in practice to approximate from the learned likelihood $p_\theta(o,a \vert z)$ as a parametric distribution (e.g. Gaussians) parametrized by some learned vector of coefficients derived from $\mu_\theta (z), \ z \sim p (\bullet)$. In all such cases, learning a VAE corresponds to optimally *reconstructing* the examples in $\mathcal D$ by minimizing the L2-error--a very common *supervised learning* objective for regression targets--while regularizing the information compression into the latent, as under the common modeling choice $p(z) = \mathcal N (\mathbf{0}, \mathbf{I})$ [VAE-Lreg] regularizes the posterior limiting the expressivity of $q_\phi(z\vert o,a)$. +In practice, it is common to approximate the learned likelihood $p_\theta(o,a \vert z)$ with a parametric distribution (e.g., Gaussian) whose parameters are given by a learned coefficient vector derived from $\mu_\theta(z), \ z \sim p(\bullet)$. Under this formulation, learning a VAE amounts to (1) *reconstructing* the examples in $\mathcal{D}$ by minimizing (1) the reconstruction loss $\mathbf{L^{\text{rec}}}$--a standard *supervised learning* objective for regression--while (2) *regularizing* the latent representation by minimizing $\mathbf{L^{\text{reg}}}$. The latter enforces information compression, since with the common prior choice $p(z) = \mathcal{N}(\mathbf{0}, \mathbf{I})$ in eq. [VAE-Lreg], the regularizer constrains the posterior and thereby limits the expressivity of $q_\phi(z \vert o,a)$. #### Diffusion Models -VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to [BC-latent-variable], and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure [ch4-many-latents]), resulting in +VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to eq. [BC-latent-variable], and solve the variational-inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure [ch4-many-latents]), resulting in ``` math \begin{align} - p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\ - p(z_0, z_1, \dots z_T) &= p(z_T) \prod_{t=0}^{T} p(z_{t-1} \vert z_t), + p(\underbrace{o,a}_{= z_0}) &= \int_{\operatorname{supp}({Z_0})} \int_{\operatorname{supp}({Z_1})} \ldots \int_{\operatorname{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\ + p(z_0, z_1, \dots z_T) &= p(z_T) \prod_{t=1}^{T} p(z_{t-1} \vert z_t), \end{align} ``` -where we explicitly showed the marginalization over the multiple latents in [BC-multi-latent-model-1], and used the law of conditional probability and Markov property in [BC-multi-latent-model-2]. +where we explicitly showed the marginalization over the multiple latents in eq. [BC-multi-latent-model-1], and used the law of conditional probability and Markov property in eq. [BC-multi-latent-model-2]. Also, for ease of notation, we will refer to observation-action pairs $o,a$ as $z_0$.
HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.
-Similarily to VAEs, providing an exact interpretation for the latent variables is typically not possible. Still, one fairly reasonable application-driven intuition is that, by providing a model of the hierarchical, decoupled interaction of latent variables, Hierarchical Markov Latent Variable (HMLV) models attempt to capture the different resolutions at which different conditioning factors intervene, so that in a robotics application for instance, one could naturally distinguish between early-stage trajectory planning ($t \to T$) and fine-grained adjustments ($t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$ for a given $\beta_t \in \mathbb R^+$, thereby iteratively reducing the signal-to-noise ratio as $\beta_t$ increases along the latents hierarchy. +Similar to VAEs, it is generally not possible to assign an *exact* interpretation to the latent variables. Nevertheless, a reasonable application-driven intuition is that Hierarchical Markov Latent Variable (HMLV) models, by capturing hierarchical and decoupled interactions among latent variables, can reflect the different resolutions at which conditioning factors intervene. For example, in a robotics setting, one might naturally distinguish between high-level trajectory planning (higher up in the hierarchy, $t \to T$) and fine-grained motion adjustments (closer to empirical observations, $t \to 0$). In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model. DMs are a particular instantiation of HMLV models for which the posterior is fixed to $q( z_t \vert z_{t-1}) = \mathcal N(z_t \sqrt{1-\beta_t}, \beta_t \mathbf{I})$, for a given $\beta_t \in \mathbb R^+$. In practice, $\beta_t$ is used to iteratively reduce the signal-to-noise ratio along the latents’ hierarchy, similarily to how a diffusion process influences the information of a physical system. + +Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place (eq. [BC-multi-latent-model-1]). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ by (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for simplicity of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are iteratively turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*. + +Under such model, we can express the log-likelihood of an arbitrary sample $z_0$ as: + + -Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( [BC-multi-latent-model-1]). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*. +``` math +\begin{align} + \log p_\theta (z_0) &= \log \int_{\operatorname{supp}({Z_1}) \times \operatorname{supp}({Z_2}) \times \dots \times \operatorname{supp}({Z_T})} p_\theta(\underbrace{z_0, z_1, z_2, \dots z_T}_{z_{0:T}}) \\ + &= \log \int_{\operatorname{supp}({Z_{1:T}})} \frac{p_\theta(z_{0:T}) \cdot q(z_{1:T} \vert z_0)}{q(z_{1:T} \vert z_0)} \\ + &= \log \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \frac{p_\theta(z_{0:T})}{q(z_{1:T} \vert z_0)} \bigg] \\ + &\geq \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p_\theta(z_{0:T})}{q(z_{1:T} \vert z_0)} \bigg] \\ + &= \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p(z_T) \prod_{t=1}^{T} p_\theta (z_{t-1} \vert z_t)}{\prod_{t=1}^T q(z_t \vert z_{t-1})} \bigg] \\ + &= \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p(z_T) \cdot p_\theta (z_0 \vert z_1) \prod_{t=2}^{T} p_\theta (z_{t-1} \vert z_t)}{q(z_T \vert z_{T-1}) \prod_{t=1}^{T-1} q(z_t \vert z_{t-1})} \bigg] \\ + &= \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p(z_T) \cdot p_\theta (z_0 \vert z_1) \prod_{t=1}^{T-1} p_\theta (z_{t} \vert z_{t+1})}{q(z_T \vert z_{T-1}) \prod_{t=1}^{T-1} q(z_t \vert z_{t-1})} \bigg] \\ + &= + \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p(z_T) \cdot p_\theta (z_0 \vert z_1)}{q(z_t \vert z_{t-1})} \bigg] + + \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \prod_{t=1}^{T-1} \frac{p_\theta (z_{t} \vert z_{t+1})}{q(z_t \vert z_{t-1})}\bigg] + \\ + &= + \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \big[ \log p_\theta (z_0 \vert z_1) \big] + + \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p (z_T)}{q(z_T \vert z_{T-1})} \bigg] + + \sum_{t=1}^{T-1} \mathbb{E}_{z_{1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p_\theta (z_{t} \vert z_{t+1})}{q(z_t \vert z_{t-1})}\bigg] + \\ + &= + \mathbb{E}_{z_1 \sim q(\bullet \vert z_0)} \big[ \log p_\theta (z_0 \vert z_1) \big] + + \mathbb{E}_{z_{T-1:T} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p (z_T)}{q(z_T \vert z_{T-1})} \bigg] + + \sum_{t=1}^{T-1} \mathbb{E}_{z_{t-1:t+1} \sim q(\bullet \vert z_0)} \bigg[ \log \frac{p_\theta (z_{t} \vert z_{t+1})}{q(z_t \vert z_{t-1})}\bigg] + \\ + &= \mathbb{E}_{z_1 \sim q(\bullet \vert z_0)} \log p_\theta (z_0 \vert z_1) - \mathbb{E}_{z_{T-1} \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_T \vert z_{T-1}) \Vert p(z_T) ) \big] \\ + &- \sum_{t=1}^{T-1} \mathbb{E}_{(z_{t-1}, z_{t+1}) \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_t \vert z_{t-1}) \Vert p_\theta(z_t \vert z_{t+1}) ) \big], \notag +\end{align} +``` +where we: used eq. [BC-multi-latent-model-1] and multiplied by $1 = \tfrac{q(z_{1-T} \vert z_0)}{q(z_{1:T} \vert z_0)}$ in eq. [diffusion-1]; used Jensen’s inequality in eq. [diffusion-jensen]; used the law of conditional probability for both numerator and denominator in eq. [diffusion-2]; stepped forward and backward the products in the numerator and denominator products in eq. [diffusion-3], respectively; reindexed the product terms in eq. [diffusion-4]; removed out-of-expectation variables in eq. [diffusion-expectation-indices]; used the defintion of KL-divergence in eq. [diffusion-likelihood]. In turn, eq. [diffusion-likelihood] provides an optimization target to *learn* $p_\theta$ solving $\max_\theta \log p_\theta (\mathcal D)$. -Under such model, we can express the log-likelihood of an arbitrary sample as[^4] - +In their seminal work on using DMs for variational inference, @hoDenoisingDiffusionProbabilistic2020 introduce major contributions regarding solving $\min_\theta -\log p_\theta(z_0)$. In particular, @hoDenoisingDiffusionProbabilistic2020 exclusively adopt a *fixed, isotropic Gaussian posterior* in the form of $q(z_t \vert z_{t-1}) = \mathcal{N}(\sqrt{1-\beta_t}z_{t-1}, \beta_t \mathbf I)$. The choice of adopting Gaussians has profound implications on the generative process modeled. Indeed, under the (mild) assumption that the variance is sufficiently small $\beta_t \leq \eta, \eta \in \mathbb R^+$, @sohnLearningStructuredOutput2015 proved that the likelihood $p(z_{t-1} \vert z_t)$ is Gaussian as well, which allows for the particularly convenient parametrization of the approximate likelihood $p_\theta (z_{t-1} \vert z_t) = \mathcal N(\mu_\theta(z_t, t), \Sigma_\theta(z_t,t)), \ t \in [1,T]$, as well as for closed-form tractability of the KL-divergence terms in eq. [diffusion-likelihood]. Further, the posterior’s structure also enables the analytical description of the distribution of the $t$-th latent variable, $q(z_t \vert z_0) = \mathcal N (\sqrt{\bar{\alpha}_t}z_0, (1-\bar{\alpha}_t) \mathbf{I})$, with $\alpha_t = 1-\beta_t, \ \bar \alpha_t = \prod_{k=1}^t \alpha_k$, conveniently preventing iterative posterior sampling simplifying computing eq. [diffusion-likelihood]. It follows- + ``` math \begin{align} - \log p_\theta (\underbrace{o,a}_{= z_0}) = - &\mathbb{E}_{z_1 \sim q(\bullet \vert z_0)} \log p_\theta (z_0 \vert z_1) - \\ - &\mathbb{E}_{z_{T-1} \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_T \vert z_{T-1}) \Vert p(z_T) ) \big] - \notag \\ - &\sum_{t=1}^{T-1} \mathbb{E}_{(z_{t-1}, z_{t+1}) \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_t \vert z_{t-1}) \Vert p_\theta(z_t \vert z_{t-1}) ) \big], \notag + \nabla_\theta \log p_\theta (z_0) = \mathbb E_{z_1 \sim q(\bullet \vert z_0)} \nabla_\theta \log p_\theta (z_0 \vert z_1) - \sum_{t=1}^{T-1} \mathbb E_{z_{t-1}, z_{t+1} \sim q(\bullet \vert z_0)} \nabla_\theta \text{D}_{\text{KL}}(q(z_t \vert z_{t-1}) \Vert p_\theta(z_t \vert z_{t+1}), \end{align} ``` -providing an optimization target in the form of $\max_\theta \log p_\theta (\mathcal D)$. - -In their seminal work on using DMs for variational inference, @hoDenoisingDiffusionProbabilistic2020 introduce major contributions regarding solving $\min_\theta -\log p_\theta(o,a)$. In particular, @hoDenoisingDiffusionProbabilistic2020 exclusively adopt a fixed *Gaussian* posterior in the form of $q(z_t \vert z_{t-1}) = \mathcal{N}(\sqrt{1-\beta_t}z_{t-1}, \beta_t \mathbf I)$. The choice of adopting Gaussians has profound implications on the generative process modeled. Indeed, under the (mild) assumption that the variance is sufficiently small $\beta_t \leq \eta, \eta \in \mathbb R^+$, @sohl-dicksteinDeepUnsupervisedLearning2015 proved that the likelihood $p(z_{t-1} \vert z_t)$ is Gaussian as well, which allows for the particularly convenient parametrization of the approximate likelihood $p_\theta (x_{t-1} \vert x_t) = \mathcal N(\mu_\theta(x_t, t), \Sigma_\theta(x_t,t)), \ t \in [1,T]$, as well as for closed-form tractability of the KL-divergence terms in [diffusion-likelihood]. Further, the posterior’s structure also enables an analytical description for the distribution of the $t$-th latent variable, $q(z_t \vert z_0) = \mathcal N (\sqrt{\bar{\alpha}_t}z_0, (1-\bar{\alpha}_t) \mathbf{I})$, with $\alpha_t = 1-\beta_t, \ \bar \alpha_t = \prod_{k=1}^t \alpha_k$, which conveniently prevents iterative posterior sampling. +where the former term is equivalent to the reconstruction term in eq. [VAE-min-neg-ELBO] and the latter term can be obtained in closed form.
DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.
-Finally, adopting Gaussian posteriors permits a particularly pleasing interpretation of the dynamics of training DMs @permenterInterpretingImprovingDiffusion2024. By using Gaussian posteriors, the hierarchical latent variables effectively lose increasingly more information circa the original (unknown) distribution’s sample, $z_0$, increasingly distributing according to a standard Gaussian and thus containing no information at all (Figure [diffusion-robot-actions]). Figure [diffusion-robot-actions] illustrates this procedure on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ representing the robot’s *elbow flex* actuation and $q^h_2$ the human teleoperator’s robot elbow flex. - -
- - -
A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint in the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.
-
- -Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure [ch4-action-vs-observation-distribution]). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure [diffusion-robot-actions], $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective: +Besides mathematical tractability of eq. [diffusion-likelihood-gradient], adopting Gaussian posteriors allows for a particularly intuitive interpretation of the training dynamics of DMs @permenterInterpretingImprovingDiffusion2024. As the hierarchical latent variables are repeatedly corrupted by applying increasingly more Gaussian noise, they progressively lose information about the original (unknown) sample $z_0$, converging toward a standard Gaussian which eventually contains no information at all (Figure [diffusion-robot-actions]). Figure [diffusion-robot-actions] illustrates this process on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ denoting the robot’s *elbow flex* actuation and $q^h_2$ the corresponding human teleoperator’s elbow flex. Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure [ch4-action-vs-observation-distribution]). Notice how corrupted samples distribute differently from the most reasonable structure $a \simeq o$, further underscoring how diffusion corrupts both the individual samples and the global distribution (Figure [diffusion-robot-actions], left and center). In this, using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure [diffusion-robot-actions], $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples. Comparing the diffused samples to the original data points, one can derive an estimate of the total displacement induced by the diffusion process, and, under the assumption that the likelihood of the totally diffused samples is low under the original unknown data distribution, one can effectively approximate the unkwown distribution by *learning to reverse* such displacement. This key intuition allows to write a simplified training objective[^4]: @@ -1002,10 +1338,22 @@ Because the recorded behavior is teleoperated, measurements mostly distribute al \end{align} ``` -In this simplified (minimization) objective, the optimization process differs from [diffusion-likelihood] in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution. +
+ + +
A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint from the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.
+
+ +In this simplified (minimization) objective, the optimization process differs from eq. [diffusion-likelihood] in that, rather than maximizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\})$) diffusion process starting from a sample of the target distribution. -By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing - +By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis that real-world data belongs to a single, higher-dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing- + ``` math @@ -1013,26 +1361,26 @@ By learning the total displacement from a generally, uninformative corrupted sam z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}), \end{align} ``` -thus showing that the lower-level latent variables in a DM can be obtained by iteratively removing noise from the one-step higher order variable, using the noise regressor $\epsilon_\theta(z_t, t)$ learned minimizing [diffusion-simplified-loss]. +thus showing that the lower-level latent variables in a DM can be obtained by iteratively removing noise from the one-step higher order variable, using the noise regressor $\epsilon_\theta(z_t, t)$ learned minimizing eq. [diffusion-simplified-loss]. #### Flow Matching -The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $\psi [0,1] \times Z \mapsto Z$, formalized starting from possibly time-dependent vector field $v: [0,1] \times Z \mapsto Z$ transporting samples from a simple prior distribution $p_0$--e.g., a standard Gaussian--to a more complex, potentially unknown data distribution $p_1$ over time. Note how FM models time $t \in [0,1]$ to be varying continuously while moving away *from* an easy-to-sample distribution $p_0$ *towards* the unknown data-distribution, $p_1$. This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time, +The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its *efficiency* at inference time, where a possibly large number (hundreds) of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $\psi: [0,1] \times Z \mapsto Z$, formalized starting from a (possibly time-dependent) vector field $v: [0,1] \times Z \mapsto Z$ *transporting over time* samples from a simple prior distribution $p_0$--e.g., a standard Gaussian--to a more complex, typically unknown data distribution $p_1$. In this, FM accomodates for arbitrary intermediate distributions, breaking free from the particular case where posterior and likelihood are exclusively Gaussians. Note also how FM models time $t \in [0,1]$ to be varying continuously while moving away *from* an easy-to-sample distribution $p_0$ *towards* the unknown data-distribution, $p_1$. This results in a continuous (and deterministic) trajectory at inference, which is in practice more efficient compared to following stochastic paths like in DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time, ``` math \begin{align} - \frac{d}{dt} \psi(z, t) &= v(t, \psi(t, z)) \\ - \psi(0, z) &= z + \frac{d}{dt} \psi(z, t) &= v(t, \psi(t, z)), \\ + \psi(0, z) &= z . \end{align} ``` +In practice, flow models learn to approximate these dynamics by estimating a vector field $v$ that matches the true, unknown $u$, so that the induced flows $\psi$ can approximate the ideal trajectories $\psi^*$. -FM proved very effective in a variety of applications, ranging from image @esserScalingRectifiedFlow2024 and video generation @polyakMovieGenCast2025 to robotics control @blackp0VisionLanguageActionFlow2024. Most notably, in their introductory work on FM for GM, @lipmanFlowMatchingGenerative2023 show how DMs can be seen as a specific instance of FM where the *conditional* target vector field $u$ approximated by the noise regressor corresponds to - +FM proved very effective in a variety of applications, ranging from image @esserScalingRectifiedFlow2024 and video generation @polyakMovieGenCast2025 to robotics control @blackp0VisionLanguageActionFlow2024. Most notably, in their introductory work on FM for GM, @lipmanFlowMatchingGenerative2023 show how DMs can be seen as a specific instance of FM where the *conditional* target vector field $v$ learned by the noise regressor ${\varepsilon}_\theta$ corresponds to: ``` math -\htmlId{fm-diffusion-vector-field}{u(t, z\vert z_0) = \frac{\frac{d}{dt}\alpha(1-t)}{1 - (\alpha(1-t))^2}(\alpha(1-t)z - z_0), \quad \alpha(t) = e^{-\frac12 \int_0^t \beta(s) ds}, \quad \forall z_0 \in \mathcal D} +\htmlId{fm-diffusion-vector-field}{u(t, z\vert z_0) = \frac{\frac{d}{dt}\alpha(1-t)}{1 - (\alpha(1-t))^2}(\alpha(1-t)z - z_0), \quad \alpha(t) = e^{-\frac12 \int_0^t \beta(s) ds}, \quad \forall z_0 \in \mathcal D.} ``` -Note that the traditional discrete-time noise-scheduler ${\beta_t}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$ individually, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$. +Conditional vector fields are defined not only over their argument $z$ and time $t$, but do also vary with respect to an auxiliary variable $z_0$, thereby extending the standard notion of a vector field to incorporate additional conditioning. Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_{t=0}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$. Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D$, one also retrieves the optimal flow to morph the entire support of the initial distribution $p_0$ into $p_1 \ \text{s.t.} \mathcal D \sim p_1$.
-
Probability distributions can be modified applying vector fields resulting in a flow of mass in the support. When acting over time, vector fields can effectively change the distribution’s structure.
+
Probability distributions can be modified differently by applying different vector fields, inducing different flows of mass across the same support (top versus bottom, using two different time-invariant 2D-fields u 1(x, y) = (x, 0) and $u_2(x,y) = (x/\sqrt{2}, y/\sqrt{2})$). Notice time flows continuously in [0, 1]. FM models learn to approximate a target vector field, thereby producing arbitrary (goal) transformations of an easy-to-sample initial distribution.
-While the noising schedule of DMs results in a stochastic process that resembles a random walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically jointly referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure [ch4-normalizing-flows]). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT)--a subdiscipline studying the problem of finding the most efficient way to morph one probability distribution into another. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure [ch4-diffusion-paths-versus-fm]), which can lead to faster and more stable training, as well as higher-quality sample generation with fewer steps at inference time. By avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced, while retaining comparable results @lipmanFlowMatchingGenerative2023. +While the noising schedule of DMs results in a stochastic resembling a random (Brownian) walk, FM allows for more general--potentially, deterministic--likelihood and posterior parametrization. In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure [ch4-normalizing-flows]). The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths. For instance, one can design probability paths inspired by Optimal Transport (OT), a mathematical framework concerned with characterizing the most efficient morphings between probability distributions. Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure [ch4-diffusion-paths-versus-fm]), which can lead to faster and more stable training, as well as empirically result in higher-quality generations with fewer denoising steps at inference time. In particular, by avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced in FM, while retaining comparable results @lipmanFlowMatchingGenerative2023.
Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over T = 50 steps.
-In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in [fm-diffusion-vector-field], while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as +In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in eq. [fm-diffusion-vector-field], while in priciple the target vector field can be learned to induce an arbitrary mass displacement, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, Conditional FM (CFM) defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, which in turn results in the target vector field $u(t, z_t) = z_1 - z_0$. FM models can then be trained with a simple regression objective defined as: @@ -1071,15 +1419,15 @@ In practice, FM can be applied to generative modeling by learning a vector field \Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal{U}([0,1]), \end{align} ``` -where $z_0 \sim p_0(\bullet)$ and $z_1 \sim p_1(\bullet)$. Note how in [flow-matching-objective]--differently from [diffusion-simplified-loss]--time is assumed to be varying continuously $t \sim \mathcal U([0,1])$ rather than discretely $t \sim \mathcal U(\{0,1\})$, a key property of flow-based models. The objective in [flow-matching-objective] directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $z_0 \sim p_0$ and iteratively refined according to $\frac{dz}{dt} = v_\theta(z_t, t)$ for $t \in [0,1]$--an operation that can be numerically carried out with standard ODE solvers. +where $z_0 \sim p_0(\bullet)$ and $z_1 \sim p_1(\bullet)$. Note how in eq. [flow-matching-objective]--differently from eq. [diffusion-simplified-loss]--time is assumed to be varying continuously $t \sim \mathcal U([0,1])$ rather than discretely $t \sim \mathcal U(\{0, \Delta t, 2 \Delta t, \dots, 1 \})$, a key property of flow-based models. Therefore, the objective in eq. [flow-matching-objective] directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $z_0 \sim p_0$ and iteratively refined according to $\frac{dz}{dt} = v_\theta(z_t, t)$ for $t \in [0,1]$--an operation that can be numerically carried out with standard ODE solvers, and that in practice is often carried out numerically via forward-Euler integrating over tens of denoising steps. ### Action Chunking with Transformers -While GMs prove useful in learning complex, high-dimensional multi-modal distributions, they do not natively address the compouding errors problem characteristic of online, sequential predictions. In Action Chunking with Transformers (ACT), @zhaoLearningFineGrainedBimanual2023 present an application of VAEs to the problem of learning purely from offline trajectories, introduce a simple, yet effective method to mitigate error compounding, learning high-fidelity autonomous behaviors. Drawing inspiration from how humans plan to enact atomically sequences of the kind $a_{t-t+k}$ instead of single actions $a_t$, @zhaoLearningFineGrainedBimanual2023 propose learning a GM on a dataset of input demonstrations by modeling *action chunks*. Besides contributions to learning high-performance autonomous behaviors, @zhaoLearningFineGrainedBimanual2023 also introduce hardware contributions in the form of a low-cost bimanual robot setup (ALOHA) capable of performing fine-grained manipulation tasks, such as opening a lid, slotting a battery in its allotment or even prepare tape for application. +While GMs prove useful in learning complex, high-dimensional multi-modal distributions, they do not natively address the compouding errors problem characteristic of modeling online, sequential predictions. In Action Chunking with Transformers (ACT), @zhaoLearningFineGrainedBimanual2023 present an application of VAEs to the problem of learning purely from offline trajectories, and introduce a simple, yet effective method to mitigate error compounding, learning high-fidelity autonomous behaviors via BC. Drawing inspiration from how humans plan to enact *sequences* of actions $a_{t-t+k}$ instead of single actions $a_t$, @zhaoLearningFineGrainedBimanual2023 propose learning a GM on a dataset of input demonstrations by modeling *chunks* of multiple actions directly. Besides contributions to learning high-performance autonomous behaviors, @zhaoLearningFineGrainedBimanual2023 also introduce hardware contributions in the form of a low-cost bimanual robot setup (ALOHA) capable of performing fine-grained manipulation tasks, such as opening a lid, slotting a battery in its allotment or even prepare tape for application. Notably, ALOHA bimanual setup costs just as much as a mono-arm Franka arm and can be assembled from easy-to-source parts, underscoring its higher accessibility. -On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$. +@zhaoLearningFineGrainedBimanual2023 do also present significant algorithmic contributions related to synthetizing performant autonomous behaviors for the ALOHA setup, adopting transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015 from demonstrations. Conditional VAEs are a variation of the standard VAE introducing an arbitrary conditioning on sampling from the latent prior, modeling *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions, and rather focus on the conditional $p_\theta(a \vert o)$. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from the joint $p_\theta(o,a)$, conditional distributions are often intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$, and the integral in the denominator is typically intractable. Thus, instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly, hence approximating $p (a \vert o)$. -In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in [ELBO], which is adapted to +In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in eq. [ELBO], which is adapted to: @@ -1092,70 +1440,188 @@ In practice, when learning from demonstrations adopting CVAEs results in a sligh \right) \end{align} ``` -Notice how in [c-ELBO] we are now also learning a new set of parameters $\omega$ for the prior distribution in the latent space. Effectively, this enables conditioning latent-space sampling (and thus reconstruction) during training, and potentially inference, providing useful when learning inherently conditional distributions like policies. Further, ACT is trained as a $\beta$-CVAE @higgins2017beta, using a weight of the KL regularization term in [c-ELBO] as an hyperparameter regulating the information condensed in the latent space, where higher $\beta$ results in a less expressive latent space. +Notice how in eq. [c-ELBO] we are now also learning a new set of parameters $\omega$ for the prior distribution in the latent space. Effectively, this enables conditioning latent-space sampling (and thus reconstruction) during training (and potentially inference too), providing useful when learning inherently conditional distributions like policies. Further, ACT is trained as a $\beta$-CVAE @higgins2017beta, weighing the KL regularization term in eq. [c-ELBO] with an hyperparameter $\beta \in \mathbb R^+$ regulating the information condensed in the latent space, where *higher* $\beta$ results in a *less* expressive latent space. + +In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn from human demonstrations compared to a simpler, supervised objective, $\mathcal L_1(a,a^\prime) = \Vert a - a^\prime \Vert_1$. Interestingly, they found the performance of these two approaches to be comparable when learning from *scripted* demonstrations. That is, when learning from data collected rolling out a predetermined set of commands $[q^c_0, q^c_1, \dots]$, GM did *not* prove competitive compared to standard supervised learning. However, when learning from human demonstrations--i.e., from data collected executing commands coming from a human controller $[q^h_0, q^h_1, \dots]$-- @zhaoLearningFineGrainedBimanual2023 found performance (defined as the success rate on a downstream task) to be severily (-33.3%) hindered from adopting a standard supervised learning objective compared to a richer, potentially more complex to learn variational objective. The result of such ablation reflects from the multimodal nature of human demonstrations data, and is consistent with the findings presented by @florenceImplicitBehavioralCloning2022. The authors also ablate the action chunking paradigm, reporting significant performance gains deriving from using action chunking (1% vs. 44% success rate). To reduce acting open-loop, @zhaoLearningFineGrainedBimanual2023 also design an inference process consisting in performing inference at every timestep $t$ and then aggregate multiple chunks using an exponential moving average (EMA) on the overlapping chunks. -In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn from human demonstrations compared to a simpler, supervised objective, $\mathcal L_1(a,a^\prime) = \Vert a - a^\prime \Vert_1$. Interestingly, they found the performance of these two approaches to be comparable when learning from *scripted* demonstrations. That is, when learning from data collected rolling out a predetermined set of commands $[q^c_0, q^c_1, \dots]$, GM did *not* prove competitive compared to standard supervised learning. However, when learning from human demonstrations--i.e., from data collected executing commands coming from a human controller $[q^h_0, q^h_1, \dots]$--they found performance (success rate on a downstream task) to be severily (-33.3%) hindered from adopting a standard supervised learning objective compared to a richer, potentially more complex to learn variational objective, in keeping with the multimodal nature of human demonstrations data and findings presented in @florenceImplicitBehavioralCloning2022. The authors also ablate the action chunking paradigm, reporting significant performance gains for performing action chunking (1% vs. 44% success rate). To avoid acting openloop, @zhaoLearningFineGrainedBimanual2023 design an inference process consisting in performing inference at every timestep $t$ and then aggregate overlapping chunks using chunks’ exponential moving average. +In ACT (Figure [ch4-act]), inference for a given observation $o \in \mathcal O$ could be performed by (1) defining a prior $p_\omega(z \vert o)$ for the latent variable $z$ and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how sampling from standard VAEs takes place, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim \mathcal N(\mathbf{0}, \mathbf{I})$ and thus skip (1).
- -
Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.
+ +
The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned [CLS] token used to aggregate input level information, and predict the style variable z . The encoder is exclusively used to train the decoder, and it is entirely disregarded at inference time.
-In ACT (Figure [ch4-act]), inference for a given observation $o \in \mathcal O$ could be performed by (1) computing a prior $p_\omega(z \vert o)$ for the latent and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how standard VAEs generate samples, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim N(\mathbf{0}, \mathbf{I})$ and thus skip (1). +However, the authors claim that using a deterministic procedure to sample $z$ benefits policy evaluation, and thus avoid using the conditional prior at all at inference time, effectively using the CVAE framework exclusively to train a more expressive decoder. At test time, @zhaoLearningFineGrainedBimanual2023 propose simply using $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be a standard Gaussian. Further, conditioning on the observation $o$ is achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$ at test time. If at inference $z$ is sampled from a standard Gaussian, during training $z$ is sampled from an approximate posterior distribution $q_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons.
- -
The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned [CLS] token used to aggregate input level information, and predict the style variable z . The encoder is entirely disregarded at inference time.
+ +
The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all n camera views are first embedded using pre-trained visual encoders, and then aggregated with the corresponding positional embeddings. Then, the proprioperceptive information and style variable z retrieved from the CVAE encoder, are fed to the encoder-decoder Transformer for inference. The encoder shares the matrices K, V with the decoder, and is trained to decode fixed position embeddings into action chunks.
-However, the authors claim using a deterministic procedure to derive $z$ may benefit policy evaluation, and thus avoid sampling from the conditional prior at all. At test time, instead, they simply use $z = \mathbf{0}$, as the conditional prior on $z$ used in training is set to be the unit Gaussian. At test time, conditioning on the observation $o$ is instead achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$, while during training $z$ is indeed sampled from the approximate posterior distribution $p_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o$ for efficiency reasons (as the posterior $q_\phi$ is completely disregarded at test time). +#### Code Example: Training and Using ACT in Practice
- -
The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all n camera views are first embedded using pre-trained visual encoders, and then concatenated to the corresponding positional embeddings. Then, alongside embeddings for the proprioperceptive information available and the style variable z retrieved from the CVAE encoder, the Transformer encoder shares the matrices K, Q with the Transformer decoder, trained to decode fixed position embeddings into action valid chunks.
+ +
Action Chunking with Transformer (ACT), as in @zhaoLearningFineGrainedBimanual2023. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.
+
+ +Training ACT +[https://github.com/fracapuano/robot-learning-tutorial/snippets/ch4/01_training_act.py](https://github.com/fracapuano/robot-learning-tutorial/snippets/ch4/01_training_act.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+
+ +Using ACT +[https://github.com/fracapuano/robot-learning-tutorial/snippets/ch4/02_using_act.py](https://github.com/fracapuano/robot-learning-tutorial/snippets/ch4/02_using_act.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) -#### Code Example: Learning ACT + # Next, you can do amazing_model.forward(batch) + ... +``` + +
### Diffusion Policy -DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations. +DMs have proven very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data, and their training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model expert demonstrations in a variety of simulated and real-world tasks. Similarily to ACT @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$, and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s choice to model the data distribution through $p_\theta(a \vert o)$ also stems from the computational burden of diffusion at test time: generating actions together with observations would require a large number of denoising steps--an unnecessarily slow and ultimately unhelpful process, given that robotics focuses on producing controls rather than reconstructing observations. -In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in [diffusion-simplified-loss] on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective +In practice, conditioning on observation data is achieved conditioning the noise regressor $\epsilon_\theta$ introduced in eq. [diffusion-simplified-loss] on a stack of $H_o$ observations, resulting in the *conditional*, simplified diffusion objective: ``` math \begin{align} \mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[ - \Vert \epsilon - \epsilon_\theta(\sqrt{\bar \alpha_t} a_{t:t+T_a} + \epsilon \sqrt{1 - \bar \alpha_t}, t, o_{t-T_o:t}) \Vert^2 \big], \\ + \Vert \epsilon - \epsilon_\theta(\sqrt{\bar \alpha_t} a_{t:t+H_a} + \epsilon \sqrt{1 - \bar \alpha_t}, t, o_{t-H_o:t}) \Vert^2 \big], \\ & t \sim \mathcal{U}(\{1,\dots,T\}), \quad - a_{t:t+T_a}, o_{t-T_o:t} \sim \mathcal{D}, \quad + a_{t:t+H_a}, o_{t-H_o:t} \sim \mathcal{D}, \quad \epsilon \sim \mathcal{N}(\mathbf{0},\mathbf{I}). \notag \end{align} ``` -Notice how in [diffusion-policy-objective] the noise regressor is conditioned both on the latent variable rank $t$ *and* on a stack of previous observations $o_{t-T_o-t}$.  @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness. +Note how in eq. [diffusion-policy-objective] the noise regressor is conditioned on both the latent variable rank $t$ *and* on a stack of previous observations $o_{t-H_o-t}$. @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
-
The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of H o previous observations is used as external conditioning to denoise a group of H a actions. Conditioning is used at every layer of a U-Net block, and in practice allows to obtain fully-formed action chunks with as little as T = 10 denoising steps.
+
The Diffusion Policy archicture, as in @chiDiffusionPolicyVisuomotor2024. A stack of H o previous observations is used as external conditioning to denoise a group of H a actions. Conditioning is performed at every layer of a U-Net block. Diffusion Policy allows to obtain fully-formed action chunks with as little as T = 10 denoising steps.
-Figure [diffusion-policy-architecture] shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample from $\mathcal D$ for simplicity. An arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is mapped to a learned high-dimensional space. Similarily, both image observations and poses are embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, using observation conditioning information at every layer and seeking to optimize [diffusion-policy-objective]. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+T_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-T_o:t}$ to predict $a_{t:t+T_a}$. +Figure [diffusion-policy-architecture] shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample drawn from $\mathcal D$, for simplicity. The starting, arbitrarily noisy chunk of $H_a$ actions $\tilde a_{t:t+H_a}$ is first mapped to a (learned) high-dimensional space. Similarily, both image observations and poses are also embedded before being aggregated to the action embeddings. Then, a U-Net @ronnebergerUNetConvolutionalNetworks2015 is trained to regress the noise added into $\tilde a_{t:t+H_a}$, conditioned on observation information at every layer, thus seeking to optimize eq. [diffusion-policy-objective]. At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_{t-t+H_a}$, reversing the diffusion process simulated in training conditioned on $o_{t-H_o:t}$ to predict $a_{t:t+H_a}$. + +DP can be trained with as little as 50-150 demos (ca. 15-60 minutes of teleoperation data), and exhibit strong performance on a variety of simulated and real-world tasks, including dexterous and deformable manipulation tasks such as sauce pouring and yoga-mat unrolling. Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, which are typically trained in simulation with priviledged information not directly available in real-world deployments. As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments, @chiDiffusionPolicyVisuomotor2024’s conclusion offers significant evidence for learning streamlined control policies directly from pixels. In their work, @chiDiffusionPolicyVisuomotor2024 also ablate the performance of DP against the size of the dataset collected, showing that DP reliably outperforms the considered baseline for all benchmark sizes considered. Further, in order accelerate inference, @chiDiffusionPolicyVisuomotor2024 employ Denoising Diffusion Implicit Models @songDenoisingDiffusionImplicit2022, a variant of Denoising Diffusion Probabilistic Models @hoDenoisingDiffusionProbabilistic2020 (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM’s natively stochastic one) inducing the same final distribution’s as DDPM’s, and yet resulting in 10x less denoising steps at inference time @chiDiffusionPolicyVisuomotor2024. Across a range of simulated and real-world tasks, @chiDiffusionPolicyVisuomotor2024 find DPs particularly performant when modeling $\epsilon_\theta$ with a transformer-based network, although the authors note the increased sensitivity of transformer networks to hyperparameters. Thus, @chiDiffusionPolicyVisuomotor2024 explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure [diffusion-policy-architecture]), which is however reported to be biased towards learning low-frequency components @tancikFourierFeaturesLet2020, and thus may prove more challenging to train with non-smooth action sequences. + +#### Code Example: Training and Using Diffusion Policies in Practice + +
+ +Training Diffusion Policy +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/03_training_diffusion.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/03_training_diffusion.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+
+ +Using Diffusion Policy +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/04_using_diffusion.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/04_using_diffusion.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) -Training using 50-150 demos (15-60 minutes of teleoperation data) DP achieves strong performance on a variety of simulated and real-world tasks, including dexterous and deformable manipulation tasks such as sauce pouring and mat unrolling. Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, typically trained in simulation with priviledged information not directly available in real-world deployments. As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments, @chiDiffusionPolicyVisuomotor2024’s conclusion offers significant evidence for learning streamlined control policies directly from pixels. In their work, @chiDiffusionPolicyVisuomotor2024 also ablate the performance of DP against their baseline against the size of the dataset collected, showing that DP outperforms the considered baseline for every benchmark size considered. Further, to accelerate inference, @chiDiffusionPolicyVisuomotor2024 employ Denoising Diffusion Implicit Models @songDenoisingDiffusionImplicit2022, a variant of Denoising Diffusion Probabilistic Models @hoDenoisingDiffusionProbabilistic2020 (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM’s natively stochastic one) inducing the same final distribution’s as DDPM’s, and yet resulting in 10 times less denoising steps at inference time @chiDiffusionPolicyVisuomotor2024. Across a range of simulated and real-world tasks, @chiDiffusionPolicyVisuomotor2024 find DPs particularly performant when implementing a transformer-based network as $\epsilon_\theta$, although the authors note the increased sensitivity of transformer networks to hyperparameters and thus explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure [diffusion-policy-architecture]), which are however reported to be biased towards learning low-frequency components @tancikFourierFeaturesLet2020 and thus may prove more challenging to train with non-smooth action sequences. + # Next, you can do amazing_model.forward(batch) + ... +``` -#### Code Example: Learning Diffusion Policies +
### Optimized Inference -Modern visuomotor policies output *action chunks*-sequences $\pi(o_t) = \mathbf{A}_t$ with $\mathbf{A}_t = \bigl(a_t,a_{t+1},\dots,a_{t+H_a}\bigr)$ being a sequence of $H_a \gg 1$ low-level commands enqueued in an action queue, originating from an environment observation, $o_t$. Predicting series of actions instead of single commands proved essential in learning complex, multi-modal behavior @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024. +Modern visuomotor policies output *action chunks*-sequences $\pi(o_t) = \bigl(a_t,a_{t+1},\dots,a_{t+H_a}\bigr) = \mathbf{A}_t$ with $\mathbf{A}_t$ a sequence of $H_a \gg 1$ low-level commands scheduled for execution in an action queue, all originating from a single environment observation, $o_t$. Predicting series of actions instead of single commands proved essential in learning complex, multi-modal behavior @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024, and it also holds the premise to be useful to optimize how inference is carried out in practice. -Typically, the robot executes the entire action chunk $\mathbf{A}_t$, before a new observation $o_{t+H_a}$ is passed to the policy $\pi$ to predict the next chunk. This results in open-loop inference in between observations captured every $H_a$ timesteps.  @zhaoLearningFineGrainedBimanual2023 adopts a different strategy whereby the robot controller interleaves chunk prediction $\mathbf{A}_t \gets \pi(o_t)$ and chunk consumption $a_t \gets \text{PopFront(MATHEXPR)}$, computing a new chunk of actions at every timestep $t$ and aggregating the predicted chunks on overlapping sections. While adaptive--every observation at every timestep $o_t$ is processed--such approaches rely on running inference continuously, which can be prohibitive in resource-constrained scenarios, such as edge deployments. +A robot may indeed execute an entire action chunk $\mathbf{A}_t$ *before* a new observation $o_{t+H_a}$ is passed to the policy $\pi$ to predict the next chunk, which would result in open-loop control between observations captured every $H_a$ timesteps. @zhaoLearningFineGrainedBimanual2023 adopt a different strategy, whereby the robot controller interleaves chunk prediction $\mathbf{A}_t \gets \pi(o_t)$ and chunk consumption $a_t \gets \text{PopFront(MATHEXPR)}$, and computes a new chunk of actions at every timestep $t$, to then aggregate the predicted chunks on overlapping sections. While adaptive--every observation at every timestep $o_t$ is processed--such an approach relies on running inference continuously, which can be prohibitive in resource-constrained scenarios, such as edge deployments. A less resource-intensive approach is to entirely exhaust the chunk $\mathbf{A}$ before predicting a new chunk of actions, a strategy we refer to as *synchronous* (sync) inference. Sync inference allocates computation every $H_a$ timesteps, resulting in a reduced computational burden (on average) at control time. In contrast, sync inference also inherently hinders the responsiveness of robot systems, introducing blind lags due to the robot being *idle* while computing $\mathbf{A}$. -A less resource-intensive approach is to entirely exhaust the chunk $\mathbf{A}$ before predicting a new chunk of actions, a strategy we refer to as *synchronous* (sync) inference. Sync inference efficiently allocates computation every $H_a$ timesteps, resulting in a reduced average computational burden at control time. In contrast, it inherently hinders the responsiveness of robot systems, introducing blind lags due to the robot being *idle* while computing $\mathbf{A}$. - -We directly assess the lack of adaptiveness of robot systems due to acting open-loop, and the presence of lags at runtime by decoupling action chunk prediction $\mathbf{A}$ from action execution $a_t \gets \text{PopFront}(\mathbf{A}_t)$, developing an *asynchronous* (async) inference stack ([alg-async-inference]), whereby a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete ([ch4-async-inference]). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available queue, aggregating it with the newly incoming queue whenever available. In turn, async-inference tightens the loop between action prediction and action execution, by increasing the frequency at which observations are processed for chunk prediction. Crucially, decoupling action prediction from action execution also directly allows to allocate more computational resources on a remote policy server sending actions to the robot client over networks, something which may prove very effective in resource-constrained scenarios such as low-power robots. +One can use the fact that policies output multiple actions at the same time to directly (1) the lack of adaptiveness and (2) the presence of lags at runtime by decoupling action chunk *prediction* $\mathbf{A}$ from action *execution* $a_t \gets \text{PopFront}(\mathbf{A}_t)$. This decoupled stack, which we refer to as *asynchronous* (async) inference ([alg-async-inference]), also enables optimized inference by allowing action-chunk inference to run on a separate machine, typically equipped with better computational resources than the ones onboard a robot. In async inference, a $\text{RobotClient}$ sends an observation $o_t$ to a $\text{PolicyServer}$, receiving an action chunk $\mathbf{A}_t$ once inference is complete (Figure [ch4-async-inference]). In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available chunk, aggregating the previous and incoming chunks whenever the latter is available to the $\text{RobotClient}$. In turn, async-inference tightens the loop between action prediction and action execution efficienty, by increasing the frequency at which observations are processed for chunk prediction while not running inference at every timestep. Crucially, decoupling action prediction from action execution also allows to allocate more computational resources on a remote policy server sending actions to the robot client over the network.
@@ -1202,7 +1785,7 @@ We directly assess the lack of adaptiveness of robot systems due to acting open-
- +
**Input:** horizon $T$, chunk size $H_a$, threshold $g\in[0,1]$ **Init:** capture $o_0$; send $o_0$ to ; receive $\mathbf{A}_0 \gets \pi(o_0)$ $a_t \gets \text{PopFront}(\mathbf{A}_t)$ ($a_t$) capture new observation, $o_{t+1}$ `async_handle` $\gets \text{AsyncInfer}(o_{t+1})$ $\tilde{\mathbf{A}}_{t+1} \gets \pi(o_{t+1})$ $\mathbf{A}_{t+1} \gets f(\mathbf{A}_t,\tilde{\mathbf{A}}_{t+1})$ @@ -1210,25 +1793,19 @@ We directly assess the lack of adaptiveness of robot systems due to acting open- $\mathbf{A}_{t+1} \gets \mathbf{A}_t$
- -
-##### Implementation details +In practice, *async* inference (1) tightens the control loop by capturing observations more often, eliminating idle gaps at runtime (2) and directly allows to run inference on more powerful computational resources than the ones typically available onboard autonomous robotic platforms. Algorithmically, one can attain (1) on the -side by consuming actions from a readily available queue until a given condition on the number of remaining actions in the queue ($\vert \mathbf{A}_t \vert / H_a < g$) is met. When this condition is triggered, a new observation of the environment is captured and sent to the (possibly remote) . To avoid redundant server calls and erratic behavior at runtime observations are compared in joint-space, and near-duplicates are dropped. Two observations are considered near-duplicates if their distance in joint-space falls under a predetermined threshold, $d_{\text{lim}} \in \mathbb R_+$. Importantly, should the queue available to the robot client eventually empty out, the most recent observation is processed regardless of similarity. -*Async* inference (1) tightens the control loop by capturing observations more often, directly eliminates idle gaps at runtime, and (2) directly allows to run inference on more powerful computational resources than the ones typically available onboard autonomous robotic platforms. +Interestingly, the behavior of async inference can be studied analytically. First, let $\ell$ be a random variable modeling the time needed to receive an action chunk $\mathbf{A}$ after sending an observation $o$, i.e. the sum of (1) the time to send across the observation $o$ between the and , $t_{C \to S}$ (2) the inference latency on the , $\ell_S$ and (3) the time to send $\mathbf{A}$ between the and , $t_{S \to C}$. Under the (reasonable) assumption of independence, $\mathbb E [\ell] = \mathbb E[t_{C \to S}] + \mathbb E[\ell_S] + \mathbb E[t_{S \to C}]$, which can be further simplified to $\mathbb E[\ell] \simeq \mathbb E[\ell_S]$, assuming communication time is (1) equal in both directions and (2) negligible with respect to the inference latency. Second, let $\Delta t$ be the environment’s control cycle. With a real-world frame-rate of 30 frames-per-second (fps), $\Delta t=33\text{ms}$. Consequently, exhausted queues at runtime--i.e. being idle awaiting for a new chunk--are avoided for $g \geq \frac{\mathbb E[\ell_S] / \Delta t}{H_a}$. In this, the action queue threshold $g$ below which to capture and send a new observation for processing plays a major role relatively to the availability of actions to the $\text{RobotClient}$. -Algorithmically, we attain (1) on the -side by consuming actions from a readily available queue until a threshold condition on the number of remaining actions in the queue ($\vert \mathbf{A}_t \vert / H_a < g$) is met. When this condition is triggered, a new observation of the environment is captured and sent to the (possibly remote) . To avoid redundant server calls and erratic behavior at runtime observations are compared in joint-space, and near-duplicates are dropped. Two observations are considered near-duplicates if their distance in joint-space is under a predetermined threshold, $\epsilon \in \mathbb R_+$. Importantly, when the queue available to robot client eventually becomes empty, the most recent observation is processed regardless of similarity. - -Interestingly, the behavior of async inference can be studied analytically. First, let $\ell$ be a random variable modeling the time needed to receive an action chunk $\mathbf{A}$ after sending an observation $o$, i.e. the sum of (1) the time to send across the observation $o$ between the and , $t_{C \to S}$ (2) the inference latency on the , $\ell_S$ and (3) the time to send $\mathbf{A}$ between the and , $t_{S \to C}$. Assuming independence, $\mathbb E [\ell] = \mathbb E[t_{C \to S}] + \mathbb E[\ell_S] + \mathbb E[t_{S \to C}]$ which can be further simplified to $\mathbb E[\ell] \simeq \mathbb E[\ell_S]$, assuming communication time is (1) equal in both directions and (2) negligible with respect to the inference latency. Second, let $\Delta t$ be the environment’s control cycle. With a real-world frame-rate of 30 frames per second, $\Delta t=33\text{ms}$. Consequently, exhausted queues at runtime-i.e. being idle awaiting for a new chunk-are avoided for $g \geq \frac{\mathbb E[\ell_S] / \Delta t}{H_a}$. In this, the queue threshold $g$ plays a major role relatively to the availability of actions to the . - -[ch4-queues] illustrates how the size of the action chunk $\lvert \mathbf{A}_t \rvert$ evolves over time for three representative values of $g$, detailing the following key scenarios: +Figure [ch4-queues] illustrates how the size of the action chunk $\lvert \mathbf{A}_t \rvert$ evolves over time for three representative values of $g$, detailing the following key scenarios: - **Sequential limit $(g=0)$.** The client drains the entire chunk before forwarding a new observation to the server. During the round-trip latency needed to compute the next chunk, the queue is empty, leaving the robot *incapable of acting*. This reproduces the behavior of a fully sequential deployment and results in an average of $\mathbb E[\ell_S]$ idle seconds. -- **Asynchronous inference $(g \in (0,1))$.** Allowing the client to consume $1-g$ of its available queue $\mathbf{A}_{t-1}$ before triggering inference for a new action queue $\mathbf{A}_{t}$, amortizing computation while keeping the queue from emptying. The overlap between successive chunks provides a buffer against modeling errors without the full cost of the $g=1$ regime. The updated queue $\mathbf{A}_t$ is obtained aggregating queues on the overlapping timesteps between $\mathbf{A}_{t-1}$ and the incoming $\tilde{\mathbf{A}}_{t}$. +- **Asynchronous inference $(g \in (0,1))$.** Allowing the client to consume a $1-g$ fraction of its available queue $\mathbf{A}_{t-1}$ *before* triggering inference for a new action queue $\mathbf{A}_{t}$, computation is amortized while keeping the queue from emptying. The overlap between successive chunks provides a buffer against modeling errors without the full cost of the $g=1$ regime. The updated queue $\mathbf{A}_t$ is obtained aggregating queues on the overlapping timesteps between $\mathbf{A}_{t-1}$ and the incoming $\tilde{\mathbf{A}}_{t}$. -- **Compute-intensive limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to$\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets filled again. +- **Sync-inference limit $(g=1)$.** As an extreme case, and in keeping with @zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to $\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets entirely filled.
@@ -1243,10 +1820,130 @@ Interestingly, the behavior of async inference can be studied analytically. Firs
Action queue size evolution at runtime for various levels of g when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.
-[ch4-queues] emphasizes the trade-off governed by $g$: small values place result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the observation similarity filter dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, [ch4-queues] results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in [ch4-queues] highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty. + +Figure [ch4-queues] emphasizes the trade-off governed by $g$: small values of $g$ result in idle periods, whereas $g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the $\text{RobotClient}$ would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average. The presence of the filter for observation similarity dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, Figure [ch4-queues] results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in [ch4-queues] highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty. #### Code Example- Using Async Inference +
+ +Spinning up a Remote Server +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/05_policy_server.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/05_policy_server.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+
+ +Attaching a Robot Client +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/06_robot_client.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch4/06_robot_client.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+ ## Generalist Robot Policies @@ -1259,11 +1956,11 @@ Robert A. Heinlein
-TL;DR Openly available large scale datasets and the development of stable, expressive and efficient architecture fostered research on the development of generalist robot policies that can operate across embodiment and tasks. +TL;DR Openly available, large-scale datasets and the development of stable-to-train, expressive and efficient architectures fostered research on the development of generalist robot policies that can operate across embodiment and tasks.
-The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. The pre-training/adaptation paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, motivated by the main drawback of limited scalability for *task-specific approaches*, traditionally labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization collective efforts to aggregate large-scale openly available datasets @collaborationOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section [learning-bc-single] introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions given via natural language. +The advent of large models trained on internet-scale datasets has drastically influenced fields like Computer Vision (CV) and Natural Language Processing (NLP), shifting the previously task-specific paradigm towards combining (1) an initial, task-agnostic large-scale pre-training stage and a (2) task-specific, adjustment phase. This *pre-train-and-adaptat* paradigm has now largely replaced more classic approaches consisting of task-specific data collection, curation and model training in many subdomains within CV and NLP, and it is motivated by the main drawback of limited scalability for *task-specific approaches*, which have been traditionally more labor intensive. Factors including (1) the advancements in generalist models learned with self-supervision for perception @oquabDINOv2LearningRobust2024 or semantic understanding @devlinBERTPretrainingDeep2019 and (2) the popularization of collective efforts to aggregate large-scale openly available datasets @oneillOpenXEmbodimentRobotic2025, @khazatskyDROIDLargeScaleInTheWild2025 are increasingly pushing the field of robot learning towards the pre-train-and-adapt paradigm. This shift taps into the long-standing challenge of developing generalist robot policies, and holds the premise to surpass traditionally siloed approaches to robotics problems and develop a *foundation robotics model*. While Section [learning-imitation] introduced methods for learning *single-task policies* such as ACT or Diffusion Policy, in this section we present advancements in developing *generalist, multi-task, policies*, capable of performing a wide range of tasks across different environments and embodiments, and guided by unstructured instructions typically given in plain, natural language.
[ch5-ml-vs-robotics-foundation]). +The remarkable success of foundation models in NLP and CV seems to be increasingly predicated on two core principles: architectural innovation and (joint) data-compute scaling. Indeed, the transformer architecture proved very effective in capturing long-range dependencies in a variety of data formats, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale datasets. However, in stark contrast with large-scale NLP and CV datasets @raffelExploringLimitsTransfer2023, @ImageNet_VSS09, robotics has historically developed around small, task-specific datasets. In turn, this traditionally hindered scalability across problems as well as results, posing concrete challenges to developing general-purpose robot learning algorithms. Indeed, differently from the wealth of relatively readily-available task-agnostic text and images datasets on the internet, robotics data is *intrinsically embodied* and thus task-specific: datasets collected for *manipulation* differ significantly from *locomotion*. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Further, datasets consisting of expert demonstrations are (1) intrinsically more expensive to collect and (2) notoriously heterogeneous--different human experts may perform the same task in very different. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, developed to perform well at their designated task but that fail to generalize to new deployment scenarios (Figure [ch5-ml-vs-robotics-foundation]).
-
Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @collaborationOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, π 0 @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.
+
Early efforts in the development of generalist models for robotics include BC-Zero @jangBCZZeroShotTask2022, RT-1 @brohanRT1RoboticsTransformer2023, and RT-2 @brohanRT2VisionLanguageActionModels2023: large scale models trained on thousands of demonstrations. The open release of the Open-X @oneillOpenXEmbodimentRobotic2025 and DROID datasets @khazatskyDROIDLargeScaleInTheWild2025 fostered the development of open source models: OpenVLA @kimOpenVLAOpenSourceVisionLanguageAction2024, π 0 @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025.
-Motivated by the pursuit of generalist robot policies, the research community started investigating what and how to integrate from other domains within ML. Figure [ch5-generalist-policies-timeline] shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25K+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. For starters, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 uses a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots in the span on 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of 6 256 bins, each for each joint of a 6-dof robotic arm. +Driven by the goal of developing generalist robot policies, the research community has increasingly explored how insights and techniques from other areas of ML can be integrated into robotics. Figure [ch5-generalist-policies-timeline] shows a timeline of some of the most popular contributions attempting at developing generalist policies. Starting from BC-Zero, a latent variable model trained on 25k+ demonstrations, the field has now evolved into $\pi_0$, a transformer-based model trained on 10M+ demonstrations and exhibiting strong few-shot capabilities across tasks and embodiments. In between, Robotics Transformer 1 (RT-1) @brohanRT1RoboticsTransformer2023 represented a significant step in the direction of developing a generalist robot policies over prior work including (1) BC-Zero @jangBCZZeroShotTask2022 and (2) Gato @reedGeneralistAgent2022, in that @brohanRT1RoboticsTransformer2023 use a much larger and diverse set of training tasks compared to both BC-Zero and Gato. In particular, RT-1 uses a transformer architecture, and is trained on as many as 130k human-recorded trajectories collected over 13 robots and over 17 months. RT-1 learns to process a history of camera images and a natural language instruction, and feeds the resulting sequence of high-dimensional tokens to a transformer, trained using a *classification loss on a discretized actions space* consisting of six different 256-bins, one for each joint of a 6-dof robotic arm. -Perhaps motivated by the contemporary successes of the transformer architecture in both CV and NLP, the same group of authors investigated using a discrete output space to model--inherently continuous--quantities such as actions, leveraging a (1) more powerful architecture and (2) scaling up the dataset used . In RT-2, @brohanRT2VisionLanguageActionModels2023 propose inheriting internet-scale semantic knowledge from large-scale multi-modal datasets to learn a single, *unified model* for robotics control. Such a model, termed *Vision-Language-Action* (VLA) in the original RT-2 paper, effectively casts robot control as a language modeling problem, and in particular as a Visual Question-Answering (VQ) task, whereby the output token space used to represent *string* tokens is shared with the *8-bits tokens* used to represent the 256 actuation levels of a 6-dof robot joint. In their work, @brohanRT2VisionLanguageActionModels2023 propose co-fine-tuning then-leading large-scale VLMs such as PaLIX @chenPaLIXScalingMultilingual2023 or PaLM-E @driessPaLMEEmbodiedMultimodal2023 on a mix of web and robotics data, thus complementing VQtraining with robotics-specific signal, learning to directly output robot actions in a shared token space for visual and language inputs. Using large models trained on internet-scale data as backbones for VLAs allows models to tap into the rich semantic knowledge embedded in the VLM’s parameters, interpret new commands as well as recognize unseen objects by connecting them to concepts acquired while pre-training. For instance, @brohanRT2VisionLanguageActionModels2023 show that while RT-2 has never been explicitly trained to repurpose tools for a hammering task, it can still combine its semantic understanding of images, so that when asked which object between (1) a piece of paper, (2) a pair of headphones or (3) a rock may be used instead of a hammer, it answers correctly, (3). +In a follow-up work, the same group of authors propose a modified method to learn generalist models, leveraging (1) a more powerful architecture and (2) scaling up the dataset used . In RT-2, @brohanRT2VisionLanguageActionModels2023 propose inheriting internet-scale semantic knowledge from large-scale multi-modal datasets to learn a single, *unified model* for robotics control. Such a model, termed *Vision-Language-Action* (VLA) in the original RT-2 paper, effectively casts robot control as a language-modeling problem, and in particular as a Visual Question-Answering (VQ) task, in which the output token space used to represent *textual tokens* is shared with the *8-bits tokens* used to represent the 256 ($2^8$) actuation levels of a 6-dof robot. In their work, @brohanRT2VisionLanguageActionModels2023 propose co-fine-tuning large-scale VLMs such as PaLIX @chenPaLIXScalingMultilingual2023 or PaLM-E @driessPaLMEEmbodiedMultimodal2023 on a mix of (1) web and (2) robotics data, complementing VQtraining with robotics-specific signal, and learning to directly output robot actions in a shared token space for visual and language inputs. In their work, the authors claim using large models trained on internet-scale data as backbones for VLAs allows models to tap into the rich semantic knowledge embedded in the VLM’s parameters, interpreting instructions and unseen objects by connecting them to concepts acquired while pre-training. For instance, @brohanRT2VisionLanguageActionModels2023 show that while RT-2 has never been explicitly trained to repurpose tools for a *hammering* task, it can still combine its semantic understanding of images, so that when asked which object between (1) a piece of paper, (2) a pair of headphones or (3) a rock may be used instead of a hammer, it correctly answers (3). -Traditionally, research involved not only training the model but also collecting the underlying data, a costly and time-consuming process--for instance, @jangBCZZeroShotTask2022 gathered 25K+ trajectories before training, while RT-1 required 130K+. In turn, the data used in robot learning research efforts have traditionally proved rather fragmented, tailored to the specific task considered by the specific group of researchers who collected it, ultimately hindering integration. The Open X-Embodiment project @collaborationOpenXEmbodimentRobotic2025 was a landmark effort to address the data fragmentation problem, curating the aggregation of 60 *existing* robotics datasets from 22 different robot embodiments and 21 institutions, resulting in a total 1.4M of cross-embodiments, cross-tasks, openly-available trajectories. Besides the contribution of an aggregate, large scale dataset, @collaborationOpenXEmbodimentRobotic2025 also demonstrated significant positive transfer *across tasks and embodiments*, showing that a single model trained on multi-embodiment data can outperform specialist models trained on their respective single-embodiment datasets. The Distributed Robot Interaction Dataset (DROID) @khazatskyDROIDLargeScaleInTheWild2025 represents another significant step towards addressing the problem of scarse and disaggregated data in robot learning, providing a unique dataset consisting of 75K+ human demonstrations collected in realistic (*in-the-wild*) manipulation settings, providing another cornerstone for building general-purpose robot policies. Recently, foundational datasets curated through large, centralized efforts, are increasingly complemented by decentralized, community-driven collection of robotics data. Software libraries as `lerobot` have been instrumental in enabling decentralized collection of large amounts of data, providing the infrastructure for researchers and practitioners to easily contribute trajectories from range of embodiments, democratizing data access via distributed collection. +Traditionally, research efforts revolved around not only training models, but also proposing datasets for the community, a costly and time-consuming process. Due to the aforementioned embodiment gap, the data used in research efforts in robot learning have traditionally proved rather fragmented, tailored to the specific task considered by the specific group of researchers who collected it, which ultimately hindered integration. The Open X-Embodiment project @oneillOpenXEmbodimentRobotic2025 was a landmark collaboration effort to address data fragmentation, by curating the aggregation of 60 *existing* robotics datasets from 22 different robot embodiments and 21 institutions across the world, and resulted in a total 1.4M of cross-embodiments, cross-tasks, openly-available trajectories. Besides the contribution of an aggregate, large scale dataset, @oneillOpenXEmbodimentRobotic2025 also demonstrated significant positive transfer *across tasks and embodiments*, showing that a single model trained on multi-embodiment data can outperform specialist models trained on their respective single-embodiment datasets. The Distributed Robot Interaction Dataset (DROID) @khazatskyDROIDLargeScaleInTheWild2025 represents another significant step towards addressing the problem of scarse and disaggregated data in robot learning, providing a unique dataset consisting of 75k+ human demonstrations collected in realistic (*in-the-wild*) manipulation settings, providing another cornerstone for building general-purpose robot policies. Recently, foundational datasets curated through large, centralized efforts, are increasingly complemented by decentralized, community-driven contributions of robotics data. Software libraries like `lerobot` have been instrumental in enabling decentralized collection of large amounts of data, providing the infrastructure for researchers and practitioners to easily contribute trajectories from a wide range of embodiments, democratizing data access via distributed collection. -The success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast of closed-source counterparts, as a community-driven effort to create powerful, openly available VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970K+ from the Open-X dataset), and share training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of Llama2-7B @touvronLlama2Open2023 language model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels. +Despite these advancements, the success of large, proprietary models like RT-1 and RT-2, highlighted a growing accessibility gap in robotics research, as training and deploying large-scale robotics foundation models requires computational resources simply unattainable for most research institutions. The OpenVLA project @kimOpenVLAOpenSourceVisionLanguageAction2024 emerged in direct contrast to traditionally closed-source efforts to develop VLAs. In particular, @kimOpenVLAOpenSourceVisionLanguageAction2024 trained OpenVLA by exclusively leveraging openly available data (970k+ trajectories from the Open-X dataset), and openly shared their training recipes alongside the model weights. Architecturally, OpenVLA integrates a pre-trained vision encoder to project visual tokens into the embedding space of the Llama2-7B @touvronLlama2Open2023 language-model backbone. The language model backbone is then used to predict *discrete action tokens* over 256 activation levels.
-
Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising Ms of demonstrations, and (A, right) decentralized approaches to data collection are also rising as an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.
+
Robot learning is undergoing a paradigmatic shift: centralized data collections (A, left) are increasingly larger, often comprising millions of demonstrations, while (A, right) decentralized data collection efforts are becoming an alternative for large scale data collection. (B) Generalist models are also becoming increasingly smaller and easier to run on limited hardware.
-Figure [ch5-trends] illustrates graphically the two most relevant trends in modern robot learning. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners have also gained traction recently, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are also becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments. +Figure [ch5-trends] shows the current trends in robot learning in terms of size and nature of the robotics datasets contributed, together with the size and accessibility of the available models. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners also gained traction, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are increasingly becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments. -### Modern VLAs +### VLAs -Modern recipes to train large scale VLAs extend early efforts to learn foundation models from large amounts of data via BC, introducing significant advancements concerning both architectural and procedural aspects. From an architectural perspective, modern VLAs such as $\pi_0$ @blackp0VisionLanguageActionFlow2024 leverage a *unified transformer model* for efficiency of computation, while maintaining specialized sub-components within the model for visual perception and action prediction, enabling cross-task performance via language conditioning. Crucially, modern VLAs including @blackp0VisionLanguageActionFlow2024\[$\pi_0$\] and @shukorSmolVLAVisionLanguageActionModel2025\[SmolVLA\] adopt *unified* transformer models employing disjoint set of weights (*experts*) for compute-efficient visual-semantic understanding and robotic control. Procedurally, modern VLAs complement advanced Vision-Language Model (VLM) backbones with action-specific modules (1) adopting mid-sized *action experts* to model continuous actions distributions $p (a_{t:t+H_a} \vert o_t)$--avoiding discrete action tokens entirely--and (2) relying on *action chunking*  as a strategy to reduce error compounding when predicting multiple actions learning from inherently non-i.i.d. data, such as demonstration data. +Modern recipes to train large scale VLAs extend early efforts to learn foundation models from large amounts of data via BC, introducing significant advancements concerning both architectural and procedural aspects. From an architectural perspective, modern VLAs such as $\pi_0$ @blackp0VisionLanguageActionFlow2024 leverage a *unified transformer model* for efficiency of computation, while maintaining specialized sub-components within the model for visual perception and action prediction, enabling cross-task performance via language conditioning. Crucially, modern VLAs including$\pi_0$ @blackp0VisionLanguageActionFlow2024 and SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 adopt *unified* transformer models employing disjoint set of weights (*experts*) for both compute-efficient visual-semantic understanding as well as control. Procedurally, VLAs complement advanced Vision-Language Model (VLM) backbones with action-specific modules (1) adopting mid-sized *action experts* to model continuous actions distributions $p (a_{t:t+H_a} \vert o_t)$--avoiding discrete action tokens entirely--and (2) relying on *action chunking*  as a strategy to reduce error compounding when predicting multiple actions learning from inherently non-i.i.d. data, such as demonstration data. -These architectural and procedural innovations present three benefits. First, developing architectures that exploit internet-scale pre-trained backbones allows to fully capitalizes on the vast world knowledge and skills state-of-the-art VLMs exhibit, preventig models from needing to learn visual, linguistic and semantic concepts from scratch. Second, using generative models for continuous action distributions allows to learn rich, multimodal data distributions, a much more likely scenario in the big-data regime typically tackled while developing generalist policies. Further, introducing two separate components for perception and action planning could enable using Mixture of Experts (MoE) architectures @fedusReviewSparseExpert2022, more efficient to run and thus resulting in faster inference--a key features for models deployed in real-world scenarios. This new paradigm has been at the core of some of the most capable generalist policies developed to date, capable to few-shot adapt to novel tasks and to perform highly dexterous manipulation tasks, ranging from end-to-end folding laundry, to bussing tables. +These architectural and procedural innovations present three benefits over task-specific methods. First, developing architectures that exploit internet-scale pre-trained backbones allows to fully capitalize on the vast world knowledge and skills state-of-the-art VLMs exhibit, preventig models from needing to learn visual, linguistic and semantic concepts from scratch. Second, using generative models for continuous action distributions allows to learn rich, multimodal data distributions, a much more likely scenario in the big-data regime which is typically tackled while developing generalist policies. Further, introducing separate components for perception and action planning enable using Mixture of Experts (MoE) architectures @fedusReviewSparseExpert2022, which are often more efficient to run--a key feature for models deployed in real-world scenarios. This new paradigm has been at the core of some of the most capable generalist policies developed to date, capable to few-shot adapt to novel tasks and to perform highly dexterous manipulation tasks ranging from end-to-end folding laundry to bussing tables @blackp0VisionLanguageActionFlow2024. #### VLMs for VLAs -VLMs are designed to process both visual and textual modalities--most commonly by taking both images and text as input and generating text conditioned on the visual context. Recent advances in VLMs have been driven by the success of LLMs, with many approaches building upon pretrained LLMs and adopting similar training paradigms to the ones used in language modeling. Typically, VLMs @alayracFlamingoVisualLanguage2022, @laurenconWhatMattersWhen2024, @linVILAPretrainingVisual2024 are constructed by integrating a pretrained vision encoder @radfordLearningTransferableVisual2021, @zhaiSigmoidLossLanguage2023, @finiMultimodalAutoregressivePretraining2024 with a pretrained LLM @grattafioriLlama3Herd2024, @jiangMistral7B2023. Training then proceeds in multiple multimodal stages, beginning with a large-scale pretraining on datasets containing image-text pairs @LAION-COCO, @kakaobrain2022coyo700m and interleaved vision-language corpora @OBELICS, @MMC4, all followed by a supervised fine-tuning stage on instruction-tuning datasets @LLaVA-1.5, @tong2024cambrian, @laurenconWhatMattersWhen2024. The inherent multimodal nature of VLMs enables them to jointly reason over vision and language. Pre-training on vast internet-scale datasets allows these models to associate visual patterns with textual descriptions, thereby acquiring a rich semantic understanding of the world--knowledge about objects, their properties, and relationships--without explicit supervision for each concept. In turn, integrating a VLM as a perception backbone for a VLA allows the complete model to inherit rich world knowledge, sidestepping the need to learn visual and semantic representations from scratch. In principle, this allows the robot to ground high-level natural language instructions in its visual context, and possibly recognize unseen objects by connecting them to pre-trained concepts absorbed during pre-training, improving on the possibility to generalize to novel scenarios. +VLMs are designed to handle both visual and textual modalities, most commonly by taking both images and text as inputs, generating text conditioned on the visual context. Recent advances in VLMs have been driven by the success of LLMs, with many approaches building upon pretrained LLMs and adopting similar training paradigms to the ones used in language modeling. Typically, VLMs @alayracFlamingoVisualLanguage2022, @laurenconWhatMattersWhen2024, @linVILAPretrainingVisual2024 are constructed by integrating a pretrained vision encoder @radfordLearningTransferableVisual2021, @zhaiSigmoidLossLanguage2023, @finiMultimodalAutoregressivePretraining2024 with a pretrained LLM @grattafioriLlama3Herd2024, @jiangMistral7B2023. Training then proceeds in multiple multimodal stages, beginning with a large-scale pretraining on datasets containing image-text pairs @LAION-COCO, @kakaobrain2022coyo700m and interleaved vision-language corpora @OBELICS, @MMC4, all followed by a supervised fine-tuning stage on instruction-tuning datasets @LLaVA-1.5, @tong2024cambrian, @laurenconWhatMattersWhen2024. The inherent multimodal nature of VLMs enables them to jointly reason over vision and language. Pre-training on vast internet-scale datasets allows these models to associate visual patterns with textual descriptions, thereby acquiring a rich semantic understanding of the world--knowledge about objects, their properties, and relationships--without explicit supervision for each concept. In turn, integrating VLMs as the perceptual backbone for VLAs allows the latter to inherit rich, contextual world knowledge from the VLM, sidestepping the need to re-learn visual and semantic representations. In principle, this also allows the robot to ground high-level natural language instructions in its visual context, and possibly recognize objects by connecting them to the pre-trained concepts absorbed during pre-training, improving on the possibility to generalize to novel scenarios. -Recently, compute efficiency has also become a central focus in VLM research. Several works aim to reduce training costs by using smaller, more diverse datasets @LLaVA-1.5, @InstructBLIP, @bai2025qwen25vl, @zhu2024minigpt, @tong2024cambrian, training smaller-scale models @marafiotiSmolVLMRedefiningSmall2025, @moondream, @minicmpv2024, or by adapting pretrained unimodal models by tuning only a small subset of parameters @shukor2023epalm, @vallaeys2024improveddepalm, @MAPL, @FROMAGe, @tsimpoukelli2021multimodalfrozen, @BLIP-2. While the majority of VLM research focuses on image and text modalities, recent work has demonstrated that similar techniques can be extended to integrate additional modalities, such as video and audio @wang2025internvideo2, @liu2024kangaroo, @zhang2025videollama, @kong2024audioflam--a particularly promising direction of research for robotics applications, where multiple sensor modalities can be integrated effectively. This trend towards efficiency is paramount for robotics applications, where policies must operate under the stringent constraints of real-world deployment. Indeed, robots often possess limited on-board computational resources and must react in real-time to dynamic environments. Smaller and faster VLMs have thus become quintessential for developing responsive autonomous systems, enabling high-frequency control loops by reducing the latency between perception and action. +Recently, compute efficiency has also become a central focus in multi-modal research. Several works aim to reduce training costs by using smaller, more diverse datasets @LLaVA-1.5, @InstructBLIP, @bai2025qwen25vl, @zhu2024minigpt, @tong2024cambrian, training smaller-scale models @marafiotiSmolVLMRedefiningSmall2025, @moondream, @minicmpv2024, or by adapting pretrained unimodal models by tuning only a small subset of parameters @shukor2023epalm, @vallaeys2024improveddepalm, @MAPL, @FROMAGe, @tsimpoukelli2021multimodalfrozen, @BLIP-2. While the majority of VLM research focuses on image and text modalities, recent work has also demonstrated that similar techniques can be extended to integrate additional modalities, such as video and audio @wang2025internvideo2, @liu2024kangaroo, @zhang2025videollama, @kong2024audioflam--a particularly promising direction of research for robotics applications, where multiple sensor modalities can be integrated effectively. This trend towards efficiency is paramount for robotics applications, where policies must operate under the stringent constraints of real-world deployment. ### $\pi_0$ -$\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with a late-fusion VLM (PaliGemma), while proprioceptive state and actions chunks are routed to a smaller action expert, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation. +$\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with PaliGemma, a VLM merging independently encoded visual and textual features deep in the network (*late-fusion*), while proprioceptive state and actions chunks are routed to a smaller *action expert*, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
-
The π 0 architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture. The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.
+
The π 0 architecture, as in @blackp0VisionLanguageActionFlow2024. Vision and language tokens are routed to a VLM backbone which is prevented from attending robot proprioperceptive states and action tokens, which are instead routed to a smaller subset of weights within the architecture referred to as "action expert". The architecture is trained with Flow Matching on 10M+ trajectories from a mixture of closed and openly available datasets.
-Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure [ch5-pi0]). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference. +Concretely, $\pi_0$ is a single, unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $f_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used to process both the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure [ch5-pi0]). The different expert networks operate separately in processing the respective inputs and turn them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$ uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while *across* blocks, future blocks are masked out. Formally, this corresponds to using an attention mask like: $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive tokens and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference. In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing: @@ -1363,16 +2060,16 @@ In $\pi_0$, both the VLM backbone and action expert are update using a *flow mat o_t, a_{t:t+H_a} \sim \mathcal D \notag \end{align} ``` +where the two experts parametrized by the separate weights $\phi, \theta$ interact with each other via self-attention layers only, so that the action expert $v_\theta$ internal computations also depend on the VLM backbone’s parameters $\phi$. Importantly, @blackp0VisionLanguageActionFlow2024 minimize eq. [pi0-loss] over both the multimodal backbone and action expert parameters, thus updating both the internal representations of the VLM and action-expert weights using BC-specific gradients. In contrast, @driessKnowledgeInsulatingVisionLanguageAction2025 later show that failing to insulate the VLM knowledge from the flow matching gradients actually harms performance. -Where the experts parametrized by the separate weights $\phi, \theta$ interact with each other via self-attention layers only, so that the action expert $v_\theta$ internal computations also depend on the VLM backbone’s parameters $\phi$. Importantly, @blackp0VisionLanguageActionFlow2024 minimize [pi0-loss] over both the multimodal backbone and action expert parameters, thus updating the internal representations of the VLM using BC-specific gradients. In contrast, @driessKnowledgeInsulatingVisionLanguageAction2025 later show that failing to insulate the VLM knowledge from the flow matching gradients actually harms performance. Inference is performed iteratively refining action chunks while numerically forward-integrating the vector field predicted by the action expert, - +At runtime, inference is performed iteratively refining action chunks while numerically forward-integrating the vector field predicted by the action expert, ``` math \begin{equation} a_{t:t+H_a}^{\tau + \delta} = a_{t:t+H_a}^{\tau } + \delta v_\theta(a_{t:t+H_a}^{\tau }, o_t) \end{equation} ``` -Flow matching  can be seen as a continuous time, detetrministic generalization of Diffusion and has proven effective in modeling highly complex multi-modal distributions, including those over images and video. In turn, its application to large-scale data collections of multiple human behaviors across tasks and embodiments appears rather consequential, particularly considering how it can enable faster inference via a reduced number of denoising steps--as few as 10, in $\pi_0$. In particular, the action expert is model as a conditional flow matching model. Each action token embeds a noisy action $a_i^{\tau} \in a^\tau_{t:t+H_a}$, alongside a sinusoidal encoding of the *flow process* timestep $\tau$. The action expert then leverages full bidirectional attention across the $H_a$ action tokens provided, as well as attends to previous proprioperceptive and image-language tokens as well. Interestingly, differently from a standard flow matching pipeline @lipmanFlowMatchingGenerative2023, $\tau$ is *not* sampled from a uniform distribution $\tau \sim \mathcal U([0,1])$, but rather obtained from $\tau \sim \textrm{Beta}(1.5,1)$ defined on the $[0,s], s<1$ support (Figure [ch5-pi0-sampling-timesteps]). +Flow matching  can be seen as a continuous time, deterministic generalization of diffusion processes, and has proven effective in modeling highly complex multi-modal distributions, including those over images and video. In turn, the application of flow matching to large-scale datasets of multiple human behaviors across tasks and embodiments appears rather consequential, particularly considering how it can enable faster inference via a limited number of denoising steps at test time--as few as 10, in $\pi_0$. In particular, the action expert is implemented as a conditional flow matching model. Each action token embeds a noisy action $a_i^{\tau} \in a^\tau_{t:t+H_a}$, alongside a sinusoidal encoding of the *flow process* timestep $\tau$. The action expert then leverages full bidirectional attention across the $H_a$ action tokens provided, and also attends to previous proprioperceptive and image-language tokens. Interestingly, differently from a standard flow matching pipeline @lipmanFlowMatchingGenerative2023, $\tau$ is *not* sampled from a uniform distribution $\tau \sim \mathcal U([0,1])$, but rather obtained from $\tau \sim \textrm{Beta}(1.5,1)$ defined on the $[0,s], s<1$ support (Figure [ch5-pi0-sampling-timesteps]).
@@ -1387,17 +2084,77 @@ r0.4
-Using such Beta distribution emphasizes higher noise levels during training, a choice @blackp0VisionLanguageActionFlow2024 argue allows $\pi_0$to focus on learning the mean of the data distribution $\mathbb E[a_{t:t+H_a} \vert o_t]$ during training, in keeping with @esserScalingRectifiedFlow2024. To further optimize performance and reduce inference time, @blackp0VisionLanguageActionFlow2024 propose reducing the support of the timestep distribution to $[0,s], \ s < 1$, as for any forward-integration step size $\delta = 1-s$ timesteps above $s$ are never sampled at inference time. +Using such Beta distribution emphasizes higher noise levels during training, a choice @blackp0VisionLanguageActionFlow2024 argue allows $\pi_0$ to focus on learning to reconstruct the mean of the data distribution $\mathbb E[a_{t:t+H_a} \vert o_t]$ over an identity map during training, in keeping with @esserScalingRectifiedFlow2024. To further optimize performance and reduce inference time, @blackp0VisionLanguageActionFlow2024 propose reducing the support of the timestep distribution to $[0,s], \ s < 1$, as for any forward-integration step size $\delta = 1-s$ timesteps above $s$ are never sampled at inference time. -Besides adopting a MoE architecture with a VLM backbone initialized from a pre-trained model and trained jointly with an action expert via flow matching, $\pi_0$also relies on a unique pre-training corpus mixes open data of 10M+ trajectories, which @blackp0VisionLanguageActionFlow2024 claim to be the largest dataset used in building a foundational model in robotics to date. The dataset used to train $\pi_0$--referred to as $\pi$ dataset--comprises a private, undisclosed portion obtained via teleoperation aggregated to openly available datasets including Open-X and DROID, with $\approx 9.1\%$ of the $\pi$ being openly available. Open datasets such as DROID and Open-X are complemeneted with expert trajectories with of dexterous demonstrations tasks spanning 7 robot configurations and 68 different tasks.  @blackp0VisionLanguageActionFlow2024 show that pre-training on the $\pi$ dataset yields a broadly capable base model, which can be adapted via post-training on narrower high-quality task data, inducing fluent multi-stage behavior while retaining robustness. In particular, @blackp0VisionLanguageActionFlow2024 report that, across a variety of benchmarks, $\pi_0$pretrained on the $\pi$ dataset and post-trained on extra high-quality data demonstrations *consistently outperform* $\pi_0$trained from scratch (i.e., without pretraining on the $\pi$ dataset), further scoring the relevance of pretraining.  @blackp0VisionLanguageActionFlow2024 offer an intuition behind this finding: high-quality demonstrations of a given task typically do not contain mistakes, and how human demonstrator may recover from them. In turn, robot trained on high-quality data exclusively with BC may be incapable to recover from failure. Conversely, large scale collections of human demonstrations are typically much more diverse (if anything, for their sheer scale), and therefore typically contain rich and diverse information, which may prove suboptimal for any given task when considered in isolation but that proves invaluable in coupling with a small, narrower set of demonstrations. +Besides adopting a MoE architecture with a VLM backbone initialized from a pre-trained model and trained jointly with an action expert via flow matching, $\pi_0$ also relies on a unique pre-training corpus comprising of a mix of proprietary and open data totaling 10M+ trajectories, which in their work @blackp0VisionLanguageActionFlow2024 claim to be the largest dataset used to develop a foundational robotics model to date. The dataset used to train $\pi_0$--referred to as "the $\pi$ dataset"--comprises a private, undisclosed portion obtained via expert teleoperation as well as openly available datasets including Open-X and DROID, with only $\approx 9.1\%$ of the $\pi$ being openly available. In the $\pi$ dataset, open datasets such as DROID and Open-X are complemeneted with expert trajectories consisting of dexterous demonstrations tasks spanning 7 robot configurations and 68 different tasks. Crucially, @blackp0VisionLanguageActionFlow2024 show that pre-training on the $\pi$ dataset yields a broadly capable base model, which can be adapted via fine-tuning on narrower, higher-quality task data, which induces a fluent multi-stage behavior while retaining robustness. In particular, @blackp0VisionLanguageActionFlow2024 report that, across a variety of benchmarks, the version of $\pi_0$ pretrained on the $\pi$ dataset and fine-tuned on extra high-quality data demonstrations *consistently outperforms* a $\pi_0^{\text{scratch}}$ baseline trained entirely from scratch for a given specific task, which further underscores the relevance of pretraining on the $\pi$ dataset. @blackp0VisionLanguageActionFlow2024 do also offer an intuition behind this finding: high-quality demonstrations of a given task tend to omit failure data, which inherently prevents an autonomous agent to learn how to recover from near-failure states. In turn, robot trained on high-quality data exclusively with BC may as well be entirely incapable to recover from failure. Conversely, large scale collections of human demonstrations are typically much more diverse (if anything, for their sheer scale), and typically contain rich and diverse information, which may prove suboptimal for any given task when considered in isolation but which proves invaluable in coupling with a small, narrower set of demonstrations. -Lastly, @blackp0VisionLanguageActionFlow2024 present cross-embodiment experiments where they demonstrate $\pi_0$’s ability to control both mobile and static manipulator robots with varying arm embodiments. The emergence of cross-embodiment capabilities is largely to be attributed to the presence of large scale cross-embodiment data in the data mixture, handled by $\pi_0$defaulting to the maximal configuration size across the $\pi$ dataset, and zero-padding robots with fewer dof. In that $\pi_0$constantly processes 18 DoFs robots (two 6-DoF arms, two grippers, base, vertical torso), regardless of the kind of robot, and robots with fewer dofs are zero-padded. $\pi_0$also relies on three camera views, and uses masked image slots for training and deployment scenarios with fewer cameras. +Lastly, @blackp0VisionLanguageActionFlow2024 present cross-embodiment experiments where they demonstrate $\pi_0$’s ability to control both mobile and static manipulator robots with varying arm embodiments. The emergence of cross-embodiment capabilities is largely to be attributed to the presence of large scale cross-embodiment data in $\pi$ data mixture, which is in practice handled by $\pi_0$ outputting actions with maximal configuration size across the whole $\pi$ dataset, and zero-padding robots with fewer dofs. $\pi_0$ does also rely on exactly three camera views at both training and test time, and uses masked image slots for training and deployment scenarios with fewer cameras. #### Code Example: Using $\pi_0$ +
+ +Using $\pi_0$ +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch5/01_using_pi0.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch5/01_using_pi0.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+ ### SmolVLA -VLAs remain in an early stage of development and are not yet as mature or widely adopted as LLMs and VLMs. Further, much of the impactful VLA progress remains proprietary, with many models sharing only weights while withholding full training details and essential methodological components. SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, aiming to democratize the developments of robotics foundation models by open sourcing model, training recipes and data used. +With VLAs in the early stage of development compared to more mature LLMs and VLMs, much of the progress made on VLAs remains proprietary, with many releases exclusively sharing the weights while withholding the data used, full experimental details and essential methodological components of training. In constrast with this closed approach, SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, which aims at democratizing the developments of robotics foundation models by open sourcing the model alongside the data used as well as the training recipes.
-
The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 6x less memory usage than π 0 .
+
The SmolVLA architecture, as in @shukorSmolVLAVisionLanguageActionModel2025. SmolVLA is a compact MoE model trained with flow matching to denoise action chunks. Vision and language tokens are fed to a VLM backbone, and share information with the proprioperceptive and action tokens via the attention mechanism. The attention expert interleaves SA and CA layers for further conditioning on the visual features from the VLM backbone. SmolVLA skips computations and reduces the visual tokens, resulting in 7x less memory usage than π 0 (450M parameters vs. π 0 ’s 3.3B).
-While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, ultimately hindering accessibility. SmolVLA mitigates both these accessibility issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure [ch5-smolvla]) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of around 450M parameters versus $\pi_0$’s 3.3B parameters. +While encouraging efforts like $\pi_0$ @blackp0VisionLanguageActionFlow2024 demonstrate the feasibility of open VLA systems, they remain (1) large and compute-intensive and (2) dependent on closed datasets collected via centralized efforts on costly robotic platforms, which ultimately hinders the accessibility of the method altogether. SmolVLA mitigates both these issues by (1) prioritizing a compact, compute-efficient VLA design and (2) targeting community-contributed datasets on accessible robotic platforms such as the SO-100 and SO-101 arms. Similarly to $\pi_0$, SmolVLA (Figure [ch5-smolvla]) employs a MoE architecture combining a pretrained VLM backbone with a dedicated action expert, and trains with flow matching. To ensure efficiency and accessibility, SmolVLA adopts SmolVLM-2 @marafiotiSmolVLMRedefiningSmall2025 as its VLM backbone, considering SmolVLM-2’s reduced size and capability to process multiple image inputs alongside text items. SmolVLM-2 uses SigLIP @zhaiSigmoidLossLanguage2023 as vision encoder, producing visual features for a SmolLM2 language decoder @allalSmolLM2WhenSmol2025. Further, SmolVLA adopts a smaller action expert consisting of $\sim$100M parameters and an interleaved stack of self and cross-attention layers. To improve efficiency, the action expert adopts a reduced embedding dimension compared to the VLM backbone, resulting in $d_{v_\theta} = 0.75 d_{\text{VLM}}$. @shukorSmolVLAVisionLanguageActionModel2025’s design choices thus result in a much smaller size model compared to $\pi_0$, consisting of ca. 450M parameters versus $\pi_0$’s 3.3B parameters. -Effectively, SmolVLA consumes multi-view RGB images, a natural-language instruction, and a projected sensorimotor state token as inputs, together with the noised *action chunk* $\tilde{a_{t-t+H_a}}$ the action expert $v_\theta$ is trained to denoise. In particular, robot proprioperceptive states are projected into a shared token space with the VLM to match $d_{\text{VLM}}$, and successively projected into the expert’s token space. Similarily to $\pi_0$, SmolVLA adopts separate experts communicating exclusively through self-attention layers, which do not employ the same blockwise causal masking in favour of a simple causal masking, resulting in a lower triangular attention mask. +In practice, SmolVLA consumes multi-view RGB images, a natural-language instruction, and projected sensorimotor state token as inputs, together with the noised *action chunk* $\tilde{a}_{t-t+H_a}$ the action expert $v_\theta$ is trained to denoise. The robot proprioperceptive states are projected to a shared token space with the VLM to match $d_{\text{VLM}}$, and successively projected into the expert’s token space. Similarily to $\pi_0$, SmolVLA adopts separate experts communicating exclusively through self-attention layers, which however do not employ blockwise causal attention masking and rather favour simple causal masking. -In contrast with $\pi_0$, the action expert interleaves *cross-attention* (CA) and *self-attention* (SA) layers, a choice shown to yield higher success and smoother action chunks in practice. While in the expert SA layers, tokens are used to obtain queries, keys and values, CA layers use action tokens only as queries, and instead project visual, language and proprioperceptive tokens in a shared action space to obtain keys and values. Notably, keys and values can be cached as well, resulting in performance gains at inference time. +In contrast with $\pi_0$, the action expert interleaves *cross-attention* (CA) and *self-attention* (SA) layers, a choice shown to yield higher success and smoother action chunks in practice. While in the expert SA layers tokens are used to obtain queries, keys and values, CA layers use action tokens only as queries, and instead project visual, language and proprioperceptive tokens from the VLM backbone to a shared embedding space to then obtain keys and values. Notably, keys and values can be cached here as well, resulting in performance gains at inference time. -SmolVLA trims both token and layer compute. First, it *reduces visual tokens* via pixel shuffle to a fixed budget of 64 tokens per frame, foregoing tiling used during VLM pretraining for runtime efficiency. Second, it *skips upper VLM layers*: the action expert consumes features from the first $N$ decoder layers, with $N=L/2$ providing a good speed-performance trade-off and effectively halving downstream compute for the larger part of SmolVLA. Beyond model compactness, SmolVLA also contributes an inference stack that decouples action prediction from execution for responsiveness on modest hardware (Section [ch4-async-inference]). +SmolVLA also trims down both token and layer compute. First, it *reduces visual tokens* via pixel shuffling to a fixed budget of 64 tokens per frame, foregoing the tiling used during VLM pretraining for the sake of runtime efficiency. Second, it *skips upper VLM layers*, as only features from the first $N$ decoder layers, with $N=L/2$, are consumed, which provides a good speed-performance trade-off and effectively halves compute needs for the larger part of SmolVLA. Beyond model compactness, SmolVLA also contributes an inference stack that decouples action prediction from execution for responsiveness on modest hardware (Section [ch4-async-inference]). -Departing from reliance on proprietary datasets, SmolVLA pretrains exclusively on 450+ *community datasets*, totaling 20K+ trajectories. Because instructions in community contributed dataset can be noisy or missing, the authors re-annotate tasks with a small off-the-shelf VLM using frames sampled from the dataset, and standardize camera viewpoints by mapping sources to a consistent top/wrist/side ordering. At inference, similarily to $\pi_0$, SmolVLA integrates flow over 10 steps, resulting in fast inference. SmolVLA proves effective across a range of both real-world and simulated environments, rivaling $\pi_0$while being close to 40% faster and consuming 6x less memory. +Departing from reliance on proprietary datasets, SmolVLA pretrains exclusively on 450+ *community datasets*, totaling 20k+ trajectories. Because instructions in community contributed dataset can be noisy or missing, the authors re-annotate tasks with a small off-the-shelf VLM using frames sampled from the dataset, and standardize camera viewpoints by mapping sources to a consistent top/wrist/side ordering. At test time, similarily to $\pi_0$, SmolVLA forward-integrates flow over 10 steps, resulting in fast inference. SmolVLA proves effective across a range of both real-world and simulated environments, rivaling $\pi_0$ while being close to 40% faster and consuming 6x less memory @shukorSmolVLAVisionLanguageActionModel2025. #### Code Example: Using SmolVLA +
+ +Using SmolVLA +[https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch5/02_using_smolvla.py](https://github.com/fracapuano/robot-learning-tutorial/blob/main/snippets/ch5/02_using_smolvla.py) + +```python +import torch +from lerobot.datasets.lerobot_dataset import LeRobotDataset +from lerobot.datasets.streaming_dataset import StreamingLeRobotDataset + +delta_timestamps = { + "observation.images.wrist_camera": [-0.2, -0.1, 0.0] # 0.2, and 0.1 seconds *before* each frame +} + +# Optionally, use StreamingLeRobotDataset to avoid downloading the dataset +dataset = LeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Streams frames from the Hugging Face Hub without loading into memory +streaming_dataset = StreamingLeRobotDataset( + "lerobot/svla_so101_pickplace", + delta_timestamps=delta_timestamps +) + +# Get the 100th frame in the dataset by +sample = dataset[100] +print(sample) +# { +# 'observation.state': tensor([...]), +# 'action': tensor([...]), +# 'observation.images.wrist_camera': tensor([3, C, H, W]), for delta timesteps +# ... +# } + +batch_size=16 +# wrap the dataset in a DataLoader to use process it batches for training purposes +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size +) + +# Iterate over the DataLoader in a training loop +num_epochs = 1 +device = "cuda" if torch.cuda.is_available() else "cpu" + +for epoch in range(num_epochs): + for batch in data_loader: + # Move data to the appropriate device (e.g., GPU) + observations = batch["observation.state"].to(device) + actions = batch["action"].to(device) + images = batch["observation.images.wrist_camera"].to(device) + + # Next, you can do amazing_model.forward(batch) + ... +``` + +
+ ## Conclusions -This tutorial has chronicled the paradigmatic shift transforming robotics, from the structured, model-based methods of its classical era to the dynamic, data-driven approaches that define modern robot learning. We began by examining the limitations of traditional dynamics-based control, highlighting the brittleness and the significant engineering overhead required by traditional approaches, which in turn motivates more flexible, less model-intensive learning approaches. +This tutorial has charted the paradigmatic shift transforming robotics, tracing the evolution of robotics from structured, model-based methods to the dynamic, data-driven approaches that define modern robot learning. We began by examining the limitations of traditional dynamics-based control, namely its brittleness and significant engineering overhead, which motivate the adoption of more flexible, learning-based alternatives. Unlike scalable, data-driven techniques, conventional explicit models demand extensive human expertise, hindering wider accessibility and scalability of robotics. + +Our exploration traced a clear trajectory of progress, beginning with Reinforcement Learning (RL). While RL offers a powerful paradigm for learning through interaction, its application in robotics is complicated by challenges such as sample inefficiency, safety concerns in real-world training, and the complexities of reward design. We saw how modern approaches like HIL-SERL make real-world RL more feasible by incorporating training-time human guidance, datasets of previously collected data as well as learned reward classifiers. -Our exploration of learning-based techniques revealed a clear trajectory of progress. We began with Reinforcement Learning, acknowledging its power to learn through interaction but also its real-world challenges, particularly sample inefficiency and the complexities of reward design. We saw how modern, data-driven approaches like HIL-SERL can make real-world RL feasible by incorporating human guidance and prior data. The inherent difficulties of RL, however, naturally motivated a deeper dive into imitation learning. This led us to single-task policies, where Behavioral Cloning, powered by advanced generative models like Action Chunking with Transformers and Diffusion Policy, demonstrated the ability to learn complex, multimodal behaviors directly from expert demonstrations. This laid the groundwork for the current frontier: the development of generalist, language-conditioned Vision-Language-Action models. Architectures like $\pi_0$ and SmolVLA--leveraging powerful pre-trained backbones and sophisticated generative modeling techniques like flow matching--represent a significant leap towards building foundational models for robotics that can generalize across varied tasks and embodiments. +Nonetheless, the inherent difficulties of RL increasingly motivate approaches based on imitation learning, capable to safely learns from limited numbers of real-world, reward-free expert demonstrations. In turn, the wider adoption of imitation learning led to the development of single-task policies, where advanced Behavioral Cloning techniques--implemented as state-conditioned generative models like Action Chunking with Transformers and Diffusion Policy--have demonstrated the ability to learn complex, multimodal behaviors from human demonstrations. These advancements laid the groundwork for the current frontier: generalist, language-conditioned Vision-Language-Action models capable to perform few- and zero-shot a variety of different real-world tasks. By leveraging powerful pre-trained backbones and sophisticated generative methods like flow matching, models such as $\pi_0$ and SmolVLA represent a significant leap towards foundational models for robotics capable of generalizing across diverse tasks, and even robot embodiments. -A central theme throughout this work has been the critical role of openness in accelerating this progress. The recent explosion in capability is inseparable from the advent of large-scale, openly available datasets, the standardization of powerful and efficient model architectures, and the development of accessible, open-source software like `lerobot`. We argue the convergence towards an open approach to robotics is not merely a trend but a fundamental enabler, democratizing access to cutting-edge research in a traditionally siloed field like robotics. +A central theme of this work is the critical role of openness in accelerating this progress. The recent explosion in capability is inseparable from the advent of large-scale, openly available datasets, standardized, stable and accessible model architectures, and accessible, open-source software like `lerobot`. We argue this convergence on open-source robotics is not a mere trend but a fundamental enabler, democratizing access to research and unlocking the potential of large, decentralized efforts to advance the field. -We believe the path ahead for robot learning to be overly exciting, and filled with fundamental challenges we yet have to even scratch the surface of. The journey detailed in this tutorial, from the first principles to the state-of-the-art, equips researchers and practitioners alike with the context and the tools to chart their own journey in the future of robotics. +The journey detailed in this tutorial, from first principles to the state-of-the-art, aims to equip researchers and practitioners with the context and tools to begin their own explorations in open-source robot learning. [^1]: In here, we refer to both *kinematics* and *dynamics*-based control. [^2]: Quote from @mnihPlayingAtariDeep2013. The notation used has slightly been adapted for consistency with the rest of this tutorial. -[^3]: Throughout, we will adopt the terminology and notation for SL introduced in @shalev-shwartzUnderstandingMachineLearning2014 +[^3]: Throughout, we will adopt the terminology and notation for SL used in @shalev-shwartzUnderstandingMachineLearning2014 -[^4]: $o,a = z_0$ for the sake of notation. Steps omitted for brevity. See Section A in @hoDenoisingDiffusionProbabilistic2020 for a complete derivation. +[^4]: See  for a complete derivation