thibaud frere commited on
Commit
1f9a800
·
1 Parent(s): 8fd03ae
Files changed (34) hide show
  1. app/.astro/astro/content.d.ts +2 -2
  2. app/scripts/latex-to-mdx/filters/equation-ids.lua +4 -3
  3. app/scripts/latex-to-mdx/output/main.md +38 -0
  4. app/scripts/latex-to-mdx/output/main.mdx +56 -4
  5. app/scripts/latex-to-mdx/post-processor.mjs +24 -0
  6. app/src/components/Hero.astro +1 -1
  7. app/src/content/article.mdx +56 -4
  8. app/src/content/{embeds → embeds2}/banner.html +0 -0
  9. app/src/content/{embeds → embeds2}/d3-bar.html +0 -0
  10. app/src/content/{embeds → embeds2}/d3-benchmark.html +0 -0
  11. app/src/content/{embeds → embeds2}/d3-confusion-matrix.html +0 -0
  12. app/src/content/{embeds → embeds2}/d3-evals-after-fix.html +0 -0
  13. app/src/content/{embeds → embeds2}/d3-evals-tpbug.html +0 -0
  14. app/src/content/{embeds → embeds2}/d3-line-quad.html +0 -0
  15. app/src/content/{embeds → embeds2}/d3-line.html +0 -0
  16. app/src/content/{embeds → embeds2}/d3-matrix.html +0 -0
  17. app/src/content/{embeds → embeds2}/d3-neural-network.html +0 -0
  18. app/src/content/{embeds → embeds2}/d3-pie-quad.html +0 -0
  19. app/src/content/{embeds → embeds2}/d3-pie.html +0 -0
  20. app/src/content/{embeds → embeds2}/d3-scatter.html +0 -0
  21. app/src/content/{embeds → embeds2}/demo/color-picker.html +0 -0
  22. app/src/content/{embeds → embeds2}/demo/content-structure.html +0 -0
  23. app/src/content/{embeds → embeds2}/demo/palettes.html +0 -0
  24. app/src/content/{embeds → embeds2}/original_embeds/plotly/banner.py +0 -0
  25. app/src/content/{embeds → embeds2}/original_embeds/plotly/bar.py +0 -0
  26. app/src/content/{embeds → embeds2}/original_embeds/plotly/heatmap.py +0 -0
  27. app/src/content/{embeds → embeds2}/original_embeds/plotly/line.py +0 -0
  28. app/src/content/{embeds → embeds2}/original_embeds/plotly/poetry.lock +0 -0
  29. app/src/content/{embeds → embeds2}/original_embeds/plotly/pyproject.toml +0 -0
  30. app/src/content/{embeds → embeds2}/plotly-line.html +0 -0
  31. app/src/content/{embeds → embeds2}/throughput-debug-1node.html +0 -0
  32. app/src/content/{embeds → embeds2}/throughput-drops-comparison.html +0 -0
  33. app/src/content/{embeds → embeds2}/throughput-weka-drops.html +0 -0
  34. app/src/content/{embeds → embeds2}/vibe-code-d3-embeds-directives.md +0 -0
app/.astro/astro/content.d.ts CHANGED
@@ -209,12 +209,12 @@ declare module 'astro:content' {
209
  data: any
210
  } & { render(): Render[".mdx"] };
211
  };
212
- "embeds": {
213
  "vibe-code-d3-embeds-directives.md": {
214
  id: "vibe-code-d3-embeds-directives.md";
215
  slug: "vibe-code-d3-embeds-directives";
216
  body: string;
217
- collection: "embeds";
218
  data: any
219
  } & { render(): Render[".md"] };
220
  };
 
209
  data: any
210
  } & { render(): Render[".mdx"] };
211
  };
212
+ "embeds2": {
213
  "vibe-code-d3-embeds-directives.md": {
214
  id: "vibe-code-d3-embeds-directives.md";
215
  slug: "vibe-code-d3-embeds-directives";
216
  body: string;
217
+ collection: "embeds2";
218
  data: any
219
  } & { render(): Render[".md"] };
220
  };
app/scripts/latex-to-mdx/filters/equation-ids.lua CHANGED
@@ -69,9 +69,10 @@ function Math(el)
69
 
70
  local new_math
71
  if has_align then
72
- -- For align environments, add the ID differently - KaTeX doesn't support \htmlId with align
73
- -- Instead, we'll add a span with the ID right before the align
74
- new_math = clean_math
 
75
  else
76
  -- For other math, wrap with \htmlId{}
77
  new_math = "\\htmlId{" .. clean_id .. "}{" .. clean_math .. "}"
 
69
 
70
  local new_math
71
  if has_align then
72
+ -- For align environments, KaTeX doesn't support \htmlId with align
73
+ -- Instead, we add a special marker that the post-processor will convert to a span
74
+ -- This span will serve as an anchor for references
75
+ new_math = "%%ALIGN_ANCHOR_ID{" .. clean_id .. "}%%\n" .. clean_math
76
  else
77
  -- For other math, wrap with \htmlId{}
78
  new_math = "\\htmlId{" .. clean_id .. "}{" .. clean_math .. "}"
app/scripts/latex-to-mdx/output/main.md CHANGED
@@ -248,6 +248,8 @@ Deriving the end-effector’s *pose*--position *and* orientation--in some $`m`$-
248
 
249
  In the simplified case here considered (for which $`\boldsymbol{p} \equiv p`$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $`p^*`$ by solving analytically for $`q: p(q) = f_{\text{FK}}(q) = p^*`$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $`\Vert p(q) - p^* \Vert_2^2`$ is a natural candidate), yielding:
250
 
 
 
251
  ``` math
252
  \begin{align}
253
  \min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
@@ -262,6 +264,8 @@ For instance, the robot in Figure <a href="#planar-manipulator-floor" data-refe
262
  However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $`q`$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$`K`$ succession of goal poses $`\tau_K = [p^*_0, p^*_1, \dots p^*_K]`$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $`\tau_K`$ via IK can prove prohibitively expensive, as tracking would require $`K`$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $`J(q)`$ denote the Jacobian matrix of (partial) derivatives of the FK-function $`f_\text{FK}- \mathcal Q \mapsto \mathcal P`$, such that $`J(q) = \frac{\partial f_{FK}(q)}{\partial q }`$. Then, one can apply the chain rule to any $`p(q) = f_{\text{FK}}(q)`$, deriving $`\dot p = J(q) \dot q`$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
263
 
264
  Given a desired end-effector trajectory $`\dot {p}^*(t)`$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $`\dot q(t)`$ solving for joints’ *velocities* instead of *configurations*,
 
 
265
  ``` math
266
  \begin{align}
267
  \dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
@@ -390,6 +394,8 @@ A length-$`T`$ *trajectory* is the (random) sequence
390
  \htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
391
  ```
392
  with per-step rewards defined as $`r_t = r (s_t, a_t, s_{t+1})`$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
 
 
393
  ``` math
394
  \begin{align}
395
  \mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) &= \mathbb P (s_{t+1}\vert s_t, a_t) \\
@@ -406,6 +412,8 @@ Policies $`\mathbb P(a_t\vert s_t)`$ are typically indicated as $`\pi(a_t\vert s
406
  G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
407
  ```
408
  In that, agents seek to learn control strategies (*policies*, $`\pi_\theta`$) maximizing the expected return $`\mathbb E_{\tau \sim \pi_\theta} G(\tau)`$. For a given dynamics $`\mathcal D`$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $`\Pi`$, yielding the (maximization) target $`J : \Pi \mapsto \mathbb R`$
 
 
409
  ``` math
410
  \begin{align}
411
  J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
@@ -422,6 +430,8 @@ can be used to discriminate between desirable and undesirable state in terms of
422
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
423
  ```
424
  Crucially, value functions are interrelated:
 
 
425
  ``` math
426
  \begin{align}
427
  Q_\pi(s_t, a_t) &= \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})] \\
@@ -493,6 +503,8 @@ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t
493
  Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $`Q_K \approx Q^*`$ at each timestep. In fact, under certain assumptions on the MDP considered, $`Q_K \to Q^* \, \text{as } K \to \infty`$.
494
 
495
  Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $`\mathcal S\times \mathcal A`$ problems, in which the storing of $`Q : \mathcal S\times \mathcal A\mapsto \mathbb R`$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $`\theta`$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $`\delta_i`$):
 
 
496
  ``` math
497
  \begin{align}
498
  \mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
@@ -515,6 +527,8 @@ Provably, <a href="#deterministic-pg" data-reference-type="ref" data-reference="
515
  Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
516
 
517
  Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with \<span class="highlight"\>maximizing the discounted cumulative reward, while acting as randomly as possible\</span\>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $`J (\pi)`$ to specifically account for the policy entropy,
 
 
518
  ``` math
519
  \begin{align}
520
  J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
@@ -643,6 +657,8 @@ Intuitively, in the case of observation-action pairs $`(o, a)`$ for a robotics a
643
  </figure>
644
 
645
  Given a dataset $`\mathcal D`$ consisting of $`N`$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $`\theta`$ (in Bayesian terms, the *evidence* $`p_\theta(\mathcal D)`$) can thus be written as:
 
 
646
  ``` math
647
  \begin{align}
648
  \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
@@ -658,6 +674,8 @@ In the special case where one assumes distributions to be tractable, $`p_\theta
658
  In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $`p_\theta(o,a\vert z)`$ and (approximate) posterior distribution $`q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)`$.
659
 
660
  In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$`\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]`$--yielding:
 
 
661
  ``` math
662
  \begin{align}
663
  \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
@@ -671,6 +689,8 @@ In particular, the lower bound on <a href="#evidence-definition" data-reference
671
  \end{align}
672
  ```
673
  The true, generally intractable posterior $`p_\theta (z \vert o,a)`$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $`q_\phi(z \vert o,a)`$, resulting in the final, tractable ELBO objective,
 
 
674
  ``` math
675
  \begin{align}
676
  \text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left(
@@ -684,6 +704,8 @@ From Jensen’s inequality, maximizing ELBO results in maximizing the log-likeli
684
 
685
  An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
686
 
 
 
687
  ``` math
688
  \begin{align}
689
  \min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
@@ -705,6 +727,8 @@ Indeed, it is very common in practice to approximate from the learned likelihood
705
  #### Diffusion Models
706
 
707
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
 
 
708
  ``` math
709
  \begin{align}
710
  p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
@@ -724,6 +748,8 @@ Similarily to VAEs, providing an exact interpretation for the latent variables i
724
  Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $`p (o,a)`$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $`p(o,a)`$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $`p(o,a)`$. Crucially, as no information from the sample $`(o,a)`$ (denoted as $`z_0 \equiv (o,a)`$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $`q(z_t \vert z_{t-1})`$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $`p(z_{t-1} \vert z_t)`$ is instead typically approximated using the parametrization $`p_\theta (z_{t-1} \vert z_t)`$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $`p(o,a)`$--a process referred to as *denoising*.
725
 
726
  Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
 
 
727
  ``` math
728
  \begin{align}
729
  \log p_\theta (\underbrace{o,a}_{= z_0}) =
@@ -751,6 +777,8 @@ Finally, adopting Gaussian posteriors permits a particularly pleasing interpreta
751
  </figure>
752
 
753
  Because the recorded behavior is teleoperated, measurements mostly distribute along the line $`a = o + \eta, \eta \sim N(0,1)`$, with $`\eta`$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $`\mathcal O\times \mathcal A`$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
 
 
754
  ``` math
755
  \begin{align}
756
 
@@ -764,6 +792,8 @@ Because the recorded behavior is teleoperated, measurements mostly distribute al
764
  In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $`p_\theta`$ directly, the parameters $`\theta`$ of the pairwise likelihood $`p_\theta(z_{t-1} \vert z_t)`$ are adjusted to *predict the total displacement* $`\epsilon`$ for a randomly long ($`t \sim \mathcal{U}(\{1,\dots,T\}`$ )) diffusion process starting from a sample of the target distribution.
765
 
766
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing
 
 
767
  ``` math
768
  \begin{align}
769
  z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
@@ -804,6 +834,8 @@ While the noising schedule of DMs results in a stochastic process that resembles
804
  </figure>
805
 
806
  In practice, FM can be applied to generative modeling by learning a vector field regressor $`v_\theta(z, t)`$ to approximate a given target vector field $`u(t, z)`$. In the particular case of DMs, $`u(t, z)`$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $`z_1 \sim p_1`$ and a sample from an easy-to-sample prior $`z_0 \sim p_0`$, CFM defines a simple path between them using *linear interpolation* between samples $`z_t = (1-t)z_0 + t z_1`$, resulting in the target vector field $`u(t, z_t) = z_1 - z_0`$. Then, a FM model can be trained with the simple regression objective defined as
 
 
807
  ``` math
808
  \begin{align}
809
 
@@ -820,6 +852,8 @@ While GMs prove useful in learning complex, high-dimensional multi-modal distrib
820
  On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $`p_\theta(o,a)`$ on observation and actions. While the *policy* distribution $`p_\theta(a \vert o)`$ can in principle be entirely described from its joint $`p_\theta(o,a)`$, it is often the case that the conditional distribution is intractable when using function approximators, as $`p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}`$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $`p (a \vert o)`$.
821
 
822
  In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
 
 
823
  ``` math
824
  \begin{align}
825
 
@@ -862,6 +896,8 @@ However, the authors claim using a deterministic procedure to derive $`z`$ may b
862
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
863
 
864
  In practice, conditioning on observation data is achieved conditioning the added noise regressor $`\epsilon_\theta`$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $`T_o`$ observations, resulting in the *conditional* simplified diffusion objective
 
 
865
  ``` math
866
  \begin{align}
867
  \mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
@@ -1033,6 +1069,8 @@ Concretely, $`\pi_0`$ is a unified transformer with two disjoint sets of weights
1033
  \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}`$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $`\mathbf{A}`$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1034
 
1035
  In $`\pi_0`$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
 
 
1036
  ``` math
1037
  \begin{align}
1038
  \mathcal{L}(\phi, \theta) &=
 
248
 
249
  In the simplified case here considered (for which $`\boldsymbol{p} \equiv p`$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $`p^*`$ by solving analytically for $`q: p(q) = f_{\text{FK}}(q) = p^*`$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $`\Vert p(q) - p^* \Vert_2^2`$ is a natural candidate), yielding:
250
 
251
+ <span id="ik_problem" style="position: absolute;"></span>
252
+
253
  ``` math
254
  \begin{align}
255
  \min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
 
264
  However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $`q`$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$`K`$ succession of goal poses $`\tau_K = [p^*_0, p^*_1, \dots p^*_K]`$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $`\tau_K`$ via IK can prove prohibitively expensive, as tracking would require $`K`$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $`J(q)`$ denote the Jacobian matrix of (partial) derivatives of the FK-function $`f_\text{FK}- \mathcal Q \mapsto \mathcal P`$, such that $`J(q) = \frac{\partial f_{FK}(q)}{\partial q }`$. Then, one can apply the chain rule to any $`p(q) = f_{\text{FK}}(q)`$, deriving $`\dot p = J(q) \dot q`$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
265
 
266
  Given a desired end-effector trajectory $`\dot {p}^*(t)`$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $`\dot q(t)`$ solving for joints’ *velocities* instead of *configurations*,
267
+ <span id="reg_ik_velocity" style="position: absolute;"></span>
268
+
269
  ``` math
270
  \begin{align}
271
  \dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
 
394
  \htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
395
  ```
396
  with per-step rewards defined as $`r_t = r (s_t, a_t, s_{t+1})`$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
397
+ <span id="dynamics_markovian" style="position: absolute;"></span>
398
+
399
  ``` math
400
  \begin{align}
401
  \mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) &= \mathbb P (s_{t+1}\vert s_t, a_t) \\
 
412
  G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
413
  ```
414
  In that, agents seek to learn control strategies (*policies*, $`\pi_\theta`$) maximizing the expected return $`\mathbb E_{\tau \sim \pi_\theta} G(\tau)`$. For a given dynamics $`\mathcal D`$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $`\Pi`$, yielding the (maximization) target $`J : \Pi \mapsto \mathbb R`$
415
+ <span id="RL-j-function" style="position: absolute;"></span>
416
+
417
  ``` math
418
  \begin{align}
419
  J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
 
430
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
431
  ```
432
  Crucially, value functions are interrelated:
433
+ <span id="q-as-v" style="position: absolute;"></span>
434
+
435
  ``` math
436
  \begin{align}
437
  Q_\pi(s_t, a_t) &= \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})] \\
 
503
  Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $`Q_K \approx Q^*`$ at each timestep. In fact, under certain assumptions on the MDP considered, $`Q_K \to Q^* \, \text{as } K \to \infty`$.
504
 
505
  Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $`\mathcal S\times \mathcal A`$ problems, in which the storing of $`Q : \mathcal S\times \mathcal A\mapsto \mathbb R`$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $`\theta`$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $`\delta_i`$):
506
+ <span id="dqn-loss" style="position: absolute;"></span>
507
+
508
  ``` math
509
  \begin{align}
510
  \mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
 
527
  Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
528
 
529
  Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with \<span class="highlight"\>maximizing the discounted cumulative reward, while acting as randomly as possible\</span\>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $`J (\pi)`$ to specifically account for the policy entropy,
530
+ <span id="J-soft" style="position: absolute;"></span>
531
+
532
  ``` math
533
  \begin{align}
534
  J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
 
657
  </figure>
658
 
659
  Given a dataset $`\mathcal D`$ consisting of $`N`$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $`\theta`$ (in Bayesian terms, the *evidence* $`p_\theta(\mathcal D)`$) can thus be written as:
660
+ <span id="evidence-definition-1" style="position: absolute;"></span>
661
+
662
  ``` math
663
  \begin{align}
664
  \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
 
674
  In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $`p_\theta(o,a\vert z)`$ and (approximate) posterior distribution $`q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)`$.
675
 
676
  In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$`\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]`$--yielding:
677
+ <span id="ELBO-intractable" style="position: absolute;"></span>
678
+
679
  ``` math
680
  \begin{align}
681
  \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
 
689
  \end{align}
690
  ```
691
  The true, generally intractable posterior $`p_\theta (z \vert o,a)`$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $`q_\phi(z \vert o,a)`$, resulting in the final, tractable ELBO objective,
692
+ <span id="ELBO" style="position: absolute;"></span>
693
+
694
  ``` math
695
  \begin{align}
696
  \text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left(
 
704
 
705
  An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
706
 
707
+ <span id="VAE-min-neg-ELBO" style="position: absolute;"></span>
708
+
709
  ``` math
710
  \begin{align}
711
  \min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
 
727
  #### Diffusion Models
728
 
729
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
730
+ <span id="BC-multi-latent-model-1" style="position- absolute;"></span>
731
+
732
  ``` math
733
  \begin{align}
734
  p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
 
748
  Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $`p (o,a)`$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $`p(o,a)`$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $`p(o,a)`$. Crucially, as no information from the sample $`(o,a)`$ (denoted as $`z_0 \equiv (o,a)`$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $`q(z_t \vert z_{t-1})`$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $`p(z_{t-1} \vert z_t)`$ is instead typically approximated using the parametrization $`p_\theta (z_{t-1} \vert z_t)`$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $`p(o,a)`$--a process referred to as *denoising*.
749
 
750
  Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
751
+ <span id="diffusion-likelihood" style="position: absolute;"></span>
752
+
753
  ``` math
754
  \begin{align}
755
  \log p_\theta (\underbrace{o,a}_{= z_0}) =
 
777
  </figure>
778
 
779
  Because the recorded behavior is teleoperated, measurements mostly distribute along the line $`a = o + \eta, \eta \sim N(0,1)`$, with $`\eta`$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $`\mathcal O\times \mathcal A`$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
780
+ <span id="diffusion-simplified-loss" style="position: absolute;"></span>
781
+
782
  ``` math
783
  \begin{align}
784
 
 
792
  In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $`p_\theta`$ directly, the parameters $`\theta`$ of the pairwise likelihood $`p_\theta(z_{t-1} \vert z_t)`$ are adjusted to *predict the total displacement* $`\epsilon`$ for a randomly long ($`t \sim \mathcal{U}(\{1,\dots,T\}`$ )) diffusion process starting from a sample of the target distribution.
793
 
794
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing
795
+ <span id="diffusion-denoising-definition" style="position- absolute;"></span>
796
+
797
  ``` math
798
  \begin{align}
799
  z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
 
834
  </figure>
835
 
836
  In practice, FM can be applied to generative modeling by learning a vector field regressor $`v_\theta(z, t)`$ to approximate a given target vector field $`u(t, z)`$. In the particular case of DMs, $`u(t, z)`$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $`z_1 \sim p_1`$ and a sample from an easy-to-sample prior $`z_0 \sim p_0`$, CFM defines a simple path between them using *linear interpolation* between samples $`z_t = (1-t)z_0 + t z_1`$, resulting in the target vector field $`u(t, z_t) = z_1 - z_0`$. Then, a FM model can be trained with the simple regression objective defined as
837
+ <span id="flow-matching-objective" style="position: absolute;"></span>
838
+
839
  ``` math
840
  \begin{align}
841
 
 
852
  On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $`p_\theta(o,a)`$ on observation and actions. While the *policy* distribution $`p_\theta(a \vert o)`$ can in principle be entirely described from its joint $`p_\theta(o,a)`$, it is often the case that the conditional distribution is intractable when using function approximators, as $`p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}`$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $`p (a \vert o)`$.
853
 
854
  In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
855
+ <span id="c-ELBO" style="position: absolute;"></span>
856
+
857
  ``` math
858
  \begin{align}
859
 
 
896
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
897
 
898
  In practice, conditioning on observation data is achieved conditioning the added noise regressor $`\epsilon_\theta`$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $`T_o`$ observations, resulting in the *conditional* simplified diffusion objective
899
+ <span id="diffusion-policy-objective" style="position: absolute;"></span>
900
+
901
  ``` math
902
  \begin{align}
903
  \mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
 
1069
  \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}`$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $`\mathbf{A}`$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1070
 
1071
  In $`\pi_0`$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
1072
+ <span id="pi0-loss" style="position: absolute;"></span>
1073
+
1074
  ``` math
1075
  \begin{align}
1076
  \mathcal{L}(\phi, \theta) &=
app/scripts/latex-to-mdx/output/main.mdx CHANGED
@@ -347,6 +347,9 @@ Deriving the end-effector’s *pose*--position *and* orientation--in some $m$-di
347
 
348
  In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
349
 
 
 
 
350
  ``` math
351
  \begin{align}
352
  \min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
@@ -361,6 +364,9 @@ For instance, the robot in Figure <a href="#planar-manipulator-floor" data-refe
361
  However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
362
 
363
  Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
 
 
 
364
  ``` math
365
  \begin{align}
366
  \dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
@@ -520,8 +526,9 @@ A length-$T$ *trajectory* is the (random) sequence
520
  ``` math
521
  \htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
522
  ```
523
-
524
  with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
 
 
525
 
526
  ``` math
527
  \begin{align}
@@ -529,17 +536,21 @@ with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notat
529
  \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
530
  \end{align}
531
  ```
 
532
  The probability of observing a given trajectory $\tau$ factorizes into
 
533
  ``` math
534
  \htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
535
  ```
536
 
537
  Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
538
-
539
  ``` math
540
  G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
541
  ```
542
  In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
 
 
 
543
  ``` math
544
  \begin{align}
545
  J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
@@ -556,8 +567,9 @@ can be used to discriminate between desirable and undesirable state in terms of
556
  ``` math
557
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
558
  ```
559
-
560
  Crucially, value functions are interrelated:
 
 
561
 
562
  ``` math
563
  \begin{align}
@@ -648,6 +660,9 @@ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t
648
  Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
649
 
650
  Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
 
 
 
651
  ``` math
652
  \begin{align}
653
  \mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
@@ -672,6 +687,9 @@ Provably, <a href="#deterministic-pg" data-reference-type="ref" data-reference="
672
  Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
673
 
674
  Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
 
 
 
675
  ``` math
676
  \begin{align}
677
  J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
@@ -835,6 +853,9 @@ Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics app
835
  </figure>
836
 
837
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
 
 
 
838
  ``` math
839
  \begin{align}
840
  \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
@@ -850,6 +871,9 @@ In the special case where one assumes distributions to be tractable, $p_\theta (
850
  In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
851
 
852
  In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
 
 
 
853
  ``` math
854
  \begin{align}
855
  \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
@@ -862,8 +886,9 @@ In particular, the lower bound on <a href="#evidence-definition" data-reference
862
  \right)
863
  \end{align}
864
  ```
865
-
866
  The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
 
 
867
 
868
  ``` math
869
  \begin{align}
@@ -878,6 +903,9 @@ From Jensen’s inequality, maximizing ELBO results in maximizing the log-likeli
878
 
879
  An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
880
 
 
 
 
881
  ``` math
882
  \begin{align}
883
  \min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
@@ -900,6 +928,9 @@ Indeed, it is very common in practice to approximate from the learned likelihood
900
  #### Diffusion Models
901
 
902
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
 
 
 
903
  ``` math
904
  \begin{align}
905
  p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
@@ -925,6 +956,9 @@ Similarily to VAEs, providing an exact interpretation for the latent variables i
925
  Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
926
 
927
  Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
 
 
 
928
  ``` math
929
  \begin{align}
930
  \log p_\theta (\underbrace{o,a}_{= z_0}) =
@@ -964,6 +998,9 @@ Finally, adopting Gaussian posteriors permits a particularly pleasing interpreta
964
  </figure>
965
 
966
  Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
 
 
 
967
  ``` math
968
  \begin{align}
969
 
@@ -977,6 +1014,9 @@ Because the recorded behavior is teleoperated, measurements mostly distribute al
977
  In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
978
 
979
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
 
 
 
980
  ``` math
981
  \begin{align}
982
  z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
@@ -1030,6 +1070,9 @@ While the noising schedule of DMs results in a stochastic process that resembles
1030
  </figure>
1031
 
1032
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
 
 
 
1033
  ``` math
1034
  \begin{align}
1035
 
@@ -1046,6 +1089,9 @@ While GMs prove useful in learning complex, high-dimensional multi-modal distrib
1046
  On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
1047
 
1048
  In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
 
 
 
1049
  ``` math
1050
  \begin{align}
1051
 
@@ -1106,6 +1152,9 @@ However, the authors claim using a deterministic procedure to derive $z$ may ben
1106
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
1107
 
1108
  In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
 
 
 
1109
  ``` math
1110
  \begin{align}
1111
  \mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
@@ -1306,6 +1355,9 @@ $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a Mo
1306
  Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1307
 
1308
  In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
 
 
 
1309
  ``` math
1310
  \begin{align}
1311
  \mathcal{L}(\phi, \theta) &=
 
347
 
348
  In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
349
 
350
+ <span id="ik_problem" style="position: absolute;">
351
+ </span>
352
+
353
  ``` math
354
  \begin{align}
355
  \min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
 
364
  However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
365
 
366
  Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
367
+ <span id="reg_ik_velocity" style="position: absolute;">
368
+ </span>
369
+
370
  ``` math
371
  \begin{align}
372
  \dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
 
526
  ``` math
527
  \htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
528
  ```
 
529
  with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
530
+ <span id="dynamics_markovian" style="position: absolute;">
531
+ </span>
532
 
533
  ``` math
534
  \begin{align}
 
536
  \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
537
  \end{align}
538
  ```
539
+
540
  The probability of observing a given trajectory $\tau$ factorizes into
541
+
542
  ``` math
543
  \htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
544
  ```
545
 
546
  Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
 
547
  ``` math
548
  G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
549
  ```
550
  In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
551
+ <span id="RL-j-function" style="position: absolute;">
552
+ </span>
553
+
554
  ``` math
555
  \begin{align}
556
  J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
 
567
  ``` math
568
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
569
  ```
 
570
  Crucially, value functions are interrelated:
571
+ <span id="q-as-v" style="position: absolute;">
572
+ </span>
573
 
574
  ``` math
575
  \begin{align}
 
660
  Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
661
 
662
  Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
663
+ <span id="dqn-loss" style="position: absolute;">
664
+ </span>
665
+
666
  ``` math
667
  \begin{align}
668
  \mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
 
687
  Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
688
 
689
  Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
690
+ <span id="J-soft" style="position: absolute;">
691
+ </span>
692
+
693
  ``` math
694
  \begin{align}
695
  J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
 
853
  </figure>
854
 
855
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
856
+ <span id="evidence-definition-1" style="position: absolute;">
857
+ </span>
858
+
859
  ``` math
860
  \begin{align}
861
  \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
 
871
  In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
872
 
873
  In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
874
+ <span id="ELBO-intractable" style="position: absolute;">
875
+ </span>
876
+
877
  ``` math
878
  \begin{align}
879
  \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
 
886
  \right)
887
  \end{align}
888
  ```
 
889
  The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
890
+ <span id="ELBO" style="position: absolute;">
891
+ </span>
892
 
893
  ``` math
894
  \begin{align}
 
903
 
904
  An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
905
 
906
+ <span id="VAE-min-neg-ELBO" style="position: absolute;">
907
+ </span>
908
+
909
  ``` math
910
  \begin{align}
911
  \min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
 
928
  #### Diffusion Models
929
 
930
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
931
+ <span id="BC-multi-latent-model-1" style="position- absolute;">
932
+ </span>
933
+
934
  ``` math
935
  \begin{align}
936
  p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
 
956
  Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
957
 
958
  Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
959
+ <span id="diffusion-likelihood" style="position: absolute;">
960
+ </span>
961
+
962
  ``` math
963
  \begin{align}
964
  \log p_\theta (\underbrace{o,a}_{= z_0}) =
 
998
  </figure>
999
 
1000
  Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
1001
+ <span id="diffusion-simplified-loss" style="position: absolute;">
1002
+ </span>
1003
+
1004
  ``` math
1005
  \begin{align}
1006
 
 
1014
  In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
1015
 
1016
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
1017
+ <span id="diffusion-denoising-definition" style="position- absolute;">
1018
+ </span>
1019
+
1020
  ``` math
1021
  \begin{align}
1022
  z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
 
1070
  </figure>
1071
 
1072
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
1073
+ <span id="flow-matching-objective" style="position: absolute;">
1074
+ </span>
1075
+
1076
  ``` math
1077
  \begin{align}
1078
 
 
1089
  On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
1090
 
1091
  In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
1092
+ <span id="c-ELBO" style="position: absolute;">
1093
+ </span>
1094
+
1095
  ``` math
1096
  \begin{align}
1097
 
 
1152
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
1153
 
1154
  In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
1155
+ <span id="diffusion-policy-objective" style="position: absolute;">
1156
+ </span>
1157
+
1158
  ``` math
1159
  \begin{align}
1160
  \mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
 
1355
  Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1356
 
1357
  In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
1358
+ <span id="pi0-loss" style="position: absolute;">
1359
+ </span>
1360
+
1361
  ``` math
1362
  \begin{align}
1363
  \mathcal{L}(\phi, \theta) &=
app/scripts/latex-to-mdx/post-processor.mjs CHANGED
@@ -300,6 +300,29 @@ function fixLinkTextContent(content) {
300
  return cleanedContent;
301
  }
302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  /**
304
  * Main post-processing function that applies all cleanup steps
305
  * @param {string} content - Raw Markdown content from Pandoc
@@ -315,6 +338,7 @@ export function postProcessMarkdown(content, inputDir = null) {
315
  processedContent = removeTexGroupingCommands(processedContent);
316
  processedContent = simplifyLatexDelimiters(processedContent);
317
  processedContent = removeOrphanedLabels(processedContent);
 
318
  processedContent = fixMathCommands(processedContent);
319
  processedContent = fixMatrixCommands(processedContent);
320
  processedContent = fixUnicodeIssues(processedContent);
 
300
  return cleanedContent;
301
  }
302
 
303
+ /**
304
+ * Convert align anchor markers to proper HTML spans outside math blocks
305
+ * @param {string} content - Markdown content
306
+ * @returns {string} - Content with converted anchor spans
307
+ */
308
+ function convertAlignAnchors(content) {
309
+ console.log(' 🏷️ Converting align anchor markers to HTML spans...');
310
+
311
+ let convertedCount = 0;
312
+
313
+ // Find and replace align anchor markers with proper spans outside math blocks
314
+ content = content.replace(/``` math\n%%ALIGN_ANCHOR_ID\{([^}]+)\}%%\n([\s\S]*?)\n```/g, (match, anchorId, mathContent) => {
315
+ convertedCount++;
316
+ return `<span id="${anchorId}" style="position: absolute;"></span>\n\n\`\`\` math\n${mathContent}\n\`\`\``;
317
+ });
318
+
319
+ if (convertedCount > 0) {
320
+ console.log(` ✅ Converted ${convertedCount} align anchor marker(s) to spans`);
321
+ }
322
+
323
+ return content;
324
+ }
325
+
326
  /**
327
  * Main post-processing function that applies all cleanup steps
328
  * @param {string} content - Raw Markdown content from Pandoc
 
338
  processedContent = removeTexGroupingCommands(processedContent);
339
  processedContent = simplifyLatexDelimiters(processedContent);
340
  processedContent = removeOrphanedLabels(processedContent);
341
+ processedContent = convertAlignAnchors(processedContent);
342
  processedContent = fixMathCommands(processedContent);
343
  processedContent = fixMatrixCommands(processedContent);
344
  processedContent = fixUnicodeIssues(processedContent);
app/src/components/Hero.astro CHANGED
@@ -202,7 +202,7 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
202
  /* Hero (full-width) */
203
  .hero {
204
  width: 100%;
205
- padding: 48px 16px 16px;
206
  text-align: center;
207
  }
208
  .hero-title {
 
202
  /* Hero (full-width) */
203
  .hero {
204
  width: 100%;
205
+ padding: 48px 16px 48px;
206
  text-align: center;
207
  }
208
  .hero-title {
app/src/content/article.mdx CHANGED
@@ -347,6 +347,9 @@ Deriving the end-effector’s *pose*--position *and* orientation--in some $m$-di
347
 
348
  In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
349
 
 
 
 
350
  ``` math
351
  \begin{align}
352
  \min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
@@ -361,6 +364,9 @@ For instance, the robot in Figure <a href="#planar-manipulator-floor" data-refe
361
  However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
362
 
363
  Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
 
 
 
364
  ``` math
365
  \begin{align}
366
  \dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
@@ -520,8 +526,9 @@ A length-$T$ *trajectory* is the (random) sequence
520
  ``` math
521
  \htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
522
  ```
523
-
524
  with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
 
 
525
 
526
  ``` math
527
  \begin{align}
@@ -529,17 +536,21 @@ with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notat
529
  \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
530
  \end{align}
531
  ```
 
532
  The probability of observing a given trajectory $\tau$ factorizes into
 
533
  ``` math
534
  \htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
535
  ```
536
 
537
  Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
538
-
539
  ``` math
540
  G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
541
  ```
542
  In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
 
 
 
543
  ``` math
544
  \begin{align}
545
  J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
@@ -556,8 +567,9 @@ can be used to discriminate between desirable and undesirable state in terms of
556
  ``` math
557
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
558
  ```
559
-
560
  Crucially, value functions are interrelated:
 
 
561
 
562
  ``` math
563
  \begin{align}
@@ -648,6 +660,9 @@ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t
648
  Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
649
 
650
  Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
 
 
 
651
  ``` math
652
  \begin{align}
653
  \mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
@@ -672,6 +687,9 @@ Provably, <a href="#deterministic-pg" data-reference-type="ref" data-reference="
672
  Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
673
 
674
  Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
 
 
 
675
  ``` math
676
  \begin{align}
677
  J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
@@ -835,6 +853,9 @@ Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics app
835
  </figure>
836
 
837
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
 
 
 
838
  ``` math
839
  \begin{align}
840
  \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
@@ -850,6 +871,9 @@ In the special case where one assumes distributions to be tractable, $p_\theta (
850
  In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
851
 
852
  In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
 
 
 
853
  ``` math
854
  \begin{align}
855
  \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
@@ -862,8 +886,9 @@ In particular, the lower bound on <a href="#evidence-definition" data-reference
862
  \right)
863
  \end{align}
864
  ```
865
-
866
  The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
 
 
867
 
868
  ``` math
869
  \begin{align}
@@ -878,6 +903,9 @@ From Jensen’s inequality, maximizing ELBO results in maximizing the log-likeli
878
 
879
  An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
880
 
 
 
 
881
  ``` math
882
  \begin{align}
883
  \min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
@@ -900,6 +928,9 @@ Indeed, it is very common in practice to approximate from the learned likelihood
900
  #### Diffusion Models
901
 
902
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
 
 
 
903
  ``` math
904
  \begin{align}
905
  p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
@@ -925,6 +956,9 @@ Similarily to VAEs, providing an exact interpretation for the latent variables i
925
  Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
926
 
927
  Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
 
 
 
928
  ``` math
929
  \begin{align}
930
  \log p_\theta (\underbrace{o,a}_{= z_0}) =
@@ -964,6 +998,9 @@ Finally, adopting Gaussian posteriors permits a particularly pleasing interpreta
964
  </figure>
965
 
966
  Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
 
 
 
967
  ``` math
968
  \begin{align}
969
 
@@ -977,6 +1014,9 @@ Because the recorded behavior is teleoperated, measurements mostly distribute al
977
  In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
978
 
979
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
 
 
 
980
  ``` math
981
  \begin{align}
982
  z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
@@ -1030,6 +1070,9 @@ While the noising schedule of DMs results in a stochastic process that resembles
1030
  </figure>
1031
 
1032
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
 
 
 
1033
  ``` math
1034
  \begin{align}
1035
 
@@ -1046,6 +1089,9 @@ While GMs prove useful in learning complex, high-dimensional multi-modal distrib
1046
  On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
1047
 
1048
  In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
 
 
 
1049
  ``` math
1050
  \begin{align}
1051
 
@@ -1106,6 +1152,9 @@ However, the authors claim using a deterministic procedure to derive $z$ may ben
1106
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
1107
 
1108
  In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
 
 
 
1109
  ``` math
1110
  \begin{align}
1111
  \mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
@@ -1306,6 +1355,9 @@ $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a Mo
1306
  Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1307
 
1308
  In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
 
 
 
1309
  ``` math
1310
  \begin{align}
1311
  \mathcal{L}(\phi, \theta) &=
 
347
 
348
  In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
349
 
350
+ <span id="ik_problem" style="position: absolute;">
351
+ </span>
352
+
353
  ``` math
354
  \begin{align}
355
  \min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
 
364
  However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
365
 
366
  Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
367
+ <span id="reg_ik_velocity" style="position: absolute;">
368
+ </span>
369
+
370
  ``` math
371
  \begin{align}
372
  \dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
 
526
  ``` math
527
  \htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
528
  ```
 
529
  with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
530
+ <span id="dynamics_markovian" style="position: absolute;">
531
+ </span>
532
 
533
  ``` math
534
  \begin{align}
 
536
  \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
537
  \end{align}
538
  ```
539
+
540
  The probability of observing a given trajectory $\tau$ factorizes into
541
+
542
  ``` math
543
  \htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
544
  ```
545
 
546
  Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
 
547
  ``` math
548
  G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
549
  ```
550
  In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
551
+ <span id="RL-j-function" style="position: absolute;">
552
+ </span>
553
+
554
  ``` math
555
  \begin{align}
556
  J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
 
567
  ``` math
568
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
569
  ```
 
570
  Crucially, value functions are interrelated:
571
+ <span id="q-as-v" style="position: absolute;">
572
+ </span>
573
 
574
  ``` math
575
  \begin{align}
 
660
  Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
661
 
662
  Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
663
+ <span id="dqn-loss" style="position: absolute;">
664
+ </span>
665
+
666
  ``` math
667
  \begin{align}
668
  \mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
 
687
  Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
688
 
689
  Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
690
+ <span id="J-soft" style="position: absolute;">
691
+ </span>
692
+
693
  ``` math
694
  \begin{align}
695
  J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
 
853
  </figure>
854
 
855
  Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
856
+ <span id="evidence-definition-1" style="position: absolute;">
857
+ </span>
858
+
859
  ``` math
860
  \begin{align}
861
  \log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
 
871
  In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
872
 
873
  In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
874
+ <span id="ELBO-intractable" style="position: absolute;">
875
+ </span>
876
+
877
  ``` math
878
  \begin{align}
879
  \log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
 
886
  \right)
887
  \end{align}
888
  ```
 
889
  The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
890
+ <span id="ELBO" style="position: absolute;">
891
+ </span>
892
 
893
  ``` math
894
  \begin{align}
 
903
 
904
  An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
905
 
906
+ <span id="VAE-min-neg-ELBO" style="position: absolute;">
907
+ </span>
908
+
909
  ``` math
910
  \begin{align}
911
  \min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
 
928
  #### Diffusion Models
929
 
930
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
931
+ <span id="BC-multi-latent-model-1" style="position- absolute;">
932
+ </span>
933
+
934
  ``` math
935
  \begin{align}
936
  p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
 
956
  Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
957
 
958
  Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
959
+ <span id="diffusion-likelihood" style="position: absolute;">
960
+ </span>
961
+
962
  ``` math
963
  \begin{align}
964
  \log p_\theta (\underbrace{o,a}_{= z_0}) =
 
998
  </figure>
999
 
1000
  Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
1001
+ <span id="diffusion-simplified-loss" style="position: absolute;">
1002
+ </span>
1003
+
1004
  ``` math
1005
  \begin{align}
1006
 
 
1014
  In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
1015
 
1016
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
1017
+ <span id="diffusion-denoising-definition" style="position- absolute;">
1018
+ </span>
1019
+
1020
  ``` math
1021
  \begin{align}
1022
  z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
 
1070
  </figure>
1071
 
1072
  In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
1073
+ <span id="flow-matching-objective" style="position: absolute;">
1074
+ </span>
1075
+
1076
  ``` math
1077
  \begin{align}
1078
 
 
1089
  On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
1090
 
1091
  In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
1092
+ <span id="c-ELBO" style="position: absolute;">
1093
+ </span>
1094
+
1095
  ``` math
1096
  \begin{align}
1097
 
 
1152
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
1153
 
1154
  In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
1155
+ <span id="diffusion-policy-objective" style="position: absolute;">
1156
+ </span>
1157
+
1158
  ``` math
1159
  \begin{align}
1160
  \mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
 
1355
  Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
1356
 
1357
  In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
1358
+ <span id="pi0-loss" style="position: absolute;">
1359
+ </span>
1360
+
1361
  ``` math
1362
  \begin{align}
1363
  \mathcal{L}(\phi, \theta) &=
app/src/content/{embeds → embeds2}/banner.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-bar.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-benchmark.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-confusion-matrix.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-evals-after-fix.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-evals-tpbug.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-line-quad.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-line.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-matrix.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-neural-network.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-pie-quad.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-pie.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/d3-scatter.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/demo/color-picker.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/demo/content-structure.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/demo/palettes.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/original_embeds/plotly/banner.py RENAMED
File without changes
app/src/content/{embeds → embeds2}/original_embeds/plotly/bar.py RENAMED
File without changes
app/src/content/{embeds → embeds2}/original_embeds/plotly/heatmap.py RENAMED
File without changes
app/src/content/{embeds → embeds2}/original_embeds/plotly/line.py RENAMED
File without changes
app/src/content/{embeds → embeds2}/original_embeds/plotly/poetry.lock RENAMED
File without changes
app/src/content/{embeds → embeds2}/original_embeds/plotly/pyproject.toml RENAMED
File without changes
app/src/content/{embeds → embeds2}/plotly-line.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/throughput-debug-1node.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/throughput-drops-comparison.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/throughput-weka-drops.html RENAMED
File without changes
app/src/content/{embeds → embeds2}/vibe-code-d3-embeds-directives.md RENAMED
File without changes