thibaud frere
commited on
Commit
·
1f9a800
1
Parent(s):
8fd03ae
update
Browse files- app/.astro/astro/content.d.ts +2 -2
- app/scripts/latex-to-mdx/filters/equation-ids.lua +4 -3
- app/scripts/latex-to-mdx/output/main.md +38 -0
- app/scripts/latex-to-mdx/output/main.mdx +56 -4
- app/scripts/latex-to-mdx/post-processor.mjs +24 -0
- app/src/components/Hero.astro +1 -1
- app/src/content/article.mdx +56 -4
- app/src/content/{embeds → embeds2}/banner.html +0 -0
- app/src/content/{embeds → embeds2}/d3-bar.html +0 -0
- app/src/content/{embeds → embeds2}/d3-benchmark.html +0 -0
- app/src/content/{embeds → embeds2}/d3-confusion-matrix.html +0 -0
- app/src/content/{embeds → embeds2}/d3-evals-after-fix.html +0 -0
- app/src/content/{embeds → embeds2}/d3-evals-tpbug.html +0 -0
- app/src/content/{embeds → embeds2}/d3-line-quad.html +0 -0
- app/src/content/{embeds → embeds2}/d3-line.html +0 -0
- app/src/content/{embeds → embeds2}/d3-matrix.html +0 -0
- app/src/content/{embeds → embeds2}/d3-neural-network.html +0 -0
- app/src/content/{embeds → embeds2}/d3-pie-quad.html +0 -0
- app/src/content/{embeds → embeds2}/d3-pie.html +0 -0
- app/src/content/{embeds → embeds2}/d3-scatter.html +0 -0
- app/src/content/{embeds → embeds2}/demo/color-picker.html +0 -0
- app/src/content/{embeds → embeds2}/demo/content-structure.html +0 -0
- app/src/content/{embeds → embeds2}/demo/palettes.html +0 -0
- app/src/content/{embeds → embeds2}/original_embeds/plotly/banner.py +0 -0
- app/src/content/{embeds → embeds2}/original_embeds/plotly/bar.py +0 -0
- app/src/content/{embeds → embeds2}/original_embeds/plotly/heatmap.py +0 -0
- app/src/content/{embeds → embeds2}/original_embeds/plotly/line.py +0 -0
- app/src/content/{embeds → embeds2}/original_embeds/plotly/poetry.lock +0 -0
- app/src/content/{embeds → embeds2}/original_embeds/plotly/pyproject.toml +0 -0
- app/src/content/{embeds → embeds2}/plotly-line.html +0 -0
- app/src/content/{embeds → embeds2}/throughput-debug-1node.html +0 -0
- app/src/content/{embeds → embeds2}/throughput-drops-comparison.html +0 -0
- app/src/content/{embeds → embeds2}/throughput-weka-drops.html +0 -0
- app/src/content/{embeds → embeds2}/vibe-code-d3-embeds-directives.md +0 -0
app/.astro/astro/content.d.ts
CHANGED
|
@@ -209,12 +209,12 @@ declare module 'astro:content' {
|
|
| 209 |
data: any
|
| 210 |
} & { render(): Render[".mdx"] };
|
| 211 |
};
|
| 212 |
-
"
|
| 213 |
"vibe-code-d3-embeds-directives.md": {
|
| 214 |
id: "vibe-code-d3-embeds-directives.md";
|
| 215 |
slug: "vibe-code-d3-embeds-directives";
|
| 216 |
body: string;
|
| 217 |
-
collection: "
|
| 218 |
data: any
|
| 219 |
} & { render(): Render[".md"] };
|
| 220 |
};
|
|
|
|
| 209 |
data: any
|
| 210 |
} & { render(): Render[".mdx"] };
|
| 211 |
};
|
| 212 |
+
"embeds2": {
|
| 213 |
"vibe-code-d3-embeds-directives.md": {
|
| 214 |
id: "vibe-code-d3-embeds-directives.md";
|
| 215 |
slug: "vibe-code-d3-embeds-directives";
|
| 216 |
body: string;
|
| 217 |
+
collection: "embeds2";
|
| 218 |
data: any
|
| 219 |
} & { render(): Render[".md"] };
|
| 220 |
};
|
app/scripts/latex-to-mdx/filters/equation-ids.lua
CHANGED
|
@@ -69,9 +69,10 @@ function Math(el)
|
|
| 69 |
|
| 70 |
local new_math
|
| 71 |
if has_align then
|
| 72 |
-
-- For align environments,
|
| 73 |
-
-- Instead, we
|
| 74 |
-
|
|
|
|
| 75 |
else
|
| 76 |
-- For other math, wrap with \htmlId{}
|
| 77 |
new_math = "\\htmlId{" .. clean_id .. "}{" .. clean_math .. "}"
|
|
|
|
| 69 |
|
| 70 |
local new_math
|
| 71 |
if has_align then
|
| 72 |
+
-- For align environments, KaTeX doesn't support \htmlId with align
|
| 73 |
+
-- Instead, we add a special marker that the post-processor will convert to a span
|
| 74 |
+
-- This span will serve as an anchor for references
|
| 75 |
+
new_math = "%%ALIGN_ANCHOR_ID{" .. clean_id .. "}%%\n" .. clean_math
|
| 76 |
else
|
| 77 |
-- For other math, wrap with \htmlId{}
|
| 78 |
new_math = "\\htmlId{" .. clean_id .. "}{" .. clean_math .. "}"
|
app/scripts/latex-to-mdx/output/main.md
CHANGED
|
@@ -248,6 +248,8 @@ Deriving the end-effector’s *pose*--position *and* orientation--in some $`m`$-
|
|
| 248 |
|
| 249 |
In the simplified case here considered (for which $`\boldsymbol{p} \equiv p`$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $`p^*`$ by solving analytically for $`q: p(q) = f_{\text{FK}}(q) = p^*`$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $`\Vert p(q) - p^* \Vert_2^2`$ is a natural candidate), yielding:
|
| 250 |
|
|
|
|
|
|
|
| 251 |
``` math
|
| 252 |
\begin{align}
|
| 253 |
\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
|
|
@@ -262,6 +264,8 @@ For instance, the robot in Figure <a href="#planar-manipulator-floor" data-refe
|
|
| 262 |
However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $`q`$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$`K`$ succession of goal poses $`\tau_K = [p^*_0, p^*_1, \dots p^*_K]`$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $`\tau_K`$ via IK can prove prohibitively expensive, as tracking would require $`K`$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $`J(q)`$ denote the Jacobian matrix of (partial) derivatives of the FK-function $`f_\text{FK}- \mathcal Q \mapsto \mathcal P`$, such that $`J(q) = \frac{\partial f_{FK}(q)}{\partial q }`$. Then, one can apply the chain rule to any $`p(q) = f_{\text{FK}}(q)`$, deriving $`\dot p = J(q) \dot q`$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
|
| 263 |
|
| 264 |
Given a desired end-effector trajectory $`\dot {p}^*(t)`$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $`\dot q(t)`$ solving for joints’ *velocities* instead of *configurations*,
|
|
|
|
|
|
|
| 265 |
``` math
|
| 266 |
\begin{align}
|
| 267 |
\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
|
|
@@ -390,6 +394,8 @@ A length-$`T`$ *trajectory* is the (random) sequence
|
|
| 390 |
\htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
|
| 391 |
```
|
| 392 |
with per-step rewards defined as $`r_t = r (s_t, a_t, s_{t+1})`$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
|
|
|
|
|
|
|
| 393 |
``` math
|
| 394 |
\begin{align}
|
| 395 |
\mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) &= \mathbb P (s_{t+1}\vert s_t, a_t) \\
|
|
@@ -406,6 +412,8 @@ Policies $`\mathbb P(a_t\vert s_t)`$ are typically indicated as $`\pi(a_t\vert s
|
|
| 406 |
G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
|
| 407 |
```
|
| 408 |
In that, agents seek to learn control strategies (*policies*, $`\pi_\theta`$) maximizing the expected return $`\mathbb E_{\tau \sim \pi_\theta} G(\tau)`$. For a given dynamics $`\mathcal D`$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $`\Pi`$, yielding the (maximization) target $`J : \Pi \mapsto \mathbb R`$
|
|
|
|
|
|
|
| 409 |
``` math
|
| 410 |
\begin{align}
|
| 411 |
J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
|
|
@@ -422,6 +430,8 @@ can be used to discriminate between desirable and undesirable state in terms of
|
|
| 422 |
Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
|
| 423 |
```
|
| 424 |
Crucially, value functions are interrelated:
|
|
|
|
|
|
|
| 425 |
``` math
|
| 426 |
\begin{align}
|
| 427 |
Q_\pi(s_t, a_t) &= \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})] \\
|
|
@@ -493,6 +503,8 @@ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t
|
|
| 493 |
Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $`Q_K \approx Q^*`$ at each timestep. In fact, under certain assumptions on the MDP considered, $`Q_K \to Q^* \, \text{as } K \to \infty`$.
|
| 494 |
|
| 495 |
Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $`\mathcal S\times \mathcal A`$ problems, in which the storing of $`Q : \mathcal S\times \mathcal A\mapsto \mathbb R`$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $`\theta`$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $`\delta_i`$):
|
|
|
|
|
|
|
| 496 |
``` math
|
| 497 |
\begin{align}
|
| 498 |
\mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
|
|
@@ -515,6 +527,8 @@ Provably, <a href="#deterministic-pg" data-reference-type="ref" data-reference="
|
|
| 515 |
Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
|
| 516 |
|
| 517 |
Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with \<span class="highlight"\>maximizing the discounted cumulative reward, while acting as randomly as possible\</span\>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $`J (\pi)`$ to specifically account for the policy entropy,
|
|
|
|
|
|
|
| 518 |
``` math
|
| 519 |
\begin{align}
|
| 520 |
J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
|
|
@@ -643,6 +657,8 @@ Intuitively, in the case of observation-action pairs $`(o, a)`$ for a robotics a
|
|
| 643 |
</figure>
|
| 644 |
|
| 645 |
Given a dataset $`\mathcal D`$ consisting of $`N`$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $`\theta`$ (in Bayesian terms, the *evidence* $`p_\theta(\mathcal D)`$) can thus be written as:
|
|
|
|
|
|
|
| 646 |
``` math
|
| 647 |
\begin{align}
|
| 648 |
\log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
|
|
@@ -658,6 +674,8 @@ In the special case where one assumes distributions to be tractable, $`p_\theta
|
|
| 658 |
In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $`p_\theta(o,a\vert z)`$ and (approximate) posterior distribution $`q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)`$.
|
| 659 |
|
| 660 |
In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$`\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]`$--yielding:
|
|
|
|
|
|
|
| 661 |
``` math
|
| 662 |
\begin{align}
|
| 663 |
\log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
|
|
@@ -671,6 +689,8 @@ In particular, the lower bound on <a href="#evidence-definition" data-reference
|
|
| 671 |
\end{align}
|
| 672 |
```
|
| 673 |
The true, generally intractable posterior $`p_\theta (z \vert o,a)`$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $`q_\phi(z \vert o,a)`$, resulting in the final, tractable ELBO objective,
|
|
|
|
|
|
|
| 674 |
``` math
|
| 675 |
\begin{align}
|
| 676 |
\text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left(
|
|
@@ -684,6 +704,8 @@ From Jensen’s inequality, maximizing ELBO results in maximizing the log-likeli
|
|
| 684 |
|
| 685 |
An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
|
| 686 |
|
|
|
|
|
|
|
| 687 |
``` math
|
| 688 |
\begin{align}
|
| 689 |
\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
|
|
@@ -705,6 +727,8 @@ Indeed, it is very common in practice to approximate from the learned likelihood
|
|
| 705 |
#### Diffusion Models
|
| 706 |
|
| 707 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
|
|
|
|
|
|
|
| 708 |
``` math
|
| 709 |
\begin{align}
|
| 710 |
p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
|
|
@@ -724,6 +748,8 @@ Similarily to VAEs, providing an exact interpretation for the latent variables i
|
|
| 724 |
Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $`p (o,a)`$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $`p(o,a)`$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $`p(o,a)`$. Crucially, as no information from the sample $`(o,a)`$ (denoted as $`z_0 \equiv (o,a)`$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $`q(z_t \vert z_{t-1})`$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $`p(z_{t-1} \vert z_t)`$ is instead typically approximated using the parametrization $`p_\theta (z_{t-1} \vert z_t)`$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $`p(o,a)`$--a process referred to as *denoising*.
|
| 725 |
|
| 726 |
Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
|
|
|
|
|
|
|
| 727 |
``` math
|
| 728 |
\begin{align}
|
| 729 |
\log p_\theta (\underbrace{o,a}_{= z_0}) =
|
|
@@ -751,6 +777,8 @@ Finally, adopting Gaussian posteriors permits a particularly pleasing interpreta
|
|
| 751 |
</figure>
|
| 752 |
|
| 753 |
Because the recorded behavior is teleoperated, measurements mostly distribute along the line $`a = o + \eta, \eta \sim N(0,1)`$, with $`\eta`$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $`\mathcal O\times \mathcal A`$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
|
|
|
|
|
|
|
| 754 |
``` math
|
| 755 |
\begin{align}
|
| 756 |
|
|
@@ -764,6 +792,8 @@ Because the recorded behavior is teleoperated, measurements mostly distribute al
|
|
| 764 |
In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $`p_\theta`$ directly, the parameters $`\theta`$ of the pairwise likelihood $`p_\theta(z_{t-1} \vert z_t)`$ are adjusted to *predict the total displacement* $`\epsilon`$ for a randomly long ($`t \sim \mathcal{U}(\{1,\dots,T\}`$ )) diffusion process starting from a sample of the target distribution.
|
| 765 |
|
| 766 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing
|
|
|
|
|
|
|
| 767 |
``` math
|
| 768 |
\begin{align}
|
| 769 |
z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
|
|
@@ -804,6 +834,8 @@ While the noising schedule of DMs results in a stochastic process that resembles
|
|
| 804 |
</figure>
|
| 805 |
|
| 806 |
In practice, FM can be applied to generative modeling by learning a vector field regressor $`v_\theta(z, t)`$ to approximate a given target vector field $`u(t, z)`$. In the particular case of DMs, $`u(t, z)`$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $`z_1 \sim p_1`$ and a sample from an easy-to-sample prior $`z_0 \sim p_0`$, CFM defines a simple path between them using *linear interpolation* between samples $`z_t = (1-t)z_0 + t z_1`$, resulting in the target vector field $`u(t, z_t) = z_1 - z_0`$. Then, a FM model can be trained with the simple regression objective defined as
|
|
|
|
|
|
|
| 807 |
``` math
|
| 808 |
\begin{align}
|
| 809 |
|
|
@@ -820,6 +852,8 @@ While GMs prove useful in learning complex, high-dimensional multi-modal distrib
|
|
| 820 |
On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $`p_\theta(o,a)`$ on observation and actions. While the *policy* distribution $`p_\theta(a \vert o)`$ can in principle be entirely described from its joint $`p_\theta(o,a)`$, it is often the case that the conditional distribution is intractable when using function approximators, as $`p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}`$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $`p (a \vert o)`$.
|
| 821 |
|
| 822 |
In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
|
|
|
|
|
|
|
| 823 |
``` math
|
| 824 |
\begin{align}
|
| 825 |
|
|
@@ -862,6 +896,8 @@ However, the authors claim using a deterministic procedure to derive $`z`$ may b
|
|
| 862 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 863 |
|
| 864 |
In practice, conditioning on observation data is achieved conditioning the added noise regressor $`\epsilon_\theta`$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $`T_o`$ observations, resulting in the *conditional* simplified diffusion objective
|
|
|
|
|
|
|
| 865 |
``` math
|
| 866 |
\begin{align}
|
| 867 |
\mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
|
|
@@ -1033,6 +1069,8 @@ Concretely, $`\pi_0`$ is a unified transformer with two disjoint sets of weights
|
|
| 1033 |
\quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}`$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $`\mathbf{A}`$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
|
| 1034 |
|
| 1035 |
In $`\pi_0`$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
|
|
|
|
|
|
|
| 1036 |
``` math
|
| 1037 |
\begin{align}
|
| 1038 |
\mathcal{L}(\phi, \theta) &=
|
|
|
|
| 248 |
|
| 249 |
In the simplified case here considered (for which $`\boldsymbol{p} \equiv p`$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $`p^*`$ by solving analytically for $`q: p(q) = f_{\text{FK}}(q) = p^*`$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $`\Vert p(q) - p^* \Vert_2^2`$ is a natural candidate), yielding:
|
| 250 |
|
| 251 |
+
<span id="ik_problem" style="position: absolute;"></span>
|
| 252 |
+
|
| 253 |
``` math
|
| 254 |
\begin{align}
|
| 255 |
\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
|
|
|
|
| 264 |
However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $`q`$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$`K`$ succession of goal poses $`\tau_K = [p^*_0, p^*_1, \dots p^*_K]`$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $`\tau_K`$ via IK can prove prohibitively expensive, as tracking would require $`K`$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $`J(q)`$ denote the Jacobian matrix of (partial) derivatives of the FK-function $`f_\text{FK}- \mathcal Q \mapsto \mathcal P`$, such that $`J(q) = \frac{\partial f_{FK}(q)}{\partial q }`$. Then, one can apply the chain rule to any $`p(q) = f_{\text{FK}}(q)`$, deriving $`\dot p = J(q) \dot q`$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
|
| 265 |
|
| 266 |
Given a desired end-effector trajectory $`\dot {p}^*(t)`$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $`\dot q(t)`$ solving for joints’ *velocities* instead of *configurations*,
|
| 267 |
+
<span id="reg_ik_velocity" style="position: absolute;"></span>
|
| 268 |
+
|
| 269 |
``` math
|
| 270 |
\begin{align}
|
| 271 |
\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
|
|
|
|
| 394 |
\htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
|
| 395 |
```
|
| 396 |
with per-step rewards defined as $`r_t = r (s_t, a_t, s_{t+1})`$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
|
| 397 |
+
<span id="dynamics_markovian" style="position: absolute;"></span>
|
| 398 |
+
|
| 399 |
``` math
|
| 400 |
\begin{align}
|
| 401 |
\mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) &= \mathbb P (s_{t+1}\vert s_t, a_t) \\
|
|
|
|
| 412 |
G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
|
| 413 |
```
|
| 414 |
In that, agents seek to learn control strategies (*policies*, $`\pi_\theta`$) maximizing the expected return $`\mathbb E_{\tau \sim \pi_\theta} G(\tau)`$. For a given dynamics $`\mathcal D`$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $`\Pi`$, yielding the (maximization) target $`J : \Pi \mapsto \mathbb R`$
|
| 415 |
+
<span id="RL-j-function" style="position: absolute;"></span>
|
| 416 |
+
|
| 417 |
``` math
|
| 418 |
\begin{align}
|
| 419 |
J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
|
|
|
|
| 430 |
Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
|
| 431 |
```
|
| 432 |
Crucially, value functions are interrelated:
|
| 433 |
+
<span id="q-as-v" style="position: absolute;"></span>
|
| 434 |
+
|
| 435 |
``` math
|
| 436 |
\begin{align}
|
| 437 |
Q_\pi(s_t, a_t) &= \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})] \\
|
|
|
|
| 503 |
Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $`Q_K \approx Q^*`$ at each timestep. In fact, under certain assumptions on the MDP considered, $`Q_K \to Q^* \, \text{as } K \to \infty`$.
|
| 504 |
|
| 505 |
Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $`\mathcal S\times \mathcal A`$ problems, in which the storing of $`Q : \mathcal S\times \mathcal A\mapsto \mathbb R`$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $`\theta`$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $`\delta_i`$):
|
| 506 |
+
<span id="dqn-loss" style="position: absolute;"></span>
|
| 507 |
+
|
| 508 |
``` math
|
| 509 |
\begin{align}
|
| 510 |
\mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
|
|
|
|
| 527 |
Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
|
| 528 |
|
| 529 |
Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with \<span class="highlight"\>maximizing the discounted cumulative reward, while acting as randomly as possible\</span\>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $`J (\pi)`$ to specifically account for the policy entropy,
|
| 530 |
+
<span id="J-soft" style="position: absolute;"></span>
|
| 531 |
+
|
| 532 |
``` math
|
| 533 |
\begin{align}
|
| 534 |
J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
|
|
|
|
| 657 |
</figure>
|
| 658 |
|
| 659 |
Given a dataset $`\mathcal D`$ consisting of $`N`$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $`\theta`$ (in Bayesian terms, the *evidence* $`p_\theta(\mathcal D)`$) can thus be written as:
|
| 660 |
+
<span id="evidence-definition-1" style="position: absolute;"></span>
|
| 661 |
+
|
| 662 |
``` math
|
| 663 |
\begin{align}
|
| 664 |
\log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
|
|
|
|
| 674 |
In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $`p_\theta(o,a\vert z)`$ and (approximate) posterior distribution $`q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)`$.
|
| 675 |
|
| 676 |
In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$`\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]`$--yielding:
|
| 677 |
+
<span id="ELBO-intractable" style="position: absolute;"></span>
|
| 678 |
+
|
| 679 |
``` math
|
| 680 |
\begin{align}
|
| 681 |
\log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
|
|
|
|
| 689 |
\end{align}
|
| 690 |
```
|
| 691 |
The true, generally intractable posterior $`p_\theta (z \vert o,a)`$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $`q_\phi(z \vert o,a)`$, resulting in the final, tractable ELBO objective,
|
| 692 |
+
<span id="ELBO" style="position: absolute;"></span>
|
| 693 |
+
|
| 694 |
``` math
|
| 695 |
\begin{align}
|
| 696 |
\text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left(
|
|
|
|
| 704 |
|
| 705 |
An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
|
| 706 |
|
| 707 |
+
<span id="VAE-min-neg-ELBO" style="position: absolute;"></span>
|
| 708 |
+
|
| 709 |
``` math
|
| 710 |
\begin{align}
|
| 711 |
\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
|
|
|
|
| 727 |
#### Diffusion Models
|
| 728 |
|
| 729 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
|
| 730 |
+
<span id="BC-multi-latent-model-1" style="position- absolute;"></span>
|
| 731 |
+
|
| 732 |
``` math
|
| 733 |
\begin{align}
|
| 734 |
p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
|
|
|
|
| 748 |
Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $`p (o,a)`$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $`p(o,a)`$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $`p(o,a)`$. Crucially, as no information from the sample $`(o,a)`$ (denoted as $`z_0 \equiv (o,a)`$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $`q(z_t \vert z_{t-1})`$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $`p(z_{t-1} \vert z_t)`$ is instead typically approximated using the parametrization $`p_\theta (z_{t-1} \vert z_t)`$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $`p(o,a)`$--a process referred to as *denoising*.
|
| 749 |
|
| 750 |
Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
|
| 751 |
+
<span id="diffusion-likelihood" style="position: absolute;"></span>
|
| 752 |
+
|
| 753 |
``` math
|
| 754 |
\begin{align}
|
| 755 |
\log p_\theta (\underbrace{o,a}_{= z_0}) =
|
|
|
|
| 777 |
</figure>
|
| 778 |
|
| 779 |
Because the recorded behavior is teleoperated, measurements mostly distribute along the line $`a = o + \eta, \eta \sim N(0,1)`$, with $`\eta`$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $`\mathcal O\times \mathcal A`$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
|
| 780 |
+
<span id="diffusion-simplified-loss" style="position: absolute;"></span>
|
| 781 |
+
|
| 782 |
``` math
|
| 783 |
\begin{align}
|
| 784 |
|
|
|
|
| 792 |
In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $`p_\theta`$ directly, the parameters $`\theta`$ of the pairwise likelihood $`p_\theta(z_{t-1} \vert z_t)`$ are adjusted to *predict the total displacement* $`\epsilon`$ for a randomly long ($`t \sim \mathcal{U}(\{1,\dots,T\}`$ )) diffusion process starting from a sample of the target distribution.
|
| 793 |
|
| 794 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing
|
| 795 |
+
<span id="diffusion-denoising-definition" style="position- absolute;"></span>
|
| 796 |
+
|
| 797 |
``` math
|
| 798 |
\begin{align}
|
| 799 |
z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
|
|
|
|
| 834 |
</figure>
|
| 835 |
|
| 836 |
In practice, FM can be applied to generative modeling by learning a vector field regressor $`v_\theta(z, t)`$ to approximate a given target vector field $`u(t, z)`$. In the particular case of DMs, $`u(t, z)`$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $`z_1 \sim p_1`$ and a sample from an easy-to-sample prior $`z_0 \sim p_0`$, CFM defines a simple path between them using *linear interpolation* between samples $`z_t = (1-t)z_0 + t z_1`$, resulting in the target vector field $`u(t, z_t) = z_1 - z_0`$. Then, a FM model can be trained with the simple regression objective defined as
|
| 837 |
+
<span id="flow-matching-objective" style="position: absolute;"></span>
|
| 838 |
+
|
| 839 |
``` math
|
| 840 |
\begin{align}
|
| 841 |
|
|
|
|
| 852 |
On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $`p_\theta(o,a)`$ on observation and actions. While the *policy* distribution $`p_\theta(a \vert o)`$ can in principle be entirely described from its joint $`p_\theta(o,a)`$, it is often the case that the conditional distribution is intractable when using function approximators, as $`p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}`$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $`p (a \vert o)`$.
|
| 853 |
|
| 854 |
In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
|
| 855 |
+
<span id="c-ELBO" style="position: absolute;"></span>
|
| 856 |
+
|
| 857 |
``` math
|
| 858 |
\begin{align}
|
| 859 |
|
|
|
|
| 896 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 897 |
|
| 898 |
In practice, conditioning on observation data is achieved conditioning the added noise regressor $`\epsilon_\theta`$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $`T_o`$ observations, resulting in the *conditional* simplified diffusion objective
|
| 899 |
+
<span id="diffusion-policy-objective" style="position: absolute;"></span>
|
| 900 |
+
|
| 901 |
``` math
|
| 902 |
\begin{align}
|
| 903 |
\mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
|
|
|
|
| 1069 |
\quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}`$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $`\mathbf{A}`$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
|
| 1070 |
|
| 1071 |
In $`\pi_0`$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
|
| 1072 |
+
<span id="pi0-loss" style="position: absolute;"></span>
|
| 1073 |
+
|
| 1074 |
``` math
|
| 1075 |
\begin{align}
|
| 1076 |
\mathcal{L}(\phi, \theta) &=
|
app/scripts/latex-to-mdx/output/main.mdx
CHANGED
|
@@ -347,6 +347,9 @@ Deriving the end-effector’s *pose*--position *and* orientation--in some $m$-di
|
|
| 347 |
|
| 348 |
In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
|
| 349 |
|
|
|
|
|
|
|
|
|
|
| 350 |
``` math
|
| 351 |
\begin{align}
|
| 352 |
\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
|
|
@@ -361,6 +364,9 @@ For instance, the robot in Figure <a href="#planar-manipulator-floor" data-refe
|
|
| 361 |
However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
|
| 362 |
|
| 363 |
Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
|
|
|
|
|
|
|
|
|
|
| 364 |
``` math
|
| 365 |
\begin{align}
|
| 366 |
\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
|
|
@@ -520,8 +526,9 @@ A length-$T$ *trajectory* is the (random) sequence
|
|
| 520 |
``` math
|
| 521 |
\htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
|
| 522 |
```
|
| 523 |
-
|
| 524 |
with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
|
|
|
|
|
|
|
| 525 |
|
| 526 |
``` math
|
| 527 |
\begin{align}
|
|
@@ -529,17 +536,21 @@ with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notat
|
|
| 529 |
\mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
|
| 530 |
\end{align}
|
| 531 |
```
|
|
|
|
| 532 |
The probability of observing a given trajectory $\tau$ factorizes into
|
|
|
|
| 533 |
``` math
|
| 534 |
\htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
|
| 535 |
```
|
| 536 |
|
| 537 |
Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
|
| 538 |
-
|
| 539 |
``` math
|
| 540 |
G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
|
| 541 |
```
|
| 542 |
In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
|
|
|
|
|
|
|
|
|
|
| 543 |
``` math
|
| 544 |
\begin{align}
|
| 545 |
J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
|
|
@@ -556,8 +567,9 @@ can be used to discriminate between desirable and undesirable state in terms of
|
|
| 556 |
``` math
|
| 557 |
Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
|
| 558 |
```
|
| 559 |
-
|
| 560 |
Crucially, value functions are interrelated:
|
|
|
|
|
|
|
| 561 |
|
| 562 |
``` math
|
| 563 |
\begin{align}
|
|
@@ -648,6 +660,9 @@ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t
|
|
| 648 |
Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
|
| 649 |
|
| 650 |
Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
|
|
|
|
|
|
|
|
|
|
| 651 |
``` math
|
| 652 |
\begin{align}
|
| 653 |
\mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
|
|
@@ -672,6 +687,9 @@ Provably, <a href="#deterministic-pg" data-reference-type="ref" data-reference="
|
|
| 672 |
Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
|
| 673 |
|
| 674 |
Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
|
|
|
|
|
|
|
|
|
|
| 675 |
``` math
|
| 676 |
\begin{align}
|
| 677 |
J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
|
|
@@ -835,6 +853,9 @@ Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics app
|
|
| 835 |
</figure>
|
| 836 |
|
| 837 |
Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
|
|
|
|
|
|
|
|
|
|
| 838 |
``` math
|
| 839 |
\begin{align}
|
| 840 |
\log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
|
|
@@ -850,6 +871,9 @@ In the special case where one assumes distributions to be tractable, $p_\theta (
|
|
| 850 |
In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
|
| 851 |
|
| 852 |
In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
|
|
|
|
|
|
|
|
|
|
| 853 |
``` math
|
| 854 |
\begin{align}
|
| 855 |
\log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
|
|
@@ -862,8 +886,9 @@ In particular, the lower bound on <a href="#evidence-definition" data-reference
|
|
| 862 |
\right)
|
| 863 |
\end{align}
|
| 864 |
```
|
| 865 |
-
|
| 866 |
The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
|
|
|
|
|
|
|
| 867 |
|
| 868 |
``` math
|
| 869 |
\begin{align}
|
|
@@ -878,6 +903,9 @@ From Jensen’s inequality, maximizing ELBO results in maximizing the log-likeli
|
|
| 878 |
|
| 879 |
An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
|
| 880 |
|
|
|
|
|
|
|
|
|
|
| 881 |
``` math
|
| 882 |
\begin{align}
|
| 883 |
\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
|
|
@@ -900,6 +928,9 @@ Indeed, it is very common in practice to approximate from the learned likelihood
|
|
| 900 |
#### Diffusion Models
|
| 901 |
|
| 902 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
|
|
|
|
|
|
|
|
|
|
| 903 |
``` math
|
| 904 |
\begin{align}
|
| 905 |
p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
|
|
@@ -925,6 +956,9 @@ Similarily to VAEs, providing an exact interpretation for the latent variables i
|
|
| 925 |
Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
|
| 926 |
|
| 927 |
Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
|
|
|
|
|
|
|
|
|
|
| 928 |
``` math
|
| 929 |
\begin{align}
|
| 930 |
\log p_\theta (\underbrace{o,a}_{= z_0}) =
|
|
@@ -964,6 +998,9 @@ Finally, adopting Gaussian posteriors permits a particularly pleasing interpreta
|
|
| 964 |
</figure>
|
| 965 |
|
| 966 |
Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
|
|
|
|
|
|
|
|
|
|
| 967 |
``` math
|
| 968 |
\begin{align}
|
| 969 |
|
|
@@ -977,6 +1014,9 @@ Because the recorded behavior is teleoperated, measurements mostly distribute al
|
|
| 977 |
In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
|
| 978 |
|
| 979 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
|
|
|
|
|
|
|
|
|
|
| 980 |
``` math
|
| 981 |
\begin{align}
|
| 982 |
z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
|
|
@@ -1030,6 +1070,9 @@ While the noising schedule of DMs results in a stochastic process that resembles
|
|
| 1030 |
</figure>
|
| 1031 |
|
| 1032 |
In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
|
|
|
|
|
|
|
|
|
|
| 1033 |
``` math
|
| 1034 |
\begin{align}
|
| 1035 |
|
|
@@ -1046,6 +1089,9 @@ While GMs prove useful in learning complex, high-dimensional multi-modal distrib
|
|
| 1046 |
On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
|
| 1047 |
|
| 1048 |
In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
|
|
|
|
|
|
|
|
|
|
| 1049 |
``` math
|
| 1050 |
\begin{align}
|
| 1051 |
|
|
@@ -1106,6 +1152,9 @@ However, the authors claim using a deterministic procedure to derive $z$ may ben
|
|
| 1106 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 1107 |
|
| 1108 |
In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
|
|
|
|
|
|
|
|
|
|
| 1109 |
``` math
|
| 1110 |
\begin{align}
|
| 1111 |
\mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
|
|
@@ -1306,6 +1355,9 @@ $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a Mo
|
|
| 1306 |
Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
|
| 1307 |
|
| 1308 |
In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
|
|
|
|
|
|
|
|
|
|
| 1309 |
``` math
|
| 1310 |
\begin{align}
|
| 1311 |
\mathcal{L}(\phi, \theta) &=
|
|
|
|
| 347 |
|
| 348 |
In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
|
| 349 |
|
| 350 |
+
<span id="ik_problem" style="position: absolute;">
|
| 351 |
+
</span>
|
| 352 |
+
|
| 353 |
``` math
|
| 354 |
\begin{align}
|
| 355 |
\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
|
|
|
|
| 364 |
However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
|
| 365 |
|
| 366 |
Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
|
| 367 |
+
<span id="reg_ik_velocity" style="position: absolute;">
|
| 368 |
+
</span>
|
| 369 |
+
|
| 370 |
``` math
|
| 371 |
\begin{align}
|
| 372 |
\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
|
|
|
|
| 526 |
``` math
|
| 527 |
\htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
|
| 528 |
```
|
|
|
|
| 529 |
with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
|
| 530 |
+
<span id="dynamics_markovian" style="position: absolute;">
|
| 531 |
+
</span>
|
| 532 |
|
| 533 |
``` math
|
| 534 |
\begin{align}
|
|
|
|
| 536 |
\mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
|
| 537 |
\end{align}
|
| 538 |
```
|
| 539 |
+
|
| 540 |
The probability of observing a given trajectory $\tau$ factorizes into
|
| 541 |
+
|
| 542 |
``` math
|
| 543 |
\htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
|
| 544 |
```
|
| 545 |
|
| 546 |
Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
|
|
|
|
| 547 |
``` math
|
| 548 |
G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
|
| 549 |
```
|
| 550 |
In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
|
| 551 |
+
<span id="RL-j-function" style="position: absolute;">
|
| 552 |
+
</span>
|
| 553 |
+
|
| 554 |
``` math
|
| 555 |
\begin{align}
|
| 556 |
J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
|
|
|
|
| 567 |
``` math
|
| 568 |
Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
|
| 569 |
```
|
|
|
|
| 570 |
Crucially, value functions are interrelated:
|
| 571 |
+
<span id="q-as-v" style="position: absolute;">
|
| 572 |
+
</span>
|
| 573 |
|
| 574 |
``` math
|
| 575 |
\begin{align}
|
|
|
|
| 660 |
Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
|
| 661 |
|
| 662 |
Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
|
| 663 |
+
<span id="dqn-loss" style="position: absolute;">
|
| 664 |
+
</span>
|
| 665 |
+
|
| 666 |
``` math
|
| 667 |
\begin{align}
|
| 668 |
\mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
|
|
|
|
| 687 |
Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
|
| 688 |
|
| 689 |
Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
|
| 690 |
+
<span id="J-soft" style="position: absolute;">
|
| 691 |
+
</span>
|
| 692 |
+
|
| 693 |
``` math
|
| 694 |
\begin{align}
|
| 695 |
J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
|
|
|
|
| 853 |
</figure>
|
| 854 |
|
| 855 |
Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
|
| 856 |
+
<span id="evidence-definition-1" style="position: absolute;">
|
| 857 |
+
</span>
|
| 858 |
+
|
| 859 |
``` math
|
| 860 |
\begin{align}
|
| 861 |
\log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
|
|
|
|
| 871 |
In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
|
| 872 |
|
| 873 |
In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
|
| 874 |
+
<span id="ELBO-intractable" style="position: absolute;">
|
| 875 |
+
</span>
|
| 876 |
+
|
| 877 |
``` math
|
| 878 |
\begin{align}
|
| 879 |
\log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
|
|
|
|
| 886 |
\right)
|
| 887 |
\end{align}
|
| 888 |
```
|
|
|
|
| 889 |
The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
|
| 890 |
+
<span id="ELBO" style="position: absolute;">
|
| 891 |
+
</span>
|
| 892 |
|
| 893 |
``` math
|
| 894 |
\begin{align}
|
|
|
|
| 903 |
|
| 904 |
An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
|
| 905 |
|
| 906 |
+
<span id="VAE-min-neg-ELBO" style="position: absolute;">
|
| 907 |
+
</span>
|
| 908 |
+
|
| 909 |
``` math
|
| 910 |
\begin{align}
|
| 911 |
\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
|
|
|
|
| 928 |
#### Diffusion Models
|
| 929 |
|
| 930 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
|
| 931 |
+
<span id="BC-multi-latent-model-1" style="position- absolute;">
|
| 932 |
+
</span>
|
| 933 |
+
|
| 934 |
``` math
|
| 935 |
\begin{align}
|
| 936 |
p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
|
|
|
|
| 956 |
Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
|
| 957 |
|
| 958 |
Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
|
| 959 |
+
<span id="diffusion-likelihood" style="position: absolute;">
|
| 960 |
+
</span>
|
| 961 |
+
|
| 962 |
``` math
|
| 963 |
\begin{align}
|
| 964 |
\log p_\theta (\underbrace{o,a}_{= z_0}) =
|
|
|
|
| 998 |
</figure>
|
| 999 |
|
| 1000 |
Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
|
| 1001 |
+
<span id="diffusion-simplified-loss" style="position: absolute;">
|
| 1002 |
+
</span>
|
| 1003 |
+
|
| 1004 |
``` math
|
| 1005 |
\begin{align}
|
| 1006 |
|
|
|
|
| 1014 |
In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
|
| 1015 |
|
| 1016 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
|
| 1017 |
+
<span id="diffusion-denoising-definition" style="position- absolute;">
|
| 1018 |
+
</span>
|
| 1019 |
+
|
| 1020 |
``` math
|
| 1021 |
\begin{align}
|
| 1022 |
z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
|
|
|
|
| 1070 |
</figure>
|
| 1071 |
|
| 1072 |
In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
|
| 1073 |
+
<span id="flow-matching-objective" style="position: absolute;">
|
| 1074 |
+
</span>
|
| 1075 |
+
|
| 1076 |
``` math
|
| 1077 |
\begin{align}
|
| 1078 |
|
|
|
|
| 1089 |
On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
|
| 1090 |
|
| 1091 |
In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
|
| 1092 |
+
<span id="c-ELBO" style="position: absolute;">
|
| 1093 |
+
</span>
|
| 1094 |
+
|
| 1095 |
``` math
|
| 1096 |
\begin{align}
|
| 1097 |
|
|
|
|
| 1152 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 1153 |
|
| 1154 |
In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
|
| 1155 |
+
<span id="diffusion-policy-objective" style="position: absolute;">
|
| 1156 |
+
</span>
|
| 1157 |
+
|
| 1158 |
``` math
|
| 1159 |
\begin{align}
|
| 1160 |
\mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
|
|
|
|
| 1355 |
Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
|
| 1356 |
|
| 1357 |
In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
|
| 1358 |
+
<span id="pi0-loss" style="position: absolute;">
|
| 1359 |
+
</span>
|
| 1360 |
+
|
| 1361 |
``` math
|
| 1362 |
\begin{align}
|
| 1363 |
\mathcal{L}(\phi, \theta) &=
|
app/scripts/latex-to-mdx/post-processor.mjs
CHANGED
|
@@ -300,6 +300,29 @@ function fixLinkTextContent(content) {
|
|
| 300 |
return cleanedContent;
|
| 301 |
}
|
| 302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
/**
|
| 304 |
* Main post-processing function that applies all cleanup steps
|
| 305 |
* @param {string} content - Raw Markdown content from Pandoc
|
|
@@ -315,6 +338,7 @@ export function postProcessMarkdown(content, inputDir = null) {
|
|
| 315 |
processedContent = removeTexGroupingCommands(processedContent);
|
| 316 |
processedContent = simplifyLatexDelimiters(processedContent);
|
| 317 |
processedContent = removeOrphanedLabels(processedContent);
|
|
|
|
| 318 |
processedContent = fixMathCommands(processedContent);
|
| 319 |
processedContent = fixMatrixCommands(processedContent);
|
| 320 |
processedContent = fixUnicodeIssues(processedContent);
|
|
|
|
| 300 |
return cleanedContent;
|
| 301 |
}
|
| 302 |
|
| 303 |
+
/**
|
| 304 |
+
* Convert align anchor markers to proper HTML spans outside math blocks
|
| 305 |
+
* @param {string} content - Markdown content
|
| 306 |
+
* @returns {string} - Content with converted anchor spans
|
| 307 |
+
*/
|
| 308 |
+
function convertAlignAnchors(content) {
|
| 309 |
+
console.log(' 🏷️ Converting align anchor markers to HTML spans...');
|
| 310 |
+
|
| 311 |
+
let convertedCount = 0;
|
| 312 |
+
|
| 313 |
+
// Find and replace align anchor markers with proper spans outside math blocks
|
| 314 |
+
content = content.replace(/``` math\n%%ALIGN_ANCHOR_ID\{([^}]+)\}%%\n([\s\S]*?)\n```/g, (match, anchorId, mathContent) => {
|
| 315 |
+
convertedCount++;
|
| 316 |
+
return `<span id="${anchorId}" style="position: absolute;"></span>\n\n\`\`\` math\n${mathContent}\n\`\`\``;
|
| 317 |
+
});
|
| 318 |
+
|
| 319 |
+
if (convertedCount > 0) {
|
| 320 |
+
console.log(` ✅ Converted ${convertedCount} align anchor marker(s) to spans`);
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
return content;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
/**
|
| 327 |
* Main post-processing function that applies all cleanup steps
|
| 328 |
* @param {string} content - Raw Markdown content from Pandoc
|
|
|
|
| 338 |
processedContent = removeTexGroupingCommands(processedContent);
|
| 339 |
processedContent = simplifyLatexDelimiters(processedContent);
|
| 340 |
processedContent = removeOrphanedLabels(processedContent);
|
| 341 |
+
processedContent = convertAlignAnchors(processedContent);
|
| 342 |
processedContent = fixMathCommands(processedContent);
|
| 343 |
processedContent = fixMatrixCommands(processedContent);
|
| 344 |
processedContent = fixUnicodeIssues(processedContent);
|
app/src/components/Hero.astro
CHANGED
|
@@ -202,7 +202,7 @@ const pdfFilename = `${slugify(pdfBase)}.pdf`;
|
|
| 202 |
/* Hero (full-width) */
|
| 203 |
.hero {
|
| 204 |
width: 100%;
|
| 205 |
-
padding: 48px 16px
|
| 206 |
text-align: center;
|
| 207 |
}
|
| 208 |
.hero-title {
|
|
|
|
| 202 |
/* Hero (full-width) */
|
| 203 |
.hero {
|
| 204 |
width: 100%;
|
| 205 |
+
padding: 48px 16px 48px;
|
| 206 |
text-align: center;
|
| 207 |
}
|
| 208 |
.hero-title {
|
app/src/content/article.mdx
CHANGED
|
@@ -347,6 +347,9 @@ Deriving the end-effector’s *pose*--position *and* orientation--in some $m$-di
|
|
| 347 |
|
| 348 |
In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
|
| 349 |
|
|
|
|
|
|
|
|
|
|
| 350 |
``` math
|
| 351 |
\begin{align}
|
| 352 |
\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
|
|
@@ -361,6 +364,9 @@ For instance, the robot in Figure <a href="#planar-manipulator-floor" data-refe
|
|
| 361 |
However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
|
| 362 |
|
| 363 |
Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
|
|
|
|
|
|
|
|
|
|
| 364 |
``` math
|
| 365 |
\begin{align}
|
| 366 |
\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
|
|
@@ -520,8 +526,9 @@ A length-$T$ *trajectory* is the (random) sequence
|
|
| 520 |
``` math
|
| 521 |
\htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
|
| 522 |
```
|
| 523 |
-
|
| 524 |
with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
|
|
|
|
|
|
|
| 525 |
|
| 526 |
``` math
|
| 527 |
\begin{align}
|
|
@@ -529,17 +536,21 @@ with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notat
|
|
| 529 |
\mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
|
| 530 |
\end{align}
|
| 531 |
```
|
|
|
|
| 532 |
The probability of observing a given trajectory $\tau$ factorizes into
|
|
|
|
| 533 |
``` math
|
| 534 |
\htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
|
| 535 |
```
|
| 536 |
|
| 537 |
Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
|
| 538 |
-
|
| 539 |
``` math
|
| 540 |
G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
|
| 541 |
```
|
| 542 |
In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
|
|
|
|
|
|
|
|
|
|
| 543 |
``` math
|
| 544 |
\begin{align}
|
| 545 |
J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
|
|
@@ -556,8 +567,9 @@ can be used to discriminate between desirable and undesirable state in terms of
|
|
| 556 |
``` math
|
| 557 |
Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
|
| 558 |
```
|
| 559 |
-
|
| 560 |
Crucially, value functions are interrelated:
|
|
|
|
|
|
|
| 561 |
|
| 562 |
``` math
|
| 563 |
\begin{align}
|
|
@@ -648,6 +660,9 @@ Q_{i+1}(s_t, a_t) \leftarrow \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t
|
|
| 648 |
Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
|
| 649 |
|
| 650 |
Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
|
|
|
|
|
|
|
|
|
|
| 651 |
``` math
|
| 652 |
\begin{align}
|
| 653 |
\mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
|
|
@@ -672,6 +687,9 @@ Provably, <a href="#deterministic-pg" data-reference-type="ref" data-reference="
|
|
| 672 |
Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
|
| 673 |
|
| 674 |
Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
|
|
|
|
|
|
|
|
|
|
| 675 |
``` math
|
| 676 |
\begin{align}
|
| 677 |
J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
|
|
@@ -835,6 +853,9 @@ Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics app
|
|
| 835 |
</figure>
|
| 836 |
|
| 837 |
Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
|
|
|
|
|
|
|
|
|
|
| 838 |
``` math
|
| 839 |
\begin{align}
|
| 840 |
\log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
|
|
@@ -850,6 +871,9 @@ In the special case where one assumes distributions to be tractable, $p_\theta (
|
|
| 850 |
In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
|
| 851 |
|
| 852 |
In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
|
|
|
|
|
|
|
|
|
|
| 853 |
``` math
|
| 854 |
\begin{align}
|
| 855 |
\log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
|
|
@@ -862,8 +886,9 @@ In particular, the lower bound on <a href="#evidence-definition" data-reference
|
|
| 862 |
\right)
|
| 863 |
\end{align}
|
| 864 |
```
|
| 865 |
-
|
| 866 |
The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
|
|
|
|
|
|
|
| 867 |
|
| 868 |
``` math
|
| 869 |
\begin{align}
|
|
@@ -878,6 +903,9 @@ From Jensen’s inequality, maximizing ELBO results in maximizing the log-likeli
|
|
| 878 |
|
| 879 |
An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
|
| 880 |
|
|
|
|
|
|
|
|
|
|
| 881 |
``` math
|
| 882 |
\begin{align}
|
| 883 |
\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
|
|
@@ -900,6 +928,9 @@ Indeed, it is very common in practice to approximate from the learned likelihood
|
|
| 900 |
#### Diffusion Models
|
| 901 |
|
| 902 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
|
|
|
|
|
|
|
|
|
|
| 903 |
``` math
|
| 904 |
\begin{align}
|
| 905 |
p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
|
|
@@ -925,6 +956,9 @@ Similarily to VAEs, providing an exact interpretation for the latent variables i
|
|
| 925 |
Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
|
| 926 |
|
| 927 |
Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
|
|
|
|
|
|
|
|
|
|
| 928 |
``` math
|
| 929 |
\begin{align}
|
| 930 |
\log p_\theta (\underbrace{o,a}_{= z_0}) =
|
|
@@ -964,6 +998,9 @@ Finally, adopting Gaussian posteriors permits a particularly pleasing interpreta
|
|
| 964 |
</figure>
|
| 965 |
|
| 966 |
Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
|
|
|
|
|
|
|
|
|
|
| 967 |
``` math
|
| 968 |
\begin{align}
|
| 969 |
|
|
@@ -977,6 +1014,9 @@ Because the recorded behavior is teleoperated, measurements mostly distribute al
|
|
| 977 |
In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
|
| 978 |
|
| 979 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
|
|
|
|
|
|
|
|
|
|
| 980 |
``` math
|
| 981 |
\begin{align}
|
| 982 |
z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
|
|
@@ -1030,6 +1070,9 @@ While the noising schedule of DMs results in a stochastic process that resembles
|
|
| 1030 |
</figure>
|
| 1031 |
|
| 1032 |
In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
|
|
|
|
|
|
|
|
|
|
| 1033 |
``` math
|
| 1034 |
\begin{align}
|
| 1035 |
|
|
@@ -1046,6 +1089,9 @@ While GMs prove useful in learning complex, high-dimensional multi-modal distrib
|
|
| 1046 |
On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
|
| 1047 |
|
| 1048 |
In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
|
|
|
|
|
|
|
|
|
|
| 1049 |
``` math
|
| 1050 |
\begin{align}
|
| 1051 |
|
|
@@ -1106,6 +1152,9 @@ However, the authors claim using a deterministic procedure to derive $z$ may ben
|
|
| 1106 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 1107 |
|
| 1108 |
In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
|
|
|
|
|
|
|
|
|
|
| 1109 |
``` math
|
| 1110 |
\begin{align}
|
| 1111 |
\mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
|
|
@@ -1306,6 +1355,9 @@ $\pi_0$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a Mo
|
|
| 1306 |
Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
|
| 1307 |
|
| 1308 |
In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
|
|
|
|
|
|
|
|
|
|
| 1309 |
``` math
|
| 1310 |
\begin{align}
|
| 1311 |
\mathcal{L}(\phi, \theta) &=
|
|
|
|
| 347 |
|
| 348 |
In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
|
| 349 |
|
| 350 |
+
<span id="ik_problem" style="position: absolute;">
|
| 351 |
+
</span>
|
| 352 |
+
|
| 353 |
``` math
|
| 354 |
\begin{align}
|
| 355 |
\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
|
|
|
|
| 364 |
However, IK--solving eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#ik_problem" data-reference-type="ref" data-reference="ik_problem">[ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}- \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
|
| 365 |
|
| 366 |
Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*,
|
| 367 |
+
<span id="reg_ik_velocity" style="position: absolute;">
|
| 368 |
+
</span>
|
| 369 |
+
|
| 370 |
``` math
|
| 371 |
\begin{align}
|
| 372 |
\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
|
|
|
|
| 526 |
``` math
|
| 527 |
\htmlId{trajectory_definition}{\tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),}
|
| 528 |
```
|
|
|
|
| 529 |
with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
|
| 530 |
+
<span id="dynamics_markovian" style="position: absolute;">
|
| 531 |
+
</span>
|
| 532 |
|
| 533 |
``` math
|
| 534 |
\begin{align}
|
|
|
|
| 536 |
\mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t\vert s_t)
|
| 537 |
\end{align}
|
| 538 |
```
|
| 539 |
+
|
| 540 |
The probability of observing a given trajectory $\tau$ factorizes into
|
| 541 |
+
|
| 542 |
``` math
|
| 543 |
\htmlId{traj_prob}{\mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1}\vert s_t, a_t)\ \mathbb P(a_t\vert s_t).}
|
| 544 |
```
|
| 545 |
|
| 546 |
Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
|
|
|
|
| 547 |
``` math
|
| 548 |
G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
|
| 549 |
```
|
| 550 |
In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
|
| 551 |
+
<span id="RL-j-function" style="position: absolute;">
|
| 552 |
+
</span>
|
| 553 |
+
|
| 554 |
``` math
|
| 555 |
\begin{align}
|
| 556 |
J(\pi_\theta) &= \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)], \\
|
|
|
|
| 567 |
``` math
|
| 568 |
Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
|
| 569 |
```
|
|
|
|
| 570 |
Crucially, value functions are interrelated:
|
| 571 |
+
<span id="q-as-v" style="position: absolute;">
|
| 572 |
+
</span>
|
| 573 |
|
| 574 |
``` math
|
| 575 |
\begin{align}
|
|
|
|
| 660 |
Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep. In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text{as } K \to \infty$.
|
| 661 |
|
| 662 |
Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal S\times \mathcal A$ problems, in which the storing of $Q : \mathcal S\times \mathcal A\mapsto \mathbb R$ alone might result prohibitive. Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics. In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces. DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
|
| 663 |
+
<span id="dqn-loss" style="position: absolute;">
|
| 664 |
+
</span>
|
| 665 |
+
|
| 666 |
``` math
|
| 667 |
\begin{align}
|
| 668 |
\mathcal L(\theta_i) &= \mathbb E_{(s_t, a_t) \sim \chi(\bullet)}
|
|
|
|
| 687 |
Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
|
| 688 |
|
| 689 |
Soft Actor-Critic (SAC) @haarnojaSoftActorCriticOffPolicy2018 is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with <span class="highlight">maximizing the discounted cumulative reward, while acting as randomly as possible</span>. MaxEnt RL @haarnojaReinforcementLearningDeep2017 has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation. In that, MaxEnt revisits the RL objective $J (\pi)$ to specifically account for the policy entropy,
|
| 690 |
+
<span id="J-soft" style="position: absolute;">
|
| 691 |
+
</span>
|
| 692 |
+
|
| 693 |
``` math
|
| 694 |
\begin{align}
|
| 695 |
J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} [r_t + \alpha \mathcal H(\pi (\bullet \vert s_t))]
|
|
|
|
| 853 |
</figure>
|
| 854 |
|
| 855 |
Given a dataset $\mathcal D$ consisting of $N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta$ (in Bayesian terms, the *evidence* $p_\theta(\mathcal D)$) can thus be written as:
|
| 856 |
+
<span id="evidence-definition-1" style="position: absolute;">
|
| 857 |
+
</span>
|
| 858 |
+
|
| 859 |
``` math
|
| 860 |
\begin{align}
|
| 861 |
\log p_\theta(\mathcal D) &= \log \sum_{i=0}^N p_\theta ((o,a)_i) \\
|
|
|
|
| 871 |
In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
|
| 872 |
|
| 873 |
In particular, the lower bound on <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> (Evidence LOwer Bound, *ELBO*) can be derived from <a href="#evidence-definition" data-reference-type="ref" data-reference="evidence-definition">[evidence-definition]</a> applying Jensen’s inequality--$\log \mathbb{E}[\bullet] \geq \mathbb{E} [\log (\bullet)]$--yielding:
|
| 874 |
+
<span id="ELBO-intractable" style="position: absolute;">
|
| 875 |
+
</span>
|
| 876 |
+
|
| 877 |
``` math
|
| 878 |
\begin{align}
|
| 879 |
\log p_\theta(\mathcal D) &\geq \sum_{i=0}^{N} \left(
|
|
|
|
| 886 |
\right)
|
| 887 |
\end{align}
|
| 888 |
```
|
|
|
|
| 889 |
The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#ELBO-intractable" data-reference-type="ref" data-reference="ELBO-intractable">[ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
|
| 890 |
+
<span id="ELBO" style="position: absolute;">
|
| 891 |
+
</span>
|
| 892 |
|
| 893 |
``` math
|
| 894 |
\begin{align}
|
|
|
|
| 903 |
|
| 904 |
An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
|
| 905 |
|
| 906 |
+
<span id="VAE-min-neg-ELBO" style="position: absolute;">
|
| 907 |
+
</span>
|
| 908 |
+
|
| 909 |
``` math
|
| 910 |
\begin{align}
|
| 911 |
\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi) \\
|
|
|
|
| 928 |
#### Diffusion Models
|
| 929 |
|
| 930 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#BC-latent-variable" data-reference-type="ref" data-reference="BC-latent-variable">[BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $p_\theta$ and (approximate) posterior $q_\phi$ for such model. In that, the unknown data distribution $p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#ch4-many-latents" data-reference-type="ref" data-reference="ch4-many-latents">[ch4-many-latents]</a>), resulting in
|
| 931 |
+
<span id="BC-multi-latent-model-1" style="position- absolute;">
|
| 932 |
+
</span>
|
| 933 |
+
|
| 934 |
``` math
|
| 935 |
\begin{align}
|
| 936 |
p(\underbrace{o,a}_{= z_0}) &= \int_{\text{supp}({Z_0})} \int_{\text{supp}({Z_1})} \ldots \int_{\text{supp}({Z_T})} p(z_0, z_1, \dots z_T) \\
|
|
|
|
| 956 |
Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( <a href="#BC-multi-latent-model-1" data-reference-type="ref" data-reference="BC-multi-latent-model-1">[BC-multi-latent-model-1]</a>). Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution. However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$. Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_{t-1})$ assumes a relatively amicable structure in DMs, reducing complexity. The *true* likelihood $p(z_{t-1} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_{t-1} \vert z_t)$. In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$--a process referred to as *denoising*.
|
| 957 |
|
| 958 |
Under such model, we can express the log-likelihood of an arbitrary sample as[^4]
|
| 959 |
+
<span id="diffusion-likelihood" style="position: absolute;">
|
| 960 |
+
</span>
|
| 961 |
+
|
| 962 |
``` math
|
| 963 |
\begin{align}
|
| 964 |
\log p_\theta (\underbrace{o,a}_{= z_0}) =
|
|
|
|
| 998 |
</figure>
|
| 999 |
|
| 1000 |
Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure <a href="#ch4-action-vs-observation-distribution" data-reference-type="ref" data-reference="ch4-action-vs-observation-distribution">[ch4-action-vs-observation-distribution]</a>). Using Gaussian posteriors--i.e., adding Gaussian noise--effectively simulates a *Brownian motion* for the elements in the distribution’s support (in Figure <a href="#diffusion-robot-actions" data-reference-type="ref" data-reference="diffusion-robot-actions">[diffusion-robot-actions]</a>, $\mathcal O\times \mathcal A$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion. Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement. This key intuition allows to write a simplified training objective:
|
| 1001 |
+
<span id="diffusion-simplified-loss" style="position: absolute;">
|
| 1002 |
+
</span>
|
| 1003 |
+
|
| 1004 |
``` math
|
| 1005 |
\begin{align}
|
| 1006 |
|
|
|
|
| 1014 |
In this simplified (minimization) objective, the optimization process differs from <a href="#diffusion-likelihood" data-reference-type="ref" data-reference="diffusion-likelihood">[diffusion-likelihood]</a> in that, rather than maxizing $p_\theta$ directly, the parameters $\theta$ of the pairwise likelihood $p_\theta(z_{t-1} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon$ for a randomly long ($t \sim \mathcal{U}(\{1,\dots,T\}$ )) diffusion process starting from a sample of the target distribution.
|
| 1015 |
|
| 1016 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $p_\theta (z_{t-1} \vert z_t)$ is Gaussian, then sampling $z_{t-1} \sim p_\theta(\bullet \vert z_{t})$ corresponds to computing
|
| 1017 |
+
<span id="diffusion-denoising-definition" style="position- absolute;">
|
| 1018 |
+
</span>
|
| 1019 |
+
|
| 1020 |
``` math
|
| 1021 |
\begin{align}
|
| 1022 |
z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}),
|
|
|
|
| 1070 |
</figure>
|
| 1071 |
|
| 1072 |
In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#fm-diffusion-vector-field" data-reference-type="ref" data-reference="fm-diffusion-vector-field">[fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as
|
| 1073 |
+
<span id="flow-matching-objective" style="position: absolute;">
|
| 1074 |
+
</span>
|
| 1075 |
+
|
| 1076 |
``` math
|
| 1077 |
\begin{align}
|
| 1078 |
|
|
|
|
| 1089 |
On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE @sohnLearningStructuredOutput2015. Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples. Further, in stark contrast with previous work @florenceImplicitBehavioralCloning2022, @jannerPlanningDiffusionFlexible2022, @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $p_\theta(o,a)$ on observation and actions. While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac{p_\theta(o,a)}{\int_\mathcal Ap_\theta(o,a)}$ and the integral in the denominator is typically intractable. Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE @sohnLearningStructuredOutput2015 modeling the policy distribution directly $p (a \vert o)$.
|
| 1090 |
|
| 1091 |
In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in <a href="#ELBO" data-reference-type="ref" data-reference="ELBO">[ELBO]</a>, which is adapted to
|
| 1092 |
+
<span id="c-ELBO" style="position: absolute;">
|
| 1093 |
+
</span>
|
| 1094 |
+
|
| 1095 |
``` math
|
| 1096 |
\begin{align}
|
| 1097 |
|
|
|
|
| 1152 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $p(o,a)$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP’s rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 1153 |
|
| 1154 |
In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta$ introduced in <a href="#diffusion-simplified-loss" data-reference-type="ref" data-reference="diffusion-simplified-loss">[diffusion-simplified-loss]</a> on a stack of $T_o$ observations, resulting in the *conditional* simplified diffusion objective
|
| 1155 |
+
<span id="diffusion-policy-objective" style="position: absolute;">
|
| 1156 |
+
</span>
|
| 1157 |
+
|
| 1158 |
``` math
|
| 1159 |
\begin{align}
|
| 1160 |
\mathcal L(\theta) &= \mathbb{E}_{t, a_{t:t+H_a}, \epsilon} \big[
|
|
|
|
| 1355 |
Concretely, $\pi_0$ is a unified transformer with two disjoint sets of weights $\phi, \theta$. A larger VLM backbone $p_\phi$ initialized from Gemma 2.6B processes multiple image frames obtained from multiple cameras points $[\{ I_t \}_{t=1}^n]$, as well as a language instruction $[\ell_t]$ used to describe the task considered. Concurrently, a 300M-parameter *action expert* based on a similar transformer architecture is used processes the robot proprioperceptive state $q_t$ and an action chunk $a_{t:t+H_a}$ (Figure <a href="#ch5-pi0" data-reference-type="ref" data-reference="ch5-pi0">[ch5-pi0]</a>). The different expert networks operate separately in processing the respective inputs and turning them into query, key and value matrices, and only share information between each other via self-attention layers. The outputs from the VLM backbone are disregarded, while the vector field regressed by the action expert is used to iteratively refine the action process. In particular, $\pi_0$uses a *blockwise causal attention mask* over tokens belonging to three separate blocks: (1) image and language tokens $\mathcal T_i$ obtained from $[\{ I_t \}_{t=1}^n, \ell_t]$, (2) proprioperceptive tokens $\mathcal T_q$ obtained from $q_t$, and (3) the action tokens $\mathcal T_a$ for items in the chunk $a^{\tau}_{t:t+H_a}$ at time $\tau$ in the flow-matching process. Notably, *within* each block the attention operations are bidirectional, while across blocks, future blocks are masked out. Formally, this corresponds to using the attention mask $\mathbf{A} = \bordermatrix{ \mathcal{T}_i \mathcal{T}_q \mathcal{T}_a \cr \mathcal{T}_i \mathbf{1} \mathbf{0} \mathbf{0} \cr \mathcal{T}_q \mathbf{1} \mathbf{1} \mathbf{0} \cr \mathcal{T}_a \mathbf{1} \mathbf{1} \mathbf{1} \cr }, \quad \mathbf{1}: \text{Bidirectional Attention}, \ \mathbf{0}: \text{Masked Attention}$ Note how *intra*-block directional attention allows tokens to communicate freely, while *inter*-block communication is mediated by the attention mask $\mathbf{A}$. *Blockwise causal masking* effectively prevents the pre-trained perception-language tokens from attending to robotics-tokens, likely out of distribution for VLM backbones traditionally trained on large corpora of internet, non-robotics, data. Crucially, because communication is obstructed between image-language tokens, proprioperceptive and action tokens, one can cache keys and values across denoising steps at runtime time, incuring in a reduced computational footprint and faster inference.
|
| 1356 |
|
| 1357 |
In $\pi_0$, both the VLM backbone and action expert are update using a *flow matching* loss, and in particular are updated minimizing:
|
| 1358 |
+
<span id="pi0-loss" style="position: absolute;">
|
| 1359 |
+
</span>
|
| 1360 |
+
|
| 1361 |
``` math
|
| 1362 |
\begin{align}
|
| 1363 |
\mathcal{L}(\phi, \theta) &=
|
app/src/content/{embeds → embeds2}/banner.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-bar.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-benchmark.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-confusion-matrix.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-evals-after-fix.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-evals-tpbug.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-line-quad.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-line.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-matrix.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-neural-network.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-pie-quad.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-pie.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/d3-scatter.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/demo/color-picker.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/demo/content-structure.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/demo/palettes.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/original_embeds/plotly/banner.py
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/original_embeds/plotly/bar.py
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/original_embeds/plotly/heatmap.py
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/original_embeds/plotly/line.py
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/original_embeds/plotly/poetry.lock
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/original_embeds/plotly/pyproject.toml
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/plotly-line.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/throughput-debug-1node.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/throughput-drops-comparison.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/throughput-weka-drops.html
RENAMED
|
File without changes
|
app/src/content/{embeds → embeds2}/vibe-code-d3-embeds-directives.md
RENAMED
|
File without changes
|