thibaud frere commited on
Commit
52bc805
·
1 Parent(s): 59924a2

update article

Browse files
app/scripts/latex-to-markdown/index.mjs CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import { join, dirname } from 'path';
4
  import { fileURLToPath } from 'url';
 
5
  import { convertLatexToMarkdown } from './latex-converter.mjs';
6
  import { convertToMdx } from './mdx-converter.mjs';
7
  import { cleanBibliography } from './bib-cleaner.mjs';
@@ -12,6 +13,7 @@ const __dirname = dirname(__filename);
12
  // Default configuration
13
  const DEFAULT_INPUT = join(__dirname, 'input', 'main.tex');
14
  const DEFAULT_OUTPUT = join(__dirname, 'output');
 
15
 
16
  function parseArgs() {
17
  const args = process.argv.slice(2);
@@ -110,6 +112,15 @@ function main() {
110
 
111
  console.log('📝 Converting Markdown to MDX...');
112
  convertToMdx(markdownFile, mdxFile);
 
 
 
 
 
 
 
 
 
113
  }
114
 
115
  } catch (error) {
 
2
 
3
  import { join, dirname } from 'path';
4
  import { fileURLToPath } from 'url';
5
+ import { copyFileSync } from 'fs';
6
  import { convertLatexToMarkdown } from './latex-converter.mjs';
7
  import { convertToMdx } from './mdx-converter.mjs';
8
  import { cleanBibliography } from './bib-cleaner.mjs';
 
13
  // Default configuration
14
  const DEFAULT_INPUT = join(__dirname, 'input', 'main.tex');
15
  const DEFAULT_OUTPUT = join(__dirname, 'output');
16
+ const ASTRO_CONTENT_PATH = join(__dirname, '..', '..', 'src', 'content', 'article.mdx');
17
 
18
  function parseArgs() {
19
  const args = process.argv.slice(2);
 
112
 
113
  console.log('📝 Converting Markdown to MDX...');
114
  convertToMdx(markdownFile, mdxFile);
115
+
116
+ // Copy MDX to Astro content directory
117
+ console.log('📋 Copying MDX to Astro content directory...');
118
+ try {
119
+ copyFileSync(mdxFile, ASTRO_CONTENT_PATH);
120
+ console.log(` ✅ Copied to ${ASTRO_CONTENT_PATH}`);
121
+ } catch (error) {
122
+ console.warn(` ⚠️ Failed to copy MDX to Astro: ${error.message}`);
123
+ }
124
  }
125
 
126
  } catch (error) {
app/scripts/latex-to-markdown/mdx-converter.mjs CHANGED
@@ -356,6 +356,47 @@ date: "${new Date().toISOString().split('T')[0]}"
356
  return content;
357
  }
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  /**
360
  * Clean up MDX-incompatible syntax
361
  * @param {string} content - MDX content
@@ -391,6 +432,7 @@ function processMdxContent(content) {
391
 
392
  // Apply each transformation step sequentially
393
  processedContent = ensureFrontmatter(processedContent);
 
394
  processedContent = cleanMdxSyntax(processedContent);
395
  processedContent = transformImages(processedContent);
396
  processedContent = transformStyledSpans(processedContent);
 
356
  return content;
357
  }
358
 
359
+ /**
360
+ * Clean newlines from single-line math blocks that contain them
361
+ * @param {string} content - MDX content
362
+ * @returns {string} - Content with cleaned math blocks
363
+ */
364
+ function cleanSingleLineMathNewlines(content) {
365
+ console.log(' 🔢 Cleaning newlines in single-line math blocks...');
366
+
367
+ let cleanedCount = 0;
368
+
369
+ // Find single dollar math blocks that contain newlines BUT are short enough to be single-line math
370
+ // Use a more restrictive approach: max 200 chars and only simple newlines (not paragraph breaks)
371
+ const cleanedContent = content.replace(/\$([^$]{1,200}?)\$/g, (match, mathContent) => {
372
+ // Only process if:
373
+ // 1. It contains newlines
374
+ // 2. It's not too long (likely not a multi-paragraph match)
375
+ // 3. It doesn't contain double newlines (paragraph breaks)
376
+ if (mathContent.includes('\n') &&
377
+ !mathContent.includes('\n\n') &&
378
+ mathContent.length <= 200) {
379
+
380
+ cleanedCount++;
381
+
382
+ // Remove newlines and normalize whitespace, but preserve math structure
383
+ const cleanedMath = mathContent
384
+ .replace(/\n+/g, ' ') // Replace newlines with spaces
385
+ .replace(/\s+/g, ' ') // Normalize multiple spaces to single
386
+ .trim(); // Remove leading/trailing spaces
387
+
388
+ return `$${cleanedMath}$`;
389
+ }
390
+ return match; // Keep original if doesn't meet criteria
391
+ });
392
+
393
+ if (cleanedCount > 0) {
394
+ console.log(` ✅ Cleaned ${cleanedCount} single-line math block(s) with newlines`);
395
+ }
396
+
397
+ return cleanedContent;
398
+ }
399
+
400
  /**
401
  * Clean up MDX-incompatible syntax
402
  * @param {string} content - MDX content
 
432
 
433
  // Apply each transformation step sequentially
434
  processedContent = ensureFrontmatter(processedContent);
435
+ processedContent = cleanSingleLineMathNewlines(processedContent);
436
  processedContent = cleanMdxSyntax(processedContent);
437
  processedContent = transformImages(processedContent);
438
  processedContent = transformStyledSpans(processedContent);
app/scripts/latex-to-markdown/output/main.mdx CHANGED
@@ -325,8 +325,7 @@ Deriving the end-effector’s *pose*--position *and* orientation--in some $m$-di
325
 
326
  In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
327
 
328
- $\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, .
329
- $
330
 
331
  Exact analytical solutions to IK are even less appealing when one considers the presence of obstacles in the robot’s workspace, resulting in constraints on the possible values of $q \in \mathcal Q \subseteq [-\pi, +\pi]^n \subset \mathbb R^n$ in the general case of $n$-links robots.
332
 
@@ -334,8 +333,7 @@ For instance, the robot in Figure <a href="#fig:planar-manipulator-floor" data-
334
 
335
  However, IK--solving eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}: \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
336
 
337
- Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*, $\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2
338
- $
339
 
340
  Unlike eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a>, solving for $\dot q$ is much less dependent on the environment (typically, variations in velocity are constrained by physical limits on the actuators). Conveniently, eq. <a href="#eq:reg_ik_velocity" data-reference-type="ref" data-reference="eq:reg_ik_velocity">[eq:reg_ik_velocity]</a> also often admits the closed-form solution $\dot q = J(q)^+ \dot {p}^*$, where $J^+(q)$ denotes the Moore-Penrose pseudo-inverse of $J(q)$. Finally, discrete-time joint configurations $q$ can be reconstructed from joint velocities $\dot q$ using forward-integration on the continuous-time joint velocity , $q_{t+1} = q_t + \Delta t\,\dot q_t$ for a given $\Delta t$, resulting in tracking via diff-IK.
341
 
@@ -480,11 +478,7 @@ A length-$T$ *trajectory* is the (random) sequence
480
  \end{equation}
481
  ```
482
  with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
483
- $$
484
- `\mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) = \mathbb P (s_{t+1}\vert s_t, a_t)\\
485
- \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) = \mathbb P(a_t\vert s_t) `
486
- $$
487
- The probability of observing a given trajectory $\tau$ factorizes into
488
  ``` math
489
  \begin{equation}
490
 
@@ -492,11 +486,7 @@ $$
492
  \end{equation}
493
  ```
494
 
495
- Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
496
- ``` math
497
- G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t.
498
- ```
499
- In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
500
  $$
501
  `J(\pi_\theta) = \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)],\\
502
  \mathbb P_{\theta; \mathcal D} (\tau) = \rho \prod_{t=0}^{T-1} \mathcal D (s_t, a_t, s_{t+1})\ \pi_\theta (a_t\vert s_t).`
@@ -512,12 +502,7 @@ can be used to discriminate between desirable and undesirable state in terms of
512
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
513
  ```
514
  Crucially, value functions are interrelated:
515
- $$
516
- `Q_\pi(s_t, a_t) = \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})]\\
517
- V_\pi(s_t) = \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)]
518
- `
519
- $$
520
- Inducing an ordering over states and state-action pairs under $\pi$, value functions are central to most RL algorithms. A variety of methods have been developed in RL as standalone attemps to find (approximate) solutions to the problem of maximizing cumulative reward (Figure <a href="#fig:rl-algos-atlas" data-reference-type="ref" data-reference="fig:rl-algos-atlas">15</a>).
521
 
522
  <ResponsiveImage
523
  src={ch3_rl_algorithms_atlas}
@@ -599,8 +584,7 @@ $$
599
  (\underbrace{y_i - Q_{\theta_i}(s_t, a_t)}_{\delta_i})^2
600
  \big],\\
601
  y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma \max_{a_t\in \mathcal A} Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) \big], `
602
- $$
603
- Where $\chi$ represents a behavior distribution over state-action pairs. Crucially, $\chi$ can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a *replay buffer* in the form of $(s_t, a_t, r_t, s_{t+1})$ transitions, used to form the TD-target $y_i$, TD-error $\delta_i$ and loss function <a href="#eq:dqn-loss" data-reference-type="ref" data-reference="eq:dqn-loss">[eq:dqn-loss]</a> via Monte-Carlo (MC) estimates.
604
 
605
  While effective in handling large, unstructured state spaces for discrete action-space problems, DQN application’s to continous control problems proved challenging. Indeed, in the case of high-capacity function approximators such as neural networks, solving $\max_{a_t \in \mathcal A} Q_\theta(s_t, a_t)$ at each timestep is simply unfeasible due to the (1) continous nature of the action space ($\mathcal A\subset \mathbb R^n$ for some $n$) and (2) impossibility to express the find a cheap (ideally, closed-form) solution to $Q_\theta$.  @silverDeterministicPolicyGradient2014 tackle this fundamental challenge by using a *deterministic* function of the state $s_t$ as policy, $\mu_\phi(s_t) = a_t$, parametrized by $\phi$. Thus, policies can be iteratively refined updating $\phi$ along the direction:
606
  ``` math
@@ -795,8 +779,7 @@ $$
795
  \mathbb{E}_{z \sim p_\theta(\cdot \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big]
796
  - \text{D}_{\text{KL}}\big[ q_\theta(z \vert (o,a)_i) \Vert p(z) \big]
797
  \right) `
798
- $$
799
- The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#eq:ELBO-intractable" data-reference-type="ref" data-reference="eq:ELBO-intractable">[eq:ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective, $\text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left(
800
  \mathbb{E}_{z \sim q_\phi(\cdot \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big]
801
  - \text{D}_{\text{KL}}\big[ q_\phi(z \vert (o,a)_i) \Vert p(z) \big]
802
  \right)
@@ -851,8 +834,7 @@ $$
851
  \mathbb{E}_{z_1 \sim q(\bullet \vert z_0)} \log p_\theta (z_0 \vert z_1) -\\
852
  \mathbb{E}_{z_{T-1} \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_T \vert z_{T-1}) \Vert p(z_T) ) \big] - \notag\\
853
  \sum_{t=1}^{T-1} \mathbb{E}_{(z_{t-1}, z_{t+1}) \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_t \vert z_{t-1}) \Vert p_\theta(z_t \vert z_{t-1}) ) \big], \notag`
854
- $$
855
- providing an optimization target in the form of $\max_\theta \log p_\theta (\mathcal D)$.
856
 
857
  In their seminal work on using DMs for variational inference, @hoDenoisingDiffusionProbabilistic2020 introduce major contributions regarding solving $\min_\theta -\log p_\theta(o,a)$. In particular, @hoDenoisingDiffusionProbabilistic2020 exclusively adopt a fixed *Gaussian* posterior in the form of $q(z_t \vert z_{t-1}) = \mathcal{N}(\sqrt{1-\beta_t}z_{t-1}, \beta_t \mathbf I)$. The choice of adopting Gaussians has profound implications on the generative process modeled. Indeed, under the (mild) assumption that the variance is sufficiently small $\beta_t \leq \eta, \eta \in \mathbb R^+$, @sohl-dicksteinDeepUnsupervisedLearning2015 proved that the likelihood $p(z_{t-1} \vert z_t)$ is Gaussian as well, which allows for the particularly convenient parametrization of the approximate likelihood $p_\theta (x_{t-1} \vert x_t) = \mathcal N(\mu_\theta(x_t, t), \Sigma_\theta(x_t,t)), \ t \in [1,T]$, as well as for closed-form tractability of the KL-divergence terms in <a href="#eq:diffusion-likelihood" data-reference-type="ref" data-reference="eq:diffusion-likelihood">[eq:diffusion-likelihood]</a>. Further, the posterior’s structure also enables an analytical description for the distribution of the $t$-th latent variable, $q(z_t \vert z_0) = \mathcal N (\sqrt{\bar{\alpha}_t}z_0, (1-\bar{\alpha}_t) \mathbf{I})$, with $\alpha_t = 1-\beta_t, \ \bar \alpha_t = \prod_{k=1}^t \alpha_k$, which conveniently prevents iterative posterior sampling.
858
 
@@ -891,10 +873,7 @@ By learning the total displacement from a generally, uninformative corrupted sam
891
  ### Flow Matching
892
 
893
  The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $\psi [0,1] \times Z \mapsto Z$, formalized starting from possibly time-dependent vector field $v: [0,1] \times Z \mapsto Z$ transporting samples from a simple prior distribution $p_0$--e.g., a standard Gaussian--to a more complex, potentially unknown data distribution $p_1$ over time. Note how FM models time $t \in [0,1]$ to be varying continuously while moving away *from* an easy-to-sample distribution $p_0$ *towards* the unknown data-distribution, $p_1$. This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time,
894
- $$
895
- `\frac{d}{dt} \psi(z, t) = v(t, \psi(t, z))\\
896
- \psi(0, z) = z`
897
- $$
898
 
899
 
900
  FM proved very effective in a variety of applications, ranging from image @esserScalingRectifiedFlow2024 and video generation @polyakMovieGenCast2025 to robotics control @blackp0VisionLanguageActionFlow2024. Most notably, in their introductory work on FM for GM, @lipmanFlowMatchingGenerative2023 show how DMs can be seen as a specific instance of FM where the *conditional* target vector field $u$ approximated by the noise regressor corresponds to
@@ -928,9 +907,7 @@ While the noising schedule of DMs results in a stochastic process that resembles
928
  caption={'Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over T = 50 steps.'}
929
  />
930
 
931
- In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#eq:fm-diffusion-vector-field" data-reference-type="ref" data-reference="eq:fm-diffusion-vector-field">[eq:fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as $
932
- \mathcal L(\theta) = \mathbb{E}_{t, z_0, z_1} \big[
933
- \Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal{U}([0,1]),$ where $z_0 \sim p_0(\bullet)$ and $z_1 \sim p_1(\bullet)$. Note how in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a>--differently from <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>--time is assumed to be varying continuously $t \sim \mathcal U([0,1])$ rather than discretely $t \sim \mathcal U(\{0,1\})$, a key property of flow-based models. The objective in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a> directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $z_0 \sim p_0$ and iteratively refined according to $\frac{dz}{dt} = v_\theta(z_t, t)$ for $t \in [0,1]$--an operation that can be numerically carried out with standard ODE solvers.
934
 
935
  ## Action Chunking with Transformers
936
 
@@ -1189,8 +1166,7 @@ $$
1189
  \tau \sim \mathrm{Beta}_{[0,s]}(1.5,1), \quad
1190
  \epsilon \sim \mathcal{N}(\mathbf{0}, \mathbf{I}), \quad
1191
  o_t, a_{t:t+H_a} \sim \mathcal D \notag`
1192
- $$
1193
- Where the experts parametrized by the separate weights $\phi, \theta$ interact with each other via self-attention layers only, so that the action expert $v_\theta$ internal computations also depend on the VLM backbone’s parameters $\phi$. Importantly, @blackp0VisionLanguageActionFlow2024 minimize <a href="#eq:pi0-loss" data-reference-type="ref" data-reference="eq:pi0-loss">[eq:pi0-loss]</a> over both the multimodal backbone and action expert parameters, thus updating the internal representations of the VLM using BC-specific gradients. In contrast, @driessKnowledgeInsulatingVisionLanguageAction2025 later show that failing to insulate the VLM knowledge from the flow matching gradients actually harms performance. Inference is performed iteratively refining action chunks while numerically forward-integrating the vector field predicted by the action expert,
1194
  ``` math
1195
  \begin{equation}
1196
  a_{t:t+H_a}^{\tau + \delta} = a_{t:t+H_a}^{\tau } + \delta v_\theta(a_{t:t+H_a}^{\tau }, o_t)
 
325
 
326
  In the simplified case here considered (for which $\boldsymbol{p} \equiv p$, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector’s location to reach a goal position $p^*$ by solving analytically for $q: p(q) = f_{\text{FK}}(q) = p^*$. However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
327
 
328
+ $\min_{q \in \mathcal Q} \Vert p(q) - p^* \Vert_2^2 \, . $
 
329
 
330
  Exact analytical solutions to IK are even less appealing when one considers the presence of obstacles in the robot’s workspace, resulting in constraints on the possible values of $q \in \mathcal Q \subseteq [-\pi, +\pi]^n \subset \mathbb R^n$ in the general case of $n$-links robots.
331
 
 
333
 
334
  However, IK--solving eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a> for a feasible $q$--only proves useful in determining information regarding the robot’s configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose. Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking. In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts. However, tracking $\tau_K$ via IK can prove prohibitively expensive, as tracking would require $K$ resolutions of eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a> (one for each target pose). *Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a>. Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text{FK}: \mathcal Q \mapsto \mathcal P$, such that $J(q) = \frac{\partial f_{FK}(q)}{\partial q }$. Then, one can apply the chain rule to any $p(q) = f_{\text{FK}}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
335
 
336
+ Given a desired end-effector trajectory $\dot {p}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints’ *velocities* instead of *configurations*, $\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot {p}^*(t) \rVert_2^2 $
 
337
 
338
  Unlike eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="eq:ik_problem">[eq:ik_problem]</a>, solving for $\dot q$ is much less dependent on the environment (typically, variations in velocity are constrained by physical limits on the actuators). Conveniently, eq. <a href="#eq:reg_ik_velocity" data-reference-type="ref" data-reference="eq:reg_ik_velocity">[eq:reg_ik_velocity]</a> also often admits the closed-form solution $\dot q = J(q)^+ \dot {p}^*$, where $J^+(q)$ denotes the Moore-Penrose pseudo-inverse of $J(q)$. Finally, discrete-time joint configurations $q$ can be reconstructed from joint velocities $\dot q$ using forward-integration on the continuous-time joint velocity , $q_{t+1} = q_t + \Delta t\,\dot q_t$ for a given $\Delta t$, resulting in tracking via diff-IK.
339
 
 
478
  \end{equation}
479
  ```
480
  with per-step rewards defined as $r_t = r (s_t, a_t, s_{t+1})$ for ease of notation.Interestingly, assuming both the environment dynamics and conditional distribution over actions given states--the *policy*--to be *Markovian*:
481
+ $$\mathbb P(s_{t+1}\vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) = \mathbb P (s_{t+1}\vert s_t, a_t)\\ \mathbb P(a_t\vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) = \mathbb P(a_t\vert s_t) $$The probability of observing a given trajectory$\tau$ factorizes into
 
 
 
 
482
  ``` math
483
  \begin{equation}
484
 
 
486
  \end{equation}
487
  ```
488
 
489
+ Policies $\mathbb P(a_t\vert s_t)$ are typically indicated as $\pi(a_t\vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t\vert s_t)$. Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory: ``` math G(\tau) = \sum_{t=0}^{T-1} \gamma^{t} r_t. ``` In that, agents seek to learn control strategies (*policies*,$\pi_\theta$) maximizing the expected return $\mathbb E_{\tau \sim \pi_\theta} G(\tau)$. For a given dynamics $\mathcal D$--i.e., for a given problem--taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi$, yielding the (maximization) target $J : \Pi \mapsto \mathbb R$
 
 
 
 
490
  $$
491
  `J(\pi_\theta) = \mathbb E_{\tau \sim \mathbb P_{\theta; \mathcal D}} [G(\tau)],\\
492
  \mathbb P_{\theta; \mathcal D} (\tau) = \rho \prod_{t=0}^{T-1} \mathcal D (s_t, a_t, s_{t+1})\ \pi_\theta (a_t\vert s_t).`
 
502
  Q_\pi(s,a) = \mathbb E_{\tau \sim \pi} [G (\tau) \big \vert s_0 = s, a_0=a]
503
  ```
504
  Crucially, value functions are interrelated:
505
+ $$Q_\pi(s_t, a_t) = \mathbb{E}_{s_{t+1}\sim \mathbb P(\bullet \vert s_t, a_t)} [r_t + \gamma V_\pi(s_{t+1})]\\ V_\pi(s_t) = \mathbb E_{a_t\sim \pi(\bullet \vert s_t)} [Q_\pi (s_t, a_t)] $$Inducing an ordering over states and state-action pairs under$\pi$, value functions are central to most RL algorithms. A variety of methods have been developed in RL as standalone attemps to find (approximate) solutions to the problem of maximizing cumulative reward (Figure <a href="#fig:rl-algos-atlas" data-reference-type="ref" data-reference="fig:rl-algos-atlas">15</a>).
 
 
 
 
 
506
 
507
  <ResponsiveImage
508
  src={ch3_rl_algorithms_atlas}
 
584
  (\underbrace{y_i - Q_{\theta_i}(s_t, a_t)}_{\delta_i})^2
585
  \big],\\
586
  y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma \max_{a_t\in \mathcal A} Q_{\theta_{i-1}} (s_{t+1}, a_{t+1}) \big], `
587
+ $$Where$\chi$ represents a behavior distribution over state-action pairs. Crucially, $\chi$ can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a *replay buffer* in the form of $(s_t, a_t, r_t, s_{t+1})$ transitions, used to form the TD-target $y_i$, TD-error $\delta_i$ and loss function <a href="#eq:dqn-loss" data-reference-type="ref" data-reference="eq:dqn-loss">[eq:dqn-loss]</a> via Monte-Carlo (MC) estimates.
 
588
 
589
  While effective in handling large, unstructured state spaces for discrete action-space problems, DQN application’s to continous control problems proved challenging. Indeed, in the case of high-capacity function approximators such as neural networks, solving $\max_{a_t \in \mathcal A} Q_\theta(s_t, a_t)$ at each timestep is simply unfeasible due to the (1) continous nature of the action space ($\mathcal A\subset \mathbb R^n$ for some $n$) and (2) impossibility to express the find a cheap (ideally, closed-form) solution to $Q_\theta$.  @silverDeterministicPolicyGradient2014 tackle this fundamental challenge by using a *deterministic* function of the state $s_t$ as policy, $\mu_\phi(s_t) = a_t$, parametrized by $\phi$. Thus, policies can be iteratively refined updating $\phi$ along the direction:
590
  ``` math
 
779
  \mathbb{E}_{z \sim p_\theta(\cdot \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big]
780
  - \text{D}_{\text{KL}}\big[ q_\theta(z \vert (o,a)_i) \Vert p(z) \big]
781
  \right) `
782
+ $$The true, generally intractable posterior$p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in <a href="#eq:ELBO-intractable" data-reference-type="ref" data-reference="eq:ELBO-intractable">[eq:ELBO-intractable]</a>, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective, $\text{ELBO}_{\mathcal D}(\theta, \phi) = \sum_{i=0}^{N} \left(
 
783
  \mathbb{E}_{z \sim q_\phi(\cdot \vert (o,a)_i)} \big[ \log p_\theta((o,a)_i \vert z) \big]
784
  - \text{D}_{\text{KL}}\big[ q_\phi(z \vert (o,a)_i) \Vert p(z) \big]
785
  \right)
 
834
  \mathbb{E}_{z_1 \sim q(\bullet \vert z_0)} \log p_\theta (z_0 \vert z_1) -\\
835
  \mathbb{E}_{z_{T-1} \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_T \vert z_{T-1}) \Vert p(z_T) ) \big] - \notag\\
836
  \sum_{t=1}^{T-1} \mathbb{E}_{(z_{t-1}, z_{t+1}) \sim q(\bullet \vert z_0)} \big[ \text{D}_{\text{KL}}(q(z_t \vert z_{t-1}) \Vert p_\theta(z_t \vert z_{t-1}) ) \big], \notag`
837
+ $$providing an optimization target in the form of$\max_\theta \log p_\theta (\mathcal D)$.
 
838
 
839
  In their seminal work on using DMs for variational inference, @hoDenoisingDiffusionProbabilistic2020 introduce major contributions regarding solving $\min_\theta -\log p_\theta(o,a)$. In particular, @hoDenoisingDiffusionProbabilistic2020 exclusively adopt a fixed *Gaussian* posterior in the form of $q(z_t \vert z_{t-1}) = \mathcal{N}(\sqrt{1-\beta_t}z_{t-1}, \beta_t \mathbf I)$. The choice of adopting Gaussians has profound implications on the generative process modeled. Indeed, under the (mild) assumption that the variance is sufficiently small $\beta_t \leq \eta, \eta \in \mathbb R^+$, @sohl-dicksteinDeepUnsupervisedLearning2015 proved that the likelihood $p(z_{t-1} \vert z_t)$ is Gaussian as well, which allows for the particularly convenient parametrization of the approximate likelihood $p_\theta (x_{t-1} \vert x_t) = \mathcal N(\mu_\theta(x_t, t), \Sigma_\theta(x_t,t)), \ t \in [1,T]$, as well as for closed-form tractability of the KL-divergence terms in <a href="#eq:diffusion-likelihood" data-reference-type="ref" data-reference="eq:diffusion-likelihood">[eq:diffusion-likelihood]</a>. Further, the posterior’s structure also enables an analytical description for the distribution of the $t$-th latent variable, $q(z_t \vert z_0) = \mathcal N (\sqrt{\bar{\alpha}_t}z_0, (1-\bar{\alpha}_t) \mathbf{I})$, with $\alpha_t = 1-\beta_t, \ \bar \alpha_t = \prod_{k=1}^t \alpha_k$, which conveniently prevents iterative posterior sampling.
840
 
 
873
  ### Flow Matching
874
 
875
  The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $\psi [0,1] \times Z \mapsto Z$, formalized starting from possibly time-dependent vector field $v: [0,1] \times Z \mapsto Z$ transporting samples from a simple prior distribution $p_0$--e.g., a standard Gaussian--to a more complex, potentially unknown data distribution $p_1$ over time. Note how FM models time $t \in [0,1]$ to be varying continuously while moving away *from* an easy-to-sample distribution $p_0$ *towards* the unknown data-distribution, $p_1$. This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time,
876
+ $$\frac{d}{dt} \psi(z, t) = v(t, \psi(t, z))\\ \psi(0, z) = z$$
 
 
 
877
 
878
 
879
  FM proved very effective in a variety of applications, ranging from image @esserScalingRectifiedFlow2024 and video generation @polyakMovieGenCast2025 to robotics control @blackp0VisionLanguageActionFlow2024. Most notably, in their introductory work on FM for GM, @lipmanFlowMatchingGenerative2023 show how DMs can be seen as a specific instance of FM where the *conditional* target vector field $u$ approximated by the noise regressor corresponds to
 
907
  caption={'Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over T = 50 steps.'}
908
  />
909
 
910
+ In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$. In the particular case of DMs, $u(t, z)$ is defined as in <a href="#eq:fm-diffusion-vector-field" data-reference-type="ref" data-reference="eq:fm-diffusion-vector-field">[eq:fm-diffusion-vector-field]</a>, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT. Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$. Then, a FM model can be trained with the simple regression objective defined as $ \mathcal L(\theta) = \mathbb{E}_{t, z_0, z_1} \big[ \Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal{U}([0,1]),$ where $z_0 \sim p_0(\bullet)$ and $z_1 \sim p_1(\bullet)$. Note how in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a>--differently from <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>--time is assumed to be varying continuously $t \sim \mathcal U([0,1])$ rather than discretely $t \sim \mathcal U(\{0,1\})$, a key property of flow-based models. The objective in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a> directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $z_0 \sim p_0$ and iteratively refined according to $\frac{dz}{dt} = v_\theta(z_t, t)$ for $t \in [0,1]$--an operation that can be numerically carried out with standard ODE solvers.
 
 
911
 
912
  ## Action Chunking with Transformers
913
 
 
1166
  \tau \sim \mathrm{Beta}_{[0,s]}(1.5,1), \quad
1167
  \epsilon \sim \mathcal{N}(\mathbf{0}, \mathbf{I}), \quad
1168
  o_t, a_{t:t+H_a} \sim \mathcal D \notag`
1169
+ $$Where the experts parametrized by the separate weights$\phi, \theta$ interact with each other via self-attention layers only, so that the action expert $v_\theta$ internal computations also depend on the VLM backbone’s parameters $\phi$. Importantly, @blackp0VisionLanguageActionFlow2024 minimize <a href="#eq:pi0-loss" data-reference-type="ref" data-reference="eq:pi0-loss">[eq:pi0-loss]</a> over both the multimodal backbone and action expert parameters, thus updating the internal representations of the VLM using BC-specific gradients. In contrast, @driessKnowledgeInsulatingVisionLanguageAction2025 later show that failing to insulate the VLM knowledge from the flow matching gradients actually harms performance. Inference is performed iteratively refining action chunks while numerically forward-integrating the vector field predicted by the action expert,
 
1170
  ``` math
1171
  \begin{equation}
1172
  a_{t:t+H_a}^{\tau + \delta} = a_{t:t+H_a}^{\tau } + \delta v_\theta(a_{t:t+H_a}^{\tau }, o_t)
app/src/content/article.mdx CHANGED
The diff for this file is too large to render. See raw diff