thibaud frere commited on
Commit
0ec6423
·
1 Parent(s): 52bc805
app/scripts/latex-to-markdown/latex-converter.mjs CHANGED
@@ -226,7 +226,7 @@ export function convertLatexToMarkdown(inputFile, outputDir) {
226
  const mediaDir = join(outputDir, 'assets', 'image');
227
  ensureDirectory(mediaDir);
228
  const inputDir = dirname(inputFile);
229
- const pandocCommand = `pandoc "${preprocessedFile}" -f latex+latex_macros -t gfm+tex_math_dollars --wrap=none ${bibOption} --extract-media="${mediaDir}" --resource-path="${inputDir}" -o "${outputFile}"`;
230
 
231
  console.log(` Running: ${pandocCommand}`);
232
  execSync(pandocCommand, { stdio: 'pipe' });
 
226
  const mediaDir = join(outputDir, 'assets', 'image');
227
  ensureDirectory(mediaDir);
228
  const inputDir = dirname(inputFile);
229
+ const pandocCommand = `pandoc "${preprocessedFile}" -f latex+latex_macros -t gfm+tex_math_dollars --shift-heading-level-by=1 --wrap=none ${bibOption} --extract-media="${mediaDir}" --resource-path="${inputDir}" -o "${outputFile}"`;
230
 
231
  console.log(` Running: ${pandocCommand}`);
232
  execSync(pandocCommand, { stdio: 'pipe' });
app/scripts/latex-to-markdown/mdx-converter.mjs CHANGED
@@ -3,6 +3,7 @@
3
  import { readFileSync, writeFileSync, existsSync } from 'fs';
4
  import { join, dirname, basename, extname } from 'path';
5
  import { fileURLToPath } from 'url';
 
6
 
7
  const __filename = fileURLToPath(import.meta.url);
8
  const __dirname = dirname(__filename);
@@ -334,22 +335,40 @@ function transformReferenceLinks(content) {
334
  );
335
  }
336
 
 
337
  /**
338
  * Fix frontmatter and ensure proper MDX format
339
  * @param {string} content - MDX content
 
340
  * @returns {string} - Content with proper frontmatter
341
  */
342
- function ensureFrontmatter(content) {
343
  console.log(' 📄 Ensuring proper frontmatter...');
344
 
345
  if (!content.startsWith('---')) {
346
- const frontmatter = `---
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  title: "Research Article"
348
- description: "Converted from LaTeX to MDX"
349
- date: "${new Date().toISOString().split('T')[0]}"
350
  ---
351
 
352
  `;
 
 
 
353
  return frontmatter + content;
354
  }
355
 
@@ -357,41 +376,125 @@ date: "${new Date().toISOString().split('T')[0]}"
357
  }
358
 
359
  /**
360
- * Clean newlines from single-line math blocks that contain them
361
  * @param {string} content - MDX content
362
  * @returns {string} - Content with cleaned math blocks
363
  */
364
  function cleanSingleLineMathNewlines(content) {
365
- console.log(' 🔢 Cleaning newlines in single-line math blocks...');
366
 
367
  let cleanedCount = 0;
368
 
369
- // Find single dollar math blocks that contain newlines BUT are short enough to be single-line math
370
- // Use a more restrictive approach: max 200 chars and only simple newlines (not paragraph breaks)
371
- const cleanedContent = content.replace(/\$([^$]{1,200}?)\$/g, (match, mathContent) => {
372
- // Only process if:
373
- // 1. It contains newlines
374
- // 2. It's not too long (likely not a multi-paragraph match)
375
- // 3. It doesn't contain double newlines (paragraph breaks)
376
- if (mathContent.includes('\n') &&
377
- !mathContent.includes('\n\n') &&
378
- mathContent.length <= 200) {
379
-
380
  cleanedCount++;
381
 
382
- // Remove newlines and normalize whitespace, but preserve math structure
383
  const cleanedMath = mathContent
384
- .replace(/\n+/g, ' ') // Replace newlines with spaces
 
385
  .replace(/\s+/g, ' ') // Normalize multiple spaces to single
386
  .trim(); // Remove leading/trailing spaces
387
 
388
  return `$${cleanedMath}$`;
389
  }
390
- return match; // Keep original if doesn't meet criteria
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  });
392
 
393
  if (cleanedCount > 0) {
394
- console.log(` ✅ Cleaned ${cleanedCount} single-line math block(s) with newlines`);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  }
396
 
397
  return cleanedContent;
@@ -419,9 +522,10 @@ function cleanMdxSyntax(content) {
419
  /**
420
  * Main MDX processing function that applies all transformations
421
  * @param {string} content - Raw Markdown content
 
422
  * @returns {string} - Processed MDX content compatible with Astro
423
  */
424
- function processMdxContent(content) {
425
  console.log('🔧 Processing for Astro MDX compatibility...');
426
 
427
  // Clear previous tracking
@@ -431,12 +535,15 @@ function processMdxContent(content) {
431
  let processedContent = content;
432
 
433
  // Apply each transformation step sequentially
434
- processedContent = ensureFrontmatter(processedContent);
435
  processedContent = cleanSingleLineMathNewlines(processedContent);
 
 
436
  processedContent = cleanMdxSyntax(processedContent);
437
  processedContent = transformImages(processedContent);
438
  processedContent = transformStyledSpans(processedContent);
439
  processedContent = transformReferenceLinks(processedContent);
 
440
 
441
  // Add component imports at the end
442
  processedContent = addComponentImports(processedContent);
@@ -459,8 +566,20 @@ function convertToMdx(inputFile, outputFile) {
459
  console.log('🔄 Reading Markdown file...');
460
  const markdownContent = readFileSync(inputFile, 'utf8');
461
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  // Apply modular MDX processing
463
- const mdxContent = processMdxContent(markdownContent);
464
 
465
  console.log('💾 Writing MDX file...');
466
  writeFileSync(outputFile, mdxContent);
 
3
  import { readFileSync, writeFileSync, existsSync } from 'fs';
4
  import { join, dirname, basename, extname } from 'path';
5
  import { fileURLToPath } from 'url';
6
+ import { extractAndGenerateFrontmatter } from './metadata-extractor.mjs';
7
 
8
  const __filename = fileURLToPath(import.meta.url);
9
  const __dirname = dirname(__filename);
 
335
  );
336
  }
337
 
338
+
339
  /**
340
  * Fix frontmatter and ensure proper MDX format
341
  * @param {string} content - MDX content
342
+ * @param {string} latexContent - Original LaTeX content for metadata extraction
343
  * @returns {string} - Content with proper frontmatter
344
  */
345
+ function ensureFrontmatter(content, latexContent = '') {
346
  console.log(' 📄 Ensuring proper frontmatter...');
347
 
348
  if (!content.startsWith('---')) {
349
+ let frontmatter;
350
+
351
+ if (latexContent) {
352
+ // Extract metadata from LaTeX using dedicated module
353
+ frontmatter = extractAndGenerateFrontmatter(latexContent);
354
+ console.log(' ✅ Generated frontmatter from LaTeX metadata');
355
+ } else {
356
+ // Fallback frontmatter
357
+ const currentDate = new Date().toLocaleDateString('en-US', {
358
+ year: 'numeric',
359
+ month: 'short',
360
+ day: '2-digit'
361
+ });
362
+ frontmatter = `---
363
  title: "Research Article"
364
+ published: "${currentDate}"
365
+ tableOfContentsAutoCollapse: true
366
  ---
367
 
368
  `;
369
+ console.log(' ✅ Generated basic frontmatter');
370
+ }
371
+
372
  return frontmatter + content;
373
  }
374
 
 
376
  }
377
 
378
  /**
379
+ * Clean newlines from single-dollar math blocks ($...$) ONLY
380
  * @param {string} content - MDX content
381
  * @returns {string} - Content with cleaned math blocks
382
  */
383
  function cleanSingleLineMathNewlines(content) {
384
+ console.log(' 🔢 Cleaning newlines in single-dollar math blocks ($...$)...');
385
 
386
  let cleanedCount = 0;
387
 
388
+ // ULTRA STRICT: Only target single dollar blocks ($...$) that contain newlines
389
+ // Use dotall flag (s) to match newlines with .*, and ensure we don't match $$
390
+ const cleanedContent = content.replace(/\$(?!\$)([\s\S]*?)\$(?!\$)/g, (match, mathContent) => {
391
+ // Only process if the content contains newlines
392
+ if (mathContent.includes('\n')) {
 
 
 
 
 
 
393
  cleanedCount++;
394
 
395
+ // Remove ALL newlines and carriage returns, normalize whitespace
396
  const cleanedMath = mathContent
397
+ .replace(/\n+/g, ' ') // Replace all newlines with spaces
398
+ .replace(/\r+/g, ' ') // Replace carriage returns with spaces
399
  .replace(/\s+/g, ' ') // Normalize multiple spaces to single
400
  .trim(); // Remove leading/trailing spaces
401
 
402
  return `$${cleanedMath}$`;
403
  }
404
+ return match; // Keep original if no newlines
405
+ });
406
+
407
+ if (cleanedCount > 0) {
408
+ console.log(` ✅ Cleaned ${cleanedCount} single-dollar math block(s) with newlines`);
409
+ }
410
+
411
+ return cleanedContent;
412
+ }
413
+
414
+ /**
415
+ * Add proper line breaks around display math blocks ($$...$$)
416
+ * @param {string} content - MDX content
417
+ * @returns {string} - Content with properly spaced display math
418
+ */
419
+ function formatDisplayMathBlocks(content) {
420
+ console.log(' 📐 Formatting display math blocks with proper spacing...');
421
+
422
+ let formattedCount = 0;
423
+
424
+ // Find all $$...$$$ blocks (display math) and ensure proper line breaks
425
+ // Very strict: only matches exactly $$ followed by content followed by $$
426
+ const formattedContent = content.replace(/\$\$([\s\S]*?)\$\$/g, (match, mathContent) => {
427
+ formattedCount++;
428
+
429
+ // Clean up the math content - trim whitespace but preserve structure
430
+ const cleanedMath = mathContent.trim();
431
+
432
+ // Return with proper line breaks before and after
433
+ return `\n$$\n${cleanedMath}\n$$\n`;
434
+ });
435
+
436
+ if (formattedCount > 0) {
437
+ console.log(` ✅ Formatted ${formattedCount} display math block(s) with proper spacing`);
438
+ }
439
+
440
+ return formattedContent;
441
+ }
442
+
443
+ /**
444
+ * Clean newlines from figcaption content
445
+ * @param {string} content - MDX content
446
+ * @returns {string} - Content with cleaned figcaptions
447
+ */
448
+ function cleanFigcaptionNewlines(content) {
449
+ console.log(' 📝 Cleaning newlines in figcaption elements...');
450
+
451
+ let cleanedCount = 0;
452
+
453
+ // Find all <figcaption>...</figcaption> blocks and remove internal newlines
454
+ const cleanedContent = content.replace(/<figcaption([^>]*)>([\s\S]*?)<\/figcaption>/g, (match, attributes, captionContent) => {
455
+ // Only process if the content contains newlines
456
+ if (captionContent.includes('\n')) {
457
+ cleanedCount++;
458
+
459
+ // Remove newlines and normalize whitespace
460
+ const cleanedCaption = captionContent
461
+ .replace(/\n+/g, ' ') // Replace newlines with spaces
462
+ .replace(/\s+/g, ' ') // Normalize multiple spaces
463
+ .trim(); // Trim whitespace
464
+
465
+ return `<figcaption${attributes}>${cleanedCaption}</figcaption>`;
466
+ }
467
+
468
+ return match; // Return unchanged if no newlines
469
  });
470
 
471
  if (cleanedCount > 0) {
472
+ console.log(` ✅ Cleaned ${cleanedCount} figcaption element(s)`);
473
+ } else {
474
+ console.log(` ℹ️ No figcaption elements with newlines found`);
475
+ }
476
+
477
+ return cleanedContent;
478
+ }
479
+
480
+ /**
481
+ * Remove HTML comments from MDX content
482
+ * @param {string} content - MDX content
483
+ * @returns {string} - Content without HTML comments
484
+ */
485
+ function removeHtmlComments(content) {
486
+ console.log(' 🗑️ Removing HTML comments...');
487
+
488
+ let removedCount = 0;
489
+
490
+ // Remove all HTML comments <!-- ... -->
491
+ const cleanedContent = content.replace(/<!--[\s\S]*?-->/g, () => {
492
+ removedCount++;
493
+ return '';
494
+ });
495
+
496
+ if (removedCount > 0) {
497
+ console.log(` ✅ Removed ${removedCount} HTML comment(s)`);
498
  }
499
 
500
  return cleanedContent;
 
522
  /**
523
  * Main MDX processing function that applies all transformations
524
  * @param {string} content - Raw Markdown content
525
+ * @param {string} latexContent - Original LaTeX content for metadata extraction
526
  * @returns {string} - Processed MDX content compatible with Astro
527
  */
528
+ function processMdxContent(content, latexContent = '') {
529
  console.log('🔧 Processing for Astro MDX compatibility...');
530
 
531
  // Clear previous tracking
 
535
  let processedContent = content;
536
 
537
  // Apply each transformation step sequentially
538
+ processedContent = ensureFrontmatter(processedContent, latexContent);
539
  processedContent = cleanSingleLineMathNewlines(processedContent);
540
+ processedContent = formatDisplayMathBlocks(processedContent);
541
+ processedContent = removeHtmlComments(processedContent);
542
  processedContent = cleanMdxSyntax(processedContent);
543
  processedContent = transformImages(processedContent);
544
  processedContent = transformStyledSpans(processedContent);
545
  processedContent = transformReferenceLinks(processedContent);
546
+ processedContent = cleanFigcaptionNewlines(processedContent);
547
 
548
  // Add component imports at the end
549
  processedContent = addComponentImports(processedContent);
 
566
  console.log('🔄 Reading Markdown file...');
567
  const markdownContent = readFileSync(inputFile, 'utf8');
568
 
569
+ // Try to read original LaTeX file for metadata extraction
570
+ let latexContent = '';
571
+ try {
572
+ const inputDir = dirname(inputFile);
573
+ const latexFile = join(inputDir, '..', 'input', 'main.tex');
574
+ if (existsSync(latexFile)) {
575
+ latexContent = readFileSync(latexFile, 'utf8');
576
+ }
577
+ } catch (error) {
578
+ // Ignore LaTeX reading errors - we'll use fallback frontmatter
579
+ }
580
+
581
  // Apply modular MDX processing
582
+ const mdxContent = processMdxContent(markdownContent, latexContent);
583
 
584
  console.log('💾 Writing MDX file...');
585
  writeFileSync(outputFile, mdxContent);
app/scripts/latex-to-markdown/metadata-extractor.mjs ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * LaTeX Metadata Extractor
3
+ * Extracts document metadata from LaTeX files for frontmatter generation
4
+ */
5
+
6
+ /**
7
+ * Extract metadata from LaTeX content
8
+ * @param {string} latexContent - Raw LaTeX content
9
+ * @returns {object} - Extracted metadata object
10
+ */
11
+ export function extractLatexMetadata(latexContent) {
12
+ const metadata = {};
13
+
14
+ // Extract title
15
+ const titleMatch = latexContent.match(/\\title\s*\{\s*([^}]+)\s*\}/s);
16
+ if (titleMatch) {
17
+ metadata.title = titleMatch[1]
18
+ .replace(/\n/g, ' ')
19
+ .trim();
20
+ }
21
+
22
+ // Extract authors with their specific affiliations
23
+ const authors = [];
24
+ const authorMatches = latexContent.matchAll(/\\authorOne\[[^\]]*\]\{([^}]+)\}/g);
25
+
26
+ for (const match of authorMatches) {
27
+ const fullAuthorInfo = match[1];
28
+
29
+ // Determine affiliations based on macros present
30
+ const affiliations = [];
31
+ if (fullAuthorInfo.includes('\\ensps')) {
32
+ affiliations.push(1); // École Normale Supérieure
33
+ }
34
+ if (fullAuthorInfo.includes('\\hf')) {
35
+ affiliations.push(2); // Hugging Face
36
+ }
37
+
38
+ // Clean author name by removing macros
39
+ let authorName = fullAuthorInfo
40
+ .replace(/\\ensps/g, '') // Remove École macro
41
+ .replace(/\\hf/g, '') // Remove Hugging Face macro
42
+ .replace(/\s+/g, ' ') // Normalize whitespace
43
+ .trim();
44
+
45
+ // Skip empty authors or placeholder entries
46
+ if (authorName && authorName !== '...') {
47
+ authors.push({
48
+ name: authorName,
49
+ affiliations: affiliations.length > 0 ? affiliations : [2] // Default to HF if no macro
50
+ });
51
+ }
52
+ }
53
+
54
+ if (authors.length > 0) {
55
+ metadata.authors = authors;
56
+ }
57
+
58
+ // Extract affiliations - create the two distinct affiliations
59
+ metadata.affiliations = [
60
+ {
61
+ name: "École Normale Supérieure Paris-Saclay"
62
+ },
63
+ {
64
+ name: "Hugging Face"
65
+ }
66
+ ];
67
+
68
+ // Extract date if available (common LaTeX patterns)
69
+ const datePatterns = [
70
+ /\\date\s*\{([^}]+)\}/,
71
+ /\\newcommand\s*\{\\date\}\s*\{([^}]+)\}/,
72
+ ];
73
+
74
+ for (const pattern of datePatterns) {
75
+ const dateMatch = latexContent.match(pattern);
76
+ if (dateMatch) {
77
+ metadata.published = dateMatch[1].trim();
78
+ break;
79
+ }
80
+ }
81
+
82
+ // Fallback to current date if no date found
83
+ if (!metadata.published) {
84
+ metadata.published = new Date().toLocaleDateString('en-US', {
85
+ year: 'numeric',
86
+ month: 'short',
87
+ day: '2-digit'
88
+ });
89
+ }
90
+
91
+ return metadata;
92
+ }
93
+
94
+ /**
95
+ * Generate YAML frontmatter from metadata object
96
+ * @param {object} metadata - Metadata object
97
+ * @returns {string} - YAML frontmatter string
98
+ */
99
+ export function generateFrontmatter(metadata) {
100
+ let frontmatter = '---\n';
101
+
102
+ // Title
103
+ if (metadata.title) {
104
+ frontmatter += `title: "${metadata.title}"\n`;
105
+ }
106
+
107
+ // Authors
108
+ if (metadata.authors && metadata.authors.length > 0) {
109
+ frontmatter += 'authors:\n';
110
+ metadata.authors.forEach(author => {
111
+ frontmatter += ` - name: "${author.name}"\n`;
112
+ if (author.url) {
113
+ frontmatter += ` url: "${author.url}"\n`;
114
+ }
115
+ frontmatter += ` affiliations: [${author.affiliations.join(', ')}]\n`;
116
+ });
117
+ }
118
+
119
+ // Affiliations
120
+ if (metadata.affiliations && metadata.affiliations.length > 0) {
121
+ frontmatter += 'affiliations:\n';
122
+ metadata.affiliations.forEach((affiliation, index) => {
123
+ frontmatter += ` - name: "${affiliation.name}"\n`;
124
+ if (affiliation.url) {
125
+ frontmatter += ` url: "${affiliation.url}"\n`;
126
+ }
127
+ });
128
+ }
129
+
130
+ // Publication date
131
+ if (metadata.published) {
132
+ frontmatter += `published: "${metadata.published}"\n`;
133
+ }
134
+
135
+ // Additional metadata
136
+ if (metadata.doi) {
137
+ frontmatter += `doi: "${metadata.doi}"\n`;
138
+ }
139
+
140
+ if (metadata.description) {
141
+ frontmatter += `description: "${metadata.description}"\n`;
142
+ }
143
+
144
+ if (metadata.licence) {
145
+ frontmatter += `licence: >\n ${metadata.licence}\n`;
146
+ }
147
+
148
+ if (metadata.tags && metadata.tags.length > 0) {
149
+ frontmatter += 'tags:\n';
150
+ metadata.tags.forEach(tag => {
151
+ frontmatter += ` - ${tag}\n`;
152
+ });
153
+ }
154
+
155
+ // Default Astro configuration
156
+ frontmatter += 'tableOfContentsAutoCollapse: true\n';
157
+ frontmatter += '---\n\n';
158
+
159
+ return frontmatter;
160
+ }
161
+
162
+ /**
163
+ * Extract and generate frontmatter from LaTeX content
164
+ * @param {string} latexContent - Raw LaTeX content
165
+ * @returns {string} - Complete YAML frontmatter
166
+ */
167
+ export function extractAndGenerateFrontmatter(latexContent) {
168
+ const metadata = extractLatexMetadata(latexContent);
169
+ return generateFrontmatter(metadata);
170
+ }
app/scripts/latex-to-markdown/output/main.md CHANGED
@@ -1,4 +1,4 @@
1
- # Foreword
2
 
3
  Robotics is an inherently multidisciplinary field, and is not witnessing unprecedented advancements since its inception in the 1960s. Yet, more than sixty years after the debut of Unimate, robots have still not fully integrated into the rich, unstructured, and dynamic world we humans inhabit. Over the decades, numerous disciplines have shown immense promise in tackling the challenges of creating autonomous systems. This tutorial takes a clear stance in the debate on whether modern Machine Learning can play a pivotal role in the development of autonomous robot systems: we believe this to be the case.
4
 
@@ -16,7 +16,7 @@ Instead, our goal here is to provide an intuitive explanation as per why these d
16
 
17
  We sincerely hope this tutorial serves as a valuable starting point for your journey into robot learning.
18
 
19
- # Introduction
20
 
21
  <figure id="fig:figure1">
22
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch1/ch1-lerobot-figure1.png" />
@@ -43,13 +43,13 @@ This tutorial serves the double purpose of providing useful references for the S
43
 
44
  Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today. We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using `lerobot`, and start here by presenting the dataset format introduced with `lerobot`.
45
 
46
- ## `LeRobotDataset`
47
 
48
  `LeRobotDataset` is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset` also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding.
49
 
50
  In this, `LeRobotDataset` provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. `LeRobotDataset` can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in `lerobot`, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users.
51
 
52
- ### The dataset class design
53
 
54
  A core design choice behind `LeRobotDataset` is separating the underlying data storage from the user-facing API. This allows for efficient storage while presenting the data in an intuitive, ready-to-use format.
55
 
@@ -75,7 +75,7 @@ For scalability, and to support datasets with potentially millions of trajectori
75
 
76
  - `videos/*`: Contains the MP4 video files for all visual observation streams. Similar to the `data/` directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems.
77
 
78
- ## Code Example: Batching a (Streaming) Dataset
79
 
80
  This section provides an overview of how to access datasets hosted on Hugging Face using the `LeRobotDataset` class. Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction.
81
 
@@ -142,7 +142,7 @@ for epoch in range(num_epochs):
142
 
143
  </div>
144
 
145
- # Classical Robotics
146
 
147
  <div class="epigraph">
148
 
@@ -158,7 +158,7 @@ TL;DR Learning-based approaches to robotics are motivated by the need to (1) gen
158
 
159
  </div>
160
 
161
- ## Explicit and Implicit Models
162
 
163
  <figure id="fig:generating-motion-atlas">
164
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-approaches.png" style="width:50.0%" />
@@ -169,7 +169,7 @@ Robotics is concerned with producing artificial motion in the physical world in
169
 
170
  Methods to produce robotics motion range from traditional *explicit* models--<span style="color: hf2">dynamics-based</span>[^1] methods, leveraging precise descriptions of the mechanics of robots’ rigid bodies and their interactions with eventual obstacles in the environment--to *implicit* models--<span style="color: hf2">learning-based</span> methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readings @agrawalComputationalSensorimotorLearning, @bekrisStateRobotMotion2024. A variety of methods have been developed between these two extrema. For instance,  @hansenTemporalDifferenceLearning2022 show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning @suttonReinforcementLearningIntroduction2018 with Model-Predictive Control (MPC). Conversely, as explicit models may be relying on assumptions proving overly simplistic--or even unrealistic--in practice, learning can prove effective to improve modeling of complex phenomena or complement perception @mccormacSemanticFusionDense3D2016. Such examples aim at demonstrating the richness of approaches to robotics, and Figure <a href="#fig:generating-motion-atlas" data-reference-type="ref" data-reference="fig:generating-motion-atlas">2</a> graphically illustrates some of the most relevant techniques. Such a list is clearly far from being exhaustive, and we refer to @bekrisStateRobotMotion2024 for a more comprehensive overview of both general and application-specific methods for motion generation. In this section, we wish to introduce the inherent benefits of <span style="color: hf2">learning-based approaches to robotics</span>--the core focus on this tutorial.
171
 
172
- ## Different Types of Motion
173
 
174
  <figure id="fig:robotics-platforms-atlas">
175
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-platforms.png" style="width:70.0%" />
@@ -182,7 +182,7 @@ Effects such as (1) are typically achieved *through* the robot, i.e. generating
182
 
183
  The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches. ML has indeed proven particularly transformative across the entire robotics stack, first empowering planning-based techniques with improved state estimation used for traditional planning @tangPerceptionNavigationAutonomous2023 and then end-to-end replacing controllers, effectively yielding perception-to-action methods @koberReinforcementLearningRobotics. Work in producing robots capable of navigating a diverse set of terrains demonstrated the premise of both dynamics and learning-based approaches for locomotion @griffinWalkingStabilizationUsing2017, @jiDribbleBotDynamicLegged2023, @leeLearningQuadrupedalLocomotion2020, @margolisRapidLocomotionReinforcement2022, and recent works on whole-body control indicated the premise of learning-based approaches to generate rich motion on complex robots, including humanoids @zhangWoCoCoLearningWholeBody2024, @bjorckGR00TN1Open2025. Manipulation has also been widely studied, particularly considering its relevance for many impactful use-cases ranging from high-risk applications for humans @fujitaDevelopmentRobotsNuclear2020, @alizadehComprehensiveSurveySpace2024 to manufacturing @sannemanStateIndustrialRobotics2020. While explicit models have proven fundamental in achieving important milestones towards the development of modern robotics, recent works leveraging implicit models proved particularly promising in surpassing scalability and applicability challenges via learning @koberReinforcementLearningRobotics.
184
 
185
- ## Example: Planar Manipulation
186
 
187
  Robot manipulators typically consist of a series of links and joints, articulated in a chain finally connected to an *end-effector*. Actuated joints are considered responsible for generating motion of the links, while the end effector is instead used to perform specific actions at the target location (e.g., grasping/releasing objects via closing/opening a gripper end-effector, using a specialized tool like a screwdriver, etc.).
188
 
@@ -258,7 +258,7 @@ Unlike eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="
258
 
259
  Following trajectories with diff-IK is a valid option in well-controlled and static environments (e.g., industrial manipulators in controlled manufacturing settings), and relies on the ability to define a set of target velocities to track $`[\dot {p}^*_0, \dot {p}^*_1, \dots, \dot {p}^*_k ]`$--an error-prone task largely requiring human expertise. Furthermore, diff-IK relies on the ability to (1) access $`J(q) \, \forall q \in \mathcal Q`$ and (2) compute its pseudo-inverse at every iteration of a given control cycle--a challenging assumption in highly dynamical settings, or for complex kinematic chains.
260
 
261
- ### Adding Feedback Loops
262
 
263
  While very effective when a goal trajectory has been well specified, the performance of diff-IK can degrade significantly in the presence of modeling/tracking errors, or in the presence of non-modeled dynamics in the environment.
264
 
@@ -278,7 +278,7 @@ More advanced techniques for control consisting in feedback linearization, PID c
278
 
279
  We point the interested reader to , , and  for extended coverage of FK, IK, diff-IK and control for (diff-)IK.
280
 
281
- ## Limitations of Dynamics-based Robotics
282
 
283
  Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
284
 
@@ -297,7 +297,7 @@ Lastly, dynamics-based methods (naturally) overlook the rather recent <span styl
297
 
298
  Taken together, these limitations (Figure <a href="#fig:classical-limitations" data-reference-type="ref" data-reference="fig:classical-limitations">10</a>) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available.
299
 
300
- # Robot (Reinforcement) Learning
301
 
302
  <div class="epigraph">
303
 
@@ -338,7 +338,7 @@ Figure <a href="#fig:robot-learning-atlas" data-reference-type="ref" data-refer
338
 
339
  Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling).
340
 
341
- ## A (Concise) Introduction to RL
342
 
343
  The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (Figure <a href="#fig:rl-most-famous-pic" data-reference-type="ref" data-reference="fig:rl-most-famous-pic">14</a>) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*).
344
 
@@ -415,7 +415,7 @@ $$
415
 
416
  Popular approaches to continuous state and action space--such as those studied within robotics--include @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulation @akkayaSolvingRubiksCube2019 and locomotion @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024.
417
 
418
- ## Real-world RL for Robotics
419
 
420
  Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency.
421
 
@@ -447,7 +447,7 @@ To make the most of (1) the growing number of openly available datasets and (2)
447
 
448
  Off-policy algorithms like Soft Actor-Critic (SAC)��@haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpart @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of the training. Other than allowing to re-use transitions $`(s_t, a_t, r_t, s_{t+1})`$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training process @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-world @luoSERLSoftwareSuite2025. Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours @luoPreciseDexterousRobotic2024.
449
 
450
- #### Sample-efficient RL
451
 
452
  In an MDP, the optimal policy $`\pi^*`$ can be derived from its associated $`Q`$-function, $`Q_{\pi^*}`$, and in particular the optimal action(s) $`\mu(s_t)`$ can be selected maximizing the optimal $`Q`$-function over the action space,
453
  ``` math
@@ -508,13 +508,13 @@ Similarily to DDPG, SAC also maintains an explicit policy, trained under the sam
508
  ```
509
  The update rule provided in <a href="#eq:sac-policy-update" data-reference-type="ref" data-reference="eq:sac-policy-update">[eq:sac-policy-update]</a> optimizes the policy while projecting it on a set $`\Pi`$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017).
510
 
511
- #### Sample-efficient, data-driven RL
512
 
513
  Importantly, sampling $`(s_t, a_t, r_t, s_{t+1})`$ from the replay buffer $`D`$ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $`D`$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process.
514
 
515
  Reinforcement Learning with Prior Data (RLPD) @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $`D_\text{offline}`$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine.
516
 
517
- #### Sample-efficient, data-driven, real-world RL
518
 
519
  Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment. As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness. In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $`c`$, as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($`e^+`$) or failure ($`e^-`$) states, $`s \in \mathcal S`$, with a natural choice for the state-conditioned reward function being $`r \mathcal S \mapsto \mathbb R`$ being $`r(s) = \log c(e^+ \ vert s )`$. Further, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies.
520
 
@@ -529,11 +529,11 @@ Building on off-policy deep Q-learning with replay buffers, entropy regularizati
529
 
530
  Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL) @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process. Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. Consequently, given an intervention timestep $`k \in (0, T)`$, length-$`K`$ human intervention data $`\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K`$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of training @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training.
531
 
532
- ### Code Example: Real-world RL
533
 
534
  **TODO(fracapuano): work out rl training example**
535
 
536
- ### Limitations of RL in Real-World Robotics: Simulators and Reward Design
537
 
538
  Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations:
539
 
@@ -543,7 +543,7 @@ Despite the advancements in real-world RL training, solving robotics training RL
543
 
544
  Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns. By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions.
545
 
546
- # Robot (Imitation) Learning
547
 
548
  <div class="epigraph">
549
 
@@ -593,11 +593,11 @@ Despite the inherent challenges of learning on non-i.i.d. data, the BC formulati
593
 
594
  While conceptually elegant, point-estimate policies $`f : \mathcal O\mapsto \mathcal A`$ learned by solving <a href="#eq:loss-minimization-SL" data-reference-type="ref" data-reference="eq:loss-minimization-SL">[eq:loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $`\mathcal D`$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $`\epsilon`$-prediction errors $`0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon`$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus errors compounding (Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, left).Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving robotics problems, since multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the generative model $`p(o, a)`$ underlying the samples in $`\mathcal D`$, rather than an explicitly learning a prediction function $`f(o) = a`$.
595
 
596
- ## A (Concise) Introduction to Generative Models
597
 
598
  Generative Models (GMs) aim to learn the stochastic process underlying the very generation of the data collected, and typically do so by fitting a probability distribution that approximates the unknown *data distribution*, $`p`$. In the case of BC, this unknown data distribution $`p`$ represents the expert’s joint distribution over $`(o, a)`$-pairs. Thus, given a finite set of $`N`$ pairs $`\mathcal D = \{ (o,a)_i \}_{i=0}^N`$ used as an imitation learning target (and thus assumed to be i.i.d.), GM seeks to learn a *parametric* distribution $`p_\theta(o,a)`$ such that (1) new samples $`(o,a) \sim p_\theta(\bullet)`$ resemble those stored in $`\mathcal D`$, and (2) high likelihood is assigned to the observed regions of the unobservable $`p`$. Likelihood-based learning provides a principled training objective to achieve both objectives, and it is thus extensively used in GM @prince2023understanding.
599
 
600
- ### Variational Auto-Encoders
601
 
602
  <figure id="fig:ch4-task-effect-on-pairs">
603
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch4/ch4-task-effect-on-pairs.png" style="width:80.0%" />
@@ -668,7 +668,7 @@ Assuming $`p_\theta(o,a \vert z)`$ is parametrized as an isotropic Gaussian dist
668
  ```
669
  Indeed, it is very common in practice to approximate from the learned likelihood $`p_\theta(o,a \vert z)`$ as a parametric distribution (e.g. Gaussians) parametrized by some learned vector of coefficients derived from $`\mu_\theta (z), \ z \sim p (\bullet)`$. In all such cases, learning a VAE corresponds to optimally *reconstructing* the examples in $`\mathcal D`$ by minimizing the L2-error--a very common *supervised learning* objective for regression targets--while regularizing the information compression into the latent, as under the common modeling choice $`p(z) = \mathcal N (\mathbf{0}, \mathbf{I})`$ <a href="#eq:VAE-Lreg" data-reference-type="ref" data-reference="eq:VAE-Lreg">[eq:VAE-Lreg]</a> regularizes the posterior limiting the expressivity of $`q_\phi(z\vert o,a)`$.
670
 
671
- ### Diffusion Models
672
 
673
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#eq:BC-latent-variable" data-reference-type="ref" data-reference="eq:BC-latent-variable">[eq:BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#fig:ch4-many-latents" data-reference-type="ref" data-reference="fig:ch4-many-latents">24</a>), resulting in
674
  $$
@@ -719,7 +719,7 @@ In this simplified (minimization) objective, the optimization process differs fr
719
 
720
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing $`z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}), `$ thus showing that the lower-level latent variables in a DM can be obtained by iteratively removing noise from the one-step higher order variable, using the noise regressor $`\epsilon_\theta(z_t, t)`$ learned minimizing <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>.
721
 
722
- ### Flow Matching
723
 
724
  The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $`\psi [0,1] \times Z \mapsto Z`$, formalized starting from possibly time-dependent vector field $`v: [0,1] \times Z \mapsto Z`$ transporting samples from a simple prior distribution $`p_0`$--e.g., a standard Gaussian--to a more complex, potentially unknown data distribution $`p_1`$ over time. Note how FM models time $`t \in [0,1]`$ to be varying continuously while moving away *from* an easy-to-sample distribution $`p_0`$ *towards* the unknown data-distribution, $`p_1`$. This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time,
725
  $$
@@ -753,7 +753,7 @@ In practice, FM can be applied to generative modeling by learning a vector field
753
  \mathcal L(\theta) = \mathbb{E}_{t, z_0, z_1} \big[
754
  \Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal{U}([0,1]),`$ where $`z_0 \sim p_0(\bullet)`$ and $`z_1 \sim p_1(\bullet)`$. Note how in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a>--differently from <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>--time is assumed to be varying continuously $`t \sim \mathcal U([0,1])`$ rather than discretely $`t \sim \mathcal U(\{0,1\})`$, a key property of flow-based models. The objective in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a> directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $`z_0 \sim p_0`$ and iteratively refined according to $`\frac{dz}{dt} = v_\theta(z_t, t)`$ for $`t \in [0,1]`$--an operation that can be numerically carried out with standard ODE solvers.
755
 
756
- ## Action Chunking with Transformers
757
 
758
  While GMs prove useful in learning complex, high-dimensional multi-modal distributions, they do not natively address the compouding errors problem characteristic of online, sequential predictions. In Action Chunking with Transformers (ACT), @zhaoLearningFineGrainedBimanual2023 present an application of VAEs to the problem of learning purely from offline trajectories, introduce a simple, yet effective method to mitigate error compounding, learning high-fidelity autonomous behaviors. Drawing inspiration from how humans plan to enact atomically sequences of the kind $`a_{t:t+k}`$ instead of single actions $`a_t`$, @zhaoLearningFineGrainedBimanual2023 propose learning a GM on a dataset of input demonstrations by modeling *action chunks*. Besides contributions to learning high-performance autonomous behaviors, @zhaoLearningFineGrainedBimanual2023 also introduce hardware contributions in the form of a low-cost bimanual robot setup (ALOHA) capable of performing fine-grained manipulation tasks, such as opening a lid, slotting a battery in its allotment or even prepare tape for application.
759
 
@@ -786,9 +786,9 @@ However, the authors claim using a deterministic procedure to derive $`z`$ may b
786
  <figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"><em>n</em></span> camera views are first embedded using pre-trained visual encoders, and then concatenated to the corresponding positional embeddings. Then, alongside embeddings for the proprioperceptive information available and the style variable <span class="math inline"><em>z</em></span> retrieved from the CVAE encoder, the Transformer encoder shares the matrices <span class="math inline"><em>K</em>, <em>Q</em></span> with the Transformer decoder, trained to decode fixed position embeddings into action valid chunks.</figcaption>
787
  </figure>
788
 
789
- ### Code Example: Learning ACT
790
 
791
- ## Diffusion Policy
792
 
793
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
794
 
@@ -811,9 +811,9 @@ Figure <a href="#fig:diffusion-policy-architecture" data-reference-type="ref" d
811
 
812
  Training using 50-150 demos (15-60 minutes of teleoperation data) DP achieves strong performance on a variety of simulated and real-world tasks, including dexterous and deformable manipulation tasks such as sauce pouring and mat unrolling. Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, typically trained in simulation with priviledged information not directly available in real-world deployments. As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments, @chiDiffusionPolicyVisuomotor2024’s conclusion offers significant evidence for learning streamlined control policies directly from pixels. In their work, @chiDiffusionPolicyVisuomotor2024 also ablate the performance of DP against their baseline against the size of the dataset collected, showing that DP outperforms the considered baseline for every benchmark size considered. Further, to accelerate inference, @chiDiffusionPolicyVisuomotor2024 employ Denoising Diffusion Implicit Models @songDenoisingDiffusionImplicit2022, a variant of Denoising Diffusion Probabilistic Models @hoDenoisingDiffusionProbabilistic2020 (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM’s natively stochastic one) inducing the same final distribution’s as DDPM’s, and yet resulting in 10 times less denoising steps at inference time @chiDiffusionPolicyVisuomotor2024. Across a range of simulated and real-world tasks, @chiDiffusionPolicyVisuomotor2024 find DPs particularly performant when implementing a transformer-based network as $`\epsilon_\theta`$, although the authors note the increased sensitivity of transformer networks to hyperparameters and thus explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure <a href="#fig:diffusion-policy-architecture" data-reference-type="ref" data-reference="fig:diffusion-policy-architecture">32</a>), which are however reported to be biased towards learning low-frequency components @tancikFourierFeaturesLet2020 and thus may prove more challenging to train with non-smooth action sequences.
813
 
814
- ### Code Example: Learning Diffusion Policies
815
 
816
- ## Optimized Inference
817
 
818
  Modern visuomotor policies output *action chunks*-sequences $`\pi(o_t) = \mathbf{A}_t`$ with $`\mathbf{A}_t = \bigl(a_t,a_{t+1},\dots,a_{t+H_a}\bigr)`$ being a sequence of $`H_a \gg 1`$ low-level commands enqueued in an action queue, originating from an environment observation, $`o_t`$. Predicting series of actions instead of single commands proved essential in learning complex, multi-modal behavior @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024.
819
 
@@ -844,7 +844,7 @@ $`\mathbf{A}_{t+1} \gets \mathbf{A}_t`$
844
 
845
  </div>
846
 
847
- #### Implementation details
848
 
849
  *Async* inference (1) tightens the control loop by capturing observations more often, directly eliminates idle gaps at runtime, and (2) directly allows to run inference on more powerful computational resources than the ones typically available onboard autonomous robotic platforms.
850
 
@@ -869,9 +869,9 @@ Interestingly, the behavior of async inference can be studied analytically. Firs
869
 
870
  <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> emphasizes the trade-off governed by $`g`$: small values place result in idle periods, whereas $`g\approx 1`$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $`g\in(0,1)`$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the would send observations for processing every $`(1 - g) H_a \cdot \Delta t`$ seconds, receiving a new chunk of actions every $`(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]`$, on average. The presence of the observation similarity filter dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
871
 
872
- ### Code Example: Using Async Inference
873
 
874
- # Generalist Robot Policies
875
 
876
  <div class="epigraph">
877
 
@@ -894,7 +894,7 @@ The advent of large models trained on internet-scale datasets has drastically in
894
  <figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
895
  </figure>
896
 
897
- ## Preliminaries: Models and Data
898
 
899
  The remarkable success of foundation models in NLP and CV is predicated on two core principles: architectural innovation and joint data-compute scaling. The transformer architecture proved instrumental in capturing long-range dependencies in sequential data such as text, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale amounts of data. In stark contrast with popular NLP @raffelExploringLimitsTransfer2023 and CV @ImageNet_VSS09 general-purpose datasets, the field of robotics has historically developed around task-specific datasets which hinders scalability across problems, resulting in a concrete data deficit for general-purpose robot learning. Unlike the wealth of relatively readily available text and images on the internet, robotics data is intrinsically embodied--datasets collected for a manipulation robot typically differ entirely from locomotion datasets. Further, datasets consisting of expert demonstrations are (1) intrinsically expensive to collect (2) and notoriously heterogeneous--different human experts may perform the same task optimally yet in very different ways. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, and which excel at their designated task but fail to generalize to new situations (Figure <a href="#fig:ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="fig:ch5-ml-vs-robotics-foundation">35</a>).
900
 
@@ -918,19 +918,19 @@ The success of large, proprietary models like RT-1 and RT-2, highlighted a growi
918
 
919
  Figure <a href="#fig:ch5-trends" data-reference-type="ref" data-reference="fig:ch5-trends">37</a> illustrates graphically the two most relevant trends in modern robot learning. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners have also gained traction recently, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are also becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
920
 
921
- ## Modern VLAs
922
 
923
  Modern recipes to train large scale VLAs extend early efforts to learn foundation models from large amounts of data via BC, introducing significant advancements concerning both architectural and procedural aspects. From an architectural perspective, modern VLAs such as $`\pi_0`$ @blackp0VisionLanguageActionFlow2024 leverage a *unified transformer model* for efficiency of computation, while maintaining specialized sub-components within the model for visual perception and action prediction, enabling cross-task performance via language conditioning. Crucially, modern VLAs including @blackp0VisionLanguageActionFlow2024\[$`\pi_0`$\] and @shukorSmolVLAVisionLanguageActionModel2025\[SmolVLA\] adopt *unified* transformer models employing disjoint set of weights (*experts*) for compute-efficient visual-semantic understanding and robotic control. Procedurally, modern VLAs complement advanced Vision-Language Model (VLM) backbones with action-specific modules (1) adopting mid-sized *action experts* to model continuous actions distributions $`p (a_{t:t+H_a} \vert o_t)`$--avoiding discrete action tokens entirely--and (2) relying on *action chunking*  as a strategy to reduce error compounding when predicting multiple actions learning from inherently non-i.i.d. data, such as demonstration data.
924
 
925
  These architectural and procedural innovations present three benefits. First, developing architectures that exploit internet-scale pre-trained backbones allows to fully capitalizes on the vast world knowledge and skills state-of-the-art VLMs exhibit, preventig models from needing to learn visual, linguistic and semantic concepts from scratch. Second, using generative models for continuous action distributions allows to learn rich, multimodal data distributions, a much more likely scenario in the big-data regime typically tackled while developing generalist policies. Further, introducing two separate components for perception and action planning could enable using Mixture of Experts (MoE) architectures @fedusReviewSparseExpert2022, more efficient to run and thus resulting in faster inference--a key features for models deployed in real-world scenarios. This new paradigm has been at the core of some of the most capable generalist policies developed to date, capable to few-shot adapt to novel tasks and to perform highly dexterous manipulation tasks, ranging from end-to-end folding laundry, to bussing tables.
926
 
927
- ### VLMs for VLAs
928
 
929
  VLMs are designed to process both visual and textual modalities--most commonly by taking both images and text as input and generating text conditioned on the visual context. Recent advances in VLMs have been driven by the success of LLMs, with many approaches building upon pretrained LLMs and adopting similar training paradigms to the ones used in language modeling. Typically, VLMs @alayracFlamingoVisualLanguage2022, @laurenconWhatMattersWhen2024, @linVILAPretrainingVisual2024 are constructed by integrating a pretrained vision encoder @radfordLearningTransferableVisual2021, @zhaiSigmoidLossLanguage2023, @finiMultimodalAutoregressivePretraining2024 with a pretrained LLM @grattafioriLlama3Herd2024, @jiangMistral7B2023. Training then proceeds in multiple multimodal stages, beginning with a large-scale pretraining on datasets containing image-text pairs @LAION-COCO, @kakaobrain2022coyo700m and interleaved vision-language corpora @OBELICS, @MMC4, all followed by a supervised fine-tuning stage on instruction-tuning datasets @LLaVA-1.5, @tong2024cambrian, @laurenconWhatMattersWhen2024. The inherent multimodal nature of VLMs enables them to jointly reason over vision and language. Pre-training on vast internet-scale datasets allows these models to associate visual patterns with textual descriptions, thereby acquiring a rich semantic understanding of the world--knowledge about objects, their properties, and relationships--without explicit supervision for each concept. In turn, integrating a VLM as a perception backbone for a VLA allows the complete model to inherit rich world knowledge, sidestepping the need to learn visual and semantic representations from scratch. In principle, this allows the robot to ground high-level natural language instructions in its visual context, and possibly recognize unseen objects by connecting them to pre-trained concepts absorbed during pre-training, improving on the possibility to generalize to novel scenarios.
930
 
931
  Recently, compute efficiency has also become a central focus in VLM research. Several works aim to reduce training costs by using smaller, more diverse datasets @LLaVA-1.5, @InstructBLIP, @bai2025qwen25vl, @zhu2024minigpt, @tong2024cambrian, training smaller-scale models @marafiotiSmolVLMRedefiningSmall2025, @moondream, @minicmpv2024, or by adapting pretrained unimodal models by tuning only a small subset of parameters @shukor2023epalm, @vallaeys2024improveddepalm, @MAPL, @FROMAGe, @tsimpoukelli2021multimodalfrozen, @BLIP-2. While the majority of VLM research focuses on image and text modalities, recent work has demonstrated that similar techniques can be extended to integrate additional modalities, such as video and audio @wang2025internvideo2, @liu2024kangaroo, @zhang2025videollama, @kong2024audioflam--a particularly promising direction of research for robotics applications, where multiple sensor modalities can be integrated effectively. This trend towards efficiency is paramount for robotics applications, where policies must operate under the stringent constraints of real-world deployment. Indeed, robots often possess limited on-board computational resources and must react in real-time to dynamic environments. Smaller and faster VLMs have thus become quintessential for developing responsive autonomous systems, enabling high-frequency control loops by reducing the latency between perception and action.
932
 
933
- ## $`\pi_0`$
934
 
935
  $`\pi_0`$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with a late-fusion VLM (PaliGemma), while proprioceptive state and actions chunks are routed to a smaller action expert, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
936
 
@@ -982,9 +982,9 @@ Besides adopting a MoE architecture with a VLM backbone initialized from a pre-t
982
 
983
  Lastly, @blackp0VisionLanguageActionFlow2024 present cross-embodiment experiments where they demonstrate $`\pi_0`$’s ability to control both mobile and static manipulator robots with varying arm embodiments. The emergence of cross-embodiment capabilities is largely to be attributed to the presence of large scale cross-embodiment data in the data mixture, handled by $`\pi_0`$defaulting to the maximal configuration size across the $`\pi`$ dataset, and zero-padding robots with fewer dof. In that $`\pi_0`$constantly processes 18 DoFs robots (two 6-DoF arms, two grippers, base, vertical torso), regardless of the kind of robot, and robots with fewer dofs are zero-padded. $`\pi_0`$also relies on three camera views, and uses masked image slots for training and deployment scenarios with fewer cameras.
984
 
985
- ### Code Example: Using $`\pi_0`$
986
 
987
- ## SmolVLA
988
 
989
  VLAs remain in an early stage of development and are not yet as mature or widely adopted as LLMs and VLMs. Further, much of the impactful VLA progress remains proprietary, with many models sharing only weights while withholding full training details and essential methodological components. SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, aiming to democratize the developments of robotics foundation models by open sourcing model, training recipes and data used.
990
 
@@ -1003,9 +1003,9 @@ SmolVLA trims both token and layer compute. First, it *reduces visual tokens* vi
1003
 
1004
  Departing from reliance on proprietary datasets, SmolVLA pretrains exclusively on 450+ *community datasets*, totaling 20K+ trajectories. Because instructions in community contributed dataset can be noisy or missing, the authors re-annotate tasks with a small off-the-shelf VLM using frames sampled from the dataset, and standardize camera viewpoints by mapping sources to a consistent top/wrist/side ordering. At inference, similarily to $`\pi_0`$, SmolVLA integrates flow over 10 steps, resulting in fast inference. SmolVLA proves effective across a range of both real-world and simulated environments, rivaling $`\pi_0`$while being close to 40% faster and consuming 6x less memory.
1005
 
1006
- ### Code Example: Using SmolVLA
1007
 
1008
- # Conclusions
1009
 
1010
  This tutorial has chronicled the paradigmatic shift transforming robotics, from the structured, model-based methods of its classical era to the dynamic, data-driven approaches that define modern robot learning. We began by examining the limitations of traditional dynamics-based control, highlighting the brittleness and the significant engineering overhead required by traditional approaches, which in turn motivates more flexible, less model-intensive learning approaches.
1011
 
 
1
+ ## Foreword
2
 
3
  Robotics is an inherently multidisciplinary field, and is not witnessing unprecedented advancements since its inception in the 1960s. Yet, more than sixty years after the debut of Unimate, robots have still not fully integrated into the rich, unstructured, and dynamic world we humans inhabit. Over the decades, numerous disciplines have shown immense promise in tackling the challenges of creating autonomous systems. This tutorial takes a clear stance in the debate on whether modern Machine Learning can play a pivotal role in the development of autonomous robot systems: we believe this to be the case.
4
 
 
16
 
17
  We sincerely hope this tutorial serves as a valuable starting point for your journey into robot learning.
18
 
19
+ ## Introduction
20
 
21
  <figure id="fig:figure1">
22
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch1/ch1-lerobot-figure1.png" />
 
43
 
44
  Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today. We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using `lerobot`, and start here by presenting the dataset format introduced with `lerobot`.
45
 
46
+ ### `LeRobotDataset`
47
 
48
  `LeRobotDataset` is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset` also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding.
49
 
50
  In this, `LeRobotDataset` provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. `LeRobotDataset` can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in `lerobot`, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users.
51
 
52
+ #### The dataset class design
53
 
54
  A core design choice behind `LeRobotDataset` is separating the underlying data storage from the user-facing API. This allows for efficient storage while presenting the data in an intuitive, ready-to-use format.
55
 
 
75
 
76
  - `videos/*`: Contains the MP4 video files for all visual observation streams. Similar to the `data/` directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems.
77
 
78
+ ### Code Example: Batching a (Streaming) Dataset
79
 
80
  This section provides an overview of how to access datasets hosted on Hugging Face using the `LeRobotDataset` class. Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction.
81
 
 
142
 
143
  </div>
144
 
145
+ ## Classical Robotics
146
 
147
  <div class="epigraph">
148
 
 
158
 
159
  </div>
160
 
161
+ ### Explicit and Implicit Models
162
 
163
  <figure id="fig:generating-motion-atlas">
164
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-approaches.png" style="width:50.0%" />
 
169
 
170
  Methods to produce robotics motion range from traditional *explicit* models--<span style="color: hf2">dynamics-based</span>[^1] methods, leveraging precise descriptions of the mechanics of robots’ rigid bodies and their interactions with eventual obstacles in the environment--to *implicit* models--<span style="color: hf2">learning-based</span> methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readings @agrawalComputationalSensorimotorLearning, @bekrisStateRobotMotion2024. A variety of methods have been developed between these two extrema. For instance,  @hansenTemporalDifferenceLearning2022 show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning @suttonReinforcementLearningIntroduction2018 with Model-Predictive Control (MPC). Conversely, as explicit models may be relying on assumptions proving overly simplistic--or even unrealistic--in practice, learning can prove effective to improve modeling of complex phenomena or complement perception @mccormacSemanticFusionDense3D2016. Such examples aim at demonstrating the richness of approaches to robotics, and Figure <a href="#fig:generating-motion-atlas" data-reference-type="ref" data-reference="fig:generating-motion-atlas">2</a> graphically illustrates some of the most relevant techniques. Such a list is clearly far from being exhaustive, and we refer to @bekrisStateRobotMotion2024 for a more comprehensive overview of both general and application-specific methods for motion generation. In this section, we wish to introduce the inherent benefits of <span style="color: hf2">learning-based approaches to robotics</span>--the core focus on this tutorial.
171
 
172
+ ### Different Types of Motion
173
 
174
  <figure id="fig:robotics-platforms-atlas">
175
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-platforms.png" style="width:70.0%" />
 
182
 
183
  The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches. ML has indeed proven particularly transformative across the entire robotics stack, first empowering planning-based techniques with improved state estimation used for traditional planning @tangPerceptionNavigationAutonomous2023 and then end-to-end replacing controllers, effectively yielding perception-to-action methods @koberReinforcementLearningRobotics. Work in producing robots capable of navigating a diverse set of terrains demonstrated the premise of both dynamics and learning-based approaches for locomotion @griffinWalkingStabilizationUsing2017, @jiDribbleBotDynamicLegged2023, @leeLearningQuadrupedalLocomotion2020, @margolisRapidLocomotionReinforcement2022, and recent works on whole-body control indicated the premise of learning-based approaches to generate rich motion on complex robots, including humanoids @zhangWoCoCoLearningWholeBody2024, @bjorckGR00TN1Open2025. Manipulation has also been widely studied, particularly considering its relevance for many impactful use-cases ranging from high-risk applications for humans @fujitaDevelopmentRobotsNuclear2020, @alizadehComprehensiveSurveySpace2024 to manufacturing @sannemanStateIndustrialRobotics2020. While explicit models have proven fundamental in achieving important milestones towards the development of modern robotics, recent works leveraging implicit models proved particularly promising in surpassing scalability and applicability challenges via learning @koberReinforcementLearningRobotics.
184
 
185
+ ### Example: Planar Manipulation
186
 
187
  Robot manipulators typically consist of a series of links and joints, articulated in a chain finally connected to an *end-effector*. Actuated joints are considered responsible for generating motion of the links, while the end effector is instead used to perform specific actions at the target location (e.g., grasping/releasing objects via closing/opening a gripper end-effector, using a specialized tool like a screwdriver, etc.).
188
 
 
258
 
259
  Following trajectories with diff-IK is a valid option in well-controlled and static environments (e.g., industrial manipulators in controlled manufacturing settings), and relies on the ability to define a set of target velocities to track $`[\dot {p}^*_0, \dot {p}^*_1, \dots, \dot {p}^*_k ]`$--an error-prone task largely requiring human expertise. Furthermore, diff-IK relies on the ability to (1) access $`J(q) \, \forall q \in \mathcal Q`$ and (2) compute its pseudo-inverse at every iteration of a given control cycle--a challenging assumption in highly dynamical settings, or for complex kinematic chains.
260
 
261
+ #### Adding Feedback Loops
262
 
263
  While very effective when a goal trajectory has been well specified, the performance of diff-IK can degrade significantly in the presence of modeling/tracking errors, or in the presence of non-modeled dynamics in the environment.
264
 
 
278
 
279
  We point the interested reader to , , and  for extended coverage of FK, IK, diff-IK and control for (diff-)IK.
280
 
281
+ ### Limitations of Dynamics-based Robotics
282
 
283
  Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
284
 
 
297
 
298
  Taken together, these limitations (Figure <a href="#fig:classical-limitations" data-reference-type="ref" data-reference="fig:classical-limitations">10</a>) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available.
299
 
300
+ ## Robot (Reinforcement) Learning
301
 
302
  <div class="epigraph">
303
 
 
338
 
339
  Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling).
340
 
341
+ ### A (Concise) Introduction to RL
342
 
343
  The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (Figure <a href="#fig:rl-most-famous-pic" data-reference-type="ref" data-reference="fig:rl-most-famous-pic">14</a>) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*).
344
 
 
415
 
416
  Popular approaches to continuous state and action space--such as those studied within robotics--include @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulation @akkayaSolvingRubiksCube2019 and locomotion @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024.
417
 
418
+ ### Real-world RL for Robotics
419
 
420
  Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency.
421
 
 
447
 
448
  Off-policy algorithms like Soft Actor-Critic (SAC)��@haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpart @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of the training. Other than allowing to re-use transitions $`(s_t, a_t, r_t, s_{t+1})`$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training process @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-world @luoSERLSoftwareSuite2025. Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours @luoPreciseDexterousRobotic2024.
449
 
450
+ ##### Sample-efficient RL
451
 
452
  In an MDP, the optimal policy $`\pi^*`$ can be derived from its associated $`Q`$-function, $`Q_{\pi^*}`$, and in particular the optimal action(s) $`\mu(s_t)`$ can be selected maximizing the optimal $`Q`$-function over the action space,
453
  ``` math
 
508
  ```
509
  The update rule provided in <a href="#eq:sac-policy-update" data-reference-type="ref" data-reference="eq:sac-policy-update">[eq:sac-policy-update]</a> optimizes the policy while projecting it on a set $`\Pi`$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017).
510
 
511
+ ##### Sample-efficient, data-driven RL
512
 
513
  Importantly, sampling $`(s_t, a_t, r_t, s_{t+1})`$ from the replay buffer $`D`$ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $`D`$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process.
514
 
515
  Reinforcement Learning with Prior Data (RLPD) @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $`D_\text{offline}`$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine.
516
 
517
+ ##### Sample-efficient, data-driven, real-world RL
518
 
519
  Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment. As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness. In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $`c`$, as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($`e^+`$) or failure ($`e^-`$) states, $`s \in \mathcal S`$, with a natural choice for the state-conditioned reward function being $`r \mathcal S \mapsto \mathbb R`$ being $`r(s) = \log c(e^+ \ vert s )`$. Further, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies.
520
 
 
529
 
530
  Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL) @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process. Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. Consequently, given an intervention timestep $`k \in (0, T)`$, length-$`K`$ human intervention data $`\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K`$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of training @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training.
531
 
532
+ #### Code Example: Real-world RL
533
 
534
  **TODO(fracapuano): work out rl training example**
535
 
536
+ #### Limitations of RL in Real-World Robotics: Simulators and Reward Design
537
 
538
  Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations:
539
 
 
543
 
544
  Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns. By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions.
545
 
546
+ ## Robot (Imitation) Learning
547
 
548
  <div class="epigraph">
549
 
 
593
 
594
  While conceptually elegant, point-estimate policies $`f : \mathcal O\mapsto \mathcal A`$ learned by solving <a href="#eq:loss-minimization-SL" data-reference-type="ref" data-reference="eq:loss-minimization-SL">[eq:loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $`\mathcal D`$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $`\epsilon`$-prediction errors $`0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon`$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus errors compounding (Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, left).Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving robotics problems, since multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the generative model $`p(o, a)`$ underlying the samples in $`\mathcal D`$, rather than an explicitly learning a prediction function $`f(o) = a`$.
595
 
596
+ ### A (Concise) Introduction to Generative Models
597
 
598
  Generative Models (GMs) aim to learn the stochastic process underlying the very generation of the data collected, and typically do so by fitting a probability distribution that approximates the unknown *data distribution*, $`p`$. In the case of BC, this unknown data distribution $`p`$ represents the expert’s joint distribution over $`(o, a)`$-pairs. Thus, given a finite set of $`N`$ pairs $`\mathcal D = \{ (o,a)_i \}_{i=0}^N`$ used as an imitation learning target (and thus assumed to be i.i.d.), GM seeks to learn a *parametric* distribution $`p_\theta(o,a)`$ such that (1) new samples $`(o,a) \sim p_\theta(\bullet)`$ resemble those stored in $`\mathcal D`$, and (2) high likelihood is assigned to the observed regions of the unobservable $`p`$. Likelihood-based learning provides a principled training objective to achieve both objectives, and it is thus extensively used in GM @prince2023understanding.
599
 
600
+ #### Variational Auto-Encoders
601
 
602
  <figure id="fig:ch4-task-effect-on-pairs">
603
  <img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch4/ch4-task-effect-on-pairs.png" style="width:80.0%" />
 
668
  ```
669
  Indeed, it is very common in practice to approximate from the learned likelihood $`p_\theta(o,a \vert z)`$ as a parametric distribution (e.g. Gaussians) parametrized by some learned vector of coefficients derived from $`\mu_\theta (z), \ z \sim p (\bullet)`$. In all such cases, learning a VAE corresponds to optimally *reconstructing* the examples in $`\mathcal D`$ by minimizing the L2-error--a very common *supervised learning* objective for regression targets--while regularizing the information compression into the latent, as under the common modeling choice $`p(z) = \mathcal N (\mathbf{0}, \mathbf{I})`$ <a href="#eq:VAE-Lreg" data-reference-type="ref" data-reference="eq:VAE-Lreg">[eq:VAE-Lreg]</a> regularizes the posterior limiting the expressivity of $`q_\phi(z\vert o,a)`$.
670
 
671
+ #### Diffusion Models
672
 
673
  VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#eq:BC-latent-variable" data-reference-type="ref" data-reference="eq:BC-latent-variable">[eq:BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#fig:ch4-many-latents" data-reference-type="ref" data-reference="fig:ch4-many-latents">24</a>), resulting in
674
  $$
 
719
 
720
  By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing $`z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}), `$ thus showing that the lower-level latent variables in a DM can be obtained by iteratively removing noise from the one-step higher order variable, using the noise regressor $`\epsilon_\theta(z_t, t)`$ learned minimizing <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>.
721
 
722
+ #### Flow Matching
723
 
724
  The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $`\psi [0,1] \times Z \mapsto Z`$, formalized starting from possibly time-dependent vector field $`v: [0,1] \times Z \mapsto Z`$ transporting samples from a simple prior distribution $`p_0`$--e.g., a standard Gaussian--to a more complex, potentially unknown data distribution $`p_1`$ over time. Note how FM models time $`t \in [0,1]`$ to be varying continuously while moving away *from* an easy-to-sample distribution $`p_0`$ *towards* the unknown data-distribution, $`p_1`$. This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time,
725
  $$
 
753
  \mathcal L(\theta) = \mathbb{E}_{t, z_0, z_1} \big[
754
  \Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal{U}([0,1]),`$ where $`z_0 \sim p_0(\bullet)`$ and $`z_1 \sim p_1(\bullet)`$. Note how in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a>--differently from <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>--time is assumed to be varying continuously $`t \sim \mathcal U([0,1])`$ rather than discretely $`t \sim \mathcal U(\{0,1\})`$, a key property of flow-based models. The objective in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a> directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $`z_0 \sim p_0`$ and iteratively refined according to $`\frac{dz}{dt} = v_\theta(z_t, t)`$ for $`t \in [0,1]`$--an operation that can be numerically carried out with standard ODE solvers.
755
 
756
+ ### Action Chunking with Transformers
757
 
758
  While GMs prove useful in learning complex, high-dimensional multi-modal distributions, they do not natively address the compouding errors problem characteristic of online, sequential predictions. In Action Chunking with Transformers (ACT), @zhaoLearningFineGrainedBimanual2023 present an application of VAEs to the problem of learning purely from offline trajectories, introduce a simple, yet effective method to mitigate error compounding, learning high-fidelity autonomous behaviors. Drawing inspiration from how humans plan to enact atomically sequences of the kind $`a_{t:t+k}`$ instead of single actions $`a_t`$, @zhaoLearningFineGrainedBimanual2023 propose learning a GM on a dataset of input demonstrations by modeling *action chunks*. Besides contributions to learning high-performance autonomous behaviors, @zhaoLearningFineGrainedBimanual2023 also introduce hardware contributions in the form of a low-cost bimanual robot setup (ALOHA) capable of performing fine-grained manipulation tasks, such as opening a lid, slotting a battery in its allotment or even prepare tape for application.
759
 
 
786
  <figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"><em>n</em></span> camera views are first embedded using pre-trained visual encoders, and then concatenated to the corresponding positional embeddings. Then, alongside embeddings for the proprioperceptive information available and the style variable <span class="math inline"><em>z</em></span> retrieved from the CVAE encoder, the Transformer encoder shares the matrices <span class="math inline"><em>K</em>, <em>Q</em></span> with the Transformer decoder, trained to decode fixed position embeddings into action valid chunks.</figcaption>
787
  </figure>
788
 
789
+ #### Code Example: Learning ACT
790
 
791
+ ### Diffusion Policy
792
 
793
  DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
794
 
 
811
 
812
  Training using 50-150 demos (15-60 minutes of teleoperation data) DP achieves strong performance on a variety of simulated and real-world tasks, including dexterous and deformable manipulation tasks such as sauce pouring and mat unrolling. Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, typically trained in simulation with priviledged information not directly available in real-world deployments. As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments, @chiDiffusionPolicyVisuomotor2024’s conclusion offers significant evidence for learning streamlined control policies directly from pixels. In their work, @chiDiffusionPolicyVisuomotor2024 also ablate the performance of DP against their baseline against the size of the dataset collected, showing that DP outperforms the considered baseline for every benchmark size considered. Further, to accelerate inference, @chiDiffusionPolicyVisuomotor2024 employ Denoising Diffusion Implicit Models @songDenoisingDiffusionImplicit2022, a variant of Denoising Diffusion Probabilistic Models @hoDenoisingDiffusionProbabilistic2020 (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM’s natively stochastic one) inducing the same final distribution’s as DDPM’s, and yet resulting in 10 times less denoising steps at inference time @chiDiffusionPolicyVisuomotor2024. Across a range of simulated and real-world tasks, @chiDiffusionPolicyVisuomotor2024 find DPs particularly performant when implementing a transformer-based network as $`\epsilon_\theta`$, although the authors note the increased sensitivity of transformer networks to hyperparameters and thus explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure <a href="#fig:diffusion-policy-architecture" data-reference-type="ref" data-reference="fig:diffusion-policy-architecture">32</a>), which are however reported to be biased towards learning low-frequency components @tancikFourierFeaturesLet2020 and thus may prove more challenging to train with non-smooth action sequences.
813
 
814
+ #### Code Example: Learning Diffusion Policies
815
 
816
+ ### Optimized Inference
817
 
818
  Modern visuomotor policies output *action chunks*-sequences $`\pi(o_t) = \mathbf{A}_t`$ with $`\mathbf{A}_t = \bigl(a_t,a_{t+1},\dots,a_{t+H_a}\bigr)`$ being a sequence of $`H_a \gg 1`$ low-level commands enqueued in an action queue, originating from an environment observation, $`o_t`$. Predicting series of actions instead of single commands proved essential in learning complex, multi-modal behavior @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024.
819
 
 
844
 
845
  </div>
846
 
847
+ ##### Implementation details
848
 
849
  *Async* inference (1) tightens the control loop by capturing observations more often, directly eliminates idle gaps at runtime, and (2) directly allows to run inference on more powerful computational resources than the ones typically available onboard autonomous robotic platforms.
850
 
 
869
 
870
  <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> emphasizes the trade-off governed by $`g`$: small values place result in idle periods, whereas $`g\approx 1`$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $`g\in(0,1)`$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the would send observations for processing every $`(1 - g) H_a \cdot \Delta t`$ seconds, receiving a new chunk of actions every $`(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]`$, on average. The presence of the observation similarity filter dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
871
 
872
+ #### Code Example: Using Async Inference
873
 
874
+ ## Generalist Robot Policies
875
 
876
  <div class="epigraph">
877
 
 
894
  <figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
895
  </figure>
896
 
897
+ ### Preliminaries: Models and Data
898
 
899
  The remarkable success of foundation models in NLP and CV is predicated on two core principles: architectural innovation and joint data-compute scaling. The transformer architecture proved instrumental in capturing long-range dependencies in sequential data such as text, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale amounts of data. In stark contrast with popular NLP @raffelExploringLimitsTransfer2023 and CV @ImageNet_VSS09 general-purpose datasets, the field of robotics has historically developed around task-specific datasets which hinders scalability across problems, resulting in a concrete data deficit for general-purpose robot learning. Unlike the wealth of relatively readily available text and images on the internet, robotics data is intrinsically embodied--datasets collected for a manipulation robot typically differ entirely from locomotion datasets. Further, datasets consisting of expert demonstrations are (1) intrinsically expensive to collect (2) and notoriously heterogeneous--different human experts may perform the same task optimally yet in very different ways. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, and which excel at their designated task but fail to generalize to new situations (Figure <a href="#fig:ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="fig:ch5-ml-vs-robotics-foundation">35</a>).
900
 
 
918
 
919
  Figure <a href="#fig:ch5-trends" data-reference-type="ref" data-reference="fig:ch5-trends">37</a> illustrates graphically the two most relevant trends in modern robot learning. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners have also gained traction recently, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are also becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
920
 
921
+ ### Modern VLAs
922
 
923
  Modern recipes to train large scale VLAs extend early efforts to learn foundation models from large amounts of data via BC, introducing significant advancements concerning both architectural and procedural aspects. From an architectural perspective, modern VLAs such as $`\pi_0`$ @blackp0VisionLanguageActionFlow2024 leverage a *unified transformer model* for efficiency of computation, while maintaining specialized sub-components within the model for visual perception and action prediction, enabling cross-task performance via language conditioning. Crucially, modern VLAs including @blackp0VisionLanguageActionFlow2024\[$`\pi_0`$\] and @shukorSmolVLAVisionLanguageActionModel2025\[SmolVLA\] adopt *unified* transformer models employing disjoint set of weights (*experts*) for compute-efficient visual-semantic understanding and robotic control. Procedurally, modern VLAs complement advanced Vision-Language Model (VLM) backbones with action-specific modules (1) adopting mid-sized *action experts* to model continuous actions distributions $`p (a_{t:t+H_a} \vert o_t)`$--avoiding discrete action tokens entirely--and (2) relying on *action chunking*  as a strategy to reduce error compounding when predicting multiple actions learning from inherently non-i.i.d. data, such as demonstration data.
924
 
925
  These architectural and procedural innovations present three benefits. First, developing architectures that exploit internet-scale pre-trained backbones allows to fully capitalizes on the vast world knowledge and skills state-of-the-art VLMs exhibit, preventig models from needing to learn visual, linguistic and semantic concepts from scratch. Second, using generative models for continuous action distributions allows to learn rich, multimodal data distributions, a much more likely scenario in the big-data regime typically tackled while developing generalist policies. Further, introducing two separate components for perception and action planning could enable using Mixture of Experts (MoE) architectures @fedusReviewSparseExpert2022, more efficient to run and thus resulting in faster inference--a key features for models deployed in real-world scenarios. This new paradigm has been at the core of some of the most capable generalist policies developed to date, capable to few-shot adapt to novel tasks and to perform highly dexterous manipulation tasks, ranging from end-to-end folding laundry, to bussing tables.
926
 
927
+ #### VLMs for VLAs
928
 
929
  VLMs are designed to process both visual and textual modalities--most commonly by taking both images and text as input and generating text conditioned on the visual context. Recent advances in VLMs have been driven by the success of LLMs, with many approaches building upon pretrained LLMs and adopting similar training paradigms to the ones used in language modeling. Typically, VLMs @alayracFlamingoVisualLanguage2022, @laurenconWhatMattersWhen2024, @linVILAPretrainingVisual2024 are constructed by integrating a pretrained vision encoder @radfordLearningTransferableVisual2021, @zhaiSigmoidLossLanguage2023, @finiMultimodalAutoregressivePretraining2024 with a pretrained LLM @grattafioriLlama3Herd2024, @jiangMistral7B2023. Training then proceeds in multiple multimodal stages, beginning with a large-scale pretraining on datasets containing image-text pairs @LAION-COCO, @kakaobrain2022coyo700m and interleaved vision-language corpora @OBELICS, @MMC4, all followed by a supervised fine-tuning stage on instruction-tuning datasets @LLaVA-1.5, @tong2024cambrian, @laurenconWhatMattersWhen2024. The inherent multimodal nature of VLMs enables them to jointly reason over vision and language. Pre-training on vast internet-scale datasets allows these models to associate visual patterns with textual descriptions, thereby acquiring a rich semantic understanding of the world--knowledge about objects, their properties, and relationships--without explicit supervision for each concept. In turn, integrating a VLM as a perception backbone for a VLA allows the complete model to inherit rich world knowledge, sidestepping the need to learn visual and semantic representations from scratch. In principle, this allows the robot to ground high-level natural language instructions in its visual context, and possibly recognize unseen objects by connecting them to pre-trained concepts absorbed during pre-training, improving on the possibility to generalize to novel scenarios.
930
 
931
  Recently, compute efficiency has also become a central focus in VLM research. Several works aim to reduce training costs by using smaller, more diverse datasets @LLaVA-1.5, @InstructBLIP, @bai2025qwen25vl, @zhu2024minigpt, @tong2024cambrian, training smaller-scale models @marafiotiSmolVLMRedefiningSmall2025, @moondream, @minicmpv2024, or by adapting pretrained unimodal models by tuning only a small subset of parameters @shukor2023epalm, @vallaeys2024improveddepalm, @MAPL, @FROMAGe, @tsimpoukelli2021multimodalfrozen, @BLIP-2. While the majority of VLM research focuses on image and text modalities, recent work has demonstrated that similar techniques can be extended to integrate additional modalities, such as video and audio @wang2025internvideo2, @liu2024kangaroo, @zhang2025videollama, @kong2024audioflam--a particularly promising direction of research for robotics applications, where multiple sensor modalities can be integrated effectively. This trend towards efficiency is paramount for robotics applications, where policies must operate under the stringent constraints of real-world deployment. Indeed, robots often possess limited on-board computational resources and must react in real-time to dynamic environments. Smaller and faster VLMs have thus become quintessential for developing responsive autonomous systems, enabling high-frequency control loops by reducing the latency between perception and action.
932
 
933
+ ### $`\pi_0`$
934
 
935
  $`\pi_0`$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with a late-fusion VLM (PaliGemma), while proprioceptive state and actions chunks are routed to a smaller action expert, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
936
 
 
982
 
983
  Lastly, @blackp0VisionLanguageActionFlow2024 present cross-embodiment experiments where they demonstrate $`\pi_0`$’s ability to control both mobile and static manipulator robots with varying arm embodiments. The emergence of cross-embodiment capabilities is largely to be attributed to the presence of large scale cross-embodiment data in the data mixture, handled by $`\pi_0`$defaulting to the maximal configuration size across the $`\pi`$ dataset, and zero-padding robots with fewer dof. In that $`\pi_0`$constantly processes 18 DoFs robots (two 6-DoF arms, two grippers, base, vertical torso), regardless of the kind of robot, and robots with fewer dofs are zero-padded. $`\pi_0`$also relies on three camera views, and uses masked image slots for training and deployment scenarios with fewer cameras.
984
 
985
+ #### Code Example: Using $`\pi_0`$
986
 
987
+ ### SmolVLA
988
 
989
  VLAs remain in an early stage of development and are not yet as mature or widely adopted as LLMs and VLMs. Further, much of the impactful VLA progress remains proprietary, with many models sharing only weights while withholding full training details and essential methodological components. SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, aiming to democratize the developments of robotics foundation models by open sourcing model, training recipes and data used.
990
 
 
1003
 
1004
  Departing from reliance on proprietary datasets, SmolVLA pretrains exclusively on 450+ *community datasets*, totaling 20K+ trajectories. Because instructions in community contributed dataset can be noisy or missing, the authors re-annotate tasks with a small off-the-shelf VLM using frames sampled from the dataset, and standardize camera viewpoints by mapping sources to a consistent top/wrist/side ordering. At inference, similarily to $`\pi_0`$, SmolVLA integrates flow over 10 steps, resulting in fast inference. SmolVLA proves effective across a range of both real-world and simulated environments, rivaling $`\pi_0`$while being close to 40% faster and consuming 6x less memory.
1005
 
1006
+ #### Code Example: Using SmolVLA
1007
 
1008
+ ## Conclusions
1009
 
1010
  This tutorial has chronicled the paradigmatic shift transforming robotics, from the structured, model-based methods of its classical era to the dynamic, data-driven approaches that define modern robot learning. We began by examining the limitations of traditional dynamics-based control, highlighting the brittleness and the significant engineering overhead required by traditional approaches, which in turn motivates more flexible, less model-intensive learning approaches.
1011
 
app/scripts/latex-to-markdown/output/main.mdx CHANGED
The diff for this file is too large to render. See raw diff
 
app/src/content/article.mdx CHANGED
The diff for this file is too large to render. See raw diff
 
app/src/styles/_base.css CHANGED
@@ -1,9 +1,17 @@
1
  @import "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200..900;1,200..900&display=swap";
2
 
3
- html { font-size: 16px; line-height: 1.6; }
 
 
 
 
 
 
 
4
 
5
- .content-grid main { color: var(--text-color); }
6
- .content-grid main p { margin: 0 0 var(--spacing-3); }
 
7
 
8
  .content-grid main h2 {
9
  font-weight: 600;
@@ -14,6 +22,12 @@ html { font-size: 16px; line-height: 1.6; }
14
  border-bottom: 1px solid var(--border-color);
15
  }
16
 
 
 
 
 
 
 
17
  .content-grid main h3 {
18
  font-weight: 700;
19
  font-size: clamp(18px, 2.1vw, 22px);
@@ -29,25 +43,51 @@ html { font-size: 16px; line-height: 1.6; }
29
  margin: var(--spacing-8) 0 var(--spacing-4);
30
  }
31
 
32
- .content-grid main a { color: var(--primary-color); text-decoration: none; border-bottom: 1px solid var(--link-underline); }
33
- .content-grid main a:hover { color: var(--primary-color-hover); border-bottom: 1px solid var(--link-underline-hover); }
 
 
 
 
 
 
 
 
34
 
35
  /* Do not underline heading links inside the article (not the TOC) */
36
  .content-grid main h2 a,
37
  .content-grid main h3 a,
38
  .content-grid main h4 a,
39
  .content-grid main h5 a,
40
- .content-grid main h6 a { color: inherit; border-bottom: none; text-decoration: none; }
 
 
 
 
 
41
  .content-grid main h2 a:hover,
42
  .content-grid main h3 a:hover,
43
  .content-grid main h4 a:hover,
44
  .content-grid main h5 a:hover,
45
- .content-grid main h6 a:hover { color: inherit; border-bottom: none; text-decoration: none; }
 
 
 
 
46
 
47
  .content-grid main ul,
48
- .content-grid main ol { padding-left: 24px; margin: 0 0 var(--spacing-3); }
49
- .content-grid main li { margin-bottom: var(--spacing-2); }
50
- .content-grid main li:last-child { margin-bottom: 0; }
 
 
 
 
 
 
 
 
 
51
 
52
  .content-grid main blockquote {
53
  border-left: 2px solid var(--border-color);
@@ -57,7 +97,11 @@ html { font-size: 16px; line-height: 1.6; }
57
  margin: var(--spacing-4) 0;
58
  }
59
 
60
- .content-grid main hr { border: none; border-bottom: 1px solid var(--border-color); margin: var(--spacing-5) 0; }
 
 
 
 
61
 
62
  .muted {
63
  color: var(--muted-color);
 
1
  @import "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200..900;1,200..900&display=swap";
2
 
3
+ html {
4
+ font-size: 16px;
5
+ line-height: 1.6;
6
+ }
7
+
8
+ .content-grid main {
9
+ color: var(--text-color);
10
+ }
11
 
12
+ .content-grid main p {
13
+ margin: 0 0 var(--spacing-3);
14
+ }
15
 
16
  .content-grid main h2 {
17
  font-weight: 600;
 
22
  border-bottom: 1px solid var(--border-color);
23
  }
24
 
25
+ .content-grid main h2:first-child {
26
+ margin-top: 0;
27
+ }
28
+
29
+
30
+
31
  .content-grid main h3 {
32
  font-weight: 700;
33
  font-size: clamp(18px, 2.1vw, 22px);
 
43
  margin: var(--spacing-8) 0 var(--spacing-4);
44
  }
45
 
46
+ .content-grid main a {
47
+ color: var(--primary-color);
48
+ text-decoration: none;
49
+ border-bottom: 1px solid var(--link-underline);
50
+ }
51
+
52
+ .content-grid main a:hover {
53
+ color: var(--primary-color-hover);
54
+ border-bottom: 1px solid var(--link-underline-hover);
55
+ }
56
 
57
  /* Do not underline heading links inside the article (not the TOC) */
58
  .content-grid main h2 a,
59
  .content-grid main h3 a,
60
  .content-grid main h4 a,
61
  .content-grid main h5 a,
62
+ .content-grid main h6 a {
63
+ color: inherit;
64
+ border-bottom: none;
65
+ text-decoration: none;
66
+ }
67
+
68
  .content-grid main h2 a:hover,
69
  .content-grid main h3 a:hover,
70
  .content-grid main h4 a:hover,
71
  .content-grid main h5 a:hover,
72
+ .content-grid main h6 a:hover {
73
+ color: inherit;
74
+ border-bottom: none;
75
+ text-decoration: none;
76
+ }
77
 
78
  .content-grid main ul,
79
+ .content-grid main ol {
80
+ padding-left: 24px;
81
+ margin: 0 0 var(--spacing-3);
82
+ }
83
+
84
+ .content-grid main li {
85
+ margin-bottom: var(--spacing-2);
86
+ }
87
+
88
+ .content-grid main li:last-child {
89
+ margin-bottom: 0;
90
+ }
91
 
92
  .content-grid main blockquote {
93
  border-left: 2px solid var(--border-color);
 
97
  margin: var(--spacing-4) 0;
98
  }
99
 
100
+ .content-grid main hr {
101
+ border: none;
102
+ border-bottom: 1px solid var(--border-color);
103
+ margin: var(--spacing-5) 0;
104
+ }
105
 
106
  .muted {
107
  color: var(--muted-color);