thibaud frere
commited on
Commit
·
0ec6423
1
Parent(s):
52bc805
update
Browse files- app/scripts/latex-to-markdown/latex-converter.mjs +1 -1
- app/scripts/latex-to-markdown/mdx-converter.mjs +143 -24
- app/scripts/latex-to-markdown/metadata-extractor.mjs +170 -0
- app/scripts/latex-to-markdown/output/main.md +40 -40
- app/scripts/latex-to-markdown/output/main.mdx +0 -0
- app/src/content/article.mdx +0 -0
- app/src/styles/_base.css +55 -11
app/scripts/latex-to-markdown/latex-converter.mjs
CHANGED
|
@@ -226,7 +226,7 @@ export function convertLatexToMarkdown(inputFile, outputDir) {
|
|
| 226 |
const mediaDir = join(outputDir, 'assets', 'image');
|
| 227 |
ensureDirectory(mediaDir);
|
| 228 |
const inputDir = dirname(inputFile);
|
| 229 |
-
const pandocCommand = `pandoc "${preprocessedFile}" -f latex+latex_macros -t gfm+tex_math_dollars --wrap=none ${bibOption} --extract-media="${mediaDir}" --resource-path="${inputDir}" -o "${outputFile}"`;
|
| 230 |
|
| 231 |
console.log(` Running: ${pandocCommand}`);
|
| 232 |
execSync(pandocCommand, { stdio: 'pipe' });
|
|
|
|
| 226 |
const mediaDir = join(outputDir, 'assets', 'image');
|
| 227 |
ensureDirectory(mediaDir);
|
| 228 |
const inputDir = dirname(inputFile);
|
| 229 |
+
const pandocCommand = `pandoc "${preprocessedFile}" -f latex+latex_macros -t gfm+tex_math_dollars --shift-heading-level-by=1 --wrap=none ${bibOption} --extract-media="${mediaDir}" --resource-path="${inputDir}" -o "${outputFile}"`;
|
| 230 |
|
| 231 |
console.log(` Running: ${pandocCommand}`);
|
| 232 |
execSync(pandocCommand, { stdio: 'pipe' });
|
app/scripts/latex-to-markdown/mdx-converter.mjs
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
| 4 |
import { join, dirname, basename, extname } from 'path';
|
| 5 |
import { fileURLToPath } from 'url';
|
|
|
|
| 6 |
|
| 7 |
const __filename = fileURLToPath(import.meta.url);
|
| 8 |
const __dirname = dirname(__filename);
|
|
@@ -334,22 +335,40 @@ function transformReferenceLinks(content) {
|
|
| 334 |
);
|
| 335 |
}
|
| 336 |
|
|
|
|
| 337 |
/**
|
| 338 |
* Fix frontmatter and ensure proper MDX format
|
| 339 |
* @param {string} content - MDX content
|
|
|
|
| 340 |
* @returns {string} - Content with proper frontmatter
|
| 341 |
*/
|
| 342 |
-
function ensureFrontmatter(content) {
|
| 343 |
console.log(' 📄 Ensuring proper frontmatter...');
|
| 344 |
|
| 345 |
if (!content.startsWith('---')) {
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
title: "Research Article"
|
| 348 |
-
|
| 349 |
-
|
| 350 |
---
|
| 351 |
|
| 352 |
`;
|
|
|
|
|
|
|
|
|
|
| 353 |
return frontmatter + content;
|
| 354 |
}
|
| 355 |
|
|
@@ -357,41 +376,125 @@ date: "${new Date().toISOString().split('T')[0]}"
|
|
| 357 |
}
|
| 358 |
|
| 359 |
/**
|
| 360 |
-
* Clean newlines from single-
|
| 361 |
* @param {string} content - MDX content
|
| 362 |
* @returns {string} - Content with cleaned math blocks
|
| 363 |
*/
|
| 364 |
function cleanSingleLineMathNewlines(content) {
|
| 365 |
-
console.log(' 🔢 Cleaning newlines in single-
|
| 366 |
|
| 367 |
let cleanedCount = 0;
|
| 368 |
|
| 369 |
-
//
|
| 370 |
-
// Use
|
| 371 |
-
const cleanedContent = content.replace(/\$([
|
| 372 |
-
// Only process if
|
| 373 |
-
|
| 374 |
-
// 2. It's not too long (likely not a multi-paragraph match)
|
| 375 |
-
// 3. It doesn't contain double newlines (paragraph breaks)
|
| 376 |
-
if (mathContent.includes('\n') &&
|
| 377 |
-
!mathContent.includes('\n\n') &&
|
| 378 |
-
mathContent.length <= 200) {
|
| 379 |
-
|
| 380 |
cleanedCount++;
|
| 381 |
|
| 382 |
-
// Remove newlines and
|
| 383 |
const cleanedMath = mathContent
|
| 384 |
-
.replace(/\n+/g, ' ') // Replace newlines with spaces
|
|
|
|
| 385 |
.replace(/\s+/g, ' ') // Normalize multiple spaces to single
|
| 386 |
.trim(); // Remove leading/trailing spaces
|
| 387 |
|
| 388 |
return `$${cleanedMath}$`;
|
| 389 |
}
|
| 390 |
-
return match; // Keep original if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
});
|
| 392 |
|
| 393 |
if (cleanedCount > 0) {
|
| 394 |
-
console.log(` ✅ Cleaned ${cleanedCount}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
}
|
| 396 |
|
| 397 |
return cleanedContent;
|
|
@@ -419,9 +522,10 @@ function cleanMdxSyntax(content) {
|
|
| 419 |
/**
|
| 420 |
* Main MDX processing function that applies all transformations
|
| 421 |
* @param {string} content - Raw Markdown content
|
|
|
|
| 422 |
* @returns {string} - Processed MDX content compatible with Astro
|
| 423 |
*/
|
| 424 |
-
function processMdxContent(content) {
|
| 425 |
console.log('🔧 Processing for Astro MDX compatibility...');
|
| 426 |
|
| 427 |
// Clear previous tracking
|
|
@@ -431,12 +535,15 @@ function processMdxContent(content) {
|
|
| 431 |
let processedContent = content;
|
| 432 |
|
| 433 |
// Apply each transformation step sequentially
|
| 434 |
-
processedContent = ensureFrontmatter(processedContent);
|
| 435 |
processedContent = cleanSingleLineMathNewlines(processedContent);
|
|
|
|
|
|
|
| 436 |
processedContent = cleanMdxSyntax(processedContent);
|
| 437 |
processedContent = transformImages(processedContent);
|
| 438 |
processedContent = transformStyledSpans(processedContent);
|
| 439 |
processedContent = transformReferenceLinks(processedContent);
|
|
|
|
| 440 |
|
| 441 |
// Add component imports at the end
|
| 442 |
processedContent = addComponentImports(processedContent);
|
|
@@ -459,8 +566,20 @@ function convertToMdx(inputFile, outputFile) {
|
|
| 459 |
console.log('🔄 Reading Markdown file...');
|
| 460 |
const markdownContent = readFileSync(inputFile, 'utf8');
|
| 461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
// Apply modular MDX processing
|
| 463 |
-
const mdxContent = processMdxContent(markdownContent);
|
| 464 |
|
| 465 |
console.log('💾 Writing MDX file...');
|
| 466 |
writeFileSync(outputFile, mdxContent);
|
|
|
|
| 3 |
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
| 4 |
import { join, dirname, basename, extname } from 'path';
|
| 5 |
import { fileURLToPath } from 'url';
|
| 6 |
+
import { extractAndGenerateFrontmatter } from './metadata-extractor.mjs';
|
| 7 |
|
| 8 |
const __filename = fileURLToPath(import.meta.url);
|
| 9 |
const __dirname = dirname(__filename);
|
|
|
|
| 335 |
);
|
| 336 |
}
|
| 337 |
|
| 338 |
+
|
| 339 |
/**
|
| 340 |
* Fix frontmatter and ensure proper MDX format
|
| 341 |
* @param {string} content - MDX content
|
| 342 |
+
* @param {string} latexContent - Original LaTeX content for metadata extraction
|
| 343 |
* @returns {string} - Content with proper frontmatter
|
| 344 |
*/
|
| 345 |
+
function ensureFrontmatter(content, latexContent = '') {
|
| 346 |
console.log(' 📄 Ensuring proper frontmatter...');
|
| 347 |
|
| 348 |
if (!content.startsWith('---')) {
|
| 349 |
+
let frontmatter;
|
| 350 |
+
|
| 351 |
+
if (latexContent) {
|
| 352 |
+
// Extract metadata from LaTeX using dedicated module
|
| 353 |
+
frontmatter = extractAndGenerateFrontmatter(latexContent);
|
| 354 |
+
console.log(' ✅ Generated frontmatter from LaTeX metadata');
|
| 355 |
+
} else {
|
| 356 |
+
// Fallback frontmatter
|
| 357 |
+
const currentDate = new Date().toLocaleDateString('en-US', {
|
| 358 |
+
year: 'numeric',
|
| 359 |
+
month: 'short',
|
| 360 |
+
day: '2-digit'
|
| 361 |
+
});
|
| 362 |
+
frontmatter = `---
|
| 363 |
title: "Research Article"
|
| 364 |
+
published: "${currentDate}"
|
| 365 |
+
tableOfContentsAutoCollapse: true
|
| 366 |
---
|
| 367 |
|
| 368 |
`;
|
| 369 |
+
console.log(' ✅ Generated basic frontmatter');
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
return frontmatter + content;
|
| 373 |
}
|
| 374 |
|
|
|
|
| 376 |
}
|
| 377 |
|
| 378 |
/**
|
| 379 |
+
* Clean newlines from single-dollar math blocks ($...$) ONLY
|
| 380 |
* @param {string} content - MDX content
|
| 381 |
* @returns {string} - Content with cleaned math blocks
|
| 382 |
*/
|
| 383 |
function cleanSingleLineMathNewlines(content) {
|
| 384 |
+
console.log(' 🔢 Cleaning newlines in single-dollar math blocks ($...$)...');
|
| 385 |
|
| 386 |
let cleanedCount = 0;
|
| 387 |
|
| 388 |
+
// ULTRA STRICT: Only target single dollar blocks ($...$) that contain newlines
|
| 389 |
+
// Use dotall flag (s) to match newlines with .*, and ensure we don't match $$
|
| 390 |
+
const cleanedContent = content.replace(/\$(?!\$)([\s\S]*?)\$(?!\$)/g, (match, mathContent) => {
|
| 391 |
+
// Only process if the content contains newlines
|
| 392 |
+
if (mathContent.includes('\n')) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
cleanedCount++;
|
| 394 |
|
| 395 |
+
// Remove ALL newlines and carriage returns, normalize whitespace
|
| 396 |
const cleanedMath = mathContent
|
| 397 |
+
.replace(/\n+/g, ' ') // Replace all newlines with spaces
|
| 398 |
+
.replace(/\r+/g, ' ') // Replace carriage returns with spaces
|
| 399 |
.replace(/\s+/g, ' ') // Normalize multiple spaces to single
|
| 400 |
.trim(); // Remove leading/trailing spaces
|
| 401 |
|
| 402 |
return `$${cleanedMath}$`;
|
| 403 |
}
|
| 404 |
+
return match; // Keep original if no newlines
|
| 405 |
+
});
|
| 406 |
+
|
| 407 |
+
if (cleanedCount > 0) {
|
| 408 |
+
console.log(` ✅ Cleaned ${cleanedCount} single-dollar math block(s) with newlines`);
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
return cleanedContent;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
/**
|
| 415 |
+
* Add proper line breaks around display math blocks ($$...$$)
|
| 416 |
+
* @param {string} content - MDX content
|
| 417 |
+
* @returns {string} - Content with properly spaced display math
|
| 418 |
+
*/
|
| 419 |
+
function formatDisplayMathBlocks(content) {
|
| 420 |
+
console.log(' 📐 Formatting display math blocks with proper spacing...');
|
| 421 |
+
|
| 422 |
+
let formattedCount = 0;
|
| 423 |
+
|
| 424 |
+
// Find all $$...$$$ blocks (display math) and ensure proper line breaks
|
| 425 |
+
// Very strict: only matches exactly $$ followed by content followed by $$
|
| 426 |
+
const formattedContent = content.replace(/\$\$([\s\S]*?)\$\$/g, (match, mathContent) => {
|
| 427 |
+
formattedCount++;
|
| 428 |
+
|
| 429 |
+
// Clean up the math content - trim whitespace but preserve structure
|
| 430 |
+
const cleanedMath = mathContent.trim();
|
| 431 |
+
|
| 432 |
+
// Return with proper line breaks before and after
|
| 433 |
+
return `\n$$\n${cleanedMath}\n$$\n`;
|
| 434 |
+
});
|
| 435 |
+
|
| 436 |
+
if (formattedCount > 0) {
|
| 437 |
+
console.log(` ✅ Formatted ${formattedCount} display math block(s) with proper spacing`);
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
return formattedContent;
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
/**
|
| 444 |
+
* Clean newlines from figcaption content
|
| 445 |
+
* @param {string} content - MDX content
|
| 446 |
+
* @returns {string} - Content with cleaned figcaptions
|
| 447 |
+
*/
|
| 448 |
+
function cleanFigcaptionNewlines(content) {
|
| 449 |
+
console.log(' 📝 Cleaning newlines in figcaption elements...');
|
| 450 |
+
|
| 451 |
+
let cleanedCount = 0;
|
| 452 |
+
|
| 453 |
+
// Find all <figcaption>...</figcaption> blocks and remove internal newlines
|
| 454 |
+
const cleanedContent = content.replace(/<figcaption([^>]*)>([\s\S]*?)<\/figcaption>/g, (match, attributes, captionContent) => {
|
| 455 |
+
// Only process if the content contains newlines
|
| 456 |
+
if (captionContent.includes('\n')) {
|
| 457 |
+
cleanedCount++;
|
| 458 |
+
|
| 459 |
+
// Remove newlines and normalize whitespace
|
| 460 |
+
const cleanedCaption = captionContent
|
| 461 |
+
.replace(/\n+/g, ' ') // Replace newlines with spaces
|
| 462 |
+
.replace(/\s+/g, ' ') // Normalize multiple spaces
|
| 463 |
+
.trim(); // Trim whitespace
|
| 464 |
+
|
| 465 |
+
return `<figcaption${attributes}>${cleanedCaption}</figcaption>`;
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
return match; // Return unchanged if no newlines
|
| 469 |
});
|
| 470 |
|
| 471 |
if (cleanedCount > 0) {
|
| 472 |
+
console.log(` ✅ Cleaned ${cleanedCount} figcaption element(s)`);
|
| 473 |
+
} else {
|
| 474 |
+
console.log(` ℹ️ No figcaption elements with newlines found`);
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
return cleanedContent;
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
/**
|
| 481 |
+
* Remove HTML comments from MDX content
|
| 482 |
+
* @param {string} content - MDX content
|
| 483 |
+
* @returns {string} - Content without HTML comments
|
| 484 |
+
*/
|
| 485 |
+
function removeHtmlComments(content) {
|
| 486 |
+
console.log(' 🗑️ Removing HTML comments...');
|
| 487 |
+
|
| 488 |
+
let removedCount = 0;
|
| 489 |
+
|
| 490 |
+
// Remove all HTML comments <!-- ... -->
|
| 491 |
+
const cleanedContent = content.replace(/<!--[\s\S]*?-->/g, () => {
|
| 492 |
+
removedCount++;
|
| 493 |
+
return '';
|
| 494 |
+
});
|
| 495 |
+
|
| 496 |
+
if (removedCount > 0) {
|
| 497 |
+
console.log(` ✅ Removed ${removedCount} HTML comment(s)`);
|
| 498 |
}
|
| 499 |
|
| 500 |
return cleanedContent;
|
|
|
|
| 522 |
/**
|
| 523 |
* Main MDX processing function that applies all transformations
|
| 524 |
* @param {string} content - Raw Markdown content
|
| 525 |
+
* @param {string} latexContent - Original LaTeX content for metadata extraction
|
| 526 |
* @returns {string} - Processed MDX content compatible with Astro
|
| 527 |
*/
|
| 528 |
+
function processMdxContent(content, latexContent = '') {
|
| 529 |
console.log('🔧 Processing for Astro MDX compatibility...');
|
| 530 |
|
| 531 |
// Clear previous tracking
|
|
|
|
| 535 |
let processedContent = content;
|
| 536 |
|
| 537 |
// Apply each transformation step sequentially
|
| 538 |
+
processedContent = ensureFrontmatter(processedContent, latexContent);
|
| 539 |
processedContent = cleanSingleLineMathNewlines(processedContent);
|
| 540 |
+
processedContent = formatDisplayMathBlocks(processedContent);
|
| 541 |
+
processedContent = removeHtmlComments(processedContent);
|
| 542 |
processedContent = cleanMdxSyntax(processedContent);
|
| 543 |
processedContent = transformImages(processedContent);
|
| 544 |
processedContent = transformStyledSpans(processedContent);
|
| 545 |
processedContent = transformReferenceLinks(processedContent);
|
| 546 |
+
processedContent = cleanFigcaptionNewlines(processedContent);
|
| 547 |
|
| 548 |
// Add component imports at the end
|
| 549 |
processedContent = addComponentImports(processedContent);
|
|
|
|
| 566 |
console.log('🔄 Reading Markdown file...');
|
| 567 |
const markdownContent = readFileSync(inputFile, 'utf8');
|
| 568 |
|
| 569 |
+
// Try to read original LaTeX file for metadata extraction
|
| 570 |
+
let latexContent = '';
|
| 571 |
+
try {
|
| 572 |
+
const inputDir = dirname(inputFile);
|
| 573 |
+
const latexFile = join(inputDir, '..', 'input', 'main.tex');
|
| 574 |
+
if (existsSync(latexFile)) {
|
| 575 |
+
latexContent = readFileSync(latexFile, 'utf8');
|
| 576 |
+
}
|
| 577 |
+
} catch (error) {
|
| 578 |
+
// Ignore LaTeX reading errors - we'll use fallback frontmatter
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
// Apply modular MDX processing
|
| 582 |
+
const mdxContent = processMdxContent(markdownContent, latexContent);
|
| 583 |
|
| 584 |
console.log('💾 Writing MDX file...');
|
| 585 |
writeFileSync(outputFile, mdxContent);
|
app/scripts/latex-to-markdown/metadata-extractor.mjs
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* LaTeX Metadata Extractor
|
| 3 |
+
* Extracts document metadata from LaTeX files for frontmatter generation
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
/**
|
| 7 |
+
* Extract metadata from LaTeX content
|
| 8 |
+
* @param {string} latexContent - Raw LaTeX content
|
| 9 |
+
* @returns {object} - Extracted metadata object
|
| 10 |
+
*/
|
| 11 |
+
export function extractLatexMetadata(latexContent) {
|
| 12 |
+
const metadata = {};
|
| 13 |
+
|
| 14 |
+
// Extract title
|
| 15 |
+
const titleMatch = latexContent.match(/\\title\s*\{\s*([^}]+)\s*\}/s);
|
| 16 |
+
if (titleMatch) {
|
| 17 |
+
metadata.title = titleMatch[1]
|
| 18 |
+
.replace(/\n/g, ' ')
|
| 19 |
+
.trim();
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
// Extract authors with their specific affiliations
|
| 23 |
+
const authors = [];
|
| 24 |
+
const authorMatches = latexContent.matchAll(/\\authorOne\[[^\]]*\]\{([^}]+)\}/g);
|
| 25 |
+
|
| 26 |
+
for (const match of authorMatches) {
|
| 27 |
+
const fullAuthorInfo = match[1];
|
| 28 |
+
|
| 29 |
+
// Determine affiliations based on macros present
|
| 30 |
+
const affiliations = [];
|
| 31 |
+
if (fullAuthorInfo.includes('\\ensps')) {
|
| 32 |
+
affiliations.push(1); // École Normale Supérieure
|
| 33 |
+
}
|
| 34 |
+
if (fullAuthorInfo.includes('\\hf')) {
|
| 35 |
+
affiliations.push(2); // Hugging Face
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
// Clean author name by removing macros
|
| 39 |
+
let authorName = fullAuthorInfo
|
| 40 |
+
.replace(/\\ensps/g, '') // Remove École macro
|
| 41 |
+
.replace(/\\hf/g, '') // Remove Hugging Face macro
|
| 42 |
+
.replace(/\s+/g, ' ') // Normalize whitespace
|
| 43 |
+
.trim();
|
| 44 |
+
|
| 45 |
+
// Skip empty authors or placeholder entries
|
| 46 |
+
if (authorName && authorName !== '...') {
|
| 47 |
+
authors.push({
|
| 48 |
+
name: authorName,
|
| 49 |
+
affiliations: affiliations.length > 0 ? affiliations : [2] // Default to HF if no macro
|
| 50 |
+
});
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
if (authors.length > 0) {
|
| 55 |
+
metadata.authors = authors;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// Extract affiliations - create the two distinct affiliations
|
| 59 |
+
metadata.affiliations = [
|
| 60 |
+
{
|
| 61 |
+
name: "École Normale Supérieure Paris-Saclay"
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
name: "Hugging Face"
|
| 65 |
+
}
|
| 66 |
+
];
|
| 67 |
+
|
| 68 |
+
// Extract date if available (common LaTeX patterns)
|
| 69 |
+
const datePatterns = [
|
| 70 |
+
/\\date\s*\{([^}]+)\}/,
|
| 71 |
+
/\\newcommand\s*\{\\date\}\s*\{([^}]+)\}/,
|
| 72 |
+
];
|
| 73 |
+
|
| 74 |
+
for (const pattern of datePatterns) {
|
| 75 |
+
const dateMatch = latexContent.match(pattern);
|
| 76 |
+
if (dateMatch) {
|
| 77 |
+
metadata.published = dateMatch[1].trim();
|
| 78 |
+
break;
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// Fallback to current date if no date found
|
| 83 |
+
if (!metadata.published) {
|
| 84 |
+
metadata.published = new Date().toLocaleDateString('en-US', {
|
| 85 |
+
year: 'numeric',
|
| 86 |
+
month: 'short',
|
| 87 |
+
day: '2-digit'
|
| 88 |
+
});
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
return metadata;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
/**
|
| 95 |
+
* Generate YAML frontmatter from metadata object
|
| 96 |
+
* @param {object} metadata - Metadata object
|
| 97 |
+
* @returns {string} - YAML frontmatter string
|
| 98 |
+
*/
|
| 99 |
+
export function generateFrontmatter(metadata) {
|
| 100 |
+
let frontmatter = '---\n';
|
| 101 |
+
|
| 102 |
+
// Title
|
| 103 |
+
if (metadata.title) {
|
| 104 |
+
frontmatter += `title: "${metadata.title}"\n`;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
// Authors
|
| 108 |
+
if (metadata.authors && metadata.authors.length > 0) {
|
| 109 |
+
frontmatter += 'authors:\n';
|
| 110 |
+
metadata.authors.forEach(author => {
|
| 111 |
+
frontmatter += ` - name: "${author.name}"\n`;
|
| 112 |
+
if (author.url) {
|
| 113 |
+
frontmatter += ` url: "${author.url}"\n`;
|
| 114 |
+
}
|
| 115 |
+
frontmatter += ` affiliations: [${author.affiliations.join(', ')}]\n`;
|
| 116 |
+
});
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
// Affiliations
|
| 120 |
+
if (metadata.affiliations && metadata.affiliations.length > 0) {
|
| 121 |
+
frontmatter += 'affiliations:\n';
|
| 122 |
+
metadata.affiliations.forEach((affiliation, index) => {
|
| 123 |
+
frontmatter += ` - name: "${affiliation.name}"\n`;
|
| 124 |
+
if (affiliation.url) {
|
| 125 |
+
frontmatter += ` url: "${affiliation.url}"\n`;
|
| 126 |
+
}
|
| 127 |
+
});
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// Publication date
|
| 131 |
+
if (metadata.published) {
|
| 132 |
+
frontmatter += `published: "${metadata.published}"\n`;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
// Additional metadata
|
| 136 |
+
if (metadata.doi) {
|
| 137 |
+
frontmatter += `doi: "${metadata.doi}"\n`;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
if (metadata.description) {
|
| 141 |
+
frontmatter += `description: "${metadata.description}"\n`;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
if (metadata.licence) {
|
| 145 |
+
frontmatter += `licence: >\n ${metadata.licence}\n`;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
if (metadata.tags && metadata.tags.length > 0) {
|
| 149 |
+
frontmatter += 'tags:\n';
|
| 150 |
+
metadata.tags.forEach(tag => {
|
| 151 |
+
frontmatter += ` - ${tag}\n`;
|
| 152 |
+
});
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
// Default Astro configuration
|
| 156 |
+
frontmatter += 'tableOfContentsAutoCollapse: true\n';
|
| 157 |
+
frontmatter += '---\n\n';
|
| 158 |
+
|
| 159 |
+
return frontmatter;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
/**
|
| 163 |
+
* Extract and generate frontmatter from LaTeX content
|
| 164 |
+
* @param {string} latexContent - Raw LaTeX content
|
| 165 |
+
* @returns {string} - Complete YAML frontmatter
|
| 166 |
+
*/
|
| 167 |
+
export function extractAndGenerateFrontmatter(latexContent) {
|
| 168 |
+
const metadata = extractLatexMetadata(latexContent);
|
| 169 |
+
return generateFrontmatter(metadata);
|
| 170 |
+
}
|
app/scripts/latex-to-markdown/output/main.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
Robotics is an inherently multidisciplinary field, and is not witnessing unprecedented advancements since its inception in the 1960s. Yet, more than sixty years after the debut of Unimate, robots have still not fully integrated into the rich, unstructured, and dynamic world we humans inhabit. Over the decades, numerous disciplines have shown immense promise in tackling the challenges of creating autonomous systems. This tutorial takes a clear stance in the debate on whether modern Machine Learning can play a pivotal role in the development of autonomous robot systems: we believe this to be the case.
|
| 4 |
|
|
@@ -16,7 +16,7 @@ Instead, our goal here is to provide an intuitive explanation as per why these d
|
|
| 16 |
|
| 17 |
We sincerely hope this tutorial serves as a valuable starting point for your journey into robot learning.
|
| 18 |
|
| 19 |
-
|
| 20 |
|
| 21 |
<figure id="fig:figure1">
|
| 22 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch1/ch1-lerobot-figure1.png" />
|
|
@@ -43,13 +43,13 @@ This tutorial serves the double purpose of providing useful references for the S
|
|
| 43 |
|
| 44 |
Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today. We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using `lerobot`, and start here by presenting the dataset format introduced with `lerobot`.
|
| 45 |
|
| 46 |
-
|
| 47 |
|
| 48 |
`LeRobotDataset` is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset` also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding.
|
| 49 |
|
| 50 |
In this, `LeRobotDataset` provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. `LeRobotDataset` can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in `lerobot`, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users.
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
A core design choice behind `LeRobotDataset` is separating the underlying data storage from the user-facing API. This allows for efficient storage while presenting the data in an intuitive, ready-to-use format.
|
| 55 |
|
|
@@ -75,7 +75,7 @@ For scalability, and to support datasets with potentially millions of trajectori
|
|
| 75 |
|
| 76 |
- `videos/*`: Contains the MP4 video files for all visual observation streams. Similar to the `data/` directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems.
|
| 77 |
|
| 78 |
-
|
| 79 |
|
| 80 |
This section provides an overview of how to access datasets hosted on Hugging Face using the `LeRobotDataset` class. Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction.
|
| 81 |
|
|
@@ -142,7 +142,7 @@ for epoch in range(num_epochs):
|
|
| 142 |
|
| 143 |
</div>
|
| 144 |
|
| 145 |
-
|
| 146 |
|
| 147 |
<div class="epigraph">
|
| 148 |
|
|
@@ -158,7 +158,7 @@ TL;DR Learning-based approaches to robotics are motivated by the need to (1) gen
|
|
| 158 |
|
| 159 |
</div>
|
| 160 |
|
| 161 |
-
|
| 162 |
|
| 163 |
<figure id="fig:generating-motion-atlas">
|
| 164 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-approaches.png" style="width:50.0%" />
|
|
@@ -169,7 +169,7 @@ Robotics is concerned with producing artificial motion in the physical world in
|
|
| 169 |
|
| 170 |
Methods to produce robotics motion range from traditional *explicit* models--<span style="color: hf2">dynamics-based</span>[^1] methods, leveraging precise descriptions of the mechanics of robots’ rigid bodies and their interactions with eventual obstacles in the environment--to *implicit* models--<span style="color: hf2">learning-based</span> methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readings @agrawalComputationalSensorimotorLearning, @bekrisStateRobotMotion2024. A variety of methods have been developed between these two extrema. For instance, @hansenTemporalDifferenceLearning2022 show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning @suttonReinforcementLearningIntroduction2018 with Model-Predictive Control (MPC). Conversely, as explicit models may be relying on assumptions proving overly simplistic--or even unrealistic--in practice, learning can prove effective to improve modeling of complex phenomena or complement perception @mccormacSemanticFusionDense3D2016. Such examples aim at demonstrating the richness of approaches to robotics, and Figure <a href="#fig:generating-motion-atlas" data-reference-type="ref" data-reference="fig:generating-motion-atlas">2</a> graphically illustrates some of the most relevant techniques. Such a list is clearly far from being exhaustive, and we refer to @bekrisStateRobotMotion2024 for a more comprehensive overview of both general and application-specific methods for motion generation. In this section, we wish to introduce the inherent benefits of <span style="color: hf2">learning-based approaches to robotics</span>--the core focus on this tutorial.
|
| 171 |
|
| 172 |
-
|
| 173 |
|
| 174 |
<figure id="fig:robotics-platforms-atlas">
|
| 175 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-platforms.png" style="width:70.0%" />
|
|
@@ -182,7 +182,7 @@ Effects such as (1) are typically achieved *through* the robot, i.e. generating
|
|
| 182 |
|
| 183 |
The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches. ML has indeed proven particularly transformative across the entire robotics stack, first empowering planning-based techniques with improved state estimation used for traditional planning @tangPerceptionNavigationAutonomous2023 and then end-to-end replacing controllers, effectively yielding perception-to-action methods @koberReinforcementLearningRobotics. Work in producing robots capable of navigating a diverse set of terrains demonstrated the premise of both dynamics and learning-based approaches for locomotion @griffinWalkingStabilizationUsing2017, @jiDribbleBotDynamicLegged2023, @leeLearningQuadrupedalLocomotion2020, @margolisRapidLocomotionReinforcement2022, and recent works on whole-body control indicated the premise of learning-based approaches to generate rich motion on complex robots, including humanoids @zhangWoCoCoLearningWholeBody2024, @bjorckGR00TN1Open2025. Manipulation has also been widely studied, particularly considering its relevance for many impactful use-cases ranging from high-risk applications for humans @fujitaDevelopmentRobotsNuclear2020, @alizadehComprehensiveSurveySpace2024 to manufacturing @sannemanStateIndustrialRobotics2020. While explicit models have proven fundamental in achieving important milestones towards the development of modern robotics, recent works leveraging implicit models proved particularly promising in surpassing scalability and applicability challenges via learning @koberReinforcementLearningRobotics.
|
| 184 |
|
| 185 |
-
|
| 186 |
|
| 187 |
Robot manipulators typically consist of a series of links and joints, articulated in a chain finally connected to an *end-effector*. Actuated joints are considered responsible for generating motion of the links, while the end effector is instead used to perform specific actions at the target location (e.g., grasping/releasing objects via closing/opening a gripper end-effector, using a specialized tool like a screwdriver, etc.).
|
| 188 |
|
|
@@ -258,7 +258,7 @@ Unlike eq. <a href="#eq:ik_problem" data-reference-type="ref" data-reference="
|
|
| 258 |
|
| 259 |
Following trajectories with diff-IK is a valid option in well-controlled and static environments (e.g., industrial manipulators in controlled manufacturing settings), and relies on the ability to define a set of target velocities to track $`[\dot {p}^*_0, \dot {p}^*_1, \dots, \dot {p}^*_k ]`$--an error-prone task largely requiring human expertise. Furthermore, diff-IK relies on the ability to (1) access $`J(q) \, \forall q \in \mathcal Q`$ and (2) compute its pseudo-inverse at every iteration of a given control cycle--a challenging assumption in highly dynamical settings, or for complex kinematic chains.
|
| 260 |
|
| 261 |
-
|
| 262 |
|
| 263 |
While very effective when a goal trajectory has been well specified, the performance of diff-IK can degrade significantly in the presence of modeling/tracking errors, or in the presence of non-modeled dynamics in the environment.
|
| 264 |
|
|
@@ -278,7 +278,7 @@ More advanced techniques for control consisting in feedback linearization, PID c
|
|
| 278 |
|
| 279 |
We point the interested reader to , , and for extended coverage of FK, IK, diff-IK and control for (diff-)IK.
|
| 280 |
|
| 281 |
-
|
| 282 |
|
| 283 |
Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
|
| 284 |
|
|
@@ -297,7 +297,7 @@ Lastly, dynamics-based methods (naturally) overlook the rather recent <span styl
|
|
| 297 |
|
| 298 |
Taken together, these limitations (Figure <a href="#fig:classical-limitations" data-reference-type="ref" data-reference="fig:classical-limitations">10</a>) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available.
|
| 299 |
|
| 300 |
-
|
| 301 |
|
| 302 |
<div class="epigraph">
|
| 303 |
|
|
@@ -338,7 +338,7 @@ Figure <a href="#fig:robot-learning-atlas" data-reference-type="ref" data-refer
|
|
| 338 |
|
| 339 |
Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling).
|
| 340 |
|
| 341 |
-
|
| 342 |
|
| 343 |
The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (Figure <a href="#fig:rl-most-famous-pic" data-reference-type="ref" data-reference="fig:rl-most-famous-pic">14</a>) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*).
|
| 344 |
|
|
@@ -415,7 +415,7 @@ $$
|
|
| 415 |
|
| 416 |
Popular approaches to continuous state and action space--such as those studied within robotics--include @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulation @akkayaSolvingRubiksCube2019 and locomotion @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024.
|
| 417 |
|
| 418 |
-
|
| 419 |
|
| 420 |
Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency.
|
| 421 |
|
|
@@ -447,7 +447,7 @@ To make the most of (1) the growing number of openly available datasets and (2)
|
|
| 447 |
|
| 448 |
Off-policy algorithms like Soft Actor-Critic (SAC)��@haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpart @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of the training. Other than allowing to re-use transitions $`(s_t, a_t, r_t, s_{t+1})`$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training process @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-world @luoSERLSoftwareSuite2025. Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours @luoPreciseDexterousRobotic2024.
|
| 449 |
|
| 450 |
-
|
| 451 |
|
| 452 |
In an MDP, the optimal policy $`\pi^*`$ can be derived from its associated $`Q`$-function, $`Q_{\pi^*}`$, and in particular the optimal action(s) $`\mu(s_t)`$ can be selected maximizing the optimal $`Q`$-function over the action space,
|
| 453 |
``` math
|
|
@@ -508,13 +508,13 @@ Similarily to DDPG, SAC also maintains an explicit policy, trained under the sam
|
|
| 508 |
```
|
| 509 |
The update rule provided in <a href="#eq:sac-policy-update" data-reference-type="ref" data-reference="eq:sac-policy-update">[eq:sac-policy-update]</a> optimizes the policy while projecting it on a set $`\Pi`$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017).
|
| 510 |
|
| 511 |
-
|
| 512 |
|
| 513 |
Importantly, sampling $`(s_t, a_t, r_t, s_{t+1})`$ from the replay buffer $`D`$ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $`D`$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process.
|
| 514 |
|
| 515 |
Reinforcement Learning with Prior Data (RLPD) @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $`D_\text{offline}`$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine.
|
| 516 |
|
| 517 |
-
|
| 518 |
|
| 519 |
Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment. As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness. In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $`c`$, as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($`e^+`$) or failure ($`e^-`$) states, $`s \in \mathcal S`$, with a natural choice for the state-conditioned reward function being $`r \mathcal S \mapsto \mathbb R`$ being $`r(s) = \log c(e^+ \ vert s )`$. Further, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies.
|
| 520 |
|
|
@@ -529,11 +529,11 @@ Building on off-policy deep Q-learning with replay buffers, entropy regularizati
|
|
| 529 |
|
| 530 |
Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL) @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process. Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. Consequently, given an intervention timestep $`k \in (0, T)`$, length-$`K`$ human intervention data $`\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K`$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of training @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training.
|
| 531 |
|
| 532 |
-
|
| 533 |
|
| 534 |
**TODO(fracapuano): work out rl training example**
|
| 535 |
|
| 536 |
-
|
| 537 |
|
| 538 |
Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations:
|
| 539 |
|
|
@@ -543,7 +543,7 @@ Despite the advancements in real-world RL training, solving robotics training RL
|
|
| 543 |
|
| 544 |
Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns. By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions.
|
| 545 |
|
| 546 |
-
|
| 547 |
|
| 548 |
<div class="epigraph">
|
| 549 |
|
|
@@ -593,11 +593,11 @@ Despite the inherent challenges of learning on non-i.i.d. data, the BC formulati
|
|
| 593 |
|
| 594 |
While conceptually elegant, point-estimate policies $`f : \mathcal O\mapsto \mathcal A`$ learned by solving <a href="#eq:loss-minimization-SL" data-reference-type="ref" data-reference="eq:loss-minimization-SL">[eq:loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $`\mathcal D`$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $`\epsilon`$-prediction errors $`0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon`$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus errors compounding (Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, left).Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving robotics problems, since multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the generative model $`p(o, a)`$ underlying the samples in $`\mathcal D`$, rather than an explicitly learning a prediction function $`f(o) = a`$.
|
| 595 |
|
| 596 |
-
|
| 597 |
|
| 598 |
Generative Models (GMs) aim to learn the stochastic process underlying the very generation of the data collected, and typically do so by fitting a probability distribution that approximates the unknown *data distribution*, $`p`$. In the case of BC, this unknown data distribution $`p`$ represents the expert’s joint distribution over $`(o, a)`$-pairs. Thus, given a finite set of $`N`$ pairs $`\mathcal D = \{ (o,a)_i \}_{i=0}^N`$ used as an imitation learning target (and thus assumed to be i.i.d.), GM seeks to learn a *parametric* distribution $`p_\theta(o,a)`$ such that (1) new samples $`(o,a) \sim p_\theta(\bullet)`$ resemble those stored in $`\mathcal D`$, and (2) high likelihood is assigned to the observed regions of the unobservable $`p`$. Likelihood-based learning provides a principled training objective to achieve both objectives, and it is thus extensively used in GM @prince2023understanding.
|
| 599 |
|
| 600 |
-
|
| 601 |
|
| 602 |
<figure id="fig:ch4-task-effect-on-pairs">
|
| 603 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch4/ch4-task-effect-on-pairs.png" style="width:80.0%" />
|
|
@@ -668,7 +668,7 @@ Assuming $`p_\theta(o,a \vert z)`$ is parametrized as an isotropic Gaussian dist
|
|
| 668 |
```
|
| 669 |
Indeed, it is very common in practice to approximate from the learned likelihood $`p_\theta(o,a \vert z)`$ as a parametric distribution (e.g. Gaussians) parametrized by some learned vector of coefficients derived from $`\mu_\theta (z), \ z \sim p (\bullet)`$. In all such cases, learning a VAE corresponds to optimally *reconstructing* the examples in $`\mathcal D`$ by minimizing the L2-error--a very common *supervised learning* objective for regression targets--while regularizing the information compression into the latent, as under the common modeling choice $`p(z) = \mathcal N (\mathbf{0}, \mathbf{I})`$ <a href="#eq:VAE-Lreg" data-reference-type="ref" data-reference="eq:VAE-Lreg">[eq:VAE-Lreg]</a> regularizes the posterior limiting the expressivity of $`q_\phi(z\vert o,a)`$.
|
| 670 |
|
| 671 |
-
|
| 672 |
|
| 673 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#eq:BC-latent-variable" data-reference-type="ref" data-reference="eq:BC-latent-variable">[eq:BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#fig:ch4-many-latents" data-reference-type="ref" data-reference="fig:ch4-many-latents">24</a>), resulting in
|
| 674 |
$$
|
|
@@ -719,7 +719,7 @@ In this simplified (minimization) objective, the optimization process differs fr
|
|
| 719 |
|
| 720 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing $`z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}), `$ thus showing that the lower-level latent variables in a DM can be obtained by iteratively removing noise from the one-step higher order variable, using the noise regressor $`\epsilon_\theta(z_t, t)`$ learned minimizing <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>.
|
| 721 |
|
| 722 |
-
|
| 723 |
|
| 724 |
The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $`\psi [0,1] \times Z \mapsto Z`$, formalized starting from possibly time-dependent vector field $`v: [0,1] \times Z \mapsto Z`$ transporting samples from a simple prior distribution $`p_0`$--e.g., a standard Gaussian--to a more complex, potentially unknown data distribution $`p_1`$ over time. Note how FM models time $`t \in [0,1]`$ to be varying continuously while moving away *from* an easy-to-sample distribution $`p_0`$ *towards* the unknown data-distribution, $`p_1`$. This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time,
|
| 725 |
$$
|
|
@@ -753,7 +753,7 @@ In practice, FM can be applied to generative modeling by learning a vector field
|
|
| 753 |
\mathcal L(\theta) = \mathbb{E}_{t, z_0, z_1} \big[
|
| 754 |
\Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal{U}([0,1]),`$ where $`z_0 \sim p_0(\bullet)`$ and $`z_1 \sim p_1(\bullet)`$. Note how in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a>--differently from <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>--time is assumed to be varying continuously $`t \sim \mathcal U([0,1])`$ rather than discretely $`t \sim \mathcal U(\{0,1\})`$, a key property of flow-based models. The objective in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a> directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $`z_0 \sim p_0`$ and iteratively refined according to $`\frac{dz}{dt} = v_\theta(z_t, t)`$ for $`t \in [0,1]`$--an operation that can be numerically carried out with standard ODE solvers.
|
| 755 |
|
| 756 |
-
|
| 757 |
|
| 758 |
While GMs prove useful in learning complex, high-dimensional multi-modal distributions, they do not natively address the compouding errors problem characteristic of online, sequential predictions. In Action Chunking with Transformers (ACT), @zhaoLearningFineGrainedBimanual2023 present an application of VAEs to the problem of learning purely from offline trajectories, introduce a simple, yet effective method to mitigate error compounding, learning high-fidelity autonomous behaviors. Drawing inspiration from how humans plan to enact atomically sequences of the kind $`a_{t:t+k}`$ instead of single actions $`a_t`$, @zhaoLearningFineGrainedBimanual2023 propose learning a GM on a dataset of input demonstrations by modeling *action chunks*. Besides contributions to learning high-performance autonomous behaviors, @zhaoLearningFineGrainedBimanual2023 also introduce hardware contributions in the form of a low-cost bimanual robot setup (ALOHA) capable of performing fine-grained manipulation tasks, such as opening a lid, slotting a battery in its allotment or even prepare tape for application.
|
| 759 |
|
|
@@ -786,9 +786,9 @@ However, the authors claim using a deterministic procedure to derive $`z`$ may b
|
|
| 786 |
<figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"><em>n</em></span> camera views are first embedded using pre-trained visual encoders, and then concatenated to the corresponding positional embeddings. Then, alongside embeddings for the proprioperceptive information available and the style variable <span class="math inline"><em>z</em></span> retrieved from the CVAE encoder, the Transformer encoder shares the matrices <span class="math inline"><em>K</em>, <em>Q</em></span> with the Transformer decoder, trained to decode fixed position embeddings into action valid chunks.</figcaption>
|
| 787 |
</figure>
|
| 788 |
|
| 789 |
-
|
| 790 |
|
| 791 |
-
|
| 792 |
|
| 793 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 794 |
|
|
@@ -811,9 +811,9 @@ Figure <a href="#fig:diffusion-policy-architecture" data-reference-type="ref" d
|
|
| 811 |
|
| 812 |
Training using 50-150 demos (15-60 minutes of teleoperation data) DP achieves strong performance on a variety of simulated and real-world tasks, including dexterous and deformable manipulation tasks such as sauce pouring and mat unrolling. Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, typically trained in simulation with priviledged information not directly available in real-world deployments. As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments, @chiDiffusionPolicyVisuomotor2024’s conclusion offers significant evidence for learning streamlined control policies directly from pixels. In their work, @chiDiffusionPolicyVisuomotor2024 also ablate the performance of DP against their baseline against the size of the dataset collected, showing that DP outperforms the considered baseline for every benchmark size considered. Further, to accelerate inference, @chiDiffusionPolicyVisuomotor2024 employ Denoising Diffusion Implicit Models @songDenoisingDiffusionImplicit2022, a variant of Denoising Diffusion Probabilistic Models @hoDenoisingDiffusionProbabilistic2020 (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM’s natively stochastic one) inducing the same final distribution’s as DDPM’s, and yet resulting in 10 times less denoising steps at inference time @chiDiffusionPolicyVisuomotor2024. Across a range of simulated and real-world tasks, @chiDiffusionPolicyVisuomotor2024 find DPs particularly performant when implementing a transformer-based network as $`\epsilon_\theta`$, although the authors note the increased sensitivity of transformer networks to hyperparameters and thus explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure <a href="#fig:diffusion-policy-architecture" data-reference-type="ref" data-reference="fig:diffusion-policy-architecture">32</a>), which are however reported to be biased towards learning low-frequency components @tancikFourierFeaturesLet2020 and thus may prove more challenging to train with non-smooth action sequences.
|
| 813 |
|
| 814 |
-
|
| 815 |
|
| 816 |
-
|
| 817 |
|
| 818 |
Modern visuomotor policies output *action chunks*-sequences $`\pi(o_t) = \mathbf{A}_t`$ with $`\mathbf{A}_t = \bigl(a_t,a_{t+1},\dots,a_{t+H_a}\bigr)`$ being a sequence of $`H_a \gg 1`$ low-level commands enqueued in an action queue, originating from an environment observation, $`o_t`$. Predicting series of actions instead of single commands proved essential in learning complex, multi-modal behavior @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024.
|
| 819 |
|
|
@@ -844,7 +844,7 @@ $`\mathbf{A}_{t+1} \gets \mathbf{A}_t`$
|
|
| 844 |
|
| 845 |
</div>
|
| 846 |
|
| 847 |
-
|
| 848 |
|
| 849 |
*Async* inference (1) tightens the control loop by capturing observations more often, directly eliminates idle gaps at runtime, and (2) directly allows to run inference on more powerful computational resources than the ones typically available onboard autonomous robotic platforms.
|
| 850 |
|
|
@@ -869,9 +869,9 @@ Interestingly, the behavior of async inference can be studied analytically. Firs
|
|
| 869 |
|
| 870 |
<a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> emphasizes the trade-off governed by $`g`$: small values place result in idle periods, whereas $`g\approx 1`$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $`g\in(0,1)`$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the would send observations for processing every $`(1 - g) H_a \cdot \Delta t`$ seconds, receiving a new chunk of actions every $`(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]`$, on average. The presence of the observation similarity filter dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
|
| 871 |
|
| 872 |
-
|
| 873 |
|
| 874 |
-
|
| 875 |
|
| 876 |
<div class="epigraph">
|
| 877 |
|
|
@@ -894,7 +894,7 @@ The advent of large models trained on internet-scale datasets has drastically in
|
|
| 894 |
<figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
|
| 895 |
</figure>
|
| 896 |
|
| 897 |
-
|
| 898 |
|
| 899 |
The remarkable success of foundation models in NLP and CV is predicated on two core principles: architectural innovation and joint data-compute scaling. The transformer architecture proved instrumental in capturing long-range dependencies in sequential data such as text, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale amounts of data. In stark contrast with popular NLP @raffelExploringLimitsTransfer2023 and CV @ImageNet_VSS09 general-purpose datasets, the field of robotics has historically developed around task-specific datasets which hinders scalability across problems, resulting in a concrete data deficit for general-purpose robot learning. Unlike the wealth of relatively readily available text and images on the internet, robotics data is intrinsically embodied--datasets collected for a manipulation robot typically differ entirely from locomotion datasets. Further, datasets consisting of expert demonstrations are (1) intrinsically expensive to collect (2) and notoriously heterogeneous--different human experts may perform the same task optimally yet in very different ways. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, and which excel at their designated task but fail to generalize to new situations (Figure <a href="#fig:ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="fig:ch5-ml-vs-robotics-foundation">35</a>).
|
| 900 |
|
|
@@ -918,19 +918,19 @@ The success of large, proprietary models like RT-1 and RT-2, highlighted a growi
|
|
| 918 |
|
| 919 |
Figure <a href="#fig:ch5-trends" data-reference-type="ref" data-reference="fig:ch5-trends">37</a> illustrates graphically the two most relevant trends in modern robot learning. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners have also gained traction recently, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are also becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
|
| 920 |
|
| 921 |
-
|
| 922 |
|
| 923 |
Modern recipes to train large scale VLAs extend early efforts to learn foundation models from large amounts of data via BC, introducing significant advancements concerning both architectural and procedural aspects. From an architectural perspective, modern VLAs such as $`\pi_0`$ @blackp0VisionLanguageActionFlow2024 leverage a *unified transformer model* for efficiency of computation, while maintaining specialized sub-components within the model for visual perception and action prediction, enabling cross-task performance via language conditioning. Crucially, modern VLAs including @blackp0VisionLanguageActionFlow2024\[$`\pi_0`$\] and @shukorSmolVLAVisionLanguageActionModel2025\[SmolVLA\] adopt *unified* transformer models employing disjoint set of weights (*experts*) for compute-efficient visual-semantic understanding and robotic control. Procedurally, modern VLAs complement advanced Vision-Language Model (VLM) backbones with action-specific modules (1) adopting mid-sized *action experts* to model continuous actions distributions $`p (a_{t:t+H_a} \vert o_t)`$--avoiding discrete action tokens entirely--and (2) relying on *action chunking* as a strategy to reduce error compounding when predicting multiple actions learning from inherently non-i.i.d. data, such as demonstration data.
|
| 924 |
|
| 925 |
These architectural and procedural innovations present three benefits. First, developing architectures that exploit internet-scale pre-trained backbones allows to fully capitalizes on the vast world knowledge and skills state-of-the-art VLMs exhibit, preventig models from needing to learn visual, linguistic and semantic concepts from scratch. Second, using generative models for continuous action distributions allows to learn rich, multimodal data distributions, a much more likely scenario in the big-data regime typically tackled while developing generalist policies. Further, introducing two separate components for perception and action planning could enable using Mixture of Experts (MoE) architectures @fedusReviewSparseExpert2022, more efficient to run and thus resulting in faster inference--a key features for models deployed in real-world scenarios. This new paradigm has been at the core of some of the most capable generalist policies developed to date, capable to few-shot adapt to novel tasks and to perform highly dexterous manipulation tasks, ranging from end-to-end folding laundry, to bussing tables.
|
| 926 |
|
| 927 |
-
|
| 928 |
|
| 929 |
VLMs are designed to process both visual and textual modalities--most commonly by taking both images and text as input and generating text conditioned on the visual context. Recent advances in VLMs have been driven by the success of LLMs, with many approaches building upon pretrained LLMs and adopting similar training paradigms to the ones used in language modeling. Typically, VLMs @alayracFlamingoVisualLanguage2022, @laurenconWhatMattersWhen2024, @linVILAPretrainingVisual2024 are constructed by integrating a pretrained vision encoder @radfordLearningTransferableVisual2021, @zhaiSigmoidLossLanguage2023, @finiMultimodalAutoregressivePretraining2024 with a pretrained LLM @grattafioriLlama3Herd2024, @jiangMistral7B2023. Training then proceeds in multiple multimodal stages, beginning with a large-scale pretraining on datasets containing image-text pairs @LAION-COCO, @kakaobrain2022coyo700m and interleaved vision-language corpora @OBELICS, @MMC4, all followed by a supervised fine-tuning stage on instruction-tuning datasets @LLaVA-1.5, @tong2024cambrian, @laurenconWhatMattersWhen2024. The inherent multimodal nature of VLMs enables them to jointly reason over vision and language. Pre-training on vast internet-scale datasets allows these models to associate visual patterns with textual descriptions, thereby acquiring a rich semantic understanding of the world--knowledge about objects, their properties, and relationships--without explicit supervision for each concept. In turn, integrating a VLM as a perception backbone for a VLA allows the complete model to inherit rich world knowledge, sidestepping the need to learn visual and semantic representations from scratch. In principle, this allows the robot to ground high-level natural language instructions in its visual context, and possibly recognize unseen objects by connecting them to pre-trained concepts absorbed during pre-training, improving on the possibility to generalize to novel scenarios.
|
| 930 |
|
| 931 |
Recently, compute efficiency has also become a central focus in VLM research. Several works aim to reduce training costs by using smaller, more diverse datasets @LLaVA-1.5, @InstructBLIP, @bai2025qwen25vl, @zhu2024minigpt, @tong2024cambrian, training smaller-scale models @marafiotiSmolVLMRedefiningSmall2025, @moondream, @minicmpv2024, or by adapting pretrained unimodal models by tuning only a small subset of parameters @shukor2023epalm, @vallaeys2024improveddepalm, @MAPL, @FROMAGe, @tsimpoukelli2021multimodalfrozen, @BLIP-2. While the majority of VLM research focuses on image and text modalities, recent work has demonstrated that similar techniques can be extended to integrate additional modalities, such as video and audio @wang2025internvideo2, @liu2024kangaroo, @zhang2025videollama, @kong2024audioflam--a particularly promising direction of research for robotics applications, where multiple sensor modalities can be integrated effectively. This trend towards efficiency is paramount for robotics applications, where policies must operate under the stringent constraints of real-world deployment. Indeed, robots often possess limited on-board computational resources and must react in real-time to dynamic environments. Smaller and faster VLMs have thus become quintessential for developing responsive autonomous systems, enabling high-frequency control loops by reducing the latency between perception and action.
|
| 932 |
|
| 933 |
-
|
| 934 |
|
| 935 |
$`\pi_0`$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with a late-fusion VLM (PaliGemma), while proprioceptive state and actions chunks are routed to a smaller action expert, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
|
| 936 |
|
|
@@ -982,9 +982,9 @@ Besides adopting a MoE architecture with a VLM backbone initialized from a pre-t
|
|
| 982 |
|
| 983 |
Lastly, @blackp0VisionLanguageActionFlow2024 present cross-embodiment experiments where they demonstrate $`\pi_0`$’s ability to control both mobile and static manipulator robots with varying arm embodiments. The emergence of cross-embodiment capabilities is largely to be attributed to the presence of large scale cross-embodiment data in the data mixture, handled by $`\pi_0`$defaulting to the maximal configuration size across the $`\pi`$ dataset, and zero-padding robots with fewer dof. In that $`\pi_0`$constantly processes 18 DoFs robots (two 6-DoF arms, two grippers, base, vertical torso), regardless of the kind of robot, and robots with fewer dofs are zero-padded. $`\pi_0`$also relies on three camera views, and uses masked image slots for training and deployment scenarios with fewer cameras.
|
| 984 |
|
| 985 |
-
|
| 986 |
|
| 987 |
-
|
| 988 |
|
| 989 |
VLAs remain in an early stage of development and are not yet as mature or widely adopted as LLMs and VLMs. Further, much of the impactful VLA progress remains proprietary, with many models sharing only weights while withholding full training details and essential methodological components. SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, aiming to democratize the developments of robotics foundation models by open sourcing model, training recipes and data used.
|
| 990 |
|
|
@@ -1003,9 +1003,9 @@ SmolVLA trims both token and layer compute. First, it *reduces visual tokens* vi
|
|
| 1003 |
|
| 1004 |
Departing from reliance on proprietary datasets, SmolVLA pretrains exclusively on 450+ *community datasets*, totaling 20K+ trajectories. Because instructions in community contributed dataset can be noisy or missing, the authors re-annotate tasks with a small off-the-shelf VLM using frames sampled from the dataset, and standardize camera viewpoints by mapping sources to a consistent top/wrist/side ordering. At inference, similarily to $`\pi_0`$, SmolVLA integrates flow over 10 steps, resulting in fast inference. SmolVLA proves effective across a range of both real-world and simulated environments, rivaling $`\pi_0`$while being close to 40% faster and consuming 6x less memory.
|
| 1005 |
|
| 1006 |
-
|
| 1007 |
|
| 1008 |
-
|
| 1009 |
|
| 1010 |
This tutorial has chronicled the paradigmatic shift transforming robotics, from the structured, model-based methods of its classical era to the dynamic, data-driven approaches that define modern robot learning. We began by examining the limitations of traditional dynamics-based control, highlighting the brittleness and the significant engineering overhead required by traditional approaches, which in turn motivates more flexible, less model-intensive learning approaches.
|
| 1011 |
|
|
|
|
| 1 |
+
## Foreword
|
| 2 |
|
| 3 |
Robotics is an inherently multidisciplinary field, and is not witnessing unprecedented advancements since its inception in the 1960s. Yet, more than sixty years after the debut of Unimate, robots have still not fully integrated into the rich, unstructured, and dynamic world we humans inhabit. Over the decades, numerous disciplines have shown immense promise in tackling the challenges of creating autonomous systems. This tutorial takes a clear stance in the debate on whether modern Machine Learning can play a pivotal role in the development of autonomous robot systems: we believe this to be the case.
|
| 4 |
|
|
|
|
| 16 |
|
| 17 |
We sincerely hope this tutorial serves as a valuable starting point for your journey into robot learning.
|
| 18 |
|
| 19 |
+
## Introduction
|
| 20 |
|
| 21 |
<figure id="fig:figure1">
|
| 22 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch1/ch1-lerobot-figure1.png" />
|
|
|
|
| 43 |
|
| 44 |
Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today. We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using `lerobot`, and start here by presenting the dataset format introduced with `lerobot`.
|
| 45 |
|
| 46 |
+
### `LeRobotDataset`
|
| 47 |
|
| 48 |
`LeRobotDataset` is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status. `LeRobotDataset` also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state’s streams are proceeding.
|
| 49 |
|
| 50 |
In this, `LeRobotDataset` provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems. `LeRobotDataset` can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in `lerobot`, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars. This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users.
|
| 51 |
|
| 52 |
+
#### The dataset class design
|
| 53 |
|
| 54 |
A core design choice behind `LeRobotDataset` is separating the underlying data storage from the user-facing API. This allows for efficient storage while presenting the data in an intuitive, ready-to-use format.
|
| 55 |
|
|
|
|
| 75 |
|
| 76 |
- `videos/*`: Contains the MP4 video files for all visual observation streams. Similar to the `data/` directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems.
|
| 77 |
|
| 78 |
+
### Code Example: Batching a (Streaming) Dataset
|
| 79 |
|
| 80 |
This section provides an overview of how to access datasets hosted on Hugging Face using the `LeRobotDataset` class. Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction.
|
| 81 |
|
|
|
|
| 142 |
|
| 143 |
</div>
|
| 144 |
|
| 145 |
+
## Classical Robotics
|
| 146 |
|
| 147 |
<div class="epigraph">
|
| 148 |
|
|
|
|
| 158 |
|
| 159 |
</div>
|
| 160 |
|
| 161 |
+
### Explicit and Implicit Models
|
| 162 |
|
| 163 |
<figure id="fig:generating-motion-atlas">
|
| 164 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-approaches.png" style="width:50.0%" />
|
|
|
|
| 169 |
|
| 170 |
Methods to produce robotics motion range from traditional *explicit* models--<span style="color: hf2">dynamics-based</span>[^1] methods, leveraging precise descriptions of the mechanics of robots’ rigid bodies and their interactions with eventual obstacles in the environment--to *implicit* models--<span style="color: hf2">learning-based</span> methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readings @agrawalComputationalSensorimotorLearning, @bekrisStateRobotMotion2024. A variety of methods have been developed between these two extrema. For instance, @hansenTemporalDifferenceLearning2022 show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning @suttonReinforcementLearningIntroduction2018 with Model-Predictive Control (MPC). Conversely, as explicit models may be relying on assumptions proving overly simplistic--or even unrealistic--in practice, learning can prove effective to improve modeling of complex phenomena or complement perception @mccormacSemanticFusionDense3D2016. Such examples aim at demonstrating the richness of approaches to robotics, and Figure <a href="#fig:generating-motion-atlas" data-reference-type="ref" data-reference="fig:generating-motion-atlas">2</a> graphically illustrates some of the most relevant techniques. Such a list is clearly far from being exhaustive, and we refer to @bekrisStateRobotMotion2024 for a more comprehensive overview of both general and application-specific methods for motion generation. In this section, we wish to introduce the inherent benefits of <span style="color: hf2">learning-based approaches to robotics</span>--the core focus on this tutorial.
|
| 171 |
|
| 172 |
+
### Different Types of Motion
|
| 173 |
|
| 174 |
<figure id="fig:robotics-platforms-atlas">
|
| 175 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch2/ch2-platforms.png" style="width:70.0%" />
|
|
|
|
| 182 |
|
| 183 |
The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches. ML has indeed proven particularly transformative across the entire robotics stack, first empowering planning-based techniques with improved state estimation used for traditional planning @tangPerceptionNavigationAutonomous2023 and then end-to-end replacing controllers, effectively yielding perception-to-action methods @koberReinforcementLearningRobotics. Work in producing robots capable of navigating a diverse set of terrains demonstrated the premise of both dynamics and learning-based approaches for locomotion @griffinWalkingStabilizationUsing2017, @jiDribbleBotDynamicLegged2023, @leeLearningQuadrupedalLocomotion2020, @margolisRapidLocomotionReinforcement2022, and recent works on whole-body control indicated the premise of learning-based approaches to generate rich motion on complex robots, including humanoids @zhangWoCoCoLearningWholeBody2024, @bjorckGR00TN1Open2025. Manipulation has also been widely studied, particularly considering its relevance for many impactful use-cases ranging from high-risk applications for humans @fujitaDevelopmentRobotsNuclear2020, @alizadehComprehensiveSurveySpace2024 to manufacturing @sannemanStateIndustrialRobotics2020. While explicit models have proven fundamental in achieving important milestones towards the development of modern robotics, recent works leveraging implicit models proved particularly promising in surpassing scalability and applicability challenges via learning @koberReinforcementLearningRobotics.
|
| 184 |
|
| 185 |
+
### Example: Planar Manipulation
|
| 186 |
|
| 187 |
Robot manipulators typically consist of a series of links and joints, articulated in a chain finally connected to an *end-effector*. Actuated joints are considered responsible for generating motion of the links, while the end effector is instead used to perform specific actions at the target location (e.g., grasping/releasing objects via closing/opening a gripper end-effector, using a specialized tool like a screwdriver, etc.).
|
| 188 |
|
|
|
|
| 258 |
|
| 259 |
Following trajectories with diff-IK is a valid option in well-controlled and static environments (e.g., industrial manipulators in controlled manufacturing settings), and relies on the ability to define a set of target velocities to track $`[\dot {p}^*_0, \dot {p}^*_1, \dots, \dot {p}^*_k ]`$--an error-prone task largely requiring human expertise. Furthermore, diff-IK relies on the ability to (1) access $`J(q) \, \forall q \in \mathcal Q`$ and (2) compute its pseudo-inverse at every iteration of a given control cycle--a challenging assumption in highly dynamical settings, or for complex kinematic chains.
|
| 260 |
|
| 261 |
+
#### Adding Feedback Loops
|
| 262 |
|
| 263 |
While very effective when a goal trajectory has been well specified, the performance of diff-IK can degrade significantly in the presence of modeling/tracking errors, or in the presence of non-modeled dynamics in the environment.
|
| 264 |
|
|
|
|
| 278 |
|
| 279 |
We point the interested reader to , , and for extended coverage of FK, IK, diff-IK and control for (diff-)IK.
|
| 280 |
|
| 281 |
+
### Limitations of Dynamics-based Robotics
|
| 282 |
|
| 283 |
Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects). While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
|
| 284 |
|
|
|
|
| 297 |
|
| 298 |
Taken together, these limitations (Figure <a href="#fig:classical-limitations" data-reference-type="ref" data-reference="fig:classical-limitations">10</a>) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available.
|
| 299 |
|
| 300 |
+
## Robot (Reinforcement) Learning
|
| 301 |
|
| 302 |
<div class="epigraph">
|
| 303 |
|
|
|
|
| 338 |
|
| 339 |
Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics @koberReinforcementLearningRobotics. Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> depicts two of such cases. Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object. Figure <a href="#fig:robotics-with-rl-examples" data-reference-type="ref" data-reference="fig:robotics-with-rl-examples">13</a> also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation. While sliding to the side, the controller has to constantly keep adjusting to the robot’s propioperception to avoid failure (falling).
|
| 340 |
|
| 341 |
+
### A (Concise) Introduction to RL
|
| 342 |
|
| 343 |
The RL framework @suttonReinforcementLearningIntroduction2018, which we briefly introduce here, has often been used to model robotics problems @koberReinforcementLearningRobotics. RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*). Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only. In RL, this feedback loop (Figure <a href="#fig:rl-most-famous-pic" data-reference-type="ref" data-reference="fig:rl-most-famous-pic">14</a>) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*).
|
| 344 |
|
|
|
|
| 415 |
|
| 416 |
Popular approaches to continuous state and action space--such as those studied within robotics--include @schulmanTrustRegionPolicy2017, @schulmanProximalPolicyOptimization2017, @haarnojaSoftActorCriticOffPolicy2018. Across manipulation @akkayaSolvingRubiksCube2019 and locomotion @leeLearningQuadrupedalLocomotion2020 problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets. For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics, @tangDeepReinforcementLearning2024.
|
| 417 |
|
| 418 |
+
### Real-world RL for Robotics
|
| 419 |
|
| 420 |
Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics. However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency.
|
| 421 |
|
|
|
|
| 447 |
|
| 448 |
Off-policy algorithms like Soft Actor-Critic (SAC)��@haarnojaSoftActorCriticOffPolicy2018 tend to be more sample efficient then their on-policy counterpart @schulmanProximalPolicyOptimization2017, due to the presence a *replay buffer* used over the course of the training. Other than allowing to re-use transitions $`(s_t, a_t, r_t, s_{t+1})`$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training process @ballEfficientOnlineReinforcement2023. Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-world @luoSERLSoftwareSuite2025. Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours @luoPreciseDexterousRobotic2024.
|
| 449 |
|
| 450 |
+
##### Sample-efficient RL
|
| 451 |
|
| 452 |
In an MDP, the optimal policy $`\pi^*`$ can be derived from its associated $`Q`$-function, $`Q_{\pi^*}`$, and in particular the optimal action(s) $`\mu(s_t)`$ can be selected maximizing the optimal $`Q`$-function over the action space,
|
| 453 |
``` math
|
|
|
|
| 508 |
```
|
| 509 |
The update rule provided in <a href="#eq:sac-policy-update" data-reference-type="ref" data-reference="eq:sac-policy-update">[eq:sac-policy-update]</a> optimizes the policy while projecting it on a set $`\Pi`$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017).
|
| 510 |
|
| 511 |
+
##### Sample-efficient, data-driven RL
|
| 512 |
|
| 513 |
Importantly, sampling $`(s_t, a_t, r_t, s_{t+1})`$ from the replay buffer $`D`$ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates. The replay buffer $`D`$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency. Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process.
|
| 514 |
|
| 515 |
Reinforcement Learning with Prior Data (RLPD) @ballEfficientOnlineReinforcement2023 is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent. Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $`D_\text{offline}`$ to improve online-learning from scratch. During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine.
|
| 516 |
|
| 517 |
+
##### Sample-efficient, data-driven, real-world RL
|
| 518 |
|
| 519 |
Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function. Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment. As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness. In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $`c`$, as well as jointly learn forward-backward controllers to speed up real-world RL. Reward classifiers are particularly useful in treating complex tasks--e.g., folding a t-shirt--for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($`e^+`$) or failure ($`e^-`$) states, $`s \in \mathcal S`$, with a natural choice for the state-conditioned reward function being $`r \mathcal S \mapsto \mathbb R`$ being $`r(s) = \log c(e^+ \ vert s )`$. Further, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies.
|
| 520 |
|
|
|
|
| 529 |
|
| 530 |
Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL) @luoPreciseDexterousRobotic2024 augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories. While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process. Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only. Consequently, given an intervention timestep $`k \in (0, T)`$, length-$`K`$ human intervention data $`\{ s^{\text{human}}_k, a^{\text{human}}_k, r^{\text{human}}_k, s^{\text{human}}_{k+1},\}_{k=1}^K`$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning. Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of training @luoPreciseDexterousRobotic2024, underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training.
|
| 531 |
|
| 532 |
+
#### Code Example: Real-world RL
|
| 533 |
|
| 534 |
**TODO(fracapuano): work out rl training example**
|
| 535 |
|
| 536 |
+
#### Limitations of RL in Real-World Robotics: Simulators and Reward Design
|
| 537 |
|
| 538 |
Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations:
|
| 539 |
|
|
|
|
| 543 |
|
| 544 |
Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns. By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions.
|
| 545 |
|
| 546 |
+
## Robot (Imitation) Learning
|
| 547 |
|
| 548 |
<div class="epigraph">
|
| 549 |
|
|
|
|
| 593 |
|
| 594 |
While conceptually elegant, point-estimate policies $`f : \mathcal O\mapsto \mathcal A`$ learned by solving <a href="#eq:loss-minimization-SL" data-reference-type="ref" data-reference="eq:loss-minimization-SL">[eq:loss-minimization-SL]</a> have been observed to suffer from (1) compounding errors @rossReductionImitationLearning2011 and (2) poor fit to multimodal distributions @florenceImplicitBehavioralCloning2022, @keGraspingChopsticksCombating2020. Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a> illustrates these two key issues related to learning *explicit policies* @florenceImplicitBehavioralCloning2022. Besides sequentiality in $`\mathcal D`$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $`\epsilon`$-prediction errors $`0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon`$ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus errors compounding (Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, left).Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving robotics problems, since multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure <a href="#fig:ch4-issues-with-bc" data-reference-type="ref" data-reference="fig:ch4-issues-with-bc">21</a>, right). In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands @florenceImplicitBehavioralCloning2022. To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the generative model $`p(o, a)`$ underlying the samples in $`\mathcal D`$, rather than an explicitly learning a prediction function $`f(o) = a`$.
|
| 595 |
|
| 596 |
+
### A (Concise) Introduction to Generative Models
|
| 597 |
|
| 598 |
Generative Models (GMs) aim to learn the stochastic process underlying the very generation of the data collected, and typically do so by fitting a probability distribution that approximates the unknown *data distribution*, $`p`$. In the case of BC, this unknown data distribution $`p`$ represents the expert’s joint distribution over $`(o, a)`$-pairs. Thus, given a finite set of $`N`$ pairs $`\mathcal D = \{ (o,a)_i \}_{i=0}^N`$ used as an imitation learning target (and thus assumed to be i.i.d.), GM seeks to learn a *parametric* distribution $`p_\theta(o,a)`$ such that (1) new samples $`(o,a) \sim p_\theta(\bullet)`$ resemble those stored in $`\mathcal D`$, and (2) high likelihood is assigned to the observed regions of the unobservable $`p`$. Likelihood-based learning provides a principled training objective to achieve both objectives, and it is thus extensively used in GM @prince2023understanding.
|
| 599 |
|
| 600 |
+
#### Variational Auto-Encoders
|
| 601 |
|
| 602 |
<figure id="fig:ch4-task-effect-on-pairs">
|
| 603 |
<img src="/Users/thibaudfrere/Documents/work-projects/huggingface/research-article-template/app/scripts/latex-to-markdown/output/assets/image/figures/ch4/ch4-task-effect-on-pairs.png" style="width:80.0%" />
|
|
|
|
| 668 |
```
|
| 669 |
Indeed, it is very common in practice to approximate from the learned likelihood $`p_\theta(o,a \vert z)`$ as a parametric distribution (e.g. Gaussians) parametrized by some learned vector of coefficients derived from $`\mu_\theta (z), \ z \sim p (\bullet)`$. In all such cases, learning a VAE corresponds to optimally *reconstructing* the examples in $`\mathcal D`$ by minimizing the L2-error--a very common *supervised learning* objective for regression targets--while regularizing the information compression into the latent, as under the common modeling choice $`p(z) = \mathcal N (\mathbf{0}, \mathbf{I})`$ <a href="#eq:VAE-Lreg" data-reference-type="ref" data-reference="eq:VAE-Lreg">[eq:VAE-Lreg]</a> regularizes the posterior limiting the expressivity of $`q_\phi(z\vert o,a)`$.
|
| 670 |
|
| 671 |
+
#### Diffusion Models
|
| 672 |
|
| 673 |
VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to <a href="#eq:BC-latent-variable" data-reference-type="ref" data-reference="eq:BC-latent-variable">[eq:BC-latent-variable]</a>, and solve the variational inference problem of jointly learning the likelihood $`p_\theta`$ and (approximate) posterior $`q_\phi`$ for such model. In that, the unknown data distribution $`p(o,a)`$ is effectively approximated via $`\int_Z p(z) p_\theta(o,a \vert z)`$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $`p(o,a)`$. Diffusion Models (DMs) @hoDenoisingDiffusionProbabilistic2020 are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution--*variational inference*--by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $`o,a`$ itself. In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure <a href="#fig:ch4-many-latents" data-reference-type="ref" data-reference="fig:ch4-many-latents">24</a>), resulting in
|
| 674 |
$$
|
|
|
|
| 719 |
|
| 720 |
By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution--significant ($`\Vert \epsilon \Vert > 0`$) whenever input and target distribution are sufficiently different-- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples. Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself. Following this gradient--i.e., denoising a sample from an uninformative distribution--corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection. Indeed, under the assumption that $`p_\theta (z_{t-1} \vert z_t)`$ is Gaussian, then sampling $`z_{t-1} \sim p_\theta(\bullet \vert z_{t})`$ corresponds to computing $`z_{t-1} = \frac{1}{\sqrt{\alpha_t}} \left( z_t - \frac{\beta_t}{\sqrt{1 - \bar\alpha_t}} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf{0}, \mathbf{I}), `$ thus showing that the lower-level latent variables in a DM can be obtained by iteratively removing noise from the one-step higher order variable, using the noise regressor $`\epsilon_\theta(z_t, t)`$ learned minimizing <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>.
|
| 721 |
|
| 722 |
+
#### Flow Matching
|
| 723 |
|
| 724 |
The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution. Flow Matching (FM) @lipmanFlowMatchingGenerative2023 extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs. Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $`\psi [0,1] \times Z \mapsto Z`$, formalized starting from possibly time-dependent vector field $`v: [0,1] \times Z \mapsto Z`$ transporting samples from a simple prior distribution $`p_0`$--e.g., a standard Gaussian--to a more complex, potentially unknown data distribution $`p_1`$ over time. Note how FM models time $`t \in [0,1]`$ to be varying continuously while moving away *from* an easy-to-sample distribution $`p_0`$ *towards* the unknown data-distribution, $`p_1`$. This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs. Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions’ support when integrating over time,
|
| 725 |
$$
|
|
|
|
| 753 |
\mathcal L(\theta) = \mathbb{E}_{t, z_0, z_1} \big[
|
| 754 |
\Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal{U}([0,1]),`$ where $`z_0 \sim p_0(\bullet)`$ and $`z_1 \sim p_1(\bullet)`$. Note how in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a>--differently from <a href="#eq:diffusion-simplified-loss" data-reference-type="ref" data-reference="eq:diffusion-simplified-loss">[eq:diffusion-simplified-loss]</a>--time is assumed to be varying continuously $`t \sim \mathcal U([0,1])`$ rather than discretely $`t \sim \mathcal U(\{0,1\})`$, a key property of flow-based models. The objective in <a href="#eq:flow-matching-objective" data-reference-type="ref" data-reference="eq:flow-matching-objective">[eq:flow-matching-objective]</a> directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient. At inference time, samples are generated by starting with $`z_0 \sim p_0`$ and iteratively refined according to $`\frac{dz}{dt} = v_\theta(z_t, t)`$ for $`t \in [0,1]`$--an operation that can be numerically carried out with standard ODE solvers.
|
| 755 |
|
| 756 |
+
### Action Chunking with Transformers
|
| 757 |
|
| 758 |
While GMs prove useful in learning complex, high-dimensional multi-modal distributions, they do not natively address the compouding errors problem characteristic of online, sequential predictions. In Action Chunking with Transformers (ACT), @zhaoLearningFineGrainedBimanual2023 present an application of VAEs to the problem of learning purely from offline trajectories, introduce a simple, yet effective method to mitigate error compounding, learning high-fidelity autonomous behaviors. Drawing inspiration from how humans plan to enact atomically sequences of the kind $`a_{t:t+k}`$ instead of single actions $`a_t`$, @zhaoLearningFineGrainedBimanual2023 propose learning a GM on a dataset of input demonstrations by modeling *action chunks*. Besides contributions to learning high-performance autonomous behaviors, @zhaoLearningFineGrainedBimanual2023 also introduce hardware contributions in the form of a low-cost bimanual robot setup (ALOHA) capable of performing fine-grained manipulation tasks, such as opening a lid, slotting a battery in its allotment or even prepare tape for application.
|
| 759 |
|
|
|
|
| 786 |
<figcaption>The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all <span class="math inline"><em>n</em></span> camera views are first embedded using pre-trained visual encoders, and then concatenated to the corresponding positional embeddings. Then, alongside embeddings for the proprioperceptive information available and the style variable <span class="math inline"><em>z</em></span> retrieved from the CVAE encoder, the Transformer encoder shares the matrices <span class="math inline"><em>K</em>, <em>Q</em></span> with the Transformer decoder, trained to decode fixed position embeddings into action valid chunks.</figcaption>
|
| 787 |
</figure>
|
| 788 |
|
| 789 |
+
#### Code Example: Learning ACT
|
| 790 |
|
| 791 |
+
### Diffusion Policy
|
| 792 |
|
| 793 |
DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images @hoDenoisingDiffusionProbabilistic2020 or videos @polyakMovieGenCast2025, thanks to their inherent capability to deal with multimodal data and training stability. In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks. Similarily to Action Chunking with Transformer @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $`p(o,a)`$ and (2) predict multiple actions into the future instead of a single action. Besides the intractability of the observations’ marginal $`p_\theta(o)`$ given $`p_\theta(o,a)`$, DP’s rationale for modeling the data distribution via $`p_\theta(a \vert o)`$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
|
| 794 |
|
|
|
|
| 811 |
|
| 812 |
Training using 50-150 demos (15-60 minutes of teleoperation data) DP achieves strong performance on a variety of simulated and real-world tasks, including dexterous and deformable manipulation tasks such as sauce pouring and mat unrolling. Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, typically trained in simulation with priviledged information not directly available in real-world deployments. As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments, @chiDiffusionPolicyVisuomotor2024’s conclusion offers significant evidence for learning streamlined control policies directly from pixels. In their work, @chiDiffusionPolicyVisuomotor2024 also ablate the performance of DP against their baseline against the size of the dataset collected, showing that DP outperforms the considered baseline for every benchmark size considered. Further, to accelerate inference, @chiDiffusionPolicyVisuomotor2024 employ Denoising Diffusion Implicit Models @songDenoisingDiffusionImplicit2022, a variant of Denoising Diffusion Probabilistic Models @hoDenoisingDiffusionProbabilistic2020 (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM’s natively stochastic one) inducing the same final distribution’s as DDPM’s, and yet resulting in 10 times less denoising steps at inference time @chiDiffusionPolicyVisuomotor2024. Across a range of simulated and real-world tasks, @chiDiffusionPolicyVisuomotor2024 find DPs particularly performant when implementing a transformer-based network as $`\epsilon_\theta`$, although the authors note the increased sensitivity of transformer networks to hyperparameters and thus explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure <a href="#fig:diffusion-policy-architecture" data-reference-type="ref" data-reference="fig:diffusion-policy-architecture">32</a>), which are however reported to be biased towards learning low-frequency components @tancikFourierFeaturesLet2020 and thus may prove more challenging to train with non-smooth action sequences.
|
| 813 |
|
| 814 |
+
#### Code Example: Learning Diffusion Policies
|
| 815 |
|
| 816 |
+
### Optimized Inference
|
| 817 |
|
| 818 |
Modern visuomotor policies output *action chunks*-sequences $`\pi(o_t) = \mathbf{A}_t`$ with $`\mathbf{A}_t = \bigl(a_t,a_{t+1},\dots,a_{t+H_a}\bigr)`$ being a sequence of $`H_a \gg 1`$ low-level commands enqueued in an action queue, originating from an environment observation, $`o_t`$. Predicting series of actions instead of single commands proved essential in learning complex, multi-modal behavior @zhaoLearningFineGrainedBimanual2023, @chiDiffusionPolicyVisuomotor2024.
|
| 819 |
|
|
|
|
| 844 |
|
| 845 |
</div>
|
| 846 |
|
| 847 |
+
##### Implementation details
|
| 848 |
|
| 849 |
*Async* inference (1) tightens the control loop by capturing observations more often, directly eliminates idle gaps at runtime, and (2) directly allows to run inference on more powerful computational resources than the ones typically available onboard autonomous robotic platforms.
|
| 850 |
|
|
|
|
| 869 |
|
| 870 |
<a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> emphasizes the trade-off governed by $`g`$: small values place result in idle periods, whereas $`g\approx 1`$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $`g\in(0,1)`$ allows to strike a balance between reactivity against resource budgets. If not for the aforementioned similarity filter, the would send observations for processing every $`(1 - g) H_a \cdot \Delta t`$ seconds, receiving a new chunk of actions every $`(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]`$, on average. The presence of the observation similarity filter dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk. In particular, <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in <a href="#fig:ch4-queues" data-reference-type="ref" data-reference="fig:ch4-queues">34</a> highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
|
| 871 |
|
| 872 |
+
#### Code Example: Using Async Inference
|
| 873 |
|
| 874 |
+
## Generalist Robot Policies
|
| 875 |
|
| 876 |
<div class="epigraph">
|
| 877 |
|
|
|
|
| 894 |
<figcaption>Fields within ML such as Computer Vision and NLP converged on the development of foundation models, trained on a variety of large scale models and capable to perform multiple downstream tasks (top). Conversely, robotics suffered from limited standardization in terms of the architectures used, and siloed, task specific datasets, incurring in a high degree of fragmentation which traditionally hindered the development of generalist models for robotics in favour of task-specific models (bottom).</figcaption>
|
| 895 |
</figure>
|
| 896 |
|
| 897 |
+
### Preliminaries: Models and Data
|
| 898 |
|
| 899 |
The remarkable success of foundation models in NLP and CV is predicated on two core principles: architectural innovation and joint data-compute scaling. The transformer architecture proved instrumental in capturing long-range dependencies in sequential data such as text, and its stability and expressivity made it the *de facto* standard for modern large-scale models trained on internet-scale amounts of data. In stark contrast with popular NLP @raffelExploringLimitsTransfer2023 and CV @ImageNet_VSS09 general-purpose datasets, the field of robotics has historically developed around task-specific datasets which hinders scalability across problems, resulting in a concrete data deficit for general-purpose robot learning. Unlike the wealth of relatively readily available text and images on the internet, robotics data is intrinsically embodied--datasets collected for a manipulation robot typically differ entirely from locomotion datasets. Further, datasets consisting of expert demonstrations are (1) intrinsically expensive to collect (2) and notoriously heterogeneous--different human experts may perform the same task optimally yet in very different ways. In particular, since each expert trajectory is tied to a specific robot platform and the operating conditions of its environment and task, data heterogeneity has long posed a *methodological* challenge for scaling robotics datasets via aggregation. Beyond this, heterogeneity also raises *conceptual* issues: naively mixing data across embodiments can induce negative transfer, as control strategies developed in isolation for different robot systems in different environments may even conflict when combined. Thus, the high degree of fragmentation of robotics datasets and tasks has traditionally led to the development of *specialist* policies, trained on small, task-specific datasets, and which excel at their designated task but fail to generalize to new situations (Figure <a href="#fig:ch5-ml-vs-robotics-foundation" data-reference-type="ref" data-reference="fig:ch5-ml-vs-robotics-foundation">35</a>).
|
| 900 |
|
|
|
|
| 918 |
|
| 919 |
Figure <a href="#fig:ch5-trends" data-reference-type="ref" data-reference="fig:ch5-trends">37</a> illustrates graphically the two most relevant trends in modern robot learning. As datasets collected via centralized, cross-institutions cooperation of increasing size are made available for the research community, decentralized datasets collected by individual researchers and practitioners have also gained traction recently, closing the gap with academic benchmarks thanks to community-contributed datasets. Further, models used across tasks and embodiments are also becoming much more compute-efficient, and as a result the models’ size has been consistently reducing over time, with consequent gains for autonomous robots in real-world, resource-constrained environments.
|
| 920 |
|
| 921 |
+
### Modern VLAs
|
| 922 |
|
| 923 |
Modern recipes to train large scale VLAs extend early efforts to learn foundation models from large amounts of data via BC, introducing significant advancements concerning both architectural and procedural aspects. From an architectural perspective, modern VLAs such as $`\pi_0`$ @blackp0VisionLanguageActionFlow2024 leverage a *unified transformer model* for efficiency of computation, while maintaining specialized sub-components within the model for visual perception and action prediction, enabling cross-task performance via language conditioning. Crucially, modern VLAs including @blackp0VisionLanguageActionFlow2024\[$`\pi_0`$\] and @shukorSmolVLAVisionLanguageActionModel2025\[SmolVLA\] adopt *unified* transformer models employing disjoint set of weights (*experts*) for compute-efficient visual-semantic understanding and robotic control. Procedurally, modern VLAs complement advanced Vision-Language Model (VLM) backbones with action-specific modules (1) adopting mid-sized *action experts* to model continuous actions distributions $`p (a_{t:t+H_a} \vert o_t)`$--avoiding discrete action tokens entirely--and (2) relying on *action chunking* as a strategy to reduce error compounding when predicting multiple actions learning from inherently non-i.i.d. data, such as demonstration data.
|
| 924 |
|
| 925 |
These architectural and procedural innovations present three benefits. First, developing architectures that exploit internet-scale pre-trained backbones allows to fully capitalizes on the vast world knowledge and skills state-of-the-art VLMs exhibit, preventig models from needing to learn visual, linguistic and semantic concepts from scratch. Second, using generative models for continuous action distributions allows to learn rich, multimodal data distributions, a much more likely scenario in the big-data regime typically tackled while developing generalist policies. Further, introducing two separate components for perception and action planning could enable using Mixture of Experts (MoE) architectures @fedusReviewSparseExpert2022, more efficient to run and thus resulting in faster inference--a key features for models deployed in real-world scenarios. This new paradigm has been at the core of some of the most capable generalist policies developed to date, capable to few-shot adapt to novel tasks and to perform highly dexterous manipulation tasks, ranging from end-to-end folding laundry, to bussing tables.
|
| 926 |
|
| 927 |
+
#### VLMs for VLAs
|
| 928 |
|
| 929 |
VLMs are designed to process both visual and textual modalities--most commonly by taking both images and text as input and generating text conditioned on the visual context. Recent advances in VLMs have been driven by the success of LLMs, with many approaches building upon pretrained LLMs and adopting similar training paradigms to the ones used in language modeling. Typically, VLMs @alayracFlamingoVisualLanguage2022, @laurenconWhatMattersWhen2024, @linVILAPretrainingVisual2024 are constructed by integrating a pretrained vision encoder @radfordLearningTransferableVisual2021, @zhaiSigmoidLossLanguage2023, @finiMultimodalAutoregressivePretraining2024 with a pretrained LLM @grattafioriLlama3Herd2024, @jiangMistral7B2023. Training then proceeds in multiple multimodal stages, beginning with a large-scale pretraining on datasets containing image-text pairs @LAION-COCO, @kakaobrain2022coyo700m and interleaved vision-language corpora @OBELICS, @MMC4, all followed by a supervised fine-tuning stage on instruction-tuning datasets @LLaVA-1.5, @tong2024cambrian, @laurenconWhatMattersWhen2024. The inherent multimodal nature of VLMs enables them to jointly reason over vision and language. Pre-training on vast internet-scale datasets allows these models to associate visual patterns with textual descriptions, thereby acquiring a rich semantic understanding of the world--knowledge about objects, their properties, and relationships--without explicit supervision for each concept. In turn, integrating a VLM as a perception backbone for a VLA allows the complete model to inherit rich world knowledge, sidestepping the need to learn visual and semantic representations from scratch. In principle, this allows the robot to ground high-level natural language instructions in its visual context, and possibly recognize unseen objects by connecting them to pre-trained concepts absorbed during pre-training, improving on the possibility to generalize to novel scenarios.
|
| 930 |
|
| 931 |
Recently, compute efficiency has also become a central focus in VLM research. Several works aim to reduce training costs by using smaller, more diverse datasets @LLaVA-1.5, @InstructBLIP, @bai2025qwen25vl, @zhu2024minigpt, @tong2024cambrian, training smaller-scale models @marafiotiSmolVLMRedefiningSmall2025, @moondream, @minicmpv2024, or by adapting pretrained unimodal models by tuning only a small subset of parameters @shukor2023epalm, @vallaeys2024improveddepalm, @MAPL, @FROMAGe, @tsimpoukelli2021multimodalfrozen, @BLIP-2. While the majority of VLM research focuses on image and text modalities, recent work has demonstrated that similar techniques can be extended to integrate additional modalities, such as video and audio @wang2025internvideo2, @liu2024kangaroo, @zhang2025videollama, @kong2024audioflam--a particularly promising direction of research for robotics applications, where multiple sensor modalities can be integrated effectively. This trend towards efficiency is paramount for robotics applications, where policies must operate under the stringent constraints of real-world deployment. Indeed, robots often possess limited on-board computational resources and must react in real-time to dynamic environments. Smaller and faster VLMs have thus become quintessential for developing responsive autonomous systems, enabling high-frequency control loops by reducing the latency between perception and action.
|
| 932 |
|
| 933 |
+
### $`\pi_0`$
|
| 934 |
|
| 935 |
$`\pi_0`$ @blackp0VisionLanguageActionFlow2024 introduce a VLA consisting of a MoE architecture consisting of (1) a pre-trained VLM backbone (Gemma 2.6B @teamGemma2Improving2024) and (2) a dedicated action expert used to generate continuous actions via flow matching. Images and language are embedded with a late-fusion VLM (PaliGemma), while proprioceptive state and actions chunks are routed to a smaller action expert, initialized from scratch. The two separate experts communicate via self-attention layers, but maintain disjoint weights to obtain query, key and values matrices at each layer, maintaining specialization while efficiently allocating computation.
|
| 936 |
|
|
|
|
| 982 |
|
| 983 |
Lastly, @blackp0VisionLanguageActionFlow2024 present cross-embodiment experiments where they demonstrate $`\pi_0`$’s ability to control both mobile and static manipulator robots with varying arm embodiments. The emergence of cross-embodiment capabilities is largely to be attributed to the presence of large scale cross-embodiment data in the data mixture, handled by $`\pi_0`$defaulting to the maximal configuration size across the $`\pi`$ dataset, and zero-padding robots with fewer dof. In that $`\pi_0`$constantly processes 18 DoFs robots (two 6-DoF arms, two grippers, base, vertical torso), regardless of the kind of robot, and robots with fewer dofs are zero-padded. $`\pi_0`$also relies on three camera views, and uses masked image slots for training and deployment scenarios with fewer cameras.
|
| 984 |
|
| 985 |
+
#### Code Example: Using $`\pi_0`$
|
| 986 |
|
| 987 |
+
### SmolVLA
|
| 988 |
|
| 989 |
VLAs remain in an early stage of development and are not yet as mature or widely adopted as LLMs and VLMs. Further, much of the impactful VLA progress remains proprietary, with many models sharing only weights while withholding full training details and essential methodological components. SmolVLA @shukorSmolVLAVisionLanguageActionModel2025 is an entirely open-source research effort, aiming to democratize the developments of robotics foundation models by open sourcing model, training recipes and data used.
|
| 990 |
|
|
|
|
| 1003 |
|
| 1004 |
Departing from reliance on proprietary datasets, SmolVLA pretrains exclusively on 450+ *community datasets*, totaling 20K+ trajectories. Because instructions in community contributed dataset can be noisy or missing, the authors re-annotate tasks with a small off-the-shelf VLM using frames sampled from the dataset, and standardize camera viewpoints by mapping sources to a consistent top/wrist/side ordering. At inference, similarily to $`\pi_0`$, SmolVLA integrates flow over 10 steps, resulting in fast inference. SmolVLA proves effective across a range of both real-world and simulated environments, rivaling $`\pi_0`$while being close to 40% faster and consuming 6x less memory.
|
| 1005 |
|
| 1006 |
+
#### Code Example: Using SmolVLA
|
| 1007 |
|
| 1008 |
+
## Conclusions
|
| 1009 |
|
| 1010 |
This tutorial has chronicled the paradigmatic shift transforming robotics, from the structured, model-based methods of its classical era to the dynamic, data-driven approaches that define modern robot learning. We began by examining the limitations of traditional dynamics-based control, highlighting the brittleness and the significant engineering overhead required by traditional approaches, which in turn motivates more flexible, less model-intensive learning approaches.
|
| 1011 |
|
app/scripts/latex-to-markdown/output/main.mdx
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/src/content/article.mdx
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/src/styles/_base.css
CHANGED
|
@@ -1,9 +1,17 @@
|
|
| 1 |
@import "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200..900;1,200..900&display=swap";
|
| 2 |
|
| 3 |
-
html {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
.content-grid main {
|
| 6 |
-
|
|
|
|
| 7 |
|
| 8 |
.content-grid main h2 {
|
| 9 |
font-weight: 600;
|
|
@@ -14,6 +22,12 @@ html { font-size: 16px; line-height: 1.6; }
|
|
| 14 |
border-bottom: 1px solid var(--border-color);
|
| 15 |
}
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
.content-grid main h3 {
|
| 18 |
font-weight: 700;
|
| 19 |
font-size: clamp(18px, 2.1vw, 22px);
|
|
@@ -29,25 +43,51 @@ html { font-size: 16px; line-height: 1.6; }
|
|
| 29 |
margin: var(--spacing-8) 0 var(--spacing-4);
|
| 30 |
}
|
| 31 |
|
| 32 |
-
.content-grid main a {
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
/* Do not underline heading links inside the article (not the TOC) */
|
| 36 |
.content-grid main h2 a,
|
| 37 |
.content-grid main h3 a,
|
| 38 |
.content-grid main h4 a,
|
| 39 |
.content-grid main h5 a,
|
| 40 |
-
.content-grid main h6 a {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
.content-grid main h2 a:hover,
|
| 42 |
.content-grid main h3 a:hover,
|
| 43 |
.content-grid main h4 a:hover,
|
| 44 |
.content-grid main h5 a:hover,
|
| 45 |
-
.content-grid main h6 a:hover {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
.content-grid main ul,
|
| 48 |
-
.content-grid main ol {
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
.content-grid main blockquote {
|
| 53 |
border-left: 2px solid var(--border-color);
|
|
@@ -57,7 +97,11 @@ html { font-size: 16px; line-height: 1.6; }
|
|
| 57 |
margin: var(--spacing-4) 0;
|
| 58 |
}
|
| 59 |
|
| 60 |
-
.content-grid main hr {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
.muted {
|
| 63 |
color: var(--muted-color);
|
|
|
|
| 1 |
@import "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200..900;1,200..900&display=swap";
|
| 2 |
|
| 3 |
+
html {
|
| 4 |
+
font-size: 16px;
|
| 5 |
+
line-height: 1.6;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
.content-grid main {
|
| 9 |
+
color: var(--text-color);
|
| 10 |
+
}
|
| 11 |
|
| 12 |
+
.content-grid main p {
|
| 13 |
+
margin: 0 0 var(--spacing-3);
|
| 14 |
+
}
|
| 15 |
|
| 16 |
.content-grid main h2 {
|
| 17 |
font-weight: 600;
|
|
|
|
| 22 |
border-bottom: 1px solid var(--border-color);
|
| 23 |
}
|
| 24 |
|
| 25 |
+
.content-grid main h2:first-child {
|
| 26 |
+
margin-top: 0;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
.content-grid main h3 {
|
| 32 |
font-weight: 700;
|
| 33 |
font-size: clamp(18px, 2.1vw, 22px);
|
|
|
|
| 43 |
margin: var(--spacing-8) 0 var(--spacing-4);
|
| 44 |
}
|
| 45 |
|
| 46 |
+
.content-grid main a {
|
| 47 |
+
color: var(--primary-color);
|
| 48 |
+
text-decoration: none;
|
| 49 |
+
border-bottom: 1px solid var(--link-underline);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
.content-grid main a:hover {
|
| 53 |
+
color: var(--primary-color-hover);
|
| 54 |
+
border-bottom: 1px solid var(--link-underline-hover);
|
| 55 |
+
}
|
| 56 |
|
| 57 |
/* Do not underline heading links inside the article (not the TOC) */
|
| 58 |
.content-grid main h2 a,
|
| 59 |
.content-grid main h3 a,
|
| 60 |
.content-grid main h4 a,
|
| 61 |
.content-grid main h5 a,
|
| 62 |
+
.content-grid main h6 a {
|
| 63 |
+
color: inherit;
|
| 64 |
+
border-bottom: none;
|
| 65 |
+
text-decoration: none;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
.content-grid main h2 a:hover,
|
| 69 |
.content-grid main h3 a:hover,
|
| 70 |
.content-grid main h4 a:hover,
|
| 71 |
.content-grid main h5 a:hover,
|
| 72 |
+
.content-grid main h6 a:hover {
|
| 73 |
+
color: inherit;
|
| 74 |
+
border-bottom: none;
|
| 75 |
+
text-decoration: none;
|
| 76 |
+
}
|
| 77 |
|
| 78 |
.content-grid main ul,
|
| 79 |
+
.content-grid main ol {
|
| 80 |
+
padding-left: 24px;
|
| 81 |
+
margin: 0 0 var(--spacing-3);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
.content-grid main li {
|
| 85 |
+
margin-bottom: var(--spacing-2);
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.content-grid main li:last-child {
|
| 89 |
+
margin-bottom: 0;
|
| 90 |
+
}
|
| 91 |
|
| 92 |
.content-grid main blockquote {
|
| 93 |
border-left: 2px solid var(--border-color);
|
|
|
|
| 97 |
margin: var(--spacing-4) 0;
|
| 98 |
}
|
| 99 |
|
| 100 |
+
.content-grid main hr {
|
| 101 |
+
border: none;
|
| 102 |
+
border-bottom: 1px solid var(--border-color);
|
| 103 |
+
margin: var(--spacing-5) 0;
|
| 104 |
+
}
|
| 105 |
|
| 106 |
.muted {
|
| 107 |
color: var(--muted-color);
|