diff --git a/app/.astro/astro/content.d.ts b/app/.astro/astro/content.d.ts
index 1e4488d8d711ac4d5200d3550c9be64f0ba79f34..0faa3fd603c845052b1120c8e0aaa1fbeec7f171 100644
--- a/app/.astro/astro/content.d.ts
+++ b/app/.astro/astro/content.d.ts
@@ -152,6 +152,13 @@ declare module 'astro:content' {
type ContentEntryMap = {
"chapters": {
+"00_abstract.mdx": {
+ id: "00_abstract.mdx";
+ slug: "00_abstract";
+ body: string;
+ collection: "chapters";
+ data: any
+} & { render(): Render[".mdx"] };
"01_introduction.mdx": {
id: "01_introduction.mdx";
slug: "01_introduction";
@@ -180,13 +187,6 @@ declare module 'astro:content' {
collection: "chapters";
data: any
} & { render(): Render[".mdx"] };
-"05_foundation_models.mdx": {
- id: "05_foundation_models.mdx";
- slug: "05_foundation_models";
- body: string;
- collection: "chapters";
- data: any
-} & { render(): Render[".mdx"] };
"06_next_directions.mdx": {
id: "06_next_directions.mdx";
slug: "06_next_directions";
diff --git a/app/src/content/article.mdx b/app/src/content/article.mdx
index d63eb0e920422c468c5dcec21abaa3185303f931..c5eea5b5bc380b2b561734b5072e38c8806b3dab 100644
--- a/app/src/content/article.mdx
+++ b/app/src/content/article.mdx
@@ -1,49 +1,55 @@
---
-title: "Bringing paper to life:\n A modern template for\n scientific writing"
-subtitle: "Publish‑ready workflow that lets you focus on ideas, not infrastructure"
-description: "Publish‑ready workflow that lets you focus on ideas, not infrastructure"
+title: "Robot Learning: A Tutorial"
+subtitle: "From Classical Robotics to Foundation Models"
+description: "A comprehensive guide to modern robot learning techniques"
+date: "2025-09-17"
authors:
- - name: "Thibaud Frere"
- url: "https://huggingface.co/tfrere"
- affiliations: [1]
+ - name: "Francesco Capuano"
+ affiliations: [1, 2]
+ - name: "Adil Zouitine"
+ affiliations: [2]
+ - name: "Pepijn Kooijmans"
+ affiliations: [2]
+ - name: "Thomas Wolf"
+ affiliations: [2]
+ - name: "Michel Aractingi"
+ affiliations: [2]
affiliations:
+ - name: "École Normale Supérieure Paris-Saclay"
+ url: "https://ens-paris-saclay.fr"
- name: "Hugging Face"
url: "https://huggingface.co"
-published: "Sep. 01, 2025"
-doi: 10.1234/abcd.efgh
-licence: >
- Diagrams and text are licensed under CC‑BY 4.0 with the source available on Hugging Face, unless noted otherwise.
- Figures reused from other sources are excluded and marked in their captions (“Figure from …”).
tags:
- - research
- - template
-tableOfContentsAutoCollapse: true
+ - robotics
+ - machine-learning
+ - tutorial
+bibliography: bibliography.bib
+converted_from: "LaTeX"
---
-import Introduction from "./chapters/introduction.mdx";
-import BestPractices from "./chapters/best-pratices.mdx";
-import WritingYourContent from "./chapters/writing-your-content.mdx";
-import AvailableBlocks from "./chapters/markdown.mdx";
-import GettingStarted from "./chapters/getting-started.mdx";
-import Markdown from "./chapters/markdown.mdx";
-import Components from "./chapters/components.mdx";
-import Greetings from "./chapters/greetings.mdx";
-import VibeCodingCharts from "./chapters/vibe-coding-charts.mdx";
+import Abstract from "./chapters/00_abstract.mdx";
+import Chapter01Introduction from "./chapters/01_introduction.mdx";
+import Chapter02ClassicRobotics from "./chapters/02_classic_robotics.mdx";
+import Chapter03ReinforcementLearning from "./chapters/03_reinforcement_learning.mdx";
+import Chapter04ImitationLearning from "./chapters/04_imitation_learning.mdx";
+import Chapter06NextDirections from "./chapters/06_next_directions.mdx";
+import Chapter07Conclusions from "./chapters/07_conclusions.mdx";
+import AForeword from "./chapters/A_foreword.mdx";
-
-
+
-
+
-
+
-
+
-
+
-
+
-
+
+
diff --git a/app/src/content/assets/audio/audio-example.wav b/app/src/content/assets/audio/audio-example.wav
deleted file mode 100644
index d0a5d0390c67bae54736cfcafb9067898e419c99..0000000000000000000000000000000000000000
--- a/app/src/content/assets/audio/audio-example.wav
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:552f71aef82738f9b5c9f1d6be495e0f83cec0eabf485066628badb3283cb4b8
-size 48830444
diff --git a/app/src/content/assets/data/against_baselines copy.csv b/app/src/content/assets/data/against_baselines copy.csv
deleted file mode 100644
index d2bbd2200fa92a4b0f34c47b019e92462670cd0a..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/against_baselines copy.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a5e6173a1541b9798278da1729f1e357c0711d2e270f68aa4af8eae962f146dd
-size 53573
diff --git a/app/src/content/assets/data/against_baselines.csv b/app/src/content/assets/data/against_baselines.csv
deleted file mode 100644
index d2bbd2200fa92a4b0f34c47b019e92462670cd0a..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/against_baselines.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a5e6173a1541b9798278da1729f1e357c0711d2e270f68aa4af8eae962f146dd
-size 53573
diff --git a/app/src/content/assets/data/against_baselines_deduplicated.csv b/app/src/content/assets/data/against_baselines_deduplicated.csv
deleted file mode 100644
index e180b302141004f1b9a0fe4a565d45dd3ca3f102..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/against_baselines_deduplicated.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:56d18f581eff719023eb87c695e0e11770738d7872c8b9dac9bc23d9b0ef560b
-size 32738
diff --git a/app/src/content/assets/data/all_ratings_luis.csv b/app/src/content/assets/data/all_ratings_luis.csv
deleted file mode 100644
index 249dacd118d20c655c64bf3d7c3dbd203eeb9477..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/all_ratings_luis.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:104433529e7d9c8a3bd297be1138e9e87677a666953d1362c517ec389c6c9172
-size 64966
diff --git a/app/src/content/assets/data/banner_visualisation_data.csv b/app/src/content/assets/data/banner_visualisation_data.csv
deleted file mode 100644
index 3a7e33d02407b7c9cd2cd29f25b18d5f853641ff..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/banner_visualisation_data.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b155d8c319b1788befe716017fecca580768157feee6221f3af44b7bb9f9c7e5
-size 81995
diff --git a/app/src/content/assets/data/banner_visualisation_data_enriched.csv b/app/src/content/assets/data/banner_visualisation_data_enriched.csv
deleted file mode 100644
index d429317e79e2d4c29eae0083fabae777a9720c5c..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/banner_visualisation_data_enriched.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:98eba5e5db19f482da8a3b26498c2fa633afa458f5b75e23d2dca24e24cc7596
-size 844651
diff --git a/app/src/content/assets/data/formatting_filters.csv b/app/src/content/assets/data/formatting_filters.csv
deleted file mode 100644
index afcb8e40eafdcf74d5a1194c39771b25ef7c5878..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/formatting_filters.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e5218781e5f018891311410d684785a3c661ca3cd25d2ac62bf45e6bb7d69e78
-size 63268
diff --git a/app/src/content/assets/data/image_correspondence_filters.csv b/app/src/content/assets/data/image_correspondence_filters.csv
deleted file mode 100644
index 409176cf9b54c834ef062c34ad0908565f169e59..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/image_correspondence_filters.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:64a8af61666421e33d02bf0e52d9df576a6a831677910b3631e8b02069e380a6
-size 60206
diff --git a/app/src/content/assets/data/internal_deduplication.csv b/app/src/content/assets/data/internal_deduplication.csv
deleted file mode 100644
index a55377c1b56e823092ac474af9d1650b0e8835d4..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/internal_deduplication.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d6b6bf0d84fe1bc67436c70f9a8d5919627e9c2bc9c3f931f4af80c01be22649
-size 47060
diff --git a/app/src/content/assets/data/llm_benchmarks.json b/app/src/content/assets/data/llm_benchmarks.json
deleted file mode 100644
index b62d7f67c0dd8f809cd3065a9ce86030d29d3ff4..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/llm_benchmarks.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:03eeac176ae9abab0c36d798678d80aaa14228ae71d0f9134127cc3cc0d00196
-size 1088
diff --git a/app/src/content/assets/data/mnist-variant-model.json b/app/src/content/assets/data/mnist-variant-model.json
deleted file mode 100644
index f6d4cc17abe497e553a2ac87687770c7f7047e3f..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/mnist-variant-model.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7dca86e85be46c1fca6a4e2503786e88e3f8d4609fb7284c8a1479620a5827da
-size 4315
diff --git a/app/src/content/assets/data/relevance_filters.csv b/app/src/content/assets/data/relevance_filters.csv
deleted file mode 100644
index 2cce8f33805f443a7f17a99e2d83376d9bac9dc8..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/relevance_filters.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:69acb8bc0b80b2c664d821b1c06d67af315e67d8a706cf9e5d351e4468392cc6
-size 63236
diff --git a/app/src/content/assets/data/remove_ch.csv b/app/src/content/assets/data/remove_ch.csv
deleted file mode 100644
index 733d7d6a4b82735330ad7b46136dbe42f7705956..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/remove_ch.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:869fc4724af7e9c868b6024f472f9ae0f6468b74ef61db101438f80610828abb
-size 28837
diff --git a/app/src/content/assets/data/s25_ratings.csv b/app/src/content/assets/data/s25_ratings.csv
deleted file mode 100644
index 9eececb8353846ae8c3dbee4171df39ea5a9c209..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/s25_ratings.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ca22654a0302da0ca335420b0a89cd770cea560b11f2a9f9f25927877d7ed231
-size 61626
diff --git a/app/src/content/assets/data/ss_vs_s1.csv b/app/src/content/assets/data/ss_vs_s1.csv
deleted file mode 100644
index a56d5aa8a6a94d96d9fcebb65a1f262ac03956c1..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/ss_vs_s1.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3f076631fcad76129ed8cab03c72a61965b465e1f3e7fa8dc68b7c7a9275616b
-size 28041
diff --git a/app/src/content/assets/data/vision.csv b/app/src/content/assets/data/vision.csv
deleted file mode 100644
index f5ded8db9f405c12478eeb75bf10ae326d83f43a..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/vision.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d28bd13dc3a9ff100c82e8c9dc59270563b865383d09cf28c5aba5812bfa75ee
-size 10913
diff --git a/app/src/content/assets/data/visual_dependency_filters.csv b/app/src/content/assets/data/visual_dependency_filters.csv
deleted file mode 100644
index 3cea8673216195618e97ad920d6db48b85412a5e..0000000000000000000000000000000000000000
--- a/app/src/content/assets/data/visual_dependency_filters.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a967b10ba4a1034f4d6da250d267a6af51722c3f6dbae0ef0221a62d53502d69
-size 60114
diff --git a/app/src/content/assets/image/ch1/ch1-lerobot-figure1.png b/app/src/content/assets/image/ch1/ch1-lerobot-figure1.png
new file mode 100644
index 0000000000000000000000000000000000000000..9a43981b7d60df842224ee6bff9be820809b36b6
--- /dev/null
+++ b/app/src/content/assets/image/ch1/ch1-lerobot-figure1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a850d2b9170736a42366d65dd858408dcffafa3420a0c6cfd678bbdd29a196fa
+size 2861318
diff --git a/app/src/content/assets/image/ch2/ch2-approaches.png b/app/src/content/assets/image/ch2/ch2-approaches.png
new file mode 100644
index 0000000000000000000000000000000000000000..161aac09e5cae1c51d7a24deb2038ad80358e8cb
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-approaches.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d07f3166fd9efe5b0823ecca63166c019b6fb9dcc912f7b1ae0fd209a25ba274
+size 93262
diff --git a/app/src/content/assets/image/ch2/ch2-classical-limitations.png b/app/src/content/assets/image/ch2/ch2-classical-limitations.png
new file mode 100644
index 0000000000000000000000000000000000000000..969684eb34a3f473e0a0df8ec491c27144d69613
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-classical-limitations.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85742a774d8d1ad3e36fc50d89c5a69409bce98ebe6bdba734896156ba668aa8
+size 4739243
diff --git a/app/src/content/assets/image/ch2/ch2-cost-accessibility.png b/app/src/content/assets/image/ch2/ch2-cost-accessibility.png
new file mode 100644
index 0000000000000000000000000000000000000000..17aa82045475dc0e0537649285e4abd0a9aefd2b
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-cost-accessibility.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:606cbb89fda90a2ddb22dc721ea978ffa9fe34a7f9f0bf1614b6ae53b4117411
+size 1962263
diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-box.png b/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-box.png
new file mode 100644
index 0000000000000000000000000000000000000000..608b518385558b273d591d7f76d1d2804ece01b8
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-box.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c856918ffb061c235d05e74df6310412f5b41ea907f0f12f55fed5c8b45590b
+size 93114
diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-shelf.png b/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-shelf.png
new file mode 100644
index 0000000000000000000000000000000000000000..47c539881d7b58df4b4493093ab6b780c349a476
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor-shelf.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4abb239c45a576a02fc2cbd0d87f877b2c5f61dcac74e1b8c79a70ebacaca3e
+size 83589
diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor.png b/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f19ca65db5de85acc43ca8240987b99fd298231
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-planar-manipulator-floor.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a2c70f2d7c903d9f16433a9ca44c10892fd0e10ca90e2d9b8438c3d25fa623a
+size 58946
diff --git a/app/src/content/assets/image/ch2/ch2-planar-manipulator-free.png b/app/src/content/assets/image/ch2/ch2-planar-manipulator-free.png
new file mode 100644
index 0000000000000000000000000000000000000000..42d6dc9662903b2563663a9b409a8dc83f69906f
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-planar-manipulator-free.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d860153a76720749a50a6d06c7bcb9886f5605a867f130f66810597ca3f5299
+size 44656
diff --git a/app/src/content/assets/image/ch2/ch2-platforms.png b/app/src/content/assets/image/ch2/ch2-platforms.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ccc153ed092d5493052d1ddede64094ae6b4068
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-platforms.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baf76deb1a68b859d1e702bc7d0b4173a6b34b56d4bdf75c4748e80eb1934aad
+size 3616534
diff --git a/app/src/content/assets/image/ch2/ch2-so100-to-planar-manipulator.png b/app/src/content/assets/image/ch2/ch2-so100-to-planar-manipulator.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4bc70f800df876a10b6fdb4ac51c2544b2977fb
--- /dev/null
+++ b/app/src/content/assets/image/ch2/ch2-so100-to-planar-manipulator.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:731806e912421ee3f3fcd10c24b5f5e9f4dd448f859e8213f8f11c0821fcbf59
+size 1555756
diff --git a/app/src/content/assets/image/ch3/ch3-agent-env.png b/app/src/content/assets/image/ch3/ch3-agent-env.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d3ac5a9b05c8c48faf8660a5cac80737392110f
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-agent-env.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43c8641128f72b994a7269561fd6beaf2fbe0d73bb19f58ade559e271de1de31
+size 42614
diff --git a/app/src/content/assets/image/ch3/ch3-duck-sim-vs-real.png b/app/src/content/assets/image/ch3/ch3-duck-sim-vs-real.png
new file mode 100644
index 0000000000000000000000000000000000000000..142a5ea15f01aee271c1775e26a6a2c7bc4aedcc
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-duck-sim-vs-real.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c682cfebec3bf21f579a687d4f6a34d6f7cff225397e081188c39ca3b3def1e7
+size 1762155
diff --git a/app/src/content/assets/image/ch3/ch3-hil-serl-examples.png b/app/src/content/assets/image/ch3/ch3-hil-serl-examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..d665f43d5ed8972fc76399ed8caedd9fee4b373e
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-hil-serl-examples.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae41b09a8a8412b28994425565438a897f827b3a2048d6832c2be7884b40a2af
+size 7216604
diff --git a/app/src/content/assets/image/ch3/ch3-learning-atlas.png b/app/src/content/assets/image/ch3/ch3-learning-atlas.png
new file mode 100644
index 0000000000000000000000000000000000000000..6aceb0b7ccaefebf0bb854ab012eca0cc3ac5da2
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-learning-atlas.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:124d586210aa9b3a110c712c4eff3629d0064a507c9c77bf937dd00cc959428c
+size 178001
diff --git a/app/src/content/assets/image/ch3/ch3-learning-benefits.png b/app/src/content/assets/image/ch3/ch3-learning-benefits.png
new file mode 100644
index 0000000000000000000000000000000000000000..89684d039e24b897517612c222ef6e979f42a7c2
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-learning-benefits.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c23f98c050afb75098f34a2bca49fa30ebb4a2b373447c36ba62612854253ff3
+size 6936585
diff --git a/app/src/content/assets/image/ch3/ch3-many-ducks.png b/app/src/content/assets/image/ch3/ch3-many-ducks.png
new file mode 100644
index 0000000000000000000000000000000000000000..7605bcb2ba0f2abcd7213a4ca092e792db08c504
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-many-ducks.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:418bdeff168978207fcc623db74d25b86d11f27d1100a28238bc1591901b93de
+size 4872198
diff --git a/app/src/content/assets/image/ch3/ch3-rl-algorithms-atlas.png b/app/src/content/assets/image/ch3/ch3-rl-algorithms-atlas.png
new file mode 100644
index 0000000000000000000000000000000000000000..95e818db1704eb52f601c8d5a32f215b7cf7620c
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-rl-algorithms-atlas.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2aa853e6067e7bd06cfa0d12250d4277fbe2020b8a2b817c005b084c49c905d5
+size 194522
diff --git a/app/src/content/assets/image/ch3/ch3-rl-examples.png b/app/src/content/assets/image/ch3/ch3-rl-examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..06de5007b9f0c10c23f79a2af13865a701916662
--- /dev/null
+++ b/app/src/content/assets/image/ch3/ch3-rl-examples.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edb1fa24ee3d279302980016809eab038fc43037156b8d7cadae7fa5b9dddbba
+size 9051359
diff --git a/app/src/content/assets/image/ch4/ch4-act-decoder.png b/app/src/content/assets/image/ch4/ch4-act-decoder.png
new file mode 100644
index 0000000000000000000000000000000000000000..9a09fcb99bb717287ca74d165a3ca5d6983febba
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-act-decoder.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:578074c47e65992422e9cb991949b1d63598aded2098dfde3925a33dfd55e481
+size 3180391
diff --git a/app/src/content/assets/image/ch4/ch4-act-encoder.png b/app/src/content/assets/image/ch4/ch4-act-encoder.png
new file mode 100644
index 0000000000000000000000000000000000000000..f587680a13512bae2fe83b3b472ea54a273293e5
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-act-encoder.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ceeeccb9dd7e791f215f71ee422d9adfb8c2ff1d2417a851e31ba6a6715aaf7
+size 874336
diff --git a/app/src/content/assets/image/ch4/ch4-act.png b/app/src/content/assets/image/ch4/ch4-act.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f884e4a57994ca4a50e979ce8a7595bd02afc6f
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-act.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:318b6f77277c5e8fcf51e2aba63154ee99052e2bcff2af0387fb3cfd1d07cff7
+size 1517348
diff --git a/app/src/content/assets/image/ch4/ch4-action-vs-observation-distribution.png b/app/src/content/assets/image/ch4/ch4-action-vs-observation-distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc82dc6c86ce40126b00697f13a43cc563fe4b4d
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-action-vs-observation-distribution.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7db4ecc0d54d9cab6b8a16017c81bfd9b7fd5d7997bcdd645ccf57167f7efcf2
+size 274240
diff --git a/app/src/content/assets/image/ch4/ch4-async-inference.png b/app/src/content/assets/image/ch4/ch4-async-inference.png
new file mode 100644
index 0000000000000000000000000000000000000000..73aae17126c70f3fca8651ef62b7d519c81e6f58
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-async-inference.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:850ebb6e6ad809edc48597a89cf8e25b2664b9137ca4602ae14f164524f8d232
+size 282300
diff --git a/app/src/content/assets/image/ch4/ch4-bc-trajectories.png b/app/src/content/assets/image/ch4/ch4-bc-trajectories.png
new file mode 100644
index 0000000000000000000000000000000000000000..d577a6966244c54eb3738bd61af13232a603145a
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-bc-trajectories.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ede85dbb8f12b3cced4dc0e12f97e3713d8432953183840f99e8534998d7f3b
+size 2253030
diff --git a/app/src/content/assets/image/ch4/ch4-diffusion-policy.png b/app/src/content/assets/image/ch4/ch4-diffusion-policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..56da7917d95a1592faafde62702170fac438f903
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-diffusion-policy.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3cb644c79fd016e77c78bd7fcf185908b18fb127f656003eb577349cfb6da40
+size 2805702
diff --git a/app/src/content/assets/image/ch4/ch4-diffusion-robot-actions.png b/app/src/content/assets/image/ch4/ch4-diffusion-robot-actions.png
new file mode 100644
index 0000000000000000000000000000000000000000..43d8ce2193bdaeecb172de160290392aaf4000c0
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-diffusion-robot-actions.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a59b816b60a53784127e3dcf0aad612ba14474bde57e1c2b73b670665d1b70ec
+size 8927638
diff --git a/app/src/content/assets/image/ch4/ch4-diffusion-vs-flowmatching.png b/app/src/content/assets/image/ch4/ch4-diffusion-vs-flowmatching.png
new file mode 100644
index 0000000000000000000000000000000000000000..2f4898e0c4db3a001354cc9a78d40e7537b34359
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-diffusion-vs-flowmatching.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aef138f5120025b0bad73788bc8b3af91f27331af3b49bafb09b15037944fa12
+size 189022
diff --git a/app/src/content/assets/image/ch4/ch4-issues-with-bc.png b/app/src/content/assets/image/ch4/ch4-issues-with-bc.png
new file mode 100644
index 0000000000000000000000000000000000000000..789283d5085bae36ebaf062bd157007988e2dd23
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-issues-with-bc.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b726d8aa64534e8cbec4a0084fd86e4dfcc0b17685559970006a573dd326459
+size 1560808
diff --git a/app/src/content/assets/image/ch4/ch4-latent-variable-model.png b/app/src/content/assets/image/ch4/ch4-latent-variable-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..62a7ade0557696ee25c61d10ef323ca1ec9bb077
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-latent-variable-model.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5b1f48d4dc011d5a20b1d5bccc5cde750f4ffab4b8c48bb5b04529a18aa0390
+size 983775
diff --git a/app/src/content/assets/image/ch4/ch4-many-latents.png b/app/src/content/assets/image/ch4/ch4-many-latents.png
new file mode 100644
index 0000000000000000000000000000000000000000..d972eb9694fe47d81d7a5bff66f78edd80c83e57
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-many-latents.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f5421aae5c9e9735de598fca1a5c68ef7fd28c8b31112c4675356f6deda9b29
+size 222323
diff --git a/app/src/content/assets/image/ch4/ch4-normalizing-flows.png b/app/src/content/assets/image/ch4/ch4-normalizing-flows.png
new file mode 100644
index 0000000000000000000000000000000000000000..cf51b8de51af38c0ea807889d8056d41c524c2d5
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-normalizing-flows.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f73d09b35b8ccd5685c6b26f7615f8d6ab3df7d045b2502e9232bfe33beace
+size 278482
diff --git a/app/src/content/assets/image/ch4/ch4-observation-action-mapping.png b/app/src/content/assets/image/ch4/ch4-observation-action-mapping.png
new file mode 100644
index 0000000000000000000000000000000000000000..6206870edf17a28bafe36ca0c5631a62b14f5a6a
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-observation-action-mapping.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1a4a70971ea4c7cf73c089a70e4bc9dd1b5aba43021016fea8b323ad2642c53
+size 2081981
diff --git a/app/src/content/assets/image/ch4/ch4-queues.png b/app/src/content/assets/image/ch4/ch4-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..c1e912ba8a2d5b254ea9d990ba8dbab491cb22ed
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-queues.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d3072c26d0419ee4b19f4ebd10c66e117e113514326eb3e7864057644c305d7
+size 1971787
diff --git a/app/src/content/assets/image/ch4/ch4-task-effect-on-pairs.png b/app/src/content/assets/image/ch4/ch4-task-effect-on-pairs.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fa47c83e5ba456655b025bd651aea0fc6feeeaa
--- /dev/null
+++ b/app/src/content/assets/image/ch4/ch4-task-effect-on-pairs.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0423b4760f661afa6b81a896a473a4bfc50737b0ecef76fa75051eb6ccf69896
+size 1186204
diff --git a/app/src/content/assets/image/ch5/ch5-generalist-policies-timeline.png b/app/src/content/assets/image/ch5/ch5-generalist-policies-timeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..d85a308d7665bd9c6fab4b0f59f622b0e1599745
--- /dev/null
+++ b/app/src/content/assets/image/ch5/ch5-generalist-policies-timeline.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98f0efdb30302f2fd582bbec379007ef3d2188171f0d700014539560b5d29a9f
+size 121521
diff --git a/app/src/content/assets/image/ch5/ch5-ml-vs-robotics-foundation.png b/app/src/content/assets/image/ch5/ch5-ml-vs-robotics-foundation.png
new file mode 100644
index 0000000000000000000000000000000000000000..0327c71faf9a48c757b6a6f3027f7e54cac6f0e7
--- /dev/null
+++ b/app/src/content/assets/image/ch5/ch5-ml-vs-robotics-foundation.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e858e0c5c2d7246e097c8e048d7c378c0ce20c922e66ceac8db8dbb2c5598e79
+size 3389240
diff --git a/app/src/content/assets/image/ch5/ch5-pi0-sampling-timesteps.png b/app/src/content/assets/image/ch5/ch5-pi0-sampling-timesteps.png
new file mode 100644
index 0000000000000000000000000000000000000000..84401c9e5468cef66fcd2cdf2014f0c103003c93
--- /dev/null
+++ b/app/src/content/assets/image/ch5/ch5-pi0-sampling-timesteps.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c27d0d34e08154b42692d1a3ea142ef7742ab50547211e9b22f16d79d14fbb3
+size 186917
diff --git a/app/src/content/assets/image/ch5/ch5-pi0.png b/app/src/content/assets/image/ch5/ch5-pi0.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ea364ceb9691e4ea9928caac2ee6a32860a52d3
--- /dev/null
+++ b/app/src/content/assets/image/ch5/ch5-pi0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:689a7d0a94d116edce122d8c9010aa456ae7d1d816f5684513711d36c94ebb89
+size 1242717
diff --git a/app/src/content/assets/image/ch5/ch5-smolvla.png b/app/src/content/assets/image/ch5/ch5-smolvla.png
new file mode 100644
index 0000000000000000000000000000000000000000..488341b99047ecfad012127baa3a759354577853
--- /dev/null
+++ b/app/src/content/assets/image/ch5/ch5-smolvla.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49575d51c64eb320c588673fb9b33d1d0a3de7f6af7165a18c35ffb40af93e7a
+size 1333430
diff --git a/app/src/content/assets/image/ch5/ch5-trends.png b/app/src/content/assets/image/ch5/ch5-trends.png
new file mode 100644
index 0000000000000000000000000000000000000000..b399968a1d56a98ce0f4af3d1458cf903a1e1471
--- /dev/null
+++ b/app/src/content/assets/image/ch5/ch5-trends.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:357708ec69852658d69c5f3ec3d9c5805939fdaa0d13150f6777731579db09fe
+size 636731
diff --git a/app/src/content/assets/image/misc/lerobot-team.jpeg b/app/src/content/assets/image/misc/lerobot-team.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..330c9a79b9751bf86ffe5ce84a9aaac88ac5d7e6
--- /dev/null
+++ b/app/src/content/assets/image/misc/lerobot-team.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b79149533fb8602ee423c91c068100657745045bfd1507a6a61e30d58c65877
+size 170202
diff --git a/app/src/content/assets/image/placeholder-wide.png b/app/src/content/assets/image/placeholder-wide.png
deleted file mode 100644
index 0010420a4ede3b75de2154c4e1fd864a600a76f8..0000000000000000000000000000000000000000
--- a/app/src/content/assets/image/placeholder-wide.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:200daf8ade0c7f035d883fefa9a12d6ba7cca504b1d5571774748c3c90639103
-size 34642
diff --git a/app/src/content/assets/image/placeholder.png b/app/src/content/assets/image/placeholder.png
deleted file mode 100644
index 1248e48424b9a83c2230f63e5db5344c9821eb39..0000000000000000000000000000000000000000
--- a/app/src/content/assets/image/placeholder.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af82403ec775e8ed0139a70d034e000cb567b6766a3187d246800f0384d57ea9
-size 677135
diff --git a/app/src/content/assets/image/visual-vocabulary-poster.png b/app/src/content/assets/image/visual-vocabulary-poster.png
deleted file mode 100644
index 418527bd50ea464437b59626ad2f8e86dd8ce78a..0000000000000000000000000000000000000000
--- a/app/src/content/assets/image/visual-vocabulary-poster.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:62f72a7eeabc611d4b312c882589bae9369d49e39dd40e2d17e68c77399efc11
-size 915038
diff --git a/app/src/content/bibliography.bib b/app/src/content/bibliography.bib
index 794f652ed80d8ac490001ed4c8827be4fac47ce0..f327c76ba04fe23bc3ae59904848bf0289602306 100644
--- a/app/src/content/bibliography.bib
+++ b/app/src/content/bibliography.bib
@@ -1,130 +1,2246 @@
-@inproceedings{vaswani2017attention,
- title={Attention Is All You Need},
- author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {
- }Lukasz and Polosukhin, Illia},
- booktitle={Advances in Neural Information Processing Systems},
- year={2017}
-}
-
-@book{mckinney2017python,
- title={Python for Data Analysis},
- author={McKinney, Wes},
- publisher={O'Reilly Media},
- address={Sebastopol, CA},
- year={2017},
- edition={2},
- isbn={978-1491957660}
-}
-
-@inproceedings{he2016resnet,
- title={Deep Residual Learning for Image Recognition},
- author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
- booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- pages={770--778},
- year={2016},
- doi={10.1109/CVPR.2016.90},
- url={https://doi.org/10.1109/CVPR.2016.90}
-}
-
-@article{silver2017mastering,
- title={Mastering the game of Go without human knowledge},
- author={Silver, David and Schrittwieser, Julian and Simonyan, Karen and Antonoglou, Ioannis and Huang, Aja and others},
- journal={Nature},
- volume={550},
- number={7676},
- pages={354--359},
- year={2017},
- month={oct},
- doi={10.1038/nature24270},
- url={https://www.nature.com/articles/nature24270}
-}
-
-@techreport{openai2023gpt4,
- title={GPT-4 Technical Report},
- author={{OpenAI}},
- institution={OpenAI},
- year={2023},
- number={arXiv:2303.08774},
- archivePrefix={arXiv},
- eprint={2303.08774},
- primaryClass={cs.CL},
- url={https://arxiv.org/abs/2303.08774}
-}
-
-@phdthesis{doe2020thesis,
- title={Learning Efficient Representations for Large-Scale Visual Recognition},
- author={Doe, Jane},
- school={Massachusetts Institute of Technology},
- address={Cambridge, MA},
- year={2020},
- doi={10.5555/mit-2020-xyz}
-}
-
-@incollection{cover2006entropy,
- title={Entropy, Relative Entropy, and Mutual Information},
- author={Cover, Thomas M. and Thomas, Joy A.},
- booktitle={Elements of Information Theory},
- publisher={Wiley},
- address={Hoboken, NJ},
- edition={2},
- year={2006},
- pages={13--55},
- isbn={978-0471241959}
-}
-
-@misc{zenodo2021dataset,
- title={ImageNet-21K Subset (Version 2.0)},
- author={Smith, John and Lee, Alice and Kumar, Ravi},
- year={2021},
- howpublished={Dataset on Zenodo},
- doi={10.5281/zenodo.1234567},
- url={https://doi.org/10.5281/zenodo.1234567},
- note={Accessed 2025-09-01}
-}
-
-@misc{sklearn2024,
- title={scikit-learn: Machine Learning in Python (Version 1.4)},
- author={Pedregosa, Fabian and Varoquaux, Ga{"e}l and Gramfort, Alexandre and others},
- year={2024},
- howpublished={Software},
- doi={10.5281/zenodo.592264},
- url={https://scikit-learn.org}
-}
-
-@inproceedings{smith2024privacy,
- title={Privacy-Preserving Training with Low-Precision Secure Aggregation},
- author={Smith, Emily and Zhang, Wei and Rossi, Marco and Patel, Neha},
- booktitle={Proceedings of the 41st International Conference on Machine Learning},
- editor={Smith, A. and Johnson, B.},
- series={Proceedings of Machine Learning Research},
- volume={235},
- pages={12345--12367},
- address={Vienna, Austria},
- publisher={PMLR},
- month={jul},
- year={2024},
- url={https://proceedings.mlr.press/v235/}
-}
-
-@article{kingma2015adam,
- title={Adam: A Method for Stochastic Optimization},
- author={Kingma, Diederik P. and Ba, Jimmy},
- journal={International Conference on Learning Representations (ICLR)},
- year={2015},
- archivePrefix={arXiv},
- eprint={1412.6980},
- primaryClass={cs.LG},
- url={https://arxiv.org/abs/1412.6980}
-}
-
-@misc{raffel2020t5,
- title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
- author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and others},
- year={2020},
- howpublished={arXiv preprint},
- archivePrefix={arXiv},
- eprint={1910.10683},
- primaryClass={cs.LG},
- doi={10.48550/arXiv.1910.10683},
- url={https://arxiv.org/abs/1910.10683}
+@misc{agibot-world-contributorsAgiBotWorldColosseo2025,
+ title = {AgiBot World Colosseo: A Large-scale Manipulation Platform for {Scalable} and {Intelligent Embodied Systems}},
+ shorttitle = {AgiBot World Colosseo},
+ author = {{AgiBot-World-Contributors} and Bu, Qingwen and Cai, Jisong and Chen, Li and Cui, Xiuqi and Ding, Yan and Feng, Siyuan and Gao, Shenyuan and He, Xindong and Hu, Xuan and Huang, Xu and Jiang, Shu and Jiang, Yuxin and Jing, Cheng and Li, Hongyang and Li, Jialu and Liu, Chiming and Liu, Yi and Lu, Yuxiang and Luo, Jianlan and Luo, Ping and Mu, Yao and Niu, Yuehan and Pan, Yixuan and Pang, Jiangmiao and Qiao, Yu and Ren, Guanghui and Ruan, Cheng and Shan, Jiaqi and Shen, Yongjian and Shi, Chengshi and Shi, Mingkang and Shi, Modi and Sima, Chonghao and Song, Jianheng and Wang, Huijie and Wang, Wenhao and Wei, Dafeng and Xie, Chengen and Xu, Guo and Yan, Junchi and Yang, Cunbiao and Yang, Lei and Yang, Shukai and Yao, Maoqing and Zeng, Jia and Zhang, Chi and Zhang, Qinglin and Zhao, Bin and Zhao, Chengyue and Zhao, Jiaqi and Zhu, Jianchao},
+ year = {2025},
+ month = aug,
+ number = {arXiv:2503.06669},
+ eprint = {2503.06669},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2503.06669},
+ urldate = {2025-08-27},
+ abstract = {We explore how scalable robot data can address real-world challenges for generalized robotic manipulation. Introducing AgiBot World, a large-scale platform comprising over 1 million trajectories across 217 tasks in five deployment scenarios, we achieve an order-of-magnitude increase in data scale compared to existing datasets. Accelerated by a standardized collection pipeline with human-in-the-loop verification, AgiBot World guarantees high-quality and diverse data distribution. It is extensible from grippers to dexterous hands and visuo-tactile sensors for fine-grained skill acquisition. Building on top of data, we introduce Genie Operator-1 (GO-1), a novel generalist policy that leverages latent action representations to maximize data utilization, demonstrating predictable performance scaling with increased data volume. Policies pre-trained on our dataset achieve an average performance improvement of 30% over those trained on Open X-Embodiment, both in in-domain and out-of-distribution scenarios. GO-1 exhibits exceptional capability in real-world dexterous and long-horizon tasks, achieving over 60% success rate on complex tasks and outperforming prior RDT approach by 32%. By open-sourcing the dataset, tools, and models, we aim to democratize access to large-scale, high-quality robot data, advancing the pursuit of scalable and general-purpose intelligence.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/TGP4C7GA/AgiBot-World-Contributors et al. - 2025 - AgiBot World Colosseo A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Sys.pdf;/Users/fracapuano/Zotero/storage/IC7BUHWR/2503.html}
+}
+
+@article{agrawalComputationalSensorimotorLearning,
+ title = {Computational {Sensorimotor Learning}},
+ author = {Agrawal, Pulkit},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/KSDX9GA2/Agrawal - Computational Sensorimotor Learning.pdf}
+}
+
+@misc{akkayaSolvingRubiksCube2019,
+ title = {Solving {Rubik}'s {Cube} with a {Robot Hand}},
+ author = {Akkaya, Ilge and Andrychowicz, Marcin and Chociej, Maciek and Litwin, Mateusz and McGrew, Bob and Petron, Arthur and Paino, Alex and Plappert, Matthias and Powell, Glenn and Ribas, Raphael and Schneider, Jonas and Tezak, Nikolas and Tworek, Jerry and Welinder, Peter and Weng, Lilian and Yuan, Qiming and Zaremba, Wojciech and Zhang, Lei},
+ year = {2019},
+ month = oct,
+ number = {arXiv:1910.07113},
+ eprint = {1910.07113},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1910.07113},
+ urldate = {2025-08-26},
+ abstract = {We demonstrate that models trained only in simulation can be used to solve a manipulation problem of unprecedented complexity on a real robot. This is made possible by two key components: a novel algorithm, which we call automatic domain randomization (ADR) and a robot platform built for machine learning. ADR automatically generates a distribution over randomized environments of ever-increasing difficulty. Control policies and vision state estimators trained with ADR exhibit vastly improved sim2real transfer. For control policies, memory-augmented models trained on an ADR-generated distribution of environments show clear signs of emergent meta-learning at test time. The combination of ADR with our custom robot platform allows us to solve a Rubik's cube with a humanoid robot hand, which involves both control and state estimation problems. Videos summarizing our results are available: https://openai.com/blog/solving-rubiks-cube/},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/5HNZLG9D/OpenAI et al. - 2019 - Solving Rubik's Cube with a Robot Hand.pdf;/Users/fracapuano/Zotero/storage/WSM7BJ4I/1910.html}
+}
+
+@misc{alayracFlamingoVisualLanguage2022,
+ title = {Flamingo: A {Visual Language Model} for Few-Shot Learning},
+ shorttitle = {Flamingo},
+ author = {Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katie and Reynolds, Malcolm and Ring, Roman and Rutherford, Eliza and Cabi, Serkan and Han, Tengda and Gong, Zhitao and Samangooei, Sina and Monteiro, Marianne and Menick, Jacob and Borgeaud, Sebastian and Brock, Andrew and Nematzadeh, Aida and Sharifzadeh, Sahand and Binkowski, Mikolaj and Barreira, Ricardo and Vinyals, Oriol and Zisserman, Andrew and Simonyan, Karen},
+ year = {2022},
+ month = nov,
+ number = {arXiv:2204.14198},
+ eprint = {2204.14198},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2204.14198},
+ urldate = {2025-08-27},
+ abstract = {Building models that can be rapidly adapted to novel tasks using only a handful of annotated examples is an open challenge for multimodal machine learning research. We introduce Flamingo, a family of Visual Language Models (VLM) with this ability. We propose key architectural innovations to: (i) bridge powerful pretrained vision-only and language-only models, (ii) handle sequences of arbitrarily interleaved visual and textual data, and (iii) seamlessly ingest images or videos as inputs. Thanks to their flexibility, Flamingo models can be trained on large-scale multimodal web corpora containing arbitrarily interleaved text and images, which is key to endow them with in-context few-shot learning capabilities. We perform a thorough evaluation of our models, exploring and measuring their ability to rapidly adapt to a variety of image and video tasks. These include open-ended tasks such as visual question-answering, where the model is prompted with a question which it has to answer; captioning tasks, which evaluate the ability to describe a scene or an event; and close-ended tasks such as multiple-choice visual question-answering. For tasks lying anywhere on this spectrum, a single Flamingo model can achieve a new state of the art with few-shot learning, simply by prompting the model with task-specific examples. On numerous benchmarks, Flamingo outperforms models fine-tuned on thousands of times more task-specific data.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/QZ69HN5K/Alayrac et al. - 2022 - Flamingo a Visual Language Model for Few-Shot Learning.pdf;/Users/fracapuano/Zotero/storage/JMAD5HJY/2204.html}
+}
+
+@article{aldacoALOHA2Enhanced,
+ title = {ALOHA 2: An Enhanced Low-Cost Hardware for {Bimanual Teleoperation}},
+ author = {Aldaco, Jorge and Armstrong, Travis and Baruch, Robert and Bingham, Jeff and Chan, Sanky and Dwibedi, Debidatta and Finn, Chelsea and Florence, Pete and Goodrich, Spencer and Gramlich, Wayne and Herzog, Alexander and Hoech, Jonathan and Nguyen, Thinh and Storz, Ian and Tabanpour, Baruch and Tompson, Jonathan and Wahid, Ayzaan and Wahrburg, Ted and Xu, Sichun and Yaroshenko, Sergey and Zhao, Tony Z},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/LDEJG62Q/Aldaco et al. - ALOHA 2 An Enhanced Low-Cost Hardware for Bimanual Teleoperation.pdf}
+}
+
+@article{alizadehComprehensiveSurveySpace2024,
+ title = {A Comprehensive Survey of Space Robotic Manipulators for On-Orbit Servicing},
+ author = {Alizadeh, Mohammad and Zhu, Zheng H.},
+ year = {2024},
+ month = oct,
+ journal = {Frontiers in Robotics and AI},
+ volume = {11},
+ publisher = {Frontiers},
+ issn = {2296-9144},
+ doi = {10.3389/frobt.2024.1470950},
+ urldate = {2025-08-26},
+ abstract = {On-Orbit Servicing (OOS) robots are transforming space exploration by enabling vital maintenance and repair of spacecraft directly in space. However, achieving precise and safe manipulation in microgravity necessitates overcoming significant challenges. This survey delves into four crucial areas essential for successful OOS manipulation: object state estimation, motion planning, and feedback control. Techniques from traditional vision to advanced X-ray and neural network methods are explored for object state estimation. Strategies for fuel-optimized trajectories, docking maneuvers, and collision avoidance are examined in motion planning. The survey also explores control methods for various scenarios, including cooperative manipulation and handling uncertainties, in feedback control. Additionally, this survey examines how Machine learning techniques can further propel OOS robots towards more complex and delicate tasks in space.},
+ langid = {english},
+ keywords = {control,machine learning,motion planning,on-orbit servicing,pose estimation,robotic manipulator,space robots},
+ file = {/Users/fracapuano/Zotero/storage/VA36KZYY/Alizadeh and Zhu - 2024 - A comprehensive survey of space robotic manipulators for on-orbit servicing.pdf}
+}
+
+@misc{allalSmolLM2WhenSmol2025,
+ title = {SmolLM2: {When Smol Goes Big} -- Data-Centric Training of a {Small Language Model}},
+ shorttitle = {SmolLM2},
+ author = {Allal, Loubna Ben and Lozhkov, Anton and Bakouch, Elie and Bl{\'a}zquez, Gabriel Mart{\'i}n and Penedo, Guilherme and Tunstall, Lewis and Marafioti, Andr{\'e}s and Kydl{\'i}{\v c}ek, Hynek and Lajar{\'i}n, Agust{\'i}n Piqueres and Srivastav, Vaibhav and Lochner, Joshua and Fahlgren, Caleb and Nguyen, Xuan-Son and Fourrier, Cl{\'e}mentine and Burtenshaw, Ben and Larcher, Hugo and Zhao, Haojun and Zakka, Cyril and Morlon, Mathieu and Raffel, Colin and von Werra, Leandro and Wolf, Thomas},
+ year = {2025},
+ month = feb,
+ number = {arXiv:2502.02737},
+ eprint = {2502.02737},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2502.02737},
+ urldate = {2025-09-09},
+ abstract = {While large language models have facilitated breakthroughs in many applications of artificial intelligence, their inherent largeness makes them computationally expensive and challenging to deploy in resource-constrained settings. In this paper, we document the development of SmolLM2, a state-of-the-art "small" (1.7 billion parameter) language model (LM). To attain strong performance, we overtrain SmolLM2 on {\textasciitilde}11 trillion tokens of data using a multi-stage training process that mixes web text with specialized math, code, and instruction-following data. We additionally introduce new specialized datasets (FineMath, Stack-Edu, and SmolTalk) at stages where we found existing datasets to be problematically small or low-quality. To inform our design decisions, we perform both small-scale ablations as well as a manual refinement process that updates the dataset mixing rates at each stage based on the performance at the previous stage. Ultimately, we demonstrate that SmolLM2 outperforms other recent small LMs including Qwen2.5-1.5B and Llama3.2-1B. To facilitate future research on LM development as well as applications of small LMs, we release both SmolLM2 as well as all of the datasets we prepared in the course of this project.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language},
+ file = {/Users/fracapuano/Zotero/storage/I7XDMSV7/Allal et al. - 2025 - SmolLM2 When Smol Goes Big -- Data-Centric Training of a Small Language Model.pdf;/Users/fracapuano/Zotero/storage/6MLZI84T/2502.html}
+}
+
+@misc{antonovaReinforcementLearningPivoting2017,
+ title = {Reinforcement {Learning} for {Pivoting Task}},
+ author = {Antonova, Rika and Cruciani, Silvia and Smith, Christian and Kragic, Danica},
+ year = {2017},
+ month = mar,
+ number = {arXiv:1703.00472},
+ eprint = {1703.00472},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1703.00472},
+ urldate = {2025-08-25},
+ abstract = {In this work we propose an approach to learn a robust policy for solving the pivoting task. Recently, several model-free continuous control algorithms were shown to learn successful policies without prior knowledge of the dynamics of the task. However, obtaining successful policies required thousands to millions of training episodes, limiting the applicability of these approaches to real hardware. We developed a training procedure that allows us to use a simple custom simulator to learn policies robust to the mismatch of simulation vs robot. In our experiments, we demonstrate that the policy learned in the simulator is able to pivot the object to the desired target angle on the real robot. We also show generalization to an object with different inertia, shape, mass and friction properties than those used during training. This result is a step towards making model-free reinforcement learning available for solving robotics tasks via pre-training in simulators that offer only an imprecise match to the real-world dynamics.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/WRZCHVGB/Antonova et al. - 2017 - Reinforcement Learning for Pivoting Task.pdf;/Users/fracapuano/Zotero/storage/WJEJ2VGU/1703.html}
+}
+
+@article{aractingiControllingSolo12Quadruped2023,
+ title = {Controlling the Solo12 Quadruped Robot with Deep Reinforcement Learning},
+ author = {Aractingi, Michel and L{\'e}ziart, Pierre-Alexandre and Flayols, Thomas and Perez, Julien and Silander, Tomi and Sou{\`e}res, Philippe},
+ year = {2023},
+ month = jul,
+ journal = {Scientific Reports},
+ volume = {13},
+ number = {1},
+ pages = {11945},
+ publisher = {Nature Publishing Group},
+ issn = {2045-2322},
+ doi = {10.1038/s41598-023-38259-7},
+ urldate = {2025-08-27},
+ abstract = {Quadruped robots require robust and general locomotion skills to exploit their mobility potential in complex and challenging environments. In this work, we present an implementation of a robust end-to-end learning-based controller on the Solo12 quadruped. Our method is based on deep reinforcement learning of joint impedance references. The resulting control policies follow a commanded velocity reference while being efficient in its energy consumption and easy to deploy. We detail the learning procedure and method for transfer on the real robot. We show elaborate experiments. Finally, we present experimental results of the learned locomotion on various grounds indoors and outdoors. These results show that the Solo12 robot is a suitable open-source platform for research combining learning and control because of the easiness in transferring and deploying learned controllers.},
+ copyright = {2023 The Author(s)},
+ langid = {english},
+ keywords = {Computer science,Information technology},
+ file = {/Users/fracapuano/Zotero/storage/84ZFT7RP/Aractingi et al. - 2023 - Controlling the Solo12 quadruped robot with deep reinforcement learning.pdf}
+}
+
+@misc{bai2025qwen25vl,
+ title = {Qwen2.5-{VL} Technical Report},
+ author = {Bai, Shuai and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Song, Sibo and Dang, Kai and Wang, Peng and Wang, Shijie and Tang, Jun and Zhong, Humen and Zhu, Yuanzhi and Yang, Mingkun and Li, Zhaohai and Wan, Jianqiang and Wang, Pengfei and Ding, Wei and Fu, Zheren and Xu, Yiheng and Ye, Jiabo and Zhang, Xi and Xie, Tianbao and Cheng, Zesen and Zhang, Hang and Yang, Zhibo and Xu, Haiyang and Lin, Junyang},
+ year = {2025},
+ eprint = {2502.13923},
+ primaryclass = {cs.CV},
+ archiveprefix = {arXiv}
+}
+
+@misc{ballEfficientOnlineReinforcement2023,
+ title = {Efficient {Online Reinforcement Learning} with {Offline Data}},
+ author = {Ball, Philip J. and Smith, Laura and Kostrikov, Ilya and Levine, Sergey},
+ year = {2023},
+ month = may,
+ number = {arXiv:2302.02948},
+ eprint = {2302.02948},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2302.02948},
+ urldate = {2025-08-30},
+ abstract = {Sample efficiency and exploration remain major challenges in online reinforcement learning (RL). A powerful approach that can be applied to address these issues is the inclusion of offline data, such as prior trajectories from a human expert or a sub-optimal exploration policy. Previous methods have relied on extensive modifications and additional complexity to ensure the effective use of this data. Instead, we ask: can we simply apply existing off-policy methods to leverage offline data when learning online? In this work, we demonstrate that the answer is yes; however, a set of minimal but important changes to existing off-policy RL algorithms are required to achieve reliable performance. We extensively ablate these design choices, demonstrating the key factors that most affect performance, and arrive at a set of recommendations that practitioners can readily apply, whether their data comprise a small number of expert demonstrations or large volumes of sub-optimal trajectories. We see that correct application of these simple recommendations can provide a ${\textbackslash}mathbf\{2.5{\textbackslash}times\}$ improvement over existing approaches across a diverse set of competitive benchmarks, with no additional computational overhead. We have released our code at https://github.com/ikostrikov/rlpd.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/MUKA5D2V/Ball et al. - 2023 - Efficient Online Reinforcement Learning with Offline Data.pdf;/Users/fracapuano/Zotero/storage/IKURHC3D/2302.html}
+}
+
+@misc{bekrisStateRobotMotion2024,
+ title = {The {State} of {Robot Motion Generation}},
+ author = {Bekris, Kostas E. and Doerr, Joe and Meng, Patrick and Tangirala, Sumanth},
+ year = {2024},
+ month = oct,
+ number = {arXiv:2410.12172},
+ eprint = {2410.12172},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2410.12172},
+ urldate = {2025-08-26},
+ abstract = {This paper reviews the large spectrum of methods for generating robot motion proposed over the 50 years of robotics research culminating in recent developments. It crosses the boundaries of methodologies, typically not surveyed together, from those that operate over explicit models to those that learn implicit ones. The paper discusses the current state-of-the-art as well as properties of varying methodologies, highlighting opportunities for integration.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/DMJJZFDZ/Bekris et al. - 2024 - The State of Robot Motion Generation.pdf;/Users/fracapuano/Zotero/storage/TL42IRAN/2410.html}
+}
+
+@article{bellemareAutonomousNavigationStratospheric2020,
+ title = {Autonomous Navigation of Stratospheric Balloons Using Reinforcement Learning},
+ author = {Bellemare, Marc G. and Candido, Salvatore and Castro, Pablo Samuel and Gong, Jun and Machado, Marlos C. and Moitra, Subhodeep and Ponda, Sameera S. and Wang, Ziyu},
+ year = {2020},
+ month = dec,
+ journal = {Nature},
+ volume = {588},
+ number = {7836},
+ pages = {77--82},
+ publisher = {Nature Publishing Group},
+ issn = {1476-4687},
+ doi = {10.1038/s41586-020-2939-8},
+ urldate = {2025-08-31},
+ abstract = {Efficiently navigating a superpressure balloon in the stratosphere1 requires the integration of a multitude of cues, such as wind speed and solar elevation, and the process is complicated by forecast errors and sparse wind measurements. Coupled with the need to make decisions in real time, these factors rule out the use of conventional control techniques2,3. Here we describe the use of reinforcement learning4,5 to create a high-performing flight controller. Our algorithm uses data augmentation6,7 and a self-correcting design to overcome the key technical challenge of reinforcement learning from imperfect data, which has proved to be a major obstacle to its application to physical systems8. We deployed our controller to station Loon superpressure balloons at multiple locations across the globe, including a 39-day controlled experiment over the Pacific Ocean. Analyses show that the controller outperforms Loon's previous algorithm and is robust to the natural diversity in stratospheric winds. These results demonstrate that reinforcement learning is an effective solution to real-world autonomous control problems in which neither conventional methods nor human intervention suffice, offering clues about what may be needed to create artificially intelligent agents that continuously interact with real, dynamic environments.},
+ copyright = {2020 The Author(s), under exclusive licence to Springer Nature Limited},
+ langid = {english},
+ keywords = {Aerospace engineering,Computer science}
+}
+
+@article{bellmanMarkovianDecisionProcess1957,
+ title = {A {Markovian Decision Process}},
+ author = {Bellman, Richard},
+ year = {1957},
+ journal = {Journal of Mathematics and Mechanics},
+ volume = {6},
+ number = {5},
+ eprint = {24900506},
+ eprinttype = {jstor},
+ pages = {679--684},
+ publisher = {Indiana University Mathematics Department},
+ issn = {0095-9057},
+ urldate = {2025-08-30}
+}
+
+@misc{beyerPaliGemmaVersatile3B2024,
+ title = {PaliGemma: {A} Versatile 3B VLM for Transfer},
+ shorttitle = {PaliGemma},
+ author = {Beyer, Lucas and Steiner, Andreas and Pinto, Andr{\'e} Susano and Kolesnikov, Alexander and Wang, Xiao and Salz, Daniel and Neumann, Maxim and Alabdulmohsin, Ibrahim and Tschannen, Michael and Bugliarello, Emanuele and Unterthiner, Thomas and Keysers, Daniel and Koppula, Skanda and Liu, Fangyu and Grycner, Adam and Gritsenko, Alexey and Houlsby, Neil and Kumar, Manoj and Rong, Keran and Eisenschlos, Julian and Kabra, Rishabh and Bauer, Matthias and Bo{\v s}njak, Matko and Chen, Xi and Minderer, Matthias and Voigtlaender, Paul and Bica, Ioana and Balazevic, Ivana and Puigcerver, Joan and Papalampidi, Pinelopi and Henaff, Olivier and Xiong, Xi and Soricut, Radu and Harmsen, Jeremiah and Zhai, Xiaohua},
+ year = {2024},
+ month = oct,
+ number = {arXiv:2407.07726},
+ eprint = {2407.07726},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2407.07726},
+ urldate = {2025-09-08},
+ abstract = {PaliGemma is an open Vision-Language Model (VLM) that is based on the SigLIP-So400m vision encoder and the Gemma-2B language model. It is trained to be a versatile and broadly knowledgeable base model that is effective to transfer. It achieves strong performance on a wide variety of open-world tasks. We evaluate PaliGemma on almost 40 diverse tasks including standard VLM benchmarks, but also more specialized tasks such as remote-sensing and segmentation.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/IPDYNWC4/Beyer et al. - 2024 - PaliGemma A versatile 3B VLM for transfer.pdf;/Users/fracapuano/Zotero/storage/R7UVD9WC/2407.html}
+}
+
+@misc{bjorckGR00TN1Open2025,
+ title = {GR00T N1: {An Open Foundation Model} for {Generalist Humanoid Robots}},
+ shorttitle = {GR00T N1},
+ author = {Bjorck, Johan and Casta{\~n}eda, Fernando and Cherniadev, Nikita and Da, Xingye and Ding, Runyu and Fan, Linxi "Jim" and Fang, Yu and Fox, Dieter and Hu, Fengyuan and Huang, Spencer and Jang, Joel and Jiang, Zhenyu and Kautz, Jan and Kundalia, Kaushil and Lao, Lawrence and Li, Zhiqi and Lin, Zongyu and Lin, Kevin and Liu, Guilin and Llontop, Edith and Magne, Loic and Mandlekar, Ajay and Narayan, Avnish and Nasiriany, Soroush and Reed, Scott and Tan, You Liang and Wang, Guanzhi and Wang, Zu and Wang, Jing and Wang, Qi and Xiang, Jiannan and Xie, Yuqi and Xu, Yinzhen and Xu, Zhenjia and Ye, Seonghyeon and Yu, Zhiding and Zhang, Ao and Zhang, Hao and Zhao, Yizhou and Zheng, Ruijie and Zhu, Yuke},
+ year = {2025},
+ month = mar,
+ number = {arXiv:2503.14734},
+ eprint = {2503.14734},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2503.14734},
+ urldate = {2025-08-26},
+ abstract = {General-purpose robots need a versatile body and an intelligent mind. Recent advancements in humanoid robots have shown great promise as a hardware platform for building generalist autonomy in the human world. A robot foundation model, trained on massive and diverse data sources, is essential for enabling the robots to reason about novel situations, robustly handle real-world variability, and rapidly learn new tasks. To this end, we introduce GR00T N1, an open foundation model for humanoid robots. GR00T N1 is a Vision-Language-Action (VLA) model with a dual-system architecture. The vision-language module (System 2) interprets the environment through vision and language instructions. The subsequent diffusion transformer module (System 1) generates fluid motor actions in real time. Both modules are tightly coupled and jointly trained end-to-end. We train GR00T N1 with a heterogeneous mixture of real-robot trajectories, human videos, and synthetically generated datasets. We show that our generalist robot model GR00T N1 outperforms the state-of-the-art imitation learning baselines on standard simulation benchmarks across multiple robot embodiments. Furthermore, we deploy our model on the Fourier GR-1 humanoid robot for language-conditioned bimanual manipulation tasks, achieving strong performance with high data efficiency.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/BDNSKFA6/NVIDIA et al. - 2025 - GR00T N1 An Open Foundation Model for Generalist Humanoid Robots.pdf;/Users/fracapuano/Zotero/storage/FENU9PQR/2503.html}
+}
+
+@misc{black$p_0$VisionLanguageActionFlow2024,
+ title = {$\pi_0$: A Vision-Language-Action Flow Model for {General Robot Control}},
+ shorttitle = {$\pi_0$},
+ author = {Black, Kevin and Brown, Noah and Driess, Danny and Esmail, Adnan and Equi, Michael and Finn, Chelsea and Fusai, Niccolo and Groom, Lachy and Hausman, Karol and Ichter, Brian and Jakubczak, Szymon and Jones, Tim and Ke, Liyiming and Levine, Sergey and {Li-Bell}, Adrian and Mothukuri, Mohith and Nair, Suraj and Pertsch, Karl and Shi, Lucy Xiaoyang and Tanner, James and Vuong, Quan and Walling, Anna and Wang, Haohuan and Zhilinsky, Ury},
+ year = {2024},
+ month = oct,
+ number = {arXiv:2410.24164},
+ eprint = {2410.24164},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2410.24164},
+ urldate = {2025-08-28},
+ abstract = {Robot learning holds tremendous promise to unlock the full potential of flexible, general, and dexterous robot systems, as well as to address some of the deepest questions in artificial intelligence. However, bringing robot learning to the level of generality required for effective real-world systems faces major obstacles in terms of data, generalization, and robustness. In this paper, we discuss how generalist robot policies (i.e., robot foundation models) can address these challenges, and how we can design effective generalist robot policies for complex and highly dexterous tasks. We propose a novel flow matching architecture built on top of a pre-trained vision-language model (VLM) to inherit Internet-scale semantic knowledge. We then discuss how this model can be trained on a large and diverse dataset from multiple dexterous robot platforms, including single-arm robots, dual-arm robots, and mobile manipulators. We evaluate our model in terms of its ability to perform tasks in zero shot after pre-training, follow language instructions from people and from a high-level VLM policy, and its ability to acquire new skills via fine-tuning. Our results cover a wide variety of tasks, such as laundry folding, table cleaning, and assembling boxes.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/GUEM37NZ/Black et al. - 2024 - $π_0$ A Vision-Language-Action Flow Model for General Robot Control.pdf;/Users/fracapuano/Zotero/storage/FHYXZWF8/2410.html}
+}
+
+@inproceedings{BLIP-2,
+ title = {BLIP-2: Bootstrapping Language-Image Pre-Training with Frozen Image Encoders and Large Language Models},
+ booktitle = {Proceedings of the 40th International Conference on Machine Learning},
+ author = {Li, Junnan and Li, Dongxu and Savarese, Silvio and Hoi, Steven},
+ year = {2023},
+ series = {ICML'23},
+ publisher = {JMLR.org},
+ address = {, Honolulu, Hawaii, USA,},
+ abstract = {The cost of vision-and-language pre-training has become increasingly prohibitive due to end-to-end training of large-scale models. This paper proposes BLIP-2, a generic and efficient pretraining strategy that bootstraps vision-language pre-training from off-the-shelf frozen pretrained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pretrained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model's capabilities of zero-shot image-to-text generation that can follow natural language instructions.},
+ articleno = {814}
+}
+
+@misc{brohanRT1RoboticsTransformer2023,
+ title = {RT-1: {Robotics Transformer} for Real-World Control at {Scale}},
+ shorttitle = {RT-1},
+ author = {Brohan, Anthony and Brown, Noah and Carbajal, Justice and Chebotar, Yevgen and Dabis, Joseph and Finn, Chelsea and Gopalakrishnan, Keerthana and Hausman, Karol and Herzog, Alex and Hsu, Jasmine and Ibarz, Julian and Ichter, Brian and Irpan, Alex and Jackson, Tomas and Jesmonth, Sally and Joshi, Nikhil J. and Julian, Ryan and Kalashnikov, Dmitry and Kuang, Yuheng and Leal, Isabel and Lee, Kuang-Huei and Levine, Sergey and Lu, Yao and Malla, Utsav and Manjunath, Deeksha and Mordatch, Igor and Nachum, Ofir and Parada, Carolina and Peralta, Jodilyn and Perez, Emily and Pertsch, Karl and Quiambao, Jornell and Rao, Kanishka and Ryoo, Michael and Salazar, Grecia and Sanketi, Pannag and Sayed, Kevin and Singh, Jaspiar and Sontakke, Sumedh and Stone, Austin and Tan, Clayton and Tran, Huong and Vanhoucke, Vincent and Vega, Steve and Vuong, Quan and Xia, Fei and Xiao, Ted and Xu, Peng and Xu, Sichun and Yu, Tianhe and Zitkovich, Brianna},
+ year = {2023},
+ month = aug,
+ number = {arXiv:2212.06817},
+ eprint = {2212.06817},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2212.06817},
+ urldate = {2025-09-07},
+ abstract = {By transferring knowledge from large, diverse, task-agnostic datasets, modern machine learning models can solve specific downstream tasks either zero-shot or with small task-specific datasets to a high level of performance. While this capability has been demonstrated in other fields such as computer vision, natural language processing or speech recognition, it remains to be shown in robotics, where the generalization capabilities of the models are particularly critical due to the difficulty of collecting real-world robotic data. We argue that one of the keys to the success of such general robotic models lies with open-ended task-agnostic training, combined with high-capacity architectures that can absorb all of the diverse, robotic data. In this paper, we present a model class, dubbed Robotics Transformer, that exhibits promising scalable model properties. We verify our conclusions in a study of different model classes and their ability to generalize as a function of the data size, model size, and data diversity based on a large-scale data collection on real robots performing real-world tasks. The project's website and videos can be found at robotics-transformer1.github.io},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/TTBN3M5Y/Brohan et al. - 2023 - RT-1 Robotics Transformer for Real-World Control at Scale.pdf;/Users/fracapuano/Zotero/storage/DK3D593W/2212.html}
+}
+
+@misc{brohanRT2VisionLanguageActionModels2023,
+ title = {RT-2: Vision-Language-Action Models Transfer Web Knowledge to {Robotic Control}},
+ shorttitle = {RT-2},
+ author = {Brohan, Anthony and Brown, Noah and Carbajal, Justice and Chebotar, Yevgen and Chen, Xi and Choromanski, Krzysztof and Ding, Tianli and Driess, Danny and Dubey, Avinava and Finn, Chelsea and Florence, Pete and Fu, Chuyuan and Arenas, Montse Gonzalez and Gopalakrishnan, Keerthana and Han, Kehang and Hausman, Karol and Herzog, Alexander and Hsu, Jasmine and Ichter, Brian and Irpan, Alex and Joshi, Nikhil and Julian, Ryan and Kalashnikov, Dmitry and Kuang, Yuheng and Leal, Isabel and Lee, Lisa and Lee, Tsang-Wei Edward and Levine, Sergey and Lu, Yao and Michalewski, Henryk and Mordatch, Igor and Pertsch, Karl and Rao, Kanishka and Reymann, Krista and Ryoo, Michael and Salazar, Grecia and Sanketi, Pannag and Sermanet, Pierre and Singh, Jaspiar and Singh, Anikait and Soricut, Radu and Tran, Huong and Vanhoucke, Vincent and Vuong, Quan and Wahid, Ayzaan and Welker, Stefan and Wohlhart, Paul and Wu, Jialin and Xia, Fei and Xiao, Ted and Xu, Peng and Xu, Sichun and Yu, Tianhe and Zitkovich, Brianna},
+ year = {2023},
+ month = jul,
+ number = {arXiv:2307.15818},
+ eprint = {2307.15818},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2307.15818},
+ urldate = {2025-09-07},
+ abstract = {We study how vision-language models trained on Internet-scale data can be incorporated directly into end-to-end robotic control to boost generalization and enable emergent semantic reasoning. Our goal is to enable a single end-to-end trained model to both learn to map robot observations to actions and enjoy the benefits of large-scale pretraining on language and vision-language data from the web. To this end, we propose to co-fine-tune state-of-the-art vision-language models on both robotic trajectory data and Internet-scale vision-language tasks, such as visual question answering. In contrast to other approaches, we propose a simple, general recipe to achieve this goal: in order to fit both natural language responses and robotic actions into the same format, we express the actions as text tokens and incorporate them directly into the training set of the model in the same way as natural language tokens. We refer to such category of models as vision-language-action models (VLA) and instantiate an example of such a model, which we call RT-2. Our extensive evaluation (6k evaluation trials) shows that our approach leads to performant robotic policies and enables RT-2 to obtain a range of emergent capabilities from Internet-scale training. This includes significantly improved generalization to novel objects, the ability to interpret commands not present in the robot training data (such as placing an object onto a particular number or icon), and the ability to perform rudimentary reasoning in response to user commands (such as picking up the smallest or largest object, or the one closest to another object). We further show that incorporating chain of thought reasoning allows RT-2 to perform multi-stage semantic reasoning, for example figuring out which object to pick up for use as an improvised hammer (a rock), or which type of drink is best suited for someone who is tired (an energy drink).},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/CZHMNYPG/Brohan et al. - 2023 - RT-2 Vision-Language-Action Models Transfer Web Knowledge to Robotic Control.pdf;/Users/fracapuano/Zotero/storage/WN2E7AZH/2307.html}
+}
+
+@misc{brownLanguageModelsAre2020,
+ title = {Language {Models} Are Few-Shot Learners},
+ author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and {Herbert-Voss}, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+ year = {2020},
+ month = jul,
+ number = {arXiv:2005.14165},
+ eprint = {2005.14165},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2005.14165},
+ urldate = {2025-08-28},
+ abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language},
+ file = {/Users/fracapuano/Zotero/storage/L6J45ZW7/Brown et al. - 2020 - Language Models are Few-Shot Learners.pdf;/Users/fracapuano/Zotero/storage/52DC5AT2/2005.html}
+}
+
+@article{burridgeSequentialCompositionDynamically1999b,
+ title = {Sequential {Composition} of {Dynamically Dexterous Robot Behaviors}},
+ author = {Burridge, R. R. and Rizzi, A. A. and Koditschek, D. E.},
+ year = {1999},
+ month = jun,
+ journal = {The International Journal of Robotics Research},
+ volume = {18},
+ number = {6},
+ pages = {534--555},
+ issn = {0278-3649, 1741-3176},
+ doi = {10.1177/02783649922066385},
+ urldate = {2025-08-26},
+ abstract = {We report on our efforts to develop a sequential robot controllercomposition technique in the context of dexterous ``batting'' maneuvers. A robot with a flat paddle is required to strike repeatedly at a thrown ball until the ball is brought to rest on the paddle at a specified location. The robot's reachable workspace is blocked by an obstacle that disconnects the free space formed when the ball and paddle remain in contact, forcing the machine to ``let go'' for a time to bring the ball to the desired state. The controller compositions we create guarantee that a ball introduced in the ``safe workspace'' remains there and is ultimately brought to the goal. We report on experimental results from an implementation of these formal composition methods, and present descriptive statistics characterizing the experiments.},
+ copyright = {https://journals.sagepub.com/page/policies/text-and-data-mining-license},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/TFZQ6EHJ/Burridge et al. - 1999 - Sequential Composition of Dynamically Dexterous Robot Behaviors.pdf}
+}
+
+@misc{cadene2024lerobot,
+ title = {LeRobot: State-of-the-art Machine Learning for Real-World Robotics in Pytorch},
+ author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Gallouedec, Quentin and Zouitine, Adil and Palma, Steven and Kooijmans, Pepijn and Aractingi, Michel and Shukor, Mustafa and Aubakirova, Dana and Russi, Martino and Capuano, Francesco and Pascal, Caroline and Choghari, Jade and Moss, Jess and Wolf, Thomas},
+ year = {2024}
+}
+
+@misc{cadeneLeRobotStateoftheartMachine,
+ title = {LeRobot: State-of-the-art Machine Learning for Real-World Robotics in {Pytorch}},
+ author = {Cadene, Remi}
+}
+
+@misc{cadeneLeRobotStateoftheartMachine2024,
+ title = {LeRobot: State-of-the-art Machine Learning for Real-World Robotics in {Pytorch}},
+ author = {Cadene, Remi and Alibert, Simon and Soare, Alexander and Galloudec, Quentin and Zouitine, Adil and Palma, Steven and Kooijmans, Pepijn and Aractingi, Michel and Shukor, Mustafa and Aubakirova, Dana and Russi, Martino and Capuano, Francesco and Pascal, Caroline and Chogari, Jade and Moss, Jess and Wolf, Thomas},
+ year = {2024}
+}
+
+@misc{caronEmergingPropertiesSelfSupervised2021,
+ title = {Emerging {Properties} in Self-Supervised Vision Transformers},
+ author = {Caron, Mathilde and Touvron, Hugo and Misra, Ishan and J{\'e}gou, Herv{\'e} and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand},
+ year = {2021},
+ month = may,
+ number = {arXiv:2104.14294},
+ eprint = {2104.14294},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2104.14294},
+ urldate = {2025-09-07},
+ abstract = {In this paper, we question if self-supervised learning provides new properties to Vision Transformer (ViT) that stand out compared to convolutional networks (convnets). Beyond the fact that adapting self-supervised methods to this architecture works particularly well, we make the following observations: first, self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets. Second, these features are also excellent k-NN classifiers, reaching 78.3% top-1 on ImageNet with a small ViT. Our study also underlines the importance of momentum encoder, multi-crop training, and the use of small patches with ViTs. We implement our findings into a simple self-supervised method, called DINO, which we interpret as a form of self-distillation with no labels. We show the synergy between DINO and ViTs by achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/AYIY6DTF/Caron et al. - 2021 - Emerging Properties in Self-Supervised Vision Transformers.pdf;/Users/fracapuano/Zotero/storage/EKA7ZN2P/2104.html}
+}
+
+@inproceedings{chebotar2019closing,
+ title = {Closing the Sim-to-Real Loop: {Adapting} Simulation Randomization with Real World Experience},
+ booktitle = {2019 International Conference on Robotics and Automation ({ICRA})},
+ author = {Chebotar, Yevgen and Handa, Ankur and Makoviychuk, Viktor and Macklin, Miles and Issac, Jan and Ratliff, Nathan and Fox, Dieter},
+ year = {2019},
+ pages = {8973--8979},
+ publisher = {IEEE}
+}
+
+@inproceedings{chebotarClosingSimtorealLoop2019,
+ title = {Closing the Sim-to-Real Loop: {Adapting} Simulation Randomization with Real World Experience},
+ shorttitle = {Closing the Sim-to-Real Loop},
+ booktitle = {2019 {International Conference} on {Robotics} and {Automation} ({ICRA})},
+ author = {Chebotar, Yevgen and Handa, Ankur and Makoviychuk, Viktor and Macklin, Miles and Issac, Jan and Ratliff, Nathan and Fox, Dieter},
+ year = {2019},
+ pages = {8973--8979},
+ publisher = {IEEE},
+ urldate = {2025-08-31}
+}
+
+@misc{chenPaLIXScalingMultilingual2023,
+ title = {PaLI-X: {On Scaling} up a {Multilingual Vision} and {Language Model}},
+ shorttitle = {PaLI-X},
+ author = {Chen, Xi and Djolonga, Josip and Padlewski, Piotr and Mustafa, Basil and Changpinyo, Soravit and Wu, Jialin and Ruiz, Carlos Riquelme and Goodman, Sebastian and Wang, Xiao and Tay, Yi and Shakeri, Siamak and Dehghani, Mostafa and Salz, Daniel and Lucic, Mario and Tschannen, Michael and Nagrani, Arsha and Hu, Hexiang and Joshi, Mandar and Pang, Bo and Montgomery, Ceslee and Pietrzyk, Paulina and Ritter, Marvin and Piergiovanni, A. J. and Minderer, Matthias and Pavetic, Filip and Waters, Austin and Li, Gang and Alabdulmohsin, Ibrahim and Beyer, Lucas and Amelot, Julien and Lee, Kenton and Steiner, Andreas Peter and Li, Yang and Keysers, Daniel and Arnab, Anurag and Xu, Yuanzhong and Rong, Keran and Kolesnikov, Alexander and Seyedhosseini, Mojtaba and Angelova, Anelia and Zhai, Xiaohua and Houlsby, Neil and Soricut, Radu},
+ year = {2023},
+ month = may,
+ number = {arXiv:2305.18565},
+ eprint = {2305.18565},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2305.18565},
+ urldate = {2025-09-07},
+ abstract = {We present the training recipe and results of scaling up PaLI-X, a multilingual vision and language model, both in terms of size of the components and the breadth of its training task mixture. Our model achieves new levels of performance on a wide-range of varied and complex tasks, including multiple image-based captioning and question-answering tasks, image-based document understanding and few-shot (in-context) learning, as well as object detection, video question answering, and video captioning. PaLI-X advances the state-of-the-art on most vision-and-language benchmarks considered (25+ of them). Finally, we observe emerging capabilities, such as complex counting and multilingual object detection, tasks that are not explicitly in the training mix.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/UES2DMFM/Chen et al. - 2023 - PaLI-X On Scaling up a Multilingual Vision and Language Model.pdf;/Users/fracapuano/Zotero/storage/LEGNNSHS/2305.html}
+}
+
+@misc{chiDiffusionPolicyVisuomotor2024,
+ title = {Diffusion {Policy}: {Visuomotor Policy Learning} via {Action Diffusion}},
+ shorttitle = {Diffusion {Policy}},
+ author = {Chi, Cheng and Xu, Zhenjia and Feng, Siyuan and Cousineau, Eric and Du, Yilun and Burchfiel, Benjamin and Tedrake, Russ and Song, Shuran},
+ year = {2024},
+ month = mar,
+ number = {arXiv:2303.04137},
+ eprint = {2303.04137},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2303.04137},
+ urldate = {2025-08-28},
+ abstract = {This paper introduces Diffusion Policy, a new way of generating robot behavior by representing a robot's visuomotor policy as a conditional denoising diffusion process. We benchmark Diffusion Policy across 12 different tasks from 4 different robot manipulation benchmarks and find that it consistently outperforms existing state-of-the-art robot learning methods with an average improvement of 46.9%. Diffusion Policy learns the gradient of the action-distribution score function and iteratively optimizes with respect to this gradient field during inference via a series of stochastic Langevin dynamics steps. We find that the diffusion formulation yields powerful advantages when used for robot policies, including gracefully handling multimodal action distributions, being suitable for high-dimensional action spaces, and exhibiting impressive training stability. To fully unlock the potential of diffusion models for visuomotor policy learning on physical robots, this paper presents a set of key technical contributions including the incorporation of receding horizon control, visual conditioning, and the time-series diffusion transformer. We hope this work will help motivate a new generation of policy learning techniques that are able to leverage the powerful generative modeling capabilities of diffusion models. Code, data, and training details is publicly available diffusion-policy.cs.columbia.edu},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/7XRY3GJX/Chi et al. - 2024 - Diffusion Policy Visuomotor Policy Learning via Action Diffusion.pdf;/Users/fracapuano/Zotero/storage/BBBPKKMZ/2303.html}
+}
+
+@misc{collaborationOpenXEmbodimentRobotic2025,
+ title = {Open X-Embodiment: {Robotic Learning Datasets} and RT-X Models},
+ shorttitle = {Open X-Embodiment},
+ author = {Collaboration, Open X.-Embodiment and O'Neill, Abby and Rehman, Abdul and Gupta, Abhinav and Maddukuri, Abhiram and Gupta, Abhishek and Padalkar, Abhishek and Lee, Abraham and Pooley, Acorn and Gupta, Agrim and Mandlekar, Ajay and Jain, Ajinkya and Tung, Albert and Bewley, Alex and Herzog, Alex and Irpan, Alex and Khazatsky, Alexander and Rai, Anant and Gupta, Anchit and Wang, Andrew and Kolobov, Andrey and Singh, Anikait and Garg, Animesh and Kembhavi, Aniruddha and Xie, Annie and Brohan, Anthony and Raffin, Antonin and Sharma, Archit and Yavary, Arefeh and Jain, Arhan and Balakrishna, Ashwin and Wahid, Ayzaan and {Burgess-Limerick}, Ben and Kim, Beomjoon and Sch{\"o}lkopf, Bernhard and Wulfe, Blake and Ichter, Brian and Lu, Cewu and Xu, Charles and Le, Charlotte and Finn, Chelsea and Wang, Chen and Xu, Chenfeng and Chi, Cheng and Huang, Chenguang and Chan, Christine and Agia, Christopher and Pan, Chuer and Fu, Chuyuan and Devin, Coline and Xu, Danfei and Morton, Daniel and Driess, Danny and Chen, Daphne and Pathak, Deepak and Shah, Dhruv and B{\"u}chler, Dieter and Jayaraman, Dinesh and Kalashnikov, Dmitry and Sadigh, Dorsa and Johns, Edward and Foster, Ethan and Liu, Fangchen and Ceola, Federico and Xia, Fei and Zhao, Feiyu and Frujeri, Felipe Vieira and Stulp, Freek and Zhou, Gaoyue and Sukhatme, Gaurav S. and Salhotra, Gautam and Yan, Ge and Feng, Gilbert and Schiavi, Giulio and Berseth, Glen and Kahn, Gregory and Yang, Guangwen and Wang, Guanzhi and Su, Hao and Fang, Hao-Shu and Shi, Haochen and Bao, Henghui and Amor, Heni Ben and Christensen, Henrik I. and Furuta, Hiroki and Bharadhwaj, Homanga and Walke, Homer and Fang, Hongjie and Ha, Huy and Mordatch, Igor and Radosavovic, Ilija and Leal, Isabel and Liang, Jacky and {Abou-Chakra}, Jad and Kim, Jaehyung and Drake, Jaimyn and Peters, Jan and Schneider, Jan and Hsu, Jasmine and Vakil, Jay and Bohg, Jeannette and Bingham, Jeffrey and Wu, Jeffrey and Gao, Jensen and Hu, Jiaheng and Wu, Jiajun and Wu, Jialin and Sun, Jiankai and Luo, Jianlan and Gu, Jiayuan and Tan, Jie and Oh, Jihoon and Wu, Jimmy and Lu, Jingpei and Yang, Jingyun and Malik, Jitendra and Silv{\'e}rio, Jo{\~a}o and Hejna, Joey and Booher, Jonathan and Tompson, Jonathan and Yang, Jonathan and Salvador, Jordi and Lim, Joseph J. and Han, Junhyek and Wang, Kaiyuan and Rao, Kanishka and Pertsch, Karl and Hausman, Karol and Go, Keegan and Gopalakrishnan, Keerthana and Goldberg, Ken and Byrne, Kendra and Oslund, Kenneth and Kawaharazuka, Kento and Black, Kevin and Lin, Kevin and Zhang, Kevin and Ehsani, Kiana and Lekkala, Kiran and Ellis, Kirsty and Rana, Krishan and Srinivasan, Krishnan and Fang, Kuan and Singh, Kunal Pratap and Zeng, Kuo-Hao and Hatch, Kyle and Hsu, Kyle and Itti, Laurent and Chen, Lawrence Yunliang and Pinto, Lerrel and {Fei-Fei}, Li and Tan, Liam and Fan, Linxi "Jim" and Ott, Lionel and Lee, Lisa and Weihs, Luca and Chen, Magnum and Lepert, Marion and Memmel, Marius and Tomizuka, Masayoshi and Itkina, Masha and Castro, Mateo Guaman and Spero, Max and Du, Maximilian and Ahn, Michael and Yip, Michael C. and Zhang, Mingtong and Ding, Mingyu and Heo, Minho and Srirama, Mohan Kumar and Sharma, Mohit and Kim, Moo Jin and Irshad, Muhammad Zubair and Kanazawa, Naoaki and Hansen, Nicklas and Heess, Nicolas and Joshi, Nikhil J. and Suenderhauf, Niko and Liu, Ning and Palo, Norman Di and Shafiullah, Nur Muhammad Mahi and Mees, Oier and Kroemer, Oliver and Bastani, Osbert and Sanketi, Pannag R. and Miller, Patrick "Tree" and Yin, Patrick and Wohlhart, Paul and Xu, Peng and Fagan, Peter David and Mitrano, Peter and Sermanet, Pierre and Abbeel, Pieter and Sundaresan, Priya and Chen, Qiuyu and Vuong, Quan and Rafailov, Rafael and Tian, Ran and Doshi, Ria and {Mart{\'i}n-Mart{\'i}n}, Roberto and Baijal, Rohan and Scalise, Rosario and Hendrix, Rose and Lin, Roy and Qian, Runjia and Zhang, Ruohan and Mendonca, Russell and Shah, Rutav and Hoque, Ryan and Julian, Ryan and Bustamante, Samuel and Kirmani, Sean and Levine, Sergey and Lin, Shan and Moore, Sherry and Bahl, Shikhar and Dass, Shivin and Sonawani, Shubham and Tulsiani, Shubham and Song, Shuran and Xu, Sichun and Haldar, Siddhant and Karamcheti, Siddharth and Adebola, Simeon and Guist, Simon and Nasiriany, Soroush and Schaal, Stefan and Welker, Stefan and Tian, Stephen and Ramamoorthy, Subramanian and Dasari, Sudeep and Belkhale, Suneel and Park, Sungjae and Nair, Suraj and Mirchandani, Suvir and Osa, Takayuki and Gupta, Tanmay and Harada, Tatsuya and Matsushima, Tatsuya and Xiao, Ted and Kollar, Thomas and Yu, Tianhe and Ding, Tianli and Davchev, Todor and Zhao, Tony Z. and Armstrong, Travis and Darrell, Trevor and Chung, Trinity and Jain, Vidhi and Kumar, Vikash and Vanhoucke, Vincent and Guizilini, Vitor and Zhan, Wei and Zhou, Wenxuan and Burgard, Wolfram and Chen, Xi and Chen, Xiangyu and Wang, Xiaolong and Zhu, Xinghao and Geng, Xinyang and Liu, Xiyuan and Liangwei, Xu and Li, Xuanlin and Pang, Yansong and Lu, Yao and Ma, Yecheng Jason and Kim, Yejin and Chebotar, Yevgen and Zhou, Yifan and Zhu, Yifeng and Wu, Yilin and Xu, Ying and Wang, Yixuan and Bisk, Yonatan and Dou, Yongqiang and Cho, Yoonyoung and Lee, Youngwoon and Cui, Yuchen and Cao, Yue and Wu, Yueh-Hua and Tang, Yujin and Zhu, Yuke and Zhang, Yunchu and Jiang, Yunfan and Li, Yunshuang and Li, Yunzhu and Iwasawa, Yusuke and Matsuo, Yutaka and Ma, Zehan and Xu, Zhuo and Cui, Zichen Jeff and Zhang, Zichen and Fu, Zipeng and Lin, Zipeng},
+ year = {2025},
+ month = may,
+ number = {arXiv:2310.08864},
+ eprint = {2310.08864},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2310.08864},
+ urldate = {2025-09-08},
+ abstract = {Large, high-capacity models trained on diverse datasets have shown remarkable successes on efficiently tackling downstream applications. In domains from NLP to Computer Vision, this has led to a consolidation of pretrained models, with general pretrained backbones serving as a starting point for many applications. Can such a consolidation happen in robotics? Conventionally, robotic learning methods train a separate model for every application, every robot, and even every environment. Can we instead train generalist X-robot policy that can be adapted efficiently to new robots, tasks, and environments? In this paper, we provide datasets in standardized data formats and models to make it possible to explore this possibility in the context of robotic manipulation, alongside experimental results that provide an example of effective X-robot policies. We assemble a dataset from 22 different robots collected through a collaboration between 21 institutions, demonstrating 527 skills (160266 tasks). We show that a high-capacity model trained on this data, which we call RT-X, exhibits positive transfer and improves the capabilities of multiple robots by leveraging experience from other platforms. More details can be found on the project website https://robotics-transformer-x.github.io.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/2U73MMVN/Collaboration et al. - 2025 - Open X-Embodiment Robotic Learning Datasets and RT-X Models.pdf;/Users/fracapuano/Zotero/storage/PX7IHY32/2310.html}
+}
+
+@book{connellRobotLearning1993,
+ title = {Robot {Learning}},
+ editor = {Connell, Jonathan H. and Mahadevan, Sridhar},
+ year = {1993},
+ publisher = {Springer US},
+ address = {Boston, MA},
+ doi = {10.1007/978-1-4615-3184-5},
+ urldate = {2025-08-28},
+ copyright = {http://www.springer.com/tdm},
+ isbn = {978-1-4613-6396-5 978-1-4615-3184-5},
+ keywords = {algorithms,artificial intelligence,artificial life,autonom,autonomous robot,genetic algorithms,intelligence,learning,Navigation,programming,proving,robot,uncertainty}
+}
+
+@article{degraveMagneticControlTokamak2022,
+ title = {Magnetic Control of Tokamak Plasmas through Deep Reinforcement Learning},
+ author = {Degrave, Jonas and Felici, Federico and Buchli, Jonas and Neunert, Michael and Tracey, Brendan and Carpanese, Francesco and Ewalds, Timo and Hafner, Roland and Abdolmaleki, Abbas and {de las Casas}, Diego and Donner, Craig and Fritz, Leslie and Galperti, Cristian and Huber, Andrea and Keeling, James and Tsimpoukelli, Maria and Kay, Jackie and Merle, Antoine and Moret, Jean-Marc and Noury, Seb and Pesamosca, Federico and Pfau, David and Sauter, Olivier and Sommariva, Cristian and Coda, Stefano and Duval, Basil and Fasoli, Ambrogio and Kohli, Pushmeet and Kavukcuoglu, Koray and Hassabis, Demis and Riedmiller, Martin},
+ year = {2022},
+ month = feb,
+ journal = {Nature},
+ volume = {602},
+ number = {7897},
+ pages = {414--419},
+ publisher = {Nature Publishing Group},
+ issn = {1476-4687},
+ doi = {10.1038/s41586-021-04301-9},
+ urldate = {2025-08-31},
+ abstract = {Nuclear fusion using magnetic confinement, in particular in the tokamak configuration, is a promising path towards sustainable energy. A core challenge is to shape and maintain a high-temperature plasma within the tokamak vessel. This requires high-dimensional, high-frequency, closed-loop control using magnetic actuator coils, further complicated by the diverse requirements across a wide range of plasma configurations. In this work, we introduce a previously undescribed architecture for tokamak magnetic controller design that autonomously learns to command the full set of control coils. This architecture meets control objectives specified at a high level, at the same time satisfying physical and operational constraints. This approach has unprecedented flexibility and generality in problem specification and yields a notable reduction in design effort to produce new plasma configurations. We successfully produce and control a diverse set of plasma configurations on the Tokamak {\`a} Configuration Variable1,2, including elongated, conventional shapes, as well as advanced configurations, such as negative triangularity and `snowflake' configurations. Our approach achieves accurate tracking of the location, current and shape for these configurations. We also demonstrate sustained `droplets' on TCV, in which two separate plasmas are maintained simultaneously within the vessel. This represents a notable advance for tokamak feedback control, showing the potential of reinforcement learning to accelerate research in the fusion domain, and is one of the most challenging real-world systems to which reinforcement learning has been applied.},
+ copyright = {2022 The Author(s)},
+ langid = {english},
+ keywords = {Computer science,Magnetically confined plasmas,Nuclear fusion and fission},
+ file = {/Users/fracapuano/Zotero/storage/EZ4EAU84/Degrave et al. - 2022 - Magnetic control of tokamak plasmas through deep reinforcement learning.pdf}
+}
+
+@misc{devlinBERTPretrainingDeep2019,
+ title = {BERT: Pre-training of {Deep Bidirectional Transformers} for {Language Understanding}},
+ shorttitle = {BERT},
+ author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+ year = {2019},
+ month = may,
+ number = {arXiv:1810.04805},
+ eprint = {1810.04805},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1810.04805},
+ urldate = {2025-09-08},
+ abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language},
+ file = {/Users/fracapuano/Zotero/storage/AJ3SRLHF/Devlin et al. - 2019 - BERT Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf;/Users/fracapuano/Zotero/storage/LNIKJNIW/1810.html}
+}
+
+@misc{driessKnowledgeInsulatingVisionLanguageAction2025,
+ title = {Knowledge Insulating Vision-Language-Action Models: {Train Fast}, {Run Fast}, {Generalize Better}},
+ shorttitle = {Knowledge Insulating Vision-Language-Action Models},
+ author = {Driess, Danny and Springenberg, Jost Tobias and Ichter, Brian and Yu, Lili and {Li-Bell}, Adrian and Pertsch, Karl and Ren, Allen Z. and Walke, Homer and Vuong, Quan and Shi, Lucy Xiaoyang and Levine, Sergey},
+ year = {2025},
+ month = may,
+ number = {arXiv:2505.23705},
+ eprint = {2505.23705},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2505.23705},
+ urldate = {2025-09-09},
+ abstract = {Vision-language-action (VLA) models provide a powerful approach to training control policies for physical systems, such as robots, by combining end-to-end learning with transfer of semantic knowledge from web-scale vision-language model (VLM) training. However, the constraints of real-time control are often at odds with the design of VLMs: the most powerful VLMs have tens or hundreds of billions of parameters, presenting an obstacle to real-time inference, and operate on discrete tokens rather than the continuous-valued outputs that are required for controlling robots. To address this challenge, recent VLA models have used specialized modules for efficient continuous control, such as action experts or continuous output heads, which typically require adding new untrained parameters to the pretrained VLM backbone. While these modules improve real-time and control capabilities, it remains an open question whether they preserve or degrade the semantic knowledge contained in the pretrained VLM, and what effect they have on the VLA training dynamics. In this paper, we study this question in the context of VLAs that include a continuous diffusion or flow matching action expert, showing that naively including such experts significantly harms both training speed and knowledge transfer. We provide an extensive analysis of various design choices, their impact on performance and knowledge transfer, and propose a technique for insulating the VLM backbone during VLA training that mitigates this issue. Videos are available at https://pi.website/research/knowledge_insulation.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/QHTS9JIC/Driess et al. - 2025 - Knowledge Insulating Vision-Language-Action Models Train Fast, Run Fast, Generalize Better.pdf;/Users/fracapuano/Zotero/storage/3U9FCXRB/2505.html}
+}
+
+@misc{driessPaLMEEmbodiedMultimodal2023,
+ title = {PaLM-E: {An Embodied Multimodal Language Model}},
+ shorttitle = {PaLM-E},
+ author = {Driess, Danny and Xia, Fei and Sajjadi, Mehdi S. M. and Lynch, Corey and Chowdhery, Aakanksha and Ichter, Brian and Wahid, Ayzaan and Tompson, Jonathan and Vuong, Quan and Yu, Tianhe and Huang, Wenlong and Chebotar, Yevgen and Sermanet, Pierre and Duckworth, Daniel and Levine, Sergey and Vanhoucke, Vincent and Hausman, Karol and Toussaint, Marc and Greff, Klaus and Zeng, Andy and Mordatch, Igor and Florence, Pete},
+ year = {2023},
+ month = mar,
+ number = {arXiv:2303.03378},
+ eprint = {2303.03378},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2303.03378},
+ urldate = {2025-09-07},
+ abstract = {Large language models excel at a wide range of complex tasks. However, enabling general inference in the real world, e.g., for robotics problems, raises the challenge of grounding. We propose embodied language models to directly incorporate real-world continuous sensor modalities into language models and thereby establish the link between words and percepts. Input to our embodied language model are multi-modal sentences that interleave visual, continuous state estimation, and textual input encodings. We train these encodings end-to-end, in conjunction with a pre-trained large language model, for multiple embodied tasks including sequential robotic manipulation planning, visual question answering, and captioning. Our evaluations show that PaLM-E, a single large embodied multimodal model, can address a variety of embodied reasoning tasks, from a variety of observation modalities, on multiple embodiments, and further, exhibits positive transfer: the model benefits from diverse joint training across internet-scale language, vision, and visual-language domains. Our largest model, PaLM-E-562B with 562B parameters, in addition to being trained on robotics tasks, is a visual-language generalist with state-of-the-art performance on OK-VQA, and retains generalist language capabilities with increasing scale.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/PQSPI784/Driess et al. - 2023 - PaLM-E An Embodied Multimodal Language Model.pdf;/Users/fracapuano/Zotero/storage/K3PJVSGB/2303.html}
+}
+
+@misc{esserScalingRectifiedFlow2024,
+ title = {Scaling {Rectified Flow Transformers} for High-Resolution Image Synthesis},
+ author = {Esser, Patrick and Kulal, Sumith and Blattmann, Andreas and Entezari, Rahim and M{\"u}ller, Jonas and Saini, Harry and Levi, Yam and Lorenz, Dominik and Sauer, Axel and Boesel, Frederic and Podell, Dustin and Dockhorn, Tim and English, Zion and Lacey, Kyle and Goodwin, Alex and Marek, Yannik and Rombach, Robin},
+ year = {2024},
+ month = mar,
+ number = {arXiv:2403.03206},
+ eprint = {2403.03206},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2403.03206},
+ urldate = {2025-09-07},
+ abstract = {Diffusion models create data from noise by inverting the forward paths of data towards noise and have emerged as a powerful generative modeling technique for high-dimensional, perceptual data such as images and videos. Rectified flow is a recent generative model formulation that connects data and noise in a straight line. Despite its better theoretical properties and conceptual simplicity, it is not yet decisively established as standard practice. In this work, we improve existing noise sampling techniques for training rectified flow models by biasing them towards perceptually relevant scales. Through a large-scale study, we demonstrate the superior performance of this approach compared to established diffusion formulations for high-resolution text-to-image synthesis. Additionally, we present a novel transformer-based architecture for text-to-image generation that uses separate weights for the two modalities and enables a bidirectional flow of information between image and text tokens, improving text comprehension, typography, and human preference ratings. We demonstrate that this architecture follows predictable scaling trends and correlates lower validation loss to improved text-to-image synthesis as measured by various metrics and human evaluations. Our largest models outperform state-of-the-art models, and we will make our experimental data, code, and model weights publicly available.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/23TGK9JM/Esser et al. - 2024 - Scaling Rectified Flow Transformers for High-Resolution Image Synthesis.pdf;/Users/fracapuano/Zotero/storage/W2CRYPZY/2403.html}
+}
+
+@misc{fedusReviewSparseExpert2022,
+ title = {A {Review} of {Sparse Expert Models} in {Deep Learning}},
+ author = {Fedus, William and Dean, Jeff and Zoph, Barret},
+ year = {2022},
+ month = sep,
+ number = {arXiv:2209.01667},
+ eprint = {2209.01667},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2209.01667},
+ urldate = {2025-09-08},
+ abstract = {Sparse expert models are a thirty-year old concept re-emerging as a popular architecture in deep learning. This class of architecture encompasses Mixture-of-Experts, Switch Transformers, Routing Networks, BASE layers, and others, all with the unifying idea that each example is acted on by a subset of the parameters. By doing so, the degree of sparsity decouples the parameter count from the compute per example allowing for extremely large, but efficient models. The resulting models have demonstrated significant improvements across diverse domains such as natural language processing, computer vision, and speech recognition. We review the concept of sparse expert models, provide a basic description of the common algorithms, contextualize the advances in the deep learning era, and conclude by highlighting areas for future work.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/MZXG2WMJ/Fedus et al. - 2022 - A Review of Sparse Expert Models in Deep Learning.pdf;/Users/fracapuano/Zotero/storage/GLZINJYC/2209.html}
+}
+
+@misc{finiMultimodalAutoregressivePretraining2024,
+ title = {Multimodal Autoregressive Pre-training of {Large Vision Encoders}},
+ author = {Fini, Enrico and Shukor, Mustafa and Li, Xiujun and Dufter, Philipp and Klein, Michal and Haldimann, David and Aitharaju, Sai and da Costa, Victor Guilherme Turrisi and B{\'e}thune, Louis and Gan, Zhe and Toshev, Alexander T. and Eichner, Marcin and Nabi, Moin and Yang, Yinfei and Susskind, Joshua M. and {El-Nouby}, Alaaeldin},
+ year = {2024},
+ month = nov,
+ number = {arXiv:2411.14402},
+ eprint = {2411.14402},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2411.14402},
+ urldate = {2025-09-09},
+ abstract = {We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/ULTX55I6/Fini et al. - 2024 - Multimodal Autoregressive Pre-training of Large Vision Encoders.pdf;/Users/fracapuano/Zotero/storage/SUG2W6A9/2411.html}
+}
+
+@inproceedings{florenceImplicitBehavioralCloning2022,
+ title = {Implicit {Behavioral Cloning}},
+ booktitle = {Proceedings of the 5th {Conference} on {Robot Learning}},
+ author = {Florence, Pete and Lynch, Corey and Zeng, Andy and Ramirez, Oscar A. and Wahid, Ayzaan and Downs, Laura and Wong, Adrian and Lee, Johnny and Mordatch, Igor and Tompson, Jonathan},
+ year = {2022},
+ month = jan,
+ pages = {158--168},
+ publisher = {PMLR},
+ issn = {2640-3498},
+ urldate = {2025-09-01},
+ abstract = {We find that across a wide range of robot policy learning scenarios, treating supervised policy learning with an implicit model generally performs better, on average, than commonly used explicit models. We present extensive experiments on this finding, and we provide both intuitive insight and theoretical arguments distinguishing the properties of implicit models compared to their explicit counterparts, particularly with respect to approximating complex, potentially discontinuous and multi-valued (set-valued) functions. On robotic policy learning tasks we show that implicit behavior-cloning policies with energy-based models (EBM) often outperform common explicit (Mean Square Error, or Mixture Density) behavior-cloning policies, including on tasks with high-dimensional action spaces and visual image inputs. We find these policies provide competitive results or outperform state-of-the-art offline reinforcement learning methods on the challenging human-expert tasks from the D4RL benchmark suite, despite using no reward information. In the real world, robots with implicit policies can learn complex and remarkably subtle behaviors on contact-rich tasks from human demonstrations, including tasks with high combinatorial complexity and tasks requiring 1mm precision.},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/Q8I5E862/Florence et al. - 2022 - Implicit Behavioral Cloning.pdf}
+}
+
+@misc{FROMAGe,
+ title = {Grounding Language Models to Images for Multimodal Inputs and Outputs},
+ author = {Koh, Jing Yu and Salakhutdinov, Ruslan and Fried, Daniel},
+ year = {2023}
+}
+
+@article{fujitaDevelopmentRobotsNuclear2020,
+ title = {Development of {Robots} for {Nuclear Power Plants} and {Their Application} to {New Fields}},
+ author = {Fujita, Jun and Soda, Daisuke and Murata, Chotaro and Tsuhari, Hiroyuki},
+ year = {2020},
+ volume = {57},
+ number = {4},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/K349QTEG/Fujita et al. - 2020 - Development of Robots for Nuclear Power Plants and Their Application to New Fields.pdf}
+}
+
+@misc{grattafioriLlama3Herd2024,
+ title = {The {Llama} 3 {Herd} of {Models}},
+ author = {Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and {Al-Dahle}, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and Yang, Amy and Fan, Angela and Goyal, Anirudh and Hartshorn, Anthony and Yang, Aobo and Mitra, Archi and Sravankumar, Archie and Korenev, Artem and Hinsvark, Arthur and Rao, Arun and Zhang, Aston and Rodriguez, Aurelien and Gregerson, Austen and Spataru, Ava and Roziere, Baptiste and Biron, Bethany and Tang, Binh and Chern, Bobbie and Caucheteux, Charlotte and Nayak, Chaya and Bi, Chloe and Marra, Chris and McConnell, Chris and Keller, Christian and Touret, Christophe and Wu, Chunyang and Wong, Corinne and Ferrer, Cristian Canton and Nikolaidis, Cyrus and Allonsius, Damien and Song, Daniel and Pintz, Danielle and Livshits, Danny and Wyatt, Danny and Esiobu, David and Choudhary, Dhruv and Mahajan, Dhruv and {Garcia-Olano}, Diego and Perino, Diego and Hupkes, Dieuwke and Lakomkin, Egor and AlBadawy, Ehab and Lobanova, Elina and Dinan, Emily and Smith, Eric Michael and Radenovic, Filip and Guzm{\'a}n, Francisco and Zhang, Frank and Synnaeve, Gabriel and Lee, Gabrielle and Anderson, Georgia Lewis and Thattai, Govind and Nail, Graeme and Mialon, Gregoire and Pang, Guan and Cucurell, Guillem and Nguyen, Hailey and Korevaar, Hannah and Xu, Hu and Touvron, Hugo and Zarov, Iliyan and Ibarra, Imanol Arrieta and Kloumann, Isabel and Misra, Ishan and Evtimov, Ivan and Zhang, Jack and Copet, Jade and Lee, Jaewon and Geffert, Jan and Vranes, Jana and Park, Jason and Mahadeokar, Jay and Shah, Jeet and van der Linde, Jelmer and Billock, Jennifer and Hong, Jenny and Lee, Jenya and Fu, Jeremy and Chi, Jianfeng and Huang, Jianyu and Liu, Jiawen and Wang, Jie and Yu, Jiecao and Bitton, Joanna and Spisak, Joe and Park, Jongsoo and Rocca, Joseph and Johnstun, Joshua and Saxe, Joshua and Jia, Junteng and Alwala, Kalyan Vasuden and Prasad, Karthik and Upasani, Kartikeya and Plawiak, Kate and Li, Ke and Heafield, Kenneth and Stone, Kevin and {El-Arini}, Khalid and Iyer, Krithika and Malik, Kshitiz and Chiu, Kuenley and Bhalla, Kunal and Lakhotia, Kushal and {Rantala-Yeary}, Lauren and van der Maaten, Laurens and Chen, Lawrence and Tan, Liang and Jenkins, Liz and Martin, Louis and Madaan, Lovish and Malo, Lubo and Blecher, Lukas and Landzaat, Lukas and de Oliveira, Luke and Muzzi, Madeline and Pasupuleti, Mahesh and Singh, Mannat and Paluri, Manohar and Kardas, Marcin and Tsimpoukelli, Maria and Oldham, Mathew and Rita, Mathieu and Pavlova, Maya and Kambadur, Melanie and Lewis, Mike and Si, Min and Singh, Mitesh Kumar and Hassan, Mona and Goyal, Naman and Torabi, Narjes and Bashlykov, Nikolay and Bogoychev, Nikolay and Chatterji, Niladri and Zhang, Ning and Duchenne, Olivier and {\c C}elebi, Onur and Alrassy, Patrick and Zhang, Pengchuan and Li, Pengwei and Vasic, Petar and Weng, Peter and Bhargava, Prajjwal and Dubal, Pratik and Krishnan, Praveen and Koura, Punit Singh and Xu, Puxin and He, Qing and Dong, Qingxiao and Srinivasan, Ragavan and Ganapathy, Raj and Calderer, Ramon and Cabral, Ricardo Silveira and Stojnic, Robert and Raileanu, Roberta and Maheswari, Rohan and Girdhar, Rohit and Patel, Rohit and Sauvestre, Romain and Polidoro, Ronnie and Sumbaly, Roshan and Taylor, Ross and Silva, Ruan and Hou, Rui and Wang, Rui and Hosseini, Saghar and Chennabasappa, Sahana and Singh, Sanjay and Bell, Sean and Kim, Seohyun Sonia and Edunov, Sergey and Nie, Shaoliang and Narang, Sharan and Raparthy, Sharath and Shen, Sheng and Wan, Shengye and Bhosale, Shruti and Zhang, Shun and Vandenhende, Simon and Batra, Soumya and Whitman, Spencer and Sootla, Sten and Collot, Stephane and Gururangan, Suchin and Borodinsky, Sydney and Herman, Tamar and Fowler, Tara and Sheasha, Tarek and Georgiou, Thomas and Scialom, Thomas and Speckbacher, Tobias and Mihaylov, Todor and Xiao, Tong and Karn, Ujjwal and Goswami, Vedanuj and Gupta, Vibhor and Ramanathan, Vignesh and Kerkez, Viktor and Gonguet, Vincent and Do, Virginie and Vogeti, Vish and Albiero, V{\'i}tor and Petrovic, Vladan and Chu, Weiwei and Xiong, Wenhan and Fu, Wenyin and Meers, Whitney and Martinet, Xavier and Wang, Xiaodong and Wang, Xiaofang and Tan, Xiaoqing Ellen and Xia, Xide and Xie, Xinfeng and Jia, Xuchao and Wang, Xuewei and Goldschlag, Yaelle and Gaur, Yashesh and Babaei, Yasmine and Wen, Yi and Song, Yiwen and Zhang, Yuchen and Li, Yue and Mao, Yuning and Coudert, Zacharie Delpierre and Yan, Zheng and Chen, Zhengxing and Papakipos, Zoe and Singh, Aaditya and Srivastava, Aayushi and Jain, Abha and Kelsey, Adam and Shajnfeld, Adam and Gangidi, Adithya and Victoria, Adolfo and Goldstand, Ahuva and Menon, Ajay and Sharma, Ajay and Boesenberg, Alex and Baevski, Alexei and Feinstein, Allie and Kallet, Amanda and Sangani, Amit and Teo, Amos and Yunus, Anam and Lupu, Andrei and Alvarado, Andres and Caples, Andrew and Gu, Andrew and Ho, Andrew and Poulton, Andrew and Ryan, Andrew and Ramchandani, Ankit and Dong, Annie and Franco, Annie and Goyal, Anuj and Saraf, Aparajita and Chowdhury, Arkabandhu and Gabriel, Ashley and Bharambe, Ashwin and Eisenman, Assaf and Yazdan, Azadeh and James, Beau and Maurer, Ben and Leonhardi, Benjamin and Huang, Bernie and Loyd, Beth and Paola, Beto De and Paranjape, Bhargavi and Liu, Bing and Wu, Bo and Ni, Boyu and Hancock, Braden and Wasti, Bram and Spence, Brandon and Stojkovic, Brani and Gamido, Brian and Montalvo, Britt and Parker, Carl and Burton, Carly and Mejia, Catalina and Liu, Ce and Wang, Changhan and Kim, Changkyu and Zhou, Chao and Hu, Chester and Chu, Ching-Hsiang and Cai, Chris and Tindal, Chris and Feichtenhofer, Christoph and Gao, Cynthia and Civin, Damon and Beaty, Dana and Kreymer, Daniel and Li, Daniel and Adkins, David and Xu, David and Testuggine, Davide and David, Delia and Parikh, Devi and Liskovich, Diana and Foss, Didem and Wang, Dingkang and Le, Duc and Holland, Dustin and Dowling, Edward and Jamil, Eissa and Montgomery, Elaine and Presani, Eleonora and Hahn, Emily and Wood, Emily and Le, Eric-Tuan and Brinkman, Erik and Arcaute, Esteban and Dunbar, Evan and Smothers, Evan and Sun, Fei and Kreuk, Felix and Tian, Feng and Kokkinos, Filippos and Ozgenel, Firat and Caggioni, Francesco and Kanayet, Frank and Seide, Frank and Florez, Gabriela Medina and Schwarz, Gabriella and Badeer, Gada and Swee, Georgia and Halpern, Gil and Herman, Grant and Sizov, Grigory and Guangyi and Zhang and Lakshminarayanan, Guna and Inan, Hakan and Shojanazeri, Hamid and Zou, Han and Wang, Hannah and Zha, Hanwen and Habeeb, Haroun and Rudolph, Harrison and Suk, Helen and Aspegren, Henry and Goldman, Hunter and Zhan, Hongyuan and Damlaj, Ibrahim and Molybog, Igor and Tufanov, Igor and Leontiadis, Ilias and Veliche, Irina-Elena and Gat, Itai and Weissman, Jake and Geboski, James and Kohli, James and Lam, Janice and Asher, Japhet and Gaya, Jean-Baptiste and Marcus, Jeff and Tang, Jeff and Chan, Jennifer and Zhen, Jenny and Reizenstein, Jeremy and Teboul, Jeremy and Zhong, Jessica and Jin, Jian and Yang, Jingyi and Cummings, Joe and Carvill, Jon and Shepard, Jon and McPhie, Jonathan and Torres, Jonathan and Ginsburg, Josh and Wang, Junjie and Wu, Kai and U, Kam Hou and Saxena, Karan and Khandelwal, Kartikay and Zand, Katayoun and Matosich, Kathy and Veeraraghavan, Kaushik and Michelena, Kelly and Li, Keqian and Jagadeesh, Kiran and Huang, Kun and Chawla, Kunal and Huang, Kyle and Chen, Lailin and Garg, Lakshya and A, Lavender and Silva, Leandro and Bell, Lee and Zhang, Lei and Guo, Liangpeng and Yu, Licheng and Moshkovich, Liron and Wehrstedt, Luca and Khabsa, Madian and Avalani, Manav and Bhatt, Manish and Mankus, Martynas and Hasson, Matan and Lennie, Matthew and Reso, Matthias and Groshev, Maxim and Naumov, Maxim and Lathi, Maya and Keneally, Meghan and Liu, Miao and Seltzer, Michael L. and Valko, Michal and Restrepo, Michelle and Patel, Mihir and Vyatskov, Mik and Samvelyan, Mikayel and Clark, Mike and Macey, Mike and Wang, Mike and Hermoso, Miquel Jubert and Metanat, Mo and Rastegari, Mohammad and Bansal, Munish and Santhanam, Nandhini and Parks, Natascha and White, Natasha and Bawa, Navyata and Singhal, Nayan and Egebo, Nick and Usunier, Nicolas and Mehta, Nikhil and Laptev, Nikolay Pavlovich and Dong, Ning and Cheng, Norman and Chernoguz, Oleg and Hart, Olivia and Salpekar, Omkar and Kalinli, Ozlem and Kent, Parkin and Parekh, Parth and Saab, Paul and Balaji, Pavan and Rittner, Pedro and Bontrager, Philip and Roux, Pierre and Dollar, Piotr and Zvyagina, Polina and Ratanchandani, Prashant and Yuvraj, Pritish and Liang, Qian and Alao, Rachad and Rodriguez, Rachel and Ayub, Rafi and Murthy, Raghotham and Nayani, Raghu and Mitra, Rahul and Parthasarathy, Rangaprabhu and Li, Raymond and Hogan, Rebekkah and Battey, Robin and Wang, Rocky and Howes, Russ and Rinott, Ruty and Mehta, Sachin and Siby, Sachin and Bondu, Sai Jayesh and Datta, Samyak and Chugh, Sara and Hunt, Sara and Dhillon, Sargun and Sidorov, Sasha and Pan, Satadru and Mahajan, Saurabh and Verma, Saurabh and Yamamoto, Seiji and Ramaswamy, Sharadh and Lindsay, Shaun and Lindsay, Shaun and Feng, Sheng and Lin, Shenghao and Zha, Shengxin Cindy and Patil, Shishir and Shankar, Shiva and Zhang, Shuqiang and Zhang, Shuqiang and Wang, Sinong and Agarwal, Sneha and Sajuyigbe, Soji and Chintala, Soumith and Max, Stephanie and Chen, Stephen and Kehoe, Steve and Satterfield, Steve and Govindaprasad, Sudarshan and Gupta, Sumit and Deng, Summer and Cho, Sungmin and Virk, Sunny and Subramanian, Suraj and Choudhury, Sy and Goldman, Sydney and Remez, Tal and Glaser, Tamar and Best, Tamara and Koehler, Thilo and Robinson, Thomas and Li, Tianhe and Zhang, Tianjun and Matthews, Tim and Chou, Timothy and Shaked, Tzook and Vontimitta, Varun and Ajayi, Victoria and Montanez, Victoria and Mohan, Vijai and Kumar, Vinay Satish and Mangla, Vishal and Ionescu, Vlad and Poenaru, Vlad and Mihailescu, Vlad Tiberiu and Ivanov, Vladimir and Li, Wei and Wang, Wenchen and Jiang, Wenwen and Bouaziz, Wes and Constable, Will and Tang, Xiaocheng and Wu, Xiaojian and Wang, Xiaolan and Wu, Xilun and Gao, Xinbo and Kleinman, Yaniv and Chen, Yanjun and Hu, Ye and Jia, Ye and Qi, Ye and Li, Yenda and Zhang, Yilin and Zhang, Ying and Adi, Yossi and Nam, Youngjin and Yu and Wang and Zhao, Yu and Hao, Yuchen and Qian, Yundi and Li, Yunlu and He, Yuzi and Rait, Zach and DeVito, Zachary and Rosnbrick, Zef and Wen, Zhaoduo and Yang, Zhenyu and Zhao, Zhiwei and Ma, Zhiyu},
+ year = {2024},
+ month = nov,
+ number = {arXiv:2407.21783},
+ eprint = {2407.21783},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2407.21783},
+ urldate = {2025-09-09},
+ abstract = {Modern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and our Llama Guard 3 model for input and output safety. The paper also presents the results of experiments in which we integrate image, video, and speech capabilities into Llama 3 via a compositional approach. We observe this approach performs competitively with the state-of-the-art on image, video, and speech recognition tasks. The resulting models are not yet being broadly released as they are still under development.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/88PJ48EN/Grattafiori et al. - 2024 - The Llama 3 Herd of Models.pdf;/Users/fracapuano/Zotero/storage/2LLAWX8L/2407.html}
+}
+
+@inproceedings{griffinWalkingStabilizationUsing2017,
+ title = {Walking {Stabilization Using Step Timing} and {Location Adjustment} on the {Humanoid Robot}, {Atlas}},
+ booktitle = {2017 {IEEE}/RSJ International Conference on {Intelligent Robots} and {Systems} ({IROS})},
+ author = {Griffin, Robert J. and Wiedebach, Georg and Bertrand, Sylvain and Leonessa, Alexander and Pratt, Jerry},
+ year = {2017},
+ month = sep,
+ eprint = {1703.00477},
+ primaryclass = {cs},
+ pages = {667--673},
+ doi = {10.1109/IROS.2017.8202223},
+ urldate = {2025-08-26},
+ abstract = {While humans are highly capable of recovering from external disturbances and uncertainties that result in large tracking errors, humanoid robots have yet to reliably mimic this level of robustness. Essential to this is the ability to combine traditional "ankle strategy" balancing with step timing and location adjustment techniques. In doing so, the robot is able to step quickly to the necessary location to continue walking. In this work, we present both a new swing speed up algorithm to adjust the step timing, allowing the robot to set the foot down more quickly to recover from errors in the direction of the current capture point dynamics, and a new algorithm to adjust the desired footstep, expanding the base of support to utilize the center of pressure (CoP)-based ankle strategy for balance. We then utilize the desired centroidal moment pivot (CMP) to calculate the momentum rate of change for our inverse-dynamics based whole-body controller. We present simulation and experimental results using this work, and discuss performance limitations and potential improvements.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/SSNAZ6U4/Griffin et al. - 2017 - Walking Stabilization Using Step Timing and Location Adjustment on the Humanoid Robot, Atlas.pdf;/Users/fracapuano/Zotero/storage/VP885PA9/1703.html}
+}
+
+@misc{haarnojaReinforcementLearningDeep2017,
+ title = {Reinforcement {Learning} with Deep Energy-Based Policies},
+ author = {Haarnoja, Tuomas and Tang, Haoran and Abbeel, Pieter and Levine, Sergey},
+ year = {2017},
+ month = jul,
+ number = {arXiv:1702.08165},
+ eprint = {1702.08165},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1702.08165},
+ urldate = {2025-08-31},
+ abstract = {We propose a method for learning expressive energy-based policies for continuous states and actions, which has been feasible only in tabular domains before. We apply our method to learning maximum entropy policies, resulting into a new algorithm, called soft Q-learning, that expresses the optimal policy via a Boltzmann distribution. We use the recently proposed amortized Stein variational gradient descent to learn a stochastic sampling network that approximates samples from this distribution. The benefits of the proposed algorithm include improved exploration and compositionality that allows transferring skills between tasks, which we confirm in simulated experiments with swimming and walking robots. We also draw a connection to actor-critic methods, which can be viewed performing approximate inference on the corresponding energy-based model.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/PXCR4TCT/Haarnoja et al. - 2017 - Reinforcement Learning with Deep Energy-Based Policies.pdf;/Users/fracapuano/Zotero/storage/VUXXX9B7/1702.html}
+}
+
+@misc{haarnojaReinforcementLearningDeep2017a,
+ title = {Reinforcement {Learning} with Deep Energy-Based Policies},
+ author = {Haarnoja, Tuomas and Tang, Haoran and Abbeel, Pieter and Levine, Sergey},
+ year = {2017},
+ month = jul,
+ number = {arXiv:1702.08165},
+ eprint = {1702.08165},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1702.08165},
+ urldate = {2025-08-31},
+ abstract = {We propose a method for learning expressive energy-based policies for continuous states and actions, which has been feasible only in tabular domains before. We apply our method to learning maximum entropy policies, resulting into a new algorithm, called soft Q-learning, that expresses the optimal policy via a Boltzmann distribution. We use the recently proposed amortized Stein variational gradient descent to learn a stochastic sampling network that approximates samples from this distribution. The benefits of the proposed algorithm include improved exploration and compositionality that allows transferring skills between tasks, which we confirm in simulated experiments with swimming and walking robots. We also draw a connection to actor-critic methods, which can be viewed performing approximate inference on the corresponding energy-based model.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/T84UBYDJ/Haarnoja et al. - 2017 - Reinforcement Learning with Deep Energy-Based Policies.pdf;/Users/fracapuano/Zotero/storage/53SJ2ED8/1702.html}
+}
+
+@inproceedings{haarnojaReinforcementLearningDeep2017b,
+ title = {Reinforcement {Learning} with Deep Energy-Based Policies},
+ booktitle = {Proceedings of the 34th {International Conference} on {Machine Learning}},
+ author = {Haarnoja, Tuomas and Tang, Haoran and Abbeel, Pieter and Levine, Sergey},
+ year = {2017},
+ month = jul,
+ pages = {1352--1361},
+ publisher = {PMLR},
+ issn = {2640-3498},
+ urldate = {2025-08-31},
+ abstract = {We propose a method for learning expressive energy-based policies for continuous states and actions, which has been feasible only in tabular domains before. We apply our method to learning maximum entropy policies, resulting into a new algorithm, called soft Q-learning, that expresses the optimal policy via a Boltzmann distribution. We use the recently proposed amortized Stein variational gradient descent to learn a stochastic sampling network that approximates samples from this distribution. The benefits of the proposed algorithm include improved exploration and compositionality that allows transferring skills between tasks, which we confirm in simulated experiments with swimming and walking robots. We also draw a connection to actor-critic methods, which can be viewed performing approximate inference on the corresponding energy-based model.},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/C59BJ4GU/Haarnoja et al. - 2017 - Reinforcement Learning with Deep Energy-Based Policies.pdf}
+}
+
+@misc{haarnojaSoftActorCriticOffPolicy2018,
+ title = {Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a {Stochastic Actor}},
+ shorttitle = {Soft Actor-Critic},
+ author = {Haarnoja, Tuomas and Zhou, Aurick and Abbeel, Pieter and Levine, Sergey},
+ year = {2018},
+ month = aug,
+ number = {arXiv:1801.01290},
+ eprint = {1801.01290},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1801.01290},
+ urldate = {2025-08-29},
+ abstract = {Model-free deep reinforcement learning (RL) algorithms have been demonstrated on a range of challenging decision making and control tasks. However, these methods typically suffer from two major challenges: very high sample complexity and brittle convergence properties, which necessitate meticulous hyperparameter tuning. Both of these challenges severely limit the applicability of such methods to complex, real-world domains. In this paper, we propose soft actor-critic, an off-policy actor-critic deep RL algorithm based on the maximum entropy reinforcement learning framework. In this framework, the actor aims to maximize expected reward while also maximizing entropy. That is, to succeed at the task while acting as randomly as possible. Prior deep RL methods based on this framework have been formulated as Q-learning methods. By combining off-policy updates with a stable stochastic actor-critic formulation, our method achieves state-of-the-art performance on a range of continuous control benchmark tasks, outperforming prior on-policy and off-policy methods. Furthermore, we demonstrate that, in contrast to other off-policy algorithms, our approach is very stable, achieving very similar performance across different random seeds.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/HG6UQIRM/Haarnoja et al. - 2018 - Soft Actor-Critic Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor.pdf;/Users/fracapuano/Zotero/storage/RKG3J7MX/1801.html}
+}
+
+@misc{hansenTemporalDifferenceLearning2022,
+ title = {Temporal {Difference Learning} for {Model Predictive Control}},
+ author = {Hansen, Nicklas and Wang, Xiaolong and Su, Hao},
+ year = {2022},
+ month = jul,
+ number = {arXiv:2203.04955},
+ eprint = {2203.04955},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2203.04955},
+ urldate = {2025-08-25},
+ abstract = {Data-driven model predictive control has two key advantages over model-free methods: a potential for improved sample efficiency through model learning, and better performance as computational budget for planning increases. However, it is both costly to plan over long horizons and challenging to obtain an accurate model of the environment. In this work, we combine the strengths of model-free and model-based methods. We use a learned task-oriented latent dynamics model for local trajectory optimization over a short horizon, and use a learned terminal value function to estimate long-term return, both of which are learned jointly by temporal difference learning. Our method, TD-MPC, achieves superior sample efficiency and asymptotic performance over prior work on both state and image-based continuous control tasks from DMControl and Meta-World. Code and video results are available at https://nicklashansen.github.io/td-mpc.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/TZF8LCDG/Hansen et al. - 2022 - Temporal Difference Learning for Model Predictive Control.pdf;/Users/fracapuano/Zotero/storage/WU2WWWQE/2203.html}
+}
+
+@misc{heessEmergenceLocomotionBehaviours2017,
+ title = {Emergence of {Locomotion Behaviours} in {Rich Environments}},
+ author = {Heess, Nicolas and TB, Dhruva and Sriram, Srinivasan and Lemmon, Jay and Merel, Josh and Wayne, Greg and Tassa, Yuval and Erez, Tom and Wang, Ziyu and Eslami, S. M. Ali and Riedmiller, Martin and Silver, David},
+ year = {2017},
+ month = jul,
+ number = {arXiv:1707.02286},
+ eprint = {1707.02286},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1707.02286},
+ urldate = {2025-09-02},
+ abstract = {The reinforcement learning paradigm allows, in principle, for complex behaviours to be learned directly from simple reward signals. In practice, however, it is common to carefully hand-design the reward function to encourage a particular solution, or to derive it from demonstration data. In this paper explore how a rich environment can help to promote the learning of complex behavior. Specifically, we train agents in diverse environmental contexts, and find that this encourages the emergence of robust behaviours that perform well across a suite of tasks. We demonstrate this principle for locomotion -- behaviours that are known for their sensitivity to the choice of reward. We train several simulated bodies on a diverse set of challenging terrains and obstacles, using a simple reward function based on forward progress. Using a novel scalable variant of policy gradient reinforcement learning, our agents learn to run, jump, crouch and turn as required by the environment without explicit reward-based guidance. A visual depiction of highlights of the learned behavior can be viewed following https://youtu.be/hx_bgoTF7bs .},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence},
+ file = {/Users/fracapuano/Zotero/storage/9DZ8XEVY/Heess et al. - 2017 - Emergence of Locomotion Behaviours in Rich Environments.pdf;/Users/fracapuano/Zotero/storage/JUB2Q3WH/1707.html}
+}
+
+@inproceedings{higgins2017beta,
+ title = {Beta-Vae: {Learning} Basic Visual Concepts with a Constrained Variational Framework},
+ booktitle = {International Conference on Learning Representations},
+ author = {Higgins, Irina and Matthey, Loic and Pal, Arka and Burgess, Christopher and Glorot, Xavier and Botvinick, Matthew and Mohamed, Shakir and Lerchner, Alexander},
+ year = {2017}
+}
+
+@misc{hoDenoisingDiffusionProbabilistic2020,
+ title = {Denoising {Diffusion Probabilistic Models}},
+ author = {Ho, Jonathan and Jain, Ajay and Abbeel, Pieter},
+ year = {2020},
+ month = dec,
+ number = {arXiv:2006.11239},
+ eprint = {2006.11239},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2006.11239},
+ urldate = {2025-09-03},
+ abstract = {We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN. Our implementation is available at https://github.com/hojonathanho/diffusion},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/DE655AYQ/Ho et al. - 2020 - Denoising Diffusion Probabilistic Models.pdf;/Users/fracapuano/Zotero/storage/NVIS47ZH/2006.html}
+}
+
+@article{hwangboLearningAgileDynamic2019,
+ title = {Learning Agile and Dynamic Motor Skills for Legged Robots},
+ author = {Hwangbo, Jemin and Lee, Joonho and Dosovitskiy, Alexey and Bellicoso, Dario and Tsounis, Vassilios and Koltun, Vladlen and Hutter, Marco},
+ year = {2019},
+ month = jan,
+ journal = {Science Robotics},
+ volume = {4},
+ number = {26},
+ pages = {eaau5872},
+ publisher = {American Association for the Advancement of Science},
+ doi = {10.1126/scirobotics.aau5872},
+ urldate = {2025-08-27},
+ abstract = {Legged robots pose one of the greatest challenges in robotics. Dynamic and agile maneuvers of animals cannot be imitated by existing methods that are crafted by humans. A compelling alternative is reinforcement learning, which requires minimal craftsmanship and promotes the natural evolution of a control policy. However, so far, reinforcement learning research for legged robots is mainly limited to simulation, and only few and comparably simple examples have been deployed on real systems. The primary reason is that training with real robots, particularly with dynamically balancing systems, is complicated and expensive. In the present work, we introduce a method for training a neural network policy in simulation and transferring it to a state-of-the-art legged system, thereby leveraging fast, automated, and cost-effective data generation schemes. The approach is applied to the ANYmal robot, a sophisticated medium-dog--sized quadrupedal system. Using policies trained in simulation, the quadrupedal machine achieves locomotion skills that go beyond what had been achieved with prior methods: ANYmal is capable of precisely and energy-efficiently following high-level body velocity commands, running faster than before, and recovering from falling even in complex configurations.},
+ file = {/Users/fracapuano/Zotero/storage/9V3X2F7R/Hwangbo et al. - 2019 - Learning agile and dynamic motor skills for legged robots.pdf}
+}
+
+@inproceedings{ImageNet_VSS09,
+ title = {Construction and Analysis of a Large Scale Image Ontology},
+ author = {Deng, J. and Li, K. and Do, M. and Su, H. and {Fei-Fei}, L.},
+ year = {2009},
+ publisher = {Vision Sciences Society}
+}
+
+@inproceedings{InstructBLIP,
+ title = {InstructBLIP: {Towards} General-Purpose Vision-Language Models with Instruction Tuning},
+ booktitle = {Thirty-Seventh Conference on Neural Information Processing Systems},
+ author = {Dai, Wenliang and Li, Junnan and Li, Dongxu and Tiong, Anthony and Zhao, Junqi and Wang, Weisheng and Li, Boyang and Fung, Pascale and Hoi, Steven},
+ year = {2023}
+}
+
+@misc{jangBCZZeroShotTask2022,
+ title = {BC-Z: Zero-Shot Task Generalization with {Robotic Imitation Learning}},
+ shorttitle = {BC-Z},
+ author = {Jang, Eric and Irpan, Alex and Khansari, Mohi and Kappler, Daniel and Ebert, Frederik and Lynch, Corey and Levine, Sergey and Finn, Chelsea},
+ year = {2022},
+ month = feb,
+ number = {arXiv:2202.02005},
+ eprint = {2202.02005},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2202.02005},
+ urldate = {2025-09-01},
+ abstract = {In this paper, we study the problem of enabling a vision-based robotic manipulation system to generalize to novel tasks, a long-standing challenge in robot learning. We approach the challenge from an imitation learning perspective, aiming to study how scaling and broadening the data collected can facilitate such generalization. To that end, we develop an interactive and flexible imitation learning system that can learn from both demonstrations and interventions and can be conditioned on different forms of information that convey the task, including pre-trained embeddings of natural language or videos of humans performing the task. When scaling data collection on a real robot to more than 100 distinct tasks, we find that this system can perform 24 unseen manipulation tasks with an average success rate of 44%, without any robot demonstrations for those tasks.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/YDG2WMDC/Jang et al. - 2022 - BC-Z Zero-Shot Task Generalization with Robotic Imitation Learning.pdf;/Users/fracapuano/Zotero/storage/ZZ47RG6V/2202.html}
+}
+
+@misc{jannerPlanningDiffusionFlexible2022,
+ title = {Planning with {Diffusion} for {Flexible Behavior Synthesis}},
+ author = {Janner, Michael and Du, Yilun and Tenenbaum, Joshua B. and Levine, Sergey},
+ year = {2022},
+ month = dec,
+ number = {arXiv:2205.09991},
+ eprint = {2205.09991},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2205.09991},
+ urldate = {2025-09-03},
+ abstract = {Model-based reinforcement learning methods often use learning only for the purpose of estimating an approximate dynamics model, offloading the rest of the decision-making work to classical trajectory optimizers. While conceptually simple, this combination has a number of empirical shortcomings, suggesting that learned models may not be well-suited to standard trajectory optimization. In this paper, we consider what it would look like to fold as much of the trajectory optimization pipeline as possible into the modeling problem, such that sampling from the model and planning with it become nearly identical. The core of our technical approach lies in a diffusion probabilistic model that plans by iteratively denoising trajectories. We show how classifier-guided sampling and image inpainting can be reinterpreted as coherent planning strategies, explore the unusual and useful properties of diffusion-based planning methods, and demonstrate the effectiveness of our framework in control settings that emphasize long-horizon decision-making and test-time flexibility.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/6S28T733/Janner et al. - 2022 - Planning with Diffusion for Flexible Behavior Synthesis.pdf;/Users/fracapuano/Zotero/storage/DRH9ZWCG/2205.html}
+}
+
+@misc{jiangMistral7B2023,
+ title = {Mistral 7B},
+ author = {Jiang, Albert Q. and Sablayrolles, Alexandre and Mensch, Arthur and Bamford, Chris and Chaplot, Devendra Singh and de las Casas, Diego and Bressand, Florian and Lengyel, Gianna and Lample, Guillaume and Saulnier, Lucile and Lavaud, L{\'e}lio Renard and Lachaux, Marie-Anne and Stock, Pierre and Scao, Teven Le and Lavril, Thibaut and Wang, Thomas and Lacroix, Timoth{\'e}e and Sayed, William El},
+ year = {2023},
+ month = oct,
+ number = {arXiv:2310.06825},
+ eprint = {2310.06825},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2310.06825},
+ urldate = {2025-09-09},
+ abstract = {We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are released under the Apache 2.0 license.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/JJX9Q8J9/Jiang et al. - 2023 - Mistral 7B.pdf;/Users/fracapuano/Zotero/storage/WTMQBRW3/2310.html}
+}
+
+@misc{jiDribbleBotDynamicLegged2023,
+ title = {DribbleBot: {Dynamic Legged Manipulation} in the {Wild}},
+ shorttitle = {DribbleBot},
+ author = {Ji, Yandong and Margolis, Gabriel B. and Agrawal, Pulkit},
+ year = {2023},
+ month = apr,
+ number = {arXiv:2304.01159},
+ eprint = {2304.01159},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2304.01159},
+ urldate = {2025-08-26},
+ abstract = {DribbleBot (Dexterous Ball Manipulation with a Legged Robot) is a legged robotic system that can dribble a soccer ball under the same real-world conditions as humans (i.e., in-the-wild). We adopt the paradigm of training policies in simulation using reinforcement learning and transferring them into the real world. We overcome critical challenges of accounting for variable ball motion dynamics on different terrains and perceiving the ball using body-mounted cameras under the constraints of onboard computing. Our results provide evidence that current quadruped platforms are well-suited for studying dynamic whole-body control problems involving simultaneous locomotion and manipulation directly from sensory observations.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/ABSRE4C4/Ji et al. - 2023 - DribbleBot Dynamic Legged Manipulation in the Wild.pdf;/Users/fracapuano/Zotero/storage/ADI4QNCY/2304.html}
+}
+
+@misc{kakaobrain2022coyo700m,
+ title = {COYO-700M: Image-text Pair Dataset},
+ author = {Byeon, Minwoo and Park, Beomhee and Kim, Haecheon and Lee, Sungjun and Baek, Woonhyuk and Kim, Saehoon},
+ year = {2022}
+}
+
+@misc{kaplanScalingLawsNeural2020,
+ title = {Scaling {Laws} for {Neural Language Models}},
+ author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
+ year = {2020},
+ month = jan,
+ number = {arXiv:2001.08361},
+ eprint = {2001.08361},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2001.08361},
+ urldate = {2025-09-07},
+ abstract = {We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are significantly more sample-efficient, such that optimally compute-efficient training involves training very large models on a relatively modest amount of data and stopping significantly before convergence.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/MI5AGWBH/Kaplan et al. - 2020 - Scaling Laws for Neural Language Models.pdf;/Users/fracapuano/Zotero/storage/SBZT8DDY/2001.html}
+}
+
+@misc{keGraspingChopsticksCombating2020,
+ title = {Grasping with {Chopsticks}: {Combating Covariate Shift} in Model-free Imitation Learning for {Fine Manipulation}},
+ shorttitle = {Grasping with {Chopsticks}},
+ author = {Ke, Liyiming and Wang, Jingqiang and Bhattacharjee, Tapomayukh and Boots, Byron and Srinivasa, Siddhartha},
+ year = {2020},
+ month = nov,
+ number = {arXiv:2011.06719},
+ eprint = {2011.06719},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2011.06719},
+ urldate = {2025-09-01},
+ abstract = {Billions of people use chopsticks, a simple yet versatile tool, for fine manipulation of everyday objects. The small, curved, and slippery tips of chopsticks pose a challenge for picking up small objects, making them a suitably complex test case. This paper leverages human demonstrations to develop an autonomous chopsticks-equipped robotic manipulator. Due to the lack of accurate models for fine manipulation, we explore model-free imitation learning, which traditionally suffers from the covariate shift phenomenon that causes poor generalization. We propose two approaches to reduce covariate shift, neither of which requires access to an interactive expert or a model, unlike previous approaches. First, we alleviate single-step prediction errors by applying an invariant operator to increase the data support at critical steps for grasping. Second, we generate synthetic corrective labels by adding bounded noise and combining parametric and non-parametric methods to prevent error accumulation. We demonstrate our methods on a real chopstick-equipped robot that we built, and observe the agent's success rate increase from 37.3% to 80%, which is comparable to the human expert performance of 82.6%.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/ZUPECLSW/Ke et al. - 2020 - Grasping with Chopsticks Combating Covariate Shift in Model-free Imitation Learning for Fine Manipu.pdf;/Users/fracapuano/Zotero/storage/X7PX638S/2011.html}
+}
+
+@article{khatibRealTimeObstancleAvoidance1986,
+ title = {Real-{Time Obstancle Avoidance} for {Manipulators} and {Mobile Robots}},
+ author = {Khatib, Oussama},
+ year = {1986},
+ journal = {The International Journal of Robotics Research},
+ volume = {5}
+}
+
+@misc{khazatskyDROIDLargeScaleInTheWild2025,
+ title = {DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset},
+ shorttitle = {DROID},
+ author = {Khazatsky, Alexander and Pertsch, Karl and Nair, Suraj and Balakrishna, Ashwin and Dasari, Sudeep and Karamcheti, Siddharth and Nasiriany, Soroush and Srirama, Mohan Kumar and Chen, Lawrence Yunliang and Ellis, Kirsty and Fagan, Peter David and Hejna, Joey and Itkina, Masha and Lepert, Marion and Ma, Yecheng Jason and Miller, Patrick Tree and Wu, Jimmy and Belkhale, Suneel and Dass, Shivin and Ha, Huy and Jain, Arhan and Lee, Abraham and Lee, Youngwoon and Memmel, Marius and Park, Sungjae and Radosavovic, Ilija and Wang, Kaiyuan and Zhan, Albert and Black, Kevin and Chi, Cheng and Hatch, Kyle Beltran and Lin, Shan and Lu, Jingpei and Mercat, Jean and Rehman, Abdul and Sanketi, Pannag R. and Sharma, Archit and Simpson, Cody and Vuong, Quan and Walke, Homer Rich and Wulfe, Blake and Xiao, Ted and Yang, Jonathan Heewon and Yavary, Arefeh and Zhao, Tony Z. and Agia, Christopher and Baijal, Rohan and Castro, Mateo Guaman and Chen, Daphne and Chen, Qiuyu and Chung, Trinity and Drake, Jaimyn and Foster, Ethan Paul and Gao, Jensen and Guizilini, Vitor and Herrera, David Antonio and Heo, Minho and Hsu, Kyle and Hu, Jiaheng and Irshad, Muhammad Zubair and Jackson, Donovon and Le, Charlotte and Li, Yunshuang and Lin, Kevin and Lin, Roy and Ma, Zehan and Maddukuri, Abhiram and Mirchandani, Suvir and Morton, Daniel and Nguyen, Tony and O'Neill, Abigail and Scalise, Rosario and Seale, Derick and Son, Victor and Tian, Stephen and Tran, Emi and Wang, Andrew E. and Wu, Yilin and Xie, Annie and Yang, Jingyun and Yin, Patrick and Zhang, Yunchu and Bastani, Osbert and Berseth, Glen and Bohg, Jeannette and Goldberg, Ken and Gupta, Abhinav and Gupta, Abhishek and Jayaraman, Dinesh and Lim, Joseph J. and Malik, Jitendra and {Mart{\'i}n-Mart{\'i}n}, Roberto and Ramamoorthy, Subramanian and Sadigh, Dorsa and Song, Shuran and Wu, Jiajun and Yip, Michael C. and Zhu, Yuke and Kollar, Thomas and Levine, Sergey and Finn, Chelsea},
+ year = {2025},
+ month = apr,
+ number = {arXiv:2403.12945},
+ eprint = {2403.12945},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2403.12945},
+ urldate = {2025-09-08},
+ abstract = {The creation of large, diverse, high-quality robot manipulation datasets is an important stepping stone on the path toward more capable and robust robotic manipulation policies. However, creating such datasets is challenging: collecting robot manipulation data in diverse environments poses logistical and safety challenges and requires substantial investments in hardware and human labour. As a result, even the most general robot manipulation policies today are mostly trained on data collected in a small number of environments with limited scene and task diversity. In this work, we introduce DROID (Distributed Robot Interaction Dataset), a diverse robot manipulation dataset with 76k demonstration trajectories or 350 hours of interaction data, collected across 564 scenes and 84 tasks by 50 data collectors in North America, Asia, and Europe over the course of 12 months. We demonstrate that training with DROID leads to policies with higher performance and improved generalization ability. We open source the full dataset, policy learning code, and a detailed guide for reproducing our robot hardware setup.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/XZ5Y4HZS/Khazatsky et al. - 2025 - DROID A Large-Scale In-The-Wild Robot Manipulation Dataset.pdf;/Users/fracapuano/Zotero/storage/N2Z72XLK/2403.html}
+}
+
+@misc{kimOpenVLAOpenSourceVisionLanguageAction2024,
+ title = {OpenVLA: An Open-Source Vision-Language-Action Model},
+ shorttitle = {OpenVLA},
+ author = {Kim, Moo Jin and Pertsch, Karl and Karamcheti, Siddharth and Xiao, Ted and Balakrishna, Ashwin and Nair, Suraj and Rafailov, Rafael and Foster, Ethan and Lam, Grace and Sanketi, Pannag and Vuong, Quan and Kollar, Thomas and Burchfiel, Benjamin and Tedrake, Russ and Sadigh, Dorsa and Levine, Sergey and Liang, Percy and Finn, Chelsea},
+ year = {2024},
+ month = sep,
+ number = {arXiv:2406.09246},
+ eprint = {2406.09246},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2406.09246},
+ urldate = {2025-09-08},
+ abstract = {Large policies pretrained on a combination of Internet-scale vision-language data and diverse robot demonstrations have the potential to change how we teach robots new skills: rather than training new behaviors from scratch, we can fine-tune such vision-language-action (VLA) models to obtain robust, generalizable policies for visuomotor control. Yet, widespread adoption of VLAs for robotics has been challenging as 1) existing VLAs are largely closed and inaccessible to the public, and 2) prior work fails to explore methods for efficiently fine-tuning VLAs for new tasks, a key component for adoption. Addressing these challenges, we introduce OpenVLA, a 7B-parameter open-source VLA trained on a diverse collection of 970k real-world robot demonstrations. OpenVLA builds on a Llama 2 language model combined with a visual encoder that fuses pretrained features from DINOv2 and SigLIP. As a product of the added data diversity and new model components, OpenVLA demonstrates strong results for generalist manipulation, outperforming closed models such as RT-2-X (55B) by 16.5% in absolute task success rate across 29 tasks and multiple robot embodiments, with 7x fewer parameters. We further show that we can effectively fine-tune OpenVLA for new settings, with especially strong generalization results in multi-task environments involving multiple objects and strong language grounding abilities, and outperform expressive from-scratch imitation learning methods such as Diffusion Policy by 20.4%. We also explore compute efficiency; as a separate contribution, we show that OpenVLA can be fine-tuned on consumer GPUs via modern low-rank adaptation methods and served efficiently via quantization without a hit to downstream success rate. Finally, we release model checkpoints, fine-tuning notebooks, and our PyTorch codebase with built-in support for training VLAs at scale on Open X-Embodiment datasets.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/XR2SX8WG/Kim et al. - 2024 - OpenVLA An Open-Source Vision-Language-Action Model.pdf;/Users/fracapuano/Zotero/storage/63Q96WRV/2406.html}
+}
+
+@misc{kingmaAutoEncodingVariationalBayes2022,
+ title = {Auto-{Encoding Variational Bayes}},
+ author = {Kingma, Diederik P. and Welling, Max},
+ year = {2022},
+ month = dec,
+ number = {arXiv:1312.6114},
+ eprint = {1312.6114},
+ primaryclass = {stat},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1312.6114},
+ urldate = {2025-09-02},
+ abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions are two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/IT7VNQ4U/Kingma and Welling - 2022 - Auto-Encoding Variational Bayes.pdf;/Users/fracapuano/Zotero/storage/HQT22HP5/1312.html}
+}
+
+@misc{knightStandardOpenSO100,
+ title = {Standard Open SO-100 & SO-101 Arms},
+ author = {Knight, Rob and Kooijmans, Pepijn and Wolf, Thomas and Alibert, Simon and Aractingi, Michel and Aubakirova, Dana and Zouitine, Adil and Martino, Russi and Palma, Steven and Pascal, Caroline and Cadene, Remi}
+}
+
+@article{koberReinforcementLearningRobotics,
+ title = {Reinforcement {Learning} in {Robotics}: {A Survey}},
+ author = {Kober, Jens and Bagnell, J Andrew and Peters, Jan},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/72PRHGKL/Kober et al. - Reinforcement Learning in Robotics A Survey.pdf}
+}
+
+@inproceedings{kong2024audioflam,
+ title = {Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities},
+ booktitle = {International Conference on Machine Learning},
+ author = {Kong, Zhifeng and Goel, Arushi and Badlani, Rohan and Ping, Wei and Valle, Rafael and Catanzaro, Bryan},
+ year = {2024},
+ pages = {25125--25148},
+ publisher = {PMLR}
+}
+
+@misc{kumarRMARapidMotor2021,
+ title = {RMA: {Rapid Motor Adaptation} for {Legged Robots}},
+ shorttitle = {RMA},
+ author = {Kumar, Ashish and Fu, Zipeng and Pathak, Deepak and Malik, Jitendra},
+ year = {2021},
+ month = jul,
+ number = {arXiv:2107.04034},
+ eprint = {2107.04034},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2107.04034},
+ urldate = {2025-08-27},
+ abstract = {Successful real-world deployment of legged robots would require them to adapt in real-time to unseen scenarios like changing terrains, changing payloads, wear and tear. This paper presents Rapid Motor Adaptation (RMA) algorithm to solve this problem of real-time online adaptation in quadruped robots. RMA consists of two components: a base policy and an adaptation module. The combination of these components enables the robot to adapt to novel situations in fractions of a second. RMA is trained completely in simulation without using any domain knowledge like reference trajectories or predefined foot trajectory generators and is deployed on the A1 robot without any fine-tuning. We train RMA on a varied terrain generator using bioenergetics-inspired rewards and deploy it on a variety of difficult terrains including rocky, slippery, deformable surfaces in environments with grass, long vegetation, concrete, pebbles, stairs, sand, etc. RMA shows state-of-the-art performance across diverse real-world as well as simulation experiments. Video results at https://ashish-kmr.github.io/rma-legged-robots/},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/TMYICHS6/Kumar et al. - 2021 - RMA Rapid Motor Adaptation for Legged Robots.pdf;/Users/fracapuano/Zotero/storage/TFY2EU8I/2107.html}
+}
+
+@misc{laiActionChunkingConditional2025,
+ title = {Action Chunking as Conditional Policy Compression},
+ author = {Lai, Lucy and Huang, Ann and Gershman, Samuel},
+ year = {2025},
+ month = jun,
+ publisher = {OSF},
+ doi = {10.31234/osf.io/z8yrv_v2},
+ urldate = {2025-09-02},
+ abstract = {Many skills in our everyday lives are learned by sequencing actions towards a desired goal. The action sequence can become a ``chunk'' when individual actions are grouped together and executed as one unit, making them more efficient to store and execute. While chunking has been studied extensively across various domains, a puzzle remains as to why and under what conditions action chunking occurs. To tackle these questions, we develop a model of conditional policy compression---the reduction in cognitive cost by conditioning on an additional source of information---to explain the origin of chunking. We argue that chunking is a result of optimizing the trade-off between reward and conditional policy complexity. Chunking compresses policies when there is temporal structure in the environment that can be leveraged for action selection, reducing the amount of memory necessary to encode the policy. We experimentally confirm our model's predictions, showing that chunking reduces conditional policy complexity and reaction times. Chunking also increases with working memory load, consistent with the hypothesis that the degree of policy compression scales with the scarcity of cognitive resources. Finally, chunking also reduces overall working memory load, freeing cognitive resources for the benefit of other, not-chunked information.},
+ archiveprefix = {OSF},
+ langid = {american},
+ keywords = {action selection,chunking,habits,reinforcement learning,resource-rationality,working memory}
+}
+
+@article{laiActionChunkingConditional2025a,
+ title = {Action Chunking as Conditional Policy Compression},
+ author = {Lai, Lucy and Huang, Ann Z. X. and Gershman, Samuel J.},
+ year = {2025},
+ month = nov,
+ journal = {Cognition},
+ volume = {264},
+ pages = {106201},
+ issn = {1873-7838},
+ doi = {10.1016/j.cognition.2025.106201},
+ abstract = {Many skills in our everyday lives are learned by sequencing actions towards a desired goal. The action sequence can become a "chunk" when individual actions are grouped together and executed as one unit, making them more efficient to store and execute. While chunking has been studied extensively across various domains, a puzzle remains as to why and under what conditions action chunking occurs. To tackle these questions, we develop a model of conditional policy compression-the reduction in cognitive cost by conditioning on an additional source of information-to explain the origin of chunking. We argue that chunking is a result of optimizing the trade-off between reward and conditional policy complexity. Chunking compresses policies when there is temporal structure in the environment that can be leveraged for action selection, reducing the amount of memory necessary to encode the policy. We experimentally confirm our model's predictions, showing that chunking reduces conditional policy complexity and reaction times. Chunking also increases with working memory load, consistent with the hypothesis that the degree of policy compression scales with the scarcity of cognitive resources. Finally, chunking also reduces overall working memory load, freeing cognitive resources for the benefit of other, not-chunked information.},
+ langid = {english},
+ pmid = {40602234},
+ keywords = {Action selection,Adult,Chunking,Cognition,Decision making,Female,Humans,Information bottleneck,Male,Memory Short-Term,Models Psychological,Psychomotor Performance,Reaction Time,Reinforcement learning,Resource rationality,Reward,Young Adult}
+}
+
+@article{LAION-COCO,
+ title = {Laion Coco: 600m Synthetic Captions from Laion2b-En},
+ author = {Schuhmann, C and K{\"o}pf, A and Vencu, R and Coombes, T and Beaumont, R},
+ year = {2022},
+ journal = {URL https://laion.ai/blog/laion-coco}
+}
+
+@misc{laurenconWhatMattersWhen2024,
+ title = {What Matters When Building Vision-Language Models?},
+ author = {Lauren{\c c}on, Hugo and Tronchon, L{\'e}o and Cord, Matthieu and Sanh, Victor},
+ year = {2024},
+ month = may,
+ number = {arXiv:2405.02246},
+ eprint = {2405.02246},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2405.02246},
+ urldate = {2025-09-09},
+ abstract = {The growing interest in vision-language models (VLMs) has been driven by improvements in large language models and vision transformers. Despite the abundance of literature on this subject, we observe that critical decisions regarding the design of VLMs are often not justified. We argue that these unsupported decisions impede progress in the field by making it difficult to identify which choices improve model performance. To address this issue, we conduct extensive experiments around pre-trained models, architecture choice, data, and training methods. Our consolidation of findings includes the development of Idefics2, an efficient foundational VLM of 8 billion parameters. Idefics2 achieves state-of-the-art performance within its size category across various multimodal benchmarks, and is often on par with models four times its size. We release the model (base, instructed, and chat) along with the datasets created for its training.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/8H6NRPU7/Laurençon et al. - 2024 - What matters when building vision-language models.pdf;/Users/fracapuano/Zotero/storage/H3NETYXA/2405.html}
+}
+
+@misc{leeBehaviorGenerationLatent2024,
+ title = {Behavior {Generation} with {Latent Actions}},
+ author = {Lee, Seungjae and Wang, Yibin and Etukuru, Haritheja and Kim, H. Jin and Shafiullah, Nur Muhammad Mahi and Pinto, Lerrel},
+ year = {2024},
+ month = jun,
+ number = {arXiv:2403.03181},
+ eprint = {2403.03181},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2403.03181},
+ urldate = {2025-08-28},
+ abstract = {Generative modeling of complex behaviors from labeled datasets has been a longstanding problem in decision making. Unlike language or image generation, decision making requires modeling actions - continuous-valued vectors that are multimodal in their distribution, potentially drawn from uncurated sources, where generation errors can compound in sequential prediction. A recent class of models called Behavior Transformers (BeT) addresses this by discretizing actions using k-means clustering to capture different modes. However, k-means struggles to scale for high-dimensional action spaces or long sequences, and lacks gradient information, and thus BeT suffers in modeling long-range actions. In this work, we present Vector-Quantized Behavior Transformer (VQ-BeT), a versatile model for behavior generation that handles multimodal action prediction, conditional generation, and partial observations. VQ-BeT augments BeT by tokenizing continuous actions with a hierarchical vector quantization module. Across seven environments including simulated manipulation, autonomous driving, and robotics, VQ-BeT improves on state-of-the-art models such as BeT and Diffusion Policies. Importantly, we demonstrate VQ-BeT's improved ability to capture behavior modes while accelerating inference speed 5x over Diffusion Policies. Videos and code can be found https://sjlee.cc/vq-bet},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/IA93ENCH/Lee et al. - 2024 - Behavior Generation with Latent Actions.pdf;/Users/fracapuano/Zotero/storage/KBVF7GQL/2403.html}
+}
+
+@article{leeLearningQuadrupedalLocomotion2020,
+ title = {Learning {Quadrupedal Locomotion} over {Challenging Terrain}},
+ author = {Lee, Joonho and Hwangbo, Jemin and Wellhausen, Lorenz and Koltun, Vladlen and Hutter, Marco},
+ year = {2020},
+ month = oct,
+ journal = {Science Robotics},
+ volume = {5},
+ number = {47},
+ eprint = {2010.11251},
+ primaryclass = {cs},
+ pages = {eabc5986},
+ issn = {2470-9476},
+ doi = {10.1126/scirobotics.abc5986},
+ urldate = {2025-08-26},
+ abstract = {Some of the most challenging environments on our planet are accessible to quadrupedal animals but remain out of reach for autonomous machines. Legged locomotion can dramatically expand the operational domains of robotics. However, conventional controllers for legged locomotion are based on elaborate state machines that explicitly trigger the execution of motion primitives and reflexes. These designs have escalated in complexity while falling short of the generality and robustness of animal locomotion. Here we present a radically robust controller for legged locomotion in challenging natural environments. We present a novel solution to incorporating proprioceptive feedback in locomotion control and demonstrate remarkable zero-shot generalization from simulation to natural environments. The controller is trained by reinforcement learning in simulation. It is based on a neural network that acts on a stream of proprioceptive signals. The trained controller has taken two generations of quadrupedal ANYmal robots to a variety of natural environments that are beyond the reach of prior published work in legged locomotion. The controller retains its robustness under conditions that have never been encountered during training: deformable terrain such as mud and snow, dynamic footholds such as rubble, and overground impediments such as thick vegetation and gushing water. The presented work opens new frontiers for robotics and indicates that radical robustness in natural environments can be achieved by training in much simpler domains.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics,Computer Science - Systems and Control,Electrical Engineering and Systems Science - Systems and Control},
+ file = {/Users/fracapuano/Zotero/storage/8B9EF2CE/Lee et al. - 2020 - Learning Quadrupedal Locomotion over Challenging Terrain.pdf}
+}
+
+@misc{lillicrapContinuousControlDeep2019,
+ title = {Continuous Control with Deep Reinforcement Learning},
+ author = {Lillicrap, Timothy P. and Hunt, Jonathan J. and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
+ year = {2019},
+ month = jul,
+ number = {arXiv:1509.02971},
+ eprint = {1509.02971},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1509.02971},
+ urldate = {2025-08-31},
+ abstract = {We adapt the ideas underlying the success of Deep Q-Learning to the continuous action domain. We present an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. Using the same learning algorithm, network architecture and hyper-parameters, our algorithm robustly solves more than 20 simulated physics tasks, including classic problems such as cartpole swing-up, dexterous manipulation, legged locomotion and car driving. Our algorithm is able to find policies whose performance is competitive with those found by a planning algorithm with full access to the dynamics of the domain and its derivatives. We further demonstrate that for many of the tasks the algorithm can learn policies end-to-end: directly from raw pixel inputs.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/2VN6TMVK/Lillicrap et al. - 2019 - Continuous control with deep reinforcement learning.pdf;/Users/fracapuano/Zotero/storage/4FQ4W5VE/1509.html}
+}
+
+@misc{lillicrapContinuousControlDeep2019a,
+ title = {Continuous Control with Deep Reinforcement Learning},
+ author = {Lillicrap, Timothy P. and Hunt, Jonathan J. and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
+ year = {2019},
+ month = jul,
+ number = {arXiv:1509.02971},
+ eprint = {1509.02971},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1509.02971},
+ urldate = {2025-08-31},
+ abstract = {We adapt the ideas underlying the success of Deep Q-Learning to the continuous action domain. We present an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. Using the same learning algorithm, network architecture and hyper-parameters, our algorithm robustly solves more than 20 simulated physics tasks, including classic problems such as cartpole swing-up, dexterous manipulation, legged locomotion and car driving. Our algorithm is able to find policies whose performance is competitive with those found by a planning algorithm with full access to the dynamics of the domain and its derivatives. We further demonstrate that for many of the tasks the algorithm can learn policies end-to-end: directly from raw pixel inputs.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/HYMPB9F5/Lillicrap et al. - 2019 - Continuous control with deep reinforcement learning.pdf;/Users/fracapuano/Zotero/storage/EKCXMJNQ/1509.html}
+}
+
+@misc{linVILAPretrainingVisual2024,
+ title = {VILA: On Pre-training for {Visual Language Models}},
+ shorttitle = {VILA},
+ author = {Lin, Ji and Yin, Hongxu and Ping, Wei and Lu, Yao and Molchanov, Pavlo and Tao, Andrew and Mao, Huizi and Kautz, Jan and Shoeybi, Mohammad and Han, Song},
+ year = {2024},
+ month = may,
+ number = {arXiv:2312.07533},
+ eprint = {2312.07533},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2312.07533},
+ urldate = {2025-09-09},
+ abstract = {Visual language models (VLMs) rapidly progressed with the recent success of large language models. There have been growing efforts on visual instruction tuning to extend the LLM with visual inputs, but lacks an in-depth study of the visual language pre-training process, where the model learns to perform joint modeling on both modalities. In this work, we examine the design options for VLM pre-training by augmenting LLM towards VLM through step-by-step controllable comparisons. We introduce three main findings: (1) freezing LLMs during pre-training can achieve decent zero-shot performance, but lack in-context learning capability, which requires unfreezing the LLM; (2) interleaved pre-training data is beneficial whereas image-text pairs alone are not optimal; (3) re-blending text-only instruction data to image-text data during instruction fine-tuning not only remedies the degradation of text-only tasks, but also boosts VLM task accuracy. With an enhanced pre-training recipe we build VILA, a Visual Language model family that consistently outperforms the state-of-the-art models, e.g., LLaVA-1.5, across main benchmarks without bells and whistles. Multi-modal pre-training also helps unveil appealing properties of VILA, including multi-image reasoning, enhanced in-context learning, and better world knowledge.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/DNA6AFRL/Lin et al. - 2024 - VILA On Pre-training for Visual Language Models.pdf;/Users/fracapuano/Zotero/storage/K32IJ2A3/2312.html}
+}
+
+@misc{lipmanFlowMatchingGenerative2023,
+ title = {Flow {Matching} for {Generative Modeling}},
+ author = {Lipman, Yaron and Chen, Ricky T. Q. and {Ben-Hamu}, Heli and Nickel, Maximilian and Le, Matt},
+ year = {2023},
+ month = feb,
+ number = {arXiv:2210.02747},
+ eprint = {2210.02747},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2210.02747},
+ urldate = {2025-09-07},
+ abstract = {We introduce a new paradigm for generative modeling built on Continuous Normalizing Flows (CNFs), allowing us to train CNFs at unprecedented scale. Specifically, we present the notion of Flow Matching (FM), a simulation-free approach for training CNFs based on regressing vector fields of fixed conditional probability paths. Flow Matching is compatible with a general family of Gaussian probability paths for transforming between noise and data samples -- which subsumes existing diffusion paths as specific instances. Interestingly, we find that employing FM with diffusion paths results in a more robust and stable alternative for training diffusion models. Furthermore, Flow Matching opens the door to training CNFs with other, non-diffusion probability paths. An instance of particular interest is using Optimal Transport (OT) displacement interpolation to define the conditional probability paths. These paths are more efficient than diffusion paths, provide faster training and sampling, and result in better generalization. Training CNFs using Flow Matching on ImageNet leads to consistently better performance than alternative diffusion-based methods in terms of both likelihood and sample quality, and allows fast and reliable sample generation using off-the-shelf numerical ODE solvers.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/YFZTRGJ3/Lipman et al. - 2023 - Flow Matching for Generative Modeling.pdf;/Users/fracapuano/Zotero/storage/QUKPDHWR/2210.html}
+}
+
+@misc{lipmanFlowMatchingGuide2024,
+ title = {Flow {Matching Guide} and {Code}},
+ author = {Lipman, Yaron and Havasi, Marton and Holderrieth, Peter and Shaul, Neta and Le, Matt and Karrer, Brian and Chen, Ricky T. Q. and {Lopez-Paz}, David and {Ben-Hamu}, Heli and Gat, Itai},
+ year = {2024},
+ month = dec,
+ number = {arXiv:2412.06264},
+ eprint = {2412.06264},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2412.06264},
+ urldate = {2025-09-09},
+ abstract = {Flow Matching (FM) is a recent framework for generative modeling that has achieved state-of-the-art performance across various domains, including image, video, audio, speech, and biological structures. This guide offers a comprehensive and self-contained review of FM, covering its mathematical foundations, design choices, and extensions. By also providing a PyTorch package featuring relevant examples (e.g., image and text generation), this work aims to serve as a resource for both novice and experienced researchers interested in understanding, applying and further developing FM.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/6MGQ5AZ2/Lipman et al. - 2024 - Flow Matching Guide and Code.pdf;/Users/fracapuano/Zotero/storage/IKHZ75PU/2412.html}
+}
+
+@article{liu2024kangaroo,
+ title = {Kangaroo: {A} Powerful Video-Language Model Supporting Long-Context Video Input},
+ author = {Liu, Jiajun and Wang, Yibing and Ma, Hanghang and Wu, Xiaoping and Ma, Xiaoqi and Wei, Xiaoming and Jiao, Jianbin and Wu, Enhua and Hu, Jie},
+ year = {2024},
+ journal = {arXiv preprint arXiv:2408.15542},
+ eprint = {2408.15542},
+ archiveprefix = {arXiv}
+}
+
+@inproceedings{LLaVA-1.5,
+ title = {Improved Baselines with Visual Instruction Tuning},
+ booktitle = {NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following},
+ author = {Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
+ year = {2023}
+}
+
+@misc{luoPreciseDexterousRobotic2024,
+ title = {Precise and {Dexterous Robotic Manipulation} via Human-in-the-Loop Reinforcement Learning},
+ author = {Luo, Jianlan and Xu, Charles and Wu, Jeffrey and Levine, Sergey},
+ year = {2024},
+ month = oct,
+ number = {arXiv:2410.21845},
+ eprint = {2410.21845},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2410.21845},
+ urldate = {2025-08-28},
+ abstract = {Reinforcement learning (RL) holds great promise for enabling autonomous acquisition of complex robotic manipulation skills, but realizing this potential in real-world settings has been challenging. We present a human-in-the-loop vision-based RL system that demonstrates impressive performance on a diverse set of dexterous manipulation tasks, including dynamic manipulation, precision assembly, and dual-arm coordination. Our approach integrates demonstrations and human corrections, efficient RL algorithms, and other system-level design choices to learn policies that achieve near-perfect success rates and fast cycle times within just 1 to 2.5 hours of training. We show that our method significantly outperforms imitation learning baselines and prior RL approaches, with an average 2x improvement in success rate and 1.8x faster execution. Through extensive experiments and analysis, we provide insights into the effectiveness of our approach, demonstrating how it learns robust, adaptive policies for both reactive and predictive control strategies. Our results suggest that RL can indeed learn a wide range of complex vision-based manipulation policies directly in the real world within practical training times. We hope this work will inspire a new generation of learned robotic manipulation techniques, benefiting both industrial applications and research advancements. Videos and code are available at our project website https://hil-serl.github.io/.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/LEL37N2D/Luo et al. - 2024 - Precise and Dexterous Robotic Manipulation via Human-in-the-Loop Reinforcement Learning.pdf;/Users/fracapuano/Zotero/storage/VT83SIPT/2410.html}
+}
+
+@misc{luoSERLSoftwareSuite2025,
+ title = {SERL: {A Software Suite} for Sample-Efficient Robotic Reinforcement Learning},
+ shorttitle = {SERL},
+ author = {Luo, Jianlan and Hu, Zheyuan and Xu, Charles and Tan, You Liang and Berg, Jacob and Sharma, Archit and Schaal, Stefan and Finn, Chelsea and Gupta, Abhishek and Levine, Sergey},
+ year = {2025},
+ month = mar,
+ number = {arXiv:2401.16013},
+ eprint = {2401.16013},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2401.16013},
+ urldate = {2025-08-31},
+ abstract = {In recent years, significant progress has been made in the field of robotic reinforcement learning (RL), enabling methods that handle complex image observations, train in the real world, and incorporate auxiliary data, such as demonstrations and prior experience. However, despite these advances, robotic RL remains hard to use. It is acknowledged among practitioners that the particular implementation details of these algorithms are often just as important (if not more so) for performance as the choice of algorithm. We posit that a significant challenge to widespread adoption of robotic RL, as well as further development of robotic RL methods, is the comparative inaccessibility of such methods. To address this challenge, we developed a carefully implemented library containing a sample efficient off-policy deep RL method, together with methods for computing rewards and resetting the environment, a high-quality controller for a widely-adopted robot, and a number of challenging example tasks. We provide this library as a resource for the community, describe its design choices, and present experimental results. Perhaps surprisingly, we find that our implementation can achieve very efficient learning, acquiring policies for PCB board assembly, cable routing, and object relocation between 25 to 50 minutes of training per policy on average, improving over state-of-the-art results reported for similar tasks in the literature. These policies achieve perfect or near-perfect success rates, extreme robustness even under perturbations, and exhibit emergent recovery and correction behaviors. We hope that these promising results and our high-quality open-source implementation will provide a tool for the robotics community to facilitate further developments in robotic RL. Our code, documentation, and videos can be found at https://serl-robot.github.io/},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/IFYQTF4K/Luo et al. - 2025 - SERL A Software Suite for Sample-Efficient Robotic Reinforcement Learning.pdf;/Users/fracapuano/Zotero/storage/5B67QZDM/2401.html}
+}
+
+@book{lynchModernRoboticsMechanics2017,
+ title = {Modern {Robotics}: {Mechanics}, {Planning}, and {Control}},
+ shorttitle = {Modern {Robotics}},
+ author = {Lynch, Kevin M. and Park, Frank C.},
+ year = {2017},
+ month = may,
+ edition = {1},
+ publisher = {Cambridge University Press},
+ doi = {10.1017/9781316661239},
+ urldate = {2025-08-25},
+ abstract = {This introduction to robotics offers a distinct and unified perspective of the mechanics, planning and control of robots. Ideal for self-learning, or for courses, as it assumes only freshman-level physics, ordinary differential equations, linear algebra and a little bit of computing background. Modern Robotics presents the state-of-the-art, screw-theoretic techniques capturing the most salient physical features of a robot in an intuitive geometrical way. With numerous exercises at the end of each chapter, accompanying software written to reinforce the concepts in the book and video lectures aimed at changing the classroom experience, this is the go-to textbook for learning about this fascinating subject.},
+ copyright = {https://www.cambridge.org/core/terms},
+ isbn = {978-1-316-66123-9 978-1-107-15630-2 978-1-316-60984-2},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/S9E6NIQ8/Lynch and Park - 2017 - Modern Robotics Mechanics, Planning, and Control.pdf}
+}
+
+@inproceedings{MAPL,
+ title = {MAPL: Parameter-efficient Adaptation of Unimodal Pre-Trained Models for Vision-Language Few-Shot Prompting},
+ booktitle = {Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics},
+ author = {Ma{\~n}as, Oscar and Rodriguez Lopez, Pau and Ahmadi, Saba and Nematzadeh, Aida and Goyal, Yash and Agrawal, Aishwarya},
+ editor = {Vlachos, Andreas and Augenstein, Isabelle},
+ year = {2023},
+ month = may,
+ pages = {2523--2548},
+ publisher = {Association for Computational Linguistics},
+ address = {Dubrovnik, Croatia},
+ doi = {10.18653/v1/2023.eacl-main.185},
+ abstract = {Large pre-trained models have proved to be remarkable zero- and (prompt-based) few-shot learners in unimodal vision and language tasks. We propose MAPL, a simple and parameter-efficient method that reuses frozen pre-trained unimodal models and leverages their strong generalization capabilities in multimodal vision-language (VL) settings. MAPL learns a lightweight mapping between the representation spaces of unimodal models using aligned image-text data, and can generalize to unseen VL tasks from just a few in-context examples. The small number of trainable parameters makes MAPL effective at low-data and in-domain learning. Moreover, MAPL's modularity enables easy extension to other pre-trained models. Extensive experiments on several visual question answering and image captioning benchmarks show that MAPL achieves superior or competitive performance compared to similar methods while training orders of magnitude fewer parameters. MAPL can be trained in just a few hours using modest computational resources and public datasets. We release our code and pre-trained model weights at {$<$}a href="https://github.com/oscmansan/mapl"{$>$}https://github.com/oscmansan/mapl{$<$}/a{$>$}.}
+}
+
+@misc{marafiotiSmolVLMRedefiningSmall2025,
+ title = {SmolVLM: {Redefining} Small and Efficient Multimodal Models},
+ shorttitle = {SmolVLM},
+ author = {Marafioti, Andr{\'e}s and Zohar, Orr and Farr{\'e}, Miquel and Noyan, Merve and Bakouch, Elie and Cuenca, Pedro and Zakka, Cyril and Allal, Loubna Ben and Lozhkov, Anton and Tazi, Nouamane and Srivastav, Vaibhav and Lochner, Joshua and Larcher, Hugo and Morlon, Mathieu and Tunstall, Lewis and von Werra, Leandro and Wolf, Thomas},
+ year = {2025},
+ month = apr,
+ number = {arXiv:2504.05299},
+ eprint = {2504.05299},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2504.05299},
+ urldate = {2025-09-09},
+ abstract = {Large Vision-Language Models (VLMs) deliver exceptional performance but require significant computational resources, limiting their deployment on mobile and edge devices. Smaller VLMs typically mirror design choices of larger models, such as extensive image tokenization, leading to inefficient GPU memory usage and constrained practicality for on-device applications. We introduce SmolVLM, a series of compact multimodal models specifically engineered for resource-efficient inference. We systematically explore architectural configurations, tokenization strategies, and data curation optimized for low computational overhead. Through this, we identify key design choices that yield substantial performance gains on image and video tasks with minimal memory footprints. Our smallest model, SmolVLM-256M, uses less than 1GB GPU memory during inference and outperforms the 300-times larger Idefics-80B model, despite an 18-month development gap. Our largest model, at 2.2B parameters, rivals state-of-the-art VLMs consuming twice the GPU memory. SmolVLM models extend beyond static images, demonstrating robust video comprehension capabilities. Our results emphasize that strategic architectural optimizations, aggressive yet efficient tokenization, and carefully curated training data significantly enhance multimodal performance, facilitating practical, energy-efficient deployments at significantly smaller scales.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/5P2KTYKZ/Marafioti et al. - 2025 - SmolVLM Redefining small and efficient multimodal models.pdf;/Users/fracapuano/Zotero/storage/ILVVMXNG/2504.html}
+}
+
+@misc{margolisRapidLocomotionReinforcement2022,
+ title = {Rapid {Locomotion} via {Reinforcement Learning}},
+ author = {Margolis, Gabriel B. and Yang, Ge and Paigwar, Kartik and Chen, Tao and Agrawal, Pulkit},
+ year = {2022},
+ month = may,
+ number = {arXiv:2205.02824},
+ eprint = {2205.02824},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2205.02824},
+ urldate = {2025-08-26},
+ abstract = {Agile maneuvers such as sprinting and high-speed turning in the wild are challenging for legged robots. We present an end-to-end learned controller that achieves record agility for the MIT Mini Cheetah, sustaining speeds up to 3.9 m/s. This system runs and turns fast on natural terrains like grass, ice, and gravel and responds robustly to disturbances. Our controller is a neural network trained in simulation via reinforcement learning and transferred to the real world. The two key components are (i) an adaptive curriculum on velocity commands and (ii) an online system identification strategy for sim-to-real transfer leveraged from prior work. Videos of the robot's behaviors are available at: https://agility.csail.mit.edu/},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/URXYM9ZM/Margolis et al. - 2022 - Rapid Locomotion via Reinforcement Learning.pdf;/Users/fracapuano/Zotero/storage/S7PRP8ZT/2205.html}
+}
+
+@misc{margolisWalkTheseWays2022,
+ title = {Walk {These Ways}: {Tuning Robot Control} for {Generalization} with {Multiplicity} of {Behavior}},
+ shorttitle = {Walk {These Ways}},
+ author = {Margolis, Gabriel B. and Agrawal, Pulkit},
+ year = {2022},
+ month = dec,
+ number = {arXiv:2212.03238},
+ eprint = {2212.03238},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2212.03238},
+ urldate = {2025-08-27},
+ abstract = {Learned locomotion policies can rapidly adapt to diverse environments similar to those experienced during training but lack a mechanism for fast tuning when they fail in an out-of-distribution test environment. This necessitates a slow and iterative cycle of reward and environment redesign to achieve good performance on a new task. As an alternative, we propose learning a single policy that encodes a structured family of locomotion strategies that solve training tasks in different ways, resulting in Multiplicity of Behavior (MoB). Different strategies generalize differently and can be chosen in real-time for new tasks or environments, bypassing the need for time-consuming retraining. We release a fast, robust open-source MoB locomotion controller, Walk These Ways, that can execute diverse gaits with variable footswing, posture, and speed, unlocking diverse downstream tasks: crouching, hopping, high-speed running, stair traversal, bracing against shoves, rhythmic dance, and more. Video and code release: https://gmargo11.github.io/walk-these-ways/},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Robotics,Computer Science - Systems and Control,Electrical Engineering and Systems Science - Systems and Control},
+ file = {/Users/fracapuano/Zotero/storage/KPNWQYU7/Margolis and Agrawal - 2022 - Walk These Ways Tuning Robot Control for Generalization with Multiplicity of Behavior.pdf;/Users/fracapuano/Zotero/storage/EVSJWCYV/2212.html}
+}
+
+@misc{mccormacSemanticFusionDense3D2016,
+ title = {SemanticFusion: Dense 3D Semantic Mapping with {Convolutional Neural Networks}},
+ shorttitle = {SemanticFusion},
+ author = {McCormac, John and Handa, Ankur and Davison, Andrew and Leutenegger, Stefan},
+ year = {2016},
+ month = sep,
+ number = {arXiv:1609.05130},
+ eprint = {1609.05130},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1609.05130},
+ urldate = {2025-08-28},
+ abstract = {Ever more robust, accurate and detailed mapping using visual sensing has proven to be an enabling factor for mobile robots across a wide variety of applications. For the next level of robot intelligence and intuitive user interaction, maps need extend beyond geometry and appearence - they need to contain semantics. We address this challenge by combining Convolutional Neural Networks (CNNs) and a state of the art dense Simultaneous Localisation and Mapping (SLAM) system, ElasticFusion, which provides long-term dense correspondence between frames of indoor RGB-D video even during loopy scanning trajectories. These correspondences allow the CNN's semantic predictions from multiple view points to be probabilistically fused into a map. This not only produces a useful semantic 3D map, but we also show on the NYUv2 dataset that fusing multiple predictions leads to an improvement even in the 2D semantic labelling over baseline single frame predictions. We also show that for a smaller reconstruction dataset with larger variation in prediction viewpoint, the improvement over single frame segmentation increases. Our system is efficient enough to allow real-time interactive use at frame-rates of approximately 25Hz.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/3ASZ9WL8/McCormac et al. - 2016 - SemanticFusion Dense 3D Semantic Mapping with Convolutional Neural Networks.pdf;/Users/fracapuano/Zotero/storage/VGUFP4FL/1609.html}
+}
+
+@misc{minicmpv2024,
+ title = {MiniCPM-v: A GPT-4V Level {MLLM} on Your Phone},
+ author = {Yao, Yuan and Yu, Tianyu and Zhang, Ao and Wang, Chongyi and Cui, Junbo and Zhu, Hongji and Cai, Tianchi and Li, Haoyu and Zhao, Weilin and He, Zhihui and Chen, Qianyu and Zhou, Huarong and Zou, Zhensheng and Zhang, Haoye and Hu, Shengding and Zheng, Zhi and Zhou, Jie and Cai, Jie and Han, Xu and Zeng, Guoyang and Li, Dahai and Liu, Zhiyuan and Sun, Maosong},
+ year = {2024},
+ eprint = {2408.01800},
+ primaryclass = {cs.CV},
+ archiveprefix = {arXiv}
+}
+
+@inproceedings{MMC4,
+ title = {Multimodal {C4}: {An} Open, Billion-Scale Corpus of Images Interleaved with Text},
+ booktitle = {Thirty-Seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
+ author = {Zhu, Wanrong and Hessel, Jack and Awadalla, Anas and Gadre, Samir Yitzhak and Dodge, Jesse and Fang, Alex and Yu, Youngjae and Schmidt, Ludwig and Wang, William Yang and Choi, Yejin},
+ year = {2023}
+}
+
+@misc{mnihPlayingAtariDeep2013,
+ title = {Playing {Atari} with {Deep Reinforcement Learning}},
+ author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
+ year = {2013},
+ month = dec,
+ number = {arXiv:1312.5602},
+ eprint = {1312.5602},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1312.5602},
+ urldate = {2025-08-31},
+ abstract = {We present the first deep learning model to successfully learn control policies directly from high-dimensional sensory input using reinforcement learning. The model is a convolutional neural network, trained with a variant of Q-learning, whose input is raw pixels and whose output is a value function estimating future rewards. We apply our method to seven Atari 2600 games from the Arcade Learning Environment, with no adjustment of the architecture or learning algorithm. We find that it outperforms all previous approaches on six of the games and surpasses a human expert on three of them.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/WVHMEBJ5/Mnih et al. - 2013 - Playing Atari with Deep Reinforcement Learning.pdf;/Users/fracapuano/Zotero/storage/MQIFGTV7/1312.html}
+}
+
+@misc{moondream,
+ title = {Moondream},
+ author = {Korrapati, Vik},
+ year = {2024},
+ howpublished = {Online}
+}
+
+@article{mooreRobotsNuclearPower,
+ title = {Robots for Nuclear Power Plants},
+ author = {Moore, Taylor},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/IMLZMTF3/Moore - Robots for nuclear power plants.pdf}
+}
+
+@misc{nakkiranStepbyStepDiffusionElementary2024,
+ title = {Step-by-{Step Diffusion}: {An Elementary Tutorial}},
+ shorttitle = {Step-by-{Step Diffusion}},
+ author = {Nakkiran, Preetum and Bradley, Arwen and Zhou, Hattie and Advani, Madhu},
+ year = {2024},
+ month = jun,
+ number = {arXiv:2406.08929},
+ eprint = {2406.08929},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2406.08929},
+ urldate = {2025-09-04},
+ abstract = {We present an accessible first course on diffusion models and flow matching for machine learning, aimed at a technical audience with no diffusion experience. We try to simplify the mathematical details as much as possible (sometimes heuristically), while retaining enough precision to derive correct algorithms.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/F8X6FZUI/Nakkiran et al. - 2024 - Step-by-Step Diffusion An Elementary Tutorial.pdf;/Users/fracapuano/Zotero/storage/CR78HTMU/2406.html}
+}
+
+@inproceedings{OBELICS,
+ title = {OBELICS: {An} Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents},
+ booktitle = {Thirty-Seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
+ author = {Lauren{\c c}on, Hugo and Saulnier, Lucile and Tronchon, Leo and Bekman, Stas and Singh, Amanpreet and Lozhkov, Anton and Wang, Thomas and Karamcheti, Siddharth and Rush, Alexander M and Kiela, Douwe and Cord, Matthieu and Sanh, Victor},
+ year = {2023}
+}
+
+@misc{openaiGPT4TechnicalReport2024,
+ title = {GPT-4 Technical Report},
+ author = {OpenAI and Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and Avila, Red and Babuschkin, Igor and Balaji, Suchir and Balcom, Valerie and Baltescu, Paul and Bao, Haiming and Bavarian, Mohammad and Belgum, Jeff and Bello, Irwan and Berdine, Jake and {Bernadett-Shapiro}, Gabriel and Berner, Christopher and Bogdonoff, Lenny and Boiko, Oleg and Boyd, Madelaine and Brakman, Anna-Luisa and Brockman, Greg and Brooks, Tim and Brundage, Miles and Button, Kevin and Cai, Trevor and Campbell, Rosie and Cann, Andrew and Carey, Brittany and Carlson, Chelsea and Carmichael, Rory and Chan, Brooke and Chang, Che and Chantzis, Fotis and Chen, Derek and Chen, Sully and Chen, Ruby and Chen, Jason and Chen, Mark and Chess, Ben and Cho, Chester and Chu, Casey and Chung, Hyung Won and Cummings, Dave and Currier, Jeremiah and Dai, Yunxing and Decareaux, Cory and Degry, Thomas and Deutsch, Noah and Deville, Damien and Dhar, Arka and Dohan, David and Dowling, Steve and Dunning, Sheila and Ecoffet, Adrien and Eleti, Atty and Eloundou, Tyna and Farhi, David and Fedus, Liam and Felix, Niko and Fishman, Sim{\'o}n Posada and Forte, Juston and Fulford, Isabella and Gao, Leo and Georges, Elie and Gibson, Christian and Goel, Vik and Gogineni, Tarun and Goh, Gabriel and {Gontijo-Lopes}, Rapha and Gordon, Jonathan and Grafstein, Morgan and Gray, Scott and Greene, Ryan and Gross, Joshua and Gu, Shixiang Shane and Guo, Yufei and Hallacy, Chris and Han, Jesse and Harris, Jeff and He, Yuchen and Heaton, Mike and Heidecke, Johannes and Hesse, Chris and Hickey, Alan and Hickey, Wade and Hoeschele, Peter and Houghton, Brandon and Hsu, Kenny and Hu, Shengli and Hu, Xin and Huizinga, Joost and Jain, Shantanu and Jain, Shawn and Jang, Joanne and Jiang, Angela and Jiang, Roger and Jin, Haozhun and Jin, Denny and Jomoto, Shino and Jonn, Billie and Jun, Heewoo and Kaftan, Tomer and Kaiser, {\L}ukasz and Kamali, Ali and Kanitscheider, Ingmar and Keskar, Nitish Shirish and Khan, Tabarak and Kilpatrick, Logan and Kim, Jong Wook and Kim, Christina and Kim, Yongjik and Kirchner, Jan Hendrik and Kiros, Jamie and Knight, Matt and Kokotajlo, Daniel and Kondraciuk, {\L}ukasz and Kondrich, Andrew and Konstantinidis, Aris and Kosic, Kyle and Krueger, Gretchen and Kuo, Vishal and Lampe, Michael and Lan, Ikai and Lee, Teddy and Leike, Jan and Leung, Jade and Levy, Daniel and Li, Chak Ming and Lim, Rachel and Lin, Molly and Lin, Stephanie and Litwin, Mateusz and Lopez, Theresa and Lowe, Ryan and Lue, Patricia and Makanju, Anna and Malfacini, Kim and Manning, Sam and Markov, Todor and Markovski, Yaniv and Martin, Bianca and Mayer, Katie and Mayne, Andrew and McGrew, Bob and McKinney, Scott Mayer and McLeavey, Christine and McMillan, Paul and McNeil, Jake and Medina, David and Mehta, Aalok and Menick, Jacob and Metz, Luke and Mishchenko, Andrey and Mishkin, Pamela and Monaco, Vinnie and Morikawa, Evan and Mossing, Daniel and Mu, Tong and Murati, Mira and Murk, Oleg and M{\'e}ly, David and Nair, Ashvin and Nakano, Reiichiro and Nayak, Rajeev and Neelakantan, Arvind and Ngo, Richard and Noh, Hyeonwoo and Ouyang, Long and O'Keefe, Cullen and Pachocki, Jakub and Paino, Alex and Palermo, Joe and Pantuliano, Ashley and Parascandolo, Giambattista and Parish, Joel and Parparita, Emy and Passos, Alex and Pavlov, Mikhail and Peng, Andrew and Perelman, Adam and Peres, Filipe de Avila Belbute and Petrov, Michael and Pinto, Henrique Ponde de Oliveira and Michael and Pokorny and Pokrass, Michelle and Pong, Vitchyr H. and Powell, Tolly and Power, Alethea and Power, Boris and Proehl, Elizabeth and Puri, Raul and Radford, Alec and Rae, Jack and Ramesh, Aditya and Raymond, Cameron and Real, Francis and Rimbach, Kendra and Ross, Carl and Rotsted, Bob and Roussez, Henri and Ryder, Nick and Saltarelli, Mario and Sanders, Ted and Santurkar, Shibani and Sastry, Girish and Schmidt, Heather and Schnurr, David and Schulman, John and Selsam, Daniel and Sheppard, Kyla and Sherbakov, Toki and Shieh, Jessica and Shoker, Sarah and Shyam, Pranav and Sidor, Szymon and Sigler, Eric and Simens, Maddie and Sitkin, Jordan and Slama, Katarina and Sohl, Ian and Sokolowsky, Benjamin and Song, Yang and Staudacher, Natalie and Such, Felipe Petroski and Summers, Natalie and Sutskever, Ilya and Tang, Jie and Tezak, Nikolas and Thompson, Madeleine B. and Tillet, Phil and Tootoonchian, Amin and Tseng, Elizabeth and Tuggle, Preston and Turley, Nick and Tworek, Jerry and Uribe, Juan Felipe Cer{\'o}n and Vallone, Andrea and Vijayvergiya, Arun and Voss, Chelsea and Wainwright, Carroll and Wang, Justin Jay and Wang, Alvin and Wang, Ben and Ward, Jonathan and Wei, Jason and Weinmann, C. J. and Welihinda, Akila and Welinder, Peter and Weng, Jiayi and Weng, Lilian and Wiethoff, Matt and Willner, Dave and Winter, Clemens and Wolrich, Samuel and Wong, Hannah and Workman, Lauren and Wu, Sherwin and Wu, Jeff and Wu, Michael and Xiao, Kai and Xu, Tao and Yoo, Sarah and Yu, Kevin and Yuan, Qiming and Zaremba, Wojciech and Zellers, Rowan and Zhang, Chong and Zhang, Marvin and Zhao, Shengjia and Zheng, Tianhao and Zhuang, Juntang and Zhuk, William and Zoph, Barret},
+ year = {2024},
+ month = mar,
+ number = {arXiv:2303.08774},
+ eprint = {2303.08774},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2303.08774},
+ urldate = {2025-08-27},
+ abstract = {We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales. This allowed us to accurately predict some aspects of GPT-4's performance based on models trained with no more than 1/1,000th the compute of GPT-4.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+ file = {/Users/fracapuano/Zotero/storage/9CJAC5WC/OpenAI et al. - 2024 - GPT-4 Technical Report.pdf;/Users/fracapuano/Zotero/storage/8VS6FA7G/2303.html}
+}
+
+@misc{OpenXEmbodimentRobotic,
+ title = {Open X-Embodiment: {Robotic Learning Datasets} and RT-X Models},
+ shorttitle = {Open X-Embodiment},
+ urldate = {2025-08-27},
+ abstract = {Project page for Open X-Embodiment: Robotic Learning Datasets and RT-X Models.},
+ howpublished = {https://robotics-transformer-x.github.io/},
+ file = {/Users/fracapuano/Zotero/storage/5DS9SYCH/robotics-transformer-x.github.io.html}
+}
+
+@misc{oquabDINOv2LearningRobust2024,
+ title = {DINOv2: {Learning Robust Visual Features} without {Supervision}},
+ shorttitle = {DINOv2},
+ author = {Oquab, Maxime and Darcet, Timoth{\'e}e and Moutakanni, Th{\'e}o and Vo, Huy and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and {El-Nouby}, Alaaeldin and Assran, Mahmoud and Ballas, Nicolas and Galuba, Wojciech and Howes, Russell and Huang, Po-Yao and Li, Shang-Wen and Misra, Ishan and Rabbat, Michael and Sharma, Vasu and Synnaeve, Gabriel and Xu, Hu and Jegou, Herv{\'e} and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
+ year = {2024},
+ month = feb,
+ number = {arXiv:2304.07193},
+ eprint = {2304.07193},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2304.07193},
+ urldate = {2025-09-07},
+ abstract = {The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/QUP9C62G/Oquab et al. - 2024 - DINOv2 Learning Robust Visual Features without Supervision.pdf;/Users/fracapuano/Zotero/storage/G5P2WXLM/2304.html}
+}
+
+@misc{permenterInterpretingImprovingDiffusion2024,
+ title = {Interpreting and {Improving Diffusion Models} from an {Optimization Perspective}},
+ author = {Permenter, Frank and Yuan, Chenyang},
+ year = {2024},
+ month = jun,
+ number = {arXiv:2306.04848},
+ eprint = {2306.04848},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2306.04848},
+ urldate = {2025-09-03},
+ abstract = {Denoising is intuitively related to projection. Indeed, under the manifold hypothesis, adding random noise is approximately equivalent to orthogonal perturbation. Hence, learning to denoise is approximately learning to project. In this paper, we use this observation to interpret denoising diffusion models as approximate gradient descent applied to the Euclidean distance function. We then provide straight-forward convergence analysis of the DDIM sampler under simple assumptions on the projection error of the denoiser. Finally, we propose a new gradient-estimation sampler, generalizing DDIM using insights from our theoretical results. In as few as 5-10 function evaluations, our sampler achieves state-of-the-art FID scores on pretrained CIFAR-10 and CelebA models and can generate high quality samples on latent diffusion models.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Mathematics - Optimization and Control,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/45F7R93S/Permenter and Yuan - 2024 - Interpreting and Improving Diffusion Models from an Optimization Perspective.pdf;/Users/fracapuano/Zotero/storage/9EAM4RZH/2306.html}
+}
+
+@misc{pieterabbeelL5DDPGSAC2021,
+ title = {L5 {DDPG} and {SAC} ({Foundations} of Deep RL Series)},
+ author = {Pieter Abbeel},
+ year = {2021},
+ month = aug,
+ urldate = {2025-09-01},
+ abstract = {Lecture 5 of a 6-lecture series on the Foundations of Deep RL Topic: Deep Deterministic Policy Gradients (DDPG) and Soft Actor Critic (SAC) Instructor: Pieter Abbeel}
+}
+
+@inproceedings{pmlr-v32-silver14,
+ title = {Deterministic Policy Gradient Algorithms},
+ booktitle = {Proceedings of the 31st International Conference on Machine Learning},
+ author = {Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin},
+ editor = {Xing, Eric P. and Jebara, Tony},
+ year = {2014},
+ month = jun,
+ series = {Proceedings of Machine Learning Research},
+ volume = {32},
+ pages = {387--395},
+ publisher = {PMLR},
+ address = {Bejing, China},
+ abstract = {In this paper we consider deterministic policy gradient algorithms for reinforcement learning with continuous actions. The deterministic policy gradient has a particularly appealing form: it is the expected gradient of the action-value function. This simple form means that the deterministic policy gradient can be estimated much more efficiently than the usual stochastic policy gradient. To ensure adequate exploration, we introduce an off-policy actor-critic algorithm that learns a deterministic target policy from an exploratory behaviour policy. Deterministic policy gradient algorithms outperformed their stochastic counterparts in several benchmark problems, particularly in high-dimensional action spaces.}
+}
+
+@misc{PolicyGradientMethods,
+ title = {Policy Gradient Methods for Reinforcement Learning with Function Approximation - {Google Search}},
+ urldate = {2025-08-31},
+ howpublished = {https://www.google.com/search?q=Policy+gradient+methods+for+reinforcement+learning+with+function+approximation&sourceid=chrome&ie=UTF-8},
+ file = {/Users/fracapuano/Zotero/storage/GRIBG9H8/search.html}
+}
+
+@misc{polyakMovieGenCast2025,
+ title = {Movie {Gen}: {A Cast} of {Media Foundation Models}},
+ shorttitle = {Movie {Gen}},
+ author = {Polyak, Adam and Zohar, Amit and Brown, Andrew and Tjandra, Andros and Sinha, Animesh and Lee, Ann and Vyas, Apoorv and Shi, Bowen and Ma, Chih-Yao and Chuang, Ching-Yao and Yan, David and Choudhary, Dhruv and Wang, Dingkang and Sethi, Geet and Pang, Guan and Ma, Haoyu and Misra, Ishan and Hou, Ji and Wang, Jialiang and Jagadeesh, Kiran and Li, Kunpeng and Zhang, Luxin and Singh, Mannat and Williamson, Mary and Le, Matt and Yu, Matthew and Singh, Mitesh Kumar and Zhang, Peizhao and Vajda, Peter and Duval, Quentin and Girdhar, Rohit and Sumbaly, Roshan and Rambhatla, Sai Saketh and Tsai, Sam and Azadi, Samaneh and Datta, Samyak and Chen, Sanyuan and Bell, Sean and Ramaswamy, Sharadh and Sheynin, Shelly and Bhattacharya, Siddharth and Motwani, Simran and Xu, Tao and Li, Tianhe and Hou, Tingbo and Hsu, Wei-Ning and Yin, Xi and Dai, Xiaoliang and Taigman, Yaniv and Luo, Yaqiao and Liu, Yen-Cheng and Wu, Yi-Chiao and Zhao, Yue and Kirstain, Yuval and He, Zecheng and He, Zijian and Pumarola, Albert and Thabet, Ali and Sanakoyeu, Artsiom and Mallya, Arun and Guo, Baishan and Araya, Boris and Kerr, Breena and Wood, Carleigh and Liu, Ce and Peng, Cen and Vengertsev, Dimitry and Schonfeld, Edgar and Blanchard, Elliot and {Juefei-Xu}, Felix and Nord, Fraylie and Liang, Jeff and Hoffman, John and Kohler, Jonas and Fire, Kaolin and Sivakumar, Karthik and Chen, Lawrence and Yu, Licheng and Gao, Luya and Georgopoulos, Markos and Moritz, Rashel and Sampson, Sara K. and Li, Shikai and Parmeggiani, Simone and Fine, Steve and Fowler, Tara and Petrovic, Vladan and Du, Yuming},
+ year = {2025},
+ month = feb,
+ number = {arXiv:2410.13720},
+ eprint = {2410.13720},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2410.13720},
+ urldate = {2025-09-06},
+ abstract = {We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization, video editing, video-to-audio generation, and text-to-audio generation. Our largest video generation model is a 30B parameter transformer trained with a maximum context length of 73K video tokens, corresponding to a generated video of 16 seconds at 16 frames-per-second. We show multiple technical innovations and simplifications on the architecture, latent spaces, training objectives and recipes, data curation, evaluation protocols, parallelization techniques, and inference optimizations that allow us to reap the benefits of scaling pre-training data, model size, and training compute for training large scale media generation models. We hope this paper helps the research community to accelerate progress and innovation in media generation models. All videos from this paper are available at https://go.fb.me/MovieGenResearchVideos.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Electrical Engineering and Systems Science - Image and Video Processing},
+ file = {/Users/fracapuano/Zotero/storage/KGDELBPH/Polyak et al. - 2025 - Movie Gen A Cast of Media Foundation Models.pdf;/Users/fracapuano/Zotero/storage/LV8WPFVU/2410.html}
+}
+
+@inproceedings{pomerleauALVINNAutonomousLand1988,
+ title = {ALVINN: {An Autonomous Land Vehicle} in a {Neural Network}},
+ shorttitle = {ALVINN},
+ booktitle = {Advances in {Neural Information Processing Systems}},
+ author = {Pomerleau, Dean A.},
+ year = {1988},
+ volume = {1},
+ publisher = {Morgan-Kaufmann},
+ urldate = {2025-09-03},
+ abstract = {ALVINN (Autonomous Land Vehicle In a Neural Network) is a 3-layer back-propagation network designed for the task of road following. Cur(cid:173) rently ALVINN takes images from a camera and a laser range finder as input and produces as output the direction the vehicle should travel in order to follow the road. Training has been conducted using simulated road images. Successful tests on the Carnegie Mellon autonomous navigation test vehicle indicate that the network can effectively follow real roads under certain field conditions. The representation developed to perfOIm the task differs dra(cid:173) matically when the networlc is trained under various conditions, suggesting the possibility of a novel adaptive autonomous navigation system capable of tailoring its processing to the conditions at hand.},
+ file = {/Users/fracapuano/Zotero/storage/BT7UE8MA/Pomerleau - 1988 - ALVINN An Autonomous Land Vehicle in a Neural Network.pdf}
+}
+
+@inproceedings{pomerleauALVINNAutonomousLand1988a,
+ title = {ALVINN: {An Autonomous Land Vehicle} in a {Neural Network}},
+ shorttitle = {ALVINN},
+ booktitle = {Advances in {Neural Information Processing Systems}},
+ author = {Pomerleau, Dean A.},
+ year = {1988},
+ volume = {1},
+ publisher = {Morgan-Kaufmann},
+ urldate = {2025-09-01},
+ abstract = {ALVINN (Autonomous Land Vehicle In a Neural Network) is a 3-layer back-propagation network designed for the task of road following. Cur(cid:173) rently ALVINN takes images from a camera and a laser range finder as input and produces as output the direction the vehicle should travel in order to follow the road. Training has been conducted using simulated road images. Successful tests on the Carnegie Mellon autonomous navigation test vehicle indicate that the network can effectively follow real roads under certain field conditions. The representation developed to perfOIm the task differs dra(cid:173) matically when the networlc is trained under various conditions, suggesting the possibility of a novel adaptive autonomous navigation system capable of tailoring its processing to the conditions at hand.},
+ file = {/Users/fracapuano/Zotero/storage/P64K7XYH/Pomerleau - 1988 - ALVINN An Autonomous Land Vehicle in a Neural Network.pdf}
+}
+
+@book{prince2023understanding,
+ title = {Understanding Deep Learning},
+ author = {Prince, Simon J.D.},
+ year = {2023},
+ publisher = {The MIT Press}
+}
+
+@misc{radfordLearningTransferableVisual2021,
+ title = {Learning {Transferable Visual Models From Natural Language Supervision}},
+ author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
+ year = {2021},
+ month = feb,
+ number = {arXiv:2103.00020},
+ eprint = {2103.00020},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2103.00020},
+ urldate = {2025-09-09},
+ abstract = {State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability since additional labeled data is needed to specify any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained model weights at https://github.com/OpenAI/CLIP.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/9RAM5ZIE/Radford et al. - 2021 - Learning Transferable Visual Models From Natural Language Supervision.pdf;/Users/fracapuano/Zotero/storage/YIEJ6PCB/2103.html}
+}
+
+@misc{raffelExploringLimitsTransfer2023,
+ title = {Exploring the {Limits} of {Transfer Learning} with a Unified Text-to-Text Transformer},
+ author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
+ year = {2023},
+ month = sep,
+ number = {arXiv:1910.10683},
+ eprint = {1910.10683},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1910.10683},
+ urldate = {2025-09-07},
+ abstract = {Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new ``Colossal Clean Crawled Corpus'', we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/F7VN7TZA/Raffel et al. - 2023 - Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer.pdf;/Users/fracapuano/Zotero/storage/YALEE6N9/1910.html}
+}
+
+@misc{reedGeneralistAgent2022,
+ title = {A {Generalist Agent}},
+ author = {Reed, Scott and Zolna, Konrad and Parisotto, Emilio and Colmenarejo, Sergio Gomez and Novikov, Alexander and {Barth-Maron}, Gabriel and Gimenez, Mai and Sulsky, Yury and Kay, Jackie and Springenberg, Jost Tobias and Eccles, Tom and Bruce, Jake and Razavi, Ali and Edwards, Ashley and Heess, Nicolas and Chen, Yutian and Hadsell, Raia and Vinyals, Oriol and Bordbar, Mahyar and de Freitas, Nando},
+ year = {2022},
+ month = nov,
+ number = {arXiv:2205.06175},
+ eprint = {2205.06175},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2205.06175},
+ urldate = {2025-09-07},
+ abstract = {Inspired by progress in large-scale language modeling, we apply a similar approach towards building a single generalist agent beyond the realm of text outputs. The agent, which we refer to as Gato, works as a multi-modal, multi-task, multi-embodiment generalist policy. The same network with the same weights can play Atari, caption images, chat, stack blocks with a real robot arm and much more, deciding based on its context whether to output text, joint torques, button presses, or other tokens. In this report we describe the model and the data, and document the current capabilities of Gato.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/VDNMGQB4/Reed et al. - 2022 - A Generalist Agent.pdf;/Users/fracapuano/Zotero/storage/9Y4ZMZIL/2205.html}
+}
+
+@misc{ronnebergerUNetConvolutionalNetworks2015,
+ title = {U-{Net}: {Convolutional Networks} for {Biomedical Image Segmentation}},
+ shorttitle = {U-{Net}},
+ author = {Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
+ year = {2015},
+ month = may,
+ number = {arXiv:1505.04597},
+ eprint = {1505.04597},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1505.04597},
+ urldate = {2025-09-06},
+ abstract = {There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net .},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/7H54LXUZ/Ronneberger et al. - 2015 - U-Net Convolutional Networks for Biomedical Image Segmentation.pdf;/Users/fracapuano/Zotero/storage/4NZ6ZRGI/1505.html}
+}
+
+@misc{rossReductionImitationLearning2011,
+ title = {A {Reduction} of {Imitation Learning} and {Structured Prediction} to No-Regret Online Learning},
+ author = {Ross, Stephane and Gordon, Geoffrey J. and Bagnell, J. Andrew},
+ year = {2011},
+ month = mar,
+ number = {arXiv:1011.0686},
+ eprint = {1011.0686},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1011.0686},
+ urldate = {2025-09-02},
+ abstract = {Sequential prediction problems such as imitation learning, where future observations depend on previous predictions (actions), violate the common i.i.d. assumptions made in statistical learning. This leads to poor performance in theory and often in practice. Some recent approaches provide stronger guarantees in this setting, but remain somewhat unsatisfactory as they train either non-stationary or stochastic policies and require a large number of iterations. In this paper, we propose a new iterative algorithm, which trains a stationary deterministic policy, that can be seen as a no regret algorithm in an online learning setting. We show that any such no regret algorithm, combined with additional reduction assumptions, must find a policy with good performance under the distribution of observations it induces in such sequential settings. We demonstrate that this new approach outperforms previous approaches on two challenging imitation learning problems and a benchmark sequence labeling problem.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/PFDE9IUH/Ross et al. - 2011 - A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning.pdf;/Users/fracapuano/Zotero/storage/7VA6XGEA/1011.html}
+}
+
+@misc{sannemanStateIndustrialRobotics2020,
+ title = {The {State} of {Industrial Robotics}: {Emerging Technologies}, {Challenges}, and {Key Research Directions}},
+ shorttitle = {The {State} of {Industrial Robotics}},
+ author = {Sanneman, Lindsay and Fourie, Christopher and Shah, Julie A.},
+ year = {2020},
+ month = oct,
+ number = {arXiv:2010.14537},
+ eprint = {2010.14537},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2010.14537},
+ urldate = {2025-08-26},
+ abstract = {Robotics and related technologies are central to the ongoing digitization and advancement of manufacturing. In recent years, a variety of strategic initiatives around the world including "Industry 4.0", introduced in Germany in 2011 have aimed to improve and connect manufacturing technologies in order to optimize production processes. In this work, we study the changing technological landscape of robotics and "internet-of-things" (IoT)-based connective technologies over the last 7-10 years in the wake of Industry 4.0. We interviewed key players within the European robotics ecosystem, including robotics manufacturers and integrators, original equipment manufacturers (OEMs), and applied industrial research institutions and synthesize our findings in this paper. We first detail the state-of-the-art robotics and IoT technologies we observed and that the companies discussed during our interviews. We then describe the processes the companies follow when deciding whether and how to integrate new technologies, the challenges they face when integrating these technologies, and some immediate future technological avenues they are exploring in robotics and IoT. Finally, based on our findings, we highlight key research directions for the robotics community that can enable improved capabilities in the context of manufacturing.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/8ETI44WZ/Sanneman et al. - 2020 - The State of Industrial Robotics Emerging Technologies, Challenges, and Key Research Directions.pdf;/Users/fracapuano/Zotero/storage/Y37S4WE2/2010.html}
+}
+
+@misc{ScholargoogleusercontentcomScholarbibqinfo88G_QluoYI4J,
+ title = {Scholar.Googleusercontent.Com/Scholar.Bib?Q=info:88G_QluoYI4J:Scholar.Google.Com/&output=citation&scisdr=CgIQg4SNEO7moXYtjoc:AAZF9b8AAAAAaLQrlocZcsFJirMs3WpUvW3zxvM&scisig=AAZF9b8AAAAAaLQrlgE-ix1Lq0FaNEP0Mj37mGU&scisf=4&ct=citation&cd=-1&hl=en},
+ urldate = {2025-08-31},
+ howpublished = {https://scholar.googleusercontent.com/scholar.bib?q=info:88G_QluoYI4J:scholar.google.com/&output=citation&scisdr=CgIQg4SNEO7moXYtjoc:AAZF9b8AAAAAaLQrlocZcsFJirMs3WpUvW3zxvM&scisig=AAZF9b8AAAAAaLQrlgE-ix1Lq0FaNEP0Mj37mGU&scisf=4&ct=citation&cd=-1&hl=en},
+ file = {/Users/fracapuano/Zotero/storage/9DKD7T9B/scholar.html}
+}
+
+@misc{schulmanProximalPolicyOptimization2017,
+ title = {Proximal {Policy Optimization Algorithms}},
+ author = {Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
+ year = {2017},
+ month = aug,
+ number = {arXiv:1707.06347},
+ eprint = {1707.06347},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1707.06347},
+ urldate = {2025-08-29},
+ abstract = {We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a "surrogate" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/DGQ79LDQ/Schulman et al. - 2017 - Proximal Policy Optimization Algorithms.pdf;/Users/fracapuano/Zotero/storage/ISS4QTB9/1707.html}
+}
+
+@misc{schulmanTrustRegionPolicy2017,
+ title = {Trust {Region Policy Optimization}},
+ author = {Schulman, John and Levine, Sergey and Moritz, Philipp and Jordan, Michael I. and Abbeel, Pieter},
+ year = {2017},
+ month = apr,
+ number = {arXiv:1502.05477},
+ eprint = {1502.05477},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1502.05477},
+ urldate = {2025-08-29},
+ abstract = {We describe an iterative procedure for optimizing policies, with guaranteed monotonic improvement. By making several approximations to the theoretically-justified procedure, we develop a practical algorithm, called Trust Region Policy Optimization (TRPO). This algorithm is similar to natural policy gradient methods and is effective for optimizing large nonlinear policies such as neural networks. Our experiments demonstrate its robust performance on a wide variety of tasks: learning simulated robotic swimming, hopping, and walking gaits; and playing Atari games using images of the screen as input. Despite its approximations that deviate from the theory, TRPO tends to give monotonic improvement, with little tuning of hyperparameters.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/MC469UHX/Schulman et al. - 2017 - Trust Region Policy Optimization.pdf;/Users/fracapuano/Zotero/storage/V7M6LZV3/1502.html}
+}
+
+@book{shalev-shwartzUnderstandingMachineLearning2014,
+ title = {Understanding {Machine Learning}: {From Theory} to {Algorithms}},
+ shorttitle = {Understanding {Machine Learning}},
+ author = {{Shalev-Shwartz}, Shai and {Ben-David}, Shai},
+ year = {2014},
+ month = may,
+ edition = {1},
+ publisher = {Cambridge University Press},
+ doi = {10.1017/CBO9781107298019},
+ urldate = {2025-09-01},
+ abstract = {Machine learning is one of the fastest growing areas of computer science, with far-reaching applications. The aim of this textbook is to introduce machine learning, and the algorithmic paradigms it offers, in a principled way. The book provides a theoretical account of the fundamentals underlying machine learning and the mathematical derivations that transform these principles into practical algorithms. Following a presentation of the basics, the book covers a wide array of central topics unaddressed by previous textbooks. These include a discussion of the computational complexity of learning and the concepts of convexity and stability; important algorithmic paradigms including stochastic gradient descent, neural networks, and structured output learning; and emerging theoretical concepts such as the PAC-Bayes approach and compression-based bounds. Designed for advanced undergraduates or beginning graduates, the text makes the fundamentals and algorithms of machine learning accessible to students and non-expert readers in statistics, computer science, mathematics and engineering.},
+ copyright = {https://www.cambridge.org/core/terms},
+ isbn = {978-1-107-05713-5 978-1-107-29801-9},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/KTKPACDG/Shalev-Shwartz and Ben-David - 2014 - Understanding Machine Learning From Theory to Algorithms.pdf}
+}
+
+@article{shazeerOUTRAGEOUSLYLARGENEURAL2017,
+ title = {OUTRAGEOUSLY LARGE NEURAL NETWORKS: THE SPARSELY-GATED MIXTURE-OF-EXPERTS LAYER},
+ author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Dean, Jeff},
+ year = {2017},
+ abstract = {The capacity of a neural network to absorb information is limited by its number of parameters. Conditional computation, where parts of the network are active on a per-example basis, has been proposed in theory as a way of dramatically increasing model capacity without a proportional increase in computation. In practice, however, there are significant algorithmic and performance challenges. In this work, we address these challenges and finally realize the promise of conditional computation, achieving greater than 1000x improvements in model capacity with only minor losses in computational efficiency on modern GPU clusters. We introduce a Sparsely-Gated Mixture-of-Experts layer (MoE), consisting of up to thousands of feed-forward sub-networks. A trainable gating network determines a sparse combination of these experts to use for each example. We apply the MoE to the tasks of language modeling and machine translation, where model capacity is critical for absorbing the vast quantities of knowledge available in the training corpora. We present model architectures in which a MoE with up to 137 billion parameters is applied convolutionally between stacked LSTM layers. On large language modeling and machine translation benchmarks, these models achieve significantly better results than state-of-the-art at lower computational cost.},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/QHJRU8HX/Shazeer et al. - 2017 - OUTRAGEOUSLY LARGE NEURAL NETWORKS THE SPARSELY-GATED MIXTURE-OF-EXPERTS LAYER.pdf}
+}
+
+@misc{shazeerOutrageouslyLargeNeural2017a,
+ title = {Outrageously {Large Neural Networks}: The Sparsely-Gated Mixture-of-Experts Layer},
+ shorttitle = {Outrageously {Large Neural Networks}},
+ author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
+ year = {2017},
+ month = jan,
+ number = {arXiv:1701.06538},
+ eprint = {1701.06538},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1701.06538},
+ urldate = {2025-09-08},
+ abstract = {The capacity of a neural network to absorb information is limited by its number of parameters. Conditional computation, where parts of the network are active on a per-example basis, has been proposed in theory as a way of dramatically increasing model capacity without a proportional increase in computation. In practice, however, there are significant algorithmic and performance challenges. In this work, we address these challenges and finally realize the promise of conditional computation, achieving greater than 1000x improvements in model capacity with only minor losses in computational efficiency on modern GPU clusters. We introduce a Sparsely-Gated Mixture-of-Experts layer (MoE), consisting of up to thousands of feed-forward sub-networks. A trainable gating network determines a sparse combination of these experts to use for each example. We apply the MoE to the tasks of language modeling and machine translation, where model capacity is critical for absorbing the vast quantities of knowledge available in the training corpora. We present model architectures in which a MoE with up to 137 billion parameters is applied convolutionally between stacked LSTM layers. On large language modeling and machine translation benchmarks, these models achieve significantly better results than state-of-the-art at lower computational cost.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/DJX78PLY/Shazeer et al. - 2017 - Outrageously Large Neural Networks The Sparsely-Gated Mixture-of-Experts Layer.pdf;/Users/fracapuano/Zotero/storage/I4T8DUPG/1701.html}
+}
+
+@inproceedings{shukor2023epalm,
+ title = {Ep-Alm: {Efficient} Perceptual Augmentation of Language Models},
+ booktitle = {Proceedings of the {IEEE}/{CVF} International Conference on Computer Vision},
+ author = {Shukor, Mustafa and Dancette, Corentin and Cord, Matthieu},
+ year = {2023},
+ pages = {22056--22069}
+}
+
+@misc{shukorSmolVLAVisionLanguageActionModel2025,
+ title = {SmolVLA: A Vision-Language-Action Model for {Affordable} and {Efficient Robotics}},
+ shorttitle = {SmolVLA},
+ author = {Shukor, Mustafa and Aubakirova, Dana and Capuano, Francesco and Kooijmans, Pepijn and Palma, Steven and Zouitine, Adil and Aractingi, Michel and Pascal, Caroline and Russi, Martino and Marafioti, Andres and Alibert, Simon and Cord, Matthieu and Wolf, Thomas and Cadene, Remi},
+ year = {2025},
+ month = jun,
+ number = {arXiv:2506.01844},
+ eprint = {2506.01844},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2506.01844},
+ urldate = {2025-08-28},
+ abstract = {Vision-language models (VLMs) pretrained on large-scale multimodal datasets encode rich visual and linguistic knowledge, making them a strong foundation for robotics. Rather than training robotic policies from scratch, recent approaches adapt VLMs into vision-language-action (VLA) models that enable natural language-driven perception and control. However, existing VLAs are typically massive--often with billions of parameters--leading to high training costs and limited real-world deployability. Moreover, they rely on academic and industrial datasets, overlooking the growing availability of community-collected data from affordable robotic platforms. In this work, we present SmolVLA, a small, efficient, and community-driven VLA that drastically reduces both training and inference costs, while retaining competitive performance. SmolVLA is designed to be trained on a single GPU and deployed on consumer-grade GPUs or even CPUs. To further improve responsiveness, we introduce an asynchronous inference stack decoupling perception and action prediction from action execution, allowing higher control rates with chunked action generation. Despite its compact size, SmolVLA achieves performance comparable to VLAs that are 10x larger. We evaluate SmolVLA on a range of both simulated as well as real-world robotic benchmarks and release all code, pretrained models, and training data.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/Y64M6XLX/Shukor et al. - 2025 - SmolVLA A Vision-Language-Action Model for Affordable and Efficient Robotics.pdf;/Users/fracapuano/Zotero/storage/FNNQTK8Q/2506.html}
+}
+
+@book{sicilianoSpringerHandbookRobotics2016,
+ title = {Springer {Handbook} of {Robotics}},
+ editor = {Siciliano, Bruno and Khatib, Oussama},
+ year = {2016},
+ series = {Springer {Handbooks}},
+ publisher = {Springer International Publishing},
+ address = {Cham},
+ doi = {10.1007/978-3-319-32552-1},
+ urldate = {2025-08-26},
+ copyright = {https://www.springer.com/tdm},
+ isbn = {978-3-319-32550-7 978-3-319-32552-1},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/JHG94GYG/Siciliano and Khatib - 2016 - Springer Handbook of Robotics.pdf}
+}
+
+@misc{SignYourAccount,
+ title = {Sign in to Your Account},
+ urldate = {2025-09-02},
+ howpublished = {https://login.microsoftonline.com/cc95de1b-97f5-4f93-b4ba-fe68b852cf91/login},
+ file = {/Users/fracapuano/Zotero/storage/AP6JNKS8/login.html}
+}
+
+@article{silverDeterministicPolicyGradient,
+ title = {Deterministic {Policy Gradient Algorithms}},
+ author = {Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin},
+ abstract = {In this paper we consider deterministic policy gradient algorithms for reinforcement learning with continuous actions. The deterministic policy gradient has a particularly appealing form: it is the expected gradient of the action-value function. This simple form means that the deterministic policy gradient can be estimated much more efficiently than the usual stochastic policy gradient. To ensure adequate exploration, we introduce an off-policy actor-critic algorithm that learns a deterministic target policy from an exploratory behaviour policy. We demonstrate that deterministic policy gradient algorithms can significantly outperform their stochastic counterparts in high-dimensional action spaces.},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/IMFSXA3G/Silver et al. - Deterministic Policy Gradient Algorithms.pdf}
+}
+
+@inproceedings{silverDeterministicPolicyGradient2014,
+ title = {Deterministic {Policy Gradient Algorithms}},
+ booktitle = {Proceedings of the 31st {International Conference} on {Machine Learning}},
+ author = {Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin},
+ year = {2014},
+ month = jan,
+ pages = {387--395},
+ publisher = {PMLR},
+ issn = {1938-7228},
+ urldate = {2025-08-31},
+ abstract = {In this paper we consider deterministic policy gradient algorithms for reinforcement learning with continuous actions. The deterministic policy gradient has a particularly appealing form: it is the expected gradient of the action-value function. This simple form means that the deterministic policy gradient can be estimated much more efficiently than the usual stochastic policy gradient. To ensure adequate exploration, we introduce an off-policy actor-critic algorithm that learns a deterministic target policy from an exploratory behaviour policy. Deterministic policy gradient algorithms outperformed their stochastic counterparts in several benchmark problems, particularly in high-dimensional action spaces.},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/YI9JNYPV/Silver et al. - 2014 - Deterministic Policy Gradient Algorithms.pdf}
+}
+
+@article{silverDeterministicPolicyGradienta,
+ title = {Deterministic {Policy Gradient Algorithms}},
+ author = {Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin},
+ abstract = {In this paper we consider deterministic policy gradient algorithms for reinforcement learning with continuous actions. The deterministic policy gradient has a particularly appealing form: it is the expected gradient of the action-value function. This simple form means that the deterministic policy gradient can be estimated much more efficiently than the usual stochastic policy gradient. To ensure adequate exploration, we introduce an off-policy actor-critic algorithm that learns a deterministic target policy from an exploratory behaviour policy. We demonstrate that deterministic policy gradient algorithms can significantly outperform their stochastic counterparts in high-dimensional action spaces.},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/VWQNLK9R/Silver et al. - Deterministic Policy Gradient Algorithms.pdf}
+}
+
+@misc{sohl-dicksteinDeepUnsupervisedLearning2015,
+ title = {Deep {Unsupervised Learning} Using {Nonequilibrium Thermodynamics}},
+ author = {{Sohl-Dickstein}, Jascha and Weiss, Eric A. and Maheswaranathan, Niru and Ganguli, Surya},
+ year = {2015},
+ month = nov,
+ number = {arXiv:1503.03585},
+ eprint = {1503.03585},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1503.03585},
+ urldate = {2025-09-04},
+ abstract = {A central problem in machine learning involves modeling complex data-sets using highly flexible families of probability distributions in which learning, sampling, inference, and evaluation are still analytically or computationally tractable. Here, we develop an approach that simultaneously achieves both flexibility and tractability. The essential idea, inspired by non-equilibrium statistical physics, is to systematically and slowly destroy structure in a data distribution through an iterative forward diffusion process. We then learn a reverse diffusion process that restores structure in data, yielding a highly flexible and tractable generative model of the data. This approach allows us to rapidly learn, sample from, and evaluate probabilities in deep generative models with thousands of layers or time steps, as well as to compute conditional and posterior probabilities under the learned model. We additionally release an open source reference implementation of the algorithm.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Condensed Matter - Disordered Systems and Neural Networks,Quantitative Biology - Neurons and Cognition,Statistics - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/YZ5GBG5Z/Sohl-Dickstein et al. - 2015 - Deep Unsupervised Learning using Nonequilibrium Thermodynamics.pdf;/Users/fracapuano/Zotero/storage/97PKSBVT/1503.html}
+}
+
+@inproceedings{sohnLearningStructuredOutput2015,
+ title = {Learning {Structured Output Representation} Using {Deep Conditional Generative Models}},
+ booktitle = {Advances in {Neural Information Processing Systems}},
+ author = {Sohn, Kihyuk and Lee, Honglak and Yan, Xinchen},
+ year = {2015},
+ volume = {28},
+ publisher = {Curran Associates, Inc.},
+ urldate = {2025-09-02},
+ abstract = {Supervised deep learning has been successfully applied for many recognition problems in machine learning and computer vision. Although it can approximate a complex many-to-one function very well when large number of training data is provided, the lack of probabilistic inference of the current supervised deep learning methods makes it difficult to model a complex structured output representations. In this work, we develop a scalable deep conditional generative model for structured output variables using Gaussian latent variables. The model is trained efficiently in the framework of stochastic gradient variational Bayes, and allows a fast prediction using stochastic feed-forward inference. In addition, we provide novel strategies to build a robust structured prediction algorithms, such as recurrent prediction network architecture, input noise-injection and multi-scale prediction training methods. In experiments, we demonstrate the effectiveness of our proposed algorithm in comparison to the deterministic deep neural network counterparts in generating diverse but realistic output representations using stochastic inference. Furthermore, the proposed schemes in training methods and architecture design were complimentary, which leads to achieve strong pixel-level object segmentation and semantic labeling performance on Caltech-UCSD Birds 200 and the subset of Labeled Faces in the Wild dataset.},
+ file = {/Users/fracapuano/Zotero/storage/T6QP2WB3/Sohn et al. - 2015 - Learning Structured Output Representation using Deep Conditional Generative Models.pdf}
+}
+
+@misc{songDenoisingDiffusionImplicit2022,
+ title = {Denoising {Diffusion Implicit Models}},
+ author = {Song, Jiaming and Meng, Chenlin and Ermon, Stefano},
+ year = {2022},
+ month = oct,
+ number = {arXiv:2010.02502},
+ eprint = {2010.02502},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2010.02502},
+ urldate = {2025-09-06},
+ abstract = {Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. We empirically demonstrate that DDIMs can produce high quality samples $10 {\textbackslash}times$ to $50 {\textbackslash}times$ faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/INI2LBQV/Song et al. - 2022 - Denoising Diffusion Implicit Models.pdf;/Users/fracapuano/Zotero/storage/GE2U4XU7/2010.html}
+}
+
+@article{SpinningUp2018,
+ title = {Spinning up in Deep Reinforcement Learning},
+ author = {Achiam, Joshua},
+ year = {2018}
+}
+
+@misc{SuttonBartoBook,
+ title = {Sutton & {Barto Book}: {Reinforcement Learning}: {An Introduction}},
+ urldate = {2025-08-28},
+ howpublished = {http://incompleteideas.net/book/the-book-2nd.html},
+ file = {/Users/fracapuano/Zotero/storage/A3QZFGPB/the-book-2nd.html}
+}
+
+@inproceedings{suttonPolicyGradientMethods1999,
+ title = {Policy {Gradient Methods} for {Reinforcement Learning} with {Function Approximation}},
+ booktitle = {Advances in {Neural Information Processing Systems}},
+ author = {Sutton, Richard S and McAllester, David and Singh, Satinder and Mansour, Yishay},
+ year = {1999},
+ volume = {12},
+ publisher = {MIT Press},
+ urldate = {2025-08-31},
+ abstract = {Function approximation is essential to reinforcement learning, but the standard approach of approximating a value function and deter(cid:173) mining a policy from it has so far proven theoretically intractable. In this paper we explore an alternative approach in which the policy is explicitly represented by its own function approximator, indepen(cid:173) dent of the value function, and is updated according to the gradient of expected reward with respect to the policy parameters. Williams's REINFORCE method and actor-critic methods are examples of this approach. Our main new result is to show that the gradient can be written in a form suitable for estimation from experience aided by an approximate action-value or advantage function. Using this result, we prove for the first time that a version of policy iteration with arbitrary differentiable function approximation is convergent to a locally optimal policy.},
+ file = {/Users/fracapuano/Zotero/storage/4EKJMS5H/Sutton et al. - 1999 - Policy Gradient Methods for Reinforcement Learning with Function Approximation.pdf}
+}
+
+@inproceedings{suttonPolicyGradientMethods1999a,
+ title = {Policy {Gradient Methods} for {Reinforcement Learning} with {Function Approximation}},
+ booktitle = {Advances in {Neural Information Processing Systems}},
+ author = {Sutton, Richard S and McAllester, David and Singh, Satinder and Mansour, Yishay},
+ year = {1999},
+ volume = {12},
+ publisher = {MIT Press},
+ urldate = {2025-08-31},
+ abstract = {Function approximation is essential to reinforcement learning, but the standard approach of approximating a value function and deter(cid:173) mining a policy from it has so far proven theoretically intractable. In this paper we explore an alternative approach in which the policy is explicitly represented by its own function approximator, indepen(cid:173) dent of the value function, and is updated according to the gradient of expected reward with respect to the policy parameters. Williams's REINFORCE method and actor-critic methods are examples of this approach. Our main new result is to show that the gradient can be written in a form suitable for estimation from experience aided by an approximate action-value or advantage function. Using this result, we prove for the first time that a version of policy iteration with arbitrary differentiable function approximation is convergent to a locally optimal policy.},
+ file = {/Users/fracapuano/Zotero/storage/JNPS7AMN/Sutton et al. - 1999 - Policy Gradient Methods for Reinforcement Learning with Function Approximation.pdf}
+}
+
+@book{suttonReinforcementLearningIntroduction2018,
+ title = {Reinforcement Learning: An Introduction},
+ shorttitle = {Reinforcement Learning},
+ author = {Sutton, Richard S. and Barto, Andrew G.},
+ year = {2018},
+ series = {Adaptive Computation and Machine Learning Series},
+ edition = {Second edition},
+ publisher = {The MIT Press},
+ address = {Cambridge, Massachusetts},
+ abstract = {"Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives while interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the field's key ideas and algorithms."--},
+ isbn = {978-0-262-03924-6},
+ langid = {english},
+ lccn = {Q325.6 .R45 2018},
+ keywords = {Reinforcement learning},
+ file = {/Users/fracapuano/Zotero/storage/CJB8FNNL/Sutton and Barto - 2018 - Reinforcement learning an introduction.pdf}
+}
+
+@misc{tancikFourierFeaturesLet2020,
+ title = {Fourier {Features Let Networks Learn High Frequency Functions} in {Low Dimensional Domains}},
+ author = {Tancik, Matthew and Srinivasan, Pratul P. and Mildenhall, Ben and {Fridovich-Keil}, Sara and Raghavan, Nithin and Singhal, Utkarsh and Ramamoorthi, Ravi and Barron, Jonathan T. and Ng, Ren},
+ year = {2020},
+ month = jun,
+ number = {arXiv:2006.10739},
+ eprint = {2006.10739},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2006.10739},
+ urldate = {2025-09-06},
+ abstract = {We show that passing input points through a simple Fourier feature mapping enables a multilayer perceptron (MLP) to learn high-frequency functions in low-dimensional problem domains. These results shed light on recent advances in computer vision and graphics that achieve state-of-the-art results by using MLPs to represent complex 3D objects and scenes. Using tools from the neural tangent kernel (NTK) literature, we show that a standard MLP fails to learn high frequencies both in theory and in practice. To overcome this spectral bias, we use a Fourier feature mapping to transform the effective NTK into a stationary kernel with a tunable bandwidth. We suggest an approach for selecting problem-specific Fourier features that greatly improves the performance of MLPs for low-dimensional regression tasks relevant to the computer vision and graphics communities.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/AYWWN7ME/Tancik et al. - 2020 - Fourier Features Let Networks Learn High Frequency Functions in Low Dimensional Domains.pdf;/Users/fracapuano/Zotero/storage/68Q4Y4LM/2006.html}
+}
+
+@misc{tangDeepReinforcementLearning2024,
+ title = {Deep {Reinforcement Learning} for {Robotics}: {A Survey} of Real-World Successes},
+ shorttitle = {Deep {Reinforcement Learning} for {Robotics}},
+ author = {Tang, Chen and Abbatematteo, Ben and Hu, Jiaheng and Chandra, Rohan and {Mart{\'i}n-Mart{\'i}n}, Roberto and Stone, Peter},
+ year = {2024},
+ month = sep,
+ number = {arXiv:2408.03539},
+ eprint = {2408.03539},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2408.03539},
+ urldate = {2025-08-29},
+ abstract = {Reinforcement learning (RL), particularly its combination with deep neural networks referred to as deep RL (DRL), has shown tremendous promise across a wide range of applications, suggesting its potential for enabling the development of sophisticated robotic behaviors. Robotics problems, however, pose fundamental difficulties for the application of RL, stemming from the complexity and cost of interacting with the physical world. This article provides a modern survey of DRL for robotics, with a particular focus on evaluating the real-world successes achieved with DRL in realizing several key robotic competencies. Our analysis aims to identify the key factors underlying those exciting successes, reveal underexplored areas, and provide an overall characterization of the status of DRL in robotics. We highlight several important avenues for future work, emphasizing the need for stable and sample-efficient real-world RL paradigms, holistic approaches for discovering and integrating various competencies to tackle complex long-horizon, open-world tasks, and principled development and evaluation procedures. This survey is designed to offer insights for both RL practitioners and roboticists toward harnessing RL's power to create generally capable real-world robotic systems.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/ZTX4VSMA/Tang et al. - 2024 - Deep Reinforcement Learning for Robotics A Survey of Real-World Successes.pdf;/Users/fracapuano/Zotero/storage/WDVGKFL3/2408.html}
+}
+
+@article{tangDeepReinforcementLearning2025,
+ title = {Deep {Reinforcement Learning} for {Robotics}: {A Survey} of Real-World Successes},
+ shorttitle = {Deep {Reinforcement Learning} for {Robotics}},
+ author = {Tang, Chen and Abbatematteo, Ben and Hu, Jiaheng and Chandra, Rohan and {Mart{\'i}n-Mart{\'i}n}, Roberto and Stone, Peter},
+ year = {2025},
+ month = may,
+ journal = {Annual Review of Control, Robotics, and Autonomous Systems},
+ volume = {8},
+ number = {Volume 8, 2025},
+ pages = {153--188},
+ publisher = {Annual Reviews},
+ issn = {2573-5144},
+ doi = {10.1146/annurev-control-030323-022510},
+ urldate = {2025-08-29},
+ abstract = {Reinforcement learning (RL), particularly its combination with deep neural networks, referred to as deep RL (DRL), has shown tremendous promise across a wide range of applications, suggesting its potential for enabling the development of sophisticated robotic behaviors. Robotics problems, however, pose fundamental difficulties for the application of RL, stemming from the complexity and cost of interacting with the physical world. This article provides a modern survey of DRL for robotics, with a particular focus on evaluating the real-world successes achieved with DRL in realizing several key robotic competencies. Our analysis aims to identify the key factors underlying those exciting successes, reveal underexplored areas, and provide an overall characterization of the status of DRL in robotics. We highlight several important avenues for future work, emphasizing the need for stable and sample-efficient real-world RL paradigms; holistic approaches for discovering and integrating various competencies to tackle complex long-horizon, open-world tasks; and principled development and evaluation procedures. This survey is designed to offer insights for both RL practitioners and roboticists toward harnessing RL's power to create generally capable real-world robotic systems.},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/CCNUWJ73/Tang et al. - 2025 - Deep Reinforcement Learning for Robotics A Survey of Real-World Successes.pdf;/Users/fracapuano/Zotero/storage/UVIIIEXP/Tang et al. - 2025 - Deep Reinforcement Learning for Robotics A Survey of Real-World Successes.pdf;/Users/fracapuano/Zotero/storage/EUKPASJ2/annurev-control-030323-022510.html}
+}
+
+@article{tangPerceptionNavigationAutonomous2023,
+ title = {Perception and {Navigation} in {Autonomous Systems} in the {Era} of {Learning}: {A Survey}},
+ shorttitle = {Perception and {Navigation} in {Autonomous Systems} in the {Era} of {Learning}},
+ author = {Tang, Yang and Zhao, Chaoqiang and Wang, Jianrui and Zhang, Chongzhen and Sun, Qiyu and Zheng, Weixing and Du, Wenli and Qian, Feng and Kurths, Juergen},
+ year = {2023},
+ month = dec,
+ journal = {IEEE Transactions on Neural Networks and Learning Systems},
+ volume = {34},
+ number = {12},
+ eprint = {2001.02319},
+ primaryclass = {cs},
+ pages = {9604--9624},
+ issn = {2162-237X, 2162-2388},
+ doi = {10.1109/TNNLS.2022.3167688},
+ urldate = {2025-08-27},
+ abstract = {Autonomous systems possess the features of inferring their own state, understanding their surroundings, and performing autonomous navigation. With the applications of learning systems, like deep learning and reinforcement learning, the visual-based self-state estimation, environment perception and navigation capabilities of autonomous systems have been efficiently addressed, and many new learning-based algorithms have surfaced with respect to autonomous visual perception and navigation. In this review, we focus on the applications of learning-based monocular approaches in ego-motion perception, environment perception and navigation in autonomous systems, which is different from previous reviews that discussed traditional methods. First, we delineate the shortcomings of existing classical visual simultaneous localization and mapping (vSLAM) solutions, which demonstrate the necessity to integrate deep learning techniques. Second, we review the visual-based environmental perception and understanding methods based on deep learning, including deep learning-based monocular depth estimation, monocular ego-motion prediction, image enhancement, object detection, semantic segmentation, and their combinations with traditional vSLAM frameworks. Then, we focus on the visual navigation based on learning systems, mainly including reinforcement learning and deep reinforcement learning. Finally, we examine several challenges and promising directions discussed and concluded in related research of learning systems in the era of computer science and robotics.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/D3YRY6XE/Tang et al. - 2023 - Perception and Navigation in Autonomous Systems in the Era of Learning A Survey.pdf;/Users/fracapuano/Zotero/storage/SAYN9GG9/2001.html}
+}
+
+@misc{teamGemma2Improving2024,
+ title = {Gemma 2: {Improving Open Language Models} at a {Practical Size}},
+ shorttitle = {Gemma 2},
+ author = {Team, Gemma and Riviere, Morgane and Pathak, Shreya and Sessa, Pier Giuseppe and Hardin, Cassidy and Bhupatiraju, Surya and Hussenot, L{\'e}onard and Mesnard, Thomas and Shahriari, Bobak and Ram{\'e}, Alexandre and Ferret, Johan and Liu, Peter and Tafti, Pouya and Friesen, Abe and Casbon, Michelle and Ramos, Sabela and Kumar, Ravin and Lan, Charline Le and Jerome, Sammy and Tsitsulin, Anton and Vieillard, Nino and Stanczyk, Piotr and Girgin, Sertan and Momchev, Nikola and Hoffman, Matt and Thakoor, Shantanu and Grill, Jean-Bastien and Neyshabur, Behnam and Bachem, Olivier and Walton, Alanna and Severyn, Aliaksei and Parrish, Alicia and Ahmad, Aliya and Hutchison, Allen and Abdagic, Alvin and Carl, Amanda and Shen, Amy and Brock, Andy and Coenen, Andy and Laforge, Anthony and Paterson, Antonia and Bastian, Ben and Piot, Bilal and Wu, Bo and Royal, Brandon and Chen, Charlie and Kumar, Chintu and Perry, Chris and Welty, Chris and {Choquette-Choo}, Christopher A. and Sinopalnikov, Danila and Weinberger, David and Vijaykumar, Dimple and Rogozi{\'n}ska, Dominika and Herbison, Dustin and Bandy, Elisa and Wang, Emma and Noland, Eric and Moreira, Erica and Senter, Evan and Eltyshev, Evgenii and Visin, Francesco and Rasskin, Gabriel and Wei, Gary and Cameron, Glenn and Martins, Gus and Hashemi, Hadi and {Klimczak-Pluci{\'n}ska}, Hanna and Batra, Harleen and Dhand, Harsh and Nardini, Ivan and Mein, Jacinda and Zhou, Jack and Svensson, James and Stanway, Jeff and Chan, Jetha and Zhou, Jin Peng and Carrasqueira, Joana and Iljazi, Joana and Becker, Jocelyn and Fernandez, Joe and van Amersfoort, Joost and Gordon, Josh and Lipschultz, Josh and Newlan, Josh and Ji, Ju-yeong and Mohamed, Kareem and Badola, Kartikeya and Black, Kat and Millican, Katie and McDonell, Keelin and Nguyen, Kelvin and Sodhia, Kiranbir and Greene, Kish and Sjoesund, Lars Lowe and Usui, Lauren and Sifre, Laurent and Heuermann, Lena and Lago, Leticia and McNealus, Lilly and Soares, Livio Baldini and Kilpatrick, Logan and Dixon, Lucas and Martins, Luciano and Reid, Machel and Singh, Manvinder and Iverson, Mark and G{\"o}rner, Martin and Velloso, Mat and Wirth, Mateo and Davidow, Matt and Miller, Matt and Rahtz, Matthew and Watson, Matthew and Risdal, Meg and Kazemi, Mehran and Moynihan, Michael and Zhang, Ming and Kahng, Minsuk and Park, Minwoo and Rahman, Mofi and Khatwani, Mohit and Dao, Natalie and Bardoliwalla, Nenshad and Devanathan, Nesh and Dumai, Neta and Chauhan, Nilay and Wahltinez, Oscar and Botarda, Pankil and Barnes, Parker and Barham, Paul and Michel, Paul and Jin, Pengchong and Georgiev, Petko and Culliton, Phil and Kuppala, Pradeep and Comanescu, Ramona and Merhej, Ramona and Jana, Reena and Rokni, Reza Ardeshir and Agarwal, Rishabh and Mullins, Ryan and Saadat, Samaneh and Carthy, Sara Mc and Perrin, Sarah and Arnold, S{\'e}bastien M. R. and Krause, Sebastian and Dai, Shengyang and Garg, Shruti and Sheth, Shruti and Ronstrom, Sue and Chan, Susan and Jordan, Timothy and Yu, Ting and Eccles, Tom and Hennigan, Tom and Kocisky, Tomas and Doshi, Tulsee and Jain, Vihan and Yadav, Vikas and Meshram, Vilobh and Dharmadhikari, Vishal and Barkley, Warren and Wei, Wei and Ye, Wenming and Han, Woohyun and Kwon, Woosuk and Xu, Xiang and Shen, Zhe and Gong, Zhitao and Wei, Zichuan and Cotruta, Victor and Kirk, Phoebe and Rao, Anand and Giang, Minh and Peran, Ludovic and Warkentin, Tris and Collins, Eli and Barral, Joelle and Ghahramani, Zoubin and Hadsell, Raia and Sculley, D. and Banks, Jeanine and Dragan, Anca and Petrov, Slav and Vinyals, Oriol and Dean, Jeff and Hassabis, Demis and Kavukcuoglu, Koray and Farabet, Clement and Buchatskaya, Elena and Borgeaud, Sebastian and Fiedel, Noah and Joulin, Armand and Kenealy, Kathleen and Dadashi, Robert and Andreev, Alek},
+ year = {2024},
+ month = aug,
+ number = {arXiv:2408.00118},
+ eprint = {2408.00118},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2408.00118},
+ urldate = {2025-09-08},
+ abstract = {In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3 times bigger. We release all our models to the community.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+ file = {/Users/fracapuano/Zotero/storage/NTLZNFPL/Team et al. - 2024 - Gemma 2 Improving Open Language Models at a Practical Size.pdf;/Users/fracapuano/Zotero/storage/GKX7JFK3/2408.html}
+}
+
+@misc{tedrakeRoboticManipulationPerception,
+ title = {Robotic {Manipulation}. {Perception}, {Planning} and {Control}.},
+ author = {Tedrake, Russ}
+}
+
+@misc{tedrakeUnderactuatedRoboticsAlgorithms,
+ title = {Underactuated {Robotics}. {Algorithms} for {Walking}, {Running}, {Swimming}, {Flying}, and {Manipulation}},
+ author = {Tedrake, Russ}
+}
+
+@article{thrunPROBABILISTICROBOTICS,
+ title = {PROBABILISTIC ROBOTICS},
+ author = {Thrun, Sebastian and Burgard, Wolfram and Fox, Dieter},
+ langid = {english},
+ file = {/Users/fracapuano/Zotero/storage/UKNC34V7/Thrun et al. - PROBABILISTIC ROBOTICS.pdf}
+}
+
+@misc{tiboniDomainRandomizationEntropy2024,
+ title = {Domain {Randomization} via {Entropy Maximization}},
+ author = {Tiboni, Gabriele and Klink, Pascal and Peters, Jan and Tommasi, Tatiana and D'Eramo, Carlo and Chalvatzaki, Georgia},
+ year = {2024},
+ month = mar,
+ number = {arXiv:2311.01885},
+ eprint = {2311.01885},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2311.01885},
+ urldate = {2025-08-30},
+ abstract = {Varying dynamics parameters in simulation is a popular Domain Randomization (DR) approach for overcoming the reality gap in Reinforcement Learning (RL). Nevertheless, DR heavily hinges on the choice of the sampling distribution of the dynamics parameters, since high variability is crucial to regularize the agent's behavior but notoriously leads to overly conservative policies when randomizing excessively. In this paper, we propose a novel approach to address sim-to-real transfer, which automatically shapes dynamics distributions during training in simulation without requiring real-world data. We introduce DOmain RAndomization via Entropy MaximizatiON (DORAEMON), a constrained optimization problem that directly maximizes the entropy of the training distribution while retaining generalization capabilities. In achieving this, DORAEMON gradually increases the diversity of sampled dynamics parameters as long as the probability of success of the current policy is sufficiently high. We empirically validate the consistent benefits of DORAEMON in obtaining highly adaptive and generalizable policies, i.e. solving the task at hand across the widest range of dynamics parameters, as opposed to representative baselines from the DR literature. Notably, we also demonstrate the Sim2Real applicability of DORAEMON through its successful zero-shot transfer in a robotic manipulation setup under unknown real-world parameters.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/T5KH6GM9/Tiboni et al. - 2024 - Domain Randomization via Entropy Maximization.pdf;/Users/fracapuano/Zotero/storage/KRE436NC/2311.html}
+}
+
+@misc{tiboniDROPOSimtoRealTransfer2023,
+ title = {DROPO: Sim-to-Real Transfer with {Offline Domain Randomization}},
+ shorttitle = {DROPO},
+ author = {Tiboni, Gabriele and Arndt, Karol and Kyrki, Ville},
+ year = {2023},
+ month = jan,
+ number = {arXiv:2201.08434},
+ eprint = {2201.08434},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2201.08434},
+ urldate = {2025-08-31},
+ abstract = {In recent years, domain randomization over dynamics parameters has gained a lot of traction as a method for sim-to-real transfer of reinforcement learning policies in robotic manipulation; however, finding optimal randomization distributions can be difficult. In this paper, we introduce DROPO, a novel method for estimating domain randomization distributions for safe sim-to-real transfer. Unlike prior work, DROPO only requires a limited, precollected offline dataset of trajectories, and explicitly models parameter uncertainty to match real data using a likelihood-based approach. We demonstrate that DROPO is capable of recovering dynamic parameter distributions in simulation and finding a distribution capable of compensating for an unmodeled phenomenon. We also evaluate the method in two zero-shot sim-to-real transfer scenarios, showing successful domain transfer and improved performance over prior methods.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/Q875LPZF/Tiboni et al. - 2023 - DROPO Sim-to-Real Transfer with Offline Domain Randomization.pdf;/Users/fracapuano/Zotero/storage/2NQ4L37P/2201.html}
+}
+
+@misc{tobinDomainRandomizationTransferring2017,
+ title = {Domain {Randomization} for {Transferring Deep Neural Networks} from {Simulation} to the {Real World}},
+ author = {Tobin, Josh and Fong, Rachel and Ray, Alex and Schneider, Jonas and Zaremba, Wojciech and Abbeel, Pieter},
+ year = {2017},
+ month = mar,
+ number = {arXiv:1703.06907},
+ eprint = {1703.06907},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.1703.06907},
+ urldate = {2025-08-30},
+ abstract = {Bridging the 'reality gap' that separates simulated robotics from experiments on hardware could accelerate robotic research through improved data availability. This paper explores domain randomization, a simple technique for training models on simulated images that transfer to real images by randomizing rendering in the simulator. With enough variability in the simulator, the real world may appear to the model as just another variation. We focus on the task of object localization, which is a stepping stone to general robotic manipulation skills. We find that it is possible to train a real-world object detector that is accurate to $1.5$cm and robust to distractors and partial occlusions using only data from a simulator with non-realistic random textures. To demonstrate the capabilities of our detectors, we show they can be used to perform grasping in a cluttered environment. To our knowledge, this is the first successful transfer of a deep neural network trained only on simulated RGB images (without pre-training on real images) to the real world for the purpose of robotic control.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/TYJZAD9R/Tobin et al. - 2017 - Domain Randomization for Transferring Deep Neural Networks from Simulation to the Real World.pdf;/Users/fracapuano/Zotero/storage/C9QS7DES/1703.html}
+}
+
+@article{tong2024cambrian,
+ title = {Cambrian-1: {A} Fully Open, Vision-Centric Exploration of Multimodal Llms},
+ author = {Tong, Peter and Brown, Ellis and Wu, Penghao and Woo, Sanghyun and IYER, Adithya Jairam Vedagiri and Akula, Sai Charitha and Yang, Shusheng and Yang, Jihan and Middepogu, Manoj and Wang, Ziteng and others},
+ year = {2024},
+ journal = {Advances in Neural Information Processing Systems},
+ volume = {37},
+ pages = {87310--87356}
+}
+
+@misc{touvronLlama2Open2023,
+ title = {Llama 2: {Open Foundation} and Fine-Tuned Chat Models},
+ shorttitle = {Llama 2},
+ author = {Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and Bikel, Dan and Blecher, Lukas and Ferrer, Cristian Canton and Chen, Moya and Cucurull, Guillem and Esiobu, David and Fernandes, Jude and Fu, Jeremy and Fu, Wenyin and Fuller, Brian and Gao, Cynthia and Goswami, Vedanuj and Goyal, Naman and Hartshorn, Anthony and Hosseini, Saghar and Hou, Rui and Inan, Hakan and Kardas, Marcin and Kerkez, Viktor and Khabsa, Madian and Kloumann, Isabel and Korenev, Artem and Koura, Punit Singh and Lachaux, Marie-Anne and Lavril, Thibaut and Lee, Jenya and Liskovich, Diana and Lu, Yinghai and Mao, Yuning and Martinet, Xavier and Mihaylov, Todor and Mishra, Pushkar and Molybog, Igor and Nie, Yixin and Poulton, Andrew and Reizenstein, Jeremy and Rungta, Rashi and Saladi, Kalyan and Schelten, Alan and Silva, Ruan and Smith, Eric Michael and Subramanian, Ranjan and Tan, Xiaoqing Ellen and Tang, Binh and Taylor, Ross and Williams, Adina and Kuan, Jian Xiang and Xu, Puxin and Yan, Zheng and Zarov, Iliyan and Zhang, Yuchen and Fan, Angela and Kambadur, Melanie and Narang, Sharan and Rodriguez, Aurelien and Stojnic, Robert and Edunov, Sergey and Scialom, Thomas},
+ year = {2023},
+ month = jul,
+ number = {arXiv:2307.09288},
+ eprint = {2307.09288},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2307.09288},
+ urldate = {2025-09-08},
+ abstract = {In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+ file = {/Users/fracapuano/Zotero/storage/VKQFSEUF/Touvron et al. - 2023 - Llama 2 Open Foundation and Fine-Tuned Chat Models.pdf;/Users/fracapuano/Zotero/storage/N6MFUQCF/2307.html}
+}
+
+@article{tsimpoukelli2021multimodalfrozen,
+ title = {Multimodal Few-Shot Learning with Frozen Language Models},
+ author = {Tsimpoukelli, Maria and Menick, Jacob L and Cabi, Serkan and Eslami, {\relax SM} and Vinyals, Oriol and Hill, Felix},
+ year = {2021},
+ journal = {Advances in Neural Information Processing Systems},
+ volume = {34},
+ pages = {200--212}
+}
+
+@article{vallaeys2024improveddepalm,
+ title = {Improved Baselines for Data-Efficient Perceptual Augmentation of Llms},
+ author = {Vallaeys, Th{\'e}ophane and Shukor, Mustafa and Cord, Matthieu and Verbeek, Jakob},
+ year = {2024},
+ journal = {arXiv preprint arXiv:2403.13499},
+ eprint = {2403.13499},
+ archiveprefix = {arXiv}
+}
+
+@article{wang2025internvideo2,
+ title = {InternVideo2. 5: {Empowering} Video Mllms with Long and Rich Context Modeling},
+ author = {Wang, Yi and Li, Xinhao and Yan, Ziang and He, Yinan and Yu, Jiashuo and Zeng, Xiangyu and Wang, Chenting and Ma, Changlian and Huang, Haian and Gao, Jianfei and others},
+ year = {2025},
+ journal = {arXiv preprint arXiv:2501.12386},
+ eprint = {2501.12386},
+ archiveprefix = {arXiv}
+}
+
+@misc{zhaiSigmoidLossLanguage2023,
+ title = {Sigmoid {Loss} for Language Image Pre-Training},
+ author = {Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
+ year = {2023},
+ month = sep,
+ number = {arXiv:2303.15343},
+ eprint = {2303.15343},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2303.15343},
+ urldate = {2025-09-09},
+ abstract = {We propose a simple pairwise Sigmoid loss for Language-Image Pre-training (SigLIP). Unlike standard contrastive learning with softmax normalization, the sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. The sigmoid loss simultaneously allows further scaling up the batch size, while also performing better at smaller batch sizes. Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days. The disentanglement of the batch size from the loss further allows us to study the impact of examples vs pairs and negative to positive ratio. Finally, we push the batch size to the extreme, up to one million, and find that the benefits of growing batch size quickly diminish, with a more reasonable batch size of 32k being sufficient. We release our models at https://github.com/google-research/big_vision and hope our research motivates further explorations in improving the quality and efficiency of language-image pre-training.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition},
+ file = {/Users/fracapuano/Zotero/storage/Z39H5W8R/Zhai et al. - 2023 - Sigmoid Loss for Language Image Pre-Training.pdf;/Users/fracapuano/Zotero/storage/IYX9QALK/2303.html}
+}
+
+@article{zhang2025videollama,
+ title = {VideoLLaMA 3: {Frontier} Multimodal Foundation Models for Image and Video Understanding},
+ author = {Zhang, Boqiang and Li, Kehan and Cheng, Zesen and Hu, Zhiqiang and Yuan, Yuqian and Chen, Guanzheng and Leng, Sicong and Jiang, Yuming and Zhang, Hang and Li, Xin and others},
+ year = {2025},
+ journal = {arXiv preprint arXiv:2501.13106},
+ eprint = {2501.13106},
+ archiveprefix = {arXiv}
+}
+
+@misc{zhangWoCoCoLearningWholeBody2024,
+ title = {WoCoCo: Learning Whole-Body Humanoid Control with {Sequential Contacts}},
+ shorttitle = {WoCoCo},
+ author = {Zhang, Chong and Xiao, Wenli and He, Tairan and Shi, Guanya},
+ year = {2024},
+ month = nov,
+ number = {arXiv:2406.06005},
+ eprint = {2406.06005},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2406.06005},
+ urldate = {2025-08-26},
+ abstract = {Humanoid activities involving sequential contacts are crucial for complex robotic interactions and operations in the real world and are traditionally solved by model-based motion planning, which is time-consuming and often relies on simplified dynamics models. Although model-free reinforcement learning (RL) has become a powerful tool for versatile and robust whole-body humanoid control, it still requires tedious task-specific tuning and state machine design and suffers from long-horizon exploration issues in tasks involving contact sequences. In this work, we propose WoCoCo (Whole-Body Control with Sequential Contacts), a unified framework to learn whole-body humanoid control with sequential contacts by naturally decomposing the tasks into separate contact stages. Such decomposition facilitates simple and general policy learning pipelines through task-agnostic reward and sim-to-real designs, requiring only one or two task-related terms to be specified for each task. We demonstrated that end-to-end RL-based controllers trained with WoCoCo enable four challenging whole-body humanoid tasks involving diverse contact sequences in the real world without any motion priors: 1) versatile parkour jumping, 2) box loco-manipulation, 3) dynamic clap-and-tap dancing, and 4) cliffside climbing. We further show that WoCoCo is a general framework beyond humanoid by applying it in 22-DoF dinosaur robot loco-manipulation tasks.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Graphics,Computer Science - Robotics,Computer Science - Systems and Control,Electrical Engineering and Systems Science - Systems and Control},
+ file = {/Users/fracapuano/Zotero/storage/2SYII7A2/Zhang et al. - 2024 - WoCoCo Learning Whole-Body Humanoid Control with Sequential Contacts.pdf;/Users/fracapuano/Zotero/storage/C6ZJPZEV/2406.html}
+}
+
+@misc{zhaoLearningFineGrainedBimanual2023,
+ title = {Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware},
+ author = {Zhao, Tony Z. and Kumar, Vikash and Levine, Sergey and Finn, Chelsea},
+ year = {2023},
+ month = apr,
+ number = {arXiv:2304.13705},
+ eprint = {2304.13705},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ doi = {10.48550/arXiv.2304.13705},
+ urldate = {2025-08-26},
+ abstract = {Fine manipulation tasks, such as threading cable ties or slotting a battery, are notoriously difficult for robots because they require precision, careful coordination of contact forces, and closed-loop visual feedback. Performing these tasks typically requires high-end robots, accurate sensors, or careful calibration, which can be expensive and difficult to set up. Can learning enable low-cost and imprecise hardware to perform these fine manipulation tasks? We present a low-cost system that performs end-to-end imitation learning directly from real demonstrations, collected with a custom teleoperation interface. Imitation learning, however, presents its own challenges, particularly in high-precision domains: errors in the policy can compound over time, and human demonstrations can be non-stationary. To address these challenges, we develop a simple yet novel algorithm, Action Chunking with Transformers (ACT), which learns a generative model over action sequences. ACT allows the robot to learn 6 difficult tasks in the real world, such as opening a translucent condiment cup and slotting a battery with 80-90% success, with only 10 minutes worth of demonstrations. Project website: https://tonyzhaozh.github.io/aloha/},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Machine Learning,Computer Science - Robotics},
+ file = {/Users/fracapuano/Zotero/storage/4P7GCF3I/Zhao et al. - 2023 - Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware.pdf;/Users/fracapuano/Zotero/storage/3BC9S3Z2/2304.html}
+}
+
+@misc{zhongPracticalBlockwiseNeural2018,
+ title = {Practical Block-wise Neural Network Architecture Generation},
+ author = {Zhong, Zhao and Yan, Junjie and Wu, Wei and Shao, Jing and Liu, Cheng-Lin},
+ year = {2018},
+ month = may,
+ number = {arXiv:1708.05552},
+ eprint = {1708.05552},
+ primaryclass = {cs},
+ publisher = {arXiv},
+ urldate = {2023-05-05},
+ abstract = {Convolutional neural networks have gained a remarkable success in computer vision. However, most usable network architectures are hand-crafted and usually require expertise and elaborate design. In this paper, we provide a block-wise network generation pipeline called BlockQNN which automatically builds high-performance networks using the Q-Learning paradigm with epsilon-greedy exploration strategy. The optimal network block is constructed by the learning agent which is trained sequentially to choose component layers. We stack the block to construct the whole auto-generated network. To accelerate the generation process, we also propose a distributed asynchronous framework and an early stop strategy. The block-wise generation brings unique advantages: (1) it performs competitive results in comparison to the hand-crafted state-of-the-art networks on image classification, additionally, the best network generated by BlockQNN achieves 3.54% top-1 error rate on CIFAR-10 which beats all existing auto-generate networks. (2) in the meanwhile, it offers tremendous reduction of the search space in designing networks which only spends 3 days with 32 GPUs, and (3) moreover, it has strong generalizability that the network built on CIFAR also performs well on a larger-scale ImageNet dataset.},
+ archiveprefix = {arXiv},
+ keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
+ file = {/Users/fracapuano/Zotero/storage/7ZJWPCRW/Zhong et al. - 2018 - Practical Block-wise Neural Network Architecture G.pdf;/Users/fracapuano/Zotero/storage/ZI2R395F/Zhong et al. - 2018 - Practical Block-wise Neural Network Architecture G.html}
+}
+
+@inproceedings{zhu2024minigpt,
+ title = {MiniGPT-4: {Enhancing} Vision-Language Understanding with Advanced Large Language Models},
+ booktitle = {The Twelfth International Conference on Learning Representations},
+ author = {Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
+ year = {2024}
+}
+
+@misc{zotero-item-169,
+ type = {Misc}
}
diff --git a/app/src/content/chapters/00_abstract.mdx b/app/src/content/chapters/00_abstract.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..2d1252d23e8614a7e96b7293c40af70b15f5f982
--- /dev/null
+++ b/app/src/content/chapters/00_abstract.mdx
@@ -0,0 +1,7 @@
+Robot learning is at an inflection point, driven by rapid advancements in machine learning and the growing availability of large-scale robotics data.
+This shift from classical, model-based methods to data-driven, learning-based paradigms is unlocking unprecedented capabilities in autonomous systems.
+This tutorial navigates the landscape of modern robot learning, charting a course from the foundational principles of Reinforcement Learning and Behavioral Cloning to generalist, language-conditioned models capable of operating across diverse tasks and even robot embodiments.
+This work is intended as a guide for researchers and practitioners, and our goal is to equip the reader with the conceptual understanding and hands-on tools necessary to understand and contribute to developments in robot learning.\
+
+Code: **[https://github.com/huggingface/lerobot](https://github.com/huggingface/lerobot)**\
+Date: **2025-09-17**
\ No newline at end of file
diff --git a/app/src/content/chapters/01_introduction.mdx b/app/src/content/chapters/01_introduction.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..3c3a9a3c0f62594b443cae8ff4b3ef7691e775fa
--- /dev/null
+++ b/app/src/content/chapters/01_introduction.mdx
@@ -0,0 +1,107 @@
+import ResponsiveImage from '../../components/ResponsiveImage.astro'
+import Ch1LerobotFigure1 from '../assets/image/ch1/ch1-lerobot-figure1.png'
+
+
+# Introduction
+
+
+
+***LeRobot** is the open-source library for end-to-end robotics developed by Hugging Face. The library is vertically integrated on the entire robotics stack, supporting low-level control of real-world robot devices, advanced data and inference optimizations, as well as SOTA robot learning methods with simple implementations in pure Pytorch.*
+
+Autonomous robotics holds the premise of relieving humans from repetitive, tiring or dangerous manual tasks.
+Consequently, the field of robotics has been widely studied since its first inception in the 1950s.
+Lately, advancements in Machine Learning (ML) have sparked the development of a relatively new class of methods used to tackle robotics problems, leveraging large amounts of data and computation rather than human expertise and modeling skills to develop autonomous systems.
+
+The frontier of robotics research is indeed increasingly moving away from classical model-based control paradigm, embracing the advancements made in ML, aiming to unlock (1) monolithic perception-to-action control pipelines and (2) multi-modal data-driven feature extraction strategies, together with (3) reduced reliance on precise models of the world and (4) a better positioning to benefit from the growing availability of open robotics data.
+While central problems in manipulation, locomotion and whole-body control demand knowledge of rigid-body dynamics, contact modeling, planning under uncertainty, recent results seem to indicate learning can prove just as effective as explicit modeling, sparking interest in the field of *robot learning*.
+This interest can be largely justified considering the significant challenges related to deriving accurate models of robot-environment interactions.
+
+Moreover, since end-to-end learning on ever-growing collections of text and image data has historically been at the core of the development of *foundation models* capable of semantic reasoning across multiple modalities (images, text, audio, etc.), deriving robotics methods grounded in learning appears particularly consequential, especially as the number of openly available datasets continues to grow.
+
+Robotics is, at its core, an inherently multidisciplinary field, requiring a wide range of expertise in both *software* and *hardware*.
+The integration of learning-based techniques further broadens this spectrum of skills, raising the bar for both research and practical applications.
+**LeRobot** is an open-source library designed to integrate end-to-end with the entire robotics stack.
+With a strong focus on accessible, real-world robots for manipulation, locomotion and even whole-body control.
+**LeRobot** also implements a to extend support for other robot platforms with relatively low effort.
+The library introduces 'LeRobotDataset', currently being used by the community to efficiently record and share datasets.
+**LeRobot** also supports many state-of-the-art (SOTA) algorithms in robot learning---mainly based on Reinforcement Learning (RL) and Behavioral Cloning (BC) techniques---with efficient implementations in Pytorch, and extended support to experimentation and experiments tracking.
+Lastly, **LeRobot** defines a custom, optimized inference stack for robotic policies decoupling action planning from action execution, proving effective in guaranteeing more adaptability at runtime.
+
+This tutorial serves the double purpose of providing useful references for the Science behind---and practical use of---common robot learning techniques.
+To this aim, we strike to provide a rigorous yet concise overview of the core concepts behind the techniques presented, paired with practical examples of how to use such techniques concretely, with code examples in **LeRobot**, for researchers and practitioners interested in the field of robot learning.
+This tutorial is structured as follows:
+
+- the following section reviews classical robotics foundations, introducing the limitations of dynamics-based approaches to robotics.
+
+- the following section elaborates on the limitations of dynamics-based methods, and introduce RL as a practical approach to solve robotics problems, considering its upsides and potential limitations.
+
+- the following section further describes robot learning techniques that aim at solving single-tasks learning, leveraging BC techniques to autonomously reproduce specific expert demonstrations.
+
+- the following section presents recent contributions on developing generalist models for robotics applications, by learning from large corpora of multi-task & multi-robot data (*robotics foundation models*).
+
+Our goal with this tutorial is to provide an intuitive explanation of the reasons various disparate ideas from Machine Learning (ML) have converged and are powering the current evolution of Robotics, driving the unprecedented progress we see today.
+We complement our presentation of the most common and recent approaches in robot learning with practical code implementations using **LeRobot**, and start here by presenting the dataset format introduced with **LeRobot**.
+
+## 'LeRobotDataset'
+
+'LeRobotDataset' is a standardized dataset format designed to address the specific needs of robot learning research, and it provides a unified and convenient access to robotics data across modalities, including sensorimotor readings, multiple camera feeds and teleoperation status.
+'LeRobotDataset' also accommodates for storing general information regarding the data being collected, including textual descriptions of the task being performed by the teleoperator, the kind of robot used, and relevant measurement specifics like the frames per second at which the recording of both image and robot state's streams are proceeding.
+
+In this, 'LeRobotDataset' provides a unified interface for handling multi-modal, time-series data, and it is designed to seamlessly integrate with the PyTorch and Hugging Face ecosystems.
+'LeRobotDataset' can be easily extended by users and it is highly customizable by users, and it already supports openly available data coming from a variety of embodiments supported in **LeRobot**, ranging from manipulator platforms like the SO-100 arm and ALOHA-2 setup, to real-world humanoid arm and hands, as well as entirely simulation-based datasets, and self-driving cars.
+This dataset format is built to be both efficient for training and flexible enough to accommodate the diverse data types encountered in robotics, while promoting reproducibility and ease of use for users.
+
+### The dataset class design
+
+A core design choice behind 'LeRobotDataset' is separating the underlying data storage from the user-facing API.
+This allows for efficient storage while presenting the data in an intuitive, ready-to-use format.
+
+Datasets are always organized into three main components:
+
+- **Tabular Data**: Low-dimensional, high-frequency data such as joint states, and actions are stored in efficient memory-mapped files, and typically offloaded to the more mature `datasets` library by Hugging Face, providing fast with limited memory consumption.
+
+- **Visual Data**: To handle large volumes of camera data, frames are concatenated and encoded into MP4 files. Frames from the same episode are always grouped together into the same video, and multiple videos are grouped together by camera. To reduce stress on the file system, groups of videos for the same camera view are also broke into multiple sub-directories, after a given threshold number.
+
+- **Metadata** A collection of JSON files which describes the dataset's structure in terms of its metadata, serving as the relational counterpart to both the tabular and visual dimensions of data. Metadata include the different feature schema, frame rates, normalization statistics, and episode boundaries.
+
+For scalability, and to support datasets with potentially millions of trajectories (resulting in hundreds of millions or billions of individual camera frames), we merge data from different episodes into the same high-level structure.
+Concretely, this means that any given tabular collection and video will not typically contain information about one episode only, but rather a concatenation of the information available in multiple episodes.
+This keeps the pressure on the file system limited, both locally and on remote storage providers like Hugging Face, though at the expense of leveraging more heavily relational-like, metadata parts of the dataset, which are used to reconstruct information such as at which position, in a given file, an episode starts or ends.
+An example struture for a given 'LeRobotDataset' would appear as follows:
+
+- `meta/info.json`: This metadata is a central metadata file. It contains the complete dataset schema, defining all features (e.g., `observation.state`, `action`), their shapes, and data types. It also stores crucial information like the dataset's frames-per-second (`fps`), **LeRobot**'s version at the time of capture, and the path templates used to locate data and video files.
+
+- `meta/stats.json`: This file stores aggregated statistics (mean, std, min, max) for each feature across the entire dataset, used for data normalization for most policy models and accessible externally via `dataset.meta.stats`.
+
+- `meta/tasks.jsonl`: This file contains the mapping from natural language task descriptions to integer task indices, which are useful for task-conditioned policy training.
+
+- `meta/episodes/*` This directory contains metadata about each individual episode, such as its length, the corresponding task, and pointers to where its data is stored in the dataset's files. For scalability, this information is stored in files rather than a single large JSON file.
+
+- `data/*`: Contains the core frame-by-frame tabular data, using parquet files to allow for fast, memory-mapped access. To improve performance and handle large datasets, data from multiple episodes are concatenated into larger files. These files are organized into chunked subdirectories to keep the size of directories manageable. A single file typically contains data for more than one single episode.
+
+- `videos/*`: Contains the MP4 video files for all visual observation streams. Similar to the `data/` directory, the video footage from multiple episodes is concatenated into single MP4 files. This strategy significantly reduces the number of files in the dataset, which is more efficient for modern filesystems.
+
+## Code Example: Batching a (Streaming) Dataset
+
+This section provides an overview of how to access datasets hosted on Hugging Face using the 'LeRobotDataset' class.
+Every dataset on the Hugging Face Hub containing the three main pillars presented above (Tabular, Visual and relational Metadata), and can be assessed with a single instruction.
+
+In practice, most reinforcement learning (RL) and behavioral cloning (BC) algorithms tend to operate on stack of observation and actions.
+For the sake of brevity, we will refer to joint spaces, and camera frames with the single term of *frame*.
+For instance, RL algorithms may use a history of previous frames $o_\{t-H_o:t\}$ to mitigate partial observability, and BC algorithms are in practice trained to regress chunks of multiple actions ($a_\{t+t+H_a\}$) rather than single controls.
+To accommodate for these specifics of robot learning training, 'LeRobotDataset' provides a native windowing operation, whereby users can define the *seconds* of a given window (before and after) around any given frame, by using the `delta_timestemps` functionality.
+Unavailable frames are opportunely padded, and a padding mask is also returned to filter out the padded frames.
+Notably, this all happens within the 'LeRobotDataset', and is entirely transparent to higher level wrappers commonly used in training ML models such as `torch.utils.data. DataLoader`.
+
+Conveniently, by using 'LeRobotDataset' with a Pytorch `DataLoader` one can automatically collate the individual sample dictionaries from the dataset into a single dictionary of batched tensors for downstream training or inference.
+'LeRobotDataset' also natively supports streaming mode for datasets.
+Users can stream data of a large dataset hosted on the Hugging Face Hub, with a one-line change in their implementation.
+Streaming datasets supports high-performance batch processing (ca. 80-100 it/s, varying on connectivity) and high levels of frames randomization, key features for practical BC algorithms which otherwise may be slow or operating on highly non-i.i.d. data.
+This feature is designed to improve on accessibility so that large datasets can be processed by users without requiring large amounts of memory and storage.
+
+::: pbox
+Batching a (Streaming) Dataset
+
+``` python
+```
+:::
\ No newline at end of file
diff --git a/app/src/content/chapters/02_classic_robotics.mdx b/app/src/content/chapters/02_classic_robotics.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..40ca176d35afdbbc7a7046c4f125b7024888723b
--- /dev/null
+++ b/app/src/content/chapters/02_classic_robotics.mdx
@@ -0,0 +1,196 @@
+import ResponsiveImage from '../../components/ResponsiveImage.astro'
+import Ch2Approaches from '../assets/image/ch2/ch2-approaches.png'
+import Ch2Platforms from '../assets/image/ch2/ch2-platforms.png'
+import Ch2CostAccessibility from '../assets/image/ch2/ch2-cost-accessibility.png'
+import Ch2So100ToPlanarManipulator from '../assets/image/ch2/ch2-so100-to-planar-manipulator.png'
+import Ch2PlanarManipulatorFree from '../assets/image/ch2/ch2-planar-manipulator-free.png'
+import Ch2PlanarManipulatorFloorBox from '../assets/image/ch2/ch2-planar-manipulator-floor-box.png'
+import Ch2ClassicalLimitations from '../assets/image/ch2/ch2-classical-limitations.png'
+
+
+# Classical Robotics
+
+::: epigraph
+*Know your enemy* [\...]
+
+Sun Tzu
+:::
+
+> **TL;DR**
+> Learning-based approaches to robotics are motivated by the need to (1) generalize across tasks and embodiments (2) reduce dependency on human expertise (3) leverage historical trends on the production of data---all traditionally overlooked by dynamics-based techniques.
+
+## Explicit and Implicit Models
+
+
+
+*Overview of methods to generate motion (clearly non-exhausitve, see [@bekrisStateRobotMotion2024]). The different methods can be grouped based on whether they explicitly (*dynamics-based*) or implicitly (*learning-based*) model robot-environment interactions.*
+
+Robotics is concerned with producing artificial motion in the physical world in useful, reliable and safe fashion.
+Thus, robotics is an inherently multi-disciplinar domain: producing autonomous motion in the physical world requires, to the very least, interfacing different software (motion planners) and hardware (motion executioners) components.
+Further, knowledge of mechanical, electrical, and software engineering, as well as rigid-body mechanics and control theory have therefore proven quintessential in robotics since the field first developed in the 1950s.
+More recently, Machine Learning (ML) has also proved effective in robotics, complementing these more traditional disciplines [@connellRobotLearning1993].
+As a direct consequence of its multi-disciplinar nature, robotics has developed as a rather wide array of methods, all concerned with the main purpose of .
+
+Methods to produce robotics motion range from traditional *explicit* models---[^1] methods, leveraging precise descriptions of the mechanics of robots' rigid bodies and their interactions with eventual obstacles in the environment---to *implicit* models--- methods, treating artificial motion as a statistical pattern to learn given multiple sensorimotor readings [@agrawalComputationalSensorimotorLearning,bekrisStateRobotMotion2024].
+A variety of methods have been developed between these two extrema.
+For instance, @hansenTemporalDifferenceLearning2022 show how learning-based systems can benefit from information on the physics of problems, complementing a traditional learning method such as Temporal Difference (TD)-learning @suttonReinforcementLearningIntroduction2018 with Model-Predictive Control (MPC).
+Conversely, as explicit models may be relying on assumptions proving overly simplistic---or even unrealistic---in practice, learning can prove effective to improve modeling of complex phenomena or complement perception [@mccormacSemanticFusionDense3D2016].
+Such examples aim at demonstrating the richness of approaches to robotics, and Figure the referenced figure graphically illustrates some of the most relevant techniques.
+Such a list is clearly far from being exhaustive, and we refer to @bekrisStateRobotMotion2024 for a more comprehensive overview of both general and application-specific methods for motion generation.
+In this section, we wish to introduce the inherent benefits of ---the core focus on this tutorial.
+
+## Different Types of Motion
+
+
+
+*Different kinds of motions are achieved with potentially very different robotic platforms. From left to right, top to bottom: ViperX, SO-100, Boston Dynamics' Spot, Open-Duck, 1X's NEO, Boston Dynamics' Atlas. This is an example list of robotic platforms and is (very) far from being exhaustive.*
+
+In the vast majority of instances, robotics deals with producing motion via actuating joints connecting nearly entirely-rigid links.
+A key distinction between focus areas in robotics is based on whether the generated motion modifies (1) the absolute state of the environment (via dexterity), (2) the relative state of the robot with respect to its environment (exercising mobility skills), or (3) a combination of the two (Figure Section fig:robotics-platforms-atlas).
+
+Effects such as (1) are typically achieved *through* the robot, i.e. generating motion to perform an action inducing a desirable modification, effectively *manipulating* the environment (manipulation).
+Motions like (2) may result in changes in the robot's physical location within its environment.
+Generally, modifications to a robot's location within its environment may be considered instances of the general *locomotion* problem, further specified as *wheeled* or *legged* locomotion based on whenever a robot makes use of wheels or leg(s) to move in the environment.
+Lastly, an increased level of dynamism in the robot-environment interactions can be obtained combining (1) and (2), thus designing systems capable to interact with *and* move within their environment.
+This category is problems is typically termed *mobile manipulation*, and is characterized by a typically much larger set of control variables compared to either locomotion or manipulation alone.
+
+The traditional body of work developed since the very inception of robotics is increasingly complemented by learning-based approaches.
+ML has indeed proven particularly transformative across the entire robotics stack, first empowering planning-based techniques with improved state estimation used for traditional planning [@tangPerceptionNavigationAutonomous2023] and then end-to-end replacing controllers, effectively yielding perception-to-action methods [@koberReinforcementLearningRobotics].
+Work in producing robots capable of navigating a diverse set of terrains demonstrated the premise of both dynamics and learning-based approaches for locomotion [@griffinWalkingStabilizationUsing2017,jiDribbleBotDynamicLegged2023,leeLearningQuadrupedalLocomotion2020,margolisRapidLocomotionReinforcement2022], and recent works on whole-body control indicated the premise of learning-based approaches to generate rich motion on complex robots, including humanoids [@zhangWoCoCoLearningWholeBody2024,bjorckGR00TN1Open2025].
+Manipulation has also been widely studied, particularly considering its relevance for many impactful use-cases ranging from high-risk applications for humans [@fujitaDevelopmentRobotsNuclear2020,alizadehComprehensiveSurveySpace2024] to manufacturing [@sannemanStateIndustrialRobotics2020].
+While explicit models have proven fundamental in achieving important milestones towards the development of modern robotics, recent works leveraging implicit models proved particularly promising in surpassing scalability and applicability challenges via learning [@koberReinforcementLearningRobotics].
+
+## Example: Planar Manipulation
+
+Robot manipulators typically consist of a series of links and joints, articulated in a chain finally connected to an *end-effector*.
+Actuated joints are considered responsible for generating motion of the links, while the end effector is instead used to perform specific actions at the target location (e.g., grasping/releasing objects via closing/opening a gripper end-effector, using a specialized tool like a screwdriver, etc.).
+
+Recently, the development of low-cost manipulators like the ALOHA [@zhaoLearningFineGrainedBimanual2023] ALOHA-2 [@aldacoALOHA2Enhanced] and SO-100/SO-101 [@knightStandardOpenSO100] platforms significantly lowered the barrier to entry to robotics, considering the increased accessibility of these robots compared to more traditional platforms like the Franka Emika Panda arm (Figure Section fig:robotic-platforms-costs).
+
+
+
+*Cheaper, more accessible robots are starting to rival traditional platforms like the Panda arm platforms in adoption in resource-constrained scenarios. The SO-100, in particular, has a cost in the 100s of Euros, and can be entirely 3D-printed in hours, while the industrially-manufactured Panda arm costs tens of thousands of Euros and is not openly available.*
+
+Deriving an intuition as per why learning-based approaches are gaining popularity in the robotics community requires briefly analyzing traditional approaches for manipulation, leveraging tools like forward and inverse kinematics (FK, IK) and control theory.
+Providing a detailed overview of these methods falls (well) out of the scope of this tutorial, and we refer the reader to works including @sicilianoSpringerHandbookRobotics2016, lynchModernRoboticsMechanics2017, tedrakeRoboticManipulationPerception, tedrakeUnderactuatedRoboticsAlgorithms for a much more comprehensive description of these techniques.
+Here, we mostly wish to highlight the benefits of ML over these traditional techniques
+
+
+
+*The SO-100 arm is a 6-dof manipulator arm. Preventing some of its joints (shoulder pane, wrist flex and wrist roll) from actuating, it can be represented as a traditional 2-dof planar manipulator (the gripper joint in the end-effector is not considered towards the count of the degrees of freedom used to produce motion).*
+
+Consider the (simple) case where a SO-100 is restrained from actuating (1) the shoulder pane and (2) the wrist flex and roll motors.
+This effectively reduces the degrees of freedom of the SO-100 from the original 5+1 (5 joints + 1 gripper) to 2+1 (shoulder lift, elbow flex + gripper).
+As the end-effector does not impact motion in this model, the SO-100 is effectively reduced to the planar manipulator robot presented in Figure the referenced figure, where spheres represent actuators, and solid lines indicate length-$l$ links from the base of the SO-100 to the end-effector (*ee*).
+
+Further, let us make the simplifying assumption that actuators can produce rotations up to $2 \pi$ radians.
+In practice, this is seldom the case due to movement obstructions caused by the robot body itself (for instance, the shoulder lift cannot produce counter-clockwise movement due to the presence of the robot's base used to secure the SO-100 to its support and host the robot bus), but we will introduce movement obstruction at a later stage.
+
+All these simplifying assumptions leave us with the planar manipulator of Figure the referenced figure, free of moving its end-effector by controlling the angles $\theta_1$ and $\theta_2$, jointly referred to as the robot's *configuration*, and indicated with $q = [\theta_1, \theta_2 ] \in [-\pi, +\pi]^2$.
+The axis attached to the joints indicate the associated reference frame, whereas circular arrows indicate the maximal feasible rotation allowed at each joint.
+In this tutorial, we do not cover topics related to spatial algebra, and we instead refer the reader to @lynchModernRoboticsMechanics2017 [Chapter 2] and @tedrakeRoboticManipulationPerception [Chapter 3] for excellent explanations of the mechanics and theoretical foundations of producing motion on rigid bodies.
+
+
+
+*Free to move*
+
+Considering the (toy) example presented in Figure the referenced figure, then we can analytically write the end-effector's position $p \in \mathbb R^2$ as a function of the robot's configuration, $p = p(q), p: \mathcal Q \mapsto \mathbb R^2$.
+In particular, we have:
+$$\begin\{equation*\}
+p(q) =
+\begin\{pmatrix\}
+p_x(\theta_1, \theta_2) \\
+p_y(\theta_1, \theta_2)
+\end\{pmatrix\}
+=
+\begin\{pmatrix\}
+l \cos(\theta_1) + l \cos(\theta_1 + \theta_2) \\
+l \sin(\theta_1) + l \sin(\theta_1 + \theta_2)
+\end\{pmatrix\}
+\in S^\{n=2\}_\{l_1+l_2\} = \{ p(q) \in \mathbb R^2: \Vert p(q) \Vert_2^2 \leq (2l)^2, \ \forall q \in \mathcal Q \}
+\end\{equation*\}$$
+
+Deriving the end-effector's *pose*---position *and* orientation---in some $m$-dimensional space $\vec\{p\} \in \mathcal\{P\} \subset \mathbb\{R\}^\{m\}$ starting from the configuration $\q \in \mathcal Q \subset \mathbb R^n $ of a $ n$-joints robot is referred to as *forward kinematics* (FK), whereas identifying the configuration corresponding to any given target pose is termed *inverse kinematics* (IK).
+In that, FK is used to map a robot configuration into the corresponding end-effector pose, whereas IK is used to reconstruct the configuration(s) given an end-effector pose.
+
+In the simplified case here considered (for which $\vec\{p\} \equiv p $, as the orientation of the end-effector is disregarded for simplicity), one can solve the problem of controlling the end-effector's location to reach a goal position $ p^*$ by solving analytically for $q: p(q) = f_\{\text\{FK\}\}(q) = p^*$.
+However, in the general case, one might not be able to solve this problem analytically, and can typically resort to iterative optimization methods comparing candidate solutions using a loss function (in the simplest case, $\Vert p(q) - p^* \Vert_2^2$ is a natural candidate), yielding:
+
+$$\min_\{q \in \mathcal Q\} \Vert p(q) - p^* \Vert_2^2 \, .
+$$
+
+Exact analytical solutions to IK are even less appealing when one considers the presence of obstacles in the robot's workspace, resulting in constraints on the possible values of $q \in \mathcal Q \subseteq [-\pi, +\pi]^n \subset \mathbb R^n $ in the general case of $ n$-links robots.
+
+For instance, the robot in Figure the referenced figure is (very naturally) obstacled by the presence of the surface upon which it rests: $\theta_1$ can now exclusively vary within $[0, \pi]$, while possible variations in $\theta_2$ depend on $\theta_1$ (when $\theta_1 \to 0$ or $\theta_1 \to \pi$, further downwards movements are restricted).
+Even for a simplified kinematic model, developing techniques to solve eq. Section eq:ik_problem\ is in general non-trivial in the presence of constraints, particularly considering that the feasible set of solutions $\mathcal Q $ may change across problems.
+Figure the referenced figure provides an example of how the environment influences the feasible set considered, with a new set of constraints deriving from the position of a new obstacle.
+
+However, IK---solving eq. Section eq:ik_problem\ for a feasible $ q$---only proves useful in determining information regarding the robot's configuration in the goal pose, and crucially does not provide information on the *trajectory* to follow over time to reach a target pose.
+Expert-defined trajectories obviate to this problem providing a length-$K$ succession of goal poses $\tau_K = [p^*_0, p^*_1, \dots p^*_K]$ for tracking.
+In practice, trajectories can also be obtained automatically through *motion planning* algorithms, thus avoiding expensive trajectory definition from human experts.
+However, tracking $\tau_K $ via IK can prove prohibitively expensive, as tracking would require $ K$ resolutions of eq. Section eq:ik_problem\ (one for each target pose).
+*Differential* inverse kinematics (diff-IK) complements IK via closed-form solution of a variant of eq. Section eq:ik_problem\.
+Let $J(q)$ denote the Jacobian matrix of (partial) derivatives of the FK-function $f_\text\{FK\}: \mathcal Q \mapsto \mathcal P $, such that $ J(q) = \frac\{\partial f_\{FK\}(q)\}\{\partial q \}$.
+Then, one can apply the chain rule to any $p(q) = f_\{\text\{FK\}\}(q)$, deriving $\dot p = J(q) \dot q$, and thus finally relating variations in the robot configurations to variations in pose, thereby providing a platform for control.
+
+Given a desired end-effector trajectory $\dot\{p\}^*(t)$ (1) indicating anchor regions in space and (2) how much time to spend in each region, diff-IK finds $\dot q(t)$ solving for joints' *velocities* instead of *configurations*,
+
+$$\dot q(t) = \arg\min_\nu \; \lVert J(q(t)) \nu - \dot\{p\}^* (t) \rVert_2^2
+$$
+
+Unlike eq. Section eq:ik_problem\, solving for $\dot q$ is much less dependent on the environment (typically, variations in velocity are constrained by physical limits on the actuators).
+Conveniently, eq. Section eq:reg_ik_velocity\ also often admits the closed-form solution $\dot q = J(q)^+ \dot\{p\}^*$, where $J^+(q)$ denotes the Moore-Penrose pseudo-inverse of $J(q)$.
+Finally, discrete-time joint configurations $q$ can be reconstructed from joint velocities $\dot q $ using forward-integration on the continuous-time joint velocity , $ q_\{t+1\} = q_t + \Delta t\,\dot q_t$ for a given $\Delta t$, resulting in tracking via diff-IK.
+
+Following trajectories with diff-IK is a valid option in well-controlled and static environments (e.g., industrial manipulators in controlled manufacturing settings), and relies on the ability to define a set of target velocities to track $[\dot\{p\}^*_0, \dot\{p\}^*_1, \dots, \dot\{p\}^*_k ]$---an error-prone task largely requiring human expertise.
+Furthermore, diff-IK relies on the ability to (1) access $J(q) \, \forall q \in \mathcal Q$ and (2) compute its pseudo-inverse at every iteration of a given control cycle---a challenging assumption in highly dynamical settings, or for complex kinematic chains.
+
+### Adding Feedback Loops
+
+While very effective when a goal trajectory has been well specified, the performance of diff-IK can degrade significantly in the presence of modeling/tracking errors, or in the presence of non-modeled dynamics in the environment.
+
+
+
+One such case is presented in Figure the referenced figure, where another rigid body other than the manipulator is moving in the environment along the horizontal axis, with velocity $\dot x_B$.
+Accounting analytically for the presence of this disturbance---for instance, to prevent the midpoint of the link from ever colliding with the object---requires access to $\dot x_B$ at least, to derive the equation characterizing the motion of the environment.
+
+Less predictable disturbances however (e.g., $\dot x_B \leftarrow \dot x_B + \eps, \eps \sim N(0,1)$) may prove challenging to model analytically, and one could attain the same result of preventing link-object collision by adding a condition on the distance between the midpoint of $l $ and $ x_B$, enforced through a feedback loop on the position of the robot and object at each control cycle.
+
+To mitigate the effect of modeling errors, sensing noise and other disturbances, classical pipelines indeed do augment diff-IK with feedback control looping back quantities of interest.
+In practice, following a trajectory with a closed feedback loop might consist in backwarding the error between the target and measured pose, $\Delta p = p^* - p(q)$, hereby modifying the control applied to $\dot q = J(q)^+ (\dot\{p\}^* + k_p \Delta p )$, with $k_p$ defined as the (proportional) gain.
+
+More advanced techniques for control consisting in feedback linearization, PID control, Linear Quatratic Regulator (LQR) or Model-Predictive Control (MPC) can be employed to stabilize tracking and reject moderate perturbations, and we refer to @sicilianoSpringerHandbookRobotics2016 [Chapter 8] for in-detail explanation of these concepts, or [@tedrakeRoboticManipulationPerception Chapter 8] for a simple, intuitive example in the case of a point-mass system.
+Nonetheless, feedback control presents its challenges as well: tuning gains remains laborious and system-specific.
+Further, manipulation tasks present intermittent contacts inducing hybrid dynamics (mode switches) and discontinuities in the Jacobian, challenging the stability guarantees of the controller and thus often necessitating rather conservative gains and substantial hand-tuning.
+
+We point the interested reader to @sicilianoSpringerHandbookRobotics2016 [Chapter 2,7,8], @lynchModernRoboticsMechanics2017 [Chapter 6,11], and @tedrakeRoboticManipulationPerception [Chapter 3,8] for extended coverage of FK, IK, diff-IK and control for (diff-)IK.
+
+## Limitations of Dynamics-based Robotics
+
+Despite the last 60+ years of robotics research, autonomous robots are still largely incapable of performing tasks at human-level performance in the physical world generalizing across (1) robot embodiments (different manipulators, different locomotion platforms, etc.) and (2) tasks (tying shoe-laces, manipulating a diverse set of objects).
+While essential in the early development of robotics, the aforementioned methods require significant human expertise to be used in practice, and are typically specific to a particular applicative problem.
+
+
+
+*Dynamics-based approaches to robotics suffer from several limitations: (1) orchestrating multiple components poses integration challenges; (2) the need to develop custom processing pipelines for the sensing modalities and tasks considered hinders scalability; (3) simplified analytical models of physical phenomena (here friction at the gripper; credits to [@antonovaReinforcementLearningPivoting2017]) limit real-world performance. Lastly, (4) dynamics-based methods overlook trends in the availability and growth of robotics data.*
+
+Dynamics-based robotics pipelines have historically been now within most architectures for specific purposes.
+That is, sensing, state estimation, mapping, planning, (diff-)IK, and low-level control have been traditionally developed as distinct modules with fixed interfaces.
+Pipelining these specific modules proved error-prone, and brittleness emerges---alongside compounding errors---whenever changes incur (e.g., changes in lighting for sensing, occlusion/failure of sensors, control failures).
+Adapting such a stack to new tasks or robotic platforms often entails re-specifying objectives, constraints, and heuristics at multiple stages, incurring significant engineering overhead.
+
+Moreover, classical planners operate on compact, assumed-sufficient state representations; extending them to reason directly over raw, heterogeneous and noisy data streams is non-trivial.
+This results in a , as incorporating high-dimensional perceptual inputs (RGB, depth, tactile, audio) traditionally required extensive engineering efforts to extract meaningful features for control.
+Also, the large number of tasks, coupled with the adoption of *per-task* planners, goal parameterizations, and safety constraints, results in an explosion in design and validation options, with little opportunity to reuse solutions across tasks.
+
+Setting aside integration and scalability challenges: developing accurate modeling of contact, friction, and compliance for complicated systems remains difficult.
+Rigid-body approximations are often insufficient in the presence of deformable objects, and of the methods developed.
+In the case of complex, time-dependent and/or non-linear dynamics, even moderate mismatches in parameters, unmodeled evolutions, or grasp-induced couplings can qualitatively affect the observed dynamics.
+
+Lastly, dynamics-based methods (naturally) overlook the rather recent .
+The curation of academic datasets by large centralized groups of human experts in robotics [@collaborationOpenXEmbodimentRobotic2025, khazatskyDROIDLargeScaleInTheWild2025] is now increasingly complemented by a by individuals with varied expertise.
+If not tangentially, dynamics-based approaches are not posed to maximally benefit from this trend, which holds the premise of allowing generalization in the space of tasks and embodiments, like data was the cornerstone for advancements in vision [@alayracFlamingoVisualLanguage2022] and natural-language understanding [@brownLanguageModelsAre2020].
+
+Taken together, these limitations (Figure Section fig:classical-limitations) motivate the exploration of learning-based approaches that can (1) integrate perception and control more tightly, (2) adapt across tasks and embodiments with reduced expert modeling interventions and (3) scale gracefully in performance as more robotics data becomes available.
+
+[^1]: In here, we refer to both *kinematics* and *dynamics*-based control.
\ No newline at end of file
diff --git a/app/src/content/chapters/03_reinforcement_learning.mdx b/app/src/content/chapters/03_reinforcement_learning.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..4a18748d68b725624f7c7e3642ae9ed14eae2ad5
--- /dev/null
+++ b/app/src/content/chapters/03_reinforcement_learning.mdx
@@ -0,0 +1,317 @@
+import ResponsiveImage from '../../components/ResponsiveImage.astro'
+import Ch3LearningBenefits from '../assets/image/ch3/ch3-learning-benefits.png'
+import Ch3LearningAtlas from '../assets/image/ch3/ch3-learning-atlas.png'
+import Ch3RlExamples from '../assets/image/ch3/ch3-rl-examples.png'
+import Ch3AgentEnv from '../assets/image/ch3/ch3-agent-env.png'
+import Ch3RlAlgorithmsAtlas from '../assets/image/ch3/ch3-rl-algorithms-atlas.png'
+import Ch3DuckSimVsReal from '../assets/image/ch3/ch3-duck-sim-vs-real.png'
+import Ch3ManyDucks from '../assets/image/ch3/ch3-many-ducks.png'
+import Ch3HilSerlExamples from '../assets/image/ch3/ch3-hil-serl-examples.png'
+
+
+# Robot (Reinforcement) Learning
+
+::: epigraph
+*Approximate the solution, not the problem* [\...]
+
+Richard Sutton
+:::
+
+> **TL;DR**
+> The need for expensive high-fidelity simulators can be obviated by learning from real-world data, using sample-efficient algorithms that can safely train directly on hardware.
+
+
+
+*Learning-based robotics streamlines perception-to-action by learning a (1) unified high-level controller capable to take (2) high-dimensional, unstructured sensorimotor information. Learning (3) does not require a dynamics model and instead focuses on interaction data, and (4) empirically correlates with
+the scale of the data used.*
+
+Learning-based techniques for robotics naturally address the limitations presented in the following section\ (Figure Section fig:robot-learning-upsides).
+Learning-based techniques typically rely on prediction-to-action (*visuomotor policies*), thereby directly mapping sensorimotor inputs to predicted actions, streamlining control policies by removing the need to interface multiple components.
+Mapping sensorimotor inputs to actions directly also allows to add diverse input modalities, leveraging the automatic feature extraction characteristic of most modern learning systems.
+Further, learning-based approaches can in principle entirely bypass modeling efforts and instead rely exclusively on interactions data, proving transformative when dynamics are challenging to model or even entirely unknown.
+Lastly, learning for robotics (*robot learning*) is naturally well posed to leverage the growing amount of robotics data openly available, just as computer vision first and natural language processing later did historically benefit from large scale corpora of (possibly non curated) data, in great part overlooked by dynamics-based approaches.
+
+Being a field at its relative nascent stages, no prevalent technique(s) proved distinctly better better in robot learning.
+Still, two major classes of methods gained prominence: and (Figure Section fig:robot-learning-atlas).
+In this section, we provide a conceptual overview of applications of the former to robotics, as well as introduce practical examples of how to use RL within **LeRobot**.
+We then introduce the major limitations RL suffers from, to introduce BC techniques in the next sections (the following section, sec:learning-bc-generalist).
+
+
+
+*Overview of the robot learning methods implemented in **LeRobot**.*
+
+In Figure the referenced figure we decided to include generalist robot models [@black $p_0$ VisionLanguageActionFlow2024,shukorSmolVLAVisionLanguageActionModel2025] alongside task-specific BC methods.
+While significant different in spirit---*generalist* models are language-conditioned and use instructions to generate motion valid across many tasks, while *task-specific* models are typically not language-conditioned and used to perform a single task---foundation models are largely trained to reproduce trajectories contained in a large training set of input demonstrations.
+Thus, we argue generalist policies can indeed be grouped alongside other task-specific BC methods, as they both leverage similar training data and schemas.
+
+Figure the referenced figure illustrates this categorization graphically, explicitly listing all the robot learning policies currently available in **LeRobot**: Action Chunking with Transformers (ACT) [@zhaoLearningFineGrainedBimanual2023], Diffusion Policy [@chiDiffusionPolicyVisuomotor2024], Vector-Quantized Behavior Transformer (VQ-BeT) [@leeBehaviorGenerationLatent2024], $\pi_0$ [@black $p_0$ VisionLanguageActionFlow2024], SmolVLA [@shukorSmolVLAVisionLanguageActionModel2025], Human-in-the-loop Sample-efficient RL (HIL-SERL) [@luoPreciseDexterousRobotic2024] and TD-MPC [@hansenTemporalDifferenceLearning2022].
+
+
+
+*Examples of two different robotics tasks performed using RL. In the manipulation task (A) an agent learns to reach for a yellow plastic block in its environment, and to put it inside of a box. In the locomotion task (B) an agent learns to move its center of mass sideways without falling.*
+
+Applications of RL to robotics have been long studied, to the point the relationship between these two disciplines has been compared to that between physics and matematics [@koberReinforcementLearningRobotics].
+Indeed, due to their interactive and sequential nature, many robotics problems can be directly mapped to RL problems.
+Figure the referenced figure depicts two of such cases.
+Reaching for an object to move somewhere else in the scene is an indeed sequential problem where at each cycle the controller needs to adjust the position of the robotic arm based on their current configuration and the (possibly varying) position of the object.
+Figure the referenced figure also shows an example of a locomotion problem, where sequentiality is inherent in the problem formulation.
+While sliding to the side, the controller has to constantly keep adjusting to the robot's propioperception to avoid failure (falling).
+
+## A (Concise) Introduction to RL
+
+The RL framework [@suttonReinforcementLearningIntroduction2018], which we briefly introduce here, has often been used to model robotics problems [@koberReinforcementLearningRobotics].
+RL is a subfield within ML fundamentally concerned with the development of autonomous systems (*agents*) learning how to *continuously behave* in an evolving environment, developing (ideally, well-performing) control strategies (*policies*).
+Crucially for robotics, RL agents can improve via trial-and-error only, thus entirely bypassing the need to develop explicit models of the problem dynamics, and rather exploiting interaction data only.
+In RL, this feedback loop (Figure Section fig:rl-most-famous-pic) between actions and outcomes is established through the agent sensing a scalar quantity (*reward*).
+
+
+
+*Agent-Environment interaction diagram (image credits to [@suttonReinforcementLearningIntroduction2018]).*
+
+Formally, interactions between an agent and its environment are typically modeled via a Markov Decision Process (MDP) [@bellmanMarkovianDecisionProcess1957].
+Representing robotics problems via MDPs offers several advantages, including (1) incorporating uncertainty through MDP's inherently stochastic formulation and (2) providing a theoretically sound framework for learning *without* an explicit dynamic model.
+While accommodating also a continuous time formulation, MDPs are typically considered in discrete time in RL, thus assuming interactions to atomically take place over the course of discrete *timestep* $t=0,1,2,3, \dots, T $.
+MDPs allowing for an unbounded number of interactions ( $ T \to + \infty $ ) are typically termed *infinite-horizon*, and opposed to *finite-horizon* MDPs in which $ T$ cannot grow unbounded.
+Unless diversely specified, we will only be referring to discrete-time finite-horizon (*episodic*) MDPs here.
+
+Formally, a lenght-$T$ Markov Decision Process (MDP) is a tuple $\mathcal M = \langle \mathcal\{S\}, \mathcal\{A\}, \mathcal\{D\}, r, \gamma, \rho, T \rangle$, where:
+
+- $\mathcal\{S\}$ is the *state space*; $s_t \in \mathcal\{S\}$ denotes the (possibly non-directly observable) environment state at time $t $. In robotics, states often comprise robot configuration and velocities ($ q_t, \dot q_t$), and can accomodate sensor readings such as camera or audio streams.
+
+- $\mathcal\{A\}$ is the *action space*; $a_t \in \mathcal\{A\}$ may represent joint torques, joint velocities, or even end-effector commands. In general, actions correspond to commands intervenings on the configuration of the robot.
+
+- $\mathcal\{D\}$ represents the (possibly non-deterministic) environment dynamics, with $\mathcal\{D\}: \mathcal\{S\} \times \mathcal\{A\} \times \mathcal\{S\} \mapsto [0, 1]$ corresponding to $\mathcal\{D\} \, (s_t, a_t, s_\{t+1\}) = \mathbb\{P\}(s_\{t+1\} | s_t, a_t)$. For instance, for a planar manipulator dynamics could be considered deterministic when the environment is fully described (Figure Section fig:planar-manipulation-simple), and stochastic when unmodeled disturbances depending on non-observable parameters intervene (Figure Section fig:planar-manipulator-box-velocity).
+
+- $r: \mathcal\{S\} \times \mathcal\{A\} \times \mathcal\{S\} \to \mathbb R$ is the *reward function*, weighing the transition $(s_t, a_t, s_\{t+1\})$ in the context of the achievement of an arbitrary goal. For instance, a simple reward function for quickly moving the along the $x $ axis in 3D-space (Figure Section fig:robotics-with-rl-examples) could be based on the absolute position of the robot along the $ x$ axis ($p_x $), present negative penalties for falling over (measured from $ p_z$) and a introduce bonuses $\dot p_x $ for speed, $ r (s_t, a_t, s_\{t+1\}) \equiv r(s_t) = p_\{x_t\} \cdot \dot p_\{x_t\} - \tfrac\{1\}\{p_\{z_t\}\}$.
+
+Lastly, $\gamma \in [0,1]$ represent the discount factor regulating preference for immediate versus long-term reward (with an effective horizon equal to $\tfrac\{1\}\{1-\gamma\}$), and $\rho$ is the distribution, defined over $\mathcal\{S\}$, the MDP's *initial* state is sampled from, $s_0 \sim \rho $.
+
+A length-$ T$ *trajectory* is the (random) sequence
+$$\begin{equation}
+
+ \tau = (s_0, a_0, r_0, s_1, a_1, r_1, \dots, s_{T-1}, a_{T-1}, r_{T-1}, s_T),
+\end{equation}$
+with per-step rewards defined as r_t = r (s_t, a_t, s_\{t+1\}) for ease of notation. Interestingly, assuming both the environment dynamics and conditional distribution over actions given states---the *policy*---to be *Markovian*:
+
+$\mathbb P(s_{t+1} \vert s_t, a_t, s_{t-1}, a_{t-1}, \dots s_0, a_0 ) &= \mathbb P (s_{t+1} | s_t, a_t)
+
+\mathbb P(a_t \vert s_t, a_{t-1}, s_{t-1}, s_0, a_0) &= \mathbb P(a_t \vert s_t) $
+
+The probability of observing a given trajectory \tau factorizes into
+$\begin{equation}
+
+ \mathbb P(\tau) = \mathbb P (s_0) \prod_{t=0}^{T-1} \mathbb P (s_{t+1} | s_t, a_t) \ \mathbb P(a_t \vert s_t).
+\end{equation}$$
+
+Policies $\mathbb P(a_t \vert s_t)$ are typically indicated as $\pi(a_t \vert s_t)$, and often parametrized via $\theta$, yielding $\pi_\theta (a_t \vert s_t )$.
+Policies are trained optimizing the (discounted) *return* associated to a given $\tau$, i.e. the (random) sum of measured rewards over trajectory:
+$$G(\tau) = \sum_\{t=0\}^\{T-1\} \gamma^\{t\} r_t.$$
+In that, agents seek to learn control strategies (*policies*, $\pi_\theta$) maximizing the expected return $\mathbb E_\{\tau \sim \pi_\theta\} G(\tau)$.
+For a given dynamics $\mathcal D$---i.e., for a given problem---taking the expectation over the (possibly random) trajectories resulting from acting according to a certain policy provides a direct, goal-conditioned ordering in the space of all the possible policies $\Pi $, yielding the (maximization) target $ J : \Pi \mapsto \mathbb R$
+
+$$J(\pi_\theta) &= \mathbb E_\{\tau \sim \mathbb P_\{\theta; \mathcal D\}\} \left[ G(\tau) \right],
+
+ \mathbb P_\{\theta; \mathcal D\} (\tau) &= \rho \prod_\{t=0\}^\{T-1\} \mathcal D (s_t, a_t, s_\{t+1\}) \ \pi_\theta (a_t \vert s_t).$$
+
+Because in the RL framework the agent is assumed to only be able to observe the environment dynamics and not to intervene on them, the referenced figure varies exclusively with the policy followed.
+In turn, MDPs naturally provide a framework to optimize over the space of the possible behaviors an agent might enact ($\pi \in \Pi$), searching for the *optimal policy* $\pi^* = \arg \max_\{\theta\} J(\pi_\theta)$, where $\theta$ is the parametrization adopted by the policy set $\Pi: \pi_\theta \in \Pi, \ \forall \theta $.
+Other than providing a target for policy search, $ G(\tau)$ can also be used as a target to discriminate between states and state-action pairs.
+Given any state $s \in \mathcal\{S\}$---e.g., a given configuration of the robot---the *state-value* function
+$$V_\pi(s) = \mathbb E_\{\tau \sim \pi\} \left[ G(\tau) \big \vert s_0 = s \right]$$
+can be used to discriminate between desirable and undesirable state in terms of long-term (discounted) reward maximization, under a given policy $\pi $.
+Similarily, the *state-action* value function also conditions the cumulative discounted reward on selecting action $ a$ when in $s$, and thereafter act according to $\pi $:
+$ Q_\pi(s,a) = \mathbb E_\{\tau \sim \pi\} \left[ G (\tau) \big \vert s_0 = s, a_0=a \right]
+Crucially, value functions are interrelated:
+
+$Q_\pi(s_t, a_t) &= \mathbb{E}_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \left[ r_t + \gamma V_\pi(s_{t+1}) \right]
+
+V_\pi(s_t) &= \mathbb E_{a_t \sim \pi(\bullet \vert s_t)} \left[ Q_\pi (s_t, a_t) \right]
+$$
+
+Inducing an ordering over states and state-action pairs under $\pi$, value functions are central to most RL algorithms.
+A variety of methods have been developed in RL as standalone attemps to find (approximate) solutions to the problem of maximizing cumulative reward (Figure Section fig:rl-algos-atlas).
+
+
+
+*Popular RL algorithms. See [@SpinningUp2018] for a complete list of citations.*
+
+Popular approaches to continuous state and action space---such as those studied within robotics---include @schulmanTrustRegionPolicy2017, schulmanProximalPolicyOptimization2017, haarnojaSoftActorCriticOffPolicy2018.
+Across manipulation [@akkayaSolvingRubiksCube2019] and locomotion [@leeLearningQuadrupedalLocomotion2020] problems, RL proved extremely effective in providing a platform to (1) adopt a unified, streamlined perception-to-action pipeline, (2) natively integrate propioperception with multi-modal high-dimensional sensor streams (3) disregard a description of the environment dynamics, by focusing on observed interaction data rather than modeling, and (4) anchor policies in the experience collected and stored in datasets.
+For a more complete survey of applications of RL to robotics, we refer the reader to @koberReinforcementLearningRobotics,tangDeepReinforcementLearning2024.
+
+## Real-world RL for Robotics
+
+Streamlined end-to-end control pipelines, data-driven feature extraction and a disregard for explicit modeling in favor of interaction data are all features of RL for robotics.
+However, particularly in the context of real-world robotics, RL still suffers from limitations concerning machine safety and learning efficiency.
+
+First, especially early in training, .
+On physical systems, untrained policies may command high velocities, self-collisiding configurations, or torques exceeding joint limits, leading to wear and potential hardware damage.
+Mitigating these risks requires external safeguards (e.g., watchdogs, safety monitors, emergency stops), often incuring in a high degree of human supervision.
+Further, in the typical episodic setting considered in most robotics problems, experimentation is substantially slowed down by the need to manually reset the environment over the course of training, a time-consuming and brittle process.
+
+Second, learning with a limited number of samples remains problematic in RL, .
+Even strong algorithms such as SAC [@haarnojaSoftActorCriticOffPolicy2018] typically require a large numbers of transitions $\{ (s_t, a_t, r_t, s_\{t+1\}) \}_\{t=1\}^N$.
+On hardware, generating these data is time-consuming and can even be prohibitive.
+
+
+
+*Simulated (left) vs. real-world (right) OpenDuck. Discrepancies in the simulation dynamics (*reality gap*) pose risks to policy transfer.*
+
+Training RL policies in simulation [@tobinDomainRandomizationTransferring2017] addresses both issues: it eliminates physical risk and dramatically increases throughput.
+Yet, simulators require significant modeling effort, and rely on assumptions (simplified physical modeling, instantaneous actuation, static environmental conditions, etc.) limiting transferring policies learned in simulation due the discrepancy between real and simulated environments (*reality gap*, Figure Section fig:synthetic-vs-real-duck).
+*Domain randomization* (DR) is a popular technique to overcome the reality gap, consisting in randomizing parameters of the simulated environment during training, to induce robustness to specific disturbances.
+In turn, DR is employed to increase the diversity of scenarios over the course of training, improving on the chances sim-to-real transfer [@akkayaSolvingRubiksCube2019,antonovaReinforcementLearningPivoting2017,jiDribbleBotDynamicLegged2023].
+In practice, DR is performed further parametrizing the *simulator*'s dynamics $\mathcal D \equiv \mathcal D_\xi$ with a *dynamics* (random) vector $\xi$ drawn an arbitrary distribution, $\xi \sim \Xi$.
+Over the course of training---typically at each episode's reset---a new $\xi$ is drawn, and used to specify the environment's dynamics for that episode.
+For instance, one could decide to randomize the friction coefficient of the surface in a locomotion task (Figure Section fig:ducks-on-terrains), or the center of mass of an object for a manipulation task.
+
+
+
+*The same locomotion task can be carried out in different (simulated) domains (exemplified by the difference in terrains) at training time, resulting to increased robustness over diverse environment dynamics.*
+
+While effective in transfering policies across the reality gap in real-world robotics [@tobinDomainRandomizationTransferring2017,akkayaSolvingRubiksCube2019, jiDribbleBotDynamicLegged2023,tiboniDomainRandomizationEntropy2024], DR often requires extensive manual engineering.
+First, identifying which parameters to randomize---i.e., the *support* $\text\{supp\} (\Xi)$ of $\Xi$---is an inherently task specific process.
+When locomoting over different terrains, choosing to randomize the friction coefficient is a reasonable choice, yet not completely resolutive as other factors (lightning conditions, external temperature, joints' fatigue, etc.) may prove just as important, making selecting these parameters yet another source of brittlness.
+
+Selecting the dynamics distribution $\Xi$ is also non-trivial.
+On the one hand, distributions with low entropy might risk to cause failure at transfer time, due to the limited robustness induced over the course of training.
+On the other hand, excessive randomization may cause over-regularization and hinder performance.
+Consequently, the research community investigated approaches to automatically select the randomization distribution $\Xi$, using signals from the training process or tuning it to reproduce observed real-world trajectories.
+ @akkayaSolvingRubiksCube2019 use a parametric uniform distribution $\mathcal U(a, b)$ as $\Xi$, widening the bounds as training progresses and the agent's performance improves (AutoDR).
+While effective, AutoDR requires significant tuning---the bounds are widened by a fixed, pre-specified amount $\Delta$---and may disregard data when performance *does not* improve after a distribution update [@tiboniDomainRandomizationEntropy2024].
+ @tiboniDomainRandomizationEntropy2024 propose a similar method to AutoDR (DORAEMON) to evolve $\Xi$ based on training signal, but with the key difference of explicitly maximizing the entropy of parametric Beta distributions, inherently more flexible than uniform distributions.
+DORAEMON proves particularly effective at dynamically increasing the entropy levels of the training distribution by employing a max-entropy objective, under performance constraints formulation.
+Other approaches to automatic DR consist in specifically tuning $\Xi$ to align as much as possible the simulation and real-world domains.
+For instance, @chebotar2019closing interleave in-simulation policy training with repeated real-world policy rollouts used to adjust $\Xi$ based on real-world data, while @tiboniDROPOSimtoRealTransfer2023 leverage a single, pre-collected set of real-world trajectories and tune $\Xi$ under a simple likelihood objective.
+
+While DR has shown promise, it does not address the main limitation that, even under the assumption that an ideal distribution $\Xi$ to sample from was indeed available, many robotics problems in the first place.
+Simulating contact-rich manipulation of possibly deformable or soft materials---i.e., *folding a piece of clothing*---can be costly and even time-intensive, limiting the benefits of in-simulation training.
+
+A perhaps more foundamental limitation of RL for robotics is the general unavailability of complicated tasks' *dense* reward function, the design of which is essentially based on human expertise and trial-and-error.
+In practice, *sparse* reward functions can be used to conclude whether one specific goal has been attained---*has this t-shirt been correctly folded?*---but unfortunately incur in more challenging learning.
+As a result, despite notable successes, deploying RL directly on real-world robots at scale remains challenging.
+
+To make the most of (1) the growing number of openly available datasets and (2) relatively inexpensive robots like the SO-100, RL could (1) be anchored in already-collected trajectories---limiting erratic and dangerous exploration---and (2) train in the real-world directly---bypassing the aforementioned issues with low-fidelity simulations.
+In such a context, sample-efficient learning is also paramount, as training on the real-world is inherently time-bottlenecked.
+
+Off-policy algorithms like Soft Actor-Critic (SAC) [@haarnojaSoftActorCriticOffPolicy2018] tend to be more sample efficient then their on-policy counterpart [@schulmanProximalPolicyOptimization2017], due to the presence a *replay buffer* used over the course of the training.
+Other than allowing to re-use transitions $(s_t, a_t, r_t, s_\{t+1\})$ over the course of training, the replay buffer can also accomodate for the injection of previously-collected data in the training process [@ballEfficientOnlineReinforcement2023].
+Using expert demonstrations to guide learning together with learned rewards, RL training can effectively be carried out in the real-world [@luoSERLSoftwareSuite2025].
+Interestingly, when completed with in-training human interventions, real-world RL agents have been shown to learn policies with near-perfect success rates on challenging manipulation tasks in 1-2 hours [@luoPreciseDexterousRobotic2024].
+
+#### Sample-efficient RL
+
+In an MDP, the optimal policy $\pi^*$ can be derived from its associated Q-function, $Q_\{\pi^*\}$, and in particular the optimal action(s) $\mu(s_t)$ can be selected maximizing the optimal Q-function over the action space,
+$$\mu(s_t) = \max_\{a_t \in \mathcal A\} Q_\{\pi^*\}(s_t, a_t).$$
+Interestingly, the Q\^*-function satisfies a recursive relationship (*Bellman equation*) based on a very natural intuition
+[^1]:
+
+> [\...] If the optimal value $Q^*(s_\{t+1\}, a_\{t+1\})$ of the [state] $s_\{t+1\}$ was known for all possible actions $a_\{t+1\}$, then the optimal strategy is to select the action $a_\{t+1\}$ maximizing the expected value of $r_t + \gamma Q^*(s_\{t+1\}, a_\{t+1\})$
+> $$Q^*(s_t, a_t) = \mathbb E_\{s_\{t+1\} \sim \mathbb P(\bullet \vert s_t, a_t)\} \left[ r_t + \gamma \max_\{a_\{t+1\} \in \mathcal A\} Q^*(s_\{t+1\}, a_\{t+1\}) \big\vert s_t, a_t \right]$$
+
+In turn, the optimal Q-function is guaranteed to be self-consistent by definition.
+*Value-iteration* methods exploit this relationship (and/or its state-value counterpart, $V^*(s_t)$ ) by iteratively updating an initial estimate of Q\^*, $Q_k$ using the Bellman equation as update rule (*Q-learning*):
+$$Q_\{i+1\}(s_t, a_t) \leftarrow \mathbb E_\{s_\{t+1\} \sim \mathbb P(\bullet \vert s_t, a_t)\} \left[ r_t + \gamma \max_\{a_\{t+1\} \in \mathcal A\} Q_i (s_\{t+1\}, a_\{t+1\}) \big\vert s_t, a_t \right], \quad i=0,1,2,\dots,K$$
+Then, one can derive the (ideally, near-optimal) policy by explicitly maximizing over the action space the final (ideally, near-optimal) estimate $Q_K \approx Q^*$ at each timestep.
+In fact, under certain assumptions on the MDP considered, $Q_K \to Q^* \, \text\{as \} K \to \infty$.
+
+Effective in its early applications to small-scale discrete problems and theoretically sound, vanilla Q-learning was found complicated to scale to large $\mathcal\{S\} \times \mathcal\{A\}$ problems, in which the storing of $Q : \mathcal\{S\} \times \mathcal\{A\} \mapsto \mathbb R$ alone might result prohibitive.
+Also, vanilla Q-learning is not directly usable for *continuous*, unstructured state-action space MPDs, such as those considered in robotics.
+In their seminal work on *Deep Q-Learning* (DQN), @mnihPlayingAtariDeep2013 propose learning Q-values using deep convolutional neural networks, thereby accomodating for large and even unstructured *state* spaces.
+DQN parametrizes the Q-function using a neural network with parameters $\theta$, updating the parameters by sequentially minimizing the expected squared temporal-difference error (TD-error, $\delta_i$):
+
+$$\mathcal L(\theta_i) &= \mathbb E_\{(s_t, a_t) \sim \chi(\bullet)\}
+ \big[
+ (\underbrace\{y_i - Q_\{\theta_i\}(s_t, a_t)\}_\{\delta_i\})^2
+ \big],
+
+ y_i &= \mathbb E_\{s_\{t+1\} \sim \mathbb P(\bullet \vert s_t, a_t)\} \big[ r_t + \gamma \max_\{a_t \in \mathcal A\} Q_\{\theta_\{i-1\}\} (s_\{t+1\}, a_\{t+1\}) \big], $$
+
+Where $\chi$ represents a behavior distribution over state-action pairs.
+Crucially, $\chi$ can in principle be different from the policy being followed, effectively allowing to reuse prior data stored in a *replay buffer* in the form of $(s_t, a_t, r_t, s_\{t+1\})$ transitions, used to form the TD-target $y_i$, TD-error $\delta_i$ and loss function the referenced figure via Monte-Carlo (MC) estimates.
+
+While effective in handling large, unstructured state spaces for discrete action-space problems, DQN application's to continous control problems proved challenging.
+Indeed, in the case of high-capacity function approximators such as neural networks, solving $\max_\{a_t \in \mathcal A\} Q_\theta(s_t, a_t)$ at each timestep is simply unfeasible due to the (1) continous nature of the action space ($\mathcal\{A\} \subset \mathbb R^n $ for some $ n$) and (2) impossibility to express the find a cheap (ideally, closed-form) solution to $Q_\theta $.
+ @silverDeterministicPolicyGradient2014 tackle this fundamental challenge by using a *deterministic* function of the state $ s_t$ as policy, $\mu_\phi(s_t) = a_t$, parametrized by $\phi$. Thus, policies can be iteratively refined updating $\phi$ along the direction:
+$$\begin\{equation\}
+
+ d_\phi = \mathbb E_\{s_t \sim \mathbb P (\bullet)\} \left[ \nabla_\phi Q(s_t, a_t)\vert_\{a_t = \mu_\phi(s_t)\} \right] = \mathbb E_\{s_t \sim \mathbb P(\bullet)\} \left[ \nabla_\{a_t\} Q(s_t, a_t) \vert_\{a_t = \mu_\phi(s_t)\} \cdot \nabla_\phi \mu(s_t) \right]
+\end\{equation\}$$
+Provably, the referenced figure is the *deterministic policy gradient* (DPG) of the policy $\mu_\phi$ [@silverDeterministicPolicyGradient2014], so that updates $\phi_\{k+1\}\leftarrow \phi_k + \alpha d_\phi $ are guaranteed to increase the (deterministic) cumulative discounted reward, $ J(\mu_\phi)$.
+ @lillicrapContinuousControlDeep2019 extended DPG to the case of (1) high-dimensional unstructured observations and (2) continuous action spaces, introducing Deep Deterministic Policy Gradient (DDPG), an important algorithm RL and its applications to robotics.
+DDPG adopts a modified TD-target compared to the one defined in the referenced figure, by maintaining a policy network used to select actions, yielding
+$$\begin{equation}
+
+y_i = \mathbb E_{s_{t+1} \sim \mathbb P(\bullet \vert s_t, a_t)} \big[ r_t + \gamma Q_{\theta_{i-1}} (s_{t+1}, \mu_\phi(s_{t+1})) \big] .
+\end{equation}$
+Similarily to DQN, DDPG also employs the same replay buffer mechanism, to reuse past transitions over training for increased sample efficiency and estimate the loss function via MC-estimates.
+
+Soft Actor-Critic (SAC) [@haarnojaSoftActorCriticOffPolicy2018] is a derivation of DDPG in the max-entropy (MaxEnt) RL framework, in which RL agents are tasked with .
+MaxEnt RL [@haarnojaReinforcementLearningDeep2017] has proven particularly robust thanks to the development of diverse behaviors, incentivized by its entropy-regularization formulation.
+In that, MaxEnt revisits the RL objective J (\pi) to specifically account for the policy entropy,
+
+$J(\pi) &= \sum_{t=0}^T \mathbb{E}_{(s_t, a_t) \sim \chi} \left[ r_t + \alpha \mathcal H(\pi (\bullet \vert s_t)) \right] $
+
+This modified objective results in the *soft* TD-target:
+ \begin\{equation\}
+
+ y_i = \mathbb E_\{s_\{t+1\} \sim \mathbb P( \bullet \vert s_t, a_t)\} \left[ r_t + \gamma \left( Q_\{\theta_\{i-1\}\} (s_\{t+1\}, a_\{t+1\}) - \alpha \log \pi_\phi(a_\{t+1\} \vert s_\{t+1\}) \right) \right], \quad a_\{t+1\} \sim \pi_\phi(\bullet \vert s_t)
+\end\{equation\}$
+Similarily to DDPG, SAC also maintains an explicit policy, trained under the same MaxEnt framework for the maximization of the referenced figure, and updated using:
+$$\begin\{equation\}
+
+ \pi_\{k+1\} \leftarrow \arg\min_\{\pi^\prime \in \Pi\} \text\{D\}_\{\text\{KL\}\} \left(\pi^\prime (\bullet \vert s_t) \bigg\Vert \frac\{\exp(Q_\{\pi_k\}(s_t, \bullet))\}\{Z_\{\pi_k\}(s_t)\} \right)
+\end\{equation\}$$
+The update rule provided in the referenced figure optimizes the policy while projecting it on a set $\Pi$ of tractable distributions (e.g., Gaussians, @haarnojaReinforcementLearningDeep2017).
+
+#### Sample-efficient, data-driven RL
+
+Importantly, sampling $(s_t, a_t, r_t, s_\{t+1\})$ from the replay buffer $D $ conveniently allows to approximate the previously introduced expectations for TD-target and TD-error through Monte-Carlo (MC) estimates.
+The replay buffer $ D$ also proves extremely useful in maintaining a history of previous transitions and using it for training, improving on sample efficiency.
+Furthermore, it also naturally provides an entry point to inject offline trajectories recorded, for instance, by a human demonstrator, into the training process.
+
+Reinforcement Learning with Prior Data (RLPD) [@ballEfficientOnlineReinforcement2023] is an Offline-to-Online RL algorithm leveraging prior data to effectively accelerate the training of a SAC agent.
+Unlike previous works on Offline-to-Online RL, RLPD avoids any pre-training and instead uses the available offline data $D_\text\{offline\}$ to improve online-learning from scratch.
+During each training step, transitions from both the offline and online replay buffers are sampled in equal proportion, and used in the underlying SAC routine.
+
+#### Sample-efficient, data-driven, real-world RL
+
+Despite the possibility to leverage offline data for learning, the effectiveness of real-world RL training is still limited by the need to define a task-specific, hard-to-define reward function.
+Further, even assuming to have access to a well-defined reward function, typical robotics pipelines rely mostly on propioperceptive inputs augmented by camera streams of the environment.
+As such, even well-defined rewards would need to be derived from processed representations of unstructured observations, introducing brittleness.
+In their technical report, @luoSERLSoftwareSuite2025 empirically address the needs (1) to define a reward function and (2) to use it on image observations, by introducing a series of tools to allow for streamlined training of *reward classifiers* $c $, as well as jointly learn forward-backward controllers to speed up real-world RL.
+Reward classifiers are particularly useful in treating complex tasks---e.g., folding a t-shirt---for which a precise reward formulation is arbitrarily complex to obtain, or that do require significant shaping and are more easily learned directly from demonstrations of success ($ e^+$) or failure ($e^-$) states, $s \in \mathcal\{S\}$, with a natural choice for the state-conditioned reward function being $r \mathcal S \mapsto \mathbb R $ being $ r(s) = \log c(e^+ \ vert s )$.
+Further, @luoSERLSoftwareSuite2025 demonstrate the benefits of learning *forward* (executing the task from initial state to completion) and *backward* (resetting the environment to the initial state from completion) controllers, parametrized by separate policies.
+
+Lastly, in order to improve on the robustness of their approach to different goals while maintaing practical scalability, @luoSERLSoftwareSuite2025 introduced a modified state and action space, expressing proprioperceptive configurations $q$ and actions $\dot q $ in the frame of end-effector pose at $ t=0$.
+Randomizing the initial pose of the end-effector ($s_0$),@luoSERLSoftwareSuite2025 achieved a similar result to that of having to manually randomize the environment at every timestep, but with the benefit of maintaining the environment in the same condition across multiple training episodes, achieving higher scalability of their method thanks to the increased practicality of their approach.
+
+
+
+*(A) HIL-SERL allows for real-world training of high performance RL agents by building on top advancements presented by of SAC, RLPD and SERL. (B) Example of human intervention during a HIL-SERL training process on a SO-100.*
+
+Building on off-policy deep Q-learning with replay buffers, entropy regularization for better exploration and performance, expert demonstrations to guide learning, and a series of tools and recommendations for real-world training using reward classifiers (Figure Section fig:hil-serl-blocks), @luoPreciseDexterousRobotic2024 introduce human interactions during training, learning near-optimal policies in challenging real-world manipulation tasks in 1-2 hours.
+
+Human in the Loop Sample Efficient Robot reinforcement Learning (HIL-SERL) [@luoPreciseDexterousRobotic2024] augments offline-to-online RL with targeted human corrections during training, and employs prior data to (1) train a reward classifier and (2) bootstrap RL training on expert trajectories.
+While demonstrations provide the initial dataset seeding learning and constraining early exploration, interactive corrections allow a human supervisor to intervene on failure modes and supply targeted interventions to aid the learning process.
+Crucially, human interventions are stored in both the offline and online replay buffers, differently from the autonomous transitions generated at training time and stored in the online buffer only.
+Consequently, given an intervention timestep $k \in (0, T)$, length-$K$ human intervention data $\{ s^\{\text\{human\}\}_k, a^\{\text\{human\}\}_k, r^\{\text\{human\}\}_k, s^\{\text\{human\}\}_\{k+1\},\}_\{k=1\}^K$ is more likely to be sampled for off-policy learning than the data generated online during training, providing stronger supervision to the agent while still allowing for autonomous learning.
+Empirically, HIL-SERL attains near-perfect success rates on diverse manipulation tasks within 1-2 hours of training [@luoPreciseDexterousRobotic2024], underscoring how offline datasets with online RL can markedly improve stability and data efficiency, and ultimately even allow real-world RL-training.
+
+### Code Example: Real-world RL
+
+**TODO(fracapuano): work out rl training example**
+
+### Limitations of RL in Real-World Robotics: Simulators and Reward Design
+
+Despite the advancements in real-world RL training, solving robotics training RL agents in the real world still suffers from the following limitations:
+
+- In those instances where real-world training experience is prohibitively expensive to gather [@degraveMagneticControlTokamak2022, bellemareAutonomousNavigationStratospheric2020], in-simulation training is often the only option. However, high-fidelity simulators for real-world problems can be difficult to build and maintain, especially for contact-rich manipulation and tasks involving deformable or soft materials.
+
+- Reward design poses an additional source of brittleness. Dense shaping terms are often required to guide exploration in long-horizon problems, but poorly tuned terms can lead to specification gaming or local optima. Sparse rewards avoid shaping but exacerbate credit assignment and slow down learning. In practice, complex behaviors require efforts shaping rewards: a britlle and error prone process.
+
+Advances in Behavioral Cloning (BC) from corpora of human demonstrations address both of these concerns.
+By learning in a supervised fashion to reproduce expert demonstrations, BC methods prove competitive while bypassing the need for simulated environments and hard-to-define reward functions.
+
+[^1]: Quote from @mnihPlayingAtariDeep2013. The notation used has slightly been adapted for consistency with the rest of this tutorial.
\ No newline at end of file
diff --git a/app/src/content/chapters/04_imitation_learning.mdx b/app/src/content/chapters/04_imitation_learning.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..6c518b641eebf0f5232a25359a88e18125b411ef
--- /dev/null
+++ b/app/src/content/chapters/04_imitation_learning.mdx
@@ -0,0 +1,468 @@
+import ResponsiveImage from '../../components/ResponsiveImage.astro'
+import Ch4BcTrajectories from '../assets/image/ch4/ch4-bc-trajectories.png'
+import Ch4ObservationActionMapping from '../assets/image/ch4/ch4-observation-action-mapping.png'
+import Ch4IssuesWithBc from '../assets/image/ch4/ch4-issues-with-bc.png'
+import Ch4TaskEffectOnPairs from '../assets/image/ch4/ch4-task-effect-on-pairs.png'
+import Ch4LatentVariableModel from '../assets/image/ch4/ch4-latent-variable-model.png'
+import Ch4ManyLatents from '../assets/image/ch4/ch4-many-latents.png'
+import Ch4DiffusionRobotActions from '../assets/image/ch4/ch4-diffusion-robot-actions.png'
+import Ch4ActionVsObservationDistribution from '../assets/image/ch4/ch4-action-vs-observation-distribution.png'
+import Ch4NormalizingFlows from '../assets/image/ch4/ch4-normalizing-flows.png'
+import Ch4DiffusionVsFlowmatching from '../assets/image/ch4/ch4-diffusion-vs-flowmatching.png'
+import Ch4Act from '../assets/image/ch4/ch4-act.png'
+import Ch4ActEncoder from '../assets/image/ch4/ch4-act-encoder.png'
+import Ch4ActDecoder from '../assets/image/ch4/ch4-act-decoder.png'
+import Ch4DiffusionPolicy from '../assets/image/ch4/ch4-diffusion-policy.png'
+import Ch4AsyncInference from '../assets/image/ch4/ch4-async-inference.png'
+import Ch4Queues from '../assets/image/ch4/ch4-queues.png'
+
+
+# Robot (Imitation) Learning
+
+::: epigraph
+*The best material model for a cat is another, or preferably the same cat*
+
+Norbert Wiener
+:::
+
+> **TL;DR**
+> Behavioral Cloning provides a natural platform to learn from real-world interactions without the need to design any reward function, and generative models prove more effective than point-wise policies at dealing with multimodal demonstration datasets.
+
+
+
+*(A) Average (with standard deviation) evolution of the actuation levels over the first 5 recorded episodes in /svla_so101_pickplace. Proprioperceptive state provide invaluable to determine the robot's state during an episode. (B) Camera frames are also recorded alongside measurements on the robot's state, capturing information about the robot's interaction with its environment.*
+
+Learning from human demonstrations provides a pragmatic alternative to the reinforcement-learning pipeline discussed in the following section.
+Indeed, in real-world robotics online exploration is typically , and designing (dense) reward signals is a process.
+In general, success detection itself may often require bespoke instrumentation, while episodic training demands reliable resets---all factors complicating training RL algorithms on hardware at scale.
+Behavioral Cloning (BC) sidesteps these constraints by casting control an imitation learning problem, leveraging previously collected expert demonstrations.
+Most notably, by learning to imitate autonomous systems naturally adhere to the objectives, preferences, and success criteria implicitly encoded in the data, which obviates reduces early-stage exploratory failures and obviates hand-crafted reward shaping altogether.
+
+Formally, let $\mathcal D = \{ \tau^\{(i)\} \}_\{i=1\}^N$ be a set of expert trajectories, with $\tau^\{(i)\} = \{(o_t^\{(i)\}, a_t^\{(i)\})\}_\{t=0\}^\{T_i\}$ representing the $i$-th trajectory in $\mathcal D $, $ o_t \in \mathcal\{O\}$ denoting observations (e.g., images and proprioception altogether), and $a_t \in \mathcal\{A\}$ the expert actions.
+Typically, observations $o \in \mathcal\{O\}$ consist of both image and proprioperceptive information, while actions $a \in \mathcal\{A\}$ represent control specifications for the robot to execute, e.g. a joint configuration.
+Note that differently from the following section, in the imitation learning context $\mathcal D $ denotes an offline dataset collecting $ N$ length-$T_i$ reward-free (expert) human trajectories $\tau^\{(i)\}$, and *not* the environment dynamics.
+Similarily, in this section $\tau^\{(i)\}$ represent a length-$T_i$ trajectory of observation-action pairs, which crucially *omits entirely any reward* information.
+Figure the referenced figure graphically shows trajectories in terms of the average evolution of the actuation on the 6 joints over a group of teleoperated episodes for the SO-100 manipulator.
+Notice how proprioperceptive states are captured jointly with camera frames over the course of the recorded episodes, providing a unified high-frame rate collection of teleoperation data.
+Figure the referenced figure shows $(o_t, a_t)$-pairs for the same dataset, with the actions performed by the human expert illustrated just alongside the corresponding observation.
+In principle, (expert) trajectories $\tau^\{(i)\}$ can have different lengths since demonstrations might exhibit multi-modal strategies to attain the same goal, resulting in possibly multiple, different behaviors.
+
+
+
+*Sample observations and action pairs over the course of a given trajectory recorded in /svla_so101_pickplace. Observations, comprising of both proprioperceptive and visual information, are recorded alongside the configuration of a second, leader robot controlled by a human expert, providing complete information for regressing actions given observation.*
+
+Behavioral Cloning (BC) [@pomerleauALVINNAutonomousLand1988a] aims at synthetizing synthetic behaviors by learning the mapping from observations to actions, and in its most natural formulation can be effectively tackled as a *supevised* learning problem, consisting of learning the (deterministic) mapping $f: \mathcal\{O\} \mapsto \mathcal\{A\}, \ a_t = f(o_t)$ by solving
+$$\begin\{equation\}
+
+ \min_\{f\} \mathbb\{E\}_\{(o_t, a_t) \sim p(\bullet)\} \mathcal L(a_t, f(o_t)),
+\end\{equation\}$$
+for a given risk function $\mathcal L: \mathcal A \times \mathcal A \mapsto \mathbb\{R\}, \ \mathcal L (a, a^\prime)$.
+
+Typically, the expert's joint observation-action distribution $p: \mathcal\{O\} \times \mathcal\{A\} \mapsto [0,1]$ such that $(o,a) \sim p(\bullet)$ is assumed to be unknown, in keeping with a classic Supervised Learning (SL) framework[^1].
+However, differently from standard SL's assumptions, the samples collected in $\mathcal D $, correspoding to observations of the underlying $ p$ are *not* i.i.d., as expert demonstrations are collected *sequentially* in trajectories.
+In practice, this aspect can be partially mitigated by considering pairs in a non-sequential order---*shuffling* the samples in $\mathcal D $---so that the expected risk under $ p$ can be approximated using MC estimates, although estimates may in general be less accurate.
+Another strategy to mitigate the impact of regressing over non-i.i.d. samples relies on the possibility of interleaving BC and data collection [@rossReductionImitationLearning2011], aggregating multiple datasets iteratively.
+However, because we only consider the case where a single offline dataset $\mathcal D $ of (expert) trajectories is already available, dataset aggregation falls out of scope.
+
+Despite the inherent challenges of learning on non-i.i.d. data, the BC formulation affords several operational advantages in robotics.
+First, training happens offline and typically uses expert human demonstration data, hereby severily limiting exploration risks by preventing the robot from performing dangerous actions altogether.
+Second, reward design is entirely unnecessary in BC, as demonstrations already reflect human intent and task completion.
+This also mitigates the risk of misalignment and specification gaming (*reward hacking*), otherwise inherent in purely reward-based RL [@heessEmergenceLocomotionBehaviours2017].
+Third, because expert trajectories encode terminal conditions, success detection and resets are implicit in the dataset.
+Finally, BC scales naturally with growing corpora of demonstrations collected across tasks, embodiments, and environments.
+However, BC can in principle only learn behaviors that are, at most, as good as the one exhibited by the demonstrator, and thus critically provides no mitigation for the suboptimal decision making that might be enaced by humans.
+Still, while problematic in sequential-decision making problems for which expert demonstrations are not generally available---data migth be expensive to collect, or human performance may be inherently suboptimal---many robotics applications benefit from relative cheap pipelines to acquire high-quality trajectories generated by humans, thus justifying BC approaches.
+
+
+
+*Point-wise policies suffer from limitations due to (A) covariate shifts and poor approximation of (B) multimodal demonstrations. (A) Initially small errors may drive the policy out of distribution, incuring in a vicious circle ultimately resulting in failure. (B) Both modes of reaching for a target object in a scene, either left or right-first, are equally as good and thus equally as likely to be present in a dataset of human demonstrations, ultimately resulting in multimodal demonstrations.*
+
+While conceptually elegant, point-estimate policies $ f : \mathcal\{O\} \mapsto \mathcal\{A\}$ learned by solving the referenced figure have been observed to suffer from (1) compounding errors [@rossReductionImitationLearning2011] and (2) poor fit to multimodal distributions [@florenceImplicitBehavioralCloning2022, keGraspingChopsticksCombating2020].
+Figure the referenced figure illustrates these two key issues related to learning *explicit policies* [@florenceImplicitBehavioralCloning2022].
+Besides sequentiality in $\mathcal D$, compounding errors due to *covariate shift* may also prove catastrophic, as even small $\epsilon$-prediction errors $0 < \Vert \mu(o_t) - a_t \Vert \leq \epsilon $ can quickly drive the policy into out-of-distribution states, incuring in less confident generations and thus errors compounding (Figure the referenced figure, left).Moreover, point-estimate policies typically fail to learn *multimodal* targets, which are very common in human demonstrations solving robotics problems, since multiple trajectories can be equally as good towards the accomplishment of a goal (e.g., symmetric grasps, Figure the referenced figure, right).
+In particular, unimodal regressors tend to average across modes, yielding indecisive or even unsafe commands [@florenceImplicitBehavioralCloning2022].
+To address poor multimodal fitting, @florenceImplicitBehavioralCloning2022 propose learning the generative model $ p(o, a)$ underlying the samples in $\mathcal D $, rather than an explicitly learning a prediction function $ f(o) = a $.
+
+## A (Concise) Introduction to Generative Models
+
+Generative Models (GMs) aim to learn the stochastic process underlying the very generation of the data collected, and typically do so by fitting a probability distribution that approximates the unknown *data distribution*, $ p$.
+In the case of BC, this unknown data distribution $p$ represents the expert's joint distribution over $(o, a)$-pairs.
+Thus, given a finite set of $N$ pairs $\mathcal D = \{ (o,a)_i \}_\{i=0\}^N $ used as an imitation learning target (and thus assumed to be i.i.d.), GM seeks to learn a *parametric* distribution $ p_\theta(o,a)$ such that (1) new samples $(o,a) \sim p_\theta(\bullet)$ resemble those stored in $\mathcal D $, and (2) high likelihood is assigned to the observed regions of the unobservable $ p$.
+Likelihood-based learning provides a principled training objective to achieve both objectives, and it is thus extensively used in GM [@prince2023understanding].
+
+### Variational Auto-Encoders
+
+
+
+*Intuitively, latent variable in a single latent model may contain information regarding the task being performed, which directly results in the likelihood of the same observation-action pair being different for two different tasks. When (A) picking a block the likelihood of a wide gripper's opening should be higher than narrower one, while it should be the opposite when (B) pushing the block.*
+
+A common inductive bias used in GM posits samples $(o,a)$ are influenced from an unobservable latent variable $z \in Z$, resulting in
+$$\begin\{equation\}
+
+ p (o,a) = \int_\{\text\{supp\}\{Z\}\} p(o,a \vert z) p(z)
+\end\{equation\}$$
+Intuitively, in the case of observation-action pairs $(o, a)$ for a robotics application, $z $ could be some high level representation of the underlying task being performed by the human demonstrator.
+In such case, treating $ p(o,a)$ as a marginalization over $\text\{supp\}\{Z\}$ of the complete joint distribution $p(o,a,z)$ natively captures the effect different tasks have on the likelihood of observation-action pairs.
+Figure the referenced figure graphically illustrates this concept in the case of a (A) picking and (B) pushing task, for which, nearing the target object, the likelihood of actions resulting in opening the gripper---the higher $q_6$, the wider the gripper's opening---should intuitively be (A) high or (B) low, depending on the task performed.
+While the latent space $Z $ typically has a much richer structure than the set of all actual tasks performed, the referenced figure still provides a solid framework to learn joint distribution conditioned on unobservable yet relevant factors.
+Figure the referenced figure represents this framework of latent-variable for a robotics application: the true, $ z$-conditioned generative process on assigns *likelihood* $p((o,a) \vert z)$ to the single $(o,a)$-pair.
+Using Bayes' theorem, one can reconstruct the *posterior* distribution on $\text\{supp\}\{Z\}$, $q_\theta(z \vert o,a)$ from the likelihood $p_\theta(o,a \vert z)$, *prior* $p_\theta(z)$ and *evidence* $p_\theta(o,a)$.
+VAEs approximate the latent variable model presented in Section eq:BC-latent-variable) using an *approximate posterior* $q_\phi(z \vert o,a)$ while regressing parameters for a parametric likelihood, $p_\theta(o,a \vert z)$ (Figure Section fig:ch4-latent-variable-model).
+
+
+
+*(A) The latent variable model in a robotics application regulates influence between observed ($o,a)$ variables and an unobservable latent variable. (B) VAEs approximate exact latent variable models by means of variational inference.*
+
+Given a dataset $\mathcal D $ consisting of $ N$ i.i.d. observation-action pairs, the log-likelihood of all datapoints under $\theta $ (in Bayesian terms, the *evidence* $ p_\theta(\mathcal D)$) can thus be written as:
+
+$$\log p_\theta(\mathcal D) &= \log \sum_\{i=0\}^N p_\theta ((o,a)_i)
+ &= \log \sum_\{i=0\}^N \int_\{\text\{supp\}\{Z\}\} p_\theta((o,a)_i \vert z) p(z)
+ &= \log \sum_\{i=0\}^N \int_\{\text\{supp\}\{Z\}\} \frac\{q_\theta(z \vert (o,a)_i)\}\{q_\theta(z \vert (o,a)_i)\} \cdot p_\theta((o,a)_i \vert z) p(z)
+ &= \log \sum_\{i=0\}^N \mathbb E_\{z \sim p_\theta(\bullet \vert (o,a)_i)\} \left[ \frac\{p(z)\}\{q_\theta(z \vert (o,a)_i)\} \cdot p_\theta((o,a)_i \vert z) \right], $$
+
+where we used the referenced figure in the referenced figure, multiplied by $1 = \frac\{q_\theta(z \vert (o,a)_i)\}\{q_\theta(z \vert (o,a)_i)\}$ in the referenced figure, and used the definition of expected value in the referenced figure.
+
+In the special case where one assumes distributions to be tractable, $p_\theta (\mathcal D)$ is typically tractable too, and $\max_\theta \log p_\theta(\mathcal D)$ provides a natural target for (point-wise) infering the unknown parameters $\theta $ of the generative model.
+Unfortunately, the referenced figure is rarely tractable when the distribution $ p$ is modeled with approximators such as neural networks, especially for high-dimensional, unstructured data.
+
+In their seminal work on Variational Auto-Encoders (VAEs), @kingmaAutoEncodingVariationalBayes2022 present two major contributions to learn complex latent-variable GMs on unstructured data, proposing (1) a tractable, variational lower-bound to the referenced figure as an optimization target to jointly learn likelihood and posterior and (2) high-capacity function approximators to model the likelihood $p_\theta(o,a\vert z)$ and (approximate) posterior distribution $q_\phi(z \vert o,a) \approx q_\theta(z \vert o,a)$.
+
+In particular, the lower bound on the referenced figure (Evidence LOwer Bound, *ELBO*) can be derived from the referenced figure applying Jensen's inequality---$\log \mathbb\{E\}[\bullet] \geq \mathbb\{E\} [\log (\bullet)]$---yielding:
+
+$$\log p_\theta(\mathcal D) &\geq \sum_\{i=0\}^\{N\} \left(
+ \mathbb\{E\}_\{z \sim p_\theta(\cdot \vert (o,a)_i)\} \big[ \log p_\theta((o,a)_i \vert z) \big]
+ + \mathbb\{E\}_\{z \sim p_\theta(\cdot \vert (o,a)_i)\} \left[ \log \left( \frac\{p(z)\}\{q_\theta(z \vert (o,a)_i)\} \right) \right]
+ \right)
+ &= \sum_\{i=0\}^\{N\} \left(
+ \mathbb\{E\}_\{z \sim p_\theta(\cdot \vert (o,a)_i)\} \big[ \log p_\theta((o,a)_i \vert z) \big]
+ - \text\{D\}_\{\text\{KL\}\} \big[ q_\theta(z \vert (o,a)_i) \Vert p(z) \big]
+ \right) $$
+
+The true, generally intractable posterior $p_\theta (z \vert o,a)$ prevents computing both the expectation and KL divergence terms in the referenced figure, and therefore @kingmaAutoEncodingVariationalBayes2022 propose deriving the ELBO using an *approximate* posterior $q_\phi(z \vert o,a)$, resulting in the final, tractable ELBO objective,
+
+$$\text\{ELBO\}_\{\mathcal D\}(\theta, \phi) = \sum_\{i=0\}^\{N\} \left(
+ \mathbb\{E\}_\{z \sim q_\phi(\cdot \vert (o,a)_i)\} \big[ \log p_\theta((o,a)_i \vert z) \big]
+ - \text\{D\}_\{\text\{KL\}\} \big[ q_\phi(z \vert (o,a)_i) \Vert p(z) \big]
+ \right)
+ $$
+
+From Jensen's inequality, maximizing ELBO results in maximizing the log-likelihood of the data too, thus providing a natural, tractable optimization target.
+Indeed, expectations can be estimated using MC estimates from the learned distributions in the referenced figure, while the KL-divergence term can typically be computed in closed-form (1) modeling $q_\phi $ as a Gaussian $ q_\phi(z \vert o,a) = \mathcal N\big(\mu_\phi(o,a), \Sigma_\phi(o,a) \big)$ and (2) imposing a standard Gaussian prior on the latent space, $p(z) = \mathcal N(\mathbf\{0\}, \mathbf\{I\})$.
+
+An intuitive explanation of the learning dynamics of VAEs can be given considering the equivalent case of *minimizing the negative ELBO*, which admits a particularly interpretable factorization
+
+$$\min_{\theta, \phi} - \text{ELBO}_{\mathcal (o,a) \sim \mathcal D}(\theta, \phi) &= \min_{\theta, \phi}\mathbf{L^{\text{rec}}}(\theta) + \mathbf{L^{\text{reg}}}(\phi)
+
+\mathbf{L^{\text{rec}}}(\theta) &= \mathbb{E}_{z \sim q_\phi(\cdot \vert o,a} \big[ \log p_\theta(o,a \vert z) \big]
+
+\mathbf{L^{\text{reg}}}(\phi) &= \text{D}_{\text{KL}} \big[ q_\phi(z \vert o,a) \Vert p(z) \big] $
+
+For any given (o,a) pair, the expected value term of the referenced figure is typically computed via MC estimates, resulting in
+$-\mathbb{E}_{z \sim q_\phi(\bullet \vert o,a)} \big[ \log p_\theta(o,a \vert z) \big] = \mathbf{L^{\text{rec}}} \approx - \frac{1}{n} \sum_{i=0}^n \log p_\theta(o,a \vert z_i).$$
+Assuming $p_\theta(o,a \vert z)$ is parametrized as an isotropic Gaussian distribution with mean $\mu_\theta (z) \in \mathbb R^d$ and variance $\sigma^2$, the log-likelihood thus simplifies to:
+$$\log p(o,a \vert z_i) = -\frac\{1\}\{2\sigma^\{2\}\} \big \Vert (o,a)-\mu_\theta(z_i) \big\Vert_2^2 -\frac\{d\}\{2\}\log(2\pi \sigma^\{2\}) \implies \mathbf\{L^\text\{rec\}\} \approx \frac \{1\}\{n\} \sum_\{i=0\}^n \big\Vert (o,a) - \mu_\theta(z_i) \big \Vert^2_2$$
+Indeed, it is very common in practice to approximate from the learned likelihood $p_\theta(o,a \vert z)$ as a parametric distribution (e.g. Gaussians) parametrized by some learned vector of coefficients derived from $\mu_\theta (z), \ z \sim p (\bullet)$.
+In all such cases, learning a VAE corresponds to optimally *reconstructing* the examples in $\mathcal D $ by minimizing the L2-error---a very common *supervised learning* objective for regression targets---while regularizing the information compression into the latent, as under the common modeling choice $ p(z) = \mathcal N (\mathbf\{0\}, \mathbf\{I\})$ the referenced figure regularizes the posterior limiting the expressivity of $q_\phi(z\vert o,a)$.
+
+### Diffusion Models
+
+VAEs approximate probability distributions via a *single* latent variable model, assuming the underlying unknown distribution can be factored according to the referenced figure, and solve the variational inference problem of jointly learning the likelihood $p_\theta $ and (approximate) posterior $ q_\phi $ for such model.
+In that, the unknown data distribution $ p(o,a)$ is effectively approximated via $\int_Z p(z) p_\theta(o,a \vert z)$, and the underlying generative process reproduced by (1) sampling a latent variable and (2) learning to decode it into a (ideally) high-likelihood sample under the (unknown) $p(o,a)$.
+Diffusion Models (DMs) [@hoDenoisingDiffusionProbabilistic2020] are another class of GMs which treat the similar problem of approximating an underlying unknown data distribution---*variational inference*---by *partially* extending VAEs to the case where *multiple* latent variables influence each other and the generative process underlying $o,a$ itself.
+In particular, DMs posit the generative process can be decomposed to a series of piece-wise (Markovian) interactions between (latent) variables (Figure Section fig:ch4-many-latents), resulting in
+
+$$p(\underbrace\{o,a\}_\{= z_0\}) &= \int_\{\text\{supp\}\{Z_0\}\} \int_\{\text\{supp\}\{Z_1\}\} \hdots \int_\{\text\{supp\}\{Z_T\}\} p(z_0, z_1, \dots z_T)
+
+ p(z_0, z_1, \dots z_T) &= p(z_T) \prod_\{t=0\}^\{T\} p(z_\{t-1\} \vert z_t), $$
+
+where we explicitly showed the marginalization over the multiple latents in the referenced figure, and used the law of conditional probability and Markov property in the referenced figure.
+
+
+
+*HMLV models posit the data generation process is influenced by a stack of Markov-dependent latent variables, with samples from the posterior distribution being progressively higher up in the hierarchy.*
+
+Similarily to VAEs, providing an exact interpretation for the latent variables is typically not possible.
+Still, one fairly reasonable application-driven intuition is that, by providing a model of the hierarchical, decoupled interaction of latent variables, Hierarchical Markov Latent Variable (HMLV) models attempt to capture the different resolutions at which different conditioning factors intervene, so that in a robotics application for instance, one could naturally distinguish between early-stage trajectory planning ($t \to T $) and fine-grained adjustments ($ t \to 0$).
+In that, HMLV models thus provide a framework to perform variational inference via multiple, sequential sampling steps from different higher level distributions instead of approximating the generative process with a single-latent variable model.
+DMs are a particular instantiation of HMLV models for which the posterior $q( z_t \vert z_\{t-1\}) = \mathcal N(z_t \sqrt\{1-\beta_t\}, \beta_t \mathbf\{I\})$ for a given $\beta_t \in \mathbb R^+$, thereby iteratively reducing the signal-to-noise ratio as $\beta_t $ increases along the latents hierarchy.
+
+Just like VAEs, DMs attemp to learn to reproduce an underlying data distribution $ p (o,a)$ given a collection of i.i.d. samples approximating the model posited to have generated the data in the first place ( Section eq:BC-multi-latent-model-1).
+Similarily to VAEs, DMs approximate the process of sampling from the unknown $p(o,a)$ (1) sampling from an easy-to-sample distribution (e.g., Gaussian) and (2) learning to reconstruct high-likelihood samples under the unknown distribution.
+However, in stark contrast with VAEs, the easy-to-sample distribution contains *no mutual information* regarding the data distribution $p(o,a)$.
+Crucially, as no information from the sample $(o,a)$ (denoted as $z_0 \equiv (o,a)$ for the sake of notation) is assumed to be propagated throughout the chain of latents, the posterior $q(z_t \vert z_\{t-1\})$ assumes a relatively amicable structure in DMs, reducing complexity.
+The *true* likelihood $p(z_\{t-1\} \vert z_t)$ is instead typically approximated using the parametrization $p_\theta (z_\{t-1\} \vert z_t)$.
+In that, the information contained in the unknwon data distribution is *reconstructed* via a process in which samples from a fixed distribution are turned into (ideally) high-likelihood samples under $p(o,a)$---a process referred to as *denoising*.
+
+Under such model, we can express the log-likelihood of an arbitrary sample as[^2]
+
+$$\log p_\theta (\underbrace\{o,a\}_\{= z_0\}) =
+ &\mathbb\{E\}_\{z_1 \sim q(\bullet \vert z_0)\} \log p_\theta (z_0 \vert z_1) -
+
+ &\mathbb\{E\}_\{z_\{T-1\} \sim q(\bullet \vert z_0)\} \big[ \text\{D\}_\{\text\{KL\}\} (q(z_T \vert z_\{T-1\}) \Vert p(z_T) ) \big] - \notag
+
+ &\sum_\{t=1\}^\{T-1\} \mathbb\{E\}_\{(z_\{t-1\}, z_\{t+1\}) \sim q(\bullet \vert z_0)\} \big[ \text\{D\}_\{\text\{KL\}\} (q(z_t \vert z_\{t-1\}) \Vert p_\theta(z_t \vert z_\{t-1\}) ) \big], \notag$$
+
+providing an optimization target in the form of $\max_\theta \log p_\theta (\mathcal D)$.
+
+In their seminal work on using DMs for variational inference, @hoDenoisingDiffusionProbabilistic2020 introduce major contributions regarding solving $\min_\theta -\log p_\theta(o,a)$.
+In particular, @hoDenoisingDiffusionProbabilistic2020 exclusively adopt a fixed *Gaussian* posterior in the form of $q(z_t \vert z_\{t-1\}) = \mathcal\{N\}(\sqrt\{1-\beta_t\}z_\{t-1\}, \beta_t \mathbf I)$.
+The choice of adopting Gaussians has profound implications on the generative process modeled.
+Indeed, under the (mild) assumption that the variance is sufficiently small $\beta_t \leq \eta, \eta \in \mathbb R^+$, @sohl-dicksteinDeepUnsupervisedLearning2015 proved that the likelihood $p(z_\{t-1\} \vert z_t)$ is Gaussian as well, which allows for the particularly convenient parametrization of the approximate likelihood $p_\theta (x_\{t-1\} \vert x_t) = \mathcal N(\mu_\theta(x_t, t), \Sigma_\theta(x_t,t)), \ t \in [1,T]$, as well as for closed-form tractability of the KL-divergence terms in the referenced figure.
+Further, the posterior's structure also enables an analytical description for the distribution of the $t $-th latent variable, $ q(z_t \vert z_0) = \mathcal N (\sqrt\{\bar\{\alpha\}_t\}z_0, (1-\bar\{\alpha\}_t) \mathbf\{I\})$, with $\alpha_t = 1-\beta_t, \ \bar \alpha_t = \prod_\{k=1\}^t \alpha_k $, which conveniently prevents iterative posterior sampling.
+
+
+
+*DMs iteratively corrupt samples (left) from an unknown distribution into a quasi-standard Gaussian (center), learning the displacement field (right) that permits to reconstruct samples from the unknown target distribution by iteratively denoising samples of a tractable, easy-to-sample distribution.*
+
+Finally, adopting Gaussian posteriors permits a particularly pleasing interpretation of the dynamics of training DMs [@permenterInterpretingImprovingDiffusion2024].
+By using Gaussian posteriors, the hierarchical latent variables effectively lose increasingly more information circa the original (unknown) distribution's sample, $ z_0$, increasingly distributing according to a standard Gaussian and thus containing no information at all (Figure Section fig:diffusion-robot-actions).
+Figure the referenced figure illustrates this procedure on a simplified, bidimensional observation-action distribution, where we considered $o=q_2$ and $a=q^h_2$, with $q_2$ representing the robot's *elbow flex* actuation and $q^h_2$ the human teleoperator's robot elbow flex.
+
+
+
+*A joint action-observation distribution, in the simplified case where the observation is the elbow-flex actuation in a SO-100, and the action is the recorded position for the same joint in the teleoperator arm. The motion recorded being teleoperated, the points distribute along a the diagonal.*
+
+Because the recorded behavior is teleoperated, measurements mostly distribute along the line $a = o + \eta, \eta \sim N(0,1)$, with $\eta$-variability accouting for minor control inconsistencies (Figure Section fig:ch4-action-vs-observation-distribution).
+Using Gaussian posteriors---i.e., adding Gaussian noise---effectively simulates a *Brownian motion* for the elements in the distribution's support (in Figure the referenced figure, $\mathcal\{O\} \times \mathcal\{A\}$), whereby information *diffuses away* from the samples, and comparing the diffused samples to the original data points one can derive an estimate of the total displacement induced by diffusion.
+Under the only assumption that the likelihood of the diffused samples is low under the original unknown data distribution, then one can effectively approximate the unkwown distribution by learning to *reverse* such displacement.
+This key intuition allows to write a simplified training objective:
+
+$$
+ \mathcal L(\theta) = \mathbb\{E\}_\{t, z_0, \epsilon\} \big[
+ \Vert \epsilon - \epsilon_\theta(\sqrt\{\bar \alpha_t\} z_0 + \epsilon \sqrt\{1 - \bar \alpha_t\}, t) \Vert^2 \big], \quad t \sim \mathcal\{U\}(\{1,\dots,T\}), \quad
+ z_0 \sim \mathcal\{D\}, \quad
+ \epsilon \sim \mathcal\{N\}(\mathbf\{0\},\mathbf\{I\}).$$
+
+In this simplified (minimization) objective, the optimization process differs from the referenced figure in that, rather than maxizing $p_\theta$ directly, the parameters $\theta $ of the pairwise likelihood $ p_\theta(z_\{t-1\} \vert z_t)$ are adjusted to *predict the total displacement* $\epsilon $ for a randomly long ($ t \sim \mathcal\{U\}(\\{1,\dots,T\\}$ )) diffusion process starting from a sample of the target distribution.
+
+By learning the total displacement from a generally, uninformative corrupted sample obtained diffusing information and a sample from an unknown distribution---significant ($\Vert \epsilon \Vert > 0$) whenever input and target distribution are sufficiently different--- @hoDenoisingDiffusionProbabilistic2020 show that one can approximate the underlying distribution reversing the displacement, *denoising* samples.
+Interestingly, under the hypothesis real-world data belongs to a single higher dimensional manifold (Manifold Hypothesis), @permenterInterpretingImprovingDiffusion2024 show that diffusion learns the gradient of a distance function from any off-point manifold (such as perturbed, uniformative samples), and the data manifold itself.
+Following this gradient---i.e., denoising a sample from an uninformative distribution---corresponds to projecting back into the manifold, yielding a procedure to sample from unknown distributions by means of Euclidean projection.
+Indeed, under the assumption that $p_\theta (z_\{t-1\} \vert z_t)$ is Gaussian, then sampling $z_\{t-1\} \sim p_\theta(\bullet \vert z_\{t\})$ corresponds to computing
+
+$$z_\{t-1\} = \frac\{1\}\{\sqrt\{\alpha_t\}\} \left( z_t - \frac\{\beta_t\}\{\sqrt\{1 - \bar\alpha_t\}\} \epsilon_\theta(z_t, t) \right) + \sigma_t \epsilon, \quad \epsilon \sim \mathcal N(\mathbf\{0\}, \mathbf\{I\}), $$
+
+thus showing that the lower-level latent variables in a DM can be obtained by iteratively removing noise from the one-step higher order variable, using the noise regressor $\epsilon_\theta(z_t, t)$ learned minimizing the referenced figure.
+
+### Flow Matching
+
+The posterior parametrization adopted by DMs proved traditionally effective, yet it raised concerns circa its efficiency at inference time, where a possibly large of compute-expensive denoising steps are needed in order to recover a sample from the target distribution.
+Flow Matching (FM) [@lipmanFlowMatchingGenerative2023] extends DMs to the general case of arbitrary, parametrized likelihood and posteriors, and in this defines a superseding class of GMs providing a unified framework for learning *continuous transformations* between distributions, encompassing and generalizing DMs.
+Instead of a *stochastic, discrete, multi-step* denoising process, FM aims to learn a *deterministic, continuous, differentiable flow* $\psi [0,1] \times Z \mapsto Z $, formalized starting from possibly time-dependent vector field $ v: [0,1] \times Z \mapsto Z $ transporting samples from a simple prior distribution $ p_0$---e.g., a standard Gaussian---to a more complex, potentially unknown data distribution $p_1$ over time.
+Note how FM models time $t \in [0,1]$ to be varying continuously while moving away *from* an easy-to-sample distribution $p_0$ *towards* the unknown data-distribution, $p_1$.
+This results in a continuous and deterministic trajectory for each sample, which can be more efficient to generate compared to the stochastic paths of DMs.
+Formally, FM can be fully characterized by an ordinary differential equation (ODE) relating instantaneous variations of flows with the underlying vector field, and hence providing complete trajectories over the distributions' support when integrating over time,
+
+$$\frac\{d\}\{dt\} \psi(z, t) &= v(t, \psi(t, z))
+
+ \psi(0, z) &= z$$
+
+FM proved very effective in a variety of applications, ranging from image [@esserScalingRectifiedFlow2024] and video generation [@polyakMovieGenCast2025] to robotics control [@black $p_0$ VisionLanguageActionFlow2024].
+Most notably, in their introductory work on FM for GM, @lipmanFlowMatchingGenerative2023 show how DMs can be seen as a specific instance of FM where the *conditional* target vector field $u$ approximated by the noise regressor corresponds to
+$$\begin\{equation\}
+
+ u(t, z\vert z_0) = \frac\{\frac\{d\}\{dt\}\alpha(1-t)\}\{1 - (\alpha(1-t))^2\}(\alpha(1-t)z - z_0), \quad \alpha(t) = e^\{-\frac12 \int_0^t \beta(s) ds\}, \quad \forall z_0 \in \mathcal D
+\end\{equation\}$$
+Note that the traditional discrete-time noise-scheduler $\{\beta_t\}_\{t=0\}^T$ is now generalized to a continuous map $\beta : [0,1] \mapsto \mathbb R^+$.
+Crucially, @lipmanFlowMatchingGenerative2023 prove that by exclusively optimizing the vector field for individual data points $z_0 \in \mathcal D $ individually, one also retrieves the optimal flow to morph the entire support of the initial distribution $ p_0$ into $p_1 \ \text\{s.t.\} \mathcal D \sim p_1$.
+
+
+
+*Probability distributions can be modified applying vector fields resulting in a flow of mass in the support. When acting over time, vector fields can effectively change the distribution's structure.*
+
+While the noising schedule of DMs results in a stochastic process that resembles a random walk, FM allows for more general---potentially, deterministic---likelihood and posterior parametrization.
+In the FM literature the likelihood and posterior probabilty densities defined along a HMLV model are typically jointly referred to as a *probability path*, where the distributions for successive adjacent transitions in the HMLV model are related by the (normalized) flow between them (Figure Section fig:ch4-normalizing-flows).
+The inherent flexibility of FM is one of their key advantages over DMs, as it opens up the possibility of *learning* more efficient paths.
+For instance, one can design probability paths inspired by Optimal Transport (OT)---a subdiscipline studying the problem of finding the most efficient way to morph one probability distribution into another.
+Probability paths obtained through OT paths tend to be *straighter* than diffusion paths (Figure Section fig:ch4-diffusion-paths-versus-fm), which can lead to faster and more stable training, as well as higher-quality sample generation with fewer steps at inference time.
+By avoiding unnecessary backtracking associated with the inherent stochastic nature of both the noising and denoising process in DMs, test-time compute is typically significantly reduced, while retaining comparable results [@lipmanFlowMatchingGenerative2023].
+
+
+
+*Compared to diffusion, flow matching distorts distribution along a less randomic pattern, resulting in a clearer interpolation between source and target distribution. The visualization shows an example comparison between these two methods on joint distribution of robot observations and actions over $T=50$ steps.*
+
+In practice, FM can be applied to generative modeling by learning a vector field regressor $v_\theta(z, t)$ to approximate a given target vector field $u(t, z)$.
+In the particular case of DMs, $u(t, z)$ is defined as in the referenced figure, while in priciple the target vector field can be learned to induce a particular transportation, or fixed according to OT.
+Given a sample from the data distribution $z_1 \sim p_1$ and a sample from an easy-to-sample prior $z_0 \sim p_0$, CFM defines a simple path between them using *linear interpolation* between samples $z_t = (1-t)z_0 + t z_1$, resulting in the target vector field $u(t, z_t) = z_1 - z_0$.
+Then, a FM model can be trained with the simple regression objective defined as
+
+$$
+ \mathcal L(\theta) = \mathbb\{E\}_\{t, z_0, z_1\} \big[
+ \Vert v_\theta((1-t)z_0 + t z_1, t) - (z_1 - z_0) \Vert^2 \big], \quad t \sim \mathcal\{U\}([0,1]),$$
+
+where $z_0 \sim p_0(\bullet)$ and $z_1 \sim p_1(\bullet)$. Note how in the referenced figure---differently from the referenced figure---time is assumed to be varying continuously $t \sim \mathcal U([0,1])$ rather than discretely $t \sim \mathcal U(\\{0,1\\})$, a key property of flow-based models.
+The objective in the referenced figure directly regresses the learned vector field onto the simple, straight path connecting a point from the prior and a point from the data, providing a simulation-free training procedure that is both stable and efficient.
+At inference time, samples are generated by starting with $z_0 \sim p_0$ and iteratively refined according to $\frac\{dz\}\{dt\} = v_\theta(z_t, t)$ for $t \in [0,1]$---an operation that can be numerically carried out with standard ODE solvers.
+
+## Action Chunking with Transformers
+
+While GMs prove useful in learning complex, high-dimensional multi-modal distributions, they do not natively address the compouding errors problem characteristic of online, sequential predictions.
+In Action Chunking with Transformers (ACT), @zhaoLearningFineGrainedBimanual2023 present an application of VAEs to the problem of learning purely from offline trajectories, introduce a simple, yet effective method to mitigate error compounding, learning high-fidelity autonomous behaviors.
+Drawing inspiration from how humans plan to enact atomically sequences of the kind $a_\{t:t+k\}$ instead of single actions $a_t $, @zhaoLearningFineGrainedBimanual2023 propose learning a GM on a dataset of input demonstrations by modeling *action chunks*.
+Besides contributions to learning high-performance autonomous behaviors, @zhaoLearningFineGrainedBimanual2023 also introduce hardware contributions in the form of a low-cost bimanual robot setup (ALOHA) capable of performing fine-grained manipulation tasks, such as opening a lid, slotting a battery in its allotment or even prepare tape for application.
+
+On the robot learning side of their contributions, @zhaoLearningFineGrainedBimanual2023 adopt transformers as the architectural backbone to learn a *Conditional* VAE [@sohnLearningStructuredOutput2015].
+Conditional VAEs are a variation of the more standard VAE formulation introducing a conditioning variable on sampling from the latent prior, allowing the modeling of *one-to-many* relationships between latent and data samples.
+Further, in stark contrast with previous work [@florenceImplicitBehavioralCloning2022,jannerPlanningDiffusionFlexible2022], @zhaoLearningFineGrainedBimanual2023 do not learn a full joint $ p_\theta(o,a)$ on observation and actions.
+While the *policy* distribution $p_\theta(a \vert o)$ can in principle be entirely described from its joint $p_\theta(o,a)$, it is often the case that the conditional distribution is intractable when using function approximators, as $p_\theta(a \vert o) = \tfrac\{p_\theta(o,a)\}\{\int_\mathcal\{A\} p_\theta(o,a)\}$ and the integral in the denominator is typically intractable.
+Instead of modeling the full joint using a vanilla VAE, @zhaoLearningFineGrainedBimanual2023 propose learning a *conditional* VAE [@sohnLearningStructuredOutput2015] modeling the policy distribution directly $p (a \vert o)$.
+
+In practice, when learning from demonstrations adopting CVAEs results in a slight modification to the VAE objective in the referenced figure, which is adapted to
+
+$$
+ \text\{ELBO\}_\{\mathcal D\}(\theta, \phi, \omega) = \sum_\{i=0\}^\{N\} \left(
+ \mathbb\{E\}_\{z \sim q_\phi(\cdot \vert o_i, a_i)\} \big[ \log p_\theta(a_i \vert z, o_i) \big]
+ - \text\{D\}_\{\text\{KL\}\} \big[ q_\phi(z \vert o_i, a_i) \Vert p_\omega(z \vert o_i) \big]
+ \right)$$
+
+Notice how in the referenced figure we are now also learning a new set of parameters $\omega$ for the prior distribution in the latent space.
+Effectively, this enables conditioning latent-space sampling (and thus reconstruction) during training, and potentially inference, providing useful when learning inherently conditional distributions like policies.
+Further, ACT is trained as a $\beta$-CVAE [@higgins2017beta], using a weight of the KL regularization term in the referenced figure as an hyperparameter regulating the information condensed in the latent space, where higher $\beta$ results in a less expressive latent space.
+
+In their work, @zhaoLearningFineGrainedBimanual2023 ablated using a GM to learn from human demonstrations compared to a simpler, supervised objective, $\mathcal L_1(a,a^\prime) = \Vert a - a^\prime \Vert_1$.
+Interestingly, they found the performance of these two approaches to be comparable when learning from *scripted* demonstrations.
+That is, when learning from data collected rolling out a predetermined set of commands $[q^c_0, q^c_1, \dots]$, GM did *not* prove competitive compared to standard supervised learning.
+However, when learning from human demonstrations---i.e., from data collected executing commands coming from a human controller $[q^h_0, q^h_1, \dots]$---they found performance (success rate on a downstream task) to be severily (-33.3 The authors also ablate the action chunking paradigm, reporting significant performance gains for performing action chunking (1 To avoid acting openloop, @zhaoLearningFineGrainedBimanual2023 design an inference process consisting in performing inference at every timestep $t $ and then aggregate overlapping chunks using chunks' exponential moving average.
+
+
+
+*Action Chunking with Transformer (ACT), as in [@zhaoLearningFineGrainedBimanual2023]. ACT introduces an action chunking paradigm to cope with high-dimensional multi-modal demonstration data, and a transformer-based CVAE architecture.*
+
+In ACT (Figure Section fig:ch4-act), inference for a given observation $ o \in \mathcal O $ could be performed by (1) computing a prior $ p_\omega(z \vert o)$ for the latent and (2) decoding an action chunk from a sampled latent $z \sim p_\omega(\bullet \vert o)$, similarily to how standard VAEs generate samples, with the exception that vanilla VAEs typically pose $p(z\vert o) \equiv p(z) \sim N(\mathbf\{0\}, \mathbf\{I\})$ and thus skip (1).
+
+
+
+*The CVAE encoder used in ACT. Input action chunks are first embedded and aggregated with positional embeddings, before being processed alongside embedded proprioperceptive information, and a learned `[`CLS] token used to aggregate input level information, and predict the style variable $ z$. The encoder is entirely disregarded at inference time.*
+
+However, the authors claim using a deterministic procedure to derive $z $ may benefit policy evaluation, and thus avoid sampling from the conditional prior at all.
+At test time, instead, they simply use $ z = \mathbf\{0\}$, as the conditional prior on $z $ used in training is set to be the unit Gaussian.
+At test time, conditioning on the observation $ o$ is instead achieved through explicitly feeding proprioperceptive and visual observations to the decoder, $p_\theta(a \vert z, o)$, while during training $z $ is indeed sampled from the approximate posterior distribution $ p_\phi(z \vert o, a)$, which, however, disregards image observations and exclusively uses proprioperceptive states to form $o $ for efficiency reasons (as the posterior $ q_\phi $ is completely disregarded at test time).
+
+
+
+*The CVAE decoder used in ACT, comprising of a full encoder-decoder Transformer architecture. Camera observations from all $ n$ camera views are first embedded using pre-trained visual encoders, and then concatenated to the corresponding positional embeddings. Then, alongside embeddings for the proprioperceptive information available and the style variable $z $ retrieved from the CVAE encoder, the Transformer encoder shares the matrices $ K,Q $ with the Transformer decoder, trained to decode fixed position embeddings into action valid chunks.*
+
+### Code Example: Learning ACT
+
+## Diffusion Policy
+
+DMs proved very effective in approximating complex highly dimensional distributions, such as distributions over images [@hoDenoisingDiffusionProbabilistic2020] or videos [@polyakMovieGenCast2025], thanks to their inherent capability to deal with multimodal data and training stability.
+In Diffusion Policy (DP), @chiDiffusionPolicyVisuomotor2024 present an application of DMs the field of robot learning, leveraging diffusion to model human expert demonstrations in a variety of simulated and real-world tasks.
+Similarily to Action Chunking with Transformer [@zhaoLearningFineGrainedBimanual2023], @chiDiffusionPolicyVisuomotor2024 (1) adopt a modified *observation-conditioned target distribution* instead of the full joint $ p(o,a)$ and (2) predict multiple actions into the future instead of a single action.
+Besides the intractability of the observations' marginal $p_\theta(o)$ given $p_\theta(o,a)$, DP's rationale for modeling the data distribution via $p_\theta(a \vert o)$ stems from the rather test-time compute intensive nature of diffusion, whereby generating actions *alongside* observations is likely to result in higher complexity and thus a likely larger number of denoising operations, which would prove ultimately pointless considering robotics applications rely on the capability to generate controls rather than reproducing observations.
+
+In practice, conditioning on observation data is achieved conditioning the added noise regressor $\epsilon_\theta $ introduced in the referenced figure on a stack of $ T_o$ observations, resulting in the *conditional* simplified diffusion objective
+
+$$\mathcal L(\theta) &= \mathbb\{E\}_\{t, a_\{t:t+H_a\}, \epsilon\} \big[
+ \Vert \epsilon - \epsilon_\theta(\sqrt\{\bar \alpha_t\} a_\{t:t+T_a\} + \epsilon \sqrt\{1 - \bar \alpha_t\}, t, o_\{t-T_o:t\}) \Vert^2 \big],
+
+ & t \sim \mathcal\{U\}(\{1,\dots,T\}), \quad
+ a_\{t:t+T_a\}, o_\{t-T_o:t\} \sim \mathcal\{D\}, \quad
+ \epsilon \sim \mathcal\{N\}(\mathbf\{0\},\mathbf\{I\}). \notag$$
+
+Notice how in the referenced figure the noise regressor is conditioned both on the latent variable rank $t $ *and* on a stack of previous observations $ o_\{t-T_o:t\}$.
+ @chiDiffusionPolicyVisuomotor2024 claim the combination of (1) conditioning on a horizon of previous observations and (2) predicting multiple actions into the future allows DP to *commit to specific modes* in the data at inference time, which proves essential for good performance and avoiding undecisiveness.
+
+
+
+*The Diffusion Policy archicture, as in [@chiDiffusionPolicyVisuomotor2024]. A stack of $H_o $ previous observations is used as external conditioning to denoise a group of $ H_a $ actions. Conditioning is used at every layer of a U-Net block, and in practice allows to obtain fully-formed action chunks with as little as $ T=10$ denoising steps.*
+
+Figure the referenced figure shows the convolution-based version of the architecture proposed by @chiDiffusionPolicyVisuomotor2024, illustrating inference on a single sample from $\mathcal D $ for simplicity.
+An arbitrarily noisy chunk of $ H_a$ actions $\tilde a_\{t:t+H_a\}$ is mapped to a learned high-dimensional space.
+Similarily, both image observations and poses are embedded before being aggregated to the action embeddings.
+Then, a U-Net [@ronnebergerUNetConvolutionalNetworks2015] is trained to regress the noise added into $\tilde a_\{t:t+H_a\}$, using observation conditioning information at every layer and seeking to optimize the referenced figure.
+At inference time, the noise predictor is used to predict the quantity of noise at every $t \in [T, \dots, 0 ]$ and iteratively subtract it from $\tilde a_\{t:t+T_a\}$, reversing the diffusion process simulated in training conditioned on $o_\{t-T_o:t\}$ to predict $a_\{t:t+T_a\}$.
+
+Training using 50-150 demos (15-60 minutes of teleoperation data) DP achieves strong performance on a variety of simulated and real-world tasks, including dexterous and deformable manipulation tasks such as sauce pouring and mat unrolling.
+Notably, the authors ablated the relevance of using RGB camera streams as input to their policy, and observed how high frame-rate visual observations can be used to attain performance (measured as success rate) comparable to that of state-based policies, typically trained in simulation with priviledged information not directly available in real-world deployments.
+As high-frame rate RGB inputs naturally accomodate for dynamic, fast changing environments, @chiDiffusionPolicyVisuomotor2024's conclusion offers significant evidence for learning streamlined control policies directly from pixels.
+In their work, @chiDiffusionPolicyVisuomotor2024 also ablate the performance of DP against their baseline against the size of the dataset collected, showing that DP outperforms the considered baseline for every benchmark size considered.
+Further, to accelerate inference, @chiDiffusionPolicyVisuomotor2024 employ Denoising Diffusion Implicit Models [@songDenoisingDiffusionImplicit2022], a variant of Denoising Diffusion Probabilistic Models [@hoDenoisingDiffusionProbabilistic2020] (DDPM) adopting a strictly deterministic denoising paradigm (differently from DDPM's natively stochastic one) inducing the same final distribution's as DDPM's, and yet resulting in 10 times less denoising steps at inference time [@chiDiffusionPolicyVisuomotor2024].
+Across a range of simulated and real-world tasks, @chiDiffusionPolicyVisuomotor2024 find DPs particularly performant when implementing a transformer-based network as $\epsilon_\theta$, although the authors note the increased sensitivity of transformer networks to hyperparameters and thus explicitly recommend starting out with a simpler, convolution-based architecture for diffusion (Figure Section fig:diffusion-policy-architecture), which are however reported to be biased towards learning low-frequency components [@tancikFourierFeaturesLet2020] and thus may prove more challenging to train with non-smooth action sequences.
+
+### Code Example: Learning Diffusion Policies
+
+## Optimized Inference
+
+Modern visuomotor policies output *action chunks*--sequences $\pi(o_t) = \mathbf\{A\}_t$ with $\mathbf\{A\}_t = \bigl(a_t,a_\{t+1\},\dots,a_\{t+H_a\}\bigr)$ being a sequence of $H_a \gg 1$ low-level commands enqueued in an action queue, originating from an environment observation, $o_t$.
+Predicting series of actions instead of single commands proved essential in learning complex, multi-modal behavior [@zhaoLearningFineGrainedBimanual2023,chiDiffusionPolicyVisuomotor2024].
+
+Typically, the robot executes the entire action chunk $\mathbf\{A\}_t $, before a new observation $ o_\{t+H_a\}$ is passed to the policy $\pi $ to predict the next chunk.
+This results in open-loop inference in between observations captured every $ H_a$ timesteps.
+ @zhaoLearningFineGrainedBimanual2023 adopts a different strategy whereby the robot controller interleaves chunk prediction $\mathbf\{A\}_t \gets \pi(o_t)$ and chunk consumption $a_t \gets **PopFront(( \mathbf\{A\}_t $)** , computing a new chunk of actions at every timestep $ t$ and aggregating the predicted chunks on overlapping sections.
+While adaptive---every observation at every timestep $o_t$ is processed---such approaches rely on running inference continuously, which can be prohibitive in resource-constrained scenarios, such as edge deployments.
+
+A less resource-intensive approach is to entirely exhaust the chunk $\mathbf\{A\}$ before predicting a new chunk of actions, a strategy we refer to as *synchronous* (sync) inference.
+Sync inference efficiently allocates computation every $H_a$ timesteps, resulting in a reduced average computational burden at control time.
+In contrast, it inherently hinders the responsiveness of robot systems, introducing blind lags due to the robot being *idle* while computing $\mathbf\{A\}$.
+
+We directly assess the lack of adaptiveness of robot systems due to acting open-loop, and the presence of lags at runtime by decoupling action chunk prediction $\mathbf\{A\}$ from action execution $a_t \gets **PopFront**(\mathbf\{A\}_t)$, developing an *asynchronous* (async) inference stack (Section alg:async-inference), whereby a $**RobotClient**$ sends an observation $o_t$ to a $**PolicyServer**$, receiving an action chunk $\mathbf\{A\}_t $ once inference is complete (Section fig:ch4-async-inference).
+In this, we avoid execution lags by triggering chunk prediction while the control loop is still consuming a previously available queue, aggregating it with the newly incoming queue whenever available.
+In turn, async-inference tightens the loop between action prediction and action execution, by increasing the frequency at which observations are processed for chunk prediction.
+Crucially, decoupling action prediction from action execution also directly allows to allocate more computational resources on a remote policy server sending actions to the robot client over networks, something which may prove very effective in resource-constrained scenarios such as low-power robots.
+
+
+
+***Asynchronous inference**. Illustration of the asynchronous inference stack. Note that the policy can be run on a remote server, possibly with GPUs.*
+
+"'
+Algorithm:
+
+Asynchronous inference control-loop
+alg:robotclient
+algorithmic[1]
+Input: horizon $ T$, chunk size $H_a $, threshold $ g[0,1]$
+Init: capture $o_0$; send $o_0$ to PolicyServer;
+receive $_0 (o_0)$
+$t $ to $ H_a $
+$ a_t \{PopFront\}(_t)$
+Execute($a_t $) execute action at step $ t$
+$\{|_t|\}\{H_a\} < g $ queue below threshold
+capture new observation, $ o_\{t+1\}$
+NeedsProcessing $(o_\{t+1\})$ similarity filter, or triggers direct processing
+async_handle $\{AsyncInfer\}(o_\{t+1\})$
+Trigger new chunk prediction (non blocking)
+$\{\}_\{t+1\} (o_\{t+1\})$ New queue is predicted with the policy
+$_\{t+1\} f(_t,\{\}_\{t+1\})$ aggregate overlaps (if any)
+
+NotCompleted(async_handle)
+$_\{t+1\} _t$ No update on queue (inference is not over just yet)
+
+algorithmic
+alg:async-inference
+
+"'
+
+#### Implementation details
+
+*Async* inference (1) tightens the control loop by capturing observations more often, directly eliminates idle gaps at runtime, and (2) directly allows to run inference on more powerful computational resources than the ones typically available onboard autonomous robotic platforms.
+
+Algorithmically, we attain (1) on the **RobotClient**-side by consuming actions from a readily available queue until a threshold condition on the number of remaining actions in the queue ($\vert \mathbf\{A\}_t \vert / H_a < g$) is met. When this condition is triggered, a new observation of the environment is captured and sent to the (possibly remote) **PolicyServer**.
+To avoid redundant server calls and erratic behavior at runtime observations are compared in joint-space, and near-duplicates are dropped.
+Two observations are considered near-duplicates if their distance in joint-space is under a predetermined threshold, $\epsilon \in \mathbb R_+$.
+Importantly, when the queue available to robot client eventually becomes empty, the most recent observation is processed regardless of similarity.
+
+Interestingly, the behavior of async inference can be studied analytically. First, let $\ell$ be a random variable modeling the time needed to receive an action chunk $\mathbf\{A\}$ after sending an observation $o $, i.e. the sum of (1) the time to send across the observation $ o$ between the **RobotClient** and **PolicyServer**, $t_\{C \to S\}$ (2) the inference latency on the **PolicyServer**, $\ell_S$ and (3) the time to send $\mathbf\{A\}$ between the **PolicyServer** and **RobotClient**, $t_\{S \to C\}$. Assuming independence, $\mathbb E [\ell] = \mathbb E[t_\{C \to S\}] + \mathbb E[\ell_S] + \mathbb E[t_\{S \to C\}]$ which can be further simplified to $\mathbb E[\ell] \simeq \mathbb E[\ell_S]$, assuming communication time is (1) equal in both directions and (2) negligible with respect to the inference latency. Second, let $\Delta t$ be the environment's control cycle. With a real-world frame-rate of 30 frames per second, $\Delta t=33\text\{ms\}$. Consequently, exhausted queues at runtime--i.e. being idle awaiting for a new chunk--are avoided for $g \geq \frac\{\mathbb E[\ell_S] / \Delta t\}\{H_a\}$. In this, the queue threshold $g$ plays a major role relatively to the availability of actions to the **RobotClient**.
+
+the referenced figure illustrates how the size of the action chunk $\lvert \mathbf\{A\}_t \rvert $ evolves over time for three representative values of $ g$, detailing the following key scenarios:
+
+- **Sequential limit $(g=0)$.** The client drains the entire chunk before forwarding a new observation to the server. During the round-trip latency needed to compute the next chunk, the queue is empty, leaving the robot *incapable of acting*. This reproduces the behavior of a fully sequential deployment and results in an average of $\mathbb E[\ell_S]$ idle seconds.
+
+- **Asynchronous inference $(g \in (0,1))$.** Allowing the client to consume $1-g$ of its available queue $\mathbf\{A\}_\{t-1\}$ before triggering inference for a new action queue $\mathbf\{A\}_\{t\}$, amortizing computation while keeping the queue from emptying. The overlap between successive chunks provides a buffer against modeling errors without the full cost of the $g=1$ regime. The updated queue $\mathbf\{A\}_t$ is obtained aggregating queues on the overlapping timesteps between $\mathbf\{A\}_\{t-1\}$ and the incoming $\tilde\{\mathbf\{A\}\}_\{t\}$.
+
+- **Compute-intensive limit $(g=1)$.** As an extreme case, and in keeping with \@zhaoLearningFineGrainedBimanual2023, an observation is sent at *every* timestep. The queue is therefore almost always filled, with only a minor saw-tooth due to$\Delta t/\mathbb E[\ell_s] < 1$. While maximally reactive, this setting incurs one forward pass per control tick and can prove prohibitively expensive on limited hardware. Importantly, because the client is consuming actions while the server computes the next chunk, the available queue never gets filled again.
+
+
+
+*Action queue size evolution at runtime for various levels of $ g$ when (A) not filtering out observation based on joint-space similarity and (B) filtering out near-duplicates observation, measuring their similarity in joint-space.*
+
+the referenced figure emphasizes the trade-off governed by $g $: small values place result in idle periods, whereas $ g\approx 1$ assumes a highly accurate model and pays a significant compute price. In practice, choosing $g\in(0,1)$ allows to strike a balance between reactivity against resource budgets.
+If not for the aforementioned similarity filter, the **RobotClient** would send observations for processing every $(1 - g) H_a \cdot \Delta t$ seconds, receiving a new chunk of actions every $(1 - g) H_a \cdot \Delta t + \mathbb E[\ell_S]$, on average.
+The presence of the observation similarity filter dilates this processing time, and serves the scope of avoiding the robot stalling due to the queue being constantly integrated with an incoming, nearly identical, action chunk.
+In particular, the referenced figure results in a queue which is filled with incoming actions *unless* near-duplicate observations are filtered out from the processing pipeline. For clarity, the red arrow in the referenced figure highlights a timestep where the observation similarity mechanism is bypassed, forcing a (nearly identical) observation to be processed as the queue results empty.
+
+### Code Example: Using Async Inference
+
+[^1]: Throughout, we will adopt the terminology and notation for SL introduced in @shalev-shwartzUnderstandingMachineLearning2014
+
+[^2]: $o,a = z_0$ for the sake of notation. Steps omitted for brevity. See Section A in @hoDenoisingDiffusionProbabilistic2020 for a complete derivation.
\ No newline at end of file
diff --git a/app/src/content/chapters/06_next_directions.mdx b/app/src/content/chapters/06_next_directions.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..c23fdc6cdbfce1d0719a224577736ef45099370d
--- /dev/null
+++ b/app/src/content/chapters/06_next_directions.mdx
@@ -0,0 +1,8 @@
+\- Post training VLAs
+- From Imitation to Refinement
+- EXPO
+
+\- World Models for robotics
+- Cosmos
+- World Models (1X)
+- Sima and Genie 1
\ No newline at end of file
diff --git a/app/src/content/chapters/07_conclusions.mdx b/app/src/content/chapters/07_conclusions.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..f179bc4923a3e5e4c68e02b386157e085f991c71
--- /dev/null
+++ b/app/src/content/chapters/07_conclusions.mdx
@@ -0,0 +1,18 @@
+# Conclusions
+
+This tutorial has chronicled the paradigmatic shift transforming robotics, from the structured, model-based methods of its classical era to the dynamic, data-driven approaches that define modern robot learning.
+We began by examining the limitations of traditional dynamics-based control, highlighting the brittleness and the significant engineering overhead required by traditional approaches, which in turn motivates more flexible, less model-intensive learning approaches.
+
+Our exploration of learning-based techniques revealed a clear trajectory of progress.
+We began with Reinforcement Learning, acknowledging its power to learn through interaction but also its real-world challenges, particularly sample inefficiency and the complexities of reward design.
+We saw how modern, data-driven approaches like HIL-SERL can make real-world RL feasible by incorporating human guidance and prior data.
+The inherent difficulties of RL, however, naturally motivated a deeper dive into imitation learning. This led us to single-task policies, where Behavioral Cloning, powered by advanced generative models like Action Chunking with Transformers and Diffusion Policy, demonstrated the ability to learn complex, multimodal behaviors directly from expert demonstrations.
+This laid the groundwork for the current frontier: the development of generalist, language-conditioned Vision-Language-Action models.
+Architectures like $\pi_0$ and SmolVLA---leveraging powerful pre-trained backbones and sophisticated generative modeling techniques like flow matching---represent a significant leap towards building foundational models for robotics that can generalize across varied tasks and embodiments.
+
+A central theme throughout this work has been the critical role of openness in accelerating this progress.
+The recent explosion in capability is inseparable from the advent of large-scale, openly available datasets, the standardization of powerful and efficient model architectures, and the development of accessible, open-source software like **LeRobot**.
+We argue the convergence towards an open approach to robotics is not merely a trend but a fundamental enabler, democratizing access to cutting-edge research in a traditionally siloed field like robotics.
+
+We believe the path ahead for robot learning to be overly exciting, and filled with fundamental challenges we yet have to even scratch the surface of.
+The journey detailed in this tutorial, from the first principles to the state-of-the-art, equips researchers and practitioners alike with the context and the tools to chart their own journey in the future of robotics.
\ No newline at end of file
diff --git a/app/src/content/chapters/A_foreword.mdx b/app/src/content/chapters/A_foreword.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..a6138e7cad2ed4bc9074f1c125a76da664187421
--- /dev/null
+++ b/app/src/content/chapters/A_foreword.mdx
@@ -0,0 +1,25 @@
+# Foreword
+
+Robotics is an inherently multidisciplinary field, and is not witnessing unprecedented advancements since its inception in the 1960s.
+Yet, more than sixty years after the debut of Unimate, robots have still not fully integrated into the rich, unstructured, and dynamic world we humans inhabit.
+Over the decades, numerous disciplines have shown immense promise in tackling the challenges of creating autonomous systems.
+This tutorial takes a clear stance in the debate on whether modern Machine
+Learning can play a pivotal role in the development of
+autonomous robot systems: we believe this to be the case.
+
+Nonetheless, we also hold that the wealth of research from both academia and industry in classical robotics over the past six decades is, simply put, too valuable to be cast aside in favor of purely learning-based methods.
+However, the interplay between classical robotics and modern machine learning is still in its nascent stages, and the path to integration yet to be clearly defined.
+In turn our goal here is to present what we consider to be the most relevant approaches within robot learning today, while warmly extending an invite to collaborate to expand the breadth of this work! Start contributing today [here](https://github.com/fracapuano/robot-learning-tutorial).
+
+This tutorial...
+
+- Does *not* aim to be a comprehensive guide to general field of robotics, manipulation or underactuated systems: @sicilianoSpringerHandbookRobotics2016 and @tedrakeRoboticManipulationPerception,tedrakeUnderactuatedRoboticsAlgorithms do this better than we ever could.
+
+- Does *not* aim to be an introduction to statistical or deep learning: @shalev-shwartzUnderstandingMachineLearning2014 and @prince2023understanding cover these subjects better than we ever could.
+
+- Does *not* aim to be a deep dive into Reinforcement Learning, Diffusion Models, or Flow Matching: invaluable works such as @suttonReinforcementLearningIntroduction2018, @nakkiranStepbyStepDiffusionElementary2024, and @lipmanFlowMatchingGuide2024 do this better than we ever could.
+
+Instead, our goal here is to provide an intuitive explanation as per why these disparate ideas have converged to form the exciting field of modern robot learning, driving the unprecedented progress we see today.
+In this spirit, we follow the adage: \"a jack of all trades is a master of none, *but oftentimes better than a master of one*.\"
+
+We sincerely hope this tutorial serves as a valuable starting point for your journey into robot learning.
\ No newline at end of file
diff --git a/app/src/content/chapters/best-pratices.mdx b/app/src/content/chapters/best-pratices.mdx
deleted file mode 100644
index d987e138f94f7bd4ab78b18d6649b1b4725e7102..0000000000000000000000000000000000000000
--- a/app/src/content/chapters/best-pratices.mdx
+++ /dev/null
@@ -1,64 +0,0 @@
-
-import visualPoster from '../assets/image/visual-vocabulary-poster.png';
-import Note from '../../components/Note.astro';
-import ResponsiveImage from '../../components/ResponsiveImage.astro';
-
-
-## Best Practices
-
-### Short sections
-Break content into **small, purpose‑driven sections**. Each section should answer a **single question** or support one idea. This improves **scanability**, helps readers navigate with the TOC, and makes later edits safer.
-
-### Clear, minimal annotations
-Favor **concise captions** and callouts that clarify what to look at and why it matters. In code, **highlight just the lines** that carry the idea; avoid verbose commentary. **Precision beats volume**.
-
-### Explain math notation
-**Introduce symbols and variables** the first time they appear, and prefer **well‑known identities** over custom shorthand. When formulas carry the message, add one sentence of **plain‑language interpretation** right after.
-
-
-
-
- For example, in linear regression with features $x \in \mathbb{R}^d$, weights $w \in \mathbb{R}^d$, and bias $b$, the prediction is:
-
- $$
- \hat{y} = w^\top x + b
- $$
-
- A common training objective is the mean squared error over $N$ samples:
-
- $$
- \mathcal{L}(w,b) = \frac{1}{N} \sum_{i=1}^{N} (w^\top x_i + b - y_i)^2
- $$
-
- Interpretation: the model fits a hyperplane that minimizes the average squared prediction error.
-
-
-
-
-
-{/* ### Use the right color
-A palette encodes **meaning** (categories, magnitudes, oppositions), preserves **readability** and **accessibility** (sufficient contrast, color‑vision safety), and ensures **perceptually smooth transitions**. The three families below illustrate when to use **categorical**, **sequential**, or **diverging** colors and how they evolve from the same **reference hue**.
-
-
-
-
-
-
- You can choose a color from the palette to update palettes and copy them to your clipboard.
-
-
- It will be applied to the whole page.
-
- */}
-
-### Use the right chart
-
-Picking the right visualization depends on your goal (compare values, show distribution, part-to-whole, trends, relationships, etc.). The Visual Vocabulary poster below provides a concise mapping from **analytical task** to **chart types**.
-
-
-Financial-Times A handy reference to select chart types by purpose — click to enlarge.'}
-/>
diff --git a/app/src/content/chapters/components.mdx b/app/src/content/chapters/components.mdx
deleted file mode 100644
index f7465b29a0ed2fc8da12c1432a509419eb4b3708..0000000000000000000000000000000000000000
--- a/app/src/content/chapters/components.mdx
+++ /dev/null
@@ -1,297 +0,0 @@
-import { Image } from 'astro:assets';
-import placeholder from '../assets/image/placeholder.png';
-import audioDemo from '../assets/audio/audio-example.wav';
-import HtmlEmbed from '../../components/HtmlEmbed.astro';
-import Sidenote from '../../components/Sidenote.astro';
-import Wide from '../../components/Wide.astro';
-import Note from '../../components/Note.astro';
-import FullWidth from '../../components/FullWidth.astro';
-import Accordion from '../../components/Accordion.astro';
-import ResponsiveImage from '../../components/ResponsiveImage.astro';
-
-## Components
-
-**All** the following **components** are available in the **article.mdx** file. You can also create your own **components** by creating a new file in the `/components` folder.
-You have to import them in the **.mdx** file you want to use them in.
-
-
-
-
-
-
-
-
-
-
-
-### ResponsiveImage
-
-**Responsive images** automatically generate an optimized `srcset` and `sizes` so the browser downloads the most appropriate file for the current viewport and DPR. You can also request multiple output formats (e.g., **AVIF**, **WebP**, fallback **PNG/JPEG**) and control **lazy loading/decoding** for better **performance**.
-
-
- Credit: RCA Indian Head Test Pattern'}
-/>
-
-| Prop | Required | Description
-|------------------------|----------|-------------------------------------------------------
-| `zoomable` | No | Adds a zoomable lightbox (Medium-like).
-| `downloadable` | No | Adds a download button to fetch the image file.
-| `loading="lazy"` | No | Lazy loads the image.
-| `caption` | No | Adds a caption and credit.
-| `id` | No | Adds an `id` to the outer figure for deep-linking and cross-references.
-
-
-
-```mdx
-import ResponsiveImage from '../components/ResponsiveImage.astro'
-import myImage from './assets/image/placeholder.jpg'
-
-
-
-Credit: Photo by Author'}
-/>
-```
-
-
-
-### Placement
-
-Use these helpers when you need to step outside the main content flow: **Sidenotes** for contextual side notes, **Wide** to extend beyond the main column, and **Full-width** for full-width, immersive sections.
-
-#### Sidenotes
-
-
- This paragraph presents a **key idea** concisely.
-
- **Side note** for brief context or a definition.
-
-
-
-
-```mdx
-import Sidenote from '../components/Sidenote.astro'
-
-
- Main paragraph with the core idea.
- Short side note.
-
-```
-
-
-
-#### Wide example
-
-
-
demo wide
-
-
-
-```mdx
-import Wide from '../components/Wide.astro'
-
-
- Your content here...
-
-```
-
-
-#### Full-width example
-
-
-
demo full-width
-
-
-
-```mdx
-import FullWidth from '../components/FullWidth.astro'
-
-
- Your content here...
-
-```
-
-
-
-### Accordion
-
-Can be used like this `some content`. You can pass any children content.
-
-
-
Text, lists, images, code blocks, etc.
-
-
Item one
-
Item two
-
-
-
-
- | Prop | Required | Description
-|-------------|----------|----------------------------------------------------------------------------------
-| `src` | Yes | Path to the embed file in the `embeds` folder.
-| `title` | No | Short title displayed above the card.
-| `desc` | No | Short description displayed below the card. Supports inline HTML (e.g., links).
-| `frameless` | No | Removes the card background and border for seamless embeds.
-| `align` | No | Aligns the title/description text. One of `left` (default), `center`, `right`.
-| `id` | No | Adds an `id` to the outer figure for deep-linking and cross-references.
-
-
-
-````mdx
-import Accordion from '../components/Accordion.astro'
-
-
-
Free content with markdown and MDX components.
-
-
-
- | Prop | Required | Description
-|-------------|----------|----------------------------------------------------------------------------------
-| `src` | Yes | Path to the embed file in the `embeds` folder.
-| `title` | No | Short title displayed above the card.
-| `desc` | No | Short description displayed below the card. Supports inline HTML (e.g., links).
-| `frameless` | No | Removes the card background and border for seamless embeds.
-| `align` | No | Aligns the title/description text. One of `left` (default), `center`, `right`.
-| `id` | No | Adds an `id` to the outer figure for deep-linking and cross-references.
-
-
-
-```ts
-function greet(name: string) {
- console.log(`Hello, ${name}`);
-}
-
-greet("Astro");
-```
-
-````
-
-
-### Note
-
-Small contextual callout for tips, caveats, or emphasis.
-
-
- Use notes to surface context without breaking reading flow.
-
-
-
- Operation completed successfully.
-
-
-
- Be careful: this action cannot be undone.
-
-
-
- Plain note without header. Useful for short clarifications.
-
-
-| Prop | Required | Type | Description |
-|----------|----------|------------------------------|-------------------------------------|
-| `title` | No | string | Short title displayed in header |
-| `emoji` | No | string | Emoji displayed before the title |
-| `class` | No | string | Extra classes for custom styling |
-| `variant`| No | 'info' | 'success' | 'danger' | Visual intent of the note |
-
-
-```mdx
-import Note from '../../components/Note.astro'
-
-
- Use notes to surface context without breaking reading flow.
-
-
-
- Operation completed successfully.
-
-
-
- Be careful: this action cannot be undone.
-
-
-
- Plain note without header. Useful for short clarifications.
-
-```
-
-
-
-### Iframes
-
-You can embed external content in your article using **iframes**. For example, **TrackIO**, **Gradio** or even **Github code embeds** can be used this way.
-
-Gradio embed example
-
-
-
-
-
-
-
-```mdx
-
-
-
-```
-
-
-
-### HtmlEmbed
-
-The main purpose of the ```HtmlEmbed``` component is to **embed** a **Plotly** or **D3.js** chart in your article. **Libraries** are already imported in the template.
-
-They exist in the `app/src/content/embeds` folder.
-
-For researchers who want to stay in **Python** while targeting **D3**, the [d3blocks](https://github.com/d3blocks/d3blocks) library lets you create interactive D3 charts with only a few lines of code. In **2025**, **D3** often provides more flexibility and a more web‑native rendering than **Plotly** for custom visualizations.
-
-
-
-
-| Prop | Required | Description
-|-------------|----------|----------------------------------------------------------------------------------
-| `src` | Yes | Path to the embed file in the `embeds` folder.
-| `title` | No | Short title displayed above the card.
-| `desc` | No | Short description displayed below the card. Supports inline HTML (e.g., links).
-| `frameless` | No | Removes the card background and border for seamless embeds.
-| `align` | No | Aligns the title/description text. One of `left` (default), `center`, `right`.
-| `id` | No | Adds an `id` to the outer figure for deep-linking and cross-references.
-| `data` | No | Path (string) or array of paths (string[]) to data file(s) consumed by the embed.
-| `config` | No | Optional object for embed options (e.g., `{ defaultMetric: 'average_rank' }`).
-
-
-```mdx
-import HtmlEmbed from '../components/HtmlEmbed.astro'
-
-
-
-
-```
-
-
-
-#### Data
-
-If you need to link your **HTML embeds** to **data files**, there is an **`assets/data`** folder for this.
-As long as your files are there, they will be served from the **`public/data`** folder.
-You can fetch them with this address: **`[domain]/data/your-data.ext`**
-
-Be careful, unlike images, data files are not optimized by Astro. You need to optimize them manually.
diff --git a/app/src/content/chapters/debug-components.mdx b/app/src/content/chapters/debug-components.mdx
deleted file mode 100644
index 731ab530025af7e43a9c10f40c25fcd57831c93d..0000000000000000000000000000000000000000
--- a/app/src/content/chapters/debug-components.mdx
+++ /dev/null
@@ -1,37 +0,0 @@
-import Accordion from '../../components/Accordion.astro';
-import HtmlEmbed from '../../components/HtmlEmbed.astro';
-import ResponsiveImage from '../../components/ResponsiveImage.astro';
-import Wide from '../../components/Wide.astro';
-import FullWidth from '../../components/FullWidth.astro';
-import Note from '../../components/Note.astro';
-
-| Prop | Required |
-|------------------------|----------|
-| `zoomable` | No |
-| `downloadable` | No |
-| `loading="lazy"` | No |
-| `caption` | No |
-
-
- | Prop | Required | Description
-|-------------|----------|----------------------------------------------------------------------------------
-| `src` | Yes | Path to the embed file in the `embeds` folder.
-| `title` | No | Short title displayed above the card.
-| `desc` | No | Short description displayed below the card. Supports inline HTML (e.g., links).
-| `frameless` | No | Removes the card background and border for seamless embeds.
-| `align` | No | Aligns the title/description text. One of `left` (default), `center`, `right`.
-
-
-
-
-
Simple example
-
-
-
- ```mdx
- import HtmlEmbed from '../components/HtmlEmbed.astro'
-
-
-
- ```
-
diff --git a/app/src/content/chapters/getting-started.mdx b/app/src/content/chapters/getting-started.mdx
deleted file mode 100644
index 6ce1a282707eae24c6052694e0a3e96f1fffaad4..0000000000000000000000000000000000000000
--- a/app/src/content/chapters/getting-started.mdx
+++ /dev/null
@@ -1,82 +0,0 @@
-import Sidenote from '../../components/Sidenote.astro';
-import Note from '../../components/Note.astro';
-
-## Getting Started
-
-### Installation
-
-The recommended way is to **duplicate this Space** on **Hugging Face** rather than cloning it directly:
-
-1. Open the Space: **[🤗 science-blog-template](https://huggingface.co/spaces/tfrere/science-blog-template)** and click `Duplicate this Space`.
-2. Give it a **name**, choose **visibility**, and keep the **free CPU instance**.
-3. **Clone** your new Space repository.
-```bash
-git clone git@hf.co:spaces//
-cd
-```
-
-4. Use **Node.js 20 or newer**. To manage versions, consider using **nvm**
- - macOS/Linux: see [nvm-sh](https://github.com/nvm-sh/nvm)
- - Windows: see [nvm-windows](https://github.com/coreybutler/nvm-windows)
-
-```bash
-nvm install 20
-nvm use 20
-node -v
-```
-
-5. Install lfs and pull files from the repository.
-```bash
-git lfs install
-git lfs pull
-```
-If you attempt to push binary files without Git LFS installed, you will encounter an error.
-
-
-6. Install dependencies.
-
-```bash
-cd app
-npm install
-```
-
- Alternatively, you can use **Yarn** as your package manager.
-
-
-
-
- And that's it!
-
-**You're ready to go!** 🎉
-
-### Development
-
-```bash
-npm run dev
-```
-
-Once started, the dev server is available at `http://localhost:4321`.
-
-### Build
-
-```bash
-npm run build
-```
-
-
-### Deploy
-
-**Every push** automatically triggers a **build** and **deploy** on Spaces.
-```bash
-# Make edits locally, then:
-git add .
-git commit -m "Update content"
-git push
-```
-
-
-Serving the `dist/` directory on any static host is enough to deliver the site.
-
-
-A [slugified-title].pdf and thumb.jpg are also generated at build time. You can find them in the public folder and point to them at `[domain]/public/thumb.jpg`.
-
diff --git a/app/src/content/chapters/greetings.mdx b/app/src/content/chapters/greetings.mdx
deleted file mode 100644
index c92c269defafd5b9a7b4108e44aa0faf5a3234da..0000000000000000000000000000000000000000
--- a/app/src/content/chapters/greetings.mdx
+++ /dev/null
@@ -1,15 +0,0 @@
-## Greetings
-
-Huge thanks to the following people for their **precious feedbacks**!
-
-import HfUser from '../../components/HfUser.astro';
-
-
-
-
-
-
-
-
-
-
diff --git a/app/src/content/chapters/introduction.mdx b/app/src/content/chapters/introduction.mdx
deleted file mode 100644
index 32e2049aad28666b45d5e74a3d33e3f9db16b7aa..0000000000000000000000000000000000000000
--- a/app/src/content/chapters/introduction.mdx
+++ /dev/null
@@ -1,71 +0,0 @@
-import Sidenote from "../../components/Sidenote.astro";
-
-
- Welcome to this single‑page **research article template**. It helps you publish **clear**, **modern**, and **interactive technical writing** with **minimal setup**.
-
- Grounded in up to date good practices in web dev, it favors **interactive explanations**, **clear notation**, and **inspectable examples** over static snapshots.
-
-
- Reading time: 20–25 minutes.
-
-
-
-#### Features
-
-
-
- Markdown-based
- KaTeX math
- Syntax highlighting
- Citations in all flavors
- Footnotes
- Table of contents
- Mermaid diagrams
- Plotly ready
- D3.js ready
- HTML embeds
- Gradio app embeds
- Dataviz color palettes
- Optimized images
- Lightweight bundle
- SEO friendly
- Automatic build
- Automatic PDF export
- Dark theme
- Mobile friendly
-
-
- If you have questions, remarks or suggestions, open a discussion on the Community tab!
-
-
-
-## Introduction
-The web offers what static PDFs can’t: **interactive diagrams**, **progressive notation**, and **exploratory views** that show how ideas behave. This template treats **interactive artifacts**—figures, math, code, and inspectable experiments—as **first‑class** alongside prose, helping readers **build intuition** instead of skimming results.
-
-### Who is this for
-
-Ideal for anyone creating **web‑native** and **interactive** content with **minimal setup**:
-
-- For **scientists** writing modern web‑native papers
-- For **educators** building explorable lessons.
-
-**No web knowledge required**—just write in **Markdown**.
-
-This is not a CMS or a multi‑page blog—it's a **focused**, **single‑page**, **MDX‑first** workflow.
-
-### Inspired by Distill
-
-
-This project stands in the direct continuity of [Distill](https://distill.pub/) (2016–2021). Our goal is to carry that spirit forward and push it even further: **accessible scientific writing**, **high‑quality interactive explanations**, and **reproducible**, production‑ready demos.
-
-To give you a sense of what inspired this template, here is a short, curated list of **well‑designed** and often **interactive** works from Distill:
-
-- [Growing Neural Cellular Automata](https://distill.pub/2020/growing-ca/)
-- [Activation Atlas](https://distill.pub/2019/activation-atlas/)
-- [Handwriting with a Neural Network](https://distill.pub/2016/handwriting/)
-- [The Building Blocks of Interpretability](https://distill.pub/2018/building-blocks/)
-
-
- I'm always excited to discover more great examples—please share your favorites in the Community tab!
-
-
\ No newline at end of file
diff --git a/app/src/content/chapters/markdown.mdx b/app/src/content/chapters/markdown.mdx
deleted file mode 100644
index 6d62a4a7b6dfa78f61365fcbbfefac1606762ab7..0000000000000000000000000000000000000000
--- a/app/src/content/chapters/markdown.mdx
+++ /dev/null
@@ -1,368 +0,0 @@
-import placeholder from '../assets/image/placeholder.png';
-import audioDemo from '../assets/audio/audio-example.wav';
-import HtmlEmbed from '../../components/HtmlEmbed.astro';
-import Sidenote from '../../components/Sidenote.astro';
-import Wide from '../../components/Wide.astro';
-import Note from '../../components/Note.astro';
-import FullWidth from '../../components/FullWidth.astro';
-import Accordion from '../../components/Accordion.astro';
-import ResponsiveImage from '../../components/ResponsiveImage.astro';
-
-## Markdown
-
-All the following **markdown features** are available **natively** in the `article.mdx` file. See also the complete [**Markdown documentation**](https://www.markdownguide.org/basic-syntax/).
-
-
-
-
-
-
-
-
-
-
-
-
-
-### Math
-
-KaTeX is used for math rendering. You can use **inline** notation `$...$` or **block** `$$...$$` notation. As an example, this is an **inline** math equation: $x^2 + y^2 = z^2$ and this is a **block**:
-
-$$
-\mathrm{Attention}(Q,K,V)=\mathrm{softmax}\!\left(\frac{QK^\top}{\sqrt{d_k}}\right) V
-$$
-
-
-```mdx
-$x^2 + y^2 = z^2$
-
-$$
-\mathrm{Attention}(Q,K,V)=\mathrm{softmax}\!\left(\frac{QK^\top}{\sqrt{d_k}}\right) V
-$$
-```
-
-
-### Code
-
-Use inline code with backticks \`...\` or \`\`\` fenced code blocks \`\`\` with a language for syntax highlighting (e.g., \`python\`).
-
-As an example, here is inline code: `greet("Astro")` and below is a block.
-
-
-```python
-def greet(name: str) -> None:
- print(f"Hello, {name}!")
-```
-
-
-````mdx
-`greet("Astro")`
-
-```python
-def greet(name: str) -> None:
- print(f"Hello, {name}!")
-```
-````
-
-
-
-### Code output
-
-If you want to display the output of a code block, you can use the `:::output` directive. If it's directly below the code block, it will adapt to the code block's styling.
-
-```python
-def greet(name: str) -> None:
- print(f"Hello, {name}!")
-
-greet("Astro")
-```
-:::output
-Hello, Astro!
-:::
-
-Or it can also be used at a standalone block.
-
-:::output
-Hello i'm a standalone output block.
-:::
-
-
-```python
-print("This script prints a very very long line to check overflow behavior.")
-```
-:::output
-This script prints a very very long line to check overflow behavior.
-:::
-
-
-
-
-````mdx
-```python
-def greet(name: str) -> None:
- print(f"Hello, {name}!")
-
-greet("Astro")
-```
-:::output
-Hello, Astro!
-:::
-
-Or you can also use it at a standalone block.
-
-:::output
-Hello i'm a standalone outputs block.
-:::
-````
-
-
-### Citation
-
-The **citation keys** come from `app/src/content/bibliography.bib`.
-
-**Citation** use the `@` syntax (e.g., `[@vaswani2017attention]` or `@vaswani2017attention` in narrative form) and are **automatically** collected to render the **bibliography** at the end of the article.
-
-1) In-text citation with brackets: [@vaswani2017attention].
-
-2) Narrative citation: As shown by @kingma2015adam, stochastic optimization is widely used.
-
-3) Multiple citations and a footnote together: see [@mckinney2017python; @he2016resnet] for related work.
-
-4) All citations in one group: [@vaswani2017attention; @mckinney2017python; @he2016resnet; @silver2017mastering; @openai2023gpt4; @doe2020thesis; @cover2006entropy; @zenodo2021dataset; @sklearn2024; @smith2024privacy; @kingma2015adam; @raffel2020t5].
-
-
-```mdx
-1) In-text citation with brackets: [@vaswani2017attention].
-
-2) Narrative citation: As shown by @kingma2015adam, stochastic optimization is widely used.
-
-3) Multiple citations and a footnote together: see [@mckinney2017python; @he2016resnet] for related work.
-
-4) All citations in one group: [@vaswani2017attention; @mckinney2017python; @he2016resnet; @silver2017mastering; @openai2023gpt4; @doe2020thesis; @cover2006entropy; @zenodo2021dataset; @sklearn2024; @smith2024privacy; @kingma2015adam; @raffel2020t5].
-```
-
-
-You can change the citation style in the `astro.config.mjs` file. There are several styles available: `apa`, `vancouver`, `harvard1`, `chicago`, `mla`. Default is `apa`.
-
-### Footnote
-
-**Footnote** use an identifier like `[^f1]` and a definition anywhere in the document, e.g., `[^f1]: Your explanation`. They are **numbered** and **listed automatically** at the end of the article.
-
-1) Footnote attached to the sentence above[^f1].
-
-[^f1]: Footnote attached to the sentence above.
-
-2) Multi-paragraph footnote example[^f2].
-
-[^f2]: Multi-paragraph footnote. First paragraph.
-
- Second paragraph with a link to [Astro](https://astro.build).
-
-2) Footnote containing a list[^f3].
-
-[^f3]: Footnote with a list:
-
- - First item
- - Second item
-
-3) Footnote with an inline code and an indented code block[^f4].
-
-[^f4]: Footnote with code snippet:
-
- ```ts
- function add(a: number, b: number) {
- return a + b;
- }
- ```
- Result: `add(2, 3) === 5`.
-
-4) Footnote that includes citation inside[^f5] and another footnote[^f1].
-
-[^f5]: Footnote containing citation [@vaswani2017attention] and [@kingma2015adam].
-
-
-```mdx
-1) Footnote attached to the sentence above[^f1].
-
-2) Multi-paragraph footnote example[^f2].
-
-2) Footnote containing a list[^f3].
-
-3) Footnote with an inline code and an indented code block[^f4].
-
-4) Footnote that includes citation inside[^f5].
-
-[^f1]: Footnote attached to the sentence above.
-
-[^f2]: Multi-paragraph footnote. First paragraph.
-
- Second paragraph with a link to [Astro](https://astro.build).
-
-[^f3]: Footnote with a list:
-
- - First item
- - Second item
-
-[^f4]: Footnote with code snippet:
-
- function add(a: number, b: number) {
- return a + b;
- }
-
- Result: `add(2, 3) === 5`.
-
-[^f5]: Footnote containing citation [@vaswani2017attention] and [@kingma2015adam].
-```
-
-
-
-### Reference
-
-In research articles, you may have to make references to anything. They are basically html anchors. They can be used internally in the article or externally in other articles.
-
-1. **Title**
- Each title is automatically generated with a slugged version from the citation key. ( slugged title from the citation key )
- like for example, the id `#mermaid-diagrams` is generated from the `Mermaid diagrams` title.
-
**Example** [Mermaid diagrams](#mermaid-diagram)
-
-2. **Image and chart**
- You can make a link to an image or a chart by adding an ID on it. `` then you can link to it with a link like `Fig 1`.
-
**Example** [Chart 1](#neural-network-mnist-like) or [Fig 1](#placeholder-image)
-
-
-```mdx
- #### Mermaid diagrams
- [Mermaid diagrams](#mermaid-diagrams)
-
-
- [Chart 1](#neural-network-mnist-like)
-
-
- [Fig 1](#placeholder-image)
-```
-
-
-
-### Mermaid diagram
-
-Native mermaid diagrams are supported (use a \`\`\`mermaid\`\`\` code fence). You can use the live editor to create your diagram and copy the code to your article.
-
-```mermaid
-erDiagram
- DATASET ||--o{ SAMPLE : contains
- RUN }o--o{ SAMPLE : uses
- RUN ||--|| MODEL : trains
- RUN ||--o{ METRIC : logs
-
- DATASET {
- string id
- string name
- }
-
- SAMPLE {
- string id
- string uri
- }
-
- MODEL {
- string id
- string framework
- }
-
- RUN {
- string id
- date startedAt
- }
-
- METRIC {
- string name
- float value
- }
-```
-
-
-````mdx
-```mermaid
-erDiagram
- DATASET ||--o{ SAMPLE : contains
- RUN }o--o{ SAMPLE : uses
- RUN ||--|| MODEL : trains
- RUN ||--o{ METRIC : logs
-
- DATASET {
- string id
- string name
- }
-
- SAMPLE {
- string id
- string uri
- }
-
- MODEL {
- string id
- string framework
- }
-
- RUN {
- string id
- date startedAt
- }
-
- METRIC {
- string name
- float value
- }
-```
-````
-
-
-
-### Separator
-
-Use `---` on its own line to insert a horizontal separator between sections. This is a standard Markdown “thematic break”. Don’t confuse it with the `---` used at the very top of the file to delimit the frontmatter.
-
----
-
-
-```mdx
-Intro paragraph.
-
----
-
-Next section begins here.
-```
-
-
-### Table
-
-Use pipe tables like `| Column |` with header separator `| --- |`.
-
-| Method | Score |
-|---|---|
-| A | 0.78 |
-| B | 0.86 |
-
-
-```mdx
-| Method | Score |
-| --- | --- |
-| A | 0.78 |
-| B | 0.86 |
-```
-
-
-### Audio
-
-Embed audio using ``.
-
-