paradise-wujie commited on
Commit
96f3405
·
verified ·
1 Parent(s): c65fa5e

Upload fine-tuned GR00T model for stack cube task

Browse files
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - robotics
5
+ - embodied-ai
6
+ - imitation-learning
7
+ - groot
8
+ - easymimic
9
+ ---
10
+
11
+ # EasyMimic Stack Cube Task - Fine-tuned GR00T Model
12
+
13
+ 这是基于 GR00T-N1.5-3B 在 "把一个方块堆在另一个方块上" 任务上微调的模型。
14
+
15
+ ## 模型信息
16
+
17
+ - **基础模型**: GR00T-N1.5-3B
18
+ - **任务**: Stack one cube on another cube
19
+ - **训练数据**:
20
+ - 人手演示: 110 episodes (11 videos × 10 segments)
21
+ - 机械臂演示: 20 episodes (2 videos × 10 segments)
22
+ - **训练步数**: 3000 steps
23
+ - **最终损失**: 0.0326
24
+
25
+ ## 训练配置
26
+
27
+ - Batch size: 8
28
+ - Learning rate: 5e-5
29
+ - 优化器: AdamW
30
+ - 微调组件: Projector + Diffusion Model
31
+ - 数据平衡: 启用 dataset weights 和 trajectory weights 平衡
32
+
33
+ ## 使用方法
34
+
35
+ ```python
36
+ from transformers import AutoModel, AutoTokenizer
37
+
38
+ model = AutoModel.from_pretrained("paradise-wujie/easymimic-stack-cube-groot")
39
+ tokenizer = AutoTokenizer.from_pretrained("paradise-wujie/easymimic-stack-cube-groot")
40
+ ```
41
+
42
+ ## 训练日志
43
+
44
+ 完整训练日志和代码请查看: https://github.com/KKqdtjo/MyEasyMimic
45
+
46
+ ## 引用
47
+
48
+ 如果使用此模型,请引用 EasyMimic 论文:
49
+
50
+ ```bibtex
51
+ @article{easymimic2024,
52
+ title={EasyMimic: Learning Robotic Manipulation from Human Demonstrations},
53
+ author={...},
54
+ journal={...},
55
+ year={2024}
56
+ }
57
+ ```
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 32,
3
+ "action_head_cfg": {
4
+ "action_dim": 32,
5
+ "action_horizon": 16,
6
+ "add_pos_embed": true,
7
+ "backbone_embedding_dim": 2048,
8
+ "diffusion_model_cfg": {
9
+ "attention_head_dim": 48,
10
+ "cross_attention_dim": 2048,
11
+ "dropout": 0.2,
12
+ "final_dropout": true,
13
+ "interleave_self_attention": true,
14
+ "norm_type": "ada_norm",
15
+ "num_attention_heads": 32,
16
+ "num_layers": 16,
17
+ "output_dim": 1024,
18
+ "positional_embeddings": null
19
+ },
20
+ "hidden_size": 1024,
21
+ "input_embedding_dim": 1536,
22
+ "max_action_dim": 32,
23
+ "max_state_dim": 64,
24
+ "model_dtype": "float32",
25
+ "noise_beta_alpha": 1.5,
26
+ "noise_beta_beta": 1.0,
27
+ "noise_s": 0.999,
28
+ "num_inference_timesteps": 4,
29
+ "num_target_vision_tokens": 32,
30
+ "num_timestep_buckets": 1000,
31
+ "tune_diffusion_model": true,
32
+ "tune_projector": true,
33
+ "use_vlln": true,
34
+ "vl_self_attention_cfg": {
35
+ "attention_head_dim": 64,
36
+ "dropout": 0.2,
37
+ "final_dropout": true,
38
+ "num_attention_heads": 32,
39
+ "num_layers": 4,
40
+ "positional_embeddings": null
41
+ }
42
+ },
43
+ "action_horizon": 16,
44
+ "architectures": [
45
+ "GR00T_N1_5"
46
+ ],
47
+ "attn_implementation": null,
48
+ "backbone_cfg": {
49
+ "eagle_path": "NVEagle/eagle_er-qwen3_1_7B-Siglip2_400M_stage1_5_128gpu_er_v7_1mlp_nops",
50
+ "load_bf16": false,
51
+ "project_to_dim": null,
52
+ "reproject_vision": false,
53
+ "select_layer": 12,
54
+ "tune_llm": false,
55
+ "tune_visual": true,
56
+ "use_flash_attention": true
57
+ },
58
+ "compute_dtype": "bfloat16",
59
+ "hidden_size": 2048,
60
+ "model_dtype": "float32",
61
+ "model_type": "gr00t_n1_5",
62
+ "torch_dtype": "bfloat16",
63
+ "transformers_version": "4.51.3"
64
+ }
experiment_cfg/metadata.json ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "statistics": {
4
+ "state": {
5
+ "single_arm_eef_xyz": {
6
+ "max": [
7
+ 44.82416915893555,
8
+ 74.993896484375,
9
+ 7.081694602966309
10
+ ],
11
+ "min": [
12
+ -57.89995193481445,
13
+ -10.476190567016602,
14
+ -68.30281066894531
15
+ ],
16
+ "mean": [
17
+ -7.281844013035281,
18
+ 29.96592333167826,
19
+ -25.618637384699355
20
+ ],
21
+ "std": [
22
+ 23.979941732549644,
23
+ 24.416230415004133,
24
+ 19.43252338277194
25
+ ],
26
+ "q01": [
27
+ -56.414066314697266,
28
+ -7.399267196655273,
29
+ -60.488399505615234
30
+ ],
31
+ "q99": [
32
+ 44.62605285644531,
33
+ 68.88888549804688,
34
+ 5.503016471862793
35
+ ]
36
+ },
37
+ "single_arm_eef_rpy": {
38
+ "max": [
39
+ 3.1413190364837646,
40
+ 2.808302879333496,
41
+ 0.547160267829895
42
+ ],
43
+ "min": [
44
+ -15.91375732421875,
45
+ -0.42427751421928406,
46
+ -100.0
47
+ ],
48
+ "mean": [
49
+ -12.876365340030208,
50
+ 2.1934915151454386,
51
+ -89.06816732241624
52
+ ],
53
+ "std": [
54
+ 4.438364737648521,
55
+ 0.7593419392130816,
56
+ 30.92162555839885
57
+ ],
58
+ "q01": [
59
+ -14.98973274230957,
60
+ -0.34675517678260803,
61
+ -100.0
62
+ ],
63
+ "q99": [
64
+ 3.1062824726104736,
65
+ 2.759462833404541,
66
+ 0.455392986536026
67
+ ]
68
+ },
69
+ "gripper": {
70
+ "max": [
71
+ 31.742507934570312
72
+ ],
73
+ "min": [
74
+ 0.0
75
+ ],
76
+ "mean": [
77
+ 8.453739841643333
78
+ ],
79
+ "std": [
80
+ 7.863242578422885
81
+ ],
82
+ "q01": [
83
+ 0.0
84
+ ],
85
+ "q99": [
86
+ 27.7469482421875
87
+ ]
88
+ }
89
+ },
90
+ "action": {
91
+ "single_arm_eef_xyz": {
92
+ "max": [
93
+ 45.03376388549805,
94
+ 73.33577728271484,
95
+ 77.4813461303711
96
+ ],
97
+ "min": [
98
+ -60.38243865966797,
99
+ -10.664224624633789,
100
+ -71.25431060791016
101
+ ],
102
+ "mean": [
103
+ -7.273999378171898,
104
+ 28.893785842769137,
105
+ -25.522589628544708
106
+ ],
107
+ "std": [
108
+ 24.13997737859988,
109
+ 23.95556826391448,
110
+ 23.182489643655508
111
+ ],
112
+ "q01": [
113
+ -56.567874908447266,
114
+ -8.664224624633789,
115
+ -62.96855163574219
116
+ ],
117
+ "q99": [
118
+ 45.03376388549805,
119
+ 67.33577728271484,
120
+ 32.89389419555664
121
+ ]
122
+ },
123
+ "single_arm_eef_rpy": {
124
+ "max": [
125
+ 3.1413190364837646,
126
+ 2.612942695617676,
127
+ 0.547160267829895
128
+ ],
129
+ "min": [
130
+ -14.98973274230957,
131
+ -0.42427751421928406,
132
+ -100.0
133
+ ],
134
+ "mean": [
135
+ -13.402412390908477,
136
+ 2.3359100808290307,
137
+ -89.40878290362456
138
+ ],
139
+ "std": [
140
+ 4.616244254847214,
141
+ 0.8055287371739884,
142
+ 30.77510217813324
143
+ ],
144
+ "q01": [
145
+ -14.98973274230957,
146
+ -0.35010501742362976,
147
+ -100.0
148
+ ],
149
+ "q99": [
150
+ 3.107100009918213,
151
+ 2.612942695617676,
152
+ 0.455392986536026
153
+ ]
154
+ },
155
+ "gripper": {
156
+ "max": [
157
+ 32.0
158
+ ],
159
+ "min": [
160
+ 0.0
161
+ ],
162
+ "mean": [
163
+ 5.719525878968183
164
+ ],
165
+ "std": [
166
+ 8.653996030874026
167
+ ],
168
+ "q01": [
169
+ 0.0
170
+ ],
171
+ "q99": [
172
+ 28.0
173
+ ]
174
+ }
175
+ }
176
+ },
177
+ "modalities": {
178
+ "video": {
179
+ "webcam": {
180
+ "resolution": [
181
+ 640,
182
+ 480
183
+ ],
184
+ "channels": 3,
185
+ "fps": 30.0
186
+ }
187
+ },
188
+ "state": {
189
+ "single_arm_eef_xyz": {
190
+ "absolute": true,
191
+ "rotation_type": null,
192
+ "shape": [
193
+ 3
194
+ ],
195
+ "continuous": true
196
+ },
197
+ "single_arm_eef_rpy": {
198
+ "absolute": true,
199
+ "rotation_type": null,
200
+ "shape": [
201
+ 3
202
+ ],
203
+ "continuous": true
204
+ },
205
+ "gripper": {
206
+ "absolute": true,
207
+ "rotation_type": null,
208
+ "shape": [
209
+ 1
210
+ ],
211
+ "continuous": true
212
+ }
213
+ },
214
+ "action": {
215
+ "single_arm_eef_xyz": {
216
+ "absolute": true,
217
+ "rotation_type": null,
218
+ "shape": [
219
+ 3
220
+ ],
221
+ "continuous": true
222
+ },
223
+ "single_arm_eef_rpy": {
224
+ "absolute": true,
225
+ "rotation_type": null,
226
+ "shape": [
227
+ 3
228
+ ],
229
+ "continuous": true
230
+ },
231
+ "gripper": {
232
+ "absolute": true,
233
+ "rotation_type": null,
234
+ "shape": [
235
+ 1
236
+ ],
237
+ "continuous": true
238
+ }
239
+ }
240
+ },
241
+ "embodiment_tag": "new_embodiment"
242
+ }
243
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b01a948ae542e4f9f1ff096165c1c2767aba566694cc92510e7335ce6c7ff07b
3
+ size 4999367032
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff4379eecf11fdd50745bd3acac9295d707b1aa46415aacd1df938c07bcb9875
3
+ size 2586705312
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff