Upload fine-tuned GR00T model for stack cube task

Browse files

Files changed (6) hide show

README.md +57 -0
config.json +64 -0
experiment_cfg/metadata.json +243 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+license: apache-2.0
+tags:
+- robotics
+- embodied-ai
+- imitation-learning
+- groot
+- easymimic
+---
+# EasyMimic Stack Cube Task - Fine-tuned GR00T Model
+这是基于 GR00T-N1.5-3B 在 "把一个方块堆在另一个方块上" 任务上微调的模型。
+## 模型信息
+- **基础模型**: GR00T-N1.5-3B
+- **任务**: Stack one cube on another cube
+- **训练数据**:
+  - 人手演示: 110 episodes (11 videos × 10 segments)
+  - 机械臂演示: 20 episodes (2 videos × 10 segments)
+- **训练步数**: 3000 steps
+- **最终损失**: 0.0326
+## 训练配置
+- Batch size: 8
+- Learning rate: 5e-5
+- 优化器: AdamW
+- 微调组件: Projector + Diffusion Model
+- 数据平衡: 启用 dataset weights 和 trajectory weights 平衡
+## 使用方法
+```python
+from transformers import AutoModel, AutoTokenizer
+model = AutoModel.from_pretrained("paradise-wujie/easymimic-stack-cube-groot")
+tokenizer = AutoTokenizer.from_pretrained("paradise-wujie/easymimic-stack-cube-groot")
+```
+## 训练日志
+完整训练日志和代码请查看: https://github.com/KKqdtjo/MyEasyMimic
+## 引用
+如果使用此模型，请引用 EasyMimic 论文：
+```bibtex
+@article{easymimic2024,
+  title={EasyMimic: Learning Robotic Manipulation from Human Demonstrations},
+  author={...},
+  journal={...},
+  year={2024}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "action_dim": 32,
+  "action_head_cfg": {
+    "action_dim": 32,
+    "action_horizon": 16,
+    "add_pos_embed": true,
+    "backbone_embedding_dim": 2048,
+    "diffusion_model_cfg": {
+      "attention_head_dim": 48,
+      "cross_attention_dim": 2048,
+      "dropout": 0.2,
+      "final_dropout": true,
+      "interleave_self_attention": true,
+      "norm_type": "ada_norm",
+      "num_attention_heads": 32,
+      "num_layers": 16,
+      "output_dim": 1024,
+      "positional_embeddings": null
+    },
+    "hidden_size": 1024,
+    "input_embedding_dim": 1536,
+    "max_action_dim": 32,
+    "max_state_dim": 64,
+    "model_dtype": "float32",
+    "noise_beta_alpha": 1.5,
+    "noise_beta_beta": 1.0,
+    "noise_s": 0.999,
+    "num_inference_timesteps": 4,
+    "num_target_vision_tokens": 32,
+    "num_timestep_buckets": 1000,
+    "tune_diffusion_model": true,
+    "tune_projector": true,
+    "use_vlln": true,
+    "vl_self_attention_cfg": {
+      "attention_head_dim": 64,
+      "dropout": 0.2,
+      "final_dropout": true,
+      "num_attention_heads": 32,
+      "num_layers": 4,
+      "positional_embeddings": null
+    }
+  },
+  "action_horizon": 16,
+  "architectures": [
+    "GR00T_N1_5"
+  ],
+  "attn_implementation": null,
+  "backbone_cfg": {
+    "eagle_path": "NVEagle/eagle_er-qwen3_1_7B-Siglip2_400M_stage1_5_128gpu_er_v7_1mlp_nops",
+    "load_bf16": false,
+    "project_to_dim": null,
+    "reproject_vision": false,
+    "select_layer": 12,
+    "tune_llm": false,
+    "tune_visual": true,
+    "use_flash_attention": true
+  },
+  "compute_dtype": "bfloat16",
+  "hidden_size": 2048,
+  "model_dtype": "float32",
+  "model_type": "gr00t_n1_5",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3"
+}

experiment_cfg/metadata.json ADDED Viewed

	@@ -0,0 +1,243 @@

+{
+    "new_embodiment": {
+        "statistics": {
+            "state": {
+                "single_arm_eef_xyz": {
+                    "max": [
+                        44.82416915893555,
+                        74.993896484375,
+                        7.081694602966309
+                    ],
+                    "min": [
+                        -57.89995193481445,
+                        -10.476190567016602,
+                        -68.30281066894531
+                    ],
+                    "mean": [
+                        -7.281844013035281,
+                        29.96592333167826,
+                        -25.618637384699355
+                    ],
+                    "std": [
+                        23.979941732549644,
+                        24.416230415004133,
+                        19.43252338277194
+                    ],
+                    "q01": [
+                        -56.414066314697266,
+                        -7.399267196655273,
+                        -60.488399505615234
+                    ],
+                    "q99": [
+                        44.62605285644531,
+                        68.88888549804688,
+                        5.503016471862793
+                    ]
+                },
+                "single_arm_eef_rpy": {
+                    "max": [
+                        3.1413190364837646,
+                        2.808302879333496,
+                        0.547160267829895
+                    ],
+                    "min": [
+                        -15.91375732421875,
+                        -0.42427751421928406,
+                        -100.0
+                    ],
+                    "mean": [
+                        -12.876365340030208,
+                        2.1934915151454386,
+                        -89.06816732241624
+                    ],
+                    "std": [
+                        4.438364737648521,
+                        0.7593419392130816,
+                        30.92162555839885
+                    ],
+                    "q01": [
+                        -14.98973274230957,
+                        -0.34675517678260803,
+                        -100.0
+                    ],
+                    "q99": [
+                        3.1062824726104736,
+                        2.759462833404541,
+                        0.455392986536026
+                    ]
+                },
+                "gripper": {
+                    "max": [
+                        31.742507934570312
+                    ],
+                    "min": [
+                        0.0
+                    ],
+                    "mean": [
+                        8.453739841643333
+                    ],
+                    "std": [
+                        7.863242578422885
+                    ],
+                    "q01": [
+                        0.0
+                    ],
+                    "q99": [
+                        27.7469482421875
+                    ]
+                }
+            },
+            "action": {
+                "single_arm_eef_xyz": {
+                    "max": [
+                        45.03376388549805,
+                        73.33577728271484,
+                        77.4813461303711
+                    ],
+                    "min": [
+                        -60.38243865966797,
+                        -10.664224624633789,
+                        -71.25431060791016
+                    ],
+                    "mean": [
+                        -7.273999378171898,
+                        28.893785842769137,
+                        -25.522589628544708
+                    ],
+                    "std": [
+                        24.13997737859988,
+                        23.95556826391448,
+                        23.182489643655508
+                    ],
+                    "q01": [
+                        -56.567874908447266,
+                        -8.664224624633789,
+                        -62.96855163574219
+                    ],
+                    "q99": [
+                        45.03376388549805,
+                        67.33577728271484,
+                        32.89389419555664
+                    ]
+                },
+                "single_arm_eef_rpy": {
+                    "max": [
+                        3.1413190364837646,
+                        2.612942695617676,
+                        0.547160267829895
+                    ],
+                    "min": [
+                        -14.98973274230957,
+                        -0.42427751421928406,
+                        -100.0
+                    ],
+                    "mean": [
+                        -13.402412390908477,
+                        2.3359100808290307,
+                        -89.40878290362456
+                    ],
+                    "std": [
+                        4.616244254847214,
+                        0.8055287371739884,
+                        30.77510217813324
+                    ],
+                    "q01": [
+                        -14.98973274230957,
+                        -0.35010501742362976,
+                        -100.0
+                    ],
+                    "q99": [
+                        3.107100009918213,
+                        2.612942695617676,
+                        0.455392986536026
+                    ]
+                },
+                "gripper": {
+                    "max": [
+                        32.0
+                    ],
+                    "min": [
+                        0.0
+                    ],
+                    "mean": [
+                        5.719525878968183
+                    ],
+                    "std": [
+                        8.653996030874026
+                    ],
+                    "q01": [
+                        0.0
+                    ],
+                    "q99": [
+                        28.0
+                    ]
+                }
+            }
+        },
+        "modalities": {
+            "video": {
+                "webcam": {
+                    "resolution": [
+                        640,
+                        480
+                    ],
+                    "channels": 3,
+                    "fps": 30.0
+                }
+            },
+            "state": {
+                "single_arm_eef_xyz": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        3
+                    ],
+                    "continuous": true
+                },
+                "single_arm_eef_rpy": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        3
+                    ],
+                    "continuous": true
+                },
+                "gripper": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        1
+                    ],
+                    "continuous": true
+                }
+            },
+            "action": {
+                "single_arm_eef_xyz": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        3
+                    ],
+                    "continuous": true
+                },
+                "single_arm_eef_rpy": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        3
+                    ],
+                    "continuous": true
+                },
+                "gripper": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        1
+                    ],
+                    "continuous": true
+                }
+            }
+        },
+        "embodiment_tag": "new_embodiment"
+    }
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b01a948ae542e4f9f1ff096165c1c2767aba566694cc92510e7335ce6c7ff07b
+size 4999367032

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff4379eecf11fdd50745bd3acac9295d707b1aa46415aacd1df938c07bcb9875
+size 2586705312

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff