aidanandrews commited on
Commit
3d355f2
·
verified ·
1 Parent(s): 81027e4

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GR00T-N1.5-3B LoRA Fine-tuned Model
2
+
3
+ This is a LoRA fine-tuned checkpoint of [nvidia/GR00T-N1.5-3B](https://huggingface.co/nvidia/GR00T-N1.5-3B) trained on single front camera data.
4
+
5
+ ## Model Details
6
+
7
+ - **Base Model**: nvidia/GR00T-N1.5-3B
8
+ - **Training Method**: LoRA (Low-Rank Adaptation)
9
+ - **Training Steps**: 100,000
10
+ - **Final Training Loss**: 0.053
11
+
12
+ ## Training Configuration
13
+
14
+ ### LoRA Parameters
15
+ - **Rank (r)**: 8
16
+ - **Alpha**: 16
17
+ - **Dropout**: 0.1
18
+ - **Target Modules**: to_q, to_k, to_v (attention layers only)
19
+ - **Trainable Parameters**: 1,638,400 (0.06% of total)
20
+
21
+ ### Training Parameters
22
+ - **Batch Size**: 2 per GPU
23
+ - **Learning Rate**: 1e-4
24
+ - **Weight Decay**: 1e-5
25
+ - **Warmup Ratio**: 0.05
26
+ - **Optimizer**: AdamW
27
+ - **LR Scheduler**: Cosine
28
+ - **Training Duration**: ~1h 52m (6719 seconds)
29
+ - **Training Speed**: 14.88 steps/second
30
+
31
+ ### Model Components Tuned
32
+ - **LLM Backbone**: ❌ Frozen
33
+ - **Vision Tower**: ❌ Frozen
34
+ - **Action Head Projector**: ✅ Tuned
35
+ - **Diffusion Model**: ✅ Tuned
36
+
37
+ ## Dataset
38
+
39
+ - **Embodiment**: SO-100 robot with single front camera
40
+ - **Camera Resolution**: 320x240
41
+ - **FPS**: 30
42
+ - **Action Dimensions**: 6 (5 DoF arm + 1 gripper)
43
+ - **Action Horizon**: 16 timesteps
44
+ - **Video Backend**: torchvision_av
45
+
46
+ ## Usage
47
+
48
+ This is a LoRA adapter that must be loaded on top of the base model:
49
+
50
+ ```python
51
+ from gr00t.model.gr00t_n1 import GR00T_N1_5
52
+ from peft import PeftModel
53
+
54
+ # Load base model
55
+ base_model = GR00T_N1_5.from_pretrained("nvidia/GR00T-N1.5-3B")
56
+
57
+ # Load LoRA adapter
58
+ model = PeftModel.from_pretrained(base_model, "path/to/this/checkpoint")
59
+
60
+ # Use for inference
61
+ model.eval()
62
+ ```
63
+
64
+ ## Model Architecture
65
+
66
+ - **Action Dimension**: 32 (max)
67
+ - **Action Horizon**: 16
68
+ - **Hidden Size**: 2048
69
+ - **Compute Dtype**: bfloat16
70
+ - **Diffusion Timesteps**: 4 (inference)
71
+
72
+ ## Training Hardware
73
+
74
+ - **GPUs**: 1x NVIDIA GPU
75
+ - **Compute Dtype**: bfloat16
76
+ - **TF32**: Enabled
77
+ - **Gradient Checkpointing**: Disabled
78
+
79
+ ## Citation
80
+
81
+ If you use this model, please cite the original GR00T paper and model:
82
+
83
+ ```bibtex
84
+ @misc{gr00t2024,
85
+ title={GR00T: Generalist Robot Policy},
86
+ author={NVIDIA},
87
+ year={2024},
88
+ url={https://huggingface.co/nvidia/GR00T-N1.5-3B}
89
+ }
90
+ ```
91
+
92
+ ## License
93
+
94
+ Inherits license from nvidia/GR00T-N1.5-3B base model.
adapter_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/home/aidan/.cache/huggingface/hub/models--nvidia--GR00T-N1.5-3B/snapshots/869830fc749c35f34771aa5209f923ac57e4564e",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 8,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "to_q",
29
+ "to_k",
30
+ "to_v"
31
+ ],
32
+ "target_parameters": null,
33
+ "task_type": "CAUSAL_LM",
34
+ "trainable_token_indices": null,
35
+ "use_dora": false,
36
+ "use_qalora": false,
37
+ "use_rslora": false
38
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:231bacf00771ddba3e3c923c466521664e89f79e4646ae3faadb72b968b2b32d
3
+ size 6571800
config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nvidia/GR00T-N1.5-3B",
3
+ "action_dim": 32,
4
+ "action_head_cfg": {
5
+ "action_dim": 32,
6
+ "action_horizon": 16,
7
+ "add_pos_embed": true,
8
+ "backbone_embedding_dim": 2048,
9
+ "diffusion_model_cfg": {
10
+ "attention_head_dim": 48,
11
+ "cross_attention_dim": 2048,
12
+ "dropout": 0.2,
13
+ "final_dropout": true,
14
+ "interleave_self_attention": true,
15
+ "norm_type": "ada_norm",
16
+ "num_attention_heads": 32,
17
+ "num_layers": 16,
18
+ "output_dim": 1024,
19
+ "positional_embeddings": null
20
+ },
21
+ "hidden_size": 1024,
22
+ "input_embedding_dim": 1536,
23
+ "max_action_dim": 32,
24
+ "max_state_dim": 64,
25
+ "model_dtype": "float32",
26
+ "noise_beta_alpha": 1.5,
27
+ "noise_beta_beta": 1.0,
28
+ "noise_s": 0.999,
29
+ "num_inference_timesteps": 4,
30
+ "num_target_vision_tokens": 32,
31
+ "num_timestep_buckets": 1000,
32
+ "tune_diffusion_model": true,
33
+ "tune_projector": true,
34
+ "use_vlln": true,
35
+ "vl_self_attention_cfg": {
36
+ "attention_head_dim": 64,
37
+ "dropout": 0.2,
38
+ "final_dropout": true,
39
+ "num_attention_heads": 32,
40
+ "num_layers": 4,
41
+ "positional_embeddings": null
42
+ }
43
+ },
44
+ "action_horizon": 16,
45
+ "architectures": [
46
+ "GR00T_N1_5"
47
+ ],
48
+ "attn_implementation": null,
49
+ "backbone_cfg": {
50
+ "eagle_path": "NVEagle/eagle_er-qwen3_1_7B-Siglip2_400M_stage1_5_128gpu_er_v7_1mlp_nops",
51
+ "load_bf16": false,
52
+ "project_to_dim": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 12,
55
+ "tune_llm": false,
56
+ "tune_visual": true,
57
+ "use_flash_attention": true
58
+ },
59
+ "compute_dtype": "bfloat16",
60
+ "hidden_size": 2048,
61
+ "model_dtype": "float32",
62
+ "model_type": "gr00t_n1_5",
63
+ "torch_dtype": "bfloat16",
64
+ "transformers_version": "4.51.3"
65
+ }
experiment_cfg/metadata.json ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "new_embodiment": {
3
+ "statistics": {
4
+ "state": {
5
+ "single_arm": {
6
+ "max": [
7
+ 1.853501319885254,
8
+ 1.5128743648529053,
9
+ 1.3364235162734985,
10
+ 1.6018670797348022,
11
+ 1.4484314918518066
12
+ ],
13
+ "min": [
14
+ -1.8044019937515259,
15
+ -1.5987982749938965,
16
+ -1.6248823404312134,
17
+ -1.7460963726043701,
18
+ -1.4346222877502441
19
+ ],
20
+ "mean": [
21
+ 0.0625106617808342,
22
+ 0.1884286254644394,
23
+ -0.3740880787372589,
24
+ 1.1956137418746948,
25
+ 0.5744051933288574
26
+ ],
27
+ "std": [
28
+ 0.6124866008758545,
29
+ 0.7113936543464661,
30
+ 0.7794482111930847,
31
+ 0.5560765862464905,
32
+ 0.5381472706794739
33
+ ],
34
+ "q01": [
35
+ -1.5892086327075958,
36
+ -1.592660903930664,
37
+ -1.5972639322280884,
38
+ -1.641760230064392,
39
+ -1.107420951128006
40
+ ],
41
+ "q99": [
42
+ 1.4181279838085175,
43
+ 1.1615070104599,
44
+ 1.3318204879760742,
45
+ 1.6003326177597046,
46
+ 1.4453628063201904
47
+ ]
48
+ },
49
+ "gripper": {
50
+ "max": [
51
+ 1.2029346227645874
52
+ ],
53
+ "min": [
54
+ -0.0030687106773257256
55
+ ],
56
+ "mean": [
57
+ 0.484174907207489
58
+ ],
59
+ "std": [
60
+ 0.36618679761886597
61
+ ],
62
+ "q01": [
63
+ 0.007671777158975601
64
+ ],
65
+ "q99": [
66
+ 1.1937284469604492
67
+ ]
68
+ }
69
+ },
70
+ "action": {
71
+ "single_arm": {
72
+ "max": [
73
+ 1.9747153520584106,
74
+ 1.245896577835083,
75
+ 1.7599055767059326,
76
+ 1.6371572017669678,
77
+ 2.2570366859436035
78
+ ],
79
+ "min": [
80
+ -1.9072037935256958,
81
+ -1.8703792095184326,
82
+ -1.640225887298584,
83
+ -1.7752491235733032,
84
+ -1.4346222877502441
85
+ ],
86
+ "mean": [
87
+ 0.04739709198474884,
88
+ 0.1307937055826187,
89
+ -0.40392717719078064,
90
+ 1.205496907234192,
91
+ 0.5910767912864685
92
+ ],
93
+ "std": [
94
+ 0.6170614361763,
95
+ 0.7358222007751465,
96
+ 0.8430852890014648,
97
+ 0.565430223941803,
98
+ 0.5715965032577515
99
+ ],
100
+ "q01": [
101
+ -1.619895726442337,
102
+ -1.835089087486267,
103
+ -1.6310198307037354,
104
+ -1.6310198307037354,
105
+ -1.1001328229904175
106
+ ],
107
+ "q99": [
108
+ 1.4024008512496948,
109
+ 1.118543028831482,
110
+ 1.744562029838562,
111
+ 1.6279510259628296,
112
+ 2.118177652359009
113
+ ]
114
+ },
115
+ "gripper": {
116
+ "max": [
117
+ 1.2474309206008911
118
+ ],
119
+ "min": [
120
+ -0.6689789295196533
121
+ ],
122
+ "mean": [
123
+ 0.23903973400592804
124
+ ],
125
+ "std": [
126
+ 0.5927625298500061
127
+ ],
128
+ "q01": [
129
+ -0.6352231502532959
130
+ ],
131
+ "q99": [
132
+ 1.16917884349823
133
+ ]
134
+ }
135
+ }
136
+ },
137
+ "modalities": {
138
+ "video": {
139
+ "webcam": {
140
+ "resolution": [
141
+ 320,
142
+ 240
143
+ ],
144
+ "channels": 3,
145
+ "fps": 30.0
146
+ }
147
+ },
148
+ "state": {
149
+ "single_arm": {
150
+ "absolute": true,
151
+ "rotation_type": null,
152
+ "shape": [
153
+ 5
154
+ ],
155
+ "continuous": true
156
+ },
157
+ "gripper": {
158
+ "absolute": true,
159
+ "rotation_type": null,
160
+ "shape": [
161
+ 1
162
+ ],
163
+ "continuous": true
164
+ }
165
+ },
166
+ "action": {
167
+ "single_arm": {
168
+ "absolute": true,
169
+ "rotation_type": null,
170
+ "shape": [
171
+ 5
172
+ ],
173
+ "continuous": true
174
+ },
175
+ "gripper": {
176
+ "absolute": true,
177
+ "rotation_type": null,
178
+ "shape": [
179
+ 1
180
+ ],
181
+ "continuous": true
182
+ }
183
+ }
184
+ },
185
+ "embodiment_tag": "new_embodiment"
186
+ }
187
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff