Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +94 -0
adapter_config.json +38 -0
adapter_model.safetensors +3 -0
config.json +65 -0
experiment_cfg/metadata.json +187 -0
trainer_state.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# GR00T-N1.5-3B LoRA Fine-tuned Model
+This is a LoRA fine-tuned checkpoint of [nvidia/GR00T-N1.5-3B](https://huggingface.co/nvidia/GR00T-N1.5-3B) trained on single front camera data.
+## Model Details
+- **Base Model**: nvidia/GR00T-N1.5-3B
+- **Training Method**: LoRA (Low-Rank Adaptation)
+- **Training Steps**: 100,000
+- **Final Training Loss**: 0.053
+## Training Configuration
+### LoRA Parameters
+- **Rank (r)**: 8
+- **Alpha**: 16
+- **Dropout**: 0.1
+- **Target Modules**: to_q, to_k, to_v (attention layers only)
+- **Trainable Parameters**: 1,638,400 (0.06% of total)
+### Training Parameters
+- **Batch Size**: 2 per GPU
+- **Learning Rate**: 1e-4
+- **Weight Decay**: 1e-5
+- **Warmup Ratio**: 0.05
+- **Optimizer**: AdamW
+- **LR Scheduler**: Cosine
+- **Training Duration**: ~1h 52m (6719 seconds)
+- **Training Speed**: 14.88 steps/second
+### Model Components Tuned
+- **LLM Backbone**: ❌ Frozen
+- **Vision Tower**: ❌ Frozen
+- **Action Head Projector**: ✅ Tuned
+- **Diffusion Model**: ✅ Tuned
+## Dataset
+- **Embodiment**: SO-100 robot with single front camera
+- **Camera Resolution**: 320x240
+- **FPS**: 30
+- **Action Dimensions**: 6 (5 DoF arm + 1 gripper)
+- **Action Horizon**: 16 timesteps
+- **Video Backend**: torchvision_av
+## Usage
+This is a LoRA adapter that must be loaded on top of the base model:
+```python
+from gr00t.model.gr00t_n1 import GR00T_N1_5
+from peft import PeftModel
+# Load base model
+base_model = GR00T_N1_5.from_pretrained("nvidia/GR00T-N1.5-3B")
+# Load LoRA adapter
+model = PeftModel.from_pretrained(base_model, "path/to/this/checkpoint")
+# Use for inference
+model.eval()
+```
+## Model Architecture
+- **Action Dimension**: 32 (max)
+- **Action Horizon**: 16
+- **Hidden Size**: 2048
+- **Compute Dtype**: bfloat16
+- **Diffusion Timesteps**: 4 (inference)
+## Training Hardware
+- **GPUs**: 1x NVIDIA GPU
+- **Compute Dtype**: bfloat16
+- **TF32**: Enabled
+- **Gradient Checkpointing**: Disabled
+## Citation
+If you use this model, please cite the original GR00T paper and model:
+```bibtex
+@misc{gr00t2024,
+  title={GR00T: Generalist Robot Policy},
+  author={NVIDIA},
+  year={2024},
+  url={https://huggingface.co/nvidia/GR00T-N1.5-3B}
+}
+```
+## License
+Inherits license from nvidia/GR00T-N1.5-3B base model.

adapter_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/aidan/.cache/huggingface/hub/models--nvidia--GR00T-N1.5-3B/snapshots/869830fc749c35f34771aa5209f923ac57e4564e",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "to_q",
+    "to_k",
+    "to_v"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:231bacf00771ddba3e3c923c466521664e89f79e4646ae3faadb72b968b2b32d
+size 6571800

config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "_name_or_path": "nvidia/GR00T-N1.5-3B",
+  "action_dim": 32,
+  "action_head_cfg": {
+    "action_dim": 32,
+    "action_horizon": 16,
+    "add_pos_embed": true,
+    "backbone_embedding_dim": 2048,
+    "diffusion_model_cfg": {
+      "attention_head_dim": 48,
+      "cross_attention_dim": 2048,
+      "dropout": 0.2,
+      "final_dropout": true,
+      "interleave_self_attention": true,
+      "norm_type": "ada_norm",
+      "num_attention_heads": 32,
+      "num_layers": 16,
+      "output_dim": 1024,
+      "positional_embeddings": null
+    },
+    "hidden_size": 1024,
+    "input_embedding_dim": 1536,
+    "max_action_dim": 32,
+    "max_state_dim": 64,
+    "model_dtype": "float32",
+    "noise_beta_alpha": 1.5,
+    "noise_beta_beta": 1.0,
+    "noise_s": 0.999,
+    "num_inference_timesteps": 4,
+    "num_target_vision_tokens": 32,
+    "num_timestep_buckets": 1000,
+    "tune_diffusion_model": true,
+    "tune_projector": true,
+    "use_vlln": true,
+    "vl_self_attention_cfg": {
+      "attention_head_dim": 64,
+      "dropout": 0.2,
+      "final_dropout": true,
+      "num_attention_heads": 32,
+      "num_layers": 4,
+      "positional_embeddings": null
+    }
+  },
+  "action_horizon": 16,
+  "architectures": [
+    "GR00T_N1_5"
+  ],
+  "attn_implementation": null,
+  "backbone_cfg": {
+    "eagle_path": "NVEagle/eagle_er-qwen3_1_7B-Siglip2_400M_stage1_5_128gpu_er_v7_1mlp_nops",
+    "load_bf16": false,
+    "project_to_dim": null,
+    "reproject_vision": false,
+    "select_layer": 12,
+    "tune_llm": false,
+    "tune_visual": true,
+    "use_flash_attention": true
+  },
+  "compute_dtype": "bfloat16",
+  "hidden_size": 2048,
+  "model_dtype": "float32",
+  "model_type": "gr00t_n1_5",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3"
+}

experiment_cfg/metadata.json ADDED Viewed

	@@ -0,0 +1,187 @@

+{
+    "new_embodiment": {
+        "statistics": {
+            "state": {
+                "single_arm": {
+                    "max": [
+                        1.853501319885254,
+                        1.5128743648529053,
+                        1.3364235162734985,
+                        1.6018670797348022,
+                        1.4484314918518066
+                    ],
+                    "min": [
+                        -1.8044019937515259,
+                        -1.5987982749938965,
+                        -1.6248823404312134,
+                        -1.7460963726043701,
+                        -1.4346222877502441
+                    ],
+                    "mean": [
+                        0.0625106617808342,
+                        0.1884286254644394,
+                        -0.3740880787372589,
+                        1.1956137418746948,
+                        0.5744051933288574
+                    ],
+                    "std": [
+                        0.6124866008758545,
+                        0.7113936543464661,
+                        0.7794482111930847,
+                        0.5560765862464905,
+                        0.5381472706794739
+                    ],
+                    "q01": [
+                        -1.5892086327075958,
+                        -1.592660903930664,
+                        -1.5972639322280884,
+                        -1.641760230064392,
+                        -1.107420951128006
+                    ],
+                    "q99": [
+                        1.4181279838085175,
+                        1.1615070104599,
+                        1.3318204879760742,
+                        1.6003326177597046,
+                        1.4453628063201904
+                    ]
+                },
+                "gripper": {
+                    "max": [
+                        1.2029346227645874
+                    ],
+                    "min": [
+                        -0.0030687106773257256
+                    ],
+                    "mean": [
+                        0.484174907207489
+                    ],
+                    "std": [
+                        0.36618679761886597
+                    ],
+                    "q01": [
+                        0.007671777158975601
+                    ],
+                    "q99": [
+                        1.1937284469604492
+                    ]
+                }
+            },
+            "action": {
+                "single_arm": {
+                    "max": [
+                        1.9747153520584106,
+                        1.245896577835083,
+                        1.7599055767059326,
+                        1.6371572017669678,
+                        2.2570366859436035
+                    ],
+                    "min": [
+                        -1.9072037935256958,
+                        -1.8703792095184326,
+                        -1.640225887298584,
+                        -1.7752491235733032,
+                        -1.4346222877502441
+                    ],
+                    "mean": [
+                        0.04739709198474884,
+                        0.1307937055826187,
+                        -0.40392717719078064,
+                        1.205496907234192,
+                        0.5910767912864685
+                    ],
+                    "std": [
+                        0.6170614361763,
+                        0.7358222007751465,
+                        0.8430852890014648,
+                        0.565430223941803,
+                        0.5715965032577515
+                    ],
+                    "q01": [
+                        -1.619895726442337,
+                        -1.835089087486267,
+                        -1.6310198307037354,
+                        -1.6310198307037354,
+                        -1.1001328229904175
+                    ],
+                    "q99": [
+                        1.4024008512496948,
+                        1.118543028831482,
+                        1.744562029838562,
+                        1.6279510259628296,
+                        2.118177652359009
+                    ]
+                },
+                "gripper": {
+                    "max": [
+                        1.2474309206008911
+                    ],
+                    "min": [
+                        -0.6689789295196533
+                    ],
+                    "mean": [
+                        0.23903973400592804
+                    ],
+                    "std": [
+                        0.5927625298500061
+                    ],
+                    "q01": [
+                        -0.6352231502532959
+                    ],
+                    "q99": [
+                        1.16917884349823
+                    ]
+                }
+            }
+        },
+        "modalities": {
+            "video": {
+                "webcam": {
+                    "resolution": [
+                        320,
+                        240
+                    ],
+                    "channels": 3,
+                    "fps": 30.0
+                }
+            },
+            "state": {
+                "single_arm": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        5
+                    ],
+                    "continuous": true
+                },
+                "gripper": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        1
+                    ],
+                    "continuous": true
+                }
+            },
+            "action": {
+                "single_arm": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        5
+                    ],
+                    "continuous": true
+                },
+                "gripper": {
+                    "absolute": true,
+                    "rotation_type": null,
+                    "shape": [
+                        1
+                    ],
+                    "continuous": true
+                }
+            }
+        },
+        "embodiment_tag": "new_embodiment"
+    }
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff