Upload bridge_orig LoRA adapter (r=32, 195k steps)

Browse files

Files changed (4) hide show

README.md +101 -0
adapter_config.json +45 -0
adapter_model.safetensors +3 -0
dataset_statistics.json +127 -0

README.md ADDED Viewed

	@@ -0,0 +1,101 @@

+---
+base_model: openvla/openvla-7b
+library_name: peft
+license: mit
+tags:
+  - openvla
+  - vla
+  - robotics
+  - lora
+  - bridgedata-v2
+datasets:
+  - bridge_orig
+---
+# OpenVLA-7B + BridgeData V2 LoRA adapter
+LoRA adapter (rank 32) fine-tuned on top of [`openvla/openvla-7b`](https://huggingface.co/openvla/openvla-7b)
+on the **BridgeData V2** dataset (`bridge_orig` from the official Bridge V2 project website),
+following the standard LoRA fine-tune recipe in the [OpenVLA repo](https://github.com/openvla/openvla).
+## Files
+- `adapter_model.safetensors` — LoRA weights (~463 MB)
+- `adapter_config.json` — PEFT config (`r=32`, `alpha=16`, `dropout=0.0`)
+- `dataset_statistics.json` — bridge_orig action normalization stats (needed by `predict_action(unnorm_key="bridge_orig")`)
+## Training setup
+| | |
+|---|---|
+| Base model | `openvla/openvla-7b` |
+| Dataset | `bridge_orig` (BridgeData V2, project-website version) |
+| LoRA rank | 32 |
+| LoRA alpha | 16 |
+| LoRA dropout | 0.0 |
+| Target modules | all q/k/v/o + MLP projections + lm_head (PEFT auto-mapping) |
+| Batch size | 16 per GPU |
+| Grad accumulation | 1 |
+| Effective batch | 16 × 8 GPUs = 128 |
+| Learning rate | 5e-4 |
+| Image augmentation | enabled (random resized crop, scale ≈ 0.9) |
+| Hardware | 8× NVIDIA A100-SXM4-80GB |
+| Steps | 195,000 gradient steps (≈ 2.5 × 10⁷ transitions) |
+| Precision | bf16, FlashAttention-2 |
+Training command (script: `vla-scripts/finetune.py`):
+```bash
+torchrun --standalone --nnodes 1 --nproc-per-node 8 vla-scripts/finetune.py \
+  --vla_path openvla/openvla-7b \
+  --data_root_dir <path-to-rlds-data> \
+  --dataset_name bridge_orig \
+  --run_root_dir runs --adapter_tmp_dir adapter-tmp \
+  --lora_rank 32 --batch_size 16 --grad_accumulation_steps 1 \
+  --learning_rate 5e-4 --image_aug True \
+  --save_steps 5000 --max_steps 200000
+```
+## Quick offline evaluation
+On 98 frames sampled from the bridge_orig **val** split (3 episodes, open-loop teacher-forcing — no simulator), per-dimension MAE was:
+| dim | dx | dy | dz | dRoll | dPitch | dYaw | gripper |
+|---|---|---|---|---|---|---|---|
+| MAE | 0.004 | 0.007 | 0.007 | 0.033 | 0.041 | 0.040 | 0.053 |
+For context, bridge_orig action `q99` magnitudes are roughly `~3e-2` for translation, `~0.1–0.2` for rotation, and `{0,1}` for gripper. This is **single-step open-loop accuracy**, not closed-loop task success.
+## Usage
+```python
+import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from peft import PeftModel
+processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
+base = AutoModelForVision2Seq.from_pretrained(
+    "openvla/openvla-7b",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    trust_remote_code=True,
+).to("cuda")
+vla = PeftModel.from_pretrained(base, "RalphFH/openvla-7b")
+# Load action normalization statistics for predict_action
+import json, huggingface_hub
+stats_path = huggingface_hub.hf_hub_download("RalphFH/openvla-7b", "dataset_statistics.json")
+vla.norm_stats = json.load(open(stats_path))
+from PIL import Image
+img = Image.open("some_observation.png").convert("RGB")
+inputs = processor("In: What action should the robot take to pick up the carrot?\nOut:", img).to("cuda", dtype=torch.bfloat16)
+action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
+print(action)  # 7-D: [dx, dy, dz, dRoll, dPitch, dYaw, gripper]
+```
+If you prefer not to merge LoRA at inference, you can also call `vla.merge_and_unload()` first.
+## License
+MIT (matches OpenVLA upstream).

adapter_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "OpenVLAForActionPrediction",
+    "parent_library": "transformers_modules.openvla-7b.modeling_prismatic"
+  },
+  "base_model_name_or_path": "openvla/openvla-7b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": "gaussian",
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o_proj",
+    "kv",
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "fc3",
+    "lm_head",
+    "k_proj",
+    "fc2",
+    "fc1",
+    "proj",
+    "qkv",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:751eae3aadce9f02b5185b0cdef8ea43b4c644f7f1ac4ffa3b93a5fba3463063
+size 484458600

dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+  "bridge_orig": {
+    "action": {
+      "mean": [
+        0.0002334193413844332,
+        0.0001300490548601374,
+        -0.0001276246621273458,
+        -0.00015565502690151334,
+        -0.0004039333143737167,
+        0.0002355769247515127,
+        0.5764579772949219
+      ],
+      "std": [
+        0.009765916503965855,
+        0.013689138926565647,
+        0.012667354196310043,
+        0.02853417582809925,
+        0.0306379534304142,
+        0.07691461592912674,
+        0.49737000465393066
+      ],
+      "max": [
+        0.41691166162490845,
+        0.25864794850349426,
+        0.21218234300613403,
+        3.122201919555664,
+        1.8618112802505493,
+        6.280478477478027,
+        1.0
+      ],
+      "min": [
+        -0.4007510244846344,
+        -0.13874775171279907,
+        -0.22553899884223938,
+        -3.2010786533355713,
+        -1.8618112802505493,
+        -6.279075622558594,
+        0.0
+      ],
+      "q01": [
+        -0.02872725307941437,
+        -0.04170349963009357,
+        -0.026093858778476715,
+        -0.08092105075716972,
+        -0.09288699507713317,
+        -0.20718276381492615,
+        0.0
+      ],
+      "q99": [
+        0.028309678435325586,
+        0.040855254605412394,
+        0.040161586627364146,
+        0.08192047759890528,
+        0.07792850524187081,
+        0.20382574498653397,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 2135463,
+    "num_trajectories": 60064
+  }
+}