Add CPU inference script, update README with model details and perf stats

Browse files

Files changed (9) hide show

.gitattributes +0 -34
.gitignore +3 -0
HF_README.md +167 -0
README.md +3 -139
inference.py +28 -18
inference_cpu.py +61 -0
modeling_openvla_micro.py +5 -2
pyproject.toml +2 -1
train_shim.py +205 -292

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text























1	*.pt filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -11,4 +11,7 @@ dist/
 *.egg-info/
 .venv/
 venv/

 *.egg-info/
 .venv/
 venv/
+*.pt
+*.bin
+*.safetensors

HF_README.md ADDED Viewed

	@@ -0,0 +1,167 @@

+---
+license: mit
+language:
+- en
+library_name: transformers
+pipeline_tag: reinforcement-learning
+tags:
+- robotics
+- vla
+- vision-language-action
+- openvla
+- omnivla
+- robot
+- qwen
+- dinov2
+- siglip
+datasets:
+- libero_90
+- cast
+model-index:
+- name: openvla-micro
+  results: []
+---
+# OpenVLA-Micro
+**A drop-in replacement for OmniVLA 7B that runs ~14× faster with 0.997 action cosine similarity.**
+OpenVLA-Micro is a compact Vision-Language-Action model that replaces OmniVLA 7B (DINOv2-L + SigLIP-so400m + Llama-2 7B) with a much smaller stack (DINOv2-S/14 + SigLIP-B/16 + Qwen2.5 0.5B) while preserving compatibility with OmniVLA's pretrained action head through a learned hidden-state shim (896→4096).
+## Model Architecture
+| Component | Encoder | Output Dim |
+|---|---|---|
+| Vision (DINO) | `DINOv2-S/14` (facebook/dinov2-small) | 256 tokens × 384d → MLP → 8704 |
+| Vision (SigLIP) | `SigLIP-B/16` (google/siglip-base-patch16-224) | 196 tokens × 768d → MLP → 8704 |
+| Projector | Linear(8704→896) + GELU + Linear(896→896) | 452 tokens × 896d |
+| LLM | `Qwen2.5-0.5B` (24 layers, 896 hidden, 8 heads) | Variable × 896d |
+| Shim | Linear(896→2048) + GELU + Linear(2048→4096) | 32 action tokens × 4096d |
+| Action Head | OmniVLA's pretrained head (unchanged) | 8 chunks × 7-DoF |
+Total parameters: **~0.6B** (vs 7B for OmniVLA/OpenVLA).
+## Performance
+### vs OmniVLA 7B (teacher)
+| Metric | Value | Note |
+|---|---|---|
+| Hidden state cosine | **0.63** | Last-layer HS at action positions |
+| Action cosine | **0.997** | After OmniVLA action head |
+| Action MSE | ~0.001 | Effectively identical predictions |
+| Inference speed | **~14× faster** | 0.5B vs 7B LLM |
+Trained on 1000 CAST episodes (17k steps) via hidden-state distillation. The shim was trained for ~21k steps (plateaued at ~8k).
+### vs OpenVLA 7B (original)
+OpenVLA-Micro is distilled from *OmniVLA*, which itself fine-tuned *OpenVLA* with 32 action tokens and a modified action head. Direct comparison with the original OpenVLA is not apples-to-apples due to different action tokenization, but the ~14× speedup and near-lossless action quality relative to OmniVLA apply similarly vs OpenVLA.
+## Quick Start
+```python
+from PIL import Image
+from modeling_openvla_micro import OpenVLAMicro
+model = OpenVLAMicro.from_pretrained("theguy21/openvla-micro", device="cuda")
+model.eval()
+image = Image.open("demo.jpg").convert("RGB")
+action = model.predict_action(image, "pick up the red block")
+print(action)  # [0.12, -0.03, 0.45, -0.01, 0.22, 0.08, -0.15]
+```
+### CLI — GPU
+```bash
+python inference.py --image demo.jpg "pick up the red block"
+```
+### CLI — CPU / Edge
+```bash
+# Standard CPU (~6GB RAM, 3-5 sec/step)
+python inference_cpu.py --image demo.jpg "pick up the red block"
+# Low-RAM CPU (~2.5GB RAM, requires bitsandbytes)
+python inference_cpu.py --low-ram --image demo.jpg "pick up the red block"
+```
+### As an OmniVLA drop-in replacement
+Use `OpenVLAMicroWrapper` (from `model_wrapper.py`) to expose the same forward interface as OmniVLA's `VLAForActionPrediction`:
+```python
+from model_wrapper import OpenVLAMicroWrapper
+from modeling_openvla_micro import DinoSigLIPEncoder, CombinedProjector, ShimMLP
+ckpt = torch.load("openvla-micro-distill.pt", map_location="cpu")
+ve = DinoSigLIPEncoder()
+ve.load_state_dict(ckpt["model"]["vision_backbone"])
+# ... (see model_wrapper.py for full example)
+output = vla(input_ids, attention_mask, pixel_values, labels=labels, output_hidden_states=True)
+actions_hidden_states = extract_actions(output.hidden_states[-1], labels)
+predicted_actions = omnivla_action_head.predict_action(actions_hidden_states, modality_id)
+```
+## Architecture Diagram
+```
+Image (224×224)
+  ├── DINOv2-S/14 → 256 patches × 384d → ShimMLP(384→8704)
+  └── SigLIP-B/16 → 196 patches × 768d → ShimMLP(768→8704)
+       └── Concat (452 tokens) → Linear(8704→896) → GELU → Linear(896→896)
+              └── Qwen2.5 0.5B (24 layers, 896 hidden)
+                     └── Hidden State Shim (896→2048→4096)
+                            └── OmniVLA Action Head (pretrained, frozen)
+                                   └── 8 chunks × 7-DoF actions
+```
+## Files
+| File | Size | Description |
+|---|---|---|
+| `modeling_openvla_micro.py` | 15 KB | Model definitions |
+| `model_wrapper.py` | 11 KB | OmniVLA-compatible interface |
+| `inference.py` | 1.5 KB | GPU/CPU CLI inference |
+| `inference_cpu.py` | 2 KB | Edge device inference (with low-RAM mode) |
+| `train_shim.py` | 15 KB | Reference shim training script |
+| `config.json` | 1.2 KB | Model configuration |
+| `openvla-micro-merged.pt` | 1.6 GB | Base checkpoint (no shim, 896-dim output) |
+| `openvla-micro-distill.pt` | 1.6 GB | Full checkpoint (with baked-in shim, 4096-dim) |
+**Which checkpoint to use?**
+- `openvla-micro-distill.pt` — **recommended**. Outputs 4096-dim hidden states that plug directly into OmniVLA's action head. One-step inference.
+- `openvla-micro-merged.pt` — base model only (896-dim). Use if you want to train your own shim or action head.
+## Requirements
+```
+torch>=2.0.0
+torchvision>=0.15.0
+transformers>=4.38.0
+timm>=0.9.0
+Pillow>=10.0.0
+numpy>=1.24.0
+```
+For low-RAM CPU: `bitsandbytes>=0.43.0`
+## Training the Shim
+```bash
+python train_shim.py \
+    --cache-dir ./teacher_cache \
+    --data-dir ./dataset \
+    --base-model openvla-micro-merged.pt \
+    --teacher-dim 4096
+```
+See `train_shim.py` for full options. The script expects pre-cached teacher hidden states; adapt `DistillDataset` to your format.
+## License
+MIT

README.md CHANGED Viewed

@@ -1,143 +1,7 @@
----
-license: mit
-language:
-- en
-library_name: transformers
-pipeline_tag: reinforcement-learning
-tags:
-- robotics
-- vla
-- vision-language-action
-- openvla
-- omnivla
-- robot
-- qwen
-- dinov2
-- siglip
-datasets:
-- libero_90
-- cast
-model-index:
-- name: openvla-micro
-  results: []
----
 # OpenVLA-Micro
-**A drop-in replacement for OmniVLA 7B that runs ~14× faster with 0.997 action cosine similarity.**
-OpenVLA-Micro is a compact Vision-Language-Action model that replaces the bulky OmniVLA 7B architecture (DINOv2-L + SigLIP-so400m + Llama-2 7B) with a much smaller stack (DINOv2-S + SigLIP-B + Qwen2.5 0.5B) while preserving compatibility with OmniVLA's action head through a learned hidden state shim (896→4096).
-| Property | OpenVLA-Micro | OmniVLA 7B |
-|---|---|---|
-| Vision encoder | DINOv2-S (384d) + SigLIP-B/16 (768d) | DINOv2-L (1024d) + SigLIP-so400m (1152d) |
-| LLM | Qwen2.5 0.5B (896 hidden) | Llama-2 7B (4096 hidden) |
-| Total params | ~0.6B | ~7B |
-| Hidden state dim | 4096 (via learned shim) | 4096 (native) |
-| Action head | OmniVLA compatible | Native |
-| Action cos similarity | **0.997** vs teacher | 1.0 (reference) |
-## Performance
-Trained on 1000 CAST episodes (17k steps) via hidden-state distillation from OmniVLA 7B:
-- **Hidden state cosine**: 0.63 vs teacher
-- **Action cosine**: 0.997 vs teacher
-- **Action MSE**: near-zero (effectively identical predictions)
-## Quick Start
-```python
-from PIL import Image
-from modeling_openvla_micro import OpenVLAMicro
-model = OpenVLAMicro.from_pretrained("theguy21/openvla-micro", device="cuda")
-model.eval()
-image = Image.open("demo.jpg").convert("RGB")
-action = model.predict_action(image, "pick up the red block")
-print(action)
-```
-### CLI
-```bash
-python inference.py --checkpoint theguy21/openvla-micro --image demo.jpg "pick up the red block"
-```
-### As an OmniVLA drop-in replacement
-Use `OpenVLAMicroWrapper` (from `model_wrapper.py`) to expose the same forward interface as OmniVLA's `VLAForActionPrediction`:
-```python
-from model_wrapper import OpenVLAMicroWrapper
-from modeling_openvla_micro import DinoSigLIPEncoder, CombinedProjector, ShimMLP
-# Load components from checkpoint
-ckpt = torch.load("openvla-micro-distill.pt", map_location="cpu")
-ve = DinoSigLIPEncoder()
-ve.load_state_dict(ckpt["model"]["vision_backbone"])
-# ... (see model_wrapper.py for full example)
-# Forward pass compatible with OmniVLA action head
-output = vla(input_ids, attention_mask, pixel_values, labels=labels, output_hidden_states=True)
-actions_hidden_states = extract_actions(output.hidden_states[-1], labels)
-predicted_actions = omnivla_action_head.predict_action(actions_hidden_states, modality_id)
-```
-## Architecture
-```
-Image (224×224)
-  ├── DINOv2-S/14 → 256 patches × 384d → ShimMLP(384→8704)
-  └── SigLIP-B/16 → 196 patches × 768d → ShimMLP(768→8704)
-       └── Concat (452 tokens) → Linear(8704→896) → GELU → Linear(896→896)
-              └── Qwen2.5 0.5B (24 layers, 896 hidden)
-                     └── Hidden State Shim (896→2048→4096)
-                            └── OmniVLA Action Head (pretrained)
-                                   └── 8 chunks × 4-DoF actions
-```
-## Files
-| File | Size | Description |
-|---|---|---|
-| `modeling_openvla_micro.py` | 15 KB | Model definitions (DinoSigLIPEncoder, CombinedProjector, ShimMLP, OpenVLAMicro) |
-| `model_wrapper.py` | 11 KB | OmniVLA-compatible wrapper (OpenVLAMicroWrapper) |
-| `inference.py` | 1.5 KB | Standalone CLI inference |
-| `config.json` | 1.2 KB | Model configuration |
-| `openvla-micro-distill.pt` | 1.6 GB | Full checkpoint with baked-in shim |
-## Requirements
-```
-torch>=2.0.0
-torchvision>=0.15.0
-transformers>=4.38.0
-timm>=0.9.0
-Pillow>=10.0.0
-numpy>=1.24.0
-```
-## Training Details
-The shim (896→2048→4096 MLP) was trained to minimize MSE between the Qwen2.5 0.5B last hidden state and the cached OmniVLA 7B hidden states, keeping all other components frozen. Training used 1000 CAST episodes (17k steps) with bf16 precision on a single 24GB GPU.
-- **Optimizer**: AdamW (lr=5e-5, cosine schedule)
-- **Batch**: 8 micro-batch × 4 grad accum = 32 effective
-- **Training steps**: 21k (plateaued at step ~8k)
-- **Precision**: bfloat16
-## Citation
-```bibtex
-@misc{openvla-micro-2026,
-  title = {OpenVLA-Micro: A Compact Drop-in Replacement for OmniVLA 7B},
-  author = {},
-  year = {2026},
-}
-```
-## License
-MIT

 # OpenVLA-Micro
+A drop-in replacement for OmniVLA 7B that runs ~14× faster with 0.997 action cosine similarity.
+Swaps DINOv2-L + SigLIP-so400m + Llama-2 7B → DINOv2-S/14 + SigLIP-B/16 + Qwen2.5 0.5B (~0.6B total), with a learned 896→4096 shim to stay compatible with OmniVLA's action head.
+See [HF_README.md](HF_README.md) or https://huggingface.co/theguy21/openvla-micro for full details.

inference.py CHANGED Viewed

@@ -1,38 +1,48 @@
 """
 Standalone inference script for OpenVLA-Micro.
 Usage:
-    python inference.py --checkpoint openvla-micro-merged.pt --image demo.jpg "pick up the red block"
 """
 import argparse
 from PIL import Image
 from modeling_openvla_micro import OpenVLAMicro
 def main():
     parser = argparse.ArgumentParser(description="OpenVLA-Micro inference")
-    parser.add_argument("--checkpoint", type=str, default="openvla-micro-merged.pt",
-                        help="Path to a local checkpoint or a Hugging Face repo ID")
-    parser.add_argument("--image", type=str, required=True,
-                        help="Path to input image")
-    parser.add_argument("--device", type=str, default="cpu",
-                        help="Device: cpu or cuda")
-    parser.add_argument("instruction", type=str, nargs="?",
-                        default="pick up the red block",
-                        help="Task instruction")
     args = parser.parse_args()
-    # Load model
-    print(f"Loading OpenVLA-Micro from {args.checkpoint}...")
-    model = OpenVLAMicro.from_pretrained(args.checkpoint, device=args.device)
     model.eval()
-    print(f"Model loaded on {args.device}")
-    # Load image
     image = Image.open(args.image).convert("RGB")
     print(f"Image: {image.size}")
-    # Run inference
     print(f"Instruction: {args.instruction}")
     action = model.predict_action(image, args.instruction)
     print(f"Action (7-DoF): {action}")

 """
 Standalone inference script for OpenVLA-Micro.
 Usage:
+    # GPU inference (HF hub)
+    python inference.py --image demo.jpg "pick up the red block"
+    # From a local .pt file
+    python inference.py --checkpoint openvla-micro-distill.pt --image demo.jpg "pick up the red block"
+    # CPU inference
+    python inference.py --device cpu --image demo.jpg "pick up the red block"
 """
 import argparse
 from PIL import Image
 from modeling_openvla_micro import OpenVLAMicro
 def main():
     parser = argparse.ArgumentParser(description="OpenVLA-Micro inference")
+    parser.add_argument("--checkpoint", type=str, default="theguy21/openvla-micro",
+                        help="HF repo ID or path to local .pt checkpoint")
+    parser.add_argument("--image", type=str, required=True, help="Input image path")
+    parser.add_argument("--device", type=str, default="auto",
+                        help="Device: auto, cuda, or cpu")
+    parser.add_argument("instruction", type=str, nargs="?", default="pick up the red block",
+                        help="Task instruction (positional, optional)")
     args = parser.parse_args()
+    device = args.device
+    if device == "auto":
+        import torch
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    llm_kwargs = {}
+    if device == "cpu":
+        llm_kwargs["torch_dtype"] = "float32"
+    print(f"Loading OpenVLA-Micro from {args.checkpoint} on {device}...")
+    model = OpenVLAMicro.from_pretrained(args.checkpoint, device=device, llm_kwargs=llm_kwargs)
     model.eval()
+    n_params = sum(p.numel() for p in model.parameters()) / 1e6
+    print(f"Model loaded ({n_params:.0f}M params)")
     image = Image.open(args.image).convert("RGB")
     print(f"Image: {image.size}")
     print(f"Instruction: {args.instruction}")
     action = model.predict_action(image, args.instruction)
     print(f"Action (7-DoF): {action}")

inference_cpu.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+Edge device / CPU inference for OpenVLA-Micro.
+This script is optimized for resource-constrained environments.
+Two modes:
+  1. Standard CPU  – float32, ~3-5 sec/step on modern x86, ~6GB RAM
+  2. Low-RAM (4-bit) – uses bitsandbytes 4-bit quantization, ~2.5GB RAM,
+                       slightly slower but usable on 4GB devices like RPi 5
+                       with sufficient swap.
+Usage:
+    python inference_cpu.py --image demo.jpg "pick up the red block"
+    python inference_cpu.py --low-ram --image demo.jpg "pick up the red block"
+    python inference_cpu.py --checkpoint ./openvla-micro-distill.pt --image demo.jpg "pick up the red block"
+"""
+import argparse
+from PIL import Image
+from modeling_openvla_micro import OpenVLAMicro
+def main():
+    parser = argparse.ArgumentParser(description="OpenVLA-Micro CPU/edge inference")
+    parser.add_argument("--checkpoint", type=str, default="theguy21/openvla-micro",
+                        help="HF repo ID or local .pt path")
+    parser.add_argument("--image", type=str, required=True, help="Input image path")
+    parser.add_argument("--low-ram", action="store_true",
+                        help="4-bit quantized LLM (~2.5GB peak, requires bitsandbytes)")
+    parser.add_argument("instruction", type=str, nargs="?", default="pick up the red block",
+                        help="Task instruction (positional, optional)")
+    args = parser.parse_args()
+    device = "cpu"
+    llm_kwargs = {}
+    if args.low_ram:
+        print("Low-RAM mode: 4-bit quantization (requires bitsandbytes)")
+        llm_kwargs = {
+            "load_in_4bit": True,
+            "bnb_4bit_compute_dtype": "float32",
+            "bnb_4bit_use_double_quant": True,
+        }
+    else:
+        print("Standard CPU mode: float32 (~6GB RAM)")
+        llm_kwargs["torch_dtype"] = "float32"
+    print(f"Loading OpenVLA-Micro from {args.checkpoint} on CPU...")
+    model = OpenVLAMicro.from_pretrained(args.checkpoint, device=device, llm_kwargs=llm_kwargs)
+    model.eval()
+    n_params = sum(p.numel() for p in model.parameters()) / 1e6
+    print(f"Model loaded ({n_params:.0f}M params)")
+    image = Image.open(args.image).convert("RGB")
+    print(f"Image: {image.size}")
+    print(f"Instruction: {args.instruction}")
+    action = model.predict_action(image, args.instruction)
+    print(f"Action (7-DoF): {action}")
+if __name__ == "__main__":
+    main()

modeling_openvla_micro.py CHANGED Viewed

@@ -286,7 +286,8 @@ class OpenVLAMicro(nn.Module):
         )
     @classmethod
-    def from_pretrained(cls, checkpoint_path: Union[str, Path], device: str = "cpu"):
         checkpoint_path = cls._resolve_checkpoint_path(checkpoint_path)
         ckpt = torch.load(checkpoint_path, map_location="cpu")
@@ -304,10 +305,12 @@ class OpenVLAMicro(nn.Module):
         llm_id = "Qwen/Qwen2.5-0.5B"
         config = AutoConfig.from_pretrained(llm_id)
         config.use_flash_attention_2 = False
         llm = AutoModelForCausalLM.from_pretrained(
             llm_id,
             config=config,
-            torch_dtype=torch.bfloat16,
         )
         # --- Tokenizer ---

         )
     @classmethod
+    def from_pretrained(cls, checkpoint_path: Union[str, Path], device: str = "cpu",
+                         **kwargs):
         checkpoint_path = cls._resolve_checkpoint_path(checkpoint_path)
         ckpt = torch.load(checkpoint_path, map_location="cpu")
         llm_id = "Qwen/Qwen2.5-0.5B"
         config = AutoConfig.from_pretrained(llm_id)
         config.use_flash_attention_2 = False
+        llm_kwargs = kwargs.pop("llm_kwargs", {})
+        llm_kwargs.setdefault("torch_dtype", torch.bfloat16)
         llm = AutoModelForCausalLM.from_pretrained(
             llm_id,
             config=config,
+            **llm_kwargs,
         )
         # --- Tokenizer ---

pyproject.toml CHANGED Viewed

@@ -19,6 +19,7 @@ dependencies = [
 [project.scripts]
 openvla-micro = "inference:main"
 [tool.setuptools]
-py-modules = ["inference", "modeling_openvla_micro"]

 [project.scripts]
 openvla-micro = "inference:main"
+openvla-micro-cpu = "inference_cpu:main"
 [tool.setuptools]
+py-modules = ["inference", "inference_cpu", "modeling_openvla_micro", "model_wrapper"]

train_shim.py CHANGED Viewed

@@ -1,8 +1,22 @@
 """
-Continue shim-only training (no action head) on cached distill data.
-Resumes from the previous best shim checkpoint.
 """
-import argparse, json, os, sys
 from pathlib import Path
 import numpy as np
@@ -13,50 +27,50 @@ from torch.utils.data import Dataset, DataLoader
 from PIL import Image
 from tqdm import tqdm
-sys.path.insert(0, os.path.expanduser("~/openvla-micro"))
-sys.path.insert(0, "/mnt/steamdrive/openvla-micro")
-from model_wrapper import IMAGENET_MEAN as IMAGENET_MEAN_4D, IMAGENET_STD as IMAGENET_STD_4D, SIGLIP_MEAN, SIGLIP_STD
-IMAGENET_MEAN = IMAGENET_MEAN_4D.view(3, 1, 1)
-IMAGENET_STD = IMAGENET_STD_4D.view(3, 1, 1)
-# ── Paths ──
-CACHE_DIR = Path("/mnt/steamdrive/openvla_cache")
-CAST_DIR = Path("/mnt/steamdrive/cast_converted")
-CKPT_DIR = Path("/mnt/steamdrive/omnivla_checkpoints")
-RUN_BASE = Path("/mnt/steamdrive/openvla-micro/runs_distill_shim")
-# ── Constants ──
-NUM_ACTION_TOKENS = 32
-ACTION_DIM = 4
-NUM_ACTIONS_CHUNK = 8
-def to_siglip(pv: torch.Tensor) -> torch.Tensor:
-    return (pv * IMAGENET_STD.to(pv.device) + IMAGENET_MEAN.to(pv.device) - SIGLIP_MEAN.to(pv.device)) / SIGLIP_STD.to(pv.device)
-# ── Dataset (same as original) ──
 class DistillDataset(Dataset):
-    def __init__(self, cache_dir: Path, cast_dir: Path, split="train", val_ratio=0.1, max_episodes=0):
-        self.cast_dir = cast_dir
-        self.cache_dir = cache_dir
-        cache_files = sorted(cache_dir.glob("episode_*.pt"))
         n = len(cache_files)
-        if max_episodes > 0:
-            cache_files = cache_files[:max_episodes]
-            n = len(cache_files)
         split_idx = int(n * (1 - val_ratio))
         files = cache_files[:split_idx] if split == "train" else cache_files[split_idx:]
         self.index = []
         for cf in files:
             d = torch.load(cf, weights_only=True)
-            T = d["num_steps"]
-            for t in range(T):
                 self.index.append((cf, t))
         self._cache = {}
-        self._instruction_cache = {}
         print(f"  [{split}] {len(self.index)} steps from {len(files)} episodes", flush=True)
     def __len__(self):
@@ -67,345 +81,244 @@ class DistillDataset(Dataset):
         cf_str = str(cf_path)
         if cf_str not in self._cache:
             self._cache[cf_str] = torch.load(cf_path, weights_only=True)
-            if len(self._cache) > 2:
-                oldest = next(iter(self._cache))
-                del self._cache[oldest]
-        ep_data = self._cache[cf_str]
-        ep_id = ep_data["episode_id"]
-        hs_target = ep_data["hidden_states"][t].float()
-        if ep_id not in self._instruction_cache:
-            instr_path = self.cast_dir / ep_id / "instructions.json"
-            with open(instr_path, "r") as f:
-                self._instruction_cache[ep_id] = json.load(f)
-        instr = self._instruction_cache[ep_id][t]
-        if isinstance(instr, list):
-            instr = instr[0]
-        instr = str(instr).strip()
-        ep_dir = self.cast_dir / ep_id
         from torchvision.transforms.functional import resize as tv_resize
-        cur = tv_resize(Image.open(ep_dir / "img" / f"step_{t:04d}.png").convert("RGB"), 224)
-        cur = torch.tensor(np.array(cur, dtype=np.float32) / 255.0).permute(2, 0, 1)
-        cur = (cur - IMAGENET_MEAN) / IMAGENET_STD
-        return {
-            "cur_img": cur,
-            "hs_target": hs_target,
-            "instruction": instr,
-        }
-class DistillCollator:
-    def __init__(self, tokenizer, action_token_ids, num_action_tokens=NUM_ACTION_TOKENS):
-        self.tokenizer = tokenizer
-        self.action_token_ids = action_token_ids
-        self.num_action_tokens = num_action_tokens
-    def __call__(self, batch):
-        texts = []
-        cur_imgs = []
-        hs_targets = []
-        for item in batch:
-            texts.append(item["instruction"])
-            cur_imgs.append(item["cur_img"])
-            hs_targets.append(item["hs_target"])
-        cur = torch.stack(cur_imgs)  # (B, 3, 224, 224)
-        hs_target = torch.stack(hs_targets)  # (B, 32, 4096)
-        chat = []
-        for t in texts:
-            chat.append([
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": f"What action should the robot take to {t.lower()}?"},
-                {"role": "assistant", "content": " ".join([f"<ACTION_{i}>" for i in range(self.num_action_tokens)])},
-            ])
-        tok = self.tokenizer.apply_chat_template(
-            chat, tokenize=True, add_generation_prompt=False, return_dict=True, return_tensors="pt", padding=True,
-        )
-        input_ids = tok["input_ids"]
-        attention_mask = tok["attention_mask"]
-        return {
-            "cur_img": cur,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "hs_target": hs_target,
-        }
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--max-steps", type=int, default=50000)
     parser.add_argument("--batch-size", type=int, default=8)
     parser.add_argument("--lr", type=float, default=5e-5)
-    parser.add_argument("--weight-decay", type=float, default=0.01)
     parser.add_argument("--grad-accum", type=int, default=4)
-    parser.add_argument("--log-every", type=int, default=50)
     parser.add_argument("--val-every", type=int, default=500)
     parser.add_argument("--save-every", type=int, default=5000)
-    parser.add_argument("--num-workers", type=int, default=0)
-    parser.add_argument("--resume", type=str,
-                        default=str(RUN_BASE / "shim_best.pt"))
-    parser.add_argument("--run-name", type=str, default="shim_continued")
     args = parser.parse_args()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    assert device.type == "cuda"
     print(f"Device: {device}")
-    run_dir = RUN_BASE.parent / args.run_name
-    run_dir.mkdir(exist_ok=True, parents=True)
-    # ── 1. Models ──
-    from modeling_openvla_micro import DinoSigLIPEncoder, CombinedProjector, ShimMLP
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    print("\n[1] Loading base model (vision encoder, projector, Qwen)...")
-    distill_ckpt = torch.load(os.path.expanduser("~/openvla-micro/openvla-micro-distill.pt"),
-                               map_location="cpu", weights_only=False)
-    msd = distill_ckpt["model"]
-    ve = DinoSigLIPEncoder()
     ve.load_state_dict(msd["vision_backbone"])
-    ve.to(device=device, dtype=torch.bfloat16).eval()
-    for p in ve.parameters():
-        p.requires_grad_(False)
-    projector = CombinedProjector(ShimMLP(384), ShimMLP(768),
-                                   nn.Linear(8704, 896), nn.Linear(896, 896))
     projector.load_state_dict(msd["projector"])
-    projector.to(device=device, dtype=torch.bfloat16).eval()
-    for p in projector.parameters():
-        p.requires_grad_(False)
-    qwen = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B", torch_dtype=torch.bfloat16)
-    qwen_sd = {k.replace("llm.", "", 1): v for k, v in msd["llm_backbone"].items()}
-    qwen.load_state_dict(qwen_sd)
-    qwen.to(device=device, dtype=torch.bfloat16).eval()
-    for p in qwen.parameters():
-        p.requires_grad_(False)
     tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", use_fast=True)
     tokenizer.add_tokens([f"<ACTION_{i}>" for i in range(NUM_ACTION_TOKENS)])
-    # ── 2. Shim (trainable) ──
-    print("\n[2] Loading shim...")
-    shim = nn.Sequential(nn.Linear(896, 2048), nn.GELU(), nn.Linear(2048, 4096))
-    if args.resume and os.path.exists(args.resume):
-        shim_sd = torch.load(args.resume, map_location="cpu", weights_only=True)
-        shim.load_state_dict(shim_sd)
         print(f"  Resumed from {args.resume}")
-    else:
-        print("  Starting from scratch (random init)!")
-    shim.to(device=device, dtype=torch.bfloat16).train()
-    # Pose projector (trainable)
-    pose_proj = nn.Sequential(
-        nn.Linear(4, 896), nn.GELU(),
-    ).to(device=device, dtype=torch.bfloat16).train()
     # ── 3. Data ──
-    print("\n[3] Setting up data...")
-    action_token_ids = tokenizer.convert_tokens_to_ids([f"<ACTION_{i}>" for i in range(NUM_ACTION_TOKENS)])
-    train_ds = DistillDataset(CACHE_DIR, CAST_DIR, split="train")
-    val_ds = DistillDataset(CACHE_DIR, CAST_DIR, split="val")
-    collator = DistillCollator(tokenizer, action_token_ids)
-    train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True,
-                              collate_fn=collator, num_workers=args.num_workers, pin_memory=True)
-    val_loader = DataLoader(val_ds, batch_size=1, shuffle=False,
-                            collate_fn=collator, num_workers=0, pin_memory=True)
     # ── 4. Optimizer ──
-    opt = torch.optim.AdamW([
-        {"params": shim.parameters(), "lr": args.lr},
-        {"params": pose_proj.parameters(), "lr": args.lr},
-    ], weight_decay=args.weight_decay)
     sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.max_steps)
     # ── 5. Training ──
-    print("\n[4] Training (shim only, no action head)...")
-    dino = ve.dino_featurizer
-    siglip = ve.siglip_featurizer
-    dino.eval(); siglip.eval()
-    projector.eval(); qwen.eval()
     def encode_image(cur):
-        """Encode a single image → (B, 452, 896) vision features."""
         with torch.no_grad():
-            dino_f = dino(cur.to(dtype=torch.bfloat16))
-            if isinstance(dino_f, (list, tuple)):
-                dino_f = dino_f[0]
-            dino_f = dino_f[:, 1:]  # drop cls
-            siglip_f = siglip(to_siglip(cur).to(dtype=torch.bfloat16))
-            if isinstance(siglip_f, (list, tuple)):
-                siglip_f = siglip_f[0]
-            siglip_f = siglip_f[:, 1:]  # drop cls
-            B = cur.shape[0]
-            D = 1152
-            def pad(feat, ed):
-                p = torch.zeros(B, feat.shape[1], D, device=device, dtype=torch.bfloat16)
-                p[..., :ed] = feat[..., :ed]
-                return p
-            combined = torch.cat([pad(dino_f, 384), pad(siglip_f, 768)], dim=1)
-            return projector(combined)  # (B, 452, 896)
-    # Find action token offset in template
-    dummy_tok = tokenizer.apply_chat_template(
-        [{"role": "system", "content": "You are a helpful assistant."},
-         {"role": "user", "content": "test"},
-         {"role": "assistant", "content": " ".join([f"<ACTION_{i}>" for i in range(NUM_ACTION_TOKENS)])}],
-        tokenize=True, add_generation_prompt=False, return_dict=True, return_tensors="pt",
-    )
-    dummy_ids = dummy_tok["input_ids"].squeeze(0)
-    action_pos = torch.where(
-        (dummy_ids >= action_token_ids[0]) & (dummy_ids <= action_token_ids[-1])
-    )[0]
-    action_offset_in_text = action_pos[0].item()
-    NUM_VIS = 452
-    print(f"  Action tokens start at position {action_offset_in_text} in text sequence")
-    print(f"  Vision tokens: {NUM_VIS}")
     global_step = 0
-    best_val_loss = float("inf")
     train_iter = iter(train_loader)
     pbar = tqdm(total=args.max_steps, desc="Train")
     while global_step < args.max_steps:
         shim.train()
-        pose_proj.train()
         opt.zero_grad()
         accum_loss = 0.0
-        for micro_step in range(args.grad_accum):
             try:
                 batch = next(train_iter)
             except StopIteration:
                 train_iter = iter(train_loader)
                 batch = next(train_iter)
-            cur_img = batch["cur_img"].to(device, dtype=torch.bfloat16)
             inp = batch["input_ids"].to(device)
             am = batch["attention_mask"].to(device)
-            hs_target = batch["hs_target"].to(device, dtype=torch.bfloat16)
             B = cur_img.shape[0]
-            # Encode image
             vis = encode_image(cur_img)
-            # Text embeddings
-            embed = qwen.get_input_embeddings()(inp)
-            # Multimodal input
-            mm_embeds = torch.cat([embed[:, :1, :], vis, embed[:, 1:, :]], dim=1)
-            mm_attn = torch.cat([am[:, :1],
-                                 torch.ones(B, vis.shape[1], dtype=am.dtype, device=device),
-                                 am[:, 1:]], dim=1)
-            # Action mask
-            act_start = 1 + NUM_VIS + action_offset_in_text - 1
-            action_mask_mm = torch.zeros(mm_embeds.shape[:2], dtype=torch.bool, device=device)
             for i in range(B):
-                start = act_start
-                end = start + NUM_ACTION_TOKENS
-                if end <= mm_embeds.shape[1]:
-                    action_mask_mm[i, start:end] = True
-            mm_embeds = mm_embeds * ~action_mask_mm.unsqueeze(-1)
-            # Qwen forward
-            try:
-                with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-                    out = qwen(inputs_embeds=mm_embeds, attention_mask=mm_attn,
-                               labels=None, output_hidden_states=True, return_dict=True)
-            except Exception as exc:
-                print(f"\n[train] forward failed at step {global_step} micro_step {micro_step}: {exc}", flush=True)
-                raise
             hs_all = out.hidden_states[-1]
-            hs_act_qwen = torch.stack([hs_all[i, action_mask_mm[i]] for i in range(B)], dim=0)
-            hs_shimmed = shim(hs_act_qwen)  # (B, 32, 4096)
-            # State MSE (shim gradients)
             loss = F.mse_loss(hs_shimmed, hs_target)
             (loss / args.grad_accum).backward()
             accum_loss += loss.item()
-        # Gradient clipping & step
         torch.nn.utils.clip_grad_norm_(shim.parameters(), 1.0)
-        torch.nn.utils.clip_grad_norm_(pose_proj.parameters(), 1.0)
         opt.step()
         sched.step()
         global_step += 1
-        # Logging
-        if global_step % args.log_every == 0:
-            lr_cur = opt.param_groups[0]["lr"]
             with torch.no_grad():
-                cos = F.cosine_similarity(hs_shimmed.float().reshape(-1, 4096),
-                                          hs_target.float().reshape(-1, 4096), dim=-1).mean().item()
-                nd = (hs_shimmed.float() - hs_target.float()).norm(dim=-1).mean().item()
-            pbar.set_postfix({
-                "loss": f"{accum_loss/args.grad_accum:.5f}",
-                "cos": f"{cos:.4f}",
-                "nd": f"{nd:.2f}",
-                "lr": f"{lr_cur:.1e}",
-            })
         # Validation
         if global_step % args.val_every == 0:
             shim.eval()
-            val_loss, val_cos, val_nd, nv = 0.0, 0.0, 0.0, 0
             with torch.no_grad():
-                for vbatch in val_loader:
-                    cur_img = vbatch["cur_img"].to(device, dtype=torch.bfloat16)
-                    inp = vbatch["input_ids"].to(device)
-                    am = vbatch["attention_mask"].to(device)
-                    hs_target = vbatch["hs_target"].to(device, dtype=torch.bfloat16)
-                    Bv = cur_img.shape[0]
-                    vis = encode_image(cur_img)
-                    embed = qwen.get_input_embeddings()(inp)
-                    mm_embeds = torch.cat([embed[:, :1, :], vis, embed[:, 1:, :]], dim=1)
-                    mm_attn = torch.cat([am[:, :1],
-                                         torch.ones(Bv, NUM_VIS, dtype=am.dtype, device=device),
-                                         am[:, 1:]], dim=1)
-                    act_start_v = 1 + NUM_VIS + action_offset_in_text - 1
-                    act_mask_v = torch.zeros(mm_embeds.shape[:2], dtype=torch.bool, device=device)
                     for i in range(Bv):
-                        s = act_start_v
-                        e = s + NUM_ACTION_TOKENS
-                        if e <= mm_embeds.shape[1]:
-                            act_mask_v[i, s:e] = True
-                    mm_embeds = mm_embeds * ~act_mask_v.unsqueeze(-1)
-                    out = qwen(inputs_embeds=mm_embeds, attention_mask=mm_attn,
-                               labels=None, output_hidden_states=True, return_dict=True)
-                    hs_all = out.hidden_states[-1]
-                    hs_act = torch.stack([hs_all[i, act_mask_v[i]] for i in range(Bv)], dim=0)
-                    hs_shimmed = shim(hs_act)
-                    val_loss += F.mse_loss(hs_shimmed, hs_target).item()
-                    val_cos += F.cosine_similarity(hs_shimmed.float().reshape(-1, 4096),
-                                                    hs_target.float().reshape(-1, 4096), dim=-1).mean().item()
-                    val_nd += (hs_shimmed.float() - hs_target.float()).norm(dim=-1).mean().item()
                     nv += 1
-            val_loss /= nv; val_cos /= nv; val_nd /= nv
-            print(f"\n─── Val @ step {global_step}: loss={val_loss:.5f}  cos={val_cos:.4f}  nd={val_nd:.2f} ───", flush=True)
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
                 torch.save(shim.state_dict(), run_dir / "shim_best.pt")
-                torch.save(pose_proj.state_dict(), run_dir / "pose_projector_best.pt")
-                print(f"  → New best saved (loss={val_loss:.5f})")
         if global_step % args.save_every == 0:
-            ckpt_dir = run_dir / f"step_{global_step}"
-            ckpt_dir.mkdir(exist_ok=True)
-            torch.save(shim.state_dict(), ckpt_dir / "shim.pt")
-            torch.save(pose_proj.state_dict(), ckpt_dir / "pose_projector.pt")
-        pbar.update(1)
     pbar.close()
-    print(f"\nDone! Best val loss: {best_val_loss:.5f}")
 if __name__ == "__main__":

 """
+Train the hidden state shim (896→4096) for OpenVLA-Micro.
+The shim maps Qwen2.5 0.5B's 896-dim hidden states to match a teacher
+LLM's 4096-dim space (e.g., Llama-2, Llama-3). This lets the small model
+drive OmniVLA's pretrained action head with near-zero accuracy loss.
+Workflow:
+  1. Cache your teacher's hidden states on your dataset
+  2. Run this script to train the shim
+  3. Bake the shim into the checkpoint with bake_shim.py
+Usage:
+    python train_shim.py --cache-dir ./my_cache --base-model theguy21/openvla-micro
+For the full training pipeline used in openvla-micro-distill, see:
+  https://huggingface.co/theguy21/openvla-micro
 """
+import argparse, json, os
 from pathlib import Path
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
+from modeling_openvla_micro import DinoSigLIPEncoder, CombinedProjector, ShimMLP
+from model_wrapper import IMAGENET_MEAN as IM4D, IMAGENET_STD as IS4D, SIGLIP_MEAN, SIGLIP_STD
+from transformers import AutoModelForCausalLM, AutoTokenizer
+IMAGENET_MEAN = IM4D.view(3, 1, 1)
+IMAGENET_STD = IS4D.view(3, 1, 1)
+NUM_ACTION_TOKENS = 32  # OmniVLA uses 8 chunks × 4 DoF
+NUM_VIS = 452  # 256 dino patches + 196 siglip patches
+def to_siglip(pv):
+    return (pv * IMAGENET_STD.to(pv.device) + IMAGENET_MEAN.to(pv.device)
+            - SIGLIP_MEAN.to(pv.device)) / SIGLIP_STD.to(pv.device)
+# ─────────────────────────────────────────────────────────────
+# Dataset — ADAPT THE IMAGE/INSTRUCTION LOGIC TO YOUR FORMAT
+# ─────────────────────────────────────────────────────────────
 class DistillDataset(Dataset):
+    """
+    Each episode_*.pt is expected to contain:
+        episode_id: str
+        num_steps: int
+        hidden_states: Tensor[T, 32, teacher_dim]
+        (optional) instructions: list[str] of length T
+    Image paths are constructed as {data_dir}/{episode_id}/img/step_{t:04d}.png
+    Override _load_image / _get_instruction for custom formats.
+    """
+    def __init__(self, cache_dir, data_dir, split="train", val_ratio=0.1):
+        self.data_dir = Path(data_dir)
+        cache_files = sorted(Path(cache_dir).glob("episode_*.pt"))
         n = len(cache_files)
         split_idx = int(n * (1 - val_ratio))
         files = cache_files[:split_idx] if split == "train" else cache_files[split_idx:]
         self.index = []
         for cf in files:
             d = torch.load(cf, weights_only=True)
+            for t in range(d["num_steps"]):
                 self.index.append((cf, t))
         self._cache = {}
+        self._instr_cache = {}
         print(f"  [{split}] {len(self.index)} steps from {len(files)} episodes", flush=True)
     def __len__(self):
         cf_str = str(cf_path)
         if cf_str not in self._cache:
             self._cache[cf_str] = torch.load(cf_path, weights_only=True)
+        ep = self._cache[cf_str]
+        ep_id = ep["episode_id"]
+        # Image
         from torchvision.transforms.functional import resize as tv_resize
+        img = tv_resize(Image.open(self.data_dir / ep_id / "img" / f"step_{t:04d}.png").convert("RGB"), 224)
+        img = torch.tensor(np.array(img, dtype=np.float32) / 255.0).permute(2, 0, 1)
+        img = (img - IMAGENET_MEAN) / IMAGENET_STD
+        # Instruction
+        if "instructions" in ep:
+            instr = ep["instructions"][t]
+            if isinstance(instr, list):
+                instr = instr[0]
+        else:
+            instr = "move forward"
+        return {"cur_img": img, "hs_target": ep["hidden_states"][t].float(), "instruction": str(instr).strip()}
+def find_action_offset(tokenizer, action_token_ids):
+    """Determine where action tokens start in the chat template."""
+    dummy = tokenizer.apply_chat_template(
+        [{"role": "system", "content": "You are a helpful assistant."},
+         {"role": "user", "content": "test"},
+         {"role": "assistant", "content": " ".join([f"<ACTION_{i}>" for i in range(NUM_ACTION_TOKENS)])}],
+        tokenize=True, add_generation_prompt=False, return_dict=True, return_tensors="pt",
+    )
+    ids = dummy["input_ids"].squeeze(0)
+    pos = torch.where((ids >= action_token_ids[0]) & (ids <= action_token_ids[-1]))[0]
+    return pos[0].item()
 def main():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--cache-dir", type=str, required=True)
+    parser.add_argument("--data-dir", type=str, required=True,
+                        help="Dataset root with {episode_id}/img/step_*.png")
+    parser.add_argument("--base-model", type=str, default="theguy21/openvla-micro")
+    parser.add_argument("--teacher-dim", type=int, default=4096)
+    parser.add_argument("--max-steps", type=int, default=10000)
     parser.add_argument("--batch-size", type=int, default=8)
     parser.add_argument("--lr", type=float, default=5e-5)
     parser.add_argument("--grad-accum", type=int, default=4)
     parser.add_argument("--val-every", type=int, default=500)
     parser.add_argument("--save-every", type=int, default=5000)
+    parser.add_argument("--resume", type=str, default=None)
+    parser.add_argument("--run-name", type=str, default="shim_run")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
     args = parser.parse_args()
+    device = torch.device(args.device)
+    dtype = torch.bfloat16
     print(f"Device: {device}")
+    run_dir = Path(args.run_name)
+    run_dir.mkdir(exist_ok=True)
+    # ── 1. Load base model ──
+    print("\n[1] Loading base model...")
+    ckpt = torch.load(os.path.expanduser(args.base_model), map_location="cpu", weights_only=False)
+    msd = ckpt["model"]
+    ve = DinoSigLIPEncoder().eval()
     ve.load_state_dict(msd["vision_backbone"])
+    ve.to(device, dtype=dtype)
+    for p in ve.parameters(): p.requires_grad_(False)
+    projector = CombinedProjector(ShimMLP(384), ShimMLP(768), nn.Linear(8704, 896), nn.Linear(896, 896))
     projector.load_state_dict(msd["projector"])
+    projector.to(device, dtype=dtype).eval()
+    for p in projector.parameters(): p.requires_grad_(False)
+    llm = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B", torch_dtype=dtype)
+    llm_sd = {k.replace("llm.", "", 1): v for k, v in msd["llm_backbone"].items()}
+    llm.load_state_dict(llm_sd)
+    llm.to(device, dtype=dtype).eval()
+    for p in llm.parameters(): p.requires_grad_(False)
     tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", use_fast=True)
     tokenizer.add_tokens([f"<ACTION_{i}>" for i in range(NUM_ACTION_TOKENS)])
+    action_token_ids = tokenizer.convert_tokens_to_ids([f"<ACTION_{i}>" for i in range(NUM_ACTION_TOKENS)])
+    action_offset = find_action_offset(tokenizer, action_token_ids)
+    print(f"  Action tokens at position {action_offset}")
+    # ── 2. Shim ──
+    print("\n[2] Building shim...")
+    shim = nn.Sequential(nn.Linear(896, 2048), nn.GELU(), nn.Linear(2048, args.teacher_dim))
+    if args.resume:
+        shim.load_state_dict(torch.load(args.resume, map_location="cpu"))
         print(f"  Resumed from {args.resume}")
+    shim.to(device, dtype=dtype).train()
     # ── 3. Data ──
+    print("\n[3] Loading data...")
+    train_ds = DistillDataset(args.cache_dir, args.data_dir, split="train")
+    val_ds = DistillDataset(args.cache_dir, args.data_dir, split="val")
+    def collate(batch):
+        from torchvision.transforms.functional import resize as tv_resize
+        texts, imgs, hs = [], [], []
+        for b in batch:
+            texts.append(b["instruction"])
+            imgs.append(b["cur_img"])
+            hs.append(b["hs_target"])
+        cur = torch.stack(imgs)
+        hs_target = torch.stack(hs)
+        chat = [[{"role": "system", "content": "You are a helpful assistant."},
+                 {"role": "user", "content": f"What action should the robot take to {t.lower()}?"},
+                 {"role": "assistant", "content": " ".join([f"<ACTION_{i}>" for i in range(NUM_ACTION_TOKENS)])}]
+                for t in texts]
+        tok = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False,
+                                            return_dict=True, return_tensors="pt", padding=True)
+        return {"cur_img": cur, "input_ids": tok["input_ids"], "attention_mask": tok["attention_mask"],
+                "hs_target": hs_target}
+    train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate, num_workers=0)
+    val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, collate_fn=collate, num_workers=0)
     # ── 4. Optimizer ──
+    opt = torch.optim.AdamW(shim.parameters(), lr=args.lr, weight_decay=0.01)
     sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.max_steps)
     # ── 5. Training ──
+    print(f"\n[4] Training...")
+    dino, siglip = ve.dino_featurizer, ve.siglip_featurizer
     def encode_image(cur):
         with torch.no_grad():
+            df = dino(cur)
+            if isinstance(df, (list, tuple)): df = df[0]
+            df = df[:, 1:]
+            sf = siglip(to_siglip(cur))
+            if isinstance(sf, (list, tuple)): sf = sf[0]
+            sf = sf[:, 1:]
+            B = cur.shape[0]; D = 1152
+            def pad(f, ed):
+                p = torch.zeros(B, f.shape[1], D, device=device, dtype=dtype)
+                p[..., :ed] = f[..., :ed]; return p
+            return projector(torch.cat([pad(df, 384), pad(sf, 768)], dim=1))
+    best_loss = float("inf")
     global_step = 0
     train_iter = iter(train_loader)
     pbar = tqdm(total=args.max_steps, desc="Train")
     while global_step < args.max_steps:
         shim.train()
         opt.zero_grad()
         accum_loss = 0.0
+        for _ in range(args.grad_accum):
             try:
                 batch = next(train_iter)
             except StopIteration:
                 train_iter = iter(train_loader)
                 batch = next(train_iter)
+            cur_img = batch["cur_img"].to(device, dtype=dtype)
             inp = batch["input_ids"].to(device)
             am = batch["attention_mask"].to(device)
+            hs_target = batch["hs_target"].to(device, dtype=dtype)
             B = cur_img.shape[0]
             vis = encode_image(cur_img)
+            embed = llm.get_input_embeddings()(inp)
+            mm = torch.cat([embed[:, :1, :], vis, embed[:, 1:, :]], dim=1)
+            mm_attn = torch.cat([am[:, :1], torch.ones(B, NUM_VIS, dtype=am.dtype, device=device), am[:, 1:]], dim=1)
+            act_start = 1 + NUM_VIS + action_offset - 1
+            mask = torch.zeros(B, mm.shape[1], dtype=torch.bool, device=device)
             for i in range(B):
+                end = act_start + NUM_ACTION_TOKENS
+                if end <= mm.shape[1]:
+                    mask[i, act_start:end] = True
+            mm = mm * ~mask.unsqueeze(-1)
+            with torch.autocast(device_type=device.type, dtype=dtype):
+                out = llm(inputs_embeds=mm, attention_mask=mm_attn, labels=None, output_hidden_states=True, return_dict=True)
             hs_all = out.hidden_states[-1]
+            hs_act = torch.stack([hs_all[i, mask[i]] for i in range(B)], dim=0)
+            hs_shimmed = shim(hs_act)
             loss = F.mse_loss(hs_shimmed, hs_target)
             (loss / args.grad_accum).backward()
             accum_loss += loss.item()
         torch.nn.utils.clip_grad_norm_(shim.parameters(), 1.0)
         opt.step()
         sched.step()
         global_step += 1
+        if global_step % 100 == 0:
             with torch.no_grad():
+                cos = F.cosine_similarity(hs_shimmed.float().reshape(-1, args.teacher_dim),
+                                          hs_target.float().reshape(-1, args.teacher_dim), dim=-1).mean().item()
+            pbar.set_postfix({"loss": f"{accum_loss/args.grad_accum:.5f}", "cos": f"{cos:.4f}"})
+        pbar.update(1)
         # Validation
         if global_step % args.val_every == 0:
             shim.eval()
+            v_loss, v_cos, nv = 0.0, 0.0, 0
             with torch.no_grad():
+                for vb in val_loader:
+                    ci = vb["cur_img"].to(device, dtype=dtype)
+                    ip = vb["input_ids"].to(device)
+                    am = vb["attention_mask"].to(device)
+                    ht = vb["hs_target"].to(device, dtype=dtype)
+                    Bv = ci.shape[0]
+                    vi = encode_image(ci)
+                    em = llm.get_input_embeddings()(ip)
+                    mm = torch.cat([em[:, :1, :], vi, em[:, 1:, :]], dim=1)
+                    ma = torch.cat([am[:, :1], torch.ones(Bv, NUM_VIS, dtype=am.dtype, device=device), am[:, 1:]], dim=1)
+                    mk = torch.zeros(Bv, mm.shape[1], dtype=torch.bool, device=device)
                     for i in range(Bv):
+                        e = 1 + NUM_VIS + action_offset - 1 + NUM_ACTION_TOKENS
+                        if e <= mm.shape[1]:
+                            mk[i, 1 + NUM_VIS + action_offset - 1:e] = True
+                    mm = mm * ~mk.unsqueeze(-1)
+                    o = llm(inputs_embeds=mm, attention_mask=ma, labels=None, output_hidden_states=True, return_dict=True)
+                    ha = torch.stack([o.hidden_states[-1][i, mk[i]] for i in range(Bv)], dim=0)
+                    hs = shim(ha)
+                    v_loss += F.mse_loss(hs, ht).item()
+                    v_cos += F.cosine_similarity(hs.float().reshape(-1, args.teacher_dim),
+                                                  ht.float().reshape(-1, args.teacher_dim), dim=-1).mean().item()
                     nv += 1
+            v_loss /= nv; v_cos /= nv
+            print(f"\n─── Val @ {global_step}: loss={v_loss:.5f} cos={v_cos:.4f} ───", flush=True)
+            if v_loss < best_loss:
+                best_loss = v_loss
                 torch.save(shim.state_dict(), run_dir / "shim_best.pt")
+                print(f"  → Saved best (loss={v_loss:.5f})")
         if global_step % args.save_every == 0:
+            d = run_dir / f"step_{global_step}"; d.mkdir(exist_ok=True)
+            torch.save(shim.state_dict(), d / "shim.pt")
     pbar.close()
+    print(f"\nDone! Best val loss: {best_loss:.5f}")
 if __name__ == "__main__":