atharva-pantheon
/

rltoken-encoder

+"""RL Token encoder-decoder for MolmoAct2 (RLT Stage 1) — PyTorch port.
+Faithful port of openpi's ``pi0_rl.py`` (Xu et al. 2025, "RL Tokens") to PyTorch
+for the frozen MolmoAct2 lerobot fork. Differences from my earlier
+``rlt_logit_autoencoder.py`` (which was wrong): that one MLP-reconstructed the
+2048-D action logits; THIS reconstructs the VLA's **per-token prefix hidden
+states** ``(M, dim)`` with a transformer encoder + autoregressive decoder, so
+the single ``z_rl`` token is forced to regenerate the whole prefix — the real
+RLT bottleneck, and what todo Phase 3 specifies.
+Design (matches the reference):
+  encoder: append a learned <rl> query to the prefix embeddings (b, M, dim),
+           run bidirectional pre-norm transformer blocks (RMSNorm + SwiGLU),
+           read the query position  ->  z_rl (b, dim).
+  decoder: autoregressive. input  [z_rl, z̄_1 … z̄_{M-1}], causal mask,
+           predict [z̄_1 … z̄_M]; output_proj.
+  loss:    per-token squared-L2 recon (sum over dim, masked mean over tokens),
+           targets stop-gradiented. VLA is a frozen server here, so there is no
+           L_vla term (alpha = 0): we only train the encoder/decoder.
+z_rl is full-dim (= dim), exactly like the reference — the bottleneck is the
+sequence compression (M tokens -> 1), not a narrow feature dim. Downstream SAC
+consumes z_rl as its (frozen) RLT state.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+@dataclass
+class RLTokenConfig:
+    dim: int = 2560          # MolmoAct2 VLM hidden width (cached embeddings are 2560-D)
+    num_layers: int = 2
+    num_heads: int = 8       # 2560 / 8 = 320 head_dim
+    mlp_dim: int = 8192
+class _Block(nn.Module):
+    """Pre-norm transformer block: MHA + SwiGLU FFN, RMSNorm. Matches the ref."""
+    def __init__(self, dim: int, num_heads: int, mlp_dim: int):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} not divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.attn_norm = nn.RMSNorm(dim)
+        self.q_proj = nn.Linear(dim, dim, bias=False)
+        self.k_proj = nn.Linear(dim, dim, bias=False)
+        self.v_proj = nn.Linear(dim, dim, bias=False)
+        self.o_proj = nn.Linear(dim, dim, bias=False)
+        self.ffn_norm = nn.RMSNorm(dim)
+        self.ffn_gate = nn.Linear(dim, mlp_dim, bias=False)
+        self.ffn_up = nn.Linear(dim, mlp_dim, bias=False)
+        self.ffn_down = nn.Linear(mlp_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor | None) -> torch.Tensor:
+        b, s, d = x.shape
+        h = self.attn_norm(x)
+        q = self.q_proj(h).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)  # (b,nh,s,hd)
+        k = self.k_proj(h).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(h).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
+        # attn_mask: (b, s, s) bool, True = attend. -> (b,1,s,s) for SDPA additive.
+        am = None
+        if attn_mask is not None:
+            am = torch.zeros(b, 1, s, s, dtype=x.dtype, device=x.device)
+            am = am.masked_fill(~attn_mask[:, None, :, :], float("-inf"))
+        attn = F.scaled_dot_product_attention(q, k, v, attn_mask=am)  # (b,nh,s,hd)
+        attn = attn.transpose(1, 2).reshape(b, s, d)
+        x = x + self.o_proj(attn)
+        h = self.ffn_norm(x)
+        x = x + self.ffn_down(F.silu(self.ffn_gate(h)) * self.ffn_up(h))
+        return x
+class RLTokenEncoder(nn.Module):
+    """Compress prefix embeddings (b, M, dim) -> z_rl (b, dim) via a learned query."""
+    def __init__(self, cfg: RLTokenConfig):
+        super().__init__()
+        self.rl_query = nn.Parameter(torch.randn(1, 1, cfg.dim) * 0.02)
+        self.layers = nn.ModuleList(_Block(cfg.dim, cfg.num_heads, cfg.mlp_dim) for _ in range(cfg.num_layers))
+    def forward(self, prefix: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        b, m, d = prefix.shape
+        query = self.rl_query.expand(b, 1, d)
+        x = torch.cat([prefix, query], dim=1)  # (b, M+1, dim)
+        if mask is not None:
+            ext = torch.cat([mask, torch.ones(b, 1, dtype=torch.bool, device=mask.device)], dim=1)
+            attn_mask = ext[:, None, :] & ext[:, :, None]  # (b, M+1, M+1) bidirectional
+        else:
+            attn_mask = None
+        for layer in self.layers:
+            x = layer(x, attn_mask)
+        return x[:, -1, :]  # z_rl at the query position
+class RLTokenDecoder(nn.Module):
+    """Autoregressively reconstruct prefix embeddings from z_rl."""
+    def __init__(self, cfg: RLTokenConfig):
+        super().__init__()
+        self.layers = nn.ModuleList(_Block(cfg.dim, cfg.num_heads, cfg.mlp_dim) for _ in range(cfg.num_layers))
+        self.output_proj = nn.Linear(cfg.dim, cfg.dim, bias=False)
+    def forward(self, z_rl: torch.Tensor, target: torch.Tensor, mask: torch.Tensor | None = None,
+                context_dropout: float = 0.0) -> torch.Tensor:
+        # input [z_rl, z̄_1..z̄_{M-1}] -> predict [z̄_1..z̄_M]
+        b, m, d = target.shape
+        ctx = target[:, :-1, :]
+        # Context dropout (train only): randomly zero teacher-forced context tokens
+        # so the decoder cannot reconstruct purely from the true-previous-token leak
+        # and is forced to route information through z_rl. Off (0.0) = bare reference.
+        if self.training and context_dropout > 0.0:
+            keep = (torch.rand(b, m - 1, 1, device=target.device) >= context_dropout).to(target.dtype)
+            ctx = ctx * keep
+        dec_in = torch.cat([z_rl[:, None, :], ctx], dim=1)  # (b, M, dim)
+        causal = torch.tril(torch.ones(m, m, dtype=torch.bool, device=target.device))[None]  # (1,M,M)
+        if mask is not None:
+            key_valid = torch.cat([torch.ones(b, 1, dtype=torch.bool, device=mask.device), mask[:, :-1]], dim=1)
+            attn_mask = causal & key_valid[:, None, :]  # (b, M, M)
+        else:
+            attn_mask = causal.expand(b, m, m)
+        x = dec_in
+        for layer in self.layers:
+            x = layer(x, attn_mask)
+        return self.output_proj(x)
+class RLTokenAutoencoder(nn.Module):
+    """Encoder + decoder. forward() returns (z_rl, recon_loss) for training."""
+    def __init__(self, cfg: RLTokenConfig | None = None):
+        super().__init__()
+        self.cfg = cfg or RLTokenConfig()
+        self.encoder = RLTokenEncoder(self.cfg)
+        self.decoder = RLTokenDecoder(self.cfg)
+    def encode(self, prefix: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        return self.encoder(prefix, mask)
+    def forward(self, prefix: torch.Tensor, mask: torch.Tensor | None = None, context_dropout: float = 0.0):
+        # Targets are stop-gradiented (frozen VLA features). detach() = jax.lax.stop_gradient.
+        target = prefix.detach()
+        z_rl = self.encoder(target, mask)
+        pred = self.decoder(z_rl, target, mask, context_dropout=context_dropout)
+        per_token = (pred - target).pow(2).sum(dim=-1)  # (b, M) squared-L2 per token
+        if mask is not None:
+            per_token = per_token * mask
+            denom = mask.sum(dim=1).clamp(min=1)
+            recon = (per_token.sum(dim=1) / denom)  # (b,)
+        else:
+            recon = per_token.mean(dim=1)
+        return z_rl, recon.mean()
+if __name__ == "__main__":
+    # Self-test on COMPRESSIBLE data: each sequence is a per-sample latent c
+    # broadcast across positions + a small FIXED positional pattern. So one z_rl
+    # can capture c. Fair ablation = FIRST-token recon: position 0 sees ONLY
+    # z_rl (no AR context), so it isolates whether z_rl carries information.
+    torch.manual_seed(0)
+    cfg = RLTokenConfig(dim=64, num_layers=2, num_heads=4, mlp_dim=128)  # tiny for CPU
+    ae = RLTokenAutoencoder(cfg)
+    opt = torch.optim.AdamW(ae.parameters(), lr=1e-3)
+    B, M = 32, 12
+    pos_pattern = torch.randn(M, cfg.dim) * 0.3  # fixed per-position offset
+    def batch():
+        c = torch.randn(B, cfg.dim)                       # per-sample latent
+        x = c[:, None, :] + pos_pattern[None]             # (B, M, dim), compressible
+        return x, torch.ones(B, M, dtype=torch.bool)
+    for step in range(600):
+        x, mask = batch()
+        z, loss = ae(x, mask)
+        opt.zero_grad(); loss.backward(); opt.step()
+        if step % 150 == 0 or step == 599:
+            print(f"step {step:3d}  recon={loss.item():.4f}")
+    ae.eval()
+    with torch.no_grad():
+        x, mask = batch()
+        z, _ = ae(x, mask)
+        def first_tok_err(zt):
+            pred = ae.decoder(zt, x, mask)
+            return (pred[:, 0] - x[:, 0]).pow(2).sum(-1).mean().item()  # token-0 only
+        real0 = first_tok_err(z)
+        zero0 = first_tok_err(torch.zeros_like(z))
+        shuf0 = first_tok_err(z[torch.randperm(B)])
+    print(f"first-token recon: real={real0:.3f}  zeroed={zero0:.3f}  shuffled={shuf0:.3f}")
+    ok = real0 < 0.3 * zero0 and real0 < 0.3 * shuf0
+    print("SELF-TEST:", "PASS ✅ (z_rl carries the prefix latent)" if ok else "FAIL ❌")