Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

configuration_actioncodec.py +228 -0
modeling_actioncodec.py +541 -0
modular_actioncodec.py +779 -0
rvq.py +522 -0

configuration_actioncodec.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import copy
+from typing import Any, Dict
+from transformers import AutoConfig, PretrainedConfig
+class ActionCodecConfig(PretrainedConfig):
+    model_type = "action_codec"
+    def __init__(
+        self,
+        embodiment_config: Dict[str, Any] = None,
+        n_tokens: int = 16,
+        n_quantizers: int = 1,
+        z_dim: int = 512,
+        vq_type: str = "vq",
+        vq_codebook_size: int = 2048,
+        vq_commitment_weight: float = 0.25,
+        vq_decay: float = 0.99,
+        vq_kmeans_init: bool = True,
+        vq_threshold_ema_dead_code: int = 2,
+        vq_quantizer_dropout: float = 0.25,
+        encoder_dim: int = 256,
+        encoder_n_layers: int = 6,
+        encoder_n_heads: int = 8,
+        encoder_add_self_attn: bool = False,
+        encoder_add_causal_mask: bool = False,
+        encoder_pos_encoding_type: str = "fourier",
+        decoder_dim: int = 256,
+        decoder_n_layers: int = 6,
+        decoder_n_heads: int = 8,
+        decoder_add_self_attn: bool = False,
+        decoder_add_causal_mask: bool = False,
+        decoder_pos_encoding_type: str = "fourier",
+        decoder_cls_size: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if embodiment_config is None:
+            default_config = {
+                "franka_libero_20hz": {
+                    "action_dim": 7,
+                    "freq": 20,
+                    "duration": 1,
+                    "description": "20Hz 7-dim action for 1s. Delta eef position (xyz), orientation (rpy), and gripper position (1 open/0 close).",
+                },
+                "widowx_bridge_5hz": {
+                    "action_dim": 7,
+                    "freq": 5,
+                    "duration": 1,
+                    "description": "5Hz 7-dim action for 1s. Delta eef position (xyz), orientation (rpy), and gripper position (1 open/0 close).",
+                },
+                "franka_droid_15hz": {
+                    "action_dim": 7,
+                    "freq": 15,
+                    "duration": 1,
+                    "description": "15Hz 7-dim action for 1s. Delta eef position (xyz), orientation (rpy), and gripper position (1 open/0 close).",
+                },
+            }
+            self.embodiment_config = copy.deepcopy(default_config)
+        else:
+            self.embodiment_config = copy.deepcopy(embodiment_config)
+        self.n_tokens = n_tokens
+        self.n_quantizers = n_quantizers
+        self.z_dim = z_dim
+        self.encoder_dim = encoder_dim
+        self.encoder_n_layers = encoder_n_layers
+        self.encoder_n_heads = encoder_n_heads
+        self.encoder_add_self_attn = encoder_add_self_attn
+        self.encoder_add_causal_mask = encoder_add_causal_mask
+        self.encoder_pos_encoding_type = encoder_pos_encoding_type
+        self.decoder_dim = decoder_dim
+        self.decoder_n_layers = decoder_n_layers
+        self.decoder_n_heads = decoder_n_heads
+        self.decoder_add_self_attn = decoder_add_self_attn
+        self.decoder_add_causal_mask = decoder_add_causal_mask
+        self.decoder_pos_encoding_type = decoder_pos_encoding_type
+        self.decoder_cls_size = decoder_cls_size
+        self.vq_type = vq_type
+        self.vq_codebook_size = vq_codebook_size
+        self.vq_commitment_weight = vq_commitment_weight
+        self.vq_decay = vq_decay
+        self.vq_kmeans_init = vq_kmeans_init
+        self.vq_threshold_ema_dead_code = vq_threshold_ema_dead_code
+        self.vq_quantizer_dropout = vq_quantizer_dropout
+class ActionCodecConfigOld(PretrainedConfig):
+    model_type = "action_codec"
+    def __init__(
+        self,
+        horizon: int = 20,
+        action_dim: int = 7,
+        action_encoding: str = "independent_v2",
+        horizon_patch_size: int = 1,
+        encoder_class: str = "action_codec.modules.perceiver.PerceiverEncoder",
+        decoder_class: str = "action_codec.modules.perceiver.PerceiverDecoder",
+        vq_class: str = "vector_quantize_pytorch.VectorQuantize",
+        encoder_kwargs: Dict[str, Any] = None,
+        decoder_kwargs: Dict[str, Any] = None,
+        vq_kwargs: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.horizon = horizon
+        self.action_dim = action_dim
+        self.action_encoding = action_encoding
+        self.horizon_patch_size = horizon_patch_size
+        self.encoder_class = encoder_class
+        self.decoder_class = decoder_class
+        self.vq_class = vq_class
+        self.encoder_kwargs = (
+            dict(encoder_kwargs)
+            if encoder_kwargs is not None
+            else {
+                "dim": 384,
+                "in_len": horizon,
+                "out_len": 16,
+                "num_layers": 12,
+                "num_heads": 4,
+                "output_round": -1.0,
+            }
+        )
+        self.decoder_kwargs = (
+            dict(decoder_kwargs)
+            if decoder_kwargs is not None
+            else {
+                "dim": 384,
+                "in_len": 16,
+                "out_len": horizon,
+                "num_layers": 12,
+                "num_heads": 4,
+            }
+        )
+        self.vq_kwargs = (
+            dict(vq_kwargs)
+            if vq_kwargs is not None
+            else {
+                "dim": 512,
+                "codebook_size": 2048,
+                "kmeans_init": True,
+                "kmeans_iters": 10,
+                "decay": 0.99,
+                "commitment_weight": 0.25,
+                "rotation_trick": False,
+                "threshold_ema_dead_code": 2,
+                "use_cosine_sim": False,
+                "codebook_diversity_loss_weight": 0.0,
+            }
+        )
+class BPEActionCodecConfig(PretrainedConfig):
+    model_type = "bpe_action_codec"
+    def __init__(
+        self,
+        horizon: int = 20,
+        action_dim: int = 7,
+        action_encoding: str = "independent_v2",
+        horizon_patch_size: int = 1,
+        encoder_class: str = "action_codec.modules.perceiver.PerceiverEncoder",
+        decoder_class: str = "action_codec.modules.perceiver.PerceiverDecoder",
+        vq_class: str = "vector_quantize_pytorch.VectorQuantize",
+        encoder_kwargs: Dict[str, Any] = None,
+        decoder_kwargs: Dict[str, Any] = None,
+        vq_kwargs: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.horizon = horizon
+        self.action_dim = action_dim
+        self.action_encoding = action_encoding
+        self.horizon_patch_size = horizon_patch_size
+        self.encoder_class = encoder_class
+        self.decoder_class = decoder_class
+        self.vq_class = vq_class
+        self.encoder_kwargs = (
+            dict(encoder_kwargs)
+            if encoder_kwargs is not None
+            else {
+                "dim": 384,
+                "in_len": horizon,
+                "out_len": 16,
+                "num_layers": 12,
+                "num_heads": 4,
+                "output_round": -1.0,
+            }
+        )
+        self.decoder_kwargs = (
+            dict(decoder_kwargs)
+            if decoder_kwargs is not None
+            else {
+                "dim": 384,
+                "in_len": 16,
+                "out_len": horizon,
+                "num_layers": 12,
+                "num_heads": 4,
+            }
+        )
+        self.vq_kwargs = (
+            dict(vq_kwargs)
+            if vq_kwargs is not None
+            else {
+                "dim": 512,
+                "codebook_size": 2048,
+                "kmeans_init": True,
+                "kmeans_iters": 10,
+                "decay": 0.99,
+                "commitment_weight": 0.25,
+                "rotation_trick": False,
+                "threshold_ema_dead_code": 2,
+                "use_cosine_sim": False,
+                "codebook_diversity_loss_weight": 0.0,
+            }
+        )
+AutoConfig.register("action_codec", ActionCodecConfig)
+AutoConfig.register("bpe_action_codec", BPEActionCodecConfig)
+__all__ = ["ActionCodecConfig", "BPEActionCodecConfig"]

modeling_actioncodec.py ADDED Viewed

	@@ -0,0 +1,541 @@

+from typing import List
+import einops
+import numpy as np
+import torch
+from transformers import AutoModel, PreTrainedModel
+from vector_quantize_pytorch import VectorQuantize
+from .configuration_actioncodec import ActionCodecConfig
+from .modular_actioncodec import PerceiverDecoder, PerceiverEncoder
+from .rvq import ResidualVectorQuantize
+def trim_trailing_zeros(arr: np.ndarray) -> list[np.ndarray]:
+    if arr.shape[0] == 0:
+        return []
+    b, n = arr.shape
+    is_nonzero = arr != 0
+    flipped_mask = np.flip(is_nonzero, axis=1)
+    last_nonzero_indices = n - 1 - np.argmax(flipped_mask, axis=1)
+    any_nonzero_in_row = is_nonzero.any(axis=1)
+    new_lengths = (last_nonzero_indices + 1) * any_nonzero_in_row
+    result = [arr[i, :length].tolist() for i, length in enumerate(new_lengths)]
+    return result
+class ActionCodec(PreTrainedModel):
+    config_class = ActionCodecConfig
+    def __init__(self, config: ActionCodecConfig):
+        super().__init__(config)
+        self.default_embodiment_id = 0
+        self.encoder = PerceiverEncoder(config)
+        self.decoder = PerceiverDecoder(config)
+        if config.vq_type == "vq":
+            assert config.n_quantizers == 1, "Only one quantizer is supported for VQ"
+            self.vq = VectorQuantize(
+                dim=config.z_dim,
+                codebook_size=config.vq_codebook_size,
+                commitment_weight=config.vq_commitment_weight,
+                decay=config.vq_decay,
+                kmeans_init=config.vq_kmeans_init,
+                threshold_ema_dead_code=config.vq_threshold_ema_dead_code,
+                rotation_trick=False,
+                straight_through=True,
+            )
+        elif config.vq_type == "rvq":
+            assert config.n_quantizers > 1, "At least two quantizers are supported for RVQ"
+            self.vq = ResidualVectorQuantize(
+                dim=config.z_dim,
+                n_codebooks=config.n_quantizers,
+                codebook_size=config.vq_codebook_size,
+                codebook_dim=config.z_dim,
+                quantizer_dropout=config.vq_quantizer_dropout,
+                commitment=config.vq_commitment_weight,
+            )
+        else:
+            raise NotImplementedError(f"VQ type {config.vq_type} not implemented")
+        self.vocab_size = config.vq_codebook_size
+        self.num_quantizers = config.n_quantizers
+        self.n_tokens_per_quantizer = config.n_tokens // config.n_quantizers
+    def expand_embodiment(self, embodiment_config: dict):
+        """
+        Delegates expansion to the underlying Encoder and Decoder.
+        This allows the Codec to adapt to new robots dynamically.
+        """
+        self.encoder.expand_embodiment(embodiment_config)
+        self.decoder.expand_embodiment(embodiment_config)
+        self.config.embodiment_config.update(embodiment_config)
+        return self
+    def _encode(
+        self,
+        x: torch.Tensor,
+        embodiment_ids: torch.Tensor | int | None = None,
+        padding_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Encode action sequences into latent representations.
+        Args:
+            x (torch.Tensor): Action sequences to encode. Shape: (b, seq_len, max_action_dim).
+                Assumes that the action dimension is zero-padded to the max action dimension.
+                `seq_len` is supposed to be `int(duration * freq)` for each embodiment and padded to the max sequence length.
+            embodiment_ids (torch.Tensor | int): Embodiment IDs. Shape: (b,).
+                If int, the same embodiment ID is repeated for all sequences in the batch.
+                It specifies the embodiment to encode.
+            padding_mask (Optional[torch.Tensor], optional): Padding mask, where `False` values indicate padding. Shape: (b, seq_len). Defaults to None.
+                It is used to mask the padding tokens on `seq_len` dimension.
+        Returns:
+            torch.Tensor: Encoded latent representations. Shape: (b, n_tokens_per_quantizer, z_dim).
+        """
+        embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
+        z_e = self.encoder(x, embodiment_ids, padding_mask)
+        return z_e
+    def _quantize(self, z_e: torch.Tensor, return_perplexity: bool = True) -> List[torch.Tensor]:
+        if isinstance(self.vq, ResidualVectorQuantize):
+            z_q, indices, _, commitment_loss, codebook_loss = self.vq(z_e)
+            commit_loss = commitment_loss.mean() + codebook_loss.mean()
+        elif isinstance(self.vq, VectorQuantize):
+            z_q, indices, commit_loss = self.vq(z_e)
+        else:
+            raise NotImplementedError(f"VQ type {type(self.vq)} not implemented")
+        if return_perplexity:
+            if len(indices.size()) < 3:
+                indices = indices.unsqueeze(-1)
+            perplexity = []
+            for k in range(indices.size(-1)):
+                this_indices = indices[:, :, k]
+                indices_count = torch.bincount(this_indices.view(-1), minlength=self.vq.codebook_size)
+                if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
+                    torch.distributed.all_reduce(indices_count)
+                this_avg_probs = indices_count.float() / indices_count.sum()
+                perplexity.append(((-(this_avg_probs * torch.log(this_avg_probs + 1e-10)).sum()).exp().item()))
+        else:
+            perplexity = 0
+        return z_q, indices, perplexity, commit_loss
+    def _dequantize(self, indices: torch.Tensor) -> torch.Tensor:
+        if self.num_quantizers == 1:
+            if len(indices.size()) == 3:
+                indices = indices.squeeze(-1)
+        if isinstance(self.vq, ResidualVectorQuantize):
+            z_q = self.vq.from_codes(indices)[0]
+        else:
+            z_q = self.vq.get_output_from_indices(indices)
+        return z_q
+    def _decode(
+        self, z_q: torch.Tensor, embodiment_ids: torch.Tensor | int | None = None, durations: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
+        x_recon, padding_mask = self.decoder(z_q, embodiment_ids, durations)
+        return x_recon, padding_mask
+    @torch.no_grad()
+    def encode(
+        self,
+        x: np.ndarray,
+        embodiment_ids: List[int] | int | None = None,
+        padding_mask: List[bool] | None = None,
+    ) -> List[List[int]]:
+        """Encode action sequences into latent representations.
+        Args:
+            x (np.ndarray): Action sequences to encode. Shape: (b, seq_len, max_action_dim).
+                Assumes that the action dimension is zero-padded to the max action dimension.
+                `seq_len` is supposed to be `int(duration * freq)` for each embodiment and padded to the max sequence length.
+            embodiment_ids (List[int] | int): Embodiment IDs. Shape: (b,).
+                If int, the same embodiment ID is repeated for all sequences in the batch.
+                It specifies the embodiment to encode.
+            padding_mask (List[bool] | None): Padding mask, where `False` values indicate padding. Shape: (b, seq_len). Defaults to None.
+                It is used to mask the padding tokens on `seq_len` dimension.
+        Returns:
+            List[List[int]]: List of token sequences. Shape: (b, n_tokens).
+        """
+        self.eval()
+        embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
+        with torch.no_grad():
+            x_tensor = torch.tensor(x, dtype=self.dtype, device=self.device)
+            if not isinstance(embodiment_ids, int):
+                embodiment_ids = torch.tensor(embodiment_ids, dtype=torch.long, device=self.device)
+            if padding_mask is not None:
+                padding_mask = torch.tensor(padding_mask, dtype=torch.bool, device=self.device)
+            z_e = self._encode(x_tensor, embodiment_ids, padding_mask)
+            _, indices, _, _ = self._quantize(z_e, return_perplexity=False)
+            if len(indices.size()) > 2:
+                codes_list = einops.rearrange(indices, "b n s -> b (s n)").cpu()
+            else:
+                codes_list = indices.cpu()
+            codes_list = codes_list.tolist()
+            return codes_list
+    @torch.no_grad()
+    def decode(
+        self, tokens: List[List[int]], embodiment_ids: List[int] | int | None = None, durations: List[float] | None = None
+    ) -> np.ndarray:
+        self.eval()
+        embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
+        tokens = torch.tensor(tokens, dtype=torch.long, device=self.device)
+        if not isinstance(embodiment_ids, int):
+            embodiment_ids = torch.tensor(embodiment_ids, dtype=torch.long, device=self.device)
+        if durations is not None:
+            durations = torch.tensor(durations, dtype=torch.float32, device=self.device)
+        b, n = tokens.shape
+        assert n % self.n_tokens_per_quantizer == 0, (
+            f"Expected {self.n_tokens_per_quantizer} tokens per quantizer, got {n} in total."
+        )
+        indices = einops.rearrange(tokens, "b (n m) -> b m n", m=self.n_tokens_per_quantizer)
+        z_q = self._dequantize(indices)
+        x_recon, padding_mask = self._decode(z_q, embodiment_ids, durations)
+        return x_recon.cpu().numpy(), padding_mask.cpu().numpy()
+    # def sparse_encode(
+    #     self,
+    #     x: np.ndarray,
+    #     search_num: int = 10,
+    #     threshold: float = 0.1,
+    #     action_encoding: str | None = None,
+    #     remove_padding: bool = True,
+    # ) -> List[List[int]]:
+    #     """
+    #     Sparse encoding with adaptive token selection based on reconstruction error threshold.
+    #     Uses quaternary search to find optimal token length.
+    #     Args:
+    #         x: Input action arrays of shape (b, n, d)
+    #         search_num: Maximum number of search iterations
+    #         threshold: Reconstruction error threshold
+    #         action_encoding: Action encoding type
+    #         remove_padding: Whether to remove trailing zeros
+    #     Returns:
+    #         List of sparse token sequences
+    #     """
+    #     self.eval()
+    #     with torch.no_grad():
+    #         x_tensor = self._numpy_to_tensor(x)
+    #         # Get initial encoding
+    #         z_e = self._encode(x_tensor, action_encoding)
+    #         _, indices, _, _ = self._quantize(z_e, return_perplexity=False)
+    #         # Convert indices to proper format
+    #         if len(indices.size()) > 2:
+    #             indices_flat = einops.rearrange(indices, "b n s -> b (s n)")
+    #         else:
+    #             indices_flat = indices
+    #         # Use quaternary search to find optimal token lengths
+    #         optimal_lengths = self._quaternary_search(x_tensor, indices_flat, threshold, search_num, action_encoding)
+    #         # Create final sparse tokens based on optimal lengths
+    #         final_tokens = self._create_sparse_tokens_from_lengths(indices_flat, optimal_lengths)
+    #         # Convert to list format
+    #         if remove_padding:
+    #             final_tokens = trim_trailing_zeros(final_tokens.cpu().numpy())
+    #         else:
+    #             final_tokens = final_tokens.cpu().tolist()
+    #         return final_tokens
+    # def _quaternary_search(
+    #     self,
+    #     x_tensor: torch.Tensor,
+    #     indices_flat: torch.Tensor,
+    #     threshold: float,
+    #     search_num: int,
+    #     action_encoding: str | None = None,
+    # ) -> torch.Tensor:
+    #     """
+    #     Quaternary search to find optimal token lengths for each batch item.
+    #     Returns tensor of shape (batch_size,) containing optimal lengths.
+    #     """
+    #     batch_size, seq_len = indices_flat.shape
+    #     # Initialize search bounds
+    #     device = indices_flat.device
+    #     left = torch.ones(batch_size, dtype=torch.long, device=device)
+    #     right = torch.full((batch_size,), seq_len, dtype=torch.long, device=device)
+    #     # Perform quaternary search
+    #     for _ in range(search_num):
+    #         # Calculate three division points
+    #         range_size = right - left
+    #         q1 = left + range_size // 4
+    #         q2 = left + range_size // 2
+    #         q3 = left + 3 * range_size // 4
+    #         # Ensure q1, q2, q3 are within bounds and distinct
+    #         q1 = torch.clamp(q1, left, right)
+    #         q2 = torch.clamp(q2, q1 + 1, right)
+    #         q3 = torch.clamp(q3, q2 + 1, right)
+    #         # Create test lengths: [left, q1, q2, q3, right]
+    #         test_lengths = torch.stack([left, q1, q2, q3, right], dim=1)  # (batch_size, 5)
+    #         # Calculate errors for all test lengths
+    #         errors = self._calculate_errors_for_lengths(x_tensor, indices_flat, test_lengths, action_encoding)
+    #         # Update search bounds based on results (vectorized)
+    #         # Find which lengths meet threshold for each batch item
+    #         meets_threshold = errors <= threshold
+    #         # For each batch item, find the smallest length that meets threshold
+    #         valid_indices = torch.argmax(meets_threshold.float(), dim=1)  # First True index
+    #         has_valid = meets_threshold.any(dim=1)  # Whether any length meets threshold
+    #         # Create batch indices for advanced indexing
+    #         batch_indices = torch.arange(batch_size, device=device)
+    #         # Get the smallest valid length for each batch
+    #         smallest_valid_lengths = test_lengths[batch_indices, valid_indices]
+    #         # Update bounds based on results
+    #         # If has valid length, use it; otherwise use longest length
+    #         right = torch.where(has_valid, smallest_valid_lengths, test_lengths[:, -1])
+    #         # Update left bound: if we found a valid length and it's not the first one,
+    #         # use the previous length; otherwise keep current left
+    #         prev_lengths = torch.where(valid_indices > 0, test_lengths[batch_indices, valid_indices - 1], left)
+    #         left = torch.where(has_valid & (valid_indices > 0), prev_lengths, left)
+    #         # Check convergence
+    #         if (right - left).max() <= 1:
+    #             break
+    #     return right  # Return optimal lengths
+    # def _calculate_errors_for_lengths(
+    #     self,
+    #     x_tensor: torch.Tensor,
+    #     indices_flat: torch.Tensor,
+    #     test_lengths: torch.Tensor,
+    #     action_encoding: str | None = None,
+    # ) -> torch.Tensor:
+    #     """
+    #     Calculate reconstruction errors for given token lengths.
+    #     Args:
+    #         x_tensor: Original input tensor (batch_size, ...)
+    #         indices_flat: Full token indices (batch_size, seq_len)
+    #         test_lengths: Test lengths tensor (batch_size, num_tests)
+    #         action_encoding: Action encoding type
+    #     Returns:
+    #         Error tensor (batch_size, num_tests)
+    #     """
+    #     # Create sparse tokens for all test lengths (vectorized)
+    #     batch_size, num_tests = test_lengths.shape
+    #     seq_len = indices_flat.shape[1]
+    #     device = indices_flat.device
+    #     # Create position tensor for all combinations
+    #     positions = torch.arange(seq_len, device=device).unsqueeze(0).unsqueeze(0)  # (1, 1, seq_len)
+    #     positions = positions.expand(batch_size, num_tests, -1)  # (batch_size, num_tests, seq_len)
+    #     # Create length mask: positions < test_lengths
+    #     length_mask = positions < test_lengths.unsqueeze(2)  # (batch_size, num_tests, seq_len)
+    #     # Create sparse tokens using advanced indexing
+    #     sparse_tokens = torch.where(
+    #         length_mask,
+    #         indices_flat.unsqueeze(1).expand(-1, num_tests, -1),
+    #         torch.zeros_like(indices_flat).unsqueeze(1).expand(-1, num_tests, -1),
+    #     )
+    #     # Reshape for parallel processing
+    #     sparse_flat = sparse_tokens.view(batch_size * num_tests, seq_len)
+    #     # Decode all sparse tokens in parallel
+    #     reconstructed_flat = self._decode_sparse_tokens(sparse_flat, action_encoding)
+    #     # Reshape back and calculate errors
+    #     reconstructed = reconstructed_flat.view(batch_size, num_tests, *x_tensor.shape[1:])
+    #     # Calculate errors
+    #     x_expanded = x_tensor.unsqueeze(1).expand(-1, num_tests, -1, -1)
+    #     errors = (x_expanded - reconstructed).abs().mean((-1, -2))  # (batch_size, num_tests)
+    #     return errors
+    # def _decode_sparse_tokens(self, sparse_tokens: torch.Tensor, action_encoding: str | None = None) -> torch.Tensor:
+    #     """Decode sparse tokens to reconstructed data."""
+    #     batch_size, seq_len = sparse_tokens.shape
+    #     # Convert to proper indices format for dequantization
+    #     if self.num_quantizers > 1:
+    #         seq_len_per_quantizer = seq_len // self.num_quantizers
+    #         if seq_len % self.num_quantizers != 0:
+    #             raise ValueError("Sequence length must be divisible by num_quantizers")
+    #         indices_for_decode = sparse_tokens.view(batch_size, self.num_quantizers, seq_len_per_quantizer).transpose(
+    #             1, 2
+    #         )  # (batch_size, seq_len_per_quantizer, num_quantizers)
+    #     else:
+    #         indices_for_decode = sparse_tokens.unsqueeze(-1)  # (batch_size, seq_len, 1)
+    #     # Dequantize and decode
+    #     z_q = self._dequantize(indices_for_decode)
+    #     reconstructed = self._decode(z_q, action_encoding)
+    #     return reconstructed
+    # def _create_sparse_tokens_from_lengths(
+    #     self, indices_flat: torch.Tensor, optimal_lengths: torch.Tensor
+    # ) -> torch.Tensor:
+    #     """Create sparse tokens based on optimal lengths (vectorized)."""
+    #     batch_size, seq_len = indices_flat.shape
+    #     device = indices_flat.device
+    #     # Create position mask for all batch items simultaneously
+    #     positions = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)  # (batch_size, seq_len)
+    #     length_mask = positions < optimal_lengths.unsqueeze(1)  # (batch_size, seq_len)
+    #     # Apply mask to create sparse tokens
+    #     result = torch.where(length_mask, indices_flat, torch.zeros_like(indices_flat))
+    #     return result
+    def forward(self, x: torch.Tensor, embodiment_ids: int | None = None, padding_mask: List[bool] | None = None):
+        return self.encode(x, embodiment_ids, padding_mask)
+AutoModel.register(ActionCodecConfig, ActionCodec)
+__all__ = ["ActionCodec"]
+if __name__ == "__main__":
+    print("=== ActionCodec Comprehensive Test ===\n")
+    # 1. Configuration Setup (RVQ enabled with n_quantizers=4)
+    initial_config = {
+        "robot_A": {"action_dim": 7, "freq": 10, "duration": 1, "description": "Robot A"},
+    }
+    # We set n_quantizers=4 to test Residual VQ logic
+    config = ActionCodecConfig(
+        embodiment_config=initial_config,
+        n_tokens=16,  # Total tokens per sequence (latent_len * n_quantizers)
+        n_quantizers=4,  # RVQ depth
+        vq_type="rvq",
+        vq_codebook_size=256,
+        encoder_dim=128,
+        decoder_dim=128,
+    )
+    # Expected latent sequence length = n_tokens / n_quantizers = 16 / 4 = 4
+    latent_seq_len = int(config.n_tokens // config.n_quantizers)
+    print(f"Config: {config.n_quantizers} quantizers, {latent_seq_len} latent vectors per sequence.")
+    codec = ActionCodec(config)
+    codec.eval()
+    # 2. Basic Encode/Decode Test
+    print("\n--- Test 1: Basic Encode/Decode ---")
+    batch_size = 2
+    seq_len_A = 10  # 10Hz * 1s
+    # Create random action data for Robot A (ID 0)
+    x = np.random.randn(batch_size, seq_len_A, 7).astype(np.float32)
+    # Masking: Second item in batch is half padding
+    padding_mask = np.ones((batch_size, seq_len_A), dtype=bool)
+    padding_mask[1, 5:] = False
+    embodiment_ids = [0, 0]
+    # Encode
+    codes = codec.encode(x, embodiment_ids, padding_mask)
+    print(f"Encoded codes shape (list length): {len(codes)} x {len(codes[0])}")
+    # Validate code length
+    assert len(codes[0]) == config.n_tokens, f"Expected {config.n_tokens} tokens, got {len(codes[0])}"
+    # Decode
+    x_recon, recon_mask = codec.decode(codes, embodiment_ids)
+    print(f"Reconstructed shape: {x_recon.shape}")
+    print(f"Recon mask shape: {recon_mask.shape}")
+    assert x_recon.shape == (batch_size, seq_len_A, 7)  # Should imply zero-padding to max dim 7
+    # 3. Expansion Test
+    print("\n--- Test 2: Dynamic Expansion ---")
+    new_robot_config = {"robot_B": {"action_dim": 10, "freq": 20, "duration": 1, "description": "Robot B (Larger)"}}
+    print("Expanding codec to include Robot B (10 dims, 20Hz)...")
+    codec.expand_embodiment(new_robot_config)
+    assert codec.encoder.max_action_dim == 10
+    assert codec.decoder.max_action_dim == 10
+    print("✅ Expansion successful.")
+    # 4. Mixed Batch Test (Old + New Robot)
+    print("\n--- Test 3: Mixed Batch Inference ---")
+    # Batch: [Robot A, Robot B]
+    # Robot A: 10Hz, 1s -> 10 steps. Dims 7.
+    # Robot B: 20Hz, 1s -> 20 steps. Dims 10.
+    # Batch Max Steps: 20. Batch Max Dims: 10.
+    batch_x_mixed = np.zeros((2, 20, 10), dtype=np.float32)
+    # Fill Robot A data (index 0)
+    data_A = np.random.randn(10, 7)
+    batch_x_mixed[0, :10, :7] = data_A
+    # Fill Robot B data (index 1)
+    data_B = np.random.randn(20, 10)
+    batch_x_mixed[1, :20, :10] = data_B
+    # Embodiment IDs: 0 for A, 1 for B
+    # Note: expand_embodiment appends. Original was 0, new is 1.
+    mixed_ids = [0, 1]
+    # Encode Mask
+    mixed_mask = np.zeros((2, 20), dtype=bool)
+    mixed_mask[0, :10] = True
+    mixed_mask[1, :20] = True
+    print("Encoding mixed batch...")
+    mixed_codes = codec.encode(batch_x_mixed, mixed_ids, mixed_mask)
+    print("Decoding mixed batch...")
+    # Explicit durations (optional, but good for verification if we wanted to override defaults)
+    durations = [1, 1]
+    x_recon_mixed, dec_mask_mixed = codec.decode(mixed_codes, mixed_ids, durations)
+    print(f"Mixed Recon Shape: {x_recon_mixed.shape}")
+    # Validation
+    # Robot A output check (mask should be True for first 10, False for rest)
+    valid_A = dec_mask_mixed[0].sum()
+    valid_B = dec_mask_mixed[1].sum()
+    print(f"Valid steps detected by Decoder: Robot A={valid_A}, Robot B={valid_B}")
+    assert valid_A == 10
+    assert valid_B == 20
+    # Check dimensionality preservation
+    # Robot A's reconstruction in dims 7-9 should be noise or zero (depending on implementation),
+    # but dims 0-6 should contain signal.
+    print("✅ Mixed batch processed successfully.")
+    print("\n✨ All systems go.")

modular_actioncodec.py ADDED Viewed

	@@ -0,0 +1,779 @@

+import math
+from copy import deepcopy
+from typing import List, Literal, Optional, Tuple, Union
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .configuration_actioncodec import ActionCodecConfig
+def apply_rotary_pos_emb(x: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
+    original_dtype = x.dtype
+    x = x.to(torch.float32)
+    sin = sin.to(torch.float32)
+    cos = cos.to(torch.float32)
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    rotated_x1 = x1 * cos - x2 * sin
+    rotated_x2 = x1 * sin + x2 * cos
+    x_out = torch.empty_like(x)
+    x_out[..., 0::2] = rotated_x1
+    x_out[..., 1::2] = rotated_x2
+    return x_out.to(original_dtype)
+def attention_op(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    mask: torch.Tensor | None = None,
+    is_causal: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        q (torch.Tensor): (*b, h, l, d)
+        k (torch.Tensor): (*b, k, s, d)
+        v (torch.Tensor): (*b, k, s, d)
+        mask (torch.Tensor | None, optional): (*b, l, s), where `True` indicates the element should take part in attention. Defaults to None.
+        is_causal (bool, optional): Whether to apply causal mask. Defaults to False.
+    Returns:
+        torch.Tensor: (*b, h, l, d)
+    """
+    heads, kv_heads = q.shape[-3], k.shape[-3]
+    if heads != kv_heads:
+        assert heads % kv_heads == 0, f"q_heads must be divisible by kv_heads, but got {heads} and {kv_heads}"
+        heads_per_kv_head = heads // kv_heads
+        k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim=1), (k, v))
+    if mask is not None:
+        if mask.dim() == 3:
+            mask = mask.unsqueeze(1)
+            mask = mask.expand(mask.shape[0], heads, -1, -1)
+    out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=is_causal)
+    return out
+class L2Norm(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return F.normalize(x, p=2, dim=-1)
+class Attention(nn.Module):
+    """
+    Args:
+        hidden_size (int): Hidden size of the input tensor.
+        num_heads (int): Number of attention heads.
+        num_kv_heads (int, optional): Number of key/value heads. Defaults to None.
+        qk_norm (Literal["l2", "ln", "none"], optional): Type of normalization to apply to query/key. Defaults to "none".
+        bias (bool, optional): Whether to use bias in linear layers. Defaults to False.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int | None = None,
+        qk_norm: Literal["l2", "ln", "none"] = "none",
+        bias: bool = False,
+        zero_init_output: bool = False,
+    ):
+        super().__init__()
+        num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.dim = hidden_size // num_heads
+        self.num_heads, self.num_kv_heads = num_heads, num_kv_heads
+        self.q_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.k_proj = nn.Linear(hidden_size, self.dim * num_kv_heads, bias=bias)
+        self.v_proj = nn.Linear(hidden_size, self.dim * num_kv_heads, bias=bias)
+        self.out_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+        if qk_norm == "l2":
+            self.q_norm = L2Norm()
+            self.k_norm = L2Norm()
+        elif qk_norm == "ln":
+            self.q_norm = nn.LayerNorm(self.dim, elementwise_affine=False)
+            self.k_norm = nn.LayerNorm(self.dim, elementwise_affine=False)
+        else:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+        if zero_init_output:
+            nn.init.zeros_(self.out_proj.weight)
+            if self.out_proj.bias is not None:
+                nn.init.zeros_(self.out_proj.bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor | None = None,
+        mask: torch.Tensor | None = None,
+        rotary_pos_emb: Tuple[torch.Tensor, torch.Tensor] | None = None,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        context = x if context is None else context
+        q = self.q_proj(x)
+        k, v = self.k_proj(context), self.v_proj(context)
+        q = einops.rearrange(q, "b l (h d) -> b h l d", h=self.num_heads)
+        k = einops.rearrange(k, "b s (h d) -> b h s d", h=self.num_kv_heads)
+        v = einops.rearrange(v, "b s (h d) -> b h s d", h=self.num_kv_heads)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if rotary_pos_emb is not None:
+            q, k = map(lambda t: apply_rotary_pos_emb(t, *rotary_pos_emb), (q, k))
+        out = attention_op(q, k, v, mask=mask, is_causal=is_causal)
+        out = einops.rearrange(out, "b h l d -> b l (h d)")
+        out = self.out_proj(out)
+        return out
+class PositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        encoding_type: Literal["sincos", "fourier"] = "sincos",
+        scale: float = 2.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.encoding_type = encoding_type
+        if encoding_type == "fourier":
+            self.register_buffer("freqs", torch.randn(dim // 2) * scale, persistent=True)
+        elif encoding_type == "sincos":
+            pass
+        else:
+            raise ValueError(f"encoding_type must be 'sincos' or 'fourier', but got {encoding_type}")
+    def _create_sincos_emb(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        position = torch.arange(seq_len, device=device, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) * -(math.log(10000.0) / self.dim)
+        )
+        pos_emb = torch.zeros(seq_len, self.dim, device=device, dtype=dtype)
+        pos_emb[:, 0::2] = torch.sin(position * div_term).to(dtype)
+        pos_emb[:, 1::2] = torch.cos(position * div_term).to(dtype)
+        return pos_emb
+    def _create_fourier_emb(self, timestamps: torch.Tensor, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        pos_emb = torch.einsum("b t, d -> b t d", timestamps, 2 * np.pi * self.freqs).to(device, torch.float32)
+        pos_emb = torch.cat([pos_emb.cos(), pos_emb.sin()], dim=-1).to(dtype)
+        return pos_emb
+    def forward(
+        self, x: torch.Tensor, freq: Optional[Union[float, torch.Tensor]] = None, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        b, t = x.shape[0], x.shape[1]
+        device = x.device
+        if self.encoding_type == "sincos":
+            pos_emb = self._create_sincos_emb(t, device, dtype)
+            pos_emb = pos_emb.unsqueeze(0).expand(b, -1, -1)
+            return pos_emb * 0.1
+        elif self.encoding_type == "fourier":
+            if freq is None:
+                raise ValueError(
+                    "freq must be provided when encoding_type is 'fourier'. Please provide the sequence frequency."
+                )
+            if isinstance(freq, float):
+                freq = torch.tensor(freq, dtype=dtype, device=device)[None].expand(b)
+            timestamps = torch.einsum("t, b -> b t", torch.arange(t, dtype=dtype, device=device), 1 / freq)
+            pos_emb = self._create_fourier_emb(timestamps, device, dtype)
+            return pos_emb * 0.1
+        else:
+            raise ValueError(f"Unknown encoding_type: {self.encoding_type}")
+class SinusoidalPositionalEmbedding(PositionalEmbedding):
+    def __init__(self, dim: int):
+        super().__init__(dim=dim, encoding_type="sincos")
+    def forward(self, x: torch.Tensor, pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return super().forward(x, freq=None)
+class FeedForward(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, bias: bool = False):
+        super().__init__()
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=bias)
+        self.act_fn = nn.GELU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        down_proj = self.down_proj(self.act_fn(self.up_proj(x)))
+        return down_proj
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_val=1e-2):
+        super().__init__()
+        self.scale = nn.Parameter(torch.full([dim], init_val))
+    def forward(self, x):
+        return x * self.scale
+class PerceiverTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: int = 4,
+        dropout: float = 0.0,
+        qk_norm: str = "ln",
+        layer_scale: bool = True,
+        zero_init_output: bool = False,
+        add_self_attn: bool = False,
+        add_causal_mask: bool = False,
+    ):
+        super().__init__()
+        self.add_self_attn = add_self_attn
+        self.add_causal_mask = add_causal_mask
+        self.norm1 = nn.LayerNorm(dim, eps=1e-2)
+        self.cross_attn = Attention(
+            hidden_size=dim, num_heads=num_heads, qk_norm=qk_norm, bias=False, zero_init_output=zero_init_output
+        )
+        if add_self_attn:
+            self.norm_self_attn = nn.LayerNorm(dim, eps=1e-2)
+            self.self_attn = Attention(
+                hidden_size=dim, num_heads=num_heads, qk_norm=qk_norm, bias=False, zero_init_output=zero_init_output
+            )
+        else:
+            self.self_attn = None
+        self.norm2 = nn.LayerNorm(dim, eps=1e-2)
+        self.mlp = FeedForward(hidden_size=dim, intermediate_size=int(mlp_ratio * dim), bias=True)
+        self.dropout = nn.Dropout(dropout)
+        self.attn_scale = LayerScale(dim) if layer_scale else nn.Identity()
+        self.mlp_scale = LayerScale(dim) if layer_scale else nn.Identity()
+        if zero_init_output:
+            nn.init.zeros_(self.mlp.down_proj.weight)
+            if self.mlp.down_proj.bias is not None:
+                nn.init.zeros_(self.mlp.down_proj.bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        context_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        residual = x
+        x = self.norm1(x)
+        x = self.cross_attn(x=x, context=context, mask=context_mask, rotary_pos_emb=rotary_pos_emb, is_causal=False)
+        x = self.dropout(x)
+        x = self.attn_scale(x)
+        x = x + residual
+        if self.add_self_attn:
+            residual = x
+            x = self.norm_self_attn(x)
+            x = self.self_attn(
+                x=x,
+                context=None,
+                mask=None,
+                rotary_pos_emb=rotary_pos_emb,
+                is_causal=self.add_causal_mask,
+            )
+            x = self.dropout(x)
+            x = self.attn_scale(x)
+            x = x + residual
+        residual = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.dropout(x)
+        x = self.mlp_scale(x)
+        x = x + residual
+        return x
+class EmbodimentEmbedding(nn.Module):
+    def __init__(self, embodiment_config: dict, out_len: int, out_dim: int) -> None:
+        super().__init__()
+        self.out_len, self.out_dim = out_len, out_dim
+        self.embodiment_config = embodiment_config
+        self.num_embodiments = len(self.embodiment_config)
+        self.embedding = nn.Embedding(self.num_embodiments, out_dim * out_len)
+    @torch.no_grad()
+    def expand_embodiment(self, embodiment_config: dict):
+        for k in embodiment_config.keys():
+            assert k not in self.embodiment_config.keys()
+        self.embodiment_config.update(embodiment_config)
+        self.num_embodiments = len(self.embodiment_config)
+        extra_embodiments = len(embodiment_config)
+        old_weights = torch.clone(self.embedding.weight)
+        self.embedding = nn.Embedding(self.num_embodiments, self.out_dim * self.out_len)
+        self.embedding.weight.data[:-extra_embodiments] = old_weights
+        return self
+    def keys(self) -> list[str]:
+        return list(self.embodiment_config.keys())
+    def ids_to_keys(self, ids: torch.Tensor) -> List[str]:
+        return [self.keys()[i] for i in ids]
+    def keys_to_ids(self, keys: List[str]) -> torch.Tensor:
+        return torch.tensor([self.keys().index(k) for k in keys])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return einops.rearrange(self.embedding(x), "b (l d) -> b l d", d=self.out_dim)
+class PerceiverEncoder(nn.Module):
+    def __init__(self, config: ActionCodecConfig):
+        super().__init__()
+        self.config = config
+        self.embodiment_config = deepcopy(config.embodiment_config)
+        out_len = int(config.n_tokens // config.n_quantizers)
+        dim = config.encoder_dim
+        _action_dim, _freq, _duration = list(), list(), list()
+        for k, v in self.embodiment_config.items():
+            _action_dim.append(v["action_dim"])
+            _freq.append(v["freq"])
+            _duration.append(v["duration"])
+        self.register_buffer("_action_dim", torch.tensor(_action_dim), persistent=False)
+        self.register_buffer("_freq", torch.tensor(_freq), persistent=False)
+        self.register_buffer("_duration", torch.tensor(_duration), persistent=False)
+        self.max_action_dim = max(v["action_dim"] for v in self.embodiment_config.values())
+        self.input_proj = nn.Linear(self.max_action_dim, dim)
+        self.cls_tokens = EmbodimentEmbedding(self.embodiment_config, out_len, dim)
+        self.pos_emb_q = PositionalEmbedding(dim, encoding_type="sincos")
+        self.pos_emb_kv = PositionalEmbedding(dim, encoding_type=config.encoder_pos_encoding_type)
+        self.layers = nn.ModuleList(
+            [
+                PerceiverTransformerBlock(
+                    dim=dim,
+                    num_heads=config.encoder_n_heads,
+                    add_self_attn=config.encoder_add_self_attn,
+                    add_causal_mask=config.encoder_add_causal_mask,
+                )
+                for _ in range(config.encoder_n_layers)
+            ]
+        )
+        self.output_proj = nn.Linear(dim, config.z_dim)
+        self._init_weights()
+    def _init_weights(self):
+        nn.init.trunc_normal_(self.input_proj.weight, std=0.02)
+        if self.input_proj.bias is not None:
+            nn.init.zeros_(self.input_proj.bias)
+        nn.init.trunc_normal_(self.output_proj.weight, std=0.02)
+        if self.output_proj.bias is not None:
+            nn.init.zeros_(self.output_proj.bias)
+        nn.init.trunc_normal_(self.cls_tokens.embedding.weight, std=0.02)
+    @torch.no_grad()
+    def expand_embodiment(self, embodiment_config: dict):
+        self.cls_tokens.expand_embodiment(embodiment_config)
+        self.embodiment_config = self.cls_tokens.embodiment_config
+        _action_dim, _freq, _duration = list(), list(), list()
+        for k, v in self.embodiment_config.items():
+            _action_dim.append(v["action_dim"])
+            _freq.append(v["freq"])
+            _duration.append(v["duration"])
+        self._action_dim = torch.tensor(_action_dim)
+        self._freq = torch.tensor(_freq)
+        self._duration = torch.tensor(_duration)
+        max_action_dim = max(v["action_dim"] for v in self.embodiment_config.values())
+        if max_action_dim > self.max_action_dim:
+            old_weights = torch.clone(self.input_proj.weight)
+            old_bias = torch.clone(self.input_proj.bias)
+            self.input_proj = nn.Linear(max_action_dim, self.config.encoder_dim)
+            self.input_proj.weight.data[:, : self.max_action_dim] = old_weights
+            self.input_proj.bias.data = old_bias
+            self.max_action_dim = max_action_dim
+        return self
+    def forward(
+        self,
+        x: torch.Tensor,
+        embodiment_ids: torch.Tensor | int,
+        padding_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Encode action sequences into latent representations.
+        Args:
+            x (torch.Tensor): Action sequences to encode. Shape: (b, seq_len, max_action_dim).
+                Assumes that the action dimension is zero-padded to the max action dimension.
+                `seq_len` is supposed to be `int(duration * freq)` for each embodiment and padded to the max sequence length.
+            embodiment_ids (torch.Tensor | int): Embodiment IDs. Shape: (b,).
+                If int, the same embodiment ID is repeated for all sequences in the batch.
+                It specifies the embodiment to encode.
+            padding_mask (Optional[torch.Tensor], optional): Padding mask, where `False` values indicate padding. Shape: (b, seq_len). Defaults to None.
+                It is used to mask the padding tokens on `seq_len` dimension.
+        Returns:
+            torch.Tensor: Encoded latent representations. Shape: (b, n_tokens_per_quantizer, z_dim).
+        """
+        b, seq_len, _ = x.shape
+        x = self.input_proj(x)
+        if isinstance(embodiment_ids, int):
+            embodiment_ids = torch.tensor([embodiment_ids], dtype=torch.long, device=x.device).repeat(b)
+        cls_tokens = self.cls_tokens(embodiment_ids)
+        freqs = self._freq[embodiment_ids].to(x.device, x.dtype)
+        pos_emb_q = self.pos_emb_q(cls_tokens)
+        pos_emb_kv = self.pos_emb_kv(x, freqs)
+        cls_tokens = cls_tokens + pos_emb_q
+        x = x + pos_emb_kv
+        if padding_mask is not None:
+            padding_mask = padding_mask.unsqueeze(1).expand(-1, cls_tokens.shape[1], -1)
+        for layer in self.layers:
+            cls_tokens = layer(x=cls_tokens, context=x, context_mask=padding_mask)
+        return self.output_proj(cls_tokens)
+class PerceiverDecoder(nn.Module):
+    def __init__(self, config: ActionCodecConfig):
+        super().__init__()
+        self.config = config
+        self.embodiment_config = deepcopy(config.embodiment_config)
+        dim = config.decoder_dim
+        _action_dim, _freq, _duration = list(), list(), list()
+        for k, v in self.embodiment_config.items():
+            _action_dim.append(v["action_dim"])
+            _freq.append(v["freq"])
+            _duration.append(v["duration"])
+        self.register_buffer("_action_dim", torch.tensor(_action_dim), persistent=False)
+        self.register_buffer("_freq", torch.tensor(_freq), persistent=False)
+        self.register_buffer("_duration", torch.tensor(_duration), persistent=False)
+        self.max_action_dim = max(v["action_dim"] for v in self.embodiment_config.values())
+        self.input_proj = nn.Linear(config.z_dim, dim)
+        self.cls_tokens = EmbodimentEmbedding(self.embodiment_config, config.decoder_cls_size, dim)
+        self.pos_emb_q = PositionalEmbedding(dim, encoding_type=config.decoder_pos_encoding_type)
+        self.pos_emb_kv = PositionalEmbedding(dim, encoding_type="sincos")
+        self.layers = nn.ModuleList(
+            [
+                PerceiverTransformerBlock(
+                    dim=dim,
+                    num_heads=config.decoder_n_heads,
+                    add_self_attn=config.decoder_add_self_attn,
+                    add_causal_mask=config.decoder_add_causal_mask,
+                )
+                for _ in range(config.decoder_n_layers)
+            ]
+        )
+        self.output_proj = nn.Linear(dim, self.max_action_dim)
+        self._init_weights()
+    def _init_weights(self):
+        nn.init.trunc_normal_(self.input_proj.weight, std=0.02)
+        if self.input_proj.bias is not None:
+            nn.init.zeros_(self.input_proj.bias)
+        nn.init.trunc_normal_(self.output_proj.weight, std=0.02)
+        if self.output_proj.bias is not None:
+            nn.init.zeros_(self.output_proj.bias)
+        nn.init.trunc_normal_(self.cls_tokens.embedding.weight, std=0.02)
+    @torch.no_grad()
+    def expand_embodiment(self, embodiment_config: dict):
+        self.cls_tokens.expand_embodiment(embodiment_config)
+        self.embodiment_config = self.cls_tokens.embodiment_config
+        _action_dim, _freq, _duration = list(), list(), list()
+        for k, v in self.embodiment_config.items():
+            _action_dim.append(v["action_dim"])
+            _freq.append(v["freq"])
+            _duration.append(v["duration"])
+        self._action_dim = torch.tensor(_action_dim)
+        self._freq = torch.tensor(_freq)
+        self._duration = torch.tensor(_duration)
+        max_action_dim = max(v["action_dim"] for v in self.embodiment_config.values())
+        if max_action_dim > self.max_action_dim:
+            old_weights = torch.clone(self.output_proj.weight)
+            old_bias = torch.clone(self.output_proj.bias)
+            self.output_proj = nn.Linear(self.config.decoder_dim, max_action_dim)
+            self.output_proj.weight.data[: self.max_action_dim, :] = old_weights
+            self.output_proj.bias.data[: self.max_action_dim] = old_bias
+            self.max_action_dim = max_action_dim
+        return self
+    def forward(
+        self, x: torch.Tensor, embodiment_ids: torch.Tensor | int, durations: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        """Decode latent representations into action sequences.
+        Args:
+            x (torch.Tensor): Latent representations to decode. Shape: (b, n_tokens_per_quantizer, z_dim).
+            embodiment_ids (torch.Tensor | int): Embodiment IDs. Shape: (b,).
+                If int, the same embodiment ID is repeated for all sequences in the batch.
+                It specifies the embodiment to decode.
+            durations (torch.Tensor | None, optional): Duration of each action sequence. Shape: (b,).
+                If `None`, the duration is inferred from the default values in `embodiment_config`.
+        Returns:
+            torch.Tensor: Decoded action sequences. Shape: (b, seq_len, max_action_dim).
+                Assumes that the action dimension is zero-padded to the max action dimension.
+                `seq_len` is supposed to be `int(duration * freq)` for each embodiment and padded to the max sequence length.
+        """
+        b, seq_len, _ = x.shape
+        x = self.input_proj(x)
+        if isinstance(embodiment_ids, int):
+            embodiment_ids = torch.tensor([embodiment_ids], dtype=torch.long, device=x.device).repeat(b)
+        cls_tokens = self.cls_tokens(embodiment_ids)
+        freqs = self._freq[embodiment_ids]
+        durations = self._duration[embodiment_ids] if durations is None else durations
+        action_horizons = (durations * freqs).long()
+        max_horizon = action_horizons.max().item()
+        padding_mask = torch.arange(max_horizon, device=x.device).expand(b, -1) < action_horizons.unsqueeze(1)
+        if self.config.decoder_cls_size == 1:
+            cls_tokens = cls_tokens.repeat(1, max_horizon, 1)
+        pos_emb_q = self.pos_emb_q(cls_tokens, freqs)
+        pos_emb_kv = self.pos_emb_kv(x)
+        cls_tokens = cls_tokens + pos_emb_q
+        x = x + pos_emb_kv
+        for layer in self.layers:
+            cls_tokens = layer(x=cls_tokens, context=x)
+        output = self.output_proj(cls_tokens)
+        return output, padding_mask
+if __name__ == "__main__":
+    # ------------------------------------------
+    # 1. Initialization
+    # ------------------------------------------
+    print("=== Test 1: Initialization ===")
+    # Define initial config with two smaller robots
+    initial_embodiment_config = {
+        "robot_small_7d": {"action_dim": 7, "freq": 20, "duration": 1, "description": "Original Robot"},
+        "robot_tiny_3d": {"action_dim": 3, "freq": 10, "duration": 2, "description": "Tiny Robot"},
+    }
+    config = ActionCodecConfig(embodiment_config=initial_embodiment_config)
+    # Set seed for reproducibility
+    torch.manual_seed(42)
+    encoder = PerceiverEncoder(config)
+    decoder = PerceiverDecoder(config)
+    encoder.eval()
+    decoder.eval()
+    print("✅ Models initialized successfully.")
+    # ------------------------------------------
+    # 2. Baseline Inference (Before Expansion)
+    # ------------------------------------------
+    print("\n=== Test 2: Baseline Inference (Before Expansion) ===")
+    # Simulate Robot 1 (7-dim) data
+    # Max action dim currently is 7.
+    batch_size = 1
+    seq_len = 20  # 20Hz * 1s
+    # Input: (1, 20, 7)
+    input_action_v0 = torch.randn(batch_size, seq_len, 7)
+    emb_id_v0 = torch.tensor([0], dtype=torch.long)  # ID 0 -> robot_small_7d
+    with torch.no_grad():
+        z_ref = encoder(input_action_v0, emb_id_v0)
+        rec_action_ref, _ = decoder(z_ref, emb_id_v0)
+    print(f"Reference Latent Shape: {z_ref.shape}")
+    print(f"Reference Recon Shape: {rec_action_ref.shape}")
+    # ------------------------------------------
+    # 3. Model Expansion (Add New Embodiment)
+    # ------------------------------------------
+    print("\n=== Test 3: Model Expansion ===")
+    # Add a larger robot: 10-dim, high frequency
+    new_embodiment_config = {
+        "robot_large_10d": {"action_dim": 10, "freq": 30, "duration": 1, "description": "New Large Robot"}
+    }
+    print(f"Expanding from Max Dim {encoder.max_action_dim} to 10...")
+    encoder.expand_embodiment(new_embodiment_config)
+    decoder.expand_embodiment(new_embodiment_config)
+    # Verify buffer updates
+    assert encoder._action_dim[-1] == 10
+    assert encoder.max_action_dim == 10
+    assert decoder.max_action_dim == 10
+    print(f"✅ Expansion successful. New Encoder Input Dim: {encoder.input_proj.weight.shape[1]}")
+    print(f"✅ New Decoder Output Dim: {decoder.output_proj.weight.shape[0]}")
+    # ------------------------------------------
+    # 4. Encoder Invariance Check
+    # ------------------------------------------
+    print("\n=== Test 4: Encoder Invariance Check ===")
+    # Pad old data (7 dims) to new max dim (10 dims) with ZEROS.
+    input_action_padded = torch.zeros(batch_size, seq_len, 10)
+    input_action_padded[:, :, :7] = input_action_v0
+    with torch.no_grad():
+        z_new = encoder(input_action_padded, emb_id_v0)
+    # Compare latents
+    diff_z = (z_ref - z_new).abs().max().item()
+    print(f"Latent Difference (Max Abs): {diff_z:.8f}")
+    if diff_z < 1e-6:
+        print("✅ PASS: Encoder produces identical latents for old data.")
+    else:
+        print("❌ FAIL: Encoder outputs changed after expansion!")
+    # ------------------------------------------
+    # 5. Decoder Invariance Check
+    # ------------------------------------------
+    print("\n=== Test 5: Decoder Invariance Check ===")
+    with torch.no_grad():
+        # Feed old latent to expanded decoder
+        rec_action_new_full, _ = decoder(z_ref, emb_id_v0)
+    # Output shape should be (1, 20, 10)
+    print(f"Expanded Decoder Output Shape: {rec_action_new_full.shape}")
+    # Slice first 7 dims, should match reference
+    rec_action_new_sliced = rec_action_new_full[:, :, :7]
+    diff_rec = (rec_action_ref - rec_action_new_sliced).abs().max().item()
+    print(f"Reconstruction Difference (Max Abs on valid dims): {diff_rec:.8f}")
+    if diff_rec < 1e-6:
+        print("✅ PASS: Decoder produces identical action values for valid dimensions.")
+    else:
+        print("❌ FAIL: Decoder outputs changed!")
+    # Check phantom dimensions (7-9)
+    # For old embodiment, these are driven by random weights and should be random
+    new_dims_mean = rec_action_new_full[:, :, 7:].abs().mean().item()
+    print(f"Values in new phantom dimensions (should be random garbage): {new_dims_mean:.4f}")
+    # ------------------------------------------
+    # 6. New Embodiment Inference
+    # ------------------------------------------
+    print("\n=== Test 6: New Embodiment Inference ===")
+    # ID 2 -> robot_large_10d
+    emb_id_new = torch.tensor([2], dtype=torch.long)
+    seq_len_new = 30  # 30Hz * 1s
+    input_action_new = torch.randn(1, seq_len_new, 10)
+    with torch.no_grad():
+        z_large = encoder(input_action_new, emb_id_new)
+        rec_large, mask_large = decoder(z_large, emb_id_new)
+    print(f"New Embodiment Output Shape: {rec_large.shape}")
+    if rec_large.shape == (1, 30, 10):
+        print("✅ PASS: New embodiment handled correctly with full dimensions.")
+    else:
+        print(f"❌ FAIL: Expected (1, 30, 10), got {rec_large.shape}")
+    # ------------------------------------------
+    # 7. Mixed Batch Processing (Masking)
+    # ------------------------------------------
+    print("\n=== Test 7: Mixed Batch Processing ===")
+    # Batch size 2: [Robot 0 (20Hz, 7dim), Robot 2 (30Hz, 10dim)]
+    mixed_emb_ids = torch.tensor([0, 2], dtype=torch.long)
+    # Max seq len is 30. Max action dim is 10.
+    batch_input = torch.zeros(2, 30, 10)
+    # Fill data
+    # Batch 0: Length 20, Dim 7 valid
+    batch_input[0, :20, :7] = torch.randn(20, 7)
+    # Batch 1: Length 30, Dim 10 valid
+    batch_input[1, :30, :10] = torch.randn(30, 10)
+    # Encoder Mask: True = Valid
+    enc_padding_mask = torch.zeros(2, 30, dtype=torch.bool)
+    enc_padding_mask[0, :20] = True
+    enc_padding_mask[1, :30] = True
+    print("Running mixed batch...")
+    with torch.no_grad():
+        z_mixed = encoder(batch_input, mixed_emb_ids, padding_mask=enc_padding_mask)
+        rec_mixed, dec_padding_mask = decoder(z_mixed, mixed_emb_ids)
+    print(f"Mixed Reconstruction Shape: {rec_mixed.shape}")  # Should be (2, 30, 10)
+    # Verify Decoder Generated Mask
+    valid_len_0 = dec_padding_mask[0].sum().item()
+    valid_len_1 = dec_padding_mask[1].sum().item()
+    print(f"Decoder Mask Valid Lengths: Batch 0={valid_len_0}, Batch 1={valid_len_1}")
+    if valid_len_0 == 20 and valid_len_1 == 30:
+        print("✅ PASS: Decoder correctly generated masks based on frequency and duration.")
+    else:
+        print("❌ FAIL: Decoder masks are incorrect.")
+    print("\n✨ All Tests Completed ✨")

rvq.py ADDED Viewed

	@@ -0,0 +1,522 @@

+from typing import List, Union
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from vector_quantize_pytorch import VectorQuantize as torchVQ
+def sample_vectors(samples, num):
+    # samples: (N, D), num_samples: N, feature dim: D
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices].float()  # (num, D), ensure fp32
+def ema_inplace(moving_avg, new, decay):
+    # moving_avg: (codebook_size) or (codebook_size, D'), new: same as moving_avg
+    """Update exponential moving average in-place"""
+    moving_avg.data.mul_(decay).add_(new.float(), alpha=(1 - decay))  # ensure fp32
+def kmeans(samples, num_clusters, num_iters=10):
+    # samples: (N, D), N samples with D dimensions
+    dim, _ = samples.shape[-1], torch.float32  # Force fp32
+    means = sample_vectors(samples, num_clusters).float()  # (num_clusters, D), ensure fp32
+    for _ in range(num_iters):
+        dists = -(
+            samples.float().pow(2).sum(1, keepdim=True)  # (N, 1), ensure fp32
+            - 2 * samples.float() @ means.t()  # (N, num_clusters), ensure fp32
+            + means.t().float().pow(2).sum(0, keepdim=True)
+        )  # (1, num_clusters), ensure fp32
+        # dists: (N, num_clusters)
+        buckets = dists.max(dim=-1).indices  # (N)
+        bins = torch.bincount(buckets, minlength=num_clusters)  # (num_clusters)
+        zero_mask = bins == 0  # (num_clusters)
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)  # (num_clusters)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=torch.float32)  # (num_clusters, D), ensure fp32
+        new_means.scatter_add_(
+            0, buckets.unsqueeze(1).expand(-1, dim), samples.float()
+        )  # (num_clusters, D), ensure fp32
+        new_means = new_means / bins_min_clamped[..., None]  # (num_clusters, D)
+        means = torch.where(zero_mask[..., None], means, new_means)  # (num_clusters, D)
+    # Final cluster assignments for returning cluster sizes
+    dists = -(
+        samples.float().pow(2).sum(1, keepdim=True)
+        - 2 * samples.float() @ means.t()
+        + means.t().float().pow(2).sum(0, keepdim=True)
+    )  # (N, num_clusters), ensure fp32
+    buckets = dists.max(dim=-1).indices  # (N)
+    bins = torch.bincount(buckets, minlength=num_clusters).float()  # (num_clusters), ensure fp32
+    return means, bins  # (num_clusters, D), (num_clusters)
+class VectorQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        codebook_size,
+        codebook_dim,
+        commitment=1.0,
+        decay=0.99,  # EMA decay
+        epsilon=1e-5,  # Laplace smoothing epsilon
+        threshold_ema_dead=2,  # Dead code threshold
+        kmeans_init=True,  # Use kmeans initialization
+        kmeans_iters=10,  # Kmeans iterations
+        rotation_trick=False,  # Use rotation trick
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.commitment = commitment
+        self.decay = decay
+        self.epsilon = epsilon
+        self.threshold_ema_dead = threshold_ema_dead
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.rotation_trick = rotation_trick
+        if self.input_dim != self.codebook_dim:
+            self.in_project = nn.Linear(input_dim, codebook_dim)
+            self.out_project = nn.Linear(codebook_dim, input_dim)
+        else:
+            self.in_project = nn.Identity()
+            self.out_project = nn.Identity()
+        # Initialize codebook and EMA buffers
+        init_fn = torch.zeros if kmeans_init else lambda x, y: torch.randn(x, y)
+        self.register_buffer(
+            "codebook", init_fn(codebook_size, codebook_dim).float()
+        )  # (codebook_size, D'), ensure fp32
+        self.register_buffer("inited", torch.tensor([not kmeans_init], dtype=torch.bool))  # (1)
+        self.register_buffer("cluster_size", torch.zeros(codebook_size).float())  # (codebook_size), ensure fp32
+        self.register_buffer("embed_avg", self.codebook.clone().float())  # (codebook_size, D'), ensure fp32
+    def ema_update(self, encodings, embed_onehot):
+        # encodings: (B*T, D'), embed_onehot: (B*T, codebook_size)
+        """Update codebook using EMA"""
+        encodings = encodings.float()  # Ensure fp32
+        embed_onehot = embed_onehot.float()  # Ensure fp32
+        cluster_size_new = embed_onehot.sum(0)  # (codebook_size)
+        embed_sum = encodings.t() @ embed_onehot  # (D', codebook_size)
+        # Distributed reduction
+        if dist.is_initialized():
+            dist.all_reduce(cluster_size_new, op=dist.ReduceOp.SUM)
+            dist.all_reduce(embed_sum, op=dist.ReduceOp.SUM)
+        ema_inplace(self.cluster_size, cluster_size_new, self.decay)  # (codebook_size)
+        ema_inplace(self.embed_avg, embed_sum.t(), self.decay)  # (codebook_size, D')
+        # Laplace smoothing
+        cluster_size = (self.cluster_size + self.epsilon) / (
+            self.cluster_size.sum() + self.codebook_size * self.epsilon
+        )  # (codebook_size)
+        cluster_size = cluster_size * self.cluster_size.sum()  # (codebook_size)
+        self.codebook.copy_(self.embed_avg / cluster_size.unsqueeze(1))  # (codebook_size, D')
+    def replace_dead_codes(self, encodings):
+        # encodings: (B*T, D')
+        """Replace dead codes with random samples from current batch"""
+        if self.threshold_ema_dead == 0:
+            return
+        dead_mask = self.cluster_size < self.threshold_ema_dead  # (codebook_size)
+        if dead_mask.any():
+            if dist.is_initialized() and dist.get_rank() == 0:
+                samples = sample_vectors(encodings.float(), self.codebook_size)  # (codebook_size, D'), ensure fp32
+                print(f"Replace {dead_mask.sum().item()} dead codes")
+            else:
+                samples = torch.zeros_like(self.codebook).float()  # Placeholder, ensure fp32
+            # Broadcast samples
+            if dist.is_initialized():
+                dist.broadcast(samples, src=0)
+            self.codebook[dead_mask] = samples[: dead_mask.sum()].to(self.codebook.dtype)  # Update dead codes
+    def init_codebook(self, encodings):
+        # encodings: (B*T, D')
+        """Initialize codebook with k-means and update cluster_size"""
+        if self.inited.item():
+            return
+        if dist.is_initialized() and dist.get_rank() == 0:
+            embed, cluster_sizes = kmeans(
+                encodings.float(), self.codebook_size, self.kmeans_iters
+            )  # (codebook_size, D'), (codebook_size), ensure fp32
+        else:
+            embed = torch.zeros(self.codebook_size, self.codebook_dim, device=encodings.device).float()  # ensure fp32
+            cluster_sizes = torch.zeros(self.codebook_size, device=encodings.device, dtype=torch.float32)  # ensure fp32
+        # Broadcast results
+        if dist.is_initialized():
+            dist.broadcast(embed, src=0)
+            dist.broadcast(cluster_sizes, src=0)
+        self.codebook.copy_(embed)  # (codebook_size, D')
+        self.embed_avg.copy_(embed.clone())  # (codebook_size, D')
+        self.cluster_size.copy_(cluster_sizes.float())  # (codebook_size)
+        self.inited.fill_(True)
+    def forward(self, z):
+        self = self.to(torch.float32)
+        z = z.float()
+        z_e = self.in_project(z).float()
+        # Rearrange for quantization
+        encodings = rearrange(z_e, "b t d -> (b t) d").float()  # (B*T, D'), ensure fp32
+        # Initialize codebook if needed
+        if self.kmeans_init and not self.inited.item():
+            self.init_codebook(encodings)
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ self.codebook.float().t()
+            + self.codebook.float().pow(2).sum(1, keepdim=True).t()
+        )
+        indices = (-dist).max(1)[1]
+        # cosine_similarity = F.cosine_similarity(encodings[None], self.codebook[:, None], dim=-1)
+        # indices = cosine_similarity.max(dim=0)[1]
+        indices = rearrange(indices, "(b t) -> b t", b=z.size(0))
+        z_q = self.decode_code(indices).float()
+        commit_loss = F.mse_loss(z_e, z_q.detach()) * self.commitment
+        if self.training and torch.is_grad_enabled():
+            embed_onehot = F.one_hot(indices.view(-1), self.codebook_size).float()
+            self.ema_update(encodings, embed_onehot)
+            self.replace_dead_codes(encodings)
+        z_q = (z_q - z_e).detach() + z_e
+        z_q = self.out_project(z_q).float()
+        return (
+            z_q,
+            commit_loss,
+            torch.tensor(0.0, device=z.device, dtype=torch.float32),
+            indices,
+            z_e,
+        )
+    def decode_code(self, embed_id):  # embed_id: (B, T)
+        return F.embedding(embed_id, self.codebook).float()  # (B, D', T), ensure fp32
+# class VectorQuantize(nn.Module):
+#     """
+#     Implementation of VQ similar to Karpathy's repo:
+#     https://github.com/karpathy/deep-vector-quantization
+#     Additionally uses following tricks from Improved VQGAN
+#     (https://arxiv.org/pdf/2110.04627.pdf):
+#         1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+#             for improved codebook usage
+#         2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+#             improves training stability
+#     """
+#     def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
+#         super().__init__()
+#         self.codebook_size = codebook_size
+#         self.codebook_dim = codebook_dim
+#         self.in_proj = nn.Linear(input_dim, codebook_dim)
+#         self.out_proj = nn.Linear(codebook_dim, input_dim)
+#         self.codebook = nn.Embedding(codebook_size, codebook_dim)
+#     def forward(self, z: torch.Tensor):
+#         """
+#         Args:
+#             z (torch.Tensor): shape (b, t, d)
+#         Returns:
+#             z_q (torch.Tensor): shape (b, t, d)
+#             commitment_loss (torch.Tensor): shape (1)
+#             codebook_loss (torch.Tensor): shape (1)
+#             indices (torch.Tensor): shape (b, t)
+#             z_e (torch.Tensor): shape (b, t, d)
+#         """
+#         # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+#         z_e = self.in_proj(z)
+#         z_q, indices = self.decode_latents(z_e)
+#         commitment_loss = F.mse_loss(z_e, z_q.detach()) * 0.25
+#         codebook_loss = F.mse_loss(z_q, z_e.detach())
+#         z_q = z_e + (z_q - z_e).detach()  # noop in forward pass, straight-through gradient estimator in backward pass
+#         z_q = self.out_proj(z_q)
+#         return z_q, commitment_loss, codebook_loss, indices, z_e
+#     def embed_code(self, embed_id):
+#         return F.embedding(embed_id, self.codebook.weight)
+#     def decode_code(self, embed_id):
+#         return self.embed_code(embed_id)
+#     def decode_latents(self, latents: torch.Tensor):
+#         codebook = self.codebook.weight
+#         encodings = rearrange(latents, "b t d -> (b t) d")
+#         cosine_similarity = F.cosine_similarity(encodings[None], codebook[:, None], dim=-1)
+#         indices = cosine_similarity.max(dim=0)[1]
+#         indices = rearrange(indices, "(b t) -> b t", b=latents.size(0))
+#         # encodings = F.normalize(encodings)
+#         # codebook = F.normalize(codebook)
+#         # dist = (
+#         #     encodings.pow(2).sum(1, keepdim=True)
+#         #     - 2 * encodings @ codebook.t()
+#         #     + codebook.pow(2).sum(1, keepdim=True).t()
+#         # )
+#         # indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+#         z_q = self.decode_code(indices)
+#         return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    def __init__(
+        self,
+        dim: int = 256,
+        n_codebooks: int = 4,
+        codebook_size: int = 512,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.25,
+        commitment: float = 0.25,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead: int = 2,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 10,
+        rotation_trick: bool = False,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(
+                    input_dim=dim,
+                    codebook_size=codebook_size,
+                    codebook_dim=codebook_dim[i],
+                    commitment=commitment,
+                    decay=decay,
+                    epsilon=epsilon,
+                    threshold_ema_dead=threshold_ema_dead,
+                    kmeans_init=kmeans_init,
+                    kmeans_iters=kmeans_iters,
+                    rotation_trick=rotation_trick,
+                )
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q, residual = 0, z
+        commitment_loss, codebook_loss = 0, 0
+        codebook_indices, latents = [], []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(residual)
+            # Create mask to apply quantizer dropout
+            mask = torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=-1)
+        latents = torch.cat(latents, dim=1)
+        return z_q, codes, latents, commitment_loss, codebook_loss
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[-1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[..., i])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_project(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=-1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+class IndependentVectorQuantize(nn.Module):
+    def __init__(self, num_codebooks: int = 1, **kwargs):
+        super().__init__()
+        self.vector_quantizers = nn.ModuleList([torchVQ(**kwargs) for _ in range(num_codebooks)])
+        self.num_codebooks = num_codebooks
+        self.codebook_size = self.vector_quantizers[0].codebook_size
+    @property
+    def ema_update(self):
+        return [vq.ema_update for vq in self.vector_quantizers]
+    @property
+    def codebook(self):
+        return torch.stack([vq.codebook for vq in self.vector_quantizers], dim=0)
+    @codebook.setter
+    def codebook(self, codes: List[torch.Tensor]):
+        assert len(codes) == self.num_codebooks, "Number of codebooks must match"
+        if not self.separate_codebook_per_head:
+            codes = rearrange(codes, "... -> 1 ...")
+        for i, code in enumerate(codes):
+            self.vector_quantizers[i].codebook.copy_(code)
+    def get_codes_from_indices(self, indices: torch.Tensor):
+        codes = list()
+        for i in range(self.num_codebooks):
+            codes.append(self.vector_quantizers[i].get_codes_from_indices(indices[..., i : i + 1]))
+        return torch.cat(codes, dim=-2)
+    def get_output_from_indices(self, indices: torch.Tensor):
+        outputs = list()
+        for i in range(self.num_codebooks):
+            outputs.append(self.vector_quantizers[i].get_output_from_indices(indices[..., i : i + 1]))
+        return torch.cat(outputs, dim=-2)
+    def update_in_place_optimizer(self):
+        for i in range(self.num_codebooks):
+            self.vector_quantizers[i].update_in_place_optimizer()
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        assert x.shape[1] == self.num_codebooks
+        quantized, indices, commit_losses = list(), list(), 0
+        for i in range(self.num_codebooks):
+            quantized_i, indices_i, commit_loss_i = self.vector_quantizers[i](x[:, i : i + 1])
+            quantized.append(quantized_i)
+            indices.append(indices_i)
+            commit_losses += commit_loss_i
+        quantized = torch.cat(quantized, dim=-2)
+        indices = torch.cat(indices, dim=-1)
+        return quantized, indices, commit_losses / self.num_codebooks
+if __name__ == "__main__":
+    vq = IndependentVectorQuantize(
+        num_codebooks=16,
+        dim=256,
+        codebook_size=2048,
+        decay=0.8,  # the exponential moving average decay, lower means the dictionary will change faster
+        commitment_weight=1.0,  # the weight on the commitment loss
+    )
+    x = torch.randn(1, 16, 256)
+    quantized, indices, commit_loss = vq(x)  # (1, 1024, 256), (1, 1024), (1)