Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

config.json +0 -4
configuration_actioncodec.py +2 -0
modeling_actioncodec.py +457 -255

config.json CHANGED Viewed

@@ -2,10 +2,6 @@
   "architectures": [
     "ActionCodec"
   ],
-  "auto_map": {
-    "AutoConfig": "configuration_actioncodec.ActionCodecConfig",
-    "AutoModel": "modeling_actioncodec.ActionCodec"
-  },
   "decoder_add_causal_mask": false,
   "decoder_add_self_attn": false,
   "decoder_cls_size": 1,

   "architectures": [
     "ActionCodec"
   ],
   "decoder_add_causal_mask": false,
   "decoder_add_self_attn": false,
   "decoder_cls_size": 1,

configuration_actioncodec.py CHANGED Viewed

@@ -225,4 +225,6 @@ class BPEActionCodecConfig(PretrainedConfig):
 AutoConfig.register("action_codec", ActionCodecConfig)
 AutoConfig.register("bpe_action_codec", BPEActionCodecConfig)
 __all__ = ["ActionCodecConfig", "BPEActionCodecConfig"]

 AutoConfig.register("action_codec", ActionCodecConfig)
 AutoConfig.register("bpe_action_codec", BPEActionCodecConfig)
+ActionCodecConfig.register_for_auto_class()
 __all__ = ["ActionCodecConfig", "BPEActionCodecConfig"]

modeling_actioncodec.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List
 import einops
 import numpy as np
@@ -28,17 +28,67 @@ def trim_trailing_zeros(arr: np.ndarray) -> list[np.ndarray]:
 class ActionCodec(PreTrainedModel):
     config_class = ActionCodecConfig
     def __init__(self, config: ActionCodecConfig):
         super().__init__(config)
         self.default_embodiment_id = 0
         self.encoder = PerceiverEncoder(config)
         self.decoder = PerceiverDecoder(config)
         if config.vq_type == "vq":
-            assert config.n_quantizers == 1, "Only one quantizer is supported for VQ"
             self.vq = VectorQuantize(
                 dim=config.z_dim,
                 codebook_size=config.vq_codebook_size,
@@ -50,7 +100,10 @@ class ActionCodec(PreTrainedModel):
                 straight_through=True,
             )
         elif config.vq_type == "rvq":
-            assert config.n_quantizers > 1, "At least two quantizers are supported for RVQ"
             self.vq = ResidualVectorQuantize(
                 dim=config.z_dim,
                 n_codebooks=config.n_quantizers,
@@ -60,17 +113,57 @@ class ActionCodec(PreTrainedModel):
                 commitment=config.vq_commitment_weight,
             )
         else:
-            raise NotImplementedError(f"VQ type {config.vq_type} not implemented")
         self.vocab_size = config.vq_codebook_size
         self.num_quantizers = config.n_quantizers
         self.n_tokens_per_quantizer = config.n_tokens // config.n_quantizers
     def expand_embodiment(self, embodiment_config: dict):
         """
-        Delegates expansion to the underlying Encoder and Decoder.
-        This allows the Codec to adapt to new robots dynamically.
-        """
         self.encoder.expand_embodiment(embodiment_config)
         self.decoder.expand_embodiment(embodiment_config)
         self.config.embodiment_config.update(embodiment_config)
@@ -101,7 +194,28 @@ class ActionCodec(PreTrainedModel):
         z_e = self.encoder(x, embodiment_ids, padding_mask)
         return z_e
-    def _quantize(self, z_e: torch.Tensor, return_perplexity: bool = True) -> List[torch.Tensor]:
         if isinstance(self.vq, ResidualVectorQuantize):
             z_q, indices, _, commitment_loss, codebook_loss = self.vq(z_e)
             commit_loss = commitment_loss.mean() + codebook_loss.mean()
@@ -127,18 +241,50 @@ class ActionCodec(PreTrainedModel):
         return z_q, indices, perplexity, commit_loss
     def _dequantize(self, indices: torch.Tensor) -> torch.Tensor:
         if self.num_quantizers == 1:
             if len(indices.size()) == 3:
                 indices = indices.squeeze(-1)
         if isinstance(self.vq, ResidualVectorQuantize):
             z_q = self.vq.from_codes(indices)[0]
-        else:
             z_q = self.vq.get_output_from_indices(indices)
         return z_q
     def _decode(
         self, z_q: torch.Tensor, embodiment_ids: torch.Tensor | int | None = None, durations: torch.Tensor | None = None
-    ) -> torch.Tensor:
         embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
         x_recon, padding_mask = self.decoder(z_q, embodiment_ids, durations)
         return x_recon, padding_mask
@@ -146,275 +292,331 @@ class ActionCodec(PreTrainedModel):
     @torch.no_grad()
     def encode(
         self,
-        x: np.ndarray,
-        embodiment_ids: List[int] | int | None = None,
-        padding_mask: List[bool] | None = None,
     ) -> List[List[int]]:
-        """Encode action sequences into latent representations.
         Args:
-            x (np.ndarray): Action sequences to encode. Shape: (b, seq_len, max_action_dim).
                 Assumes that the action dimension is zero-padded to the max action dimension.
-                `seq_len` is supposed to be `int(duration * freq)` for each embodiment and padded to the max sequence length.
-            embodiment_ids (List[int] | int): Embodiment IDs. Shape: (b,).
-                If int, the same embodiment ID is repeated for all sequences in the batch.
-                It specifies the embodiment to encode.
-            padding_mask (List[bool] | None): Padding mask, where `False` values indicate padding. Shape: (b, seq_len). Defaults to None.
-                It is used to mask the padding tokens on `seq_len` dimension.
         Returns:
-            List[List[int]]: List of token sequences. Shape: (b, n_tokens).
         """
         self.eval()
-        embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
-        with torch.no_grad():
             x_tensor = torch.tensor(x, dtype=self.dtype, device=self.device)
-            if not isinstance(embodiment_ids, int):
-                embodiment_ids = torch.tensor(embodiment_ids, dtype=torch.long, device=self.device)
-            if padding_mask is not None:
-                padding_mask = torch.tensor(padding_mask, dtype=torch.bool, device=self.device)
-            z_e = self._encode(x_tensor, embodiment_ids, padding_mask)
             _, indices, _, _ = self._quantize(z_e, return_perplexity=False)
             if len(indices.size()) > 2:
                 codes_list = einops.rearrange(indices, "b n s -> b (s n)").cpu()
             else:
                 codes_list = indices.cpu()
             codes_list = codes_list.tolist()
             return codes_list
     @torch.no_grad()
     def decode(
-        self, tokens: List[List[int]], embodiment_ids: List[int] | int | None = None, durations: List[float] | None = None
-    ) -> np.ndarray:
         self.eval()
         embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
-        tokens = torch.tensor(tokens, dtype=torch.long, device=self.device)
-        if not isinstance(embodiment_ids, int):
-            embodiment_ids = torch.tensor(embodiment_ids, dtype=torch.long, device=self.device)
         if durations is not None:
-            durations = torch.tensor(durations, dtype=torch.float32, device=self.device)
-        b, n = tokens.shape
-        assert n % self.n_tokens_per_quantizer == 0, (
-            f"Expected {self.n_tokens_per_quantizer} tokens per quantizer, got {n} in total."
-        )
-        indices = einops.rearrange(tokens, "b (n m) -> b m n", m=self.n_tokens_per_quantizer)
-        z_q = self._dequantize(indices)
-        x_recon, padding_mask = self._decode(z_q, embodiment_ids, durations)
-        return x_recon.cpu().numpy(), padding_mask.cpu().numpy()
-    # def sparse_encode(
-    #     self,
-    #     x: np.ndarray,
-    #     search_num: int = 10,
-    #     threshold: float = 0.1,
-    #     action_encoding: str | None = None,
-    #     remove_padding: bool = True,
-    # ) -> List[List[int]]:
-    #     """
-    #     Sparse encoding with adaptive token selection based on reconstruction error threshold.
-    #     Uses quaternary search to find optimal token length.
-    #     Args:
-    #         x: Input action arrays of shape (b, n, d)
-    #         search_num: Maximum number of search iterations
-    #         threshold: Reconstruction error threshold
-    #         action_encoding: Action encoding type
-    #         remove_padding: Whether to remove trailing zeros
-    #     Returns:
-    #         List of sparse token sequences
-    #     """
-    #     self.eval()
-    #     with torch.no_grad():
-    #         x_tensor = self._numpy_to_tensor(x)
-    #         # Get initial encoding
-    #         z_e = self._encode(x_tensor, action_encoding)
-    #         _, indices, _, _ = self._quantize(z_e, return_perplexity=False)
-    #         # Convert indices to proper format
-    #         if len(indices.size()) > 2:
-    #             indices_flat = einops.rearrange(indices, "b n s -> b (s n)")
-    #         else:
-    #             indices_flat = indices
-    #         # Use quaternary search to find optimal token lengths
-    #         optimal_lengths = self._quaternary_search(x_tensor, indices_flat, threshold, search_num, action_encoding)
-    #         # Create final sparse tokens based on optimal lengths
-    #         final_tokens = self._create_sparse_tokens_from_lengths(indices_flat, optimal_lengths)
-    #         # Convert to list format
-    #         if remove_padding:
-    #             final_tokens = trim_trailing_zeros(final_tokens.cpu().numpy())
-    #         else:
-    #             final_tokens = final_tokens.cpu().tolist()
-    #         return final_tokens
-    # def _quaternary_search(
-    #     self,
-    #     x_tensor: torch.Tensor,
-    #     indices_flat: torch.Tensor,
-    #     threshold: float,
-    #     search_num: int,
-    #     action_encoding: str | None = None,
-    # ) -> torch.Tensor:
-    #     """
-    #     Quaternary search to find optimal token lengths for each batch item.
-    #     Returns tensor of shape (batch_size,) containing optimal lengths.
-    #     """
-    #     batch_size, seq_len = indices_flat.shape
-    #     # Initialize search bounds
-    #     device = indices_flat.device
-    #     left = torch.ones(batch_size, dtype=torch.long, device=device)
-    #     right = torch.full((batch_size,), seq_len, dtype=torch.long, device=device)
-    #     # Perform quaternary search
-    #     for _ in range(search_num):
-    #         # Calculate three division points
-    #         range_size = right - left
-    #         q1 = left + range_size // 4
-    #         q2 = left + range_size // 2
-    #         q3 = left + 3 * range_size // 4
-    #         # Ensure q1, q2, q3 are within bounds and distinct
-    #         q1 = torch.clamp(q1, left, right)
-    #         q2 = torch.clamp(q2, q1 + 1, right)
-    #         q3 = torch.clamp(q3, q2 + 1, right)
-    #         # Create test lengths: [left, q1, q2, q3, right]
-    #         test_lengths = torch.stack([left, q1, q2, q3, right], dim=1)  # (batch_size, 5)
-    #         # Calculate errors for all test lengths
-    #         errors = self._calculate_errors_for_lengths(x_tensor, indices_flat, test_lengths, action_encoding)
-    #         # Update search bounds based on results (vectorized)
-    #         # Find which lengths meet threshold for each batch item
-    #         meets_threshold = errors <= threshold
-    #         # For each batch item, find the smallest length that meets threshold
-    #         valid_indices = torch.argmax(meets_threshold.float(), dim=1)  # First True index
-    #         has_valid = meets_threshold.any(dim=1)  # Whether any length meets threshold
-    #         # Create batch indices for advanced indexing
-    #         batch_indices = torch.arange(batch_size, device=device)
-    #         # Get the smallest valid length for each batch
-    #         smallest_valid_lengths = test_lengths[batch_indices, valid_indices]
-    #         # Update bounds based on results
-    #         # If has valid length, use it; otherwise use longest length
-    #         right = torch.where(has_valid, smallest_valid_lengths, test_lengths[:, -1])
-    #         # Update left bound: if we found a valid length and it's not the first one,
-    #         # use the previous length; otherwise keep current left
-    #         prev_lengths = torch.where(valid_indices > 0, test_lengths[batch_indices, valid_indices - 1], left)
-    #         left = torch.where(has_valid & (valid_indices > 0), prev_lengths, left)
-    #         # Check convergence
-    #         if (right - left).max() <= 1:
-    #             break
-    #     return right  # Return optimal lengths
-    # def _calculate_errors_for_lengths(
-    #     self,
-    #     x_tensor: torch.Tensor,
-    #     indices_flat: torch.Tensor,
-    #     test_lengths: torch.Tensor,
-    #     action_encoding: str | None = None,
-    # ) -> torch.Tensor:
-    #     """
-    #     Calculate reconstruction errors for given token lengths.
-    #     Args:
-    #         x_tensor: Original input tensor (batch_size, ...)
-    #         indices_flat: Full token indices (batch_size, seq_len)
-    #         test_lengths: Test lengths tensor (batch_size, num_tests)
-    #         action_encoding: Action encoding type
-    #     Returns:
-    #         Error tensor (batch_size, num_tests)
-    #     """
-    #     # Create sparse tokens for all test lengths (vectorized)
-    #     batch_size, num_tests = test_lengths.shape
-    #     seq_len = indices_flat.shape[1]
-    #     device = indices_flat.device
-    #     # Create position tensor for all combinations
-    #     positions = torch.arange(seq_len, device=device).unsqueeze(0).unsqueeze(0)  # (1, 1, seq_len)
-    #     positions = positions.expand(batch_size, num_tests, -1)  # (batch_size, num_tests, seq_len)
-    #     # Create length mask: positions < test_lengths
-    #     length_mask = positions < test_lengths.unsqueeze(2)  # (batch_size, num_tests, seq_len)
-    #     # Create sparse tokens using advanced indexing
-    #     sparse_tokens = torch.where(
-    #         length_mask,
-    #         indices_flat.unsqueeze(1).expand(-1, num_tests, -1),
-    #         torch.zeros_like(indices_flat).unsqueeze(1).expand(-1, num_tests, -1),
-    #     )
-    #     # Reshape for parallel processing
-    #     sparse_flat = sparse_tokens.view(batch_size * num_tests, seq_len)
-    #     # Decode all sparse tokens in parallel
-    #     reconstructed_flat = self._decode_sparse_tokens(sparse_flat, action_encoding)
-    #     # Reshape back and calculate errors
-    #     reconstructed = reconstructed_flat.view(batch_size, num_tests, *x_tensor.shape[1:])
-    #     # Calculate errors
-    #     x_expanded = x_tensor.unsqueeze(1).expand(-1, num_tests, -1, -1)
-    #     errors = (x_expanded - reconstructed).abs().mean((-1, -2))  # (batch_size, num_tests)
-    #     return errors
-    # def _decode_sparse_tokens(self, sparse_tokens: torch.Tensor, action_encoding: str | None = None) -> torch.Tensor:
-    #     """Decode sparse tokens to reconstructed data."""
-    #     batch_size, seq_len = sparse_tokens.shape
-    #     # Convert to proper indices format for dequantization
-    #     if self.num_quantizers > 1:
-    #         seq_len_per_quantizer = seq_len // self.num_quantizers
-    #         if seq_len % self.num_quantizers != 0:
-    #             raise ValueError("Sequence length must be divisible by num_quantizers")
-    #         indices_for_decode = sparse_tokens.view(batch_size, self.num_quantizers, seq_len_per_quantizer).transpose(
-    #             1, 2
-    #         )  # (batch_size, seq_len_per_quantizer, num_quantizers)
-    #     else:
-    #         indices_for_decode = sparse_tokens.unsqueeze(-1)  # (batch_size, seq_len, 1)
-    #     # Dequantize and decode
-    #     z_q = self._dequantize(indices_for_decode)
-    #     reconstructed = self._decode(z_q, action_encoding)
-    #     return reconstructed
-    # def _create_sparse_tokens_from_lengths(
-    #     self, indices_flat: torch.Tensor, optimal_lengths: torch.Tensor
-    # ) -> torch.Tensor:
-    #     """Create sparse tokens based on optimal lengths (vectorized)."""
-    #     batch_size, seq_len = indices_flat.shape
-    #     device = indices_flat.device
-    #     # Create position mask for all batch items simultaneously
-    #     positions = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)  # (batch_size, seq_len)
-    #     length_mask = positions < optimal_lengths.unsqueeze(1)  # (batch_size, seq_len)
-    #     # Apply mask to create sparse tokens
-    #     result = torch.where(length_mask, indices_flat, torch.zeros_like(indices_flat))
-    #     return result
-    def forward(self, x: torch.Tensor, embodiment_ids: int | None = None, padding_mask: List[bool] | None = None):
-        return self.encode(x, embodiment_ids, padding_mask)
 AutoModel.register(ActionCodecConfig, ActionCodec)

+from typing import List, Tuple, Union
 import einops
 import numpy as np
 class ActionCodec(PreTrainedModel):
+    """ActionCodec: A neural codec for encoding and decoding robot action sequences.
+    This model uses a Perceiver-based encoder-decoder architecture with vector quantization
+    to convert continuous action sequences into discrete token sequences. It supports
+    multiple robot embodiments with different action dimensions and control frequencies.
+    The model supports two vector quantization types:
+    - VQ (Vector Quantization): Single quantizer
+    - RVQ (Residual Vector Quantization): Multiple quantizers for hierarchical encoding
+    Key features:
+    - Multi-embodiment support: Handle different robots with varying action dimensions
+    - Dynamic expansion: Add new robot configurations without retraining
+    - Flexible input/output: Support numpy arrays and torch tensors
+    """
     config_class = ActionCodecConfig
     def __init__(self, config: ActionCodecConfig):
+        """Initialize the ActionCodec model.
+        Args:
+            config (ActionCodecConfig): Model configuration containing hyperparameters
+                and embodiment configurations.
+        Raises:
+            ValueError: If configuration parameters are invalid.
+            NotImplementedError: If the specified VQ type is not supported.
+        """
         super().__init__(config)
+        # Validate configuration
+        if config.n_tokens % config.n_quantizers != 0:
+            raise ValueError(f"n_tokens ({config.n_tokens}) must be divisible by n_quantizers ({config.n_quantizers})")
+        if config.n_quantizers < 1:
+            raise ValueError(f"n_quantizers must be at least 1, got {config.n_quantizers}")
+        if config.vq_codebook_size < 1:
+            raise ValueError(f"vq_codebook_size must be at least 1, got {config.vq_codebook_size}")
+        if config.z_dim < 1:
+            raise ValueError(f"z_dim must be at least 1, got {config.z_dim}")
+        if not isinstance(config.embodiment_config, dict) or len(config.embodiment_config) == 0:
+            raise ValueError(
+                "embodiment_config must be a non-empty dictionary mapping embodiment names to configurations"
+            )
         self.default_embodiment_id = 0
+        # Initialize encoder and decoder
         self.encoder = PerceiverEncoder(config)
         self.decoder = PerceiverDecoder(config)
+        # Initialize vector quantizer based on type
         if config.vq_type == "vq":
+            if config.n_quantizers != 1:
+                raise ValueError(
+                    f"VQ type requires n_quantizers=1, got {config.n_quantizers}. Use RVQ type for multiple quantizers."
+                )
             self.vq = VectorQuantize(
                 dim=config.z_dim,
                 codebook_size=config.vq_codebook_size,
                 straight_through=True,
             )
         elif config.vq_type == "rvq":
+            if config.n_quantizers < 2:
+                raise ValueError(
+                    f"RVQ type requires n_quantizers >= 2, got {config.n_quantizers}. Use VQ type for single quantizer."
+                )
             self.vq = ResidualVectorQuantize(
                 dim=config.z_dim,
                 n_codebooks=config.n_quantizers,
                 commitment=config.vq_commitment_weight,
             )
         else:
+            raise NotImplementedError(f"VQ type '{config.vq_type}' not implemented. Supported types: 'vq', 'rvq'")
+        # Store quantization-related attributes
         self.vocab_size = config.vq_codebook_size
         self.num_quantizers = config.n_quantizers
         self.n_tokens_per_quantizer = config.n_tokens // config.n_quantizers
     def expand_embodiment(self, embodiment_config: dict):
+        """Dynamically expand the model to support new robot embodiments.
+        This method allows adding new robot configurations to the codec without retraining
+        the entire model. It updates the encoder and decoder to handle the new action dimensions
+        and frequencies while preserving existing functionality for previously configured robots.
+        Args:
+            embodiment_config (dict): Dictionary mapping embodiment names to their configurations.
+                Each configuration should be a dict with keys:
+                - "action_dim" (int): Action dimensionality for this embodiment.
+                - "freq" (float): Control frequency in Hz.
+                - "duration" (float): Default action sequence duration in seconds.
+                - "description" (str, optional): Human-readable description.
+                Example:
+                    {
+                        "robot_B": {
+                            "action_dim": 10,
+                            "freq": 20,
+                            "duration": 1.0,
+                            "description": "10-dim robot at 20Hz"
+                        }
+                    }
+        Returns:
+            ActionCodec: Returns self for method chaining.
+        Note:
+            - New embodiment keys must not already exist in the current configuration.
+            - The model will automatically update max_action_dim if the new embodiment
+              has a larger action dimension.
+            - Existing embodiments will continue to work with their original configurations.
         """
+        if not isinstance(embodiment_config, dict):
+            raise TypeError(f"embodiment_config must be a dict, got {type(embodiment_config)}")
+        if len(embodiment_config) == 0:
+            raise ValueError("embodiment_config cannot be empty")
+        # Check for duplicate keys
+        overlapping_keys = set(embodiment_config.keys()) & set(self.config.embodiment_config.keys())
+        if overlapping_keys:
+            raise ValueError(f"The following embodiment keys already exist and cannot be redefined: {overlapping_keys}")
         self.encoder.expand_embodiment(embodiment_config)
         self.decoder.expand_embodiment(embodiment_config)
         self.config.embodiment_config.update(embodiment_config)
         z_e = self.encoder(x, embodiment_ids, padding_mask)
         return z_e
+    def _quantize(
+        self, z_e: torch.Tensor, return_perplexity: bool = True
+    ) -> Tuple[torch.Tensor, torch.Tensor, Union[float, List[float]], torch.Tensor]:
+        """Quantize encoded representations using vector quantization.
+        Args:
+            z_e (torch.Tensor): Encoded latent representations to quantize.
+                Shape: (b, n_tokens_per_quantizer, z_dim).
+            return_perplexity (bool, optional): Whether to compute and return perplexity.
+                Defaults to True.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, Union[float, List[float]], torch.Tensor]:
+                A tuple containing:
+                - z_q (torch.Tensor): Quantized representations.
+                  Shape: (b, n_tokens_per_quantizer, z_dim).
+                - indices (torch.Tensor): Quantization indices.
+                  Shape: (b, n_tokens_per_quantizer) for VQ or (b, n_tokens_per_quantizer, n_quantizers) for RVQ.
+                - perplexity (Union[float, List[float]]): Codebook perplexity.
+                  Float for single quantizer, List[float] for multiple quantizers.
+                - commit_loss (torch.Tensor): Commitment loss scalar tensor.
+        """
         if isinstance(self.vq, ResidualVectorQuantize):
             z_q, indices, _, commitment_loss, codebook_loss = self.vq(z_e)
             commit_loss = commitment_loss.mean() + codebook_loss.mean()
         return z_q, indices, perplexity, commit_loss
     def _dequantize(self, indices: torch.Tensor) -> torch.Tensor:
+        """Dequantize token indices back to continuous latent representations.
+        Args:
+            indices (torch.Tensor): Quantization indices. Shape depends on quantizer type:
+                - For VQ: (b, n_tokens) or (b, n_tokens, 1)
+                - For RVQ: (b, n_tokens_per_quantizer, n_quantizers)
+        Returns:
+            torch.Tensor: Dequantized latent representations.
+                Shape: (b, n_tokens_per_quantizer, z_dim)
+        """
         if self.num_quantizers == 1:
             if len(indices.size()) == 3:
                 indices = indices.squeeze(-1)
         if isinstance(self.vq, ResidualVectorQuantize):
             z_q = self.vq.from_codes(indices)[0]
+        elif isinstance(self.vq, VectorQuantize):
             z_q = self.vq.get_output_from_indices(indices)
+        else:
+            raise NotImplementedError(f"VQ type {type(self.vq)} not implemented in _dequantize")
         return z_q
     def _decode(
         self, z_q: torch.Tensor, embodiment_ids: torch.Tensor | int | None = None, durations: torch.Tensor | None = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Decode quantized latent representations into action sequences.
+        Args:
+            z_q (torch.Tensor): Quantized latent representations.
+                Shape: (b, n_tokens_per_quantizer, z_dim).
+            embodiment_ids (Union[torch.Tensor, int, None], optional): Embodiment IDs.
+                Shape: (b,) if tensor. If int, the same embodiment ID is used for all
+                sequences. Defaults to None, which uses `self.default_embodiment_id`.
+            durations (torch.Tensor | None, optional): Duration of each action sequence in seconds.
+                Shape: (b,). If None, uses default duration from embodiment_config.
+                Defaults to None.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+                - x_recon (torch.Tensor): Reconstructed action sequences.
+                  Shape: (b, seq_len, max_action_dim).
+                - padding_mask (torch.Tensor): Padding mask indicating valid timesteps.
+                  Shape: (b, seq_len), where True indicates valid timesteps.
+        """
         embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
         x_recon, padding_mask = self.decoder(z_q, embodiment_ids, durations)
         return x_recon, padding_mask
     @torch.no_grad()
     def encode(
         self,
+        x: Union[np.ndarray, torch.Tensor],
+        embodiment_ids: Union[List[int], int, None] = None,
+        padding_mask: Union[List[bool], np.ndarray, torch.Tensor, None] = None,
+        **kwargs,
     ) -> List[List[int]]:
+        """Encode action sequences into latent representations (token indices).
+        This method converts action sequences into discrete token indices using the encoder
+        and vector quantizer. The input can be either a numpy array or torch tensor.
         Args:
+            x (Union[np.ndarray, torch.Tensor]): Action sequences to encode.
+                Shape: (b, seq_len, max_action_dim).
                 Assumes that the action dimension is zero-padded to the max action dimension.
+                `seq_len` is supposed to be `int(duration * freq)` for each embodiment and
+                padded to the max sequence length.
+            embodiment_ids (Union[List[int], int, None], optional): Embodiment IDs.
+                Shape: (b,) if list. If int, the same embodiment ID is repeated for all
+                sequences in the batch. It specifies the embodiment to encode.
+                Defaults to None, which uses `self.default_embodiment_id`.
+            padding_mask (Union[List[bool], np.ndarray, torch.Tensor, None], optional):
+                Padding mask, where `False` values indicate padding. Shape: (b, seq_len).
+                Defaults to None. It is used to mask the padding tokens on `seq_len` dimension.
+            **kwargs: Additional keyword arguments (currently unused, reserved for future use).
         Returns:
+            List[List[int]]: List of token sequences. Shape: (b, n_tokens), where n_tokens
+            is determined by the model configuration (typically `config.n_tokens`).
+        Raises:
+            ValueError: If input shapes are invalid or incompatible with the model configuration.
+            TypeError: If input types are not supported.
+        Examples:
+            >>> import numpy as np
+            >>> # Using numpy array
+            >>> x = np.random.randn(2, 10, 7).astype(np.float32)
+            >>> tokens = model.encode(x, embodiment_ids=[0, 0])
+            >>> # Using torch tensor
+            >>> x_tensor = torch.randn(2, 10, 7)
+            >>> tokens = model.encode(x_tensor, embodiment_ids=[0, 0])
         """
         self.eval()
+        # Validate and convert input x
+        if isinstance(x, np.ndarray):
+            if x.ndim != 3:
+                raise ValueError(
+                    f"Expected 3D input array (batch, seq_len, action_dim), got {x.ndim}D array with shape {x.shape}"
+                )
             x_tensor = torch.tensor(x, dtype=self.dtype, device=self.device)
+        elif isinstance(x, torch.Tensor):
+            if x.ndim != 3:
+                raise ValueError(
+                    f"Expected 3D tensor (batch, seq_len, action_dim), got {x.ndim}D tensor with shape {x.shape}"
+                )
+            x_tensor = x.to(dtype=self.dtype, device=self.device)
+        else:
+            raise TypeError(f"Input x must be numpy.ndarray or torch.Tensor, got {type(x)}")
+        # Validate batch size
+        batch_size = x_tensor.shape[0]
+        if batch_size == 0:
+            raise ValueError("Batch size must be at least 1")
+        # Handle embodiment_ids
+        embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
+        if isinstance(embodiment_ids, int):
+            if not 0 <= embodiment_ids < len(self.config.embodiment_config):
+                raise ValueError(
+                    f"embodiment_id {embodiment_ids} is out of range [0, {len(self.config.embodiment_config)}). "
+                    f"Available embodiment IDs: {list(range(len(self.config.embodiment_config)))}"
+                )
+            embodiment_ids_tensor = torch.tensor([embodiment_ids] * batch_size, dtype=torch.long, device=self.device)
+        elif isinstance(embodiment_ids, list):
+            if len(embodiment_ids) != batch_size:
+                raise ValueError(
+                    f"Length of embodiment_ids ({len(embodiment_ids)}) must match batch size ({batch_size})"
+                )
+            for eid in embodiment_ids:
+                if not isinstance(eid, int) or not 0 <= eid < len(self.config.embodiment_config):
+                    raise ValueError(
+                        f"Invalid embodiment_id {eid}. Must be an integer in range [0, {len(self.config.embodiment_config)})"
+                    )
+            embodiment_ids_tensor = torch.tensor(embodiment_ids, dtype=torch.long, device=self.device)
+        else:
+            raise TypeError(f"embodiment_ids must be int, List[int], or None, got {type(embodiment_ids)}")
+        # Handle padding_mask
+        padding_mask_tensor = None
+        if padding_mask is not None:
+            if isinstance(padding_mask, (list, np.ndarray)):
+                padding_mask_tensor = torch.tensor(padding_mask, dtype=torch.bool, device=self.device)
+            elif isinstance(padding_mask, torch.Tensor):
+                padding_mask_tensor = padding_mask.to(dtype=torch.bool, device=self.device)
+            else:
+                raise TypeError(
+                    f"padding_mask must be List[bool], np.ndarray, torch.Tensor, or None, got {type(padding_mask)}"
+                )
+            if padding_mask_tensor.shape != (batch_size, x_tensor.shape[1]):
+                raise ValueError(
+                    f"padding_mask shape {padding_mask_tensor.shape} does not match expected shape "
+                    f"({batch_size}, {x_tensor.shape[1]})"
+                )
+        with torch.no_grad():
+            z_e = self._encode(x_tensor, embodiment_ids_tensor, padding_mask_tensor)
             _, indices, _, _ = self._quantize(z_e, return_perplexity=False)
+            # Reshape indices: for RVQ, indices shape is (b, n, s), for VQ it's (b, n)
             if len(indices.size()) > 2:
                 codes_list = einops.rearrange(indices, "b n s -> b (s n)").cpu()
             else:
                 codes_list = indices.cpu()
             codes_list = codes_list.tolist()
             return codes_list
     @torch.no_grad()
     def decode(
+        self,
+        tokens: Union[List[List[int]], np.ndarray, torch.Tensor],
+        embodiment_ids: Union[List[int], int, None] = None,
+        durations: Union[List[float], np.ndarray, torch.Tensor, None] = None,
+        **kwargs,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Decode token sequences into action sequences.
+        This method reconstructs action sequences from discrete token indices using the
+        vector quantizer and decoder. The input tokens can be a list of lists, numpy array,
+        or torch tensor.
+        Args:
+            tokens (Union[List[List[int]], np.ndarray, torch.Tensor]): Token sequences to decode.
+                Shape: (b, n_tokens), where n_tokens must be divisible by `n_tokens_per_quantizer`.
+                For RVQ, tokens are interleaved: [q0_t0, q1_t0, ..., qN_t0, q0_t1, ...].
+            embodiment_ids (Union[List[int], int, None], optional): Embodiment IDs.
+                Shape: (b,) if list. If int, the same embodiment ID is repeated for all
+                sequences in the batch. It specifies the embodiment to decode.
+                Defaults to None, which uses `self.default_embodiment_id`.
+            durations (Union[List[float], np.ndarray, torch.Tensor, None], optional):
+                Duration of each action sequence in seconds. Shape: (b,).
+                If None, the duration is inferred from the default values in `embodiment_config`.
+                Defaults to None.
+            **kwargs: Additional keyword arguments (currently unused, reserved for future use).
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: A tuple containing:
+                - reconstructed_actions: Reconstructed action sequences.
+                  Shape: (b, seq_len, max_action_dim).
+                - padding_mask: Padding mask indicating valid timesteps.
+                  Shape: (b, seq_len), where True indicates valid timesteps.
+        Raises:
+            ValueError: If token sequence length is invalid or incompatible with the model configuration.
+            TypeError: If input types are not supported.
+        Examples:
+            >>> # Using list of lists
+            >>> tokens = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16]]
+            >>> actions, mask = model.decode(tokens, embodiment_ids=[0, 0])
+            >>> # Using numpy array
+            >>> tokens_np = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+            >>> actions, mask = model.decode(tokens_np, embodiment_ids=[0, 0])
+            >>> # Using torch tensor
+            >>> tokens_tensor = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
+            >>> actions, mask = model.decode(tokens_tensor, embodiment_ids=[0, 0])
+        """
         self.eval()
+        # Validate and convert input tokens
+        if isinstance(tokens, list):
+            if not all(isinstance(seq, list) for seq in tokens):
+                raise TypeError("If tokens is a list, all elements must be lists")
+            if len(tokens) == 0:
+                raise ValueError("Tokens list cannot be empty")
+            if not all(isinstance(val, (int, np.integer)) for seq in tokens for val in seq):
+                raise TypeError("All token values must be integers")
+            tokens_tensor = torch.tensor(tokens, dtype=torch.long, device=self.device)
+        elif isinstance(tokens, np.ndarray):
+            if tokens.ndim != 2:
+                raise ValueError(
+                    f"Expected 2D array (batch, n_tokens), got {tokens.ndim}D array with shape {tokens.shape}"
+                )
+            if not np.issubdtype(tokens.dtype, np.integer):
+                raise TypeError(f"Tokens array must have integer dtype, got {tokens.dtype}")
+            tokens_tensor = torch.tensor(tokens, dtype=torch.long, device=self.device)
+        elif isinstance(tokens, torch.Tensor):
+            if tokens.ndim != 2:
+                raise ValueError(
+                    f"Expected 2D tensor (batch, n_tokens), got {tokens.ndim}D tensor with shape {tokens.shape}"
+                )
+            if not tokens.dtype.is_integer:
+                raise TypeError(f"Tokens tensor must have integer dtype, got {tokens.dtype}")
+            tokens_tensor = tokens.to(dtype=torch.long, device=self.device)
+        else:
+            raise TypeError(f"tokens must be List[List[int]], np.ndarray, or torch.Tensor, got {type(tokens)}")
+        batch_size, n_tokens = tokens_tensor.shape
+        if batch_size == 0:
+            raise ValueError("Batch size must be at least 1")
+        if n_tokens == 0:
+            raise ValueError("Token sequence length must be at least 1")
+        # Validate token sequence length
+        if n_tokens % self.n_tokens_per_quantizer != 0:
+            raise ValueError(
+                f"Token sequence length ({n_tokens}) must be divisible by tokens per quantizer "
+                f"({self.n_tokens_per_quantizer}). Total tokens: {n_tokens}, "
+                f"Expected multiple of: {self.n_tokens_per_quantizer}. "
+                f"Number of quantizers: {self.num_quantizers}, Total tokens per sequence: {self.config.n_tokens}"
+            )
+        # Validate token values are within codebook range
+        if tokens_tensor.min() < 0 or tokens_tensor.max() >= self.vocab_size:
+            raise ValueError(
+                f"Token values must be in range [0, {self.vocab_size}), "
+                f"got range [{tokens_tensor.min().item()}, {tokens_tensor.max().item()}]"
+            )
+        # Handle embodiment_ids
         embodiment_ids = embodiment_ids if embodiment_ids is not None else self.default_embodiment_id
+        if isinstance(embodiment_ids, int):
+            if not 0 <= embodiment_ids < len(self.config.embodiment_config):
+                raise ValueError(
+                    f"embodiment_id {embodiment_ids} is out of range [0, {len(self.config.embodiment_config)}). "
+                    f"Available embodiment IDs: {list(range(len(self.config.embodiment_config)))}"
+                )
+            embodiment_ids_tensor = torch.tensor([embodiment_ids] * batch_size, dtype=torch.long, device=self.device)
+        elif isinstance(embodiment_ids, list):
+            if len(embodiment_ids) != batch_size:
+                raise ValueError(
+                    f"Length of embodiment_ids ({len(embodiment_ids)}) must match batch size ({batch_size})"
+                )
+            for eid in embodiment_ids:
+                if not isinstance(eid, int) or not 0 <= eid < len(self.config.embodiment_config):
+                    raise ValueError(
+                        f"Invalid embodiment_id {eid}. Must be an integer in range [0, {len(self.config.embodiment_config)})"
+                    )
+            embodiment_ids_tensor = torch.tensor(embodiment_ids, dtype=torch.long, device=self.device)
+        else:
+            raise TypeError(f"embodiment_ids must be int, List[int], or None, got {type(embodiment_ids)}")
+        # Handle durations
+        durations_tensor = None
         if durations is not None:
+            if isinstance(durations, (list, np.ndarray)):
+                durations_tensor = torch.tensor(durations, dtype=torch.float32, device=self.device)
+            elif isinstance(durations, torch.Tensor):
+                durations_tensor = durations.to(dtype=torch.float32, device=self.device)
+            else:
+                raise TypeError(
+                    f"durations must be List[float], np.ndarray, torch.Tensor, or None, got {type(durations)}"
+                )
+            if durations_tensor.ndim != 1:
+                raise ValueError(
+                    f"durations must be 1D, got {durations_tensor.ndim}D with shape {durations_tensor.shape}"
+                )
+            if len(durations_tensor) != batch_size:
+                raise ValueError(f"Length of durations ({len(durations_tensor)}) must match batch size ({batch_size})")
+            if (durations_tensor <= 0).any():
+                raise ValueError("All durations must be positive")
+        # Reshape tokens for dequantization: (b, n_tokens) -> (b, n_tokens_per_quantizer, n_quantizers)
+        indices = einops.rearrange(tokens_tensor, "b (n m) -> b m n", m=self.n_tokens_per_quantizer)
+        with torch.no_grad():
+            z_q = self._dequantize(indices)
+            x_recon, padding_mask = self._decode(z_q, embodiment_ids_tensor, durations_tensor)
+        return x_recon.float().cpu().numpy(), padding_mask.float().cpu().numpy()
+    def forward(
+        self,
+        x: Union[torch.Tensor, np.ndarray],
+        embodiment_ids: Union[torch.Tensor, int, List[int], None] = None,
+        padding_mask: Union[torch.Tensor, List[bool], np.ndarray, None] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass through the full ActionCodec pipeline.
+        This method performs encoding, quantization, and decoding in a single forward pass.
+        It is primarily used during training to compute reconstruction loss and commitment loss.
+        Both numpy arrays and torch tensors are supported as input.
+        Args:
+            x (Union[torch.Tensor, np.ndarray]): Action sequences to process.
+                Shape: (b, seq_len, max_action_dim).
+            embodiment_ids (Union[torch.Tensor, int, List[int], None], optional):
+                Embodiment IDs. Shape: (b,) if tensor or list. If int, same ID for all sequences.
+                Defaults to None, which uses `self.default_embodiment_id`.
+            padding_mask (Union[torch.Tensor, List[bool], np.ndarray, None], optional):
+                Padding mask. Shape: (b, seq_len). Defaults to None.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+                - x_recon (torch.Tensor): Reconstructed action sequences.
+                  Shape: (b, seq_len, max_action_dim).
+                - recon_mask (torch.Tensor): Reconstruction mask indicating valid timesteps.
+                  Shape: (b, seq_len), where True indicates valid timesteps.
+        Note:
+            - For inference use cases, prefer using `encode()` and `decode()` methods separately.
+            - If you need token indices, use the `encode()` method instead.
+        """
+        # Convert numpy array to torch tensor if needed
+        if isinstance(x, np.ndarray):
+            x = torch.tensor(x, dtype=self.dtype, device=self.device)
+        # Handle embodiment_ids conversion
+        if isinstance(embodiment_ids, list):
+            embodiment_ids = torch.tensor(embodiment_ids, device=x.device, dtype=torch.long)
+        elif isinstance(embodiment_ids, int):
+            # Keep as int, will be handled by _encode
+            pass
+        # Handle padding_mask conversion
+        if isinstance(padding_mask, (list, np.ndarray)):
+            padding_mask = torch.tensor(padding_mask, device=x.device, dtype=torch.bool)
+        # Full forward pass: encode -> quantize -> decode
+        z_e = self._encode(x, embodiment_ids, padding_mask)
+        z_q, indices, perplexity, commit_loss = self._quantize(z_e, return_perplexity=True)
+        x_recon, recon_mask = self._decode(z_q, embodiment_ids)
+        return x_recon, recon_mask
 AutoModel.register(ActionCodecConfig, ActionCodec)