Upload FAST tokenizer trained on lerobot/libero_video

Browse files

Files changed (4) hide show

processing_action_tokenizer.py +195 -90
processor_config.json +5 -4
tokenizer.json +0 -0
tokenizer_config.json +2 -2

processing_action_tokenizer.py CHANGED Viewed

@@ -1,140 +1,241 @@
 import logging
-from typing import ClassVar
 import numpy as np
-from scipy.fft import dct
-from scipy.fft import idct
 from tokenizers import ByteLevelBPETokenizer
 from tokenizers.trainers import BpeTrainer
 from transformers import PreTrainedTokenizerFast
 from transformers.processing_utils import ProcessorMixin
-class UniversalActionProcessor(ProcessorMixin):
     attributes: ClassVar[list[str]] = ["bpe_tokenizer"]
     bpe_tokenizer_class: str = "AutoTokenizer"
     def __init__(
         self,
         bpe_tokenizer: PreTrainedTokenizerFast,
-        scale: float = 10,
-        vocab_size: int = 1024,
         min_token: int = 0,
         *,
-        action_dim: int | None = None,
-        time_horizon: int | None = None,
     ):
         self.scale = scale
-        self.vocab_size = vocab_size
-        self.min_token = min_token
-        # Action horizon and dimension needed during decoding. These can be specified
-        # in three ways (in order of priority):
-        # 1. passed in as kwargs to decode()
-        # 2. in the constructor
-        # 3. cached from the last time decode() was called
         self.time_horizon = time_horizon
         self.action_dim = action_dim
         self.called_time_horizon = time_horizon
         self.called_action_dim = action_dim
         super().__init__(bpe_tokenizer)
-    def __call__(self, action_chunk: np.array) -> np.array:
         assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
         if action_chunk.ndim == 2:
             action_chunk = action_chunk[None, ...]
-        # Cache the time horizon and action dimension for decoding
-        self.called_time_horizon = action_chunk.shape[-2]
-        self.called_action_dim = action_chunk.shape[-1]
-        dct_coeff = dct(action_chunk, axis=1, norm="ortho")
-        dct_coeff = np.around(dct_coeff * self.scale)
-        tokens = []
-        for elem in dct_coeff:
-            token_str = "".join(map(chr, np.maximum(elem.flatten() - self.min_token, 0).astype(int)))
-            tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
-        return tokens
     def decode(
         self,
-        tokens: list[list[int]],
         *,
-        time_horizon: int | None = None,
-        action_dim: int | None = None,
-    ) -> np.array:
-        self.time_horizon = time_horizon or self.time_horizon or self.called_time_horizon
-        self.action_dim = action_dim or self.action_dim or self.called_action_dim
-        # Cache the time horizon and action dimension for the next call
-        self.called_time_horizon = self.time_horizon
-        self.called_action_dim = self.action_dim
-        assert (
-            self.time_horizon is not None and self.action_dim is not None
-        ), "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
         decoded_actions = []
-        for token in tokens:
-            try:
-                decoded_tokens = self.bpe_tokenizer.decode(token)
-                decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.min_token
-                decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
-                assert (
-                    decoded_dct_coeff.shape
-                    == (
-                        self.time_horizon,
-                        self.action_dim,
-                    )
-                ), f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
-            except Exception as e:
-                print(f"Error decoding tokens: {e}")
-                print(f"Tokens: {token}")
-                decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
-            decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
-        return np.stack(decoded_actions)
     @classmethod
     def fit(
         cls,
-        action_data: list[np.array],
-        scale: float = 10,
         vocab_size: int = 1024,
         *,
-        time_horizon: int | None = None,
-        action_dim: int | None = None,
-    ) -> "UniversalActionProcessor":
-        # Run DCT over all inputs
-        dct_tokens = [dct(a, axis=0, norm="ortho").flatten() for a in action_data]
-        # Quantize and find min token
-        max_token = int(np.around(np.concatenate(dct_tokens) * scale).max())
-        min_token = int(np.around(np.concatenate(dct_tokens) * scale).min())
-        min_vocab_size = max_token - min_token
-        assert (
-            min_vocab_size <= vocab_size
-        ), f"Vocab size {vocab_size} is too small for the range of tokens {min_vocab_size}"
-        if min_vocab_size + 100 > vocab_size:
             logging.warning(
-                f"Initial alphabet size {min_vocab_size} is almost as large as the vocab"
-                f"size {vocab_size}, consider increasing vocab size"
             )
-        # Make token iterator for BPE training
         def _token_iter():
-            for tokens in dct_tokens:
-                rounded_tokens = np.around(tokens * scale) - min_token
-                rounded_tokens = rounded_tokens.astype(int)
-                string = "".join(map(chr, rounded_tokens))
-                yield string
-        # Train BPE tokenizer
         bpe = ByteLevelBPETokenizer()
-        # Set up the entire range of possible tokens as the initial alphabet
-        alphabet = [chr(i) for i in range(max_token - min_token + 1)]
         trainer = BpeTrainer(
             vocab_size=vocab_size,
             min_frequency=2,
@@ -143,15 +244,19 @@ class UniversalActionProcessor(ProcessorMixin):
             initial_alphabet=alphabet,
             max_token_length=10000,
         )
-        # Train the inner tokenizer (don't use ByteLevelBPETokenizer.train_from_iterator()
-        # because it doesn't support custom alphabets)
         bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
         return cls(
             PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
             scale=scale,
-            vocab_size=vocab_size,
             min_token=min_token,
             time_horizon=time_horizon,
             action_dim=action_dim,

 import logging
+from typing import ClassVar, List, Optional
 import numpy as np
+import pywt
 from tokenizers import ByteLevelBPETokenizer
 from tokenizers.trainers import BpeTrainer
 from transformers import PreTrainedTokenizerFast
 from transformers.processing_utils import ProcessorMixin
+class WaveletActionProcessor(ProcessorMixin):
     attributes: ClassVar[list[str]] = ["bpe_tokenizer"]
     bpe_tokenizer_class: str = "AutoTokenizer"
     def __init__(
         self,
         bpe_tokenizer: PreTrainedTokenizerFast,
+        wavelet: str = "db1",
+        level: int = 2,
+        scale: float = 10.0,
         min_token: int = 0,
         *,
+        action_dim: Optional[int] = None,
+        time_horizon: Optional[int] = None,
     ):
+        self.wavelet = wavelet
+        self.level = level
         self.scale = scale
+        self.min_token = int(min_token)
+        # Used for decode (same logic as FAST)
         self.time_horizon = time_horizon
         self.action_dim = action_dim
         self.called_time_horizon = time_horizon
         self.called_action_dim = action_dim
+        # Cache wavelet coefficient layout needed for decoding
+        # We keep one slice-structure per dimension (they are typically identical for fixed T/wavelet/level)
+        self._coeff_slices_per_dim = None  # list of slice dicts
+        self._n_coeff = None               # number of wavelet coeffs per dim after coeffs_to_array
         super().__init__(bpe_tokenizer)
+    def _ensure_coeff_layout(self, T: int, D: int):
+        """Cache coeff slices and coeff vector length for given (T, wavelet, level)."""
+        if (
+            self._coeff_slices_per_dim is not None
+            and self._n_coeff is not None
+            and self.called_time_horizon == T
+            and self.called_action_dim == D
+        ):
+            return
+        dummy = np.zeros(T, dtype=np.float32)
+        slices_per_dim = []
+        n_coeff = None
+        for _ in range(D):
+            coeffs = pywt.wavedec(dummy, self.wavelet, level=self.level)
+            arr, slc = pywt.coeffs_to_array(coeffs)
+            slices_per_dim.append(slc)
+            if n_coeff is None:
+                n_coeff = int(arr.shape[0])
+        self._coeff_slices_per_dim = slices_per_dim
+        self._n_coeff = n_coeff
+    def __call__(self, action_chunk: np.ndarray) -> List[List[int]]:
+        """
+        Encode actions to BPE tokens.
+        action_chunk: (T,D) or (B,T,D)
+        returns: List[List[int]] (batch of token id lists)
+        """
         assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
         if action_chunk.ndim == 2:
             action_chunk = action_chunk[None, ...]
+        B, T, D = action_chunk.shape
+        # cache for decoding
+        self.called_time_horizon, self.called_action_dim = T, D
+        self._ensure_coeff_layout(T, D)
+        batch_tokens: List[List[int]] = []
+        for i in range(B):
+            # wavelet per dim -> flattened coeffs of length (n_coeff * D)
+            coeffs_by_dim = []
+            for d in range(D):
+                coeffs = pywt.wavedec(action_chunk[i, :, d], self.wavelet, level=self.level)
+                flat, _ = pywt.coeffs_to_array(coeffs)  # shape (n_coeff,)
+                coeffs_by_dim.append(flat)
+            coeff_mat = np.stack(coeffs_by_dim, axis=1)      # (n_coeff, D)
+            flat_all = coeff_mat.reshape(-1)                 # (n_coeff * D,)
+            quant = np.around(flat_all * self.scale).astype(int)
+            shifted = (quant - self.min_token).astype(int)
+            # Optional safety check (unicode range). Keep it simple:
+            if shifted.min() < 0:
+                # This means min_token was not low enough for these coeffs.
+                raise ValueError(
+                    f"Shifted tokens became negative (min={shifted.min()}). "
+                    f"Your min_token={self.min_token} is too high. Re-fit or lower min_token."
+                )
+            if shifted.max() > 0x10FFFF:
+                raise ValueError(
+                    f"Shifted tokens exceed Unicode max (max={shifted.max()}). "
+                    f"Reduce scale or re-fit min/max range."
+                )
+            token_str = "".join(chr(int(x)) for x in shifted)
+            batch_tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
+        return batch_tokens
     def decode(
         self,
+        tokens: List[List[int]],
         *,
+        time_horizon: Optional[int] = None,
+        action_dim: Optional[int] = None,
+    ) -> np.ndarray:
+        """
+        Decode BPE tokens back to actions.
+        tokens: List[List[int]] (batch)
+        returns: (B, T, D)
+        """
+        T = time_horizon or self.time_horizon or self.called_time_horizon
+        D = action_dim or self.action_dim or self.called_action_dim
+        assert T is not None and D is not None, (
+            "Tokenizer not initialized: call encode() once or pass time_horizon and action_dim."
+        )
+        # cache for next call + ensure layout
+        self.time_horizon, self.action_dim = T, D
+        self.called_time_horizon, self.called_action_dim = T, D
+        self._ensure_coeff_layout(T, D)
         decoded_actions = []
+        for tok_list in tokens:
+            # decode to string of chars
+            s = self.bpe_tokenizer.decode(tok_list, clean_up_tokenization_spaces=False)
+            ints = np.array([ord(c) for c in s], dtype=np.int64)
+            # unshift + dequantize
+            quant = ints + self.min_token
+            flat_coeffs = quant.astype(np.float32) / self.scale  # (n_coeff * D,)
+            # reshape to (n_coeff, D)
+            expected = self._n_coeff * D
+            if flat_coeffs.shape[0] != expected:
+                raise ValueError(
+                    f"Decoded coeff length mismatch: got {flat_coeffs.shape[0]}, expected {expected}. "
+                    f"(T={T}, D={D}, n_coeff={self._n_coeff}). "
+                    "This usually means you decoded with different T/D than encoding."
+                )
+            coeff_mat = flat_coeffs.reshape(self._n_coeff, D)
+            # inverse wavelet per dimension
+            recon = np.zeros((T, D), dtype=np.float32)
+            for d in range(D):
+                arr = coeff_mat[:, d]
+                coeff_list = pywt.array_to_coeffs(
+                    arr,
+                    self._coeff_slices_per_dim[d],
+                    output_format="wavedec",
+                )
+                sig = pywt.waverec(coeff_list, self.wavelet)
+                recon[:, d] = sig[:T]  # waverec can return a bit longer due to padding
+            decoded_actions.append(recon)
+        return np.stack(decoded_actions, axis=0)
     @classmethod
     def fit(
         cls,
+        action_data: List[np.ndarray],  # each (T,D)
+        wavelet: str = "db1",
+        level: int = 2,
+        scale: float = 10.0,
         vocab_size: int = 1024,
         *,
+        time_horizon: Optional[int] = None,
+        action_dim: Optional[int] = None,
+    ) -> "WaveletActionProcessor":
+        """
+        Fit BPE tokenizer on wavelet-quantized coefficient streams.
+        """
+        # Compute quantized coefficient streams to estimate min/max token range
+        all_streams = []
+        for a in action_data:
+            assert a.ndim == 2, "Each item must be (T,D)"
+            T, D = a.shape
+            # wavelet per dim -> flatten (n_coeff * D)
+            coeffs_by_dim = []
+            for d in range(D):
+                coeffs = pywt.wavedec(a[:, d], wavelet, level=level)
+                flat, _ = pywt.coeffs_to_array(coeffs)
+                coeffs_by_dim.append(flat)
+            coeff_mat = np.stack(coeffs_by_dim, axis=1)
+            stream = np.around(coeff_mat.reshape(-1) * scale).astype(int)
+            all_streams.append(stream)
+        all_vals = np.concatenate(all_streams)
+        min_token = int(all_vals.min())
+        max_token = int(all_vals.max())
+        token_range = max_token - min_token + 1
+        if token_range > vocab_size:
+            raise ValueError(
+                f"Vocab size {vocab_size} too small for token range {token_range}. "
+                "Increase vocab_size or reduce scale."
+            )
+        if token_range + 100 > vocab_size:
             logging.warning(
+                f"Initial alphabet size {token_range} is close to vocab_size {vocab_size}. "
+                "Consider increasing vocab_size for better BPE merges."
             )
         def _token_iter():
+            for stream in all_streams:
+                shifted = (stream - min_token).astype(int)
+                # no clamp; must be >=0
+                yield "".join(chr(int(x)) for x in shifted)
+        # Train BPE
         bpe = ByteLevelBPETokenizer()
+        alphabet = [chr(i) for i in range(token_range)]
         trainer = BpeTrainer(
             vocab_size=vocab_size,
             min_frequency=2,
             initial_alphabet=alphabet,
             max_token_length=10000,
         )
         bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
+        # infer T/D defaults if not provided
+        if time_horizon is None:
+            time_horizon = int(action_data[0].shape[0])
+        if action_dim is None:
+            action_dim = int(action_data[0].shape[1])
         return cls(
             PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
+            wavelet=wavelet,
+            level=level,
             scale=scale,
             min_token=min_token,
             time_horizon=time_horizon,
             action_dim=action_dim,

processor_config.json CHANGED Viewed

@@ -1,11 +1,12 @@
 {
   "action_dim": 6,
   "auto_map": {
-    "AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
   },
-  "min_token": -32,
-  "processor_class": "UniversalActionProcessor",
   "scale": 10.0,
   "time_horizon": 10,
-  "vocab_size": 1024
 }

 {
   "action_dim": 6,
   "auto_map": {
+    "AutoProcessor": "processing_action_tokenizer.WaveletActionProcessor"
   },
+  "level": 2,
+  "min_token": -20,
+  "processor_class": "WaveletActionProcessor",
   "scale": 10.0,
   "time_horizon": 10,
+  "wavelet": "db1"
 }

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "added_tokens_decoder": {},
   "auto_map": {
-    "AutoProcessor": "processing_action_tokenizer.UniversalActionProcessor"
   },
   "clean_up_tokenization_spaces": false,
   "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
-  "processor_class": "UniversalActionProcessor",
   "tokenizer_class": "PreTrainedTokenizerFast"
 }

 {
   "added_tokens_decoder": {},
   "auto_map": {
+    "AutoProcessor": "processing_action_tokenizer.WaveletActionProcessor"
   },
   "clean_up_tokenization_spaces": false,
   "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
+  "processor_class": "WaveletActionProcessor",
   "tokenizer_class": "PreTrainedTokenizerFast"
 }