anonymous08041999 commited on Jun 13, 2025

Commit

286e7c6

verified ·

1 Parent(s): 0a47a3b

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.gitattributes +1 -0
action_tokenizer.py +431 -0
config.json +320 -0
configuration_spatialvla.py +119 -0
generation_config.json +8 -0
global_step1500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
global_step1500/mp_rank_00_model_states.pt +3 -0
latest +1 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_gemma2.py +1283 -0
modeling_spatialvla.py +599 -0
preprocessor_config.json +28 -0
processing_spatialvla.py +259 -0
processor_config.json +327 -0
rng_state.pth +3 -0
special_tokens_map.json +39 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
trainer_state.json +0 -0
training_args.bin +3 -0
zero_to_fp32.py +674 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

action_tokenizer.py ADDED Viewed

	@@ -0,0 +1,431 @@

+"""
+action_tokenizer.py
+Extension class; wraps base LLM/VLM tokenizer with logic to discretize and tokenize continuous robot actions.
+"""
+from typing import List, Union, Dict, Optional
+import numpy as np
+from transformers import PreTrainedTokenizerBase
+from scipy.stats import norm
+import torch
+ACTION_TOKEN = '<ACTION{:05d}>'
+class ActionTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: int = 256,
+        min_action: int = -1,
+        max_action: int = 1,
+    ):
+        self._vocab_size = num_bins
+        self.tokenizer = tokenizer
+        self.min_action, self.max_action = min_action, max_action
+        self.bin_centers = np.linspace(min_action, max_action, num_bins)
+        # add special action tokens to language tokenizer
+        token_list = [ACTION_TOKEN.format(i) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} TRANSLATION TOKENS, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.action_token_begin_idx = self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 7), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n, 7), tokens.
+        """
+        action = np.clip(action, a_min=float(self.min_action), a_max=float(self.max_action))
+        ids = np.digitize(action, self.bin_centers, right=True)  # [0, 255]
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n, 7), token ids.
+        return: np.ndarray, (n, 7), continuous actions
+        """
+        ids = action_token_id - self.action_token_begin_idx
+        ids = np.clip(ids, a_min=0, a_max=self._vocab_size - 1)
+        return self.bin_centers[ids]
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class TranslationTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        bin_policy: Optional[Dict] = None,
+        use_spherical: bool = True,
+    ):
+        self.tokenizer = tokenizer
+        self.num_theta_bins = num_bins["theta_bins"]
+        self.num_phi_bins = num_bins["phi_bins"]
+        self.num_r_bins = num_bins["r_bins"]
+        self.use_spherical = use_spherical
+        # for indexing
+        self.NP = self.num_phi_bins * self.num_r_bins
+        # add special action tokens to language tokenizer
+        self._vocab_size = self.num_theta_bins * self.num_phi_bins * self.num_r_bins
+        token_list = [ACTION_TOKEN.format(i) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} TRANSLATION TOKENS, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+        self.set_bins(bin_policy)
+    def set_bins(self, bin_policy):
+        self.theta_bins = np.array(bin_policy["theta_bins"])
+        self.phi_bins = np.array(bin_policy["phi_bins"])
+        self.r_bins = np.array(bin_policy["r_bins"])
+    def cartesian_to_spherical(self, x, y, z):
+        theta = np.arctan2(np.sqrt(x**2 + y**2), z)  # polar angle
+        phi = np.arctan2(y, x)  # azimuthal angle
+        r = np.sqrt(x**2 + y**2 + z**2)
+        return theta, phi, r
+    def spherical_to_cartesian(self, theta, phi, r):
+        x = r * np.sin(theta) * np.cos(phi)
+        y = r * np.sin(theta) * np.sin(phi)
+        z = r * np.cos(theta)
+        return x, y, z
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 3), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        if self.use_spherical:
+            theta, phi, r = self.cartesian_to_spherical(action[:, 0], action[:, 1], action[:, 2])
+        else:
+            theta, phi, r = action[:, 0], action[:, 1], action[:, 2]
+        disc_theta = np.digitize(theta, self.theta_bins[1:-1]) # b
+        disc_phi = np.digitize(phi, self.phi_bins[1:-1])
+        disc_r = np.digitize(r, self.r_bins[1:-1])
+        ids = disc_theta * self.NP + disc_phi * self.num_r_bins + disc_r
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 3), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, self.token_start_idx, self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        disc_theta, disc_phi, disc_r = ids // self.NP, (ids % self.NP) // self.num_r_bins, ids % self.num_r_bins
+        theta = 0.5 * (self.theta_bins[disc_theta] + self.theta_bins[disc_theta + 1])
+        phi = 0.5 * (self.phi_bins[disc_phi] + self.phi_bins[disc_phi + 1])
+        r = 0.5 * (self.r_bins[disc_r] + self.r_bins[disc_r + 1])
+        # clip action to [-1, 1], due to the spherical coordinate action space is the circumscribed sphere of the Cartesian action space.
+        x, y, z = self.spherical_to_cartesian(theta, phi, r) if self.use_spherical else (theta, phi, r)
+        x, y, z = np.clip([x, y, z], -1, 1)
+        return np.stack((x, y, z), axis=1)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class RotationTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        bin_policy: Optional[Dict] = None,
+        array_begin_idx=None,
+    ):
+        self.tokenizer = tokenizer
+        self.num_roll_bins = num_bins["roll_bins"] # M
+        self.num_pitch_bins = num_bins["pitch_bins"] # N
+        self.num_yaw_bins = num_bins["yaw_bins"] # P
+        self.array_begin_idx = array_begin_idx
+        # for indexing
+        self.NP = self.num_pitch_bins * self.num_yaw_bins
+        # add special action tokens to language tokenizer
+        self._vocab_size = self.num_roll_bins * self.num_pitch_bins * self.num_yaw_bins
+        token_list = [ACTION_TOKEN.format(i + self.array_begin_idx) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} ROTATION TOKENS to tokenizer, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+        self.set_bins(bin_policy)
+    def set_bins(self, bin_policy):
+        self.roll_bins = np.array(bin_policy["roll_bins"])
+        self.pitch_bins = np.array(bin_policy["pitch_bins"])
+        self.yaw_bins = np.array(bin_policy["yaw_bins"])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 3), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        roll, pitch, yaw = action[:, 0], action[:, 1], action[:, 2]
+        disc_roll = np.clip(np.digitize(roll, self.roll_bins) - 1, 0, self.num_roll_bins - 1)
+        disc_pitch = np.clip(np.digitize(pitch, self.pitch_bins) - 1, 0, self.num_pitch_bins - 1)
+        disc_yaw = np.clip(np.digitize(yaw, self.yaw_bins) - 1, 0, self.num_yaw_bins - 1)
+        ids = disc_roll * self.NP + disc_pitch * self.num_yaw_bins + disc_yaw
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: Union[np.int64, np.ndarray]) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 3), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, a_min=self.token_start_idx, a_max=self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        disc_roll, disc_pitch, disc_yaw = ids // self.NP, (ids % self.NP) // self.num_yaw_bins, ids % self.num_yaw_bins
+        roll = 0.5 * (self.roll_bins[disc_roll] + self.roll_bins[disc_roll + 1])
+        pitch = 0.5 * (self.pitch_bins[disc_pitch] + self.pitch_bins[disc_pitch + 1])
+        yaw = 0.5 * (self.yaw_bins[disc_yaw] + self.yaw_bins[disc_yaw + 1])
+        return np.stack((roll, pitch, yaw), axis=1)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class GripperTokenzier:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: int = 2,
+        array_begin_idx = None,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.num_bins = num_bins
+        self.array_begin_idx = array_begin_idx
+        token_list = [ACTION_TOKEN.format(i + self.array_begin_idx) for i in range(self.num_bins)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} GRIPPER TOKENS to tokenizer, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n,), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        ids = np.where(action >= 0.5, 1, 0)
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 1), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, self.token_start_idx, self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        actions = np.where(ids == 0, 0., 1.)
+        return actions[:, None]
+    @property
+    def vocab_size(self) -> int:
+        return self.num_bins
+class SpatialActionTokenizer:
+    range_bins = {
+        "translation": {
+            "theta_bins": (0.0, np.pi),
+            "phi_bins": (-np.pi, np.pi),
+            "r_bins": (0.0, np.sqrt(3)),
+        },
+        "rotation": {
+            "roll_bins": (-1.0, 1.0),
+            "pitch_bins": (-1.0, 1.0),
+            "yaw_bins": (-1.0, 1.0),
+        },
+    }
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        gs_params: Dict = None,
+        bin_policy: Dict = None,
+        use_spherical: bool = True,
+        min_sigma: float = 0.0,
+        min_action: float = -1.0,
+        max_action: float = 1.0,
+    ):
+        """set bin_policy if exist, otherwise, caculate bin_policy from gs_params or use uniform bin grids.
+        gs_params: Optional[Dict],
+        bin_policy: Optional[Dict],
+        """
+        self.tokenizer = tokenizer
+        self.min_action, self.max_action = min_action, max_action
+        self.num_bins = num_bins
+        self.min_sigma = min_sigma
+        # set bin policy
+        self.bin_policy = bin_policy if bin_policy else self.get_bin_policy(gs_params, self.min_sigma)
+        self.translation_tokenizer = TranslationTokenizer(
+            self.tokenizer,
+            self.num_bins["translation"],
+            self.bin_policy["translation"],
+            use_spherical=use_spherical
+        )
+        self.rotation_tokenizer = RotationTokenizer(
+            self.tokenizer,
+            self.num_bins["rotation"],
+            self.bin_policy["rotation"],
+            array_begin_idx=self.translation_tokenizer.vocab_size,
+        )
+        self.gripper_tokenizer = GripperTokenzier(
+            self.tokenizer,
+            self.num_bins["gripper"],
+            array_begin_idx=self.translation_tokenizer.vocab_size + self.rotation_tokenizer.vocab_size
+        )
+        self._vocab_size = self.translation_tokenizer.vocab_size + self.rotation_tokenizer.vocab_size + self.gripper_tokenizer.vocab_size
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 7), continuous actions in Cartesian coordinates.
+        return: np.ndarray, (n, 3), tokens.
+        """
+        if len(action.shape) == 1:
+            assert action.shape[0] == 7, f"action dim mismatch, got action shape: {action.shape}"
+            action = action.reshape(1, 7)
+        assert action.shape[1] == 7, f"action dim mismatch, got action shape: {action.shape}"
+        action = np.clip(action, a_min=self.min_action, a_max=self.max_action)
+        trans_tokens = self.translation_tokenizer(action[:, :3]) # (n,)
+        rot_tokens = self.rotation_tokenizer(action[:, 3:6]) # (n,)
+        grip_tokens = self.gripper_tokenizer(action[:, 6]) # (n,)
+        return np.stack((trans_tokens, rot_tokens, grip_tokens), axis=1) # (n, 3)
+    def decode_token_ids_to_actions(self, action_token_ids: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_ids: np.ndarray, (n, 3), token ids.
+        """
+        if len(action_token_ids.shape) == 1:
+            assert action_token_ids.shape[0] == 3, f"action token id numbers mismatich, need 3 got {action_token_ids.shape[0]}"
+            action_token_ids = action_token_ids.reshape(1, 3)
+        assert action_token_ids.shape[1] == 3, f"token id numbers mismatich, need 3 got {action_token_ids.shape[1]}"
+        trans_action = self.translation_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 0]) # (n, 3)
+        rot_action = self.rotation_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 1]) # (n, 3)
+        grip_action = self.gripper_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 2]) # (n, 1)
+        return np.concatenate((trans_action, rot_action, grip_action), axis=1) # (n, 7)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+    @property
+    def action_token_begin_idx(self) -> int:
+        return self.translation_tokenizer.token_start_idx
+    def get_bin_policy(self, gs_params=None, min_sigma=0.0):
+        bin_policy = {
+            "translation": {"theta_bins": None, "phi_bins": None, "r_bins": None},
+            "rotation": {"roll_bins": None, "pitch_bins": None, "yaw_bins": None}
+        }
+        if gs_params is None:
+            for bin_type in self.range_bins.keys():
+                for bin_key in self.range_bins[bin_type].keys():
+                    bin_policy[bin_type][bin_key] = np.linspace(*self.range_bins[bin_type][bin_key], self.num_bins[bin_type][bin_key] + 1)
+            print(f"use unifrom bin grids ... \n{bin_policy}")
+        else:
+            for bin_type in self.range_bins.keys():
+                for bin_key in self.range_bins[bin_type].keys():
+                    mu = gs_params[bin_key.split("_")[0].lower()]["mu"]
+                    sigma = max(gs_params[bin_key.split("_")[0].lower()]["sigma"], min_sigma)
+                    bin_bound_prob = np.linspace(
+                        norm.cdf(self.range_bins[bin_type][bin_key][0], loc=mu, scale=sigma),
+                        norm.cdf(self.range_bins[bin_type][bin_key][1], loc=mu, scale=sigma),
+                        self.num_bins[bin_type][bin_key] + 1,
+                    )
+                    bin_boundary = norm.ppf(bin_bound_prob, loc=mu, scale=sigma)
+                    bin_policy[bin_type][bin_key] = np.clip(
+                            bin_boundary,
+                            self.range_bins[bin_type][bin_key][0],
+                            self.range_bins[bin_type][bin_key][1],
+                        ).tolist() # for serialize
+            print(f"caculate bin grids from gaussians \n{bin_policy}")
+        return bin_policy
+    def get_norm_meshgrid(self, bin_policy):
+        grids = []
+        policy = {k1: {k2: np.array(v2) for k2, v2 in v1.items()} for k1, v1 in bin_policy.items()}
+        # NOTE: use unify k,v order of range_bins (tpr, rpy)
+        for bin_type in self.range_bins.keys():
+            bounds = []
+            for bin_key in self.range_bins[bin_type].keys():
+                minb, maxb = self.range_bins[bin_type][bin_key][0], self.range_bins[bin_type][bin_key][1]
+                bin_boundary = policy[bin_type][bin_key]
+                bin_center = (bin_boundary[:-1] + bin_boundary[1:]) / 2
+                bin_center = np.concatenate([np.array([minb]),bin_center,np.array([maxb])]) # padding
+                bin_center = (bin_center - minb) /  (maxb - minb) # nomalize (m, n, k)
+                bounds.append(bin_center)
+            # generate grids
+            grid_x, grid_y, grid_z = np.meshgrid(*bounds)
+            grids += [np.stack([grid_x, grid_y, grid_z], -1).reshape(-1, 3)]
+        return grids[0], grids[1] # (N, 3)
+    def spatial_embedding_adaption(self, gs_params, embeddings: torch.nn.Embedding, min_sigma=0.0, adpt_feature=False):
+        """
+        gs_params0, gs_params1: Dict
+        embeddings: tensor (S,E)
+        """
+        from scipy.interpolate import griddata
+        new_policy = self.get_bin_policy(gs_params, min_sigma=min_sigma)
+        trans_grids0, rot_grids0 = self.get_norm_meshgrid(self.bin_policy)
+        trans_grids1, rot_grids1 = self.get_norm_meshgrid(new_policy)
+        print("overwrite bin policy and tokenizer bins ...")
+        self.bin_policy = new_policy
+        self.min_sigma = min_sigma
+        self.translation_tokenizer.set_bins(new_policy["translation"])
+        self.rotation_tokenizer.set_bins(new_policy["rotation"])
+        if adpt_feature:
+            emb_data = embeddings.weight.data # (S, e)
+            _, E = emb_data.shape
+            # translation
+            m, n, k = (self.num_bins["translation"][k] for k in ["theta_bins", "phi_bins", "r_bins"])
+            N = m*n*k
+            trans_emb_data = emb_data[:N,].reshape(m, n, k, -1).permute(3, 0, 1, 2) # (e, m, n, k)
+            pad_emb = torch.nn.functional.pad(trans_emb_data, (1, 1, 1, 1, 1, 1), "replicate").permute(1, 2, 3, 0).reshape(-1, E)
+            adpt_trans_emb = griddata(trans_grids0, pad_emb.float(), trans_grids1, method='linear')
+            adpt_trans_emb = adpt_trans_emb.reshape(m+2, n+2, k+2, E)[1:-1, 1:-1, 1:-1,]
+            # rotation
+            m1, n1, k1 = (self.num_bins["rotation"][k] for k in ["roll_bins", "pitch_bins", "yaw_bins"])
+            M = m1*n1*k1
+            rot_emb_data = emb_data[N : N + M,].reshape(m1, n1, k1, -1).permute(3, 0, 1, 2) # (e, m, n, k)
+            pad_emb = torch.nn.functional.pad(rot_emb_data, (1, 1, 1, 1, 1, 1), "replicate").permute(1, 2, 3, 0).reshape(-1, E)
+            adpt_rot_emb = griddata(rot_grids0, pad_emb.float(), rot_grids1, method='linear')
+            adpt_rot_emb = adpt_rot_emb.reshape(m1+2, n1+2, k1+2, E)[1:-1, 1:-1, 1:-1,]
+            # set data
+            device, dtype = embeddings.weight.data.device, embeddings.weight.data.dtype
+            embeddings.weight.data[:N] = torch.Tensor(adpt_trans_emb.reshape(-1, E), device=device).to(dtype)
+            embeddings.weight.data[N:N+M] = torch.Tensor(adpt_rot_emb.reshape(-1, E), device=device).to(dtype)
+            print("DONE! adapt spatial embedding to new gaussian distributation finished.")
+            print(embeddings.weight.data)

config.json ADDED Viewed

	@@ -0,0 +1,320 @@

+{
+  "_name_or_path": "/mnt/data1/datasets/vla/vla-finetuning/spatialvla-4b-224-sft-bridge",
+  "_vocab_size": 265347,
+  "action_token_begin_idx": 257153,
+  "architectures": [
+    "SpatialVLAForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_spatialvla.SpatialVLAConfig",
+    "AutoModel": "modeling_spatialvla.SpatialVLAForConditionalGeneration"
+  },
+  "bos_token_id": 2,
+  "ego3d_patch_reso": 2,
+  "eos_token_id": 1,
+  "hidden_size": 2048,
+  "image_token_index": 257152,
+  "model_type": "spatialvla",
+  "n_freqs": 8,
+  "num_hidden_layers": 26,
+  "pad_token_id": 0,
+  "projection_dim": 2304,
+  "spatial_token_num": 8194,
+  "text_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "Gemma2ForCausalLM"
+    ],
+    "eos_token_id": [
+      1,
+      107
+    ],
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 2304,
+    "intermediate_size": 9216,
+    "model_type": "gemma2",
+    "num_hidden_layers": 26,
+    "num_image_tokens": 256,
+    "num_key_value_heads": 4,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 265347
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_spatial_token": true,
+  "use_vision_zoe": true,
+  "vision_config": {
+    "hidden_size": 1152,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "num_image_tokens": 256,
+    "num_positions": 256,
+    "patch_size": 14,
+    "projection_dim": 2304,
+    "torch_dtype": "bfloat16",
+    "vision_use_head": false
+  },
+  "vision_zoe_config": {
+    "_attn_implementation_autoset": true,
+    "_name_or_path": "Intel/zoedepth-nyu-kitti",
+    "add_cross_attention": false,
+    "add_projection": false,
+    "architectures": [
+      "ZoeDepthForDepthEstimation"
+    ],
+    "attractor_alpha": 1000,
+    "attractor_gamma": 2,
+    "attractor_kind": "mean",
+    "backbone": null,
+    "backbone_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "add_fpn": false,
+      "architectures": null,
+      "attention_probs_dropout_prob": 0.0,
+      "auxiliary_channels": 256,
+      "auxiliary_concat_input": false,
+      "auxiliary_loss_weight": 0.4,
+      "auxiliary_num_convs": 1,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "drop_path_rate": 0.1,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "gelu",
+      "hidden_dropout_prob": 0.0,
+      "hidden_size": 1024,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_size": 384,
+      "initializer_range": 0.02,
+      "intermediate_size": 4096,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layer_norm_eps": 1e-12,
+      "layer_scale_init_value": 0.1,
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "beit",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 16,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_channels": 3,
+      "num_hidden_layers": 24,
+      "num_return_sequences": 1,
+      "out_features": [
+        "stage6",
+        "stage12",
+        "stage18",
+        "stage24"
+      ],
+      "out_indices": [
+        6,
+        12,
+        18,
+        24
+      ],
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_size": 16,
+      "pool_scales": [
+        1,
+        2,
+        3,
+        6
+      ],
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "reshape_hidden_states": false,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "semantic_loss_ignore_index": 255,
+      "sep_token_id": null,
+      "stage_names": [
+        "stem",
+        "stage1",
+        "stage2",
+        "stage3",
+        "stage4",
+        "stage5",
+        "stage6",
+        "stage7",
+        "stage8",
+        "stage9",
+        "stage10",
+        "stage11",
+        "stage12",
+        "stage13",
+        "stage14",
+        "stage15",
+        "stage16",
+        "stage17",
+        "stage18",
+        "stage19",
+        "stage20",
+        "stage21",
+        "stage22",
+        "stage23",
+        "stage24"
+      ],
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_absolute_position_embeddings": false,
+      "use_auxiliary_head": true,
+      "use_bfloat16": false,
+      "use_mask_token": false,
+      "use_mean_pooling": true,
+      "use_relative_position_bias": true,
+      "use_shared_relative_position_bias": false,
+      "vocab_size": 8192
+    },
+    "backbone_hidden_size": 1024,
+    "bad_words_ids": null,
+    "batch_norm_eps": 1e-05,
+    "begin_suppress_tokens": null,
+    "bin_centers_type": "softplus",
+    "bin_configurations": [
+      {
+        "max_depth": 10.0,
+        "min_depth": 0.001,
+        "n_bins": 64,
+        "name": "nyu"
+      },
+      {
+        "max_depth": 80.0,
+        "min_depth": 0.001,
+        "n_bins": 64,
+        "name": "kitti"
+      }
+    ],
+    "bin_embedding_dim": 128,
+    "bos_token_id": null,
+    "bottleneck_features": 256,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fusion_hidden_size": 256,
+    "head_in_index": -1,
+    "hidden_act": "gelu",
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_temp": 50.0,
+    "min_length": 0,
+    "min_temp": 0.0212,
+    "model_type": "zoedepth",
+    "neck_hidden_sizes": [
+      256,
+      512,
+      1024,
+      1024
+    ],
+    "no_repeat_ngram_size": 0,
+    "num_attractors": [
+      16,
+      8,
+      4,
+      1
+    ],
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_patch_transformer_layers": 4,
+    "num_relative_features": 32,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_transformer_hidden_size": 128,
+    "patch_transformer_intermediate_size": 1024,
+    "patch_transformer_num_attention_heads": 4,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "readout_type": "project",
+    "reassemble_factors": [
+      4,
+      2,
+      1,
+      0.5
+    ],
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_batch_norm_in_fusion_residual": false,
+    "use_bfloat16": false,
+    "use_bias_in_fusion_residual": null,
+    "use_pretrained_backbone": false
+  }
+}

configuration_spatialvla.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# coding=utf-8
+# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class SpatialVLAConfig(PretrainedConfig):
+    model_type = "spatialvla"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig}
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=256000,
+        vocab_size=257152,
+        projection_dim=2048,
+        hidden_size=2048,
+        vision_zoe_config=None,
+        action_token_begin_idx=None,
+        spatial_token_num=259,
+        use_spatial_token=False,
+        ego3d_patch_reso=4,
+        n_freqs=8,
+        use_vision_zoe=True,
+        **kwargs,
+    ):
+        self._ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self._vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        self.hidden_size = hidden_size
+        self.vision_config = vision_config
+        self.is_encoder_decoder = False
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1152,
+                patch_size=14,
+                image_size=224,
+                num_hidden_layers=27,
+                num_attention_heads=16,
+                vocab_size=257152,
+                vision_use_head=False,
+            )
+        self.text_config = text_config
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma2"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["gemma2"](
+                hidden_size=2048,
+                num_hidden_layers=18,
+                intermediate_size=16384,
+                num_attention_heads=8,
+                num_key_value_heads=1,
+                is_encoder_decoder=False,
+                vocab_size=vocab_size,
+            )
+        self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
+        self.vision_config.projection_dim = projection_dim
+        # vision zoe config
+        self.vision_zoe_config = vision_zoe_config
+        if isinstance(self.vision_zoe_config, dict):
+            vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth"
+            self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config)
+        else:
+            pass
+        # additional attributes
+        self.action_token_begin_idx = action_token_begin_idx
+        self.spatial_token_num = spatial_token_num
+        self.use_spatial_token = use_spatial_token
+        self.ego3d_patch_reso = ego3d_patch_reso
+        self.n_freqs = n_freqs
+        self.use_vision_zoe = use_vision_zoe
+        super().__init__(**kwargs)
+    @property
+    def ignore_index(self):
+        warnings.warn(
+            "The `ignore_index` attribute is deprecated and will be removed in v4.47.",
+            FutureWarning,
+        )
+        return self._ignore_index
+    @ignore_index.setter
+    def ignore_index(self, value):
+        self._ignore_index = value
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_ignore_index", None)
+        return output

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.0"
+}

global_step1500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc77a7b4e090619ccaf9e644968c00f7cf21aa7518a686a04e4c591e36b9828
+size 13497318524

global_step1500/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bbae45bafabe6c2d34ef51f4d02acda1136fbf706da94fc03b9c50069d7c7ed
+size 8056300410

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step1500

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d2323d8f682402232e99e6007c9b6dd0c23d81086cf2fae65a485b8a8368606
+size 4969426016

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad3f49e667e57a004fa212faa5f21477166b5c5ce2bfa3b5ac0e20986aa09c0
+size 3086476734

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_gemma2.py ADDED Viewed

	@@ -0,0 +1,1283 @@

+# custom gemma2 to support flash_attention_2,
+# source from https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/gemma2/modeling_gemma2.py
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, HybridCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal,
+    is_torch_greater_or_equal,
+    logging,
+    replace_return_docstrings,
+    is_flash_attn_greater_or_equal_2_10,
+)
+from transformers import Gemma2Config
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "google/gemma2-7b"
+_CONFIG_FOR_DOC = "Gemma2Config"
+class Gemma2RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+class Gemma2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class Gemma2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    **_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    key_states = repeat_kv(key, config.num_key_value_groups)
+    value_states = repeat_kv(value, config.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
+    if config.attn_logit_softcapping is not None:
+        attn_weights = attn_weights / config.attn_logit_softcapping
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * config.attn_logit_softcapping
+    if mask is not None:  # no matter the length, we just slice it
+        causal_mask = mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def flash_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    target_dtype: torch.dtype = torch.float16,
+    **_kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # NOTE: None mask cause un defined https://github.com/huggingface/transformers/blob/c8c8dffbe45ebef0a8dba4a51024e5e5e498596b/src/transformers/models/gemma2/modeling_gemma2.py#L211
+    seq_len = query.shape[2]
+    if mask is not None:
+        query = query[:, :, :seq_len]
+        value = value[:, :, :seq_len]
+    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
+    query_states = query.transpose(1, 2)
+    key_states = key.transpose(1, 2)
+    value_states = value.transpose(1, 2)
+    dropout_rate = config.attention_dropout if config.training else 0.0
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+    attn_output = _flash_attention_forward(
+        query_states,
+        key_states,
+        value_states,
+        mask,
+        seq_len,
+        dropout=dropout_rate,
+        softmax_scale=config.scaling,
+        is_causal=config.is_causal,
+        sliding_window=config.sliding_window,
+        use_top_left_mask=config._flash_attn_uses_top_left_mask,
+        softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
+    )
+    return attn_output, None
+def flex_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    output_attentions: bool = False,
+    **_kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    def tanh_softcap(score, b, h, q_idx, kv_idx):
+        soft_cap = config.attn_logit_softcapping
+        score = soft_cap * torch.tanh(score / soft_cap)
+        if mask is not None:
+            return score + mask[b][0][q_idx][kv_idx]
+        return score
+    attn_output = flex_attention(
+        query,
+        key,
+        value,
+        score_mod=tanh_softcap,
+        enable_gqa=True,
+        scale=config.scaling,
+        return_lse=output_attentions,
+    )
+    if not output_attentions:
+        attn_weights = None
+    else:
+        attn_output, attn_weights = attn_output
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def sdpa_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    **_kwargs,
+) -> Tuple[torch.Tensor, None]:
+    key = repeat_kv(key, config.num_key_value_groups)
+    value = repeat_kv(value, config.num_key_value_groups)
+    causal_mask = mask
+    if mask is not None:
+        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query.device.type == "cuda" and causal_mask is not None:
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=causal_mask,
+        dropout_p=config.attention_dropout if config.training else 0.0,
+        is_causal=is_causal,
+        scale=config.scaling,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, None
+GEMMA2_ATTENTION_FUNCTION = {
+    "flash_attention_2": flash_attention_forward,
+    "flex_attention": flex_attention_forward,
+    "eager": eager_attention_forward,
+    "sdpa": sdpa_attention_forward,
+}
+class Gemma2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
+        self.attn_logit_softcapping = config.attn_logit_softcapping
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.rotary_emb = Gemma2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        # NOTE: gemma2 do not include _flash_attn_uses_top_left_mask for flash attention
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
+            logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
+            attention_type = "flex_attention"
+        else:
+            attention_type = self.config._attn_implementation
+        attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
+            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Gemma2FlashAttention2(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "flash_attention_2"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+        )
+class Gemma2SdpaAttention(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "sdpa"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+        )
+class Gemma2DecoderLayer(nn.Module):
+    def __init__(self, config: Gemma2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.is_sliding = not bool(layer_idx % 2)
+        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma2MLP(config)
+        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.sliding_window = config.sliding_window
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
+            # Flash-attn is a 2D tensor
+            if self.config._attn_implementation == "flash_attention_2":
+                if past_key_value is not None:  # when decoding
+                    attention_mask = attention_mask[:, -self.sliding_window :]
+            else:
+                min_dtype = torch.finfo(hidden_states.dtype).min
+                sliding_window_mask = torch.tril(
+                    torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window
+                )
+                attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
+                if attention_mask.shape[-1] <= 1:  # when decoding
+                    attention_mask = attention_mask[:, :, :, -self.sliding_window :]
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+GEMMA2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Gemma2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2PreTrainedModel(PreTrainedModel):
+    config_class = Gemma2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Gemma2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = False
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False):
+        """
+        Overloads `PreTrainedModel._check_and_enable_sdpa` so as to DISABLE torch SDPA by default on Gemma2 models.
+        SDPA reduces the model performance on Gemma2 because of the logits softcapping.
+        """
+        config = super()._check_and_enable_sdpa(config, hard_check_only=hard_check_only)
+        # if using the default path -> swap sdpa by eager
+        if not hard_check_only and config._attn_implementation == "sdpa":
+            config._attn_implementation = "eager"
+        return config
+GEMMA2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2Model(Gemma2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Gemma2DecoderLayer`]
+    Args:
+        config: Gemma2Config
+    """
+    def __init__(self, config: Gemma2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None and not self.training:
+            batch_size, seq_len, _ = inputs_embeds.shape
+            past_key_values = HybridCache(
+                self.config,
+                batch_size=batch_size,
+                max_cache_len=seq_len,
+                device=self.device,
+                dtype=inputs_embeds.dtype,
+            )
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        # embed positions
+        hidden_states = inputs_embeds
+        # normalized
+        # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+        hidden_states = hidden_states * normalizer
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = past_key_values if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    @torch.no_grad()
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: HybridCache,
+        output_attentions: bool,
+    ):
+        # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache.
+        # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
+        # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible
+        # as it doesn't cause dynamic control issues.
+        if self.config._attn_implementation == "flash_attention_2":
+            return attention_mask
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Gemma2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, GemmaForCausalLM
+        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten: has a special cache type, `HybridCache`
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
+                # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
+                # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
+                # batch size = 1 case, `position_ids` is already contiguous but with varying stride
+                # which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if (
+            isinstance(past_key_values, HybridCache)
+            and attention_mask.ndim == 2
+            and not self.config._attn_implementation == "flash_attention_2"
+        ):
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+@add_start_docstrings(
+    """
+    The Gemma2 Model transformer with a sequence classification head on top (linear layer).
+    [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2ForSequenceClassification(Gemma2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    The Gemma2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2ForTokenClassification(Gemma2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

modeling_spatialvla.py ADDED Viewed

	@@ -0,0 +1,599 @@

+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union, Dict
+import os
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.linalg import inv
+import torchvision.transforms.functional as TF
+import torch.nn.functional as F
+from transformers.cache_utils import Cache, HybridCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel, PretrainedConfig
+from transformers.utils import (
+    ModelOutput,
+    logging,
+)
+from .configuration_spatialvla import SpatialVLAConfig
+from .modeling_gemma2 import Gemma2ForCausalLM
+from transformers import AutoModel, ZoeDepthForDepthEstimation
+SIGLIP_MEAN, SIGLIP_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+logger = logging.get_logger(__name__)
+from transformers import StoppingCriteria, StoppingCriteriaList
+class StopOnReasoningTag(StoppingCriteria):
+    def __init__(self, tokenizer, tag="<Reasoning>"):
+        self.tag_token_ids = tokenizer.tokenizer.encode(tag, add_special_tokens=False)[:-1]
+        # self.tag_token_ids = tmp.input_ids[:-1]
+        self.tag_length = len(self.tag_token_ids)
+    def __call__(self, input_ids, scores, **kwargs):
+        # Get the last tokens of the generated sequence
+        generated_tokens = input_ids[0].tolist()
+        # print("Hehe:", type(generated_tokens))
+        # print("lmao:", generated_tokens.shape)
+        # print("generated tokens",generated_tokens)
+        # print("tag length",self.tag_length)
+        # print("tag token",self.tag_token_ids)
+        # if len(generated_tokens) < tag_length:
+        #     return False
+        return generated_tokens[-self.tag_length:] == self.tag_token_ids
+class Ego3DPositionEmbeddingMLP(nn.Module):
+    """Absolute pos embedding, learned.
+    https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4
+    """
+    def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True):
+        super(Ego3DPositionEmbeddingMLP, self).__init__()
+        self.n_freqs = n_freqs
+        self.freq_out_channels = in_channels * (2 * n_freqs + 1)
+        if logscale:
+            freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs)
+        else:
+            freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs)
+        center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3)
+        self.register_buffer("freq_bands", freq_bands, persistent=False)
+        self.register_buffer("center", center, persistent=False)
+        self.position_embedding_head = nn.Sequential(
+            nn.Linear(self.freq_out_channels, num_pos_feats),
+            nn.LayerNorm(num_pos_feats),
+            nn.ReLU(),
+            nn.Linear(num_pos_feats, num_pos_feats),
+        )
+        self._reset_parameters()
+    def _reset_parameters(self):
+        """init with small weights to maintain stable training."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p, gain=0.01)
+    @torch.no_grad()
+    def frequency_encoding(self, xyz):
+        """
+        Embeds x to (x, sin(2^k x), cos(2^k x), ...)
+        Different from the paper, "x" is also in the output
+        See https://github.com/bmild/nerf/issues/12
+        x \in [-2, 2]
+        y \in [-2, 2]
+        z \in [0., 4]
+        Inputs:
+            x: (b n m)
+        Outputs:
+            out: (b n o)
+        """
+        xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype)
+        xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands  # (b n m 1)
+        sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq)  # (b n m nf)
+        encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1)
+        return encoding
+    def forward(self, xyz):
+        """Forward pass, xyz is (B, N, 3or6), output (B, N, F)."""
+        freq_encoding = self.frequency_encoding(xyz)
+        position_embedding = self.position_embedding_head(freq_encoding)
+        return position_embedding
+def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)):
+    """https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py"""
+    # h, w = images.shape[-2:]
+    # pad
+    ph, pw = 31, 31  # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31
+    images = F.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode)
+    # resize
+    size = (384, 384)  # get_resize_output_image_size
+    images = F.interpolate(images, size=size, mode="bicubic", align_corners=True)
+    # zoe: padding -> resize -> nomalize. we follow `nomalize -> padding -> resize` from siglip
+    images = TF.normalize(images, mean=ZOE_MEAN, std=ZOE_STD)
+    return images, ph, pw
+@dataclass
+class SpatialVLACausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+class SpatialVLAMultiModalProjector(nn.Module):
+    def __init__(self, config: SpatialVLAConfig):
+        super().__init__()
+        self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True)
+    def forward(self, image_features):
+        hidden_states = self.linear(image_features)
+        return hidden_states
+class SpatialVLAPreTrainedModel(PreTrainedModel):
+    config_class = SpatialVLAConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SpatialVLAMultiModalProjector", "ZoeDepthForDepthEstimation", "Ego3DPositionEmbeddingMLP"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMixin):
+    def __init__(self, config: SpatialVLAConfig, vision_model=None, vision_zoe_model=None, projector_model=None, language_model=None):
+        super().__init__(config)
+        self.vision_tower = vision_model or AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = projector_model or SpatialVLAMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        if language_model is None:
+            language_model = Gemma2ForCausalLM(config=config.text_config)
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        self.language_model = language_model
+        if config.use_vision_zoe:
+            self.vision_zoe_model = vision_zoe_model or ZoeDepthForDepthEstimation(config.vision_zoe_config)
+            self.position_embedding_3d = Ego3DPositionEmbeddingMLP(
+                config.ego3d_patch_reso**2 * 3, num_pos_feats=config.vision_config.hidden_size, n_freqs=config.n_freqs
+            )
+            # register buffer
+            patch_size, reso, image_size = config.vision_config.patch_size, config.ego3d_patch_reso, config.vision_config.image_size
+            y, x = torch.meshgrid(torch.arange(0, image_size, patch_size // reso), torch.arange(0, image_size, patch_size // reso), indexing="ij")  # (h//sp w//sp)
+            y, x = y + patch_size / reso / 2, x + patch_size / reso / 2
+            uv_h = torch.stack([x, y, torch.ones_like(x)], dim=0).reshape(3, -1)  # (3 hw)
+            self.register_buffer("uv_h", uv_h, persistent=False)
+        # shared spatial embeddings for <ACTION> <IMG>
+        if config.use_spatial_token:
+            self.spatial_embed_tokens = nn.Embedding(self.config.spatial_token_num, config.text_config.hidden_size)
+        else:
+            self.spatial_embed_tokens = None
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.processor = None
+    def criteria_value(self):
+        self.criteria = StoppingCriteriaList([StopOnReasoningTag(self.processor, "<Reasoning>")])
+    def backproject_patch(self, K: torch.Tensor, depth: torch.Tensor, patch_size=14, reso=2) -> torch.Tensor:
+        """
+        Backproject depth map to 3D points in camera coordinate.
+        Args:
+            K: camera intrinsic matrix (b 3 3)
+            depth: depth map (b 1 h w)
+            patch_size: patch size for siglip
+            reso: reso^2 -> sample points in each patch
+        patch sz = 14  ......
+        ┌────────┬────────┐
+        │ ─    ─ │ ─    ─ │
+        │ points │        ├─ ─ ─
+        │ ─    ─ │ ─    ─ │
+        ├────────┼────────┤
+        │ ─    ─ │ ─    ─ │
+        │        │        │
+        │ ─    ─ │ ─    ─ │
+        └────────┴────────┘
+        reso=2───►points=4
+            │
+            │
+        """
+        b, c, h, w = depth.shape
+        hp, wp = h // patch_size, w // patch_size
+        sub_hp = sub_wp = reso
+        patch_depth = F.interpolate(depth, size=(hp * reso, wp * reso), mode="area").reshape(b, c, -1)
+        p_cam = (inv(K.float()) @ self.uv_h.float()) * patch_depth  # (b 3 3) @ (3 hw) -> (b 3 hw) * (b 1 hw) -> (b 3 hw)
+        patch_p_cam = p_cam.reshape(b, 3, hp, sub_hp, wp, sub_wp).permute(0, 2, 4, 3, 5, 1).reshape(b, hp * wp, -1)
+        return patch_p_cam
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        mean_resizing: bool = True,
+    ) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        vocab_size = model_embeds.weight.shape[0]
+        self.config.text_config.vocab_size = self.vocab_size = self.config._vocab_size = vocab_size
+        self.tie_weights()
+        return model_embeds
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        token_type_ids,
+        past_key_values,
+        cache_position,
+        input_ids=None,
+        inputs_embeds=None,
+        is_training: bool = False,
+    ):
+        if self.config.text_config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        min_dtype = torch.finfo(self.dtype).min
+        inputs_lead_dim = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+        sequence_length = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        elif isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+        if attention_mask is not None and attention_mask.dim() == 4:
+            return attention_mask
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device)
+        if sequence_length != 1:
+            if is_training: causal_mask = torch.triu(causal_mask, diagonal=1)
+            else: causal_mask[:, :sequence_length] = 0.0
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0)
+        return causal_mask
+    def get_image_features(self, pixel_values: torch.FloatTensor, intrinsic: torch.FloatTensor):
+        siglip_pixel_values = TF.normalize(pixel_values, mean=SIGLIP_MEAN, std=SIGLIP_STD)
+        image_outputs = self.vision_tower(siglip_pixel_values)
+        # ego3d position encoding
+        if self.config.use_vision_zoe:
+            zoe_pixel_values, ph, pw = process_zoe(pixel_values, pad_mode="reflect")
+            with torch.no_grad():
+                pvh, pvw = pixel_values.shape[-2:]
+                depth = self.vision_zoe_model(pixel_values=zoe_pixel_values).predicted_depth
+                depth = F.interpolate(
+                    depth.unsqueeze(1),
+                    size=(pvh+2*ph, pvw+2*pw),
+                    mode="bicubic",
+                    align_corners=True,
+                )[..., ph:-ph, pw:-pw]
+                xyz = self.backproject_patch(
+                    intrinsic, depth, patch_size=self.config.vision_config.patch_size, reso=self.config.ego3d_patch_reso
+                )  # (b, n, 3*4)
+            pos_embed_3d = self.position_embedding_3d(xyz)
+            selected_image_feature = image_outputs.last_hidden_state + pos_embed_3d
+        else:
+            selected_image_feature = image_outputs.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features / (self.config.text_config.hidden_size**0.5)
+        return image_features
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        actions: Optional[torch.FloatTensor] = None,
+        intrinsic: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, SpatialVLACausalLMOutputWithPast]:
+        output_attentions = output_attentions or self.config.output_attentions
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        return_dict = return_dict or self.config.use_return_dict
+        is_training = token_type_ids is not None and labels is not None
+        if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids).clone() # avoid checkpint grad True
+        if self.config.use_spatial_token:
+            spatial_selected = (input_ids >= self.config.action_token_begin_idx) & (input_ids < self.config.action_token_begin_idx + self.config.spatial_token_num)
+            inputs_embeds[spatial_selected] = inputs_embeds[spatial_selected] * 0.0 + self.spatial_embed_tokens(input_ids[spatial_selected] - self.config.action_token_begin_idx)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+        # merge
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values, intrinsic)
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training
+        )
+        outputs = self.language_model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        logits = outputs.logits
+        # print("logits", logits.shape)
+        loss = None
+        if labels is not None:
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            mask = (shift_labels >= self.action_tokenizer.translation_tokenizer.token_start_idx) & (
+                shift_labels <= self.action_tokenizer.gripper_tokenizer.token_end_idx
+            )
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits_action = shift_logits[(shift_attention_mask & mask).to(logits.device) != 0].contiguous()
+                shift_labels_action = shift_labels[(shift_attention_mask & mask).to(shift_labels.device) != 0].contiguous()
+                shift_logits_reason = shift_logits[(shift_attention_mask & ~mask).to(logits.device) != 0].contiguous()
+                shift_labels_reason = shift_labels[(shift_attention_mask & ~mask).to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits_action = shift_logits[mask].contiguous()
+                shift_logits_reason = shift_logits[~mask].contiguous()
+                shift_labels_action = shift_labels[mask].contiguous()
+                shift_labels_reason = shift_labels[~mask].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            flat_logits_action = shift_logits_action.view(-1, self.config.text_config.vocab_size)
+            flat_labels_action = shift_labels_action.view(-1).to(shift_logits.device)
+            loss_action = loss_fct(flat_logits_action, flat_labels_action)
+            flat_logits_reason = shift_logits_reason.view(-1, self.config.text_config.vocab_size)
+            flat_labels_reason = shift_labels_reason.view(-1).to(shift_logits.device)
+            l1_loss_reasoning =  loss_fct(flat_logits_reason, flat_labels_reason)
+            loss = 0.7*loss_action + 0.3*l1_loss_reasoning
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return SpatialVLACausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+    # AR inference
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        intrinsic=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            num_logits_to_keep=num_logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        is_training = token_type_ids is not None and labels is not None
+        if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+            causal_mask = self._update_causal_mask(attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training)
+            model_inputs["attention_mask"] = causal_mask
+        model_inputs["intrinsic"] = intrinsic
+        return model_inputs
+    # @torch.no_grad()
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        model_inputs,
+    ) -> torch.Tensor:
+        model_inputs = model_inputs.to(torch.bfloat16).to(self.device)
+        input_len = model_inputs["input_ids"].shape[-1]
+        generation_outputs = self.generate(**model_inputs, max_new_tokens=256, stopping_criteria=self.criteria,do_sample=False)
+        return generation_outputs[:,input_len:]
+    @torch.no_grad()
+    def predict_action_with_attentions(
+        self,
+        model_inputs: Dict[str, torch.Tensor],
+        return_attentions: bool = True,
+    ) -> None:
+        model_inputs = model_inputs.to(self.device, torch.bfloat16)
+        input_len = model_inputs["input_ids"].shape[-1]
+        model_outputs = self.generate(
+            **model_inputs,
+            max_new_tokens=256,
+            do_sample=False,
+            # return_attentions=return_attentions,
+            output_attentions=return_attentions,
+            return_dict_in_generate=True,
+        )
+        generated_ids = model_outputs["sequences"][:, input_len:]
+        attentions = model_outputs["attentions"]
+        return generated_ids, attentions
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        if model.config.use_spatial_token:
+            model.language_model.model.embed_tokens.weight.data[-model.config.spatial_token_num:] = model.spatial_embed_tokens.weight.data
+        return model

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_spatialvla.SpatialVLAProcessor"
+  },
+  "do_convert_rgb": null,
+  "do_normalize": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SpatialVLAProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

processing_spatialvla.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import List, Optional, Union, Dict
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import Unpack, _validate_images_text_input_order, ProcessorMixin
+from transformers.tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
+from transformers.utils import logging
+from transformers.models.paligemma.processing_paligemma import (
+    make_batched_images,
+    build_string_from_input,
+    _is_str_or_image,
+    PaliGemmaProcessorKwargs,
+    IMAGE_TOKEN,
+    EXTRA_TOKENS
+)
+from .action_tokenizer import SpatialActionTokenizer
+logger = logging.get_logger(__name__)
+class SpatialVLAProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        statistics: Optional[dict] = None,
+        bin_policy=None,
+        intrinsic_config=None,
+        action_config=None,
+        num_obs_steps=1,
+        obs_delta=1,
+        action_chunk_size=1,
+        min_sigma=0.0,
+        **kwargs,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+        self.image_seq_length = image_processor.image_seq_length
+        if not hasattr(tokenizer, "image_token"):
+            image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
+            tokens_to_add = {"additional_special_tokens": [image_token]}
+            tokenizer.add_special_tokens(tokens_to_add)
+            self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        else:
+            self.image_token_id = tokenizer.image_token_id
+        tokenizer.add_tokens(EXTRA_TOKENS)
+        tokenizer.add_bos_token = False
+        tokenizer.add_eos_token = False
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        # action tokenizer
+        self.statistics = statistics if statistics else {}
+        self.bin_policy = bin_policy
+        self.min_sigma = min_sigma
+        self.intrinsic_config = intrinsic_config
+        self.action_config = action_config
+        self.num_obs_steps = num_obs_steps
+        self.obs_delta = obs_delta
+        self.action_chunk_size = action_chunk_size
+        self.dataset_intrinsics = {}
+        height, width = image_processor.size["height"], image_processor.size["width"]
+        # scale intrinsic matrix
+        for k, v in intrinsic_config.items():
+            K = torch.tensor(v["intrinsic"]).float()
+            K[:2] *= torch.tensor([width / v["width"], height / v["height"]])[:, None]
+            self.dataset_intrinsics[k] = K
+        self.action_tokenizer = SpatialActionTokenizer(
+            tokenizer=tokenizer, num_bins=action_config["num_bins"],
+            bin_policy=bin_policy, use_spherical=action_config["use_spherical"],
+            min_sigma=min_sigma,
+        )
+    def __call__(
+        self,
+        reasoning: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        unnorm_key: Optional[str] = None,
+        suffix_actions: Optional[np.array] = None, # (t e)
+        **kwargs: Unpack[PaliGemmaProcessorKwargs],
+    ) -> BatchFeature:
+        images, text = _validate_images_text_input_order(images, text)
+        output_kwargs = self._merge_kwargs(
+            PaliGemmaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if suffix_actions is not None:
+            action_tokens = self.action_tokenizer(suffix_actions) # (n,3)
+            suffix="".join(action_tokens.flatten())
+            suffix = f"{suffix}<Reasoning>: {reasoning}"
+        else:
+            suffix = output_kwargs["text_kwargs"].pop("suffix", None)
+        return_token_type_ids = True if suffix is not None else False
+        if images is None:
+            raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
+        if text is None:
+            logger.warning_once( "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model.")
+            text = ""
+        if _is_str_or_image(text):
+            text = [text]
+        elif isinstance(text, list) and _is_str_or_image(text[0]):
+            pass
+        if text is not None and images is not None:
+            if not any(IMAGE_TOKEN in sample for sample in text):
+                if isinstance(text, List) and isinstance(images, List):
+                    if len(images) != len(text):
+                        raise ValueError(
+                            f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image or list of images."
+                        )
+                if is_valid_image(images):
+                    images = [[images]]
+                elif isinstance(images, list) and is_valid_image(images[0]):
+                    images = [[image] for image in images]
+                elif not (isinstance(images, list) and isinstance(images[0], list) and is_valid_image(images[0][0])):
+                    raise ValueError("images must be an image, list of images or list of list of images")
+                if suffix is not None and _is_str_or_image(suffix): suffix = [suffix]
+                if suffix is not None: suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
+                print(f"suffix: {suffix}")
+                input_strings = [
+                    build_string_from_input(
+                        prompt=prompt,
+                        bos_token=self.tokenizer.bos_token,
+                        image_seq_len=self.image_seq_length,
+                        image_token=IMAGE_TOKEN,
+                        num_images=len(image_list) if isinstance(image_list, list) else 1,
+                    )
+                    for prompt, image_list in zip(text, images)
+                ]
+                images = make_batched_images(images)
+            else:
+                expanded_samples = []
+                for sample in text:
+                    expanded_sample = sample.replace(IMAGE_TOKEN, IMAGE_TOKEN * self.image_seq_length)
+                    bos_rfind_index = expanded_sample.rfind(IMAGE_TOKEN)
+                    bos_index = bos_rfind_index + len(IMAGE_TOKEN) if bos_rfind_index != -1 else 0
+                    expanded_sample = (
+                        expanded_sample[:bos_index] + self.tokenizer.bos_token + expanded_sample[bos_index:]
+                    )
+                    expanded_samples.append(expanded_sample)
+                input_strings = [f"{sample}\n" for sample in expanded_samples]
+        pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+        if output_kwargs["text_kwargs"].get("max_length", None) is not None:
+            output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
+        inputs = self.tokenizer(
+            input_strings,
+            text_pair=suffix,
+            return_token_type_ids=return_token_type_ids,
+            **output_kwargs["text_kwargs"],
+        )
+        # print(suffix)
+        intrinsic = self.dataset_intrinsics[unnorm_key] if unnorm_key in self.dataset_intrinsics else self.dataset_intrinsics["default"]
+        return_data = {**inputs, "pixel_values": pixel_values, "intrinsic": intrinsic}
+        if return_token_type_ids:
+            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+            return_data.update({"labels": labels})
+        return BatchFeature(data=return_data)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def decode_actions(
+        self,
+        generation_outputs: torch.Tensor,
+        unnorm_key: Optional[str] = None,
+    ) -> Dict[str, torch.Tensor]:
+        action_token_num = 3  # translation + rotation + gripper
+        predicted_action_token_ids = generation_outputs[0, : action_token_num * self.action_chunk_size].detach().cpu().long().numpy()
+        assert self.tokenizer.eos_token != predicted_action_token_ids[-1], "[error] actions contain EOS token, please check you truncation settings!"
+        if predicted_action_token_ids.shape[0] < action_token_num * self.action_chunk_size:  # pad with zeros
+            logger.warning(f"Padding zero action!")
+            predicted_action_token_ids = np.concatenate(
+                [
+                    predicted_action_token_ids,
+                    np.zeros(action_token_num * self.action_chunk_size - predicted_action_token_ids.shape[0], dtype=np.longlong),
+                ]
+            )
+        predicted_action_token_ids = predicted_action_token_ids.reshape(-1, action_token_num)
+        normalized_action_chunks = self.action_tokenizer.decode_token_ids_to_actions(predicted_action_token_ids)
+        if unnorm_key is None:
+            logger.warning(f"unnorm_key {unnorm_key} is not in statistics, use next one")
+            unnorm_key = next(self.statistics.keys())
+        action_norm_stats = self.statistics[unnorm_key]["action"]
+        action_dim = len(action_norm_stats["q01"])
+        mask = np.array(action_norm_stats.get("mask", np.ones(action_dim)), dtype=bool)
+        action_high, action_low = np.array(action_norm_stats["q99"]), np.array(action_norm_stats["q01"])
+        actions = []
+        for normalized_actions in normalized_action_chunks:
+            action = np.where(
+                mask,
+                0.5 * (normalized_actions + 1) * (action_high - action_low) + action_low,
+                normalized_actions,
+            )
+            actions.append(action)
+        actions = np.stack(actions)
+        return {"actions": actions, "action_ids": predicted_action_token_ids}

processor_config.json ADDED Viewed

	@@ -0,0 +1,327 @@

+{
+  "action_chunk_size": 4,
+  "action_config": {
+    "distribution": "gaussian",
+    "num_bins": {
+      "gripper": 2,
+      "rotation": {
+        "pitch_bins": 16,
+        "roll_bins": 16,
+        "yaw_bins": 16
+      },
+      "total": 8194,
+      "translation": {
+        "phi_bins": 32,
+        "r_bins": 8,
+        "theta_bins": 16
+      }
+    },
+    "use_spherical": true
+  },
+  "auto_map": {
+    "AutoProcessor": "processing_spatialvla.SpatialVLAProcessor"
+  },
+  "bin_policy": {
+    "rotation": {
+      "pitch_bins": [
+        -1.0,
+        -0.4236293919771139,
+        -0.2973624970533583,
+        -0.21059576820767317,
+        -0.14044938844843713,
+        -0.0791789125851777,
+        -0.023048480293744636,
+        0.030167161843358437,
+        0.08204200739679071,
+        0.13389374587953162,
+        0.18703587338481154,
+        0.24302765601977616,
+        0.30406026229156,
+        0.37378821800324374,
+        0.45971873753598247,
+        0.5836276162507279,
+        0.9999999999999991
+      ],
+      "roll_bins": [
+        -0.9999999999999999,
+        -0.48696292418679255,
+        -0.3676073739484146,
+        -0.28549591499691584,
+        -0.21907612836502022,
+        -0.16103745543314568,
+        -0.10784881328909159,
+        -0.05740408497876547,
+        -0.00821079709993185,
+        0.040983744804115825,
+        0.0914324636886914,
+        0.144628635967148,
+        0.20268023967111456,
+        0.269122809861373,
+        0.35127995163586373,
+        0.4707654855904555,
+        0.9999999999999944
+      ],
+      "yaw_bins": [
+        -1.0,
+        -0.4473279373756505,
+        -0.3332741619243962,
+        -0.25494122059754437,
+        -0.19161826850058544,
+        -0.1363039890445066,
+        -0.08562203792073503,
+        -0.03756062019257189,
+        0.009304860859811767,
+        0.05616950282205181,
+        0.1042282501882964,
+        0.15490516155832307,
+        0.21021078414249433,
+        0.2735184749468475,
+        0.35182078330381356,
+        0.465787139096136,
+        0.9999999999999982
+      ]
+    },
+    "translation": {
+      "phi_bins": [
+        -3.141592653589793,
+        -2.611427824867527,
+        -2.250204012654159,
+        -1.9664312602343461,
+        -1.727567317192397,
+        -1.5180333466123621,
+        -1.3290717520482633,
+        -1.1552219136523942,
+        -0.9928174267972283,
+        -0.8392525074770641,
+        -0.6925871222960145,
+        -0.5513178350935227,
+        -0.41423640072445,
+        -0.28033770999881874,
+        -0.14875675757685075,
+        -0.018723165750234833,
+        0.11047361805186211,
+        0.2395128839618976,
+        0.3690681218889241,
+        0.49983192073784344,
+        0.6325427359682341,
+        0.7680163128439619,
+        0.9071854848022353,
+        1.0511538919389105,
+        1.2012725735857557,
+        1.359254858953288,
+        1.52735781547609,
+        1.708685638209645,
+        1.9077325684228925,
+        2.1314415012063312,
+        2.3915198815314898,
+        2.710422326959981,
+        3.141592653589793
+      ],
+      "r_bins": [
+        0.0,
+        0.24715317617636928,
+        0.3738653185927623,
+        0.4741546344271254,
+        0.5660713758244397,
+        0.6591763123588074,
+        0.7640208367398835,
+        0.905077308623254,
+        1.7320508075688772
+      ],
+      "theta_bins": [
+        0.0,
+        0.9482227818534477,
+        1.232949635587941,
+        1.4288683204982662,
+        1.586471048273713,
+        1.7230822806307542,
+        1.8470152323808435,
+        1.9631023836372554,
+        2.0745890527961355,
+        2.1839605665055863,
+        2.2933911513280534,
+        2.405063409356251,
+        2.521491080766048,
+        2.6459805006534918,
+        2.7834919014248793,
+        2.942634872432456,
+        3.141592653589793
+      ]
+    }
+  },
+  "intrinsic_config": {
+    "bridge_orig/1.0.0": {
+      "height": 480,
+      "intrinsic": [
+        [
+          623.588,
+          0,
+          319.501
+        ],
+        [
+          0,
+          623.588,
+          239.545
+        ],
+        [
+          0,
+          0,
+          1
+        ]
+      ],
+      "width": 640
+    },
+    "default": {
+      "height": 480,
+      "intrinsic": [
+        [
+          623.588,
+          0,
+          319.501
+        ],
+        [
+          0,
+          623.588,
+          239.545
+        ],
+        [
+          0,
+          0,
+          1
+        ]
+      ],
+      "width": 640
+    }
+  },
+  "min_sigma": 0.0,
+  "num_obs_steps": 1,
+  "obs_delta": 1,
+  "processor_class": "SpatialVLAProcessor",
+  "statistics": {
+    "bridge_orig/1.0.0": {
+      "action": {
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ],
+        "max": [
+          0.055814191699028015,
+          0.09974314272403717,
+          0.07338187843561172,
+          0.41116073727607727,
+          0.3018309473991394,
+          6.236903190612793,
+          1.0
+        ],
+        "mean": [
+          0.00020478814258240163,
+          0.00012579727626871318,
+          -0.00013988478167448193,
+          -0.00017113501962739974,
+          -0.0003538677701726556,
+          0.00019135206821374595,
+          0.5760049223899841
+        ],
+        "min": [
+          -0.0696982890367508,
+          -0.0885118767619133,
+          -0.06311047077178955,
+          -0.3737139105796814,
+          -0.3136279881000519,
+          -6.244088649749756,
+          0.0
+        ],
+        "q01": [
+          -0.02925512194633484,
+          -0.04143750108778477,
+          -0.025954971089959145,
+          -0.08004292100667953,
+          -0.09390476904809475,
+          -0.204636562615633,
+          0.0
+        ],
+        "q99": [
+          0.028467297554016113,
+          0.04052329249680042,
+          0.040265134535729885,
+          0.0807134248316288,
+          0.08023637719452381,
+          0.2036343589425087,
+          1.0
+        ],
+        "std": [
+          0.009861940518021584,
+          0.013633579015731812,
+          0.012660318985581398,
+          0.028013890609145164,
+          0.030946530401706696,
+          0.08104098588228226,
+          0.49418240785598755
+        ]
+      },
+      "num_trajectories": 4173,
+      "num_transitions": 147976,
+      "proprio": {
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      }
+    }
+  }
+}

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:540f3db439cbbdd36aca5c70433757fc4c1857e7fffc16f42148fcf92b711029
+size 14244

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e537c6b0e9bf4083bd38cbee2a5d8aa973899c9e5eb91cf1a0e975838b005d5
+size 36157779

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dac8141dbe5733e9bd51a1f98aadd5a772028488a0ba49edfd727a9e437da291
+size 7544

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,674 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import json
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)