Spaces:

0xZohar
/

object-assembler

Paused

App Files Files Community

0xZohar commited on Nov 13, 2025

Commit

398283d

verified ·

1 Parent(s): c18faa3

Upload code/cube3d/training/engine.py

Browse files

Files changed (1) hide show

code/cube3d/training/engine.py +1120 -0

code/cube3d/training/engine.py ADDED Viewed

	@@ -0,0 +1,1120 @@

+from typing import Union, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from peft import LoraConfig, get_peft_model, TaskType
+from tqdm import tqdm
+from transformers import CLIPTextModelWithProjection, CLIPTokenizerFast
+from cube3d.inference.logits_postprocesses import process_logits
+from cube3d.inference.utils import load_config, load_model_weights, parse_structured, load_model_weights_adaption
+from cube3d.model.autoencoder.one_d_autoencoder import OneDAutoEncoder
+from cube3d.model.gpt.dual_stream_roformer import DualStreamRoformer
+from cube3d.model.transformers.cache import Cache
+from cube3d.model.transformers.rope import precompute_freqs_cis
+from cube3d.training.utils import positional_encoding
+from cube3d.config import HF_CACHE_DIR
+class Engine:
+    def __init__(
+        self,
+        config_path: str,
+        gpt_ckpt_path: str,
+        shape_ckpt_path: str,
+        save_gpt_ckpt_path: str,
+        device: torch.device,
+        mode: str
+    ):
+        """
+        Initializes the inference engine with the given configuration and checkpoint paths.
+        Args:
+            config_path (str): Path to the configuration file.
+            gpt_ckpt_path (str): Path to the GPT model checkpoint file.
+            shape_ckpt_path (str): Path to the shape model checkpoint file.
+            device (torch.device): The device to run the models on (e.g., 'cpu' or 'cuda').
+        Attributes:
+            cfg (dict): Loaded configuration from the config file.
+            device (torch.device): The device to run the models on.
+            gpt_model (DualStreamRoformer): The GPT model initialized and loaded with weights.
+            shape_model (OneDAutoEncoder): The shape model initialized and loaded with weights.
+            text_model (CLIPTextModelWithProjection): The text model initialized from a pretrained model.
+            text_tokenizer (CLIPTokenizerFast): The tokenizer for the text model.
+            max_new_tokens (int): Maximum number of new tokens for the shape model.
+            min_id (int): Minimum ID for the shape model codes.
+            max_id (int): Maximum ID for the shape model codes.
+        """
+        self.cfg = load_config(config_path)
+        self.device = device
+        self.gpt_model = DualStreamRoformer(
+            parse_structured(DualStreamRoformer.Config, self.cfg.gpt_model)
+        )
+        #------training load
+        if mode=='test':
+            load_model_weights(
+                self.gpt_model,
+                save_gpt_ckpt_path,
+            )
+        #-------traing load
+        self.gpt_model = self.gpt_model.to(self.device)
+        self.shape_model = OneDAutoEncoder(
+            parse_structured(OneDAutoEncoder.Config, self.cfg.shape_model)
+        )
+        load_model_weights(
+            self.shape_model,
+            shape_ckpt_path,
+        )
+        self.shape_model = self.shape_model.eval().to(self.device)
+        # copy vq codebook to gpt
+        with torch.no_grad():
+            codebook = self.shape_model.bottleneck.block.get_codebook()
+            codebook = self.gpt_model.shape_proj(codebook).detach()
+        self.gpt_model.transformer.wte.weight.data[: codebook.shape[0]] = codebook
+        #import ipdb; ipdb.set_trace()
+        self.text_model = CLIPTextModelWithProjection.from_pretrained(
+            self.cfg.text_model_pretrained_model_name_or_path,
+            force_download=False,
+            device_map=self.device,
+            cache_dir=HF_CACHE_DIR,
+        ).eval()
+        print("------text_model device---------", self.text_model.device)
+        self.text_tokenizer = CLIPTokenizerFast.from_pretrained(
+            self.cfg.text_model_pretrained_model_name_or_path,
+            cache_dir=HF_CACHE_DIR,
+            #force_download=False,
+        )
+        self.max_new_tokens = self.shape_model.cfg.num_encoder_latents
+        self.min_id = 0
+        self.max_id = self.shape_model.cfg.num_codes
+        self.max_token_length = 110 #bottom #310 #car
+        self.x_prembeds = None
+        self.x_prembeds = None
+        self.x_prembeds = None
+    @torch.inference_mode()
+    def prepare_conditions_with_bbox(
+        self,
+        cond: torch.Tensor,
+        bounding_box_tensor: Optional[torch.Tensor] = None,
+    ):
+        """
+        Prepares condition embeddings by incorporating bounding box information.
+        Concatenates bounding box embeddings to the existing condition tensor if the model
+        supports bounding box projection. If no bounding box is provided, uses zero padding.
+        Args:
+            cond (torch.Tensor): The input condition embeddings tensor of shape (B, seq_len, dim).
+            bounding_box_xyz (Optional[torch.Tensor], optional): The size of the bounding box
+                as (x, y, z) dimensions represented as a tensor. If None, uses zero padding for
+                bounding box embeddings.
+        Returns:
+            torch.Tensor: The condition tensor with bounding box embeddings concatenated along
+                the sequence dimension if bounding box projection is supported, otherwise
+                returns the original condition tensor unchanged.
+        """
+        if not hasattr(self.gpt_model, "bbox_proj"):
+            return cond
+        if bounding_box_tensor is None:
+            B = cond.shape[0]
+            bounding_box_tensor = torch.zeros((B, 3), dtype=cond.dtype, device=self.device)
+        bbox_emb = self.gpt_model.bbox_proj(bounding_box_tensor).unsqueeze(dim=1).expand(cond.shape[0], -1, -1)
+        cond = torch.cat([cond, bbox_emb], dim=1)
+        return cond
+    @torch.inference_mode()
+    def prepare_conditions_with_bboxs(
+        self,
+        cond: torch.Tensor,
+        bounding_box_tensor: Optional[torch.Tensor] = None,
+    ):
+        """
+        Prepares condition embeddings by incorporating bounding box information.
+        Concatenates bounding box embeddings to the existing condition tensor if the model
+        supports bounding box projection. If no bounding box is provided, uses zero padding.
+        Args:
+            cond (torch.Tensor): The input condition embeddings tensor of shape (B, seq_len, dim).
+            bounding_box_xyz (Optional[torch.Tensor], optional): The size of the bounding box
+                as (x, y, z) dimensions represented as a tensor. If None, uses zero padding for
+                bounding box embeddings.
+        Returns:
+            torch.Tensor: The condition tensor with bounding box embeddings concatenated along
+                the sequence dimension if bounding box projection is supported, otherwise
+                returns the original condition tensor unchanged.
+        """
+        if not hasattr(self.gpt_model, "bbox_proj"):
+            return cond
+        if bounding_box_tensor is None:
+            B = cond.shape[0]
+            bounding_box_tensor = torch.zeros((B, 3), dtype=cond.dtype, device=self.device)
+        bbox_emb = self.gpt_model.bbox_proj(bounding_box_tensor).unsqueeze(dim=1).expand(cond.shape[0], -1, -1)
+        cond = torch.cat([cond, bbox_emb], dim=1)
+        return cond
+    @torch.inference_mode()
+    def prepare_inputs(
+        self,
+        prompts: list[str],
+        guidance_scale: float,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
+    ):
+        """
+        Prepares the input embeddings for the model based on the provided prompts and guidance scale.
+        Args:
+            prompts (list[str]): A list of prompt strings to be encoded.
+            guidance_scale (float): A scaling factor for guidance. If greater than 0.0, additional processing is applied.
+            bounding_box_xyz (Optional[Tuple[float]], optional): The size of the bounding box for generation
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
+        Returns:
+            tuple: A tuple containing:
+                - embed (torch.Tensor): The encoded input embeddings.
+                - cond (torch.Tensor): The condition embeddings, which may include unconditional embeddings if guidance_scale is greater than 0.0.
+        """
+        prompt_embeds = self.run_clip(prompts) # [1, 77, 1536]
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            embed = self.encode_input(prompt_embeds, self.gpt_model.shape_bos_id) # (prompt_embeds, 16384) -> [1, 1, 1536], just embedding shape_bos_id
+        #bos embed
+        if bounding_box_xyz is not None:
+            cond_bbox = torch.atleast_2d(torch.tensor(bounding_box_xyz)).to(self.device)
+            uncond_bbox = torch.zeros_like(cond_bbox).to(self.device)
+        else:
+            cond_bbox = None
+            uncond_bbox = None
+        cond = self.prepare_conditions_with_bbox(prompt_embeds, cond_bbox)
+        if guidance_scale > 0.0:
+            embed = torch.cat([embed, embed], dim=0) #why cat ? for chunk=2
+            uncond_embeds = self.run_clip([""] * len(prompts))
+            uncond = self.prepare_conditions_with_bbox(uncond_embeds, uncond_bbox)
+            cond = torch.cat([cond, uncond], dim=0)
+        return embed, cond
+    @torch.inference_mode()
+    def canonical_inputs(
+        self,
+        input_ids: torch.Tensor,
+        mask: torch.Tensor,
+    ):
+        """
+        Prepares the input embeddings for the model based on the provided prompts and guidance scale.
+        Args:
+            prompts (list[str]): A list of prompt strings to be encoded.
+            guidance_scale (float): A scaling factor for guidance. If greater than 0.0, additional processing is applied.
+            bounding_box_xyz (Optional[Tuple[float]], optional): The size of the bounding box for generation
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
+        Returns:
+            tuple: A tuple containing:
+                - embed (torch.Tensor): The encoded input embeddings.
+                - cond (torch.Tensor): The condition embeddings, which may include unconditional embeddings if guidance_scale is greater than 0.0.
+        """
+#         import ipdb; ipdb.set_trace()
+        x_num = 213
+        y_num = 217
+        z_num = 529
+        rot_num = 24
+        xyz = x_num + y_num + z_num + rot_num
+        #mask_input = input_ids[mask==1]
+        #cut_idx = (mask == False)[:, :, 0].int().argmax(dim=1)
+        input_ids[:, :xyz] = 0
+        input_ids[:, 0] = 1
+        return input_ids
+    @torch.inference_mode()
+    def run_clip(self, text_inputs):
+        """
+        Processes the given text inputs using a text tokenizer and a text model, and returns the encoded text embeddings.
+        Args:
+            text_inputs (str or List[str]): The input text or list of texts to be processed.
+        Returns:
+            torch.Tensor: The encoded text embeddings.
+        """
+        #import ipdb; ipdb.set_trace()
+        text_inputs = self.text_tokenizer(
+            text_inputs,
+            max_length=self.text_tokenizer.model_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            text_inputs = {k: v.to(self.device) for k, v in text_inputs.items()}
+            # use full precision for text encoder
+            with torch.autocast(device_type=self.device.type, enabled=False):
+                encoded = self.text_model(**text_inputs)
+            if self.gpt_model.cfg.use_pooled_text_embed:
+                embed = encoded.text_embeds.unsqueeze(1)  # [bs, 1, 512]
+            else:
+                embed = encoded.last_hidden_state  # [bs, 77, 512]
+        embed = self.gpt_model.encode_text(embed)
+        return embed
+    @torch.inference_mode()
+    def encode_input(self, inputs: torch.Tensor, bos: int):
+        """
+        Encodes the beginning of sequence (BOS) token for the given input tensor.
+        Args:
+            inputs (torch.Tensor): The input tensor containing sequences.
+            bos (int): The beginning of sequence token ID.
+        Returns:
+            torch.Tensor: The encoded BOS token embeddings.
+        """
+        b = inputs.shape[0]
+        bos_embed = self.gpt_model.encode_token(
+            torch.full(
+                (b, 1),
+                fill_value=bos,
+                dtype=torch.long,
+                device=self.device,
+            )
+        )
+        return bos_embed
+    @torch.inference_mode()
+    def run_gpt(
+        self,
+        prompts: list[str],
+        use_kv_cache: bool,
+        guidance_scale: float = 3.0,
+        top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
+    ):
+        """
+        Generates text using a GPT model based on the provided prompts.
+        Args:
+            prompts (list[str]): A list of input prompts to generate text from.
+            use_kv_cache (bool): Whether to use key-value caching for faster generation.
+            guidance_scale (float, optional): The scale for guidance during generation. Default is 3.0.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
+            bounding_box_xyz (Optional[Tuple[float]], optional): The size of the bounding box for generation
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
+        Returns:
+            torch.Tensor: A tensor containing the generated token IDs.
+        """
+        embed, cond = self.prepare_inputs(prompts, guidance_scale, bounding_box_xyz) #embed: bos
+        output_ids = []
+        batch_size, input_seq_len, dim = embed.shape
+        max_seq_len = input_seq_len + self.max_new_tokens
+        embed_buffer = torch.zeros(
+            (batch_size, max_seq_len, dim), dtype=embed.dtype, device=embed.device
+        )
+        embed_buffer[:, :input_seq_len, :].copy_(embed)
+        cond_len = cond.shape[1]
+        kv_cache = None
+        if use_kv_cache:
+#             import ipdb; ipdb.set_trace()
+            kv_cache = self.gpt_model.init_kv_cache(
+                batch_size,
+                cond_len,
+                self.max_new_tokens + 1,  # +1 for the BOS token
+                torch.bfloat16,
+                embed.device,
+            )
+#         import ipdb; ipdb.set_trace()
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            for i in tqdm(range(self.max_new_tokens), desc=f"generating"):
+                curr_pos_id = torch.tensor([i], dtype=torch.long, device=embed.device)
+                logits = self.gpt_model(
+                    embed_buffer,
+                    cond,
+                    kv_cache=kv_cache,
+                    curr_pos_id=curr_pos_id if use_kv_cache else None,
+                    decode=(i > 0) if use_kv_cache else False,
+                )
+                if use_kv_cache:
+                    logits = logits[:, 0, ...]
+                else:
+                    logits = logits[:, i, ...]
+#                 import ipdb; ipdb.set_trace()
+                logits = logits[..., self.min_id : self.max_id]
+                if guidance_scale > 0.0:
+                    logits, uncond_logits = logits.float().chunk(2, dim=0)
+                    gamma = (
+                        guidance_scale * (self.max_new_tokens - i) / self.max_new_tokens
+                    )
+                    logits = (1 + gamma) * logits - gamma * uncond_logits
+                next_id = process_logits(
+                    logits,
+                    top_p=top_p,
+                )
+                output_ids.append(next_id)
+                next_embed = self.gpt_model.encode_token(next_id)
+                if guidance_scale > 0.0:
+                    next_embed = torch.cat([next_embed, next_embed], dim=0)
+                embed_buffer[:, i + input_seq_len, :].copy_(next_embed.squeeze(1))
+#                 import ipdb; ipdb.set_trace()
+                print(logits)
+        return torch.cat(output_ids, dim=1)
+    @torch.inference_mode()
+    def run_shape_decode(
+        self,
+        output_ids: torch.Tensor,
+        resolution_base: float = 8.0,
+        chunk_size: int = 100_000,
+    ):
+        """
+        Decodes the shape from the given output IDs and extracts the geometry.
+        Args:
+            output_ids (torch.Tensor): The tensor containing the output IDs.
+            resolution_base (float, optional): The base resolution for geometry extraction. Defaults to 8.43.
+            chunk_size (int, optional): The chunk size for processing. Defaults to 100,000.
+        Returns:
+            tuple: A tuple containing the vertices and faces of the mesh.
+        """
+        shape_ids = (
+            output_ids[:, : self.shape_model.cfg.num_encoder_latents, ...]
+            .clamp_(0, self.shape_model.cfg.num_codes - 1)
+            .view(-1, self.shape_model.cfg.num_encoder_latents)
+        )
+        latents = self.shape_model.decode_indices(shape_ids) #where loss?
+        mesh_v_f, _ = self.shape_model.extract_geometry(
+            latents,
+            resolution_base=resolution_base,
+            chunk_size=chunk_size,
+            use_warp=True,
+        )
+        return mesh_v_f
+    @torch.inference_mode()
+    def t2s(
+        self,
+        prompts: list[str],
+        use_kv_cache: bool,
+        guidance_scale: float = 3.0,
+        resolution_base: float = 8.0,
+        chunk_size: int = 100_000,
+        top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
+    ):
+        """
+        Generates a 3D mesh from text prompts using a GPT model and shape decoder.
+        Args:
+            prompts (list[str]): A list of text prompts to guide the generation.
+            use_kv_cache (bool): Whether to use key-value caching for the GPT model.
+            guidance_scale (float, optional): The scale of guidance for the GPT model. Default is 3.0.
+            resolution_base (float, optional): The base resolution for the shape decoder. Default is 8.0.
+            chunk_size (int, optional): The chunk size for processing the shape decoding. Default is 100,000.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
+            bounding_box_xyz (Tuple[float] | None, optional): The size of the bounding box for the generated mesh
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
+        Returns:
+            mesh_v_f: The generated 3D mesh vertices and faces.
+        """
+        output_ids = self.run_gpt(
+            prompts, use_kv_cache, guidance_scale, top_p, bounding_box_xyz
+        )
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            mesh_v_f = self.run_shape_decode(output_ids, resolution_base, chunk_size)
+        return mesh_v_f
+class EngineFast(Engine):
+    def __init__(
+        self,
+        config_path: str,
+        gpt_ckpt_path: str,
+        shape_ckpt_path: str,
+        save_gpt_ckpt_path: str,
+        device: torch.device,
+        mode: str
+    ):
+        """
+        Initializes the inference engine with the given configuration and checkpoint paths.
+        Args:
+            config_path (str): Path to the configuration file.
+            gpt_ckpt_path (str): Path to the GPT checkpoint file.
+            shape_ckpt_path (str): Path to the shape checkpoint file.
+            device (torch.device): The device to run the inference on (e.g., CPU or CUDA).
+        """
+        assert (
+            device.type == "cuda"
+        ), "EngineFast is only supported on cuda devices, please use Engine on non-cuda devices"
+        super().__init__(config_path, gpt_ckpt_path, shape_ckpt_path, save_gpt_ckpt_path, device, mode)
+        # CUDA Graph params
+        self.graph = torch.cuda.CUDAGraph()
+        self.embed_buffer = torch.Tensor()
+        self.cond_buffer = torch.Tensor()
+        self.logits_buffer = torch.Tensor()
+        self.curr_pos_id = torch.tensor([0], dtype=torch.long, device=self.device)
+        self.kv_cache: list[Cache] = []
+        #self._warmup_and_capture_graph()
+    def _warmup_and_capture_graph(self):
+        """
+        Warms up the model by running a series of forward passes and captures the CUDA graph for efficient execution.
+        This method performs the following steps:
+        1. Prepares the input embeddings and conditions using a warmup prompt.
+        2. Initializes buffers for embeddings and conditions.
+        3. Initializes the key-value cache for the GPT model.
+        4. Runs a series of warmup passes to prefill the model and generate logits.
+        5. Captures the CUDA graph for the model's forward pass to optimize future executions.
+        """
+        warmup_prompt = "A cube"
+        embed, cond = self.prepare_inputs([warmup_prompt], guidance_scale=3.0)
+        batch_size, input_seq_len, dim = embed.shape
+        max_seq_len = input_seq_len + self.max_new_tokens
+        self.embed_buffer = torch.zeros(
+            (batch_size, max_seq_len, dim), dtype=embed.dtype, device=self.device
+        )
+        self.embed_buffer[:, :input_seq_len, :].copy_(embed)
+        self.cond_buffer = torch.empty_like(cond)
+        self.cond_buffer.copy_(cond)
+        cond_len = self.cond_buffer.shape[1]
+        # Initialize kv_cache for the first time
+        self.kv_cache = self.gpt_model.init_kv_cache(
+            batch_size,
+            cond_len,
+            self.max_new_tokens + 1,  # +1 for the BOS token
+            torch.bfloat16,
+            self.device,
+        )
+        num_warmup_passes = 10
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            self._set_curr_pos_id(0)
+            _ = self._prefill_and_return_logits()
+            for x in range(1, num_warmup_passes):
+                self._set_curr_pos_id(x)
+                self.logits_buffer = self.gpt_model(
+                    embed=self.embed_buffer,
+                    cond=self.cond_buffer,
+                    kv_cache=self.kv_cache,
+                    curr_pos_id=self.curr_pos_id,
+                    #decode=True,
+                    decode=False
+                )
+        side_stream = torch.cuda.Stream(device=self.device)
+        with torch.cuda.graph(self.graph, stream=side_stream):
+            with torch.autocast(self.device.type, dtype=torch.bfloat16):
+                self.logits_buffer = self.gpt_model(
+                    embed=self.embed_buffer,
+                    cond=self.cond_buffer,
+                    kv_cache=self.kv_cache,
+                    curr_pos_id=self.curr_pos_id,
+                    decode=True,
+                )
+    def _reset_kv_cache(self):
+        """
+        Resets the key-value cache by setting all key and value states to zero.
+        This method iterates through each cache in the `kv_cache` attribute and
+        calls the `zero_()` method on both `key_states` and `value_states` to
+        reset them to their initial state.
+        """
+        for cache in self.kv_cache:
+            cache.key_states.zero_()
+            cache.value_states.zero_()
+    def _prefill_and_return_logits(self) -> torch.Tensor:
+        """
+        Prefills the model's key-value cache and returns the logits.
+        This method resets the key-value cache and then performs a forward pass
+        through the GPT model in eager mode to prefill the logits.
+        Returns:
+            torch.Tensor: The prefilled logits tensor with the first dimension removed.
+        """
+        self._reset_kv_cache()
+        # Prefill is always eager
+        prefill_logits = self.gpt_model(
+            embed=self.embed_buffer,
+            cond=self.cond_buffer,
+            kv_cache=self.kv_cache,
+            curr_pos_id=self.curr_pos_id,
+            decode=False,
+        )
+        return prefill_logits[:, 0, ...]
+    def _set_curr_pos_id(self, pos: int):
+        """
+        Set the current position ID.
+        This method updates the `curr_pos_id` attribute with the given position.
+        Args:
+            pos (int): The position ID to set.
+        """
+        self.curr_pos_id.copy_(
+            torch.tensor([pos], dtype=torch.long, device=self.device)
+        )
+    def run_gpt(
+        self,
+        prompts: list[str],
+        use_kv_cache: bool,
+        guidance_scale: float = 3.0,
+        top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
+    ):
+        """
+        Runs the GPT model to generate text based on the provided prompts.
+        Args:
+            prompts (list[str]): A list of input prompts for the GPT model. Only a single prompt is supported.
+            use_kv_cache (bool): Flag indicating whether to use key-value caching. (Currently not used)
+            guidance_scale (float, optional): The scale factor for guidance. Default is 3.0.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed. Otherwise, smallest
+                set of tokens with cumulative probability ≥ top_p are kept.
+            bounding_box_xyz (Tuple[float] | None, optional): The size of the bounding box for the generated mesh
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
+        Returns:
+            torch.Tensor: A tensor containing the generated output token IDs.
+        Raises:
+            AssertionError: If the batch size is greater than 1.
+        """
+        embed, cond = self.prepare_inputs(prompts, guidance_scale, bounding_box_xyz)
+        assert len(prompts) == 1, "batch size > 1 not support for EngineFast"
+        batch_size, input_seq_len, _ = embed.shape
+        self.embed_buffer.zero_()
+        self.embed_buffer[:, :input_seq_len, :].copy_(embed)
+        assert self.cond_buffer.shape == cond.shape
+        self.cond_buffer.copy_(cond)
+        output_ids = torch.zeros(
+            (batch_size // 2, self.max_new_tokens), dtype=torch.int, device=self.device
+        )
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            self._set_curr_pos_id(0)
+            logits = self._prefill_and_return_logits()
+#             import ipdb; ipdb.set_trace()
+            logits = logits[..., self.min_id : self.max_id] #[2, 16387]
+            if guidance_scale > 0.0:
+                logits, uncond_logits = logits.float().chunk(2, dim=0)
+                gamma = guidance_scale
+                logits = (1 + gamma) * logits - gamma * uncond_logits
+            next_id = process_logits(logits, top_p=top_p)
+            output_ids[:, 0] = next_id.squeeze()
+            next_embed = self.gpt_model.encode_token(next_id)
+            next_embed = next_embed.repeat(2, 1, 1)
+            self.embed_buffer[:, input_seq_len, :].copy_(next_embed.squeeze(1))
+            for i in tqdm(range(1, self.max_new_tokens), desc=f"generating"):
+                self._set_curr_pos_id(i)
+                self.graph.replay()
+                logits = self.logits_buffer[:, 0, ...]
+                logits = logits[..., self.min_id : self.max_id]
+                if guidance_scale > 0.0:
+                    logits, uncond_logits = logits.float().chunk(2, dim=0)
+                    gamma = (
+                        guidance_scale * (self.max_new_tokens - i) / self.max_new_tokens
+                    )
+                    logits = (1 + gamma) * logits - gamma * uncond_logits
+                next_id = process_logits(logits, top_p=top_p)
+                output_ids[:, i] = next_id.squeeze()
+                next_embed = self.gpt_model.encode_token(next_id)
+                next_embed = next_embed.repeat(2, 1, 1)
+                self.embed_buffer[:, i + input_seq_len, :].copy_(next_embed.squeeze(1))
+                print(logits)
+        return output_ids
+    def pad_id_and_attn(self, inputs_ids, attention_mask): # same
+        # reserve one space for `bos`, the pad_id will be replaced to `bos`
+        place_holder = torch.ones_like(inputs_ids[:, [0]])   # batch x 1
+        # prepare input_ids and attention_mask for transformers
+        #input_ids[attention_mask.bool()] += 3 # 0 - num_tokens to 3 - num_tokens + 3, total: 0 - num_tokens + 3, num: numtokens + 4
+        #input_ids[~attention_mask.bool()] = self.padding_token_id # 2 # in transformers pad token id is only used for init nn.embedding which we won't use
+        # input_ids = torch.cat(
+        #     (place_holder * self.shape_bos_id, input_ids, place_holder * self.pad_id),
+        #     dim=1
+        # )
+        inputs_ids = torch.cat(
+            #(place_holder * self.gpt_model.shape_bos_id, input_ids, place_holder * self.gpt_model.shape_eos_id),
+            (place_holder * self.gpt_model.shape_bos_id, inputs_ids),
+            dim=1
+        )
+        #input_ids[torch.arange(0, input_ids.shape[0]), attention_mask.sum(dim=1).long()+1] = self.eos_token_id #
+        #bos: begin of sequence, eos: end of sequence, pad: padding token
+        #import ipdb; ipdb.set_trace()
+        #input_ids[attention_mask.sum(dim=1).long()+1] = self.gpt_model.shape_eos_id #
+        attention_mask = torch.cat(
+            (place_holder, place_holder, attention_mask, ),
+            dim=1
+        )
+        # length
+        return inputs_ids, attention_mask
+    def precompute_freqs_cis_position(self, b, x_l, y_l, z_l, device):
+        """
+        Set the current position ID.
+        This method updates the `curr_pos_id` attribute with the given position.
+        Args:
+            pos (int): The position ID to set.
+        """
+        x_ids = torch.arange(x_l, dtype=torch.long, device=device)  # shape (t)
+        x_ids = x_ids.unsqueeze_(0).expand(b, -1)
+        x_freqs_cis = precompute_freqs_cis(
+            dim=self.gpt_model.cfg.n_embd // self.gpt_model.cfg.n_head * 4, # 128
+            t=x_ids,
+            theta=self.gpt_model.cfg.rope_theta, #10000.0
+        )
+        y_ids = torch.arange(y_l, dtype=torch.long, device=device)  # shape (t)
+        y_ids = y_ids.unsqueeze_(0).expand(b, -1)
+        y_freqs_cis = precompute_freqs_cis(
+            dim=self.gpt_model.cfg.n_embd // self.gpt_model.cfg.n_head * 4, # 128*4
+            t=y_ids,
+            theta=self.gpt_model.cfg.rope_theta, #10000.0
+        )
+        z_ids = torch.arange(z_l, dtype=torch.long, device=device)  # shape (t)
+        z_ids = z_ids.unsqueeze_(0).expand(b, -1)
+        z_freqs_cis = precompute_freqs_cis(
+            dim=self.gpt_model.cfg.n_embd // self.gpt_model.cfg.n_head * 4, # 128
+            t=z_ids,
+            theta=self.gpt_model.cfg.rope_theta, #10000.0
+        )
+        return x_freqs_cis, y_freqs_cis, z_freqs_cis
+    def fwd_gpt(
+        self,
+        prompts: list[str],
+        inputs_ids: list[torch.Tensor],
+        latent: list[torch.Tensor],
+        use_kv_cache: bool,
+        guidance_scale: float = 3.0,
+        top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
+        strategy: int = None,
+        mode: str = 'train'
+    ):
+        """
+        Runs the GPT model to generate text based on the provided prompts.
+        Args:
+            prompts (list[str]): A list of input prompts for the GPT model. Only a single prompt is supported.
+            use_kv_cache (bool): Flag indicating whether to use key-value caching. (Currently not used)
+            guidance_scale (float, optional): The scale factor for guidance. Default is 3.0.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed. Otherwise, smallest
+                set of tokens with cumulative probability ≥ top_p are kept.
+            bounding_box_xyz (Tuple[float] | None, optional): The size of the bounding box for the generated mesh
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
+        Returns:
+            torch.Tensor: A tensor containing the generated output token IDs.
+        Raises:
+            AssertionError: If the batch size is greater than 1.
+        """
+        #_, cond = self.prepare_inputs(prompts, guidance_scale, bounding_box_xyz)
+        #assert len(prompts) == 1, "batch size > 1 not support for EngineFast" #why?
+        #batch_size, input_seq_len, _ = embed.shape
+        with torch.no_grad():
+            attention_mask = inputs_ids != -1
+        cut_idx = (attention_mask == False)[:, :, -3].int().argmax(dim=1)
+        #dat_id = inputs_ids[:,:,self.gpt_model.xyz:self.gpt_model.xyz+self.gpt_model.dat_num].argmax(-1)
+        dat_id = inputs_ids[:,:,-6].long()
+        dat_id = torch.where(torch.arange(dat_id.shape[1], device=dat_id.device)[None,:] >= cut_idx[:,None], self.gpt_model.dat_num, dat_id)
+        inputs_embeds = self.gpt_model.dte(dat_id)
+        # x_id = inputs_ids[:,:,24:self.gpt_model.x+24].argmax(-1)
+        # y_id = inputs_ids[:,:,self.gpt_model.x:self.gpt_model.xy].argmax(-1)
+        # z_id = inputs_ids[:,:,self.gpt_model.xy:self.gpt_model.xyz].argmax(-1)
+        # coord_ids = torch.cat([x_id.unsqueeze(-1), y_id.unsqueeze(-1), z_id.unsqueeze(-1)], dim=-1)
+        # max_vals = torch.tensor([self.gpt_model.x_num - 1, self.gpt_model.y_num - 1, self.gpt_model.z_num - 1],
+        #                 dtype=torch.float32,
+        #                 device=coord_ids.device)
+        # normliz_coord = coord_ids.float() / max_vals.view(1, 1, 3) * 2 - 1  #
+        # pos_embeds = positional_encoding(normliz_coord, 128)
+        #embeds_from_id = self.gpt_model.encode_embed(inputs_ids[:, :, self.gpt_model.xyz:self.gpt_model.xyz + self.gpt_model.dat_num].float())
+        #embeds_from_id = self.gpt_model.encode_embed(inputs_ids[:, :, 24:self.gpt_model.xyz + self.gpt_model.dat_num].float())
+        #embeds_from_id = self.gpt_model.encode_embed(inputs_ids[:, :, 24:self.gpt_model.xyz].float())
+        #flatten rot id
+        r_id = inputs_ids[:,:,0]
+        r_id = torch.where(torch.arange(r_id.shape[1], device=r_id.device)[None,:] >= cut_idx[:,None], self.gpt_model.rot_num, r_id)
+        #flatten postion id
+        x_id = inputs_ids[:,:,-5]
+        y_id = inputs_ids[:,:,-4]
+        z_id = inputs_ids[:,:,-3]
+        x_id = torch.where(torch.arange(x_id.shape[1], device=x_id.device)[None,:] >= cut_idx[:,None], self.gpt_model.x_num, x_id)
+        y_id = torch.where(torch.arange(y_id.shape[1], device=y_id.device)[None,:] >= cut_idx[:,None], self.gpt_model.y_num, y_id)
+        z_id = torch.where(torch.arange(z_id.shape[1], device=z_id.device)[None,:] >= cut_idx[:,None], self.gpt_model.z_num, z_id)
+        inputs_ids[:, :, 0] = r_id.clone()
+        inputs_ids[:, :, -6] = dat_id.clone()
+        inputs_ids[:, :, -5] = x_id.clone()
+        inputs_ids[:, :, -4] = y_id.clone()
+        inputs_ids[:, :, -3] = z_id.clone()
+        #mask token
+        strategy = strategy if mode=='test' else torch.randint(0, 4, (1,)).item()
+        if strategy == 0:
+            x_id = torch.where(torch.arange(x_id.shape[1], device=x_id.device)[None,:] < cut_idx[:,None], self.gpt_model.x_num+1, x_id)
+            y_id = torch.where(torch.arange(y_id.shape[1], device=y_id.device)[None,:] < cut_idx[:,None], self.gpt_model.y_num+1, y_id)
+            z_id = torch.where(torch.arange(z_id.shape[1], device=z_id.device)[None,:] < cut_idx[:,None], self.gpt_model.z_num+1, z_id)
+            mask = None
+        elif strategy == 1:
+            x_id = torch.where(torch.arange(x_id.shape[1], device=x_id.device)[None,:] < cut_idx[:,None], self.gpt_model.x_num+1, x_id)
+            y_id = torch.where(torch.arange(y_id.shape[1], device=y_id.device)[None,:] < cut_idx[:,None], self.gpt_model.y_num+1, y_id)
+            z_id = torch.where(torch.arange(z_id.shape[1], device=z_id.device)[None,:] < cut_idx[:,None], self.gpt_model.z_num+1, z_id)
+            r_id = torch.where(torch.arange(r_id.shape[1], device=r_id.device)[None,:] < cut_idx[:,None], self.gpt_model.rot_num+1, r_id)
+            mask = None
+        elif strategy == 2:
+            x_id = torch.where(torch.arange(x_id.shape[1], device=x_id.device)[None,:] < cut_idx[:,None], self.gpt_model.x_num+1, x_id)
+            y_id = torch.where(torch.arange(y_id.shape[1], device=y_id.device)[None,:] < cut_idx[:,None], self.gpt_model.y_num+1, y_id)
+            z_id = torch.where(torch.arange(z_id.shape[1], device=z_id.device)[None,:] < cut_idx[:,None], self.gpt_model.z_num+1, z_id)
+            mask = (torch.arange(r_id.shape[1], device=r_id.device)[None,:] < cut_idx[:,None]) & (torch.rand(r_id.shape, device=r_id.device) > torch.empty(1, device=r_id.device).uniform_(0.0, 1.0).item())
+            r_id = torch.where(mask, self.gpt_model.rot_num+1, r_id)
+        else:
+            mask = (torch.arange(x_id.shape[1], device=x_id.device)[None,:] < cut_idx[:,None]) & (torch.rand(x_id.shape, device=x_id.device) > torch.empty(1, device=r_id.device).uniform_(0.0, 1.).item())
+            x_id = torch.where(mask, self.gpt_model.x_num+1, x_id)
+            y_id = torch.where(mask, self.gpt_model.y_num+1, y_id)
+            z_id = torch.where(mask, self.gpt_model.z_num+1, z_id)
+        #print(strategy)
+        rembeds_from_id = self.gpt_model.rte(r_id)
+        xembeds_from_id = self.gpt_model.xte(x_id)
+        yembeds_from_id = self.gpt_model.yte(y_id)
+        zembeds_from_id = self.gpt_model.zte(z_id)
+        embeds_from_id = torch.stack([inputs_embeds.clone(), rembeds_from_id, yembeds_from_id, xembeds_from_id, zembeds_from_id], dim=2)  # [b, 310, 3, 1536]
+        #embeds_from_id = torch.stack([yembeds_from_id, xembeds_from_id, zembeds_from_id], dim=2)
+        embeds_from_id = embeds_from_id.view(xembeds_from_id.shape[0], xembeds_from_id.shape[1] * 5, xembeds_from_id.shape[2])  # [b, 930, 1536]
+        #inputs_embeds = self.gpt_model.encode_token(latent)
+        #position embedding
+        #inputs_embeds = torch.cat([pos_embeds, inputs_embeds], dim=-1)
+        inputs_embeds = self.prepare_conditions_with_bboxs(inputs_embeds, bounding_box_xyz)
+        #add token number padding
+        #sequence_length = inputs_ids.shape[1]
+        #pad_sequence = torch.ones((inputs_ids.shape[0], sequence_length), dtype=torch.long, device=inputs_ids.device) * self.gpt_model.dat_num #self.gpt_model.padding_id
+        #pad_sequence_embed = self.gpt_model.encode_token(pad_sequence) #[b, 1536]
+        #!!!--------litte wrong
+        #embeds_from_id[~attention_mask[:,:,:inputs_embeds.shape[2]]] = pad_sequence_embed[~attention_mask[:,:,:inputs_embeds.shape[2]]]
+        #add bos
+        place_holder = torch.ones_like(inputs_ids[:, 0, 0]).long()  # batch x 1
+        bos_embed = self.gpt_model.encode_token(place_holder * self.gpt_model.shape_bos_id) #[1, 1536]
+        embeds_from_id = torch.cat([bos_embed[:, None, :], embeds_from_id], dim=1)
+        inputs_embeds = bos_embed.unsqueeze(1)
+        #exchange
+        # ex = inputs_embeds.clone()
+        # inputs_embeds = self.prepare_conditions_with_bboxs(embeds_from_id, bounding_box_xyz)
+        # embeds_from_id = torch.cat([bos_embed[:, None, :], ex], dim=1)
+        # Prefill is always eager
+        prefill_logits = self.gpt_model(
+            embed=embeds_from_id, #_repeat,
+            cond=inputs_embeds, #_repeat,
+            kv_cache=None,
+            curr_pos_id=None,
+            decode=False,
+        )
+        logits = prefill_logits[..., self.min_id : self.max_id]
+        # if guidance_scale > 0.0:
+        #     logits, uncond_logits = logits.float().chunk(2, dim=0)
+        #     gamma = guidance_scale
+        #     # seq_len = logits.size(1)
+        #     # gamma_list = guidance_scale * (seq_len - torch.arange(seq_len)) / seq_len
+        #     # # shape: [seq_len]
+        #     logits = (1 + gamma) * logits - gamma * uncond_logits
+        return logits, inputs_ids, strategy, mask, cut_idx
+    def t2t(
+        self,
+        prompts: list[str],
+        inputs_ids: list[torch.Tensor],
+        latent: list[torch.Tensor],
+        use_kv_cache: bool,
+        guidance_scale: float = 3.0,
+        resolution_base: float = 8.0,
+        chunk_size: int = 100_000,
+        top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
+        strategy: int = None,
+        mode: str = None
+    ):
+        """
+        Generates a 3D mesh from text prompts using a GPT model.
+        Args:
+            prompts (list[str]): A list of text prompts to guide the generation.
+            use_kv_cache (bool): Whether to use key-value caching for the GPT model.
+            guidance_scale (float, optional): The scale of guidance for the GPT model. Default is 3.0.
+            resolution_base (float, optional): The base resolution for the shape decoder. Default is 8.0.
+            chunk_size (int, optional): The chunk size for processing the shape decoding. Default is 100,000.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
+            bounding_box_xyz (Tuple[float] | None, optional): The size of the bounding box for the generated mesh
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
+        Returns:
+            output_ids: The generated 3D mesh tokens.
+        """
+        logits = self.fwd_gpt(
+            prompts, inputs_ids, latent, use_kv_cache, guidance_scale, top_p, bounding_box_xyz, strategy, mode
+        )
+        return logits
+    def configure_optimizers(
+        self,
+        train_config
+    ):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+#         import ipdb; ipdb.set_trace()
+        for mn, m in self.gpt_model.named_modules():
+            #print(mn, m)
+            if mn!='lm_head':
+                continue
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+                elif '_norm.weight' in pn:  #
+                    no_decay.add(fpn)
+        #import ipdb; ipdb.set_trace()
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.gpt_model.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        # assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        # assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+        #                                             % (str(param_dict.keys() - union_params), )
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        return optimizer
+    def configure_optimizers_lora(
+        self,
+        train_config
+    ):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        optim_groups = (p for p in self.gpt_model.parameters() if p.requires_grad)
+        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        return optimizer
+    def configure_optimizers_lora_linear(
+        self,
+        train_config
+    ):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.gpt_model.named_modules():
+            #print(mn, m)
+            if mn!='ldr_head' or mn!='ldr_proj' or mn!='dte' or mn!='xte' or mn!='yte' or mn!='zte' or mn!='rte':
+                continue
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+                elif '_norm.weight' in pn:  #
+                    no_decay.add(fpn)
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.gpt_model.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        lora_optim_groups = [p for p in self.gpt_model.parameters() if p.requires_grad]
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+            {"params": lora_optim_groups},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = CosineAnnealingLR(
+            optimizer,
+            T_max=train_config.max_iters,
+            eta_min=train_config.learning_rate * 0.01
+        )
+        return optimizer, scheduler
+    def configure_optimizers_scratch_linear(
+        self,
+        train_config
+    ):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.gpt_model.named_modules():
+            #print(mn, m)
+            # if mn!='ldr_head' or mn!='ldr_proj' or mn!='dte' or mn!='xte' or mn!='yte' or mn!='zte' or mn!='rte':
+            #     continue
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+                elif '_norm.weight' in pn:  #
+                    no_decay.add(fpn)
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.gpt_model.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
+        scheduler = CosineAnnealingLR(
+            optimizer,
+            T_max=train_config.max_iters,
+            eta_min=train_config.learning_rate * 0.01
+        )
+        return optimizer, scheduler