import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    PreTrainedTokenizerBase,
    GenerationConfig,
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor
    
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from peft import PeftConfig, LoraConfig

import random
from typing import Tuple, Optional, List, Union
import logging
import functools

from larm.task.base_model import BaseModel
from larm.common.registry import registry

from .weaver import MemGenWeaver
from .trigger import MemGenTrigger, NanoTrigger
from .utils import (
    CONVERSATION_TEMPLATE, 
    load_state_dict_from_safetensor, 
    fix_model_parameters,
    log_trainable_params,
)

# Decorator to log function calls in blue
def log_function_call(func):
    """Decorator to log function calls with blue color."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_name = func.__name__
        # logging.info(f"\033[94m[CALL] {func_name}\033[0m") # blue color
        return func(*args, **kwargs)
    return wrapper

# @log_function_call
def get_next_token(next_token_logits: torch.Tensor, do_sample: bool, temperature: float) -> torch.Tensor:
    """
    Selects the next token from model logits.

    Two modes are supported:
    1. Sampling mode (do_sample=True and temperature>0):  
       Apply temperature scaling to the logits, compute a probability distribution with softmax, 
       and randomly sample one token.
    2. Greedy mode (do_sample=False or temperature==0):  
       Select the token with the highest probability (argmax).

    Args:
        next_token_logits (torch.Tensor): 
            Logits for the next tokens, shape [batch_size, vocab_size].
        do_sample (bool): 
            Whether to perform stochastic sampling. If False, greedy decoding is used.
        temperature (float): 
            Sampling temperature. Higher values make the distribution flatter (more randomness), 
            while lower values make it sharper (more deterministic). 
            When set to 0, greedy decoding is enforced.

    Returns:
        torch.Tensor: 
            The selected next token indices, shape [batch_size, 1].
    """
    if len(next_token_logits.shape) != 2:
        raise ValueError("Input logits must be a 2D tensor [batch_size, vocab_size]")
    if do_sample and temperature != 0:
        # Apply temperature scaling and sample from the resulting probability distribution
        probs = F.softmax(next_token_logits / temperature, dim=-1)
        return torch.multinomial(probs, num_samples=1)
    else:
        # Greedy decoding: pick the token with the highest probability
        return torch.argmax(next_token_logits, dim=-1, keepdim=True)


# @log_function_call
def generate_position_ids(attention_mask: torch.Tensor) -> torch.Tensor:
    """
    Generate position ID tensor based on the given attention mask.

    The position IDs are computed as the cumulative count of non-padding tokens 
    within each sequence. Padding tokens are always assigned position ID 0.

    Args:
        attention_mask (torch.Tensor): 
            A tensor of shape (batch_size, sequence_length). 
            Typically, 1 indicates a valid (non-padding) token and 0 indicates a padding token.

    Returns:
        torch.Tensor: 
            A tensor of shape (batch_size, sequence_length) containing position IDs.
            - For non-padding tokens: position IDs start at 0 and increase consecutively.
            - For padding tokens: position ID is always 0.
    """
    position_ids = (attention_mask.cumsum(-1) - 1).clamp(min=0)
    position_ids.masked_fill_(attention_mask == 0, 0)
    return position_ids

@log_function_call
def is_conversation(input_ids: torch.Tensor, tokenizer) -> bool:
    """
    Check whether the given input IDs represent a conversation format.  
    Only the first sample in the batch is inspected.

    The function verifies whether the sequence contains at least one pair 
    of special tokens: <|im_start|> and <|im_end|>.

    Args:
        input_ids (torch.Tensor): 
            Tensor of shape (batch_size, seq_len) containing token IDs.
        tokenizer: 
            A HuggingFace tokenizer used to obtain the special token IDs.

    Returns:
        bool: 
            True if the sequence contains both <|im_start|> and <|im_end|>, 
            False otherwise.
    """
    if len(input_ids.shape) != 2:
        raise ValueError("input_ids must be a 2D tensor of shape (batch_size, seq_len)")
    
    seq = input_ids[0].tolist()

    # Encode the special tokens to obtain their ID sequences
    im_start_ids = tokenizer.encode("<|im_start|>", add_special_tokens=False)
    im_end_ids   = tokenizer.encode("<|im_end|>",   add_special_tokens=False)

    # Check if the sequence contains at least one <|im_start|> and one <|im_end|>
    has_start = any(seq[i:i+len(im_start_ids)] == im_start_ids for i in range(len(seq) - len(im_start_ids) + 1))
    has_end   = any(seq[i:i+len(im_end_ids)]   == im_end_ids   for i in range(len(seq) - len(im_end_ids) + 1))
    
    logging.info(f"has_start: {has_start}, has_end: {has_end}")
    # logging.info(f"seq: {seq}")
    logging.info(f"im_start_ids: {im_start_ids}")
    logging.info(f"im_end_ids: {im_end_ids}")

    return has_start and has_end


@log_function_call
def postprocess_assistant_labels(
    input_ids: torch.Tensor,
    labels: torch.Tensor,
    tokenizer
) -> torch.Tensor:
    """
    Mask out labels corresponding to the `<|im_start|>assistant` marker.  

    This ensures that the special tokens used to indicate the start of the assistant's
    response do not contribute to the loss during training.

    Args:
        input_ids (torch.Tensor): 
            Tensor of shape (batch_size, seq_len) containing the conversation token IDs.
        labels (torch.Tensor): 
            Tensor of shape (batch_size, seq_len) containing training labels. 
            A value of -100 indicates positions that should be ignored in loss computation.
        tokenizer: 
            A HuggingFace tokenizer used to encode the `<|im_start|>assistant\n` marker.

    Returns:
        torch.Tensor: 
            The modified labels tensor with positions corresponding to 
            `<|im_start|>assistant\n` masked as -100.
    """
    if tokenizer.chat_template != CONVERSATION_TEMPLATE:
        raise ValueError(
            "Invalid tokenizer.chat_template detected.\n"
            f"Expected:\n{CONVERSATION_TEMPLATE}\n\n"
            f"Got:\n{tokenizer.chat_template}\n\n"
            "Please ensure that you are using the correct conversation template."
        )
    
    # Encode the token sequence for "<|im_start|>assistant\n"
    pattern_ids: List[int] = tokenizer.encode("<|im_start|>assistant\n", add_special_tokens=False)

    batch_size, seq_len = input_ids.shape
    new_labels = labels.clone()

    for b in range(batch_size):
        seq = input_ids[b].tolist()
        for i in range(len(seq) - len(pattern_ids) + 1):
            # Mask positions matching the pattern
            if seq[i : i + len(pattern_ids)] == pattern_ids:
                new_labels[b, i : i + len(pattern_ids)] = -100

    return new_labels

# @log_function_call
def check_ends_with_delimiter(
    input_ids: torch.Tensor, tokenizer: PreTrainedTokenizerBase, delimiters: List[str]
) -> torch.Tensor:
    """
    Check whether each sequence in the batch ends with any of the specified delimiter strings.

    Args:
        input_ids (torch.Tensor): 
            Tensor of shape (batch_size, seq_len) containing token IDs for each sequence.
        tokenizer (PreTrainedTokenizerBase): 
            HuggingFace tokenizer used to decode input_ids back to text.
        delimiters (List[str]): 
            A list of delimiter strings to check against. 
            If a sequence ends with any of these delimiters, it is marked as True.

    Returns:
        torch.Tensor: 
            A boolean tensor of shape (batch_size, 1), where each entry indicates 
            whether the corresponding sequence ends with one of the delimiters.
    """
    batch_size = input_ids.size(0)

    # Initialize result tensor: False by default
    augmentation_decisions = torch.zeros(batch_size, 1, dtype=torch.bool, device=input_ids.device)

    # Fast path: compare suffix in token ID space if delimiter id tensors precomputed
    delimiter_id_tensors = getattr(getattr(tokenizer, "memgen_model_ref", None), "_delimiter_id_tensors", None)
    if delimiter_id_tensors is None:
        try:
            delimiter_id_tensors = [
                torch.tensor(tokenizer.encode(d, add_special_tokens=False), dtype=torch.long, device=input_ids.device)
                for d in delimiters
            ]
        except Exception:
            delimiter_id_tensors = []

    for i in range(batch_size):
        seq = input_ids[i]
        matched = False
        for did in delimiter_id_tensors:
            L = did.numel()
            if L == 0 or L > seq.numel():
                continue
            if torch.equal(seq[-L:], did):
                matched = True
                break
        if not matched:
            # Fallback to decode once when necessary
            decoded = tokenizer.decode(seq.tolist())
            for aug_str in delimiters:
                if decoded.endswith(aug_str):
                    matched = True
                    break
        augmentation_decisions[i] = matched
    
    return augmentation_decisions


@registry.register_model("latmem")
class LatentMemoryModel(BaseModel):

    @log_function_call
    def __init__(
        self, 
        reasoner_model_name: str, 
        weaver_model_name: str,
        prompt_latents_len: int,
        inference_latents_len: int,
        weaver_peft_config: Optional[PeftConfig] = None,
        trigger_model_name: str = None,   
        trigger_peft_config: Optional[PeftConfig] = None,
        max_prompt_aug_num: int = 1,     
        max_inference_aug_num: int = 5,
    ):   
        super().__init__()

        # # build reasoner LLM
        # self.model = AutoModelForCausalLM.from_pretrained(
        #     reasoner_model_name, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
        # )
        
        # todo: add vlm: qwen2.5-vl
        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            reasoner_model_name, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
        )
        
        self.processor = AutoProcessor.from_pretrained(reasoner_model_name)
        # For text-only models, processor IS the tokenizer; for VLMs, processor has a tokenizer attribute
        if hasattr(self.processor, 'tokenizer'):
            self.tokenizer = self.processor.tokenizer
        else:
            self.tokenizer = self.processor
        self.config = self.model.config
        
        # build weaver LLM
        self.weaver = MemGenWeaver(
            weaver_model_name, prompt_latents_len, inference_latents_len, weaver_peft_config
        )
        
        # build trigger LLM
        self.trigger = NanoTrigger()   # always return true
        if trigger_model_name is not None:
            self.trigger = MemGenTrigger(
                trigger_model_name, trigger_peft_config
            )
            logging.info(f"Use Trigger: {trigger_model_name}")
        
        # projection layers for mapping embeddings between reasoner and weaver
        # map reasoner input embeddings to weaver input embeddings
        self.reasoner_to_weaver = nn.Linear(
            self.model.config.hidden_size, self.weaver.config.hidden_size, dtype=torch.bfloat16
        ) # NOTE: 2048 -> 1536
        # Map weaver hidden states to reasoner input embeddings
        self.weaver_to_reasoner = nn.Linear(
            self.weaver.config.hidden_size, self.model.config.hidden_size, dtype=torch.bfloat16
        ) # NOTE: 1536 -> 2048

        self.delimiters: List[str] = [",", ".", "\n"]  # delimiters for detecting augmentation points
        # self.delimiters: List[str] = [".", "\n"] 
        self.max_prompt_aug_num = max_prompt_aug_num  # insert latents after input prompt
        self.max_inference_aug_num = max_inference_aug_num  # insert latents after specified delimiters

        # Validate augmentation configuration
        if max_prompt_aug_num == 0 and max_inference_aug_num == 0:
            logging.warning(
                "⚠️  Both max_prompt_aug_num and max_inference_aug_num are set to 0. "
                "This means no latent memory will be used. "
                "If you are training, this will cause errors because no trainable parameters will be involved. "
                "For training, at least one augmentation type must be enabled (> 0)."
            )

        # Precompute delimiter token ID sequences for fast suffix checks (avoid repeated decode)
        try:
            self._delimiter_id_tensors = [
                torch.tensor(self.tokenizer.encode(d, add_special_tokens=False), dtype=torch.long)
                for d in self.delimiters
            ]
        except Exception:
            # Fallback: empty list; code path will behave as if no delimiter found
            self._delimiter_id_tensors = []

        # postprocess
        self._postprocess_models()
        
        self.warnings_issued = {}
        self.model_tags = None
        log_trainable_params(self)

    @log_function_call
    def add_model_tags(self, tags: Union[list[str], str]) -> None:
        r"""
        Add custom tags into the model that gets pushed to the Hugging Face Hub. Will
        not overwrite existing tags in the model.

        Args:
            tags (`Union[list[str], str]`):
                The desired tags to inject in the model

        Examples:

        ```python
        from transformers import AutoModel

        model = AutoModel.from_pretrained("google-bert/bert-base-cased")

        model.add_model_tags(["custom", "custom-bert"])

        # Push the model to your namespace with the name "my-custom-bert".
        model.push_to_hub("my-custom-bert")
        ```
        """
        if isinstance(tags, str):
            tags = [tags]

        if self.model_tags is None:
            self.model_tags = []

        for tag in tags:
            if tag not in self.model_tags:
                self.model_tags.append(tag)
    
    @log_function_call
    def _postprocess_models(self):
        """
        Postprocess the components of the latent memory model: reasoner, weaver, trigger, and tokenizer.

        Steps:
            1. Freeze all parameters of the reasoner (no gradient updates).
            2. Cast all models to bfloat16 for memory and compute efficiency.
            3. Ensure the tokenizer has a valid pad token:
            - If pad token is missing, use the EOS token as the pad token.
            - Set `padding_side` to "left" for compatibility with generation tasks.
            4. Standardize the tokenizer's chat template to `CONVERSATION_TEMPLATE`.
        """
        # Freeze all parameters of the reasoner by default
        fix_model_parameters(self.model)

        # Convert all sub-models to bfloat16
        self.model = self.model.bfloat16()
        self.weaver = self.weaver.bfloat16()
        self.trigger = self.trigger.bfloat16()

        # Ensure tokenizer has a pad token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            self.tokenizer.padding_side = "left"
            logging.info(
                f"Tokenizer has no pad token. Using EOS token ({self.tokenizer.eos_token}) as pad token."
            )
        
        # Synchronize model config with tokenizer's special tokens to avoid warnings
        # Update model config
        if hasattr(self.model, 'config'):
            self.model.config.pad_token_id = self.tokenizer.pad_token_id
            self.model.config.bos_token_id = self.tokenizer.bos_token_id
            self.model.config.eos_token_id = self.tokenizer.eos_token_id
        
        # Update generation config
        if hasattr(self.model, 'generation_config'):
            if self.model.generation_config is not None:
                self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id
                self.model.generation_config.bos_token_id = self.tokenizer.bos_token_id
                self.model.generation_config.eos_token_id = self.tokenizer.eos_token_id
        
        # Also update processor's tokenizer if it exists
        if hasattr(self, 'processor') and hasattr(self.processor, 'tokenizer'):
            self.processor.tokenizer.pad_token_id = self.tokenizer.pad_token_id
            self.processor.tokenizer.bos_token_id = self.tokenizer.bos_token_id
            self.processor.tokenizer.eos_token_id = self.tokenizer.eos_token_id
            # IMPORTANT: Do NOT override VLM chat templates; they encode image placeholders.
            is_vlm = isinstance(self.model, Qwen2_5_VLForConditionalGeneration) or hasattr(self.processor, 'image_processor')
            if not is_vlm:
                self.processor.tokenizer.chat_template = CONVERSATION_TEMPLATE

        # Normalize the tokenizer's chat template (skip for VLMs)
        is_vlm = isinstance(self.model, Qwen2_5_VLForConditionalGeneration) or hasattr(getattr(self, 'processor', None), 'image_processor')
        if not is_vlm:
            self.tokenizer.chat_template = CONVERSATION_TEMPLATE
        
        logging.info(
            f"Synchronized special tokens - pad_token_id: {self.tokenizer.pad_token_id}, "
            f"bos_token_id: {self.tokenizer.bos_token_id}, eos_token_id: {self.tokenizer.eos_token_id}"
        )

    @property
    def device(self):
        assert self.model.device == self.weaver.device == self.trigger.device
        return self.model.device
    
    
    @log_function_call
    def _forward(
        self, 
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: torch.Tensor,
        pixel_values: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[Tuple[int, int, int]] = None,
        **kwargs
    ) -> torch.Tensor:
        # preprocess inputs
        assert input_ids.shape == attention_mask.shape == labels.shape
        
        tokenizer = self.tokenizer
        reasoner = self.model
        weaver = self.weaver
        delimiters = self.delimiters
        max_augment_num = self.max_inference_aug_num  # Limit the number of inference augmentation points to avoid excessive augmentation
        device = self.device
        embeds_dtype = reasoner.get_input_embeddings().weight.dtype
        B, _ = input_ids.shape
        hidden_size = reasoner.config.hidden_size

        # select augment idx
        augmentation_indices = self._select_augment_points_after_delimiter(
            input_ids, labels, delimiters, tokenizer, max_augment_num
        )
        logging.info(f"augmentation_indices: {augmentation_indices}")
        
        # origin inputs embeds (use fused embeddings when image inputs are provided)
        if pixel_values is not None:
            with torch.no_grad():
                seed_outputs = reasoner(
                    input_ids=input_ids,
                    pixel_values=pixel_values,
                    image_grid_thw=image_grid_thw,
                    attention_mask=attention_mask,
                    return_dict=True,
                    output_hidden_states=True,
                )
            inputs_embeds = seed_outputs.hidden_states[0]
        else:
            inputs_embeds = reasoner.get_input_embeddings()(input_ids)
        
        # Initialize the start index and empty tensors for accumulating processed segments
        current_start_idx = 0
        current_inputs_embeds = torch.empty((B, 0, hidden_size), device=device, dtype=embeds_dtype)
        current_attention_mask = torch.empty((B, 0), device=device, dtype=attention_mask.dtype)
        current_latents_mask = torch.empty((B, 0), device=device, dtype=torch.bool)

        # Prepare incremental weaver-side embeds to avoid re-projecting the whole history
        current_weaver_inputs_embeds = torch.empty(
            (B, 0, self.weaver.config.hidden_size), device=device, dtype=self.weaver_to_reasoner.weight.dtype
        )

        # Iterate over the selected augmentation points
        for aug_point_idx in augmentation_indices:
            # Slice the current segment of original embeddings and attention mask
            segment_inputs_embeds = inputs_embeds[:, current_start_idx:aug_point_idx]
            segment_attention_mask = attention_mask[:, current_start_idx:aug_point_idx]
            segment_latents_mask = torch.zeros((B, segment_inputs_embeds.size(1)), device=device, dtype=torch.bool)

            # Concatenate the current segment to the accumulated embeddings and masks
            current_inputs_embeds = torch.cat([current_inputs_embeds, segment_inputs_embeds], dim=1)
            current_attention_mask = torch.cat([current_attention_mask, segment_attention_mask], dim=1)
            current_position_ids = generate_position_ids(current_attention_mask)
            current_latents_mask = torch.cat([current_latents_mask, segment_latents_mask], dim=1)

            # Project only the newly added segment to weaver space and append (incremental)
            segment_weaver_inputs = self.reasoner_to_weaver(segment_inputs_embeds)
            current_weaver_inputs_embeds = torch.cat([current_weaver_inputs_embeds, segment_weaver_inputs], dim=1)

            # Determine whether this point is the end of the prompt (prompt augmentation)
            is_prompt_end_aug = (labels[:, aug_point_idx] != -100).all() and (labels[:, aug_point_idx-1] == -100).all().item()
            # Depending on type, use weaver to augment prompt or inference
            if is_prompt_end_aug:
                logging.info(f"[augment] idx={aug_point_idx} TYPE=PROMPT")
                weaver_hidden_states, attn_mask, pos_ids = weaver.augment_prompt(
                    current_weaver_inputs_embeds, current_attention_mask, current_position_ids
                )
            else:
                logging.info(f"[augment] idx={aug_point_idx} TYPE=INFERENCE")
                weaver_hidden_states, attn_mask, pos_ids = weaver.augment_inference(
                    current_weaver_inputs_embeds, current_attention_mask, current_position_ids
                ) 

            # Map weaver hidden states back to reasoner embeddings
            latent_inputs_embeds = self.weaver_to_reasoner(weaver_hidden_states) # NOTE

            # Update accumulated embeddings and masks with the newly augmented segment
            current_inputs_embeds = torch.cat([current_inputs_embeds, latent_inputs_embeds], dim=1)
            current_attention_mask = torch.cat([current_attention_mask, attn_mask], dim=1)
            current_start_idx = aug_point_idx
            
            # Update latent mask for the newly added latent embeddings
            latent_mask = torch.ones((B, latent_inputs_embeds.size(1)), device=device, dtype=torch.bool)
            current_latents_mask = torch.cat([current_latents_mask, latent_mask], dim=1)

            # Keep weaver-side embeds in sync by appending the latent hidden states directly
            current_weaver_inputs_embeds = torch.cat([current_weaver_inputs_embeds, weaver_hidden_states], dim=1)
            
        # Process the remaining segment after the last augmentation point
        remaining_inputs_embeds = inputs_embeds[:, current_start_idx:]
        remaining_attention_mask = attention_mask[:, current_start_idx:]
        latent_mask = torch.zeros((B, remaining_attention_mask.size(1)), device=device, dtype=torch.bool)
        
        current_inputs_embeds = torch.cat([current_inputs_embeds, remaining_inputs_embeds], dim=1)
        current_attention_mask = torch.cat([current_attention_mask, remaining_attention_mask], dim=1)
        current_position_ids = generate_position_ids(current_attention_mask)
        current_latents_mask = torch.cat([current_latents_mask, latent_mask], dim=1)

        reasoner_outputs = reasoner(
            inputs_embeds=current_inputs_embeds,
            attention_mask=current_attention_mask,
            position_ids=current_position_ids
        )
        logits = reasoner_outputs.logits
        
        # Identify valid positions in logits (positions that should contribute to loss)
        shifted = torch.zeros_like(current_latents_mask)
        shifted[:, :-1] = current_latents_mask[:, 1:]
        valid_mask = ~shifted
        
        valid_logits = logits[valid_mask].view(logits.size(0), -1, logits.size(2))  
        # assert shifted.sum() == current_latents_mask.sum()
        # assert valid_logits.shape[:2] == input_ids.shape
        return valid_logits
    
    # @log_function_call
    def _instructional_forward(
        self, 
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: torch.Tensor,
        pixel_values: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[Tuple[int, int, int]] = None,
        **kwargs
    ) -> torch.Tensor:
        """
        Forward pass for single-turn instructional data (no multi-turn conversation required).

        This method is used for instruction-following tasks (SFT), where the input
        consists of a single instruction and the corresponding labels. It directly
        delegates to the single-turn forward method `_forward`.

        Args:
            input_ids (torch.Tensor): Tensor of shape (batch_size, seq_len) containing input token IDs.
            attention_mask (torch.Tensor): Tensor indicating padding positions.
            labels (torch.Tensor): Tensor containing the target labels for supervised fine-tuning.
            **kwargs: Additional keyword arguments passed to `_forward`.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: 
                - logits: The output logits from the model for each input token.
                - labels: The same as input labels, used for loss computation.
        """
        logits = self._forward(
            input_ids,
            attention_mask,
            labels,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            **kwargs,
        )
        # For Instruction SFT, labels remain the same as input
        return logits, labels

    @log_function_call
    def _conversational_forward(
        self, 
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: torch.Tensor,
        pixel_values: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[Tuple[int, int, int]] = None,
        **kwargs
    ) -> torch.Tensor:
        """
        Forward pass for conversational (multi-turn) data.

        Multi-turn forward is constructed by sequentially calling the single-turn forward
        for each conversation turn. Latents inserted in turn i-1 are not visible to turn i.

        Args:
            input_ids (torch.Tensor): Input token IDs, shape (1, seq_len). Batch size must be 1.
            attention_mask (torch.Tensor): Attention mask for input tokens.
            labels (torch.Tensor): Target labels for supervised fine-tuning (-100 for ignore positions).
            **kwargs: Additional arguments passed to `_forward`.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
                - all_logits: Logits for the entire sequence, with zeros for unsupervised positions.
                - all_labels: Labels for the entire sequence, with -100 for unsupervised positions.
        """
        assert input_ids.shape[0] == 1, "Conversational SFT currently only supports batch_size = 1"
        seq_len = input_ids.shape[1]
        vocab_size = self.config.vocab_size
        device = input_ids.device

        # Identify single-turn segments within the conversation based on labels
        label_row = labels[0]
        should_supervise = label_row != -100
        if not should_supervise.any():
            raise ValueError("At least one completion segment is required")

        # Compute the start and end indices of valid supervised segments
        valid_mask = should_supervise.int()
        diff = torch.diff(torch.cat([torch.tensor([0], device=device), valid_mask]))
        valid_starts = (diff == 1).nonzero(as_tuple=True)[0].tolist()  # Transition 0 -> 1
        ends = (diff == -1).nonzero(as_tuple=True)[0].tolist()          # Transition 1 -> 0
        if len(ends) < len(valid_starts):
            ends.append(seq_len)
        assert len(valid_starts) == len(ends)
        
        # Build triplets (start of previous segment, start of supervised segment, end of supervised segment)
        triplets = []
        start = 0
        for s, e in zip(valid_starts, ends):
            triplets.append((start, s, e))
            start = e
        
        # If there are more segments than allowed, randomly select self.max_prompt_aug_num segments
        # Note: max_prompt_aug_num=0 means train all turns but without prompt augmentation (only inference aug)
        if self.max_prompt_aug_num == 0:
            # Train all turns, but without prompt-level augmentation (inference aug may still be used)
            select_turns = [1] * len(triplets)
        elif len(triplets) <= self.max_prompt_aug_num:
            select_turns = [1] * len(triplets)
        else:
            triplets_num = len(triplets)
            selected_indices = set(random.sample(range(triplets_num), self.max_prompt_aug_num))
            select_turns = [1 if i in selected_indices else 0 for i in range(triplets_num)]

        # Initialize tensors to store logits and labels for the entire sequence
        all_logits = torch.zeros(1, seq_len, vocab_size, device=device)
        all_labels = torch.full((1, seq_len), -100, device=device)

        # Loop over each conversation turn and perform single-turn forward if supervised
        for triplet, should_supervise in zip(triplets, select_turns):
            start, valid_start, end = triplet
            if should_supervise:
                cur_input_ids = input_ids[0, :end].unsqueeze(0)
                cur_attention = attention_mask[0, :end].unsqueeze(0)
                # cur_labels only used for _forward, does not represent the true supervision range
                cur_labels = labels[0, :end].clone().unsqueeze(0)
                cur_labels[0, :valid_start] = -100  # Mask tokens before supervision start

                # Single-turn forward for the current conversation segment
                logits = self._forward(
                    cur_input_ids,
                    cur_attention,
                    cur_labels,
                    pixel_values=pixel_values,
                    image_grid_thw=image_grid_thw,
                    **kwargs,
                )
                
                # Update overall logits and labels with the results of this segment
                all_logits[0, start:end, :] = logits[0, start:end, :]
                all_labels[0, start:end] = labels[0, start:end]

        # Return logits and labels:
        # - supervised positions retain computed logits and original labels
        # - unsupervised positions have logits = 0 and labels = -100
        return all_logits, all_labels

    @log_function_call
    def forward(
        self, 
        input_ids: torch.Tensor, 
        attention_mask: torch.Tensor,
        labels: torch.Tensor,
        pixel_values: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[Tuple[int, int, int]] = None,
        **kwargs
    ):  
        tokenizer = self.tokenizer

        # Ensure labels are provided, required for training the reasoning processor
        assert labels is not None, "Reasoning Processor requires input labels for training"
        
        # Check augmentation configuration in training mode
        if self.training and self.max_prompt_aug_num == 0 and self.max_inference_aug_num == 0:
            raise ValueError(
                "Cannot train MemGen model with both max_prompt_aug_num=0 and max_inference_aug_num=0. "
                "At least one augmentation type must be enabled during training. "
                "If you want to use the base model without augmentation, please use the reasoner model directly. "
                "For inference/evaluation without augmentation, set the model to eval mode: model.eval()"
            )
        
        # Determine whether the input is single-turn (instruction) or multi-turn (conversation)
        forward_func = self._instructional_forward
        if is_conversation(input_ids, tokenizer):
            # For conversational data, mask assistant tokens in labels
            logging.info("is_conversation: True")
            labels = postprocess_assistant_labels(input_ids, labels, tokenizer)
            forward_func = self._conversational_forward
        
        batch_size = 1  # Currently process one sequence per batch
        iter_num = input_ids.size(0) // batch_size

        # Forward pass per batch
        logits, supervised_labels = [], []
        for i in range(iter_num):
            batch_input_ids = input_ids[i * batch_size: (i + 1) * batch_size]
            batch_attention_mask = attention_mask[i * batch_size: (i + 1) * batch_size]
            batch_labels = labels[i * batch_size: (i + 1) * batch_size]

            # Call the appropriate forward function (instruction or conversation)
            batch_logits, batch_supervised_labels = forward_func( # forward_func --> _forward
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                labels=batch_labels,
                pixel_values=pixel_values,
                image_grid_thw=image_grid_thw,
                **kwargs
            )
            logits.append(batch_logits)
            supervised_labels.append(batch_supervised_labels)
        
        # Concatenate results from all batches
        all_logits = torch.concat(logits, dim=0)
        all_labels = torch.concat(supervised_labels, dim=0)

        # Compute causal language modeling loss (shifted by one)
        shift_logits = all_logits[..., :-1, :].contiguous()
        shift_labels = all_labels[..., 1:].contiguous()
        # assert shift_logits.shape[:-1] == shift_labels.shape
        loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # Return model outputs
        outputs = CausalLMOutputWithPast(loss=loss, logits=all_logits)
        outputs.supervised_labels = all_labels  # Positions in input_ids that are supervised
        return outputs

    
    @log_function_call
    @torch.no_grad()
    def generate(
        self, 
        input_ids: torch.Tensor, 
        attention_mask: torch.Tensor,
        generation_config: GenerationConfig = None,
        return_augmentation_mask: bool = False,
        pixel_values: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[Tuple[int, int, int]] = None,
        **kwargs
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 
        
        tokenizer = self.tokenizer
        reasoner = self.model
        weaver = self.weaver
        trigger = self.trigger
        delimiters = self.delimiters
        max_augment_num = self.max_inference_aug_num
        invalid_token_id = -100

        # preproecess inputs
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        max_new_tokens = generation_config.max_new_tokens
        do_sample = generation_config.do_sample
        temperature = generation_config.temperature    # control reasoner generate and trigger generate
        pad_token_id = tokenizer.pad_token_id
        # Prefer chat end token (<|im_end|>) if available; fall back to tokenizer.eos
        try:
            im_end_ids = tokenizer.encode("<|im_end|>", add_special_tokens=False)
            if isinstance(im_end_ids, list) and len(im_end_ids) == 1:
                eos_token_id = im_end_ids[0]
            else:
                eos_token_id = tokenizer.eos_token_id
        except Exception:
            eos_token_id = tokenizer.eos_token_id
        prompt_len = input_ids.size(1)
        generation_config = GenerationConfig(
            do_sample=do_sample,
            temperature=temperature,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            # use_cache=True
        )
        
        with torch.no_grad():
            outputs = reasoner(
                input_ids=input_ids,
                pixel_values=pixel_values,
                image_grid_thw=image_grid_thw,
                attention_mask=attention_mask,
                return_dict=True,
                output_hidden_states=True,
            )
        
        # Use embedding layer output (hidden_states[0]) as inputs_embeds to ensure compatibility with model's embedding space.
        fused_embeds = outputs.hidden_states[0]  # Embedding output after token & vision embedding projection
        # logging.info(f"Seed embeds shape (from hidden_states[0]): {fused_embeds.shape}")

        inputs_embeds = fused_embeds

        B, _, hidden_size = inputs_embeds.shape
        device = inputs_embeds.device

        try:
            current_inputs_embeds = inputs_embeds
            current_attention_mask = attention_mask
            current_position_ids = generate_position_ids(current_attention_mask)
            current_input_ids = input_ids

            # Apply prompt augmentation only if max_prompt_aug_num > 0
            if self.max_prompt_aug_num > 0:
                logging.info("[Generation] Applying prompt augmentation")
                weaver_inputs_embeds = self.reasoner_to_weaver(current_inputs_embeds)
                weaver_hidden_states, attn_mask, pos_ids = weaver.augment_prompt(
                    weaver_inputs_embeds, current_attention_mask, current_position_ids
                )
                latent_inputs_embeds = self.weaver_to_reasoner(weaver_hidden_states)

                # Concatenate initial augmented prompt
                current_inputs_embeds = torch.cat([current_inputs_embeds, latent_inputs_embeds], dim=1)
                current_attention_mask = torch.cat([current_attention_mask, attn_mask], dim=1)
                current_position_ids = torch.cat([current_position_ids, pos_ids], dim=1)
            else:
                logging.info("[Generation] Skipping prompt augmentation (max_prompt_aug_num=0)")

            # Generation Loop Initialization
            sentence_augment_count = torch.zeros(B, dtype=torch.int, device=device)
            augmentation_pos = torch.full((B, max_new_tokens), fill_value=invalid_token_id, device=device)
            inserted_embeds: List[List[torch.Tensor]] = [[] for _ in range(B)]
            for i in range(max_new_tokens):
                
                # If all sequences in the batch have already generated an EOS token, stop early
                if (current_input_ids[:, -1] == eos_token_id).all():
                    break   
                
                # Check if all sequences have reached the maximum number of augmentations
                if (sentence_augment_count >= max_augment_num).all():
                    # Adjust the remaining generation length
                    generation_config.max_new_tokens = max_new_tokens - i

                    # Perform generation for the remaining tokens using the reasoner
                    generated = reasoner.generate(
                        inputs_embeds=current_inputs_embeds,
                        attention_mask=current_attention_mask,
                        generation_config=generation_config,
                    )
                    current_input_ids = torch.cat([current_input_ids, generated], dim=1)
                    break

                outputs = reasoner(
                    inputs_embeds=current_inputs_embeds,
                    attention_mask=current_attention_mask,
                    position_ids=current_position_ids,
                    output_hidden_states=False,
                )
                current_inputs_embeds, current_attention_mask, current_position_ids, current_input_ids = self._append_one_step(
                    outputs, current_inputs_embeds, current_attention_mask, current_position_ids, current_input_ids, do_sample, temperature
                )
 
                if i == max_new_tokens - 1:  
                    break 

                # Determine which sentences in the batch should be augmented
                augment_decision = self._should_augment(
                    current_input_ids, current_attention_mask, sentence_augment_count=sentence_augment_count, 
                    do_sample=do_sample, temperature=temperature  
                )
                augmentation_pos[:, i + 1] = augment_decision
                augment_indices = torch.where(augment_decision == 1)[0]

                # If there are sentences to augment, apply augmentation; others remain with left padding
                if len(augment_indices) > 0:
                    # Increment the augmentation count for sentences that are being augmented
                    sentence_augment_count[augment_indices] += 1

                    # Select embeddings, attention masks, and position IDs for sentences to be augmented
                    candidate_inputs_embeds = current_inputs_embeds[augment_indices]
                    candidate_attention_mask = current_attention_mask[augment_indices]
                    candidate_position_ids = current_position_ids[augment_indices]
                    
                    # Perform inference augmentation using the weaver
                    weaver_inputs_embeds = self.reasoner_to_weaver(candidate_inputs_embeds)
                    weaver_hidden_states, attn_mask, _ = weaver.augment_inference(
                        weaver_inputs_embeds, candidate_attention_mask, candidate_position_ids
                    )
                    latent_inputs_embeds = self.weaver_to_reasoner(weaver_hidden_states)
                    
                    candidate_inputs_embeds = torch.cat([candidate_inputs_embeds, latent_inputs_embeds], dim=1)
                    candidate_attention_mask = torch.cat([candidate_attention_mask, attn_mask], dim=1)
                    
                    # Create a single merged tensor for all sequences
                    new_len = candidate_inputs_embeds.size(1)
                    merged_inputs_embeds = torch.zeros((B, new_len, hidden_size), device=device, dtype=current_inputs_embeds.dtype)
                    merged_attention_mask = torch.zeros((B, new_len), device=device, dtype=current_attention_mask.dtype)
                    
                    # Directly place augmented and non-augmented sequences
                    merged_inputs_embeds[augment_indices] = candidate_inputs_embeds
                    merged_attention_mask[augment_indices] = candidate_attention_mask
                    
                    # Non-augmented sequences now include both -100 and 0
                    non_augment_indices = torch.where(augment_decision != 1)[0]
                    if len(non_augment_indices) > 0:
                        non_aug_inputs_embeds = current_inputs_embeds[non_augment_indices]
                        non_aug_attention_mask = current_attention_mask[non_augment_indices]
                        non_aug_inputs_embeds, non_aug_attention_mask, _ = self._left_pad(
                            non_aug_inputs_embeds, non_aug_attention_mask, None, weaver.inference_latents_num
                        )
                        merged_inputs_embeds[non_augment_indices] = non_aug_inputs_embeds
                        merged_attention_mask[non_augment_indices] = non_aug_attention_mask
                    
                    current_inputs_embeds = merged_inputs_embeds
                    current_attention_mask = merged_attention_mask
                    current_position_ids = generate_position_ids(current_attention_mask)
                    
                    # Record inserted embeds for post-processing
                    for idx, embed in zip(augment_indices, latent_inputs_embeds):
                        inserted_embeds[idx].append(embed.clone().detach().cpu())
        except Exception:
            logging.exception(
                f"[generate] Exception. do_sample={do_sample}, temperature={temperature}, pad_id={pad_token_id}, eos_id={eos_token_id}, "
                f"pixel_present={pixel_values is not None}, max_new_tokens={max_new_tokens}"
            )
            raise
        
        # postprocess
        new_generated_len = current_input_ids.size(1) - prompt_len
        augmentation_pos = augmentation_pos[:, :new_generated_len]
         
        if not return_augmentation_mask:
            return current_input_ids
        else:
            return current_input_ids, augmentation_pos

    
    @classmethod
    @log_function_call
    def from_config(cls, config):
        # reasoner configs
        reasoner_model_name = config.get("reasoner_model_name", None)
        max_prompt_aug_num = config.get("max_prompt_aug_num", None)
        max_inference_aug_num = config.get("max_inference_aug_num", None)

        # processor configs
        weaver_configs = config.get("weaver")
        weaver_model_name = weaver_configs.get("weaver_model_name", None)
        prompt_latents_len = weaver_configs.get("prompt_latents_len", 8)
        inference_latents_len = weaver_configs.get("inference_latents_len", 2)
        weaver_use_peft = weaver_configs.get("use_peft", True)
        weaver_peft_config = weaver_configs.get("peft_config", None) if weaver_use_peft else None

        if weaver_peft_config is not None:  
            weaver_peft_config = LoraConfig(**weaver_peft_config)
        
        # trigger configs
        trigger_configs = config.get("trigger")
        trigger_model_name = trigger_configs.get("trigger_model_name", None)
        trigger_use_peft = trigger_configs.get("use_peft", True)
        trigger_peft_config = trigger_configs.get("peft_config", None) if trigger_use_peft else None

        if trigger_peft_config is not None:
            trigger_peft_config = LoraConfig(**trigger_peft_config)

        model = cls(
            reasoner_model_name,
            # weaver configs
            weaver_model_name=weaver_model_name,
            prompt_latents_len=prompt_latents_len,
            inference_latents_len=inference_latents_len,
            weaver_peft_config=weaver_peft_config,
            # trigger configs
            trigger_model_name=trigger_model_name,
            trigger_peft_config=trigger_peft_config,
            # augmentations
            max_prompt_aug_num=max_prompt_aug_num,
            max_inference_aug_num=max_inference_aug_num
        )

        # load model state dict
        load_model_path = config.get("load_model_path", None)
        if load_model_path is not None:
            model_state_dict = load_state_dict_from_safetensor(load_model_path)
            model.load_state_dict(model_state_dict, strict=False)
            logging.info(f"Load model state dict from: {load_model_path}")

        return model
    

    # @log_function_call
    @torch.no_grad()
    def _append_one_step(
        self,
        reasoner_outputs, 
        current_inputs_embeds: torch.Tensor,
        current_attention_mask: torch.Tensor,
        current_position_ids: torch.Tensor,
        current_input_ids: torch.Tensor,
        do_sample: bool,
        temperature: float
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        reasoner = self.model
        B = current_inputs_embeds.size(0)
        
        # Append next token
        next_token_logits = reasoner_outputs.logits[:, -1]
        next_token_ids = get_next_token(next_token_logits, do_sample, temperature)
        current_input_ids = torch.cat([current_input_ids, next_token_ids], dim=1)
        
        # Append next token embeds
        next_token_embeds = reasoner.get_input_embeddings()(next_token_ids)
        current_inputs_embeds = torch.cat([current_inputs_embeds, next_token_embeds], dim=1)
        
        # Append attention mask
        attn_mask = torch.ones((B, 1), dtype=current_attention_mask.dtype, device=current_attention_mask.device)
        current_attention_mask = torch.cat([current_attention_mask, attn_mask], dim=1)
        
        # Append position ids
        next_position_id = current_position_ids[:, -1:] + 1
        current_position_ids = torch.cat([current_position_ids, next_position_id], dim=1)

        return current_inputs_embeds, current_attention_mask, current_position_ids, current_input_ids
    
    # @log_function_call
    def _select_augment_points_after_delimiter(
        self,
        input_ids: torch.Tensor,
        labels: torch.Tensor,
        delimiters: List[str],
        tokenizer: PreTrainedTokenizerBase,
        max_num: int = 10,
    ) -> List[int]:
        """
        Select positions in a sequence suitable for data augmentation based on labels and delimiters.

        This function identifies two types of augmentation points:
        1. **Prompt augmentation points**: positions where the label transitions from -100 to a valid token.
        - Typically corresponds to the start of the assistant's response.
        - Can be disabled if max_prompt_aug_num=0.
        2. **Inference augmentation points**: positions inside valid label regions that follow a delimiter.
        - Only the first `max_num` points are kept.

        Args:
            input_ids (torch.Tensor): Tensor of shape (batch_size, seq_len) containing token IDs.
            labels (torch.Tensor): Tensor of shape (batch_size, seq_len), where -100 indicates
                                positions that should not contribute to loss.
            delimiters (List[str]): List of string delimiters used to determine inference augmentation points.
            tokenizer (PreTrainedTokenizerBase): Tokenizer used to decode input_ids for delimiter matching.
            max_num (int, optional): Maximum number of inference augmentation points to select. Default is 10.

        Returns:
            List[int]: Sorted list of selected augmentation positions in the sequence.

        Raises:
            ValueError: If the prompt augmentation point count is not exactly one when it's expected.
            RuntimeError: If no valid augmentation points are found when both types are enabled.
        """
        assert input_ids.shape == labels.shape
        B, seq_len = input_ids.size(0), input_ids.size(1)

        prompt_augment_idx = []
        inference_augment_idx = []

        for i in range(1, seq_len):  # Skip the first token and last token for augmentation
            # Detect the boundary between prompt and label for prompt augmentation
            if (labels[:, i] != -100).all() and (labels[:, i - 1] == -100).all():
                prompt_augment_idx.append(i)

            # Detect valid label regions for inference augmentation
            elif (labels[:, i] != -100).all() and (labels[:, i - 1] != -100).all():
                batch_tokens_before_i = input_ids[:, :i]
                # Assume check_ends_with_delimiter is defined
                if any(check_ends_with_delimiter(batch_tokens_before_i, tokenizer, delimiters)):
                    inference_augment_idx.append(i)
        
        # Validate prompt augmentation point exists (data format check)
        if len(prompt_augment_idx) != 1:
            raise ValueError("Single-turn forward must have exactly one prompt augment index in data format")

        final_points = []
        
        # Include prompt augmentation point only if max_prompt_aug_num > 0
        if self.max_prompt_aug_num > 0:
            final_points = prompt_augment_idx[:1]
            logging.info(f"[Augmentation] Prompt augmentation enabled: adding point at {prompt_augment_idx[0]}")
        else:
            logging.info(f"[Augmentation] Prompt augmentation disabled (max_prompt_aug_num=0)")

        # Limit the number of inference augmentation points to max_num
        if len(inference_augment_idx) > max_num:
            inference_augment_idx = inference_augment_idx[:max_num]

        final_points.extend(inference_augment_idx)
        
        # Allow empty augmentation points if both are disabled
        if len(final_points) == 0:
            if self.max_prompt_aug_num == 0 and max_num == 0:
                logging.warning("[Augmentation] No augmentation points selected (both prompt and inference disabled)")
                return []
            else:
                raise RuntimeError("No valid augmentation points found despite augmentation being enabled")
        
        final_points.sort()
        logging.info(f"[Augmentation] Final augmentation points: {final_points}")
        return final_points

    # @log_function_call
    @torch.no_grad()
    def _should_augment(
        self, 
        input_ids, 
        attention_mask, 
        sentence_augment_count: torch.Tensor, 
        do_sample: bool, 
        temperature: float = 0.0
    ) -> torch.Tensor:
        """
        Determine whether each sentence in the batch should be augmented based on
        the model's strategy and trigger predictions.

        aug_mask values:
            - -100: No sampling or augmentation decision not applicable
            - 0   : Sampled but trigger indicates no augmentation
            - 1   : Sampled and trigger indicates augmentation

        Args:
            input_ids (torch.Tensor): Input token IDs of shape (batch_size, seq_len)
            attention_mask (torch.Tensor): Attention mask for input_ids
            sentence_augment_count (torch.Tensor): Tracks how many times each sentence has been augmented
            do_sample (bool): Whether to sample from the trigger output
            temperature (float): Sampling temperature

        Returns:
            torch.Tensor: Augmentation mask for each sentence in the batch
        """
        tokenizer = self.tokenizer
        delimiters = self.delimiters
        trigger = self.trigger
        max_augment_num = self.max_inference_aug_num

        batch_size = input_ids.size(0)

        # Initialize aug_mask with -100, meaning no augmentation by default
        aug_mask = torch.full((batch_size,), -100, dtype=torch.long, device=input_ids.device)

        # Mark sentences ending with delimiters as candidates for augmentation (set to 0)
        ends_with_delimiters = check_ends_with_delimiter(input_ids, tokenizer, delimiters).squeeze(1)
        aug_mask[ends_with_delimiters] = 0

        # If a sentence has already reached the max augmentation count, reset to -100
        over_limit = (sentence_augment_count >= max_augment_num)
        aug_mask[over_limit] = -100

        # Apply trigger model only on sentences that are candidates (aug_mask != -100)
        trigger_indices = (aug_mask != -100).nonzero(as_tuple=True)[0]
        if trigger_indices.numel() > 0:
            trigger_logits = trigger(
                input_ids=input_ids[trigger_indices],
                attention_mask=attention_mask[trigger_indices],
            )
            last_token_logits = trigger_logits[:, -1]  # [num_trigger_samples, 2]

            # Sample the next token to decide whether to augment
            next_tokens = get_next_token(last_token_logits, do_sample, temperature).view(-1)

            # Update aug_mask: 0 = no augmentation, 1 = augment
            aug_mask[trigger_indices] = next_tokens

        return aug_mask

    
    # @log_function_call
    @torch.no_grad()
    def _left_pad(
        self,
        input_embeds: torch.Tensor,
        attention_mask: torch.Tensor,
        position_ids: torch.Tensor,
        pad_num: int
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        
        if input_embeds is not None:
            B, L, D = input_embeds.shape
            pad_embeds = torch.zeros((B, pad_num, D), dtype=input_embeds.dtype, device=input_embeds.device)
            input_embeds = torch.cat([pad_embeds, input_embeds], dim=1)  # [B, pad_num + L, D]
        
        if attention_mask is not None:
            B = attention_mask.size(0)
            pad_mask = torch.zeros((B, pad_num), dtype=attention_mask.dtype, device=attention_mask.device)
            attention_mask = torch.cat([pad_mask, attention_mask], dim=1)  # [B, pad_num + L]
        
        if position_ids is not None:
            B = position_ids.size(0)
            pad_pos = torch.zeros((B, pad_num), dtype=position_ids.dtype, device=position_ids.device)
            position_ids = torch.cat([pad_pos, position_ids], dim=1)  # [B, pad_num + L]

        return input_embeds, attention_mask, position_ids
    
    @log_function_call
    @torch.no_grad()
    def _left_clip_pad_tokens(
        self, inputs_embeds: torch.Tensor, attention_mask: torch.Tensor, position_ids: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Left-trim padding tokens based on the attention mask.

        This function identifies the leftmost padding (attention_mask=0) in each
        sequence. If all sequences in the batch have at least some padding on the
        left, it trims the batch by the minimal left-padding length to remove
        unnecessary computation.

        Args:
            inputs_embeds (torch.Tensor): Input embeddings of shape (B, L, D)
            attention_mask (torch.Tensor): Attention mask of shape (B, L)
            position_ids (torch.Tensor): Position IDs of shape (B, L)

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 
                Trimmed inputs_embeds, attention_mask, and position_ids
        """
        B, L, D = inputs_embeds.shape
        device = inputs_embeds.device

        # Find the index of the first non-padding token in each sequence
        first_nonpad_idx = []
        for b in range(B):
            nonzero = (attention_mask[b] != 0).nonzero(as_tuple=True)[0]
            if len(nonzero) == 0:
                # Entire row is padding; can potentially trim the whole sequence
                first_nonpad_idx.append(L)
            else:
                first_nonpad_idx.append(nonzero[0].item())
        
        # Determine the minimum number of left-padding tokens across the batch
        min_pad = min(first_nonpad_idx)

        # If no padding on the left, return original tensors
        if min_pad == 0:
            return inputs_embeds, attention_mask, position_ids

        # Trim the left-padding from all sequences in the batch
        inputs_embeds = inputs_embeds[:, min_pad:, :]
        attention_mask = attention_mask[:, min_pad:]
        position_ids = position_ids[:, min_pad:]

        return inputs_embeds, attention_mask, position_ids
    
    
    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
        """
        Enable gradient checkpointing for all sub-modules that support it.
        """
        kwargs = gradient_checkpointing_kwargs or {}
        for module in [self.model, getattr(self, "weaver", None),
                        getattr(self, "trigger", None)]:
            if hasattr(module, "gradient_checkpointing_enable"):
                module.gradient_checkpointing_enable(**kwargs)

    def gradient_checkpointing_disable(self):
        """
        Disable gradient checkpointing for all sub-modules that support it.
        """
        for module in [self.model, getattr(self, "weaver", None),
                        getattr(self, "trigger", None)]:
            if hasattr(module, "gradient_checkpointing_disable"):
                module.gradient_checkpointing_disable()

    @property
    def is_gradient_checkpointing(self):
        """
        Check if gradient checkpointing is enabled for any sub-module.
        """
        for module in [self.model, getattr(self, "weaver", None),
                        getattr(self, "trigger", None)]:
            if hasattr(module, "is_gradient_checkpointing"):
                if module.is_gradient_checkpointing:
                    return True
        return False