Spaces:

Bailan-Alex
/

D2F-eval

Runtime error

App Files Files Community

Bailan-Alex commited on Oct 22, 2025

Commit

2f3e169

verified ·

1 Parent(s): c906e52

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

README.md +3 -9
eval_dream.py +1155 -0
eval_dream.sh +158 -0
eval_dream_d2f_vllm.py +764 -0
eval_dream_d2f_vllm.sh +135 -0
eval_llada.py +1198 -0
eval_llada.sh +155 -0
generate_llada_demo_ar.py +660 -0
generate_llada_demo_block.py +630 -0
model_cache/dream/configuration_dream.py +88 -0
model_cache/dream/generation_utils.py +463 -0
model_cache/dream/model_dream.py +1029 -0
model_cache/llada/__pycache__/configuration_llada.cpython-310.pyc +0 -0
model_cache/llada/__pycache__/configuration_llada.cpython-312.pyc +0 -0
model_cache/llada/__pycache__/modeling_llada.cpython-310.pyc +0 -0
model_cache/llada/__pycache__/modeling_llada.cpython-312.pyc +0 -0
model_cache/llada/configuration_llada.py +463 -0
model_cache/llada/modeling_llada.py +1504 -0
postprocess_code.py +62 -0
sanitize.py +147 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: D2F Eval
-emoji: 🏃
-colorFrom: gray
-colorTo: gray
 sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: D2F-eval
+app_file: generate_llada_demo_block.py
 sdk: gradio
+sdk_version: 5.49.0
 ---

eval_dream.py ADDED Viewed

	@@ -0,0 +1,1155 @@

+import logging
+import gc
+import time
+import json
+from datetime import timedelta
+from typing import List, Optional, Tuple, Type, TypeVar, Union
+import torch
+import torch.nn.functional as F
+import torch.distributions as dists
+import transformers
+from accelerate import (
+    Accelerator,
+    InitProcessGroupKwargs,
+)
+from datasets import Dataset
+from packaging import version
+from tqdm import tqdm
+from peft import PeftConfig, PeftModel
+import numpy as np
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import get_dtype
+from lm_eval.__main__ import cli_evaluate
+eval_logger = logging.getLogger(__name__)
+T = TypeVar("T", bound="LM")
+import random
+def set_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def shift_logits(logits):
+    shifted_logits = torch.zeros_like(logits)
+    shifted_logits[:, 1:, :] = logits[:, :-1, :]
+    shifted_logits[:, 0, :] = 1.0
+    return shifted_logits
+def create_full_block_attention_mask(prompt_length, max_length, block_size, device=None, dtype=None):
+    """
+    Creates a complete attention mask for the entire sequence with block-based causal attention.
+    Args:
+        prompt_length: Length of the prompt (first irregular block)
+        max_length: Maximum total sequence length
+        block_size: Size of each regular block
+        device: Device to create tensor on
+        dtype: Data type for the attention mask
+    Returns:
+        attention_mask: Tensor of shape [1, 1, max_length, max_length]
+    """
+    # Use the provided dtype or default to bfloat16
+    if dtype is None:
+        dtype = torch.bfloat16
+    # Initialize mask with -inf (no attention)
+    attention_mask = torch.full((1, 1, max_length, max_length), -torch.inf, device=device, dtype=dtype)
+    # Block 0: Prompt (can see itself)
+    attention_mask[:, :, :prompt_length, :prompt_length] = 0
+    # Calculate the number of regular blocks after prompt
+    remaining_length = max_length - prompt_length
+    num_blocks = (remaining_length + block_size - 1) // block_size
+    # Process each regular block
+    for b in range(num_blocks):
+        block_start = prompt_length + b * block_size
+        block_end = min(prompt_length + (b + 1) * block_size, max_length)
+        # Current block can see the prompt
+        attention_mask[:, :, block_start:block_end, :prompt_length] = 0
+        # Current block can see all previous regular blocks
+        for prev_b in range(b):
+            prev_start = prompt_length + prev_b * block_size
+            prev_end = min(prompt_length + (prev_b + 1) * block_size, max_length)
+            attention_mask[:, :, block_start:block_end, prev_start:prev_end] = 0
+        # Current block can see itself (full attention within block)
+        attention_mask[:, :, block_start:block_end, block_start:block_end] = 0
+    return attention_mask
+def extract_attention_mask(full_mask, start_pos, input_length, cache_length):
+    """
+    Extract the relevant portion of attention mask for current forward pass.
+    Args:
+        full_mask: Complete attention mask [1, 1, max_length, max_length]
+        start_pos: Starting position in the full sequence
+        input_length: Length of current input sequence
+        cache_length: Length of cached sequence
+    Returns:
+        attention_mask: Extracted mask [1, 1, input_length, cache_length + input_length]
+    """
+    end_pos = start_pos + input_length
+    total_length = cache_length + input_length
+    # Extract the relevant rows (current input positions)
+    # and columns (cache + current input positions)
+    extracted_mask = torch.full((1, 1, input_length, total_length), -torch.inf,
+                               device=full_mask.device, dtype=full_mask.dtype)
+    # Copy cache columns (0 to cache_length in the extracted mask corresponds to 0 to cache_length in full mask)
+    extracted_mask[:, :, :, :cache_length] = full_mask[:, :, start_pos:end_pos, :cache_length]
+    # Copy current input columns
+    extracted_mask[:, :, :, cache_length:] = full_mask[:, :, start_pos:end_pos, start_pos:end_pos]
+    return extracted_mask
+def build_custom_float_attention_mask(input_ids, prompt_length, block_size, device=None, dtype=None):
+    B, seq_len = input_ids.shape
+    # Use the provided dtype or default to float32
+    if dtype is None:
+        dtype = torch.float32
+    # Initialize to all -inf
+    attn_mask = torch.full((B, 1, seq_len, seq_len), float('-inf'), dtype=dtype, device=device)
+    # 1. Prompt part: each token can attend to the entire prompt
+    for i in range(B):
+        attn_mask[i, :, :, :prompt_length[i]] = 0.0  # Allow all tokens to see the prompt
+        # 2. Block division: divide into blocks starting from prompt_length
+        num_blocks = (seq_len - prompt_length[i] + block_size - 1) // block_size
+        for b in range(num_blocks):
+            block_start = prompt_length[i] + b * block_size
+            block_end = min(block_start + block_size, seq_len)
+            # Full attention within the block
+            attn_mask[i, :, block_start:block_end, block_start:block_end] = 0.0
+            # Causal attention between blocks (can only see previous blocks)
+            for prev_b in range(b):
+                prev_start = prompt_length[i] + prev_b * block_size
+                prev_end = min(prev_start + block_size, seq_len)
+                # Current block can see previous blocks
+                attn_mask[i, :, block_start:block_end, prev_start:prev_end] = 0.0
+    return attn_mask
+def top_p_logits(logits, top_p=None):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the indices to the right to keep the first token above the threshold
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
+def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    if temperature > 0:
+        logits = logits / temperature
+    if top_p is not None and top_p < 1:
+        logits = top_p_logits(logits, top_p)
+    if top_k is not None:
+        logits = top_k_logits(logits, top_k)
+    probs = torch.softmax(logits, dim=-1)
+    if temperature > 0:
+        try:
+            x0 = dists.Categorical(probs=probs).sample()
+            initial_confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except:
+            initial_confidence, x0 = probs.max(dim=-1)
+    else:
+        initial_confidence, x0 = probs.max(dim=-1)
+    # Save initial confidence
+    confidence = initial_confidence.clone()
+    if margin_confidence:
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        # Extract top1 and top2 probabilities
+        top1_probs = sorted_probs[:, 0]
+        top2_probs = sorted_probs[:, 1]
+        # Calculate confidence as top1 - top2
+        confidence = top1_probs - top2_probs
+    if neg_entropy:
+        epsilon = 1e-10
+        log_probs = torch.log(probs + epsilon)
+        confidence = torch.sum(probs * log_probs, dim=-1)
+    return confidence, x0, initial_confidence
+@register_model("dream_lora")
+class DreamLoRA(LM):
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        lora_path: str,
+        batch_size: Optional[Union[int, str]] = 1,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        max_new_tokens: Optional[int] = 128,
+        max_length: Optional[int] = 2048,  # Updated to match example code
+        add_bos_token: Optional[bool] = False,
+        nll_type: Optional[str] = "mc",
+        log_type: Optional[str] = "ftb",
+        mc_num: Optional[int] = 128,
+        classifier_free_guidance: Optional[float] = 1.0,
+        sampling_eps: Optional[float] = 1e-3,
+        diffusion_steps: Optional[int] = 128,
+        trust_remote_code: Optional[bool] = True,
+        parallelize: Optional[bool] = False,
+        autogptq: Optional[Union[bool, str]] = False,
+        temperature: Optional[float] = 0.2,  # Updated default
+        top_p: Optional[float] = None,  # Updated default
+        top_k: Optional[float] = None,
+        alg: Optional[str] = "entropy",
+        alg_temp: Optional[float] = 0.0,
+        escape_until: Optional[bool] = False,
+        block_size: Optional[int] = 4,  # Updated to match example code
+        mask_token_id: Optional[int] = 151666,  # Added mask_token_id parameter
+        block_add_threshold: Optional[float] = 0.5,  # Added block_add_threshold parameter
+        decoded_token_threshold: Optional[int] = 0.9,  # Added decoded_token_threshold parameter
+        skip_threshold: Optional[float] = 1.0,  # Added skip_threshold parameter
+        sampling_strategy: Optional[str] = "default",  # Added sampling_strategy parameter
+        save_dir: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # prepare for parallelism
+        assert isinstance(device, str)
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int, str))
+        gpus = torch.cuda.device_count()
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        if accelerator.num_processes > 1:
+            self.accelerator = accelerator
+        if "npu" in accelerator.device.type:
+            gpus = torch.npu.device_count()
+        # using one process with no model parallelism
+        if not (parallelize or accelerator.num_processes > 1):
+            # use user-passed device
+            device_list = set(
+                ["cuda", "cpu"]
+                + [f"cuda:{i}" for i in range(gpus)]
+                + ["mps", "mps:0"]
+                + [f"npu:{i}" for i in range(gpus)]
+            )
+            if device and device in device_list:
+                self._device = torch.device(device)
+                eval_logger.info(f"Using device '{device}'")
+                if device in ("mps", "mps:0") and version.parse(
+                    torch.__version__
+                ) < version.parse("2.1"):
+                    raise RuntimeError(
+                        f"mps requires torch >= 2.1. You have {torch.__version__}"
+                    )
+            else:
+                eval_logger.info("Device not specified")
+                eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                self._device = (
+                    torch.device("cuda")
+                    if torch.cuda.is_available()
+                    else torch.device("cpu")
+                )
+        else:  # Parallelism managed by accelerate
+            if device != "cuda":
+                eval_logger.info(
+                    f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                )
+            # TODO: include in warning that `load_in_8bit` etc. affect this too
+            self._device = (
+                self.accelerator.device
+                if hasattr(self, "accelerator")
+                else torch.device(device)
+            )
+        self.batch_size_per_gpu = batch_size
+        if isinstance(batch_size, str):
+            self.batch_size_per_gpu = int(batch_size)
+        # Save LoRA path and block_size
+        self.lora_path = lora_path
+        self.block_size = block_size
+        self.block_add_threshold = block_add_threshold  # New block_add_threshold attribute
+        self.skip_threshold = skip_threshold  # New skip_threshold attribute
+        self.sampling_strategy = sampling_strategy  # Save sampling strategy parameter
+        self.decoded_token_threshold = decoded_token_threshold  # New decoded_token_threshold attribute
+        self.save_dir = save_dir
+        # Add metric tracking
+        self.total_forward_passes = 0
+        self.total_generated_tokens = 0
+        self.total_prompts = 0
+        # Add time and token statistics
+        self.total_generation_time = 0.0
+        self.total_block_tokens = 0  # Number of blocks * block_size
+        self.total_actual_tokens = 0  # Actual generated tokens (excluding EOS)
+        self.total_non_eos_tokens = 0  # Total non-EOS tokens in the entire sequence
+        self.all_generation_times = []
+        self.all_block_tokens = []
+        self.all_actual_tokens = []
+        self.all_non_eos_tokens = []
+        # Save target_dtype for later use
+        self.target_dtype = get_dtype(dtype)
+        self._create_model_and_tokenizer(pretrained, dtype, trust_remote_code)
+        if isinstance(pretrained, str):
+            if gpus >= 1 or str(self.device) == "mps":
+                # TODO: can remove this whole snippet except in the mps case, perhaps?
+                if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                    # place model onto device requested manually,
+                    # if not using HF Accelerate or device_map
+                    # or any other option that preloads model onto device
+                    try:
+                        self.model.to(self.device)
+                    except ValueError:
+                        eval_logger.debug(
+                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                        )
+            # multigpu data-parallel support when launched with accelerate
+            if gpus > 1:
+                if accelerator.num_processes > 1:
+                    if parallelize:
+                        eval_logger.warning(
+                            "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
+                        )
+                    elif gpus > accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                            "If you would like to use data parallelism, please launch the script "
+                            "with 'accelerate launch *script*'. "
+                            f"Current run will proceed with {accelerator.num_processes} devices."
+                        )
+                        if self.accelerator.is_local_main_process:
+                            eval_logger.info(
+                                f"Using {gpus} devices with data parallelism"
+                            )
+                    self._device = torch.device(f"{accelerator.device}")
+                    self.accelerator = accelerator
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+                else:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
+        else:
+            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
+            )
+            self._rank = 0
+            self._world_size = 1
+        self.max_length = max_length
+        self.add_bos_token = add_bos_token
+        # generation params
+        self.max_new_tokens = max_new_tokens
+        self.diffusion_steps = diffusion_steps
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.alg = alg
+        self.alg_temp = alg_temp
+        self.escape_until = escape_until
+        self.block_size = block_size
+        self.mask_token_id = mask_token_id
+        # loglikelihood params
+        self.nll_type = nll_type
+        self.log_type = log_type
+        self.mc_num = mc_num
+        self.classifier_free_guidance = classifier_free_guidance
+        self.sampling_eps = sampling_eps
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    def _create_model_and_tokenizer(self, pretrained, dtype, trust_remote_code):
+        # Get correct data type
+        from model_cache.dream.model_dream import DreamModel
+        from model_cache.dream.configuration_dream import DreamConfig
+        target_dtype = get_dtype(dtype)
+        # Load base model, using DreamModel and DreamConfig
+        model_config = DreamConfig.from_pretrained(pretrained)
+        self.model = DreamModel.from_pretrained(
+            pretrained,
+            config=model_config,
+            torch_dtype=target_dtype,
+            trust_remote_code=False,
+        ).eval()
+        # Load LoRA config and model
+        config = PeftConfig.from_pretrained(self.lora_path)
+        self.model = PeftModel.from_pretrained(self.model, self.lora_path)
+        # Only convert data type if target_dtype is not None and not "auto"
+        if target_dtype is not None and target_dtype != "auto":
+            self.model = self.model.to(target_dtype)
+        # Move to specified device
+        self.model = self.model.to(self.device)
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained, trust_remote_code=trust_remote_code
+        )
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    def tok_encode(self, text, add_special_tokens=True):
+        return self.tokenizer(
+            text, return_tensors="pt", add_special_tokens=add_special_tokens
+        ).input_ids
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    def apply_chat_template(
+        self, chat_history, add_generation_prompt: bool = True
+    ) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        chat_templated = self.tokenizer.apply_chat_template(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
+        )
+        return chat_templated
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    def _count_non_eos_tokens_before_truncation(self, generated_sequence, prompt_length):
+        """
+        Unified token counting function: counts non-EOS tokens in the generated sequence (before truncation).
+        """
+        # Get the generated part (excluding the prompt)
+        generated_tokens = generated_sequence[prompt_length:]
+        # Count non-EOS tokens
+        eos_token_id = self.tokenizer.eos_token_id
+        if eos_token_id is not None:
+            # If it's a tensor, convert to list for counting
+            if hasattr(generated_tokens, 'tolist'):
+                generated_tokens_list = generated_tokens.tolist()
+            else:
+                generated_tokens_list = generated_tokens
+            non_eos_count = sum(1 for token in generated_tokens_list if token != eos_token_id)
+        else:
+            non_eos_count = len(generated_tokens)
+        return non_eos_count
+    def _generate_batch(self, prompts: List[str]) -> List[str]:
+        if self.add_bos_token:
+            prompts = [self.tokenizer.bos_token + p for p in prompts]
+        responses = []
+        # Generate for each prompt individually (block generation usually processes one by one)
+        for i, prompt in enumerate(prompts):
+            # tokenize
+            prompt_ids = self.tokenizer.encode(prompt)
+            prompt_tensor = torch.tensor([prompt_ids], device=self.device, dtype=torch.long)
+            if len(prompt_ids) > self.max_length - self.max_new_tokens:
+                eval_logger.warning(f"Prompt length {len(prompt_ids)} is larger than {self.max_length-self.max_new_tokens}, cutoff on the left side")
+                prompt_tensor = prompt_tensor[:, -(self.max_length-self.max_new_tokens):]
+            # Use generate_block_single method to generate, returns EOS-truncated response text
+            response = self._generate_block_single(prompt_tensor)
+            responses.append(response)
+        return responses
+    def _generate_block_single(self, prompt):
+        """
+        Generates a response for a single prompt using parallel block generation, based on KV cache,
+        and using pre-generated attention masks.
+        Returns: EOS-truncated response text.
+        """
+        self.model.eval()
+        mask_id = self.mask_token_id
+        block_size = self.block_size
+        block_add_threshold = self.block_add_threshold
+        skip_threshold = self.skip_threshold
+        decoded_token_threshold = self.decoded_token_threshold
+        # Pre-generate full attention mask, using model's data type
+        prompt_length = prompt.shape[1]
+        full_attention_mask = create_full_block_attention_mask(
+            prompt_length=prompt_length,
+            max_length=self.max_length,
+            block_size=block_size,
+            device=self.device,
+            dtype=self.target_dtype if self.target_dtype is not None and self.target_dtype != "auto" else torch.bfloat16
+        )
+        with torch.inference_mode():
+            # Initialization
+            x_t = prompt.to(self.device)
+            # Track block states - state can be: 'active', 'to_cache', 'in_cache'
+            # Added 'is_complete' field to indicate whether it's a complete state (True) or incomplete (False)
+            block_states = {
+                0: {
+                    'start_pos': 0,
+                    'end_pos': prompt.shape[1],
+                    'mask_count': 0,
+                    'total_masks': prompt.shape[1],
+                    'state': 'to_cache',  # prompt ready for caching immediately
+                    'is_complete': True,  # prompt is always in a complete state
+                },
+            }
+            # Initialize cache
+            past_key_values = None
+            last_logits = None
+            current_blocks = 0  # Number of active blocks
+            step = 0
+            eos_detected = False  # EOS detection flag
+            while current_blocks >= 0:
+                step += 1
+                # Check if a new block needs to be added
+                if len(block_states)-1 < (self.max_new_tokens // block_size) and not eos_detected:
+                    last_block_id = len(block_states) - 1
+                    current_progress = (block_states[last_block_id]['total_masks'] -
+                                      block_states[last_block_id]['mask_count']) / block_states[last_block_id]['total_masks']
+                    if current_progress >= block_add_threshold:
+                        # Add new block - defaults to incomplete state
+                        new_block_id = len(block_states)
+                        new_start_pos = x_t.shape[1]
+                        x_t = torch.cat([x_t, torch.tensor([[mask_id] * block_size]).to(self.device)], dim=1)
+                        block_states[new_block_id] = {
+                            'start_pos': new_start_pos,
+                            'end_pos': new_start_pos + block_size,
+                            'mask_count': block_size,
+                            'total_masks': block_size,
+                            'state': 'active',
+                            'is_complete': False,  # New block defaults to incomplete state
+                        }
+                        current_blocks += 1
+                # At the beginning of each loop, update block completion states
+                self._update_block_completion_states(block_states, decoded_token_threshold)
+                # Check if there are still mask tokens
+                mask_index = (x_t == mask_id)
+                if mask_index.sum() == 0 and current_blocks == 0:
+                    break
+                # Determine which blocks need to be added to cache
+                blocks_to_cache = [bid for bid, state in block_states.items()
+                                if state['state'] == 'to_cache']
+                # Determine the part to process
+                cache_length = 0 if past_key_values is None else past_key_values.get_seq_length()
+                # Determine content to add to cache
+                update_kvcache = 0
+                if blocks_to_cache:
+                    # Find the earliest block that needs to be cached
+                    earliest_block_id = min(blocks_to_cache)
+                    earliest_pos = block_states[earliest_block_id]['start_pos']
+                    # Find the latest block that needs to be cached
+                    latest_block_id = max(blocks_to_cache)
+                    latest_pos = block_states[latest_block_id]['end_pos']
+                    # Update cache for all blocks within this range
+                    update_kvcache = latest_pos - earliest_pos
+                # Create input sequence for forward pass
+                process_start_pos = cache_length
+                if update_kvcache > 0:
+                    # Need to update cache - use completed blocks
+                    earliest_block_to_cache = min(blocks_to_cache)
+                    input_seq = x_t[:, block_states[earliest_block_to_cache]['start_pos']:]
+                    process_start_pos = block_states[earliest_block_to_cache]['start_pos']
+                else:
+                    # Only process active blocks
+                    active_blocks = [bid for bid in block_states.keys() if block_states[bid]['state'] == 'active']
+                    if active_blocks:
+                        # Get all active blocks after the cache
+                        earliest_active_after_cache = float('inf')
+                        for bid in active_blocks:
+                            if block_states[bid]['start_pos'] >= cache_length:
+                                earliest_active_after_cache = min(earliest_active_after_cache, block_states[bid]['start_pos'])
+                        if earliest_active_after_cache < float('inf'):
+                            input_seq = x_t[:, earliest_active_after_cache:]
+                            process_start_pos = earliest_active_after_cache
+                        else:
+                            # No active blocks after cache, this shouldn't happen
+                            input_seq = x_t[:, cache_length:]
+                            # If cache length is already equal to or exceeds sequence length, exit
+                            if cache_length >= x_t.shape[1]:
+                                print(f"Cache length ({cache_length}) >= sequence length ({x_t.shape[1]}) at step {step}. Exiting generation loop.")
+                                raise Exception("Cache length >= sequence length")
+                    else:
+                        # No active blocks, but might have blocks to cache in next iteration
+                        break
+                # Check if input_seq is empty
+                if input_seq.shape[1] == 0:
+                    print(f"Warning: input_seq is empty at step {step}. Breaking generation loop.")
+                    raise Exception("input_seq is empty")
+                # Extract attention mask for current input from the pre-generated full mask
+                input_length = input_seq.shape[1]
+                attention_mask = extract_attention_mask(
+                    full_mask=full_attention_mask,
+                    start_pos=process_start_pos,
+                    input_length=input_length,
+                    cache_length=cache_length
+                )
+                # Forward pass
+                outputs = self.model(
+                    input_seq,
+                    attention_mask=attention_mask,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    update_kvcache=update_kvcache,
+                )
+                # If needed, update cache
+                if update_kvcache > 0:
+                    # Store logits of the last position for next token prediction
+                    cache_end_idx = update_kvcache - 1
+                    last_logits = outputs.logits[:, cache_end_idx, :].unsqueeze(1)
+                    # Update cache
+                    past_key_values = outputs.past_key_values
+                    # Mark blocks as cached
+                    for block_id in blocks_to_cache:
+                        block_states[block_id]['state'] = 'in_cache'
+                # Get correctly shifted logits for prediction
+                logits = self._shift_logits(outputs.logits, last_logit=last_logits)
+                # Process mask tokens for each active block
+                blocks_to_deactivate = []
+                for block_id in sorted(block_states.keys()):
+                    if block_states[block_id]['state'] != 'active':
+                        continue
+                    # Get mask positions for this block
+                    block_start = block_states[block_id]['start_pos']
+                    block_end = block_states[block_id]['end_pos']
+                    block_mask_index = mask_index.clone()
+                    block_mask_index[:, :block_start] = False
+                    block_mask_index[:, block_end:] = False
+                    # If the current block has no masks, skip it
+                    if block_mask_index.sum() == 0:
+                        blocks_to_deactivate.append(block_id)
+                        continue
+                    # Calculate relative position for logits
+                    logit_offset = block_start - process_start_pos
+                    block_rel_positions = torch.where(block_mask_index[0, block_start:block_end])[0]
+                    if block_rel_positions.size(0) > 0:
+                        # Get logits for masked positions
+                        block_mask_logits = logits[:, logit_offset + block_rel_positions, :]
+                        # Sample tokens
+                        confidence, x0, initial_confidence = sample_tokens(
+                            block_mask_logits.squeeze(0),
+                            self.temperature,
+                            top_p=self.top_p,
+                            top_k=self.top_k,
+                            neg_entropy=(self.sampling_strategy == "neg_entropy"),
+                            margin_confidence=(self.sampling_strategy == "margin_confidence")
+                        )
+                        # Apply different sampling strategies based on the block's complete/incomplete state
+                        is_complete = block_states[block_id]['is_complete']
+                        if is_complete:
+                            # Complete state: apply confidence threshold, if no high confidence, select highest
+                            high_conf_indices = torch.where(initial_confidence > skip_threshold)[0]
+                            if len(high_conf_indices) == 0:
+                                number_transfer_tokens = 1
+                                _, transfer_index = torch.topk(confidence, number_transfer_tokens)
+                            else:
+                                transfer_index = torch.tensor([], device=self.device, dtype=torch.long)
+                            # Merge indices
+                            all_indices = torch.unique(torch.cat([transfer_index, high_conf_indices]))
+                        else:
+                            # Incomplete state: only apply confidence threshold, if none exceed, select no tokens
+                            high_conf_indices = torch.where(initial_confidence > skip_threshold)[0]
+                            all_indices = high_conf_indices
+                        # Update tokens
+                        if len(all_indices) > 0:
+                            x0_ = torch.zeros_like(x0, device=self.device, dtype=torch.long) + mask_id
+                            x0_[all_indices] = x0[all_indices].clone()
+                            # Map indices back to original positions
+                            for i, idx in enumerate(all_indices):
+                                abs_pos = block_start + block_rel_positions[idx]
+                                x_t[0, abs_pos] = x0_[idx]
+                            # Update block state
+                            block_states[block_id]['mask_count'] -= len(all_indices)
+                            # Check EOS token
+                            eos_token_id = self.tokenizer.eos_token_id
+                            if eos_token_id is not None:
+                                for idx in all_indices:
+                                    if x0[idx].item() == eos_token_id:
+                                        eos_detected = True
+                                        break
+                    # If no masks remain in this block, deactivate it
+                    mask_index = (x_t == mask_id)
+                    block_mask_index = mask_index.clone()
+                    block_mask_index[:, :block_start] = False
+                    block_mask_index[:, block_end:] = False
+                    if block_mask_index.sum() == 0:
+                        blocks_to_deactivate.append(block_id)
+                        continue
+                # Deactivate completed blocks and mark them for caching in the next iteration
+                for block_id in blocks_to_deactivate:
+                    if block_states[block_id]['state'] == 'active':
+                        # Check if all preceding blocks are already non-active
+                        can_deactivate = True
+                        for prev_block_id in range(block_id):
+                            if prev_block_id in block_states and block_states[prev_block_id]['state'] == 'active':
+                                can_deactivate = False
+                                break
+                        # Only mark the current block as 'to_cache' if all preceding blocks are non-active
+                        if can_deactivate:
+                            block_states[block_id]['state'] = 'to_cache'
+                            current_blocks -= 1
+                        # If there are active blocks before, keep current block as active (do nothing)
+                # Safety check
+                if step > 10000:
+                    print(f"WARNING: Hit safety check at step {step}. Exiting generation loop.")
+                    break
+        # First, calculate non-EOS tokens for the full generated sequence
+        generated_sequence = x_t[0, prompt.shape[1]:].tolist()
+        non_eos_tokens = self._count_non_eos_tokens_before_truncation(
+            x_t[0].tolist(), prompt.shape[1]
+        )
+        # Accumulate to total tokens
+        if not hasattr(self, 'total_generated_tokens'):
+            self.total_generated_tokens = 0
+        self.total_generated_tokens += non_eos_tokens
+        # Generate EOS-truncated response text (consistent with other file logic)
+        response = self.tokenizer.decode(generated_sequence).split(self.tokenizer.eos_token)[0]
+        return response
+    def _update_block_completion_states(self, block_states, decoded_token_threshold):
+        """
+        Updates the complete/incomplete state of blocks.
+        Iterates through blocks from front to back. If a block's decoded token count
+        is greater than the threshold, the next block to its right (if it exists)
+        is set to a complete state.
+        """
+        for block_id in sorted(block_states.keys()):
+            # if block_id == 0:  # Skip prompt block
+            #     continue
+            # Calculate decoded tokens for the current block
+            decoded_tokens = block_states[block_id]['total_masks'] - block_states[block_id]['mask_count']
+            decode_ratio = decoded_tokens / block_states[block_id]['total_masks']
+            # If the current block's decoded token count is greater than the threshold,
+            # then the next block (if it exists) is set to a complete state.
+            # print("decode_ratio",decode_ratio)
+            # print("decoded_token_threshold",decoded_token_threshold)
+            if decode_ratio >= decoded_token_threshold:
+                next_block_id = block_id + 1
+                if next_block_id in block_states:
+                    block_states[next_block_id]['is_complete'] = True
+    def _shift_logits(self, logits, last_logit=None, block_size=None):
+        """Shifts logits to the right by one position, for autoregressive generation"""
+        # Check if logits are empty
+        if logits.shape[1] == 0:
+            print("Warning: logits sequence length is 0, returning empty logits")
+            raise Exception("logits sequence length is 0")
+        shifted_logits = torch.zeros_like(logits)
+        shifted_logits[:, 1:, :] = logits[:, :-1, :]
+        if last_logit is not None:
+            shifted_logits[:, 0, :] = last_logit
+            return shifted_logits
+        shifted_logits[:, 0, :] = 1.0
+        return shifted_logits
+    def generate_until(self, requests: List[Instance], disable_tqdm: bool = False):
+        res = []
+        # Initialize statistics counters
+        if not hasattr(self, 'total_generated_tokens'):
+            self.total_generated_tokens = 0
+        num_tokens = 0
+        num_nfe = 0  # Number of Forward Evaluations
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests",
+        )
+        start_time = time.time()
+        for batch_idx in range(0, len(requests), self.batch_size):
+            batch_requests = requests[batch_idx : batch_idx + self.batch_size]
+            contexts, gen_args = zip(*[req.arguments for req in batch_requests])
+            responses = self._generate_batch(contexts)
+            if not self.escape_until:
+                for i, r in enumerate(responses):
+                    for s in gen_args[0]['until']:
+                        r = r.split(s)[0]
+                    responses[i] = r
+            res.extend(responses)
+            pbar.update(len(contexts))
+        end_time = time.time()
+        total_time = end_time - start_time
+        # Accumulate statistics
+        num_tokens = self.total_generated_tokens
+        num_nfe = self.diffusion_steps * len(requests)  # Estimate NFE
+        # Save final statistics
+        final_stats = {
+            'processed_samples': len(requests),
+            'total_samples': len(requests),
+            'total_tokens': num_tokens,
+            'total_nfe': num_nfe,
+            'total_time': total_time,
+            'tokens_per_second': num_tokens / total_time if total_time > 0 else 0,
+            'nfe_per_token': num_nfe / num_tokens if num_tokens > 0 else 0,
+            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
+        }
+        # Save statistics to file
+        if self.save_dir is not None:
+            import os
+            os.makedirs(self.save_dir, exist_ok=True)
+            # Save response results
+            save_path = os.path.join(self.save_dir, f'rank_{self.rank}_responses.jsonl')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                for r in res:
+                    f.write(json.dumps(r, ensure_ascii=False) + '\n')
+            # Save statistics results
+            stats_path = os.path.join(self.save_dir, f'rank_{self.rank}_final_stats.json')
+            with open(stats_path, 'w', encoding='utf-8') as f:
+                json.dump(final_stats, f, ensure_ascii=False, indent=2)
+        # Print final statistics
+        print("\n" + "="*60)
+        print("=== Final Statistics ===")
+        print("="*60)
+        print(f"Processed Samples: {final_stats['processed_samples']}")
+        print(f"Total Samples: {final_stats['total_samples']}")
+        print(f"Total Tokens: {final_stats['total_tokens']}")
+        print(f"Total NFE: {final_stats['total_nfe']}")
+        print(f"Total Time: {final_stats['total_time']:.4f}s")
+        print(f"Tokens/Second: {final_stats['tokens_per_second']:.2f}")
+        print(f"NFE/Token: {final_stats['nfe_per_token']:.4f}")
+        print(f"Completion Time: {final_stats['timestamp']}")
+        print("="*60)
+        return res
+    def _forward_process(self, batch):
+        b, l = batch.shape
+        # sample from U[0, 1] following https://arxiv.org/pdf/2107.00630 I.1
+        u0 = torch.rand(1, device=batch.device, dtype=torch.float32)
+        indices = torch.arange(b, device=batch.device).float()
+        t = (u0 + indices / b) % 1
+        p_mask = (1 - self.sampling_eps) * t + self.sampling_eps
+        p_mask = p_mask[:, None].repeat(1, l)
+        mask_indices = torch.rand((b, l), device=batch.device) < p_mask
+        # always unmask bos and eos
+        mask_indices[:, 0] = False
+        mask_indices[:, -1] = False
+        noisy_batch = torch.where(mask_indices, self.mask_token_id, batch)
+        return noisy_batch, p_mask
+    @torch.no_grad()
+    def get_logits(self, batch, prompt_index):
+        '''
+        prompt_index : 1D bool tensor, length=batch.shape[1]
+        '''
+        if self.classifier_free_guidance > 1.:
+            assert len(prompt_index) == batch.shape[1]
+            prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
+            un_batch = batch.clone()
+            un_batch[prompt_index] = self.mask_token_id
+            batch = torch.cat([batch, un_batch])
+        input = batch
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            logits = self.model(input).logits
+            # since bos always unmask, the first logits will not be used
+            logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1)
+        if self.classifier_free_guidance > 1.:
+            logits, un_logits = torch.chunk(logits, 2, dim=0)
+            logits = un_logits + self.cfg * (logits - un_logits)
+        return logits[:, :batch.shape[1]]
+    @torch.no_grad()
+    def _eval_target_nll_mc(self, prefix, target):
+        if prefix is None:
+            seq = target[None, :]
+        else:
+            seq = torch.concatenate([prefix, target])[None, :]
+        seq = seq.repeat((self.batch_size, 1)).to(self.device)
+        if self.log_type == 'ftb':
+            prompt_index = torch.arange(seq.shape[1], device=self.device) < len(prefix)
+        else:
+            prompt_index = torch.arange(seq.shape[1], device=self.device) >= len(prefix)
+        loss_acc = []
+        for _ in range(max(self.mc_num // self.batch_size, 1)):
+            perturbed_seq = seq.clone()
+            # eval_logger.info("before noising")
+            perturbed_seq_, p_mask = self._forward_process(seq)
+            # eval_logger.info("end noising")
+            if self.log_type == 'ftb':
+                perturbed_seq[:, -len(target):] = perturbed_seq_[:, -len(target):]
+            elif self.log_type == 'btf':
+                perturbed_seq[:, :len(prefix)] = perturbed_seq_[:, :len(prefix)]
+            elif self.log_type == 'union':
+                perturbed_seq = perturbed_seq_
+            else:
+                raise NotImplementedError(self.log_type)
+            mask_indices = perturbed_seq == self.mask_token_id
+            logits = self.get_logits(perturbed_seq, prompt_index)
+            loss = F.cross_entropy(logits[mask_indices], seq[mask_indices], reduction='none') / p_mask[mask_indices]
+            loss = loss.sum() / self.batch_size
+            loss_acc.append(loss.item())
+        return sum(loss_acc) / len(loss_acc)
+    @torch.no_grad()
+    def _eval_target_nll_ar(self, prefix, target):
+        prefix, target = prefix.unsqueeze(0), target.unsqueeze(0) # 1*l1, 1*l2
+        assert self.log_type in ['ftb', 'btf']
+        assert self.nll_type in ['ar_ftb', 'ar_btf']
+        if self.log_type == 'ftb':
+            prompt_index = torch.arange(prefix.shape[1] + target.shape[1], device=self.device) < prefix.shape[1]
+        else:
+            prompt_index = torch.arange(prefix.shape[1] + target.shape[1], device=self.device) >= prefix.shape[1]
+        if self.log_type == 'ftb':
+            perturbed_ = target.repeat(target.shape[1], 1).clone().contiguous() # l2*l2
+        else:
+            perturbed_ = prefix.repeat(prefix.shape[1], 1).clone().contiguous() # l1*l1
+        mask_index = torch.ones((perturbed_.shape[1], perturbed_.shape[1]), dtype=torch.bool)
+        if self.nll_type == 'ar_ftb':
+            mask_index = torch.triu(mask_index)
+        else:
+            mask_index = torch.tril(mask_index)
+        perturbed_[mask_index] = self.mask_token_id
+        if self.log_type == 'ftb':
+            perturbed_seq = torch.cat([prefix.repeat(perturbed_.shape[0], 1), perturbed_], dim=-1)
+        else:
+            perturbed_seq = torch.cat([perturbed_, target.repeat(perturbed_.shape[0], 1)], dim=-1)
+        logits_ = []
+        num = len(perturbed_seq) // self.batch_size if len(perturbed_seq) % self.batch_size == 0 else len(perturbed_seq) // self.batch_size + 1
+        for i in range(num):
+            end = (i + 1) * self.batch_size if (i + 1) * self.batch_size < len(perturbed_seq) else len(perturbed_seq)
+            perturbed_seq_ = perturbed_seq[i * self.batch_size: end]
+            perturbed_seq_ = perturbed_seq_.to(self.device)
+            if len(perturbed_seq_.shape) == 1:
+                perturbed_seq_ = perturbed_seq_.unsqueeze(0)
+            logits = self.get_logits(perturbed_seq_, prompt_index)
+            logits_.append(logits.cpu())
+        logits = torch.cat(logits_, dim=0)
+        temp_index = torch.ones((perturbed_.shape[1], perturbed_.shape[1]), dtype=torch.bool)
+        if self.nll_type == 'ar_ftb':
+            temp_index = torch.triu(temp_index, diagonal=1)
+        else:
+            temp_index = torch.tril(temp_index, diagonal=-1)
+        mask_index[temp_index] = False
+        if self.log_type == 'ftb':
+            logits_index = torch.cat([torch.zeros((perturbed_.shape[1], prefix.shape[1]), dtype=torch.bool), mask_index], dim=-1)
+        else:
+            logits_index = torch.cat([mask_index, torch.zeros((perturbed_.shape[1], target.shape[1]), dtype=torch.bool)], dim=-1)
+        if self.log_type == 'ftb':
+            loss = F.cross_entropy(logits[logits_index], target[0], reduction='sum').cpu().item()
+        else:
+            loss = F.cross_entropy(logits[logits_index], prefix[0], reduction='sum').cpu().item()
+        return loss
+    def _encode_pair(self, context, continuation):
+        if self.add_bos_token:
+            context = self.tokenizer.bos_token + context
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tokenizer.encode(context + continuation) + [self.tokenizer.eos_token_id]
+        context_enc = self.tokenizer.encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        # by default truncate on the left
+        cutoff_length = max(len(whole_enc) - self.max_length, 0)
+        if cutoff_length > 0:
+            eval_logger.warning(f"Text length {len(whole_enc)} is larger than {self.max_length}, cutoff on the left side")
+            context_remain = context_enc_len-cutoff_length
+            if context_remain > 0:
+                context_enc = context_enc[-context_remain:]
+            else:
+                eval_logger.warning(f"All context (prompt) is truncated.")
+                context_enc = ""
+                continuation_enc = whole_enc[-self.max_length:]
+        return context_enc, continuation_enc
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        def _tokenize(e):
+            prefix, target = self._encode_pair(e["prefix"], e["target"])
+            return {
+                "prefix_text": e["prefix"],
+                "target_text": e["target"],
+                "prefix": prefix,
+                "target": target,
+            }
+        ds = []
+        ds = [{"prefix": req.args[0], "target": req.args[1]} for req in requests]
+        ds = Dataset.from_list(ds)
+        print(ds[0])
+        ds = ds.map(_tokenize)
+        ds = ds.with_format("torch")
+        out = []
+        with torch.no_grad():
+            for elem in tqdm(ds, desc="Computing likelihood..."):
+                prefix = elem["prefix"]
+                target = elem["target"]
+                # likelihood calculations are modified from https://github.com/ML-GSAI/SMDM/blob/main/evaluate_diff.py
+                if self.nll_type == 'mc':
+                    ll = -self._eval_target_nll_mc(prefix, target)
+                    if self.log_type == 'union':
+                        ll = ll / (len(target) + len(prefix))
+                elif self.nll_type == 'ar_ftb' or self.nll_type == 'ar_btf':
+                    ll = -self._eval_target_nll_ar(prefix, target)
+                else:
+                    raise NotImplementedError(self.nll_type)
+                # TODO: greedy decoding
+                is_target_greedy_dec = False
+                out.append((ll, 1.0 if is_target_greedy_dec else 0.0))
+        return out
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError
+if __name__ == "__main__":
+    set_seed(1234)
+    cli_evaluate()

eval_dream.sh ADDED Viewed

	@@ -0,0 +1,158 @@

+tasks="gsm8k_cot mbpp minerva_math"
+nshots="8 3 4"
+lengths="256 256 256"
+temperatures="0 0 0"
+limits="10000 10000 10000"
+block_sizes="32 48 64"
+block_add_thresholds="0.1 0.1 0.1"
+decoded_token_thresholds="0.95 0.95 0.95"
+skip_thresholds="0.9 0.9 0.9"
+top_ps="none none none"
+dtypes="bfloat16 bfloat16 bfloat16"
+sampling_strategies="default default default"
+humaneval_nshots="0"
+humaneval_lengths="256"
+humaneval_temperatures="0"
+humaneval_limits="10000"
+humaneval_diffusion_steps="256"
+humaneval_block_sizes="32"
+humaneval_block_add_thresholds="0.9"
+humaneval_decoded_token_thresholds="0.95"
+humaneval_skip_thresholds="0.95"
+humaneval_top_ps="none"
+humaneval_dtypes="bfloat16"
+humaneval_sampling_strategies="default"
+base_model=Dream-org/Dream-v0-Base-7B
+lora_models=(
+    "SJTU-Deng-Lab/D2F_Dream_Base_7B_Lora"
+)
+read -ra TASKS_ARRAY <<< "$tasks"
+read -ra NSHOTS_ARRAY <<< "$nshots"
+read -ra LENGTH_ARRAY <<< "$lengths"
+read -ra TEMP_ARRAY <<< "$temperatures"
+read -ra LIMITS_ARRAY <<< "$limits"
+read -ra BLOCK_SIZES_ARRAY <<< "$block_sizes"
+read -ra BLOCK_ADD_THRESHOLDS_ARRAY <<< "$block_add_thresholds"
+read -ra DECODED_TOKEN_THRESHOLDS_ARRAY <<< "$decoded_token_thresholds"
+read -ra SKIP_THRESHOLDS_ARRAY <<< "$skip_thresholds"
+read -ra TOP_PS_ARRAY <<< "$top_ps"
+read -ra DTYPES_ARRAY <<< "$dtypes"
+read -ra SAMPLING_STRATEGIES_ARRAY <<< "$sampling_strategies"
+read -ra HUMANEVAL_NSHOTS_ARRAY <<< "$humaneval_nshots"
+read -ra HUMANEVAL_LENGTHS_ARRAY <<< "$humaneval_lengths"
+read -ra HUMANEVAL_TEMP_ARRAY <<< "$humaneval_temperatures"
+read -ra HUMANEVAL_LIMITS_ARRAY <<< "$humaneval_limits"
+read -ra HUMANEVAL_DIFFUSION_STEPS_ARRAY <<< "$humaneval_diffusion_steps"
+read -ra HUMANEVAL_BLOCK_SIZES_ARRAY <<< "$humaneval_block_sizes"
+read -ra HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY <<< "$humaneval_block_add_thresholds"
+read -ra HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY <<< "$humaneval_decoded_token_thresholds"
+read -ra HUMANEVAL_SKIP_THRESHOLDS_ARRAY <<< "$humaneval_skip_thresholds"
+read -ra HUMANEVAL_TOP_PS_ARRAY <<< "$humaneval_top_ps"
+read -ra HUMANEVAL_DTYPES_ARRAY <<< "$humaneval_dtypes"
+read -ra HUMANEVAL_SAMPLING_STRATEGIES_ARRAY <<< "$humaneval_sampling_strategies"
+array_length=${#TASKS_ARRAY[@]}
+if [[ ${#NSHOTS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#LENGTH_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#TEMP_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#LIMITS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#BLOCK_SIZES_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#BLOCK_ADD_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#DECODED_TOKEN_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#SKIP_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#TOP_PS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#SAMPLING_STRATEGIES_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#DTYPES_ARRAY[@]} -ne $array_length ]]; then
+    echo "Error: All configuration arrays must have the same length!"
+    echo "Tasks: ${#TASKS_ARRAY[@]}, Nshots: ${#NSHOTS_ARRAY[@]}, Lengths: ${#LENGTH_ARRAY[@]}, Temperatures: ${#TEMP_ARRAY[@]}, Limits: ${#LIMITS_ARRAY[@]}, Block sizes: ${#BLOCK_SIZES_ARRAY[@]}, Block thresholds: ${#BLOCK_ADD_THRESHOLDS_ARRAY[@]}, Decoded token thresholds: ${#DECODED_TOKEN_THRESHOLDS_ARRAY[@]}, Skip thresholds: ${#SKIP_THRESHOLDS_ARRAY[@]}, Top_ps: ${#TOP_PS_ARRAY[@]}, Sampling strategies: ${#SAMPLING_STRATEGIES_ARRAY[@]}, Dtypes: ${#DTYPES_ARRAY[@]}"
+    exit 1
+fi
+humaneval_array_length=${#HUMANEVAL_NSHOTS_ARRAY[@]}
+if [[ ${#HUMANEVAL_LENGTHS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_TEMP_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_LIMITS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DIFFUSION_STEPS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_BLOCK_SIZES_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_SKIP_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_TOP_PS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DTYPES_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[@]} -ne $humaneval_array_length ]]; then
+    echo "Error: All HumanEval configuration arrays must have the same length!"
+    echo "HumanEval Nshots: ${#HUMANEVAL_NSHOTS_ARRAY[@]}, Lengths: ${#HUMANEVAL_LENGTHS_ARRAY[@]}, Temperatures: ${#HUMANEVAL_TEMP_ARRAY[@]}, Limits: ${#HUMANEVAL_LIMITS_ARRAY[@]}, Diffusion steps: ${#HUMANEVAL_DIFFUSION_STEPS_ARRAY[@]}, Block sizes: ${#HUMANEVAL_BLOCK_SIZES_ARRAY[@]}, Block thresholds: ${#HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[@]}, Decoded token thresholds: ${#HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[@]}, Skip thresholds: ${#HUMANEVAL_SKIP_THRESHOLDS_ARRAY[@]}, Top_ps: ${#HUMANEVAL_TOP_PS_ARRAY[@]}, Dtypes: ${#HUMANEVAL_DTYPES_ARRAY[@]}, Sampling strategies: ${#HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[@]}"
+    exit 1
+fi
+export HF_ALLOW_CODE_EVAL=1
+for lora_model in "${lora_models[@]}"; do
+    lora_model_name="$lora_model"
+    echo "===================================================================="
+    echo "Evaluating LoRA model: $lora_model_name"
+    echo "===================================================================="
+    for i in "${!HUMANEVAL_NSHOTS_ARRAY[@]}"; do
+        output_path="evals_dream${lora_model_name}/humaneval-ns${HUMANEVAL_NSHOTS_ARRAY[$i]}-len${HUMANEVAL_LENGTHS_ARRAY[$i]}-temp${HUMANEVAL_TEMP_ARRAY[$i]}-limit${HUMANEVAL_LIMITS_ARRAY[$i]}-diffsteps${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]}-block${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]}-thresh${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]}-decodethresh${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}-skip${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]}-topp${HUMANEVAL_TOP_PS_ARRAY[$i]}-dtype${HUMANEVAL_DTYPES_ARRAY[$i]}-sampling${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]}"
+        echo "Running HumanEval evaluation $((i+1))/${humaneval_array_length} for $lora_model_name..."
+        echo "HumanEval Config: Shots: ${HUMANEVAL_NSHOTS_ARRAY[$i]}, Length: ${HUMANEVAL_LENGTHS_ARRAY[$i]}, Temperature: ${HUMANEVAL_TEMP_ARRAY[$i]}, Limit: ${HUMANEVAL_LIMITS_ARRAY[$i]}, Diffusion Steps: ${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]}, Block Size: ${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]}, Block Add Threshold: ${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]}, Decoded Token Threshold: ${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}, Skip Threshold: ${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]}, Top_p: ${HUMANEVAL_TOP_PS_ARRAY[$i]}, Sampling Strategy: ${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]}, Dtype: ${HUMANEVAL_DTYPES_ARRAY[$i]}; Output: $output_path"
+        if [[ "${HUMANEVAL_TOP_PS_ARRAY[$i]}" == "none" ]]; then
+            humaneval_model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${HUMANEVAL_LENGTHS_ARRAY[$i]},diffusion_steps=${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]},temperature=${HUMANEVAL_TEMP_ARRAY[$i]},add_bos_token=true,escape_until=true,block_size=${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${HUMANEVAL_DTYPES_ARRAY[$i]},sampling_strategy=${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        else
+            humaneval_model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${HUMANEVAL_LENGTHS_ARRAY[$i]},diffusion_steps=${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]},temperature=${HUMANEVAL_TEMP_ARRAY[$i]},top_p=${HUMANEVAL_TOP_PS_ARRAY[$i]},add_bos_token=true,escape_until=true,block_size=${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${HUMANEVAL_DTYPES_ARRAY[$i]},sampling_strategy=${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        fi
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --main_process_port 29520 --num_processes 8 eval_dream.py --model dream_lora \
+            --model_args $humaneval_model_args \
+            --tasks humaneval \
+            --num_fewshot ${HUMANEVAL_NSHOTS_ARRAY[$i]} \
+            --batch_size 1 \
+            --output_path $output_path \
+            --log_samples \
+            --confirm_run_unsafe_code
+    done
+    ### NOTICE: use postprocess for humaneval
+    # python postprocess_code.py {the samples_xxx.jsonl file under output_path}
+    for i in "${!TASKS_ARRAY[@]}"; do
+        output_path="evals_dream${lora_model_name}/${TASKS_ARRAY[$i]}-ns${NSHOTS_ARRAY[$i]}-len${LENGTH_ARRAY[$i]}-temp${TEMP_ARRAY[$i]}-limit${LIMITS_ARRAY[$i]}-diffsteps${LENGTH_ARRAY[$i]}-block${BLOCK_SIZES_ARRAY[$i]}-thresh${BLOCK_ADD_THRESHOLDS_ARRAY[$i]}-decodethresh${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}-skip${SKIP_THRESHOLDS_ARRAY[$i]}-topp${TOP_PS_ARRAY[$i]}-dtype${DTYPES_ARRAY[$i]}-sampling${SAMPLING_STRATEGIES_ARRAY[$i]}"
+        echo "Task: ${TASKS_ARRAY[$i]}, Shots: ${NSHOTS_ARRAY[$i]}, Length: ${LENGTH_ARRAY[$i]}, Temperature: ${TEMP_ARRAY[$i]}, Limit: ${LIMITS_ARRAY[$i]}, Block Size: ${BLOCK_SIZES_ARRAY[$i]}, Block Add Threshold: ${BLOCK_ADD_THRESHOLDS_ARRAY[$i]}, Decoded Token Threshold: ${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}, Skip Threshold: ${SKIP_THRESHOLDS_ARRAY[$i]}, Top_p: ${TOP_PS_ARRAY[$i]}, Sampling Strategy: ${SAMPLING_STRATEGIES_ARRAY[$i]}, Dtype: ${DTYPES_ARRAY[$i]}; Output: $output_path"
+        if [[ "${TOP_PS_ARRAY[$i]}" == "none" ]]; then
+            model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${LENGTH_ARRAY[$i]},diffusion_steps=${LENGTH_ARRAY[$i]},add_bos_token=true,temperature=${TEMP_ARRAY[$i]},block_size=${BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${DTYPES_ARRAY[$i]},sampling_strategy=${SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        else
+            model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${LENGTH_ARRAY[$i]},diffusion_steps=${LENGTH_ARRAY[$i]},add_bos_token=true,temperature=${TEMP_ARRAY[$i]},top_p=${TOP_PS_ARRAY[$i]},block_size=${BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${DTYPES_ARRAY[$i]},sampling_strategy=${SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        fi
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --main_process_port 29520 --num_processes 8 eval_dream.py --model dream_lora \
+            --model_args $model_args \
+            --tasks ${TASKS_ARRAY[$i]} \
+            --limit ${LIMITS_ARRAY[$i]} \
+            --num_fewshot ${NSHOTS_ARRAY[$i]} \
+            --batch_size 1 \
+            --output_path $output_path \
+            --log_samples \
+            --confirm_run_unsafe_code
+    done
+done
+echo "All evaluations completed!"

eval_dream_d2f_vllm.py ADDED Viewed

	@@ -0,0 +1,764 @@

+import logging
+import gc
+import time
+import json
+from datetime import timedelta
+from typing import List, Optional, Tuple, Type, TypeVar, Union
+import torch
+import torch.nn.functional as F
+import torch.distributions as dists
+import transformers
+from accelerate import (
+    Accelerator,
+    InitProcessGroupKwargs,
+)
+from datasets import Dataset
+from packaging import version
+from tqdm import tqdm
+from peft import PeftConfig, PeftModel
+import numpy as np
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import get_dtype
+from lm_eval.__main__ import cli_evaluate
+eval_logger = logging.getLogger(__name__)
+T = TypeVar("T", bound="LM")
+import random
+def set_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def shift_logits(logits):
+    shifted_logits = torch.zeros_like(logits)
+    shifted_logits[:, 1:, :] = logits[:, :-1, :]
+    shifted_logits[:, 0, :] = 1.0
+    return shifted_logits
+def create_full_block_attention_mask(prompt_length, max_length, block_size, device=None, dtype=None):
+    """
+    Creates a complete attention mask for the entire sequence with block-based causal attention.
+    Args:
+        prompt_length: Length of the prompt (first irregular block)
+        max_length: Maximum total sequence length
+        block_size: Size of each regular block
+        device: Device to create tensor on
+        dtype: Data type for the attention mask
+    Returns:
+        attention_mask: Tensor of shape [1, 1, max_length, max_length]
+    """
+    # Use the provided dtype or default to bfloat16
+    if dtype is None:
+        dtype = torch.bfloat16
+    # Initialize mask with -inf (no attention)
+    attention_mask = torch.full((1, 1, max_length, max_length), -torch.inf, device=device, dtype=dtype)
+    # Block 0: Prompt (can see itself)
+    attention_mask[:, :, :prompt_length, :prompt_length] = 0
+    # Calculate the number of regular blocks after prompt
+    remaining_length = max_length - prompt_length
+    num_blocks = (remaining_length + block_size - 1) // block_size
+    # Process each regular block
+    for b in range(num_blocks):
+        block_start = prompt_length + b * block_size
+        block_end = min(prompt_length + (b + 1) * block_size, max_length)
+        # Current block can see the prompt
+        attention_mask[:, :, block_start:block_end, :prompt_length] = 0
+        # Current block can see all previous regular blocks
+        for prev_b in range(b):
+            prev_start = prompt_length + prev_b * block_size
+            prev_end = min(prompt_length + (prev_b + 1) * block_size, max_length)
+            attention_mask[:, :, block_start:block_end, prev_start:prev_end] = 0
+        # Current block can see itself (full attention within block)
+        attention_mask[:, :, block_start:block_end, block_start:block_end] = 0
+    return attention_mask
+def extract_attention_mask(full_mask, start_pos, input_length, cache_length):
+    """
+    Extract the relevant portion of attention mask for current forward pass.
+    Args:
+        full_mask: Complete attention mask [1, 1, max_length, max_length]
+        start_pos: Starting position in the full sequence
+        input_length: Length of current input sequence
+        cache_length: Length of cached sequence
+    Returns:
+        attention_mask: Extracted mask [1, 1, input_length, cache_length + input_length]
+    """
+    end_pos = start_pos + input_length
+    total_length = cache_length + input_length
+    # Extract the relevant rows (current input positions)
+    # and columns (cache + current input positions)
+    extracted_mask = torch.full((1, 1, input_length, total_length), -torch.inf,
+                               device=full_mask.device, dtype=full_mask.dtype)
+    # Copy cache columns (0 to cache_length in the extracted mask corresponds to 0 to cache_length in full mask)
+    extracted_mask[:, :, :, :cache_length] = full_mask[:, :, start_pos:end_pos, :cache_length]
+    # Copy current input columns
+    extracted_mask[:, :, :, cache_length:] = full_mask[:, :, start_pos:end_pos, start_pos:end_pos]
+    return extracted_mask
+def build_custom_float_attention_mask(input_ids, prompt_length, block_size, device=None, dtype=None):
+    B, seq_len = input_ids.shape
+    # Use the provided dtype or default to float32
+    if dtype is None:
+        dtype = torch.float32
+    # Initialize to all -inf
+    attn_mask = torch.full((B, 1, seq_len, seq_len), float('-inf'), dtype=dtype, device=device)
+    # 1. Prompt part: each token can attend to the entire prompt
+    for i in range(B):
+        attn_mask[i, :, :, :prompt_length[i]] = 0.0  # Allow all tokens to see the prompt
+        # 2. Block division: divide into blocks starting from prompt_length
+        num_blocks = (seq_len - prompt_length[i] + block_size - 1) // block_size
+        for b in range(num_blocks):
+            block_start = prompt_length[i] + b * block_size
+            block_end = min(block_start + block_size, seq_len)
+            # Full attention within the block
+            attn_mask[i, :, block_start:block_end, block_start:block_end] = 0.0
+            # Causal attention between blocks (can only see previous blocks)
+            for prev_b in range(b):
+                prev_start = prompt_length[i] + prev_b * block_size
+                prev_end = min(prev_start + block_size, seq_len)
+                # Current block can see previous blocks
+                attn_mask[i, :, block_start:block_end, prev_start:prev_end] = 0.0
+    return attn_mask
+def top_p_logits(logits, top_p=None):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the indices to the right to keep the first token above the threshold
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
+def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    if temperature > 0:
+        logits = logits / temperature
+    if top_p is not None and top_p < 1:
+        logits = top_p_logits(logits, top_p)
+    if top_k is not None:
+        logits = top_k_logits(logits, top_k)
+    probs = torch.softmax(logits, dim=-1)
+    if temperature > 0:
+        try:
+            x0 = dists.Categorical(probs=probs).sample()
+            initial_confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except:
+            initial_confidence, x0 = probs.max(dim=-1)
+    else:
+        initial_confidence, x0 = probs.max(dim=-1)
+    # Save initial confidence
+    confidence = initial_confidence.clone()
+    if margin_confidence:
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        # Extract top1 and top2 probabilities
+        top1_probs = sorted_probs[:, 0]
+        top2_probs = sorted_probs[:, 1]
+        # Calculate confidence as top1 - top2
+        confidence = top1_probs - top2_probs
+    if neg_entropy:
+        epsilon = 1e-10
+        log_probs = torch.log(probs + epsilon)
+        confidence = torch.sum(probs * log_probs, dim=-1)
+    return confidence, x0, initial_confidence
+@register_model("dream_lora")
+class DreamLoRA(LM):
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        lora_path: str,
+        batch_size: Optional[Union[int, str]] = 1,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        max_new_tokens: Optional[int] = 128,
+        max_length: Optional[int] = 2048,  # Updated to match example code
+        add_bos_token: Optional[bool] = False,
+        nll_type: Optional[str] = "mc",
+        log_type: Optional[str] = "ftb",
+        mc_num: Optional[int] = 128,
+        classifier_free_guidance: Optional[float] = 1.0,
+        sampling_eps: Optional[float] = 1e-3,
+        diffusion_steps: Optional[int] = 128,
+        trust_remote_code: Optional[bool] = True,
+        parallelize: Optional[bool] = False,
+        autogptq: Optional[Union[bool, str]] = False,
+        temperature: Optional[float] = 0.2,  # Updated default
+        top_p: Optional[float] = None,  # Updated default
+        top_k: Optional[float] = None,
+        alg: Optional[str] = "entropy",
+        alg_temp: Optional[float] = 0.0,
+        escape_until: Optional[bool] = False,
+        block_size: Optional[int] = 4,  # Updated to match example code
+        mask_token_id: Optional[int] = 151666,  # Added mask_token_id parameter
+        block_add_threshold: Optional[float] = 0.5,  # Added block_add_threshold parameter
+        decoded_token_threshold: Optional[int] = 0.9,  # Added decoded_token_threshold parameter
+        skip_threshold: Optional[float] = 1.0,  # Added skip_threshold parameter
+        sampling_strategy: Optional[str] = "default",  # Added sampling_strategy parameter
+        save_dir: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # prepare for parallelism
+        assert isinstance(device, str)
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int, str))
+        gpus = torch.cuda.device_count()
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        if accelerator.num_processes > 1:
+            self.accelerator = accelerator
+        if "npu" in accelerator.device.type:
+            gpus = torch.npu.device_count()
+        # using one process with no model parallelism
+        if not (parallelize or accelerator.num_processes > 1):
+            # use user-passed device
+            device_list = set(
+                ["cuda", "cpu"]
+                + [f"cuda:{i}" for i in range(gpus)]
+                + ["mps", "mps:0"]
+                + [f"npu:{i}" for i in range(gpus)]
+            )
+            if device and device in device_list:
+                self._device = torch.device(device)
+                eval_logger.info(f"Using device '{device}'")
+                if device in ("mps", "mps:0") and version.parse(
+                    torch.__version__
+                ) < version.parse("2.1"):
+                    raise RuntimeError(
+                        f"mps requires torch >= 2.1. You have {torch.__version__}"
+                    )
+            else:
+                eval_logger.info("Device not specified")
+                eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                self._device = (
+                    torch.device("cuda")
+                    if torch.cuda.is_available()
+                    else torch.device("cpu")
+                )
+        else:  # Parallelism managed by accelerate
+            if device != "cuda":
+                eval_logger.info(
+                    f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                )
+            # TODO: include in warning that `load_in_8bit` etc. affect this too
+            self._device = (
+                self.accelerator.device
+                if hasattr(self, "accelerator")
+                else torch.device(device)
+            )
+        self.batch_size_per_gpu = batch_size
+        if isinstance(batch_size, str):
+            self.batch_size_per_gpu = int(batch_size)
+        # Save LoRA path and block_size
+        self.lora_path = lora_path
+        self.block_size = block_size
+        self.block_add_threshold = block_add_threshold  # New block_add_threshold attribute
+        self.skip_threshold = skip_threshold  # New skip_threshold attribute
+        self.sampling_strategy = sampling_strategy  # Save sampling strategy parameter
+        self.decoded_token_threshold = decoded_token_threshold  # New decoded_token_threshold attribute
+        self.save_dir = save_dir
+        # Add metric tracking
+        self.total_forward_passes = 0
+        self.total_generated_tokens = 0
+        self.total_prompts = 0
+        # Add time and token statistics
+        self.total_generation_time = 0.0
+        self.total_block_tokens = 0  # Number of blocks * block_size
+        self.total_actual_tokens = 0  # Actual generated tokens (excluding EOS)
+        self.total_non_eos_tokens = 0  # Total non-EOS tokens in the entire sequence
+        self.all_generation_times = []
+        self.all_block_tokens = []
+        self.all_actual_tokens = []
+        self.all_non_eos_tokens = []
+        # Save target_dtype for later use
+        self.target_dtype = get_dtype(dtype)
+        # if isinstance(pretrained, str):
+        #     if gpus >= 1 or str(self.device) == "mps":
+        #         # TODO: can remove this whole snippet except in the mps case, perhaps?
+        #         if not (parallelize or autogptq or hasattr(self, "accelerator")):
+        #             # place model onto device requested manually,
+        #             # if not using HF Accelerate or device_map
+        #             # or any other option that preloads model onto device
+        #             try:
+        #                 self.model.to(self.device)
+        #             except ValueError:
+        #                 eval_logger.debug(
+        #                     "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+        #                 )
+        #     # multigpu data-parallel support when launched with accelerate
+        #     if gpus > 1:
+        #         if accelerator.num_processes > 1:
+        #             if parallelize:
+        #                 eval_logger.warning(
+        #                     "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
+        #                 )
+        #             elif gpus > accelerator.num_processes:
+        #                 eval_logger.warning(
+        #                     "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+        #                     "If you would like to use data parallelism, please launch the script "
+        #                     "with 'accelerate launch *script*'. "
+        #                     f"Current run will proceed with {accelerator.num_processes} devices."
+        #                 )
+        #                 if self.accelerator.is_local_main_process:
+        #                     eval_logger.info(
+        #                         f"Using {gpus} devices with data parallelism"
+        #                     )
+        #             self._device = torch.device(f"{accelerator.device}")
+        #             self.accelerator = accelerator
+        #             self._rank = self.accelerator.local_process_index
+        #             self._world_size = self.accelerator.num_processes
+        #         else:
+        #             # if we aren't launching via accelerate, ditch
+        #             self._rank = 0
+        #             self._world_size = 1
+        # else:
+        #     # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+        #     eval_logger.warning(
+        #         "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
+        #     )
+        #     self._rank = 0
+        #     self._world_size = 1
+        self.max_length = max_length
+        self.add_bos_token = add_bos_token
+        # generation params
+        self.max_new_tokens = max_new_tokens
+        self.diffusion_steps = diffusion_steps
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.alg = alg
+        self.alg_temp = alg_temp
+        self.escape_until = escape_until
+        self.block_size = block_size
+        self.mask_token_id = mask_token_id
+        # loglikelihood params
+        self.nll_type = nll_type
+        self.log_type = log_type
+        self.mc_num = mc_num
+        self.classifier_free_guidance = classifier_free_guidance
+        self.sampling_eps = sampling_eps
+        self._create_model_and_tokenizer(pretrained, dtype, trust_remote_code)
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    def _create_model_and_tokenizer(self, pretrained, dtype, trust_remote_code):
+        from d2f_vllm import LLM, SamplingParams
+        self.LLM = LLM(
+            pretrained,
+            lora_path=self.lora_path,
+            use_lora=True,
+            model_name="dream",
+            model_type="diffusion_lm",
+            enforce_eager=True,
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.60,
+            max_num_batched_tokens=2048,
+            max_num_seqs=20,
+            max_model_len=1024,
+            accept_threshold=self.skip_threshold,
+            complete_threshold=self.decoded_token_threshold,
+            add_new_block_threshold=1-self.block_add_threshold,
+            kv_cache_layout="unified"
+        )
+        self.tokenizer = self.LLM.tokenizer
+        self.sampling_params = SamplingParams(temperature=self.temperature, max_tokens=self.max_new_tokens)
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    def tok_encode(self, text, add_special_tokens=True):
+        return self.tokenizer(
+            text, return_tensors="pt", add_special_tokens=add_special_tokens
+        ).input_ids
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    def apply_chat_template(
+        self, chat_history, add_generation_prompt: bool = True
+    ) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        chat_templated = self.tokenizer.apply_chat_template(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
+        )
+        return chat_templated
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    def generate_until(self, requests: List[Instance], disable_tqdm: bool = False):
+        res = []
+        # Initialize statistics counters
+        if not hasattr(self, 'total_generated_tokens'):
+            self.total_generated_tokens = 0
+        num_tokens = 0
+        num_nfe = 0  # Number of Forward Evaluations
+        prompts, gen_args = [], []
+        print("Preparing prompts...")
+        for req in tqdm(requests):
+            prompts.append(self.tokenizer.bos_token + req.arguments[0])
+            gen_args.append(req.arguments[1])
+        start_time = time.time()
+        outputs = self.LLM.generate(prompts, self.sampling_params)
+        end_time = time.time()
+        total_time = end_time - start_time
+        # Accumulate statistics
+        res = [output['text'] for output in outputs]
+        num_tokens = sum(len(output['token_ids']) for output in outputs)
+        num_nfe = sum(output['n_diff_steps'] for output in outputs)
+        # Save final statistics
+        final_stats = {
+            'processed_samples': len(requests),
+            'total_samples': len(requests),
+            'total_tokens': num_tokens,
+            'total_nfe': num_nfe,
+            'total_time': total_time,
+            'tokens_per_second': num_tokens / total_time if total_time > 0 else 0,
+            'nfe_per_token': num_nfe / num_tokens if num_tokens > 0 else 0,
+            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
+        }
+        # Save statistics to file
+        if self.save_dir is not None:
+            import os
+            os.makedirs(self.save_dir, exist_ok=True)
+            # Save response results
+            save_path = os.path.join(self.save_dir, f'rank_{self.rank}_responses.jsonl')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                for r in res:
+                    f.write(json.dumps(r, ensure_ascii=False) + '\n')
+            # Save statistics results
+            stats_path = os.path.join(self.save_dir, f'rank_{self.rank}_final_stats.json')
+            with open(stats_path, 'w', encoding='utf-8') as f:
+                json.dump(final_stats, f, ensure_ascii=False, indent=2)
+        # Print final statistics
+        print("\n" + "="*60)
+        print("=== Final Statistics ===")
+        print("="*60)
+        print(f"Processed Samples: {final_stats['processed_samples']}")
+        print(f"Total Samples: {final_stats['total_samples']}")
+        print(f"Total Tokens: {final_stats['total_tokens']}")
+        print(f"Total NFE: {final_stats['total_nfe']}")
+        print(f"Total Time: {final_stats['total_time']:.4f}s")
+        print(f"Tokens/Second: {final_stats['tokens_per_second']:.2f}")
+        print(f"NFE/Token: {final_stats['nfe_per_token']:.4f}")
+        print(f"Completion Time: {final_stats['timestamp']}")
+        print("="*60)
+        return res
+    def _forward_process(self, batch):
+        b, l = batch.shape
+        # sample from U[0, 1] following https://arxiv.org/pdf/2107.00630 I.1
+        u0 = torch.rand(1, device=batch.device, dtype=torch.float32)
+        indices = torch.arange(b, device=batch.device).float()
+        t = (u0 + indices / b) % 1
+        p_mask = (1 - self.sampling_eps) * t + self.sampling_eps
+        p_mask = p_mask[:, None].repeat(1, l)
+        mask_indices = torch.rand((b, l), device=batch.device) < p_mask
+        # always unmask bos and eos
+        mask_indices[:, 0] = False
+        mask_indices[:, -1] = False
+        noisy_batch = torch.where(mask_indices, self.mask_token_id, batch)
+        return noisy_batch, p_mask
+    @torch.no_grad()
+    def get_logits(self, batch, prompt_index):
+        '''
+        prompt_index : 1D bool tensor, length=batch.shape[1]
+        '''
+        if self.classifier_free_guidance > 1.:
+            assert len(prompt_index) == batch.shape[1]
+            prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
+            un_batch = batch.clone()
+            un_batch[prompt_index] = self.mask_token_id
+            batch = torch.cat([batch, un_batch])
+        input = batch
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            logits = self.model(input).logits
+            # since bos always unmask, the first logits will not be used
+            logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1)
+        if self.classifier_free_guidance > 1.:
+            logits, un_logits = torch.chunk(logits, 2, dim=0)
+            logits = un_logits + self.cfg * (logits - un_logits)
+        return logits[:, :batch.shape[1]]
+    @torch.no_grad()
+    def _eval_target_nll_mc(self, prefix, target):
+        if prefix is None:
+            seq = target[None, :]
+        else:
+            seq = torch.concatenate([prefix, target])[None, :]
+        seq = seq.repeat((self.batch_size, 1)).to(self.device)
+        if self.log_type == 'ftb':
+            prompt_index = torch.arange(seq.shape[1], device=self.device) < len(prefix)
+        else:
+            prompt_index = torch.arange(seq.shape[1], device=self.device) >= len(prefix)
+        loss_acc = []
+        for _ in range(max(self.mc_num // self.batch_size, 1)):
+            perturbed_seq = seq.clone()
+            # eval_logger.info("before noising")
+            perturbed_seq_, p_mask = self._forward_process(seq)
+            # eval_logger.info("end noising")
+            if self.log_type == 'ftb':
+                perturbed_seq[:, -len(target):] = perturbed_seq_[:, -len(target):]
+            elif self.log_type == 'btf':
+                perturbed_seq[:, :len(prefix)] = perturbed_seq_[:, :len(prefix)]
+            elif self.log_type == 'union':
+                perturbed_seq = perturbed_seq_
+            else:
+                raise NotImplementedError(self.log_type)
+            mask_indices = perturbed_seq == self.mask_token_id
+            logits = self.get_logits(perturbed_seq, prompt_index)
+            loss = F.cross_entropy(logits[mask_indices], seq[mask_indices], reduction='none') / p_mask[mask_indices]
+            loss = loss.sum() / self.batch_size
+            loss_acc.append(loss.item())
+        return sum(loss_acc) / len(loss_acc)
+    @torch.no_grad()
+    def _eval_target_nll_ar(self, prefix, target):
+        prefix, target = prefix.unsqueeze(0), target.unsqueeze(0) # 1*l1, 1*l2
+        assert self.log_type in ['ftb', 'btf']
+        assert self.nll_type in ['ar_ftb', 'ar_btf']
+        if self.log_type == 'ftb':
+            prompt_index = torch.arange(prefix.shape[1] + target.shape[1], device=self.device) < prefix.shape[1]
+        else:
+            prompt_index = torch.arange(prefix.shape[1] + target.shape[1], device=self.device) >= prefix.shape[1]
+        if self.log_type == 'ftb':
+            perturbed_ = target.repeat(target.shape[1], 1).clone().contiguous() # l2*l2
+        else:
+            perturbed_ = prefix.repeat(prefix.shape[1], 1).clone().contiguous() # l1*l1
+        mask_index = torch.ones((perturbed_.shape[1], perturbed_.shape[1]), dtype=torch.bool)
+        if self.nll_type == 'ar_ftb':
+            mask_index = torch.triu(mask_index)
+        else:
+            mask_index = torch.tril(mask_index)
+        perturbed_[mask_index] = self.mask_token_id
+        if self.log_type == 'ftb':
+            perturbed_seq = torch.cat([prefix.repeat(perturbed_.shape[0], 1), perturbed_], dim=-1)
+        else:
+            perturbed_seq = torch.cat([perturbed_, target.repeat(perturbed_.shape[0], 1)], dim=-1)
+        logits_ = []
+        num = len(perturbed_seq) // self.batch_size if len(perturbed_seq) % self.batch_size == 0 else len(perturbed_seq) // self.batch_size + 1
+        for i in range(num):
+            end = (i + 1) * self.batch_size if (i + 1) * self.batch_size < len(perturbed_seq) else len(perturbed_seq)
+            perturbed_seq_ = perturbed_seq[i * self.batch_size: end]
+            perturbed_seq_ = perturbed_seq_.to(self.device)
+            if len(perturbed_seq_.shape) == 1:
+                perturbed_seq_ = perturbed_seq_.unsqueeze(0)
+            logits = self.get_logits(perturbed_seq_, prompt_index)
+            logits_.append(logits.cpu())
+        logits = torch.cat(logits_, dim=0)
+        temp_index = torch.ones((perturbed_.shape[1], perturbed_.shape[1]), dtype=torch.bool)
+        if self.nll_type == 'ar_ftb':
+            temp_index = torch.triu(temp_index, diagonal=1)
+        else:
+            temp_index = torch.tril(temp_index, diagonal=-1)
+        mask_index[temp_index] = False
+        if self.log_type == 'ftb':
+            logits_index = torch.cat([torch.zeros((perturbed_.shape[1], prefix.shape[1]), dtype=torch.bool), mask_index], dim=-1)
+        else:
+            logits_index = torch.cat([mask_index, torch.zeros((perturbed_.shape[1], target.shape[1]), dtype=torch.bool)], dim=-1)
+        if self.log_type == 'ftb':
+            loss = F.cross_entropy(logits[logits_index], target[0], reduction='sum').cpu().item()
+        else:
+            loss = F.cross_entropy(logits[logits_index], prefix[0], reduction='sum').cpu().item()
+        return loss
+    def _encode_pair(self, context, continuation):
+        if self.add_bos_token:
+            context = self.tokenizer.bos_token + context
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tokenizer.encode(context + continuation) + [self.tokenizer.eos_token_id]
+        context_enc = self.tokenizer.encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        # by default truncate on the left
+        cutoff_length = max(len(whole_enc) - self.max_length, 0)
+        if cutoff_length > 0:
+            eval_logger.warning(f"Text length {len(whole_enc)} is larger than {self.max_length}, cutoff on the left side")
+            context_remain = context_enc_len-cutoff_length
+            if context_remain > 0:
+                context_enc = context_enc[-context_remain:]
+            else:
+                eval_logger.warning(f"All context (prompt) is truncated.")
+                context_enc = ""
+                continuation_enc = whole_enc[-self.max_length:]
+        return context_enc, continuation_enc
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        def _tokenize(e):
+            prefix, target = self._encode_pair(e["prefix"], e["target"])
+            return {
+                "prefix_text": e["prefix"],
+                "target_text": e["target"],
+                "prefix": prefix,
+                "target": target,
+            }
+        ds = []
+        ds = [{"prefix": req.args[0], "target": req.args[1]} for req in requests]
+        ds = Dataset.from_list(ds)
+        print(ds[0])
+        ds = ds.map(_tokenize)
+        ds = ds.with_format("torch")
+        out = []
+        with torch.no_grad():
+            for elem in tqdm(ds, desc="Computing likelihood..."):
+                prefix = elem["prefix"]
+                target = elem["target"]
+                # likelihood calculations are modified from https://github.com/ML-GSAI/SMDM/blob/main/evaluate_diff.py
+                if self.nll_type == 'mc':
+                    ll = -self._eval_target_nll_mc(prefix, target)
+                    if self.log_type == 'union':
+                        ll = ll / (len(target) + len(prefix))
+                elif self.nll_type == 'ar_ftb' or self.nll_type == 'ar_btf':
+                    ll = -self._eval_target_nll_ar(prefix, target)
+                else:
+                    raise NotImplementedError(self.nll_type)
+                # TODO: greedy decoding
+                is_target_greedy_dec = False
+                out.append((ll, 1.0 if is_target_greedy_dec else 0.0))
+        return out
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError
+if __name__ == "__main__":
+    set_seed(1234)
+    cli_evaluate()

eval_dream_d2f_vllm.sh ADDED Viewed

	@@ -0,0 +1,135 @@

+tasks="gsm8k_cot mbpp minerva_math"
+nshots="8 3 4"
+lengths="256 256 256"
+temperatures="0 0 0"
+limits="10000 10000 10000"
+block_sizes="32 48 64"
+block_add_thresholds="0.1 0.1 0.1"
+decoded_token_thresholds="0.95 0.95 0.95"
+skip_thresholds="0.9 0.9 0.9"
+top_ps="none none none"
+dtypes="bfloat16 bfloat16 bfloat16"
+sampling_strategies="default default default"
+humaneval_nshots="0"
+humaneval_lengths="256"
+humaneval_temperatures="0"
+humaneval_limits="10000"
+humaneval_diffusion_steps="256"
+humaneval_block_sizes="32"
+humaneval_block_add_thresholds="0.9"
+humaneval_decoded_token_thresholds="0.95"
+humaneval_skip_thresholds="0.95"
+humaneval_top_ps="none"
+humaneval_dtypes="bfloat16"
+humaneval_sampling_strategies="default"
+base_model=Dream-org/Dream-v0-Base-7B
+lora_models=(
+    "SJTU-Deng-Lab/D2F_Dream_Base_7B_Lora"
+)
+read -ra TASKS_ARRAY <<< "$tasks"
+read -ra NSHOTS_ARRAY <<< "$nshots"
+read -ra LENGTH_ARRAY <<< "$lengths"
+read -ra TEMP_ARRAY <<< "$temperatures"
+read -ra LIMITS_ARRAY <<< "$limits"
+read -ra BLOCK_SIZES_ARRAY <<< "$block_sizes"
+read -ra BLOCK_ADD_THRESHOLDS_ARRAY <<< "$block_add_thresholds"
+read -ra DECODED_TOKEN_THRESHOLDS_ARRAY <<< "$decoded_token_thresholds"
+read -ra SKIP_THRESHOLDS_ARRAY <<< "$skip_thresholds"
+read -ra TOP_PS_ARRAY <<< "$top_ps"
+read -ra DTYPES_ARRAY <<< "$dtypes"
+read -ra SAMPLING_STRATEGIES_ARRAY <<< "$sampling_strategies"
+read -ra HUMANEVAL_NSHOTS_ARRAY <<< "$humaneval_nshots"
+read -ra HUMANEVAL_LENGTHS_ARRAY <<< "$humaneval_lengths"
+read -ra HUMANEVAL_TEMP_ARRAY <<< "$humaneval_temperatures"
+read -ra HUMANEVAL_LIMITS_ARRAY <<< "$humaneval_limits"
+read -ra HUMANEVAL_DIFFUSION_STEPS_ARRAY <<< "$humaneval_diffusion_steps"
+read -ra HUMANEVAL_BLOCK_SIZES_ARRAY <<< "$humaneval_block_sizes"
+read -ra HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY <<< "$humaneval_block_add_thresholds"
+read -ra HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY <<< "$humaneval_decoded_token_thresholds"
+read -ra HUMANEVAL_SKIP_THRESHOLDS_ARRAY <<< "$humaneval_skip_thresholds"
+read -ra HUMANEVAL_TOP_PS_ARRAY <<< "$humaneval_top_ps"
+read -ra HUMANEVAL_DTYPES_ARRAY <<< "$humaneval_dtypes"
+read -ra HUMANEVAL_SAMPLING_STRATEGIES_ARRAY <<< "$humaneval_sampling_strategies"
+array_length=${#TASKS_ARRAY[@]}
+if [[ ${#NSHOTS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#LENGTH_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#TEMP_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#LIMITS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#BLOCK_SIZES_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#BLOCK_ADD_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#DECODED_TOKEN_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#SKIP_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#TOP_PS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#SAMPLING_STRATEGIES_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#DTYPES_ARRAY[@]} -ne $array_length ]]; then
+    echo "Error: All configuration arrays must have the same length!"
+    exit 1
+fi
+humaneval_array_length=${#HUMANEVAL_NSHOTS_ARRAY[@]}
+if [[ ${#HUMANEVAL_LENGTHS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_TEMP_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_LIMITS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DIFFUSION_STEPS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_BLOCK_SIZES_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_SKIP_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_TOP_PS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DTYPES_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[@]} -ne $humaneval_array_length ]]; then
+    echo "Error: All HumanEval configuration arrays must have the same length!"
+    exit 1
+fi
+export HF_ALLOW_CODE_EVAL=1
+for lora_model in "${lora_models[@]}"; do
+    lora_model_name="$lora_model"
+    echo "===================================================================="
+    echo "Evaluating LoRA model: $lora_model_name"
+    echo "===================================================================="
+    for i in "${!HUMANEVAL_NSHOTS_ARRAY[@]}"; do
+        output_path="evals_dream${lora_model_name}/humaneval-ns${HUMANEVAL_NSHOTS_ARRAY[$i]}-len${HUMANEVAL_LENGTHS_ARRAY[$i]}-temp${HUMANEVAL_TEMP_ARRAY[$i]}-limit${HUMANEVAL_LIMITS_ARRAY[$i]}-diffsteps${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]}-block${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]}-thresh${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]}-decodethresh${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}-skip${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]}-topp${HUMANEVAL_TOP_PS_ARRAY[$i]}-dtype${HUMANEVAL_DTYPES_ARRAY[$i]}-sampling${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]}"
+        echo "Running HumanEval evaluation $((i+1))/${humaneval_array_length} for $lora_model_name..."
+        if [[ "${HUMANEVAL_TOP_PS_ARRAY[$i]}" == "none" ]]; then
+            humaneval_model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${HUMANEVAL_LENGTHS_ARRAY[$i]},diffusion_steps=${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]},temperature=${HUMANEVAL_TEMP_ARRAY[$i]},add_bos_token=true,escape_until=true,block_size=${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${HUMANEVAL_DTYPES_ARRAY[$i]},sampling_strategy=${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        else
+            humaneval_model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${HUMANEVAL_LENGTHS_ARRAY[$i]},diffusion_steps=${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]},temperature=${HUMANEVAL_TEMP_ARRAY[$i]},top_p=${HUMANEVAL_TOP_PS_ARRAY[$i]},add_bos_token=true,escape_until=true,block_size=${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${HUMANEVAL_DTYPES_ARRAY[$i]},sampling_strategy=${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        fi
+        CUDA_VISIBLE_DEVICES=5 accelerate launch --main_process_port 29520 --num_processes 1 eval_dream_d2f_vllm.py --model dream_lora \
+            --model_args $humaneval_model_args \
+            --tasks humaneval \
+            --num_fewshot ${HUMANEVAL_NSHOTS_ARRAY[$i]} \
+            --batch_size 1 \
+            --output_path $output_path \
+            --log_samples \
+            --confirm_run_unsafe_code
+    done
+    for i in "${!TASKS_ARRAY[@]}"; do
+        output_path="evals_dream${lora_model_name}/${TASKS_ARRAY[$i]}-ns${NSHOTS_ARRAY[$i]}-len${LENGTH_ARRAY[$i]}-temp${TEMP_ARRAY[$i]}-limit${LIMITS_ARRAY[$i]}-diffsteps${LENGTH_ARRAY[$i]}-block${BLOCK_SIZES_ARRAY[$i]}-thresh${BLOCK_ADD_THRESHOLDS_ARRAY[$i]}-decodethresh${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}-skip${SKIP_THRESHOLDS_ARRAY[$i]}-topp${TOP_PS_ARRAY[$i]}-dtype${DTYPES_ARRAY[$i]}-sampling${SAMPLING_STRATEGIES_ARRAY[$i]}"
+        if [[ "${TOP_PS_ARRAY[$i]}" == "none" ]]; then
+            model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${LENGTH_ARRAY[$i]},diffusion_steps=${LENGTH_ARRAY[$i]},add_bos_token=true,temperature=${TEMP_ARRAY[$i]},block_size=${BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${DTYPES_ARRAY[$i]},sampling_strategy=${SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        else
+            model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${LENGTH_ARRAY[$i]},diffusion_steps=${LENGTH_ARRAY[$i]},add_bos_token=true,temperature=${TEMP_ARRAY[$i]},top_p=${TOP_PS_ARRAY[$i]},block_size=${BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${DTYPES_ARRAY[$i]},sampling_strategy=${SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        fi
+        CUDA_VISIBLE_DEVICES=5 accelerate launch --main_process_port 29520 --num_processes 1 eval_dream_d2f_vllm.py --model dream_lora \
+            --model_args $model_args \
+            --tasks ${TASKS_ARRAY[$i]} \
+            --limit ${LIMITS_ARRAY[$i]} \
+            --num_fewshot ${NSHOTS_ARRAY[$i]} \
+            --batch_size 1 \
+            --output_path $output_path \
+            --log_samples \
+            --confirm_run_unsafe_code
+    done
+done
+echo "All evaluations completed!"

eval_llada.py ADDED Viewed

	@@ -0,0 +1,1198 @@

+import logging
+import gc
+import json
+import time  # Add time module
+from datetime import timedelta
+from typing import List, Optional, Tuple, Type, TypeVar, Union, Dict
+import torch
+import torch.nn.functional as F
+import torch.distributions as dists
+import transformers
+from transformers import AutoTokenizer
+from peft import LoraConfig, get_peft_model
+from accelerate import (
+    Accelerator,
+    InitProcessGroupKwargs,
+)
+from datasets import Dataset
+from packaging import version
+from tqdm import tqdm
+from peft import PeftConfig, PeftModel
+import numpy as np  # Add numpy import
+import os
+import jinja2
+# Import LLaDA model related modules
+from model_cache.llada.modeling_llada import LLaDAModelLM
+from model_cache.llada.configuration_llada import LLaDAConfig
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import get_dtype
+from lm_eval.__main__ import cli_evaluate
+eval_logger = logging.getLogger(__name__)
+T = TypeVar("T", bound="TemplateLM")
+import random
+def set_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def create_full_block_attention_mask(prompt_length, max_length, block_size, device=None, dtype=None):
+    """
+    Creates a complete attention mask for the entire sequence with block-based causal attention.
+    Args:
+        prompt_length: Length of the prompt (first irregular block)
+        max_length: Maximum total sequence length
+        block_size: Size of each regular block
+        device: Device to create tensor on
+        dtype: Data type for the attention mask
+    Returns:
+        attention_mask: Tensor of shape [1, 1, max_length, max_length]
+    """
+    # Use the provided dtype or default to bfloat16
+    if dtype is None:
+        dtype = torch.bfloat16
+    # Initialize mask with -inf (no attention)
+    attention_mask = torch.full((1, 1, max_length, max_length), -torch.inf, device=device, dtype=dtype)
+    # Block 0: Prompt (can see itself)
+    attention_mask[:, :, :prompt_length, :prompt_length] = 0
+    # Calculate the number of regular blocks after prompt
+    remaining_length = max_length - prompt_length
+    num_blocks = (remaining_length + block_size - 1) // block_size
+    # Process each regular block
+    for b in range(num_blocks):
+        block_start = prompt_length + b * block_size
+        block_end = min(prompt_length + (b + 1) * block_size, max_length)
+        # Current block can see the prompt
+        attention_mask[:, :, block_start:block_end, :prompt_length] = 0
+        # Current block can see all previous regular blocks
+        for prev_b in range(b):
+            prev_start = prompt_length + prev_b * block_size
+            prev_end = min(prompt_length + (prev_b + 1) * block_size, max_length)
+            attention_mask[:, :, block_start:block_end, prev_start:prev_end] = 0
+        # Current block can see itself (full attention within block)
+        attention_mask[:, :, block_start:block_end, block_start:block_end] = 0
+    return attention_mask
+def extract_attention_mask(full_mask, start_pos, input_length, cache_length):
+    """
+    Extract the relevant portion of attention mask for current forward pass.
+    Args:
+        full_mask: Complete attention mask [1, 1, max_length, max_length]
+        start_pos: Starting position in the full sequence
+        input_length: Length of current input sequence
+        cache_length: Length of cached sequence
+    Returns:
+        attention_mask: Extracted mask [1, 1, input_length, cache_length + input_length]
+    """
+    end_pos = start_pos + input_length
+    total_length = cache_length + input_length
+    # Extract the relevant rows (current input positions)
+    # and columns (cache + current input positions)
+    extracted_mask = torch.full((1, 1, input_length, total_length), -torch.inf,
+                               device=full_mask.device, dtype=full_mask.dtype)
+    # Copy cache columns (0 to cache_length in the extracted mask corresponds to 0 to cache_length in full mask)
+    extracted_mask[:, :, :, :cache_length] = full_mask[:, :, start_pos:end_pos, :cache_length]
+    # Copy current input columns
+    extracted_mask[:, :, :, cache_length:] = full_mask[:, :, start_pos:end_pos, start_pos:end_pos]
+    return extracted_mask
+def build_custom_float_attention_mask(input_ids, prompt_length, block_size, device=None, dtype=None):
+    """
+    Builds a custom float attention mask with block-based causal attention.
+    Args:
+        input_ids: Input token IDs.
+        prompt_length: Length of the prompt for each sequence in the batch.
+        block_size: Size of each regular block.
+        device: Device to create tensor on.
+        dtype: Data type for the attention mask.
+    Returns:
+        attn_mask: Tensor of shape [B, 1, seq_len, seq_len].
+    """
+    B, seq_len = input_ids.shape
+    # Use the provided dtype or default to float32
+    if dtype is None:
+        dtype = torch.float32
+    # Initialize to all -inf
+    attn_mask = torch.full((B, 1, seq_len, seq_len), float('-inf'), dtype=dtype, device=device)
+    # 1. Prompt section: each token can attend to the entire prompt
+    for i in range(B):
+        attn_mask[i, :, :, :prompt_length[i]] = 0.0  # Allow all tokens to see the prompt
+        # 2. Block division: divide blocks starting from prompt_length
+        num_blocks = (seq_len - prompt_length[i] + block_size - 1) // block_size
+        for b in range(num_blocks):
+            block_start = prompt_length[i] + b * block_size
+            block_end = min(block_start + block_size, seq_len)
+            # Full attention within the block
+            attn_mask[i, :, block_start:block_end, block_start:block_end] = 0.0
+            # Causal attention between blocks (can only see previous blocks)
+            for prev_b in range(b):
+                prev_start = prompt_length[i] + prev_b * block_size
+                prev_end = min(prev_start + block_size, seq_len)
+                # Current block can see previous blocks
+                attn_mask[i, :, block_start:block_end, prev_start:prev_end] = 0.0
+    return attn_mask
+def top_p_logits(logits, top_p=None):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the indices to the right to keep the first token above the threshold
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
+def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    if temperature > 0:
+        logits = logits / temperature
+    if top_p is not None and top_p < 1:
+        logits = top_p_logits(logits, top_p)
+    if top_k is not None:
+        logits = top_k_logits(logits, top_k)
+    probs = torch.softmax(logits, dim=-1)
+    if temperature > 0:
+        try:
+            x0 = dists.Categorical(probs=probs).sample()
+            initial_confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except:
+            initial_confidence, x0 = probs.max(dim=-1)
+    else:
+        initial_confidence, x0 = probs.max(dim=-1)
+    # Save initial confidence
+    confidence = initial_confidence.clone()
+    if margin_confidence:
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        # Extract top1 and top2 probabilities
+        top1_probs = sorted_probs[:, 0]
+        top2_probs = sorted_probs[:, 1]
+        # Calculate confidence as top1 - top2
+        confidence = top1_probs - top2_probs
+    if neg_entropy:
+        epsilon = 1e-10
+        log_probs = torch.log(probs + epsilon)
+        confidence = torch.sum(probs * log_probs, dim=-1)
+    return confidence, x0, initial_confidence
+@register_model("dream_lora")
+class DreamLoRA(TemplateLM):
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        lora_path: str,
+        batch_size: Optional[Union[int, str]] = 1,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        max_new_tokens: Optional[int] = 128,
+        max_length: Optional[int] = 4096,  # Updated to match example code
+        add_bos_token: Optional[bool] = False,
+        nll_type: Optional[str] = "mc",
+        log_type: Optional[str] = "ftb",
+        mc_num: Optional[int] = 128,
+        classifier_free_guidance: Optional[float] = 1.0,
+        sampling_eps: Optional[float] = 1e-3,
+        diffusion_steps: Optional[int] = 128,
+        trust_remote_code: Optional[bool] = True,
+        parallelize: Optional[bool] = False,
+        autogptq: Optional[Union[bool, str]] = False,
+        temperature: Optional[float] = 0.2,  # Updated default value
+        top_p: Optional[float] = None,  # Updated default value
+        top_k: Optional[float] = None,
+        alg: Optional[str] = "entropy",
+        alg_temp: Optional[float] = 0.0,
+        escape_until: Optional[bool] = False,
+        block_size: Optional[int] = 4,  # Updated to match example code
+        mask_token_id: Optional[int] = 126336,  # Added mask_token_id parameter
+        block_add_threshold: Optional[float] = 0.5,  # Added block_add_threshold parameter
+        decoded_token_threshold: Optional[float] = 0.9,  # Added decoded token threshold parameter
+        skip_threshold: Optional[float] = 1.0,  # Added skip_threshold parameter
+        sampling_strategy: Optional[str] = "default",  # Added sampling strategy parameter
+        save_dir: Optional[str] = None,  # Added save directory parameter
+        show_speed: Optional[bool] = True,  # Added speed statistics parameter
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # prepare for parallelism
+        assert isinstance(device, str)
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int, str))
+        gpus = torch.cuda.device_count()
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        if accelerator.num_processes > 1:
+            self.accelerator = accelerator
+        if "npu" in accelerator.device.type:
+            gpus = torch.npu.device_count()
+        # using one process with no model parallelism
+        if not (parallelize or accelerator.num_processes > 1):
+            # use user-passed device
+            device_list = set(
+                ["cuda", "cpu"]
+                + [f"cuda:{i}" for i in range(gpus)]
+                + ["mps", "mps:0"]
+                + [f"npu:{i}" for i in range(gpus)]
+            )
+            if device and device in device_list:
+                self._device = torch.device(device)
+                eval_logger.info(f"Using device '{device}'")
+                if device in ("mps", "mps:0") and version.parse(
+                    torch.__version__
+                ) < version.parse("2.1"):
+                    raise RuntimeError(
+                        f"mps requires torch >= 2.1. You have {torch.__version__}"
+                    )
+            else:
+                eval_logger.info("Device not specified")
+                eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                self._device = (
+                    torch.device("cuda")
+                    if torch.cuda.is_available()
+                    else torch.device("cpu")
+                )
+        else:  # Parallelism managed by accelerate
+            if device != "cuda":
+                eval_logger.info(
+                    f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                )
+            # TODO: include in warning that `load_in_8bit` etc. affect this too
+            self._device = (
+                self.accelerator.device
+                if hasattr(self, "accelerator")
+                else torch.device(device)
+            )
+        self.batch_size_per_gpu = batch_size
+        if isinstance(batch_size, str):
+            self.batch_size_per_gpu = int(batch_size)
+        # Save LoRA path and block_size
+        self.lora_path = lora_path
+        self.block_size = block_size
+        self.block_add_threshold = block_add_threshold  # Added block_add_threshold attribute
+        self.skip_threshold = skip_threshold  # Added skip_threshold attribute
+        self.sampling_strategy = sampling_strategy  # Save sampling strategy parameter
+        self.decoded_token_threshold = decoded_token_threshold  # Added decoded token threshold attribute
+        # Save target_dtype for later use
+        self.target_dtype = get_dtype(dtype)
+        self._create_model_and_tokenizer(pretrained, dtype, trust_remote_code)
+        if isinstance(pretrained, str):
+            if gpus >= 1 or str(self.device) == "mps":
+                # TODO: can remove this whole snippet except in the mps case, perhaps?
+                if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                    # place model onto device requested manually,
+                    # if not using HF Accelerate or device_map
+                    # or any other option that preloads model onto device
+                    try:
+                        self.model.to(self.device)
+                    except ValueError:
+                        eval_logger.debug(
+                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                        )
+            # multigpu data-parallel support when launched with accelerate
+            if gpus > 1:
+                if accelerator.num_processes > 1:
+                    if parallelize:
+                        eval_logger.warning(
+                            "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
+                        )
+                    elif gpus > accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                            "If you would like to use data parallelism, please launch the script "
+                            "with 'accelerate launch *script*'. "
+                            f"Current run will proceed with {accelerator.num_processes} devices."
+                        )
+                        if self.accelerator.is_local_main_process:
+                            eval_logger.info(
+                                f"Using {gpus} devices with data parallelism"
+                            )
+                    self._device = torch.device(f"{accelerator.device}")
+                    self.accelerator = accelerator
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+                else:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
+        else:
+            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
+            )
+            self._rank = 0
+            self._world_size = 1
+        self.max_length = max_length
+        self.add_bos_token = add_bos_token
+        # generation params
+        self.max_new_tokens = max_new_tokens
+        self.diffusion_steps = diffusion_steps
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.alg = alg
+        self.alg_temp = alg_temp
+        self.escape_until = escape_until
+        self.block_size = block_size
+        self.mask_token_id = mask_token_id
+        # loglikelihood params
+        self.nll_type = nll_type
+        self.log_type = log_type
+        self.mc_num = mc_num
+        self.classifier_free_guidance = classifier_free_guidance
+        self.sampling_eps = sampling_eps
+        # Add backend attribute, consistent with LLaDA.py
+        self.backend = "causal"
+        # Add truncation attribute, consistent with LLaDA.py
+        self.truncation = False
+        self.save_dir = save_dir
+        self.show_speed = show_speed
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    def _create_model_and_tokenizer(self, pretrained, dtype, trust_remote_code):
+        # Get correct data type
+        target_dtype = get_dtype(dtype)
+        # Load LLaDA model and configuration
+        config = LLaDAConfig.from_pretrained(pretrained)
+        self.model = LLaDAModelLM.from_pretrained(
+            pretrained,
+            config=config,
+            torch_dtype=target_dtype,
+            trust_remote_code=False,
+        ).eval()
+        # Load LoRA configuration and model
+        peft_config = PeftConfig.from_pretrained(self.lora_path)
+        self.model = PeftModel.from_pretrained(self.model, self.lora_path)
+        # Convert data type only when target_dtype is not None and not "auto"
+        if target_dtype is not None and target_dtype != "auto":
+            self.model = self.model.to(target_dtype)
+        # Move to specified device
+        self.model = self.model.to(self.device)
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            pretrained, trust_remote_code=trust_remote_code
+        )
+    def tok_encode(
+        self, string: str, left_truncate_len=None, add_special_tokens=None
+    ) -> List[int]:
+        """ """
+        # default for None - empty dict, use predefined tokenizer param
+        # used for all models except for CausalLM or predefined value
+        special_tokens_kwargs = {}
+        # by default for CausalLM - false or self.add_bos_token is set
+        if add_special_tokens is None:
+            if self.backend == "causal":
+                special_tokens_kwargs = {
+                    "add_special_tokens": False or self.add_bos_token
+                }
+        # otherwise the method explicitly defines the value
+        else:
+            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+    def tok_batch_encode(
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+        add_special_tokens = {}
+        if self.backend == "causal":
+            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            **add_special_tokens,
+        )
+        if left_truncate_len:
+            original_lengths = encoding["input_ids"].size(1)
+            if original_lengths > left_truncate_len:
+                eval_logger.warn(
+                    f"Left truncation applied. Original sequence length was {original_lengths}, "
+                    f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
+                )
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+        return encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    def _count_tokens_after_truncation(self, response_text: str, until_terms: List[str] = None) -> int:
+        """
+        Unified token counting function: calculates the number of non-126081 tokens after truncating the response.
+        """
+        # Apply truncation based on until parameters
+        truncated_text = response_text
+        if until_terms and not self.escape_until:
+            for term in until_terms:
+                if len(term) > 0:
+                    truncated_text = truncated_text.split(term)[0]
+        # Re-tokenize processed answer and count non-126081 tokens
+        generated_answer_ids = torch.tensor(self.tokenizer(truncated_text)["input_ids"])
+        return int((generated_answer_ids != 126081).sum())
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        try:
+            chat_templated = self.tokenizer.apply_chat_template(
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
+            )
+        except jinja2.exceptions.TemplateError:
+            eval_logger.warning(
+                "Failed to apply chat template. removing the system role in chat history."
+            )
+            chat_history = [msg for msg in chat_history if msg["role"] != "system"]
+            chat_templated = self.tokenizer.apply_chat_template(
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
+            )
+        return chat_templated
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    def _generate_block_single(self, prompt):
+        """
+        Generates a response for a single prompt using parallel block generation, based on KV cache, and uses pre-generated attention masks.
+        Returns: generated_sequence (List[int]) - List of generated token IDs
+        """
+        self.model.eval()
+        mask_id = self.mask_token_id
+        block_size = self.block_size
+        block_add_threshold = self.block_add_threshold
+        skip_threshold = self.skip_threshold
+        # Pre-generate the full attention mask, using the model's data type
+        prompt_length = prompt.shape[1]
+        full_attention_mask = create_full_block_attention_mask(
+            prompt_length=prompt_length,
+            max_length=self.max_length,
+            block_size=block_size,
+            device=self.device,
+            dtype=self.target_dtype if self.target_dtype is not None and self.target_dtype != "auto" else torch.bfloat16
+        )
+        with torch.inference_mode():
+            # Initialization
+            x_t = prompt.to(self.device)
+            # Track block states - states can be: 'active', 'to_cache', 'in_cache'
+            # Added 'is_complete' field to indicate whether it's a complete state (True) or incomplete state (False)
+            block_states = {
+                0: {
+                    'start_pos': 0,
+                    'end_pos': prompt.shape[1],
+                    'mask_count': 0,
+                    'total_masks': prompt.shape[1],
+                    'state': 'to_cache',  # Prompt is immediately ready for caching
+                    'is_complete': True,  # Prompt is always in a complete state
+                },
+            }
+            # Initialize cache
+            past_key_values = None
+            current_blocks = 0  # Number of active blocks
+            step = 0
+            eos_detected = False  # EOS detection flag
+            cache_length = 0
+            while current_blocks >= 0:
+                step += 1
+                # Check if a new block needs to be added
+                if len(block_states)-1 < (self.max_new_tokens // block_size) and not eos_detected:
+                    last_block_id = len(block_states) - 1
+                    current_progress = (block_states[last_block_id]['total_masks'] -
+                                      block_states[last_block_id]['mask_count']) / block_states[last_block_id]['total_masks']
+                    if current_progress >= block_add_threshold:
+                        # Add new block
+                        new_block_id = len(block_states)
+                        new_start_pos = x_t.shape[1]
+                        x_t = torch.cat([x_t, torch.tensor([[mask_id] * block_size]).to(self.device)], dim=1)
+                        block_states[new_block_id] = {
+                            'start_pos': new_start_pos,
+                            'end_pos': new_start_pos + block_size,
+                            'mask_count': block_size,
+                            'total_masks': block_size,
+                            'state': 'active',
+                            'is_complete': False,  # New block defaults to an incomplete state
+                        }
+                        current_blocks += 1
+                # At the beginning of each loop, update the block's complete/incomplete states
+                self._update_block_completion_states(block_states, self.decoded_token_threshold)
+                # Check if there are still mask tokens
+                mask_index = (x_t == mask_id)
+                if mask_index.sum() == 0 and current_blocks == 0:
+                    break
+                # Determine which blocks need to be added to the cache
+                blocks_to_cache = [bid for bid, state in block_states.items()
+                                if state['state'] == 'to_cache']
+                # Determine the part to be processed
+                update_kvcache = 0
+                if blocks_to_cache:
+                    # Find the earliest block to be cached
+                    earliest_block_id = min(blocks_to_cache)
+                    earliest_pos = block_states[earliest_block_id]['start_pos']
+                    # Find the latest block to be cached
+                    latest_block_id = max(blocks_to_cache)
+                    latest_pos = block_states[latest_block_id]['end_pos']
+                    # Update the cache for all blocks within this range
+                    update_kvcache = latest_pos - earliest_pos
+                # Create input sequence for forward pass
+                process_start_pos = cache_length
+                if update_kvcache > 0:
+                    # Need to update cache - use completed blocks
+                    earliest_block_to_cache = min(blocks_to_cache)
+                    input_seq = x_t[:, block_states[earliest_block_to_cache]['start_pos']:]
+                    process_start_pos = block_states[earliest_block_to_cache]['start_pos']
+                else:
+                    # Only process active blocks
+                    active_blocks = [bid for bid, state in block_states.items() if state['state'] == 'active']
+                    if active_blocks:
+                        # Get all active blocks after caching
+                        earliest_active_after_cache = float('inf')
+                        for bid in active_blocks:
+                            if block_states[bid]['start_pos'] >= cache_length:
+                                earliest_active_after_cache = min(earliest_active_after_cache, block_states[bid]['start_pos'])
+                        if earliest_active_after_cache < float('inf'):
+                            input_seq = x_t[:, earliest_active_after_cache:]
+                            process_start_pos = earliest_active_after_cache
+                        else:
+                            # No active blocks after caching, this should not happen
+                            input_seq = x_t[:, cache_length:]
+                            # If cache length is already equal to or exceeds sequence length, exit
+                            if cache_length >= x_t.shape[1]:
+                                print(f"Cache length ({cache_length}) >= sequence length ({x_t.shape[1]}) at step {step}. Exiting generation loop.")
+                                raise Exception("Cache length >= sequence length")
+                    else:
+                        # No active blocks, but blocks might need to be cached in the next iteration
+                        break
+                # Check if input_seq is empty
+                if input_seq.shape[1] == 0:
+                    print(f"Warning: input_seq is empty at step {step}. Breaking generation loop.")
+                    raise Exception("input_seq is empty")
+                # Extract the attention mask for the current input from the pre-generated full mask
+                input_length = input_seq.shape[1]
+                attention_mask = extract_attention_mask(
+                    full_mask=full_attention_mask,
+                    start_pos=process_start_pos,
+                    input_length=input_length,
+                    cache_length=cache_length
+                )
+                outputs = self.model(
+                    input_seq,
+                    attention_bias=attention_mask,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    update_kvcache=update_kvcache+cache_length,
+                )
+                # Get current logits - LLaDA model directly uses logits, no shifting needed
+                logits = outputs.logits
+                # Update cache if needed
+                if update_kvcache > 0:
+                    # Update cache
+                    past_key_values = outputs.past_key_values
+                    # Mark blocks as cached
+                    for block_id in blocks_to_cache:
+                        block_states[block_id]['state'] = 'in_cache'
+                # Process mask tokens for each active block
+                blocks_to_deactivate = []
+                for block_id in sorted(block_states.keys()):
+                    if block_states[block_id]['state'] != 'active':
+                        continue
+                    # Get mask positions for this block
+                    block_start = block_states[block_id]['start_pos']
+                    block_end = block_states[block_id]['end_pos']
+                    block_mask_index = mask_index.clone()
+                    block_mask_index[:, :block_start] = False
+                    block_mask_index[:, block_end:] = False
+                    # Skip if the current block has no masks
+                    if block_mask_index.sum() == 0:
+                        blocks_to_deactivate.append(block_id)
+                        continue
+                    # Calculate relative position of logits
+                    logit_offset = block_start - process_start_pos
+                    block_rel_positions = torch.where(block_mask_index[0, block_start:block_end])[0]
+                    if block_rel_positions.size(0) > 0:
+                        # Get logits for masked positions
+                        block_mask_logits = logits[:, logit_offset + block_rel_positions, :]
+                        # Sample tokens
+                        confidence, x0, initial_confidence = sample_tokens(
+                            block_mask_logits.squeeze(0),
+                            self.temperature,
+                            top_p=self.top_p,
+                            top_k=self.top_k,
+                            neg_entropy=(self.sampling_strategy == "neg_entropy"),
+                            margin_confidence=(self.sampling_strategy == "margin_confidence")
+                        )
+                        # Use different sampling strategies based on the block's complete/incomplete state
+                        is_complete = block_states[block_id]['is_complete']
+                        if is_complete:
+                            # Complete state: apply confidence threshold, if no high confidence, select the highest
+                            high_conf_indices = torch.where(initial_confidence > skip_threshold)[0]
+                            if len(high_conf_indices) == 0:
+                                number_transfer_tokens = 1
+                                _, transfer_index = torch.topk(confidence, number_transfer_tokens)
+                            else:
+                                transfer_index = torch.tensor([], device=self.device, dtype=torch.long)
+                            # Merge indices
+                            all_indices = torch.unique(torch.cat([transfer_index, high_conf_indices]))
+                        else:
+                            # Incomplete state: only apply confidence threshold, if no tokens exceed the threshold, select none
+                            high_conf_indices = torch.where(initial_confidence > skip_threshold)[0]
+                            all_indices = high_conf_indices
+                        # Update tokens
+                        if len(all_indices) > 0:
+                            x0_ = torch.zeros_like(x0, device=self.device, dtype=torch.long) + mask_id
+                            x0_[all_indices] = x0[all_indices].clone()
+                            # Map indices back to original positions
+                            for i, idx in enumerate(all_indices):
+                                abs_pos = block_start + block_rel_positions[idx]
+                                x_t[0, abs_pos] = x0_[idx]
+                            # Update block state
+                            block_states[block_id]['mask_count'] -= len(all_indices)
+                            # Check for EOS token
+                            eos_token_id = 126081
+                            if eos_token_id is not None:
+                                for idx in all_indices:
+                                    if x0[idx].item() == eos_token_id:
+                                        eos_detected = True
+                                        break
+                    # Deactivate this block if no masks remain
+                    mask_index = (x_t == mask_id)
+                    block_mask_index = mask_index.clone()
+                    block_mask_index[:, :block_start] = False
+                    block_mask_index[:, block_end:] = False
+                    if block_mask_index.sum() == 0:
+                        blocks_to_deactivate.append(block_id)
+                        continue
+                # Deactivate completed blocks and mark them for caching in the next iteration
+                for block_id in blocks_to_deactivate:
+                    if block_states[block_id]['state'] == 'active':
+                        # Check if all preceding blocks are already in a non-active state
+                        can_deactivate = True
+                        for prev_block_id in range(block_id):
+                            if prev_block_id in block_states and block_states[prev_block_id]['state'] == 'active':
+                                can_deactivate = False
+                                break
+                        # Only mark the current block as 'to_cache' if all preceding blocks are not active
+                        if can_deactivate:
+                            block_states[block_id]['state'] = 'to_cache'
+                            current_blocks -= 1
+                        # If there are active preceding blocks, keep the current block in active state (do nothing)
+                if update_kvcache > 0:
+                    cache_length += update_kvcache
+                # Safety check
+                if step > 10000:
+                    print(f"WARNING: Hit safety check at step {step}. Exiting generation loop.")
+                    break
+                current_text = self.tokenizer.decode(x_t[0, prompt.shape[1]:].tolist(),skip_special_tokens=False)
+        # Generate final answer
+        generated_sequence = x_t[0, prompt.shape[1]:].tolist()
+        return generated_sequence
+    def generate_until(self, requests: List[Instance], disable_tqdm: bool = False):
+        res = []
+        start_time = time.time()
+        # Statistics variables
+        num_tokens = 0
+        num_nfe = 0
+        bar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)), desc="Running generate_until requests")
+        for i, req in enumerate(requests):
+            question = req.args[0]
+            # print("question:",question)
+            # exit()
+            gen_kwargs = req.args[1]
+            # Process input in LLaDA.py style
+            # print("Self.add_bos_token:", self.add_bos_token)
+            contexts = [question]
+            if self.add_bos_token:
+                contexts = [self.tokenizer.bos_token + p for p in contexts]
+            # Use the same tokenization method as LLaDA.py
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                truncation=self.truncation,
+            )
+            input_ids = context_enc[0].unsqueeze(0)  # Take the first one and add batch dimension
+            # Add length check
+            if input_ids.shape[1] > self.max_length - self.max_new_tokens:
+                eval_logger.warning(f"Prompt length {input_ids.shape[1]} is larger than {self.max_length-self.max_new_tokens}, cutoff on the left side")
+                input_ids = input_ids[:, -(self.max_length-self.max_new_tokens):]
+            # Generate token IDs
+            generated_answer = self._generate_block_single(input_ids)
+            # Use tokenizer.batch_decode for decoding, consistent with LLaDA.py
+            cont_toks_list = self.tokenizer.batch_decode([generated_answer], skip_special_tokens=True)
+            s = cont_toks_list[0]  # Take the first (and only) result
+            # Use unified token counting function
+            if self.show_speed:
+                num_tokens += self._count_tokens_after_truncation(s, gen_kwargs.get("until", []))
+                num_nfe += 1  # NFE uses simplified statistics (fixed to 1)
+            # Handle until truncation in LLaDA.py style
+            if not self.escape_until:
+                for term in gen_kwargs.get("until", []):
+                    if len(term) > 0:
+                        s = s.split(term)[0]
+            res.append(s)
+            bar.update(1)
+        bar.close()
+        # Save statistics only at the end
+        if self.save_dir is not None:
+            os.makedirs(self.save_dir, exist_ok=True)
+            final_time = time.time()
+            total_time = final_time - start_time
+            final_stats = {
+                "processed_samples": len(res),
+                "total_samples": len(requests),
+                "total_tokens": int(num_tokens),
+                "total_nfe": int(num_nfe),
+                "total_time": total_time,
+                "tokens_per_second": float(num_tokens) / total_time if total_time > 0 else 0.0,
+                "nfe_per_token": float(num_nfe) / float(num_tokens) if num_tokens > 0 else 0.0,
+                "timestamp": final_time
+            }
+            final_stats_path = os.path.join(self.save_dir, f'rank_{self.rank}_final_stats.json')
+            with open(final_stats_path, 'w', encoding='utf-8') as f:
+                json.dump(final_stats, f, ensure_ascii=False, indent=2)
+        if self.show_speed:
+            final_time = time.time()
+            total_time = final_time - start_time
+            print(f"\n=== Final Statistics ===")
+            print(f"Processed samples: {len(res)}")
+            print(f"Total tokens: {num_tokens}")
+            print(f"Total time: {total_time:.2f} seconds")
+            print(f"Throughput: {num_tokens / total_time:.2f} tokens/s")
+            print(f"Total NFE: {num_nfe}")
+        return res
+    def _forward_process(self, batch):
+        b, l = batch.shape
+        # sample from U[0, 1] following https://arxiv.org/pdf/2107.00630 I.1
+        u0 = torch.rand(1, device=batch.device, dtype=torch.float32)
+        indices = torch.arange(b, device=batch.device).float()
+        t = (u0 + indices / b) % 1
+        p_mask = (1 - self.sampling_eps) * t + self.sampling_eps
+        p_mask = p_mask[:, None].repeat(1, l)
+        mask_indices = torch.rand((b, l), device=batch.device) < p_mask
+        # always unmask bos and eos
+        mask_indices[:, 0] = False
+        mask_indices[:, -1] = False
+        noisy_batch = torch.where(mask_indices, self.mask_token_id, batch)
+        return noisy_batch, p_mask
+    @torch.no_grad()
+    def get_logits(self, batch, prompt_index):
+        '''
+        prompt_index : 1D bool tensor, length=batch.shape[1]
+        '''
+        if self.classifier_free_guidance > 1.:
+            assert len(prompt_index) == batch.shape[1]
+            prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
+            un_batch = batch.clone()
+            un_batch[prompt_index] = self.mask_token_id
+            batch = torch.cat([batch, un_batch])
+        input = batch
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            logits = self.model(input).logits
+            # since bos always unmask, the first logits will not be used
+            logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1)
+        if self.classifier_free_guidance > 1.:
+            logits, un_logits = torch.chunk(logits, 2, dim=0)
+            logits = un_logits + self.cfg * (logits - un_logits)
+        return logits[:, :batch.shape[1]]
+    @torch.no_grad()
+    def _eval_target_nll_mc(self, prefix, target):
+        if prefix is None:
+            seq = target[None, :]
+        else:
+            seq = torch.concatenate([prefix, target])[None, :]
+        seq = seq.repeat((self.batch_size, 1)).to(self.device)
+        if self.log_type == 'ftb':
+            prompt_index = torch.arange(seq.shape[1], device=self.device) < len(prefix)
+        else:
+            prompt_index = torch.arange(seq.shape[1], device=self.device) >= len(prefix)
+        loss_acc = []
+        for _ in range(max(self.mc_num // self.batch_size, 1)):
+            perturbed_seq = seq.clone()
+            # eval_logger.info("before noising")
+            perturbed_seq_, p_mask = self._forward_process(seq)
+            # eval_logger.info("end noising")
+            if self.log_type == 'ftb':
+                perturbed_seq[:, -len(target):] = perturbed_seq_[:, -len(target):]
+            elif self.log_type == 'btf':
+                perturbed_seq[:, :len(prefix)] = perturbed_seq_[:, :len(prefix)]
+            elif self.log_type == 'union':
+                perturbed_seq = perturbed_seq_
+            else:
+                raise NotImplementedError(self.log_type)
+            mask_indices = perturbed_seq == self.mask_token_id
+            logits = self.get_logits(perturbed_seq, prompt_index)
+            loss = F.cross_entropy(logits[mask_indices], seq[mask_indices], reduction='none') / p_mask[mask_indices]
+            loss = loss.sum() / self.batch_size
+            loss_acc.append(loss.item())
+        return sum(loss_acc) / len(loss_acc)
+    @torch.no_grad()
+    def _eval_target_nll_ar(self, prefix, target):
+        prefix, target = prefix.unsqueeze(0), target.unsqueeze(0) # 1*l1, 1*l2
+        assert self.log_type in ['ftb', 'btf']
+        assert self.nll_type in ['ar_ftb', 'ar_btf']
+        if self.log_type == 'ftb':
+            prompt_index = torch.arange(prefix.shape[1] + target.shape[1], device=self.device) < prefix.shape[1]
+        else:
+            prompt_index = torch.arange(prefix.shape[1] + target.shape[1], device=self.device) >= prefix.shape[1]
+        if self.log_type == 'ftb':
+            perturbed_ = target.repeat(target.shape[1], 1).clone().contiguous() # l2*l2
+        else:
+            perturbed_ = prefix.repeat(prefix.shape[1], 1).clone().contiguous() # l1*l1
+        mask_index = torch.ones((perturbed_.shape[1], perturbed_.shape[1]), dtype=torch.bool)
+        if self.nll_type == 'ar_ftb':
+            mask_index = torch.triu(mask_index)
+        else:
+            mask_index = torch.tril(mask_index)
+        perturbed_[mask_index] = self.mask_token_id
+        if self.log_type == 'ftb':
+            perturbed_seq = torch.cat([prefix.repeat(perturbed_.shape[0], 1), perturbed_], dim=-1)
+        else:
+            perturbed_seq = torch.cat([perturbed_, target.repeat(perturbed_.shape[0], 1)], dim=-1)
+        logits_ = []
+        num = len(perturbed_seq) // self.batch_size if len(perturbed_seq) % self.batch_size == 0 else len(perturbed_seq) // self.batch_size + 1
+        for i in range(num):
+            end = (i + 1) * self.batch_size if (i + 1) * self.batch_size < len(perturbed_seq) else len(perturbed_seq)
+            perturbed_seq_ = perturbed_seq[i * self.batch_size: end]
+            perturbed_seq_ = perturbed_seq_.to(self.device)
+            if len(perturbed_seq_.shape) == 1:
+                perturbed_seq_ = perturbed_seq_.unsqueeze(0)
+            logits = self.get_logits(perturbed_seq_, prompt_index)
+            logits_.append(logits.cpu())
+        logits = torch.cat(logits_, dim=0)
+        temp_index = torch.ones((perturbed_.shape[1], perturbed_.shape[1]), dtype=torch.bool)
+        if self.nll_type == 'ar_ftb':
+            temp_index = torch.triu(temp_index, diagonal=1)
+        else:
+            temp_index = torch.tril(temp_index, diagonal=-1)
+        mask_index[temp_index] = False
+        if self.log_type == 'ftb':
+            logits_index = torch.cat([torch.zeros((perturbed_.shape[1], prefix.shape[1]), dtype=torch.bool), mask_index], dim=-1)
+        else:
+            logits_index = torch.cat([mask_index, torch.zeros((perturbed_.shape[1], target.shape[1]), dtype=torch.bool)], dim=-1)
+        if self.log_type == 'ftb':
+            loss = F.cross_entropy(logits[logits_index], target[0], reduction='sum').cpu().item()
+        else:
+            loss = F.cross_entropy(logits[logits_index], prefix[0], reduction='sum').cpu().item()
+        return loss
+    def _encode_pair(self, context, continuation):
+        if self.add_bos_token:
+            context = self.tokenizer.bos_token + context
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tokenizer.encode(context + continuation) + [self.tokenizer.eos_token_id]
+        context_enc = self.tokenizer.encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        # by default truncate on the left
+        cutoff_length = max(len(whole_enc) - self.max_length, 0)
+        if cutoff_length > 0:
+            eval_logger.warning(f"Text length {len(whole_enc)} is larger than {self.max_length}, cutoff on the left side")
+            context_remain = context_enc_len-cutoff_length
+            if context_remain > 0:
+                context_enc = context_enc[-context_remain:]
+            else:
+                eval_logger.warning(f"All context (prompt) is truncated.")
+                context_enc = ""
+                continuation_enc = whole_enc[-self.max_length:]
+        return context_enc, continuation_enc
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        def _tokenize(e):
+            prefix, target = self._encode_pair(e["prefix"], e["target"])
+            return {
+                "prefix_text": e["prefix"],
+                "target_text": e["target"],
+                "prefix": prefix,
+                "target": target,
+            }
+        ds = []
+        ds = [{"prefix": req.args[0], "target": req.args[1]} for req in requests]
+        ds = Dataset.from_list(ds)
+        print(ds[0])
+        ds = ds.map(_tokenize)
+        ds = ds.with_format("torch")
+        out = []
+        with torch.no_grad():
+            for elem in tqdm(ds, desc="Computing likelihood..."):
+                prefix = elem["prefix"]
+                target = elem["target"]
+                # likelihood calculations are modified from https://github.com/ML-GSAI/SMDM/blob/main/evaluate_diff.py
+                if self.nll_type == 'mc':
+                    ll = -self._eval_target_nll_mc(prefix, target)
+                    if self.log_type == 'union':
+                        ll = ll / (len(target) + len(prefix))
+                elif self.nll_type == 'ar_ftb' or self.nll_type == 'ar_btf':
+                    ll = -self._eval_target_nll_ar(prefix, target)
+                else:
+                    raise NotImplementedError(self.nll_type)
+                # TODO: greedy decoding
+                is_target_greedy_dec = False
+                out.append((ll, 1.0 if is_target_greedy_dec else 0.0))
+        return out
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        raise NotImplementedError
+    def _update_block_completion_states(self, block_states, decoded_token_threshold):
+        """
+        Updates the complete/incomplete state of blocks.
+        Iterates through blocks from front to back. If a block's decoded token count exceeds the threshold, the next block to its right (if it exists) is set to a complete state.
+        """
+        for block_id in sorted(block_states.keys()):
+            # if block_id == 0:  # Skip prompt block
+            #     continue
+            # Calculate decoded tokens for the current block
+            decoded_tokens = block_states[block_id]['total_masks'] - block_states[block_id]['mask_count']
+            decode_ratio = decoded_tokens / block_states[block_id]['total_masks']
+            # If current block's decoded token count exceeds the threshold, the next block (if exists) is set to a complete state
+            # print("decode_ratio",decode_ratio)
+            # print("decoded_token_threshold",decoded_token_threshold)
+            if decode_ratio >= decoded_token_threshold:
+                next_block_id = block_id + 1
+                if next_block_id in block_states:
+                    block_states[next_block_id]['is_complete'] = True
+if __name__ == "__main__":
+    set_seed(1234)
+    cli_evaluate()

eval_llada.sh ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/bin/bash
+tasks="gsm8k mbpp minerva_math"
+nshots="4 3 0"
+lengths="512 512 512"
+temperatures="0 0 0"
+limits="10000 10000 10000"
+block_sizes="64 32 32"
+block_add_thresholds="0.7 0.9 0.1"
+decoded_token_thresholds="0.95 0.95 0.95"
+skip_thresholds="0.9 0.9 0.9"
+top_ps="none none none"
+dtypes="bfloat16 bfloat16 bfloat16"
+sampling_strategies="default default default"
+humaneval_nshots="0"
+humaneval_lengths="512"
+humaneval_temperatures="0"
+humaneval_limits="10000"
+humaneval_diffusion_steps="512"
+humaneval_block_sizes="32"
+humaneval_block_add_thresholds="0.1"
+humaneval_decoded_token_thresholds="0.95"
+humaneval_skip_thresholds="0.9"
+humaneval_top_ps="none"
+humaneval_dtypes="bfloat16"
+humaneval_sampling_strategies="default"
+base_model=GSAI-ML/LLaDA-8B-Instruct
+lora_models=(
+    "SJTU-Deng-Lab/D2F_LLaDA_Instruct_8B_Lora"
+)
+read -ra TASKS_ARRAY <<< "$tasks"
+read -ra NSHOTS_ARRAY <<< "$nshots"
+read -ra LENGTH_ARRAY <<< "$lengths"
+read -ra TEMP_ARRAY <<< "$temperatures"
+read -ra LIMITS_ARRAY <<< "$limits"
+read -ra BLOCK_SIZES_ARRAY <<< "$block_sizes"
+read -ra BLOCK_ADD_THRESHOLDS_ARRAY <<< "$block_add_thresholds"
+read -ra DECODED_TOKEN_THRESHOLDS_ARRAY <<< "$decoded_token_thresholds"
+read -ra SKIP_THRESHOLDS_ARRAY <<< "$skip_thresholds"
+read -ra TOP_PS_ARRAY <<< "$top_ps"
+read -ra DTYPES_ARRAY <<< "$dtypes"
+read -ra SAMPLING_STRATEGIES_ARRAY <<< "$sampling_strategies"
+read -ra HUMANEVAL_NSHOTS_ARRAY <<< "$humaneval_nshots"
+read -ra HUMANEVAL_LENGTHS_ARRAY <<< "$humaneval_lengths"
+read -ra HUMANEVAL_TEMP_ARRAY <<< "$humaneval_temperatures"
+read -ra HUMANEVAL_LIMITS_ARRAY <<< "$humaneval_limits"
+read -ra HUMANEVAL_DIFFUSION_STEPS_ARRAY <<< "$humaneval_diffusion_steps"
+read -ra HUMANEVAL_BLOCK_SIZES_ARRAY <<< "$humaneval_block_sizes"
+read -ra HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY <<< "$humaneval_block_add_thresholds"
+read -ra HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY <<< "$humaneval_decoded_token_thresholds"
+read -ra HUMANEVAL_SKIP_THRESHOLDS_ARRAY <<< "$humaneval_skip_thresholds"
+read -ra HUMANEVAL_TOP_PS_ARRAY <<< "$humaneval_top_ps"
+read -ra HUMANEVAL_DTYPES_ARRAY <<< "$humaneval_dtypes"
+read -ra HUMANEVAL_SAMPLING_STRATEGIES_ARRAY <<< "$humaneval_sampling_strategies"
+array_length=${#TASKS_ARRAY[@]}
+if [[ ${#NSHOTS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#LENGTH_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#TEMP_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#LIMITS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#BLOCK_SIZES_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#BLOCK_ADD_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#DECODED_TOKEN_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#SKIP_THRESHOLDS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#TOP_PS_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#SAMPLING_STRATEGIES_ARRAY[@]} -ne $array_length ]] || \
+   [[ ${#DTYPES_ARRAY[@]} -ne $array_length ]]; then
+    echo "Error: All configuration arrays must have the same length!"
+    echo "Tasks: ${#TASKS_ARRAY[@]}, Nshots: ${#NSHOTS_ARRAY[@]}, Lengths: ${#LENGTH_ARRAY[@]}, Temperatures: ${#TEMP_ARRAY[@]}, Limits: ${#LIMITS_ARRAY[@]}, Block sizes: ${#BLOCK_SIZES_ARRAY[@]}, Block thresholds: ${#BLOCK_ADD_THRESHOLDS_ARRAY[@]}, Decoded token thresholds: ${#DECODED_TOKEN_THRESHOLDS_ARRAY[@]}, Skip thresholds: ${#SKIP_THRESHOLDS_ARRAY[@]}, Top_ps: ${#TOP_PS_ARRAY[@]}, Sampling strategies: ${#SAMPLING_STRATEGIES_ARRAY[@]}, Dtypes: ${#DTYPES_ARRAY[@]}"
+    exit 1
+fi
+humaneval_array_length=${#HUMANEVAL_NSHOTS_ARRAY[@]}
+if [[ ${#HUMANEVAL_LENGTHS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_TEMP_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_LIMITS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DIFFUSION_STEPS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_BLOCK_SIZES_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_SKIP_THRESHOLDS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_TOP_PS_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_DTYPES_ARRAY[@]} -ne $humaneval_array_length ]] || \
+   [[ ${#HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[@]} -ne $humaneval_array_length ]]; then
+    echo "Error: All HumanEval configuration arrays must have the same length!"
+    echo "HumanEval Nshots: ${#HUMANEVAL_NSHOTS_ARRAY[@]}, Lengths: ${#HUMANEVAL_LENGTHS_ARRAY[@]}, Temperatures: ${#HUMANEVAL_TEMP_ARRAY[@]}, Limits: ${#HUMANEVAL_LIMITS_ARRAY[@]}, Diffusion steps: ${#HUMANEVAL_DIFFUSION_STEPS_ARRAY[@]}, Block sizes: ${#HUMANEVAL_BLOCK_SIZES_ARRAY[@]}, Block thresholds: ${#HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[@]}, Decoded token thresholds: ${#HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[@]}, Skip thresholds: ${#HUMANEVAL_SKIP_THRESHOLDS_ARRAY[@]}, Top_ps: ${#HUMANEVAL_TOP_PS_ARRAY[@]}, Dtypes: ${#HUMANEVAL_DTYPES_ARRAY[@]}, Sampling strategies: ${#HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[@]}"
+    exit 1
+fi
+export HF_ALLOW_CODE_EVAL=1
+for lora_model in "${lora_models[@]}"; do
+    lora_model_name="$lora_model"
+    echo "===================================================================="
+    echo "Evaluating LoRA model: $lora_model_name"
+    echo "===================================================================="
+    for i in "${!HUMANEVAL_NSHOTS_ARRAY[@]}"; do
+        output_path="eval_llada${lora_model_name}/humaneval-ns${HUMANEVAL_NSHOTS_ARRAY[$i]}-len${HUMANEVAL_LENGTHS_ARRAY[$i]}-temp${HUMANEVAL_TEMP_ARRAY[$i]}-limit${HUMANEVAL_LIMITS_ARRAY[$i]}-diffsteps${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]}-block${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]}-thresh${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]}-decodethresh${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}-skip${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]}-topp${HUMANEVAL_TOP_PS_ARRAY[$i]}-dtype${HUMANEVAL_DTYPES_ARRAY[$i]}-sampling${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]}"
+        echo "Running HumanEval evaluation $((i+1))/${humaneval_array_length} for $lora_model_name..."
+        echo "HumanEval Config: Shots: ${HUMANEVAL_NSHOTS_ARRAY[$i]}, Length: ${HUMANEVAL_LENGTHS_ARRAY[$i]}, Temperature: ${HUMANEVAL_TEMP_ARRAY[$i]}, Limit: ${HUMANEVAL_LIMITS_ARRAY[$i]}, Diffusion Steps: ${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]}, Block Size: ${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]}, Block Add Threshold: ${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]}, Decoded Token Threshold: ${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}, Skip Threshold: ${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]}, Top_p: ${HUMANEVAL_TOP_PS_ARRAY[$i]}, Sampling Strategy: ${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]}, Dtype: ${HUMANEVAL_DTYPES_ARRAY[$i]}; Output: $output_path"
+        if [[ "${HUMANEVAL_TOP_PS_ARRAY[$i]}" == "none" ]]; then
+            humaneval_model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${HUMANEVAL_LENGTHS_ARRAY[$i]},diffusion_steps=${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]},temperature=${HUMANEVAL_TEMP_ARRAY[$i]},add_bos_token=true,escape_until=true,block_size=${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${HUMANEVAL_DTYPES_ARRAY[$i]},sampling_strategy=${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        else
+            humaneval_model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${HUMANEVAL_LENGTHS_ARRAY[$i]},diffusion_steps=${HUMANEVAL_DIFFUSION_STEPS_ARRAY[$i]},temperature=${HUMANEVAL_TEMP_ARRAY[$i]},top_p=${HUMANEVAL_TOP_PS_ARRAY[$i]},add_bos_token=true,escape_until=true,block_size=${HUMANEVAL_BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${HUMANEVAL_BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${HUMANEVAL_SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${HUMANEVAL_DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${HUMANEVAL_DTYPES_ARRAY[$i]},sampling_strategy=${HUMANEVAL_SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        fi
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --main_process_port 29520 --num_processes 8 eval_llada.py --model dream_lora \
+            --model_args $humaneval_model_args \
+            --tasks humaneval \
+            --num_fewshot ${HUMANEVAL_NSHOTS_ARRAY[$i]} \
+            --batch_size 1 \
+            --output_path $output_path \
+            --log_samples \
+            --confirm_run_unsafe_code
+    done
+    ### NOTICE: use postprocess for humaneval
+    # python postprocess_code.py {the samples_xxx.jsonl file under output_path}
+    for i in "${!TASKS_ARRAY[@]}"; do
+        output_path="eval_llada${lora_model_name}/${TASKS_ARRAY[$i]}-ns${NSHOTS_ARRAY[$i]}-len${LENGTH_ARRAY[$i]}-temp${TEMP_ARRAY[$i]}-limit${LIMITS_ARRAY[$i]}-diffsteps${LENGTH_ARRAY[$i]}-block${BLOCK_SIZES_ARRAY[$i]}-thresh${BLOCK_ADD_THRESHOLDS_ARRAY[$i]}-decodethresh${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}-skip${SKIP_THRESHOLDS_ARRAY[$i]}-topp${TOP_PS_ARRAY[$i]}-dtype${DTYPES_ARRAY[$i]}-sampling${SAMPLING_STRATEGIES_ARRAY[$i]}"
+        echo "Task: ${TASKS_ARRAY[$i]}, Shots: ${NSHOTS_ARRAY[$i]}, Length: ${LENGTH_ARRAY[$i]}, Temperature: ${TEMP_ARRAY[$i]}, Limit: ${LIMITS_ARRAY[$i]}, Block Size: ${BLOCK_SIZES_ARRAY[$i]}, Block Add Threshold: ${BLOCK_ADD_THRESHOLDS_ARRAY[$i]}, Decoded Token Threshold: ${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]}, Skip Threshold: ${SKIP_THRESHOLDS_ARRAY[$i]}, Top_p: ${TOP_PS_ARRAY[$i]}, Sampling Strategy: ${SAMPLING_STRATEGIES_ARRAY[$i]}, Dtype: ${DTYPES_ARRAY[$i]}; Output: $output_path"
+        if [[ "${TOP_PS_ARRAY[$i]}" == "none" ]]; then
+            model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${LENGTH_ARRAY[$i]},diffusion_steps=${LENGTH_ARRAY[$i]},add_bos_token=true,temperature=${TEMP_ARRAY[$i]},block_size=${BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${DTYPES_ARRAY[$i]},sampling_strategy=${SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        else
+            model_args="pretrained=${base_model},lora_path=${lora_model},max_new_tokens=${LENGTH_ARRAY[$i]},diffusion_steps=${LENGTH_ARRAY[$i]},add_bos_token=true,temperature=${TEMP_ARRAY[$i]},top_p=${TOP_PS_ARRAY[$i]},block_size=${BLOCK_SIZES_ARRAY[$i]},block_add_threshold=${BLOCK_ADD_THRESHOLDS_ARRAY[$i]},skip_threshold=${SKIP_THRESHOLDS_ARRAY[$i]},decoded_token_threshold=${DECODED_TOKEN_THRESHOLDS_ARRAY[$i]},dtype=${DTYPES_ARRAY[$i]},sampling_strategy=${SAMPLING_STRATEGIES_ARRAY[$i]},save_dir=${output_path}"
+        fi
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --main_process_port 29520 --num_processes 8 eval_llada.py --model dream_lora \
+            --model_args $model_args \
+            --tasks ${TASKS_ARRAY[$i]} \
+            --limit ${LIMITS_ARRAY[$i]} \
+            --num_fewshot ${NSHOTS_ARRAY[$i]} \
+            --batch_size 1 \
+            --output_path $output_path \
+            --log_samples \
+            --confirm_run_unsafe_code \
+            --apply_chat_template \
+            --fewshot_as_multiturn
+    done
+done
+echo "All evaluations completed!"

generate_llada_demo_ar.py ADDED Viewed

	@@ -0,0 +1,660 @@

+import torch
+import torch.nn.functional as F
+import torch.distributions as dists
+import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel, PeftConfig
+import numpy as np
+import random
+import time
+import os
+from typing import List, Dict, Optional, Tuple, Iterator, Set
+import gradio as gr
+import gc
+# Suppress some Hugging Face warnings
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Import necessary model classes
+# Assuming these custom classes are in the correct path
+from model_cache.llada.modeling_llada import LLaDAModelLM
+from model_cache.llada.configuration_llada import LLaDAConfig
+# --- Helper Functions (Unchanged) ---
+def set_seed(seed):
+    torch.manual_seed(seed); random.seed(seed); np.random.seed(seed);
+    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False
+def create_full_block_attention_mask(prompt_length, max_length, block_size, device=None, dtype=None):
+    if dtype is None: dtype = torch.bfloat16
+    attention_mask = torch.full((1, 1, max_length, max_length), -torch.inf, device=device, dtype=dtype)
+    attention_mask[:, :, :prompt_length, :prompt_length] = 0
+    remaining_length = max_length - prompt_length
+    num_blocks = (remaining_length + block_size - 1) // block_size
+    for b in range(num_blocks):
+        block_start = prompt_length + b * block_size; block_end = min(prompt_length + (b + 1) * block_size, max_length)
+        attention_mask[:, :, block_start:block_end, :prompt_length] = 0
+        for prev_b in range(b):
+            prev_start = prompt_length + prev_b * block_size; prev_end = min(prompt_length + (prev_b + 1) * block_size, max_length)
+            attention_mask[:, :, block_start:block_end, prev_start:prev_end] = 0
+        attention_mask[:, :, block_start:block_end, block_start:block_end] = 0
+    return attention_mask
+def extract_attention_mask(full_mask, start_pos, input_length, cache_length):
+    end_pos = start_pos + input_length; total_length = cache_length + input_length
+    extracted_mask = torch.full((1, 1, input_length, total_length), -torch.inf, device=full_mask.device, dtype=full_mask.dtype)
+    extracted_mask[:, :, :, :cache_length] = full_mask[:, :, start_pos:end_pos, :cache_length]
+    extracted_mask[:, :, :, cache_length:] = full_mask[:, :, start_pos:end_pos, start_pos:end_pos]
+    return extracted_mask
+def top_p_logits(logits, top_p=None):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
+def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    if temperature > 0: logits = logits / temperature
+    if top_p is not None and top_p < 1: logits = top_p_logits(logits, top_p)
+    if top_k is not None: logits = top_k_logits(logits, top_k)
+    probs = torch.softmax(logits, dim=-1)
+    if temperature > 0:
+        try:
+            x0 = dists.Categorical(probs=probs).sample()
+            initial_confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except: initial_confidence, x0 = probs.max(dim=-1)
+    else: initial_confidence, x0 = probs.max(dim=-1)
+    confidence = initial_confidence.clone()
+    if margin_confidence:
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        confidence = sorted_probs[:, 0] - sorted_probs[:, 1]
+    if neg_entropy:
+        epsilon = 1e-10
+        confidence = torch.sum(probs * torch.log(probs + epsilon), dim=-1)
+    return confidence, x0, initial_confidence
+class D2FInference:
+    CSS = """
+    .gradio-container {
+        font-family: -apple-system, BlinkMacSystemFont, sans-serif;
+    }
+    .model-header {
+        font-size: 1.2em;
+        font-weight: bold;
+        margin-bottom: 10px;
+        padding: 8px;
+        border-radius: 5px;
+        text-align: center;
+    }
+    .d2f-header {
+        background-color: #DBEAFE;
+        color: #1E40AF;
+    }
+    .llama-header {
+        background-color: #FEF3C7;
+        color: #92400E;
+    }
+    .stats-container {
+        padding: 15px;
+        border: 1px solid #10B981;
+        border-radius: 8px;
+        background-color: #F0FDF4;
+        margin-top: 10px;
+        margin-bottom: 20px;
+    }
+    .output-textbox textarea {
+        font-size: 1.5em !important;
+        line-height: 1.6 !important;
+        height: 70vh !important;
+        overflow-y: auto !important;
+    }
+    """
+    def __init__(self, **kwargs):
+        print("Initializing D2F-LLaDA model...")
+        self.device = torch.device(kwargs.get("device", "cuda:3") if torch.cuda.is_available() else "cpu")
+        self.__dict__.update(kwargs)
+        if self.dtype == "bfloat16" and torch.cuda.is_bf16_supported(): self.target_dtype = torch.bfloat16
+        elif self.dtype == "float16": self.target_dtype = torch.float16
+        else: self.target_dtype = torch.float32
+        self._setup_model(self.pretrained_path, self.lora_path)
+        print("D2F-LLaDA model and tokenizer setup complete.")
+    def _setup_model(self, pretrained_path, lora_path):
+        config = LLaDAConfig.from_pretrained(pretrained_path)
+        self.model = LLaDAModelLM.from_pretrained(pretrained_path, config=config, torch_dtype=self.target_dtype).eval()
+        self.model = PeftModel.from_pretrained(self.model, lora_path)
+        self.model = self.model.to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
+        if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token
+    def _apply_chat_template(self, prompt):
+        chat_history = [{"role": "user", "content": prompt}]
+        return self.tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
+    def _update_block_completion_states(self, block_states, decoded_token_threshold):
+        for block_id in sorted(block_states.keys()):
+            decoded_tokens = block_states[block_id]['total_masks'] - block_states[block_id]['mask_count']
+            if block_states[block_id]['total_masks'] > 0:
+                decode_ratio = decoded_tokens / block_states[block_id]['total_masks']
+                if decode_ratio >= decoded_token_threshold:
+                    if (next_block_id := block_id + 1) in block_states:
+                        block_states[next_block_id]['is_complete'] = True
+    @torch.inference_mode()
+    def stream(
+        self,
+        prompt_text: str,
+        max_new_tokens: int,
+        block_size: int,
+        block_add_threshold: float,
+        decoded_token_threshold: float,
+        skip_threshold: float
+    ) -> Iterator[Tuple[str, str]]:
+        start_time = time.time()
+        input_ids = self.tokenizer(self._apply_chat_template(prompt_text), return_tensors="pt").input_ids.to(self.device)
+        prompt_length = input_ids.shape[1]
+        full_attention_mask = create_full_block_attention_mask(prompt_length, self.max_length, block_size, self.device, self.target_dtype)
+        x_t = input_ids
+        block_states = {0: {'start_pos': 0, 'end_pos': prompt_length, 'mask_count': 0, 'total_masks': prompt_length, 'state': 'to_cache', 'is_complete': True}}
+        past_key_values, current_blocks, step, eos_detected, cache_length = None, 0, 0, False, 0
+        yield "", None
+        tokens_generated = 0
+        while True:
+            step += 1
+            updated_block_ids = set()
+            if len(block_states) - 1 < (max_new_tokens // block_size) and not eos_detected:
+                last_block_id = max(block_states.keys())
+                progress_ratio = (block_states[last_block_id]['total_masks'] - block_states[last_block_id]['mask_count']) / block_states[last_block_id]['total_masks'] if block_states[last_block_id]['total_masks'] > 0 else 1.0
+                if progress_ratio >= block_add_threshold:
+                    new_block_id = last_block_id + 1; new_start_pos = x_t.shape[1]
+                    if new_start_pos + block_size <= self.max_length:
+                        x_t = torch.cat([x_t, torch.full((1, block_size), self.mask_token_id, device=self.device, dtype=torch.long)], dim=1)
+                        block_states[new_block_id] = {'start_pos': new_start_pos, 'end_pos': new_start_pos + block_size, 'mask_count': block_size, 'total_masks': block_size, 'state': 'active', 'is_complete': False}
+                        current_blocks += 1
+            self._update_block_completion_states(block_states, decoded_token_threshold)
+            if (x_t == self.mask_token_id).sum() == 0 and current_blocks == 0: break
+            blocks_to_cache = [bid for bid, state in block_states.items() if state['state'] == 'to_cache']
+            update_kvcache = 0
+            if blocks_to_cache:
+                start_pos, end_pos = block_states[min(blocks_to_cache)]['start_pos'], block_states[max(blocks_to_cache)]['end_pos']
+                update_kvcache = end_pos - start_pos; input_seq, process_start_pos = x_t[:, start_pos:], start_pos
+            else:
+                active_blocks = [bid for bid, state in block_states.items() if state['state'] == 'active' and state['start_pos'] >= cache_length]
+                if not active_blocks: break
+                start_pos = min(block_states[bid]['start_pos'] for bid in active_blocks); input_seq, process_start_pos = x_t[:, start_pos:], start_pos
+            if input_seq.shape[1] == 0: break
+            attention_mask = extract_attention_mask(full_mask=full_attention_mask,
+                                                   start_pos=process_start_pos,
+                                                   input_length=input_seq.shape[1],
+                                                   cache_length=cache_length)
+            outputs = self.model(input_seq,
+                                 attention_bias=attention_mask,
+                                 past_key_values=past_key_values,
+                                 use_cache=True,
+                                 update_kvcache=update_kvcache + cache_length)
+            if update_kvcache > 0:
+                past_key_values = outputs.past_key_values
+                for bid in blocks_to_cache:
+                    block_states[bid]['state'] = 'in_cache'
+            blocks_to_deactivate = []
+            for block_id, state in block_states.items():
+                if state['state'] != 'active':
+                    continue
+                block_mask_locs = (x_t[0, state['start_pos']:state['end_pos']] == self.mask_token_id).nonzero().squeeze(-1)
+                if block_mask_locs.numel() == 0:
+                    blocks_to_deactivate.append(block_id)
+                    continue
+                logit_offset = state['start_pos'] - process_start_pos
+                block_mask_logits = outputs.logits[:, logit_offset + block_mask_locs, :]
+                _, x0, initial_confidence = sample_tokens(block_mask_logits.squeeze(0), self.temperature, self.top_p, self.top_k)
+                all_indices = (initial_confidence > skip_threshold).nonzero().squeeze(-1)
+                if state['is_complete'] and all_indices.numel() == 0 and block_mask_logits.numel() > 0:
+                    all_indices = torch.tensor([torch.argmax(initial_confidence)], device=self.device)
+                if all_indices.numel() > 0:
+                    updated_block_ids.add(block_id)
+                    positions_to_update = state['start_pos'] + block_mask_locs[all_indices]
+                    x_t[0, positions_to_update] = x0[all_indices]
+                    state['mask_count'] -= all_indices.numel()
+                    tokens_generated += all_indices.numel()
+                    if self.tokenizer.eos_token_id in x0[all_indices]:
+                        eos_detected = True
+                if state['mask_count'] == 0:
+                    blocks_to_deactivate.append(block_id)
+            for bid in blocks_to_deactivate:
+                if block_states[bid]['state'] == 'active' and all(block_states.get(i, {}).get('state') != 'active' for i in range(bid)):
+                    block_states[bid]['state'] = 'to_cache'
+                    current_blocks -= 1
+            if update_kvcache > 0:
+                cache_length += update_kvcache
+            generated_ids = x_t[0, prompt_length:]
+            valid_ids = generated_ids[generated_ids != self.mask_token_id]
+            live_text = self.tokenizer.decode(valid_ids, skip_special_tokens=True)
+            yield live_text, None
+        total_time = time.time() - start_time
+        final_generated_ids = x_t[0, prompt_length:]
+        eos_positions = (final_generated_ids == self.tokenizer.eos_token_id).nonzero()
+        if eos_positions.numel() > 0:
+            final_generated_ids = final_generated_ids[:eos_positions[0, 0] + 1]
+        final_text = self.tokenizer.decode(final_generated_ids, skip_special_tokens=True)
+        tokens_incl_eos = len(final_generated_ids)
+        tokens_per_second = tokens_incl_eos / total_time if total_time > 0 else 0
+        stats = {
+            "total_time": total_time,
+            "tokens_generated": tokens_incl_eos,
+            "tokens_per_second": tokens_per_second
+        }
+        if past_key_values is not None:
+            del past_key_values
+        del full_attention_mask
+        torch.cuda.empty_cache()
+        yield final_text, stats
+class LlamaInference:
+    def __init__(self, **kwargs):
+        print("Initializing LLaMA model...")
+        self.device = torch.device(kwargs.get("device", "cuda:4") if torch.cuda.is_available() else "cpu")
+        self.__dict__.update(kwargs)
+        self._setup_model(self.model_id)
+        print("LLaMA model and tokenizer setup complete.")
+    def _setup_model(self, model_id):
+        print(f"Loading LLaMA model {model_id} on {self.device}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
+            device_map=self.device
+        ).eval()
+        if self.tokenizer.eos_token is None:
+            self.tokenizer.eos_token = "</s>"
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def _apply_chat_template(self, prompt):
+        chat_history = [{"role": "user", "content": prompt}]
+        return self.tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
+    @torch.inference_mode()
+    def stream(
+        self,
+        prompt_text: str,
+        max_new_tokens: int,
+        temperature: float = 0.0,
+        top_p: float = 0.9,
+        top_k: int = None
+    ) -> Iterator[Tuple[str, str]]:
+        start_time = time.time()
+        formatted_prompt = self._apply_chat_template(prompt_text)
+        input_ids = self.tokenizer(formatted_prompt, return_tensors="pt").input_ids.to(self.device)
+        prompt_length = input_ids.shape[1]
+        yield "", None
+        tokens_generated = 0
+        current_input_ids = input_ids.clone()
+        for i in range(max_new_tokens):
+            with torch.no_grad():
+                outputs = self.model(current_input_ids, use_cache=True)
+                next_token_logits = outputs.logits[:, -1, :]
+                if temperature > 0:
+                    next_token_logits = next_token_logits / temperature
+                    if top_p is not None and top_p < 1:
+                        next_token_logits = top_p_logits(next_token_logits, top_p)
+                    if top_k is not None:
+                        next_token_logits = top_k_logits(next_token_logits, top_k)
+                    probs = torch.softmax(next_token_logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                else:
+                    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+            current_input_ids = torch.cat([current_input_ids, next_token], dim=-1)
+            tokens_generated += 1
+            if next_token[0, 0].item() == self.tokenizer.eos_token_id:
+                break
+            generated_text = self.tokenizer.decode(
+                current_input_ids[0, prompt_length:],
+                skip_special_tokens=True
+            )
+            yield generated_text, None
+            del outputs
+        total_time = time.time() - start_time
+        tokens_per_second = tokens_generated / total_time if total_time > 0 else 0
+        final_text = self.tokenizer.decode(current_input_ids[0, prompt_length:], skip_special_tokens=True)
+        stats = {
+            "total_time": total_time,
+            "tokens_generated": tokens_generated,
+            "tokens_per_second": tokens_per_second
+        }
+        del current_input_ids
+        torch.cuda.empty_cache()
+        yield final_text, stats
+# --- Comparison Helper Functions ---
+def create_comparison_html(d2f_results, llama_results):
+    d_tokens = d2f_results["tokens_generated"]
+    d_time = d2f_results["total_time"]
+    d_tokens_per_sec = d2f_results["tokens_per_second"]
+    a_tokens = llama_results["tokens_generated"]
+    a_time = llama_results["total_time"]
+    a_tokens_per_sec = llama_results["tokens_per_second"]
+    if a_tokens_per_sec > 0:
+        speedup = d_tokens_per_sec / a_tokens_per_sec
+    else:
+        speedup = 0
+    comparison_html = f"""
+    <div class="stats-container" style="background-color: #F9FAFB; border-color: #6366F1;">
+        <h3>⚡ Performance Comparison</h3>
+        <table style="width:100%; text-align: left; border-collapse: collapse;">
+            <tr style="background-color: #EEF2FF;">
+                <th style="padding: 8px; border: 1px solid #ddd;">Metric</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">D2F-LLaDA-Instruct-8B</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">LLaMA3-Instruct-8B</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">Difference</th>
+            </tr>
+            <tr>
+                <td style="padding: 8px; border: 1px solid #ddd;">Total tokens</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{d_tokens}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{a_tokens}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">-</td>
+            </tr>
+            <tr>
+                <td style="padding: 8px; border: 1px solid #ddd;">Generation time</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{d_time:.2f}s</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{a_time:.2f}s</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">
+                    {"D2F-LLaDA is " + f"{(a_time/d_time):.1f}x faster" if d_time > 0 and d_time < a_time else "LLaMA3 is " + f"{(d_time/a_time):.1f}x faster"}
+                </td>
+            </tr>
+            <tr>
+                <td style="padding: 8px; border: 1px solid #ddd;">Tokens per second</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{d_tokens_per_sec:.2f}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{a_tokens_per_sec:.2f}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">
+                    {"D2F-LLaDA is " + f"{speedup:.1f}x faster" if speedup > 1 else "LLaMA3 is " + f"{(1/speedup if speedup > 0 else 0):.1f}x faster"}
+                </td>
+            </tr>
+        </table>
+    </div>
+    """
+    return comparison_html
+def create_stats_html(model_name, results):
+    stats_html = f"""
+    <div class="stats-container">
+        <h3>✓ {model_name} Generation Complete</h3>
+        <ul>
+            <li><b>Total time:</b> {results["total_time"]:.2f} seconds</li>
+            <li><b>Tokens generated:</b> {results["tokens_generated"]}</li>
+            <li><b>Tokens per second:</b> {results["tokens_per_second"]:.2f}</li>
+        </ul>
+    </div>
+    """
+    return stats_html
+# --- Main Interface ---
+if __name__ == "__main__":
+    os.environ["CUDA_VISIBLE_DEVICES"] = "3,4"
+    torch.cuda.empty_cache()
+    d2f_config = {
+        "pretrained_path": "GSAI-ML/LLaDA-8B-Instruct",
+        "lora_path": "SJTU-Deng-Lab/D2F_LLaDA_Instruct_8B_Lora",
+        "device": "cuda:0",
+        "dtype": "bfloat16",
+        "max_length": 4096,
+        "temperature": 0.0,
+        "top_p": None,
+        "top_k": None,
+        "mask_token_id": 126336,
+        "sampling_strategy": "default",
+    }
+    llama_config = {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "device": "cuda:1",
+    }
+    set_seed(42)
+    d2f_engine = D2FInference(**d2f_config)
+    llama_engine = LlamaInference(**llama_config)
+    with gr.Blocks(css=D2FInference.CSS, theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🚀 D2F-LLaDA vs LLaMA3: Speed Comparison")
+        with gr.Row():
+            with gr.Column(scale=1):
+                prompt_input = gr.Textbox(
+                    label="Enter your question",
+                    placeholder="Example: Natalia sold clips to...",
+                    lines=5
+                )
+                generate_button = gr.Button("🚀 Run Speed Comparison", variant="primary")
+                with gr.Accordion("⚙️ D2F-LLaDA Parameter Settings", open=True):
+                    with gr.Row():
+                        max_new_tokens_slider = gr.Slider(
+                            minimum=64, maximum=2048, value=1024, step=64,
+                            label="Max Tokens to Generate"
+                        )
+                        block_size_slider = gr.Slider(
+                            minimum=16, maximum=128, value=32, step=16,
+                            label="Block Size"
+                        )
+                    with gr.Row():
+                        block_add_thresh_slider = gr.Slider(
+                            minimum=0.0, maximum=1.0, value=0.1, step=0.05,
+                            label="Block Add Threshold"
+                        )
+                        decoded_token_thresh_slider = gr.Slider(
+                            minimum=0.0, maximum=1.0, value=0.5, step=0.05,
+                            label="Decoding Completion Threshold"
+                        )
+                        skip_thresh_slider = gr.Slider(
+                            minimum=0.0, maximum=1.0, value=0.9, step=0.01,
+                            label="Skip Threshold"
+                        )
+                comparison_output = gr.HTML(label="Performance Comparison", elem_id="comparison-container")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.HTML("<div class='model-header d2f-header'>✨ D2F-LLaDA-Instruct-8B (Parallel Decoding)</div>")
+                d2f_output = gr.Textbox(
+                    label="D2F-LLaDA Output",
+                    interactive=False,
+                    elem_classes=["output-textbox"]
+                )
+                d2f_status = gr.HTML(label="D2F-LLaDA Stats")
+            with gr.Column(scale=1):
+                gr.HTML("<div class='model-header llama-header'>🔄 LLaMA3-Instruct-8B (Standard)</div>")
+                llama_output = gr.Textbox(
+                    label="LLaMA3 Output",
+                    interactive=False,
+                    elem_classes=["output-textbox"]
+                )
+                llama_status = gr.HTML(label="LLaMA3 Stats")
+        gr.Examples(
+            examples=[
+                ["Solve the equation x² - 6x + 8 = 0. First, explain what a quadratic equation is and why it can have up to two solutions. Then solve this equation using three different methods: factoring, completing the square, and the quadratic formula. For each method, explain the mathematical reasoning behind it, show all steps in detail, and discuss when this particular method is most useful. Finally, verify your solutions by substituting them back into the original equation.", 1024, 32, 0.1, 0.55, 0.9],
+                ["A circular swimming pool has a diameter of 8 meters. Calculate the pool's circumference and area. First, explain the relationship between diameter, radius, circumference, and area of a circle, including the role of π in these formulas. Then perform the calculations using π ≈ 3.14159. Next, estimate how much water (in cubic meters) would be needed to fill this pool if it has a uniform depth of 1.5 meters. Finally, calculate how much it would cost to fill this pool if water costs $2.50 per cubic meter. Show all steps and include appropriate units in your answer.", 1024, 32, 0.1, 0.5, 0.9],
+                ["A movie theater offers a loyalty card that costs $15 and gives a 15% discount on all tickets. If a regular movie ticket costs $10, how many tickets would you need to buy to make the loyalty card worthwhile? First, explain the concept of a break-even point. Then set up an equation to find when the total cost with the card equals the total cost without the card. Solve this equation step by step, showing all your work. Finally, interpret your answer in the context of the problem.", 1024, 32, 0.1, 0.5, 0.9],
+            ],
+            inputs=[
+                prompt_input, max_new_tokens_slider, block_size_slider,
+                block_add_thresh_slider, decoded_token_thresh_slider, skip_thresh_slider
+            ],
+            label="Examples (Math Problems)"
+        )
+        def run_models_streaming(
+            prompt_text,
+            max_new_tokens,
+            block_size,
+            block_add_threshold,
+            decoded_token_threshold,
+            skip_threshold
+        ):
+            torch.cuda.empty_cache()
+            d2f_generator = d2f_engine.stream(
+                prompt_text=prompt_text,
+                max_new_tokens=max_new_tokens,
+                block_size=block_size,
+                block_add_threshold=block_add_threshold,
+                decoded_token_threshold=decoded_token_threshold,
+                skip_threshold=skip_threshold
+            )
+            llama_generator = llama_engine.stream(
+                prompt_text=prompt_text,
+                max_new_tokens=max_new_tokens
+            )
+            d2f_text = ""
+            llama_text = ""
+            d2f_stats = None
+            llama_stats = None
+            yield d2f_text, llama_text, "", "", ""
+            d2f_done = False
+            llama_done = False
+            while not (d2f_done and llama_done):
+                if not d2f_done:
+                    try:
+                        new_d2f_text, new_d2f_stats = next(d2f_generator)
+                        d2f_text = new_d2f_text
+                        if new_d2f_stats is not None:
+                            d2f_stats = new_d2f_stats
+                            d2f_done = True
+                    except StopIteration:
+                        d2f_done = True
+                if not llama_done:
+                    try:
+                        new_llama_text, new_llama_stats = next(llama_generator)
+                        llama_text = new_llama_text
+                        if new_llama_stats is not None:
+                            llama_stats = new_llama_stats
+                            llama_done = True
+                    except StopIteration:
+                        llama_done = True
+                d2f_status_html = create_stats_html("D2F-LLaDA", d2f_stats) if d2f_stats else ""
+                llama_status_html = create_stats_html("LLaMA3", llama_stats) if llama_stats else ""
+                comparison = ""
+                if d2f_done and llama_done and d2f_stats and llama_stats:
+                    comparison = create_comparison_html(d2f_stats, llama_stats)
+                yield d2f_text, llama_text, d2f_status_html, llama_status_html, comparison
+        # MODIFICATION: Removed the _js parameter from here
+        generate_button.click(
+            fn=run_models_streaming,
+            inputs=[
+                prompt_input, max_new_tokens_slider, block_size_slider,
+                block_add_thresh_slider, decoded_token_thresh_slider, skip_thresh_slider
+            ],
+            outputs=[
+                d2f_output, llama_output,
+                d2f_status, llama_status,
+                comparison_output
+            ]
+        )
+        # MODIFICATION: Added a hidden HTML component with a script for auto-scrolling
+        # This method is compatible with older Gradio versions.
+        gr.HTML(
+            """
+            <script>
+                function_to_run = () => {
+                    const textboxes = document.querySelectorAll('.output-textbox textarea');
+                    textboxes.forEach(textbox => {
+                        textbox.scrollTop = textbox.scrollHeight;
+                    });
+                }
+                // Run the function every 250ms to ensure autoscrolling
+                setInterval(function_to_run, 250);
+            </script>
+            """,
+            visible=False
+        )
+    demo.queue().launch(share=True)

generate_llada_demo_block.py ADDED Viewed

	@@ -0,0 +1,630 @@

+import torch
+import torch.nn.functional as F
+import torch.distributions as dists
+import transformers
+from transformers import AutoTokenizer
+from peft import PeftModel, PeftConfig
+import numpy as np
+import random
+import time
+import os
+from typing import List, Dict, Optional, Tuple, Iterator, Set
+import gradio as gr
+import ipdb
+# Suppress some Hugging Face warnings
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Import necessary model classes
+from model_cache.llada.modeling_llada import LLaDAModelLM
+from model_cache.llada.configuration_llada import LLaDAConfig
+# --- Helper Functions (Unchanged) ---
+def set_seed(seed):
+    torch.manual_seed(seed); random.seed(seed); np.random.seed(seed);
+    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False
+def create_full_block_attention_mask(prompt_length, max_length, block_size, device=None, dtype=None):
+    if dtype is None: dtype = torch.bfloat16
+    attention_mask = torch.full((1, 1, max_length, max_length), -torch.inf, device=device, dtype=dtype)
+    attention_mask[:, :, :prompt_length, :prompt_length] = 0
+    remaining_length = max_length - prompt_length
+    num_blocks = (remaining_length + block_size - 1) // block_size
+    for b in range(num_blocks):
+        block_start = prompt_length + b * block_size; block_end = min(prompt_length + (b + 1) * block_size, max_length)
+        attention_mask[:, :, block_start:block_end, :prompt_length] = 0
+        for prev_b in range(b):
+            prev_start = prompt_length + prev_b * block_size; prev_end = min(prompt_length + (prev_b + 1) * block_size, max_length)
+            attention_mask[:, :, block_start:block_end, prev_start:prev_end] = 0
+        attention_mask[:, :, block_start:block_end, block_start:block_end] = 0
+    return attention_mask
+def extract_attention_mask(full_mask, start_pos, input_length, cache_length):
+    end_pos = start_pos + input_length; total_length = cache_length + input_length
+    extracted_mask = torch.full((1, 1, input_length, total_length), -torch.inf, device=full_mask.device, dtype=full_mask.dtype)
+    extracted_mask[:, :, :, :cache_length] = full_mask[:, :, start_pos:end_pos, :cache_length]
+    extracted_mask[:, :, :, cache_length:] = full_mask[:, :, start_pos:end_pos, start_pos:end_pos]
+    return extracted_mask
+def top_p_logits(logits, top_p=None):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
+def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    if temperature > 0: logits = logits / temperature
+    if top_p is not None and top_p < 1: logits = top_p_logits(logits, top_p)
+    if top_k is not None: logits = top_k_logits(logits, top_k)
+    probs = torch.softmax(logits, dim=-1)
+    if temperature > 0:
+        try:
+            x0 = dists.Categorical(probs=probs).sample()
+            initial_confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except: initial_confidence, x0 = probs.max(dim=-1)
+    else: initial_confidence, x0 = probs.max(dim=-1)
+    confidence = initial_confidence.clone()
+    if margin_confidence:
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        confidence = sorted_probs[:, 0] - sorted_probs[:, 1]
+    if neg_entropy:
+        epsilon = 1e-10
+        confidence = torch.sum(probs * torch.log(probs + epsilon), dim=-1)
+    return confidence, x0, initial_confidence
+class DreamLoRAInference:
+    CSS = """
+    /* Fixed height, scrollable visualization container */
+    #viz-container {
+        height: 500px;
+        overflow-y: auto !important;
+        border: 1px solid #E5E7EB;
+        border-radius: 8px;
+        padding: 10px;
+        position: relative;
+    }
+    .block-container {
+        display: inline-block; border: 2px solid transparent; border-radius: 8px;
+        padding: 5px; margin: 4px 0; transition: border-color 0.3s, box-shadow 0.3s;
+    }
+    .block-updating {
+        border-color: #FF4500 !important;
+        box-shadow: 0 0 8px rgba(255, 69, 0, 0.7);
+    }
+    .token { padding: 2px 4px; margin: 2px; border-radius: 4px; display: inline-block; line-height: 1.4; font-family: monospace; }
+    .token.prompt { background-color: #E5E7EB; color: #4B5563; }
+    .token.gen-0 { background-color: #DBEAFE; color: #1E40AF; } /* Blue */
+    .token.gen-1 { background-color: #D1FAE5; color: #065F46; } /* Green */
+    .token.gen-2 { background-color: #FEF3C7; color: #92400E; } /* Yellow */
+    .token.gen-3 { background-color: #FEE2E2; color: #991B1B; } /* Red */
+    .token.gen-4 { background-color: #E0E7FF; color: #3730A3; } /* Indigo */
+    .token.gen-5 { background-color: #F3E8FF; color: #6B21A8; } /* Purple */
+    .token.mask { background-color: #F3F4F6; color: #9CA3AF; border: 1px dashed #D1D5DB; }
+    /* Independent status box styles */
+    #status-container {
+        height: 300px;
+        overflow-y: auto !important;
+        margin-top: 10px; padding: 15px; border: 1px solid #E5E7EB; border-radius: 8px; background-color: #F9FAFB;
+        position: relative;
+    }
+    #status-container h4 { margin-top: 0; }
+    .status-line { font-family: monospace; font-size: 13px; margin-bottom: 5px; margin-top: 5px; padding: 2px 4px; border-radius: 3px;}
+    #stats-output { padding: 15px; border: 1px solid #10B981; border-radius: 8px; background-color: #F0FDF4; margin-top: 10px; }
+    /* Scroll anchor */
+    .scroll-anchor {
+        height: 1px;
+        width: 100%;
+    }
+    /* Force scrollbar styles */
+    #viz-container::-webkit-scrollbar, #status-container::-webkit-scrollbar {
+        width: 10px !important;
+        background-color: #f5f5f5 !important;
+    }
+    #viz-container::-webkit-scrollbar-thumb, #status-container::-webkit-scrollbar-thumb {
+        background-color: #888 !important;
+        border-radius: 5px !important;
+    }
+    #viz-container::-webkit-scrollbar-track, #status-container::-webkit-scrollbar-track {
+        background-color: #f5f5f5 !important;
+        border-radius: 5px !important;
+    }
+    /* Column height alignment */
+    .left-column, .right-column {
+        display: flex;
+        flex-direction: column;
+        height: auto !important;
+        min-height: 800px;
+    }
+    .live-text-container, .viz-status-container {
+        display: flex;
+        flex-direction: column;
+        flex: 1;
+        overflow: visible;
+    }
+    #live-text-output, #stats-output {
+        margin-bottom: 20px;
+    }
+    /* Fix for bottom content being cut off */
+    .container {
+        padding-bottom: 40px;
+    }
+    /* Make sure content is fully visible */
+    .gradio-container {
+        overflow-y: visible !important;
+    }
+    /* Add padding to bottom of page */
+    .footer {
+        margin-top: 30px;
+        padding-bottom: 30px;
+    }
+    """
+    def __init__(self, **kwargs):
+        print("Initializing DreamLoRAInference...")
+        self.device = torch.device(kwargs.get("device", "cuda") if torch.cuda.is_available() else "cpu")
+        self.__dict__.update(kwargs)
+        if self.dtype == "bfloat16" and torch.cuda.is_bf16_supported(): self.target_dtype = torch.bfloat16
+        elif self.dtype == "float16": self.target_dtype = torch.float16
+        else: self.target_dtype = torch.float32
+        self._setup_model(self.pretrained_path, self.lora_path)
+        print("Model and tokenizer setup complete.")
+    def _setup_model(self, pretrained_path, lora_path):
+        config = LLaDAConfig.from_pretrained(pretrained_path)
+        self.model = LLaDAModelLM.from_pretrained(pretrained_path, config=config, torch_dtype=self.target_dtype).eval()
+        self.model = PeftModel.from_pretrained(self.model, lora_path)
+        self.model = self.model.to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
+        if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token
+    def _apply_chat_template(self, prompt):
+        chat_history = [{"role": "user", "content": prompt}]
+        return self.tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
+    def _update_block_completion_states(self, block_states, decoded_token_threshold):
+        for block_id in sorted(block_states.keys()):
+            decoded_tokens = block_states[block_id]['total_masks'] - block_states[block_id]['mask_count']
+            if block_states[block_id]['total_masks'] > 0:
+                decode_ratio = decoded_tokens / block_states[block_id]['total_masks']
+                if decode_ratio >= decoded_token_threshold:
+                    if (next_block_id := block_id + 1) in block_states:
+                        block_states[next_block_id]['is_complete'] = True
+    # Render visualization part (excluding prompt status info)
+    def _render_visualization_html(self, step: int, x_t: torch.Tensor, block_states: Dict, cache_length: int, updated_block_ids: Set[int]) -> str:
+        timestamp = int(time.time() * 1000)
+        html_parts = []
+        for block_id in sorted(k for k in block_states.keys() if k > 0): # Only render generated part (block_id > 0)
+            state = block_states[block_id]
+            container_classes = ["block-container"]
+            if block_id in updated_block_ids: container_classes.append("block-updating")
+            html_parts.append(f'<div class="{" ".join(container_classes)}" id="block-{block_id}-{timestamp}">')
+            block_tokens = x_t[0, state['start_pos']:state['end_pos']]
+            for token_id in block_tokens:
+                token_id_int = token_id.item()
+                token_classes = ["token"]
+                if token_id_int == self.mask_token_id:
+                    token_str = '░'; token_classes.append("mask")
+                else:
+                    token_str = self.tokenizer.decode([token_id_int], skip_special_tokens=False)
+                    token_str = token_str.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+                    token_classes.append(f"gen-{(block_id - 1) % 6}")
+                html_parts.append(f'<span class="{" ".join(token_classes)}">{token_str}</span>')
+            html_parts.append('</div>')
+        html_parts.append(f'<div class="scroll-anchor" id="viz-anchor-{timestamp}"></div>')
+        complete_html = f"""
+        <div class="viz-content" id="viz-content-{timestamp}">
+            {''.join(html_parts)}
+        </div>
+        <script>
+        function executeVizScroll() {{
+            const container = document.getElementById('viz-container');
+            const anchor = document.getElementById('viz-anchor-{timestamp}');
+            if (container && anchor) {{
+                try {{
+                    container.scrollTo(0, container.scrollHeight);
+                    container.scrollTop = container.scrollHeight;
+                    anchor.scrollIntoView({{behavior: 'auto', block: 'end'}});
+                }} catch (e) {{
+                    console.error('Scroll error:', e);
+                }}
+            }}
+        }}
+        setTimeout(executeVizScroll, 10);
+        setTimeout(executeVizScroll, 50);
+        setTimeout(executeVizScroll, 150);
+        setTimeout(executeVizScroll, 300);
+        try {{
+            const vizContent = document.getElementById('viz-content-{timestamp}');
+            const vizContainer = document.getElementById('viz-container');
+            if (vizContent && vizContainer) {{
+                const resizeObserver = new ResizeObserver(() => {{
+                    executeVizScroll();
+                }});
+                resizeObserver.observe(vizContent);
+                const mutationObserver = new MutationObserver(() => {{
+                    executeVizScroll();
+                }});
+                mutationObserver.observe(vizContainer, {{
+                    childList: true,
+                    subtree: true,
+                    characterData: true
+                }});
+            }}
+        }} catch (e) {{
+            console.error('Observer error:', e);
+        }}
+        </script>
+        """
+        return complete_html
+    # Render status box part (only shows generation block information)
+    def _render_status_html(self, step: int, block_states: Dict, cache_length: int) -> str:
+        timestamp = int(time.time() * 1000)
+        html_parts = []
+        html_parts.append(f'<h4>Generation Block Status (Step: {step}, Cache Length: {cache_length})</h4>')
+        for block_id in [k for k in sorted(block_states.keys()) if k > 0]:
+            state = block_states[block_id]
+            block_type = f"Block {block_id}"
+            masks_filled = state['total_masks'] - state['mask_count']
+            color_class = f"gen-{(block_id - 1) % 6}"
+            status_line = f'<b>{block_type.ljust(8)}</b>: Pos=[{str(state["start_pos"]).rjust(4)}:{str(state["end_pos"]).ljust(4)}] | State=\'{state["state"].ljust(8)}\' | Filled={str(masks_filled).rjust(2)}/{state["total_masks"]}'
+            html_parts.append(f'<p class="status-line token {color_class}" id="status-line-{block_id}-{timestamp}">{status_line}</p>')
+        html_parts.append(f'<div class="scroll-anchor" id="status-anchor-{timestamp}"></div>')
+        complete_html = f"""
+        <div class="status-content" id="status-content-{timestamp}">
+            {''.join(html_parts)}
+        </div>
+        <script>
+        function executeStatusScroll() {{
+            const container = document.getElementById('status-container');
+            const anchor = document.getElementById('status-anchor-{timestamp}');
+            if (container && anchor) {{
+                try {{
+                    container.scrollTo(0, container.scrollHeight);
+                    container.scrollTop = container.scrollHeight;
+                    anchor.scrollIntoView({{behavior: 'auto', block: 'end'}});
+                }} catch (e) {{
+                    console.error('Status scroll error:', e);
+                }}
+            }}
+        }}
+        setTimeout(executeStatusScroll, 10);
+        setTimeout(executeStatusScroll, 50);
+        setTimeout(executeStatusScroll, 150);
+        setTimeout(executeStatusScroll, 300);
+        try {{
+            const statusContent = document.getElementById('status-content-{timestamp}');
+            const statusContainer = document.getElementById('status-container');
+            if (statusContent && statusContainer) {{
+                const resizeObserver = new ResizeObserver(() => {{
+                    executeStatusScroll();
+                }});
+                resizeObserver.observe(statusContent);
+                const mutationObserver = new MutationObserver(() => {{
+                    executeStatusScroll();
+                }});
+                mutationObserver.observe(statusContainer, {{
+                    childList: true,
+                    subtree: true,
+                    characterData: true
+                }});
+            }}
+        }} catch (e) {{
+            console.error('Status observer error:', e);
+        }}
+        </script>
+        """
+        return complete_html
+    @torch.inference_mode()
+    def stream_and_capture_for_gradio(
+        self,
+        prompt_text: str,
+        max_new_tokens: int,
+        block_size: int,
+        block_add_threshold: float,
+        decoded_token_threshold: float,
+        skip_threshold: float
+    ) -> Iterator[Tuple[str, List[Tuple[str, str]], str, str, str]]:
+        start_time = time.time()
+        captured_frames: List[Tuple[str, str]] = []
+        # Initialization
+        ipdb.set_trace()
+        input_ids = self.tokenizer(self._apply_chat_template(prompt_text), return_tensors="pt").input_ids.to(self.device)
+        prompt_length = input_ids.shape[1]
+        full_attention_mask = create_full_block_attention_mask(prompt_length, self.max_length, block_size, self.device, self.target_dtype)
+        x_t = input_ids
+        block_states = {0: {'start_pos': 0, 'end_pos': prompt_length, 'mask_count': 0, 'total_masks': prompt_length, 'state': 'to_cache', 'is_complete': True}}
+        past_key_values, current_blocks, step, eos_detected, cache_length = None, 0, 0, False, 0
+        # Capture initial state
+        initial_viz_html = self._render_visualization_html(0, x_t, block_states, 0, set())
+        initial_status_html = self._render_status_html(0, block_states, 0)
+        captured_frames.append((initial_viz_html, initial_status_html))
+        yield "", captured_frames, "Initializing generation process...", "Initializing visualization...", "Initializing block status..."
+        # Main generation loop
+        while True:
+            step += 1
+            updated_block_ids: Set[int] = set()
+            if len(block_states) - 1 < (max_new_tokens // block_size) and not eos_detected:
+                last_block_id = max(block_states.keys())
+                progress = (block_states[last_block_id]['total_masks'] - block_states[last_block_id]['mask_count']) / block_states[last_block_id]['total_masks'] if block_states[last_block_id]['total_masks'] > 0 else 1.0
+                if progress >= block_add_threshold:
+                    new_block_id = last_block_id + 1; new_start_pos = x_t.shape[1]
+                    if new_start_pos + block_size <= self.max_length:
+                        x_t = torch.cat([x_t, torch.full((1, block_size), self.mask_token_id, device=self.device, dtype=torch.long)], dim=1)
+                        block_states[new_block_id] = {'start_pos': new_start_pos, 'end_pos': new_start_pos + block_size, 'mask_count': block_size, 'total_masks': block_size, 'state': 'active', 'is_complete': False}
+                        current_blocks += 1
+            self._update_block_completion_states(block_states, decoded_token_threshold)
+            if (x_t == self.mask_token_id).sum() == 0 and current_blocks == 0: break
+            #### D2F-BLOCK ####
+            blocks_to_cache = [bid for bid, state in block_states.items() if state['state'] == 'to_cache']
+            update_kvcache = 0
+            if blocks_to_cache:
+                start_pos, end_pos = block_states[min(blocks_to_cache)]['start_pos'], block_states[max(blocks_to_cache)]['end_pos']
+                update_kvcache = end_pos - start_pos; input_seq, process_start_pos = x_t[:, start_pos:], start_pos
+            else:
+                active_blocks = [bid for bid, state in block_states.items() if state['state'] == 'active' and state['start_pos'] >= cache_length]
+                if not active_blocks: break
+                start_pos = min(block_states[bid]['start_pos'] for bid in active_blocks); input_seq, process_start_pos = x_t[:, start_pos:], start_pos
+            if input_seq.shape[1] == 0: break
+            attention_mask = extract_attention_mask(full_attention_mask, process_start_pos, input_seq.shape[1], cache_length)
+            outputs = self.model(input_seq, attention_bias=attention_mask, past_key_values=past_key_values, use_cache=True, update_kvcache=update_kvcache + cache_length)
+            if update_kvcache > 0:
+                past_key_values = outputs.past_key_values
+                for bid in blocks_to_cache: block_states[bid]['state'] = 'in_cache'
+            blocks_to_deactivate = []
+            for block_id, state in block_states.items():
+                if state['state'] != 'active': continue
+                block_mask_locs = (x_t[0, state['start_pos']:state['end_pos']] == self.mask_token_id).nonzero().squeeze(-1)
+                if block_mask_locs.numel() == 0:
+                    blocks_to_deactivate.append(block_id); continue
+                logit_offset = state['start_pos'] - process_start_pos
+                block_mask_logits = outputs.logits[:, logit_offset + block_mask_locs, :]
+                _, x0, initial_confidence = sample_tokens(block_mask_logits.squeeze(0), self.temperature, self.top_p, self.top_k)
+                all_indices = (initial_confidence > skip_threshold).nonzero().squeeze(-1)
+                if state['is_complete'] and all_indices.numel() == 0 and block_mask_logits.numel() > 0:
+                    all_indices = torch.tensor([torch.argmax(initial_confidence)], device=self.device)
+                if all_indices.numel() > 0:
+                    updated_block_ids.add(block_id)
+                    positions_to_update = state['start_pos'] + block_mask_locs[all_indices]
+                    x_t[0, positions_to_update] = x0[all_indices]; state['mask_count'] -= all_indices.numel()
+                    if self.tokenizer.eos_token_id in x0[all_indices]: eos_detected = True
+                if state['mask_count'] == 0: blocks_to_deactivate.append(block_id)
+            for bid in blocks_to_deactivate:
+                if block_states[bid]['state'] == 'active' and all(block_states.get(i, {}).get('state') != 'active' for i in range(bid)):
+                    block_states[bid]['state'] = 'to_cache'; current_blocks -= 1
+            if update_kvcache > 0: cache_length += update_kvcache
+            #### FlexMDM Cache Update ####
+            # Capture current step's visualization and status frames
+            generated_ids = x_t[0, prompt_length:]
+            valid_ids = generated_ids[generated_ids != self.mask_token_id]
+            live_text = self.tokenizer.decode(valid_ids, skip_special_tokens=True)
+            current_viz_html = self._render_visualization_html(step, x_t, block_states, cache_length, updated_block_ids)
+            current_status_html = self._render_status_html(step, block_states, cache_length)
+            captured_frames.append((current_viz_html, current_status_html))
+            yield live_text, captured_frames, "Generating...", "Generating...", "Generating..."
+        # Final output
+        total_time = time.time() - start_time
+        final_generated_ids = x_t[0, prompt_length:]
+        eos_positions = (final_generated_ids == self.tokenizer.eos_token_id).nonzero()
+        if eos_positions.numel() > 0:
+            final_generated_ids = final_generated_ids[:eos_positions[0, 0] + 1]
+        final_text = self.tokenizer.decode(final_generated_ids, skip_special_tokens=True)
+        final_viz_html = self._render_visualization_html(step, x_t, block_states, cache_length, set())
+        final_status_html = self._render_status_html(step, block_states, cache_length)
+        captured_frames.append((final_viz_html, final_status_html))
+        tokens_incl_eos = len(final_generated_ids)
+        tokens_excl_eos = len(final_generated_ids[final_generated_ids != self.tokenizer.eos_token_id])
+        stats_text = f"""
+        ### ✅ Generation Complete!
+        ---
+        - **Total time:** `{total_time:.2f} seconds`
+        - **Tokens generated (incl. EOS):** `{tokens_incl_eos}`
+        - **Tokens generated (excl. EOS):** `{tokens_excl_eos}`
+        - **Tokens per second:** `{(tokens_incl_eos / total_time):.2f}`
+        """
+        yield final_text, captured_frames, stats_text, "Generation complete, playback starting soon", "Generation complete, playback starting soon"
+# --- Gradio UI and Event Handlers ---
+if __name__ == "__main__":
+    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
+    config = {
+        "pretrained_path": "GSAI-ML/LLaDA-8B-Instruct",
+        "lora_path": "SJTU-Deng-Lab/D2F_LLaDA_Instruct_8B_Lora",
+        "device": "cuda", "dtype": "bfloat16", "max_length": 4096,
+        "temperature": 0.0, "top_p": None, "top_k": None, "mask_token_id": 126336,
+        "sampling_strategy": "default",
+    }
+    set_seed(42)
+    inference_engine = DreamLoRAInference(**config)
+    # Gradio helper for animation
+    def animate_visualization(html_frames_list: List[Tuple[str, str]], delay: float) -> Iterator[Tuple[str, str]]:
+        if not html_frames_list:
+            yield "No visualization data captured", "No status data captured"
+            return
+        for viz_frame, status_frame in html_frames_list:
+            yield viz_frame, status_frame
+            time.sleep(delay)
+    # Global auto-scroll JS
+    auto_scroll_js = """
+    <script>
+    function globalForceScroll() {
+        // Scroll visualization container
+        var vizContainer = document.getElementById('viz-container');
+        if (vizContainer) {
+            vizContainer.scrollTop = vizContainer.scrollHeight;
+        }
+        // Scroll status container
+        var statusContainer = document.getElementById('status-container');
+        if (statusContainer) {
+            statusContainer.scrollTop = statusContainer.scrollHeight;
+        }
+        // Scroll all anchors
+        var anchors = document.querySelectorAll('.scroll-anchor');
+        anchors.forEach(function(anchor) {
+            try {
+                anchor.scrollIntoView({behavior: 'auto', block: 'end'});
+            } catch(e) {}
+        });
+    }
+    // Periodic scrolling
+    setInterval(globalForceScroll, 200);
+    document.addEventListener('DOMContentLoaded', function() {
+        // Monitor content changes
+        var observer = new MutationObserver(function(mutations) {
+            globalForceScroll();
+        });
+        observer.observe(document.body, {
+            childList: true,
+            subtree: true,
+            characterData: true
+        });
+        // Initial scrolling
+        setTimeout(globalForceScroll, 100);
+        setTimeout(globalForceScroll, 500);
+        setTimeout(globalForceScroll, 1000);
+    });
+    </script>
+    """
+    with gr.Blocks(css=DreamLoRAInference.CSS, theme=gr.themes.Soft()) as demo:
+        html_frames_state = gr.State([])
+        gr.Markdown("# ✨ D2F-LLaDA: Real-time Text vs. Slow-motion Visualization")
+        gr.Markdown("Left side shows real-time streaming output. Right side plays back the decoding process visualization after generation completes.")
+        # Inject global auto-scroll JS
+        gr.HTML(auto_scroll_js)
+        with gr.Row():
+            # --- Left Column ---
+            with gr.Column(scale=2, elem_classes=["left-column"]):
+                prompt_input = gr.Textbox(label="Enter your question", placeholder="Example: Natalia sold clips to...", lines=5)
+                generate_button = gr.Button("🚀 Generate & Visualize", variant="primary")
+                with gr.Group(elem_classes=["live-text-container"]):
+                    live_text_output = gr.Textbox(label="Real-time Generation Output", interactive=False, lines=25, elem_id="live-text-output")
+                    stats_output = gr.Markdown(label="Generation Statistics", elem_id="stats-output")
+            # --- Right Column ---
+            with gr.Column(scale=3, elem_classes=["right-column"]):
+                with gr.Accordion("⚙️ Parameter Settings", open=True):
+                    with gr.Row():
+                        max_new_tokens_slider = gr.Slider(minimum=64, maximum=2048, value=1024, step=64, label="Max Tokens to Generate")
+                        block_size_slider = gr.Slider(minimum=16, maximum=128, value=32, step=16, label="Block Size")
+                    with gr.Row():
+                        block_add_thresh_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.05, label="Block Add Threshold")
+                        decoded_token_thresh_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.05, label="Decoding Completion Threshold")
+                        skip_thresh_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.01, label="Skip Threshold")
+                    delay_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.05, label="Playback Delay (seconds)", info="Adjust visualization playback speed.")
+                with gr.Group(elem_classes=["viz-status-container"]):
+                    visualization_output = gr.HTML(label="Generation Process Visualization", elem_id="viz-container")
+                    status_output_html = gr.HTML(label="Generation Block Status", elem_id="status-container")
+        gr.Examples(
+            examples=[
+                ["Solve the equation x² - 6x + 8 = 0. First, explain what a quadratic equation is and why it can have up to two solutions. Then solve this equation using three different methods: factoring, completing the square, and the quadratic formula. For each method, explain the mathematical reasoning behind it, show all steps in detail, and discuss when this particular method is most useful. Finally, verify your solutions by substituting them back into the original equation.", 1024, 32, 0.1, 0.55, 0.9, 0.1],
+                ["A circular swimming pool has a diameter of 8 meters. Calculate the pool's circumference and area. First, explain the relationship between diameter, radius, circumference, and area of a circle, including the role of π in these formulas. Then perform the calculations using π ≈ 3.14159. Next, estimate how much water (in cubic meters) would be needed to fill this pool if it has a uniform depth of 1.5 meters. Finally, calculate how much it would cost to fill this pool if water costs $2.50 per cubic meter. Show all steps and include appropriate units in your answer.", 1024, 32, 0.1, 0.5, 0.9, 0.1],
+                ["A movie theater offers a loyalty card that costs $15 and gives a 15% discount on all tickets. If a regular movie ticket costs $10, how many tickets would you need to buy to make the loyalty card worthwhile? First, explain the concept of a break-even point. Then set up an equation to find when the total cost with the card equals the total cost without the card. Solve this equation step by step, showing all your work. Finally, interpret your answer in the context of the problem.", 1024, 32, 0.1, 0.5, 0.9, 0.1],
+            ],
+            inputs=[
+                prompt_input, max_new_tokens_slider, block_size_slider, block_add_thresh_slider,
+                decoded_token_thresh_slider, skip_thresh_slider, delay_slider
+            ],
+            label="Examples (Math Problems)"
+        )
+        # --- Event Handling Chain ---
+        inputs_list = [
+            prompt_input, max_new_tokens_slider, block_size_slider,
+            block_add_thresh_slider, decoded_token_thresh_slider, skip_thresh_slider
+        ]
+        ipdb.set_trace()
+        generation_event = generate_button.click(
+            fn=inference_engine.stream_and_capture_for_gradio,
+            inputs=inputs_list,
+            outputs=[live_text_output, html_frames_state, stats_output, visualization_output, status_output_html]
+        )
+        generation_event.then(
+            fn=animate_visualization,
+            inputs=[html_frames_state, delay_slider],
+            outputs=[visualization_output, status_output_html]
+        )
+    demo.queue().launch(share=True)

model_cache/dream/configuration_dream.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dream model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class DreamConfig(PretrainedConfig):
+    model_type = "Dream"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=False,  # cache not used in diffusion
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        mask_token_id=151666,
+        pad_token_id=151643,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.mask_token_id = mask_token_id
+        self.pad_token_id = pad_token_id

model_cache/dream/generation_utils.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.distributions as dists
+from torch.nn import functional as F
+from transformers import __version__
+from transformers.generation.configuration_utils import (
+    GenerationConfig
+)
+from transformers.utils import (
+    ModelOutput,
+    is_torchdynamo_compiling,
+    logging,
+)
+logger = logging.get_logger(__name__)
+def top_p_logits(logits, top_p=None):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the indices to the right to keep the first token above the threshold
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
+    mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
+def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    if temperature > 0:
+        logits = logits / temperature
+    if top_p is not None and top_p < 1:
+        logits = top_p_logits(logits, top_p)
+    if top_k is not None:
+        logits = top_k_logits(logits, top_k)
+    probs = torch.softmax(logits, dim=-1)
+    if temperature > 0:
+        try:
+            x0 = dists.Categorical(probs=probs).sample()
+            confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except:
+            confidence, x0 = probs.max(dim=-1)
+    else:
+        confidence, x0 = probs.max(dim=-1)
+    if margin_confidence:
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        # Extract top1 and top2 probabilities
+        top1_probs = sorted_probs[:, 0]
+        top2_probs = sorted_probs[:, 1]
+        # Calculate confidence as top1 - top2
+        confidence = top1_probs - top2_probs
+    if neg_entropy:
+        epsilon = 1e-10
+        log_probs = torch.log(probs + epsilon)
+        confidence = torch.sum(probs * log_probs, dim=-1)
+    return confidence, x0
+@dataclass
+class DreamModelOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    history: Optional[Tuple[torch.FloatTensor]] = None
+class DreamGenerationConfig(GenerationConfig):
+    def __init__(self, **kwargs):
+        self.temperature: float = kwargs.pop("temperature", 0.0)
+        self.top_p: Optional[float] = kwargs.pop("top_p", None)
+        self.top_k: Optional[int] = kwargs.pop("top_k", None)
+        self.max_length = kwargs.pop("max_length", 20)
+        self.max_new_tokens = kwargs.pop("max_new_tokens", None)
+        # diffusion specific params
+        self.eps: float = kwargs.pop("eps", 1e-3)
+        self.steps: int = kwargs.pop("steps", 512)
+        self.alg: str = kwargs.pop("alg", 'origin')
+        self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
+        # Parameters that define the output variables of `generate`
+        self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
+        self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
+        self.output_history: bool = kwargs.pop("output_history", False)
+        # Special tokens that can be used at generation time
+        self.mask_token_id = kwargs.pop("mask_token_id", None)
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+        # Wild card
+        self.generation_kwargs = kwargs.pop("generation_kwargs", {})
+        # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
+        # interface.
+        self._from_model_config = kwargs.pop("_from_model_config", False)
+        self._commit_hash = kwargs.pop("_commit_hash", None)
+        self.transformers_version = kwargs.pop("transformers_version", __version__)
+        # Additional attributes without default values
+        if not self._from_model_config:
+            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
+            # model's default configuration file
+            for key, value in kwargs.items():
+                try:
+                    setattr(self, key, value)
+                except AttributeError as err:
+                    logger.error(f"Can't set {key} with value {value} for {self}")
+                    raise err
+        # Validate the values of the attributes
+        self.validate(is_init=True)
+    def validate(self, is_init=False):
+        pass
+class DreamGenerationMixin:
+    @staticmethod
+    def _expand_inputs_for_generation(
+        expand_size: int = 1,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None
+    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+        # Do not call torch.repeat_interleave if expand_size is 1 because it clones
+        # the input tensor and thus requires more memory although no change is applied
+        if expand_size == 1:
+            return input_ids, attention_mask
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+        if attention_mask is not None:
+            attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
+        return input_ids, attention_mask
+    def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
+        """Performs validation related to the resulting generated length"""
+        # Can't throw warnings/exceptions during compilation
+        if is_torchdynamo_compiling():
+            return
+        # 1. Max length warnings related to poor parameterization
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
+            warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the "
+                "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
+                "generation.",
+                UserWarning,
+            )
+        if input_ids_length >= generation_config.max_length:
+            input_ids_string = "input_ids"
+            raise ValueError(
+                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_length` or, better yet, setting `max_new_tokens`."
+            )
+    def _prepare_generated_length(
+        self,
+        generation_config,
+        has_default_max_length,
+        input_ids_length,
+    ):
+        """Prepared max and min length in generation configs to avoid clashes between similar attributes"""
+        if generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_length
+        elif has_default_max_length:
+            if generation_config.max_length == DreamGenerationConfig().max_length:
+                generation_config.max_length = generation_config.max_length + input_ids_length
+                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+                if max_position_embeddings is not None:
+                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+        return generation_config
+    def _prepare_generation_config(
+        self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict
+    ) -> DreamGenerationConfig:
+        """
+        Prepares the base generation config, then applies any generation configuration options from kwargs. This
+        function handles retrocompatibility with respect to configuration files.
+        """
+        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
+        using_model_generation_config = False
+        if generation_config is None:
+            generation_config = DreamGenerationConfig.from_model_config(self.config)
+            using_model_generation_config = True
+        # `torch.compile` can't compile `copy.deepcopy`, arguments in `kwargs` that are part of `generation_config`
+        # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled -- an
+        # exception will be raised in `_validate_model_kwargs`
+        if not is_torchdynamo_compiling():
+            generation_config = copy.deepcopy(generation_config)
+            _kwargs = generation_config.update(**kwargs)
+            # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
+            if not using_model_generation_config:
+                if generation_config.bos_token_id is None:
+                    generation_config.bos_token_id = self.generation_config.bos_token_id
+                if generation_config.eos_token_id is None:
+                    generation_config.eos_token_id = self.generation_config.eos_token_id
+                if generation_config.pad_token_id is None:
+                    generation_config.pad_token_id = self.generation_config.pad_token_id
+                if generation_config.mask_token_id is None:
+                    generation_config.mask_token_id = self.generation_config.mask_token_id
+        return generation_config
+    def _prepare_special_tokens(
+        self,
+        generation_config: DreamGenerationConfig,
+        device: Optional[Union[torch.device, str]] = None,
+    ):
+        """
+        Prepares the special tokens for generation, overwriting the generation config with their processed versions
+        converted to tensor.
+        Note that `generation_config` is changed in place and stops being serializable after this method is called.
+        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
+        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
+        """
+        # Convert special tokens to tensors
+        def _tensor_or_none(token, device=None):
+            if token is None:
+                return token
+            device = device if device is not None else self.device
+            if isinstance(token, torch.Tensor):
+                return token.to(device)
+            return torch.tensor(token, device=device, dtype=torch.long)
+        bos_token_tensor = _tensor_or_none(generation_config.bos_token_id, device=device)
+        eos_token_tensor = _tensor_or_none(generation_config.eos_token_id, device=device)
+        pad_token_tensor = _tensor_or_none(generation_config.pad_token_id, device=device)
+        mask_token_tensor = _tensor_or_none(generation_config.mask_token_id, device=device)
+        # We can have more than one eos token. Always treat it as a 1D tensor (when it exists).
+        if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
+            eos_token_tensor = eos_token_tensor.unsqueeze(0)
+        # Set pad token if unset (and there are conditions to do so)
+        if pad_token_tensor is None and eos_token_tensor is not None:
+            pad_token_tensor = eos_token_tensor[0]
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
+        # Update generation config with the updated special tokens tensors
+        # NOTE: this must be written into a different attribute name than the one holding the original special tokens
+        # (in their non-tensor form), in order to enable end-to-end compilation. See
+        # https://pytorch.org/docs/stable/torch.compiler_cudagraph_trees.html#limitations
+        generation_config._bos_token_tensor = bos_token_tensor
+        generation_config._eos_token_tensor = eos_token_tensor
+        generation_config._pad_token_tensor = pad_token_tensor
+        generation_config._mask_token_tensor = mask_token_tensor
+    @torch.no_grad()
+    def diffusion_generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[DreamGenerationConfig] = None,
+        **kwargs,
+    ) -> Union[DreamModelOutput, torch.LongTensor]:
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        generation_config = self._prepare_generation_config(generation_config, **kwargs)
+        generation_tokens_hook_func = kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
+        generation_logits_hook_func = kwargs.pop("generation_logits_hook_func", lambda step, x, logits: logits)
+        # 2. Define model inputs
+        assert inputs is not None
+        input_ids = inputs
+        device = input_ids.device
+        attention_mask = kwargs.pop("attention_mask", None)
+        self._prepare_special_tokens(generation_config, device=device)
+        # 3. Prepare `max_length`.
+        input_ids_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            input_ids_length=input_ids_length,
+        )
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
+        # 4. Check input_ids
+        if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
+            warnings.warn(
+                "You are calling .generate() with the `input_ids` being on a device type different"
+                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+                " Please make sure that you have put `input_ids` to the"
+                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+                " running `.generate()`.",
+                UserWarning,
+            )
+        if (
+            hasattr(generation_config, "pad_token_id") and
+            torch.any(input_ids == generation_config.pad_token_id) and
+            attention_mask is None
+        ):
+            warnings.warn(
+                "Padding was detected but no attention mask is passed here. For correct "
+                "generation results, please set `attention_mask` when batch-padding inputs.",
+                UserWarning,
+            )
+        input_ids, attention_mask = self._expand_inputs_for_generation(
+            expand_size=generation_config.num_return_sequences,
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        result = self._sample(
+            input_ids,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            generation_tokens_hook_func=generation_tokens_hook_func,
+            generation_logits_hook_func=generation_logits_hook_func
+        )
+        return result
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor],
+        generation_config: DreamGenerationConfig,
+        generation_tokens_hook_func,
+        generation_logits_hook_func
+    ) -> Union[DreamModelOutput, torch.LongTensor]:
+        # init values
+        output_history = generation_config.output_history
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        max_length = generation_config.max_length
+        mask_token_id = generation_config.mask_token_id
+        steps = generation_config.steps
+        eps = generation_config.eps
+        alg = generation_config.alg
+        alg_temp = generation_config.alg_temp
+        temperature = generation_config.temperature
+        top_p = generation_config.top_p
+        top_k = generation_config.top_k
+        histories = [] if (return_dict_in_generate and output_history) else None
+        # pad input_ids to max_length
+        x = F.pad(input_ids, (0, max_length - input_ids.shape[1]), value=mask_token_id)
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # we do not mask the [MASK] tokens so value = 1.0
+            attention_mask = F.pad(attention_mask, (0, max_length - attention_mask.shape[1]), value=1.0)
+            tok_idx = attention_mask.long().cumsum(-1) - 1
+            tok_idx.masked_fill_(attention_mask == 0, 1)
+            # attention_mask is of shape [B, N]
+            # broadcast to [B, 1, N, N]
+            attention_mask = torch.logical_and(
+                attention_mask.unsqueeze(1).unsqueeze(-2),
+                attention_mask.unsqueeze(1).unsqueeze(-1),
+            )
+        else:
+            tok_idx = None
+            attention_mask = "full"
+        timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
+        # this allows user-defined token control of the intermediate steps
+        x = generation_tokens_hook_func(None, x, None)
+        for i in range(steps):
+            mask_index = (x == mask_token_id)
+            logits = self(x, attention_mask, tok_idx).logits
+            logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1)
+            # this allows user-defined logits control of the intermediate steps
+            logits = generation_logits_hook_func(i, x, logits)
+            mask_logits = logits[mask_index]
+            t = timesteps[i]
+            s = timesteps[i + 1]
+            if alg == 'origin':
+                p_transfer = 1 - s / t if i < steps - 1 else 1
+                x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
+                transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
+                _, x0[transfer_index_t_s]= sample_tokens(mask_logits[transfer_index_t_s], temperature=temperature, top_p=top_p, top_k=top_k)
+                x[mask_index] = x0.clone()
+            else:
+                if alg == 'maskgit_plus':
+                    confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
+                elif alg == 'topk_margin':
+                    confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, margin_confidence=True)
+                elif alg == 'entropy':
+                    confidence, x0 = sample_tokens(mask_logits, temperature, top_p=top_p, top_k=top_k, neg_entropy=True)
+                else:
+                    raise RuntimeError(f"Unknown alg: {alg}")
+                num_mask_token = mask_index.sum() / mask_index.shape[0]
+                number_transfer_tokens = int(num_mask_token * (1 - s / t)) if i < steps - 1 else int(num_mask_token)
+                full_confidence = torch.full_like(x, -torch.inf, device=self.device, dtype=logits.dtype)
+                full_confidence[mask_index] = confidence
+                if number_transfer_tokens > 0:
+                    if alg_temp is None or alg_temp == 0:
+                        _, transfer_index = torch.topk(full_confidence, number_transfer_tokens)
+                    else:
+                        full_confidence = full_confidence / alg_temp
+                        full_confidence = F.softmax(full_confidence, dim=-1)
+                        transfer_index = torch.multinomial(full_confidence, num_samples=number_transfer_tokens)
+                    x_ = torch.zeros_like(x, device=self.device, dtype=torch.long) + mask_token_id
+                    x_[mask_index] = x0.clone()
+                    row_indices = torch.arange(x.size(0), device=self.device).unsqueeze(1).expand_as(transfer_index)
+                    x[row_indices,transfer_index] = x_[row_indices,transfer_index]
+            # this allows user-defined token control of the intermediate steps
+            x = generation_tokens_hook_func(i, x, logits)
+            if histories is not None:
+                histories.append(x.clone())
+        if return_dict_in_generate:
+            return DreamModelOutput(
+                sequences=x,
+                history=histories,
+            )
+        else:
+            return x

model_cache/dream/model_dream.py ADDED Viewed

	@@ -0,0 +1,1029 @@

+# Hugging Face's logo
+# Hugging Face
+# Models
+# Datasets
+# Spaces
+# Community
+# Docs
+# Enterprise
+# Pricing
+# Dream-org
+# /
+# Dream-v0-Instruct-7B
+# like
+# 94
+# Follow
+# Dream Org
+# 81
+# Feature Extraction
+# Transformers
+# Safetensors
+# Dream
+# custom_code
+# License:
+# apache-2.0
+# Model card
+# Files and versions
+# Community
+# 2
+# Dream-v0-Instruct-7B
+# /
+# modeling_dream.py
+# jiacheng-ye's picture
+# jiacheng-ye
+# Upload model
+# 373705a
+# verified
+# about 2 months ago
+# raw
+# Copy download link
+# history
+# blame
+# contribute
+# delete
+# 36.8 kB
+# # coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT and Qwen implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT and Qwen used by the Meta AI and Qwen team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Dream model."""
+from transformers import Qwen2Model
+from torch.nn.attention.flex_attention import flex_attention
+import math
+from typing import List, Optional, Tuple, Union
+import os
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
+from transformers import PretrainedConfig
+from model_cache.dream.configuration_dream import DreamConfig
+from model_cache.dream.generation_utils import DreamGenerationMixin, DreamGenerationConfig
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+logger = logging.get_logger(__name__)
+from transformers import Qwen2ForCausalLM
+_CHECKPOINT_FOR_DOC = "Dream-7B"
+_CONFIG_FOR_DOC = "DreamConfig"
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Dream
+class DreamRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DreamRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Dream
+class DreamRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[DreamConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`DreamRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def reset_parameters(self):
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, self.inv_freq.device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Dream
+class DreamMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class DreamAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: DreamConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = False
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = DreamRotaryEmbedding(config=self.config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class DreamSdpaAttention(DreamAttention):
+    """
+    Dream attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DreamAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from DreamAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        update_kvcache: torch.int32 = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DreamModel is using DreamSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # causal_mask = attention_mask
+        # if attention_mask is not None:  # no matter the length, we just slice it
+        #     causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        # is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask if attention_mask is not None else None,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=False, # hard coded
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+class DreamFlexAttention(DreamAttention):
+    """
+    Dream attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DreamAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from DreamAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        update_kvcache: torch.int32 = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DreamModel is using DreamSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        # print("hidden_states",hidden_states)
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        # print(query_states.shape,key_states.shape,cos.shape,sin.shape)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        # print("k,v",key_states.shape,value_states.shape,past_key_value)
+        # print(cos.shape,sin.shape,cache_position.shape)
+        if past_key_value is not None:
+            if update_kvcache == 0:
+                past_key_states, past_value_states = past_key_value[self.layer_idx]
+                key_states=torch.cat([past_key_states, key_states], dim=2)
+                value_states=torch.cat([past_value_states, value_states], dim=2)
+  # Specific to RoPE models
+            else:
+                cache_kwargs = {"sin": sin[:,:update_kvcache,:], "cos": cos[:,:update_kvcache,:], "cache_position": cache_position[:update_kvcache]}
+                # print("update_kvcache",update_kvcache)
+                new_key_states, new_value_states = past_key_value.update(key_states[:,:,:update_kvcache, :], value_states[:,:,:update_kvcache, : ], self.layer_idx, cache_kwargs)
+                # print("new_kv",new_key_states.shape,new_value_states.shape)
+                # print("k,v",new_key_states.shape,new_value_states.shape)
+                key_states = torch.cat([new_key_states,key_states[:,:,update_kvcache:,:]], dim=2)
+                value_states = torch.cat([new_value_states,value_states[:,:,update_kvcache:,:]], dim=2)
+                # print("k,v",key_states.shape,value_states.shape)
+        # print(key_states.shape,value_states.shape)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            atte_mask = attention_mask[:,:, :, : key_states.shape[-2]].clone()
+            # print(update_kvcache,attention_mask.shape)
+            # if attention_mask.shape[3]>86+32:
+            # if attention_mask.shape[-1]!=attention_mask.shape[-2]:
+            #     atte_mask[:,:,:update_kvcache,-update_kvcache:]=-torch.inf
+            # if update_kvcache > 0:
+            #     print("attention_mask中出现过的值",atte_mask.unique())
+                # print('tTTTTTTTTT')
+            # print("-"*20)
+            # print("attention_mask",attention_mask,update_kvcache)
+            # print(attention_mask)
+            # exit()
+            # print(attention_mask[0,0,:,:],attention_mask[0,0,:,:].shape)
+            # exit(0)
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        # is_causal = True if causal_mask is None and q_len > 1 else False
+        # print(query_states.shape[2], key_states.shape[2])
+        # attention_mask=attention_mask[:,:, :key_states.shape[2], :key_states.shape[2]] if attention_mask is not None else None
+        # attn_output = flex_attention(query_states, key_states, value_states, block_mask= attention_mask ),
+        # print(query_states.shape, key_states.shape, value_states.shape, attention_mask.shape if attention_mask is not None else None)
+        # print(query_states.dtype,attention_mask.dtype if attention_mask is not None else None)
+        # print(self.training)
+        # print("key_states",key_states[:,:,:84,:])
+        # torch.save(key_states,"key_states1.pt")
+        # torch.save(value_states,"value_states1.pt")
+        # torch.save(value_states,"query_state1.pt")
+        # torch.save(attention_mask,"attention_mask1.pt")
+        # print(atte_mask.shape)
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=atte_mask if attention_mask is not None else None,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=False, # hard coded
+        )
+        # print("attn_output",attn_output[:,:,:84,:],attn_output.shape)
+        # print(atte_mask[:,:,:84,:84],attenti_mask.shape)
+        # exit()
+        # if self.layer_idx==2:
+        # torch.save(attn_output,"attn_output2.pt")
+        # exit()
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+class DreamDecoderLayer(nn.Module):
+    def __init__(self, config: DreamConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        # self.self_attn = Dream_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = DreamFlexAttention(config, layer_idx)
+        self.mlp = DreamMLP(config)
+        self.input_layernorm = DreamRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DreamRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        update_kvcache: torch.int32 = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            update_kvcache=update_kvcache,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class DreamPreTrainedModel(PreTrainedModel):
+    config_class = DreamConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DreamDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        _model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        # NOTE(Lin): we need to override the generation config
+        # because the generation config loaded in `from_pretrained`
+        # does not include all the attributes of DreamGenerationConfig
+        resume_download = kwargs.get("resume_download", None)
+        proxies = kwargs.get("proxies", None)
+        subfolder = kwargs.get("subfolder", "")
+        from_auto_class = kwargs.get("_from_auto", False)
+        from_pipeline = kwargs.get("_from_pipeline", None)
+        _model.generation_config = DreamGenerationConfig.from_pretrained(
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            _from_auto=from_auto_class,
+            _from_pipeline=from_pipeline,
+        )
+        return _model
+class DreamBaseModel(DreamPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DreamDecoderLayer`]
+    Args:
+        config: DreamConfig
+    """
+    def __init__(self, config: DreamConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DreamDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = DreamRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DreamRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        update_kvcache: torch.int32 = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if inputs_embeds is None:
+            # past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            # input_ids = input_ids[:, past_seen_tokens:]
+            inputs_embeds = self.embed_tokens(input_ids)
+            # print("inputs_embeds",inputs_embeds.shape)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    update_kvcache=update_kvcache,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class DreamModel(DreamGenerationMixin, DreamPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DreamBaseModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def reset_rope_parameters(self):
+        self.model.rotary_emb.reset_parameters()
+        for layer in self.model.layers:
+            layer.self_attn.rotary_emb.reset_parameters()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        update_kvcache: torch.int32 = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            update_kvcache=update_kvcache,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

model_cache/llada/__pycache__/configuration_llada.cpython-310.pyc ADDED Viewed

Binary file (6.24 kB). View file

model_cache/llada/__pycache__/configuration_llada.cpython-312.pyc ADDED Viewed

Binary file (8.26 kB). View file

model_cache/llada/__pycache__/modeling_llada.cpython-310.pyc ADDED Viewed

Binary file (40.3 kB). View file

model_cache/llada/__pycache__/modeling_llada.cpython-312.pyc ADDED Viewed

Binary file (72.6 kB). View file

model_cache/llada/configuration_llada.py ADDED Viewed

	@@ -0,0 +1,463 @@

+"""
+LLaDA configuration
+"""
+from transformers import AutoConfig, PretrainedConfig
+from enum import Enum
+from os import PathLike
+from typing import Union
+from dataclasses import asdict, dataclass, field
+from glob import glob
+from pathlib import Path
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
+__all__ = [
+    "ActivationType",
+    "ActivationCheckpointingStrategy",
+    "BlockType",
+    "LayerNormType",
+    "InitFnType",
+    "ModelConfig",
+]
+PathOrStr = Union[str, PathLike]
+class StrEnum(str, Enum):
+    """
+    This is equivalent to Python's :class:`enum.StrEnum` since version 3.11.
+    We include this here for compatibility with older version of Python.
+    """
+    def __str__(self) -> str:
+        return self.value
+    def __repr__(self) -> str:
+        return f"'{str(self)}'"
+class LayerNormType(StrEnum):
+    default = "default"
+    """
+    The default LayerNorm implementation, equivalent to PyTorch's built-in version.
+    """
+    low_precision = "low_precision"
+    """
+    A low-precision version of the default LayerNorm.
+    """
+    rms = "rms"
+    """
+    An RMSNorm implementation. When using ``torch.compile`` this is
+    probably the fastest implementation.
+    """
+    gemma_rms = "gemma_rms"
+    """
+    An RMSNorm implementation by gemmma. When using ``torch.compile`` this is
+    probably the fastest implementation.
+    """
+    amd_compatible = "amd_compatible"
+    """
+    LayerNorm implemented manually to work around an issue with ROCm.
+    """
+class ActivationType(StrEnum):
+    gelu = "gelu"
+    relu = "relu"
+    silu = "silu"
+    swiglu = "swiglu"
+class BlockType(StrEnum):
+    sequential = "sequential"
+    parallel = "parallel"
+    llama = "llama"
+    """
+    A block similar to the sequential block with slightly different
+    implementations of operations like attention to imitate the behavior of Llama.
+    """
+class InitFnType(StrEnum):
+    mitchell = "mitchell"
+    """
+    The strategy suggested to us by Mitchell Wortsman from UW.
+    This uses a truncated normal distribution with an adaptive standard deviation that depends
+    on the size of the weights as well as the depth of the layer.
+    """
+    normal = "normal"
+    """
+    All weights are initialized from the same normal distribution.
+    """
+    kaiming_normal = "kaiming_normal"
+    """
+    All weights are initialized with the Kaiming method from a normal distribution.
+    Note this currently won't work with FSDP.
+    """
+    fan_in = "fan_in"
+    """
+    "Fan-in variance scaling", i.e. normal with a standard deviation of ``1/sqrt(d_in)`` where ``d_in``
+    is the input dimensionality of the kernel.
+    """
+    full_megatron = "full_megatron"
+    """
+    This is what metaseq calls "full megatron init". It is the init used for Llama 2.
+    """
+@dataclass
+class ModelConfig():
+    """
+    LLaDA (model) configuration.
+    """
+    # Note that the defaults for these attributes are equivalent to the base GPT2 model.
+    d_model: int = 768
+    """
+    The hidden size of the model.
+    """
+    n_heads: int = 12
+    """
+    The number of self-attention heads.
+    """
+    n_kv_heads: Optional[int] = None
+    """
+    The number of heads to use for keys and values. Defaults to `n_heads`.
+    Set this to ``None`` or ``n_heads`` for normal multi-head attention.
+    Set this to 1 for multi-query attention.
+    Set it to some in-between value for Llama2-style grouped query attention.
+    """
+    n_layers: int = 12
+    """
+    The number of layers/blocks.
+    """
+    mlp_ratio: int = 4
+    """
+    The ratio of the inner MLP dimensionality to ``d_model``.
+    This is only used when ``mlp_hidden_size`` is not set.
+    """
+    mlp_hidden_size: Optional[int] = None
+    """
+    Set the exact hidden size for the MLP. Otherwise the inner MLP hidden size will be set to `mlp_ratio * d_model`.
+    """
+    activation_type: ActivationType = ActivationType.swiglu
+    """
+    The activation function to use within the MLP layers.
+    """
+    block_type: BlockType = BlockType.sequential
+    """
+    The transformer block implementation.
+    """
+    block_group_size: int = 1
+    """
+    The number of blocks to group together into a single parent block.
+    This has no affect on the number of parameters in the model and is only used to wrap groups
+    of blocks together with a single FSDP wrapper during training.
+    """
+    alibi: bool = False
+    """
+    If ``True``, use ALiBi embeddings. Mutually exclusive with ``rope``.
+    """
+    alibi_bias_max: float = 8.0
+    """
+    Maximum absolute value of ALiBi bias.
+    """
+    rope: bool = False
+    """
+    Use rotary positional embeddings (RoPE). Mutually exclusive with ``alibi``.
+    """
+    rope_full_precision: bool = True
+    """
+    If ``True``, apply RoPE embeddings at full precision regardless of the input type. Otherwise,
+    apply RoPE at the precision of the input.
+    """
+    flash_attention: bool = False
+    """
+    If ``True``, use ``FlashAttention``.
+    """
+    attention_dropout: float = 0.1
+    """
+    The dropout probability within the attention modules.
+    """
+    multi_query_attention: Optional[bool] = None
+    """
+    Use the Multi-Query formulation of attention used in PaLM. This reduces the number of parameters
+    and is more efficient during inference.
+    """
+    attention_layer_norm: bool = False
+    """
+    Apply layer norm to the keys and queries within the attention mechanism.
+    This can help stabilize training.
+    """
+    residual_dropout: float = 0.1
+    """
+    The dropout probability for the MLP and attention output within each block.
+    """
+    embedding_dropout: float = 0.1
+    """
+    The dropout probability for embeddings.
+    """
+    input_emb_norm: bool = False
+    """
+    An input hidden_states norm implementation by gemmma.
+    """
+    layer_norm_type: LayerNormType = LayerNormType.default
+    """
+    The layernorm implementation to use.
+    """
+    layer_norm_with_affine: bool = True
+    """
+    Whether to include bias and weight parameters for the layer norms.
+    This only affects layer norms that are immediately followed by a linear layer in the forward pass,
+    so everything except QK-norms. To turn off affines for QK norms as well, set :attr:`attention_layer_norm_with_affine`
+    to ``False``.
+    """
+    rms_norm_eps: float = 1e-05
+    """
+    The rms layernorm eps param.
+    """
+    attention_layer_norm_with_affine: bool = True
+    """
+    Toggle affine transform for the QK norms.
+    """
+    max_sequence_length: int = 1024
+    """
+    The maximum input sequence length supported by the model.
+    """
+    rope_theta: float = 10000.0
+    """
+    The rope base param.
+    """
+    include_qkv_bias: Optional[bool] = False
+    """
+    Whether or not to include bias parameters in qkv linear layers.
+    """
+    include_bias: bool = False
+    """
+    Whether or not to include bias parameters in linear layers.
+    In PaLM, they got rid of all bias terms because they found that large
+    models tend to have near 0 bias terms anyway.
+    """
+    bias_for_layer_norm: Optional[bool] = None
+    """
+    Whether or not to include bias parameters in layer norm.
+    This is separate from the include_bias parameter, because of a ROCm crash when biases are disabled in
+    layer norm.
+    When this is None (the default), it inherits the setting from include_bias.
+    """
+    scale_logits: bool = False
+    """
+    If ``True``, scale the output logits by ``1 / sqrt(d_model)``.
+    """
+    vocab_size: int = 50257
+    """
+    Vocabulary size of the model.
+    """
+    embedding_size: Optional[int] = 50304
+    """
+    The number of embeddings, i.e. the number of tokens. If set to ``None`` it will default
+    to ``vocab_size``. If ``vocab_size`` is not a multiple of 128, setting this to the
+    next multiple of 128 that's greater than ``vocab_size`` can improve throughput
+    substantially.
+    """
+    weight_tying: bool = True
+    """
+    Whether to tie output linear weights to the input embedding.
+    """
+    eos_token_id: int = 50256
+    """
+    The ID of the end-of-sentence special token.
+    """
+    pad_token_id: int = 50256
+    """
+    The ID of the token to use for padding. Defaults to the ID of the EOS token.
+    """
+    mask_token_id: Optional[int] = 50256
+    """
+    The ID of the token to use for mask token. Defaults to the ID of the EOS token.
+    """
+    init_device: Optional[str] = None
+    """
+    The torch device to use when initializing the model parameters, e.g. "cpu", "cuda:0", "meta".
+    """
+    init_fn: InitFnType = InitFnType.normal
+    """
+    The weight initialization strategy.
+    """
+    init_std: float = 0.02
+    """
+    The standard deviation to use when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal".
+    """
+    init_cutoff_factor: Optional[float] = None
+    """
+    A positive factor used to scale the cutoff values when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal". Setting this to None means values are not cutoff.
+    """
+    precision: Optional[str] = None
+    """
+    Precision used to train/evaluate with. You shouldn't set this directly.
+    See :data:`TrainConfig.precision` instead.
+    """
+    @property
+    def effective_n_kv_heads(self) -> int:
+        if self.n_kv_heads is None:
+            if self.multi_query_attention is True:
+                return 1
+            else:
+                return self.n_heads
+        else:
+            if self.multi_query_attention is None:
+                return self.n_kv_heads
+            if self.multi_query_attention:
+                n_kv_heads_should_be = 1
+            else:
+                n_kv_heads_should_be = self.n_heads
+            if self.n_kv_heads == n_kv_heads_should_be:
+                return n_kv_heads_should_be
+            else:
+                raise Exception(
+                    "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
+                )
+class ActivationCheckpointingStrategy(StrEnum):
+    whole_layer = "whole_layer"
+    """
+    Checkpoint every transformer layer.
+    """
+    one_in_two = "one_in_two"
+    """
+    Checkpoint one in two transformer layers.
+    """
+    one_in_three = "one_in_three"
+    """
+    Checkpoint one in three transformer layers.
+    """
+    one_in_four = "one_in_four"
+    """
+    Checkpoint one in four transformer layers.
+    """
+    two_in_three = "two_in_three"
+    """
+    Checkpoint two out of every three transformer layers.
+    """
+    three_in_four = "three_in_four"
+    """
+    Checkpoint three out of four of every transformer layers.
+    """
+    four_in_five = "four_in_five"
+    """
+    Checkpoint four out of five of every transformer layers.
+    """
+    nine_in_ten = "nine_in_ten"
+    """
+    Checkpoint nine out of ten of every transformer layers.
+    """
+    fine_grained = "fine_grained"
+    """
+    Focus checkpointing on where it is cheap to recompute and saves most memory.
+    """
+class LLaDAConfig(PretrainedConfig):
+    model_type = "llada"
+    keys_to_ignore_at_inference = ["past_key_values"]  # TODO: confirm
+    def __init__(self, use_cache: bool = False, **kwargs):
+        model_config = ModelConfig()
+        all_kwargs = model_config.__dict__
+        all_kwargs.update(kwargs)
+        all_kwargs.update({"use_cache": use_cache})
+        all_kwargs.update(
+            {
+                "architectures": all_kwargs.get("architectures", ["LLaDAModelLM"])
+            }
+        )
+        super().__init__(**all_kwargs)
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
+    @property
+    def hidden_size(self):
+        return self.d_model
+# Register the config class so that it is available for transformer pipelines, auto-loading etc.
+AutoConfig.register("llada", LLaDAConfig)

model_cache/llada/modeling_llada.py ADDED Viewed

	@@ -0,0 +1,1504 @@

+from __future__ import annotations
+import logging
+import math
+import sys
+from abc import abstractmethod
+from collections import defaultdict
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    cast,
+)
+from dataclasses import fields
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.backends.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.auto import AutoModel
+from transformers.cache_utils import Cache
+from .configuration_llada import (
+    LLaDAConfig,
+    StrEnum,
+    InitFnType,
+    ActivationType,
+    BlockType,
+    LayerNormType,
+    ModelConfig,
+    ActivationCheckpointingStrategy,
+)
+if sys.version_info.minor > 8:
+    from collections.abc import MutableMapping
+elif sys.version_info.minor == 8:
+    from typing import MutableMapping
+else:
+    raise SystemExit("This script supports Python 3.8 or higher")
+__all__ = [
+    "LayerNormBase",
+    "LayerNorm",
+    "RMSLayerNorm",
+    "GemmaRMSLayerNorm",
+    "RotaryEmbedding",
+    "Activation",
+    "GELU",
+    "ReLU",
+    "SwiGLU",
+    "LLaDABlock",
+    "LLaDASequentialBlock",
+    "LLaDAModel",
+    "LLaDAOutput",
+    "LLaDAGenerateOutput",
+]
+log = logging.getLogger(__name__)
+class ModuleType(StrEnum):
+    in_module = "in"
+    out_module = "out"
+    emb = "emb"
+    final_out = "final_out"
+def init_weights(
+    config: ModelConfig,
+    module: Union[nn.Linear, nn.Embedding],
+    d: Optional[int] = None,
+    layer_id: Optional[int] = None,
+    std_factor: float = 1.0,
+    type_of_module: Optional[ModuleType] = None,
+) -> None:
+    """
+    Initialize weights of a linear or embedding module.
+    :param config: The model config.
+    :param module: The linear or embedding submodule to initialize.
+    :param d: The effective input dimensionality of the weights. This could be smaller than the actual dimensions
+        for fused layers.
+    :param layer_id: When set, the standard deviation for the "mitchell" method will be adjusted by
+        ``1 / sqrt(2 * (layer_id + 1))``.
+    """
+    d = d if d is not None else config.d_model
+    if config.init_fn == InitFnType.normal:
+        std = config.init_std * std_factor
+        if config.init_cutoff_factor is not None:
+            cutoff_value = config.init_cutoff_factor * std
+            nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
+        else:
+            nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif config.init_fn == InitFnType.mitchell:
+        std = std_factor / math.sqrt(d)
+        if layer_id is not None:
+            std = std / math.sqrt(2 * (layer_id + 1))
+        nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-3 * std, b=3 * std)
+    elif config.init_fn == InitFnType.kaiming_normal:
+        nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
+    elif config.init_fn == InitFnType.fan_in:
+        std = std_factor / math.sqrt(d)
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif config.init_fn == InitFnType.full_megatron:
+        if type_of_module is None:
+            raise RuntimeError(f"When using the {InitFnType.full_megatron} init, every module must have a type.")
+        cutoff_factor = config.init_cutoff_factor
+        if cutoff_factor is None:
+            cutoff_factor = 3
+        if type_of_module == ModuleType.in_module:
+            # for att_proj (same as QKV), ff_proj
+            std = config.init_std
+        elif type_of_module == ModuleType.out_module:
+            # for attn_out, ff_out
+            std = config.init_std / math.sqrt(2.0 * config.n_layers)
+        elif type_of_module == ModuleType.emb:
+            # positional embeddings (wpe)
+            # token embeddings (wte)
+            std = config.init_std
+        elif type_of_module == ModuleType.final_out:
+            # final output (ff_out)
+            std = config.d_model**-0.5
+        else:
+            raise RuntimeError(f"Unknown module type '{type_of_module}'")
+        nn.init.trunc_normal_(
+            module.weight,
+            mean=0.0,
+            std=std,
+            a=-cutoff_factor * std,
+            b=cutoff_factor * std,
+        )
+    else:
+        raise NotImplementedError(config.init_fn)
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+        if config.init_fn == InitFnType.normal and getattr(module, "_is_residual", False):
+            with torch.no_grad():
+                module.weight.div_(math.sqrt(2 * config.n_layers))
+def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
+    """
+    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
+    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
+    """
+    if check_neg_inf:
+        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
+    if check_pos_inf:
+        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+def activation_checkpoint_function(cfg: ModelConfig):
+    preserve_rng_state = (
+        (cfg.attention_dropout == 0.0) and (cfg.embedding_dropout == 0.0) and (cfg.residual_dropout == 0.0)
+    )
+    from torch.utils.checkpoint import checkpoint
+    return partial(
+        checkpoint,
+        preserve_rng_state=preserve_rng_state,
+        use_reentrant=False,
+    )
+class BufferCache(dict, MutableMapping[str, torch.Tensor]):
+    """
+    Cache for attention biases and other things that would normally be stored as buffers.
+    We avoid using buffers because we've run into various issues doing so with FSDP.
+    In general it appears the way FSDP handles buffers is not well-defined.
+    It doesn't shard them but apparently it does synchronize them across processes, which we want to avoid
+    since (A) it isn't necessary, and (B) we sometimes have `-inf` in these biases which might get turned into
+    NaNs when they're synchronized due to casting or some other issue.
+    """
+def _non_meta_init_device(config: ModelConfig) -> torch.device:
+    if config.init_device is not None and config.init_device != "meta":
+        return torch.device(config.init_device)
+    else:
+        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class Dropout(nn.Dropout):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.p == 0.0:
+            return input
+        else:
+            return F.dropout(input, self.p, self.training, self.inplace)
+class LayerNormBase(nn.Module):
+    def __init__(
+        self,
+        config: ModelConfig,
+        *,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = True,
+        eps: float = 1e-05,
+    ):
+        super().__init__()
+        self.config = config
+        self.eps = eps
+        self.normalized_shape = (size or config.d_model,)
+        if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine):
+            self.weight = nn.Parameter(torch.ones(self.normalized_shape, device=config.init_device))
+            use_bias = self.config.bias_for_layer_norm
+            if use_bias is None:
+                use_bias = self.config.include_bias
+            if use_bias:
+                self.bias = nn.Parameter(torch.zeros(self.normalized_shape, device=config.init_device))
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("bias", None)
+            self.register_parameter("weight", None)
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig, size: Optional[int] = None, **kwargs) -> LayerNormBase:
+        if config.layer_norm_type == LayerNormType.default:
+            return LayerNorm(config, size=size, low_precision=False, **kwargs)
+        elif config.layer_norm_type == LayerNormType.low_precision:
+            return LayerNorm(config, size=size, low_precision=True, **kwargs)
+        elif config.layer_norm_type == LayerNormType.rms:
+            return RMSLayerNorm(config, size=size, **kwargs)
+        elif config.layer_norm_type == LayerNormType.gemma_rms:
+            return GemmaRMSLayerNorm(config, size=size, **kwargs)
+        else:
+            raise NotImplementedError(f"Unknown LayerNorm type: '{config.layer_norm_type}'")
+    def _cast_if_autocast_enabled(self, tensor: torch.Tensor, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if tensor.device.type == "cuda" and torch.is_autocast_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_gpu_dtype())
+        elif tensor.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_cpu_dtype())
+        else:
+            return tensor
+    def reset_parameters(self):
+        if self.weight is not None:
+            torch.nn.init.ones_(self.weight)  # type: ignore
+        if self.bias is not None:
+            torch.nn.init.zeros_(self.bias)  # type: ignore
+class LayerNorm(LayerNormBase):
+    """
+    The default :class:`LayerNorm` implementation which can optionally run in low precision.
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        low_precision: bool = False,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-05,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
+        self.low_precision = low_precision
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.low_precision:
+            module_device = x.device
+            downcast_x = self._cast_if_autocast_enabled(x)
+            downcast_weight = (
+                self._cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+            )
+            downcast_bias = self._cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+            with torch.autocast(enabled=False, device_type=module_device.type):
+                return F.layer_norm(
+                    downcast_x, self.normalized_shape, weight=downcast_weight, bias=downcast_bias, eps=self.eps
+                )
+        else:
+            return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class RMSLayerNorm(LayerNormBase):
+    """
+    RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # with torch.autocast(enabled=False, device_type=x.device.type):
+        og_dtype = x.dtype
+        x = x.to(torch.float32)
+        # print(x.dtype,x.shape)
+        variance = x*x
+        # print(variance)
+        variance = variance.mean(dim=-1,keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+        x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return self.weight * x + self.bias
+            else:
+                return self.weight * x
+        else:
+            return x
+class GemmaRMSLayerNorm(LayerNormBase):
+    """
+    Gemma RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return x * (1 + self.weight) + self.bias
+            else:
+                return x * (1 + self.weight)
+        else:
+            return x
+class RotaryEmbedding(nn.Module):
+    """
+    [Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
+    """
+    def __init__(self, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.config = config
+        self.__cache = cache
+        # Warm up cache.
+        self.rope_theta = config.rope_theta
+        self.get_rotary_embedding(config.max_sequence_length, _non_meta_init_device(config))
+    def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        if (
+            (pos_sin := self.__cache.get("rope_pos_sin")) is not None
+            and (pos_cos := self.__cache.get("rope_pos_cos")) is not None
+            and pos_sin.shape[-2] >= seq_len
+            and pos_cos.shape[-2] >= seq_len
+        ):
+            if pos_sin.device != device:
+                pos_sin = pos_sin.to(device)
+                self.__cache["rope_pos_sin"] = pos_sin
+            if pos_cos.device != device:
+                pos_cos = pos_cos.to(device)
+                self.__cache["rope_pos_cos"] = pos_cos
+            return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
+        with torch.autocast(device.type, enabled=False):
+            dim = self.config.d_model // self.config.n_heads
+            inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
+            seq = torch.arange(seq_len, device=device, dtype=torch.float)
+            freqs = einsum("i , j -> i j", seq, inv_freq)
+            positions = torch.cat((freqs, freqs), dim=-1)
+            pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
+        self.__cache["rope_pos_sin"] = pos_sin
+        self.__cache["rope_pos_cos"] = pos_cos
+        return pos_sin, pos_cos
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        B, nh, T, hs = x.size()
+        x = x.view(B, nh, T, 2, hs // 2)
+        x1, x2 = x.unbind(dim=-2)
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.config.rope_full_precision:
+            q_, k_ = q.float(), k.float()
+        else:
+            q_, k_ = q, k
+        with torch.autocast(q.device.type, enabled=False):
+            query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
+            pos_sin, pos_cos = self.get_rotary_embedding(key_len, q_.device)
+            pos_sin = pos_sin.type_as(q_)
+            pos_cos = pos_cos.type_as(q_)
+            q_ = self.apply_rotary_pos_emb(
+                pos_sin[:, :, key_len - query_len : key_len, :],
+                pos_cos[:, :, key_len - query_len : key_len, :],
+                q_,
+            )
+            k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
+        return q_.type_as(q), k_.type_as(k)
+class Activation(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @property
+    @abstractmethod
+    def output_multiplier(self) -> float:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig) -> Activation:
+        if config.activation_type == ActivationType.gelu:
+            return cast(Activation, GELU(approximate="none"))
+        elif config.activation_type == ActivationType.relu:
+            return cast(Activation, ReLU(inplace=False))
+        elif config.activation_type == ActivationType.silu:
+            return cast(Activation, SiLU(inplace=False))
+        elif config.activation_type == ActivationType.swiglu:
+            return SwiGLU(config)
+        else:
+            raise NotImplementedError(f"Unknown activation: '{config.activation_type}'")
+class GELU(nn.GELU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class ReLU(nn.ReLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SiLU(nn.SiLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SwiGLU(Activation):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+    @property
+    def output_multiplier(self) -> float:
+        return 0.5
+def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
+    att_bias = torch.triu(
+        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
+        diagonal=1,
+    )
+    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
+    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
+def get_causal_attention_bias(cache: BufferCache, seq_len: int, device: torch.device) -> torch.Tensor:
+    if (causal_bias := cache.get("causal_attention_bias")) is not None and causal_bias.shape[-1] >= seq_len:
+        if causal_bias.device != device:
+            causal_bias = causal_bias.to(device)
+            cache["causal_attention_bias"] = causal_bias
+        return causal_bias
+    with torch.autocast(device.type, enabled=False):
+        causal_bias = causal_attention_bias(seq_len, device)
+    cache["causal_attention_bias"] = causal_bias
+    return causal_bias
+def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device) -> torch.FloatTensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len)
+    # shape: (1, 1, seq_len, seq_len)
+    alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1)
+    alibi_bias.abs_().mul_(-1)
+    # shape: (n_heads,)
+    m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device)
+    m.mul_(config.alibi_bias_max / config.n_heads)
+    # shape: (1, n_heads, seq_len, seq_len)
+    return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
+class LLaDABlock(nn.Module):
+    """
+    A base class for transformer block implementations.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+        )
+        self.__cache = cache
+        assert config.d_model % config.n_heads == 0
+        self._activation_checkpoint_fn = None
+        # Dropout.
+        self.dropout = Dropout(config.residual_dropout)
+        # Layer norms.
+        self.k_norm: Optional[LayerNormBase] = None
+        self.q_norm: Optional[LayerNormBase] = None
+        if config.attention_layer_norm:
+            self.k_norm = LayerNormBase.build(
+                config,
+                size=(config.d_model // config.n_heads) * config.effective_n_kv_heads,
+                elementwise_affine=config.attention_layer_norm_with_affine,
+            )
+            self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
+        # Activation function.
+        self.act = Activation.build(config)
+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+        # Attention output projection.
+        self.attn_out = nn.Linear(
+            config.d_model, config.d_model, bias=config.include_bias, device=config.init_device
+        )
+        # Feed-forward output projection.
+        self.ff_out = nn.Linear(
+            int(self.act.output_multiplier * self.hidden_size),
+            config.d_model,
+            bias=config.include_bias,
+            device=config.init_device,
+        )
+        self.ff_out._is_residual = True  # type: ignore
+        # Rotary embeddings.
+        if self.config.rope:
+            self.rotary_emb = RotaryEmbedding(config, self.__cache)
+        self.flash_attn_func = None
+        if config.flash_attention:
+            try:
+                from flash_attn import flash_attn_func  # type: ignore
+                self.flash_attn_func = flash_attn_func
+            except ModuleNotFoundError:
+                pass
+    def reset_parameters(self):
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        init_weights(
+            self.config,
+            self.attn_out,
+            d=self.config.d_model,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+        init_weights(
+            self.config,
+            self.ff_out,
+            d=self.ff_out.in_features,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        if strategy == ActivationCheckpointingStrategy.fine_grained:
+            self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+        else:
+            self._activation_checkpoint_fn = None
+    @classmethod
+    def _cast_attn_bias(cls, bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
+        target_dtype = input_dtype
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if bias.device.type == "cuda" and torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif bias.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            target_dtype = torch.get_autocast_cpu_dtype()
+        if bias.dtype != target_dtype:
+            bias = bias.to(target_dtype)
+            ensure_finite_(bias, check_neg_inf=True, check_pos_inf=False)
+        return bias
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        """
+        Computes scaled dot product attention on query, key and value tensors, using an optional
+        attention mask if passed, and applying dropout if a probability greater than 0.0 is specified.
+        """
+        if self.flash_attn_func is not None and attn_mask is None:
+            r = self.flash_attn_func(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p, causal=False
+            )
+            return r.transpose(1, 2)
+        else:
+            # torch's sdpa doesn't support GQA, so we're doing this
+            assert k.size(1) == v.size(1)
+            num_kv_heads = k.size(1)
+            num_q_heads = q.size(1)
+            if num_q_heads != num_kv_heads:
+                assert num_q_heads % num_kv_heads == 0
+                k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+                v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            # Modify: MDM set causal to False, and with no attn_mask.
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=False,
+            )
+    def attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, C = q.size()  # batch size, sequence length, d_model
+        dtype = k.dtype
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q).to(dtype=dtype)
+            k = self.k_norm(k).to(dtype=dtype)
+        # Move head forward to be next to the batch dim.
+        # shape: (B, nh, T, hs)
+        q = q.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+        present = (k, v) if use_cache else None
+        query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+        if self.config.rope:
+            # Apply rotary embeddings.
+            q, k = self.rotary_emb(q, k)
+        # if attention_bias is not None:
+        #     # Resize and cast attention bias.
+        #     # The current dtype of the attention bias might not match the dtype that the SDP attn function will
+        #     # run in if AMP is enabled, and this can be a problem if some tokens are masked out due to padding
+        #     # as down-casting the attention bias to the autocast precision will result in -infs, which will
+        #     # cause the SDP attn function to produce NaNs.
+        #     attention_bias = self._cast_attn_bias(
+        #         attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype
+        #     )
+        # Get the attention scores.
+        # shape: (B, nh, T, hs)
+        att = self._scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attention_bias,
+            dropout_p=0.0 if not self.training else self.config.attention_dropout,
+            is_causal=False,
+        )
+        # Re-assemble all head outputs side-by-side.
+        att = att.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply output projection.
+        return self.attn_out(att), present
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> LLaDABlock:
+        if config.block_type == BlockType.sequential:
+            return LLaDASequentialBlock(layer_id, config, cache)
+        elif config.block_type == BlockType.llama:
+            return LLaDALlamaBlock(layer_id, config, cache)
+        else:
+            raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
+class LLaDASequentialBlock(LLaDABlock):
+    """
+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        # Attention input projection. Projects x -> (q, k, v)
+        head_dim = config.d_model // config.n_heads
+        self.fused_dims = (
+            config.d_model,
+            config.effective_n_kv_heads * head_dim,
+            config.effective_n_kv_heads * head_dim,
+        )
+        self.att_proj = nn.Linear(
+            config.d_model, sum(self.fused_dims), bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        init_weights(
+            self.config, self.att_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+        init_weights(
+            self.config, self.ff_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        if self._activation_checkpoint_fn is not None:
+            q, k, v = self.att_proj(self._activation_checkpoint_fn(self.attn_norm, x)).split(
+                self.fused_dims, dim=-1
+            )
+        else:
+            q, k, v = self.att_proj(self.attn_norm(x)).split(self.fused_dims, dim=-1)
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
+            )
+        else:
+            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x = self.ff_proj(x)
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class LLaDALlamaBlock(LLaDABlock):
+    """
+    This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). This block is similar to `LLaDASequentialBlock`
+    but some operations have slightly different implementations to imitate the
+    behavior of Llama.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        self.__cache = cache
+        # Attention input projection. Projects x -> (q, k, v)
+        head_dim = config.d_model // config.n_heads
+        q_proj_out_dim = config.d_model
+        k_proj_out_dim = config.effective_n_kv_heads * head_dim
+        v_proj_out_dim = config.effective_n_kv_heads * head_dim
+        self.q_proj = nn.Linear(
+            config.d_model, q_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        self.k_proj = nn.Linear(
+            config.d_model, k_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        self.v_proj = nn.Linear(
+            config.d_model, v_proj_out_dim, bias=config.include_bias | config.include_qkv_bias, device=config.init_device
+        )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+        # new add
+        self.up_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        init_weights(self.config, self.q_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.k_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.v_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.ff_proj, d=self.config.d_model, layer_id=None)
+        init_weights(self.config, self.up_proj, d=self.config.d_model, layer_id=None)  # new add
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        # print(x)
+        x_normed = self.attn_norm(x)
+        q = self.q_proj(x_normed)
+        k = self.k_proj(x_normed)
+        v = self.v_proj(x_normed)
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
+            )
+        else:
+            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x, x_up = self.ff_proj(x), self.up_proj(x) # new add
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = x * x_up # new add
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class LLaDAOutput(NamedTuple):
+    logits: torch.FloatTensor
+    """
+    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
+    for the next token *before* normalization via (log) softmax.
+    """
+    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]
+    """
+    Attention keys and values from each block.
+    """
+    hidden_states: Optional[Tuple[torch.Tensor]]
+    """
+    Hidden states from each block.
+    """
+class LLaDAGenerateOutput(NamedTuple):
+    token_ids: torch.LongTensor
+    """
+    The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`.
+    These do *not* include the original input IDs.
+    """
+    scores: torch.FloatTensor
+    """
+    The scores of the generated sequences, a tensor of shape `(batch_size, beam_size)`.
+    """
+class LLaDABlockGroup(nn.ModuleList):
+    def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None):
+        super().__init__(modules)
+        self.config = config
+        self.layer_offset = layer_offset
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layers_past: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        for block_idx, block in enumerate(self):
+            layer_past = None if layers_past is None else layers_past[block_idx]
+            block_idx += self.layer_offset
+            if (
+                (self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.whole_layer)
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_two
+                    and block_idx % 2 == 0
+                )
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_three
+                    and block_idx % 3 == 0
+                )
+                or (
+                    self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_four
+                    and block_idx % 4 == 0
+                )
+            ):
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = self._activation_checkpoint_fn(  # type: ignore
+                    block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
+                )
+            else:
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.append(cache)
+        return x, attn_key_values
+    def reset_parameters(self):
+        for block in self:
+            block.reset_parameters()
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        self.activation_checkpointing_strategy = strategy
+        for block in self:
+            block.set_activation_checkpointing(strategy)
+class LLaDAModel(nn.Module):
+    def __init__(self, config: ModelConfig, init_params: bool = True):
+        super().__init__()
+        self.config = config
+        self.__cache = BufferCache()
+        # Validate config.
+        if self.config.alibi and self.config.flash_attention:
+            raise Exception("ALiBi is currently not supported with FlashAttention")
+        if self.config.alibi and self.config.rope:
+            raise Exception("ALiBi and RoPE are mutually exclusive")
+        if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
+            if self.config.embedding_size < self.config.vocab_size:
+                raise Exception("embedding size should be at least as big as vocab size")
+            elif self.config.embedding_size % 128 != 0:
+                import warnings
+                warnings.warn(
+                    "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
+                )
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn: Callable = activation_checkpoint_function(self.config)
+        if not (
+            0 < self.config.block_group_size <= self.config.n_layers
+            and self.config.n_layers % self.config.block_group_size == 0
+        ):
+            raise Exception("n layers must be divisible by block group size")
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(
+                    config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
+                ),
+                emb_drop=Dropout(config.embedding_dropout),
+                ln_f=LayerNorm.build(config),
+            )
+        )
+        blocks = [LLaDABlock.build(i, config, self.__cache) for i in range(config.n_layers)]
+        if self.config.block_group_size > 1:
+            block_groups = [
+                LLaDABlockGroup(config, i, blocks[i : i + config.block_group_size])
+                for i in range(0, config.n_layers, config.block_group_size)
+            ]
+            self.transformer.update({"block_groups": nn.ModuleList(block_groups)})
+        else:
+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not (self.config.alibi or self.config.rope):
+            self.transformer.update(
+                {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
+            )
+        if not config.weight_tying:
+            self.transformer.update(
+                {
+                    "ff_out": nn.Linear(
+                        config.d_model,
+                        config.embedding_size or config.vocab_size,
+                        bias=config.include_bias,
+                        device=config.init_device,
+                    )
+                }
+            )
+        # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
+        if init_params and self.config.init_device != "meta":
+            self.reset_parameters()
+        self.__num_fwd_flops: Optional[int] = None
+        # Warm up cache.
+        if self.config.alibi:
+            get_causal_attention_bias(self.__cache, config.max_sequence_length, _non_meta_init_device(config))
+            self.get_alibi_attention_bias(config.max_sequence_length, _non_meta_init_device(config))
+    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
+        self.activation_checkpointing_strategy = strategy
+        if self.config.block_group_size != 1:
+            for block_group in self.transformer.block_groups:
+                block_group.set_activation_checkpointing(strategy)
+        else:
+            for block in self.transformer.blocks:
+                block.set_activation_checkpointing(strategy)
+    @property
+    def device(self) -> torch.device:
+        device: torch.device = self.transformer.wte.weight.device  # type: ignore
+        if device.type == "meta":
+            return _non_meta_init_device(self.config)
+        else:
+            return device
+    def reset_parameters(self):
+        log.info("Initializing model parameters...")
+        # Top-level embeddings / linear layers.
+        init_weights(
+            self.config,
+            self.transformer.wte,  # type: ignore
+            std_factor=(0.5 * math.sqrt(self.config.d_model)) if self.config.scale_logits else 1.0,
+            type_of_module=ModuleType.emb,
+        )
+        if hasattr(self.transformer, "wpe"):
+            init_weights(self.config, self.transformer.wpe, type_of_module=ModuleType.emb)  # type: ignore
+        # Top-level layer norm.
+        self.transformer.ln_f.reset_parameters()  # type: ignore
+        # Output weights.
+        if hasattr(self.transformer, "ff_out"):
+            init_weights(self.config, self.transformer.ff_out, type_of_module=ModuleType.final_out)  # type: ignore
+        # Let the blocks handle themselves.
+        if self.config.block_group_size == 1:
+            for block in self.transformer.blocks:
+                block.reset_parameters()
+        else:
+            for block_group in self.transformer.block_groups:
+                block_group.reset_parameters()
+    def get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        if (alibi_bias := self.__cache.get("alibi_attention_bias")) is not None and alibi_bias.shape[
+            -1
+        ] >= seq_len:
+            if alibi_bias.device != device:
+                alibi_bias = alibi_bias.to(device)
+                self.__cache["alibi_attention_bias"] = alibi_bias
+            return alibi_bias
+        with torch.autocast(device.type, enabled=False):
+            alibi_bias = alibi_attention_bias(seq_len, self.config, device)
+        self.__cache["alibi_attention_bias"] = alibi_bias
+        return alibi_bias
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeddings: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        update_kvcache: bool = False,
+        last_logits_only: bool = False,
+        output_hidden_states: Optional[bool] = None,
+    ) -> LLaDAOutput:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
+            embeddings. When provided, it is treated as the output of the input embedding layer.
+        :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            which input IDs are masked. A `1` value in the mask means that
+            the corresponding input ID should *not* be ignored. A `0` means
+            that the corresponding input ID is masked.
+            This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
+            library.
+        :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
+            `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
+            to introduce causal or other biases.
+            If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
+            indicates that the i-th element in the sequence is allowed to attend to the j-th
+            element in the sequence.
+            If the tensor is a float tensor, it will just be added to the attention
+            scores before the softmax.
+            The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
+        :param past_key_values: Pre-computed keys and values for each attention block.
+            Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        :param use_cache: If `True`, return key and value tensors for each block.
+        :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
+            This can speed up decoding when you only care about the next token.
+        """
+        # Add Basic MDM Model config check
+        # print(input_ids.dtype)
+        assert not self.config.alibi, "Alibi length extrapolation is not supported for MDM."
+        assert self.config.rope, "Rope must be used in Llama-Encoder for MDM."
+        # assert (past_key_values is None and not use_cache), "The kvcache is not suppotred for MDM."
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        if past_key_values:
+            assert len(past_key_values) == self.config.n_layers
+        batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+        if past_key_values is None:
+            past_length = 0
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        # print(input_ids.dtype,"wte")
+        x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+        if self.config.input_emb_norm:
+            x = x * (self.config.d_model**0.5)
+        if not (self.config.alibi or self.config.rope):
+            # Get positional embeddings.
+            # shape: (1, seq_len)
+            pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
+            # shape: (1, seq_len, d_model)
+            pos_emb = self.transformer.wpe(pos)  # type: ignore
+            x = pos_emb + x
+        # Add input + positional embeddings and apply dropout.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        # Transform the attention mask into what the blocks expect.
+        if attention_mask is not None and 0.0 in attention_mask:
+            # shape: (batch_size, 1, 1, seq_len)
+            attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
+            attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
+        else:
+            attention_mask = None
+        # Merge attention mask with attention bias.
+        if (
+            attention_bias is not None
+            or attention_mask is not None
+            or self.config.alibi
+            # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
+            # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
+            # scores correctly.
+            or past_key_values is not None
+        ):
+            if attention_bias is None and self.config.alibi:
+                attention_bias = get_causal_attention_bias(
+                    self.__cache, past_length + seq_len, x.device
+                ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
+            elif attention_bias is None:
+                attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
+            elif attention_bias.dtype in (torch.int8, torch.bool):
+                attention_bias = attention_bias.to(dtype=torch.float)
+                attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
+            # Transform to the right shape and data type.
+            mask_len = seq_len
+            if attention_mask is not None:
+                mask_len = attention_mask.shape[-1]
+            elif past_key_values is not None:
+                mask_len = past_key_values[0][0].shape[-2] + seq_len
+            attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
+            # Add in the masking bias.
+            if attention_mask is not None:
+                attention_bias = attention_bias + attention_mask
+                # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
+                # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
+                # it can produce NaNs.
+                ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        # decoder layers
+        all_hidden_states = []
+        # Apply blocks one-by-one.
+        if self.config.block_group_size == 1:
+            for block_idx, block in enumerate(self.transformer.blocks):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layer_past = None if past_key_values is None else past_key_values[block_idx]
+                if (
+                    (self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.whole_layer)
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_two
+                        and block_idx % 2 == 0
+                    )
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_three
+                        and block_idx % 3 == 0
+                    )
+                    or (
+                        self.activation_checkpointing_strategy == ActivationCheckpointingStrategy.one_in_four
+                        and block_idx % 4 == 0
+                    )
+                ):
+                    # shape: (batch_size, seq_len, d_model)
+                    x, cache = self._activation_checkpoint_fn(
+                        block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
+                    )
+                else:
+                    # shape: (batch_size, seq_len, d_model)
+                    x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
+                if attn_key_values is not None:
+                    if update_kvcache:
+                        cache = (cache[0][:,:,:update_kvcache],cache[1][:,:,:update_kvcache,:])
+                        # print("True")
+                        attn_key_values.append(cache)
+        else:
+            for group_idx, block_group in enumerate(self.transformer.block_groups):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layers_past = (
+                    None
+                    if past_key_values is None
+                    else past_key_values[
+                        group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
+                    ]
+                )
+                x, cache = block_group(
+                    x, attention_bias=attention_bias, layers_past=layers_past, use_cache=use_cache
+                )
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.extend(cache)
+        if last_logits_only:
+            # shape: (batch_size, 1, d_model)
+            x = x[:, -1, :].unsqueeze(1)
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        x = self.transformer.ln_f(x)  # type: ignore
+        if output_hidden_states:
+            # add final hidden state post-final-layernorm, following HuggingFace's convention
+            all_hidden_states.append(x)
+        # Get logits.
+        # shape: (batch_size, seq_len or 1, vocab_size)
+        if self.config.weight_tying:
+            logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+        else:
+            logits = self.transformer.ff_out(x)  # type: ignore
+        if self.config.scale_logits:
+            logits.mul_(1 / math.sqrt(self.config.d_model))
+        if use_cache == True and update_kvcache == False:
+            attn_key_values=past_key_values
+        return LLaDAOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+def create_model_config_from_pretrained_config(config: LLaDAConfig):
+    """
+    Utility function
+    """
+    kwargs = {}
+    for field in fields(ModelConfig):
+        kwargs[field.name] = getattr(config, field.name)
+    model_config = ModelConfig(**kwargs)
+    return model_config
+class LLaDAModelLM(PreTrainedModel):
+    """
+    Extremely barebones HF model wrapper.
+    """
+    config_class = LLaDAConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["LLaDABlock", "LLaDASequentialBlock", "LLaDALlamaBlock"]
+    def __init__(self, config: LLaDAConfig, model: Optional[LLaDAModel] = None, init_params: bool = False):
+        super().__init__(config)
+        if not model:
+            model_config = create_model_config_from_pretrained_config(config)
+            # Initialize model (always on CPU to start with so we don't run out of GPU memory).
+            model_config.init_device = "cpu"
+            self.model = LLaDAModel(model_config, init_params=init_params)
+        else:
+            self.model = model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        update_kvcache: Optional[bool] = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[Cache] = None,  # This is a hack mitigation of an issue in transformers `4.39.x`
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if use_cache is None:
+            use_cache = self.config.use_cache
+        if output_attentions:
+            raise ValueError("output_attentions is not yet supported in LLaDA")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            input_embeddings=inputs_embeds,
+            attention_mask=attention_mask,
+            attention_bias=attention_bias,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            update_kvcache=update_kvcache,
+            output_hidden_states=output_hidden_states,
+        )
+        logits = outputs.logits
+        hidden_states = outputs.hidden_states
+        loss = None
+        if labels is not None:
+            import warnings
+            warnings.warn("Note that for LLaDA, you cannot calculate the loss here.", UserWarning)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=outputs.attn_key_values,
+            hidden_states=hidden_states,
+        )
+    def can_generate(self) -> bool:
+        return True
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
+    ):
+        if past_key_values:
+            # This is because we want the model to only process the last generated token.
+            input_ids = input_ids[:, -1:]
+        model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
+        model_inputs.update(kwargs)
+        model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache)
+        return model_inputs
+    # TODO: these are required to make the implementation complete.
+    # def resize_position_embeddings(self, new_num_position_embeddings: int):
+    #     pass
+    #
+    # def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
+    #     pass
+    #
+    # def _reorder_cache(self, past_key_values, beam_idx):
+    #     pass
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+    def set_input_embeddings(self, value: torch.nn.Module):
+        self.model.transformer.wte = value
+    def get_output_embeddings(self):
+        if self.config.weight_tying:
+            return self.model.transformer.wte
+        else:
+            return self.model.transformer.ff_out
+    def set_output_embeddings(self, value: torch.nn.Module):
+        if self.config.weight_tying:
+            self.model.transformer.wte = value
+        else:
+            self.model.transformer.ff_out = value
+    def tie_weights(self):
+        if self.config.weight_tying:
+            self.model.transformer.ff_out = self.model.transformer.wte
+# Register the model so that it is available for transformer pipelines, auto-loading, etc.
+AutoModel.register(LLaDAConfig, LLaDAModelLM)

postprocess_code.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright 2025 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# Modified from Dream repos: https://github.com/HKUNLP/Dream
+import evaluate as hf_evaluate
+import os
+import sys
+from sanitize import sanitize
+os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+pass_at_k = hf_evaluate.load("code_eval")
+def pass_at_1(references, predictions):
+    return pass_at_k.compute(
+        references=references,
+        predictions=predictions,
+        k=[1],
+    )[0]["pass@1"]
+import json
+def read_jsonl(file_path):
+    data = []
+    with open(file_path, 'r') as file:
+        for line in file:
+            data.append(json.loads(line))
+    return data
+file_path = sys.argv[1]
+data = read_jsonl(file_path)
+references = [sample['target'] for sample in data]
+predictions = [[sanitize(sample['doc']['prompt'] + "\n" + sample['resps'][0][0].split('```python\n', 1)[-1].split('```')[0],
+                sample['doc']["entry_point"])]
+                for sample in data]
+pass_at_1s = [pass_at_1([reference], [prediction]) for reference, prediction in zip(references, predictions)]
+print(sum(pass_at_1s)/len(pass_at_1s))
+def write_jsonl(data, file_path):
+    with open(file_path, 'w') as file:
+        for item in data:
+            file.write(json.dumps(item) + '\n')
+res = [{"task_id": sample['doc']['task_id'], "completion": pred, "pass_at_1": res}
+       for sample, pred, res  in zip(data, predictions, pass_at_1s)]
+write_jsonl(res, file_path+'.cleaned')

sanitize.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright 2025 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# Modified from Dream repos: https://github.com/HKUNLP/Dream
+"""Post-processing LLM-generated Python code implemented using tree-sitter."""
+import os
+import sys
+import pathlib
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.extend([os.path.dirname(ROOT), os.path.dirname(os.path.dirname(ROOT))])
+import ast
+import traceback
+from typing import Dict, List, Optional, Set, Tuple
+def refine_text(text: str) -> str:
+    text =  text.replace("\t", "    ")
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    return text.strip() + "\n"
+def syntax_check(code, verbose = False):
+    try:
+        ast.parse(code)
+        return True
+    except (SyntaxError, MemoryError):
+        if verbose:
+            traceback.print_exc()
+        return False
+def extract_longest_valid_code(text: str) -> str:
+    lines = text.splitlines()
+    if len(lines) > 100:
+        lines = lines[:100]
+    max_valid_lines = 0
+    max_valid_snippet = ""
+    for i in range(len(lines)):
+        for j in range(i, len(lines)):
+            current_snippet = "\n".join(lines[i:j+1])
+            if syntax_check(current_snippet):
+                valid_line_count = sum(1 for line in lines[i:j+1] if line.strip())
+                if valid_line_count > max_valid_lines:
+                    max_valid_lines = valid_line_count
+                    max_valid_snippet = current_snippet
+    return max_valid_snippet
+def get_deps(nodes: List[Tuple[str, ast.AST]]) -> Dict[str, Set[str]]:
+    name2deps = {}
+    for name, node in nodes:
+        deps = set()
+        stack = [node]
+        while stack:
+            current = stack.pop()
+            for child in ast.iter_child_nodes(current):
+                if isinstance(child, ast.Name):
+                    deps.add(child.id)
+                elif isinstance(child, ast.Attribute):
+                    deps.add(child.attr)
+                else:
+                    stack.append(child)
+        name2deps[name] = deps
+    return name2deps
+def get_function_dependency(entrypoint: str, call_graph: Dict[str, Set[str]]) -> Set[str]:
+    visited = set()
+    to_visit = [entrypoint]
+    while to_visit:
+        current = to_visit.pop(0)
+        if current not in visited:
+            visited.add(current)
+            to_visit.extend(call_graph.get(current, set()) - visited)
+    return visited
+def get_definition_name(node: ast.AST) -> Optional[str]:
+    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
+        return node.name
+    elif isinstance(node, ast.Assign):
+        targets = node.targets
+        if targets and isinstance(targets[0], ast.Name):
+            return targets[0].id
+    return None
+def has_return_statement(node: ast.AST) -> bool:
+    return any(isinstance(n, ast.Return) for n in ast.walk(node))
+def sanitize(text: str, entrypoint: Optional[str] = None) -> str:
+    text = refine_text(text)
+    # text = python_extract(text)
+    code = extract_longest_valid_code(text)
+    tree = ast.parse(code)
+    definitions = {}
+    imports = []
+    for node in tree.body:
+        if isinstance(node, (ast.Import, ast.ImportFrom)):
+            imports.append(node)
+        elif isinstance(node, ast.ClassDef):
+            name = node.name
+            definitions[name] = ('class', node)
+        elif isinstance(node, ast.FunctionDef):
+            name = node.name
+            if has_return_statement(node):
+                definitions[name] = ('function', node)
+        elif isinstance(node, ast.Assign):
+            name = get_definition_name(node)
+            if name:
+                definitions[name] = ('variable', node)
+    if entrypoint:
+        name2deps = get_deps([(name, node) for name, (_, node) in definitions.items()])
+        reachable = get_function_dependency(entrypoint, name2deps)
+    sanitized_output = []
+    for node in imports:
+        sanitized_output.append(ast.unparse(node))
+    for name, (_, node) in definitions.items():
+        if not entrypoint or name in reachable:
+            sanitized_output.append(ast.unparse(node))
+    return "\n".join(sanitized_output)