dream_rcr

@@ -1,5 +1,18 @@
 # coding=utf-8
-# Copyright 2024 ...
 import warnings
 import copy
 from dataclasses import dataclass
@@ -9,106 +22,75 @@ import torch
 import torch.distributions as dists
 from torch.nn import functional as F
 from transformers import __version__
-from transformers.generation.configuration_utils import GenerationConfig
-from transformers.utils import ModelOutput, is_torchdynamo_compiling, logging
 logger = logging.get_logger(__name__)
 def top_p_logits(logits, top_p=None):
-    if top_p is None or top_p >= 1:
-        return logits
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cumulative_probs > top_p
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
     sorted_indices_to_remove[..., 0] = 0
     mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
-    return logits.masked_fill(mask, torch.finfo(logits.dtype).min)
 def top_k_logits(logits, top_k=None):
-    if top_k is None:
-        return logits
-    top_k = min(int(top_k), logits.size(-1))
-    thresh = torch.topk(logits, top_k)[0][..., -1, None]
-    indices_to_remove = logits < thresh
-    return logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
-def sample_tokens(
-    logits,
-    temperature=0.0,
-    top_p=None,
-    top_k=None,
-    margin_confidence=False,
-    neg_entropy=False,
-):
-    # temperature
-    if temperature > 0:
-        logits = logits / temperature
-    # filtering
-    logits = top_p_logits(logits, top_p)
-    logits = top_k_logits(logits, top_k)
-    # probs
     probs = torch.softmax(logits, dim=-1)
-    # sample or argmax
     if temperature > 0:
         try:
             x0 = dists.Categorical(probs=probs).sample()
             confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
-        except Exception:
             confidence, x0 = probs.max(dim=-1)
     else:
         confidence, x0 = probs.max(dim=-1)
-    # confidence variants
     if margin_confidence:
         sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
-        top1_probs = sorted_probs[..., 0]
-        top2_probs = sorted_probs[..., 1]
-        confidence = top1_probs - top2_probs
     if neg_entropy:
         epsilon = 1e-10
         log_probs = torch.log(probs + epsilon)
-        # 注意：neg_entropy 越大代表越“确定”
         confidence = -(probs * log_probs).sum(dim=-1)
     return confidence, x0
-def get_num_transfer_tokens_maskgit(mask_index: torch.Tensor, steps: int, mode: str = "linear") -> torch.Tensor:
-    """
-    LLaDA 风格：预计算每一步要“转移（解码）”的 token 数（逐样本），保证总量等于总 mask 数。
-    mask_index: [B, L] bool
-    return: [B, steps] long
-    """
-    device = mask_index.device
-    num_masked_tokens = mask_index.sum(dim=-1, keepdim=True).float()  # [B,1]
-    t = torch.linspace(0, 1, steps + 1, device=device)[1:]  # (steps,)
-    if mode == "linear":
-        ratio = t
-    elif mode == "cosine":
-        ratio = 1 - torch.cos(t * torch.pi / 2)
-    elif mode == "pow2":
-        ratio = t ** 2
-    elif mode == "sqrt":
-        ratio = torch.sqrt(t)
-    else:
-        raise ValueError(f"Unknown mode: {mode}")
-    # 累积配额（四舍五入），再做差得到每步配额
-    cum = (ratio.unsqueeze(0) * num_masked_tokens).round().long()         # [B, steps]
-    per_step = torch.diff(cum, dim=-1, prepend=torch.zeros_like(cum[:, :1]))
-    return per_step  # [B, steps], 每行之和 ≈ num_masked_tokens（四舍五入引入±1 误差）
 @dataclass
 class DreamModelOutput(ModelOutput):
     sequences: torch.LongTensor = None
@@ -125,20 +107,19 @@ class DreamGenerationConfig(GenerationConfig):
         # diffusion specific params
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
-        self.alg: str = kwargs.pop("alg", "origin")
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
         # RCR specific parameters
         self.rcr: bool = kwargs.pop("rcr", False)
-        self.conf_alg: str = kwargs.pop("conf_alg", "maskgit_plus")
-        self.mode: str = kwargs.pop("mode", "linear")  # LLaDA 调度
-        # Output control
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
         self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
         self.output_history: bool = kwargs.pop("output_history", False)
-        # Special tokens
         self.mask_token_id = kwargs.pop("mask_token_id", None)
         self.pad_token_id = kwargs.pop("pad_token_id", None)
         self.bos_token_id = kwargs.pop("bos_token_id", None)
@@ -147,12 +128,16 @@ class DreamGenerationConfig(GenerationConfig):
         # Wild card
         self.generation_kwargs = kwargs.pop("generation_kwargs", {})
-        # Hub info
         self._from_model_config = kwargs.pop("_from_model_config", False)
         self._commit_hash = kwargs.pop("_commit_hash", None)
         self.transformers_version = kwargs.pop("transformers_version", __version__)
         if not self._from_model_config:
             for key, value in kwargs.items():
                 try:
                     setattr(self, key, value)
@@ -160,19 +145,22 @@ class DreamGenerationConfig(GenerationConfig):
                     logger.error(f"Can't set {key} with value {value} for {self}")
                     raise err
         self.validate(is_init=True)
     def validate(self, is_init=False):
         pass
 class DreamGenerationMixin:
     @staticmethod
     def _expand_inputs_for_generation(
         expand_size: int = 1,
         input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
         if expand_size == 1:
             return input_ids, attention_mask
         if input_ids is not None:
@@ -181,47 +169,129 @@ class DreamGenerationMixin:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
             return
         if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
             warnings.warn(
-                f"Using the model-agnostic default `max_length` (={generation_config.max_length}); "
-                f"prefer setting `max_new_tokens`.",
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
             raise ValueError(
-                f"Input length is {input_ids_length}, but `max_length` is set to {generation_config.max_length}. "
-                f"Increase `max_length` or set `max_new_tokens`."
             )
-    def _prepare_generated_length(self, generation_config, has_default_max_length, input_ids_length):
         if generation_config.max_new_tokens is not None:
             if not has_default_max_length and generation_config.max_length is not None:
                 logger.warning(
-                    f"Both `max_new_tokens` and `max_length` set. `max_new_tokens` takes precedence."
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
                 generation_config.max_length = generation_config.max_length + input_ids_length
                 max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
                 if max_position_embeddings is not None:
                     generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
         return generation_config
     def _prepare_generation_config(
         self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict
     ) -> DreamGenerationConfig:
         using_model_generation_config = False
         if generation_config is None:
             generation_config = DreamGenerationConfig.from_model_config(self.config)
             using_model_generation_config = True
         if not is_torchdynamo_compiling():
             generation_config = copy.deepcopy(generation_config)
             _kwargs = generation_config.update(**kwargs)
             if not using_model_generation_config:
                 if generation_config.bos_token_id is None:
                     generation_config.bos_token_id = self.generation_config.bos_token_id
@@ -239,9 +309,20 @@ class DreamGenerationMixin:
         generation_config: DreamGenerationConfig,
         device: Optional[Union[torch.device, str]] = None,
     ):
         def _tensor_or_none(token, device=None):
             if token is None:
                 return token
             device = device if device is not None else self.device
             if isinstance(token, torch.Tensor):
                 return token.to(device)
@@ -252,13 +333,19 @@ class DreamGenerationMixin:
         pad_token_tensor = _tensor_or_none(generation_config.pad_token_id, device=device)
         mask_token_tensor = _tensor_or_none(generation_config.mask_token_id, device=device)
         if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
             eos_token_tensor = eos_token_tensor.unsqueeze(0)
         if pad_token_tensor is None and eos_token_tensor is not None:
             pad_token_tensor = eos_token_tensor[0]
             logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
         generation_config._bos_token_tensor = bos_token_tensor
         generation_config._eos_token_tensor = eos_token_tensor
         generation_config._pad_token_tensor = pad_token_tensor
@@ -271,16 +358,19 @@ class DreamGenerationMixin:
         generation_config: Optional[DreamGenerationConfig] = None,
         **kwargs,
     ) -> Union[DreamModelOutput, torch.LongTensor]:
         generation_config = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
         generation_logits_hook_func = kwargs.pop("generation_logits_hook_func", lambda step, x, logits: logits)
         assert inputs is not None
         input_ids = inputs
         device = input_ids.device
         attention_mask = kwargs.pop("attention_mask", None)
         self._prepare_special_tokens(generation_config, device=device)
         input_ids_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
         generation_config = self._prepare_generated_length(
@@ -288,23 +378,35 @@ class DreamGenerationMixin:
             has_default_max_length=has_default_max_length,
             input_ids_length=input_ids_length,
         )
-        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
-                "You are calling .generate() with input_ids on a different device than the model.",
                 UserWarning,
             )
-        if hasattr(generation_config, "pad_token_id") and torch.any(input_ids == generation_config.pad_token_id) and attention_mask is None:
             warnings.warn(
-                "Padding detected but no attention_mask is passed. For correct results, pass attention_mask.",
                 UserWarning,
             )
         input_ids, attention_mask = self._expand_inputs_for_generation(
             expand_size=generation_config.num_return_sequences,
             input_ids=input_ids,
-            attention_mask=attention_mask,
         )
         result = self._sample(
@@ -312,7 +414,7 @@ class DreamGenerationMixin:
             attention_mask=attention_mask,
             generation_config=generation_config,
             generation_tokens_hook_func=generation_tokens_hook_func,
-            generation_logits_hook_func=generation_logits_hook_func,
         )
         return result
@@ -322,10 +424,9 @@ class DreamGenerationMixin:
         attention_mask: Optional[torch.LongTensor],
         generation_config: DreamGenerationConfig,
         generation_tokens_hook_func,
-        generation_logits_hook_func,
     ) -> Union[DreamModelOutput, torch.LongTensor]:
-        # --- init values ---
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
@@ -338,20 +439,22 @@ class DreamGenerationMixin:
         top_p = generation_config.top_p
         top_k = generation_config.top_k
-        # RCR specific
         rcr = generation_config.rcr
         conf_alg = generation_config.conf_alg
-        mode = generation_config.mode
         histories = [] if (return_dict_in_generate and output_history) else None
-        # pad to max_length with [MASK]
         x = F.pad(input_ids, (0, max_length - input_ids.shape[1]), value=mask_token_id)
         if attention_mask is not None and torch.any(attention_mask == 0.0):
             attention_mask = F.pad(attention_mask, (0, max_length - attention_mask.shape[1]), value=1.0)
             tok_idx = attention_mask.long().cumsum(-1) - 1
             tok_idx.masked_fill_(attention_mask == 0, 1)
             attention_mask = torch.logical_and(
                 attention_mask.unsqueeze(1).unsqueeze(-2),
                 attention_mask.unsqueeze(1).unsqueeze(-1),
@@ -360,126 +463,75 @@ class DreamGenerationMixin:
             tok_idx = None
             attention_mask = "full"
-        # global linear schedule 1 -> eps
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
-        # 初始 mask（用于预分配 per-step token 预算；与 LLaDA 类似）
-        initial_mask_index = (x == mask_token_id)  # [B, L]
-        per_step_tokens = get_num_transfer_tokens_maskgit(initial_mask_index, steps, mode=mode)  # [B, steps]
-        # RCR tracking
         overtime_confidence = torch.zeros_like(x, dtype=torch.float32) if rcr else None
-        # user-defined token control
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
-            # 当前还未确定的 mask 位置
-            mask_index = (x == mask_token_id)  # [B, L]
-            # 模型 logits（单步预测 + 向右对齐）
             logits = self(x, attention_mask, tok_idx).logits
-            logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
-            # user-defined logits control
             logits = generation_logits_hook_func(i, x, logits)
-            # 只取 mask 位置对应的 logits 参与采样
-            mask_logits = logits[mask_index]  # [M, V] (M=mask 个数)
             t = timesteps[i]
             s = timesteps[i + 1]
-            if alg == "origin":
-                # 原 Dream 迁移（保留）
-                p_transfer = 1 - (s / t).item() if i < steps - 1 else 1.0
-                x0 = torch.zeros_like(x[mask_index], device=x.device, dtype=torch.long) + mask_token_id
-                transfer_index_t_s = (torch.rand(*x0.shape, device=x.device) < p_transfer)
-                _, x0[transfer_index_t_s] = sample_tokens(
-                    mask_logits[transfer_index_t_s],
-                    temperature=temperature,
-                    top_p=top_p,
-                    top_k=top_k,
-                )
                 x[mask_index] = x0.clone()
             else:
-                # 选择置信度算法：RCR 时优先 conf_alg；非 RCR 时用 alg 的同名变体
-                choose = conf_alg if rcr else alg
-                if choose == "maskgit_plus":
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
-                elif choose == "topk_margin":
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, margin_confidence=True)
-                elif choose == "entropy":
-                    confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, neg_entropy=True)
                 else:
-                    raise RuntimeError(f"Unknown alg/conf_alg: {choose}")
-                # 将预测/置信度写回到全长（非 mask 位置用原 token / -inf）
-                full_conf = torch.full_like(x, -torch.inf, device=x.device, dtype=logits.dtype)  # [B, L]
-                x_temp = torch.zeros_like(x, device=x.device, dtype=torch.long) + mask_token_id  # [B, L]
-                x_temp[mask_index] = x0.clone()
-                full_conf[mask_index] = confidence
-                if not rcr:
-                    # ---------- 非 RCR：逐样本的“当步配额” ----------
-                    k_per_row = per_step_tokens[:, i]  # [B]
-                    B = x.size(0)
-                    for j in range(B):
-                        k_j = int(k_per_row[j].item())
-                        if k_j <= 0:
-                            continue
-                        # clamp：不能超过当前样本剩余 mask 数
-                        masked_count_j = mask_index[j].sum().item()
-                        k_j = min(k_j, int(masked_count_j))
-                        if k_j <= 0:
-                            continue
-                        # 只在 mask 内选 topk（full_conf 的非 mask 处已是 -inf）
-                        _, select_idx = torch.topk(full_conf[j], k_j, largest=True)
-                        x[j, select_idx] = x_temp[j, select_idx]
                 else:
-                    # ---------- RCR：LLaDA 风格的“累积选取 + 下一步反遮盖” ----------
-                    B = x.size(0)
-                    for j in range(B):
-                        # 当步+未来的总剩余配额（从第 i 步到最后一步）
-                        total_remaining_tokens = int(per_step_tokens[j, i:].sum().item())
-                        if total_remaining_tokens <= 0:
-                            continue
-                        masked_count_j = mask_index[j].sum().item()
-                        k_total = min(total_remaining_tokens, int(masked_count_j))
-                        if k_total <= 0:
-                            continue
-                        # 1) 累积选取：一次性选出“当步至结尾”应确定的 token 集合
-                        #    （在 mask 内的 topk）
-                        _, select_indices = torch.topk(full_conf[j], k_total, largest=True)
-                        x[j, select_indices] = x_temp[j, select_indices]
-                        overtime_confidence[j, select_indices] = full_conf[j, select_indices].clone().float()
-                        # 2) 下一步前：把“下一步之后还应保留给未来步数的那部分”按最低置信度反遮盖回去
-                        if i < (steps - 1):
-                            next_to_keep_for_future = int(per_step_tokens[j, i + 1 :].sum().item())
-                            if next_to_keep_for_future > 0:
-                                # 仅在“已选中的位置”（overtime_confidence>0）里，反遮盖最低置信度的那部分
-                                current_conf = overtime_confidence[j]
-                                # 把 0 置信度（未生成）位置临时设成 +inf，避免被误选为“最低”
-                                safe_conf = torch.where(current_conf == 0.0, torch.tensor(float("inf"), device=x.device), current_conf)
-                                # 需要反遮盖的数量不应超过当前已选中的数
-                                gen_count = (safe_conf != float("inf")).sum().item()
-                                k_remask = min(next_to_keep_for_future, int(gen_count))
-                                if k_remask > 0:
-                                    # 选“最不自信”的 k_remask 个
-                                    _, local_mask_indices = torch.topk(safe_conf, k_remask, largest=False)
-                                    x[j, local_mask_indices] = mask_token_id
-                                    overtime_confidence[j, local_mask_indices] = 0.0  # 清零表示撤回
-            # user-defined token control
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())
         if return_dict_in_generate:
-            return DreamModelOutput(sequences=x, history=histories)
         else:
             return x

 # coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import warnings
 import copy
 from dataclasses import dataclass
 import torch.distributions as dists
 from torch.nn import functional as F
 from transformers import __version__
+from transformers.generation.configuration_utils import (
+    GenerationConfig
+)
+from transformers.utils import (
+    ModelOutput,
+    is_torchdynamo_compiling,
+    logging,
+)
 logger = logging.get_logger(__name__)
 def top_p_logits(logits, top_p=None):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the indices to the right to keep the first token above the threshold
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
     sorted_indices_to_remove[..., 0] = 0
     mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
 def top_k_logits(logits, top_k=None):
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    if temperature > 0:
+        logits = logits / temperature
+    if top_p is not None and top_p < 1:
+        logits = top_p_logits(logits, top_p)
+    if top_k is not None:
+        logits = top_k_logits(logits, top_k)
     probs = torch.softmax(logits, dim=-1)
     if temperature > 0:
         try:
             x0 = dists.Categorical(probs=probs).sample()
             confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+        except:
             confidence, x0 = probs.max(dim=-1)
     else:
         confidence, x0 = probs.max(dim=-1)
     if margin_confidence:
         sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        # Extract top1 and top2 probabilities
+        top1_probs = sorted_probs[:, 0]
+        top2_probs = sorted_probs[:, 1]
+        # Calculate confidence as top1 - top2
+        confidence = top1_probs - top2_probs
     if neg_entropy:
         epsilon = 1e-10
         log_probs = torch.log(probs + epsilon)
+        # 注意：这里返回的是“负熵”的相反数（越大越自信）
         confidence = -(probs * log_probs).sum(dim=-1)
     return confidence, x0
 @dataclass
 class DreamModelOutput(ModelOutput):
     sequences: torch.LongTensor = None
         # diffusion specific params
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
+        self.alg: str = kwargs.pop("alg", 'origin')
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
         # RCR specific parameters
         self.rcr: bool = kwargs.pop("rcr", False)
+        self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')
+        # Parameters that define the output variables of `generate`
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
         self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
         self.output_history: bool = kwargs.pop("output_history", False)
+        # Special tokens that can be used at generation time
         self.mask_token_id = kwargs.pop("mask_token_id", None)
         self.pad_token_id = kwargs.pop("pad_token_id", None)
         self.bos_token_id = kwargs.pop("bos_token_id", None)
         # Wild card
         self.generation_kwargs = kwargs.pop("generation_kwargs", {})
+        # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
+        # interface.
         self._from_model_config = kwargs.pop("_from_model_config", False)
         self._commit_hash = kwargs.pop("_commit_hash", None)
         self.transformers_version = kwargs.pop("transformers_version", __version__)
+        # Additional attributes without default values
         if not self._from_model_config:
+            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
+            # model's default configuration file
             for key, value in kwargs.items():
                 try:
                     setattr(self, key, value)
                     logger.error(f"Can't set {key} with value {value} for {self}")
                     raise err
+        # Validate the values of the attributes
         self.validate(is_init=True)
     def validate(self, is_init=False):
         pass
 class DreamGenerationMixin:
     @staticmethod
     def _expand_inputs_for_generation(
         expand_size: int = 1,
         input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None
     ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+        # Do not call torch.repeat_interleave if expand_size is 1 because it clones
+        # the input tensor and thus requires more memory although no change is applied
         if expand_size == 1:
             return input_ids, attention_mask
         if input_ids is not None:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
+    def _apply_rcr_logic(self, x, x0, confidence, mask_index, overtime_confidence,
+                        mask_token_id, step, total_steps, s, t):
+        """
+        RCR：在 Dream 原逻辑上做“最小侵入”改动，使其真正生效。
+        - 仍采用 Dream 的调度：本步 global k = num_mask_token * (1 - s/t)
+        - 逐样本 clamp，避免批均值 k 在样本上越界
+        - 目标累计约束：到本步为止累计应已生成 target_cum = num_mask_token * (1 - s/t)。
+          若当前累计 > 目标，按最低置信度反遮盖回 [MASK]。
+        """
+        device = x.device
+        B, L = x.shape
+        # 与 Dream 保持一致：使用“批均值”的 num_mask_token 与 (1 - s/t) 调度定义
+        num_mask_token = (mask_index.sum() / mask_index.shape[0]).item()
+        k_global = int(num_mask_token * (1 - (s / t).item())) if step < total_steps - 1 else int(num_mask_token)
+        # 构造全长置信度和临时候选（非 mask 位置分别置为 -inf / mask_token）
+        full_conf = torch.full_like(x, -torch.inf, device=device, dtype=confidence.dtype)
+        x_temp = torch.zeros_like(x, device=device, dtype=torch.long) + mask_token_id
+        full_conf[mask_index] = confidence
+        x_temp[mask_index] = x0.clone()
+        for j in range(B):
+            # 逐样本 clamp
+            masked_count_j = int(mask_index[j].sum().item())
+            k_j = min(k_global, masked_count_j)
+            if k_j > 0:
+                # 只在 mask 内选 topk（非 mask 位置 full_conf 为 -inf，不会被选中）
+                _, select_idx = torch.topk(full_conf[j], k_j, largest=True)
+                x[j, select_idx] = x_temp[j, select_idx]
+                overtime_confidence[j, select_idx] = full_conf[j, select_idx].clone().float()
+            # ===== 目标累计约束 + 反遮盖 =====
+            if step < total_steps - 1:
+                # Dream 的“到本步为止累计应已生成”的目标数量
+                target_cum = int(num_mask_token * (1 - (s / t).item()))
+                # 当前已生成的数量（overtime_confidence>0 的位置视为已确定）
+                gen_mask = overtime_confidence[j] > 0
+                current_gen = int(gen_mask.sum().item())
+                # 若超过目标，反遮盖（remask）最低置信度的那部分，使当前累计 ≈ 目标累计
+                to_remask = max(0, current_gen - target_cum)
+                if to_remask > 0:
+                    gen_indices = torch.where(gen_mask)[0]
+                    if gen_indices.numel() > 0:
+                        gen_conf = overtime_confidence[j, gen_indices]
+                        to_remask = min(to_remask, int(gen_indices.numel()))
+                        _, local_low = torch.topk(gen_conf, k=to_remask, largest=False)
+                        low_global = gen_indices[local_low]
+                        x[j, low_global] = mask_token_id
+                        overtime_confidence[j, low_global] = 0.0
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
+        """Performs validation related to the resulting generated length"""
+        # Can't throw warnings/exceptions during compilation
         if is_torchdynamo_compiling():
             return
+        # 1. Max length warnings related to poor parameterization
         if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
             warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the "
+                "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
+                "generation.",
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
+            input_ids_string = "input_ids"
             raise ValueError(
+                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_length` or, better yet, setting `max_new_tokens`."
             )
+    def _prepare_generated_length(
+        self,
+        generation_config,
+        has_default_max_length,
+        input_ids_length,
+    ):
+        """Prepared max and min length in generation configs to avoid clashes between similar attributes"""
         if generation_config.max_new_tokens is not None:
             if not has_default_max_length and generation_config.max_length is not None:
                 logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
                 generation_config.max_length = generation_config.max_length + input_ids_length
                 max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
                 if max_position_embeddings is not None:
                     generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
         return generation_config
     def _prepare_generation_config(
         self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict
     ) -> DreamGenerationConfig:
+        """
+        Prepares the base generation config, then applies any generation configuration options from kwargs. This
+        function handles retrocompatibility with respect to configuration files.
+        """
+        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         using_model_generation_config = False
         if generation_config is None:
             generation_config = DreamGenerationConfig.from_model_config(self.config)
             using_model_generation_config = True
+        # `torch.compile` can't compile `copy.deepcopy`, arguments in `kwargs` that are part of `generation_config`
+        # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled -- an
+        # exception will be raised in `_validate_model_kwargs`
         if not is_torchdynamo_compiling():
             generation_config = copy.deepcopy(generation_config)
             _kwargs = generation_config.update(**kwargs)
+            # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
             if not using_model_generation_config:
                 if generation_config.bos_token_id is None:
                     generation_config.bos_token_id = self.generation_config.bos_token_id
         generation_config: DreamGenerationConfig,
         device: Optional[Union[torch.device, str]] = None,
     ):
+        """
+        Prepares the special tokens for generation, overwriting the generation config with their processed versions
+        converted to tensor.
+        Note that `generation_config` is changed in place and stops being serializable after this method is called.
+        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
+        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
+        """
+        # Convert special tokens to tensors
         def _tensor_or_none(token, device=None):
             if token is None:
                 return token
             device = device if device is not None else self.device
             if isinstance(token, torch.Tensor):
                 return token.to(device)
         pad_token_tensor = _tensor_or_none(generation_config.pad_token_id, device=device)
         mask_token_tensor = _tensor_or_none(generation_config.mask_token_id, device=device)
+        # We can have more than one eos token. Always treat it as a 1D tensor (when it exists).
         if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
             eos_token_tensor = eos_token_tensor.unsqueeze(0)
+        # Set pad token if unset (and there are conditions to do so)
         if pad_token_tensor is None and eos_token_tensor is not None:
             pad_token_tensor = eos_token_tensor[0]
             logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
+        # Update generation config with the updated special tokens tensors
+        # NOTE: this must be written into a different attribute name than the one holding the original special tokens
+        # (in their non-tensor form), in order to enable end-to-end compilation. See
+        # https://pytorch.org/docs/stable/torch.compiler_cudagraph_trees.html#limitations
         generation_config._bos_token_tensor = bos_token_tensor
         generation_config._eos_token_tensor = eos_token_tensor
         generation_config._pad_token_tensor = pad_token_tensor
         generation_config: Optional[DreamGenerationConfig] = None,
         **kwargs,
     ) -> Union[DreamModelOutput, torch.LongTensor]:
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         generation_config = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
         generation_logits_hook_func = kwargs.pop("generation_logits_hook_func", lambda step, x, logits: logits)
+        # 2. Define model inputs
         assert inputs is not None
         input_ids = inputs
         device = input_ids.device
         attention_mask = kwargs.pop("attention_mask", None)
         self._prepare_special_tokens(generation_config, device=device)
+        # 3. Prepare `max_length`.
         input_ids_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
         generation_config = self._prepare_generated_length(
             has_default_max_length=has_default_max_length,
             input_ids_length=input_ids_length,
         )
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
+        # 4. Check input_ids
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
+                "You are calling .generate() with the `input_ids` being on a device type different"
+                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+                " Please make sure that you have put `input_ids` to the"
+                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+                " running `.generate()`.",
                 UserWarning,
             )
+        if (
+            hasattr(generation_config, "pad_token_id") and
+            torch.any(input_ids == generation_config.pad_token_id) and
+            attention_mask is None
+        ):
             warnings.warn(
+                "Padding was detected but no attention mask is passed here. For correct "
+                "generation results, please set `attention_mask` when batch-padding inputs.",
                 UserWarning,
             )
         input_ids, attention_mask = self._expand_inputs_for_generation(
             expand_size=generation_config.num_return_sequences,
             input_ids=input_ids,
+            attention_mask=attention_mask
         )
         result = self._sample(
             attention_mask=attention_mask,
             generation_config=generation_config,
             generation_tokens_hook_func=generation_tokens_hook_func,
+            generation_logits_hook_func=generation_logits_hook_func
         )
         return result
         attention_mask: Optional[torch.LongTensor],
         generation_config: DreamGenerationConfig,
         generation_tokens_hook_func,
+        generation_logits_hook_func
     ) -> Union[DreamModelOutput, torch.LongTensor]:
+        # init values
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
         top_p = generation_config.top_p
         top_k = generation_config.top_k
+        # RCR specific values
         rcr = generation_config.rcr
         conf_alg = generation_config.conf_alg
         histories = [] if (return_dict_in_generate and output_history) else None
+        # pad input_ids to max_length
         x = F.pad(input_ids, (0, max_length - input_ids.shape[1]), value=mask_token_id)
         if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # we do not mask the [MASK] tokens so value = 1.0
             attention_mask = F.pad(attention_mask, (0, max_length - attention_mask.shape[1]), value=1.0)
             tok_idx = attention_mask.long().cumsum(-1) - 1
             tok_idx.masked_fill_(attention_mask == 0, 1)
+            # attention_mask is of shape [B, N]
+            # broadcast to [B, 1, N, N]
             attention_mask = torch.logical_and(
                 attention_mask.unsqueeze(1).unsqueeze(-2),
                 attention_mask.unsqueeze(1).unsqueeze(-1),
             tok_idx = None
             attention_mask = "full"
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
+        # RCR tracking - initialize overtime confidence tracking
         overtime_confidence = torch.zeros_like(x, dtype=torch.float32) if rcr else None
+        # this allows user-defined token control of the intermediate steps
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
+            mask_index = (x == mask_token_id)
             logits = self(x, attention_mask, tok_idx).logits
+            logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1)
+            # this allows user-defined logits control of the intermediate steps
             logits = generation_logits_hook_func(i, x, logits)
+            mask_logits = logits[mask_index]
             t = timesteps[i]
             s = timesteps[i + 1]
+            if alg == 'origin':
+                p_transfer = 1 - s / t if i < steps - 1 else 1
+                x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
+                transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
+                _, x0[transfer_index_t_s]= sample_tokens(mask_logits[transfer_index_t_s], temperature=temperature, top_p=top_p, top_k=top_k)
                 x[mask_index] = x0.clone()
             else:
+                if alg == 'maskgit_plus' or (rcr and conf_alg == 'maskgit_plus'):
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
+                elif alg == 'topk_margin' or (rcr and conf_alg == 'topk_margin'):
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, margin_confidence=True)
+                elif alg == 'entropy' or (rcr and conf_alg == 'entropy'):
+                    confidence, x0 = sample_tokens(mask_logits, temperature, top_p=top_p, top_k=top_k, neg_entropy=True)
                 else:
+                    raise RuntimeError(f"Unknown alg: {alg}")
+                # Apply RCR logic if enabled
+                if rcr:
+                    print(f"[RCR EXEC] Step {i}: RCR logic executed")
+                    self._apply_rcr_logic(x, x0, confidence, mask_index, overtime_confidence,
+                                        mask_token_id, i, steps, s, t)
                 else:
+                    # Original Dream sampling logic
+                    num_mask_token = mask_index.sum() / mask_index.shape[0]
+                    number_transfer_tokens = int(num_mask_token * (1 - s / t)) if i < steps - 1 else int(num_mask_token)
+                    # --------- 仅此处小修：device 用 x.device，避免跨设备 ----------
+                    full_confidence = torch.full_like(x, -torch.inf, device=x.device, dtype=logits.dtype)
+                    full_confidence[mask_index] = confidence
+                    if number_transfer_tokens > 0:
+                        if alg_temp is None or alg_temp == 0:
+                            _, transfer_index = torch.topk(full_confidence, number_transfer_tokens)
+                        else:
+                            full_confidence = full_confidence / alg_temp
+                            full_confidence = F.softmax(full_confidence, dim=-1)
+                            transfer_index = torch.multinomial(full_confidence, num_samples=number_transfer_tokens)
+                        x_ = torch.zeros_like(x, device=self.device, dtype=torch.long) + mask_token_id
+                        x_[mask_index] = x0.clone()
+                        row_indices = torch.arange(x.size(0), device=self.device).unsqueeze(1).expand_as(transfer_index)
+                        x[row_indices,transfer_index] = x_[row_indices,transfer_index]
+            # this allows user-defined token control of the intermediate steps
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())
         if return_dict_in_generate:
+            return DreamModelOutput(
+                sequences=x,
+                history=histories,
+            )
         else:
             return x