Instructions to use autoprogrammer/dream_rcr with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use autoprogrammer/dream_rcr with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="autoprogrammer/dream_rcr", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("autoprogrammer/dream_rcr", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use autoprogrammer/dream_rcr with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "autoprogrammer/dream_rcr"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "autoprogrammer/dream_rcr",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/autoprogrammer/dream_rcr

SGLang

How to use autoprogrammer/dream_rcr with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "autoprogrammer/dream_rcr" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "autoprogrammer/dream_rcr",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "autoprogrammer/dream_rcr" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "autoprogrammer/dream_rcr",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use autoprogrammer/dream_rcr with Docker Model Runner:
```
docker model run hf.co/autoprogrammer/dream_rcr
```

autoprogrammer commited on Sep 24, 2025

Commit

78f65a4

verified ·

1 Parent(s): fd52594

Update generation_utils.py

Browse files

Files changed (1) hide show

generation_utils.py +136 -170

generation_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import warnings
 import copy
 from dataclasses import dataclass
@@ -17,7 +18,6 @@ def _apply_top_p_k_temp(logits, temperature=0.0, top_p=None, top_k=None):
     if temperature and temperature > 0:
         logits = logits / temperature
     if top_p is not None and top_p < 1:
-        # top-p
         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
         sorted_indices_to_remove = cumulative_probs > top_p
@@ -27,7 +27,6 @@ def _apply_top_p_k_temp(logits, temperature=0.0, top_p=None, top_k=None):
         mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
         logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
     if top_k is not None:
-        # top-k
         top_k = int(min(top_k, logits.size(-1)))
         if top_k > 0:
             indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
@@ -35,6 +34,26 @@ def _apply_top_p_k_temp(logits, temperature=0.0, top_p=None, top_k=None):
     return logits
 @dataclass
 class DreamModelOutput(ModelOutput):
     sequences: torch.LongTensor = None
@@ -52,16 +71,23 @@ class DreamGenerationConfig(GenerationConfig):
         self.max_length = kwargs.pop("max_length", 20)
         self.max_new_tokens = kwargs.pop("max_new_tokens", None)
-        # diffusion specific params
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
-        self.alg: str = kwargs.pop("alg", 'origin')  # vanilla 使用
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
-        # RCR
         self.rcr: bool = kwargs.pop("rcr", False)
-        # 注意：论文版 RCR 会忽略这里的 conf_alg，并统一用“选中 token 概率”做 running max
-        self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')
         # outputs
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
@@ -93,7 +119,9 @@ class DreamGenerationConfig(GenerationConfig):
         self.validate(is_init=True)
     def validate(self, is_init=False):
-        pass
 class DreamGenerationMixin:
@@ -111,70 +139,12 @@ class DreamGenerationMixin:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
-    # =============== 论文版 RCR：运行最大置信度 + 直接选 n_t 回遮 ===============
-    def _apply_rcr_logic_paper(
-        self,
-        x: torch.Tensor,                 # [B, L]
-        rmax_conf: torch.Tensor,         # [B, L], float32, running max of selected-token prob
-        init_mask_bool: torch.Tensor,    # [B, L], 初始生成区域（最开始是 MASK 的位置）
-        init_mask_count: torch.Tensor,   # [B], 初始 MASK 数 M0
-        mask_token_id: int,
-        step: int,
-        total_steps: int,
-        s: torch.Tensor,
-        t: torch.Tensor,
-    ):
-        """
-        目标：在“初始生成区域”(init_mask_bool) 内，让“已确认个数”符合 vanilla 的线性进度；
-             但位置选择依据“历史最大置信度 rmax_conf”——每步保留 rmax_conf 高的，回遮 rmax_conf 低的。
-        做法：
-          target_cum = floor(M0 * (1 - s/t))    # 最后一步 = M0
-          在 init_mask_bool[j] 内按 rmax_conf[j] 降序选 target_cum 个 => 保持已确认（不 mask）
-          其余位置设为 mask_token_id
-        """
-        B, L = x.shape
-        for j in range(B):
-            M0 = int(init_mask_count[j].item())
-            if step < total_steps - 1:
-                target_cum = int(M0 * (1.0 - (s.item() / t.item())))
-            else:
-                target_cum = M0
-            # 在初始生成区域内排序
-            region_idx = torch.where(init_mask_bool[j])[0]
-            if region_idx.numel() == 0:
-                continue
-            # rmax_conf 越大越稳，保留前 target_cum 个
-            scores = rmax_conf[j, region_idx]  # float32
-            # 防御：若还没更新过，rmax_conf 初始 0.0，会被优先回遮（符合“历史没自信过”的直觉）
-            target_cum = min(target_cum, int(region_idx.numel()))
-            if target_cum <= 0:
-                # 全部保持 mask
-                x[j, region_idx] = mask_token_id
-                continue
-            _, keep_local = torch.topk(scores, k=target_cum, largest=True)
-            keep_global = region_idx[keep_local]
-            # 其余回遮
-            mask_global = torch.ones_like(region_idx, dtype=torch.bool, device=x.device)
-            mask_global[keep_local] = False
-            remask_idx = region_idx[mask_global]
-            if remask_idx.numel() > 0:
-                x[j, remask_idx] = mask_token_id
-            # keep_global 上保持当前写入的 token，不动
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
             return
         if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
             warnings.warn(
-                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the "
-                "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
-                "generation.",
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
@@ -186,9 +156,7 @@ class DreamGenerationMixin:
     def _prepare_generated_length(self, generation_config, has_default_max_length, input_ids_length):
         if generation_config.max_new_tokens is not None:
             if not has_default_max_length and generation_config.max_length is not None:
-                logger.warning(
-                    f"Both `max_new_tokens` and `max_length` are set. `max_new_tokens` takes precedence."
-                )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
@@ -273,7 +241,7 @@ class DreamGenerationMixin:
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
-                "You are calling .generate() with `input_ids` on a different device than the model.",
                 UserWarning,
             )
         if (
@@ -320,7 +288,15 @@ class DreamGenerationMixin:
         top_p = generation_config.top_p
         top_k = generation_config.top_k
-        rcr = generation_config.rcr  # 打开则走论文版 RCR（历史最大 top-1 概率）
         histories = [] if (return_dict_in_generate and output_history) else None
         # pad input_ids to max_length
@@ -340,120 +316,110 @@ class DreamGenerationMixin:
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
         if rcr:
-            # 初始生成区域（prompt 之外扩展出来的那一段）
-            init_mask_bool = (x == mask_token_id)         # [B, L]
-            init_mask_count = init_mask_bool.sum(dim=1)   # [B]
-            # 历史最大“被选 token 概率”（float32）
-            rmax_conf = torch.zeros_like(x, dtype=torch.float32, device=x.device)
-            logger.warning(
-                "[RCR] Using PAPER version: running-max of SELECTED-TOKEN PROB; "
-                "this overrides `conf_alg` (e.g., entropy) for remasking decisions."
-            )
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
-            # 前向
             logits = self(x, attention_mask, tok_idx).logits
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
             logits = generation_logits_hook_func(i, x, logits)
             t = timesteps[i]
             s = timesteps[i + 1]
-            if not rcr:
-                # ===== vanilla 路径（保持你原来的实现）=====
-                mask_logits = logits[mask_index]
-                if alg == 'origin':
-                    p_transfer = 1 - s / t if i < steps - 1 else 1
-                    x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
-                    transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
-                    if transfer_index_t_s.any():
-                        logits_sub = mask_logits[transfer_index_t_s]
-                        logits_sub = _apply_top_p_k_temp(logits_sub, temperature, top_p, top_k)
-                        probs_sub = torch.softmax(logits_sub, dim=-1)
-                        try:
-                            x0_sel = dists.Categorical(probs=probs_sub).sample()
-                        except Exception:
-                            x0_sel = probs_sub.argmax(dim=-1)
-                        x0[transfer_index_t_s] = x0_sel
-                    x[mask_index] = x0.clone()
-                else:
-                    # 按你 vanilla 的 top-k / alg_temp 逻辑
-                    mask_logits = _apply_top_p_k_temp(logits[mask_index], temperature, top_p, top_k)
-                    probs = torch.softmax(mask_logits, dim=-1)
-                    if temperature and temperature > 0:
-                        try:
-                            x0 = dists.Categorical(probs=probs).sample()
-                            confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
-                        except Exception:
-                            confidence, x0 = probs.max(dim=-1)
-                    else:
-                        confidence, x0 = probs.max(dim=-1)
-                    avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
-                    ratio = (1.0 - (s.item() / t.item())) if i < steps - 1 else 1.0
-                    number_transfer_tokens = int(avg_mask_now * ratio)
-                    full_confidence = torch.full_like(x, -torch.inf, device=self.device, dtype=logits.dtype)
-                    full_confidence[mask_index] = confidence
-                    if number_transfer_tokens > 0:
-                        if alg_temp is None or alg_temp == 0:
-                            _, transfer_index = torch.topk(full_confidence, number_transfer_tokens)
-                        else:
-                            full_confidence = full_confidence / alg_temp
-                            full_confidence = F.softmax(full_confidence, dim=-1)
-                            transfer_index = torch.multinomial(full_confidence, num_samples=number_transfer_tokens)
-                        x_ = torch.zeros_like(x, device=self.device, dtype=torch.long) + mask_token_id
-                        x_[mask_index] = x0.clone()
-                        row_indices = torch.arange(x.size(0), device=self.device).unsqueeze(1).expand_as(transfer_index)
-                        x[row_indices, transfer_index] = x_[row_indices, transfer_index]
-            else:
-                # ===== 论文版 RCR =====
-                # 1) 仅对当前 mask 的位置，做 top_p/top_k/temperature 过滤后采样（或贪心）
-                mask_logits = logits[mask_index]
-                mask_logits = _apply_top_p_k_temp(mask_logits, temperature, top_p, top_k)
-                probs = torch.softmax(mask_logits, dim=-1)
-                # 采样 / 贪心
-                if temperature and temperature > 0:
-                    try:
-                        x0 = dists.Categorical(probs=probs).sample()
-                    except Exception:
-                        x0 = probs.argmax(dim=-1)
-                else:
-                    x0 = probs.argmax(dim=-1)
-                # 被选 token 的概率 p_sel（论文要求用这个做“历史置信度”）
-                p_sel = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)  # [M], float32
-                # 写入选中的 token
-                x_maskwrite = torch.full_like(x, mask_token_id, dtype=torch.long)
-                x_maskwrite[mask_index] = x0
-                x = torch.where(mask_index, x_maskwrite, x)
-                # 更新 running-max 置信度（float32）
-                # 先铺到全长
-                full_p_sel = torch.zeros_like(x, dtype=torch.float32)
-                full_p_sel[mask_index] = p_sel.to(torch.float32)
-                rmax_conf = torch.maximum(rmax_conf, full_p_sel)
-                # 2) 基于 rmax_conf 直接确定“下一步要保留的已确认个数”，其余全部回遮
-                self._apply_rcr_logic_paper(
-                    x=x,
-                    rmax_conf=rmax_conf,
-                    init_mask_bool=init_mask_bool,
-                    init_mask_count=init_mask_count,
-                    mask_token_id=mask_token_id,
-                    step=i,
-                    total_steps=steps,
-                    s=s, t=t,
-                )
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:

+# coding=utf-8
 import warnings
 import copy
 from dataclasses import dataclass
     if temperature and temperature > 0:
         logits = logits / temperature
     if top_p is not None and top_p < 1:
         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
         sorted_indices_to_remove = cumulative_probs > top_p
         mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
         logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
     if top_k is not None:
         top_k = int(min(top_k, logits.size(-1)))
         if top_k > 0:
             indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
     return logits
+def _confidence_from_probs(
+    probs: torch.Tensor,            # [..., V]
+    chosen_ids: Optional[torch.Tensor],  # [...]
+    mode: str                       # 'entropy' | 'maskgit_plus' | 'topk_margin'
+) -> torch.Tensor:
+    """返回“越大越自信”的标量分数，与解码一致。"""
+    if mode == "entropy":
+        eps = 1e-10
+        logp = torch.log(probs + eps)
+        return -(probs * logp).sum(dim=-1)          # -H(p)
+    elif mode == "maskgit_plus":
+        assert chosen_ids is not None, "maskgit_plus 需要 chosen_ids"
+        return torch.gather(probs, -1, chosen_ids.unsqueeze(-1)).squeeze(-1)  # p(x0)
+    elif mode == "topk_margin":
+        sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        return sorted_probs[..., 0] - sorted_probs[..., 1]                    # top1 - top2
+    else:
+        raise ValueError(f"Unknown conf mode: {mode}")
 @dataclass
 class DreamModelOutput(ModelOutput):
     sequences: torch.LongTensor = None
         self.max_length = kwargs.pop("max_length", 20)
         self.max_new_tokens = kwargs.pop("max_new_tokens", None)
+        # diffusion
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
+        # vanilla 的打分算法（rcr=False 时使用）
+        self.alg: str = kwargs.pop("alg", 'maskgit_plus')  # 'origin' | 'maskgit_plus' | 'topk_margin' | 'entropy'
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
+        # === RCR ===
         self.rcr: bool = kwargs.pop("rcr", False)
+        # rcr=True 时用于解码 & 历史分一致的置信度定义
+        self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')  # 'maskgit_plus' | 'topk_margin' | 'entropy'
+        # 注意：下两项会被 _sample 内部“写死”为 1/4 到 3/4，总是覆盖
+        self.rcr_start_step: int = kwargs.pop("rcr_start_step", 0)
+        self.rcr_end_step: int = kwargs.pop("rcr_end_step", None) or self.steps
+        # 是否保护“本步刚写”的 token 不被回遮
+        self.rcr_protect_current_step: bool = kwargs.pop("rcr_protect_current_step", False)
         # outputs
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
         self.validate(is_init=True)
     def validate(self, is_init=False):
+        # 简单边界
+        self.rcr_start_step = max(0, int(self.rcr_start_step))
+        self.rcr_end_step = max(self.rcr_start_step, int(self.rcr_end_step))
 class DreamGenerationMixin:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
             return
         if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
             warnings.warn(
+                f"Using default `max_length` (={generation_config.max_length}). Prefer `max_new_tokens`.",
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
     def _prepare_generated_length(self, generation_config, has_default_max_length, input_ids_length):
         if generation_config.max_new_tokens is not None:
             if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning("Both `max_new_tokens` and `max_length` are set. `max_new_tokens` takes precedence.")
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
+                "You are calling .generate() with `input_ids` on a device different from the model.",
                 UserWarning,
             )
         if (
         top_p = generation_config.top_p
         top_k = generation_config.top_k
+        rcr = generation_config.rcr
+        conf_alg = generation_config.conf_alg if rcr else generation_config.alg
+        # === 写死 RCR 生效窗口：总步数的 1/4 到 3/4（左闭右开 [start, end)）===
+        rcr_start = max(0, steps // 4)
+        rcr_end = max(rcr_start, min(steps, (3 * steps) // 4))
+        protect_cur = bool(generation_config.rcr_protect_current_step)
         histories = [] if (return_dict_in_generate and output_history) else None
         # pad input_ids to max_length
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
+        # ==== RCR 状态 ====
         if rcr:
+            init_mask_bool = (x == mask_token_id)               # 初始生成区域
+            init_mask_count = init_mask_bool.sum(dim=1)         # [B]
+            hist_conf = torch.zeros_like(x, dtype=torch.float32, device=x.device)  # 历史最大置信度
+            gen_mask = torch.zeros_like(x, dtype=torch.bool, device=x.device)      # 已确认位置
+            written_step = torch.full_like(x, -1, dtype=torch.int32, device=x.device)
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
+            # 前向 + Dream 的右移对齐
             logits = self(x, attention_mask, tok_idx).logits
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
             logits = generation_logits_hook_func(i, x, logits)
+            # 时间步
             t = timesteps[i]
             s = timesteps[i + 1]
+            # —— 仅抽出 mask 位置的 logits 并做过滤 ——
+            mask_logits = logits[mask_index]
+            if mask_logits.numel() == 0:
+                x = generation_tokens_hook_func(i, x, logits)
+                if histories is not None:
+                    histories.append(x.clone())
+                continue
+            mask_logits = _apply_top_p_k_temp(mask_logits, temperature, top_p, top_k)
+            probs = torch.softmax(mask_logits, dim=-1)
+            # 采样 / 贪心拿到 x0
+            if temperature and temperature > 0:
+                try:
+                    x0 = dists.Categorical(probs=probs).sample()
+                except Exception:
+                    x0 = probs.argmax(dim=-1)
+            else:
+                x0 = probs.argmax(dim=-1)
+            # 统一置信度（与解码一致）
+            conf_now = _confidence_from_probs(
+                probs=probs,
+                chosen_ids=x0 if conf_alg == "maskgit_plus" else None,
+                mode=conf_alg
+            ).to(torch.float32)  # [M]
+            # ====== 计算当步写入配额 k_t（与 vanilla 一致）======
+            Mt = mask_index.sum().item()
+            ratio = (1.0 - (s.item() / t.item())) if i < steps - 1 else 1.0
+            k_t = int(Mt * ratio)
+            # —— 写入：top-k_t ——（无论 RCR 窗口与否，先写）
+            full_conf_now = torch.full((x.size(0), x.size(1)), -1e9, dtype=torch.float32, device=x.device)
+            full_x0 = torch.full_like(x, mask_token_id, dtype=torch.long)
+            full_conf_now[mask_index] = conf_now
+            full_x0[mask_index] = x0
+            for b in range(x.size(0)):
+                masked_b = int(mask_index[b].sum().item())
+                if masked_b == 0 or k_t <= 0:
+                    continue
+                k_b = min(k_t, masked_b)
+                _, sel_idx = torch.topk(full_conf_now[b], k=k_b, largest=True)
+                x[b, sel_idx] = full_x0[b, sel_idx]
+                if rcr:
+                    gen_mask[b, sel_idx] = True
+                    written_step[b, sel_idx] = i
+                    # 更新历史最大置信度（与解码同定义）
+                    hist_conf[b, sel_idx] = torch.maximum(hist_conf[b, sel_idx], full_conf_now[b, sel_idx])
+            # —— RCR 窗口外：不回遮，仅跟踪历史；窗口内：执行回遮到目标累计 ——
+            if rcr and (rcr_start <= i < rcr_end):
+                for b in range(x.size(0)):
+                    M0 = int(init_mask_count[b].item())
+                    target_cum = M0 if i >= steps - 1 else int(M0 * (1.0 - (s.item() / t.item())))
+                    # 当前累计确认：初始生成区域内的已确认数
+                    C_t = int((gen_mask[b] & init_mask_bool[b]).sum().item())
+                    over = max(0, C_t - target_cum)
+                    if over <= 0:
+                        continue
+                    # 候选：初始区域 ∧ 已确认（可选：排除本步刚写）
+                    cand = torch.where(gen_mask[b] & init_mask_bool[b])[0]
+                    if cand.numel() == 0:
+                        continue
+                    if protect_cur:
+                        mask_old = (written_step[b, cand] < i)
+                        cand = cand[mask_old]
+                        if cand.numel() == 0:
+                            # 全是本步写的，且要求保护，则跳过回遮
+                            continue
+                    over = min(over, int(cand.numel()))
+                    scores = hist_conf[b, cand]  # 越大越自信
+                    _, low_local = torch.topk(scores, k=over, largest=False)
+                    low_global = cand[low_local]
+                    # 回遮
+                    x[b, low_global] = mask_token_id
+                    gen_mask[b, low_global] = False
+                    # 历史分数与 written_step 保留
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None: