kilianhaefeli
/

Fast_dLLM_v2_7B

Safetensors

English

Fast_dLLM_Qwen

custom_code

Model card Files Files and versions

xet

Community

kilianhaefeli commited on Dec 23, 2025

Commit

389e1f4

1 Parent(s): e5351ca

initial attempt at EBM sampler

Browse files

Files changed (1) hide show

modeling.py +36 -9

modeling.py CHANGED Viewed

@@ -723,6 +723,7 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         max_new_tokens=20, # Added default value for safety
         mask_id=151665,
         threshold=1,
         small_block_size=8,
         block_size=32,
         stop_token=151645,
@@ -737,6 +738,10 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         if use_block_cache:
             raise ValueError("use_block_cache=True is not supported in this generate() implementation.")
         assert attention_mask is not None, "attention_mask must be provided for this generate() implementation."
         # pad the initial input_ids and attention_mask to be multiple of block_size
         if False: # input_ids.shape[1] % block_size != 0:
@@ -862,15 +867,37 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
                             logits = logits[:, start:end]
                         x_1, p_1t = self.sample_with_top_p(logits, top_p=top_p, temperature=temperature)
-                        # Select tokens with probability greater than threshold from p_1t
-                        x1_p = torch.squeeze(torch.gather(p_1t, dim=-1, index=torch.unsqueeze(x_1, -1)), -1)
-                        x1_p = torch.where(mask_idx[:, start:end], x1_p, -torch.inf)
-                        unmask_idx = (x1_p > threshold)
-                        # Ensure at least one token is unmasked in the current small block
-                        max_prob_idx = x1_p.argmax(dim=-1)
-                        unmask_idx[torch.arange(x_1.shape[0]), max_prob_idx] = True
-                        unmask_idx = unmask_idx & mask_idx[:, start:end]
                         # Add 1 to iterations if the sequence is not stopped AND at least one token is generated in this iteration
                         # aka if not finished and unmask id has some True value
@@ -965,4 +992,4 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         samples = torch.multinomial(flat, num_samples=1).squeeze(-1)
         x_1 = samples.view(*p_1t.shape[:-1])
-        return x_1, p_1t

         max_new_tokens=20, # Added default value for safety
         mask_id=151665,
         threshold=1,
+        sampler="confidence",
         small_block_size=8,
         block_size=32,
         stop_token=151645,
         if use_block_cache:
             raise ValueError("use_block_cache=True is not supported in this generate() implementation.")
         assert attention_mask is not None, "attention_mask must be provided for this generate() implementation."
+        if sampler not in {"confidence", "ebm"}:
+            raise ValueError(f"Unsupported sampler: {sampler}. Use 'confidence' or 'ebm'.")
+        if sampler == "ebm" and small_block_size != block_size:
+            raise ValueError("sampler='ebm' currently requires small_block_size == block_size.")
         # pad the initial input_ids and attention_mask to be multiple of block_size
         if False: # input_ids.shape[1] % block_size != 0:
                             logits = logits[:, start:end]
                         x_1, p_1t = self.sample_with_top_p(logits, top_p=top_p, temperature=temperature)
+                        mask_slice = mask_idx[:, start:end]
+                        if sampler == "ebm":
+                            # EBM: unmask lowest-entropy positions with cumulative entropy <= threshold.
+                            log_p = torch.log(p_1t.clamp_min(1e-12))
+                            entropy = -(p_1t * log_p).sum(dim=-1)
+                            entropy = entropy.masked_fill(~mask_slice, float("inf"))
+                            sorted_entropy, sorted_idx = torch.sort(entropy, dim=-1, descending=False)
+                            cumulative_entropy = torch.cumsum(sorted_entropy, dim=-1)
+                            n_to_unmask = (cumulative_entropy <= threshold).sum(dim=-1)
+                            n_to_unmask = torch.clamp(n_to_unmask, min=1)
+                            max_n = mask_slice.sum(dim=-1)
+                            n_to_unmask = torch.minimum(n_to_unmask, max_n)
+                            positions = torch.arange(sorted_idx.size(1), device=sorted_idx.device)
+                            positions = positions.unsqueeze(0).expand_as(sorted_idx)
+                            select = positions < n_to_unmask.unsqueeze(1)
+                            unmask_idx = torch.zeros_like(mask_slice)
+                            unmask_idx.scatter_(dim=1, index=sorted_idx, src=select)
+                            unmask_idx = unmask_idx & mask_slice
+                        else:
+                            # Select tokens with probability greater than threshold from p_1t
+                            x1_p = torch.squeeze(torch.gather(p_1t, dim=-1, index=torch.unsqueeze(x_1, -1)), -1)
+                            x1_p = torch.where(mask_slice, x1_p, -torch.inf)
+                            unmask_idx = (x1_p > threshold)
+                            # Ensure at least one token is unmasked in the current small block
+                            max_prob_idx = x1_p.argmax(dim=-1)
+                            unmask_idx[torch.arange(x_1.shape[0]), max_prob_idx] = True
+                            unmask_idx = unmask_idx & mask_slice
                         # Add 1 to iterations if the sequence is not stopped AND at least one token is generated in this iteration
                         # aka if not finished and unmask id has some True value
         samples = torch.multinomial(flat, num_samples=1).squeeze(-1)
         x_1 = samples.view(*p_1t.shape[:-1])
+        return x_1, p_1t