rp-yu
/

Dimple-7B

@@ -420,6 +420,95 @@ class DimpleGenerationMixin:
         # tokenizer=None, # only for debug, need to be removed !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         **kwargs,
     ) -> Union[DimpleModelOutput, torch.LongTensor]:
         # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = model_kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)

         # tokenizer=None, # only for debug, need to be removed !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         **kwargs,
     ) -> Union[DimpleModelOutput, torch.LongTensor]:
+        """
+        Generates sequences using a diffusion-based masked token denoising algorithm.
+        This method replaces masked tokens in `inputs` through iterative refinement, based on a denoising process
+        inspired by diffusion models. It uses intermediate confidence-based sampling to progressively fill in masked tokens.
+        Args:
+            inputs (torch.Tensor):
+                Input token IDs.
+            generation_config (DimpleGenerationConfig, optional):
+                An instance of `DimpleGenerationConfig` containing generation hyperparameters. If not provided,
+                the default generation config from the model is used.
+            **kwargs:
+                Additional generation parameters that override those in `generation_config`.
+        Returns:
+            DimpleModelOutput if `return_dict_in_generate=True`, else `torch.LongTensor` of generated token IDs.
+        Key Parameters (either in `generation_config` or passed via kwargs):
+        - `max_new_tokens` (int, default=None):
+            The number of new tokens to generate or fill in. This sets the target length of the generated sequence beyond
+            the prompt. It is added to the input length to determine the total sequence length.
+        - `output_history` (bool, default=False):
+            If `True`, returns the full sequence history at each denoising step. This is useful for visualization or debugging
+            purposes. Only returned if `return_dict_in_generate=True`.
+        - `return_dict_in_generate` (bool, default=False):
+            If `True`, returns a `DimpleModelOutput` dictionary containing the final sequences and, optionally, the stepwise history.
+            If `False`, returns a plain tensor of token IDs.
+        - `steps` (int, default=512):
+            The number of denoising steps to perform during generation. Each step progressively refines the sequence by replacing
+            some masked tokens based on a sampling algorithm.
+        - `temperature` (float, default=0.0):
+            Sampling temperature applied to logits before softmax. Lower values make outputs more deterministic,
+            while higher values allow for more randomness in token selection.
+        - `top_p` (float, default=None):
+            Nucleus sampling parameter. If set, only the most probable tokens whose cumulative probability exceeds `top_p`
+            are considered during sampling.
+        - `alg` (str, default="origin"):
+            The denoising algorithm to use for determining which tokens to replace at each step. Options include:
+              - `"origin"`: random token selection based on a probability ratio.
+              - `"origin-ratio"`: like `"origin"` but uses continuous transfer ratio.
+              - `"autoregressive"`: always fills the left-most masked token.
+              - `"maskgit_plus"`: confidence-based selection similar to Google's MaskGIT.
+              - `"topk_margin"`: token selection based on margin (top1 - top2 probability).
+              - `"entropy"`: prioritizes tokens with high negative entropy (uncertainty).
+        - `use_cache` (bool, default=False):
+            Enables prefilling of past key values (past KV) for efficient decoding.
+        - `alg_p_threshold` (float, optional, default=None):
+            A confidence threshold used to determine whether a token is confident enough to be selected. If the token's
+            confidence is above this value, it is unmasked and committed to the sequence. Helps stabilize generation.
+        - `use_original_confidence` (bool, default=True):
+            If `True`, confidence scores are computed using the original (pre-sampled) probability distribution.
+            If `False`, uses the current step's softmaxed logits. Enables more stable token selection in some cases.
+        - `decoding_pipeline` (str, default="dim"):
+            The generation decoding pipeline to use:
+              - `"dim"`: Dimple decoding pipeline.
+              - `"dream"`: Original DREAM token selection pipeline.
+        Example:
+            ```python
+            output = model.diffusion_generate(
+                inputs=input_ids,
+                max_new_tokens=64,
+                output_history=True,
+                return_dict_in_generate=True,
+                steps=64,
+                temperature=0.2,
+                top_p=0.95,
+                alg="origin",
+                use_cache=True,
+                alg_p_threshold=0.95,
+                use_original_confidence=True,
+                decoding_pipeline="dim"
+            )
+            ```
+        """
         # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = model_kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)