codelion
/

dhara-70m

@@ -755,6 +755,158 @@ class DharaForMaskedDiffusion(DharaPreTrainedModel, GenerationMixin):
         })
         return model_inputs
     def save_pretrained(self, save_directory, **kwargs):
         kwargs['safe_serialization'] = kwargs.get('safe_serialization', True)
         return super().save_pretrained(save_directory, **kwargs)

         })
         return model_inputs
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        num_diffusion_steps: int = 10,
+        temperature: float = 1.0,
+        top_p: float = 0.9,
+        do_sample: bool = True,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        **kwargs
+    ) -> torch.LongTensor:
+        """
+        Generate text using masked diffusion sampling.
+        This method performs iterative denoising: starting from fully masked tokens,
+        it progressively unmasks positions based on model confidence.
+        Args:
+            input_ids: Input prompt token IDs [batch_size, prompt_len]
+            max_length: Maximum total sequence length (prompt + generation)
+            max_new_tokens: Number of new tokens to generate (alternative to max_length)
+            num_diffusion_steps: Number of denoising iterations (more = higher quality, slower)
+            temperature: Sampling temperature (higher = more random)
+            top_p: Nucleus sampling threshold
+            do_sample: Whether to sample or take argmax
+            pad_token_id: Token ID for padding
+            eos_token_id: Token ID for end of sequence
+        Returns:
+            Generated token IDs including the prompt
+        """
+        # Handle device and dtype
+        device = input_ids.device if input_ids is not None else next(self.parameters()).device
+        # Determine generation length
+        if input_ids is not None:
+            batch_size, prompt_len = input_ids.shape
+        else:
+            batch_size, prompt_len = 1, 0
+            input_ids = torch.empty(batch_size, 0, dtype=torch.long, device=device)
+        if max_new_tokens is not None:
+            gen_len = max_new_tokens
+        elif max_length is not None:
+            gen_len = max_length - prompt_len
+        else:
+            gen_len = 50  # Default generation length
+        if gen_len <= 0:
+            return input_ids
+        # Get special token IDs
+        mask_token_id = self.config.mask_token_id
+        if pad_token_id is None:
+            pad_token_id = self.config.pad_token_id if hasattr(self.config, 'pad_token_id') else 0
+        if eos_token_id is None:
+            eos_token_id = self.config.eos_token_id if hasattr(self.config, 'eos_token_id') else 2
+        # Initialize: prompt + masked tokens for generation
+        total_len = prompt_len + gen_len
+        tokens = torch.full((batch_size, total_len), mask_token_id, dtype=torch.long, device=device)
+        tokens[:, :prompt_len] = input_ids
+        # Track which positions are masked (need generation)
+        is_masked = torch.ones(batch_size, total_len, dtype=torch.bool, device=device)
+        is_masked[:, :prompt_len] = False  # Prompt is not masked
+        # Number of tokens to unmask per step
+        tokens_per_step = max(1, gen_len // num_diffusion_steps)
+        # Iterative denoising
+        for step in range(num_diffusion_steps):
+            # Forward pass to get logits
+            outputs = self(input_ids=tokens)
+            logits = outputs.logits  # [batch, seq_len, vocab]
+            # Only consider masked positions
+            masked_positions = is_masked.clone()
+            if not masked_positions.any():
+                break  # All tokens have been generated
+            # Apply temperature
+            if temperature != 1.0:
+                logits = logits / temperature
+            # Get probabilities
+            probs = F.softmax(logits, dim=-1)
+            # Calculate confidence (max prob) for each position
+            confidence, _ = probs.max(dim=-1)  # [batch, seq_len]
+            # Mask out already-generated positions from confidence calculation
+            confidence = confidence.masked_fill(~masked_positions, -float('inf'))
+            # Determine how many tokens to unmask this step
+            remaining_masked = masked_positions.sum(dim=1)  # [batch]
+            # For the last step, unmask everything remaining
+            if step == num_diffusion_steps - 1:
+                num_to_unmask = remaining_masked
+            else:
+                num_to_unmask = torch.minimum(
+                    torch.tensor(tokens_per_step, device=device).expand(batch_size),
+                    remaining_masked
+                )
+            # For each batch item, unmask the highest confidence positions
+            for b in range(batch_size):
+                if num_to_unmask[b] == 0:
+                    continue
+                # Get confidence scores for this batch item
+                conf_b = confidence[b]  # [seq_len]
+                # Get top-k positions with highest confidence
+                k = int(num_to_unmask[b].item())
+                _, top_indices = conf_b.topk(k)
+                # Sample or argmax for these positions
+                for idx in top_indices:
+                    pos_logits = logits[b, idx]  # [vocab]
+                    if do_sample and temperature > 0:
+                        # Top-p (nucleus) sampling
+                        sorted_logits, sorted_indices = torch.sort(pos_logits, descending=True)
+                        sorted_probs = F.softmax(sorted_logits, dim=-1)
+                        cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
+                        # Remove tokens with cumulative probability above top_p
+                        sorted_indices_to_remove = cumsum_probs > top_p
+                        sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
+                        sorted_indices_to_remove[0] = False
+                        sorted_logits[sorted_indices_to_remove] = float('-inf')
+                        probs_filtered = F.softmax(sorted_logits, dim=-1)
+                        # Sample
+                        sampled_idx = torch.multinomial(probs_filtered, 1)
+                        token_id = sorted_indices[sampled_idx]
+                    else:
+                        # Greedy (argmax)
+                        token_id = pos_logits.argmax()
+                    tokens[b, idx] = token_id
+                    is_masked[b, idx] = False
+        return tokens
     def save_pretrained(self, save_directory, **kwargs):
         kwargs['safe_serialization'] = kwargs.get('safe_serialization', True)
         return super().save_pretrained(save_directory, **kwargs)