Hazan-Lab
/

STU-426M

Safetensors

ministu

Model card Files Files and versions

xet

Community

yagizdevre commited on Jan 24, 2025

Commit

550c31b

verified ·

1 Parent(s): 4f58476

Update modeling_ministu.py

Browse files

Files changed (1) hide show

modeling_ministu.py +97 -0

modeling_ministu.py CHANGED Viewed

@@ -138,3 +138,100 @@ class MiniSTU(PreTrainedModel):
                 torch.nn.init.zeros_(module.c_attn.bias)
             if module.c_proj.bias is not None:
                 torch.nn.init.zeros_(module.c_proj.bias)

                 torch.nn.init.zeros_(module.c_attn.bias)
             if module.c_proj.bias is not None:
                 torch.nn.init.zeros_(module.c_proj.bias)
+    @staticmethod
+    def top_k_top_p_filtering(
+        logits: torch.Tensor,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        filter_value: float = float("-inf"),
+    ):
+        """
+        Filters a distribution of logits using top-k and/or nucleus (top-p) filtering.
+        """
+        # top_k
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))
+            # Remove all logits that are not in the top k
+            indices_to_remove = logits < torch.topk(logits, top_k, dim=-1).values[:, -1, None]
+            logits[indices_to_remove] = filter_value
+        # top_p (nucleus)
+        if 0 < top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = False
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                dim=1, index=sorted_indices, src=sorted_indices_to_remove
+            )
+            logits[indices_to_remove] = filter_value
+        return logits
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_new_tokens: int = 50,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        eos_token_id: int = None,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        """
+        Naive token-by-token generation loop that uses top-k/top-p filtering and optional temperature.
+        Args:
+            input_ids (torch.LongTensor): shape (batch_size, sequence_length).
+            max_new_tokens (int): max number of tokens to generate (beyond input_ids length).
+            temperature (float): sampling temperature (>=0).
+            top_k (int): Top-K sampling cutoff.
+            top_p (float): Nucleus sampling cutoff.
+            eos_token_id (int): If set, stop generation when this token is produced.
+            pad_token_id (int): If set, can be used to pad sequences. (Not fully used here.)
+            kwargs: Unused arguments (like num_beams) for compatibility.
+        Returns:
+            torch.LongTensor: shape (batch_size, sequence_length + generated_tokens).
+        """
+        device = input_ids.device
+        # We'll accumulate new tokens into generated_ids
+        generated_ids = input_ids.clone()
+        for _ in range(max_new_tokens):
+            # Forward pass to get logits for the last token
+            outputs = self.forward(generated_ids)
+            logits = outputs.logits[:, -1, :]  # shape: (batch_size, vocab_size)
+            # Scale logits by temperature
+            if temperature != 1.0:
+                logits = logits / temperature
+            # Filter logits using top-k and/or top-p
+            logits = self.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+            # Convert to probabilities
+            probabilities = F.softmax(logits, dim=-1)
+            # Sample from the distribution
+            next_token = torch.multinomial(probabilities, num_samples=1)  # (batch_size, 1)
+            # Append next token
+            generated_ids = torch.cat([generated_ids, next_token], dim=1)
+            # If eos_token_id is set and any sample produced it, we optionally could break early
+            if eos_token_id is not None:
+                # Check if all sequences in the batch ended
+                # or if you want to do a more fine-grained approach
+                if (next_token == eos_token_id).all():
+                    break
+        return generated_ids