Todokete
/

xtc

sampler

custom_generate

conversational

Model card Files Files and versions

xet

Community

Todokete commited on Dec 1, 2025

Commit

055a45c

verified ·

1 Parent(s): 1973671

Update custom_generate/generate.py

Browse files

Files changed (1) hide show

custom_generate/generate.py +214 -92

custom_generate/generate.py CHANGED Viewed

@@ -1,101 +1,223 @@
 import torch
 import random
-def generate(model, input_ids, generation_config=None, **kwargs):
-    print("✨ using XTC (Exclude Top Choices) generation ✨")
-    generation_config = generation_config or model.generation_config
-    # Setup generation parameters
-    cur_length = input_ids.shape[1]
-    max_length = generation_config.max_length or cur_length + generation_config.max_new_tokens
-    # Extract XTC parameters from config or kwargs, with defaults
-    # Note: You can pass these in the generate() call or set them in generation_config
-    xtc_threshold = kwargs.get("xtc_threshold", getattr(generation_config, "xtc_threshold", 0.1))
-    xtc_probability = kwargs.get("xtc_probability", getattr(generation_config, "xtc_probability", 0.0))
-    temperature = kwargs.get("temperature", getattr(generation_config, "temperature", 1.0))
-    # Identify special tokens to protect (EOS is critical, Newline is preferred if known)
-    eos_token_id = generation_config.eos_token_id
-    pad_token_id = generation_config.pad_token_id
-    # Try to find newline token if possible, though hard without direct tokenizer access in this scope.
-    # Users can pass `protected_token_ids` in kwargs to be specific.
-    protected_token_ids = kwargs.get("protected_token_ids", [])
-    if eos_token_id is not None:
-        protected_token_ids.append(eos_token_id)
-    # Basic handling for left_padding (from original example)
-    left_padding = kwargs.get("left_padding", None)
-    if left_padding is not None:
-        if not isinstance(left_padding, int) or left_padding < 0:
-            raise ValueError(f"left_padding must be an integer larger than 0, but is {left_padding}")
-        pad_token = kwargs.get("pad_token", None) or pad_token_id
-        if pad_token is None:
-            raise ValueError("pad_token is not defined")
-        batch_size = input_ids.shape[0]
-        pad_tensor = torch.full(size=(batch_size, left_padding), fill_value=pad_token).to(input_ids.device)
-        input_ids = torch.cat((pad_tensor, input_ids), dim=1)
-        cur_length = input_ids.shape[1]
-    # Sampling Loop
-    while cur_length < max_length:
-        with torch.no_grad():
-            outputs = model(input_ids)
-            next_token_logits = outputs.logits[:, -1, :]
-            # 1. Apply Temperature
-            if temperature != 1.0 and temperature > 0:
-                next_token_logits = next_token_logits / temperature
-            # 2. Apply XTC (Exclude Top Choices)
-            # Logic ported from text-generation-webui sampler_hijack.py
-            if xtc_probability > 0.0 and random.random() < xtc_probability:
-                # Calculate probabilities for sorting
-                # We sort descending to find the "Top" choices
-                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
-                sorted_probs = sorted_logits.softmax(dim=-1)
-                # Identify tokens to remove
-                sorted_indices_to_remove = torch.full_like(sorted_probs, False, dtype=torch.bool)
-                # XTC Logic:
-                # If the *next* token in the sorted list is above threshold,
-                # it means the current token is a "Top Choice" that can be excluded,
-                # because we still have good alternatives remaining.
-                sorted_indices_to_remove[..., :-1] = sorted_probs[..., 1:] >= xtc_threshold
-                # Scatter back to original indices
-                indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-                # Safety Check: Don't remove if special tokens (EOS, etc) are targeted
-                # If any protected token is in the removal mask, we abort XTC for this step
-                should_abort = False
-                if protected_token_ids:
-                    # Check if any protected token is marked for removal
-                    # We convert list to tensor for indexing
-                    protected_tensor = torch.tensor(protected_token_ids, device=input_ids.device)
-                    # We check if any of the columns corresponding to protected tokens are True
-                    if indices_to_remove[:, protected_tensor].any():
-                        should_abort = True
-                if not should_abort:
-                    next_token_logits = next_token_logits.masked_fill(indices_to_remove, -float("Inf"))
-            # 3. Sample
-            # Using multinomial sampling (softmax + random selection)
-            probs = torch.softmax(next_token_logits, dim=-1)
-            next_tokens = torch.multinomial(probs, num_samples=1)
-            # Update input_ids
-            input_ids = torch.cat((input_ids, next_tokens), dim=-1)
-            cur_length += 1
-            # Stop if all batch items have hit EOS (optional optimization)
-            if eos_token_id is not None and (next_tokens == eos_token_id).all():
-                break
-    return input_ids

 import torch
+import torch.nn as nn
 import random
+import logging
+from typing import Union, List, Optional
+from transformers import LogitsProcessor, LogitsProcessorList, StoppingCriteriaList, GenerationConfig
+from transformers.generation.utils import GenerationMixin, GenerateDecoderOnlyOutput
+logger = logging.getLogger(__name__)
+class XTCLogitsWarper(LogitsProcessor):
+    """
+    LogitsWarper that implements Exclude Top Choices (XTC).
+    Based on the implementation from text-generation-webui.
+    """
+    def __init__(self, threshold: float, probability: float, protected_token_ids: Optional[List[int]] = None, filter_value: float = -float("Inf")):
+        self.threshold = threshold
+        self.probability = probability
+        self.filter_value = filter_value
+        self.protected_token_ids = set(protected_token_ids) if protected_token_ids is not None else set()
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # If probability is 0 or random roll fails, do nothing
+        if self.probability <= 0.0 or random.random() >= self.probability:
+            return scores
+        # Sort scores descending
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        probs = sorted_logits.softmax(dim=-1)
+        # Create a mask for removal
+        sorted_indices_to_remove = torch.full_like(probs, False, dtype=torch.bool)
+        # XTC Logic:
+        # If the *next* token in the sorted list is above the threshold,
+        # then the current token is considered a "top choice" that can be skipped.
+        # This keeps the "tail" but trims the "head" if the head is redundant.
+        sorted_indices_to_remove[..., :-1] = probs[..., 1:] >= self.threshold
+        # Scatter back to original indices
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        # Safety: Check if any protected tokens (EOS, Newline) would be removed
+        if self.protected_token_ids:
+            # Check if any of the columns corresponding to protected IDs are marked True
+            # We iterate because constructing a full tensor for boolean indexing can be slow if list is small
+            protected_safe = True
+            for pid in self.protected_token_ids:
+                if indices_to_remove[:, pid].any():
+                    protected_safe = False
+                    break
+            if not protected_safe:
+                return scores
+        # Apply the filter
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+def _xtc_decoding(
+    model,
+    input_ids: torch.LongTensor,
+    logits_processor: LogitsProcessorList,
+    stopping_criteria: StoppingCriteriaList,
+    generation_config: GenerationConfig,
+    synced_gpus: bool = False,
+    streamer: "BaseStreamer" = None,
+    **model_kwargs,
+) -> Union[GenerateDecoderOnlyOutput, torch.LongTensor]:
+    """
+    Custom decoding loop that ensures XTC is applied during sampling.
+    """
+    # 1. Setup XTC Configuration
+    xtc_threshold = getattr(generation_config, "xtc_threshold", 0.1)
+    xtc_probability = getattr(generation_config, "xtc_probability", 0.0)
+    # Identify tokens to protect (EOS and Newlines are standard for XTC)
+    protected_ids = []
+    if generation_config.eos_token_id is not None:
+        if isinstance(generation_config.eos_token_id, list):
+            protected_ids.extend(generation_config.eos_token_id)
+        else:
+            protected_ids.append(generation_config.eos_token_id)
+    # Try to detect newline token (assumes basic ASCII/Llama tokenizer structure if not provided)
+    # Users can provide `xtc_protected_tokens` in config if needed.
+    custom_protected = getattr(generation_config, "xtc_protected_tokens", None)
+    if custom_protected:
+        protected_ids.extend(custom_protected)
+    else:
+        # Fallback heuristic: try to find \n in the model embeddings if possible,
+        # or rely on standard ID 13 (Llama/Mistral)
+        # Without tokenizer access, we cannot guarantee correct \n detection.
+        # It is safer to rely on probability check or user input.
+        pass
+    # 2. Inject XTC into the LogitsProcessorList
+    # We add it *before* the sampling step (which happens inside the loop)
+    # Note: If the user passed temperature/top_p, they are already in `logits_processor`.
+    if xtc_probability > 0:
+        xtc_warper = XTCLogitsWarper(
+            threshold=xtc_threshold,
+            probability=xtc_probability,
+            protected_token_ids=protected_ids
+        )
+        logits_processor.append(xtc_warper)
+    # 3. Initialization (Standard Transformers Logic)
+    pad_token_id = generation_config._pad_token_tensor
+    output_attentions = generation_config.output_attentions
+    output_hidden_states = generation_config.output_hidden_states
+    output_scores = generation_config.output_scores
+    return_dict_in_generate = generation_config.return_dict_in_generate
+    has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+    # We enforce sampling, because XTC with greedy search (argmax) logic is contradictory.
+    # (XTC removes top tokens -> Greedy picks the next best -> effectively just degradation).
+    do_sample = True
+    # Init output tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+    decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+    cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+    # Track finished sequences
+    batch_size, cur_length = input_ids.shape[:2]
+    unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+    model_kwargs = model._get_initial_cache_position(cur_length, input_ids.device, model_kwargs)
+    this_peer_finished = False
+    # 4. The Decoding Loop
+    while model._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+        # Prepare inputs
+        model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        # Forward pass
+        outputs = model(
+            **model_inputs,
+            return_dict=True,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        if synced_gpus and this_peer_finished:
+            continue  # don't waste resources
+        # Clone logits for return if needed
+        next_token_logits = outputs.logits[:, -1, :]
+        # Apply Logits Processors (This includes: RepetitionPenalty, Temperature, TopP, AND XTC)
+        next_token_scores = logits_processor(input_ids, next_token_logits)
+        # Store scores/hidden states if requested
+        if return_dict_in_generate and output_scores:
+            scores += (next_token_scores,)
+        if return_dict_in_generate and output_attentions:
+             decoder_attentions += ((outputs.decoder_attentions,) if model.config.is_encoder_decoder else (outputs.attentions,))
+        if return_dict_in_generate and output_hidden_states:
+            decoder_hidden_states += ((outputs.decoder_hidden_states,) if model.config.is_encoder_decoder else (outputs.hidden_states,))
+        # Sample (Multinomial)
+        # XTC modifies the distribution (zeros out top tokens), so we sample from the remainder.
+        probs = nn.functional.softmax(next_token_scores, dim=-1)
+        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        # Handle EOS safety for batching
+        if has_eos_stopping_criteria:
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+        # Update inputs for next step
+        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        # Streamer interaction
+        if streamer is not None:
+            streamer.put(next_tokens.cpu())
+        # Update model kwargs (cache, attention mask, etc)
+        model_kwargs = model._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=model.config.is_encoder_decoder
+        )
+        # Update stopping criteria
+        unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+        this_peer_finished = unfinished_sequences.max() == 0
+    if streamer is not None:
+        streamer.end()
+    # 5. Return Formatted Output
+    if return_dict_in_generate:
+        return GenerateDecoderOnlyOutput(
+            sequences=input_ids,
+            scores=scores,
+            attentions=decoder_attentions,
+            hidden_states=decoder_hidden_states,
+            past_key_values=model_kwargs.get("past_key_values"),
+        )
+    else:
+        return input_ids
+def generate(model, *args, **kwargs):
+    """
+    Custom generate function integrating XTC (Exclude Top Choices).
+    Arguments in `kwargs` or `generation_config` to control XTC:
+        xtc_probability (float): Probability to perform XTC check (0.0 to 1.0). Default 0.0 (disabled).
+        xtc_threshold (float): The threshold for defining a "top choice". Default 0.1.
+        xtc_protected_tokens (List[int]): Optional list of specific token IDs to prevent XTC from removing (e.g., newlines).
+    """
+    # XTC is effectively a sampler, so we should ensure do_sample is True in the config
+    if "generation_config" in kwargs:
+        kwargs["generation_config"].do_sample = True
+    elif "do_sample" not in kwargs:
+        kwargs["do_sample"] = True
+    # Delegate to the standard GenerationMixin, injecting our custom decoding loop
+    generation_outputs = GenerationMixin.generate(
+        model, *args, custom_generate=_xtc_decoding, **kwargs
+    )
+    return generation_outputs