Todokete
/

xtc

sampler

custom_generate

conversational

Model card Files Files and versions

xet

Community

Todokete commited on Dec 1, 2025

Commit

cd22ae8

verified ·

1 Parent(s): d84b1bf

Delete custom_generate/generate.py

Browse files

Files changed (1) hide show

custom_generate/generate.py +0 -217

custom_generate/generate.py DELETED Viewed

@@ -1,217 +0,0 @@
-import torch
-import torch.nn as nn
-import random
-import logging
-from typing import Union, List, Optional
-from transformers import LogitsProcessor, LogitsProcessorList, StoppingCriteriaList, GenerationConfig
-from transformers.generation.utils import GenerationMixin, GenerateDecoderOnlyOutput
-logger = logging.getLogger(__name__)
-class XTCLogitsWarper(LogitsProcessor):
-    """
-    LogitsWarper that implements Exclude Top Choices (XTC).
-    Based on the implementation from text-generation-webui.
-    """
-    def __init__(self, threshold: float, probability: float, protected_token_ids: Optional[List[int]] = None, filter_value: float = -float("Inf")):
-        self.threshold = threshold
-        self.probability = probability
-        self.filter_value = filter_value
-        self.protected_token_ids = set(protected_token_ids) if protected_token_ids is not None else set()
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        # If probability is 0 or random roll fails, do nothing
-        if self.probability <= 0.0 or random.random() >= self.probability:
-            return scores
-        # Sort scores descending
-        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
-        probs = sorted_logits.softmax(dim=-1)
-        # Create a mask for removal
-        sorted_indices_to_remove = torch.full_like(probs, False, dtype=torch.bool)
-        # XTC Logic:
-        # If the *next* token in the sorted list is above the threshold,
-        # then the current token is considered a "top choice" that can be skipped.
-        # This keeps the "tail" but trims the "head" if the head is redundant.
-        sorted_indices_to_remove[..., :-1] = probs[..., 1:] >= self.threshold
-        # Scatter back to original indices
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-        # Safety: Check if any protected tokens (EOS, Newline) would be removed
-        if self.protected_token_ids:
-            # Check if any of the columns corresponding to protected IDs are marked True
-            # We iterate because constructing a full tensor for boolean indexing can be slow if list is small
-            protected_safe = True
-            for pid in self.protected_token_ids:
-                if indices_to_remove[:, pid].any():
-                    protected_safe = False
-                    break
-            if not protected_safe:
-                return scores
-        # Apply the filter
-        scores = scores.masked_fill(indices_to_remove, self.filter_value)
-        return scores
-def _xtc_decoding(
-    model,
-    input_ids: torch.LongTensor,
-    logits_processor: LogitsProcessorList,
-    stopping_criteria: StoppingCriteriaList,
-    generation_config: GenerationConfig,
-    synced_gpus: bool = False,
-    streamer: "BaseStreamer" = None,
-    **model_kwargs,
-) -> Union[GenerateDecoderOnlyOutput, torch.LongTensor]:
-    """
-    Custom decoding loop that ensures XTC is applied during sampling.
-    """
-    # 1. Setup XTC Configuration
-    xtc_threshold = getattr(generation_config, "xtc_threshold", 0.1)
-    xtc_probability = getattr(generation_config, "xtc_probability", 0.0)
-    # Identify tokens to protect (EOS and Newlines are standard for XTC)
-    protected_ids = []
-    if generation_config.eos_token_id is not None:
-        if isinstance(generation_config.eos_token_id, list):
-            protected_ids.extend(generation_config.eos_token_id)
-        else:
-            protected_ids.append(generation_config.eos_token_id)
-    # Try to detect newline token (assumes basic ASCII/Llama tokenizer structure if not provided)
-    # Users can provide `xtc_protected_tokens` in config if needed.
-    custom_protected = getattr(generation_config, "xtc_protected_tokens", None)
-    if custom_protected:
-        protected_ids.extend(custom_protected)
-    else:
-        # Fallback heuristic: try to find \n in the model embeddings if possible,
-        # or rely on standard ID 13 (Llama/Mistral)
-        # Without tokenizer access, we cannot guarantee correct \n detection.
-        # It is safer to rely on probability check or user input.
-        pass
-    # 2. Inject XTC into the LogitsProcessorList
-    # We add it *before* the sampling step (which happens inside the loop)
-    # Note: If the user passed temperature/top_p, they are already in `logits_processor`.
-    if xtc_probability > 0:
-        xtc_warper = XTCLogitsWarper(
-            threshold=xtc_threshold,
-            probability=xtc_probability,
-            protected_token_ids=protected_ids
-        )
-        logits_processor.append(xtc_warper)
-    # 3. Initialization (Standard Transformers Logic)
-    pad_token_id = generation_config._pad_token_tensor
-    output_attentions = generation_config.output_attentions
-    output_hidden_states = generation_config.output_hidden_states
-    output_scores = generation_config.output_scores
-    return_dict_in_generate = generation_config.return_dict_in_generate
-    has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
-    # We enforce sampling, because XTC with greedy search (argmax) logic is contradictory.
-    # (XTC removes top tokens -> Greedy picks the next best -> effectively just degradation).
-    do_sample = True
-    # Init output tuples
-    scores = () if (return_dict_in_generate and output_scores) else None
-    decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-    cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-    # Track finished sequences
-    batch_size, cur_length = input_ids.shape[:2]
-    unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-    model_kwargs = model._get_initial_cache_position(cur_length, input_ids.device, model_kwargs)
-    this_peer_finished = False
-    # 4. The Decoding Loop
-    while model._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-        # Prepare inputs
-        model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
-        # Forward pass
-        outputs = model(
-            **model_inputs,
-            return_dict=True,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        if synced_gpus and this_peer_finished:
-            continue  # don't waste resources
-        # Clone logits for return if needed
-        next_token_logits = outputs.logits[:, -1, :]
-        # Apply Logits Processors (This includes: RepetitionPenalty, Temperature, TopP, AND XTC)
-        next_token_scores = logits_processor(input_ids, next_token_logits)
-        # Store scores/hidden states if requested
-        if return_dict_in_generate and output_scores:
-            scores += (next_token_scores,)
-        if return_dict_in_generate and output_attentions:
-             decoder_attentions += ((outputs.decoder_attentions,) if model.config.is_encoder_decoder else (outputs.attentions,))
-        if return_dict_in_generate and output_hidden_states:
-            decoder_hidden_states += ((outputs.decoder_hidden_states,) if model.config.is_encoder_decoder else (outputs.hidden_states,))
-        # Sample (Multinomial)
-        # XTC modifies the distribution (zeros out top tokens), so we sample from the remainder.
-        probs = nn.functional.softmax(next_token_scores, dim=-1)
-        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-        # Handle EOS safety for batching
-        if has_eos_stopping_criteria:
-            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-        # Update inputs for next step
-        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-        # Streamer interaction
-        if streamer is not None:
-            streamer.put(next_tokens.cpu())
-        # Update model kwargs (cache, attention mask, etc)
-        model_kwargs = model._update_model_kwargs_for_generation(
-            outputs, model_kwargs, is_encoder_decoder=model.config.is_encoder_decoder
-        )
-        # Update stopping criteria
-        unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
-        this_peer_finished = unfinished_sequences.max() == 0
-    if streamer is not None:
-        streamer.end()
-    # 5. Return Formatted Output
-    if return_dict_in_generate:
-        return GenerateDecoderOnlyOutput(
-            sequences=input_ids,
-            scores=scores,
-            attentions=decoder_attentions,
-            hidden_states=decoder_hidden_states,
-            past_key_values=model_kwargs.get("past_key_values"),
-        )
-    else:
-        return input_ids
-def generate(model, *args, **kwargs):
-    """
-    Custom generate function integrating XTC (Exclude Top Choices).
-    Arguments in `kwargs` or `generation_config` to control XTC:
-        xtc_probability (float): Probability to perform XTC check (0.0 to 1.0). Default 0.0 (disabled).
-        xtc_threshold (float): The threshold for defining a "top choice". Default 0.1.
-        xtc_protected_tokens (List[int]): Optional list of specific token IDs to prevent XTC from removing (e.g., newlines).
-    """
-    # Delegate to the standard GenerationMixin, injecting our custom decoding loop
-    generation_outputs = GenerationMixin.generate(
-        model, *args, custom_generate=_xtc_decoding, **kwargs
-    )
-    return generation_outputs