Spaces:

rnnandi
/

convert-gemma3-to-onnx

Running

File size: 13,554 Bytes

ca97aa9


/**
 * @module generation/configuration_utils
 */

import { pick } from "../utils/core.js";

/**
 * Class that holds a configuration for a generation task.
 */
export class GenerationConfig {
    // Parameters that control the length of the output
    /**
     * The maximum length the generated tokens can have.
     * Corresponds to the length of the input prompt + `max_new_tokens`.
     * Its effect is overridden by `max_new_tokens`, if also set.
     * @type {number}
     * @default 20
     */
    max_length = 20;

    /**
     * The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
     * @type {number}
     * @default null
     */
    max_new_tokens = null;

    /**
     * The minimum length of the sequence to be generated.
     * Corresponds to the length of the input prompt + `min_new_tokens`.
     * Its effect is overridden by `min_new_tokens`, if also set.
     * @type {number}
     * @default 0
     */
    min_length = 0;

    /**
     * The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.
     * @type {number}
     * @default null
     */
    min_new_tokens = null;

    /**
     * Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
     * - `true`, where the generation stops as soon as there are `num_beams` complete candidates;
     * - `false`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates;
     * - `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
     * @type {boolean|"never"}
     * @default false
     */
    early_stopping = false;

    /**
     * The maximum amount of time you allow the computation to run for in seconds.
     * Generation will still finish the current pass after allocated time has been passed.
     * @type {number}
     * @default null
     */
    max_time = null;

    // Parameters that control the generation strategy used
    /**
     * Whether or not to use sampling; use greedy decoding otherwise.
     * @type {boolean}
     * @default false
     */
    do_sample = false;

    /**
     * Number of beams for beam search. 1 means no beam search.
     * @type {number}
     * @default 1
     */
    num_beams = 1;

    /**
     * Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
     * See [this paper](https://huggingface.co/papers/1610.02424) for more details.
     * @type {number}
     * @default 1
     */
    num_beam_groups = 1;

    /**
     * The values balance the model confidence and the degeneration penalty in contrastive search decoding.
     * @type {number}
     * @default null
     */
    penalty_alpha = null;

    /**
     * Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
     * @type {boolean}
     * @default true
     */
    use_cache = true;

    // Parameters for manipulation of the model output logits
    /**
     * The value used to modulate the next token probabilities.
     * @type {number}
     * @default 1.0
     */
    temperature = 1.0;

    /**
     * The number of highest probability vocabulary tokens to keep for top-k-filtering.
     * @type {number}
     * @default 50
     */
    top_k = 50;

    /**
     * If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
     * @type {number}
     * @default 1.0
     */
    top_p = 1.0;

    /**
     * Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated.
     * If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation.
     * See [this paper](https://huggingface.co/papers/2202.00666) for more details.
     * @type {number}
     * @default 1.0
     */
    typical_p = 1.0;

    /**
     * If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled.
     * In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model.
     * See [Truncation Sampling as Language Model Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
     * @type {number}
     * @default 0.0
     */
    epsilon_cutoff = 0.0;

    /**
     * Eta sampling is a hybrid of locally typical sampling and epsilon sampling.
     * If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`.
     * The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
     * See [Truncation Sampling as Language Model Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
     * @type {number}
     * @default 0.0
     */
    eta_cutoff = 0.0;

    /**
     * This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time.
     * Note that `diversity_penalty` is only effective if `group beam search` is enabled.
     * @type {number}
     * @default 0.0
     */
    diversity_penalty = 0.0;

    /**
     * The parameter for repetition penalty. 1.0 means no penalty.
     * See [this paper](https://huggingface.co/papers/1909.05858) for more details.
     * @type {number}
     * @default 1.0
     */
    repetition_penalty = 1.0;

    /**
     * The paramater for encoder_repetition_penalty.
     * An exponential penalty on sequences that are not in the original input.
     * 1.0 means no penalty.
     * @type {number}
     * @default 1.0
     */
    encoder_repetition_penalty = 1.0;

    /**
     * Exponential penalty to the length that is used with beam-based generation.
     * It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence.
     * Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences.
     * @type {number}
     * @default 1.0
     */
    length_penalty = 1.0;

    /**
     * If set to int > 0, all ngrams of that size can only occur once.
     * @type {number}
     * @default 0
     */
    no_repeat_ngram_size = 0;

    /**
     * List of token ids that are not allowed to be generated.
     * In order to get the token ids of the words that should not appear in the generated text, use
     * `tokenizer(bad_words, { add_prefix_space: true, add_special_tokens: false }).input_ids`.
     * @type {number[][]}
     * @default null
     */
    bad_words_ids = null;

    /**
     * List of token ids that must be generated.
     * If given a `number[][]`, this is treated as a simple list of words that must be included, the opposite to `bad_words_ids`.
     * If given `number[][][]`, this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one can allow different forms of each word.
     * @type {number[][]|number[][][]}
     * @default null
     */
    force_words_ids = null;

    /**
     * Whether to renormalize the logits after applying all the logits processors or warpers (including the custom ones).
     * It's highly recommended to set this flag to `true` as the search algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization.
     * @type {boolean}
     * @default false
     */
    renormalize_logits = false;

    /**
     * Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by `Constraint` objects, in the most sensible way possible.
     * @type {Object[]}
     * @default null
     */
    constraints = null;

    /**
     * The id of the token to force as the first generated token after the `decoder_start_token_id`.
     * Useful for multilingual models like mBART where the first generated token needs to be the target language token.
     * @type {number}
     * @default null
     */
    forced_bos_token_id = null;

    /**
     * The id of the token to force as the last generated token when `max_length` is reached.
     * Optionally, use a list to set multiple *end-of-sequence* tokens.
     * @type {number|number[]}
     * @default null
     */
    forced_eos_token_id = null;

    /**
     * Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation.
     * @type {boolean}
     */
    remove_invalid_values = false;

    /**
     * This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated.
     * The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay.
     * @type {[number, number]}
     * @default null
     */
    exponential_decay_length_penalty = null;

    /**
     * A list of tokens that will be suppressed at generation.
     * The `SuppressTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
     * @type {number[]}
     * @default null
     */
    suppress_tokens = null;

    /**
     * A streamer that will be used to stream the generation.
     * @type {import('./streamers.js').TextStreamer}
     * @default null
     */
    streamer = null;

    /**
     * A list of tokens that will be suppressed at the beginning of the generation.
     * The `SuppressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled.
     * @type {number[]}
     * @default null
     */
    begin_suppress_tokens = null;

    /**
     * A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling.
     * For example, `[[1, 123]]` means the second generated token will always be a token of index 123.
     * @type {[number, number][]}
     * @default null
     */
    forced_decoder_ids = null;

    /**
     * The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
     * Higher guidance scale encourages the model to generate samples that are more closely linked to the input
     * prompt, usually at the expense of poorer quality.
     * @type {number}
     * @default null
     */
    guidance_scale = null;

    // Parameters that define the output variables of `generate`
    /**
     * The number of independently computed returned sequences for each element in the batch.
     * @type {number}
     * @default 1
     */
    num_return_sequences = 1;

    /**
     * Whether or not to return the attentions tensors of all attention layers.
     * See `attentions` under returned tensors for more details.
     * @type {boolean}
     * @default false
     */
    output_attentions = false;

    /**
     * Whether or not to return the hidden states of all layers.
     * See `hidden_states` under returned tensors for more details.
     * @type {boolean}
     * @default false
     */
    output_hidden_states = false;

    /**
     * Whether or not to return the prediction scores.
     * See `scores` under returned tensors for more details.
     * @type {boolean}
     * @default false
     */
    output_scores = false;

    /**
     * Whether or not to return a `ModelOutput` instead of a plain tuple.
     * @type {boolean}
     * @default false
     */
    return_dict_in_generate = false;

    // Special tokens that can be used at generation time
    /**
     * The id of the *padding* token.
     * @type {number}
     * @default null
     */
    pad_token_id = null;

    /**
     * The id of the *beginning-of-sequence* token.
     * @type {number}
     * @default null
     */
    bos_token_id = null;

    /**
     * The id of the *end-of-sequence* token.
     * Optionally, use a list to set multiple *end-of-sequence* tokens.
     * @type {number|number[]}
     * @default null
     */
    eos_token_id = null;

    // Generation parameters exclusive to encoder-decoder models
    /**
     * If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
     * @type {number}
     * @default 0
     */
    encoder_no_repeat_ngram_size = 0;

    /**
     * If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
     * @type {number}
     * @default null
     */
    decoder_start_token_id = null;

    // Wild card
    /**
     * Additional generation kwargs will be forwarded to the `generate` function of the model.
     * Kwargs that are not present in `generate`'s signature will be used in the model forward pass.
     * @type {Object}
     * @default {}
     */
    generation_kwargs = {};

    /**
     * 
     * @param {GenerationConfig|import('../configs.js').PretrainedConfig} config 
     */
    constructor(config) {
        Object.assign(this, pick(config, Object.getOwnPropertyNames(this)));
    }
}