Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /main /en /generation_strategies.html

rtrm

about 2 months ago

download

raw

89.9 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Text generation strategies","local":"text-generation-strategies","sections":[{"title":"Default text generation configuration","local":"default-text-generation-configuration","sections":[],"depth":2},{"title":"Customize text generation","local":"customize-text-generation","sections":[],"depth":2},{"title":"Save a custom decoding strategy with your model","local":"save-a-custom-decoding-strategy-with-your-model","sections":[],"depth":2},{"title":"Streaming","local":"streaming","sections":[],"depth":2},{"title":"Watermarking","local":"watermarking","sections":[],"depth":2},{"title":"Decoding strategies","local":"decoding-strategies","sections":[{"title":"Greedy Search","local":"greedy-search","sections":[],"depth":3},{"title":"Contrastive search","local":"contrastive-search","sections":[],"depth":3},{"title":"Multinomial sampling","local":"multinomial-sampling","sections":[],"depth":3},{"title":"Beam-search decoding","local":"beam-search-decoding","sections":[],"depth":3},{"title":"Beam-search multinomial sampling","local":"beam-search-multinomial-sampling","sections":[],"depth":3},{"title":"Diverse beam search decoding","local":"diverse-beam-search-decoding","sections":[],"depth":3},{"title":"Speculative Decoding","local":"speculative-decoding","sections":[],"depth":3},{"title":"DoLa Decoding","local":"dola-decoding","sections":[{"title":"Understanding the dola_layers argument","local":"understanding-the-dolalayers-argument","sections":[],"depth":4}],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/transformers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/singletons.0f2b7d5f.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/paths.3d04d2c6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/0.026d2fdd.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/22.88976433.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/Tip.baa67368.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Text generation strategies","local":"text-generation-strategies","sections":[{"title":"Default text generation configuration","local":"default-text-generation-configuration","sections":[],"depth":2},{"title":"Customize text generation","local":"customize-text-generation","sections":[],"depth":2},{"title":"Save a custom decoding strategy with your model","local":"save-a-custom-decoding-strategy-with-your-model","sections":[],"depth":2},{"title":"Streaming","local":"streaming","sections":[],"depth":2},{"title":"Watermarking","local":"watermarking","sections":[],"depth":2},{"title":"Decoding strategies","local":"decoding-strategies","sections":[{"title":"Greedy Search","local":"greedy-search","sections":[],"depth":3},{"title":"Contrastive search","local":"contrastive-search","sections":[],"depth":3},{"title":"Multinomial sampling","local":"multinomial-sampling","sections":[],"depth":3},{"title":"Beam-search decoding","local":"beam-search-decoding","sections":[],"depth":3},{"title":"Beam-search multinomial sampling","local":"beam-search-multinomial-sampling","sections":[],"depth":3},{"title":"Diverse beam search decoding","local":"diverse-beam-search-decoding","sections":[],"depth":3},{"title":"Speculative Decoding","local":"speculative-decoding","sections":[],"depth":3},{"title":"DoLa Decoding","local":"dola-decoding","sections":[{"title":"Understanding the dola_layers argument","local":"understanding-the-dolalayers-argument","sections":[],"depth":4}],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="text-generation-strategies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-generation-strategies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text generation strategies</span></h1> <p data-svelte-h="svelte-1pq6r4w">Text generation is essential to many NLP tasks, such as open-ended text generation, summarization, translation, and
	more. It also plays a role in a variety of mixed-modality applications that have text as an output like speech-to-text
	and vision-to-text. Some of the models that can generate text include
	GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper.</p> <p data-svelte-h="svelte-4vex4m">Check out a few examples that use <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate">generate()</a> method to produce
	text outputs for different tasks:</p> <ul data-svelte-h="svelte-18jzu0"><li><a href="./tasks/summarization#inference">Text summarization</a></li> <li><a href="./model_doc/git#transformers.GitForCausalLM.forward.example">Image captioning</a></li> <li><a href="./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example">Audio transcription</a></li></ul> <p data-svelte-h="svelte-5iqkcx">Note that the inputs to the generate method depend on the model’s modality. They are returned by the model’s preprocessor
	class, such as AutoTokenizer or AutoProcessor. If a model’s preprocessor creates more than one kind of input, pass all
	the inputs to generate(). You can learn more about the individual model’s preprocessor in the corresponding model’s documentation.</p> <p data-svelte-h="svelte-agd87v">The process of selecting output tokens to generate text is known as decoding, and you can customize the decoding strategy
	that the <code>generate()</code> method will use. Modifying a decoding strategy does not change the values of any trainable parameters.
	However, it can have a noticeable impact on the quality of the generated output. It can help reduce repetition in the text
	and make it more coherent.</p> <p data-svelte-h="svelte-1gun7m8">This guide describes:</p> <ul data-svelte-h="svelte-l1azua"><li>default generation configuration</li> <li>common decoding strategies and their main parameters</li> <li>saving and sharing custom generation configurations with your fine-tuned model on 🤗 Hub</li></ul> <h2 class="relative group"><a id="default-text-generation-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#default-text-generation-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Default text generation configuration</span></h2> <p data-svelte-h="svelte-1pysfh4">A decoding strategy for a model is defined in its generation configuration. When using pre-trained models for inference
	within a <a href="/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline">pipeline()</a>, the models call the <code>PreTrainedModel.generate()</code> method that applies a default generation
	configuration under the hood. The default configuration is also used when no custom configuration has been saved with
	the model.</p> <p data-svelte-h="svelte-2o7gdz">When you load a model explicitly, you can inspect the generation configuration that comes with it through
	<code>model.generation_config</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM

	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"distilbert/distilgpt2"</span>)
	<span class="hljs-meta">>>> </span>model.generation_config
	GenerationConfig {
	<span class="hljs-string">"bos_token_id"</span>: <span class="hljs-number">50256</span>,
	<span class="hljs-string">"eos_token_id"</span>: <span class="hljs-number">50256</span>
	}
	<BLANKLINE><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18srzjs">Printing out the <code>model.generation_config</code> reveals only the values that are different from the default generation
	configuration, and does not list any of the default values.</p> <p data-svelte-h="svelte-32rftl">The default generation configuration limits the size of the output combined with the input prompt to a maximum of 20
	tokens to avoid running into resource limitations. The default decoding strategy is greedy search, which is the simplest decoding strategy that picks a token with the highest probability as the next token. For many tasks
	and small output sizes this works well. However, when used to generate longer outputs, greedy search can start
	producing highly repetitive results.</p> <h2 class="relative group"><a id="customize-text-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#customize-text-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Customize text generation</span></h2> <p data-svelte-h="svelte-qprijs">You can override any <code>generation_config</code> by passing the parameters and their values directly to the <code>generate</code> method:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>my_model.generate(**inputs, num_beams=<span class="hljs-number">4</span>, do_sample=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-t9z639">Even if the default decoding strategy mostly works for your task, you can still tweak a few things. Some of the
	commonly adjusted parameters include:</p> <ul data-svelte-h="svelte-7xime6"><li><code>max_new_tokens</code>: the maximum number of tokens to generate. In other words, the size of the output sequence, not
	including the tokens in the prompt. As an alternative to using the output’s length as a stopping criteria, you can choose
	to stop generation whenever the full generation exceeds some amount of time. To learn more, check <a href="/docs/transformers/main/en/internal/generation_utils#transformers.StoppingCriteria">StoppingCriteria</a>.</li> <li><code>num_beams</code>: by specifying a number of beams higher than 1, you are effectively switching from greedy search to
	beam search. This strategy evaluates several hypotheses at each time step and eventually chooses the hypothesis that
	has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability
	sequences that start with a lower probability initial tokens and would’ve been ignored by the greedy search. Visualize how it works <a href="https://huggingface.co/spaces/m-ric/beam_search_visualizer" rel="nofollow">here</a>.</li> <li><code>do_sample</code>: if set to <code>True</code>, this parameter enables decoding strategies such as multinomial sampling, beam-search
	multinomial sampling, Top-K sampling and Top-p sampling. All these strategies select the next token from the probability
	distribution over the entire vocabulary with various strategy-specific adjustments.</li> <li><code>num_return_sequences</code>: the number of sequence candidates to return for each input. This option is only available for
	the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding
	strategies like greedy search and contrastive search return a single output sequence.</li></ul> <h2 class="relative group"><a id="save-a-custom-decoding-strategy-with-your-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#save-a-custom-decoding-strategy-with-your-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Save a custom decoding strategy with your model</span></h2> <p data-svelte-h="svelte-1l1ri2h">If you would like to share your fine-tuned model with a specific generation configuration, you can:</p> <ul data-svelte-h="svelte-nmlfgm"><li>Create a <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig">GenerationConfig</a> class instance</li> <li>Specify the decoding strategy parameters</li> <li>Save your generation configuration with <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.save_pretrained">GenerationConfig.save_pretrained()</a>, making sure to leave its <code>config_file_name</code> argument empty</li> <li>Set <code>push_to_hub</code> to <code>True</code> to upload your config to the model’s repo</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, GenerationConfig

	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"my_account/my_model"</span>)
	<span class="hljs-meta">>>> </span>generation_config = GenerationConfig(
	<span class="hljs-meta">... </span> max_new_tokens=<span class="hljs-number">50</span>, do_sample=<span class="hljs-literal">True</span>, top_k=<span class="hljs-number">50</span>, eos_token_id=model.config.eos_token_id
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>generation_config.save_pretrained(<span class="hljs-string">"my_account/my_model"</span>, push_to_hub=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xdh3pt">You can also store several generation configurations in a single directory, making use of the <code>config_file_name</code>
	argument in <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.save_pretrained">GenerationConfig.save_pretrained()</a>. You can later instantiate them with <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.from_pretrained">GenerationConfig.from_pretrained()</a>. This is useful if you want to
	store several generation configurations for a single model (e.g. one for creative text generation with sampling, and
	one for summarization with beam search). You must have the right Hub permissions to add configuration files to a model.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"google-t5/t5-small"</span>)
	<span class="hljs-meta">>>> </span>model = AutoModelForSeq2SeqLM.from_pretrained(<span class="hljs-string">"google-t5/t5-small"</span>)

	<span class="hljs-meta">>>> </span>translation_generation_config = GenerationConfig(
	<span class="hljs-meta">... </span> num_beams=<span class="hljs-number">4</span>,
	<span class="hljs-meta">... </span> early_stopping=<span class="hljs-literal">True</span>,
	<span class="hljs-meta">... </span> decoder_start_token_id=<span class="hljs-number">0</span>,
	<span class="hljs-meta">... </span> eos_token_id=model.config.eos_token_id,
	<span class="hljs-meta">... </span> pad_token=model.config.pad_token_id,
	<span class="hljs-meta">... </span>)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Tip: add `push_to_hub=True` to push to the Hub</span>
	<span class="hljs-meta">>>> </span>translation_generation_config.save_pretrained(<span class="hljs-string">"/tmp"</span>, <span class="hljs-string">"translation_generation_config.json"</span>)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># You could then use the named generation config file to parameterize generation</span>
	<span class="hljs-meta">>>> </span>generation_config = GenerationConfig.from_pretrained(<span class="hljs-string">"/tmp"</span>, <span class="hljs-string">"translation_generation_config.json"</span>)
	<span class="hljs-meta">>>> </span>inputs = tokenizer(<span class="hljs-string">"translate English to French: Configuration files are easy to use!"</span>, return_tensors=<span class="hljs-string">"pt"</span>)
	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, generation_config=generation_config)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>))
	[<span class="hljs-string">'Les fichiers de configuration sont faciles à utiliser!'</span>]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Streaming</span></h2> <p data-svelte-h="svelte-12rh7l9">The <code>generate()</code> supports streaming, through its <code>streamer</code> input. The <code>streamer</code> input is compatible with any instance
	from a class that has the following methods: <code>put()</code> and <code>end()</code>. Internally, <code>put()</code> is used to push new tokens and
	<code>end()</code> is used to flag the end of text generation.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-gv2g1g">The API for the streamer classes is still under development and may change in the future.</p></div> <p data-svelte-h="svelte-9nc7v7">In practice, you can craft your own streaming class for all sorts of purposes! We also have basic streaming classes
	ready for you to use. For example, you can use the <a href="/docs/transformers/main/en/internal/generation_utils#transformers.TextStreamer">TextStreamer</a> class to stream the output of <code>generate()</code> into
	your screen, one word at a time:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer, TextStreamer

	<span class="hljs-meta">>>> </span>tok = AutoTokenizer.from_pretrained(<span class="hljs-string">"openai-community/gpt2"</span>)
	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"openai-community/gpt2"</span>)
	<span class="hljs-meta">>>> </span>inputs = tok([<span class="hljs-string">"An increasing sequence: one,"</span>], return_tensors=<span class="hljs-string">"pt"</span>)
	<span class="hljs-meta">>>> </span>streamer = TextStreamer(tok)

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Despite returning the usual output, the streamer will also print the generated text to stdout.</span>
	<span class="hljs-meta">>>> </span>_ = model.generate(**inputs, streamer=streamer, max_new_tokens=<span class="hljs-number">20</span>)
	An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="watermarking" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#watermarking"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Watermarking</span></h2> <p data-svelte-h="svelte-10ws557">The <code>generate()</code> supports watermarking the generated text by randomly marking a portion of tokens as “green”.
	When generating the “green” will have a small ‘bias’ value added to their logits, thus having a higher chance to be generated.
	The watermarked text can be detected by calculating the proportion of “green” tokens in the text and estimating how likely it is
	statistically to obtain that amount of “green” tokens for human-generated text. This watermarking strategy was proposed in the paper
	<a href="https://arxiv.org/abs/2306.04634" rel="nofollow">“On the Reliability of Watermarks for Large Language Models”</a>. For more information on
	the inner functioning of watermarking, it is recommended to refer to the paper.</p> <p data-svelte-h="svelte-5xszzd">The watermarking can be used with any generative model in <code>tranformers</code> and does not require an extra classification model
	to detect watermarked text. To trigger watermarking, pass in a <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.WatermarkingConfig">WatermarkingConfig</a> with needed arguments directly to the
	<code>.generate()</code> method or add it to the <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig">GenerationConfig</a>. Watermarked text can be later detected with a <a href="/docs/transformers/main/en/internal/generation_utils#transformers.WatermarkDetector">WatermarkDetector</a>.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-ge8sma">The WatermarkDetector internally relies on the proportion of “green” tokens, and whether generated text follows the coloring pattern.
	That is why it is recommended to strip off the prompt text, if it is much longer than the generated text.
	This also can have an effect when one sequence in the batch is a lot longer causing other rows to be padded.
	Additionally, the detector <strong>must</strong> be initiated with identical watermark configuration arguments used when generating.</p></div> <p data-svelte-h="svelte-ba6d9u">Let’s generate some text with watermarking. In the below code snippet, we set the bias to 2.5 which is a value that
	will be added to “green” tokens’ logits. After generating watermarked text, we can pass it directly to the <code>WatermarkDetector</code>
	to check if the text is machine-generated (outputs <code>True</code> for machine-generated and <code>False</code> otherwise).</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig

	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"openai-community/gpt2"</span>)
	<span class="hljs-meta">>>> </span>tok = AutoTokenizer.from_pretrained(<span class="hljs-string">"openai-community/gpt2"</span>)
	<span class="hljs-meta">>>> </span>tok.pad_token_id = tok.eos_token_id
	<span class="hljs-meta">>>> </span>tok.padding_side = <span class="hljs-string">"left"</span>

	<span class="hljs-meta">>>> </span>inputs = tok([<span class="hljs-string">"This is the beginning of a long story"</span>, <span class="hljs-string">"Alice and Bob are"</span>], padding=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"pt"</span>)
	<span class="hljs-meta">>>> </span>input_len = inputs[<span class="hljs-string">"input_ids"</span>].shape[-<span class="hljs-number">1</span>]

	<span class="hljs-meta">>>> </span>watermarking_config = WatermarkingConfig(bias=<span class="hljs-number">2.5</span>, seeding_scheme=<span class="hljs-string">"selfhash"</span>)
	<span class="hljs-meta">>>> </span>out = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=<span class="hljs-literal">False</span>, max_length=<span class="hljs-number">20</span>)

	<span class="hljs-meta">>>> </span>detector = WatermarkDetector(model_config=model.config, device=<span class="hljs-string">"cpu"</span>, watermarking_config=watermarking_config)
	<span class="hljs-meta">>>> </span>detection_out = detector(out, return_dict=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span>detection_out.prediction
	array([<span class="hljs-literal">True</span>, <span class="hljs-literal">True</span>])<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="decoding-strategies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#decoding-strategies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Decoding strategies</span></h2> <p data-svelte-h="svelte-dhwbam">Certain combinations of the <code>generate()</code> parameters, and ultimately <code>generation_config</code>, can be used to enable specific
	decoding strategies. If you are new to this concept, we recommend reading
	<a href="https://huggingface.co/blog/how-to-generate" rel="nofollow">this blog post that illustrates how common decoding strategies work</a>.</p> <p data-svelte-h="svelte-nugt5b">Here, we’ll show some of the parameters that control the decoding strategies and illustrate how you can use them.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-16lh8dy">Selecting a given decoding strategy is not the only way you can influence the outcome of <code>generate()</code> with your model.
	The decoding strategies act based (mostly) on the logits, the distribution of probabilities for the next token, and
	thus selecting a good logits manipulation strategy can go a long way! In other words, manipulating the logits is another
	dimension you can act upon, in addition to selecting a decoding strategy. Popular logits manipulation strategies include
	<code>top_p</code>, <code>min_p</code>, and <code>repetition_penalty</code> — you can check the full list in the <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig">GenerationConfig</a> class.</p></div> <h3 class="relative group"><a id="greedy-search" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#greedy-search"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Greedy Search</span></h3> <p data-svelte-h="svelte-1m7rj88"><code>generate</code> uses greedy search decoding by default so you don’t have to pass any parameters to enable it. This means the parameters <code>num_beams</code> is set to 1 and <code>do_sample=False</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"I look forward to"</span>
	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"distilbert/distilgpt2"</span>

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n'</span>]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="contrastive-search" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#contrastive-search"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Contrastive search</span></h3> <p data-svelte-h="svelte-m0y9j6">The contrastive search decoding strategy was proposed in the 2022 paper <a href="https://arxiv.org/abs/2202.06417" rel="nofollow">A Contrastive Framework for Neural Text Generation</a>.
	It demonstrates superior results for generating non-repetitive yet coherent long outputs. To learn how contrastive search
	works, check out <a href="https://huggingface.co/blog/introducing-csearch" rel="nofollow">this blog post</a>.
	The two main parameters that enable and control the behavior of contrastive search are <code>penalty_alpha</code> and <code>top_k</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM

	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"openai-community/gpt2-large"</span>
	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Hugging Face Company is"</span>
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, penalty_alpha=<span class="hljs-number">0.6</span>, top_k=<span class="hljs-number">4</span>, max_new_tokens=<span class="hljs-number">100</span>)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'Hugging Face Company is a family owned and operated business. We pride ourselves on being the best
	in the business and our customer service is second to none.\n\nIf you have any questions about our
	products or services, feel free to contact us at any time. We look forward to hearing from you!'</span>]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="multinomial-sampling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multinomial-sampling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multinomial sampling</span></h3> <p data-svelte-h="svelte-vsvvis">As opposed to greedy search that always chooses a token with the highest probability as the
	next token, multinomial sampling (also called ancestral sampling) randomly selects the next token based on the probability distribution over the entire
	vocabulary given by the model. Every token with a non-zero probability has a chance of being selected, thus reducing the
	risk of repetition.</p> <p data-svelte-h="svelte-ldtxsn">To enable multinomial sampling set <code>do_sample=True</code> and <code>num_beams=1</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM, set_seed
	<span class="hljs-meta">>>> </span>set_seed(<span class="hljs-number">0</span>) <span class="hljs-comment"># For reproducibility</span>

	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"openai-community/gpt2-large"</span>
	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Today was an amazing day because"</span>
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, do_sample=<span class="hljs-literal">True</span>, num_beams=<span class="hljs-number">1</span>, max_new_tokens=<span class="hljs-number">100</span>)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">"Today was an amazing day because we received these wonderful items by the way of a gift shop. The box arrived on a Thursday and I opened it on Monday afternoon to receive the gifts. Both bags featured pieces from all the previous years!\n\nThe box had lots of surprises in it, including some sweet little mini chocolate chips! I don't think I'd eat all of these. This was definitely one of the most expensive presents I have ever got, I actually got most of them for free!\n\nThe first package came"</span>]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="beam-search-decoding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#beam-search-decoding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Beam-search decoding</span></h3> <p data-svelte-h="svelte-149ek3p">Unlike greedy search, beam-search decoding keeps several hypotheses at each time step and eventually chooses
	the hypothesis that has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability
	sequences that start with lower probability initial tokens and would’ve been ignored by the greedy search.</p> <a href="https://huggingface.co/spaces/m-ric/beam_search_visualizer" class="flex flex-col justify-center" data-svelte-h="svelte-1ck2qcm"><img style="max-width: 90%; margin: auto;" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beam_search.png"></a> <p data-svelte-h="svelte-1ymxnsd">You can visualize how beam-search decoding works in <a href="https://huggingface.co/spaces/m-ric/beam_search_visualizer" rel="nofollow">this interactive demo</a>: type your input sentence, and play with the parameters to see how the decoding beams change.</p> <p data-svelte-h="svelte-krswod">To enable this decoding strategy, specify the <code>num_beams</code> (aka number of hypotheses to keep track of) that is greater than 1.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"It is astonishing how one can"</span>
	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"openai-community/gpt2-medium"</span>

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, num_beams=<span class="hljs-number">5</span>, max_new_tokens=<span class="hljs-number">50</span>)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'It is astonishing how one can have such a profound impact on the lives of so many people in such a short period of
	time."\n\nHe added: "I am very proud of the work I have been able to do in the last few years.\n\n"I have'</span>]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="beam-search-multinomial-sampling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#beam-search-multinomial-sampling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Beam-search multinomial sampling</span></h3> <p data-svelte-h="svelte-zgjlvh">As the name implies, this decoding strategy combines beam search with multinomial sampling. You need to specify
	the <code>num_beams</code> greater than 1, and set <code>do_sample=True</code> to use this decoding strategy.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForSeq2SeqLM, set_seed
	<span class="hljs-meta">>>> </span>set_seed(<span class="hljs-number">0</span>) <span class="hljs-comment"># For reproducibility</span>

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"translate English to German: The house is wonderful."</span>
	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"google-t5/t5-small"</span>

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, num_beams=<span class="hljs-number">5</span>, do_sample=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span>tokenizer.decode(outputs[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-string">'Das Haus ist wunderbar.'</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="diverse-beam-search-decoding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diverse-beam-search-decoding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Diverse beam search decoding</span></h3> <p data-svelte-h="svelte-zdhf65">The diverse beam search decoding strategy is an extension of the beam search strategy that allows for generating a more diverse
	set of beam sequences to choose from. To learn how it works, refer to <a href="https://arxiv.org/pdf/1610.02424.pdf" rel="nofollow">Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models</a>.
	This approach has three main parameters: <code>num_beams</code>, <code>num_beam_groups</code>, and <code>diversity_penalty</code>.
	The diversity penalty ensures the outputs are distinct across groups, and beam search is used within each group.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForSeq2SeqLM

	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"google/pegasus-xsum"</span>
	<span class="hljs-meta">>>> </span>prompt = (
	<span class="hljs-meta">... </span> <span class="hljs-string">"The Permaculture Design Principles are a set of universal design principles "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"that can be applied to any location, climate and culture, and they allow us to design "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"the most efficient and sustainable human habitation and food production systems. "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"Permaculture is a design system that encompasses a wide variety of disciplines, such "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"as ecology, landscape design, environmental science and energy conservation, and the "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"Permaculture design principles are drawn from these various disciplines. Each individual "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"design principle itself embodies a complete conceptual framework based on sound "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"scientific principles. When we bring all these separate principles together, we can "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"create a design system that both looks at whole systems, the parts that these systems "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"consist of, and how those parts interact with each other to create a complex, dynamic, "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"living system. Each design principle serves as a tool that allows us to integrate all "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"the separate parts of a design, referred to as elements, into a functional, synergistic, "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"whole system, where the elements harmoniously interact and work together in the most "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">"efficient way possible."</span>
	<span class="hljs-meta">... </span>)

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, num_beams=<span class="hljs-number">5</span>, num_beam_groups=<span class="hljs-number">5</span>, max_new_tokens=<span class="hljs-number">30</span>, diversity_penalty=<span class="hljs-number">1.0</span>)
	<span class="hljs-meta">>>> </span>tokenizer.decode(outputs[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-string">'The Design Principles are a set of universal design principles that can be applied to any location, climate and
	culture, and they allow us to design the'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e7mt8t">This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the
	<code>generate</code> method, which gives you even further control over the <code>generate</code> method’s behavior.
	For the complete list of the available parameters, refer to the <a href="./main_classes/text_generation.md">API documentation</a>.</p> <h3 class="relative group"><a id="speculative-decoding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#speculative-decoding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Speculative Decoding</span></h3> <p data-svelte-h="svelte-13d2q9j">Speculative decoding (also known as assisted decoding) is a modification of the decoding strategies above, that uses an
	assistant model (ideally a much smaller one) with the same tokenizer, to generate a few candidate tokens. The main
	model then validates the candidate tokens in a single forward pass, which speeds up the decoding process. If
	<code>do_sample=True</code>, then the token validation with resampling introduced in the
	<a href="https://arxiv.org/pdf/2211.17192.pdf" rel="nofollow">speculative decoding paper</a> is used.</p> <p data-svelte-h="svelte-xy91hv">Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn’t support batched inputs.
	To learn more about assisted decoding, check <a href="https://huggingface.co/blog/assisted-generation" rel="nofollow">this blog post</a>.</p> <p data-svelte-h="svelte-ebd3ly">To enable assisted decoding, set the <code>assistant_model</code> argument with a model.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Alice and Bob"</span>
	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"EleutherAI/pythia-1.4b-deduped"</span>
	<span class="hljs-meta">>>> </span>assistant_checkpoint = <span class="hljs-string">"EleutherAI/pythia-160m-deduped"</span>

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, assistant_model=assistant_model)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nm0fxt">When using assisted decoding with sampling methods, you can use the <code>temperature</code> argument to control the randomness,
	just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer, set_seed
	<span class="hljs-meta">>>> </span>set_seed(<span class="hljs-number">42</span>) <span class="hljs-comment"># For reproducibility</span>

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Alice and Bob"</span>
	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"EleutherAI/pythia-1.4b-deduped"</span>
	<span class="hljs-meta">>>> </span>assistant_checkpoint = <span class="hljs-string">"EleutherAI/pythia-160m-deduped"</span>

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)

	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(checkpoint)
	<span class="hljs-meta">>>> </span>assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
	<span class="hljs-meta">>>> </span>outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=<span class="hljs-literal">True</span>, temperature=<span class="hljs-number">0.5</span>)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'Alice and Bob, a couple of friends of mine, who are both in the same office as'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n7i3t9">Alternatively, you can also set the <code>prompt_lookup_num_tokens</code> to trigger n-gram based assisted decoding, as opposed
	to model based assisted decoding. You can read more about it <a href="https://twitter.com/joao_gante/status/1747322413006643259" rel="nofollow">here</a>.</p> <h3 class="relative group"><a id="dola-decoding" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#dola-decoding"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DoLa Decoding</span></h3> <p data-svelte-h="svelte-c1qhny"><strong>D</strong>ecoding by C<strong>o</strong>ntrasting <strong>La</strong>yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the
	hallucinations of LLMs, as described in this paper of ICLR 2024 <a href="https://arxiv.org/abs/2309.03883" rel="nofollow">DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models</a>.</p> <p data-svelte-h="svelte-1fok5rt">DoLa is achieved by contrasting the differences in logits obtained from final
	layers versus earlier layers, thus amplify the factual knowledge localized to particular part of transformer layers.</p> <p data-svelte-h="svelte-1xb5ka9">Do the following two steps to activate DoLa decoding when calling the <code>model.generate</code> function:</p> <ol data-svelte-h="svelte-1rdm431"><li>Set the <code>dola_layers</code> argument, which can be either a string or a list of integers.<ul><li>If set to a string, it can be one of <code>low</code>, <code>high</code>.</li> <li>If set to a list of integers, it should be a list of layer indices between 0 and the total number of layers in the model. The 0-th layer is word embedding, and the 1st layer is the first transformer layer, and so on.</li></ul></li> <li>Set <code>repetition_penalty = 1.2</code> is suggested to reduce repetition in DoLa decoding.</li></ol> <p data-svelte-h="svelte-17tp6io">See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM, set_seed
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch

	<span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"huggyllama/llama-7b"</span>)
	<span class="hljs-meta">>>> </span>model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"huggyllama/llama-7b"</span>, torch_dtype=torch.float16)
	<span class="hljs-meta">>>> </span>device = <span class="hljs-string">'cuda'</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">'cpu'</span>
	<span class="hljs-meta">>>> </span>model.to(device)
	<span class="hljs-meta">>>> </span>set_seed(<span class="hljs-number">42</span>)

	<span class="hljs-meta">>>> </span>text = <span class="hljs-string">"On what date was the Declaration of Independence officially signed?"</span>
	<span class="hljs-meta">>>> </span>inputs = tokenizer(text, return_tensors=<span class="hljs-string">"pt"</span>).to(device)

	<span class="hljs-comment"># Vanilla greddy decoding</span>
	<span class="hljs-meta">>>> </span>vanilla_output = model.generate(**inputs, do_sample=<span class="hljs-literal">False</span>, max_new_tokens=<span class="hljs-number">50</span>)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(vanilla_output[:, inputs.input_ids.shape[-<span class="hljs-number">1</span>]:], skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'\nThe Declaration of Independence was signed on July 4, 1776.\nWhat was the date of the signing of the Declaration of Independence?\nThe Declaration of Independence was signed on July 4,'</span>]

	<span class="hljs-comment"># DoLa decoding with contrasting higher part of layers (layers 16,18,...,30)</span>
	<span class="hljs-meta">>>> </span>dola_high_output = model.generate(**inputs, do_sample=<span class="hljs-literal">False</span>, max_new_tokens=<span class="hljs-number">50</span>, dola_layers=<span class="hljs-string">'high'</span>)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(dola_high_output[:, inputs.input_ids.shape[-<span class="hljs-number">1</span>]:], skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'\nJuly 4, 1776, when the Continental Congress voted to separate from Great Britain. The 56 delegates to the Continental Congress signed the Declaration on August 2, 1776.'</span>]

	<span class="hljs-comment"># DoLa decoding with contrasting specific layers (layers 28 and 30)</span>
	<span class="hljs-meta">>>> </span>dola_custom_output = model.generate(**inputs, do_sample=<span class="hljs-literal">False</span>, max_new_tokens=<span class="hljs-number">50</span>, dola_layers=[<span class="hljs-number">28</span>,<span class="hljs-number">30</span>], repetition_penalty=<span class="hljs-number">1.2</span>)
	<span class="hljs-meta">>>> </span>tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-<span class="hljs-number">1</span>]:], skip_special_tokens=<span class="hljs-literal">True</span>)
	[<span class="hljs-string">'\nIt was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2'</span>]<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="understanding-the-dolalayers-argument" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#understanding-the-dolalayers-argument"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Understanding the dola_layers argument</span></h4> <p data-svelte-h="svelte-qgkuwa"><code>dola_layers</code> stands for the candidate layers in premature layer selection, as described in the DoLa paper. The selected premature layer will be contrasted with the final layer.</p> <p data-svelte-h="svelte-13a3h6h">Setting <code>dola_layers</code> to <code>'low'</code> or <code>'high'</code> will select the lower or higher part of the layers to contrast, respectively.</p> <ul data-svelte-h="svelte-bhw0w6"><li>For <code>N</code>-layer models with <code>N <= 40</code> layers, the layers of <code>range(0, N // 2, 2)</code> and <code>range(N // 2, N, 2)</code> are used for <code>'low'</code> and <code>'high'</code> layers, respectively.</li> <li>For models with <code>N > 40</code> layers, the layers of <code>range(0, 20, 2)</code> and <code>range(N - 20, N, 2)</code> are used for <code>'low'</code> and <code>'high'</code> layers, respectively.</li> <li>If the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer, as the early exit from word embeddings will become identity function.</li> <li>Set the <code>dola_layers</code> to a list of integers for layer indices to contrast manually specified layers. For example, setting <code>dola_layers=[28,30]</code> will contrast the final layer (32-th layer) with the 28-th and 30-th layers.</li></ul> <p data-svelte-h="svelte-1v5aa4g">The paper suggested that contrasting <code>'high'</code> layers to improve short-answer tasks like TruthfulQA, and contrasting <code>'low'</code> layers to improve all the other long-answer reasoning tasks, such as GSM8K, StrategyQA, FACTOR, and VicunaQA. Applying DoLa to smaller models like GPT-2 is not recommended, as the results shown in the Appendix N of the paper.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/generation_strategies.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1xexzbk = {
	assets: "/docs/transformers/main/en",
	base: "/docs/transformers/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js"),
	import("/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 22],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 89.9 kB
Xet hash:: 0acb02b035829f7bb92f122cc0cbc98ee78052085703a1043dfb41c0e278f5ac

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.