Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_33913 /en /conversations.html

rtrm

29 days ago

download

raw

48.5 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Chatting with Transformers","local":"chatting-with-transformers","sections":[{"title":"Quickstart","local":"quickstart","sections":[],"depth":2},{"title":"Choosing a chat model","local":"choosing-a-chat-model","sections":[{"title":"Size and model naming","local":"size-and-model-naming","sections":[],"depth":3},{"title":"But which chat model is best?","local":"but-which-chat-model-is-best","sections":[],"depth":3},{"title":"Specialist domains","local":"specialist-domains","sections":[],"depth":3}],"depth":2},{"title":"What happens inside the pipeline?","local":"what-happens-inside-the-pipeline","sections":[],"depth":2},{"title":"Performance, memory and hardware","local":"performance-memory-and-hardware","sections":[{"title":"Memory considerations","local":"memory-considerations","sections":[],"depth":3},{"title":"Performance considerations","local":"performance-considerations","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_33913/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/singletons.62a184e0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/paths.51881b9e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/0.05e395f5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/15.931e2bd2.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/Tip.baa67368.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Chatting with Transformers","local":"chatting-with-transformers","sections":[{"title":"Quickstart","local":"quickstart","sections":[],"depth":2},{"title":"Choosing a chat model","local":"choosing-a-chat-model","sections":[{"title":"Size and model naming","local":"size-and-model-naming","sections":[],"depth":3},{"title":"But which chat model is best?","local":"but-which-chat-model-is-best","sections":[],"depth":3},{"title":"Specialist domains","local":"specialist-domains","sections":[],"depth":3}],"depth":2},{"title":"What happens inside the pipeline?","local":"what-happens-inside-the-pipeline","sections":[],"depth":2},{"title":"Performance, memory and hardware","local":"performance-memory-and-hardware","sections":[{"title":"Memory considerations","local":"memory-considerations","sections":[],"depth":3},{"title":"Performance considerations","local":"performance-considerations","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="chatting-with-transformers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chatting-with-transformers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chatting with Transformers</span></h1> <p data-svelte-h="svelte-97mbo7">If you’re reading this article, you’re almost certainly aware of <strong>chat models</strong>. Chat models are conversational
	AIs that you can send and receive messages with. The most famous of these is the proprietary ChatGPT, but there are
	now many open-source chat models which match or even substantially exceed its performance. These models are free to
	download and run on a local machine. Although the largest and most capable models require high-powered hardware
	and lots of memory to run, there are smaller models that will run perfectly well on a single consumer GPU, or even
	an ordinary desktop or notebook CPU.</p> <p data-svelte-h="svelte-11gn30l">This guide will help you get started with chat models. We’ll start with a brief quickstart guide that uses a convenient,
	high-level “pipeline”. This is all you need if you just want to start running a chat model
	immediately. After the quickstart, we’ll move on to more detailed information about
	what exactly chat models are, how to choose an appropriate one, and a low-level breakdown of each of the
	steps involved in talking to a chat model. We’ll also give some tips on optimizing the performance and memory usage
	of your chat models.</p> <h2 class="relative group"><a id="quickstart" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quickstart"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quickstart</span></h2> <p data-svelte-h="svelte-ceqsyu">If you have no time for details, here’s the brief summary: Chat models continue chats. This means that you pass them
	a conversation history, which can be as short as a single user message, and the model will continue the conversation
	by adding its response. Let’s see this in action. First, let’s build a chat:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->chat = [
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."</span>},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Hey, can you tell me any fun things to do in New York?"</span>}
	]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ewqvbr">Notice that in addition to the user’s message, we added a <strong>system</strong> message at the start of the conversation. Not all
	chat models support system messages, but when they do, they represent high-level directives about how the model
	should behave in the conversation. You can use this to guide the model - whether you want short or long responses,
	lighthearted or serious ones, and so on. If you want the model to do useful work instead of
	practicing its improv routine, you can either omit the system message or try a terse one such as “You are a helpful and intelligent
	AI assistant who responds to user queries.”</p> <p data-svelte-h="svelte-1whouos">Once you have a chat, the quickest way to continue it is using the <a href="/docs/transformers/pr_33913/en/main_classes/pipelines#transformers.TextGenerationPipeline">TextGenerationPipeline</a>.
	Let’s see this in action with <code>LLaMA-3</code>. Note that <code>LLaMA-3</code> is a gated model, which means you will need to
	<a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct" rel="nofollow">apply for access</a> and log in with your Hugging Face
	account to use it. We’ll also use <code>device_map="auto"</code>, which will load the model on GPU if there’s enough memory
	for it, and set the dtype to <code>torch.bfloat16</code> to save memory:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	pipe = pipeline(<span class="hljs-string">"text-generation"</span>, <span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"auto"</span>)
	response = pipe(chat, max_new_tokens=<span class="hljs-number">512</span>)
	<span class="hljs-built_in">print</span>(response[<span class="hljs-number">0</span>][<span class="hljs-string">'generated_text'</span>][-<span class="hljs-number">1</span>][<span class="hljs-string">'content'</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15cj4lt">And you’ll get:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
	alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

	So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
	things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
	Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
	something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
	some wild stuff, like that Warhol guy's soup cans and all that jazz.

	And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
	those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

	Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
	even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

	And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
	pizzerias around the city. Just don't try to order a "robot-sized" slice, trust me, it won't end well. (laughs)

	So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
	excuse me, I've got some oil changes to attend to. (winks)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-143rayi">You can continue the chat by appending your own response to it. The
	<code>response</code> object returned by the pipeline actually contains the entire chat so far, so we can simply append
	a message and pass it back:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->chat = response[<span class="hljs-number">0</span>][<span class="hljs-string">'generated_text'</span>]
	chat.append(
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Wait, what's so wild about soup cans?"</span>}
	)
	response = pipe(chat, max_new_tokens=<span class="hljs-number">512</span>)
	<span class="hljs-built_in">print</span>(response[<span class="hljs-number">0</span>][<span class="hljs-string">'generated_text'</span>][-<span class="hljs-number">1</span>][<span class="hljs-string">'content'</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15cj4lt">And you’ll get:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
	It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
	like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
	(sarcastically) Oh, yeah, real original, Andy.

	But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
	status quo, and Warhol was like, the king of that. He took the ordinary and made it extraordinary.
	And, let me tell you, it was like, a real game-changer. I mean, who would've thought that a can of soup could be art? (laughs)

	But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (winks)
	But, hey, that's what makes art, art, right? (laughs)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-167ztte">The remainder of this tutorial will cover specific topics such
	as performance and memory, or how to select a chat model for your needs.</p> <h2 class="relative group"><a id="choosing-a-chat-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-a-chat-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Choosing a chat model</span></h2> <p data-svelte-h="svelte-1tkc0ve">There are an enormous number of different chat models available on the <a href="https://huggingface.co/models?pipeline_tag=text-generation&sort=trending" rel="nofollow">Hugging Face Hub</a>,
	and new users often feel very overwhelmed by the selection offered. Don’t be, though! You really need to just focus on
	two important considerations:</p> <ul data-svelte-h="svelte-t7jbp2"><li>The model’s size, which will determine if you can fit it in memory and how quickly it will
	run.</li> <li>The quality of the model’s chat output.</li></ul> <p data-svelte-h="svelte-43d84i">In general, these are correlated - bigger models tend to be
	more capable, but even so there’s a lot of variation at a given size point!</p> <h3 class="relative group"><a id="size-and-model-naming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#size-and-model-naming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Size and model naming</span></h3> <p data-svelte-h="svelte-zhw6jz">The size of a model is easy to spot - it’s the number in the model name, like “8B” or “70B”. This is the number of
	<strong>parameters</strong> in the model. Without quantization, you should expect to need about 2 bytes of memory per parameter.
	This means that an “8B” model with 8 billion parameters will need about 16GB of memory just to fit the parameters,
	plus a little extra for other overhead. It’s a good fit for a high-end consumer GPU with 24GB of memory, such as a 3090
	or 4090.</p> <p data-svelte-h="svelte-33l25f">Some chat models are “Mixture of Experts” models. These may list their sizes in different ways, such as “8x7B” or
	“141B-A35B”. The numbers are a little fuzzier here, but in general you can read this as saying that the model
	has approximately 56 (8x7) billion parameters in the first case, or 141 billion parameters in the second case.</p> <p data-svelte-h="svelte-ndw18w">Note that it is very common to use quantization techniques to reduce the memory usage per parameter to 8 bits, 4 bits,
	or even less. This topic is discussed in more detail in the <a href="#memory-considerations">Memory considerations</a> section below.</p> <h3 class="relative group"><a id="but-which-chat-model-is-best" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#but-which-chat-model-is-best"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>But which chat model is best?</span></h3> <p data-svelte-h="svelte-x8epta">Even once you know the size of chat model you can run, there’s still a lot of choice out there. One way to sift through
	it all is to consult <strong>leaderboards</strong>. Two of the most popular leaderboards are the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" rel="nofollow">OpenLLM Leaderboard</a>
	and the <a href="https://chat.lmsys.org/?leaderboard" rel="nofollow">LMSys Chatbot Arena Leaderboard</a>. Note that the LMSys leaderboard
	also includes proprietary models - look at the <code>licence</code> column to identify open-source ones that you can download, then
	search for them on the <a href="https://huggingface.co/models?pipeline_tag=text-generation&sort=trending" rel="nofollow">Hugging Face Hub</a>.</p> <h3 class="relative group"><a id="specialist-domains" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#specialist-domains"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Specialist domains</span></h3> <p data-svelte-h="svelte-zflfxi">Some models may be specialized for certain domains, such as medical or legal text, or non-English languages.
	If you’re working in these domains, you may find that a specialized model will give you big performance benefits.
	Don’t automatically assume that, though! Particularly when specialized models are smaller or older than the current
	cutting-edge, a top-end general-purpose model may still outclass them. Thankfully, we are beginning to see
	<a href="https://huggingface.co/blog/leaderboard-medicalllm" rel="nofollow">domain-specific leaderboards</a> that should make it easier to locate
	the best models for specialized domains.</p> <h2 class="relative group"><a id="what-happens-inside-the-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-happens-inside-the-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What happens inside the pipeline?</span></h2> <p data-svelte-h="svelte-1xurd14">The quickstart above used a high-level pipeline to chat with a chat model, which is convenient, but not the
	most flexible. Let’s take a more low-level approach, to see each of the steps involved in chat. Let’s start with
	a code sample, and then break it down:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer
	<span class="hljs-keyword">import</span> torch

	<span class="hljs-comment"># Prepare the input as before</span>
	chat = [
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."</span>},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Hey, can you tell me any fun things to do in New York?"</span>}
	]

	<span class="hljs-comment"># 1: Load the model and tokenizer</span>
	model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, device_map=<span class="hljs-string">"auto"</span>, torch_dtype=torch.bfloat16)
	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>)

	<span class="hljs-comment"># 2: Apply the chat template</span>
	formatted_chat = tokenizer.apply_chat_template(chat, tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">True</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Formatted chat:\n"</span>, formatted_chat)

	<span class="hljs-comment"># 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)</span>
	inputs = tokenizer(formatted_chat, return_tensors=<span class="hljs-string">"pt"</span>, add_special_tokens=<span class="hljs-literal">False</span>)
	<span class="hljs-comment"># Move the tokenized inputs to the same device the model is on (GPU/CPU)</span>
	inputs = {key: tensor.to(model.device) <span class="hljs-keyword">for</span> key, tensor <span class="hljs-keyword">in</span> inputs.items()}
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Tokenized inputs:\n"</span>, inputs)

	<span class="hljs-comment"># 4: Generate text from the model</span>
	outputs = model.generate(**inputs, max_new_tokens=<span class="hljs-number">512</span>, temperature=<span class="hljs-number">0.1</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Generated tokens:\n"</span>, outputs)

	<span class="hljs-comment"># 5: Decode the output back to a string</span>
	decoded_output = tokenizer.decode(outputs[<span class="hljs-number">0</span>][inputs[<span class="hljs-string">'input_ids'</span>].size(<span class="hljs-number">1</span>):], skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Decoded output:\n"</span>, decoded_output)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1koeyn4">There’s a lot in here, each piece of which could be its own document! Rather than going into too much detail, I’ll cover
	the broad ideas, and leave the details for the linked documents. The key steps are:</p> <ol data-svelte-h="svelte-yq7jh9"><li><a href="https://huggingface.co/learn/nlp-course/en/chapter2/3" rel="nofollow">Models</a> and <a href="https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt" rel="nofollow">Tokenizers</a> are loaded from the Hugging Face Hub.</li> <li>The chat is formatted using the tokenizer’s <a href="https://huggingface.co/docs/transformers/main/en/chat_templating" rel="nofollow">chat template</a></li> <li>The formatted chat is <a href="https://huggingface.co/learn/nlp-course/en/chapter2/4" rel="nofollow">tokenized</a> using the tokenizer.</li> <li>We <a href="https://huggingface.co/docs/transformers/en/llm_tutorial" rel="nofollow">generate</a> a response from the model.</li> <li>The tokens output by the model are decoded back to a string</li></ol> <h2 class="relative group"><a id="performance-memory-and-hardware" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#performance-memory-and-hardware"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Performance, memory and hardware</span></h2> <p data-svelte-h="svelte-rq1c2d">You probably know by now that most machine learning tasks are run on GPUs. However, it is entirely possible
	to generate text from a chat model or language model on a CPU, albeit somewhat more slowly. If you can fit
	the model in GPU memory, though, this will usually be the preferable option.</p> <h3 class="relative group"><a id="memory-considerations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-considerations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory considerations</span></h3> <p data-svelte-h="svelte-2ogp4z">By default, Hugging Face classes like <a href="/docs/transformers/pr_33913/en/main_classes/pipelines#transformers.TextGenerationPipeline">TextGenerationPipeline</a> or <a href="/docs/transformers/pr_33913/en/model_doc/auto#transformers.AutoModelForCausalLM">AutoModelForCausalLM</a> will load the model in
	<code>float32</code> precision. This means that it will need 4 bytes (32 bits) per parameter, so an “8B” model with 8 billion
	parameters will need ~32GB of memory. However, this can be wasteful! Most modern language models are trained in
	“bfloat16” precision, which uses only 2 bytes per parameter. If your hardware supports it (Nvidia 30xx/Axxx
	or newer), you can load the model in <code>bfloat16</code> precision, using the <code>torch_dtype</code> argument as we did above.</p> <p data-svelte-h="svelte-4pwule">It is possible to go even lower than 16-bits using “quantization”, a method to lossily compress model weights. This
	allows each parameter to be squeezed down to 8 bits, 4 bits or even less. Note that, especially at 4 bits,
	the model’s outputs may be negatively affected, but often this is a tradeoff worth making to fit a larger and more
	capable chat model in memory. Let’s see this in action with <code>bitsandbytes</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, BitsAndBytesConfig

	quantization_config = BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) <span class="hljs-comment"># You can also try load_in_4bit</span>
	model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, device_map=<span class="hljs-string">"auto"</span>, quantization_config=quantization_config)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2v297v">Or we can do the same thing using the <code>pipeline</code> API:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline, BitsAndBytesConfig

	quantization_config = BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) <span class="hljs-comment"># You can also try load_in_4bit</span>
	pipe = pipeline(<span class="hljs-string">"text-generation"</span>, <span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, device_map=<span class="hljs-string">"auto"</span>, model_kwargs={<span class="hljs-string">"quantization_config"</span>: quantization_config})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-13nw946">There are several other options for quantizing models besides <code>bitsandbytes</code> - please see the <a href="./quantization">Quantization guide</a>
	for more information.</p> <h3 class="relative group"><a id="performance-considerations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#performance-considerations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Performance considerations</span></h3> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1yowb4t">For a more extensive guide on language model performance and optimization, check out <a href="./llm_optims">LLM Inference Optimization</a> .</p></div> <p data-svelte-h="svelte-7p5h36">As a general rule, larger chat models will be slower in addition to requiring more memory. It’s possible to be
	more concrete about this, though: Generating text from a chat model is unusual in that it is bottlenecked by
	<strong>memory bandwidth</strong> rather than compute power, because every active parameter must be read from memory for each
	token that the model generates. This means that number of tokens per second you can generate from a chat
	model is generally proportional to the total bandwidth of the memory it resides in, divided by the size of the model.</p> <p data-svelte-h="svelte-2b6nmw">In our quickstart example above, our model was ~16GB in size when loaded in <code>bfloat16</code> precision.
	This means that 16GB must be read from memory for every token generated by the model. Total memory bandwidth can
	vary from 20-100GB/sec for consumer CPUs to 200-900GB/sec for consumer GPUs, specialized CPUs like
	Intel Xeon, AMD Threadripper/Epyc or high-end Apple silicon, and finally up to 2-3TB/sec for data center GPUs like
	the Nvidia A100 or H100. This should give you a good idea of the generation speed you can expect from these different
	hardware types.</p> <p data-svelte-h="svelte-1phhbcu">Therefore, if you want to improve the speed of text generation, the easiest solution is to either reduce the
	size of the model in memory (usually by quantization), or get hardware with higher memory bandwidth. For advanced users,
	several other techniques exist to get around this bandwidth bottleneck. The most common are variants on
	<a href="https://huggingface.co/blog/assisted-generation" rel="nofollow">assisted generation</a>, also known as “speculative
	sampling”. These techniques try to guess multiple future tokens at once, often using a smaller “draft model”, and then
	confirm these generations with the chat model. If the guesses are validated by the chat model, more than one token can
	be generated per forward pass, which greatly alleviates the bandwidth bottleneck and improves generation speed.</p> <p data-svelte-h="svelte-17sc3he">Finally, we should also note the impact of “Mixture of Experts” (MoE) models here. Several popular chat models,
	such as Mixtral, Qwen-MoE and DBRX, are MoE models. In these models, not every parameter is active for every token generated.
	As a result, MoE models generally have much lower memory bandwidth requirements, even though their total size
	can be quite large. They can therefore be several times faster than a normal “dense” model of the same size. However,
	techniques like assisted generation are generally ineffective for these models because more parameters will become
	active with each new speculated token, which will negate the bandwidth and speed benefits that the MoE architecture
	provides.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/conversations.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_z647wz = {
	assets: "/docs/transformers/pr_33913/en",
	base: "/docs/transformers/pr_33913/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"),
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 15],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 48.5 kB
Xet hash:: c8aef5c0c8fe771a2458fa2f00c23889240b1dd94c1ca44b56793c2463fc87d1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.