Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / course /pr_1021 /en /chapter1 /6.html

rtrm

29 days ago

download

raw

52.6 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Transformer Architectures","local":"transformer-architectures","sections":[{"title":"Encoder models","local":"encoder-models","sections":[],"depth":2},{"title":"Decoder models","local":"decoder-models","sections":[{"title":"Modern Large Language Models (LLMs)","local":"modern-large-language-models-llms","sections":[{"title":"Key capabilities of modern LLMs","local":"key-capabilities-of-modern-llms","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Sequence-to-sequence models","local":"sequence-to-sequence-models","sections":[{"title":"Practical applications","local":"practical-applications","sections":[],"depth":3}],"depth":2},{"title":"Choosing the right architecture","local":"choosing-the-right-architecture","sections":[],"depth":2},{"title":"The evolution of LLMs","local":"the-evolution-of-llms","sections":[],"depth":2},{"title":"Attention mechanisms","local":"attention-mechanisms","sections":[{"title":"LSH attention","local":"lsh-attention","sections":[],"depth":3},{"title":"Local attention","local":"local-attention","sections":[],"depth":3},{"title":"Axial positional encodings","local":"axial-positional-encodings","sections":[],"depth":3}],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/course/pr_1021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/entry/start.3d2e2978.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/scheduler.37c15a92.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/singletons.7a9af56d.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/index.18351ede.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/paths.692f56cf.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/entry/app.d60bb0c9.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/index.7cb9c9b8.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/nodes/0.bbc29778.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/nodes/10.fa7cc976.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/Tip.d10b3fc9.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/Youtube.8666c400.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/CourseFloatingBanner.df82c153.js">
	<link rel="modulepreload" href="/docs/course/pr_1021/en/_app/immutable/chunks/getInferenceSnippets.a2135f3c.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Transformer Architectures","local":"transformer-architectures","sections":[{"title":"Encoder models","local":"encoder-models","sections":[],"depth":2},{"title":"Decoder models","local":"decoder-models","sections":[{"title":"Modern Large Language Models (LLMs)","local":"modern-large-language-models-llms","sections":[{"title":"Key capabilities of modern LLMs","local":"key-capabilities-of-modern-llms","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Sequence-to-sequence models","local":"sequence-to-sequence-models","sections":[{"title":"Practical applications","local":"practical-applications","sections":[],"depth":3}],"depth":2},{"title":"Choosing the right architecture","local":"choosing-the-right-architecture","sections":[],"depth":2},{"title":"The evolution of LLMs","local":"the-evolution-of-llms","sections":[],"depth":2},{"title":"Attention mechanisms","local":"attention-mechanisms","sections":[{"title":"LSH attention","local":"lsh-attention","sections":[],"depth":3},{"title":"Local attention","local":"local-attention","sections":[],"depth":3},{"title":"Axial positional encodings","local":"axial-positional-encodings","sections":[],"depth":3}],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-1-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> </div> <h1 class="relative group"><a id="transformer-architectures" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#transformer-architectures"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Transformer Architectures</span></h1> <p data-svelte-h="svelte-uazgoc">In the previous sections, we introduced the general Transformer architecture and explored how these models can solve various tasks. Now, let’s take a closer look at the three main architectural variants of Transformer models and understand when to use each one. Then, we looked at how those architectures are applied to different language tasks.</p> <p data-svelte-h="svelte-d6quv">In this section, we’re going to dive deeper into the three main architectural variants of Transformer models and understand when to use each one.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1xys4uz">Remember that most Transformer models use one of three architectures: encoder-only, decoder-only, or encoder-decoder (sequence-to-sequence). Understanding these differences will help you choose the right model for your specific task.</p></div> <h2 class="relative group"><a id="encoder-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#encoder-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Encoder models</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/MUqNwgPjJvQ" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-rb0n0h">Encoder models use only the encoder of a Transformer model. At each stage, the attention layers can access all the words in the initial sentence. These models are often characterized as having “bi-directional” attention, and are often called <em>auto-encoding models</em>.</p> <p data-svelte-h="svelte-h6rvk4">The pretraining of these models usually revolves around somehow corrupting a given sentence (for instance, by masking random words in it) and tasking the model with finding or reconstructing the initial sentence.</p> <p data-svelte-h="svelte-1l771dj">Encoder models are best suited for tasks requiring an understanding of the full sentence, such as sentence classification, named entity recognition (and more generally word classification), and extractive question answering.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-17t1abp">As we saw in <a href="/chapter1/5">How 🤗 Transformers solve tasks</a>, encoder models like BERT excel at understanding text because they can look at the entire context in both directions. This makes them perfect for tasks where comprehension of the whole input is important.</p></div> <p data-svelte-h="svelte-s27rrg">Representatives of this family of models include:</p> <ul data-svelte-h="svelte-vv3q6"><li><a href="https://huggingface.co/docs/transformers/model_doc/bert" rel="nofollow">BERT</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/distilbert" rel="nofollow">DistilBERT</a></li> <li><a href="https://huggingface.co/docs/transformers/en/model_doc/modernbert" rel="nofollow">ModernBERT</a></li></ul> <h2 class="relative group"><a id="decoder-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#decoder-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Decoder models</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/d_ixlCubqQw" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1n9iaa3">Decoder models use only the decoder of a Transformer model. At each stage, for a given word the attention layers can only access the words positioned before it in the sentence. These models are often called <em>auto-regressive models</em>.</p> <p data-svelte-h="svelte-5nsp9x">The pretraining of decoder models usually revolves around predicting the next word in the sentence.</p> <p data-svelte-h="svelte-zgz7vj">These models are best suited for tasks involving text generation.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1xxzxj4">Decoder models like GPT are designed to generate text by predicting one token at a time. As we explored in <a href="/chapter1/5">How 🤗 Transformers solve tasks</a>, they can only see previous tokens, which makes them excellent for creative text generation but less ideal for tasks requiring bidirectional understanding.</p></div> <p data-svelte-h="svelte-s27rrg">Representatives of this family of models include:</p> <ul data-svelte-h="svelte-1plq41k"><li><a href="https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct" rel="nofollow">Hugging Face SmolLM Series</a></li> <li><a href="https://huggingface.co/docs/transformers/en/model_doc/llama4" rel="nofollow">Meta’s Llama Series</a></li> <li><a href="https://huggingface.co/docs/transformers/main/en/model_doc/gemma3" rel="nofollow">Google’s Gemma Series</a></li> <li><a href="https://huggingface.co/deepseek-ai/DeepSeek-V3" rel="nofollow">DeepSeek’s V3</a></li></ul> <h3 class="relative group"><a id="modern-large-language-models-llms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#modern-large-language-models-llms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Modern Large Language Models (LLMs)</span></h3> <p data-svelte-h="svelte-1ymzzri">Most modern Large Language Models (LLMs) use the decoder-only architecture. These models have grown dramatically in size and capabilities over the past few years, with some of the largest models containing hundreds of billions of parameters.</p> <p data-svelte-h="svelte-1wit7si">Modern LLMs are typically trained in two phases:</p> <ol data-svelte-h="svelte-jvrgce"><li><strong>Pretraining</strong>: The model learns to predict the next token on vast amounts of text data</li> <li><strong>Instruction tuning</strong>: The model is fine-tuned to follow instructions and generate helpful responses</li></ol> <p data-svelte-h="svelte-1lep11r">This approach has led to models that can understand and generate human-like text across a wide range of topics and tasks.</p> <h4 class="relative group"><a id="key-capabilities-of-modern-llms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#key-capabilities-of-modern-llms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Key capabilities of modern LLMs</span></h4> <p data-svelte-h="svelte-1js482o">Modern decoder-based LLMs have demonstrated impressive capabilities:</p> <table data-svelte-h="svelte-19lpk4s"><thead><tr><th>Capability</th> <th>Description</th> <th>Example</th></tr></thead> <tbody><tr><td>Text generation</td> <td>Creating coherent and contextually relevant text</td> <td>Writing essays, stories, or emails</td></tr> <tr><td>Summarization</td> <td>Condensing long documents into shorter versions</td> <td>Creating executive summaries of reports</td></tr> <tr><td>Translation</td> <td>Converting text between languages</td> <td>Translating English to Spanish</td></tr> <tr><td>Question answering</td> <td>Providing answers to factual questions</td> <td>“What is the capital of France?”</td></tr> <tr><td>Code generation</td> <td>Writing or completing code snippets</td> <td>Creating a function based on a description</td></tr> <tr><td>Reasoning</td> <td>Working through problems step by step</td> <td>Solving math problems or logical puzzles</td></tr> <tr><td>Few-shot learning</td> <td>Learning from a few examples in the prompt</td> <td>Classifying text after seeing just 2-3 examples</td></tr></tbody></table> <p data-svelte-h="svelte-u95u8f">You can experiment with decoder-based LLMs directly in your browser via model repo pages on the Hub. Here’s an an example with the classic <a href="https://huggingface.co/openai-community/gpt2" rel="nofollow">GPT-2</a> (OpenAI’s finest open source model!):</p> <iframe src="https://huggingface.co/openai-community/gpt2" frameborder="0" width="100%" height="450"></iframe> <h2 class="relative group"><a id="sequence-to-sequence-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sequence-to-sequence-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sequence-to-sequence models</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/0_4KEb08xrE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-jlh9xs">Encoder-decoder models (also called <em>sequence-to-sequence models</em>) use both parts of the Transformer architecture. At each stage, the attention layers of the encoder can access all the words in the initial sentence, whereas the attention layers of the decoder can only access the words positioned before a given word in the input.</p> <p data-svelte-h="svelte-1ch94h7">The pretraining of these models can take different forms, but it often involves reconstructing a sentence for which the input has been somehow corrupted (for instance by masking random words). The pretraining of the T5 model consists of replacing random spans of text (that can contain several words) with a single mask special token, and the task is then to predict the text that this mask token replaces.</p> <p data-svelte-h="svelte-pfbuhi">Sequence-to-sequence models are best suited for tasks revolving around generating new sentences depending on a given input, such as summarization, translation, or generative question answering.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-az5zk4">As we saw in <a href="/chapter1/5">How 🤗 Transformers solve tasks</a>, encoder-decoder models like BART and T5 combine the strengths of both architectures. The encoder provides deep bidirectional understanding of the input, while the decoder generates appropriate output text. This makes them perfect for tasks that transform one sequence into another, like translation or summarization.</p></div> <h3 class="relative group"><a id="practical-applications" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#practical-applications"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Practical applications</span></h3> <p data-svelte-h="svelte-17vc8a5">Sequence-to-sequence models excel at tasks that require transforming one form of text into another while preserving meaning. Some practical applications include:</p> <table data-svelte-h="svelte-vdppj0"><thead><tr><th>Application</th> <th>Description</th> <th>Example Model</th></tr></thead> <tbody><tr><td>Machine translation</td> <td>Converting text between languages</td> <td>Marian, T5</td></tr> <tr><td>Text summarization</td> <td>Creating concise summaries of longer texts</td> <td>BART, T5</td></tr> <tr><td>Data-to-text generation</td> <td>Converting structured data into natural language</td> <td>T5</td></tr> <tr><td>Grammar correction</td> <td>Fixing grammatical errors in text</td> <td>T5</td></tr> <tr><td>Question answering</td> <td>Generating answers based on context</td> <td>BART, T5</td></tr></tbody></table> <p data-svelte-h="svelte-1vhdufc">Here’s an interactive demo of a sequence-to-sequence model for translation:</p> <iframe src="https://course-demos-speech-to-speech-translation.hf.space" frameborder="0" width="850" height="450"></iframe> <p data-svelte-h="svelte-s27rrg">Representatives of this family of models include:</p> <ul data-svelte-h="svelte-sn73k4"><li><a href="https://huggingface.co/docs/transformers/model_doc/bart" rel="nofollow">BART</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/mbart" rel="nofollow">mBART</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/marian" rel="nofollow">Marian</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/t5" rel="nofollow">T5</a></li></ul> <h2 class="relative group"><a id="choosing-the-right-architecture" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-the-right-architecture"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Choosing the right architecture</span></h2> <p data-svelte-h="svelte-uto95j">When working on a specific NLP task, how do you decide which architecture to use? Here’s a quick guide:</p> <table data-svelte-h="svelte-1vjrc7n"><thead><tr><th>Task</th> <th>Suggested Architecture</th> <th>Examples</th></tr></thead> <tbody><tr><td>Text classification (sentiment, topic)</td> <td>Encoder</td> <td>BERT, RoBERTa</td></tr> <tr><td>Text generation (creative writing)</td> <td>Decoder</td> <td>GPT, LLaMA</td></tr> <tr><td>Translation</td> <td>Encoder-Decoder</td> <td>T5, BART</td></tr> <tr><td>Summarization</td> <td>Encoder-Decoder</td> <td>BART, T5</td></tr> <tr><td>Named entity recognition</td> <td>Encoder</td> <td>BERT, RoBERTa</td></tr> <tr><td>Question answering (extractive)</td> <td>Encoder</td> <td>BERT, RoBERTa</td></tr> <tr><td>Question answering (generative)</td> <td>Encoder-Decoder or Decoder</td> <td>T5, GPT</td></tr> <tr><td>Conversational AI</td> <td>Decoder</td> <td>GPT, LLaMA</td></tr></tbody></table> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1nc2at9">When in doubt about which model to use, consider:</p> <ol data-svelte-h="svelte-e253k4"><li>What kind of understanding does your task need? (Bidirectional or unidirectional)</li> <li>Are you generating new text or analyzing existing text?</li> <li>Do you need to transform one sequence into another?</li></ol> <p data-svelte-h="svelte-13gd5ex">The answers to these questions will guide you toward the right architecture.</p></div> <h2 class="relative group"><a id="the-evolution-of-llms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-evolution-of-llms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The evolution of LLMs</span></h2> <p data-svelte-h="svelte-533bfv">Large Language Models have evolved rapidly in recent years, with each generation bringing significant improvements in capabilities.</p> <h2 class="relative group"><a id="attention-mechanisms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#attention-mechanisms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Attention mechanisms</span></h2> <p data-svelte-h="svelte-1badtw">Most transformer models use full attention in the sense that the attention matrix is square. It can be a big
	computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and
	use a sparse version of the attention matrix to speed up training.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1yj2u5z">Standard attention mechanisms have a computational complexity of O(n²), where n is the sequence length. This becomes problematic for very long sequences. The specialized attention mechanisms below help address this limitation.</p></div> <h3 class="relative group"><a id="lsh-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lsh-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LSH attention</span></h3> <p data-svelte-h="svelte-o5n8ou"><a href="https://huggingface.co/docs/transformers/model_doc/reformer" rel="nofollow">Reformer</a> uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only
	the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
	modified to mask the current token (except at the first position), because it will give a query and a key equal (so
	very similar to each other). Since the hash can be a bit random, several hash functions are used in practice
	(determined by a n_rounds parameter) and then are averaged together.</p> <h3 class="relative group"><a id="local-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#local-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Local attention</span></h3> <p data-svelte-h="svelte-1h3tjf1"><a href="https://huggingface.co/docs/transformers/model_doc/longformer" rel="nofollow">Longformer</a> uses local attention: often, the local context (e.g., what are the two tokens to the left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small
	window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
	representation of the whole sentence.</p> <p data-svelte-h="svelte-1x90fhu">Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
	all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in
	their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:</p> <div class="flex justify-center" data-svelte-h="svelte-d4kpls"><img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"></div> <p data-svelte-h="svelte-11i8g0o">Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence
	length.</p> <h3 class="relative group"><a id="axial-positional-encodings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#axial-positional-encodings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Axial positional encodings</span></h3> <p><a href="https://huggingface.co/docs/transformers/model_doc/reformer" rel="nofollow" data-svelte-h="svelte-1bpx0mm">Reformer</a> uses axial positional encodings: in traditional transformer models, the positional encoding
	E is a matrix of size<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>l</mi></mrow><annotation encoding="application/x-tex">l</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span></span></span></span><!-- HTML_TAG_END --> by<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>d</mi></mrow><annotation encoding="application/x-tex">d</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord mathnormal">d</span></span></span></span><!-- HTML_TAG_END -->,<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>l</mi></mrow><annotation encoding="application/x-tex">l</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span></span></span></span><!-- HTML_TAG_END --> being the sequence length and<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>d</mi></mrow><annotation encoding="application/x-tex">d</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord mathnormal">d</span></span></span></span><!-- HTML_TAG_END --> the dimension of the
	hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate
	that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with
	dimensions<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>l</mi><mn>1</mn></msub><mo>×</mo><msub><mi>d</mi><mn>1</mn></msub></mrow><annotation encoding="application/x-tex">l_{1} \times d_{1}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> and<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>l</mi><mn>2</mn></msub><mo>×</mo><msub><mi>d</mi><mn>2</mn></msub></mrow><annotation encoding="application/x-tex">l_{2} \times d_{2}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END -->, such that<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>l</mi><mn>1</mn></msub><mo>×</mo><msub><mi>l</mi><mn>2</mn></msub><mo>=</mo><mi>l</mi></mrow><annotation encoding="application/x-tex">l_{1} \times l_{2} = l</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0197em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span></span></span></span><!-- HTML_TAG_END --> and<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>d</mi><mn>1</mn></msub><mo>+</mo><msub><mi>d</mi><mn>2</mn></msub><mo>=</mo><mi>d</mi></mrow><annotation encoding="application/x-tex">d_{1} + d_{2} = d</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">2</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord mathnormal">d</span></span></span></span><!-- HTML_TAG_END --> (with the product for the lengths, this ends up being way smaller). The embedding for time
	step<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>j</mi></mrow><annotation encoding="application/x-tex">j</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.854em;vertical-align:-0.1944em;"></span><span class="mord mathnormal" style="margin-right:0.05724em;">j</span></span></span></span><!-- HTML_TAG_END --> in E is obtained by concatenating the embeddings for timestep<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>j</mi><mi mathvariant="normal">%</mi><mi>l</mi><mn>1</mn></mrow><annotation encoding="application/x-tex">j \% l1</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9444em;vertical-align:-0.1944em;"></span><span class="mord mathnormal" style="margin-right:0.05724em;">j</span><span class="mord">%</span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="mord">1</span></span></span></span><!-- HTML_TAG_END --> in E1 and<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>j</mi><mi mathvariant="normal">/</mi><mi mathvariant="normal">/</mi><mi>l</mi><mn>1</mn></mrow><annotation encoding="application/x-tex">j // l1</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.05724em;">j</span><span class="mord">//</span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="mord">1</span></span></span></span><!-- HTML_TAG_END -->
	in E2.</p> <h2 class="relative group"><a id="conclusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#conclusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Conclusion</span></h2> <p data-svelte-h="svelte-1hv2asx">In this section, we’ve explored the three main Transformer architectures and some specialized attention mechanisms. Understanding these architectural differences is crucial for selecting the right model for your specific NLP task.</p> <p data-svelte-h="svelte-1sfuzr0">As we move forward in the course, you’ll get hands-on experience with these different architectures and learn how to fine-tune them for your specific needs. In the next section, we’ll look at some of the limitations and biases present in these models that you should be aware of when deploying them.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/en/chapter1/6.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_engvle = {
	assets: "/docs/course/pr_1021/en",
	base: "/docs/course/pr_1021/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/course/pr_1021/en/_app/immutable/entry/start.3d2e2978.js"),
	import("/docs/course/pr_1021/en/_app/immutable/entry/app.d60bb0c9.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 10],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 52.6 kB
Xet hash:: 2cccda80b7b8ec453d7ec4ad08f6ee6b39ce9a94a899d1b59805b3968db6db1a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.