Buckets:

rtrm's picture
download
raw
25.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;¿Cómo funcionan los Transformers?&quot;,&quot;local&quot;:&quot;how-do-transformers-work&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Un poco de historia de los Transformers&quot;,&quot;local&quot;:&quot;a-bit-of-transformer-history&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Los Transformers son modelos del lenguaje&quot;,&quot;local&quot;:&quot;transformers-are-language-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Los Transformers son modelos grandes&quot;,&quot;local&quot;:&quot;transformers-are-big-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Transfer Learning&quot;,&quot;local&quot;:&quot;transfer-learning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Arquitectura Transformer general&quot;,&quot;local&quot;:&quot;general-transformer-architecture&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Capas de atención&quot;,&quot;local&quot;:&quot;attention-layers&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;La arquitectura original&quot;,&quot;local&quot;:&quot;the-original-architecture&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Arquitecturas frente a checkpoints&quot;,&quot;local&quot;:&quot;architecture-vs-checkpoints&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1213/es/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/entry/start.36d27295.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/scheduler.505acc25.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/singletons.6865fa96.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/index.001f95d5.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/paths.ec28c642.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/entry/app.3b43d7f3.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/preload-helper.8c2bab6b.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/index.e22abd30.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/nodes/0.e2c0ea78.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/nodes/8.6f2a1c09.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.a144e953.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/Youtube.7545e4b1.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/CourseFloatingBanner.f0a2dc21.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;¿Cómo funcionan los Transformers?&quot;,&quot;local&quot;:&quot;how-do-transformers-work&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Un poco de historia de los Transformers&quot;,&quot;local&quot;:&quot;a-bit-of-transformer-history&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Los Transformers son modelos del lenguaje&quot;,&quot;local&quot;:&quot;transformers-are-language-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Los Transformers son modelos grandes&quot;,&quot;local&quot;:&quot;transformers-are-big-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Transfer Learning&quot;,&quot;local&quot;:&quot;transfer-learning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Arquitectura Transformer general&quot;,&quot;local&quot;:&quot;general-transformer-architecture&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Capas de atención&quot;,&quot;local&quot;:&quot;attention-layers&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;La arquitectura original&quot;,&quot;local&quot;:&quot;the-original-architecture&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Arquitecturas frente a checkpoints&quot;,&quot;local&quot;:&quot;architecture-vs-checkpoints&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="how-do-transformers-work" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-do-transformers-work"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>¿Cómo funcionan los Transformers?</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-1-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> </div> <p data-svelte-h="svelte-d3nlmt">En esta sección veremos la arquitectura de los modelos Transformer y profundizaremos en conceptos como la atención, la arquitectura codificador-decodificador y otros elementos clave.</p> <blockquote class="warning" data-svelte-h="svelte-vwt4ox"><p>🚀 Aquí subimos el nivel. Esta sección es técnica y detallada, así que no te preocupes si no entiendes todo a la primera. Volveremos sobre estos conceptos más adelante en el curso.</p></blockquote> <h2 class="relative group"><a id="a-bit-of-transformer-history" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#a-bit-of-transformer-history"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Un poco de historia de los Transformers</span></h2> <p data-svelte-h="svelte-4typw8">Aquí tienes algunos hitos de la historia de los modelos Transformer:</p> <div class="flex justify-center" data-svelte-h="svelte-12ln96f"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/transformers_chrono.svg" alt="Breve cronología de los modelos Transformer."> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/transformers_chrono-dark.svg" alt="Breve cronología de los modelos Transformer."></div> <p data-svelte-h="svelte-4pps8r">La <a href="https://arxiv.org/abs/1706.03762" rel="nofollow">arquitectura Transformer</a> se introdujo en junio de 2017. Después llegaron varios modelos muy influyentes, entre ellos GPT, BERT, GPT-2, T5, GPT-3, InstructGPT, Llama, Mistral, Gemma 2 y SmolLM2.</p> <p data-svelte-h="svelte-qf42we">En términos generales, pueden agruparse en tres categorías:</p> <ul data-svelte-h="svelte-1fzicrs"><li>Modelos tipo GPT, también llamados modelos Transformer <em>auto-regresivos</em>.</li> <li>Modelos tipo BERT, también llamados modelos Transformer <em>auto-codificadores</em>.</li> <li>Modelos tipo T5, también llamados modelos Transformer <em>sequence-to-sequence</em>.</li></ul> <h2 class="relative group"><a id="transformers-are-language-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#transformers-are-language-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Los Transformers son modelos del lenguaje</span></h2> <p data-svelte-h="svelte-zdyjcp">Todos los modelos Transformer mencionados antes han sido entrenados como <em>modelos del lenguaje</em>. Esto significa que se han entrenado sobre grandes cantidades de texto sin procesar de forma auto-supervisada.</p> <p data-svelte-h="svelte-ewd77m">Este tipo de modelo desarrolla una comprensión estadística del lenguaje, pero resulta menos útil para tareas prácticas concretas. Por eso, el modelo preentrenado general se somete después a un proceso llamado <em>transfer learning</em> o <em>fine-tuning</em>.</p> <p data-svelte-h="svelte-pzv9ne">Otro ejemplo es <em>masked language modeling</em>, donde el modelo predice una palabra enmascarada dentro de la oración.</p> <h2 class="relative group"><a id="transformers-are-big-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#transformers-are-big-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Los Transformers son modelos grandes</span></h2> <p data-svelte-h="svelte-qdkedl">Salvo unas pocas excepciones, la estrategia general para lograr mejor rendimiento ha sido aumentar el tamaño de los modelos y la cantidad de datos usados en su preentrenamiento.</p> <div class="flex justify-center" data-svelte-h="svelte-1sm5anm"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/model_parameters.png" alt="Número de parámetros de modelos Transformer recientes" width="90%"></div> <p data-svelte-h="svelte-vt82kj">Por eso es tan importante compartir modelos del lenguaje: compartir los pesos ya entrenados y construir sobre ellos reduce el coste total de cómputo y la huella de carbono de la comunidad.</p> <h2 class="relative group"><a id="transfer-learning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#transfer-learning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Transfer Learning</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/BqqfQnyjmgg" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-6yrug1"><em>Pretraining</em> es el acto de entrenar un modelo desde cero. <em>Fine-tuning</em>, por el contrario, es el entrenamiento que se hace <strong>después</strong> de que un modelo haya sido preentrenado.</p> <p data-svelte-h="svelte-xfr9ow">El ajuste reduce los costes de tiempo, datos, dinero e impacto ambiental. Además, permite iterar más rápido sobre distintas estrategias de entrenamiento.</p> <h2 class="relative group"><a id="general-transformer-architecture" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#general-transformer-architecture"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Arquitectura Transformer general</span></h2> <p data-svelte-h="svelte-v27ln0">El modelo está compuesto principalmente por dos bloques:</p> <ul data-svelte-h="svelte-1ac1wml"><li><strong>Encoder (izquierda)</strong>: recibe una entrada y construye una representación de ella.</li> <li><strong>Decoder (derecha)</strong>: usa la representación generada por el encoder junto con otras entradas para producir una secuencia objetivo.</li></ul> <div class="flex justify-center" data-svelte-h="svelte-62bjft"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/transformers_blocks.svg" alt="Arquitectura de un modelo Transformer"> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/transformers_blocks-dark.svg" alt="Arquitectura de un modelo Transformer"></div> <h2 class="relative group"><a id="attention-layers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#attention-layers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Capas de atención</span></h2> <p data-svelte-h="svelte-1ng7lgo">Una característica clave de los modelos Transformer es que están construidos con capas especiales llamadas <em>attention layers</em>. Estas capas hacen que el modelo preste atención a ciertas palabras de la oración y menos a otras cuando construye la representación de cada palabra.</p> <h2 class="relative group"><a id="the-original-architecture" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-original-architecture"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>La arquitectura original</span></h2> <p data-svelte-h="svelte-y0k2r1">La arquitectura Transformer se diseñó originalmente para traducción. Durante el entrenamiento, el encoder recibe oraciones en un idioma y el decoder recibe esas mismas oraciones en el idioma objetivo.</p> <div class="flex justify-center" data-svelte-h="svelte-1bxh81z"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/transformers.svg" alt="Arquitectura de un modelo Transformer"> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter1/transformers-dark.svg" alt="Arquitectura de un modelo Transformer"></div> <h2 class="relative group"><a id="architecture-vs-checkpoints" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#architecture-vs-checkpoints"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Arquitecturas frente a checkpoints</span></h2> <ul data-svelte-h="svelte-17uzpic"><li><strong>Arquitectura</strong>: es el esqueleto del modelo.</li> <li><strong>Checkpoint</strong>: es el conjunto de pesos que se carga sobre una arquitectura dada.</li> <li><strong>Modelo</strong>: es un término paraguas menos preciso, porque puede referirse tanto a la arquitectura como al checkpoint.</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/es/chapter1/4.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1nznq34 = {
assets: "/docs/course/pr_1213/es",
base: "/docs/course/pr_1213/es",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1213/es/_app/immutable/entry/start.36d27295.js"),
import("/docs/course/pr_1213/es/_app/immutable/entry/app.3b43d7f3.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 8],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
25.5 kB
·
Xet hash:
b3e275ca9eee5f7af307b7e64b3e9b8dfb9e8fc0c682d9dc8e66e27c877cda2e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.