Buckets:

rtrm's picture
download
raw
36.2 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Arquitecturas Transformer&quot;,&quot;local&quot;:&quot;transformer-architectures&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Modelos encoder&quot;,&quot;local&quot;:&quot;encoder-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Modelos decoder&quot;,&quot;local&quot;:&quot;decoder-models&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Modern Large Language Models (LLMs)&quot;,&quot;local&quot;:&quot;modern-large-language-models-llms&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Capacidades clave de los LLMs modernos&quot;,&quot;local&quot;:&quot;capacidades-clave-de-los-llms-modernos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Modelos sequence-to-sequence&quot;,&quot;local&quot;:&quot;sequence-to-sequence-models&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Aplicaciones prácticas&quot;,&quot;local&quot;:&quot;aplicaciones-prácticas&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Elegir la arquitectura adecuada&quot;,&quot;local&quot;:&quot;choosing-the-right-architecture&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;La evolución de los LLMs&quot;,&quot;local&quot;:&quot;la-evolución-de-los-llms&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Mecanismos de atención&quot;,&quot;local&quot;:&quot;attention-mechanisms&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;LSH attention&quot;,&quot;local&quot;:&quot;lsh-attention&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Local attention&quot;,&quot;local&quot;:&quot;local-attention&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Axial positional encodings&quot;,&quot;local&quot;:&quot;axial-positional-encodings&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Conclusión&quot;,&quot;local&quot;:&quot;conclusion&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1213/es/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/entry/start.36d27295.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/scheduler.505acc25.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/singletons.6865fa96.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/index.001f95d5.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/paths.ec28c642.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/entry/app.3b43d7f3.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/preload-helper.8c2bab6b.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/index.e22abd30.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/nodes/0.e2c0ea78.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/nodes/10.cd657f63.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.a144e953.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/Youtube.7545e4b1.js">
<link rel="modulepreload" href="/docs/course/pr_1213/es/_app/immutable/chunks/CourseFloatingBanner.f0a2dc21.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Arquitecturas Transformer&quot;,&quot;local&quot;:&quot;transformer-architectures&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Modelos encoder&quot;,&quot;local&quot;:&quot;encoder-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Modelos decoder&quot;,&quot;local&quot;:&quot;decoder-models&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Modern Large Language Models (LLMs)&quot;,&quot;local&quot;:&quot;modern-large-language-models-llms&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Capacidades clave de los LLMs modernos&quot;,&quot;local&quot;:&quot;capacidades-clave-de-los-llms-modernos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Modelos sequence-to-sequence&quot;,&quot;local&quot;:&quot;sequence-to-sequence-models&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Aplicaciones prácticas&quot;,&quot;local&quot;:&quot;aplicaciones-prácticas&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Elegir la arquitectura adecuada&quot;,&quot;local&quot;:&quot;choosing-the-right-architecture&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;La evolución de los LLMs&quot;,&quot;local&quot;:&quot;la-evolución-de-los-llms&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Mecanismos de atención&quot;,&quot;local&quot;:&quot;attention-mechanisms&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;LSH attention&quot;,&quot;local&quot;:&quot;lsh-attention&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Local attention&quot;,&quot;local&quot;:&quot;local-attention&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Axial positional encodings&quot;,&quot;local&quot;:&quot;axial-positional-encodings&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Conclusión&quot;,&quot;local&quot;:&quot;conclusion&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-1-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> </div> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="transformer-architectures" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#transformer-architectures"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Arquitecturas Transformer</span></h1> <p data-svelte-h="svelte-1f033m4">En las secciones anteriores introdujimos la arquitectura general Transformer y vimos cómo estos modelos pueden resolver diversas tareas. Ahora veremos con más detalle las tres variantes arquitectónicas principales y cuándo conviene usar cada una.</p> <blockquote class="tip" data-svelte-h="svelte-1lrdeyq"><p>Recuerda que la mayoría de modelos Transformer usan una de tres arquitecturas: solo encoder, solo decoder o encoder-decoder. Entender estas diferencias te ayudará a elegir el modelo adecuado para tu tarea.</p></blockquote> <h2 class="relative group"><a id="encoder-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#encoder-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Modelos encoder</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/MUqNwgPjJvQ" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-hxdagt">Los modelos encoder usan solo la parte encoder de la arquitectura Transformer. En cada etapa, las capas de atención pueden acceder a todas las palabras de la oración. Por eso suelen considerarse modelos con atención bidireccional y también se les llama <em>auto-encoding models</em>.</p> <p data-svelte-h="svelte-xt1dsu">Su preentrenamiento suele consistir en corromper una oración, por ejemplo enmascarando palabras al azar, y pedirle al modelo que reconstruya la secuencia original.</p> <p data-svelte-h="svelte-1nlsy5x">Son especialmente adecuados para tareas que requieren entender la oración completa, como clasificación de oraciones, reconocimiento de entidades y question answering extractivo.</p> <p data-svelte-h="svelte-15205fq">Representantes de esta familia:</p> <ul data-svelte-h="svelte-vv3q6"><li><a href="https://huggingface.co/docs/transformers/model_doc/bert" rel="nofollow">BERT</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/distilbert" rel="nofollow">DistilBERT</a></li> <li><a href="https://huggingface.co/docs/transformers/en/model_doc/modernbert" rel="nofollow">ModernBERT</a></li></ul> <h2 class="relative group"><a id="decoder-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#decoder-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Modelos decoder</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/d_ixlCubqQw" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1fpzzd0">Los modelos decoder usan solo la parte decoder. Para una palabra dada, las capas de atención solo pueden acceder a las palabras anteriores de la secuencia. Por eso también se llaman <em>auto-regressive models</em>.</p> <p data-svelte-h="svelte-c3d9bh">Su preentrenamiento suele basarse en predecir la siguiente palabra de la oración. Son especialmente adecuados para tareas de generación de texto.</p> <p data-svelte-h="svelte-15205fq">Representantes de esta familia:</p> <ul data-svelte-h="svelte-jhoo0"><li><a href="https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct" rel="nofollow">Hugging Face SmolLM Series</a></li> <li><a href="https://huggingface.co/docs/transformers/en/model_doc/llama4" rel="nofollow">Meta Llama Series</a></li> <li><a href="https://huggingface.co/docs/transformers/main/en/model_doc/gemma3" rel="nofollow">Google Gemma Series</a></li> <li><a href="https://huggingface.co/deepseek-ai/DeepSeek-V3" rel="nofollow">DeepSeek V3</a></li></ul> <h3 class="relative group"><a id="modern-large-language-models-llms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#modern-large-language-models-llms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Modern Large Language Models (LLMs)</span></h3> <p data-svelte-h="svelte-13t5h8t">La mayoría de los Large Language Models modernos usan arquitectura solo decoder. Estos modelos han crecido muchísimo en tamaño y capacidades durante los últimos años.</p> <p data-svelte-h="svelte-1xj6fiw">Normalmente se entrenan en dos fases:</p> <ol data-svelte-h="svelte-7c36cz"><li><strong>Pretraining</strong>: el modelo aprende a predecir el siguiente token sobre enormes cantidades de texto.</li> <li><strong>Instruction tuning</strong>: el modelo se ajusta para seguir instrucciones y generar respuestas útiles.</li></ol> <h4 class="relative group"><a id="capacidades-clave-de-los-llms-modernos" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#capacidades-clave-de-los-llms-modernos"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Capacidades clave de los LLMs modernos</span></h4> <table data-svelte-h="svelte-ll413a"><thead><tr><th>Capacidad</th> <th>Descripción</th> <th>Ejemplo</th></tr></thead> <tbody><tr><td>Generación de texto</td> <td>Crear texto coherente y relevante</td> <td>Redactar ensayos, historias o correos</td></tr> <tr><td>Resumen</td> <td>Condensar documentos largos</td> <td>Generar resúmenes ejecutivos</td></tr> <tr><td>Traducción</td> <td>Convertir texto entre idiomas</td> <td>Traducir de inglés a español</td></tr> <tr><td>Question answering</td> <td>Responder preguntas factuales</td> <td>“¿Cuál es la capital de Francia?”</td></tr> <tr><td>Generación de código</td> <td>Escribir o completar código</td> <td>Crear una función a partir de una descripción</td></tr> <tr><td>Razonamiento</td> <td>Resolver problemas paso a paso</td> <td>Problemas matemáticos o lógicos</td></tr> <tr><td>Few-shot learning</td> <td>Aprender a partir de unos pocos ejemplos</td> <td>Clasificar texto tras ver 2 o 3 ejemplos</td></tr></tbody></table> <p data-svelte-h="svelte-16lp98">Puedes experimentar con LLMs de tipo decoder directamente en las páginas de modelos del Hub. Un ejemplo clásico es <a href="https://huggingface.co/openai-community/gpt2" rel="nofollow">GPT-2</a>.</p> <h2 class="relative group"><a id="sequence-to-sequence-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sequence-to-sequence-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Modelos sequence-to-sequence</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/0_4KEb08xrE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-107e6aq">Los modelos encoder-decoder, también llamados <em>sequence-to-sequence models</em>, usan ambas partes de la arquitectura Transformer. Las capas de atención del encoder pueden acceder a toda la oración de entrada, mientras que las del decoder solo pueden ver las palabras anteriores de la secuencia objetivo.</p> <p data-svelte-h="svelte-146hlo0">Su preentrenamiento puede tomar varias formas, aunque suele implicar reconstruir una oración cuya entrada ha sido corrompida de alguna manera. El preentrenamiento de T5, por ejemplo, sustituye spans aleatorios de texto por un único token de máscara y la tarea consiste en predecir ese texto faltante.</p> <p data-svelte-h="svelte-ompku">Son especialmente adecuados para tareas que consisten en generar nuevas oraciones a partir de una entrada, como resumen, traducción o question answering generativo.</p> <h3 class="relative group"><a id="aplicaciones-prácticas" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#aplicaciones-prácticas"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Aplicaciones prácticas</span></h3> <table data-svelte-h="svelte-pjdsbq"><thead><tr><th>Aplicación</th> <th>Descripción</th> <th>Modelo de ejemplo</th></tr></thead> <tbody><tr><td>Traducción automática</td> <td>Convertir texto entre idiomas</td> <td>Marian, T5</td></tr> <tr><td>Resumen automático</td> <td>Crear resúmenes concisos</td> <td>BART, T5</td></tr> <tr><td>Data-to-text generation</td> <td>Convertir datos estructurados en lenguaje natural</td> <td>T5</td></tr> <tr><td>Corrección gramatical</td> <td>Corregir errores en texto</td> <td>T5</td></tr> <tr><td>Question answering</td> <td>Generar respuestas basadas en contexto</td> <td>BART, T5</td></tr></tbody></table> <iframe src="https://course-demos-speech-to-speech-translation.hf.space" frameborder="0" width="850" height="450"></iframe> <p data-svelte-h="svelte-15205fq">Representantes de esta familia:</p> <ul data-svelte-h="svelte-sn73k4"><li><a href="https://huggingface.co/docs/transformers/model_doc/bart" rel="nofollow">BART</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/mbart" rel="nofollow">mBART</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/marian" rel="nofollow">Marian</a></li> <li><a href="https://huggingface.co/docs/transformers/model_doc/t5" rel="nofollow">T5</a></li></ul> <h2 class="relative group"><a id="choosing-the-right-architecture" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-the-right-architecture"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Elegir la arquitectura adecuada</span></h2> <table data-svelte-h="svelte-1t1sa1f"><thead><tr><th>Tarea</th> <th>Arquitectura sugerida</th> <th>Ejemplos</th></tr></thead> <tbody><tr><td>Clasificación de texto</td> <td>Encoder</td> <td>BERT, RoBERTa</td></tr> <tr><td>Generación de texto</td> <td>Decoder</td> <td>GPT, LLaMA</td></tr> <tr><td>Traducción</td> <td>Encoder-Decoder</td> <td>T5, BART</td></tr> <tr><td>Resumen</td> <td>Encoder-Decoder</td> <td>BART, T5</td></tr> <tr><td>Named entity recognition</td> <td>Encoder</td> <td>BERT, RoBERTa</td></tr> <tr><td>Question answering extractivo</td> <td>Encoder</td> <td>BERT, RoBERTa</td></tr> <tr><td>Question answering generativo</td> <td>Encoder-Decoder o Decoder</td> <td>T5, GPT</td></tr> <tr><td>IA conversacional</td> <td>Decoder</td> <td>GPT, LLaMA</td></tr></tbody></table> <h2 class="relative group"><a id="la-evolución-de-los-llms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#la-evolución-de-los-llms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>La evolución de los LLMs</span></h2> <p data-svelte-h="svelte-1eza9as">Los Large Language Models han evolucionado muy rápido en los últimos años y cada generación ha traído mejoras importantes en capacidades.</p> <h2 class="relative group"><a id="attention-mechanisms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#attention-mechanisms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Mecanismos de atención</span></h2> <p data-svelte-h="svelte-2pxrjt">La mayoría de modelos Transformer usan atención completa, lo que significa que la matriz de atención es cuadrada. Esto puede convertirse en un cuello de botella computacional cuando las secuencias son largas. Modelos como Longformer y Reformer intentan ser más eficientes usando versiones dispersas de esa matriz.</p> <h3 class="relative group"><a id="lsh-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lsh-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LSH attention</span></h3> <p data-svelte-h="svelte-1h2vlel"><a href="https://huggingface.co/docs/transformers/model_doc/reformer" rel="nofollow">Reformer</a> usa LSH attention. La idea general es que, en vez de atender a todas las claves posibles, solo merece la pena considerar aquellas que estén cerca de cada consulta. Para decidir qué claves están cerca se emplean funciones hash.</p> <h3 class="relative group"><a id="local-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#local-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Local attention</span></h3> <p data-svelte-h="svelte-nvtsph"><a href="https://huggingface.co/docs/transformers/model_doc/longformer" rel="nofollow">Longformer</a> usa local attention. A menudo, el contexto local es suficiente para tomar una decisión sobre un token concreto. Además, algunos tokens reciben atención global para que toda la secuencia pueda conectarse mejor.</p> <div class="flex justify-center" data-svelte-h="svelte-d4kpls"><img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"></div> <h3 class="relative group"><a id="axial-positional-encodings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#axial-positional-encodings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Axial positional encodings</span></h3> <p data-svelte-h="svelte-3y4amz"><a href="https://huggingface.co/docs/transformers/model_doc/reformer" rel="nofollow">Reformer</a> también usa axial positional encodings. En lugar de mantener una gran matriz posicional, la factoriza en matrices más pequeñas para reducir el coste en memoria, lo que facilita trabajar con secuencias muy largas.</p> <h2 class="relative group"><a id="conclusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#conclusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Conclusión</span></h2> <p data-svelte-h="svelte-ll3toa">En esta sección exploramos las tres arquitecturas principales de Transformer y algunos mecanismos de atención especializados. Entender estas diferencias es clave para elegir el modelo adecuado para cada tarea.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/es/chapter1/6.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1nznq34 = {
assets: "/docs/course/pr_1213/es",
base: "/docs/course/pr_1213/es",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1213/es/_app/immutable/entry/start.36d27295.js"),
import("/docs/course/pr_1213/es/_app/immutable/entry/app.3b43d7f3.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 10],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
36.2 kB
·
Xet hash:
356708edb031222656a8e22917aa29cd9eaa3777b800e67db4b5e850163aa6e8

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.