Buckets:

rtrm's picture
download
raw
107 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Pouvoirs spéciaux des <i> tokenizers </i> rapides&quot;,&quot;local&quot;:&quot;pouvoirs-spéciaux-des-i-tokenizers-i-rapides&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;L’objet <i> BatchEncoding </i>&quot;,&quot;local&quot;:&quot;lobjet-i-batchencoding-i&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;A l’intérieur du pipeline token-classification&quot;,&quot;local&quot;:&quot;a-lintérieur-du-pipeline-token-classification&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Obtenir les résultats de base avec le pipeline&quot;,&quot;local&quot;:&quot;obtenir-les-résultats-de-base-avec-le-pipeline&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Des entrées aux prédictions&quot;,&quot;local&quot;:&quot;des-entrées-aux-prédictions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Regroupement des entités&quot;,&quot;local&quot;:&quot;regroupement-des-entités&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/fr/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/entry/start.cea6db46.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/singletons.2b29b91f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/paths.f6fdf97f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/entry/app.3f6640b1.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/nodes/0.b777de11.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/nodes/45.8f0b945d.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/CourseFloatingBanner.6add7356.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/FrameworkSwitchCourse.8d4d4ab6.js">
<link rel="modulepreload" href="/docs/course/pr_1069/fr/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Pouvoirs spéciaux des <i> tokenizers </i> rapides&quot;,&quot;local&quot;:&quot;pouvoirs-spéciaux-des-i-tokenizers-i-rapides&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;L’objet <i> BatchEncoding </i>&quot;,&quot;local&quot;:&quot;lobjet-i-batchencoding-i&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;A l’intérieur du pipeline token-classification&quot;,&quot;local&quot;:&quot;a-lintérieur-du-pipeline-token-classification&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Obtenir les résultats de base avec le pipeline&quot;,&quot;local&quot;:&quot;obtenir-les-résultats-de-base-avec-le-pipeline&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Des entrées aux prédictions&quot;,&quot;local&quot;:&quot;des-entrées-aux-prédictions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Regroupement des entités&quot;,&quot;local&quot;:&quot;regroupement-des-entités&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="bg-white leading-none border border-gray-100 rounded-lg flex p-0.5 w-56 text-sm mb-4"><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-l bg-red-50 dark:bg-transparent text-red-600" href="?fw=pt"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><defs><clipPath id="a"><rect x="3.05" y="0.5" width="25.73" height="31" fill="none"></rect></clipPath></defs><g clip-path="url(#a)"><path d="M24.94,9.51a12.81,12.81,0,0,1,0,18.16,12.68,12.68,0,0,1-18,0,12.81,12.81,0,0,1,0-18.16l9-9V5l-.84.83-6,6a9.58,9.58,0,1,0,13.55,0ZM20.44,9a1.68,1.68,0,1,1,1.67-1.67A1.68,1.68,0,0,1,20.44,9Z" fill="#ee4c2c"></path></g></svg> Pytorch </a><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-r text-gray-500 filter grayscale" href="?fw=tf"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="0.94em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 274"><path d="M145.726 42.065v42.07l72.861 42.07v-42.07l-72.86-42.07zM0 84.135v42.07l36.43 21.03V105.17L0 84.135zm109.291 21.035l-36.43 21.034v126.2l36.43 21.035v-84.135l36.435 21.035v-42.07l-36.435-21.034V105.17z" fill="#E55B2D"></path><path d="M145.726 42.065L36.43 105.17v42.065l72.861-42.065v42.065l36.435-21.03v-84.14zM255.022 63.1l-36.435 21.035v42.07l36.435-21.035V63.1zm-72.865 84.135l-36.43 21.035v42.07l36.43-21.036v-42.07zm-36.43 63.104l-36.436-21.035v84.135l36.435-21.035V210.34z" fill="#ED8E24"></path><path d="M145.726 0L0 84.135l36.43 21.035l109.296-63.105l72.861 42.07L255.022 63.1L145.726 0zm0 126.204l-36.435 21.03l36.435 21.036l36.43-21.035l-36.43-21.03z" fill="#F8BF3C"></path></svg> TensorFlow </a></div> <h1 class="relative group"><a id="pouvoirs-spéciaux-des-i-tokenizers-i-rapides" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pouvoirs-spéciaux-des-i-tokenizers-i-rapides"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pouvoirs spéciaux des &lt;i> tokenizers &lt;/i> rapides</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-jdj7ex">Dans cette section, nous allons examiner de plus près les capacités des <em>tokenizers</em> dans 🤗 <em>Transformers</em>.
Jusqu’à présent, nous ne les avons utilisés que pour tokeniser les entrées ou décoder les identifiants pour revenir à du texte. Mais les <em>tokenizers</em>, et surtout ceux soutenus par la bibliothèque 🤗 <em>Tokenizers</em>, peuvent faire beaucoup plus. Pour illustrer ces fonctionnalités supplémentaires, nous allons explorer comment reproduire les résultats des pipelines <code>token-classification</code> (que nous avons appelé <code>ner</code>) et <code>question-answering</code> que nous avons rencontrés pour la première fois dans le <a href="/course/fr/chapter1">chapitre 1</a>.</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/g8quOxoqhHQ" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-6ey859">Dans la discussion qui suit, nous ferons souvent la distinction entre les <em>tokenizers</em> « lents » et les « rapides ». Les <em>tokenizers</em> lents sont ceux écrits en Python à l’intérieur de la bibliothèque 🤗 <em>Transformers</em>, tandis que les rapides sont ceux fournis par 🤗 <em>Tokenizers</em> et sont codés en Rust. Si vous vous souvenez du tableau du <a href="/course/fr/chapter5/3">chapitre 5</a> qui indiquait combien de temps il fallait à un <em>tokenizer</em> rapide et à un <em>tokenizer</em> lent pour tokeniser le jeu de données <em>Drug Review</em>, vous devriez avoir une idée de la raison pour laquelle nous les appelons rapides et lents :</p> <table data-svelte-h="svelte-18weugc"><thead><tr><th align="center"><em>Tokenizer</em> rapide</th> <th align="center"><em>Tokenizer</em> lent</th> <th align="center"></th></tr></thead> <tbody><tr><td align="center"><code>batched=True</code></td> <td align="center">10.8s</td> <td align="center">4min41s</td></tr> <tr><td align="center"><code>batched=False</code></td> <td align="center">59.2s</td> <td align="center">5min3s</td></tr></tbody></table> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1tmecf6">⚠️ Lors de la tokenisation d’une seule phrase, vous ne verrez pas toujours une différence de vitesse entre les versions lente et rapide d’un même <em>tokenizer</em>. En fait, la version rapide peut même être plus lente ! Ce n’est que lorsque vous tokenisez beaucoup de textes en parallèle et en même temps que vous pourrez clairement voir la différence.</p></div> <h2 class="relative group"><a id="lobjet-i-batchencoding-i" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lobjet-i-batchencoding-i"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>L’objet &lt;i> BatchEncoding &lt;/i></span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/3umI3tm27Vw" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1nma7ee">La sortie d’un <em>tokenizer</em> n’est pas un simple dictionnaire Python. Ce que nous obtenons est en fait un objet spécial <code>BatchEncoding</code>. C’est une sous-classe d’un dictionnaire (c’est pourquoi nous avons pu indexer ce résultat sans problème auparavant), mais avec des méthodes supplémentaires qui sont principalement utilisées par les <em>tokenizers</em> rapides.</p> <p data-svelte-h="svelte-1taeug1">En plus de leurs capacités de parallélisation, la fonctionnalité clé des <em>tokenizers</em> rapides est qu’ils gardent toujours la trace de l’étendue originale des textes d’où proviennent les <em>tokens</em> finaux, une fonctionnalité que nous appelons <em>mapping offset</em>. Cela permet de débloquer des fonctionnalités telles que le mappage de chaque mot aux <em>tokens</em> qu’il a générés ou le mappage de chaque caractère du texte original au <em>token</em> qu’il contient, et vice versa.</p> <p data-svelte-h="svelte-140ai1o">Prenons un exemple :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;bert-base-cased&quot;</span>)
example = <span class="hljs-string">&quot;My name is Sylvain and I work at Hugging Face in Brooklyn.&quot;</span>
<span class="hljs-comment"># Je m&#x27;appelle Sylvain et je travaille chez Hugging Face à Brooklyn.</span>
encoding = tokenizer(example)
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">type</span>(encoding))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1glbevx">Comme mentionné précédemment, nous obtenons un objet <code>BatchEncoding</code> dans la sortie du <em>tokenizer</em> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&lt;<span class="hljs-keyword">class</span> <span class="hljs-string">&#x27;transformers.tokenization_utils_base.BatchEncoding&#x27;</span>&gt;<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-iwxywi">Puisque la classe <code>AutoTokenizer</code> choisit un <em>tokenizer</em> rapide par défaut, nous pouvons utiliser les méthodes supplémentaires que cet objet <code>BatchEncoding</code> fournit. Nous avons deux façons de vérifier si notre <em>tokenizer</em> est rapide ou lent. Nous pouvons soit vérifier l’attribut <code>is_fast</code> du <em>tokenizer</em> comme suit :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.is_fast<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ry6gor">soit vérifier le même attribut mais avec notre <code>encoding</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding.is_fast<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9kawn0">Voyons ce qu’un <em>tokenizer</em> rapide nous permet de faire. Tout d’abord, nous pouvons accéder aux <em>tokens</em> sans avoir à reconvertir les identifiants en <em>tokens</em> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding.tokens()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;[CLS]&#x27;</span>, <span class="hljs-string">&#x27;My&#x27;</span>, <span class="hljs-string">&#x27;name&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;S&#x27;</span>, <span class="hljs-string">&#x27;##yl&#x27;</span>, <span class="hljs-string">&#x27;##va&#x27;</span>, <span class="hljs-string">&#x27;##in&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;I&#x27;</span>, <span class="hljs-string">&#x27;work&#x27;</span>, <span class="hljs-string">&#x27;at&#x27;</span>, <span class="hljs-string">&#x27;Hu&#x27;</span>, <span class="hljs-string">&#x27;##gging&#x27;</span>, <span class="hljs-string">&#x27;Face&#x27;</span>, <span class="hljs-string">&#x27;in&#x27;</span>,
<span class="hljs-string">&#x27;Brooklyn&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;[SEP]&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e2v735">Dans ce cas, le <em>token</em> à l’index 5 est <code>##yl</code> et fait partie du mot « Sylvain » dans la phrase originale. Nous pouvons également utiliser la méthode <code>word_ids()</code> pour obtenir l’index du mot dont provient chaque <em>token</em> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding.word_ids()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-literal">None</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">3</span>, <span class="hljs-number">3</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">8</span>, <span class="hljs-number">9</span>, <span class="hljs-number">10</span>, <span class="hljs-number">11</span>, <span class="hljs-number">12</span>, <span class="hljs-literal">None</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nbwme4">On peut voir que les <em>tokens</em> spéciaux du <em>tokenizer</em>, <code>[CLS]</code> et <code>[SEP]</code>, sont mis en correspondance avec <code>None</code> et que chaque <em>token</em> est mis en correspondance avec le mot dont il provient. Ceci est particulièrement utile pour déterminer si un <em>token</em> est au début d’un mot ou si deux <em>tokens</em> sont dans le même mot. Nous pourrions nous appuyer sur le préfixe <code>##</code> pour cela, mais il ne fonctionne que pour les <em>tokenizers</em> de type BERT. Cette méthode fonctionne pour n’importe quel type de <em>tokenizer</em>, du moment qu’il est rapide. Dans le chapitre suivant, nous verrons comment utiliser cette capacité pour appliquer correctement les étiquettes que nous avons pour chaque mot aux <em>tokens</em> dans des tâches comme la reconnaissance d’entités nommées et le POS (<em>Part-of-speech</em>). Nous pouvons également l’utiliser pour masquer tous les <em>tokens</em> provenant du même mot dans la modélisation du langage masqué (une technique appelée <em>whole word masking</em>).</p> <p data-svelte-h="svelte-1b2vl2j">La notion de ce qu’est un mot est compliquée. Par exemple, est-ce que « I’ll » (contraction de « I will ») compte pour un ou deux mots ? Cela dépend en fait du <em>tokenizer</em> et de l’opération de prétokénisation qu’il applique. Certains <em>tokenizer</em> se contentent de séparer les espaces et considèrent donc qu’il s’agit d’un seul mot. D’autres utilisent la ponctuation en plus des espaces et considèrent donc qu’il s’agit de deux mots.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1yojo5w">✏️ <strong>Essayez !</strong> Créez un <em>tokenizer</em> à partir des <i>checkpoints</i> <code>bert-base-cased</code> et <code>roberta-base</code> et tokenisez « 81s » avec. Qu’observez-vous ? Quels sont les identifiants des mots ?</p></div> <p data-svelte-h="svelte-59g3j8">De même, il existe une méthode <code>sentence_ids()</code> que nous pouvons utiliser pour associer un <em>token</em> à la phrase dont il provient (bien que dans ce cas, le <code>token_type_ids</code> retourné par le <em>tokenizer</em> peut nous donner la même information).</p> <p data-svelte-h="svelte-jtpxih">Enfin, nous pouvons faire correspondre n’importe quel mot ou <em>token</em> aux caractères du texte d’origine (et vice versa) grâce aux méthodes <code>word_to_chars()</code> ou <code>token_to_chars()</code> et <code>char_to_word()</code> ou <code>char_to_token()</code>. Par exemple, la méthode <code>word_ids()</code> nous a dit que <code>##yl</code> fait partie du mot à l’indice 3, mais de quel mot s’agit-il dans la phrase ? Nous pouvons le découvrir comme ceci :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->start, end = encoding.word_to_chars(<span class="hljs-number">3</span>)
example[start:end]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Sylvain<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fjt0yz">Comme nous l’avons mentionné précédemment, tout ceci est rendu possible par le fait que le <em>tokenizer</em> rapide garde la trace de la partie du texte d’où provient chaque <em>token</em> dans une liste d’<em>offsets</em>. Pour illustrer leur utilisation, nous allons maintenant vous montrer comment reproduire manuellement les résultats du pipeline <code>token-classification</code>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1017mf4">✏️ <strong>Essayez !</strong> Rédigez votre propre texte et voyez si vous pouvez comprendre quels <em>tokens</em> sont associés à l’identifiant du mot et comment extraire les étendues de caractères pour un seul mot. Pour obtenir des points bonus, essayez d’utiliser deux phrases en entrée et voyez si les identifiants ont un sens pour vous.</p></div> <h2 class="relative group"><a id="a-lintérieur-du-pipeline-token-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#a-lintérieur-du-pipeline-token-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>A l’intérieur du pipeline token-classification</span></h2> <p data-svelte-h="svelte-1prnaq4">Dans le <a href="/course/fr/chapter1">chapitre 1</a>, nous avons eu un premier aperçu de la NER (où la tâche est d’identifier les parties du texte qui correspondent à des entités telles que des personnes, des lieux ou des organisations) avec la fonction <code>pipeline()</code> de 🤗 <em>Transformers</em>. Puis, dans le <a href="/course/fr/chapter2">chapitre 2</a>, nous avons vu comment un pipeline regroupe les trois étapes nécessaires pour obtenir les prédictions à partir d’un texte brut : la tokenisation, le passage des entrées dans le modèle et le post-traitement. Les deux premières étapes du pipeline de <code>token-classification</code> sont les mêmes que dans tout autre pipeline mais le post-traitement est un peu plus complexe. Voyons comment !</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/0E7ltQB7fM8" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <h3 class="relative group"><a id="obtenir-les-résultats-de-base-avec-le-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#obtenir-les-résultats-de-base-avec-le-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Obtenir les résultats de base avec le pipeline</span></h3> <p data-svelte-h="svelte-15tmk01">Tout d’abord, prenons un pipeline de classification de <em>tokens</em> afin d’obtenir des résultats à comparer manuellement. Le modèle utilisé par défaut est <a href="https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english" rel="nofollow"><code>dbmdz/bert-large-cased-finetuned-conll03-english</code></a>. Il effectue une NER sur les phrases :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
token_classifier = pipeline(<span class="hljs-string">&quot;token-classification&quot;</span>)
token_classifier(<span class="hljs-string">&quot;My name is Sylvain and I work at Hugging Face in Brooklyn.&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9993828</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">4</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;S&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">11</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">12</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99815476</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">5</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##yl&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">12</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">14</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99590725</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">6</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##va&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">14</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">16</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9992327</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">7</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##in&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">16</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">18</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.97389334</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">12</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Hu&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">33</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">35</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.976115</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">13</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##gging&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">35</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">40</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.98879766</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">14</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Face&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">41</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">45</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-LOC&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99321055</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">16</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Brooklyn&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">49</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">57</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1j2psou">Le modèle a correctement identifié chaque <em>token</em> généré par « Sylvain » comme une personne, chaque <em>token</em> généré par « Hugging Face » comme une organisation, et le <em>token</em> « Brooklyn » comme un lieu. Nous pouvons également demander au pipeline de regrouper les <em>tokens</em> qui correspondent à la même entité :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
token_classifier = pipeline(<span class="hljs-string">&quot;token-classification&quot;</span>, aggregation_strategy=<span class="hljs-string">&quot;simple&quot;</span>)
token_classifier(<span class="hljs-string">&quot;My name is Sylvain and I work at Hugging Face in Brooklyn.&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;entity_group&#x27;</span>: <span class="hljs-string">&#x27;PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9981694</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Sylvain&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">11</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">18</span>},
{<span class="hljs-string">&#x27;entity_group&#x27;</span>: <span class="hljs-string">&#x27;ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.97960204</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Hugging Face&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">33</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">45</span>},
{<span class="hljs-string">&#x27;entity_group&#x27;</span>: <span class="hljs-string">&#x27;LOC&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99321055</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Brooklyn&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">49</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">57</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kv9ndn">La propriété <code>aggregation_strategy</code> choisie va changer les scores calculés pour chaque entité groupée. Avec <code>&quot;simple&quot;</code> le score est juste la moyenne des scores de chaque <em>token</em> dans l’entité donnée. Par exemple, le score de « Sylvain » est la moyenne des scores que nous avons vu dans l’exemple précédent pour les tokens <code>S</code>, <code>##yl</code>, <code>##va</code>, et <code>##in</code>. D’autres stratégies sont disponibles :</p> <ul data-svelte-h="svelte-u5rx5b"><li><code>&quot;first&quot;</code>, où le score de chaque entité est le score du premier <em>token</em> de cette entité (donc pour « Sylvain » ce serait 0.993828, le score du token <code>S</code>)</li> <li><code>&quot;max&quot;</code>, où le score de chaque entité est le score maximal des <em>tokens</em> de cette entité (ainsi, pour « Hugging Face », le score de « Face » serait de 0,98879766).</li> <li><code>&quot;average&quot;</code>, où le score de chaque entité est la moyenne des scores des mots qui composent cette entité (ainsi, pour « Sylvain », il n’y aurait pas de différence avec la stratégie <code>&quot;simple&quot;</code>, mais “Hugging Face” aurait un score de 0,9819, la moyenne des scores de « Hugging », 0,975, et « Face », 0,98879).</li></ul> <p data-svelte-h="svelte-185ymn3">Voyons maintenant comment obtenir ces résultats sans utiliser la fonction <code>pipeline()</code> !</p> <h3 class="relative group"><a id="des-entrées-aux-prédictions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#des-entrées-aux-prédictions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Des entrées aux prédictions</span></h3> <p data-svelte-h="svelte-1mfobja">D’abord, nous devons tokeniser notre entrée et la faire passer dans le modèle. Cela se fait exactement comme dans le <a href="/course/fr/chapter2">chapitre 2</a>. Nous instancions le <em>tokenizer</em> et le modèle en utilisant les classes <code>TFAutoXxx</code> et les utilisons ensuite dans notre exemple :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForTokenClassification
model_checkpoint = <span class="hljs-string">&quot;dbmdz/bert-large-cased-finetuned-conll03-english&quot;</span>
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
example = <span class="hljs-string">&quot;My name is Sylvain and I work at Hugging Face in Brooklyn.&quot;</span>
inputs = tokenizer(example, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
outputs = model(**inputs)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2nz59t">Puisque nous utilisons <code>AutoModelForTokenClassification</code>, nous obtenons un ensemble de logits pour chaque <em>token</em> dans la séquence d’entrée :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(inputs[<span class="hljs-string">&quot;input_ids&quot;</span>].shape)
<span class="hljs-built_in">print</span>(outputs.logits.shape)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">19</span>])
torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">19</span>, <span class="hljs-number">9</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15vkd2i">Nous avons un batch avec 1 séquence de 19 <em>tokens</em> et le modèle a 9 étiquettes différentes. Ainsi, la sortie du modèle a une forme de 1 x 19 x 9. Comme pour le pipeline de classification de texte, nous utilisons une fonction softmax pour convertir ces logits en probabilités et nous prenons l’argmax pour obtenir des prédictions (notez que nous pouvons prendre l’argmax sur les logits car la fonction softmax ne change pas l’ordre) :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-<span class="hljs-number">1</span>)[<span class="hljs-number">0</span>].tolist()
predictions = outputs.logits.argmax(dim=-<span class="hljs-number">1</span>)[<span class="hljs-number">0</span>].tolist()
<span class="hljs-built_in">print</span>(predictions)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">4</span>, <span class="hljs-number">4</span>, <span class="hljs-number">4</span>, <span class="hljs-number">4</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">6</span>, <span class="hljs-number">6</span>, <span class="hljs-number">6</span>, <span class="hljs-number">0</span>, <span class="hljs-number">8</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-c469zt">L’attribut <code>model.config.id2label</code> contient la correspondance entre les index et les étiquettes que nous pouvons utiliser pour donner un sens aux prédictions :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.config.id2label<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-number">0</span>: <span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-number">1</span>: <span class="hljs-string">&#x27;B-MISC&#x27;</span>,
<span class="hljs-number">2</span>: <span class="hljs-string">&#x27;I-MISC&#x27;</span>,
<span class="hljs-number">3</span>: <span class="hljs-string">&#x27;B-PER&#x27;</span>,
<span class="hljs-number">4</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>,
<span class="hljs-number">5</span>: <span class="hljs-string">&#x27;B-ORG&#x27;</span>,
<span class="hljs-number">6</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>,
<span class="hljs-number">7</span>: <span class="hljs-string">&#x27;B-LOC&#x27;</span>,
<span class="hljs-number">8</span>: <span class="hljs-string">&#x27;I-LOC&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-167f9wi">Comme nous l’avons vu précédemment, il y a 9 étiquettes : <code>O</code> est le label pour les <em>tokens</em> qui ne sont dans aucune entité nommée (il signifie <em>outside</em> (en dehors)) et nous avons ensuite deux labels pour chaque type d’entité (divers, personne, organisation et lieu). L’étiquette <code>B-XXX</code> indique que le <em>token</em> est au début d’une entité <code>XXX</code> et l’étiquette <code>I-XXX</code> indique que le <em>token</em> est à l’intérieur de l’entité <code>XXX</code>. Par exemple, dans l’exemple actuel, nous nous attendons à ce que notre modèle classe le <em>token</em> <code>S</code> comme <code>B-PER</code> (début d’une entité personne) et les <em>tokens</em> <code>##yl</code>, <code>##va</code> et <code>##in</code> comme <code>I-PER</code> (à l’intérieur d’une entité personne).</p> <p data-svelte-h="svelte-13eef10">Vous pourriez penser que le modèle s’est trompé ici car il a attribué l’étiquette <code>I-PER</code> à ces quatre <em>tokens</em> mais ce n’est pas tout à fait vrai. Il existe en fait deux formats pour ces étiquettes <code>B-</code> et <code>I-</code> : <em>IOB1</em> et <em>IOB2</em>. Le format IOB2 (en rose ci-dessous) est celui que nous avons introduit alors que dans le format IOB1 (en bleu), les étiquettes commençant par <code>B-</code> ne sont jamais utilisées que pour séparer deux entités adjacentes du même type. Le modèle que nous utilisons a été <em>finetuné</em> sur un jeu de données utilisant ce format, c’est pourquoi il attribue le label <code>I-PER</code> au <em>token</em> <code>S</code>.</p> <div class="flex justify-center" data-svelte-h="svelte-qf2tof"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/IOB_versions.svg" alt="IOB1 vs IOB2 format"> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/IOB_versions-dark.svg" alt="IOB1 vs IOB2 format"></div> <p data-svelte-h="svelte-1ax17kp">Nous sommes à présent prêts à reproduire (presque entièrement) les résultats du premier pipeline. Nous pouvons simplement récupérer le score et le label de chaque <em>token</em> qui n’a pas été classé comme <code>O</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->results = []
tokens = inputs.tokens()
<span class="hljs-keyword">for</span> idx, pred <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(predictions):
label = model.config.id2label[pred]
<span class="hljs-keyword">if</span> label != <span class="hljs-string">&quot;O&quot;</span>:
results.append(
{<span class="hljs-string">&quot;entity&quot;</span>: label, <span class="hljs-string">&quot;score&quot;</span>: probabilities[idx][pred], <span class="hljs-string">&quot;word&quot;</span>: tokens[idx]}
)
<span class="hljs-built_in">print</span>(results)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9993828</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">4</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;S&#x27;</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99815476</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">5</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##yl&#x27;</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99590725</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">6</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##va&#x27;</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9992327</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">7</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##in&#x27;</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.97389334</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">12</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Hu&#x27;</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.976115</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">13</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##gging&#x27;</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.98879766</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">14</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Face&#x27;</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-LOC&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99321055</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">16</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Brooklyn&#x27;</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xswjms">C’est très similaire à ce que nous avions avant, à une exception près : le pipeline nous a aussi donné des informations sur le <code>début</code> et la <code>fin</code> de chaque entité dans la phrase originale. C’est là que notre <em>offset mapping</em> va entrer en jeu. Pour obtenir les <em>offsets</em>, il suffit de définir <code>return_offsets_mapping=True</code> lorsque nous appliquons le <em>tokenizer</em> à nos entrées :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->inputs_with_offsets = tokenizer(example, return_offsets_mapping=<span class="hljs-literal">True</span>)
inputs_with_offsets[<span class="hljs-string">&quot;offset_mapping&quot;</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-number">0</span>, <span class="hljs-number">0</span>), (<span class="hljs-number">0</span>, <span class="hljs-number">2</span>), (<span class="hljs-number">3</span>, <span class="hljs-number">7</span>), (<span class="hljs-number">8</span>, <span class="hljs-number">10</span>), (<span class="hljs-number">11</span>, <span class="hljs-number">12</span>), (<span class="hljs-number">12</span>, <span class="hljs-number">14</span>), (<span class="hljs-number">14</span>, <span class="hljs-number">16</span>), (<span class="hljs-number">16</span>, <span class="hljs-number">18</span>), (<span class="hljs-number">19</span>, <span class="hljs-number">22</span>), (<span class="hljs-number">23</span>, <span class="hljs-number">24</span>), (<span class="hljs-number">25</span>, <span class="hljs-number">29</span>), (<span class="hljs-number">30</span>, <span class="hljs-number">32</span>),
(<span class="hljs-number">33</span>, <span class="hljs-number">35</span>), (<span class="hljs-number">35</span>, <span class="hljs-number">40</span>), (<span class="hljs-number">41</span>, <span class="hljs-number">45</span>), (<span class="hljs-number">46</span>, <span class="hljs-number">48</span>), (<span class="hljs-number">49</span>, <span class="hljs-number">57</span>), (<span class="hljs-number">57</span>, <span class="hljs-number">58</span>), (<span class="hljs-number">0</span>, <span class="hljs-number">0</span>)]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fu0o3u">Chaque <em>tuple</em> est l’étendue de texte correspondant à chaque <em>token</em><code>(0, 0)</code> est réservé aux <em>tokens</em> spéciaux. Nous avons vu précédemment que le <em>token</em> à l’index 5 est <code>##yl</code>, qui a <code>(12, 14)</code> comme <em>offsets</em> ici. Si on prend la tranche correspondante dans notre exemple :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->example[<span class="hljs-number">12</span>:<span class="hljs-number">14</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1k66u8i">nous obtenons le bon espace de texte sans le <code>##</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->yl<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1d2qwwo">En utilisant cela, nous pouvons maintenant compléter les résultats précédents :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=<span class="hljs-literal">True</span>)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets[<span class="hljs-string">&quot;offset_mapping&quot;</span>]
<span class="hljs-keyword">for</span> idx, pred <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(predictions):
label = model.config.id2label[pred]
<span class="hljs-keyword">if</span> label != <span class="hljs-string">&quot;O&quot;</span>:
start, end = offsets[idx]
results.append(
{
<span class="hljs-string">&quot;entity&quot;</span>: label,
<span class="hljs-string">&quot;score&quot;</span>: probabilities[idx][pred],
<span class="hljs-string">&quot;word&quot;</span>: tokens[idx],
<span class="hljs-string">&quot;start&quot;</span>: start,
<span class="hljs-string">&quot;end&quot;</span>: end,
}
)
<span class="hljs-built_in">print</span>(results)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9993828</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">4</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;S&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">11</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">12</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99815476</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">5</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##yl&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">12</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">14</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99590725</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">6</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##va&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">14</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">16</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9992327</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">7</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##in&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">16</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">18</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.97389334</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">12</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Hu&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">33</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">35</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.976115</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">13</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;##gging&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">35</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">40</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.98879766</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">14</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Face&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">41</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">45</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-LOC&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99321055</span>, <span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">16</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Brooklyn&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">49</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">57</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rmetk">C’est la même chose que ce que nous avons obtenu avec le premier pipeline !</p> <h3 class="relative group"><a id="regroupement-des-entités" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#regroupement-des-entités"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Regroupement des entités</span></h3> <p data-svelte-h="svelte-13sczcn">L’utilisation des <em>offsets</em> pour déterminer les clés de début et de fin pour chaque entité est pratique mais cette information n’est pas strictement nécessaire. Cependant, lorsque nous voulons regrouper les entités, les <em>offsets</em> nous épargnent un batch de code compliqué. Par exemple, si nous voulions regrouper les <em>tokens</em> <code>Hu</code>, <code>##gging</code>, et <code>Face</code>, nous pourrions établir des règles spéciales disant que les deux premiers devraient être attachés tout en enlevant le <code>##</code>, et le <code>Face</code> devrait être ajouté avec un espace puisqu’il ne commence pas par <code>##</code> mais cela ne fonctionnerait que pour ce type particulier de <em>tokenizer</em>. Il faudrait écrire un autre ensemble de règles pour un <em>tokenizer</em> de type SentencePiece ou <em>Byte-Pair-Encoding</em> (voir plus loin dans ce chapitre).</p> <p data-svelte-h="svelte-1b5g9pl">Avec les <em>offsets</em>, tout ce code personnalisé disparaît : il suffit de prendre l’intervalle du texte original qui commence par le premier <em>token</em> et se termine par le dernier <em>token</em>. Ainsi, dans le cas des <em>tokens</em> <code>Hu</code>, <code>##gging</code>, et <code>Face</code>, nous devrions commencer au caractère 33 (le début de <code>Hu</code>) et finir avant le caractère 45 (la fin de <code>Face</code>) :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->example[<span class="hljs-number">33</span>:<span class="hljs-number">45</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Hugging Face<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-c8715b">Pour écrire le code qui post-traite les prédictions tout en regroupant les entités, nous regrouperons les entités qui sont consécutives et étiquetées avec <code>I-XXX</code>, à l’exception de la première, qui peut être étiquetée comme <code>B-XXX</code> ou <code>I-XXX</code> (ainsi, nous arrêtons de regrouper une entité lorsque nous obtenons un <code>O</code>, un nouveau type d’entité, ou un <code>B-XXX</code> qui nous indique qu’une entité du même type commence) :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=<span class="hljs-literal">True</span>)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets[<span class="hljs-string">&quot;offset_mapping&quot;</span>]
idx = <span class="hljs-number">0</span>
<span class="hljs-keyword">while</span> idx &lt; <span class="hljs-built_in">len</span>(predictions):
pred = predictions[idx]
label = model.config.id2label[pred]
<span class="hljs-keyword">if</span> label != <span class="hljs-string">&quot;O&quot;</span>:
<span class="hljs-comment"># Enlever le B- ou le I-</span>
label = label[<span class="hljs-number">2</span>:]
start, _ = offsets[idx]
<span class="hljs-comment"># Récupérer tous les tokens étiquetés avec I-label</span>
all_scores = []
<span class="hljs-keyword">while</span> (
idx &lt; <span class="hljs-built_in">len</span>(predictions)
<span class="hljs-keyword">and</span> model.config.id2label[predictions[idx]] == <span class="hljs-string">f&quot;I-<span class="hljs-subst">{label}</span>&quot;</span>
):
all_scores.append(probabilities[idx][pred])
_, end = offsets[idx]
idx += <span class="hljs-number">1</span>
<span class="hljs-comment"># Le score est la moyenne de tous les scores des tokens dans cette entité groupée</span>
score = np.mean(all_scores).item()
word = example[start:end]
results.append(
{
<span class="hljs-string">&quot;entity_group&quot;</span>: label,
<span class="hljs-string">&quot;score&quot;</span>: score,
<span class="hljs-string">&quot;word&quot;</span>: word,
<span class="hljs-string">&quot;start&quot;</span>: start,
<span class="hljs-string">&quot;end&quot;</span>: end,
}
)
idx += <span class="hljs-number">1</span>
<span class="hljs-built_in">print</span>(results)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pc8lov">Et nous obtenons les mêmes résultats qu’avec notre deuxième pipeline !</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;entity_group&#x27;</span>: <span class="hljs-string">&#x27;PER&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9981694</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Sylvain&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">11</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">18</span>},
{<span class="hljs-string">&#x27;entity_group&#x27;</span>: <span class="hljs-string">&#x27;ORG&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.97960204</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Hugging Face&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">33</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">45</span>},
{<span class="hljs-string">&#x27;entity_group&#x27;</span>: <span class="hljs-string">&#x27;LOC&#x27;</span>, <span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.99321055</span>, <span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;Brooklyn&#x27;</span>, <span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">49</span>, <span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">57</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-m415vw">Un autre exemple de tâche où ces <em>offsets</em> sont extrêmement utiles est la réponse aux questions. Plonger dans ce pipeline, ce que nous ferons dans la section suivante, nous permettra de jeter un coup d’œil à une dernière caractéristique des <em>tokenizers</em> de la bibliothèque 🤗 <em>Transformers</em> : la gestion des <em>tokens</em> qui débordent lorsque nous tronquons une entrée à une longueur donnée.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/fr/chapter6/3.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1sfisyd = {
assets: "/docs/course/pr_1069/fr",
base: "/docs/course/pr_1069/fr",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/fr/_app/immutable/entry/start.cea6db46.js"),
import("/docs/course/pr_1069/fr/_app/immutable/entry/app.3f6640b1.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 45],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
107 kB
·
Xet hash:
9df47c3491c81e864e373f1e2757c40ed2d98b16900e48d5bed3879c1330771d

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.