Buckets:

hf-doc-build/doc / transformers /main /zh /tasks /token_classification.html
HuggingFaceDocBuilder's picture
download
raw
72.6 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;词元分类&quot;,&quot;local&quot;:&quot;词元分类&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;加载 WNUT 17 数据集&quot;,&quot;local&quot;:&quot;加载-wnut-17-数据集&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;预处理&quot;,&quot;local&quot;:&quot;预处理&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;评估&quot;,&quot;local&quot;:&quot;评估&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;训练&quot;,&quot;local&quot;:&quot;训练&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;推断&quot;,&quot;local&quot;:&quot;推断&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/zh/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/entry/start.df08e735.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/scheduler.70fef3e7.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/singletons.712e839a.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/index.048d9726.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/paths.5531c515.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/entry/app.80028681.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/preload-helper.77b1b155.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/index.747f74de.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/nodes/0.1c1bc5e9.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/nodes/68.2f11f7b0.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/Tip.8a10e646.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/CopyLLMTxtMenu.25c10d36.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.04bd7ab0.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/Youtube.f401511c.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/CodeBlock.15c43204.js">
<link rel="modulepreload" href="/docs/transformers/main/zh/_app/immutable/chunks/DocNotebookDropdown.b1aa0e66.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;词元分类&quot;,&quot;local&quot;:&quot;词元分类&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;加载 WNUT 17 数据集&quot;,&quot;local&quot;:&quot;加载-wnut-17-数据集&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;预处理&quot;,&quot;local&quot;:&quot;预处理&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;评估&quot;,&quot;local&quot;:&quot;评估&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;训练&quot;,&quot;local&quot;:&quot;训练&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;推断&quot;,&quot;local&quot;:&quot;推断&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <div class="flex space-x-1 " style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <h1 class="relative group"><a id="词元分类" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#词元分类"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>词元分类</span></h1> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/wVHdVlPScxA" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1hhortx">词元分类为句子中的每个词元分配标签。最常见的词元分类任务之一是命名实体识别(NER)。NER 尝试为句子中的每个实体找到对应标签,例如人名、地名或组织名。</p> <p data-svelte-h="svelte-t113tb">本指南将向您展示如何:</p> <ol data-svelte-h="svelte-11sj114"><li><a href="https://huggingface.co/datasets/wnut_17" rel="nofollow">WNUT 17</a> 数据集上微调 <a href="https://huggingface.co/distilbert/distilbert-base-uncased" rel="nofollow">DistilBERT</a>,以检测新兴实体。</li> <li>使用微调后的模型进行推断。</li></ol> <blockquote class="tip"><p data-svelte-h="svelte-16xtzhr">如果您想查看所有与本任务兼容的架构和检查点,最好查看<a href="https://huggingface.co/tasks/token-classification" rel="nofollow">任务页</a></p></blockquote> <p data-svelte-h="svelte-tatoy7">在开始之前,请确保您已安装所有必要的库:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip install transformers datasets evaluate seqeval<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10w0y68">建议您登录 Hugging Face 账户,以便将模型上传并分享给社区。在提示时,输入您的令牌进行登录:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
<span class="hljs-meta">&gt;&gt;&gt; </span>notebook_login()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="加载-wnut-17-数据集" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#加载-wnut-17-数据集"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>加载 WNUT 17 数据集</span></h2> <p data-svelte-h="svelte-12unm6n">首先从 🤗 Datasets 库中加载 WNUT 17 数据集:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>wnut = load_dataset(<span class="hljs-string">&quot;wnut_17&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-k7l7gd">然后查看一个示例:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>wnut[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">0</span>]
{<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-string">&#x27;0&#x27;</span>,
<span class="hljs-string">&#x27;ner_tags&#x27;</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">8</span>, <span class="hljs-number">0</span>, <span class="hljs-number">7</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>],
<span class="hljs-string">&#x27;tokens&#x27;</span>: [<span class="hljs-string">&#x27;@paulwalk&#x27;</span>, <span class="hljs-string">&#x27;It&#x27;</span>, <span class="hljs-string">&quot;&#x27;s&quot;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;view&#x27;</span>, <span class="hljs-string">&#x27;from&#x27;</span>, <span class="hljs-string">&#x27;where&#x27;</span>, <span class="hljs-string">&#x27;I&#x27;</span>, <span class="hljs-string">&quot;&#x27;m&quot;</span>, <span class="hljs-string">&#x27;living&#x27;</span>, <span class="hljs-string">&#x27;for&#x27;</span>, <span class="hljs-string">&#x27;two&#x27;</span>, <span class="hljs-string">&#x27;weeks&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;Empire&#x27;</span>, <span class="hljs-string">&#x27;State&#x27;</span>, <span class="hljs-string">&#x27;Building&#x27;</span>, <span class="hljs-string">&#x27;=&#x27;</span>, <span class="hljs-string">&#x27;ESB&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;Pretty&#x27;</span>, <span class="hljs-string">&#x27;bad&#x27;</span>, <span class="hljs-string">&#x27;storm&#x27;</span>, <span class="hljs-string">&#x27;here&#x27;</span>, <span class="hljs-string">&#x27;last&#x27;</span>, <span class="hljs-string">&#x27;evening&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]
}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14t1nat"><code>ner_tags</code> 中的每个数字代表一个实体。将数字转换为标签名称,以了解实体类型:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>label_list = wnut[<span class="hljs-string">&quot;train&quot;</span>].features[<span class="hljs-string">f&quot;ner_tags&quot;</span>].feature.names
<span class="hljs-meta">&gt;&gt;&gt; </span>label_list
[
<span class="hljs-string">&quot;O&quot;</span>,
<span class="hljs-string">&quot;B-corporation&quot;</span>,
<span class="hljs-string">&quot;I-corporation&quot;</span>,
<span class="hljs-string">&quot;B-creative-work&quot;</span>,
<span class="hljs-string">&quot;I-creative-work&quot;</span>,
<span class="hljs-string">&quot;B-group&quot;</span>,
<span class="hljs-string">&quot;I-group&quot;</span>,
<span class="hljs-string">&quot;B-location&quot;</span>,
<span class="hljs-string">&quot;I-location&quot;</span>,
<span class="hljs-string">&quot;B-person&quot;</span>,
<span class="hljs-string">&quot;I-person&quot;</span>,
<span class="hljs-string">&quot;B-product&quot;</span>,
<span class="hljs-string">&quot;I-product&quot;</span>,
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-5ytdri">每个 <code>ner_tag</code> 的前缀字母表示实体中词元的位置:</p> <ul data-svelte-h="svelte-1oyn1pk"><li><code>B-</code> 表示实体的开始。</li> <li><code>I-</code> 表示词元包含在同一实体中(例如,<code>State</code> 词元是 <code>Empire State Building</code> 等实体的一部分)。</li> <li><code>0</code> 表示该词元不对应任何实体。</li></ul> <h2 class="relative group"><a id="预处理" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#预处理"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>预处理</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/iY2AZYdZAr0" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-liem7g">下一步是加载 DistilBERT 分词器,对 <code>tokens</code> 字段进行预处理:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;distilbert/distilbert-base-uncased&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1uikd3t">如上面示例的 <code>tokens</code> 字段所示,看起来输入已经完成了分词。但实际上输入尚未分词,您需要设置 <code>is_split_into_words=True</code> 将词语分词为子词。例如:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>example = wnut[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">0</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenized_input = tokenizer(example[<span class="hljs-string">&quot;tokens&quot;</span>], is_split_into_words=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>tokens = tokenizer.convert_ids_to_tokens(tokenized_input[<span class="hljs-string">&quot;input_ids&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>tokens
[<span class="hljs-string">&#x27;[CLS]&#x27;</span>, <span class="hljs-string">&#x27;@&#x27;</span>, <span class="hljs-string">&#x27;paul&#x27;</span>, <span class="hljs-string">&#x27;##walk&#x27;</span>, <span class="hljs-string">&#x27;it&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;view&#x27;</span>, <span class="hljs-string">&#x27;from&#x27;</span>, <span class="hljs-string">&#x27;where&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;m&#x27;</span>, <span class="hljs-string">&#x27;living&#x27;</span>, <span class="hljs-string">&#x27;for&#x27;</span>, <span class="hljs-string">&#x27;two&#x27;</span>, <span class="hljs-string">&#x27;weeks&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;empire&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;building&#x27;</span>, <span class="hljs-string">&#x27;=&#x27;</span>, <span class="hljs-string">&#x27;es&#x27;</span>, <span class="hljs-string">&#x27;##b&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;pretty&#x27;</span>, <span class="hljs-string">&#x27;bad&#x27;</span>, <span class="hljs-string">&#x27;storm&#x27;</span>, <span class="hljs-string">&#x27;here&#x27;</span>, <span class="hljs-string">&#x27;last&#x27;</span>, <span class="hljs-string">&#x27;evening&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;[SEP]&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2mvx6k">然而,这会添加一些特殊词元 <code>[CLS]</code><code>[SEP]</code>,子词分词会造成输入与标签之间的不匹配——原本对应单个标签的单个词,现在可能被分割为两个子词。您需要通过以下方式重新对齐词元和标签:</p> <ol data-svelte-h="svelte-15dmsqk"><li>使用 <a href="https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids" rel="nofollow"><code>word_ids</code></a> 方法将所有词元映射到对应的词语。</li> <li>对特殊词元 <code>[CLS]</code><code>[SEP]</code> 分配标签 <code>-100</code>,使其被 PyTorch 的损失函数忽略(参见 <a href="https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html" rel="nofollow">CrossEntropyLoss</a>)。</li> <li>仅为给定词语的第一个词元打标签,对同一词语的其他子词元分配 <code>-100</code></li></ol> <p data-svelte-h="svelte-rshgsq">下面是创建一个函数来重新对齐词元和标签、并将序列截断至不超过 DistilBERT 最大输入长度的方法:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize_and_align_labels</span>(<span class="hljs-params">examples</span>):
<span class="hljs-meta">... </span> tokenized_inputs = tokenizer(examples[<span class="hljs-string">&quot;tokens&quot;</span>], truncation=<span class="hljs-literal">True</span>, is_split_into_words=<span class="hljs-literal">True</span>)
<span class="hljs-meta">... </span> labels = []
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> i, label <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(examples[<span class="hljs-string">f&quot;ner_tags&quot;</span>]):
<span class="hljs-meta">... </span> word_ids = tokenized_inputs.word_ids(batch_index=i) <span class="hljs-comment"># 将词元映射到对应词语</span>
<span class="hljs-meta">... </span> previous_word_idx = <span class="hljs-literal">None</span>
<span class="hljs-meta">... </span> label_ids = []
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> word_idx <span class="hljs-keyword">in</span> word_ids: <span class="hljs-comment"># 将特殊词元设置为 -100</span>
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> word_idx <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>:
<span class="hljs-meta">... </span> label_ids.append(-<span class="hljs-number">100</span>)
<span class="hljs-meta">... </span> <span class="hljs-keyword">elif</span> word_idx != previous_word_idx: <span class="hljs-comment"># 仅为给定词语的第一个词元打标签</span>
<span class="hljs-meta">... </span> label_ids.append(label[word_idx])
<span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>:
<span class="hljs-meta">... </span> label_ids.append(-<span class="hljs-number">100</span>)
<span class="hljs-meta">... </span> previous_word_idx = word_idx
<span class="hljs-meta">... </span> labels.append(label_ids)
<span class="hljs-meta">... </span> tokenized_inputs[<span class="hljs-string">&quot;labels&quot;</span>] = labels
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> tokenized_inputs<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-21h35n">使用 🤗 Datasets 的 <code>map</code> 函数将预处理函数应用于整个数据集。通过设置 <code>batched=True</code> 一次处理数据集的多个元素,可以加速 <code>map</code> 函数:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>tokenized_wnut = wnut.<span class="hljs-built_in">map</span>(tokenize_and_align_labels, batched=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hrzccd">现在使用 <a href="/docs/transformers/main/zh/main_classes/data_collator#transformers.DataCollatorWithPadding">DataCollatorWithPadding</a> 创建一批样本。在整理时将句子<em>动态填充</em>至批次中的最长长度,比将整个数据集填充至最大长度更高效。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> DataCollatorForTokenClassification
<span class="hljs-meta">&gt;&gt;&gt; </span>data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="评估" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#评估"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>评估</span></h2> <p data-svelte-h="svelte-q4dtbk">在训练过程中加入评估指标有助于评估模型的性能。您可以使用 🤗 <a href="https://huggingface.co/docs/evaluate/index" rel="nofollow">Evaluate</a> 库快速加载评估方法。对于此任务,加载 <a href="https://huggingface.co/spaces/evaluate-metric/seqeval" rel="nofollow">seqeval</a> 框架(参阅 🤗 Evaluate <a href="https://huggingface.co/docs/evaluate/a_quick_tour" rel="nofollow">快速教程</a>,了解更多关于加载和计算指标的信息)。seqeval 实际上会产生多个分数:精确率、召回率、F1 和准确率。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> evaluate
<span class="hljs-meta">&gt;&gt;&gt; </span>seqeval = evaluate.load(<span class="hljs-string">&quot;seqeval&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-oqx9r2">首先获取 NER 标签,然后创建一个函数,将真实预测结果和真实标签传递给 <code>compute</code> 来计算分数:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-meta">&gt;&gt;&gt; </span>labels = [label_list[i] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> example[<span class="hljs-string">f&quot;ner_tags&quot;</span>]]
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_metrics</span>(<span class="hljs-params">p</span>):
<span class="hljs-meta">... </span> predictions, labels = p
<span class="hljs-meta">... </span> predictions = np.argmax(predictions, axis=<span class="hljs-number">2</span>)
<span class="hljs-meta">... </span> true_predictions = [
<span class="hljs-meta">... </span> [label_list[p] <span class="hljs-keyword">for</span> (p, l) <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(prediction, label) <span class="hljs-keyword">if</span> l != -<span class="hljs-number">100</span>]
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> prediction, label <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(predictions, labels)
<span class="hljs-meta">... </span> ]
<span class="hljs-meta">... </span> true_labels = [
<span class="hljs-meta">... </span> [label_list[l] <span class="hljs-keyword">for</span> (p, l) <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(prediction, label) <span class="hljs-keyword">if</span> l != -<span class="hljs-number">100</span>]
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> prediction, label <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(predictions, labels)
<span class="hljs-meta">... </span> ]
<span class="hljs-meta">... </span> results = seqeval.compute(predictions=true_predictions, references=true_labels)
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> {
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;precision&quot;</span>: results[<span class="hljs-string">&quot;overall_precision&quot;</span>],
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;recall&quot;</span>: results[<span class="hljs-string">&quot;overall_recall&quot;</span>],
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;f1&quot;</span>: results[<span class="hljs-string">&quot;overall_f1&quot;</span>],
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;accuracy&quot;</span>: results[<span class="hljs-string">&quot;overall_accuracy&quot;</span>],
<span class="hljs-meta">... </span> }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e4kddi">您的 <code>compute_metrics</code> 函数已准备就绪,在设置训练时会用到它。</p> <h2 class="relative group"><a id="训练" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#训练"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>训练</span></h2> <p data-svelte-h="svelte-1dbxn6a">在开始训练模型之前,使用 <code>id2label</code><code>label2id</code> 创建预期 id 到其标签的映射:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>id2label = {
<span class="hljs-meta">... </span> <span class="hljs-number">0</span>: <span class="hljs-string">&quot;O&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">1</span>: <span class="hljs-string">&quot;B-corporation&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">2</span>: <span class="hljs-string">&quot;I-corporation&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">3</span>: <span class="hljs-string">&quot;B-creative-work&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">4</span>: <span class="hljs-string">&quot;I-creative-work&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">5</span>: <span class="hljs-string">&quot;B-group&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">6</span>: <span class="hljs-string">&quot;I-group&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">7</span>: <span class="hljs-string">&quot;B-location&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">8</span>: <span class="hljs-string">&quot;I-location&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">9</span>: <span class="hljs-string">&quot;B-person&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">10</span>: <span class="hljs-string">&quot;I-person&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">11</span>: <span class="hljs-string">&quot;B-product&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-number">12</span>: <span class="hljs-string">&quot;I-product&quot;</span>,
<span class="hljs-meta">... </span>}
<span class="hljs-meta">&gt;&gt;&gt; </span>label2id = {
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;O&quot;</span>: <span class="hljs-number">0</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;B-corporation&quot;</span>: <span class="hljs-number">1</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;I-corporation&quot;</span>: <span class="hljs-number">2</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;B-creative-work&quot;</span>: <span class="hljs-number">3</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;I-creative-work&quot;</span>: <span class="hljs-number">4</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;B-group&quot;</span>: <span class="hljs-number">5</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;I-group&quot;</span>: <span class="hljs-number">6</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;B-location&quot;</span>: <span class="hljs-number">7</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;I-location&quot;</span>: <span class="hljs-number">8</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;B-person&quot;</span>: <span class="hljs-number">9</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;I-person&quot;</span>: <span class="hljs-number">10</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;B-product&quot;</span>: <span class="hljs-number">11</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;I-product&quot;</span>: <span class="hljs-number">12</span>,
<span class="hljs-meta">... </span>}<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-dcl442">如果您不熟悉使用 <code>Trainer</code> 微调模型,请查看<a href="../training#train-with-pytorch-trainer">这里</a>的基础教程!</p></blockquote> <p data-svelte-h="svelte-1qwiswe">现在可以开始训练模型了!使用 <code>AutoModelForTokenClassification</code> 加载 DistilBERT,并指定预期标签数量和标签映射:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForTokenClassification, TrainingArguments, Trainer
<span class="hljs-meta">&gt;&gt;&gt; </span>model = AutoModelForTokenClassification.from_pretrained(
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;distilbert/distilbert-base-uncased&quot;</span>, num_labels=<span class="hljs-number">13</span>, id2label=id2label, label2id=label2id
<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-11bnknf">此时,只剩三个步骤:</p> <ol data-svelte-h="svelte-1uaujja"><li><code>TrainingArguments</code> 中定义训练超参数。唯一必需的参数是 <code>output_dir</code>,它指定保存模型的位置。通过设置 <code>push_to_hub=True</code>,将模型推送到 Hub(您需要登录 Hugging Face 才能上传模型)。每个 epoch 结束时,<code>Trainer</code> 将评估 seqeval 分数并保存训练检查点。</li> <li>将训练参数传递给 <code>Trainer</code>,同时传入模型、数据集、分词器、数据整理器和 <code>compute_metrics</code> 函数。</li> <li>调用 <code>train()</code> 微调您的模型。</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>training_args = TrainingArguments(
<span class="hljs-meta">... </span> output_dir=<span class="hljs-string">&quot;my_awesome_wnut_model&quot;</span>,
<span class="hljs-meta">... </span> learning_rate=<span class="hljs-number">2e-5</span>,
<span class="hljs-meta">... </span> per_device_train_batch_size=<span class="hljs-number">16</span>,
<span class="hljs-meta">... </span> per_device_eval_batch_size=<span class="hljs-number">16</span>,
<span class="hljs-meta">... </span> num_train_epochs=<span class="hljs-number">2</span>,
<span class="hljs-meta">... </span> weight_decay=<span class="hljs-number">0.01</span>,
<span class="hljs-meta">... </span> eval_strategy=<span class="hljs-string">&quot;epoch&quot;</span>,
<span class="hljs-meta">... </span> save_strategy=<span class="hljs-string">&quot;epoch&quot;</span>,
<span class="hljs-meta">... </span> load_best_model_at_end=<span class="hljs-literal">True</span>,
<span class="hljs-meta">... </span> push_to_hub=<span class="hljs-literal">True</span>,
<span class="hljs-meta">... </span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>trainer = Trainer(
<span class="hljs-meta">... </span> model=model,
<span class="hljs-meta">... </span> args=training_args,
<span class="hljs-meta">... </span> train_dataset=tokenized_wnut[<span class="hljs-string">&quot;train&quot;</span>],
<span class="hljs-meta">... </span> eval_dataset=tokenized_wnut[<span class="hljs-string">&quot;test&quot;</span>],
<span class="hljs-meta">... </span> processing_class=tokenizer,
<span class="hljs-meta">... </span> data_collator=data_collator,
<span class="hljs-meta">... </span> compute_metrics=compute_metrics,
<span class="hljs-meta">... </span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16zweuy">训练完成后,使用 <code>push_to_hub()</code> 方法将模型分享到 Hub,让所有人都能使用您的模型:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>trainer.push_to_hub()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-122hef2">如需了解如何微调词元分类模型的更深入示例,请参阅相应的
<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb" rel="nofollow">PyTorch notebook</a></p></blockquote> <h2 class="relative group"><a id="推断" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#推断"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>推断</span></h2> <p data-svelte-h="svelte-lhz5h3">很好,现在您已经微调了模型,可以用它进行推断了!</p> <p data-svelte-h="svelte-16ewud">准备一些您想要进行推断的文本:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>text = <span class="hljs-string">&quot;The Golden State Warriors are an American professional basketball team based in San Francisco.&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jrtr3d">使用微调后的模型进行推断最简单的方式是在 <a href="/docs/transformers/main/zh/main_classes/pipelines#transformers.pipeline">pipeline()</a> 中使用它。用您的模型实例化一个 NER <code>pipeline</code>,并将文本传递给它:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
<span class="hljs-meta">&gt;&gt;&gt; </span>classifier = pipeline(<span class="hljs-string">&quot;ner&quot;</span>, model=<span class="hljs-string">&quot;stevhliu/my_awesome_wnut_model&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>classifier(text)
[{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;B-location&#x27;</span>,
<span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.42658573</span>,
<span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">2</span>,
<span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;golden&#x27;</span>,
<span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">4</span>,
<span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">10</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;I-location&#x27;</span>,
<span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.35856336</span>,
<span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">3</span>,
<span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;state&#x27;</span>,
<span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">11</span>,
<span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">16</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;B-group&#x27;</span>,
<span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.3064001</span>,
<span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">4</span>,
<span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;warriors&#x27;</span>,
<span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">17</span>,
<span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">25</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;B-location&#x27;</span>,
<span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.65523505</span>,
<span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">13</span>,
<span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;san&#x27;</span>,
<span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">80</span>,
<span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">83</span>},
{<span class="hljs-string">&#x27;entity&#x27;</span>: <span class="hljs-string">&#x27;B-location&#x27;</span>,
<span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.4668663</span>,
<span class="hljs-string">&#x27;index&#x27;</span>: <span class="hljs-number">14</span>,
<span class="hljs-string">&#x27;word&#x27;</span>: <span class="hljs-string">&#x27;francisco&#x27;</span>,
<span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">84</span>,
<span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">93</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xtsel9">如果您愿意,也可以手动复现 <code>pipeline</code> 的结果:</p> <p data-svelte-h="svelte-15cnrmw">对文本进行分词并返回 PyTorch 张量:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;stevhliu/my_awesome_wnut_model&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = tokenizer(text, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-to3p2e">将输入传递给模型并返回 <code>logits</code></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForTokenClassification
<span class="hljs-meta">&gt;&gt;&gt; </span>model = AutoModelForTokenClassification.from_pretrained(<span class="hljs-string">&quot;stevhliu/my_awesome_wnut_model&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">with</span> torch.no_grad():
<span class="hljs-meta">... </span> logits = model(**inputs).logits<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xjkcfz">获取概率最高的类别,并使用模型的 <code>id2label</code> 映射将其转换为文本标签:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>predictions = torch.argmax(logits, dim=<span class="hljs-number">2</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>predicted_token_class = [model.config.id2label[t.item()] <span class="hljs-keyword">for</span> t <span class="hljs-keyword">in</span> predictions[<span class="hljs-number">0</span>]]
<span class="hljs-meta">&gt;&gt;&gt; </span>predicted_token_class
[<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;B-location&#x27;</span>,
<span class="hljs-string">&#x27;I-location&#x27;</span>,
<span class="hljs-string">&#x27;B-group&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;B-location&#x27;</span>,
<span class="hljs-string">&#x27;B-location&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>,
<span class="hljs-string">&#x27;O&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/zh/tasks/token_classification.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1ea77dr = {
assets: "/docs/transformers/main/zh",
base: "/docs/transformers/main/zh",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/zh/_app/immutable/entry/start.df08e735.js"),
import("/docs/transformers/main/zh/_app/immutable/entry/app.80028681.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 68],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
72.6 kB
·
Xet hash:
f7ad37cb2531a8f10bfda3895275ef1cb6e75ed6e00b83b95ec3299ac962bc9d

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.