Buckets:

hf-doc-build/doc-dev / autotrain /pr_749 /en /token_classification.html
rtrm's picture
download
raw
15.1 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Token Classification&quot;,&quot;local&quot;:&quot;token-classification&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Data Format&quot;,&quot;local&quot;:&quot;data-format&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Columns&quot;,&quot;local&quot;:&quot;columns&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/autotrain/pr_749/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/entry/start.b4f8a0ef.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/chunks/scheduler.0219f8bd.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/chunks/singletons.74a96c49.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/chunks/paths.5815e531.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/entry/app.4f18d4a0.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/chunks/index.f61edf3b.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/nodes/0.3ba41ccf.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/nodes/31.8de9f235.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/chunks/CodeBlock.38e566ae.js">
<link rel="modulepreload" href="/docs/autotrain/pr_749/en/_app/immutable/chunks/EditOnGithub.48fa589f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Token Classification&quot;,&quot;local&quot;:&quot;token-classification&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Data Format&quot;,&quot;local&quot;:&quot;data-format&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Columns&quot;,&quot;local&quot;:&quot;columns&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="token-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#token-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Token Classification</span></h1> <p data-svelte-h="svelte-1aeigfr">Token classification is the task of classifying each token in a sequence. This can be used
for Named Entity Recognition (NER), Part-of-Speech (POS) tagging, and more. Get your data ready in
proper format and then with just a few clicks, your state-of-the-art model will be ready to
be used in production.</p> <h2 class="relative group"><a id="data-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#data-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Data Format</span></h2> <p data-svelte-h="svelte-1qrxgxb">The data should be in the following CSV format:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokens,tags
&quot;<span class="hljs-selector-attr">[<span class="hljs-string">&#x27;I&#x27;</span>, <span class="hljs-string">&#x27;love&#x27;</span>, <span class="hljs-string">&#x27;Paris&#x27;</span>]</span>&quot;, &quot;<span class="hljs-selector-attr">[<span class="hljs-string">&#x27;O&#x27;</span>, <span class="hljs-string">&#x27;O&#x27;</span>, <span class="hljs-string">&#x27;B-LOC&#x27;</span>]</span>&quot;
&quot;<span class="hljs-selector-attr">[<span class="hljs-string">&#x27;I&#x27;</span>, <span class="hljs-string">&#x27;live&#x27;</span>, <span class="hljs-string">&#x27;in&#x27;</span>, <span class="hljs-string">&#x27;New&#x27;</span>, <span class="hljs-string">&#x27;York&#x27;</span>]</span>&quot;, &quot;<span class="hljs-selector-attr">[<span class="hljs-string">&#x27;O&#x27;</span>, <span class="hljs-string">&#x27;O&#x27;</span>, <span class="hljs-string">&#x27;O&#x27;</span>, <span class="hljs-string">&#x27;B-LOC&#x27;</span>, <span class="hljs-string">&#x27;I-LOC&#x27;</span>]</span>&quot;
.
.
.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ipe22p">or you can also use JSONL format:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span><span class="hljs-attr">&quot;tokens&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span><span class="hljs-string">&quot;I&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;love&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;Paris&quot;</span><span class="hljs-punctuation">]</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">&quot;tags&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span><span class="hljs-string">&quot;O&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;O&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;B-LOC&quot;</span><span class="hljs-punctuation">]</span><span class="hljs-punctuation">}</span>
<span class="hljs-punctuation">{</span><span class="hljs-attr">&quot;tokens&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span><span class="hljs-string">&quot;I&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;live&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;in&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;New&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;York&quot;</span><span class="hljs-punctuation">]</span><span class="hljs-punctuation">,</span> <span class="hljs-attr">&quot;tags&quot;</span><span class="hljs-punctuation">:</span> <span class="hljs-punctuation">[</span><span class="hljs-string">&quot;O&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;O&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;O&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;B-LOC&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-string">&quot;I-LOC&quot;</span><span class="hljs-punctuation">]</span><span class="hljs-punctuation">}</span>
.
.
.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-a9ckqc">As you can see, we have two columns in the CSV file. One column is the tokens and the other
is the tags. Both the columns are stringified lists! The tokens column contains the tokens
of the sentence and the tags column contains the tags for each token.</p> <p data-svelte-h="svelte-bmzoma">If your CSV is huge, you can divide it into multiple CSV files and upload them separately.
Please make sure that the column names are the same in all CSV files.</p> <p data-svelte-h="svelte-1pa5x49">One way to divide the CSV file using pandas is as follows:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
<span class="hljs-comment"># Set the chunk size</span>
chunk_size = <span class="hljs-number">1000</span>
i = <span class="hljs-number">1</span>
<span class="hljs-comment"># Open the CSV file and read it in chunks</span>
<span class="hljs-keyword">for</span> chunk <span class="hljs-keyword">in</span> pd.read_csv(<span class="hljs-string">&#x27;example.csv&#x27;</span>, chunksize=chunk_size):
<span class="hljs-comment"># Save each chunk to a new file</span>
chunk.to_csv(<span class="hljs-string">f&#x27;chunk_<span class="hljs-subst">{i}</span>.csv&#x27;</span>, index=<span class="hljs-literal">False</span>)
i += <span class="hljs-number">1</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="columns" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#columns"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Columns</span></h2> <p data-svelte-h="svelte-10e0bq">Your CSV/JSONL dataset must have two columns: <code>tokens</code> and <code>tags</code>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/autotrain-advanced/blob/main/docs/source/token_classification.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_ewvlbq = {
assets: "/docs/autotrain/pr_749/en",
base: "/docs/autotrain/pr_749/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/autotrain/pr_749/en/_app/immutable/entry/start.b4f8a0ef.js"),
import("/docs/autotrain/pr_749/en/_app/immutable/entry/app.4f18d4a0.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 31],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
15.1 kB
·
Xet hash:
05f71f42fe416a11c375d092fa3365ad6e480bfea11ccaddee991fe668547384

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.