Buckets:

rtrm's picture
download
raw
68.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;根據已有的tokenizer訓練新的tokenizer&quot;,&quot;local&quot;:&quot;根據已有的tokenizer訓練新的tokenizer&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;準備語料庫&quot;,&quot;local&quot;:&quot;準備語料庫&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;訓練一個新的標記器&quot;,&quot;local&quot;:&quot;訓練一個新的標記器&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;保存標記器&quot;,&quot;local&quot;:&quot;保存標記器&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1021/zh-TW/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/entry/start.41617dd6.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/singletons.75b420cf.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/paths.241f8580.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/entry/app.89205ec4.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/nodes/0.72d34023.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/nodes/45.e190ca9d.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/CourseFloatingBanner.9ff4c771.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/DocNotebookDropdown.efc1fb7c.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-TW/_app/immutable/chunks/getInferenceSnippets.ebf8be91.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;根據已有的tokenizer訓練新的tokenizer&quot;,&quot;local&quot;:&quot;根據已有的tokenizer訓練新的tokenizer&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;準備語料庫&quot;,&quot;local&quot;:&quot;準備語料庫&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;訓練一個新的標記器&quot;,&quot;local&quot;:&quot;訓練一個新的標記器&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;保存標記器&quot;,&quot;local&quot;:&quot;保存標記器&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="根據已有的tokenizer訓練新的tokenizer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#根據已有的tokenizer訓練新的tokenizer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>根據已有的tokenizer訓練新的tokenizer</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/zh-CN/chapter6/section2.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/zh-CN/chapter6/section2.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1fmy1u7">如果您感興趣的語言中沒有可用的語言模型,或者如果您的語料庫與您的語言模型所訓練的語料庫有很大不同,您很可能希望從適合您的數據的標記器從頭開始重新訓練模型 . 這將需要在您的數據集上訓練一個新的標記器。 但這究竟是什麼意思? 當我們在 <a href="/course/chapter2">第二章</a> 中第一次查看標記器時,我們看到大多數 Transformer 模型使用<em>子詞分詞算法</em>。 為了識別哪些子詞是感興趣的並且在手頭的語料庫中最常出現,標記器需要仔細查看語料庫中的所有文本——我們稱之為<em>training</em>的過程。 這種訓練的確切規則取決於所使用的標記器的類型,我們將在本章後面討論三種主要算法。</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/DJimQynXZsQ" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-2lq2px">⚠️ 訓練標記器與訓練模型不同!模型訓練使用隨機梯度下降使每個batch的loss小一點。它本質上是隨機的(這意味著在進行兩次相同的訓練時,您必須設置一些隨機數種子才能獲得相同的結果)。訓練標記器是一個統計過程,它試圖確定哪些子詞最適合為給定的語料庫選擇,用於選擇它們的確切規則取決於分詞算法。它是確定性的,這意味著在相同的語料庫上使用相同的算法進行訓練時,您總是會得到相同的結果。</p></div> <h2 class="relative group"><a id="準備語料庫" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#準備語料庫"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>準備語料庫</span></h2> <p data-svelte-h="svelte-1m5288g">🤗 Transformers 中有一個非常簡單的 API,你可以用它來訓練一個新的標記器,使它與現有標記器相同的特徵: <strong>AutoTokenizer.train_new_from_iterator()</strong> .為了復現這一點,假設我們想從頭開始訓練 GPT-2,但使用英語以外的語言。我們的首要任務是在訓練語料庫中收集該語言的大量數據。為了提供每個人都能理解的示例,我們在這裡不會使用俄語或中文之類的語言,而是使用在特定領域的英語語言:Python 代碼。</p> <p data-svelte-h="svelte-za6ohc"><a href="https://github.com/huggingface/datasets" rel="nofollow">🤗 Datasets</a>庫可以幫助我們組裝一個 Python 源代碼語料庫。我們將使用<strong>load_dataset()</strong>功能下載和緩存<a href="https://huggingface.co/datasets/code_search_net" rel="nofollow">CodeSearchNet</a>數據集。該數據集是為<a href="https://wandb.ai/github/CodeSearchNet/benchmark" rel="nofollow">CodeSearchNet 挑戰</a>而創建的幷包含來自 GitHub 上開源庫的數百萬種編程語言的函數。在這裡,我們將加載此數據集的 Python 部分:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-comment"># This can take a few minutes to load, so grab a coffee or tea while you wait!</span>
raw_datasets = load_dataset(<span class="hljs-string">&quot;code_search_net&quot;</span>, <span class="hljs-string">&quot;python&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lxg3ks">我們可以查看訓練集的部分,以查看我們數據集中有哪些列:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;repository_name&#x27;</span>, <span class="hljs-string">&#x27;func_path_in_repository&#x27;</span>, <span class="hljs-string">&#x27;func_name&#x27;</span>, <span class="hljs-string">&#x27;whole_func_string&#x27;</span>, <span class="hljs-string">&#x27;language&#x27;</span>,
<span class="hljs-string">&#x27;func_code_string&#x27;</span>, <span class="hljs-string">&#x27;func_code_tokens&#x27;</span>, <span class="hljs-string">&#x27;func_documentation_string&#x27;</span>, <span class="hljs-string">&#x27;func_documentation_tokens&#x27;</span>, <span class="hljs-string">&#x27;split_name&#x27;</span>,
<span class="hljs-string">&#x27;func_code_url&#x27;</span>
],
num_rows: <span class="hljs-number">412178</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jj8nf8">我們可以看到數據集將文檔字符串與代碼分開,並且有他們各自的標記化後的結果。 這裡。 我們將只使用 <code>whole_func_string</code> 列來訓練我們的標記器。 我們可以通過指定到 <code>train</code> 中的一部分來查看這些函數的一個示例:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(raw_datasets[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">123456</span>][<span class="hljs-string">&quot;whole_func_string&quot;</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fzdcla">應該打印以下內容:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">handle_simple_responses</span>(<span class="hljs-params">
self, timeout_ms=<span class="hljs-literal">None</span>, info_cb=DEFAULT_MESSAGE_CALLBACK</span>):
<span class="hljs-string">&quot;&quot;&quot;Accepts normal responses from the device.
Args:
timeout_ms: Timeout in milliseconds to wait for each response.
info_cb: Optional callback for text sent from the bootloader.
Returns:
OKAY packet&#x27;s message.
&quot;&quot;&quot;</span>
<span class="hljs-keyword">return</span> self._accept_responses(<span class="hljs-string">&#x27;OKAY&#x27;</span>, info_cb, timeout_ms=timeout_ms)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1twpnet">我們需要做的第一件事是將數據集轉換為迭代器文本列表 - 例如,文本列表。使用文本列表將使我們的標記器運行得更快(訓練成批文本而不是一個接一個地處理單個文本),如果我們想避免一次將所有內容都放在內存中,它應該是一個迭代器。如果你的語料庫很大,你會想要利用這樣一個特性:🤗 Datasets 不會將所有內容都加載到 RAM 中,而是將數據集的元素存儲在磁盤上。</p> <p data-svelte-h="svelte-10k6yh2">執行以下操作將創建一個包含 1,000 個文本的列表的列表,但會將所有內容加載到內存中:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Don&#x27;t uncomment the following line unless your dataset is small!</span>
<span class="hljs-comment"># training_corpus = [raw_datasets[&quot;train&quot;][i: i + 1000][&quot;whole_func_string&quot;] for i in range(0, len(raw_datasets[&quot;train&quot;]), 1000)]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-77erhv">使用 Python 生成器,我們可以避免 Python 將任何內容加載到內存中,直到真正需要為止。要創建這樣的生成器,您只需要將括號替換為圓括號:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_corpus = (
raw_datasets[<span class="hljs-string">&quot;train&quot;</span>][i : i + <span class="hljs-number">1000</span>][<span class="hljs-string">&quot;whole_func_string&quot;</span>]
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]), <span class="hljs-number">1000</span>)
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nya2u6">這行代碼不會獲取數據集的任何元素;它只是創建了一個可以在 Python 中使用的對象 <strong>for</strong> 環形。文本只會在您需要時加載(即,當您處於 <strong>for</strong> 需要它們的循環),並且一次只會加載 1,000 個文本。這樣,即使您正在處理龐大的數據集,也不會耗盡所有內存。</p> <p data-svelte-h="svelte-1vexoip">生成器對象的問題在於它只能使用一次,每次訪問它將給出下一個值。 下面是一個例子:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->gen = (i <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">10</span>))
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">list</span>(gen))
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">list</span>(gen))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1msswjg">我們第一次得到了這個列表,然後是一個空列表:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">9</span>]
[]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sa4pom">這就是我們定義一個返回生成器的函數的原因:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_training_corpus</span>():
<span class="hljs-keyword">return</span> (
raw_datasets[<span class="hljs-string">&quot;train&quot;</span>][i : i + <span class="hljs-number">1000</span>][<span class="hljs-string">&quot;whole_func_string&quot;</span>]
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]), <span class="hljs-number">1000</span>)
)
training_corpus = get_training_corpus()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1d0zxdh">您還可以在一個 <strong>for</strong> 循環內部使用 <strong>yield</strong> 關鍵字定義您的生成器:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_training_corpus</span>():
dataset = raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]
<span class="hljs-keyword">for</span> start_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(dataset), <span class="hljs-number">1000</span>):
samples = dataset[start_idx : start_idx + <span class="hljs-number">1000</span>]
<span class="hljs-keyword">yield</span> samples[<span class="hljs-string">&quot;whole_func_string&quot;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3v2ez1">這將產生與以前完全相同的生成器,但允許您使用比列表生成式中更復雜的邏輯。</p> <h2 class="relative group"><a id="訓練一個新的標記器" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#訓練一個新的標記器"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>訓練一個新的標記器</span></h2> <p data-svelte-h="svelte-9z454o">現在我們的語料庫是文本批量迭代器的形式,我們準備訓練一個新的標記器。為此,我們首先需要加載要與模型配對的標記器(此處為 GPT-2):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;gpt2&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17473lh">即使我們要訓練一個新的標記器,最好還是這樣做以避免完全從頭開始。這樣,我們就不必指定任何關於標記化算法或我們想要使用的特殊標記;我們的新標記器將與 GPT-2 完全相同,唯一會改變的是輸入的數據,這將取決於我們訓練的語料。</p> <p data-svelte-h="svelte-1vm1nkq">首先讓我們看看這個標記器將如何處理示例的數據:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->example = <span class="hljs-string">&#x27;&#x27;&#x27;def add_numbers(a, b):
&quot;&quot;&quot;Add the two numbers `a` and `b`.&quot;&quot;&quot;
return a + b&#x27;&#x27;&#x27;</span>
tokens = old_tokenizer.tokenize(example)
tokens<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;def&#x27;</span>, <span class="hljs-string">&#x27;Ġadd&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>, <span class="hljs-string">&#x27;umbers&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;Ċ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&quot;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;Add&#x27;</span>, <span class="hljs-string">&#x27;Ġthe&#x27;</span>, <span class="hljs-string">&#x27;Ġtwo&#x27;</span>,
<span class="hljs-string">&#x27;Ġnumbers&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;`&#x27;</span>, <span class="hljs-string">&#x27;Ġand&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;`&#x27;</span>, <span class="hljs-string">&#x27;.&quot;&#x27;</span>, <span class="hljs-string">&#x27;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;Ċ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġreturn&#x27;</span>, <span class="hljs-string">&#x27;Ġa&#x27;</span>, <span class="hljs-string">&#x27;Ġ+&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jz6802">這個標記器有一些特殊的符號,比如 <strong>Ċ</strong><strong>Ġ</strong> ,分別表示空格和換行符。正如我們所看到的,這不是太有效:標記器為每個空格返回單獨的標記,當它可以將縮進級別組合在一起時(因為在代碼中具有四個或八個空格的集合將非常普遍)。它也有點奇怪地拆分了函數名稱,而習慣使用<strong>_</strong>的函數命名的方法。</p> <p data-svelte-h="svelte-11jp004">讓我們訓練一個新的標記器,看看它是否能解決這些問題。為此,我們將使用 <strong>train_new_from_iterator()</strong> 方法:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, <span class="hljs-number">52000</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-13magy7">如果您的語料庫非常大,此命令可能需要一些時間,但對於這個 1.6 GB 文本數據集,它的速度非常快(在具有 12 個內核的 AMD Ryzen 9 3900X CPU 上為 1 分 16 秒)。</p> <p data-svelte-h="svelte-2tx7et">注意 <strong>AutoTokenizer.train_new_from_iterator()</strong> 僅當您使用的標記器是“快速(fast)”標記器時才有效。正如您將在下一節中看到的,🤗 Transformers 庫包含兩種類型的標記器:一些完全用 Python 編寫,而另一些(快速的)由 🤗 Tokenizers 庫支持,該庫用<a href="https://www.rust-lang.org" rel="nofollow">Rust</a>編程語言編寫。 Python 是最常用於數據科學和深度學習應用程序的語言,但是當需要並行化以提高速度時,必須用另一種語言編寫。例如,模型計算核心的矩陣乘法是用 CUDA 編寫的,CUDA 是一個針對 GPU 的優化 C 庫。</p> <p data-svelte-h="svelte-yy1hp8">用純 Python 訓練一個全新的標記器會非常緩慢,這就是我們開發 🤗 Tokenizers庫的原因。請注意,正如您無需學習 CUDA 語言即可在 GPU 上執行您的模型一樣,您也無需學習 Rust 即可使用快速標記器。 🤗 Tokenizers 庫為許多內部調用 Rust 代碼的方法提供 Python 綁定;例如,並行化新標記器的訓練,或者,正如我們在<a href="/course/chapter3">第三章</a>中看到的,對一批輸入進行標記化。</p> <p data-svelte-h="svelte-q6va4u">大多數 Transformer 模型都有可用的快速標記器(您可以<a href="https://huggingface.co/transformers/#supported-frameworks" rel="nofollow">在這裡</a>檢查一些例外情況),如果 <strong>AutoTokenizer</strong> 可用,API 總是為您選擇快速標記器。在下一節中,我們將看看快速標記器具有的其他一些特殊功能,這些功能對於標記分類和問答等任務非常有用。然而,在深入研究之前,讓我們在上一個示例中嘗試我們全新的標記器:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokens = tokenizer.tokenize(example)
tokens<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;def&#x27;</span>, <span class="hljs-string">&#x27;Ġadd&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;numbers&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&quot;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;Add&#x27;</span>, <span class="hljs-string">&#x27;Ġthe&#x27;</span>, <span class="hljs-string">&#x27;Ġtwo&#x27;</span>, <span class="hljs-string">&#x27;Ġnumbers&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>,
<span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;`&#x27;</span>, <span class="hljs-string">&#x27;Ġand&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;`.&quot;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġreturn&#x27;</span>, <span class="hljs-string">&#x27;Ġa&#x27;</span>, <span class="hljs-string">&#x27;Ġ+&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-konn3u">在這裡我們再次看到特殊符號 <strong>Ċ</strong><strong>Ġ</strong> 表示空格和換行符,但我們也可以看到我們的標記器學習了一些高度特定於 Python 函數語料庫的標記:例如,有一個 <strong>ĊĠĠĠ</strong> 表示縮進的標記,以及 <strong>Ġ</strong> 表示開始文檔字符串的三個引號的標記。標記器還正確使用<strong>_</strong>命名的規範將函數名稱拆分為 .這是一個非常緊湊的表示;相比之下,在同一個例子中使用簡單的英語標記器會給我們一個更長的句子:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(<span class="hljs-built_in">len</span>(tokens))
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">len</span>(old_tokenizer.tokenize(example)))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">27</span>
<span class="hljs-number">36</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-29kecn">讓我們再看一個例子:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->example = <span class="hljs-string">&quot;&quot;&quot;class LinearLayer():
def __init__(self, input_size, output_size):
self.weight = torch.randn(input_size, output_size)
self.bias = torch.zeros(output_size)
def __call__(self, x):
return x @ self.weights + self.bias
&quot;&quot;&quot;</span>
tokenizer.tokenize(example)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;class&#x27;</span>, <span class="hljs-string">&#x27;ĠLinear&#x27;</span>, <span class="hljs-string">&#x27;Layer&#x27;</span>, <span class="hljs-string">&#x27;():&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġdef&#x27;</span>, <span class="hljs-string">&#x27;Ġ__&#x27;</span>, <span class="hljs-string">&#x27;init&#x27;</span>, <span class="hljs-string">&#x27;__(&#x27;</span>, <span class="hljs-string">&#x27;self&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġinput&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>,
<span class="hljs-string">&#x27;Ġoutput&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;weight&#x27;</span>, <span class="hljs-string">&#x27;Ġ=&#x27;</span>, <span class="hljs-string">&#x27;Ġtorch&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;randn&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>, <span class="hljs-string">&#x27;input&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>,
<span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġoutput&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;)&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;bias&#x27;</span>, <span class="hljs-string">&#x27;Ġ=&#x27;</span>, <span class="hljs-string">&#x27;Ġtorch&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;zeros&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>,
<span class="hljs-string">&#x27;output&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;)&#x27;</span>, <span class="hljs-string">&#x27;ĊĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġdef&#x27;</span>, <span class="hljs-string">&#x27;Ġ__&#x27;</span>, <span class="hljs-string">&#x27;call&#x27;</span>, <span class="hljs-string">&#x27;__(&#x27;</span>, <span class="hljs-string">&#x27;self&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġx&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠĠĠĠ&#x27;</span>,
<span class="hljs-string">&#x27;Ġreturn&#x27;</span>, <span class="hljs-string">&#x27;Ġx&#x27;</span>, <span class="hljs-string">&#x27;Ġ@&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;weights&#x27;</span>, <span class="hljs-string">&#x27;Ġ+&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;bias&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠ&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tsz3qm">除了一個縮進對應的token,這裡我們還可以看到一個雙縮進的token: <strong>ĊĠĠĠĠĠĠĠ</strong> .特殊的 Python 詞如 <strong>class</strong> , <strong>init</strong> , <strong>call</strong> , <strong>self</strong> , 和 <strong>return</strong> 每個都被標記為一個標記,我們可以看到,以及分裂 <strong>_</strong><strong>.</strong> 標記器甚至可以正確拆分駝峰式名稱: <strong>LinearLayer</strong> 被標記為 <strong>[ĠLinear, Layer]</strong> .</p> <h2 class="relative group"><a id="保存標記器" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#保存標記器"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>保存標記器</span></h2> <p data-svelte-h="svelte-mtr3k9">為了確保我們以後可以使用它,我們需要保存我們的新標記器。就像模型一樣,是通過 <strong>save_pretrained()</strong> 方法:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.save_pretrained(<span class="hljs-string">&quot;code-search-net-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mgbahr">這將創建一個名為的<em>code-search-net-tokenizer</em>的新文件夾,它將包含重新加載標記器所需要的所有文件。如果您想與您的同事和朋友分享這個標記器,您可以通過登錄您的帳戶將其上傳到 Hub。如果您在notebook上工作,有一個方便的功能可以幫助您:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kp8r28">這將顯示一個小部件,您可以在其中輸入您的 Hugging Face 登錄憑據。如果您不是在notebook上工作,只需在終端中輸入以下行:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->huggingface-cli login<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8g0slh">登錄後,您可以通過執行以下命令來推送您的標記器:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.push_to_hub(<span class="hljs-string">&quot;code-search-net-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1a36nj3">這將在您的命名空間中創建一個名為<strong>code-search-net-tokenizer</strong>的新存儲庫 ,包含標記器文件。然後,您可以使用以下命令從任何地方加載標記器的 <strong>from_pretrained()</strong> 方法:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Replace &quot;huggingface-course&quot; below with your actual namespace to use your own tokenizer</span>
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;huggingface-course/code-search-net-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wjo90e">您現在已準備好從頭開始訓練語言模型並根據您手頭的任務對其進行微調!我們將在<a href="/course/chapter7">第七章</a>進行這部分。但首先,在本章的其餘部分,我們將仔細研究快速標記器,並詳細探討調用 <strong>train_new_from_iterator()</strong> 方法時實際發生的情況 .</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/zh-TW/chapter6/2.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_kgz26w = {
assets: "/docs/course/pr_1021/zh-TW",
base: "/docs/course/pr_1021/zh-TW",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1021/zh-TW/_app/immutable/entry/start.41617dd6.js"),
import("/docs/course/pr_1021/zh-TW/_app/immutable/entry/app.89205ec4.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 45],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
68.5 kB
·
Xet hash:
9ac10597f187456d9a561dd3c1da33bbe1b5988bba91c049dd1012c41b8f05c3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.