Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"نمذجة اللغة المقنعة (Masked language modeling)","local":"نمذجة-اللغة-المقنعة-masked-language-modeling","sections":[{"title":"تحميل مجموعة بيانات ELI5","local":"تحميل-مجموعة-بيانات-eli5","sections":[],"depth":2},{"title":"معالجة مسبقة (Preprocess)","local":"معالجة-مسبقة-preprocess","sections":[],"depth":2},{"title":"التدريب (Train)","local":"التدريب-train","sections":[],"depth":2},{"title":"الاستدلال","local":"الاستدلال","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/transformers/main/ar/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/entry/start.6e5b536e.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/scheduler.eaf6c8c4.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/singletons.f976b0a5.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/index.5e3cad04.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/paths.cea29c4a.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/entry/app.57820f07.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/preload-helper.d648f8ef.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/index.e25dcc83.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/nodes/0.69ea8195.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/nodes/38.4b09ade1.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/Tip.32737511.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.c5abd470.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/Youtube.7df060fd.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/CodeBlock.d3edfc80.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ar/_app/immutable/chunks/DocNotebookDropdown.cd339010.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"نمذجة اللغة المقنعة (Masked language modeling)","local":"نمذجة-اللغة-المقنعة-masked-language-modeling","sections":[{"title":"تحميل مجموعة بيانات ELI5","local":"تحميل-مجموعة-بيانات-eli5","sections":[],"depth":2},{"title":"معالجة مسبقة (Preprocess)","local":"معالجة-مسبقة-preprocess","sections":[],"depth":2},{"title":"التدريب (Train)","local":"التدريب-train","sections":[],"depth":2},{"title":"الاستدلال","local":"الاستدلال","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <div class="flex space-x-1 " style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <h1 class="relative group"><a id="نمذجة-اللغة-المقنعة-masked-language-modeling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#نمذجة-اللغة-المقنعة-masked-language-modeling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>نمذجة اللغة المقنعة (Masked language modeling)</span></h1> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/mqElG5QJWUg" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-197dis2">تتنبأ نمذجة اللغة المقنعة برمز مقنع في تسلسل، ويمكن للنموذج الانتباه إلى الرموز بشكل ثنائي الاتجاه. هذا | |
| يعني أن النموذج لديه إمكانية الوصول الكاملة إلى الرموز الموجودة على اليسار واليمين. تعد نمذجة اللغة المقنعة ممتازة للمهام التي | |
| تتطلب فهمًا سياقيًا جيدًا لتسلسل كامل. BERT هو مثال على نموذج لغة مقنع.</p> <p data-svelte-h="svelte-lp8700">سيوضح لك هذا الدليل كيفية:</p> <ol data-svelte-h="svelte-bqqmbo"><li>تكييف <a href="https://huggingface.co/distilbert/distilroberta-base" rel="nofollow">DistilRoBERTa</a> على مجموعة فرعية <a href="https://www.reddit.com/r/askscience/" rel="nofollow">r/askscience</a> من مجموعة بيانات <a href="https://huggingface.co/datasets/eli5" rel="nofollow">ELI5</a>.</li> <li>استخدام نموذج المدرب الخاص بك للاستدلال.</li></ol> <blockquote class="tip"><p data-svelte-h="svelte-1p9khee">لمعرفة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من <a href="https://huggingface.co/tasks/fill-mask" rel="nofollow">صفحة المهمة</a></p></blockquote> <p data-svelte-h="svelte-93upt2">قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip install transformers datasets evaluate<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cvukg2">نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما تتم مطالبتك، أدخل رمزك لتسجيل الدخول:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login | |
| <span class="hljs-meta">>>> </span>notebook_login()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="تحميل-مجموعة-بيانات-eli5" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#تحميل-مجموعة-بيانات-eli5"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>تحميل مجموعة بيانات ELI5</span></h2> <p data-svelte-h="svelte-lm688b">ابدأ بتحميل أول 5000 مثال من مجموعة بيانات <a href="https://huggingface.co/datasets/eli5_category" rel="nofollow">ELI5-Category</a> باستخدام مكتبة 🤗 Datasets. سيعطيك هذا فرصة للتجربة والتأكد من أن كل شيء يعمل قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>eli5 = load_dataset(<span class="hljs-string">"eli5_category"</span>, split=<span class="hljs-string">"train[:5000]"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xofc0r">قم بتقسيم مجموعة البيانات <code>train</code> إلى مجموعتي تدريب واختبار باستخدام الدالة <code>train_test_split</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>eli5 = eli5.train_test_split(test_size=<span class="hljs-number">0.2</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e1soci">ثم ألق نظرة على مثال:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>eli5[<span class="hljs-string">"train"</span>][<span class="hljs-number">0</span>] | |
| {<span class="hljs-string">'q_id'</span>: <span class="hljs-string">'7h191n'</span>, | |
| <span class="hljs-string">'title'</span>: <span class="hljs-string">'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?'</span>, | |
| <span class="hljs-string">'selftext'</span>: <span class="hljs-string">''</span>, | |
| <span class="hljs-string">'category'</span>: <span class="hljs-string">'Economics'</span>, | |
| <span class="hljs-string">'subreddit'</span>: <span class="hljs-string">'explainlikeimfive'</span>, | |
| <span class="hljs-string">'answers'</span>: {<span class="hljs-string">'a_id'</span>: [<span class="hljs-string">'dqnds8l'</span>, <span class="hljs-string">'dqnd1jl'</span>, <span class="hljs-string">'dqng3i1'</span>, <span class="hljs-string">'dqnku5x'</span>], | |
| <span class="hljs-string">'text'</span>: [<span class="hljs-string">"The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised."</span>, | |
| <span class="hljs-string">'None yet. It has to be reconciled with a vastly different house bill and then passed again.'</span>, | |
| <span class="hljs-string">'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?'</span>, | |
| <span class="hljs-string">'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'</span>], | |
| <span class="hljs-string">'score'</span>: [<span class="hljs-number">21</span>, <span class="hljs-number">19</span>, <span class="hljs-number">5</span>, <span class="hljs-number">3</span>], | |
| <span class="hljs-string">'text_urls'</span>: [[], | |
| [], | |
| [], | |
| [<span class="hljs-string">'https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/'</span>]]}, | |
| <span class="hljs-string">'title_urls'</span>: [<span class="hljs-string">'url'</span>], | |
| <span class="hljs-string">'selftext_urls'</span>: [<span class="hljs-string">'url'</span>]}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ilavd2">على الرغم من أن هذا قد يبدو كثيرًا، إلا أنك مهتم حقًا بحقل <code>text</code>. ما هو رائع حول مهام نمذجة اللغة هو أنك لا تحتاج إلى تسميات (تُعرف أيضًا باسم المهمة غير الخاضعة للإشراف) لأن الكلمة التالية <em>هي</em> التسمية.</p> <h2 class="relative group"><a id="معالجة-مسبقة-preprocess" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#معالجة-مسبقة-preprocess"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>معالجة مسبقة (Preprocess)</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/8PmhEIXhBvI" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1lnbm1y">بالنسبة لنمذجة اللغة المقنعة، فإن الخطوة التالية هي تحميل معالج DistilRoBERTa لمعالجة حقل <code>text</code> الفرعي:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| <span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"distilbert/distilroberta-base"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mesglz">ستلاحظ من المثال أعلاه، أن حقل <code>text</code> موجود بالفعل داخل <code>answers</code>. هذا يعني أنك ستحتاج إلى استخراج حقل <code>text</code> الفرعي من بنيته المضمنة باستخدام الدالة <a href="https://huggingface.co/docs/datasets/process#flatten" rel="nofollow"><code>flatten</code></a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>eli5 = eli5.flatten() | |
| <span class="hljs-meta">>>> </span>eli5[<span class="hljs-string">"train"</span>][<span class="hljs-number">0</span>] | |
| {<span class="hljs-string">'q_id'</span>: <span class="hljs-string">'7h191n'</span>, | |
| <span class="hljs-string">'title'</span>: <span class="hljs-string">'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?'</span>, | |
| <span class="hljs-string">'selftext'</span>: <span class="hljs-string">''</span>, | |
| <span class="hljs-string">'category'</span>: <span class="hljs-string">'Economics'</span>, | |
| <span class="hljs-string">'subreddit'</span>: <span class="hljs-string">'explainlikeimfive'</span>, | |
| <span class="hljs-string">'answers.a_id'</span>: [<span class="hljs-string">'dqnds8l'</span>, <span class="hljs-string">'dqnd1jl'</span>, <span class="hljs-string">'dqng3i1'</span>, <span class="hljs-string">'dqnku5x'</span>], | |
| <span class="hljs-string">'answers.text'</span>: [<span class="hljs-string">"The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised."</span>, | |
| <span class="hljs-string">'None yet. It has to be reconciled with a vastly different house bill and then passed again.'</span>, | |
| <span class="hljs-string">'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?'</span>, | |
| <span class="hljs-string">'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'</span>], | |
| <span class="hljs-string">'answers.score'</span>: [<span class="hljs-number">21</span>, <span class="hljs-number">19</span>, <span class="hljs-number">5</span>, <span class="hljs-number">3</span>], | |
| <span class="hljs-string">'answers.text_urls'</span>: [[], | |
| [], | |
| [], | |
| [<span class="hljs-string">'https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/'</span>]], | |
| <span class="hljs-string">'title_urls'</span>: [<span class="hljs-string">'url'</span>], | |
| <span class="hljs-string">'selftext_urls'</span>: [<span class="hljs-string">'url'</span>]}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zkvir4">كل حقل فرعي هو الآن عمود منفصل كما هو موضح بواسطة بادئة <code>answers</code>، وحقل <code>text</code> هو قائمة الآن. بدلاً من | |
| معالجة كل جملة بشكل منفصل، قم بتحويل القائمة إلى سلسلة حتى تتمكن من معالجتها بشكل مشترك.</p> <p data-svelte-h="svelte-1xqw8re">هنا أول دالة معالجة مسبقة لربط قائمة السلاسل لكل مثال ومعالجة النتيجة:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> tokenizer([<span class="hljs-string">" "</span>.join(x) <span class="hljs-keyword">for</span> x <span class="hljs-keyword">in</span> examples[<span class="hljs-string">"answers.text"</span>]])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1q90avs">لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم الدالة 🤗 Datasets <code>map</code>. يمكنك تسريع دالة <code>map</code> عن طريق تعيين <code>batched=True</code> لمعالجة عدة عناصر في وقت واحد، وزيادة عدد العمليات باستخدام <code>num_proc</code>. احذف أي أعمدة غير ضرورية:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>tokenized_eli5 = eli5.<span class="hljs-built_in">map</span>( | |
| <span class="hljs-meta">... </span> preprocess_function, | |
| <span class="hljs-meta">... </span> batched=<span class="hljs-literal">True</span>, | |
| <span class="hljs-meta">... </span> num_proc=<span class="hljs-number">4</span>, | |
| <span class="hljs-meta">... </span> remove_columns=eli5[<span class="hljs-string">"train"</span>].column_names, | |
| <span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1o9vmyd">تحتوي مجموعة البيانات هذه على تسلسلات رمزية، ولكن بعضها أطول من الطول الأقصى للمدخلات للنموذج.</p> <p data-svelte-h="svelte-16ind4">يمكنك الآن استخدام دالة معالجة مسبقة ثانية لـ:</p> <ul data-svelte-h="svelte-1sugnmu"><li>تجميع جميع التسلسلات</li> <li>تقسيم التسلسلات المجمّعة إلى أجزاء أقصر محددة بـ <code>block_size</code>، والتي يجب أن تكون أقصر من الحد الأقصى لطول المدخلات ومناسبة لذاكرة GPU.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>block_size = <span class="hljs-number">128</span> | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">group_texts</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># تجميع جميع النصوص.</span> | |
| <span class="hljs-meta">... </span> concatenated_examples = {k: <span class="hljs-built_in">sum</span>(examples[k], []) <span class="hljs-keyword">for</span> k <span class="hljs-keyword">in</span> examples.keys()} | |
| <span class="hljs-meta">... </span> total_length = <span class="hljs-built_in">len</span>(concatenated_examples[<span class="hljs-built_in">list</span>(examples.keys())[<span class="hljs-number">0</span>]]) | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># نتجاهل الجزء المتبقي الصغير، يمكننا إضافة الحشو إذا كان النموذج يدعمه بدلاً من هذا الإسقاط، يمكنك</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># تخصيص هذا الجزء حسب احتياجاتك.</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> total_length >= block_size: | |
| <span class="hljs-meta">... </span> total_length = (total_length // block_size) * block_size | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># تقسيمها إلى أجزاء بحجم block_size.</span> | |
| <span class="hljs-meta">... </span> result = { | |
| <span class="hljs-meta">... </span> k: [t[i : i + block_size] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, total_length, block_size)] | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> k, t <span class="hljs-keyword">in</span> concatenated_examples.items() | |
| <span class="hljs-meta">... </span> } | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> result<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-pr861r">طبق دالة <code>group_texts</code> على مجموعة البيانات بأكملها:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>lm_dataset = tokenized_eli5.<span class="hljs-built_in">map</span>(group_texts, batched=<span class="hljs-literal">True</span>, num_proc=<span class="hljs-number">4</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kd6cis">الآن، قم بإنشاء دفعة من الأمثلة باستخدام <code>DataCollatorForLanguageModeling</code>. من الأكثر كفاءة أن تقوم بـ <em>الحشو الديناميكي</em> ليصل طولها إلى أطول جملة في الدفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الطول الأقصى.</p> <p data-svelte-h="svelte-1p8uifc">استخدم رمز نهاية التسلسل كرمز الحشو وحدد <code>mlm_probability</code> لحجب الرموز عشوائياً كل مرة تكرر فيها البيانات:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> DataCollatorForLanguageModeling | |
| <span class="hljs-meta">>>> </span>tokenizer.pad_token = tokenizer.eos_token | |
| <span class="hljs-meta">>>> </span>data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=<span class="hljs-number">0.15</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="التدريب-train" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#التدريب-train"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>التدريب (Train)</span></h2> <blockquote class="tip"><p data-svelte-h="svelte-qu9q7x">إذا لم تكن على دراية بتعديل نموذج باستخدام <code>Trainer</code>, ألق نظرة على الدليل الأساسي <a href="../training#train-with-pytorch-trainer">هنا</a>!</p></blockquote> <p data-svelte-h="svelte-ft3hal">أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilRoBERTa باستخدام <code>AutoModelForMaskedLM</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForMaskedLM | |
| <span class="hljs-meta">>>> </span>model = AutoModelForMaskedLM.from_pretrained(<span class="hljs-string">"distilbert/distilroberta-base"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1taxt6y">في هذه المرحلة، تبقى ثلاث خطوات فقط:</p> <ol data-svelte-h="svelte-urto3u"><li>حدد معلمات التدريب الخاصة بك في <code>TrainingArguments</code>. المعلمة الوحيدة المطلوبة هي <code>output_dir</code> والتي تحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين <code>push_to_hub=True</code> (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك).</li> <li>قم بتمرير معلمات التدريب إلى <code>Trainer</code> مع النموذج، ومجموعات البيانات، ومجمّع البيانات.</li> <li>قم باستدعاء <code>train()</code> لتعديل نموذجك.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>training_args = TrainingArguments( | |
| <span class="hljs-meta">... </span> output_dir=<span class="hljs-string">"my_awesome_eli5_mlm_model"</span>, | |
| <span class="hljs-meta">... </span> eval_strategy=<span class="hljs-string">"epoch"</span>, | |
| <span class="hljs-meta">... </span> learning_rate=<span class="hljs-number">2e-5</span>, | |
| <span class="hljs-meta">... </span> num_train_epochs=<span class="hljs-number">3</span>, | |
| <span class="hljs-meta">... </span> weight_decay=<span class="hljs-number">0.01</span>, | |
| <span class="hljs-meta">... </span> push_to_hub=<span class="hljs-literal">True</span>, | |
| <span class="hljs-meta">... </span>) | |
| <span class="hljs-meta">>>> </span>trainer = Trainer( | |
| <span class="hljs-meta">... </span> model=model, | |
| <span class="hljs-meta">... </span> args=training_args, | |
| <span class="hljs-meta">... </span> train_dataset=lm_dataset[<span class="hljs-string">"train"</span>], | |
| <span class="hljs-meta">... </span> eval_dataset=lm_dataset[<span class="hljs-string">"test"</span>], | |
| <span class="hljs-meta">... </span> data_collator=data_collator, | |
| <span class="hljs-meta">... </span> tokenizer=tokenizer, | |
| <span class="hljs-meta">... </span>) | |
| <span class="hljs-meta">>>> </span>trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-165oe5t">بمجرد اكتمال التدريب، استخدم طريقة <code>evaluate()</code> لتقييم النموذج والحصول على مقياس | |
| الحيرة:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> math | |
| <span class="hljs-meta">>>> </span>eval_results = trainer.evaluate() | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">f"Perplexity: <span class="hljs-subst">{math.exp(eval_results[<span class="hljs-string">'eval_loss'</span>]):<span class="hljs-number">.2</span>f}</span>"</span>) | |
| Perplexity: <span class="hljs-number">8.76</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g4l0nl">ثم شارك نموذجك على Hub باستخدام طريقة <code>push_to_hub()</code> حتى يتمكن الجميع من استخدام نموذجك:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>trainer.push_to_hub()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-hv9ctw">لمثال أكثر تفصيلاً حول كيفية تعديل نموذج للنمذجة اللغوية المقنعة، ألق نظرة على الدفتر المقابل | |
| <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb" rel="nofollow">دفتر PyTorch</a> | |
| أو <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb" rel="nofollow">دفتر TensorFlow</a>.</p></blockquote> <h2 class="relative group"><a id="الاستدلال" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#الاستدلال"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>الاستدلال</span></h2> <p data-svelte-h="svelte-ltiwph">رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال!</p> <p data-svelte-h="svelte-1tcacui">جهّز بعض النصوص التي تريد أن يملأ النموذج الفراغات فيها، واستخدم الرمز الخاص <code><mask></code> للإشارة إلى الفراغ:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>text = <span class="hljs-string">"The Milky Way is a <mask> galaxy."</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1deksa0">أبسط طريقة لتجربة نموذجك المعدل للاستدلال هي استخدامه في <code>pipeline()</code>. قم بإنشاء كائن <code>pipeline</code> لملء الفراغ مع نموذجك، ومرر نصك إليه. إذا أردت، يمكنك استخدام معلمة <code>top_k</code> لتحديد عدد التنبؤات التي تريد إرجاعها:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline | |
| <span class="hljs-meta">>>> </span>mask_filler = pipeline(<span class="hljs-string">"fill-mask"</span>, <span class="hljs-string">"username/my_awesome_eli5_mlm_model"</span>) | |
| <span class="hljs-meta">>>> </span>mask_filler(text, top_k=<span class="hljs-number">3</span>) | |
| [{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.5150994658470154</span>, | |
| <span class="hljs-string">'token'</span>: <span class="hljs-number">21300</span>, | |
| <span class="hljs-string">'token_str'</span>: <span class="hljs-string">' spiral'</span>, | |
| <span class="hljs-string">'sequence'</span>: <span class="hljs-string">'The Milky Way is a spiral galaxy.'</span>}, | |
| {<span class="hljs-string">'score'</span>: <span class="hljs-number">0.07087188959121704</span>, | |
| <span class="hljs-string">'token'</span>: <span class="hljs-number">2232</span>, | |
| <span class="hljs-string">'token_str'</span>: <span class="hljs-string">' massive'</span>, | |
| <span class="hljs-string">'sequence'</span>: <span class="hljs-string">'The Milky Way is a massive galaxy.'</span>}, | |
| {<span class="hljs-string">'score'</span>: <span class="hljs-number">0.06434620916843414</span>, | |
| <span class="hljs-string">'token'</span>: <span class="hljs-number">650</span>, | |
| <span class="hljs-string">'token_str'</span>: <span class="hljs-string">' small'</span>, | |
| <span class="hljs-string">'sequence'</span>: <span class="hljs-string">'The Milky Way is a small galaxy.'</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2b5lii">قم بتجزئة النص وإرجاع <code>input_ids</code> كمتجهات PyTorch. ستحتاج أيضًا إلى تحديد موضع رمز <code><mask></code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| <span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"username/my_awesome_eli5_mlm_model"</span>) | |
| <span class="hljs-meta">>>> </span>inputs = tokenizer(text, return_tensors=<span class="hljs-string">"pt"</span>) | |
| <span class="hljs-meta">>>> </span>mask_token_index = torch.where(inputs[<span class="hljs-string">"input_ids"</span>] == tokenizer.mask_token_id)[<span class="hljs-number">1</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-daaelq">قم بتمرير المدخلات إلى النموذج وإرجاع <code>logits</code> للرمز المقنع:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForMaskedLM | |
| <span class="hljs-meta">>>> </span>model = AutoModelForMaskedLM.from_pretrained(<span class="hljs-string">"username/my_awesome_eli5_mlm_model"</span>) | |
| <span class="hljs-meta">>>> </span>logits = model(**inputs).logits | |
| <span class="hljs-meta">>>> </span>mask_token_logits = logits[<span class="hljs-number">0</span>, mask_token_index, :]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ubzmox">ثم قم بإرجاع الرموز الثلاثة المقنعة ذات الاحتمالية الأعلى وطباعتها:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>top_3_tokens = torch.topk(mask_token_logits, <span class="hljs-number">3</span>, dim=<span class="hljs-number">1</span>).indices[<span class="hljs-number">0</span>].tolist() | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> token <span class="hljs-keyword">in</span> top_3_tokens: | |
| <span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) | |
| The Milky Way <span class="hljs-keyword">is</span> a spiral galaxy. | |
| The Milky Way <span class="hljs-keyword">is</span> a massive galaxy. | |
| The Milky Way <span class="hljs-keyword">is</span> a small galaxy.<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ar/tasks/masked_language_modeling.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1lnyyvn = { | |
| assets: "/docs/transformers/main/ar", | |
| base: "/docs/transformers/main/ar", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/transformers/main/ar/_app/immutable/entry/start.6e5b536e.js"), | |
| import("/docs/transformers/main/ar/_app/immutable/entry/app.57820f07.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 38], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 66.6 kB
- Xet hash:
- b4f1d5ad9945b5a393cff24b33731f2c4703c8bed85a8f9d554d477555bf8d89
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.