Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Fare il debug di una training pipeline","local":"fare-il-debug-di-una-training-pipeline","sections":[{"title":"Debugging the training pipeline","local":"debugging-the-training-pipeline","sections":[{"title":"Controllare i dati","local":"controllare-i-dati","sections":[],"depth":3},{"title":"Controllare il modello","local":"controllare-il-modello","sections":[],"depth":3},{"title":"Controllare gli iperparametri","local":"controllare-gli-iperparametri","sections":[],"depth":3}],"depth":2},{"title":"Altri potenziali problemi","local":"altri-potenziali-problemi","sections":[{"title":"Gestire gli errori out-of-memory","local":"gestire-gli-errori-out-of-memory","sections":[],"depth":3},{"title":"TensorFlow è molto affamato 🦛","local":"tensorflow-è-molto-affamato-","sections":[],"depth":3},{"title":"Check your data (again!)","local":"check-your-data-again","sections":[],"depth":3},{"title":"Fare overfitting del modello su un batch","local":"fare-overfitting-del-modello-su-un-batch","sections":[],"depth":3},{"title":"Non calibrare niente prima di avere una prima baseline","local":"non-calibrare-niente-prima-di-avere-una-prima-baseline","sections":[],"depth":3},{"title":"Chiedere aiuto","local":"chiedere-aiuto","sections":[],"depth":3}],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1069/it/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/entry/start.693d748d.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/scheduler.37c15a92.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/singletons.60b4c7a2.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/index.18351ede.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/paths.43b6516c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/entry/app.e9cfd099.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/index.2bf4358c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/nodes/0.bb8a536c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/nodes/45.fb84f5d6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/Tip.363c041f.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/Youtube.1e50a667.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/CodeBlock.4e987730.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/CourseFloatingBanner.9ff4c771.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/DocNotebookDropdown.efc1fb7c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/FrameworkSwitchCourse.8d4d4ab6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Fare il debug di una training pipeline","local":"fare-il-debug-di-una-training-pipeline","sections":[{"title":"Debugging the training pipeline","local":"debugging-the-training-pipeline","sections":[{"title":"Controllare i dati","local":"controllare-i-dati","sections":[],"depth":3},{"title":"Controllare il modello","local":"controllare-il-modello","sections":[],"depth":3},{"title":"Controllare gli iperparametri","local":"controllare-gli-iperparametri","sections":[],"depth":3}],"depth":2},{"title":"Altri potenziali problemi","local":"altri-potenziali-problemi","sections":[{"title":"Gestire gli errori out-of-memory","local":"gestire-gli-errori-out-of-memory","sections":[],"depth":3},{"title":"TensorFlow è molto affamato 🦛","local":"tensorflow-è-molto-affamato-","sections":[],"depth":3},{"title":"Check your data (again!)","local":"check-your-data-again","sections":[],"depth":3},{"title":"Fare overfitting del modello su un batch","local":"fare-overfitting-del-modello-su-un-batch","sections":[],"depth":3},{"title":"Non calibrare niente prima di avere una prima baseline","local":"non-calibrare-niente-prima-di-avere-una-prima-baseline","sections":[],"depth":3},{"title":"Chiedere aiuto","local":"chiedere-aiuto","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="bg-white leading-none border border-gray-100 rounded-lg flex p-0.5 w-56 text-sm mb-4"><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-l bg-red-50 dark:bg-transparent text-red-600" href="?fw=pt"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><defs><clipPath id="a"><rect x="3.05" y="0.5" width="25.73" height="31" fill="none"></rect></clipPath></defs><g clip-path="url(#a)"><path d="M24.94,9.51a12.81,12.81,0,0,1,0,18.16,12.68,12.68,0,0,1-18,0,12.81,12.81,0,0,1,0-18.16l9-9V5l-.84.83-6,6a9.58,9.58,0,1,0,13.55,0ZM20.44,9a1.68,1.68,0,1,1,1.67-1.67A1.68,1.68,0,0,1,20.44,9Z" fill="#ee4c2c"></path></g></svg> Pytorch </a><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-r text-gray-500 filter grayscale" href="?fw=tf"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="0.94em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 274"><path d="M145.726 42.065v42.07l72.861 42.07v-42.07l-72.86-42.07zM0 84.135v42.07l36.43 21.03V105.17L0 84.135zm109.291 21.035l-36.43 21.034v126.2l36.43 21.035v-84.135l36.435 21.035v-42.07l-36.435-21.034V105.17z" fill="#E55B2D"></path><path d="M145.726 42.065L36.43 105.17v42.065l72.861-42.065v42.065l36.435-21.03v-84.14zM255.022 63.1l-36.435 21.035v42.07l36.435-21.035V63.1zm-72.865 84.135l-36.43 21.035v42.07l36.43-21.036v-42.07zm-36.43 63.104l-36.436-21.035v84.135l36.435-21.035V210.34z" fill="#ED8E24"></path><path d="M145.726 0L0 84.135l36.43 21.035l109.296-63.105l72.861 42.07L255.022 63.1L145.726 0zm0 126.204l-36.435 21.03l36.435 21.036l36.43-21.035l-36.43-21.03z" fill="#F8BF3C"></path></svg> TensorFlow </a></div> <h1 class="relative group"><a id="fare-il-debug-di-una-training-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fare-il-debug-di-una-training-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fare il debug di una training pipeline</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-8-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/it/chapter8/section4_tf.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/it/chapter8/section4_tf.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-11weten">Hai scritto un bello script per addestrare o affinare un modello su un determinato compito, seguendo scrupolosamente i consigli del <a href="/course/chapter7">Capitolo 7</a>. Ma quando lanci il comando <code>model.fit()</code>, succede qualcosa di orribile: si ottiene un errore 😱! O peggio, tutto sembra andare bene e il training viene eseguito senza errori, ma il modello che ne risulta fa schifo. In questa sezione mostreremo cosa è possibile fare per eseguire il debug di questo tipo di problemi.</p> <h2 class="relative group"><a id="debugging-the-training-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#debugging-the-training-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Debugging the training pipeline</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/N9kO52itd0Q" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-60gr4a">Il problema quando si ha un errore da <code>model.fit()</code> è che potrebbe provenire da più fonti, poichè la fase di training di solito mette insieme molte cose su cui si è lavorato fino a quel momento. Il problema potrebbe essere qualcosa di sbagliato nel tuo dataset, o qualche problema nel provare a raggruppare in un batch elementi del dataset. E anche se tutto va bene per il training, qualcosa potrebbe andare storto durante la valutazione se c’è un problema con la metrica selezionata.</p> <p data-svelte-h="svelte-11uchwo">Il modo migliore per eseguire il debug di un errore che si verifica in <code>model.fit()</code> è quello di esaminare manualmente l’intera pipeline per vedere dove le cose sono andate storte. L’errore è spesso molto facile da risolvere.</p> <p data-svelte-h="svelte-n2uzsr">Per dimostrarlo, useremo il seguente script che ha lo scopo di affinare un modello DistilBERT sul <a href="https://huggingface.co/datasets/glue" rel="nofollow">dataset MNLI</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset, load_metric | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> ( | |
| AutoTokenizer, | |
| TFAutoModelForSequenceClassification, | |
| ) | |
| raw_datasets = load_dataset(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| model_checkpoint = <span class="hljs-string">"distilbert-base-uncased"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-keyword">return</span> tokenizer(examples[<span class="hljs-string">"premise"</span>], examples[<span class="hljs-string">"hypothesis"</span>], truncation=<span class="hljs-literal">True</span>) | |
| tokenized_datasets = raw_datasets.<span class="hljs-built_in">map</span>(preprocess_function, batched=<span class="hljs-literal">True</span>) | |
| train_dataset = tokenized_datasets[<span class="hljs-string">"train"</span>].to_tf_dataset( | |
| columns=[<span class="hljs-string">"input_ids"</span>, <span class="hljs-string">"labels"</span>], batch_size=<span class="hljs-number">16</span>, shuffle=<span class="hljs-literal">True</span> | |
| ) | |
| validation_dataset = tokenized_datasets[<span class="hljs-string">"validation_matched"</span>].to_tf_dataset( | |
| columns=[<span class="hljs-string">"input_ids"</span>, <span class="hljs-string">"labels"</span>], batch_size=<span class="hljs-number">16</span>, shuffle=<span class="hljs-literal">True</span> | |
| ) | |
| model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint) | |
| model.<span class="hljs-built_in">compile</span>(loss=<span class="hljs-string">"sparse_categorical_crossentropy"</span>, optimizer=<span class="hljs-string">"adam"</span>) | |
| model.fit(train_dataset)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2xy7hg">Se si tenta di eseguirlo, si potrebbero riscontrare alcuni <code>VisibleDeprecationWarning</code> durante la conversione del dataset — si tratta di un problema UX noto, quindi si prega di ignorarlo. Se stai leggendo il corso dopo, diciamo, novembre 2021 e il problema si ripresenta ancora, invia dei tweet di disappunto a @carrigmat finché non lo risolve.</p> <p data-svelte-h="svelte-ecj6vt">Il problema più grave, però, è che riceviamo un vero e proprio errore. Ed è davvero terribilmente lungo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->ValueError: No gradients provided <span class="hljs-keyword">for</span> <span class="hljs-built_in">any</span> variable: [<span class="hljs-string">'tf_distil_bert_for_sequence_classification/distilbert/embeddings/word_embeddings/weight:0'</span>, <span class="hljs-string">'...'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4thtzp">Che cosa significa? Abbiamo provato ad dare training sui nostri dati, ma non abbiamo ottenuto alcun gradiente? Questo è piuttosto preoccupante; come possiamo iniziare a fare il debug di una cosa del genere? Quando l’errore che si ottiene non suggerisce immediatamente dove sia il problema, la soluzione migliore è spesso quella di procedere in ordine, assicurandosi in ogni fase che tutto sia corretto. Naturalmente, il punto di partenza è sempre…</p> <h3 class="relative group"><a id="controllare-i-dati" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controllare-i-dati"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Controllare i dati</span></h3> <p data-svelte-h="svelte-6p0e69">Non c’è bisogno di dirlo, ma se i dati sono danneggiati, Keras non sarà in grado di risolverli per te. Quindi, per prima cosa, è necessario dare un’occhiata a cosa c’è nel training set.</p> <p data-svelte-h="svelte-1ijy5ig">Anche se c’è la tentazione di guardare in <code>raw_datasets</code> e <code>tokenized_datasets</code>, si consiglia vivamente di esaminare i dati proprio nel punto in cui entrano nel modello. Ciò significa leggere un output dal <code>tf.data.Dataset</code> creato con la funzione <code>to_tf_dataset()</code>! Come si fa? Gli oggetti <code>tf.data.Dataset</code> ci forniscono volta per volta interi batch e non supportano l’indicizzazione, quindi non possiamo semplicemente chiedere <code>train_dataset[0]</code>. Possiamo però chiedere gentilmente un batch:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> train_dataset: | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-187qrpj"><code>break</code> termina il ciclo dopo un’iterazione, quindi prende il primo batch che esce da <code>train_dataset</code> e lo salva come <code>batch</code>. Ora, diamo un’occhiata a ciò che c’è dentro:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">'attention_mask'</span>: <tf.Tensor: shape=(<span class="hljs-number">16</span>, <span class="hljs-number">76</span>), dtype=int64, numpy= | |
| array([[<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| ..., | |
| [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ..., <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>], | |
| [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>]])>, | |
| <span class="hljs-string">'label'</span>: <tf.Tensor: shape=(<span class="hljs-number">16</span>,), dtype=int64, numpy=array([<span class="hljs-number">0</span>, <span class="hljs-number">2</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">1</span>])>, | |
| <span class="hljs-string">'input_ids'</span>: <tf.Tensor: shape=(<span class="hljs-number">16</span>, <span class="hljs-number">76</span>), dtype=int64, numpy= | |
| array([[ <span class="hljs-number">101</span>, <span class="hljs-number">2174</span>, <span class="hljs-number">1010</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">3174</span>, <span class="hljs-number">2420</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">2044</span>, <span class="hljs-number">2048</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| ..., | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">3398</span>, <span class="hljs-number">3398</span>, ..., <span class="hljs-number">2051</span>, <span class="hljs-number">2894</span>, <span class="hljs-number">102</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">4124</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">1999</span>, <span class="hljs-number">2070</span>, ..., <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>]])>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-z1fm88">Sembra corretto, non è vero? Stiamo passando al modello le <code>labels</code>, le <code>attention_mask</code> e gli <code>input_ids</code>, che dovrebbero essere tutto ciò di cui ha bisogno per calcolare gli output e la loss (<em>funzione di perdita</em>). Perché non abbiamo un gradiente? Guarda meglio: stiamo passando un singolo dizionario come input, ma un batch di addestramento è di solito un tensore o un dizionario di input, più un tensore di label. Le label sono solo una chiave del dizionario di input.</p> <p data-svelte-h="svelte-1aa56se">È un problema? Non sempre, in realtà! Ma è uno dei problemi più comuni che si incontrano quando si addestrano modelli Transformer con TensorFlow. Tutti i nostri modelli possono calcolare la loss internamente, ma per farlo è necessario passare le label nel dizionario di input. Questa è la funzione che viene utilizzata quando non si specifica un valore di loss in <code>compile()</code>. Keras, invece, di solito si aspetta che le label siano passate separatamente dal dizionario di input e le computazioni delle loss di solito falliscono se non lo si fa.</p> <p data-svelte-h="svelte-1dy45j8">Il problema è ora più chiaro: abbiamo usato un argomento <code>loss</code>, il che significa che stiamo chiedendo a Keras di calcolare le loss per noi, ma abbiamo passato le nostre label come input al modello, non come label nel posto in cui Keras se le aspetta! Dobbiamo scegliere l’una o l’altra soluzione: o usiamo la funzione interna del modello e manteniamo le label dove sono, oppure continuiamo a usare le loss di Keras, ma spostiamo le label nel punto in cui Keras se le aspetta. Per semplicità, adottiamo il primo approccio. Cambia la chiamata a <code>compile()</code> in:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.<span class="hljs-built_in">compile</span>(optimizer=<span class="hljs-string">"adam"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s74rdr">Ora utilizzeremo la funzione di perdita interna del modello e il problema dovrebbe essere risolto!</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-6x73e">✏️ <strong>Prova tu!</strong> Come sfida opzionale, dopo aver risolto gli altri problemi, puoi provare a tornare a questo passaggio e a far funzionare il modello con la loss originale calcolata da Keras invece che con la loss interna. È necessario aggiungere <code>"labels"</code> all’argomento <code>label_cols</code> di <code>to_tf_dataset()</code> per assicurarsi che le label siano fornite correttamente, in modo da ottenere i gradienti, ma c’è un altro problema con la loss che abbiamo specificato. L’addestramento continuerà a funzionare con questo problema, ma l’apprendimento sarà molto lento e si bloccherà a una loss di addestramento elevata. Riesci a capire di cosa si tratta?</p> <p data-svelte-h="svelte-pgt8la">Un suggerimento codificato in ROT13, se sei bloccato/a: Vs lbh ybbx ng gur bhgchgf bs FrdhraprPynffvsvpngvba zbqryf va Genafsbezref, gurve svefg bhgchg vf <code>ybtvgf</code>. Jung ner ybtvgf?</p> <p data-svelte-h="svelte-r9rm6w">E un secondo indizio: Jura lbh fcrpvsl bcgvzvmref, npgvingvbaf be ybffrf jvgu fgevatf, Xrenf frgf nyy gur nethzrag inyhrf gb gurve qrsnhygf. Jung nethzragf qbrf FcnefrPngrtbevpnyPebffragebcl unir, naq jung ner gurve qrsnhygf?</p></div> <p data-svelte-h="svelte-brl45p">Ora proviamo ad avviare l’addestramento. Ora dovremmo ottenere i gradienti, quindi, se tutto va bene (musica minacciosa), possiamo chiamare <code>model.fit()</code> e tutto funzionerà bene!</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> <span class="hljs-number">246</span>/<span class="hljs-number">24543</span> [..............................] - ETA: <span class="hljs-number">15</span>:<span class="hljs-number">52</span> - loss: nan<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1skddao">Oh no.</p> <p data-svelte-h="svelte-nvfb18"><code>nan</code> non è un valore di loss molto incoraggiante. Tuttavia, abbiamo controllato i nostri dati e sembrano abbastanza buoni. Se il problema non è questo, come possiamo procedere? Il passo successivo più ovvio è…</p> <h3 class="relative group"><a id="controllare-il-modello" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controllare-il-modello"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Controllare il modello</span></h3> <p data-svelte-h="svelte-rzy1rq"><code>model.fit()</code> è un’ottima funzionionalità di Keras, ma fa un sacco di cose per te e questo può rendere più difficile trovare esattamente dove si è generato un problema. Se stai facendo il debug del modello, una strategia che può essere molto utile è quella di passare un solo batch al modello e di esaminare in dettaglio gli output di quel batch. Un altro suggerimento molto utile se il modello produce errori è quello di <code>compile()</code> il modello con <code>run_eagerly=True</code>. Questo lo renderà molto più lento, ma renderà i messaggi di errore molto più comprensibili, perché indicheranno esattamente in quale punto del codice del modello si è verificato il problema.</p> <p data-svelte-h="svelte-17ghy6r">Per ora, però, non abbiamo bisogno di <code>run_eagerly</code>. Passiamo il <code>batch</code> che abbiamo ottenuto prima attraverso il modello e vediamo come sono gli output:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model(batch)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(<span class="hljs-number">16</span>,), dtype=float32, numpy= | |
| array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, | |
| nan, nan, nan], dtype=float32)>, logits=<tf.Tensor: shape=(<span class="hljs-number">16</span>, <span class="hljs-number">2</span>), dtype=float32, numpy= | |
| array([[nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan], | |
| [nan, nan]], dtype=float32)>, hidden_states=<span class="hljs-literal">None</span>, attentions=<span class="hljs-literal">None</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jmilq2">Beh, questo è insidioso. Tutto è <code>nan</code>! Ma è strano, non è vero? Come farebbero tutti i nostri logit a diventare <code>nan</code>? <code>nan</code> significa “not a number” (<em>“non un numero”</em>). I valori <code>nan</code> si verificano spesso quando si esegue un’operazione vietata, come la divisione per zero. Ma una cosa molto importante da sapere su <code>nan</code> in machine learning è che questo valore tende a <em>propagarsi</em>. Se si moltiplica un numero per <code>nan</code>, anche il risultato sarà <code>nan</code>. E se si ottiene un <code>nan</code> in un punto qualsiasi dell’output, della loss o del gradiente, questo si diffonderà rapidamente in tutto il modello, perché quando quel valore <code>nan</code> si propagherà attraverso la rete, si otterranno gradienti <code>nan</code>, e quando gli aggiornamenti dei pesi saranno calcolati con quei gradienti, si otterranno pesi <code>nan</code>, e quei pesi calcoleranno ancora più output <code>nan</code>! Presto l’intera rete sarà solo un grande blocco di <code>nan</code>. Una volta che ciò accade, è piuttosto difficile capire dove sia iniziato il problema. Come possiamo isolare il punto in cui <code>nan</code> si è insinuato per la prima volta?</p> <p data-svelte-h="svelte-1gqndqo">La risposta è provare a <em>reinizializzare</em> il nostro modello. Una volta iniziato l’addestramento, abbiamo avuto un <code>nan</code> da qualche parte e questo si è rapidamente propagato all’intero modello. Quindi, carichiamo il modello da un checkpoint e non eseguiamo alcun aggiornamento dei pesi, e vediamo dove otteniamo un valore <code>nan</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint) | |
| model(batch)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6uaa5z">Quando lo si esegue, si ottiene:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(<span class="hljs-number">16</span>,), dtype=float32, numpy= | |
| array([<span class="hljs-number">0.6844486</span> , nan, nan, <span class="hljs-number">0.67127866</span>, <span class="hljs-number">0.7068601</span> , | |
| nan, <span class="hljs-number">0.69309855</span>, nan, <span class="hljs-number">0.65531296</span>, nan, | |
| nan, nan, <span class="hljs-number">0.675402</span> , nan, nan, | |
| <span class="hljs-number">0.69831556</span>], dtype=float32)>, logits=<tf.Tensor: shape=(<span class="hljs-number">16</span>, <span class="hljs-number">2</span>), dtype=float32, numpy= | |
| array([[-<span class="hljs-number">0.04761693</span>, -<span class="hljs-number">0.06509043</span>], | |
| [-<span class="hljs-number">0.0481936</span> , -<span class="hljs-number">0.04556257</span>], | |
| [-<span class="hljs-number">0.0040929</span> , -<span class="hljs-number">0.05848458</span>], | |
| [-<span class="hljs-number">0.02417453</span>, -<span class="hljs-number">0.0684005</span> ], | |
| [-<span class="hljs-number">0.02517801</span>, -<span class="hljs-number">0.05241832</span>], | |
| [-<span class="hljs-number">0.04514256</span>, -<span class="hljs-number">0.0757378</span> ], | |
| [-<span class="hljs-number">0.02656011</span>, -<span class="hljs-number">0.02646275</span>], | |
| [ <span class="hljs-number">0.00766164</span>, -<span class="hljs-number">0.04350497</span>], | |
| [ <span class="hljs-number">0.02060014</span>, -<span class="hljs-number">0.05655622</span>], | |
| [-<span class="hljs-number">0.02615328</span>, -<span class="hljs-number">0.0447021</span> ], | |
| [-<span class="hljs-number">0.05119278</span>, -<span class="hljs-number">0.06928903</span>], | |
| [-<span class="hljs-number">0.02859691</span>, -<span class="hljs-number">0.04879177</span>], | |
| [-<span class="hljs-number">0.02210129</span>, -<span class="hljs-number">0.05791225</span>], | |
| [-<span class="hljs-number">0.02363213</span>, -<span class="hljs-number">0.05962167</span>], | |
| [-<span class="hljs-number">0.05352269</span>, -<span class="hljs-number">0.0481673</span> ], | |
| [-<span class="hljs-number">0.08141848</span>, -<span class="hljs-number">0.07110836</span>]], dtype=float32)>, hidden_states=<span class="hljs-literal">None</span>, attentions=<span class="hljs-literal">None</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-he7wim"><em>Adesso</em> sì che ci capiamo! Non ci sono valori <code>nan</code> nei nostri logit, il che è rassicurante. Ma vediamo alcuni valori <code>nan</code> nella nostra loss! C’è qualcosa in quei campioni in particolare che sta causando questo problema? Vediamo quali sono (nota che se esegui questo codice da solo/a, potresti ottenere indici diversi perché il dataset è stato rimescolato):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np | |
| loss = model(batch).loss.numpy() | |
| indices = np.flatnonzero(np.isnan(loss)) | |
| indices<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->array([ <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">5</span>, <span class="hljs-number">7</span>, <span class="hljs-number">9</span>, <span class="hljs-number">10</span>, <span class="hljs-number">11</span>, <span class="hljs-number">13</span>, <span class="hljs-number">14</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16f088a">Visualizziamo i campioni associati a questi indici:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->input_ids = batch[<span class="hljs-string">"input_ids"</span>].numpy() | |
| input_ids[indices]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->array([[ <span class="hljs-number">101</span>, <span class="hljs-number">2007</span>, <span class="hljs-number">2032</span>, <span class="hljs-number">2001</span>, <span class="hljs-number">1037</span>, <span class="hljs-number">16480</span>, <span class="hljs-number">3917</span>, <span class="hljs-number">2594</span>, <span class="hljs-number">4135</span>, | |
| <span class="hljs-number">23212</span>, <span class="hljs-number">3070</span>, <span class="hljs-number">2214</span>, <span class="hljs-number">10170</span>, <span class="hljs-number">1010</span>, <span class="hljs-number">2012</span>, <span class="hljs-number">4356</span>, <span class="hljs-number">1997</span>, <span class="hljs-number">3183</span>, | |
| <span class="hljs-number">6838</span>, <span class="hljs-number">12953</span>, <span class="hljs-number">2039</span>, <span class="hljs-number">2000</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">6147</span>, <span class="hljs-number">1997</span>, <span class="hljs-number">2010</span>, <span class="hljs-number">2606</span>, | |
| <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">6838</span>, <span class="hljs-number">2001</span>, <span class="hljs-number">3294</span>, <span class="hljs-number">6625</span>, <span class="hljs-number">3773</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">2214</span>, | |
| <span class="hljs-number">2158</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">1998</span>, <span class="hljs-number">6814</span>, <span class="hljs-number">2016</span>, <span class="hljs-number">2234</span>, <span class="hljs-number">2461</span>, <span class="hljs-number">2153</span>, <span class="hljs-number">1998</span>, <span class="hljs-number">13322</span>, | |
| <span class="hljs-number">2009</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">2045</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">1055</span>, <span class="hljs-number">2053</span>, <span class="hljs-number">3382</span>, <span class="hljs-number">2008</span>, | |
| <span class="hljs-number">2016</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">2222</span>, <span class="hljs-number">3046</span>, <span class="hljs-number">8103</span>, <span class="hljs-number">2075</span>, <span class="hljs-number">2009</span>, <span class="hljs-number">2153</span>, <span class="hljs-number">1012</span>, | |
| <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">1998</span>, <span class="hljs-number">2007</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">3712</span>, <span class="hljs-number">4634</span>, <span class="hljs-number">1010</span>, <span class="hljs-number">2057</span>, <span class="hljs-number">8108</span>, | |
| <span class="hljs-number">2025</span>, <span class="hljs-number">3404</span>, <span class="hljs-number">2028</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">2616</span>, <span class="hljs-number">18449</span>, <span class="hljs-number">2125</span>, <span class="hljs-number">1999</span>, | |
| <span class="hljs-number">1037</span>, <span class="hljs-number">9666</span>, <span class="hljs-number">1997</span>, <span class="hljs-number">4100</span>, <span class="hljs-number">8663</span>, <span class="hljs-number">11020</span>, <span class="hljs-number">6313</span>, <span class="hljs-number">2791</span>, <span class="hljs-number">1998</span>, | |
| <span class="hljs-number">2431</span>, <span class="hljs-number">1011</span>, <span class="hljs-number">4301</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">2028</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">1055</span>, <span class="hljs-number">5177</span>, | |
| <span class="hljs-number">2110</span>, <span class="hljs-number">1998</span>, <span class="hljs-number">3977</span>, <span class="hljs-number">2000</span>, <span class="hljs-number">2832</span>, <span class="hljs-number">2106</span>, <span class="hljs-number">2025</span>, <span class="hljs-number">2689</span>, <span class="hljs-number">2104</span>, | |
| <span class="hljs-number">2122</span>, <span class="hljs-number">6214</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">1045</span>, <span class="hljs-number">2001</span>, <span class="hljs-number">1999</span>, <span class="hljs-number">1037</span>, <span class="hljs-number">13090</span>, <span class="hljs-number">5948</span>, <span class="hljs-number">2007</span>, <span class="hljs-number">2048</span>, | |
| <span class="hljs-number">2308</span>, <span class="hljs-number">2006</span>, <span class="hljs-number">2026</span>, <span class="hljs-number">5001</span>, <span class="hljs-number">2043</span>, <span class="hljs-number">2026</span>, <span class="hljs-number">2171</span>, <span class="hljs-number">2001</span>, <span class="hljs-number">2170</span>, | |
| <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">1045</span>, <span class="hljs-number">2001</span>, <span class="hljs-number">3564</span>, <span class="hljs-number">1999</span>, <span class="hljs-number">2277</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">2195</span>, <span class="hljs-number">4279</span>, <span class="hljs-number">2191</span>, <span class="hljs-number">2039</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">2181</span>, <span class="hljs-number">2124</span>, <span class="hljs-number">2004</span>, | |
| <span class="hljs-number">1996</span>, <span class="hljs-number">2225</span>, <span class="hljs-number">7363</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">2045</span>, <span class="hljs-number">2003</span>, <span class="hljs-number">2069</span>, <span class="hljs-number">2028</span>, | |
| <span class="hljs-number">2451</span>, <span class="hljs-number">1999</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">2225</span>, <span class="hljs-number">7363</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">2061</span>, <span class="hljs-number">2008</span>, <span class="hljs-number">1045</span>, <span class="hljs-number">2123</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">1056</span>, <span class="hljs-number">2113</span>, <span class="hljs-number">2065</span>, | |
| <span class="hljs-number">2009</span>, <span class="hljs-number">2428</span>, <span class="hljs-number">10654</span>, <span class="hljs-number">7347</span>, <span class="hljs-number">2030</span>, <span class="hljs-number">2009</span>, <span class="hljs-number">7126</span>, <span class="hljs-number">2256</span>, <span class="hljs-number">2495</span>, | |
| <span class="hljs-number">2291</span>, <span class="hljs-number">102</span>, <span class="hljs-number">2009</span>, <span class="hljs-number">2003</span>, <span class="hljs-number">5094</span>, <span class="hljs-number">2256</span>, <span class="hljs-number">2495</span>, <span class="hljs-number">2291</span>, <span class="hljs-number">2035</span>, | |
| <span class="hljs-number">2105</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">2051</span>, <span class="hljs-number">1010</span>, <span class="hljs-number">2029</span>, <span class="hljs-number">3216</span>, <span class="hljs-number">2019</span>, <span class="hljs-number">2503</span>, <span class="hljs-number">3444</span>, <span class="hljs-number">1010</span>, | |
| <span class="hljs-number">6732</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">2265</span>, <span class="hljs-number">2038</span>, <span class="hljs-number">19840</span>, <span class="hljs-number">2098</span>, <span class="hljs-number">2125</span>, <span class="hljs-number">9906</span>, <span class="hljs-number">1998</span>, | |
| <span class="hljs-number">2003</span>, <span class="hljs-number">2770</span>, <span class="hljs-number">2041</span>, <span class="hljs-number">1997</span>, <span class="hljs-number">4784</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">2051</span>, <span class="hljs-number">6732</span>, | |
| <span class="hljs-number">1996</span>, <span class="hljs-number">2265</span>, <span class="hljs-number">2003</span>, <span class="hljs-number">9525</span>, <span class="hljs-number">1998</span>, <span class="hljs-number">4569</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">10556</span>, <span class="hljs-number">2140</span>, <span class="hljs-number">11515</span>, <span class="hljs-number">2058</span>, <span class="hljs-number">1010</span>, <span class="hljs-number">2010</span>, <span class="hljs-number">2162</span>, | |
| <span class="hljs-number">2252</span>, <span class="hljs-number">5689</span>, <span class="hljs-number">2013</span>, <span class="hljs-number">2010</span>, <span class="hljs-number">7223</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">2043</span>, <span class="hljs-number">1996</span>, | |
| <span class="hljs-number">10556</span>, <span class="hljs-number">2140</span>, <span class="hljs-number">11515</span>, <span class="hljs-number">2058</span>, <span class="hljs-number">1010</span>, <span class="hljs-number">2010</span>, <span class="hljs-number">2252</span>, <span class="hljs-number">3062</span>, <span class="hljs-number">2000</span>, | |
| <span class="hljs-number">1996</span>, <span class="hljs-number">2598</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| [ <span class="hljs-number">101</span>, <span class="hljs-number">13543</span>, <span class="hljs-number">1999</span>, <span class="hljs-number">2049</span>, <span class="hljs-number">6143</span>, <span class="hljs-number">2933</span>, <span class="hljs-number">2443</span>, <span class="hljs-number">102</span>, <span class="hljs-number">2025</span>, | |
| <span class="hljs-number">13543</span>, <span class="hljs-number">1999</span>, <span class="hljs-number">6143</span>, <span class="hljs-number">2933</span>, <span class="hljs-number">2003</span>, <span class="hljs-number">2443</span>, <span class="hljs-number">102</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, | |
| <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>]])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jv0hmp">Beh, ci sono tante cose qui dentro, ma non c’è nulla che si distingua come insolito. Diamo un’occhiata alle label:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->labels = batch[<span class="hljs-string">'labels'</span>].numpy() | |
| labels[indices]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->array([<span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>, <span class="hljs-number">2</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v38650">I campioni <code>nan</code> hanno tutti la stessa label, ed è la classe 2. Questo è un indizio molto chiaro. Il fatto che si abbia una loss di <code>nan</code> solo quando la label è 2 suggerisce che questo è un ottimo momento per verificare il numero di label nel nostro modello:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.config.num_labels<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">2</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-u26sdw">Ora vediamo il problema: il modello pensa che ci siano solo due classi, ma le label arrivano a 2, il che significa che in realtà ci sono tre classi (perché anche lo 0 è una classe). Ecco come abbiamo ottenuto un <code>nan</code>: cercando di calcolare la loss per una classe inesistente! Proviamo a cambiare il modello e ad adattarlo di nuovo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, <span class="hljs-attribute">num_labels</span>=3) | |
| model.compile(<span class="hljs-attribute">optimizer</span>=<span class="hljs-string">'adam'</span>) | |
| model.fit(train_dataset)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> <span class="hljs-number">869</span>/<span class="hljs-number">24543</span> [>.............................] - ETA: <span class="hljs-number">15</span>:<span class="hljs-number">29</span> - loss: <span class="hljs-number">1.1032</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-121et73">Staimo addestrando! Non ci sono più <code>nan</code> e la nostra loss sta diminuendo… più o meno. Se la si osserva per un po’, si potrebbe iniziare a spazientirsi, perché il valore della loss rimane ostinatamente alto. Interrompiamo il training e cerchiamo di capire quale potrebbe essere la causa di questo problema. A questo punto, siamo abbastanza sicuri che sia i dati che il modello siano a posto, ma il nostro modello non sta imparando bene. Cos’altro rimane? È ora di…</p> <h3 class="relative group"><a id="controllare-gli-iperparametri" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controllare-gli-iperparametri"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Controllare gli iperparametri</span></h3> <p data-svelte-h="svelte-1itxdy1">Se si guarda al codice precedente, è possibile che non si riesca a vedere alcun iperparametro, a parte forse il <code>batch_size</code>, e questo non sembra un possibile problema. Non lasciarti ingannare, però: gli iperparametri ci sono sempre e se non li vedi significa che non conosci il valore a cui sono impostati. In particolare, ricorda una cosa fondamentale di Keras: se imposti una loss, un optimizer (<em>ottimizzatore</em>) o una funzione di attivazione con una stringa, <em>tutti i suoi argomenti saranno impostati ai loro valori predefiniti</em>. Ciò significa che, anche se usare le stringhe è molto comodo, bisogna fare molta attenzione, perché questa cosa potrebbe facilmente nascondere alcuni aspetti importanti. (Chiunque si cimenti nella sfida opzionale qui sopra dovrebbe prendere nota di questo fatto).</p> <p data-svelte-h="svelte-j35dye">In questo caso, dove abbiamo impostato un argomento con una stringa? Inizialmente settavamo la loss con una stringa, ma ora non lo facciamo più. Tuttavia, impostiamo l’optimizer usando una stringa. Potrebbe nasconderci qualcosa? Diamo un’occhiata ai <a href="https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam" rel="nofollow">suoi argomenti</a>.</p> <p data-svelte-h="svelte-24qqev">C’è qualcosa che balza all’occhio? Esatto: il learning rate (<em>tasso di apprendimento</em>)! Quando usiamo semplicemente la stringa <code>'adam'</code>, otterremo il tasso di apprendimento predefinito, che è 0,001, o 1e-3. Questo è decisamente troppo alto per un modello Transformer! In generale, si consiglia di provare learning rate tra 1e-5 e 1e-4 per i modelli; si tratta di un valore tra 10 e 100 volte inferiore a quello che stiamo usando qui. Questo sembra essere un problema importante, quindi proviamo a ridurlo. Per farlo, dobbiamo importare l’oggetto <code>optimizer</code>. Già che ci siamo, reinizializziamo il modello dal checkpoint, nel caso in cui il training con un learning rate elevato abbia compromesso i suoi pesi:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> tensorflow.keras.optimizers <span class="hljs-keyword">import</span> Adam | |
| model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint) | |
| model.<span class="hljs-built_in">compile</span>(optimizer=Adam(<span class="hljs-number">5e-5</span>))<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1u65xwn">💡 È anche possibile importare la funzione <code>create_optimizer()</code> da 🤗 Transformers, che fornirà un optimizer AdamW con un corretto weight decay insieme a un learning rate warmup e decay. Questo ottimizzatore spesso produce risultati leggermente migliori di quelli ottenuti con l’ottimizzatore Adam predefinito.</p></div> <p data-svelte-h="svelte-nvrts">Adess, possiamo tentarde di fare training del modell con il nuovo learning rate migliorato:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.fit(train_dataset)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">319</span>/<span class="hljs-number">24543</span> [..............................] - ETA: <span class="hljs-number">16</span>:07 - loss: <span class="hljs-number">0.9718</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-c2wwuy">Ora la nostra loss sta davvero andando da qualche parte! L’addestramento sembra finalmente funzionare. C’è una lezione da imparare: quando il modello funziona, ma la loss non diminuisce, e si è sicuri che i dati siano corretti, è una buona idea controllare gli iperparametri come il learning rate e il weight decay. Impostando uno di questi parametri troppo alto, è molto probabile che l’addestramento si “blocchi” a un valore di loss elevato.</p> <h2 class="relative group"><a id="altri-potenziali-problemi" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#altri-potenziali-problemi"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Altri potenziali problemi</span></h2> <p data-svelte-h="svelte-9qwt5n">Abbiamo trattato i problemi dello script di cui sopra, ma ci sono molti altri errori comuni che si possono incontrare. Vediamo un elenco (molto incompleto).</p> <h3 class="relative group"><a id="gestire-gli-errori-out-of-memory" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gestire-gli-errori-out-of-memory"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Gestire gli errori out-of-memory</span></h3> <p data-svelte-h="svelte-344wk2">Il segnale che indica che la memoria è esaurita è un errore del tipo “OOM when allocating tensor” (OOM è l’abbreviazione di “out of memory”). Si tratta di un rischio molto comune quando si ha a che fare con modelli linguistici di grandi dimensioni. In questo caso, una buona strategia è quella di dimezzare le dimensioni del batch e riprovare. Tenete presente, però, che alcuni modelli sono <em>molto</em> grandi. Ad esempio, il modello GPT-2 completo ha 1,5B parametri, il che significa che sono necessari 6 GB di memoria solo per memorizzare il modello e altri 6 GB per i suoi gradienti! L’addestramento del modello GPT-2 completo richiede di solito oltre 20 GB di VRAM, indipendentemente dalla dimensione del batch utilizzato, che solo poche GPU hanno. Modelli più leggeri come <code>distilbert-base-cased</code> sono molto più facili da eseguire e si addestrano molto più rapidamente.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-q2er6j">Nella prossima parte del corso, esamineremo tecniche più avanzate che possono aiutare a ridurre l’impatto sulla memoria e ad affinare i modelli più grandi.</p></div> <h3 class="relative group"><a id="tensorflow-è-molto-affamato-" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tensorflow-è-molto-affamato-"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>TensorFlow è molto affamato 🦛</span></h3> <p data-svelte-h="svelte-182ehil">Una particolarità di TensorFlow di cui bisogna essere consapevoli è che alloca <em>tutta</em> la memoria della GPU su se stesso non appena si carica un modello o si esegue un addestramento, e poi divide la memoria in base alle esigenze. Questo comportamento è diverso da quello di altri framework, come PyTorch, che allocano la memoria come richiesto con CUDA invece di farlo internamente. Un vantaggio dell’approccio di TensorFlow è che spesso può produrre errori utili quando esaurisci la memoria e può recuperare da questo stato senza mandare in crash l’intero kernel CUDA. Ma c’è anche un importante aspetto negativo: se si eseguono due processi TensorFlow contemporaneamente, allora <strong>sarà un bel guaio</strong>.</p> <p data-svelte-h="svelte-1076z6j">Se si esegue su Colab non ci si deve preoccupare di questo, ma se si lavora in locale è sicuramente qualcosa a cui si deve fare attenzione. In particolare, è bene ricordare che la chiusura della scheda di un notebook non comporta necessariamente la chiusura del notebook stesso! Potresti dover selezionare i notebook in esecuzione (quelli con l’icona verde) e chiuderli manualmente nell’elenco della directory. Qualsiasi notebook in esecuzione che utilizzava TensorFlow potrebbe ancora conservare una buona parte della memoria della GPU e ciò significa che qualsiasi nuovo notebook avviato potrebbe presentare problemi molto strani.</p> <p data-svelte-h="svelte-1n4y2a4">Se inizi a ricevere errori relativi a CUDA, BLAS o cuBLAS in un codice che prima funzionava, questa è molto spesso la ragione. Si può usare un comando come <code>nvidia-smi</code> per controllare: quando si spegne o si riavvia il notebook usato, la maggior parte della memoria è libera o è ancora in uso? Se è ancora in uso, c’è qualcos’altro che la sta occupando!</p> <h3 class="relative group"><a id="check-your-data-again" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#check-your-data-again"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Check your data (again!)</span></h3> <p data-svelte-h="svelte-wx6flv">Il tuo modello imparerà qualcosa solo se è effettivamente possibile imparare qualcosa dai tuoi dati. Se c’è un bug che corrompe i dati o le label sono assegnate in modo casuale, è molto probabile che non si riesca ad addestrare il modello sul dataset. In questo caso uno strumento utile è <code>tokenizer.decode()</code>. Questo trasformerà gli <code>input_ids</code> in stringhe, in modo da poter visualizzare i dati e vedere se i dati di training stanno addestrando ciò che si vuole. Per esempio, dopo aver ottenuto un <code>batch</code> dal proprio <code>tf.data.Dataset</code> come abbiamo fatto sopra, si può decodificare il primo elemento in questo modo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->input_ids = batch[<span class="hljs-string">"input_ids"</span>].numpy() | |
| tokenizer.decode(input_ids[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17cqayv">Poi si può confrontare con la prima label, in questo modo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->labels = batch[<span class="hljs-string">"labels"</span>].numpy() | |
| label = labels[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-74l1nl">Una volta visualizzati i dati in questo modo, puoi porti le seguenti domande:</p> <ul data-svelte-h="svelte-1ygrnog"><li>I dati decodificati sono comprensibili?</li> <li>Sei d’accordo con le label?</li> <li>C’è una label più comune delle altre?</li> <li>Quale dovrebbe essere la funzione di perdita/metrica se il modello predicesse una risposta a caso/sempre la stessa risposta?</li></ul> <p data-svelte-h="svelte-1s0ff9r">Dopo aver osservato i dati, esamina alcune previsioni del modello: se il modello produce dei token, prova a decodificare anche quelli! Se il modello prevede sempre la stessa cosa, potrebbe essere perché il tuo set di dati è influenzato verso una categoria (per i problemi di classificazione); tecniche come fare oversampling (<em>sovra-campionamento</em>) delle classi rare potrebbero aiutare. In alternativa, ciò può essere causato da problemi di addestramento, come ad esempio una scorretta impostazione degli iperparametri.</p> <p data-svelte-h="svelte-1dmtka">Se la funzione di perdita/metrica ottenuta con il tuo modello iniziale è molto diversa da quella che ci si aspetterebbe per le previsioni casuali, ricontrolla il modo in cui viene calcolata la funzione o la metrica, perché probabilmente c’è un bug. Se si utilizzano diverse funzioni che aggiungi alla fine, assicurati che siano della stessa grandezza.</p> <p data-svelte-h="svelte-wddm9m">Quando sei sicuro/a che i dati sono perfetti, puoi verificare se il modello è in grado di addestrarsi su di essi con un semplice test.</p> <h3 class="relative group"><a id="fare-overfitting-del-modello-su-un-batch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fare-overfitting-del-modello-su-un-batch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fare overfitting del modello su un batch</span></h3> <p data-svelte-h="svelte-1e3gyzf">L’overfitting è di solito qualcosa che cerchiamo di evitare durante l’addestramento, poiché significa che il modello non sta imparando a riconoscere le proprietà generali che vogliamo, ma sta invece memorizzando i campioni di addestramento. Tuttavia, provare ad addestrare il modello su un batch più e più volte è un buon test per verificare se il problema così come è stato inquadrato può essere risolto dal modello che si sta cercando di addestrare. Inoltre, ti aiuterà a capire se il learning rate iniziale è troppo alta.</p> <p data-svelte-h="svelte-rej8cx">Una volta definito il <code>Trainer</code>, è molto semplice: basta prendere un batch dal training set, ed eseguire un piccolo ciclo di addestramento manuale utilizzando solo quel <code>batch</code> per qualcosa come 20 step:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> train_dataset: | |
| <span class="hljs-keyword">break</span> | |
| <span class="hljs-comment"># Make sure you have run model.compile() and set your optimizer,</span> | |
| <span class="hljs-comment"># and your loss/metrics if you're using them</span> | |
| model.fit(batch, epochs=<span class="hljs-number">20</span>)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1ueptnc">💡 Se i dati di addestramento sono sbilanciati, assicurati di creare un batch di dati di addestramento contenente tutte le label.</p></div> <p data-svelte-h="svelte-4jju9q">Il modello risultante dovrebbe avere risultati quasi perfetti sul <code>batch</code>, con una loss che diminuisce rapidamente verso lo 0 (o il valore minimo per la loss che si sta utilizzando).</p> <p data-svelte-h="svelte-1luvbcs">Se non si riesci a far sì che il modello ottenga risultati perfetti come questo, significa che c’è qualcosa di sbagliato nel modo in cui si è impostato il problema o con i dati, e quindi dovresti risolvere questa cosa. Solo quando riesci a superare il test di overfitting puoi essere sicuro/a che il tuo modello possa effettivamente imparare qualcosa.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-3a8vl2">⚠️ Sarà necessario ricreare il modello e ricompilarlo dopo questo test, poiché il modello ottenuto probabilmente non sarà in grado di recuperare e imparare qualcosa di utile sul set di dati completo.</p></div> <h3 class="relative group"><a id="non-calibrare-niente-prima-di-avere-una-prima-baseline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#non-calibrare-niente-prima-di-avere-una-prima-baseline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Non calibrare niente prima di avere una prima baseline</span></h3> <p data-svelte-h="svelte-1ssho8y">Hyperparameter tuning (<em>calibrazione degli iperparametri</em>) è sempre considerato come la parte più difficile di machine learning, ma è solo l’ultimo passo per aiutarti a migliorare un po’ la metrica. Valori <em>molto</em> sbagliati di iperparametri, come l’uso del learning rate predefinito di Adam di 1e-3 con un modello Transformer, faranno sì che l’apprendimento proceda molto lentamente o si blocchi completamente, naturalmente, ma la maggior parte delle volte iperparametri “ragionevoli”, come un learning rate da 1e-5 a 5e-5, funzioneranno bene per darti buoni risultati. Quindi, non ci si deve lanciare in una ricerca di iperparametri dispendiosa in termini di tempo e di costi, finché non si è ottenuto qualcosa che batta la baseline (<em>base di partenza</em>) che si ha sul dataset.</p> <p data-svelte-h="svelte-1ytlvoo">Una volta ottenuto un modello sufficientemente buono, si può iniziare a modificarlo un po’. Non provare a eseguire l’addestramento un migliaio di volte con iperparametri diversi, ma confronta un paio di esecuzioni che hanno valori diversi per un iperparametro così da avere un’idea di quale abbia il maggiore impatto.</p> <p data-svelte-h="svelte-1q3oxbc">Se stai modificando il modello stesso, mantieni le cose semplici e non provare nulla che non possa essere ragionevolmente giustificato. Assicurati sempre di rifare il test di overfitting per verificare che la modifica non abbia avuto conseguenze indesiderate.</p> <h3 class="relative group"><a id="chiedere-aiuto" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chiedere-aiuto"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chiedere aiuto</span></h3> <p data-svelte-h="svelte-1824t5s">Speriamo che in questa sezione tu abbia trovato qualche consiglio utile a risolvere il tuo problema, ma se così non fosse, ricordati che puoi sempre chiedere aiuto alla community nei <a href="https://discuss.huggingface.co/" rel="nofollow">forum</a>.</p> <p data-svelte-h="svelte-1guizfr">Qui di seguito sono riportate alcune risorse aggiuntive che potrebbero rivelarsi utili:</p> <ul data-svelte-h="svelte-rloazi"><li><a href="https://docs.google.com/presentation/d/1yHLPvPhUs2KGI5ZWo0sU-PKU3GimAk3iTsI38Z-B5Gw/edit#slide=id.p" rel="nofollow">“Reproducibility as a vehicle for engineering best practices”</a> di Joel Grus</li> <li><a href="https://towardsdatascience.com/checklist-for-debugging-neural-networks-d8b2a9434f21" rel="nofollow">“Checklist for debugging neural networks”</a> di Cecelia Shao</li> <li><a href="https://medium.com/@keeper6928/how-to-unit-test-machine-learning-code-57cf6fd81765" rel="nofollow">“How to unit test machine learning code”</a> di Chase Roberts</li> <li><a href="http://karpathy.github.io/2019/04/25/recipe/" rel="nofollow">“A Recipe for Training Neural Networks”</a> di Andrej Karpathy</li></ul> <p data-svelte-h="svelte-14oh3jw">Naturalmente, non tutti i problemi che incontrerai durante l’addestramento delle reti neurali sono colpa tua! Se si incontra qualcosa nella libreria 🤗 Transformers o 🤗 Datasets che non sembra corretto, è possibile che si sia trovato un bug. Dovresti assolutamente segnalarcelo e nella prossima sezione ti spiegheremo esattamente come fare.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/it/chapter8/4_tf.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_u8ez91 = { | |
| assets: "/docs/course/pr_1069/it", | |
| base: "/docs/course/pr_1069/it", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1069/it/_app/immutable/entry/start.693d748d.js"), | |
| import("/docs/course/pr_1069/it/_app/immutable/entry/app.e9cfd099.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 45], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 128 kB
- Xet hash:
- 5cd25e1092cdad730ef5637738b99f591dcb99012b7295b9749f8adbaecba9bb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.