Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / course /pr_1069 /en /chapter3 /5.html

rtrm

27 days ago

download

raw

64.9 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Understanding Learning Curves","local":"understanding-learning-curves","sections":[{"title":"What are Learning Curves?","local":"what-are-learning-curves","sections":[{"title":"Loss Curves","local":"loss-curves","sections":[],"depth":3},{"title":"Accuracy Curves","local":"accuracy-curves","sections":[],"depth":3},{"title":"Convergence","local":"convergence","sections":[],"depth":3}],"depth":2},{"title":"Interpreting Learning Curve Patterns","local":"interpreting-learning-curve-patterns","sections":[{"title":"Healthy Learning Curves","local":"healthy-learning-curves","sections":[],"depth":3},{"title":"Practical Examples","local":"practical-examples","sections":[{"title":"During Training","local":"during-training","sections":[],"depth":4},{"title":"After Training","local":"after-training","sections":[],"depth":4},{"title":"Overfitting","local":"overfitting","sections":[],"depth":4},{"title":"2. Underfitting","local":"underfitting","sections":[],"depth":4},{"title":"3. Erratic Learning Curves","local":"erratic-learning-curves","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Key Takeaways","local":"key-takeaways","sections":[],"depth":2},{"title":"Section Quiz","local":"section-quiz","sections":[{"title":"1. What does it typically mean when training loss decreases but validation loss starts increasing?","local":"1-what-does-it-typically-mean-when-training-loss-decreases-but-validation-loss-starts-increasing","sections":[],"depth":3},{"title":"2. Why do accuracy curves often show a “steppy” or plateau-like pattern rather than smooth increases?","local":"2-why-do-accuracy-curves-often-show-a-steppy-or-plateau-like-pattern-rather-than-smooth-increases","sections":[],"depth":3},{"title":"3. What is the best approach when you observe erratic, highly fluctuating learning curves?","local":"3-what-is-the-best-approach-when-you-observe-erratic-highly-fluctuating-learning-curves","sections":[],"depth":3},{"title":"4. When should you consider using early stopping?","local":"4-when-should-you-consider-using-early-stopping","sections":[],"depth":3},{"title":"5. What indicates that your model might be underfitting?","local":"5-what-indicates-that-your-model-might-be-underfitting","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/course/pr_1069/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/entry/start.c5306bb2.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/scheduler.37c15a92.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/singletons.bc78d867.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/index.18351ede.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/paths.76894643.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/entry/app.4264f5f8.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/index.7cb9c9b8.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/nodes/0.f5347c47.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/nodes/49.f55befd0.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/Tip.d10b3fc9.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/CodeBlock.abae2786.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/CourseFloatingBanner.df82c153.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/Question.7e41e492.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/stores.cb4752a8.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/getInferenceSnippets.f9350a3f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Understanding Learning Curves","local":"understanding-learning-curves","sections":[{"title":"What are Learning Curves?","local":"what-are-learning-curves","sections":[{"title":"Loss Curves","local":"loss-curves","sections":[],"depth":3},{"title":"Accuracy Curves","local":"accuracy-curves","sections":[],"depth":3},{"title":"Convergence","local":"convergence","sections":[],"depth":3}],"depth":2},{"title":"Interpreting Learning Curve Patterns","local":"interpreting-learning-curve-patterns","sections":[{"title":"Healthy Learning Curves","local":"healthy-learning-curves","sections":[],"depth":3},{"title":"Practical Examples","local":"practical-examples","sections":[{"title":"During Training","local":"during-training","sections":[],"depth":4},{"title":"After Training","local":"after-training","sections":[],"depth":4},{"title":"Overfitting","local":"overfitting","sections":[],"depth":4},{"title":"2. Underfitting","local":"underfitting","sections":[],"depth":4},{"title":"3. Erratic Learning Curves","local":"erratic-learning-curves","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Key Takeaways","local":"key-takeaways","sections":[],"depth":2},{"title":"Section Quiz","local":"section-quiz","sections":[{"title":"1. What does it typically mean when training loss decreases but validation loss starts increasing?","local":"1-what-does-it-typically-mean-when-training-loss-decreases-but-validation-loss-starts-increasing","sections":[],"depth":3},{"title":"2. Why do accuracy curves often show a “steppy” or plateau-like pattern rather than smooth increases?","local":"2-why-do-accuracy-curves-often-show-a-steppy-or-plateau-like-pattern-rather-than-smooth-increases","sections":[],"depth":3},{"title":"3. What is the best approach when you observe erratic, highly fluctuating learning curves?","local":"3-what-is-the-best-approach-when-you-observe-erratic-highly-fluctuating-learning-curves","sections":[],"depth":3},{"title":"4. When should you consider using early stopping?","local":"4-when-should-you-consider-using-early-stopping","sections":[],"depth":3},{"title":"5. What indicates that your model might be underfitting?","local":"5-what-indicates-that-your-model-might-be-underfitting","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="understanding-learning-curves" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#understanding-learning-curves"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Understanding Learning Curves</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-3-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter3/section7.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter3/section7.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1ws90op">Now that you’ve learned how to implement fine-tuning using both the <code>Trainer</code> API and custom training loops, it’s crucial to understand how to interpret the results. Learning curves are invaluable tools that help you evaluate your model’s performance during training and identify potential issues before they reduce performance.</p> <p data-svelte-h="svelte-eliep2">In this section, we’ll explore how to read and interpret accuracy and loss curves, understand what different curve shapes tell us about our model’s behavior, and learn how to address common training issues.</p> <h2 class="relative group"><a id="what-are-learning-curves" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-are-learning-curves"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What are Learning Curves?</span></h2> <p data-svelte-h="svelte-za3ea7">Learning curves are visual representations of your model’s performance metrics over time during training. The two most important curves to monitor are:</p> <ul data-svelte-h="svelte-1t4wunj"><li><strong>Loss curves</strong>: Show how the model’s error (loss) changes over training steps or epochs</li> <li><strong>Accuracy curves</strong>: Show the percentage of correct predictions over training steps or epochs</li></ul> <p data-svelte-h="svelte-19anp1w">These curves help us understand whether our model is learning effectively and can guide us in making adjustments to improve performance. In Transformers, these metrics are individually computed for each batch and then logged to the disk. We can then use libraries like <a href="https://wandb.ai/" rel="nofollow">Weights & Biases</a> to visualize these curves and track our model’s performance over time.</p> <h3 class="relative group"><a id="loss-curves" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loss-curves"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loss Curves</span></h3> <p data-svelte-h="svelte-g4kfd9">The loss curve shows how the model’s error decreases over time. In a typical successful training run, you’ll see a curve similar to the one below:</p> <p data-svelte-h="svelte-pvwhaj"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/1.png" alt="Loss Curve"></p> <ul data-svelte-h="svelte-11aptsa"><li><strong>High initial loss</strong>: The model starts without optimization, so predictions are initially poor</li> <li><strong>Decreasing loss</strong>: As training progresses, the loss should generally decrease</li> <li><strong>Convergence</strong>: Eventually, the loss stabilizes at a low value, indicating that the model has learned the patterns in the data</li></ul> <p data-svelte-h="svelte-1pd4vow">As in previous chapters, we can use the <code>Trainer</code> API to track these metrics and visualize them in a dashboard. Below is an example of how to do this with Weights & Biases.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Example of tracking loss during training with the Trainer</span>
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> Trainer, TrainingArguments
	<span class="hljs-keyword">import</span> wandb

	<span class="hljs-comment"># Initialize Weights & Biases for experiment tracking</span>
	wandb.init(project=<span class="hljs-string">"transformer-fine-tuning"</span>, name=<span class="hljs-string">"bert-mrpc-analysis"</span>)

	training_args = TrainingArguments(
	output_dir=<span class="hljs-string">"./results"</span>,
	eval_strategy=<span class="hljs-string">"steps"</span>,
	eval_steps=<span class="hljs-number">50</span>,
	save_steps=<span class="hljs-number">100</span>,
	logging_steps=<span class="hljs-number">10</span>, <span class="hljs-comment"># Log metrics every 10 steps</span>
	num_train_epochs=<span class="hljs-number">3</span>,
	per_device_train_batch_size=<span class="hljs-number">16</span>,
	per_device_eval_batch_size=<span class="hljs-number">16</span>,
	report_to=<span class="hljs-string">"wandb"</span>, <span class="hljs-comment"># Send logs to Weights & Biases</span>
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets[<span class="hljs-string">"train"</span>],
	eval_dataset=tokenized_datasets[<span class="hljs-string">"validation"</span>],
	data_collator=data_collator,
	processing_class=tokenizer,
	compute_metrics=compute_metrics,
	)

	<span class="hljs-comment"># Train and automatically log metrics</span>
	trainer.train()<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="accuracy-curves" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#accuracy-curves"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Accuracy Curves</span></h3> <p data-svelte-h="svelte-1ot2vxf">The accuracy curve shows the percentage of correct predictions over time. Unlike loss curves, accuracy curves should generally increase as the model learns and can typically include more steps than the loss curve.</p> <p data-svelte-h="svelte-1jzjr0y"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/2.png" alt="Accuracy Curve"></p> <ul data-svelte-h="svelte-ownnm7"><li><strong>Start low</strong>: Initial accuracy should be low, as the model has not yet learned the patterns in the data</li> <li><strong>Increase with training</strong>: Accuracy should generally improve as the model learns if it is able to learn the patterns in the data</li> <li><strong>May show plateaus</strong>: Accuracy often increases in discrete jumps rather than smoothly, as the model makes predictions that are close to the true labels</li></ul> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-18uqxu2">💡 <strong>Why Accuracy Curves Are “Steppy”</strong>: Unlike loss, which is continuous, accuracy is calculated by comparing discrete predictions to true labels. Small improvements in model confidence might not change the final prediction, causing accuracy to remain flat until a threshold is crossed.</p></div> <h3 class="relative group"><a id="convergence" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#convergence"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Convergence</span></h3> <p data-svelte-h="svelte-qbg8tf">Convergence occurs when the model’s performance stabilizes and the loss and accuracy curves level off. This is a sign that the model has learned the patterns in the data and is ready to be used. In simple terms, we are aiming for the model to converge to a stable performance every time we train it.</p> <p data-svelte-h="svelte-lmle25"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/4.png" alt="Convergence"></p> <p data-svelte-h="svelte-g0y3o4">Once models have converged, we can use them to make predictions on new data and refer to evaluation metrics to understand how well the model is performing.</p> <h2 class="relative group"><a id="interpreting-learning-curve-patterns" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#interpreting-learning-curve-patterns"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Interpreting Learning Curve Patterns</span></h2> <p data-svelte-h="svelte-9px25k">Different curve shapes reveal different aspects of your model’s training. Let’s examine the most common patterns and what they mean.</p> <h3 class="relative group"><a id="healthy-learning-curves" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#healthy-learning-curves"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Healthy Learning Curves</span></h3> <p data-svelte-h="svelte-l3som6">A well-behaved training run typically shows curve shapes similar to the one below:</p> <p data-svelte-h="svelte-lw8se0"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/5.png" alt="Healthy Loss Curve"></p> <p data-svelte-h="svelte-52p4pq">Let’s look at the illustration above. It displays both the loss curve (on the left) and the corresponding accuracy curve (on the right). These curves have distinct characteristics.</p> <p data-svelte-h="svelte-gklle5">The loss curve shows the value of the model’s loss over time. Initially, the loss is high and then it gradually decreases, indicating that the model is improving. A decrease in the loss value suggests that the model is making better predictions, as the loss represents the error between the predicted output and the true output.</p> <p data-svelte-h="svelte-3kw9rm">Now let’s shift our focus to the accuracy curve. It represents the model’s accuracy over time. The accuracy curve begins at a low value and increases as training progresses. Accuracy measures the proportion of correctly classified instances. So, as the accuracy curve rises, it signifies that the model is making more correct predictions.</p> <p data-svelte-h="svelte-5msz0y">One notable difference between the curves is the smoothness and the presence of “plateaus” on the accuracy curve. While the loss decreases smoothly, the plateaus on the accuracy curve indicate discrete jumps in accuracy instead of a continuous increase. This behavior is attributed to how accuracy is measured. The loss can improve if the model’s output gets closer to the target, even if the final prediction is still incorrect. Accuracy, however, only improves when the prediction crosses the threshold to be correct.</p> <p data-svelte-h="svelte-1v47fvi">For example, in a binary classifier distinguishing cats (0) from dogs (1), if the model predicts 0.3 for an image of a dog (true value 1), this is rounded to 0 and is an incorrect classification. If in the next step it predicts 0.4, it’s still incorrect. The loss will have decreased because 0.4 is closer to 1 than 0.3, but the accuracy remains unchanged, creating a plateau. The accuracy will only jump up when the model predicts a value greater than 0.5 that gets rounded to 1.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-q6j789"><strong>Characteristics of healthy curves:</strong></p> <ul data-svelte-h="svelte-vcjo68"><li><strong>Smooth decline in loss</strong>: Both training and validation loss decrease steadily</li> <li><strong>Close training/validation performance</strong>: Small gap between training and validation metrics</li> <li><strong>Convergence</strong>: Curves level off, indicating the model has learned the patterns</li></ul></div> <h3 class="relative group"><a id="practical-examples" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#practical-examples"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Practical Examples</span></h3> <p data-svelte-h="svelte-q6sc50">Let’s work through some practical examples of learning curves. First, we will highlight some approaches to monitor the learning curves during training. Below, we will break down the different patterns that can be observed in the learning curves.</p> <h4 class="relative group"><a id="during-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#during-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>During Training</span></h4> <p data-svelte-h="svelte-1tpl7gk">During the training process (after you’ve hit <code>trainer.train()</code>), you can monitor these key indicators:</p> <ol data-svelte-h="svelte-wb01ww"><li><strong>Loss convergence</strong>: Is the loss still decreasing or has it plateaued?</li> <li><strong>Overfitting signs</strong>: Is validation loss starting to increase while training loss decreases?</li> <li><strong>Learning rate</strong>: Are the curves too erratic (LR too high) or too flat (LR too low)?</li> <li><strong>Stability</strong>: Are there sudden spikes or drops that indicate problems?</li></ol> <h4 class="relative group"><a id="after-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#after-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>After Training</span></h4> <p data-svelte-h="svelte-hadbhx">After the training process is complete, you can analyze the complete curves to understand the model’s performance.</p> <ol data-svelte-h="svelte-1oj7bbp"><li><strong>Final performance</strong>: Did the model reach acceptable performance levels?</li> <li><strong>Efficiency</strong>: Could the same performance be achieved with fewer epochs?</li> <li><strong>Generalization</strong>: How close are training and validation performance?</li> <li><strong>Trends</strong>: Would additional training likely improve performance?</li></ol> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-sx55dj">🔍 <strong>W&B Dashboard Features</strong>: Weights & Biases automatically creates beautiful, interactive plots of your learning curves. You can:</p> <ul data-svelte-h="svelte-1k0ojjk"><li>Compare multiple runs side by side</li> <li>Add custom metrics and visualizations</li> <li>Set up alerts for anomalous behavior</li> <li>Share results with your team</li></ul> <p data-svelte-h="svelte-1cr9t1l">Learn more in the <a href="https://docs.wandb.ai/" rel="nofollow">Weights & Biases documentation</a>.</p></div> <h4 class="relative group"><a id="overfitting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#overfitting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Overfitting</span></h4> <p data-svelte-h="svelte-wgqvrb">Overfitting occurs when the model learns too much from the training data and is unable to generalize to different data (represented by the validation set).</p> <p data-svelte-h="svelte-281ynk"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/10.png" alt="Overfitting"></p> <p data-svelte-h="svelte-1m2kcc3"><strong>Symptoms:</strong></p> <ul data-svelte-h="svelte-zqaa0r"><li>Training loss continues to decrease while validation loss increases or plateaus</li> <li>Large gap between training and validation accuracy</li> <li>Training accuracy much higher than validation accuracy</li></ul> <p data-svelte-h="svelte-lmb51t"><strong>Solutions for overfitting:</strong></p> <ul data-svelte-h="svelte-1pxlcl3"><li><strong>Regularization</strong>: Add dropout, weight decay, or other regularization techniques</li> <li><strong>Early stopping</strong>: Stop training when validation performance stops improving</li> <li><strong>Data augmentation</strong>: Increase training data diversity</li> <li><strong>Reduce model complexity</strong>: Use a smaller model or fewer parameters</li></ul> <p data-svelte-h="svelte-8x6dgg">In the sample below, we use early stopping to prevent overfitting. We set the <code>early_stopping_patience</code> to 3, which means that if the validation loss does not improve for 3 consecutive epochs, the training will be stopped.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Example of detecting overfitting with early stopping</span>
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> EarlyStoppingCallback

	training_args = TrainingArguments(
	output_dir=<span class="hljs-string">"./results"</span>,
	eval_strategy=<span class="hljs-string">"steps"</span>,
	eval_steps=<span class="hljs-number">100</span>,
	save_strategy=<span class="hljs-string">"steps"</span>,
	save_steps=<span class="hljs-number">100</span>,
	load_best_model_at_end=<span class="hljs-literal">True</span>,
	metric_for_best_model=<span class="hljs-string">"eval_loss"</span>,
	greater_is_better=<span class="hljs-literal">False</span>,
	num_train_epochs=<span class="hljs-number">10</span>, <span class="hljs-comment"># Set high, but we'll stop early</span>
	)

	<span class="hljs-comment"># Add early stopping to prevent overfitting</span>
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets[<span class="hljs-string">"train"</span>],
	eval_dataset=tokenized_datasets[<span class="hljs-string">"validation"</span>],
	data_collator=data_collator,
	processing_class=tokenizer,
	compute_metrics=compute_metrics,
	callbacks=[EarlyStoppingCallback(early_stopping_patience=<span class="hljs-number">3</span>)],
	)<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="underfitting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#underfitting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Underfitting</span></h4> <p data-svelte-h="svelte-qnmlsu">Underfitting occurs when the model is too simple to capture the underlying patterns in the data. This can happen for several reasons:</p> <ul data-svelte-h="svelte-1ag1u4d"><li>The model is too small or lacks capacity to learn the patterns</li> <li>The learning rate is too low, causing slow learning</li> <li>The dataset is too small or not representative of the problem</li> <li>The model is not properly regularized</li></ul> <p data-svelte-h="svelte-clpn9k"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/7.png" alt="Underfitting"></p> <p data-svelte-h="svelte-1m2kcc3"><strong>Symptoms:</strong></p> <ul data-svelte-h="svelte-1sedo32"><li>Both training and validation loss remain high</li> <li>Model performance plateaus early in training</li> <li>Training accuracy is lower than expected</li></ul> <p data-svelte-h="svelte-16mw4zr"><strong>Solutions for underfitting:</strong></p> <ul data-svelte-h="svelte-4laiep"><li><strong>Increase model capacity</strong>: Use a larger model or more parameters</li> <li><strong>Train longer</strong>: Increase the number of epochs</li> <li><strong>Adjust learning rate</strong>: Try different learning rates</li> <li><strong>Check data quality</strong>: Ensure your data is properly preprocessed</li></ul> <p data-svelte-h="svelte-1d3m5mc">In the sample below, we train for more epochs to see if the model can learn the patterns in the data.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TrainingArguments

	training_args = TrainingArguments(
	output_dir=<span class="hljs-string">"./results"</span>,
	-num_train_epochs=<span class="hljs-number">5</span>,
	+num_train_epochs=<span class="hljs-number">10</span>,
	)<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="erratic-learning-curves" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#erratic-learning-curves"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Erratic Learning Curves</span></h4> <p data-svelte-h="svelte-171x3d7">Erratic learning curves occur when the model is not learning effectively. This can happen for several reasons:</p> <ul data-svelte-h="svelte-ozmyg2"><li>The learning rate is too high, causing the model to overshoot the optimal parameters</li> <li>The batch size is too small, causing the model to learn slowly</li> <li>The model is not properly regularized, causing it to overfit to the training data</li> <li>The dataset is not properly preprocessed, causing the model to learn from noise</li></ul> <p data-svelte-h="svelte-1nvkivp"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/3.png" alt="Erratic Learning Curves"></p> <p data-svelte-h="svelte-1m2kcc3"><strong>Symptoms:</strong></p> <ul data-svelte-h="svelte-3z0f8a"><li>Frequent fluctuations in loss or accuracy</li> <li>Curves show high variance or instability</li> <li>Performance oscillates without clear trend</li></ul> <p data-svelte-h="svelte-1y0awim">Both training and validation curves show erratic behavior.</p> <p data-svelte-h="svelte-19tpia7"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter3/9.png" alt="Erratic Learning Curves"></p> <p data-svelte-h="svelte-huyrsu"><strong>Solutions for erratic curves:</strong></p> <ul data-svelte-h="svelte-1oyey8p"><li><strong>Lower learning rate</strong>: Reduce step size for more stable training</li> <li><strong>Increase batch size</strong>: Larger batches provide more stable gradients</li> <li><strong>Gradient clipping</strong>: Prevent exploding gradients</li> <li><strong>Better data preprocessing</strong>: Ensure consistent data quality</li></ul> <p data-svelte-h="svelte-ykj10u">In the sample below, we lower the learning rate and increase the batch size.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TrainingArguments

	training_args = TrainingArguments(
	output_dir=<span class="hljs-string">"./results"</span>,
	-learning_rate=<span class="hljs-number">1e-5</span>,
	+learning_rate=<span class="hljs-number">1e-4</span>,
	-per_device_train_batch_size=<span class="hljs-number">16</span>,
	+per_device_train_batch_size=<span class="hljs-number">32</span>,
	)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="key-takeaways" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#key-takeaways"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Key Takeaways</span></h2> <p data-svelte-h="svelte-1cusb5e">Understanding learning curves is crucial for becoming an effective machine learning practitioner. These visual tools provide immediate feedback about your model’s training progress and help you make informed decisions about when to stop training, adjust hyperparameters, or try different approaches. With practice, you’ll develop an intuitive understanding of what healthy learning curves look like and how to address issues when they arise.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-l314m0">💡 <strong>Key Takeaways:</strong></p> <ul data-svelte-h="svelte-yy8x0e"><li>Learning curves are essential tools for understanding model training progress</li> <li>Monitor both loss and accuracy curves, but remember they have different characteristics</li> <li>Overfitting shows as diverging training/validation performance</li> <li>Underfitting shows as poor performance on both training and validation data</li> <li>Tools like Weights & Biases make it easy to track and analyze learning curves</li> <li>Early stopping and proper regularization can address most common training issues</li></ul> <p data-svelte-h="svelte-1qylnyf">🔬 <strong>Next Steps</strong>: Practice analyzing learning curves on your own fine-tuning experiments. Try different hyperparameters and observe how they affect the curve shapes. This hands-on experience is the best way to develop intuition for reading training progress.</p></div> <h2 class="relative group"><a id="section-quiz" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#section-quiz"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Section Quiz</span></h2> <p data-svelte-h="svelte-1yn5q85">Test your understanding of learning curves and training analysis:</p> <h3 class="relative group"><a id="1-what-does-it-typically-mean-when-training-loss-decreases-but-validation-loss-starts-increasing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-what-does-it-typically-mean-when-training-loss-decreases-but-validation-loss-starts-increasing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. What does it typically mean when training loss decreases but validation loss starts increasing?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->The model is learning successfully and will continue to improve.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->The model is overfitting to the training data.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->The learning rate is too low.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->The dataset is too small.<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="2-why-do-accuracy-curves-often-show-a-steppy-or-plateau-like-pattern-rather-than-smooth-increases" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-why-do-accuracy-curves-often-show-a-steppy-or-plateau-like-pattern-rather-than-smooth-increases"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Why do accuracy curves often show a “steppy” or plateau-like pattern rather than smooth increases?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->There's an error in the accuracy calculation.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Accuracy is a discrete metric that only changes when predictions cross decision boundaries.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->The model is not learning effectively.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->The batch size is too small.<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="3-what-is-the-best-approach-when-you-observe-erratic-highly-fluctuating-learning-curves" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-what-is-the-best-approach-when-you-observe-erratic-highly-fluctuating-learning-curves"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. What is the best approach when you observe erratic, highly fluctuating learning curves?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->Increase the learning rate to speed up convergence.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Reduce the learning rate and possibly increase the batch size.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Stop training immediately as the model won't improve.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->Switch to a completely different model architecture.<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="4-when-should-you-consider-using-early-stopping" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-when-should-you-consider-using-early-stopping"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. When should you consider using early stopping?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->Always, as it prevents any form of overfitting.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->When validation performance stops improving or starts degrading.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Only when training loss is still decreasing rapidly.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->Never, as it prevents the model from reaching its full potential.<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="5-what-indicates-that-your-model-might-be-underfitting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5-what-indicates-that-your-model-might-be-underfitting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. What indicates that your model might be underfitting?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->Training accuracy is much higher than validation accuracy.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Both training and validation performance are poor and plateau early.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->The learning curves are very smooth with no fluctuations.<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->Validation loss is decreasing faster than training loss.<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/en/chapter3/5.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1y0degu = {
	assets: "/docs/course/pr_1069/en",
	base: "/docs/course/pr_1069/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/course/pr_1069/en/_app/immutable/entry/start.c5306bb2.js"),
	import("/docs/course/pr_1069/en/_app/immutable/entry/app.4264f5f8.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 49],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 64.9 kB
Xet hash:: 439911c39870e091426e76f1f79483b547aafc40716853bfcef03d2dbf1a4aec

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.