Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Supervised Fine-Tuning","local":"supervised-fine-tuning","sections":[{"title":"When to Use SFT","local":"when-to-use-sft","sections":[{"title":"Template Control","local":"template-control","sections":[],"depth":3},{"title":"Domain Adaptation","local":"domain-adaptation","sections":[],"depth":3}],"depth":2},{"title":"Dataset Preparation","local":"dataset-preparation","sections":[],"depth":2},{"title":"Training Configuration","local":"training-configuration","sections":[],"depth":2},{"title":"Implementation with TRL","local":"implementation-with-trl","sections":[],"depth":2},{"title":"Packing the Dataset","local":"packing-the-dataset","sections":[],"depth":2},{"title":"Monitoring Training Progress","local":"monitoring-training-progress","sections":[{"title":"Understanding Loss Patterns","local":"understanding-loss-patterns","sections":[],"depth":3},{"title":"Metrics to Monitor","local":"metrics-to-monitor","sections":[],"depth":3},{"title":"The Path to Convergence","local":"the-path-to-convergence","sections":[],"depth":3},{"title":"Monitoring Training Progress","local":"monitoring-training-progress","sections":[],"depth":3},{"title":"Warning Signs to Watch For","local":"warning-signs-to-watch-for","sections":[],"depth":3}],"depth":2},{"title":"Evaluation after SFT","local":"evaluation-after-sft","sections":[],"depth":2},{"title":"Quiz","local":"quiz","sections":[{"title":"1. What parameters control the training duration in SFT?","local":"1-what-parameters-control-the-training-duration-in-sft","sections":[],"depth":3},{"title":"2. Which pattern in the loss curves indicates potential overfitting?","local":"2-which-pattern-in-the-loss-curves-indicates-potential-overfitting","sections":[],"depth":3},{"title":"3. What is gradient_accumulation_steps used for?","local":"3-what-is-gradientaccumulationsteps-used-for","sections":[],"depth":3},{"title":"4. What should you monitor during SFT training?","local":"4-what-should-you-monitor-during-sft-training","sections":[],"depth":3},{"title":"5. What indicates healthy convergence during training?","local":"5-what-indicates-healthy-convergence-during-training","sections":[],"depth":3}],"depth":2},{"title":"💐 Nice work!","local":"-nice-work","sections":[],"depth":2},{"title":"Additional Resources","local":"additional-resources","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1069/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/entry/start.c5306bb2.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/scheduler.37c15a92.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/singletons.bc78d867.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/index.18351ede.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/paths.76894643.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/entry/app.4264f5f8.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/index.7cb9c9b8.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/nodes/0.f5347c47.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/nodes/23.f106269f.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/Tip.d10b3fc9.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/CodeBlock.abae2786.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/CourseFloatingBanner.df82c153.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/Question.7e41e492.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/stores.cb4752a8.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/getInferenceSnippets.f9350a3f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Supervised Fine-Tuning","local":"supervised-fine-tuning","sections":[{"title":"When to Use SFT","local":"when-to-use-sft","sections":[{"title":"Template Control","local":"template-control","sections":[],"depth":3},{"title":"Domain Adaptation","local":"domain-adaptation","sections":[],"depth":3}],"depth":2},{"title":"Dataset Preparation","local":"dataset-preparation","sections":[],"depth":2},{"title":"Training Configuration","local":"training-configuration","sections":[],"depth":2},{"title":"Implementation with TRL","local":"implementation-with-trl","sections":[],"depth":2},{"title":"Packing the Dataset","local":"packing-the-dataset","sections":[],"depth":2},{"title":"Monitoring Training Progress","local":"monitoring-training-progress","sections":[{"title":"Understanding Loss Patterns","local":"understanding-loss-patterns","sections":[],"depth":3},{"title":"Metrics to Monitor","local":"metrics-to-monitor","sections":[],"depth":3},{"title":"The Path to Convergence","local":"the-path-to-convergence","sections":[],"depth":3},{"title":"Monitoring Training Progress","local":"monitoring-training-progress","sections":[],"depth":3},{"title":"Warning Signs to Watch For","local":"warning-signs-to-watch-for","sections":[],"depth":3}],"depth":2},{"title":"Evaluation after SFT","local":"evaluation-after-sft","sections":[],"depth":2},{"title":"Quiz","local":"quiz","sections":[{"title":"1. What parameters control the training duration in SFT?","local":"1-what-parameters-control-the-training-duration-in-sft","sections":[],"depth":3},{"title":"2. Which pattern in the loss curves indicates potential overfitting?","local":"2-which-pattern-in-the-loss-curves-indicates-potential-overfitting","sections":[],"depth":3},{"title":"3. What is gradient_accumulation_steps used for?","local":"3-what-is-gradientaccumulationsteps-used-for","sections":[],"depth":3},{"title":"4. What should you monitor during SFT training?","local":"4-what-should-you-monitor-during-sft-training","sections":[],"depth":3},{"title":"5. What indicates healthy convergence during training?","local":"5-what-indicates-healthy-convergence-during-training","sections":[],"depth":3}],"depth":2},{"title":"💐 Nice work!","local":"-nice-work","sections":[],"depth":2},{"title":"Additional Resources","local":"additional-resources","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-2-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/course/en/chapter11/section3.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> </div> <h1 class="relative group"><a id="supervised-fine-tuning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#supervised-fine-tuning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Supervised Fine-Tuning</span></h1> <p data-svelte-h="svelte-1r3e85q">Supervised Fine-Tuning (SFT) is a process primarily used to adapt pre-trained language models to follow instructions, engage in dialogue, and use specific output formats. While pre-trained models have impressive general capabilities, SFT helps transform them into assistant-like models that can better understand and respond to user prompts. This is typically done by training on datasets of human-written conversations and instructions.</p> <p data-svelte-h="svelte-1lbymgt">This page provides a step-by-step guide to fine-tuning the <a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" rel="nofollow"><code>deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B</code></a> model using the <a href="https://huggingface.co/docs/trl/en/sft_trainer" rel="nofollow"><code>SFTTrainer</code></a>. By following these steps, you can adapt the model to perform specific tasks more effectively.</p> <h2 class="relative group"><a id="when-to-use-sft" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#when-to-use-sft"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>When to Use SFT</span></h2> <p data-svelte-h="svelte-bzr0f8">Before diving into implementation, it’s important to understand when SFT is the right choice for your project. As a first step, you should consider whether using an existing instruction-tuned model with well-crafted prompts would suffice for your use case. SFT involves significant computational resources and engineering effort, so it should only be pursued when prompting existing models proves insufficient.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Consider SFT only if you: | |
| - Need additional performance beyond what prompting can achieve | |
| - Have a specific use case where the cost of using a large general-purpose model outweighs the cost of fine-tuning a smaller model | |
| - Require specialized output formats or domain-specific knowledge that existing models struggle with</div> <p data-svelte-h="svelte-1vu1858">If you determine that SFT is necessary, the decision to proceed depends on two primary factors:</p> <h3 class="relative group"><a id="template-control" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#template-control"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Template Control</span></h3> <p data-svelte-h="svelte-vqqipn">SFT allows precise control over the model’s output structure. This is particularly valuable when you need the model to:</p> <ol data-svelte-h="svelte-f2jeai"><li>Generate responses in a specific chat template format</li> <li>Follow strict output schemas</li> <li>Maintain consistent styling across responses</li></ol> <h3 class="relative group"><a id="domain-adaptation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#domain-adaptation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Domain Adaptation</span></h3> <p data-svelte-h="svelte-63mafi">When working in specialized domains, SFT helps align the model with domain-specific requirements by:</p> <ol data-svelte-h="svelte-14ez77z"><li>Teaching domain terminology and concepts</li> <li>Enforcing professional standards</li> <li>Handling technical queries appropriately</li> <li>Following industry-specific guidelines</li></ol> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Before starting SFT, evaluate whether your use case requires: | |
| - Precise output formatting | |
| - Domain-specific knowledge | |
| - Consistent response patterns | |
| - Adherence to specific guidelines | |
| <p data-svelte-h="svelte-ktd560">This evaluation will help determine if SFT is the right approach for your needs.</p></div> <h2 class="relative group"><a id="dataset-preparation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#dataset-preparation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Dataset Preparation</span></h2> <p data-svelte-h="svelte-1gnb02p">The supervised fine-tuning process requires a task-specific dataset structured with input-output pairs. Each pair should consist of:</p> <ol data-svelte-h="svelte-1vwaxbl"><li>An input prompt</li> <li>The expected model response</li> <li>Any additional context or metadata</li></ol> <p data-svelte-h="svelte-irg7th">The quality of your training data is crucial for successful fine-tuning. Let’s look at how to prepare and validate your dataset:</p> <iframe src="https://huggingface.co/datasets/HuggingFaceTB/smoltalk/embed/viewer/all/train?row=0" frameborder="0" width="100%" height="360px"></iframe> <h2 class="relative group"><a id="training-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training Configuration</span></h2> <p data-svelte-h="svelte-ioywgh">The success of your fine-tuning depends heavily on choosing the right training parameters. Let’s explore each important parameter and how to configure them effectively:</p> <p data-svelte-h="svelte-14sqfj">The SFTTrainer configuration requires consideration of several parameters that control the training process. Let’s explore each parameter and their purpose:</p> <ol data-svelte-h="svelte-6ztbbs"><li><p><strong>Training Duration Parameters</strong>:</p> <ul><li><code>num_train_epochs</code>: Controls total training duration</li> <li><code>max_steps</code>: Alternative to epochs, sets maximum number of training steps</li> <li>More epochs allow better learning but risk overfitting</li></ul></li> <li><p><strong>Batch Size Parameters</strong>:</p> <ul><li><code>per_device_train_batch_size</code>: Determines memory usage and training stability</li> <li><code>gradient_accumulation_steps</code>: Enables larger effective batch sizes</li> <li>Larger batches provide more stable gradients but require more memory</li></ul></li> <li><p><strong>Learning Rate Parameters</strong>:</p> <ul><li><code>learning_rate</code>: Controls size of weight updates</li> <li><code>warmup_ratio</code>: Portion of training used for learning rate warmup</li> <li>Too high can cause instability, too low results in slow learning</li></ul></li> <li><p><strong>Monitoring Parameters</strong>:</p> <ul><li><code>logging_steps</code>: Frequency of metric logging</li> <li><code>eval_steps</code>: How often to evaluate on validation data</li> <li><code>save_steps</code>: Frequency of model checkpoint saves</li></ul></li></ol> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Start with conservative values and adjust based on monitoring: | |
| - Begin with 1-3 epochs | |
| - Use smaller batch sizes initially | |
| - Monitor validation metrics closely | |
| - Adjust learning rate if training is unstable</div> <h2 class="relative group"><a id="implementation-with-trl" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#implementation-with-trl"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Implementation with TRL</span></h2> <p data-svelte-h="svelte-grvvks">Now that we understand the key components, let’s implement the training with proper validation and monitoring. We will use the <code>SFTTrainer</code> class from the Transformers Reinforcement Learning (TRL) library, which is built on top of the <code>transformers</code> library. Here’s a complete example using the TRL library:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTConfig, SFTTrainer | |
| <span class="hljs-keyword">import</span> torch | |
| <span class="hljs-comment"># Set device</span> | |
| device = <span class="hljs-string">"cuda"</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">"cpu"</span> | |
| <span class="hljs-comment"># Load dataset</span> | |
| dataset = load_dataset(<span class="hljs-string">"HuggingFaceTB/smoltalk"</span>, <span class="hljs-string">"all"</span>) | |
| <span class="hljs-comment"># Configure model and tokenizer</span> | |
| model_name = <span class="hljs-string">"HuggingFaceTB/SmolLM2-135M"</span> | |
| model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name).to( | |
| device | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name) | |
| <span class="hljs-comment"># Setup chat template</span> | |
| model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer) | |
| <span class="hljs-comment"># Configure trainer</span> | |
| training_args = SFTConfig( | |
| output_dir=<span class="hljs-string">"./sft_output"</span>, | |
| max_steps=<span class="hljs-number">1000</span>, | |
| per_device_train_batch_size=<span class="hljs-number">4</span>, | |
| learning_rate=<span class="hljs-number">5e-5</span>, | |
| logging_steps=<span class="hljs-number">10</span>, | |
| save_steps=<span class="hljs-number">100</span>, | |
| eval_strategy=<span class="hljs-string">"steps"</span>, | |
| eval_steps=<span class="hljs-number">50</span>, | |
| ) | |
| <span class="hljs-comment"># Initialize trainer</span> | |
| trainer = SFTTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=dataset[<span class="hljs-string">"train"</span>], | |
| eval_dataset=dataset[<span class="hljs-string">"test"</span>], | |
| processing_class=tokenizer, | |
| ) | |
| <span class="hljs-comment"># Start training</span> | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">When using a dataset with a "messages" field (like the example above), the SFTTrainer automatically applies the model's chat template, which it retrieves from the hub. This means you don't need any additional configuration to handle chat-style conversations - the trainer will format the messages according to the model's expected template format.</div> <h2 class="relative group"><a id="packing-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#packing-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Packing the Dataset</span></h2> <p data-svelte-h="svelte-1nxebfc">The SFTTrainer supports example packing to optimize training efficiency. This feature allows multiple short examples to be packed into the same input sequence, maximizing GPU utilization during training. To enable packing, simply set <code>packing=True</code> in the SFTConfig constructor. When using packed datasets with <code>max_steps</code>, be aware that you may train for more epochs than expected depending on your packing configuration. You can customize how examples are combined using a formatting function - particularly useful when working with datasets that have multiple fields like question-answer pairs. For evaluation datasets, you can disable packing by setting <code>eval_packing=False</code> in the SFTConfig. Here’s a basic example of customizing the packing configuration:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Configure packing</span> | |
| training_args = SFTConfig(packing=<span class="hljs-literal">True</span>) | |
| trainer = SFTTrainer(model=model, train_dataset=dataset, args=training_args) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-c7t1n6">When packing the dataset with multiple fields, you can define a custom formatting function to combine the fields into a single input sequence. This function should take a list of examples and return a dictionary with the packed input sequence. Here’s an example of a custom formatting function:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">formatting_func</span>(<span class="hljs-params">example</span>): | |
| text = <span class="hljs-string">f"### Question: <span class="hljs-subst">{example[<span class="hljs-string">'question'</span>]}</span>\n ### Answer: <span class="hljs-subst">{example[<span class="hljs-string">'answer'</span>]}</span>"</span> | |
| <span class="hljs-keyword">return</span> text | |
| training_args = SFTConfig(packing=<span class="hljs-literal">True</span>) | |
| trainer = SFTTrainer( | |
| <span class="hljs-string">"facebook/opt-350m"</span>, | |
| train_dataset=dataset, | |
| args=training_args, | |
| formatting_func=formatting_func, | |
| )<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="monitoring-training-progress" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#monitoring-training-progress"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Monitoring Training Progress</span></h2> <p data-svelte-h="svelte-muo025">Effective monitoring is crucial for successful fine-tuning. Let’s explore what to watch for during training:</p> <h3 class="relative group"><a id="understanding-loss-patterns" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#understanding-loss-patterns"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Understanding Loss Patterns</span></h3> <p data-svelte-h="svelte-1kxz6c2">Training loss typically follows three distinct phases:</p> <ol data-svelte-h="svelte-12kgh6p"><li>Initial Sharp Drop: Rapid adaptation to new data distribution</li> <li>Gradual Stabilization: Learning rate slows as model fine-tunes</li> <li>Convergence: Loss values stabilize, indicating training completion</li></ol> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/nlp_course_sft_loss_graphic.png" alt="SFTTrainer Training"> <h3 class="relative group"><a id="metrics-to-monitor" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#metrics-to-monitor"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Metrics to Monitor</span></h3> <p data-svelte-h="svelte-1y9ggj5">Effective monitoring involves tracking quantitative metrics, and evaluating qualitative metrics. Available metrics are:</p> <ul data-svelte-h="svelte-acv7jp"><li>Training loss</li> <li>Validation loss</li> <li>Learning rate progression</li> <li>Gradient norms</li></ul> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">Watch for these warning signs during training: | |
| 1. Validation loss increasing while training loss decreases (overfitting) | |
| 2. No significant improvement in loss values (underfitting) | |
| 3. Extremely low loss values (potential memorization) | |
| 4. Inconsistent output formatting (template learning issues)</div> <h3 class="relative group"><a id="the-path-to-convergence" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-path-to-convergence"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Path to Convergence</span></h3> <p data-svelte-h="svelte-ie5i5">As training progresses, the loss curve should gradually stabilize. The key indicator of healthy training is a small gap between training and validation loss, suggesting | |
| the model is learning generalizable patterns rather than memorizing specific examples. The absolute loss values will vary depending on your task and dataset.</p> <h3 class="relative group"><a id="monitoring-training-progress" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#monitoring-training-progress"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Monitoring Training Progress</span></h3> <p data-svelte-h="svelte-1s6faap">The graph above shows a typical training progression. Notice how both training and validation loss decrease sharply at first, then gradually level off. This pattern indicates the model is learning effectively while maintaining generalization ability.</p> <h3 class="relative group"><a id="warning-signs-to-watch-for" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#warning-signs-to-watch-for"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Warning Signs to Watch For</span></h3> <p data-svelte-h="svelte-1yge1oh">Several patterns in the loss curves can indicate potential issues. Below we illustrate common warning signs and solutions that we can consider.</p> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sft_loss_1.png" alt="SFTTrainer Training"> <p data-svelte-h="svelte-13ggrtv">If the validation loss decreases at a significantly slower rate than training loss, your model is likely overfitting to the training data. Consider:</p> <ul data-svelte-h="svelte-1a305az"><li>Reducing the training steps</li> <li>Increasing the dataset size</li> <li>Validating dataset quality and diversity</li></ul> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sft_loss_2.png" alt="SFTTrainer Training"> <p data-svelte-h="svelte-176u6kl">If the loss doesn’t show significant improvement, the model might be:</p> <ul data-svelte-h="svelte-1dkaf9f"><li>Learning too slowly (try increasing the learning rate)</li> <li>Struggling with the task (check data quality and task complexity)</li> <li>Hitting architecture limitations (consider a different model)</li></ul> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sft_loss_3.png" alt="SFTTrainer Training"> <p data-svelte-h="svelte-ilfaf">Extremely low loss values could suggest memorization rather than learning. This is particularly concerning if:</p> <ul data-svelte-h="svelte-99mvik"><li>The model performs poorly on new, similar examples</li> <li>The outputs lack diversity</li> <li>The responses are too similar to training examples</li></ul> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">Monitor both the loss values and the model's actual outputs during training. Sometimes the loss can look good while the model develops unwanted behaviors. Regular qualitative evaluation of the model's responses helps catch issues that metrics alone might miss.</div> <p data-svelte-h="svelte-ttgvgc">We should note that the interpretation of the loss values we outline here is aimed on the most common case, and in fact, loss values can behave on various ways depending on the model, the dataset, the training parameters, etc. If you interested in exploring more about outlined patterns, you should check out this blog post by the people at <a href="https://www.fast.ai/posts/2023-09-04-learning-jumps/" rel="nofollow">Fast AI</a>.</p> <h2 class="relative group"><a id="evaluation-after-sft" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluation-after-sft"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluation after SFT</span></h2> <p data-svelte-h="svelte-1rfoj7z">In section <a href="/en/chapter11/4">11.4</a> we will learn how to evaluate the model using benchmark datasets. For now, we will focus on the qualitative evaluation of the model.</p> <p data-svelte-h="svelte-1028cbr">After completing SFT, consider these follow-up actions:</p> <ol data-svelte-h="svelte-1irg2q8"><li>Evaluate the model thoroughly on held-out test data</li> <li>Validate template adherence across various inputs</li> <li>Test domain-specific knowledge retention</li> <li>Monitor real-world performance metrics</li></ol> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Document your training process, including: | |
| - Dataset characteristics | |
| - Training parameters | |
| - Performance metrics | |
| - Known limitations | |
| This documentation will be valuable for future model iterations.</div> <h2 class="relative group"><a id="quiz" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quiz"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quiz</span></h2> <h3 class="relative group"><a id="1-what-parameters-control-the-training-duration-in-sft" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-what-parameters-control-the-training-duration-in-sft"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. What parameters control the training duration in SFT?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->num_train_epochs and max_steps<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->batch_size and learning_rate<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->gradient_checkpointing and warmup_ratio<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="2-which-pattern-in-the-loss-curves-indicates-potential-overfitting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-which-pattern-in-the-loss-curves-indicates-potential-overfitting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Which pattern in the loss curves indicates potential overfitting?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->Validation loss increases while training loss continues to decrease<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Both training and validation loss decrease steadily<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Training loss remains constant while validation loss decreases<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="3-what-is-gradientaccumulationsteps-used-for" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-what-is-gradientaccumulationsteps-used-for"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. What is gradient_accumulation_steps used for?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->To increase effective batch size without using more memory<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->To save checkpoints during training<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->To control the learning rate schedule<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="4-what-should-you-monitor-during-sft-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-what-should-you-monitor-during-sft-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. What should you monitor during SFT training?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->Both quantitative metrics and qualitative outputs<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Only the training loss<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Only the model's output quality<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="5-what-indicates-healthy-convergence-during-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5-what-indicates-healthy-convergence-during-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. What indicates healthy convergence during training?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->A small gap between training and validation loss<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Training loss reaching zero<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Validation loss being lower than training loss<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h2 class="relative group"><a id="-nice-work" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-nice-work"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>💐 Nice work!</span></h2> <p data-svelte-h="svelte-11s8azl">You’ve learned how to fine-tune models using SFT! To continue your learning:</p> <ol data-svelte-h="svelte-eabkx7"><li>Try the notebook with different parameters</li> <li>Experiment with other datasets</li> <li>Contribute improvements to the course material</li></ol> <h2 class="relative group"><a id="additional-resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#additional-resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Additional Resources</span></h2> <ul data-svelte-h="svelte-1qwbi4x"><li><a href="https://huggingface.co/docs/trl" rel="nofollow">TRL Documentation</a></li> <li><a href="https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py" rel="nofollow">SFT Examples Repository</a></li> <li><a href="https://huggingface.co/docs/transformers/training" rel="nofollow">Fine-tuning Best Practices</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/en/chapter11/3.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1y0degu = { | |
| assets: "/docs/course/pr_1069/en", | |
| base: "/docs/course/pr_1069/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1069/en/_app/immutable/entry/start.c5306bb2.js"), | |
| import("/docs/course/pr_1069/en/_app/immutable/entry/app.4264f5f8.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 23], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 62.6 kB
- Xet hash:
- 029c5010f7101a8d9a58135e7c4e70a73b69e2fd47a065104dbde7ae55b0c2ec
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.