Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / course /pr_1069 /en /chapter12 /3.html

rtrm

3 months ago

download

raw

59.8 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Understanding the DeepSeek R1 Paper","local":"understanding-the-deepseek-r1-paper","sections":[{"title":"The Breakthrough ‘Aha’ Moment","local":"the-breakthrough-aha-moment","sections":[],"depth":2},{"title":"The Training Process","local":"the-training-process","sections":[{"title":"Cold Start Phase (Quality Foundation)","local":"cold-start-phase-quality-foundation","sections":[],"depth":3},{"title":"Reasoning RL Phase (Capability Building)","local":"reasoning-rl-phase-capability-building","sections":[],"depth":3},{"title":"Rejection Sampling Phase (Quality Control)","local":"rejection-sampling-phase-quality-control","sections":[],"depth":3},{"title":"Diverse RL Phase (Broad Alignment)","local":"diverse-rl-phase-broad-alignment","sections":[],"depth":3}],"depth":2},{"title":"The Algorithm: Group Relative Policy Optimization (GRPO)","local":"the-algorithm-group-relative-policy-optimization-grpo","sections":[{"title":"Group Formation: Creating Multiple Solutions","local":"group-formation-creating-multiple-solutions","sections":[],"depth":3},{"title":"Preference Learning: Understanding What Makes a Good Solution","local":"preference-learning-understanding-what-makes-a-good-solution","sections":[],"depth":3},{"title":"Optimization: Learning from Experience","local":"optimization-learning-from-experience","sections":[],"depth":3},{"title":"GRPO Algorithm in Pseudocode","local":"grpo-algorithm-in-pseudocode","sections":[],"depth":3}],"depth":2},{"title":"Results and Impact","local":"results-and-impact","sections":[],"depth":2},{"title":"Limitations and Challenges of GRPO","local":"limitations-and-challenges-of-grpo","sections":[],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2},{"title":"Quiz","local":"quiz","sections":[{"title":"1. What is the main innovation of the DeepSeek R1 paper?","local":"1-what-is-the-main-innovation-of-the-deepseek-r1-paper","sections":[],"depth":3},{"title":"2. What are the four phases of the DeepSeek R1 training process?","local":"2-what-are-the-four-phases-of-the-deepseek-r1-training-process","sections":[],"depth":3},{"title":"3. What is the ‘Aha Moment’ phenomenon in R1-Zero’s training?","local":"3-what-is-the-aha-moment-phenomenon-in-r1-zeros-training","sections":[],"depth":3},{"title":"4. How does GRPO’s group formation work?","local":"4-how-does-grpos-group-formation-work","sections":[],"depth":3},{"title":"5. What is the key difference between DeepSeek-R1-Zero and DeepSeek-R1?","local":"5-what-is-the-key-difference-between-deepseek-r1-zero-and-deepseek-r1","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/course/pr_1069/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/entry/start.c5306bb2.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/scheduler.37c15a92.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/singletons.bc78d867.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/index.18351ede.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/paths.76894643.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/entry/app.4264f5f8.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/index.7cb9c9b8.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/nodes/0.f5347c47.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/nodes/30.7486345b.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/Tip.d10b3fc9.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/CodeBlock.abae2786.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/Question.7e41e492.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/stores.cb4752a8.js">
	<link rel="modulepreload" href="/docs/course/pr_1069/en/_app/immutable/chunks/getInferenceSnippets.f9350a3f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Understanding the DeepSeek R1 Paper","local":"understanding-the-deepseek-r1-paper","sections":[{"title":"The Breakthrough ‘Aha’ Moment","local":"the-breakthrough-aha-moment","sections":[],"depth":2},{"title":"The Training Process","local":"the-training-process","sections":[{"title":"Cold Start Phase (Quality Foundation)","local":"cold-start-phase-quality-foundation","sections":[],"depth":3},{"title":"Reasoning RL Phase (Capability Building)","local":"reasoning-rl-phase-capability-building","sections":[],"depth":3},{"title":"Rejection Sampling Phase (Quality Control)","local":"rejection-sampling-phase-quality-control","sections":[],"depth":3},{"title":"Diverse RL Phase (Broad Alignment)","local":"diverse-rl-phase-broad-alignment","sections":[],"depth":3}],"depth":2},{"title":"The Algorithm: Group Relative Policy Optimization (GRPO)","local":"the-algorithm-group-relative-policy-optimization-grpo","sections":[{"title":"Group Formation: Creating Multiple Solutions","local":"group-formation-creating-multiple-solutions","sections":[],"depth":3},{"title":"Preference Learning: Understanding What Makes a Good Solution","local":"preference-learning-understanding-what-makes-a-good-solution","sections":[],"depth":3},{"title":"Optimization: Learning from Experience","local":"optimization-learning-from-experience","sections":[],"depth":3},{"title":"GRPO Algorithm in Pseudocode","local":"grpo-algorithm-in-pseudocode","sections":[],"depth":3}],"depth":2},{"title":"Results and Impact","local":"results-and-impact","sections":[],"depth":2},{"title":"Limitations and Challenges of GRPO","local":"limitations-and-challenges-of-grpo","sections":[],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2},{"title":"Quiz","local":"quiz","sections":[{"title":"1. What is the main innovation of the DeepSeek R1 paper?","local":"1-what-is-the-main-innovation-of-the-deepseek-r1-paper","sections":[],"depth":3},{"title":"2. What are the four phases of the DeepSeek R1 training process?","local":"2-what-are-the-four-phases-of-the-deepseek-r1-training-process","sections":[],"depth":3},{"title":"3. What is the ‘Aha Moment’ phenomenon in R1-Zero’s training?","local":"3-what-is-the-aha-moment-phenomenon-in-r1-zeros-training","sections":[],"depth":3},{"title":"4. How does GRPO’s group formation work?","local":"4-how-does-grpos-group-formation-work","sections":[],"depth":3},{"title":"5. What is the key difference between DeepSeek-R1-Zero and DeepSeek-R1?","local":"5-what-is-the-key-difference-between-deepseek-r1-zero-and-deepseek-r1","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="understanding-the-deepseek-r1-paper" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#understanding-the-deepseek-r1-paper"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Understanding the DeepSeek R1 Paper</span></h1> <p data-svelte-h="svelte-1q1gbg6">This chapter is a crash course paper reading. We will walk through the paper in simple terms, and then we will break down the key concepts and takeaways.</p> <p data-svelte-h="svelte-1k7queo">DeepSeek R1 represents a significant advancement in language model training, particularly in developing reasoning capabilities through reinforcement learning. The paper introduces a new reinforcement learning algorithm called Group Relative Policy Optimization (GRPO).</p> <p data-svelte-h="svelte-twqc21"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/4.png" alt="DeepSeek R1 Overview"></p> <p data-svelte-h="svelte-e7b9nv">In the next chapter, we will build on this knowledge and implement GRPO in practice.</p> <p data-svelte-h="svelte-5ujq1b">The initial goal of the paper was to explore whether pure reinforcement learning could develop reasoning capabilities without supervised fine-tuning.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-187ezr">Up until that point, all the popular LLMs required some supervised fine-tuning, which we explored in <a href="/course/chapter11/1">chapter 11</a>.</p></div> <h2 class="relative group"><a id="the-breakthrough-aha-moment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-breakthrough-aha-moment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Breakthrough ‘Aha’ Moment</span></h2> <p data-svelte-h="svelte-1xpbuvl"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/9.png" alt="The 'Aha Moment'"></p> <p data-svelte-h="svelte-80ibr0">One of the most remarkable discoveries in R1-Zero’s training was the emergence of a phenomenon known as the “Aha Moment.” This phenomenon is somewhat similar to how humans experience sudden realizations while problem-solving. Here’s how it works:</p> <ol data-svelte-h="svelte-1l3luam"><li>Initial Attempt: The model makes an initial attempt at solving a problem</li> <li>Recognition: It recognizes potential errors or inconsistencies</li> <li>Self-Correction: It adjusts its approach based on this recognition</li> <li>Explanation: It can explain why the new approach is better</li></ol> <p data-svelte-h="svelte-niphs">This breakthrough resonates with learners and feels like a “Eureka” moment. It demonstrates learning rather than mere memorization, so let’s take a moment to imagine what it feels like to have an “Aha” moment.</p> <p data-svelte-h="svelte-rvuj0k">For example, imagine you’re trying to solve a puzzle:</p> <ul data-svelte-h="svelte-1v8g9eq"><li>First try: “This piece should go here based on the color”</li> <li>Recognition: “But wait, the shape doesn’t quite fit”</li> <li>Correction: “Ah, it actually belongs over there”</li> <li>Explanation: “Because both the color and shape pattern match in this position”</li></ul> <p data-svelte-h="svelte-kqgwp0">This ability emerged naturally from RL training, without being explicitly programmed, demonstrating learning rather than mere memorization of a process from the training data.</p> <p data-svelte-h="svelte-1jq77qm">The easiest way to understand the ‘Aha’ moment is to see it in action. Let’s take a look at an example. In the chat below, we ask the model to solve a problem and the UI shows the model’s thought process as it solves the problem.</p> <iframe src="https://reasoning-course-deepseek-ai-deepseek-r1-distill-0f5fad4.hf.space/" frameborder="0" width="850" height="450"></iframe> <p data-svelte-h="svelte-13tiuz5">If you want to try Deepseek’s R1, you can also check out <a href="https://huggingface.co/chat/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" rel="nofollow">Hugging Chat</a>.</p> <h2 class="relative group"><a id="the-training-process" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-training-process"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Training Process</span></h2> <p data-svelte-h="svelte-1tldo1w">Training R1 was a multi-phase process. Let’s break down the phases and the key innovations in each phase.</p> <p data-svelte-h="svelte-1pxk19b">The final process results in two models:</p> <ul data-svelte-h="svelte-ouyuru"><li>DeepSeek-R1-Zero: A model trained purely using reinforcement learning.</li> <li>DeepSeek-R1: A model that builds on the foundation of DeepSeek-R1-Zero and adds supervised fine-tuning.</li></ul> <table data-svelte-h="svelte-1koosbs"><thead><tr><th>Feature</th> <th>DeepSeek-R1-Zero</th> <th>DeepSeek-R1</th></tr></thead> <tbody><tr><td>Training Approach</td> <td>Pure RL</td> <td>Multi-phase (SFT + RL)</td></tr> <tr><td>Fine-tuning</td> <td>None</td> <td>Supervised fine-tuning</td></tr> <tr><td>Reasoning Capability</td> <td>Emergent</td> <td>Enhanced</td></tr> <tr><td>AIME Performance</td> <td>71.0%</td> <td>79.8%</td></tr> <tr><td>Key Characteristics</td> <td>Strong reasoning but readability issues</td> <td>Better language consistency and readability</td></tr></tbody></table> <p data-svelte-h="svelte-6i75xt">While DeepSeek-R1-Zero demonstrates the potential of pure reinforcement learning for developing reasoning capabilities, DeepSeek-R1 builds upon this foundation with a more balanced approach that prioritizes both reasoning performance and usability.</p> <p data-svelte-h="svelte-vvjgq">The training process involves four phases:</p> <ol data-svelte-h="svelte-1y01w2y"><li>Cold Start Phase</li> <li>Reasoning RL Phase</li> <li>Rejection Sampling Phase</li> <li>Diverse RL Phase</li></ol> <p data-svelte-h="svelte-1xbrite">Let’s break down each phase:</p> <h3 class="relative group"><a id="cold-start-phase-quality-foundation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cold-start-phase-quality-foundation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Cold Start Phase (Quality Foundation)</span></h3> <p data-svelte-h="svelte-6lbybj"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/5.png" alt="Cold Start Phase"></p> <p data-svelte-h="svelte-2jkbw6">This phase is designed to establish a strong foundation for the model’s readability and response quality. It uses a small dataset of high-quality samples from R1-Zero to fine-tune the V3-Base model. Starting with the DeepSeek-V3-Base model, the team used thousands of validated, high-quality samples from R1-Zero for supervised fine-tuning. This innovative approach uses a small but high quality dataset to establish strong baseline readability and response quality.</p> <h3 class="relative group"><a id="reasoning-rl-phase-capability-building" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#reasoning-rl-phase-capability-building"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Reasoning RL Phase (Capability Building)</span></h3> <p data-svelte-h="svelte-1n73fn0"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/6.png" alt="Reasoning RL Phase"></p> <p data-svelte-h="svelte-qsr133">The Reasoning RL Phase focuses on developing core reasoning capabilities across domains including mathematics, coding, science, and logic. This phase employs rule-based reinforcement learning, with rewards directly tied to solution correctness.</p> <p data-svelte-h="svelte-28fufx">Crucially, all the tasks in this phase are ‘verifiable’ so we can check if the model’s answer is correct or not. For example, in the case of mathematics, we can check if the model’s answer is correct by using a mathematical solver.</p> <p data-svelte-h="svelte-1a23tqi">What makes this phase particularly innovative is its direct optimization approach that eliminates the need for a separate reward model, streamlining the training process.</p> <h3 class="relative group"><a id="rejection-sampling-phase-quality-control" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rejection-sampling-phase-quality-control"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Rejection Sampling Phase (Quality Control)</span></h3> <p data-svelte-h="svelte-17yoxfz"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/7.png" alt="Rejection Sampling Phase"></p> <p data-svelte-h="svelte-yrmt66">During the Rejection Sampling Phase, the model generates samples which are then filtered through a quality control process. DeepSeek-V3 serves as the quality judge, evaluating outputs across a broad scope that extends beyond pure reasoning tasks. The filtered data is then used for supervised fine-tuning. This phase’s innovation lies in its ability to combine multiple quality signals to ensure high-standard outputs.</p> <h3 class="relative group"><a id="diverse-rl-phase-broad-alignment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diverse-rl-phase-broad-alignment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Diverse RL Phase (Broad Alignment)</span></h3> <p data-svelte-h="svelte-wb3g6q"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/8.png" alt="Diverse RL Phase"></p> <p data-svelte-h="svelte-du25qh">The final Diverse RL Phase tackles multiple task types using a sophisticated hybrid approach. For deterministic tasks, it employs rule-based rewards, while subjective tasks are evaluated through LLM feedback. This phase aims to achieve human preference alignment through its innovative hybrid reward approach, combining the precision of rule-based systems with the flexibility of language model evaluation.</p> <h2 class="relative group"><a id="the-algorithm-group-relative-policy-optimization-grpo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-algorithm-group-relative-policy-optimization-grpo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Algorithm: Group Relative Policy Optimization (GRPO)</span></h2> <p data-svelte-h="svelte-7adhmj">Now that we have a good understanding of the training process, let’s look at the algorithm that was used to train the model.</p> <p data-svelte-h="svelte-90a27r">The authors describe GRPO as a breakthrough in model fine-tuning:</p> <p data-svelte-h="svelte-1w1t5mr"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/10.png" alt="GRPO Process"></p> <p data-svelte-h="svelte-dzgmzx">GRPO’s novelty lies in its capacity to “directly optimize for preference rectification.” This signifies a more direct and efficient route to aligning the model with desired outputs, contrasting with traditional Reinforcement Learning algorithms such as PPO. Let’s break down how GRPO works through its three main components.</p> <h3 class="relative group"><a id="group-formation-creating-multiple-solutions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#group-formation-creating-multiple-solutions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Group Formation: Creating Multiple Solutions</span></h3> <p data-svelte-h="svelte-1vppveo">The first step in GRPO is remarkably intuitive - it’s similar to how a student might solve a difficult problem by trying multiple approaches. When given a prompt, the model doesn’t just generate one response; instead, it creates multiple attempts at solving the same problem (usually 4, 8, or 16 different attempts).</p> <p data-svelte-h="svelte-1d5f2df">Imagine you’re teaching a model to solve math problems. For a question about counting chickens on a farm, the model might generate several different solutions:</p> <ul data-svelte-h="svelte-aefoyw"><li>One solution might break down the problem step by step: first counting total chickens, then subtracting roosters, and finally accounting for non-laying hens</li> <li>Another might use a different but equally valid approach</li> <li>Some attempts might contain mistakes or less efficient solutions</li></ul> <p data-svelte-h="svelte-stbkld">All these attempts are kept together as a group, much like having multiple students’ solutions to compare and learn from.</p> <p data-svelte-h="svelte-149l4cn"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/11.jpg" alt="Group Formation"></p> <h3 class="relative group"><a id="preference-learning-understanding-what-makes-a-good-solution" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#preference-learning-understanding-what-makes-a-good-solution"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Preference Learning: Understanding What Makes a Good Solution</span></h3> <p data-svelte-h="svelte-ayjaop">This is where GRPO really shines in its simplicity. Unlike other methods for RLHF that need always require a separate reward model to predict how good a solution might be, GRPO can use any function or model to evaluate the quality of a solution. For example, we could use a length function to reward shorter responses or a mathematical solver to reward accurate mathematical solutions.</p> <p data-svelte-h="svelte-1mttff2">The evaluation process looks at various aspects of each solution:</p> <ul data-svelte-h="svelte-s43bsy"><li>Is the final answer correct?</li> <li>Did the solution follow proper formatting (like using the right XML tags)?</li> <li>Does the reasoning match the answer provided?</li></ul> <p data-svelte-h="svelte-1z08xc1">What makes this approach particularly clever is how it handles the scoring. Instead of just giving absolute scores, GRPO normalizes the rewards within each group. It uses a simple but effective formula for group relative advantage estimation:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-variable">Advantage</span> = (<span class="hljs-variable">reward</span> - <span class="hljs-function"><span class="hljs-title">mean</span>(<span class="hljs-variable">group_rewards</span>)) / <span class="hljs-title">std</span>(<span class="hljs-variable">group_rewards</span>)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-163dfc7"><img src="https://huggingface.co/reasoning-course/images/resolve/main/grpo/12.jpg" alt="Preference Learning"></p> <p data-svelte-h="svelte-1k0pdwy">This normalization is like grading on a curve, but for AI. It helps the model understand which solutions within the group were better or worse compared to their peers, rather than just looking at absolute scores.</p> <h3 class="relative group"><a id="optimization-learning-from-experience" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimization-learning-from-experience"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimization: Learning from Experience</span></h3> <p data-svelte-h="svelte-sa89ep">The final step is where GRPO teaches the model to improve based on what it learned from evaluating the group of solutions. This process is both powerful and stable, using two main principles:</p> <ol data-svelte-h="svelte-1o8qmwl"><li>It encourages the model to produce more solutions like the successful ones while moving away from less effective approaches</li> <li>It includes a safety mechanism (called KL divergence penalty) that prevents the model from changing too drastically all at once</li></ol> <p data-svelte-h="svelte-1f6l2f7">This approach proves more stable than traditional methods because:</p> <ul data-svelte-h="svelte-1m7p3yo"><li>It looks at multiple solutions together rather than comparing just two at a time</li> <li>The group-based normalization helps prevent issues with reward scaling</li> <li>The KL penalty acts like a safety net, ensuring the model doesn’t forget what it already knows while learning new things</li></ul> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-fcucmp">GRPO’s key innovations are:</p> <ul data-svelte-h="svelte-1udthal"><li>Learning directly from any function or model, eliminating the reliance on a separate reward model.</li> <li>Group-based learning, which is more stable and efficient than traditional methods like pairwise comparisons.</li></ul></div> <p data-svelte-h="svelte-12ai4fi">This breakdown is complex, but the key takeaway is that GRPO is a more efficient and stable way to train a model to reason.</p> <h3 class="relative group"><a id="grpo-algorithm-in-pseudocode" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#grpo-algorithm-in-pseudocode"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>GRPO Algorithm in Pseudocode</span></h3> <p data-svelte-h="svelte-z4q586">Now that we understand the key components of GRPO, let’s look at the algorithm in pseudocode. This is a simplified version of the algorithm, but it captures the key ideas.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Input:
	- initial_policy: Starting <span class="hljs-keyword">model</span> to be trained
	- reward_function: Function that evaluates outputs
	- training_prompts: <span class="hljs-keyword">Set</span> of <span class="hljs-comment">training examples</span>
	- group_size: Number <span class="hljs-comment">of outputs per prompt (typically 4-16)</span>

	Algorithm <span class="hljs-comment">GRPO:</span>
	1. For <span class="hljs-comment">each training iteration:</span>
	a. <span class="hljs-keyword">Set</span> <span class="hljs-comment">reference_policy = initial_policy (snapshot current policy)</span>
	b. For <span class="hljs-comment">each prompt in batch:</span>
	i. Generate <span class="hljs-comment">group_size different outputs using initial_policy</span>
	ii. Compute <span class="hljs-comment">rewards for each output using reward_function</span>
	iii. Normalize <span class="hljs-comment">rewards within group:</span>
	normalized_advantage <span class="hljs-comment">= (reward - mean(rewards))</span> / std(rewards)
	iv. Update policy by <span class="hljs-keyword">maximizing</span> the clipped ratio:
	<span class="hljs-built_in">min</span>(prob_ratio * normalized_advantage,
	clip(prob_ratio, <span class="hljs-number">1</span>-epsilon, <span class="hljs-number">1</span>+epsilon) * normalized_advantage)
	- kl_weight * KL(initial_policy \|\| reference_policy)

	where prob_ratio is current_prob / <span class="hljs-comment">reference_prob</span>

	Output: Optimized <span class="hljs-comment">policy model</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-aepand">This algorithm shows how GRPO combines group-based advantage estimation with policy optimization while maintaining stability through clipping and KL divergence constraints.</p> <h2 class="relative group"><a id="results-and-impact" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#results-and-impact"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Results and Impact</span></h2> <p data-svelte-h="svelte-m9e1ai">Now that we’ve explored the algorithm, let’s look at the results. DeepSeek R1 achieves state-of-the-art performance across multiple domains:</p> <table data-svelte-h="svelte-joxk22"><thead><tr><th>Domain</th> <th>Key Results</th></tr></thead> <tbody><tr><td>Mathematics</td> <td>• 79.8% on AIME 2024<br>• 97.3% on MATH-500</td></tr> <tr><td>Coding</td> <td>• Codeforces Rating: 2029<br>• LiveCodeBench: 65.9%</td></tr> <tr><td>General Knowledge</td> <td>• MMLU: 90.8%<br>• GPQA Diamond: 71.5%</td></tr> <tr><td>Language Tasks</td> <td>• AlpacaEval 2.0: 87.6% win rate<br>• FRAMES: 82.5%</td></tr></tbody></table> <p data-svelte-h="svelte-1s11wjp">The model’s practical impact extends beyond benchmarks through its cost-effective API pricing ($0.14 per million input tokens) and successful model distillation across various sizes (1.5B to 70B parameters). Notably, even the 7B model achieves 55.5% on AIME 2024, while the 70B distilled version approaches o1-mini performance on MATH-500 (94.5%), demonstrating effective capability preservation at different scales.</p> <h2 class="relative group"><a id="limitations-and-challenges-of-grpo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#limitations-and-challenges-of-grpo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Limitations and Challenges of GRPO</span></h2> <p data-svelte-h="svelte-9kkoxt">While GRPO represents a significant advancement in reinforcement learning for language models, it’s important to understand its limitations and challenges:</p> <ul data-svelte-h="svelte-1dg63f5"><li><strong>Generation Cost</strong>: Generating multiple completions (4-16) for each prompt increases computational requirements compared to methods that generate only one or two completions.</li> <li><strong>Batch Size Constraints</strong>: The need to process groups of completions together can limit effective batch sizes, adding complexity to the training process and potentially slowing down training.</li> <li><strong>Reward Function Design</strong>: The quality of training heavily depends on well-designed reward functions. Poorly designed rewards can lead to unintended behaviors or optimization for the wrong objectives.</li> <li><strong>Group Size Tradeoffs</strong>: Choosing the optimal group size involves balancing diversity of solutions against computational cost. Too few samples may not provide enough diversity, while too many increase training time and resource requirements.</li> <li><strong>KL Divergence Tuning</strong>: Finding the right balance for the KL divergence penalty requires careful tuning - too high and the model won’t learn effectively, too low and it may diverge too far from its initial capabilities.</li></ul> <h2 class="relative group"><a id="conclusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#conclusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Conclusion</span></h2> <p data-svelte-h="svelte-1txuptz">The DeepSeek R1 paper represents a significant milestone in language model development. The Group Relative Policy Optimization (GRPO) algorithm has demonstrated that pure reinforcement learning can indeed develop strong reasoning capabilities, challenging previous assumptions about the necessity of supervised fine-tuning.</p> <p data-svelte-h="svelte-hs6o69">Perhaps most importantly, DeepSeek R1 has shown that it’s possible to balance high performance with practical considerations like cost-effectiveness and accessibility. The successful distillation of the model’s capabilities across different sizes, from 1.5B to 70B parameters, demonstrates a path forward for making advanced AI capabilities more widely available.</p> <hr> <p data-svelte-h="svelte-je3zzp">In the next section, we’ll explore practical implementations of these concepts, focusing on how to leverage GRPO and RFTrans in your own language model development projects.</p> <h2 class="relative group"><a id="quiz" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quiz"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quiz</span></h2> <h3 class="relative group"><a id="1-what-is-the-main-innovation-of-the-deepseek-r1-paper" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-what-is-the-main-innovation-of-the-deepseek-r1-paper"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. What is the main innovation of the DeepSeek R1 paper?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->The GRPO algorithm that enables learning from preferences with and without a reward model<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Using more GPUs for training than any previous model<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Creating a larger language model than existing ones<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="2-what-are-the-four-phases-of-the-deepseek-r1-training-process" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-what-are-the-four-phases-of-the-deepseek-r1-training-process"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. What are the four phases of the DeepSeek R1 training process?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->Cold Start, Reasoning RL, Rejection Sampling, and Diverse RL<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Pre-training, Fine-tuning, Testing, and Deployment<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Data Collection, Model Training, Evaluation, and Optimization<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="3-what-is-the-aha-moment-phenomenon-in-r1-zeros-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-what-is-the-aha-moment-phenomenon-in-r1-zeros-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. What is the ‘Aha Moment’ phenomenon in R1-Zero’s training?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->A process where the model recognizes errors, self-corrects, and explains its corrections<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->The point where the model reaches human-level performance<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->When the model completes its training process<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="4-how-does-grpos-group-formation-work" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-how-does-grpos-group-formation-work"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. How does GRPO’s group formation work?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->It generates multiple solutions (4-16) for the same problem and evaluates them together<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->It combines multiple models into one ensemble<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->It splits the training data into different groups<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="5-what-is-the-key-difference-between-deepseek-r1-zero-and-deepseek-r1" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5-what-is-the-key-difference-between-deepseek-r1-zero-and-deepseek-r1"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. What is the key difference between DeepSeek-R1-Zero and DeepSeek-R1?</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->R1-Zero uses pure RL while R1 combines RL with supervised fine-tuning<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->R1-Zero is smaller than R1<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->R1-Zero was trained on less data<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/en/chapter12/3.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1y0degu = {
	assets: "/docs/course/pr_1069/en",
	base: "/docs/course/pr_1069/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/course/pr_1069/en/_app/immutable/entry/start.c5306bb2.js"),
	import("/docs/course/pr_1069/en/_app/immutable/entry/app.4264f5f8.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 30],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 59.8 kB
Xet hash:: eff8eb3d63fd151af652c55d685f6b6ac71d764a25f8609e1e239d6fa22848d2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.