|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import marimo |
|
|
|
|
|
__generated_with = "0.19.2" |
|
|
app = marimo.App(width="medium") |
|
|
|
|
|
with app.setup(hide_code=True): |
|
|
import marimo as mo |
|
|
import numpy as np |
|
|
from widget import GrpoGdpoWidget |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(): |
|
|
mo.md(r""" |
|
|
# GRPO vs GDPO: Why Normalization Order Matters |
|
|
|
|
|
When you're training a model with multiple reward signals, you'd think weighting them is straightforward. Set 40% on correctness, 30% on format, 30% on style, and you're good. |
|
|
|
|
|
But there's a subtle bug in how GRPO (Group Relative Policy Optimization) normalizes rewards that can completely wash out your smaller-scale signals. This is called **advantage collapse**. GDPO (Group |
|
|
reward-Decoupled Normalization Policy Optimization) seeks to address this. |
|
|
|
|
|
Let's see this in action with a toy example. |
|
|
""") |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(): |
|
|
mo.md(""" |
|
|
## The Book Ranking Problem |
|
|
|
|
|
Imagine you're ranking ML books on three dimensions: |
|
|
- **Enjoyment** (1-10): How fun is it to read? |
|
|
- **Information** (1-10): How much do you learn? |
|
|
- **Readability** (1-5): How easy is it to read? *(note the smaller scale)* |
|
|
|
|
|
Adjust the sliders and watch how GRPO and GDPO rank the books differently. |
|
|
""") |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(): |
|
|
|
|
|
book1_enjoy = mo.ui.slider(1, 10, value=8, label="Enjoy") |
|
|
book1_info = mo.ui.slider(1, 10, value=3, label="Info") |
|
|
book1_read = mo.ui.slider(1, 5, value=5, label="Read") |
|
|
|
|
|
|
|
|
book2_enjoy = mo.ui.slider(1, 10, value=3, label="Enjoy") |
|
|
book2_info = mo.ui.slider(1, 10, value=9, label="Info") |
|
|
book2_read = mo.ui.slider(1, 5, value=1, label="Read") |
|
|
|
|
|
|
|
|
book3_enjoy = mo.ui.slider(1, 10, value=7, label="Enjoy") |
|
|
book3_info = mo.ui.slider(1, 10, value=8, label="Info") |
|
|
book3_read = mo.ui.slider(1, 5, value=4, label="Read") |
|
|
|
|
|
|
|
|
book4_enjoy = mo.ui.slider(1, 10, value=9, label="Enjoy") |
|
|
book4_info = mo.ui.slider(1, 10, value=2, label="Info") |
|
|
book4_read = mo.ui.slider(1, 5, value=5, label="Read") |
|
|
|
|
|
|
|
|
book5_enjoy = mo.ui.slider(1, 10, value=4, label="Enjoy") |
|
|
book5_info = mo.ui.slider(1, 10, value=10, label="Info") |
|
|
book5_read = mo.ui.slider(1, 5, value=2, label="Read") |
|
|
|
|
|
book1_card = mo.vstack([ |
|
|
mo.md("**π€ The Singularity is Nigh**<br><small>*pop-sci hype*</small>"), |
|
|
book1_enjoy, book1_info, book1_read |
|
|
], align="center") |
|
|
|
|
|
book2_card = mo.vstack([ |
|
|
mo.md("**π§ Attention Is All You Need: The Novel**<br><small>*dense math fiction*</small>"), |
|
|
book2_enjoy, book2_info, book2_read |
|
|
], align="center") |
|
|
|
|
|
book3_card = mo.vstack([ |
|
|
mo.md("**π Rejected NeurIPS Papers**<br><small>*hidden gems*</small>"), |
|
|
book3_enjoy, book3_info, book3_read |
|
|
], align="center") |
|
|
|
|
|
book4_card = mo.vstack([ |
|
|
mo.md("**π 10X Your Startup**<br><small>*one weird trick*</small>"), |
|
|
book4_enjoy, book4_info, book4_read |
|
|
], align="center") |
|
|
|
|
|
book5_card = mo.vstack([ |
|
|
mo.md("**π Deep Learning (Goodfellow)**<br><small>*the classic*</small>"), |
|
|
book5_enjoy, book5_info, book5_read |
|
|
], align="center") |
|
|
|
|
|
mo.vstack([mo.hstack([book1_card, book2_card, book3_card], justify="space-around"), mo.hstack([book4_card, book5_card], justify="center")]) |
|
|
return ( |
|
|
book1_enjoy, |
|
|
book1_info, |
|
|
book1_read, |
|
|
book2_enjoy, |
|
|
book2_info, |
|
|
book2_read, |
|
|
book3_enjoy, |
|
|
book3_info, |
|
|
book3_read, |
|
|
book4_enjoy, |
|
|
book4_info, |
|
|
book4_read, |
|
|
book5_enjoy, |
|
|
book5_info, |
|
|
book5_read, |
|
|
) |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(book_results): |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
book_names = [ |
|
|
"π€ The Singularity is Nigh", |
|
|
"π§ Attention Is All You Need: The Novel", |
|
|
"π Rejected NeurIPS Papers", |
|
|
"π 10X Your Startup", |
|
|
"π Deep Learning (Goodfellow)", |
|
|
] |
|
|
|
|
|
grpo_r = book_results["grpo_ranks"] |
|
|
gdpo_r = book_results["gdpo_ranks"] |
|
|
grpo_adv = book_results["grpo_adv"] |
|
|
gdpo_adv = book_results["gdpo_adv"] |
|
|
rank_diff = np.abs(grpo_r - gdpo_r).sum() |
|
|
|
|
|
|
|
|
sort_idx = np.argsort(gdpo_r)[::-1] |
|
|
sorted_names = [book_names[i] for i in sort_idx] |
|
|
sorted_grpo_adv = grpo_adv[sort_idx] |
|
|
sorted_gdpo_adv = gdpo_adv[sort_idx] |
|
|
sorted_grpo_r = grpo_r[sort_idx] |
|
|
sorted_gdpo_r = gdpo_r[sort_idx] |
|
|
|
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
y=sorted_names, |
|
|
x=sorted_grpo_adv, |
|
|
name="GRPO", |
|
|
orientation="h", |
|
|
marker=dict(color="#ff6b6b"), |
|
|
text=[f"Rank {r}" for r in sorted_grpo_r], |
|
|
textposition="auto", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
y=sorted_names, |
|
|
x=sorted_gdpo_adv, |
|
|
name="GDPO", |
|
|
orientation="h", |
|
|
marker=dict(color="#4ecdc4"), |
|
|
text=[f"Rank {r}" for r in sorted_gdpo_r], |
|
|
textposition="auto", |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Book Rankings: GRPO vs GDPO", |
|
|
xaxis_title="Advantage Score", |
|
|
yaxis_title="", |
|
|
barmode="group", |
|
|
height=300, |
|
|
showlegend=True, |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
) |
|
|
|
|
|
chart = mo.ui.plotly(fig) |
|
|
|
|
|
notes = [] |
|
|
grpo_unique = len(np.unique(np.round(grpo_adv, 6))) |
|
|
gdpo_unique = len(np.unique(np.round(gdpo_adv, 6))) |
|
|
if grpo_unique < 5 and gdpo_unique == 5: |
|
|
notes.append( |
|
|
mo.md(f"**Advantage collapse detected!** GRPO has only {grpo_unique} unique advantage values for 5 books, while GDPO has {gdpo_unique}. Different books are getting the same learning signal.")) |
|
|
|
|
|
|
|
|
if rank_diff > 0: |
|
|
notes.append(mo.md("**Rankings differ!** Try adjusting sliders and notice how GRPO / GDPO advantage changes.")) |
|
|
else: |
|
|
notes.append( |
|
|
mo.md(""" |
|
|
The rankings match here, but look at the *advantage scores*. Even when the ordinal ranking is the same, the magnitude of the advantages differs. How would you prescribe a learning rate or clip range when the scale of your advantages depends on which normalization you use? |
|
|
""") |
|
|
) |
|
|
|
|
|
mo.vstack([chart] + notes) |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(): |
|
|
mo.md(r""" |
|
|
## So What's Actually Happening? |
|
|
|
|
|
The key insight: when one reward has much higher variance than another, the combined variance is dominated by the high-variance reward. After normalization, the low-variance signal contributes almost nothing. |
|
|
|
|
|
**GRPO** aggregates rewards first, then normalizes: |
|
|
|
|
|
$$r_j = \sum_i w_i \cdot r_j^{(i)}, \quad A_j^{\text{GRPO}} = \frac{r_j - \mu(r)}{\sigma(r)}$$ |
|
|
|
|
|
**GDPO** normalizes each reward independently, then aggregates: |
|
|
|
|
|
$$\tilde{r}_j^{(i)} = \frac{r_j^{(i)} - \mu(r^{(i)})}{\sigma(r^{(i)})}, \quad A_j^{\text{GDPO}} = \sum_i w_i \cdot \tilde{r}_j^{(i)}$$ |
|
|
|
|
|
The difference is subtle but critical. In GRPO, if Enjoyment and Information both range 1-10 but Readability only ranges 1-5, the Readability signal gets washed out when combined with the larger-scale rewards. |
|
|
|
|
|
GDPO fixes this by normalizing each dimension to the same scale (mean=0, std=1) *before* combining them. |
|
|
""") |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(): |
|
|
mo.md(""" |
|
|
## This Gets Worse with Binary Rewards |
|
|
|
|
|
The [GDPO paper](https://arxiv.org/abs/2601.05242) demonstrates this on the [Berkeley Function Calling Leaderboard (BFCL)](https://gorilla.cs.berkeley.edu/leaderboard.html) dataset, where LLM outputs are scored on multiple binary criteria: |
|
|
|
|
|
- **Correctness**: Does the function call execute successfully? |
|
|
- **Style**: Are the arguments formatted correctly? |
|
|
- **Conciseness**: Is the call free of redundant parameters? |
|
|
|
|
|
The table below simulates 12 rollouts from such a system. Click the cells to toggle rewards. Notice how GRPO assigns **identical advantages** to rollouts with the same total (e.g., `[1,0,1]` and `[0,1,1]` both sum to 2), while GDPO differentiates them based on *which* rewards were achieved. |
|
|
""") |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(): |
|
|
widget = GrpoGdpoWidget() |
|
|
widget_view = mo.ui.anywidget(widget) |
|
|
widget_view |
|
|
return (widget_view,) |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(): |
|
|
mo.md(""" |
|
|
## Does This Actually Matter in Practice? |
|
|
|
|
|
Let's train a toy policy and see. We have 3 binary rewards and want to maximize all of them. The policy learns a probability for each dimension. |
|
|
""") |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(): |
|
|
reuse_toggle = mo.ui.switch(label="Train on widget data (instead of fresh samples)", value=False) |
|
|
reuse_toggle |
|
|
return (reuse_toggle,) |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(gdpo_history, grpo_history): |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
_fig, _ax = plt.subplots(figsize=(10, 5)) |
|
|
|
|
|
colors = ['#1f77b4', '#ff7f0e', '#2ca02c'] |
|
|
labels = ['correctness', 'style', 'conciseness'] |
|
|
epochs = range(len(grpo_history)) |
|
|
|
|
|
for _i, (_color, _label) in enumerate(zip(colors, labels)): |
|
|
_ax.plot(epochs, gdpo_history[:, _i], '-', color=_color, linewidth=2, |
|
|
label=f'{_label} (GDPO)') |
|
|
for _i, (_color, _label) in enumerate(zip(colors, labels)): |
|
|
_ax.plot(epochs, grpo_history[:, _i], '--', color=_color, linewidth=2, |
|
|
label=f'{_label} (GRPO)') |
|
|
|
|
|
_ax.set_xlabel('Epoch') |
|
|
_ax.set_ylabel('Probability') |
|
|
_ax.set_title('GRPO vs GDPO: Policy Convergence') |
|
|
_ax.set_ylim(0, 1.05) |
|
|
_ax.legend(loc='lower right', ncol=2) |
|
|
_ax.grid(True, alpha=0.3) |
|
|
|
|
|
mo.md(""" |
|
|
**What you're seeing**: GDPO learns to maximize each dimension independently (solid lines converge to ~1.0). GRPO collapses all dimensions together (dashed lines follow similar trajectories). |
|
|
|
|
|
This is advantage collapse in action. GRPO can't tell which specific rewards to optimize because they all get the same gradient signal. |
|
|
""") |
|
|
|
|
|
_fig |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell(hide_code=True) |
|
|
def _(): |
|
|
mo.md(r""" |
|
|
## Takeaways |
|
|
|
|
|
**When to use GDPO:** |
|
|
- Multiple reward signals at different scales |
|
|
- Binary/categorical rewards mixed with continuous |
|
|
- You care about all signals contributing proportionally to their weights |
|
|
|
|
|
**When GRPO is fine:** |
|
|
- Single reward signal |
|
|
- All rewards at similar scales |
|
|
- One dominant reward, others are just regularizers |
|
|
|
|
|
**Implementation** (it's a one-line change): |
|
|
- TRL: `apply_gdpo: True` |
|
|
- VERL: `adv_estimator: 'gdpo'` |
|
|
|
|
|
**Learn more:** |
|
|
- Paper: [arXiv:2601.05242](https://arxiv.org/abs/2601.05242) |
|
|
- Code: [github.com/NVlabs/GDPO](https://github.com/NVlabs/GDPO) |
|
|
|
|
|
--- |
|
|
*Built with [marimo](https://marimo.io)* |
|
|
""") |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _( |
|
|
book1_enjoy, |
|
|
book1_info, |
|
|
book1_read, |
|
|
book2_enjoy, |
|
|
book2_info, |
|
|
book2_read, |
|
|
book3_enjoy, |
|
|
book3_info, |
|
|
book3_read, |
|
|
book4_enjoy, |
|
|
book4_info, |
|
|
book4_read, |
|
|
book5_enjoy, |
|
|
book5_info, |
|
|
book5_read, |
|
|
): |
|
|
def normalize(arr): |
|
|
arr = np.array(arr, dtype=np.float64) |
|
|
std = arr.std() |
|
|
if std == 0: |
|
|
return np.zeros_like(arr) |
|
|
return (arr - arr.mean()) / std |
|
|
|
|
|
|
|
|
rewards = np.array([ |
|
|
[book1_enjoy.value, book1_info.value, book1_read.value], |
|
|
[book2_enjoy.value, book2_info.value, book2_read.value], |
|
|
[book3_enjoy.value, book3_info.value, book3_read.value], |
|
|
[book4_enjoy.value, book4_info.value, book4_read.value], |
|
|
[book5_enjoy.value, book5_info.value, book5_read.value], |
|
|
], dtype=np.float64) |
|
|
|
|
|
|
|
|
combined = rewards.sum(axis=1) |
|
|
grpo_advantages = normalize(combined) |
|
|
|
|
|
|
|
|
gdpo_advantages = np.zeros(5) |
|
|
for dim in range(3): |
|
|
gdpo_advantages += normalize(rewards[:, dim]) |
|
|
|
|
|
|
|
|
grpo_ranks = np.argsort(np.argsort(-grpo_advantages)) + 1 |
|
|
gdpo_ranks = np.argsort(np.argsort(-gdpo_advantages)) + 1 |
|
|
|
|
|
book_results = { |
|
|
"grpo_adv": grpo_advantages, |
|
|
"gdpo_adv": gdpo_advantages, |
|
|
"grpo_ranks": grpo_ranks, |
|
|
"gdpo_ranks": gdpo_ranks, |
|
|
"rewards": rewards, |
|
|
} |
|
|
return book_results, normalize |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(normalize): |
|
|
def compute_grpo_advantages(rewards): |
|
|
totals = rewards.sum(axis=1) |
|
|
return normalize(totals) |
|
|
|
|
|
def compute_gdpo_advantages(rewards): |
|
|
advantages = np.zeros(len(rewards)) |
|
|
for dim in range(rewards.shape[1]): |
|
|
advantages += normalize(rewards[:, dim]) |
|
|
return advantages |
|
|
|
|
|
def train_policy(method, epochs=100, lr=0.1, batch_size=32, seed=41, fixed_rewards=None): |
|
|
rng = np.random.default_rng(seed) |
|
|
logits = np.zeros(3) |
|
|
history = [] |
|
|
|
|
|
for _epoch in range(epochs): |
|
|
probs = 1 / (1 + np.exp(-logits)) |
|
|
history.append(probs.copy()) |
|
|
|
|
|
if fixed_rewards is not None: |
|
|
rewards = fixed_rewards |
|
|
else: |
|
|
rewards = (rng.random((batch_size, 3)) < probs).astype(np.float64) |
|
|
|
|
|
if method == 'grpo': |
|
|
advantages = compute_grpo_advantages(rewards) |
|
|
else: |
|
|
advantages = compute_gdpo_advantages(rewards) |
|
|
|
|
|
for i in range(3): |
|
|
grad = ((rewards[:, i] - probs[i]) * advantages).mean() |
|
|
logits[i] += lr * grad |
|
|
|
|
|
return np.array(history) |
|
|
return (train_policy,) |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(reuse_toggle, train_policy, widget_view): |
|
|
def widget_rewards_to_array(rewards_list): |
|
|
return np.array([ |
|
|
[r["correctness"], r["style"], r["conciseness"]] |
|
|
for r in rewards_list |
|
|
], dtype=np.float64) |
|
|
|
|
|
if reuse_toggle.value: |
|
|
fixed = widget_rewards_to_array(widget_view.widget.rewards) |
|
|
else: |
|
|
fixed = None |
|
|
|
|
|
grpo_history = train_policy('grpo', epochs=150, lr=0.15, fixed_rewards=fixed) |
|
|
gdpo_history = train_policy('gdpo', epochs=150, lr=0.15, fixed_rewards=fixed) |
|
|
return gdpo_history, grpo_history |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.run() |
|
|
|