Spaces:
Sleeping
Sleeping
Commit ·
da0b98b
1
Parent(s): 2b43e82
update
Browse files
examples/tutorials/dpo/ultrafeedback-dpo-unsloth/step_2_train_dpo_model_unsloth_ddp_qlora.py
CHANGED
|
@@ -22,6 +22,49 @@ nohup torchrun --nproc_per_node=4 step_2_train_dpo_model_unsloth_ddp_qlora.py \
|
|
| 22 |
kill -9 `ps -aef | grep 'step_2_train_dpo_model_unsloth_ddp_qlora.py' | grep -v grep | awk '{print $2}'`
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
import argparse
|
| 27 |
import os
|
|
@@ -85,7 +128,7 @@ def get_args():
|
|
| 85 |
|
| 86 |
parser.add_argument("--num_train_epochs", default=3, type=int),
|
| 87 |
parser.add_argument("--learning_rate", default=2e-5, type=float),
|
| 88 |
-
parser.add_argument("--dpo_beta", default=0.
|
| 89 |
parser.add_argument("--lora_rank", default=32, type=int),
|
| 90 |
|
| 91 |
parser.add_argument(
|
|
|
|
| 22 |
kill -9 `ps -aef | grep 'step_2_train_dpo_model_unsloth_ddp_qlora.py' | grep -v grep | awk '{print $2}'`
|
| 23 |
|
| 24 |
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
--------------
|
| 29 |
+
训练日志介绍。
|
| 30 |
+
|
| 31 |
+
{'loss': '0.5736', 'grad_norm': '1.373', 'learning_rate': '1.765e-05', 'rewards/chosen': '0.3182', 'rewards/rejected': '-0.2243', 'rewards/accuracies': '0.6938', 'rewards/margins': '0.5425', 'logps/chosen': '-366.9', 'logps/rejected': '-316.8', 'logits/chosen': '-1.945', 'logits/rejected': '-1.889', 'epoch': '0.2434'}
|
| 32 |
+
|
| 33 |
+
grad_norm: 1.373,
|
| 34 |
+
所有参数梯度的L2范数,衡量梯度的大小,通常0.1-10之间
|
| 35 |
+
|
| 36 |
+
rewards/chosen: 0.3182,
|
| 37 |
+
模型对优选项的奖励值,计算公式为 β * (log π(chosen|x) - log π_ref(chosen|x))
|
| 38 |
+
衡量优选项相对于参考模型的提升程度
|
| 39 |
+
理想值:>0,越大越好
|
| 40 |
+
|
| 41 |
+
rewards/rejected: -0.2243
|
| 42 |
+
模型对拒选项的奖励值,计算公式为β * (log π(rejected|x) - log π_ref(rejected|x))
|
| 43 |
+
衡量拒选项相对于参考模型的下降程度
|
| 44 |
+
理想值:<0,越小越好
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
rewards/margins: 0.5425
|
| 48 |
+
chosen奖励减去rejected奖励的差值
|
| 49 |
+
理想值:>0.3为良好,>0.5为优秀
|
| 50 |
+
差值越大,模型对chosen和rejected的区分能力越强
|
| 51 |
+
|
| 52 |
+
rewards/accuracies: 0.6938
|
| 53 |
+
模型正确区分chosen和rejected的比例
|
| 54 |
+
理想值:0.6-0.8为理想,>0.85可能过拟合
|
| 55 |
+
|
| 56 |
+
logps/chosen: -366.9
|
| 57 |
+
模型生成整个优选项序列的对数概率,即log P(chosen|x)
|
| 58 |
+
所有token的log概率之和,负值越大(越接近0)表示概率越高
|
| 59 |
+
应大于(即数值上小于)logps/rejected
|
| 60 |
+
|
| 61 |
+
logps/rejected: -316.8
|
| 62 |
+
模型生成整个拒选项序列的对数概率
|
| 63 |
+
应小于(即数值上大于)logps/chosen
|
| 64 |
+
logps/chosen > logps/rejected(数值上更小)是最终目标
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
"""
|
| 69 |
import argparse
|
| 70 |
import os
|
|
|
|
| 128 |
|
| 129 |
parser.add_argument("--num_train_epochs", default=3, type=int),
|
| 130 |
parser.add_argument("--learning_rate", default=2e-5, type=float),
|
| 131 |
+
parser.add_argument("--dpo_beta", default=0.1, type=float),
|
| 132 |
parser.add_argument("--lora_rank", default=32, type=int),
|
| 133 |
|
| 134 |
parser.add_argument(
|
examples/tutorials/grpo/step_2_train_grpo_model_unsloth_ddp.py
CHANGED
|
@@ -9,6 +9,51 @@ python3 -m torch.distributed.run --nproc_per_node=4 step_2_train_grpo_model_unsl
|
|
| 9 |
torchrun --nproc_per_node=4 step_2_train_grpo_model_unsloth_ddp.py
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
import argparse
|
| 14 |
import os
|
|
@@ -27,11 +72,8 @@ else:
|
|
| 27 |
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 28 |
|
| 29 |
from datasets import load_dataset
|
| 30 |
-
from transformers import (
|
| 31 |
-
AutoTokenizer, AutoModelForCausalLM,
|
| 32 |
-
)
|
| 33 |
from trl import GRPOConfig, GRPOTrainer
|
| 34 |
-
from unsloth import FastLanguageModel
|
| 35 |
|
| 36 |
|
| 37 |
def get_args():
|
|
@@ -299,10 +341,10 @@ def main():
|
|
| 299 |
|
| 300 |
grpo_config = GRPOConfig(
|
| 301 |
output_dir=args.output_model_dir,
|
| 302 |
-
num_train_epochs=
|
| 303 |
-
per_device_train_batch_size=
|
| 304 |
-
per_device_eval_batch_size=
|
| 305 |
-
gradient_accumulation_steps=
|
| 306 |
learning_rate=5e-6, # GRPO通常使用较小的学习率
|
| 307 |
warmup_ratio=0.1,
|
| 308 |
logging_steps=10,
|
|
@@ -322,11 +364,11 @@ def main():
|
|
| 322 |
# GRPO特有参数
|
| 323 |
num_generations=4, # 每个提示生成的响应数量
|
| 324 |
max_completion_length=512, # 生成的最大长度
|
| 325 |
-
steps_per_generation=
|
| 326 |
temperature=0.7,
|
| 327 |
|
| 328 |
beta=0.001,
|
| 329 |
-
num_iterations=1,
|
| 330 |
epsilon=0.2,
|
| 331 |
reward_weights=[0.1, 1.0],
|
| 332 |
loss_type="dapo",
|
|
|
|
| 9 |
torchrun --nproc_per_node=4 step_2_train_grpo_model_unsloth_ddp.py
|
| 10 |
|
| 11 |
|
| 12 |
+
|
| 13 |
+
--------------
|
| 14 |
+
训练日志介绍。
|
| 15 |
+
|
| 16 |
+
{'loss': 0.0394, 'grad_norm': 0.060413047671318054, 'learning_rate': 1.228568308397947e-09, 'num_tokens': 69862.0, 'completions/mean_length': 226.21875762939453, 'completions/min_length': 89.5, 'completions/max_length': 512.0, 'completions/clipped_ratio': 0.05729166744276881, 'completions/mean_terminated_length': 208.7477569580078, 'completions/min_terminated_length': 89.5, 'completions/max_terminated_length': 465.75, 'rewards/format_reward_func/mean': 0.4739583358168602, 'rewards/format_reward_func/std': 0.1097758337855339, 'rewards/answer_reward_func/mean': 0.08333333488553762, 'rewards/answer_reward_func/std': 0.26695219799876213, 'reward': 0.13072916865348816, 'reward_std': 0.1440115850418806, 'frac_reward_zero_std': 0.6041666865348816, 'kl': -5.28276373756853e-08, 'entropy': 0.6871595978736877, 'clip_ratio/low_mean': 0.0, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 0.0, 'epoch': 0.0}
|
| 17 |
+
|
| 18 |
+
生成长度指标
|
| 19 |
+
completions/mean_length: 226.22 - 平均生成长度(token数)
|
| 20 |
+
completions/min_length: 89.5 - 最小生成长度
|
| 21 |
+
completions/max_length: 512.0 - 最大生成长度(达到预设上限)
|
| 22 |
+
completions/clipped_ratio: 0.057 - 被截断的生成比例,5.7%的生成被截断
|
| 23 |
+
|
| 24 |
+
生成长度
|
| 25 |
+
模型实际生成的token数量
|
| 26 |
+
可能是生成了终止符(EOS token),也可能是达到了最大长度限制
|
| 27 |
+
|
| 28 |
+
终止长度指标
|
| 29 |
+
completions/mean_terminated_length: 208.75 - 实际终止的平均长度
|
| 30 |
+
completions/min_terminated_length: 89.5 - 最小终止长度
|
| 31 |
+
completions/max_terminated_length: 465.75 - 最大终止长度
|
| 32 |
+
|
| 33 |
+
终止长度
|
| 34 |
+
模型在遇到终止符(EOS token)时停止的生成长度
|
| 35 |
+
只有正常结束(生成了EOS token)的生成才会被计入
|
| 36 |
+
被截断的生成(达到max_length)不计入
|
| 37 |
+
|
| 38 |
+
因为:completions/clipped_ratio: 0.057
|
| 39 |
+
被截断的生成比例 = 5.7%
|
| 40 |
+
正常终止(遇到EOS)的生成比例 = 94.3%
|
| 41 |
+
|
| 42 |
+
格式奖励
|
| 43 |
+
rewards/format_reward_func/mean: 0.474 - 格式奖励平均值(满分1.0)
|
| 44 |
+
说明约47.4%的生成符合格式要求
|
| 45 |
+
|
| 46 |
+
答案正确性奖励
|
| 47 |
+
rewards/answer_reward_func/mean: 0.083 - 答案正确性奖励平均值
|
| 48 |
+
只有8.3%的生成给出了正确答案
|
| 49 |
+
|
| 50 |
+
KL散度
|
| 51 |
+
kl: -5.28e-08 - KL散度,接近0表示新策略没有明显偏离旧策略
|
| 52 |
+
|
| 53 |
+
熵
|
| 54 |
+
entropy: 0.687 - 策略的熵,表示生成多样性
|
| 55 |
+
|
| 56 |
+
|
| 57 |
"""
|
| 58 |
import argparse
|
| 59 |
import os
|
|
|
|
| 72 |
temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp")
|
| 73 |
|
| 74 |
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
| 75 |
from trl import GRPOConfig, GRPOTrainer
|
| 76 |
+
from unsloth import FastLanguageModel
|
| 77 |
|
| 78 |
|
| 79 |
def get_args():
|
|
|
|
| 341 |
|
| 342 |
grpo_config = GRPOConfig(
|
| 343 |
output_dir=args.output_model_dir,
|
| 344 |
+
num_train_epochs=1,
|
| 345 |
+
per_device_train_batch_size=8,
|
| 346 |
+
per_device_eval_batch_size=8,
|
| 347 |
+
gradient_accumulation_steps=2,
|
| 348 |
learning_rate=5e-6, # GRPO通常使用较小的学习率
|
| 349 |
warmup_ratio=0.1,
|
| 350 |
logging_steps=10,
|
|
|
|
| 364 |
# GRPO特有参数
|
| 365 |
num_generations=4, # 每个提示生成的响应数量
|
| 366 |
max_completion_length=512, # 生成的最大长度
|
| 367 |
+
steps_per_generation=4, # 每次生成的数据分成几个mini-batch训练。
|
| 368 |
temperature=0.7,
|
| 369 |
|
| 370 |
beta=0.001,
|
| 371 |
+
num_iterations=1, # 每个训练的mini-batch训练几次。
|
| 372 |
epsilon=0.2,
|
| 373 |
reward_weights=[0.1, 1.0],
|
| 374 |
loss_type="dapo",
|