| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.04, |
| "eval_steps": 500, |
| "global_step": 40, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 628.125, |
| "epoch": 0.001, |
| "grad_norm": 0.5495060086250305, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "reward": 0.3375000059604645, |
| "reward_std": 0.13562028110027313, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.08750000596046448, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 1 |
| }, |
| { |
| "completion_length": 756.75, |
| "epoch": 0.002, |
| "grad_norm": 0.37339404225349426, |
| "kl": 0.0, |
| "learning_rate": 2.5e-06, |
| "loss": -0.0, |
| "reward": 0.26249998807907104, |
| "reward_std": 0.0353553406894207, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.012500000186264515, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 2 |
| }, |
| { |
| "completion_length": 893.875, |
| "epoch": 0.003, |
| "grad_norm": 0.5389103889465332, |
| "kl": 0.0010528296697884798, |
| "learning_rate": 5e-06, |
| "loss": 0.0, |
| "reward": 0.38749998807907104, |
| "reward_std": 0.2875387966632843, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.07500000298023224, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.3125, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 3 |
| }, |
| { |
| "completion_length": 654.5, |
| "epoch": 0.004, |
| "grad_norm": 0.5193591713905334, |
| "kl": 0.0012392314383760095, |
| "learning_rate": 4.962019382530521e-06, |
| "loss": 0.0, |
| "reward": 0.32499998807907104, |
| "reward_std": 0.11649646610021591, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.07500000298023224, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 4 |
| }, |
| { |
| "completion_length": 831.25, |
| "epoch": 0.005, |
| "grad_norm": 5.816634178161621, |
| "kl": 0.0014670327072963119, |
| "learning_rate": 4.849231551964771e-06, |
| "loss": 0.0001, |
| "reward": 0.41874998807907104, |
| "reward_std": 0.3058215081691742, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.20000000298023224, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.21875, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 5 |
| }, |
| { |
| "completion_length": 803.25, |
| "epoch": 0.006, |
| "grad_norm": 0.43738511204719543, |
| "kl": 0.0011906560976058245, |
| "learning_rate": 4.665063509461098e-06, |
| "loss": 0.0, |
| "reward": 0.5132201313972473, |
| "reward_std": 0.6117616891860962, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0793749988079071, |
| "rewards/confidence_calibration_reward": 0.0625, |
| "rewards/correctness_reward_func": 0.002595155732706189, |
| "rewards/legal_reference_quality_reward": 0.08750000596046448, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.28125, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 6 |
| }, |
| { |
| "completion_length": 600.875, |
| "epoch": 0.007, |
| "grad_norm": 0.43174880743026733, |
| "kl": 0.0011430989252403378, |
| "learning_rate": 4.415111107797445e-06, |
| "loss": 0.0, |
| "reward": 0.28125, |
| "reward_std": 0.15338443219661713, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0625, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.21875, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 7 |
| }, |
| { |
| "completion_length": 748.125, |
| "epoch": 0.008, |
| "grad_norm": 23.641027450561523, |
| "kl": 0.0019280135165899992, |
| "learning_rate": 4.106969024216348e-06, |
| "loss": 0.0001, |
| "reward": 0.2750000059604645, |
| "reward_std": 0.04629100486636162, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.02500000037252903, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 8 |
| }, |
| { |
| "completion_length": 706.5, |
| "epoch": 0.009, |
| "grad_norm": 0.4881976246833801, |
| "kl": 0.001386392512358725, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0001, |
| "reward": 0.9960337281227112, |
| "reward_std": 1.3480883836746216, |
| "rewards/answer_practicality_reward": 0.08749999850988388, |
| "rewards/conciseness_reward": 0.2498437464237213, |
| "rewards/confidence_calibration_reward": 0.125, |
| "rewards/correctness_reward_func": 0.046189986169338226, |
| "rewards/legal_reference_quality_reward": 0.11249999701976776, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.375, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 9 |
| }, |
| { |
| "completion_length": 673.5, |
| "epoch": 0.01, |
| "grad_norm": 0.5830779671669006, |
| "kl": 0.0012368967290967703, |
| "learning_rate": 3.3550503583141726e-06, |
| "loss": 0.0, |
| "reward": 0.5310937762260437, |
| "reward_std": 0.5911571383476257, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.04984375089406967, |
| "rewards/confidence_calibration_reward": 0.0625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.07500000298023224, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.34375, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 10 |
| }, |
| { |
| "completion_length": 635.5, |
| "epoch": 0.011, |
| "grad_norm": 0.44329559803009033, |
| "kl": 0.0015630712732672691, |
| "learning_rate": 2.9341204441673267e-06, |
| "loss": 0.0001, |
| "reward": 0.39375001192092896, |
| "reward_std": 0.3178471028804779, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.08749999850988388, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.28125, |
| "rewards/structure_clarity_reward": 0.02500000037252903, |
| "step": 11 |
| }, |
| { |
| "completion_length": 772.5, |
| "epoch": 0.012, |
| "grad_norm": 0.0020675428677350283, |
| "kl": 0.001098034786991775, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 12 |
| }, |
| { |
| "completion_length": 766.5, |
| "epoch": 0.013, |
| "grad_norm": 0.002076877048239112, |
| "kl": 0.0010669063776731491, |
| "learning_rate": 2.0658795558326745e-06, |
| "loss": 0.0, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 13 |
| }, |
| { |
| "completion_length": 743.375, |
| "epoch": 0.014, |
| "grad_norm": 0.49824437499046326, |
| "kl": 0.0010957567719742656, |
| "learning_rate": 1.6449496416858285e-06, |
| "loss": 0.0, |
| "reward": 0.3050000071525574, |
| "reward_std": 0.10515295714139938, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.05000000074505806, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.004999999888241291, |
| "step": 14 |
| }, |
| { |
| "completion_length": 647.625, |
| "epoch": 0.015, |
| "grad_norm": 0.562134325504303, |
| "kl": 0.0010952961165457964, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.0, |
| "reward": 0.26875001192092896, |
| "reward_std": 0.15103808045387268, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.05000000074505806, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.21875, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 15 |
| }, |
| { |
| "completion_length": 858.0, |
| "epoch": 0.016, |
| "grad_norm": 0.5578316450119019, |
| "kl": 0.0011399323120713234, |
| "learning_rate": 8.930309757836517e-07, |
| "loss": 0.0, |
| "reward": 0.23749999701976776, |
| "reward_std": 0.16201850771903992, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.05000000074505806, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.1875, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 16 |
| }, |
| { |
| "completion_length": 780.5, |
| "epoch": 0.017, |
| "grad_norm": 0.3805747628211975, |
| "kl": 0.001326034776866436, |
| "learning_rate": 5.848888922025553e-07, |
| "loss": 0.0001, |
| "reward": 0.3125, |
| "reward_std": 0.1060660183429718, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0625, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 17 |
| }, |
| { |
| "completion_length": 720.5, |
| "epoch": 0.018, |
| "grad_norm": 0.5874195694923401, |
| "kl": 0.0013367197243496776, |
| "learning_rate": 3.3493649053890325e-07, |
| "loss": 0.0001, |
| "reward": 0.32499998807907104, |
| "reward_std": 0.1752549111843109, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.012500000186264515, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.3125, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 18 |
| }, |
| { |
| "completion_length": 735.0, |
| "epoch": 0.019, |
| "grad_norm": 0.48865845799446106, |
| "kl": 0.0013912161812186241, |
| "learning_rate": 1.507684480352292e-07, |
| "loss": 0.0001, |
| "reward": 0.30000001192092896, |
| "reward_std": 0.10690449923276901, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.05000000074505806, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 19 |
| }, |
| { |
| "completion_length": 734.125, |
| "epoch": 0.02, |
| "grad_norm": 0.0020913290791213512, |
| "kl": 0.0011432117316871881, |
| "learning_rate": 3.798061746947995e-08, |
| "loss": 0.0, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 20 |
| }, |
| { |
| "completion_length": 899.25, |
| "epoch": 0.021, |
| "grad_norm": 4.05595064163208, |
| "kl": 0.003427362535148859, |
| "learning_rate": 0.0, |
| "loss": 0.0001, |
| "reward": 0.41874998807907104, |
| "reward_std": 0.3150255084037781, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.07500000298023224, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.34375, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 776.5, |
| "epoch": 0.022, |
| "grad_norm": 0.5486103296279907, |
| "kl": 0.0010707724140956998, |
| "learning_rate": 2.717889356869146e-06, |
| "loss": 0.0, |
| "reward": 0.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.21875, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 22 |
| }, |
| { |
| "completion_length": 679.75, |
| "epoch": 0.023, |
| "grad_norm": 0.4813958704471588, |
| "kl": 0.001209542155265808, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "reward": 0.3125, |
| "reward_std": 0.1060660183429718, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0625, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 23 |
| }, |
| { |
| "completion_length": 768.125, |
| "epoch": 0.024, |
| "grad_norm": 0.41983699798583984, |
| "kl": 0.0012476833071559668, |
| "learning_rate": 2.2821106431308546e-06, |
| "loss": 0.0, |
| "reward": 0.2874999940395355, |
| "reward_std": 0.07440236955881119, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.03750000149011612, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 24 |
| }, |
| { |
| "completion_length": 759.0, |
| "epoch": 0.025, |
| "grad_norm": 0.6893355846405029, |
| "kl": 0.0012454978423193097, |
| "learning_rate": 2.0658795558326745e-06, |
| "loss": 0.0, |
| "reward": 0.4416411817073822, |
| "reward_std": 0.3946375846862793, |
| "rewards/answer_practicality_reward": 0.03125, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.010391198098659515, |
| "rewards/legal_reference_quality_reward": 0.15000000596046448, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 25 |
| }, |
| { |
| "completion_length": 600.25, |
| "epoch": 0.026, |
| "grad_norm": 0.4706794023513794, |
| "kl": 0.0012254236498847604, |
| "learning_rate": 1.852952387243698e-06, |
| "loss": 0.0, |
| "reward": 0.32249999046325684, |
| "reward_std": 0.10081808269023895, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0625, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.009999999776482582, |
| "step": 26 |
| }, |
| { |
| "completion_length": 711.25, |
| "epoch": 0.027, |
| "grad_norm": 3.535555601119995, |
| "kl": 0.007206466048955917, |
| "learning_rate": 1.6449496416858285e-06, |
| "loss": 0.0003, |
| "reward": 0.29374998807907104, |
| "reward_std": 0.1801537126302719, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.07500000298023224, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.21875, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 27 |
| }, |
| { |
| "completion_length": 762.125, |
| "epoch": 0.028, |
| "grad_norm": 0.3828037679195404, |
| "kl": 0.0010444466024637222, |
| "learning_rate": 1.443454345648252e-06, |
| "loss": 0.0, |
| "reward": 0.36250001192092896, |
| "reward_std": 0.18850918114185333, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.11249999701976776, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 28 |
| }, |
| { |
| "completion_length": 799.125, |
| "epoch": 0.029, |
| "grad_norm": 0.531396210193634, |
| "kl": 0.0014520642580464482, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.0001, |
| "reward": 0.3812499940395355, |
| "reward_std": 0.3401023745536804, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0625, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.28125, |
| "rewards/structure_clarity_reward": 0.03750000149011612, |
| "step": 29 |
| }, |
| { |
| "completion_length": 665.875, |
| "epoch": 0.03, |
| "grad_norm": 0.5673316717147827, |
| "kl": 0.001287385355681181, |
| "learning_rate": 1.0660589091223854e-06, |
| "loss": 0.0001, |
| "reward": 0.375, |
| "reward_std": 0.2314550280570984, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.375, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 30 |
| }, |
| { |
| "completion_length": 584.375, |
| "epoch": 0.031, |
| "grad_norm": 0.5284092426300049, |
| "kl": 0.0015492349630221725, |
| "learning_rate": 8.930309757836517e-07, |
| "loss": 0.0001, |
| "reward": 0.4125000238418579, |
| "reward_std": 0.219983771443367, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.16249999403953552, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 31 |
| }, |
| { |
| "completion_length": 707.875, |
| "epoch": 0.032, |
| "grad_norm": 0.4300897419452667, |
| "kl": 0.0012347075389698148, |
| "learning_rate": 7.322330470336314e-07, |
| "loss": 0.0, |
| "reward": 0.30000001192092896, |
| "reward_std": 0.07559289038181305, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.05000000074505806, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 32 |
| }, |
| { |
| "completion_length": 555.25, |
| "epoch": 0.033, |
| "grad_norm": 0.4357583522796631, |
| "kl": 0.0011535810772329569, |
| "learning_rate": 5.848888922025553e-07, |
| "loss": 0.0, |
| "reward": 0.35999998450279236, |
| "reward_std": 0.14422208070755005, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.07500000298023224, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.03500000014901161, |
| "step": 33 |
| }, |
| { |
| "completion_length": 716.375, |
| "epoch": 0.034, |
| "grad_norm": 0.48784926533699036, |
| "kl": 0.001275419956073165, |
| "learning_rate": 4.5211988927752026e-07, |
| "loss": 0.0001, |
| "reward": 0.4749999940395355, |
| "reward_std": 0.22519832849502563, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.16250000894069672, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.3125, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 34 |
| }, |
| { |
| "completion_length": 820.75, |
| "epoch": 0.035, |
| "grad_norm": 0.3773810863494873, |
| "kl": 0.0011918045347556472, |
| "learning_rate": 3.3493649053890325e-07, |
| "loss": 0.0, |
| "reward": 0.2562499940395355, |
| "reward_std": 0.01767767034471035, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.0, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0062500000931322575, |
| "step": 35 |
| }, |
| { |
| "completion_length": 624.875, |
| "epoch": 0.036, |
| "grad_norm": 0.4946437180042267, |
| "kl": 0.0011849572183564305, |
| "learning_rate": 2.3423053240837518e-07, |
| "loss": 0.0, |
| "reward": 0.30000001192092896, |
| "reward_std": 0.10690449923276901, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.05000000074505806, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 36 |
| }, |
| { |
| "completion_length": 746.375, |
| "epoch": 0.037, |
| "grad_norm": 0.461710125207901, |
| "kl": 0.001234731636941433, |
| "learning_rate": 1.507684480352292e-07, |
| "loss": 0.0, |
| "reward": 0.36250001192092896, |
| "reward_std": 0.22320714592933655, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.11250000447034836, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 37 |
| }, |
| { |
| "completion_length": 771.0, |
| "epoch": 0.038, |
| "grad_norm": 8.74399471282959, |
| "kl": 0.0049226246774196625, |
| "learning_rate": 8.518543427732951e-08, |
| "loss": 0.0002, |
| "reward": 0.36250001192092896, |
| "reward_std": 0.18850919604301453, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.05000000074505806, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.3125, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 38 |
| }, |
| { |
| "completion_length": 700.875, |
| "epoch": 0.039, |
| "grad_norm": 0.5485146641731262, |
| "kl": 0.00130558293312788, |
| "learning_rate": 3.798061746947995e-08, |
| "loss": 0.0001, |
| "reward": 0.2874999940395355, |
| "reward_std": 0.1060660183429718, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.03750000149011612, |
| "rewards/reasoning_depth_reward": 0.0, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0, |
| "step": 39 |
| }, |
| { |
| "completion_length": 587.5, |
| "epoch": 0.04, |
| "grad_norm": 0.42346256971359253, |
| "kl": 0.0010689998744055629, |
| "learning_rate": 9.513254770636138e-09, |
| "loss": 0.0, |
| "reward": 0.46875, |
| "reward_std": 0.2724721431732178, |
| "rewards/answer_practicality_reward": 0.0, |
| "rewards/conciseness_reward": 0.0, |
| "rewards/confidence_calibration_reward": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/legal_reference_quality_reward": 0.125, |
| "rewards/reasoning_depth_reward": 0.03125, |
| "rewards/strict_format_and_completeness_reward": 0.25, |
| "rewards/structure_clarity_reward": 0.0625, |
| "step": 40 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 40, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|