{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 126, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023827252419955324, "grad_norm": 27.642908096313477, "learning_rate": 0.0, "loss": 3.3263, "step": 1 }, { "epoch": 0.04765450483991065, "grad_norm": 29.03249740600586, "learning_rate": 2.8571428571428573e-06, "loss": 3.393, "step": 2 }, { "epoch": 0.0953090096798213, "grad_norm": 4.636140823364258, "learning_rate": 8.571428571428571e-06, "loss": 2.6817, "step": 4 }, { "epoch": 0.14296351451973194, "grad_norm": 4.222386360168457, "learning_rate": 1.4285714285714287e-05, "loss": 2.5385, "step": 6 }, { "epoch": 0.1906180193596426, "grad_norm": 3.460394859313965, "learning_rate": 2e-05, "loss": 2.352, "step": 8 }, { "epoch": 0.23827252419955325, "grad_norm": 3.7368428707122803, "learning_rate": 1.998606410321534e-05, "loss": 2.1794, "step": 10 }, { "epoch": 0.2859270290394639, "grad_norm": 2.6725502014160156, "learning_rate": 1.9944295254705187e-05, "loss": 2.043, "step": 12 }, { "epoch": 0.33358153387937456, "grad_norm": 2.636101484298706, "learning_rate": 1.9874809871741877e-05, "loss": 1.8958, "step": 14 }, { "epoch": 0.3812360387192852, "grad_norm": 2.869380474090576, "learning_rate": 1.977780162255041e-05, "loss": 1.8137, "step": 16 }, { "epoch": 0.4288905435591958, "grad_norm": 2.8045756816864014, "learning_rate": 1.9653540886520387e-05, "loss": 1.7214, "step": 18 }, { "epoch": 0.4765450483991065, "grad_norm": 1.602889060974121, "learning_rate": 1.9502374000610152e-05, "loss": 1.6312, "step": 20 }, { "epoch": 0.5241995532390171, "grad_norm": 2.921908378601074, "learning_rate": 1.932472229404356e-05, "loss": 1.5755, "step": 22 }, { "epoch": 0.5718540580789278, "grad_norm": 3.2532429695129395, "learning_rate": 1.912108091398988e-05, "loss": 1.5273, "step": 24 }, { "epoch": 0.6195085629188384, "grad_norm": 2.755262613296509, "learning_rate": 1.8892017445499812e-05, "loss": 1.4395, "step": 26 }, { "epoch": 0.6671630677587491, "grad_norm": 2.0402467250823975, "learning_rate": 1.8638170329544164e-05, "loss": 1.4345, "step": 28 }, { "epoch": 0.7148175725986597, "grad_norm": 2.27799129486084, "learning_rate": 1.8360247083564343e-05, "loss": 1.3749, "step": 30 }, { "epoch": 0.7624720774385704, "grad_norm": 1.5731786489486694, "learning_rate": 1.805902232949435e-05, "loss": 1.28, "step": 32 }, { "epoch": 0.810126582278481, "grad_norm": 2.364778757095337, "learning_rate": 1.773533563475053e-05, "loss": 1.2666, "step": 34 }, { "epoch": 0.8577810871183916, "grad_norm": 1.5619275569915771, "learning_rate": 1.7390089172206594e-05, "loss": 1.2227, "step": 36 }, { "epoch": 0.9054355919583023, "grad_norm": 1.949645757675171, "learning_rate": 1.7024245205675986e-05, "loss": 1.1678, "step": 38 }, { "epoch": 0.953090096798213, "grad_norm": 1.5181773900985718, "learning_rate": 1.6638823407910085e-05, "loss": 1.1625, "step": 40 }, { "epoch": 1.0, "grad_norm": 1.2812515497207642, "learning_rate": 1.6234898018587336e-05, "loss": 1.1757, "step": 42 }, { "epoch": 1.0476545048399106, "grad_norm": 1.1844559907913208, "learning_rate": 1.58135948502146e-05, "loss": 1.1938, "step": 44 }, { "epoch": 1.0953090096798213, "grad_norm": 0.7573416233062744, "learning_rate": 1.5376088150285777e-05, "loss": 1.157, "step": 46 }, { "epoch": 1.1429635145197319, "grad_norm": 0.7566890120506287, "learning_rate": 1.4923597328443423e-05, "loss": 1.1033, "step": 48 }, { "epoch": 1.1906180193596425, "grad_norm": 0.7004701495170593, "learning_rate": 1.4457383557765385e-05, "loss": 1.1247, "step": 50 }, { "epoch": 1.2382725241995534, "grad_norm": 0.9414187669754028, "learning_rate": 1.397874625964921e-05, "loss": 1.0881, "step": 52 }, { "epoch": 1.2859270290394638, "grad_norm": 0.6709342002868652, "learning_rate": 1.348901948209167e-05, "loss": 1.0797, "step": 54 }, { "epoch": 1.3335815338793746, "grad_norm": 0.8439044952392578, "learning_rate": 1.2989568181457704e-05, "loss": 1.0723, "step": 56 }, { "epoch": 1.3812360387192852, "grad_norm": 0.4865539073944092, "learning_rate": 1.248178441810224e-05, "loss": 1.0495, "step": 58 }, { "epoch": 1.4288905435591959, "grad_norm": 0.5796261429786682, "learning_rate": 1.1967083476448282e-05, "loss": 1.0648, "step": 60 }, { "epoch": 1.4765450483991065, "grad_norm": 0.6459540128707886, "learning_rate": 1.1446899920335407e-05, "loss": 1.0539, "step": 62 }, { "epoch": 1.5241995532390171, "grad_norm": 0.41356927156448364, "learning_rate": 1.092268359463302e-05, "loss": 1.0292, "step": 64 }, { "epoch": 1.5718540580789278, "grad_norm": 0.45745816826820374, "learning_rate": 1.0395895584262696e-05, "loss": 1.002, "step": 66 }, { "epoch": 1.6195085629188384, "grad_norm": 0.4945538640022278, "learning_rate": 9.868004141892412e-06, "loss": 1.0291, "step": 68 }, { "epoch": 1.6671630677587492, "grad_norm": 0.4044688045978546, "learning_rate": 9.340480595653047e-06, "loss": 1.0213, "step": 70 }, { "epoch": 1.7148175725986596, "grad_norm": 0.40503114461898804, "learning_rate": 8.814795248282974e-06, "loss": 1.0027, "step": 72 }, { "epoch": 1.7624720774385705, "grad_norm": 0.409345805644989, "learning_rate": 8.292413279130625e-06, "loss": 1.0292, "step": 74 }, { "epoch": 1.810126582278481, "grad_norm": 0.3667221963405609, "learning_rate": 7.774790660436857e-06, "loss": 0.9908, "step": 76 }, { "epoch": 1.8577810871183917, "grad_norm": 0.37784790992736816, "learning_rate": 7.263370099279173e-06, "loss": 1.0006, "step": 78 }, { "epoch": 1.9054355919583021, "grad_norm": 0.33597901463508606, "learning_rate": 6.759577016488343e-06, "loss": 0.9708, "step": 80 }, { "epoch": 1.953090096798213, "grad_norm": 0.37733909487724304, "learning_rate": 6.264815573744884e-06, "loss": 0.9695, "step": 82 }, { "epoch": 2.0, "grad_norm": 0.32598474621772766, "learning_rate": 5.780464759928623e-06, "loss": 1.0016, "step": 84 }, { "epoch": 2.047654504839911, "grad_norm": 0.3468402922153473, "learning_rate": 5.307874547629339e-06, "loss": 0.9625, "step": 86 }, { "epoch": 2.0953090096798213, "grad_norm": 0.3330307602882385, "learning_rate": 4.848362130531039e-06, "loss": 0.9447, "step": 88 }, { "epoch": 2.142963514519732, "grad_norm": 0.33839651942253113, "learning_rate": 4.403208252156921e-06, "loss": 0.9482, "step": 90 }, { "epoch": 2.1906180193596425, "grad_norm": 0.3227793276309967, "learning_rate": 3.973653636207437e-06, "loss": 0.979, "step": 92 }, { "epoch": 2.2382725241995534, "grad_norm": 0.33203473687171936, "learning_rate": 3.560895528440844e-06, "loss": 0.9626, "step": 94 }, { "epoch": 2.2859270290394638, "grad_norm": 0.29434484243392944, "learning_rate": 3.1660843597345137e-06, "loss": 0.9814, "step": 96 }, { "epoch": 2.3335815338793746, "grad_norm": 0.2951570451259613, "learning_rate": 2.7903205396277546e-06, "loss": 0.9368, "step": 98 }, { "epoch": 2.381236038719285, "grad_norm": 0.3088631331920624, "learning_rate": 2.4346513892830427e-06, "loss": 0.952, "step": 100 }, { "epoch": 2.428890543559196, "grad_norm": 0.3212229609489441, "learning_rate": 2.100068222414121e-06, "loss": 0.924, "step": 102 }, { "epoch": 2.4765450483991067, "grad_norm": 0.28245487809181213, "learning_rate": 1.7875035823168641e-06, "loss": 0.9435, "step": 104 }, { "epoch": 2.524199553239017, "grad_norm": 0.2676449716091156, "learning_rate": 1.4978286427038602e-06, "loss": 0.92, "step": 106 }, { "epoch": 2.5718540580789275, "grad_norm": 0.26664310693740845, "learning_rate": 1.2318507795870138e-06, "loss": 0.9385, "step": 108 }, { "epoch": 2.6195085629188384, "grad_norm": 0.263298362493515, "learning_rate": 9.903113209758098e-07, "loss": 0.931, "step": 110 }, { "epoch": 2.6671630677587492, "grad_norm": 0.27026382088661194, "learning_rate": 7.738834806631712e-07, "loss": 0.947, "step": 112 }, { "epoch": 2.7148175725986596, "grad_norm": 0.2611420452594757, "learning_rate": 5.831704818578842e-07, "loss": 0.9346, "step": 114 }, { "epoch": 2.7624720774385705, "grad_norm": 0.25799694657325745, "learning_rate": 4.187038758933204e-07, "loss": 0.9363, "step": 116 }, { "epoch": 2.810126582278481, "grad_norm": 0.2518501877784729, "learning_rate": 2.809420606985236e-07, "loss": 0.9359, "step": 118 }, { "epoch": 2.8577810871183917, "grad_norm": 0.2569182813167572, "learning_rate": 1.7026900316098217e-07, "loss": 0.9399, "step": 120 }, { "epoch": 2.905435591958302, "grad_norm": 0.24561701714992523, "learning_rate": 8.699316894203225e-08, "loss": 0.9467, "step": 122 }, { "epoch": 2.953090096798213, "grad_norm": 0.2518835961818695, "learning_rate": 3.134666272774034e-08, "loss": 0.9405, "step": 124 }, { "epoch": 3.0, "grad_norm": 0.25132957100868225, "learning_rate": 3.4845813115114147e-09, "loss": 0.9466, "step": 126 } ], "logging_steps": 2, "max_steps": 126, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.312585296119595e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }