| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 126, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023827252419955324, | |
| "grad_norm": 27.642908096313477, | |
| "learning_rate": 0.0, | |
| "loss": 3.3263, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.04765450483991065, | |
| "grad_norm": 29.03249740600586, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 3.393, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0953090096798213, | |
| "grad_norm": 4.636140823364258, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 2.6817, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.14296351451973194, | |
| "grad_norm": 4.222386360168457, | |
| "learning_rate": 1.4285714285714287e-05, | |
| "loss": 2.5385, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.1906180193596426, | |
| "grad_norm": 3.460394859313965, | |
| "learning_rate": 2e-05, | |
| "loss": 2.352, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.23827252419955325, | |
| "grad_norm": 3.7368428707122803, | |
| "learning_rate": 1.998606410321534e-05, | |
| "loss": 2.1794, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2859270290394639, | |
| "grad_norm": 2.6725502014160156, | |
| "learning_rate": 1.9944295254705187e-05, | |
| "loss": 2.043, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.33358153387937456, | |
| "grad_norm": 2.636101484298706, | |
| "learning_rate": 1.9874809871741877e-05, | |
| "loss": 1.8958, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.3812360387192852, | |
| "grad_norm": 2.869380474090576, | |
| "learning_rate": 1.977780162255041e-05, | |
| "loss": 1.8137, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.4288905435591958, | |
| "grad_norm": 2.8045756816864014, | |
| "learning_rate": 1.9653540886520387e-05, | |
| "loss": 1.7214, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.4765450483991065, | |
| "grad_norm": 1.602889060974121, | |
| "learning_rate": 1.9502374000610152e-05, | |
| "loss": 1.6312, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5241995532390171, | |
| "grad_norm": 2.921908378601074, | |
| "learning_rate": 1.932472229404356e-05, | |
| "loss": 1.5755, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.5718540580789278, | |
| "grad_norm": 3.2532429695129395, | |
| "learning_rate": 1.912108091398988e-05, | |
| "loss": 1.5273, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.6195085629188384, | |
| "grad_norm": 2.755262613296509, | |
| "learning_rate": 1.8892017445499812e-05, | |
| "loss": 1.4395, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.6671630677587491, | |
| "grad_norm": 2.0402467250823975, | |
| "learning_rate": 1.8638170329544164e-05, | |
| "loss": 1.4345, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.7148175725986597, | |
| "grad_norm": 2.27799129486084, | |
| "learning_rate": 1.8360247083564343e-05, | |
| "loss": 1.3749, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7624720774385704, | |
| "grad_norm": 1.5731786489486694, | |
| "learning_rate": 1.805902232949435e-05, | |
| "loss": 1.28, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.810126582278481, | |
| "grad_norm": 2.364778757095337, | |
| "learning_rate": 1.773533563475053e-05, | |
| "loss": 1.2666, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.8577810871183916, | |
| "grad_norm": 1.5619275569915771, | |
| "learning_rate": 1.7390089172206594e-05, | |
| "loss": 1.2227, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.9054355919583023, | |
| "grad_norm": 1.949645757675171, | |
| "learning_rate": 1.7024245205675986e-05, | |
| "loss": 1.1678, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.953090096798213, | |
| "grad_norm": 1.5181773900985718, | |
| "learning_rate": 1.6638823407910085e-05, | |
| "loss": 1.1625, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.2812515497207642, | |
| "learning_rate": 1.6234898018587336e-05, | |
| "loss": 1.1757, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.0476545048399106, | |
| "grad_norm": 1.1844559907913208, | |
| "learning_rate": 1.58135948502146e-05, | |
| "loss": 1.1938, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.0953090096798213, | |
| "grad_norm": 0.7573416233062744, | |
| "learning_rate": 1.5376088150285777e-05, | |
| "loss": 1.157, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.1429635145197319, | |
| "grad_norm": 0.7566890120506287, | |
| "learning_rate": 1.4923597328443423e-05, | |
| "loss": 1.1033, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.1906180193596425, | |
| "grad_norm": 0.7004701495170593, | |
| "learning_rate": 1.4457383557765385e-05, | |
| "loss": 1.1247, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.2382725241995534, | |
| "grad_norm": 0.9414187669754028, | |
| "learning_rate": 1.397874625964921e-05, | |
| "loss": 1.0881, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.2859270290394638, | |
| "grad_norm": 0.6709342002868652, | |
| "learning_rate": 1.348901948209167e-05, | |
| "loss": 1.0797, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.3335815338793746, | |
| "grad_norm": 0.8439044952392578, | |
| "learning_rate": 1.2989568181457704e-05, | |
| "loss": 1.0723, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.3812360387192852, | |
| "grad_norm": 0.4865539073944092, | |
| "learning_rate": 1.248178441810224e-05, | |
| "loss": 1.0495, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.4288905435591959, | |
| "grad_norm": 0.5796261429786682, | |
| "learning_rate": 1.1967083476448282e-05, | |
| "loss": 1.0648, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.4765450483991065, | |
| "grad_norm": 0.6459540128707886, | |
| "learning_rate": 1.1446899920335407e-05, | |
| "loss": 1.0539, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.5241995532390171, | |
| "grad_norm": 0.41356927156448364, | |
| "learning_rate": 1.092268359463302e-05, | |
| "loss": 1.0292, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.5718540580789278, | |
| "grad_norm": 0.45745816826820374, | |
| "learning_rate": 1.0395895584262696e-05, | |
| "loss": 1.002, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.6195085629188384, | |
| "grad_norm": 0.4945538640022278, | |
| "learning_rate": 9.868004141892412e-06, | |
| "loss": 1.0291, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.6671630677587492, | |
| "grad_norm": 0.4044688045978546, | |
| "learning_rate": 9.340480595653047e-06, | |
| "loss": 1.0213, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.7148175725986596, | |
| "grad_norm": 0.40503114461898804, | |
| "learning_rate": 8.814795248282974e-06, | |
| "loss": 1.0027, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.7624720774385705, | |
| "grad_norm": 0.409345805644989, | |
| "learning_rate": 8.292413279130625e-06, | |
| "loss": 1.0292, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.810126582278481, | |
| "grad_norm": 0.3667221963405609, | |
| "learning_rate": 7.774790660436857e-06, | |
| "loss": 0.9908, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.8577810871183917, | |
| "grad_norm": 0.37784790992736816, | |
| "learning_rate": 7.263370099279173e-06, | |
| "loss": 1.0006, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.9054355919583021, | |
| "grad_norm": 0.33597901463508606, | |
| "learning_rate": 6.759577016488343e-06, | |
| "loss": 0.9708, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.953090096798213, | |
| "grad_norm": 0.37733909487724304, | |
| "learning_rate": 6.264815573744884e-06, | |
| "loss": 0.9695, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.32598474621772766, | |
| "learning_rate": 5.780464759928623e-06, | |
| "loss": 1.0016, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.047654504839911, | |
| "grad_norm": 0.3468402922153473, | |
| "learning_rate": 5.307874547629339e-06, | |
| "loss": 0.9625, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 2.0953090096798213, | |
| "grad_norm": 0.3330307602882385, | |
| "learning_rate": 4.848362130531039e-06, | |
| "loss": 0.9447, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.142963514519732, | |
| "grad_norm": 0.33839651942253113, | |
| "learning_rate": 4.403208252156921e-06, | |
| "loss": 0.9482, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.1906180193596425, | |
| "grad_norm": 0.3227793276309967, | |
| "learning_rate": 3.973653636207437e-06, | |
| "loss": 0.979, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.2382725241995534, | |
| "grad_norm": 0.33203473687171936, | |
| "learning_rate": 3.560895528440844e-06, | |
| "loss": 0.9626, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 2.2859270290394638, | |
| "grad_norm": 0.29434484243392944, | |
| "learning_rate": 3.1660843597345137e-06, | |
| "loss": 0.9814, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.3335815338793746, | |
| "grad_norm": 0.2951570451259613, | |
| "learning_rate": 2.7903205396277546e-06, | |
| "loss": 0.9368, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.381236038719285, | |
| "grad_norm": 0.3088631331920624, | |
| "learning_rate": 2.4346513892830427e-06, | |
| "loss": 0.952, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.428890543559196, | |
| "grad_norm": 0.3212229609489441, | |
| "learning_rate": 2.100068222414121e-06, | |
| "loss": 0.924, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 2.4765450483991067, | |
| "grad_norm": 0.28245487809181213, | |
| "learning_rate": 1.7875035823168641e-06, | |
| "loss": 0.9435, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 2.524199553239017, | |
| "grad_norm": 0.2676449716091156, | |
| "learning_rate": 1.4978286427038602e-06, | |
| "loss": 0.92, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 2.5718540580789275, | |
| "grad_norm": 0.26664310693740845, | |
| "learning_rate": 1.2318507795870138e-06, | |
| "loss": 0.9385, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 2.6195085629188384, | |
| "grad_norm": 0.263298362493515, | |
| "learning_rate": 9.903113209758098e-07, | |
| "loss": 0.931, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.6671630677587492, | |
| "grad_norm": 0.27026382088661194, | |
| "learning_rate": 7.738834806631712e-07, | |
| "loss": 0.947, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.7148175725986596, | |
| "grad_norm": 0.2611420452594757, | |
| "learning_rate": 5.831704818578842e-07, | |
| "loss": 0.9346, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.7624720774385705, | |
| "grad_norm": 0.25799694657325745, | |
| "learning_rate": 4.187038758933204e-07, | |
| "loss": 0.9363, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.810126582278481, | |
| "grad_norm": 0.2518501877784729, | |
| "learning_rate": 2.809420606985236e-07, | |
| "loss": 0.9359, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.8577810871183917, | |
| "grad_norm": 0.2569182813167572, | |
| "learning_rate": 1.7026900316098217e-07, | |
| "loss": 0.9399, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.905435591958302, | |
| "grad_norm": 0.24561701714992523, | |
| "learning_rate": 8.699316894203225e-08, | |
| "loss": 0.9467, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.953090096798213, | |
| "grad_norm": 0.2518835961818695, | |
| "learning_rate": 3.134666272774034e-08, | |
| "loss": 0.9405, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.25132957100868225, | |
| "learning_rate": 3.4845813115114147e-09, | |
| "loss": 0.9466, | |
| "step": 126 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 126, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 300.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.312585296119595e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |