| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 12.0, | |
| "eval_steps": 500, | |
| "global_step": 570, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08421052631578947, | |
| "grad_norm": 64.44918823242188, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 5.9893, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.16842105263157894, | |
| "grad_norm": 1.1460206508636475, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 1.7662, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.25263157894736843, | |
| "grad_norm": 0.8948838710784912, | |
| "learning_rate": 6.111111111111112e-05, | |
| "loss": 0.8212, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.3368421052631579, | |
| "grad_norm": 0.5042845606803894, | |
| "learning_rate": 8.333333333333334e-05, | |
| "loss": 0.533, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.5063032507896423, | |
| "learning_rate": 0.00010555555555555557, | |
| "loss": 0.425, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5052631578947369, | |
| "grad_norm": 0.4761255383491516, | |
| "learning_rate": 0.00012777777777777776, | |
| "loss": 0.3875, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.5894736842105263, | |
| "grad_norm": 0.4021145701408386, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.3799, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.6736842105263158, | |
| "grad_norm": 0.441383421421051, | |
| "learning_rate": 0.00017222222222222224, | |
| "loss": 0.3438, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.7578947368421053, | |
| "grad_norm": 0.3923978805541992, | |
| "learning_rate": 0.00019444444444444446, | |
| "loss": 0.2952, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.39148256182670593, | |
| "learning_rate": 0.00019999007677495127, | |
| "loss": 0.2766, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.9263157894736842, | |
| "grad_norm": 0.3753437101840973, | |
| "learning_rate": 0.0001999459775237086, | |
| "loss": 0.329, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.0105263157894737, | |
| "grad_norm": 0.49228230118751526, | |
| "learning_rate": 0.00019986661520865405, | |
| "loss": 0.2808, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.0947368421052632, | |
| "grad_norm": 0.3359147608280182, | |
| "learning_rate": 0.00019975201783049805, | |
| "loss": 0.2385, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.1789473684210527, | |
| "grad_norm": 0.33956533670425415, | |
| "learning_rate": 0.00019960222582162976, | |
| "loss": 0.224, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.29228124022483826, | |
| "learning_rate": 0.00019941729203185165, | |
| "loss": 0.2062, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.3473684210526315, | |
| "grad_norm": 0.3463740944862366, | |
| "learning_rate": 0.00019919728170973296, | |
| "loss": 0.2335, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.431578947368421, | |
| "grad_norm": 0.30409345030784607, | |
| "learning_rate": 0.00019894227247958845, | |
| "loss": 0.2242, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.5157894736842106, | |
| "grad_norm": 0.3040483593940735, | |
| "learning_rate": 0.00019865235431409123, | |
| "loss": 0.1707, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.3167271614074707, | |
| "learning_rate": 0.00019832762950252813, | |
| "loss": 0.2288, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.4126874804496765, | |
| "learning_rate": 0.00019796821261471018, | |
| "loss": 0.202, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.768421052631579, | |
| "grad_norm": 0.3122522830963135, | |
| "learning_rate": 0.00019757423046054968, | |
| "loss": 0.2209, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.8526315789473684, | |
| "grad_norm": 0.348960280418396, | |
| "learning_rate": 0.00019714582204531918, | |
| "loss": 0.1551, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.936842105263158, | |
| "grad_norm": 0.413482666015625, | |
| "learning_rate": 0.00019668313852060735, | |
| "loss": 0.1818, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.0210526315789474, | |
| "grad_norm": 0.2502991259098053, | |
| "learning_rate": 0.00019618634313098952, | |
| "loss": 0.1355, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.33105117082595825, | |
| "learning_rate": 0.00019565561115643152, | |
| "loss": 0.1286, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.1894736842105265, | |
| "grad_norm": 0.3156004548072815, | |
| "learning_rate": 0.00019509112985044717, | |
| "loss": 0.0978, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 2.2736842105263158, | |
| "grad_norm": 0.39305001497268677, | |
| "learning_rate": 0.00019449309837403137, | |
| "loss": 0.1227, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 2.3578947368421055, | |
| "grad_norm": 0.3099610209465027, | |
| "learning_rate": 0.00019386172772539162, | |
| "loss": 0.1346, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.442105263157895, | |
| "grad_norm": 0.2809341847896576, | |
| "learning_rate": 0.00019319724066550373, | |
| "loss": 0.1223, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.40614527463912964, | |
| "learning_rate": 0.00019249987163951667, | |
| "loss": 0.1353, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.610526315789474, | |
| "grad_norm": 0.23737777769565582, | |
| "learning_rate": 0.00019176986669403555, | |
| "loss": 0.112, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.694736842105263, | |
| "grad_norm": 0.2852821350097656, | |
| "learning_rate": 0.00019100748339031113, | |
| "loss": 0.1349, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.7789473684210524, | |
| "grad_norm": 0.43856674432754517, | |
| "learning_rate": 0.00019021299071336664, | |
| "loss": 0.1179, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.863157894736842, | |
| "grad_norm": 0.3141747713088989, | |
| "learning_rate": 0.00018938666897709425, | |
| "loss": 0.1306, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.2711346447467804, | |
| "learning_rate": 0.00018852880972535432, | |
| "loss": 0.1551, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.031578947368421, | |
| "grad_norm": 0.25603431463241577, | |
| "learning_rate": 0.0001876397156291125, | |
| "loss": 0.0794, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 3.1157894736842104, | |
| "grad_norm": 0.42339861392974854, | |
| "learning_rate": 0.00018671970037965118, | |
| "loss": 0.0744, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.2693917751312256, | |
| "learning_rate": 0.0001857690885778923, | |
| "loss": 0.0696, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 3.2842105263157895, | |
| "grad_norm": 0.2737679183483124, | |
| "learning_rate": 0.0001847882156198713, | |
| "loss": 0.0653, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 3.3684210526315788, | |
| "grad_norm": 0.4129018187522888, | |
| "learning_rate": 0.00018377742757840244, | |
| "loss": 0.0806, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.4526315789473685, | |
| "grad_norm": 0.3873012065887451, | |
| "learning_rate": 0.00018273708108097677, | |
| "loss": 0.0752, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 3.536842105263158, | |
| "grad_norm": 0.3241969645023346, | |
| "learning_rate": 0.0001816675431839365, | |
| "loss": 0.0772, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 3.6210526315789475, | |
| "grad_norm": 0.3158609867095947, | |
| "learning_rate": 0.0001805691912429696, | |
| "loss": 0.0802, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 3.705263157894737, | |
| "grad_norm": 0.3270516097545624, | |
| "learning_rate": 0.00017944241277997077, | |
| "loss": 0.0712, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 3.7894736842105265, | |
| "grad_norm": 0.30987676978111267, | |
| "learning_rate": 0.00017828760534631565, | |
| "loss": 0.0699, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.873684210526316, | |
| "grad_norm": 0.28480827808380127, | |
| "learning_rate": 0.0001771051763825959, | |
| "loss": 0.0858, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 3.957894736842105, | |
| "grad_norm": 0.24550119042396545, | |
| "learning_rate": 0.0001758955430748658, | |
| "loss": 0.0619, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 4.042105263157895, | |
| "grad_norm": 0.3428252041339874, | |
| "learning_rate": 0.00017465913220744998, | |
| "loss": 0.0585, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 4.126315789473685, | |
| "grad_norm": 0.32337552309036255, | |
| "learning_rate": 0.00017339638001236492, | |
| "loss": 0.0518, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 4.2105263157894735, | |
| "grad_norm": 0.2767057716846466, | |
| "learning_rate": 0.00017210773201540707, | |
| "loss": 0.0435, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.294736842105263, | |
| "grad_norm": 0.22791585326194763, | |
| "learning_rate": 0.00017079364287896174, | |
| "loss": 0.0471, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 4.378947368421053, | |
| "grad_norm": 0.3095114231109619, | |
| "learning_rate": 0.00016945457624158871, | |
| "loss": 0.0401, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 4.463157894736842, | |
| "grad_norm": 0.2641872465610504, | |
| "learning_rate": 0.0001680910045544406, | |
| "loss": 0.0447, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 4.5473684210526315, | |
| "grad_norm": 0.3095775842666626, | |
| "learning_rate": 0.00016670340891457216, | |
| "loss": 0.052, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 4.631578947368421, | |
| "grad_norm": 0.292422890663147, | |
| "learning_rate": 0.00016529227889519886, | |
| "loss": 0.0439, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.715789473684211, | |
| "grad_norm": 0.2715825140476227, | |
| "learning_rate": 0.0001638581123729652, | |
| "loss": 0.039, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.17827194929122925, | |
| "learning_rate": 0.00016240141535228323, | |
| "loss": 0.0424, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 4.88421052631579, | |
| "grad_norm": 0.21712446212768555, | |
| "learning_rate": 0.0001609227017868033, | |
| "loss": 0.0502, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 4.968421052631579, | |
| "grad_norm": 0.45244210958480835, | |
| "learning_rate": 0.00015942249339808058, | |
| "loss": 0.0548, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 5.052631578947368, | |
| "grad_norm": 0.20093843340873718, | |
| "learning_rate": 0.00015790131949150035, | |
| "loss": 0.0284, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.136842105263158, | |
| "grad_norm": 0.3055751919746399, | |
| "learning_rate": 0.00015635971676952797, | |
| "loss": 0.0304, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 5.221052631578948, | |
| "grad_norm": 0.21160796284675598, | |
| "learning_rate": 0.00015479822914234875, | |
| "loss": 0.0291, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 5.3052631578947365, | |
| "grad_norm": 0.19923457503318787, | |
| "learning_rate": 0.0001532174075359649, | |
| "loss": 0.0291, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 5.389473684210526, | |
| "grad_norm": 0.21759603917598724, | |
| "learning_rate": 0.00015161780969781728, | |
| "loss": 0.0322, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 5.473684210526316, | |
| "grad_norm": 0.1476200670003891, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.03, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.557894736842105, | |
| "grad_norm": 0.1323111355304718, | |
| "learning_rate": 0.00014836454924013824, | |
| "loss": 0.0297, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 5.6421052631578945, | |
| "grad_norm": 0.1873178780078888, | |
| "learning_rate": 0.00014671203443999845, | |
| "loss": 0.031, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 5.726315789473684, | |
| "grad_norm": 0.18353727459907532, | |
| "learning_rate": 0.00014504303864190307, | |
| "loss": 0.0294, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 5.810526315789474, | |
| "grad_norm": 0.177791565656662, | |
| "learning_rate": 0.00014335815070302054, | |
| "loss": 0.0306, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 5.894736842105263, | |
| "grad_norm": 0.17418071627616882, | |
| "learning_rate": 0.0001416579650876043, | |
| "loss": 0.0307, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.978947368421053, | |
| "grad_norm": 0.20067432522773743, | |
| "learning_rate": 0.00013994308165725288, | |
| "loss": 0.0305, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 6.063157894736842, | |
| "grad_norm": 0.1422828584909439, | |
| "learning_rate": 0.00013821410545926613, | |
| "loss": 0.0297, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 6.147368421052631, | |
| "grad_norm": 0.16031205654144287, | |
| "learning_rate": 0.00013647164651317176, | |
| "loss": 0.0233, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 6.231578947368421, | |
| "grad_norm": 0.14059369266033173, | |
| "learning_rate": 0.0001347163195954973, | |
| "loss": 0.0216, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 6.315789473684211, | |
| "grad_norm": 0.15955083072185516, | |
| "learning_rate": 0.00013294874402286402, | |
| "loss": 0.023, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.0965215414762497, | |
| "learning_rate": 0.00013116954343347882, | |
| "loss": 0.0227, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 6.484210526315789, | |
| "grad_norm": 0.11703325808048248, | |
| "learning_rate": 0.00012937934556710143, | |
| "loss": 0.0237, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 6.568421052631579, | |
| "grad_norm": 0.13229094445705414, | |
| "learning_rate": 0.0001275787820435645, | |
| "loss": 0.0233, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 6.652631578947369, | |
| "grad_norm": 0.15599046647548676, | |
| "learning_rate": 0.00012576848813992475, | |
| "loss": 0.024, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 6.7368421052631575, | |
| "grad_norm": 0.14043009281158447, | |
| "learning_rate": 0.00012394910256632356, | |
| "loss": 0.0222, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.821052631578947, | |
| "grad_norm": 0.10719907283782959, | |
| "learning_rate": 0.00012212126724063676, | |
| "loss": 0.0234, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 6.905263157894737, | |
| "grad_norm": 0.1136666014790535, | |
| "learning_rate": 0.000120285627061992, | |
| "loss": 0.0217, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 6.989473684210527, | |
| "grad_norm": 0.11610202491283417, | |
| "learning_rate": 0.00011844282968323501, | |
| "loss": 0.0221, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 7.073684210526316, | |
| "grad_norm": 0.04680392146110535, | |
| "learning_rate": 0.00011659352528242366, | |
| "loss": 0.0186, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 7.157894736842105, | |
| "grad_norm": 0.1080232560634613, | |
| "learning_rate": 0.00011473836633343144, | |
| "loss": 0.0181, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 7.242105263157895, | |
| "grad_norm": 0.07934293150901794, | |
| "learning_rate": 0.00011287800737574072, | |
| "loss": 0.0175, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 7.326315789473684, | |
| "grad_norm": 0.11348960548639297, | |
| "learning_rate": 0.00011101310478350754, | |
| "loss": 0.0191, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 7.410526315789474, | |
| "grad_norm": 0.1111045703291893, | |
| "learning_rate": 0.00010914431653397856, | |
| "loss": 0.0183, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 7.494736842105263, | |
| "grad_norm": 0.10023918747901917, | |
| "learning_rate": 0.00010727230197534299, | |
| "loss": 0.0201, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 7.578947368421053, | |
| "grad_norm": 0.1350473314523697, | |
| "learning_rate": 0.00010539772159410036, | |
| "loss": 0.019, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 7.663157894736842, | |
| "grad_norm": 0.09026475250720978, | |
| "learning_rate": 0.00010352123678202685, | |
| "loss": 0.0184, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 7.747368421052632, | |
| "grad_norm": 0.08861108869314194, | |
| "learning_rate": 0.00010164350960282252, | |
| "loss": 0.0187, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 7.831578947368421, | |
| "grad_norm": 0.07832619547843933, | |
| "learning_rate": 9.976520255852065e-05, | |
| "loss": 0.0192, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 7.91578947368421, | |
| "grad_norm": 0.10778514295816422, | |
| "learning_rate": 9.788697835574347e-05, | |
| "loss": 0.0204, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.09410171210765839, | |
| "learning_rate": 9.600949967188484e-05, | |
| "loss": 0.0189, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 8.08421052631579, | |
| "grad_norm": 0.04481295123696327, | |
| "learning_rate": 9.413342892130376e-05, | |
| "loss": 0.0151, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 8.16842105263158, | |
| "grad_norm": 0.055971067398786545, | |
| "learning_rate": 9.225942802161042e-05, | |
| "loss": 0.0158, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 8.25263157894737, | |
| "grad_norm": 0.07519485801458359, | |
| "learning_rate": 9.038815816012767e-05, | |
| "loss": 0.0162, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 8.336842105263157, | |
| "grad_norm": 0.06743883341550827, | |
| "learning_rate": 8.852027956061015e-05, | |
| "loss": 0.0155, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 8.421052631578947, | |
| "grad_norm": 0.05589531734585762, | |
| "learning_rate": 8.665645125030311e-05, | |
| "loss": 0.0166, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.505263157894737, | |
| "grad_norm": 0.05887574702501297, | |
| "learning_rate": 8.479733082742384e-05, | |
| "loss": 0.0164, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 8.589473684210526, | |
| "grad_norm": 0.057513728737831116, | |
| "learning_rate": 8.294357422914685e-05, | |
| "loss": 0.0172, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 8.673684210526316, | |
| "grad_norm": 0.05391902104020119, | |
| "learning_rate": 8.10958355001755e-05, | |
| "loss": 0.0167, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 8.757894736842106, | |
| "grad_norm": 0.047253336757421494, | |
| "learning_rate": 7.925476656198095e-05, | |
| "loss": 0.0166, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 8.842105263157894, | |
| "grad_norm": 0.04581145569682121, | |
| "learning_rate": 7.74210169827906e-05, | |
| "loss": 0.0184, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 8.926315789473684, | |
| "grad_norm": 0.05937978997826576, | |
| "learning_rate": 7.55952337484064e-05, | |
| "loss": 0.0162, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 9.010526315789473, | |
| "grad_norm": 0.043958455324172974, | |
| "learning_rate": 7.377806103393473e-05, | |
| "loss": 0.0171, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 9.094736842105263, | |
| "grad_norm": 0.035407859832048416, | |
| "learning_rate": 7.197013997650762e-05, | |
| "loss": 0.0145, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 9.178947368421053, | |
| "grad_norm": 0.0361582450568676, | |
| "learning_rate": 7.017210844907598e-05, | |
| "loss": 0.0148, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 9.263157894736842, | |
| "grad_norm": 0.04210333898663521, | |
| "learning_rate": 6.838460083535445e-05, | |
| "loss": 0.0147, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 9.347368421052632, | |
| "grad_norm": 0.043872371315956116, | |
| "learning_rate": 6.660824780599744e-05, | |
| "loss": 0.0144, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 9.431578947368422, | |
| "grad_norm": 0.04811229929327965, | |
| "learning_rate": 6.484367609608503e-05, | |
| "loss": 0.0148, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 9.51578947368421, | |
| "grad_norm": 0.05032840371131897, | |
| "learning_rate": 6.309150828399754e-05, | |
| "loss": 0.0152, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 0.04303669556975365, | |
| "learning_rate": 6.135236257175668e-05, | |
| "loss": 0.0172, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 9.68421052631579, | |
| "grad_norm": 0.04116836190223694, | |
| "learning_rate": 5.962685256691071e-05, | |
| "loss": 0.0154, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 9.76842105263158, | |
| "grad_norm": 0.04266192391514778, | |
| "learning_rate": 5.791558706604074e-05, | |
| "loss": 0.0153, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 9.852631578947369, | |
| "grad_norm": 0.04463796690106392, | |
| "learning_rate": 5.621916983996429e-05, | |
| "loss": 0.0156, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 9.936842105263159, | |
| "grad_norm": 0.0458030104637146, | |
| "learning_rate": 5.453819942071211e-05, | |
| "loss": 0.0159, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 10.021052631578947, | |
| "grad_norm": 0.03369244933128357, | |
| "learning_rate": 5.2873268890353424e-05, | |
| "loss": 0.0148, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 10.105263157894736, | |
| "grad_norm": 0.03552337363362312, | |
| "learning_rate": 5.12249656717439e-05, | |
| "loss": 0.0136, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 10.189473684210526, | |
| "grad_norm": 0.04354244843125343, | |
| "learning_rate": 4.959387132127054e-05, | |
| "loss": 0.0144, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 10.273684210526316, | |
| "grad_norm": 0.034174419939517975, | |
| "learning_rate": 4.7980561323666115e-05, | |
| "loss": 0.0132, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 10.357894736842105, | |
| "grad_norm": 0.0367300920188427, | |
| "learning_rate": 4.638560488896589e-05, | |
| "loss": 0.0146, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 10.442105263157895, | |
| "grad_norm": 0.03937629610300064, | |
| "learning_rate": 4.48095647516783e-05, | |
| "loss": 0.0143, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 10.526315789473685, | |
| "grad_norm": 0.045526400208473206, | |
| "learning_rate": 4.3252996972240324e-05, | |
| "loss": 0.0142, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 10.610526315789473, | |
| "grad_norm": 0.04123491421341896, | |
| "learning_rate": 4.171645074082737e-05, | |
| "loss": 0.0148, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 10.694736842105263, | |
| "grad_norm": 0.0411781445145607, | |
| "learning_rate": 4.0200468183587556e-05, | |
| "loss": 0.0147, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 10.778947368421052, | |
| "grad_norm": 0.04118568077683449, | |
| "learning_rate": 3.8705584171367885e-05, | |
| "loss": 0.0152, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 10.863157894736842, | |
| "grad_norm": 0.04070712625980377, | |
| "learning_rate": 3.723232613100046e-05, | |
| "loss": 0.015, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 10.947368421052632, | |
| "grad_norm": 0.037149183452129364, | |
| "learning_rate": 3.578121385921533e-05, | |
| "loss": 0.0147, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 11.031578947368422, | |
| "grad_norm": 0.04417663812637329, | |
| "learning_rate": 3.435275933924487e-05, | |
| "loss": 0.0143, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 11.115789473684211, | |
| "grad_norm": 0.03727949410676956, | |
| "learning_rate": 3.294746656018532e-05, | |
| "loss": 0.0136, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 0.03517484292387962, | |
| "learning_rate": 3.156583133917884e-05, | |
| "loss": 0.0135, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 11.284210526315789, | |
| "grad_norm": 0.04109984263777733, | |
| "learning_rate": 3.0208341146478602e-05, | |
| "loss": 0.0134, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 11.368421052631579, | |
| "grad_norm": 0.038387149572372437, | |
| "learning_rate": 2.8875474933458847e-05, | |
| "loss": 0.0137, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 11.452631578947368, | |
| "grad_norm": 0.038690097630023956, | |
| "learning_rate": 2.7567702963630803e-05, | |
| "loss": 0.0137, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 11.536842105263158, | |
| "grad_norm": 0.04456111416220665, | |
| "learning_rate": 2.6285486646723634e-05, | |
| "loss": 0.0143, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 11.621052631578948, | |
| "grad_norm": 0.03415576368570328, | |
| "learning_rate": 2.5029278375889387e-05, | |
| "loss": 0.0142, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 11.705263157894738, | |
| "grad_norm": 0.042495112866163254, | |
| "learning_rate": 2.379952136808903e-05, | |
| "loss": 0.014, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 11.789473684210526, | |
| "grad_norm": 0.04094533994793892, | |
| "learning_rate": 2.2596649507716018e-05, | |
| "loss": 0.0138, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 11.873684210526315, | |
| "grad_norm": 0.0460105761885643, | |
| "learning_rate": 2.1421087193512756e-05, | |
| "loss": 0.0146, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 11.957894736842105, | |
| "grad_norm": 0.03978003188967705, | |
| "learning_rate": 2.0273249188833654e-05, | |
| "loss": 0.0144, | |
| "step": 568 | |
| } | |
| ], | |
| "logging_steps": 4, | |
| "max_steps": 705, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.287747137892516e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |