| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.129651240958348, | |
| "eval_steps": 500, | |
| "global_step": 99000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020319627744419724, | |
| "grad_norm": 7.625, | |
| "learning_rate": 1.3531084924827305e-06, | |
| "loss": 2.0186, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.04063925548883945, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 2.7075714479208997e-06, | |
| "loss": 1.4792, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.06095888323325917, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 4.062034403359069e-06, | |
| "loss": 1.3553, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.0812785109776789, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 5.416497358797237e-06, | |
| "loss": 1.2948, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.10159813872209861, | |
| "grad_norm": 4.875, | |
| "learning_rate": 6.7709603142354064e-06, | |
| "loss": 1.2579, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.12901283677725933, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 8.598251397448762e-06, | |
| "loss": 1.2317, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.1505149762401359, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 9.999999320225357e-06, | |
| "loss": 1.1979, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.17201711570301245, | |
| "grad_norm": 4.125, | |
| "learning_rate": 9.998533101166477e-06, | |
| "loss": 1.1811, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.193519255165889, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 9.994258851483552e-06, | |
| "loss": 1.1605, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.21502139462876557, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 9.987178972325833e-06, | |
| "loss": 1.1434, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.23652353409164212, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 9.977297440963669e-06, | |
| "loss": 1.1348, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.25802567355451866, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 9.964619808554195e-06, | |
| "loss": 1.1287, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.2795278130173952, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 9.949153197022848e-06, | |
| "loss": 1.1108, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.3010299524802718, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 9.930906295062477e-06, | |
| "loss": 1.1053, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.32253209194314836, | |
| "grad_norm": 3.75, | |
| "learning_rate": 9.909889353252299e-06, | |
| "loss": 1.0951, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.3440342314060249, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 9.886114178299407e-06, | |
| "loss": 1.0883, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.36553637086890145, | |
| "grad_norm": 5.5, | |
| "learning_rate": 9.85959412640611e-06, | |
| "loss": 1.0752, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.387038510331778, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 9.830344095766812e-06, | |
| "loss": 1.0785, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.4087230094920541, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 9.798108131271342e-06, | |
| "loss": 1.0434, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.43023474683374113, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 9.763411510439176e-06, | |
| "loss": 1.0456, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.45174648417542823, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 9.72603664052252e-06, | |
| "loss": 1.0324, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.4732582215171153, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 9.686004535706463e-06, | |
| "loss": 1.0406, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.4947699588588023, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 9.64333770421811e-06, | |
| "loss": 1.0267, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.5162816962004894, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 9.598060135671232e-06, | |
| "loss": 1.0287, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.5377934335421765, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 9.550197287578003e-06, | |
| "loss": 1.0162, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.5593051708838636, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 9.499776071035394e-06, | |
| "loss": 1.0197, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.5808169082255505, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 9.446824835594304e-06, | |
| "loss": 1.0163, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.6023286455672376, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 9.391373353319884e-06, | |
| "loss": 1.0169, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.6238403829089246, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 9.333452802052072e-06, | |
| "loss": 1.0079, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.6453521202506117, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 9.273095747875717e-06, | |
| "loss": 1.0061, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.6668638575922988, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 9.210336126810147e-06, | |
| "loss": 1.0068, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.6883755949339858, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 9.145209225728495e-06, | |
| "loss": 0.9983, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.7098873322756729, | |
| "grad_norm": 4.125, | |
| "learning_rate": 9.077751662517505e-06, | |
| "loss": 0.9988, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.73139906961736, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 9.00800136548896e-06, | |
| "loss": 0.9865, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.752910806959047, | |
| "grad_norm": 4.0, | |
| "learning_rate": 8.93599755205432e-06, | |
| "loss": 0.9917, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.7744225443007341, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 8.861780706674562e-06, | |
| "loss": 0.9929, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.7959342816424212, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 8.785392558097612e-06, | |
| "loss": 0.9844, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.8174460189841082, | |
| "grad_norm": 4.5, | |
| "learning_rate": 8.706876055896176e-06, | |
| "loss": 0.9879, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.8389577563257953, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 8.62627534631915e-06, | |
| "loss": 0.9858, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.8604694936674823, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 8.5436357474702e-06, | |
| "loss": 0.9782, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.8819812310091694, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 8.45900372382746e-06, | |
| "loss": 0.9819, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.9034929683508565, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 8.372426860118667e-06, | |
| "loss": 0.9706, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.9250047056925434, | |
| "grad_norm": 4.375, | |
| "learning_rate": 8.283953834566449e-06, | |
| "loss": 0.9792, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.9465164430342305, | |
| "grad_norm": 8.875, | |
| "learning_rate": 8.193634391518774e-06, | |
| "loss": 0.9709, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.9680281803759176, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 8.101519313479972e-06, | |
| "loss": 0.9686, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.9895399177176046, | |
| "grad_norm": 4.125, | |
| "learning_rate": 8.00766039255805e-06, | |
| "loss": 0.9668, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.011057032993627, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 7.912110401344347e-06, | |
| "loss": 0.9404, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.0325687703353141, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 7.814923063241916e-06, | |
| "loss": 0.9154, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.0540805076770012, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 7.71615302225931e-06, | |
| "loss": 0.9131, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.0755922450186883, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 7.615855812286735e-06, | |
| "loss": 0.9124, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.0971039823603754, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 7.514087825871885e-06, | |
| "loss": 0.9144, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.1186157197020625, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 7.410906282512981e-06, | |
| "loss": 0.9054, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.1401274570437494, | |
| "grad_norm": 5.875, | |
| "learning_rate": 7.306369196486855e-06, | |
| "loss": 0.9162, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.1616176826480948, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 7.20053534423017e-06, | |
| "loss": 0.9378, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.183129419989782, | |
| "grad_norm": 5.375, | |
| "learning_rate": 7.093464231292111e-06, | |
| "loss": 0.9335, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.204641157331469, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 6.985216058877125e-06, | |
| "loss": 0.937, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.2261528946731561, | |
| "grad_norm": 5.25, | |
| "learning_rate": 6.875851689996526e-06, | |
| "loss": 0.9275, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.247664632014843, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 6.765432615248008e-06, | |
| "loss": 0.9307, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.26917636935653, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 6.6540209182422785e-06, | |
| "loss": 0.9338, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.2906881066982172, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 6.5416792406962785e-06, | |
| "loss": 0.9314, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.3121998440399043, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 6.4284707472126e-06, | |
| "loss": 0.9287, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 1.3337115813815914, | |
| "grad_norm": 4.875, | |
| "learning_rate": 6.3144590897649084e-06, | |
| "loss": 0.9294, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.3552233187232785, | |
| "grad_norm": 6.0, | |
| "learning_rate": 6.199708371909345e-06, | |
| "loss": 0.9383, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 1.3767350560649654, | |
| "grad_norm": 5.25, | |
| "learning_rate": 6.0842831127420196e-06, | |
| "loss": 0.9376, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 1.3982467934066525, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 5.968248210622858e-06, | |
| "loss": 0.8902, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.4197585307483396, | |
| "grad_norm": 5.5, | |
| "learning_rate": 5.851668906686223e-06, | |
| "loss": 0.8611, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 1.4412702680900267, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 5.734610748158791e-06, | |
| "loss": 0.8572, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 1.4627820054317136, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 5.617139551505345e-06, | |
| "loss": 0.8541, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 1.4842937427734006, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 5.499321365423167e-06, | |
| "loss": 0.8559, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 1.5058054801150877, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 5.381222433705873e-06, | |
| "loss": 0.858, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.5273172174567748, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 5.262909157997551e-06, | |
| "loss": 0.8509, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 1.548828954798462, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 5.144448060458137e-06, | |
| "loss": 0.859, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 1.570340692140149, | |
| "grad_norm": 5.375, | |
| "learning_rate": 5.025905746361047e-06, | |
| "loss": 0.8419, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 1.5918524294818361, | |
| "grad_norm": 6.125, | |
| "learning_rate": 4.907348866644061e-06, | |
| "loss": 0.8584, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 1.6133641668235232, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 4.78884408043454e-06, | |
| "loss": 0.8502, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 1.6348759041652101, | |
| "grad_norm": 6.875, | |
| "learning_rate": 4.670458017570048e-06, | |
| "loss": 0.8572, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 1.6563876415068972, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 4.552257241135419e-06, | |
| "loss": 0.8482, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 1.6778993788485843, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 4.434308210037382e-06, | |
| "loss": 0.8481, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 1.6994111161902712, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 4.316677241637737e-06, | |
| "loss": 0.8472, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 1.7209228535319583, | |
| "grad_norm": 5.75, | |
| "learning_rate": 4.1994304744661385e-06, | |
| "loss": 0.8417, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 1.7424345908736454, | |
| "grad_norm": 5.75, | |
| "learning_rate": 4.082633831033406e-06, | |
| "loss": 0.8441, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 1.7639463282153325, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 3.966352980766305e-06, | |
| "loss": 0.8517, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 1.7854580655570196, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 3.850653303084625e-06, | |
| "loss": 0.8474, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 1.8069698028987067, | |
| "grad_norm": 7.25, | |
| "learning_rate": 3.7355998506413144e-06, | |
| "loss": 0.8467, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 1.8284815402403938, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 3.6212573127463314e-06, | |
| "loss": 0.8484, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 1.8499932775820809, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 3.507689978994806e-06, | |
| "loss": 0.8439, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 1.8715050149237678, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 3.3949617031199265e-06, | |
| "loss": 0.8488, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 1.8930167522654548, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 3.283135867090894e-06, | |
| "loss": 0.8412, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 1.914528489607142, | |
| "grad_norm": 6.5, | |
| "learning_rate": 3.1722753454761366e-06, | |
| "loss": 0.8476, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 1.9360402269488288, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 3.062442470091809e-06, | |
| "loss": 0.8548, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 1.957551964290516, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 2.953698994955446e-06, | |
| "loss": 0.8512, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 1.979063701632203, | |
| "grad_norm": 6.5, | |
| "learning_rate": 2.8461060615644975e-06, | |
| "loss": 0.841, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 2.0005808169082258, | |
| "grad_norm": 8.0, | |
| "learning_rate": 2.7397241645192564e-06, | |
| "loss": 0.8516, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 2.0220925542499124, | |
| "grad_norm": 7.0, | |
| "learning_rate": 2.6346131175095015e-06, | |
| "loss": 0.8362, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 2.0436042915915995, | |
| "grad_norm": 10.0, | |
| "learning_rate": 2.530832019683983e-06, | |
| "loss": 0.8358, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 2.0651160289332866, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 2.4284392224216755e-06, | |
| "loss": 0.8403, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 2.0866277662749737, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 2.327492296523444e-06, | |
| "loss": 0.8289, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 2.108139503616661, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 2.228047999842622e-06, | |
| "loss": 0.8394, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 2.129651240958348, | |
| "grad_norm": 9.0, | |
| "learning_rate": 2.130162245372649e-06, | |
| "loss": 0.8294, | |
| "step": 99000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 139461, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.112776494664294e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |