{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.129651240958348, "eval_steps": 500, "global_step": 99000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020319627744419724, "grad_norm": 7.625, "learning_rate": 1.3531084924827305e-06, "loss": 2.0186, "step": 1000 }, { "epoch": 0.04063925548883945, "grad_norm": 6.03125, "learning_rate": 2.7075714479208997e-06, "loss": 1.4792, "step": 2000 }, { "epoch": 0.06095888323325917, "grad_norm": 7.4375, "learning_rate": 4.062034403359069e-06, "loss": 1.3553, "step": 3000 }, { "epoch": 0.0812785109776789, "grad_norm": 9.5625, "learning_rate": 5.416497358797237e-06, "loss": 1.2948, "step": 4000 }, { "epoch": 0.10159813872209861, "grad_norm": 4.875, "learning_rate": 6.7709603142354064e-06, "loss": 1.2579, "step": 5000 }, { "epoch": 0.12901283677725933, "grad_norm": 4.5625, "learning_rate": 8.598251397448762e-06, "loss": 1.2317, "step": 6000 }, { "epoch": 0.1505149762401359, "grad_norm": 5.34375, "learning_rate": 9.999999320225357e-06, "loss": 1.1979, "step": 7000 }, { "epoch": 0.17201711570301245, "grad_norm": 4.125, "learning_rate": 9.998533101166477e-06, "loss": 1.1811, "step": 8000 }, { "epoch": 0.193519255165889, "grad_norm": 3.859375, "learning_rate": 9.994258851483552e-06, "loss": 1.1605, "step": 9000 }, { "epoch": 0.21502139462876557, "grad_norm": 3.984375, "learning_rate": 9.987178972325833e-06, "loss": 1.1434, "step": 10000 }, { "epoch": 0.23652353409164212, "grad_norm": 3.84375, "learning_rate": 9.977297440963669e-06, "loss": 1.1348, "step": 11000 }, { "epoch": 0.25802567355451866, "grad_norm": 4.28125, "learning_rate": 9.964619808554195e-06, "loss": 1.1287, "step": 12000 }, { "epoch": 0.2795278130173952, "grad_norm": 3.4375, "learning_rate": 9.949153197022848e-06, "loss": 1.1108, "step": 13000 }, { "epoch": 0.3010299524802718, "grad_norm": 4.15625, "learning_rate": 9.930906295062477e-06, "loss": 1.1053, "step": 14000 }, { "epoch": 0.32253209194314836, "grad_norm": 3.75, "learning_rate": 9.909889353252299e-06, "loss": 1.0951, "step": 15000 }, { "epoch": 0.3440342314060249, "grad_norm": 3.78125, "learning_rate": 9.886114178299407e-06, "loss": 1.0883, "step": 16000 }, { "epoch": 0.36553637086890145, "grad_norm": 5.5, "learning_rate": 9.85959412640611e-06, "loss": 1.0752, "step": 17000 }, { "epoch": 0.387038510331778, "grad_norm": 3.796875, "learning_rate": 9.830344095766812e-06, "loss": 1.0785, "step": 18000 }, { "epoch": 0.4087230094920541, "grad_norm": 4.34375, "learning_rate": 9.798108131271342e-06, "loss": 1.0434, "step": 19000 }, { "epoch": 0.43023474683374113, "grad_norm": 3.828125, "learning_rate": 9.763411510439176e-06, "loss": 1.0456, "step": 20000 }, { "epoch": 0.45174648417542823, "grad_norm": 4.6875, "learning_rate": 9.72603664052252e-06, "loss": 1.0324, "step": 21000 }, { "epoch": 0.4732582215171153, "grad_norm": 3.78125, "learning_rate": 9.686004535706463e-06, "loss": 1.0406, "step": 22000 }, { "epoch": 0.4947699588588023, "grad_norm": 3.65625, "learning_rate": 9.64333770421811e-06, "loss": 1.0267, "step": 23000 }, { "epoch": 0.5162816962004894, "grad_norm": 6.53125, "learning_rate": 9.598060135671232e-06, "loss": 1.0287, "step": 24000 }, { "epoch": 0.5377934335421765, "grad_norm": 3.90625, "learning_rate": 9.550197287578003e-06, "loss": 1.0162, "step": 25000 }, { "epoch": 0.5593051708838636, "grad_norm": 5.78125, "learning_rate": 9.499776071035394e-06, "loss": 1.0197, "step": 26000 }, { "epoch": 0.5808169082255505, "grad_norm": 6.28125, "learning_rate": 9.446824835594304e-06, "loss": 1.0163, "step": 27000 }, { "epoch": 0.6023286455672376, "grad_norm": 3.90625, "learning_rate": 9.391373353319884e-06, "loss": 1.0169, "step": 28000 }, { "epoch": 0.6238403829089246, "grad_norm": 3.734375, "learning_rate": 9.333452802052072e-06, "loss": 1.0079, "step": 29000 }, { "epoch": 0.6453521202506117, "grad_norm": 4.0625, "learning_rate": 9.273095747875717e-06, "loss": 1.0061, "step": 30000 }, { "epoch": 0.6668638575922988, "grad_norm": 3.859375, "learning_rate": 9.210336126810147e-06, "loss": 1.0068, "step": 31000 }, { "epoch": 0.6883755949339858, "grad_norm": 3.859375, "learning_rate": 9.145209225728495e-06, "loss": 0.9983, "step": 32000 }, { "epoch": 0.7098873322756729, "grad_norm": 4.125, "learning_rate": 9.077751662517505e-06, "loss": 0.9988, "step": 33000 }, { "epoch": 0.73139906961736, "grad_norm": 4.0625, "learning_rate": 9.00800136548896e-06, "loss": 0.9865, "step": 34000 }, { "epoch": 0.752910806959047, "grad_norm": 4.0, "learning_rate": 8.93599755205432e-06, "loss": 0.9917, "step": 35000 }, { "epoch": 0.7744225443007341, "grad_norm": 4.5625, "learning_rate": 8.861780706674562e-06, "loss": 0.9929, "step": 36000 }, { "epoch": 0.7959342816424212, "grad_norm": 3.984375, "learning_rate": 8.785392558097612e-06, "loss": 0.9844, "step": 37000 }, { "epoch": 0.8174460189841082, "grad_norm": 4.5, "learning_rate": 8.706876055896176e-06, "loss": 0.9879, "step": 38000 }, { "epoch": 0.8389577563257953, "grad_norm": 3.953125, "learning_rate": 8.62627534631915e-06, "loss": 0.9858, "step": 39000 }, { "epoch": 0.8604694936674823, "grad_norm": 3.96875, "learning_rate": 8.5436357474702e-06, "loss": 0.9782, "step": 40000 }, { "epoch": 0.8819812310091694, "grad_norm": 4.34375, "learning_rate": 8.45900372382746e-06, "loss": 0.9819, "step": 41000 }, { "epoch": 0.9034929683508565, "grad_norm": 4.53125, "learning_rate": 8.372426860118667e-06, "loss": 0.9706, "step": 42000 }, { "epoch": 0.9250047056925434, "grad_norm": 4.375, "learning_rate": 8.283953834566449e-06, "loss": 0.9792, "step": 43000 }, { "epoch": 0.9465164430342305, "grad_norm": 8.875, "learning_rate": 8.193634391518774e-06, "loss": 0.9709, "step": 44000 }, { "epoch": 0.9680281803759176, "grad_norm": 4.90625, "learning_rate": 8.101519313479972e-06, "loss": 0.9686, "step": 45000 }, { "epoch": 0.9895399177176046, "grad_norm": 4.125, "learning_rate": 8.00766039255805e-06, "loss": 0.9668, "step": 46000 }, { "epoch": 1.011057032993627, "grad_norm": 5.03125, "learning_rate": 7.912110401344347e-06, "loss": 0.9404, "step": 47000 }, { "epoch": 1.0325687703353141, "grad_norm": 5.28125, "learning_rate": 7.814923063241916e-06, "loss": 0.9154, "step": 48000 }, { "epoch": 1.0540805076770012, "grad_norm": 5.9375, "learning_rate": 7.71615302225931e-06, "loss": 0.9131, "step": 49000 }, { "epoch": 1.0755922450186883, "grad_norm": 5.71875, "learning_rate": 7.615855812286735e-06, "loss": 0.9124, "step": 50000 }, { "epoch": 1.0971039823603754, "grad_norm": 5.28125, "learning_rate": 7.514087825871885e-06, "loss": 0.9144, "step": 51000 }, { "epoch": 1.1186157197020625, "grad_norm": 5.53125, "learning_rate": 7.410906282512981e-06, "loss": 0.9054, "step": 52000 }, { "epoch": 1.1401274570437494, "grad_norm": 5.875, "learning_rate": 7.306369196486855e-06, "loss": 0.9162, "step": 53000 }, { "epoch": 1.1616176826480948, "grad_norm": 6.15625, "learning_rate": 7.20053534423017e-06, "loss": 0.9378, "step": 54000 }, { "epoch": 1.183129419989782, "grad_norm": 5.375, "learning_rate": 7.093464231292111e-06, "loss": 0.9335, "step": 55000 }, { "epoch": 1.204641157331469, "grad_norm": 5.09375, "learning_rate": 6.985216058877125e-06, "loss": 0.937, "step": 56000 }, { "epoch": 1.2261528946731561, "grad_norm": 5.25, "learning_rate": 6.875851689996526e-06, "loss": 0.9275, "step": 57000 }, { "epoch": 1.247664632014843, "grad_norm": 5.3125, "learning_rate": 6.765432615248008e-06, "loss": 0.9307, "step": 58000 }, { "epoch": 1.26917636935653, "grad_norm": 5.78125, "learning_rate": 6.6540209182422785e-06, "loss": 0.9338, "step": 59000 }, { "epoch": 1.2906881066982172, "grad_norm": 5.90625, "learning_rate": 6.5416792406962785e-06, "loss": 0.9314, "step": 60000 }, { "epoch": 1.3121998440399043, "grad_norm": 5.4375, "learning_rate": 6.4284707472126e-06, "loss": 0.9287, "step": 61000 }, { "epoch": 1.3337115813815914, "grad_norm": 4.875, "learning_rate": 6.3144590897649084e-06, "loss": 0.9294, "step": 62000 }, { "epoch": 1.3552233187232785, "grad_norm": 6.0, "learning_rate": 6.199708371909345e-06, "loss": 0.9383, "step": 63000 }, { "epoch": 1.3767350560649654, "grad_norm": 5.25, "learning_rate": 6.0842831127420196e-06, "loss": 0.9376, "step": 64000 }, { "epoch": 1.3982467934066525, "grad_norm": 5.71875, "learning_rate": 5.968248210622858e-06, "loss": 0.8902, "step": 65000 }, { "epoch": 1.4197585307483396, "grad_norm": 5.5, "learning_rate": 5.851668906686223e-06, "loss": 0.8611, "step": 66000 }, { "epoch": 1.4412702680900267, "grad_norm": 5.15625, "learning_rate": 5.734610748158791e-06, "loss": 0.8572, "step": 67000 }, { "epoch": 1.4627820054317136, "grad_norm": 5.78125, "learning_rate": 5.617139551505345e-06, "loss": 0.8541, "step": 68000 }, { "epoch": 1.4842937427734006, "grad_norm": 6.21875, "learning_rate": 5.499321365423167e-06, "loss": 0.8559, "step": 69000 }, { "epoch": 1.5058054801150877, "grad_norm": 6.28125, "learning_rate": 5.381222433705873e-06, "loss": 0.858, "step": 70000 }, { "epoch": 1.5273172174567748, "grad_norm": 6.3125, "learning_rate": 5.262909157997551e-06, "loss": 0.8509, "step": 71000 }, { "epoch": 1.548828954798462, "grad_norm": 6.21875, "learning_rate": 5.144448060458137e-06, "loss": 0.859, "step": 72000 }, { "epoch": 1.570340692140149, "grad_norm": 5.375, "learning_rate": 5.025905746361047e-06, "loss": 0.8419, "step": 73000 }, { "epoch": 1.5918524294818361, "grad_norm": 6.125, "learning_rate": 4.907348866644061e-06, "loss": 0.8584, "step": 74000 }, { "epoch": 1.6133641668235232, "grad_norm": 5.71875, "learning_rate": 4.78884408043454e-06, "loss": 0.8502, "step": 75000 }, { "epoch": 1.6348759041652101, "grad_norm": 6.875, "learning_rate": 4.670458017570048e-06, "loss": 0.8572, "step": 76000 }, { "epoch": 1.6563876415068972, "grad_norm": 9.1875, "learning_rate": 4.552257241135419e-06, "loss": 0.8482, "step": 77000 }, { "epoch": 1.6778993788485843, "grad_norm": 6.46875, "learning_rate": 4.434308210037382e-06, "loss": 0.8481, "step": 78000 }, { "epoch": 1.6994111161902712, "grad_norm": 6.1875, "learning_rate": 4.316677241637737e-06, "loss": 0.8472, "step": 79000 }, { "epoch": 1.7209228535319583, "grad_norm": 5.75, "learning_rate": 4.1994304744661385e-06, "loss": 0.8417, "step": 80000 }, { "epoch": 1.7424345908736454, "grad_norm": 5.75, "learning_rate": 4.082633831033406e-06, "loss": 0.8441, "step": 81000 }, { "epoch": 1.7639463282153325, "grad_norm": 6.6875, "learning_rate": 3.966352980766305e-06, "loss": 0.8517, "step": 82000 }, { "epoch": 1.7854580655570196, "grad_norm": 6.09375, "learning_rate": 3.850653303084625e-06, "loss": 0.8474, "step": 83000 }, { "epoch": 1.8069698028987067, "grad_norm": 7.25, "learning_rate": 3.7355998506413144e-06, "loss": 0.8467, "step": 84000 }, { "epoch": 1.8284815402403938, "grad_norm": 7.4375, "learning_rate": 3.6212573127463314e-06, "loss": 0.8484, "step": 85000 }, { "epoch": 1.8499932775820809, "grad_norm": 5.8125, "learning_rate": 3.507689978994806e-06, "loss": 0.8439, "step": 86000 }, { "epoch": 1.8715050149237678, "grad_norm": 6.09375, "learning_rate": 3.3949617031199265e-06, "loss": 0.8488, "step": 87000 }, { "epoch": 1.8930167522654548, "grad_norm": 6.3125, "learning_rate": 3.283135867090894e-06, "loss": 0.8412, "step": 88000 }, { "epoch": 1.914528489607142, "grad_norm": 6.5, "learning_rate": 3.1722753454761366e-06, "loss": 0.8476, "step": 89000 }, { "epoch": 1.9360402269488288, "grad_norm": 6.8125, "learning_rate": 3.062442470091809e-06, "loss": 0.8548, "step": 90000 }, { "epoch": 1.957551964290516, "grad_norm": 6.90625, "learning_rate": 2.953698994955446e-06, "loss": 0.8512, "step": 91000 }, { "epoch": 1.979063701632203, "grad_norm": 6.5, "learning_rate": 2.8461060615644975e-06, "loss": 0.841, "step": 92000 }, { "epoch": 2.0005808169082258, "grad_norm": 8.0, "learning_rate": 2.7397241645192564e-06, "loss": 0.8516, "step": 93000 }, { "epoch": 2.0220925542499124, "grad_norm": 7.0, "learning_rate": 2.6346131175095015e-06, "loss": 0.8362, "step": 94000 }, { "epoch": 2.0436042915915995, "grad_norm": 10.0, "learning_rate": 2.530832019683983e-06, "loss": 0.8358, "step": 95000 }, { "epoch": 2.0651160289332866, "grad_norm": 6.4375, "learning_rate": 2.4284392224216755e-06, "loss": 0.8403, "step": 96000 }, { "epoch": 2.0866277662749737, "grad_norm": 5.03125, "learning_rate": 2.327492296523444e-06, "loss": 0.8289, "step": 97000 }, { "epoch": 2.108139503616661, "grad_norm": 6.6875, "learning_rate": 2.228047999842622e-06, "loss": 0.8394, "step": 98000 }, { "epoch": 2.129651240958348, "grad_norm": 9.0, "learning_rate": 2.130162245372649e-06, "loss": 0.8294, "step": 99000 } ], "logging_steps": 1000, "max_steps": 139461, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.112776494664294e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }