gemma-3-12b-pt_v2 / trainer_state.json
QomSSLab's picture
Add training logs and README
3d868e4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.129651240958348,
"eval_steps": 500,
"global_step": 99000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020319627744419724,
"grad_norm": 7.625,
"learning_rate": 1.3531084924827305e-06,
"loss": 2.0186,
"step": 1000
},
{
"epoch": 0.04063925548883945,
"grad_norm": 6.03125,
"learning_rate": 2.7075714479208997e-06,
"loss": 1.4792,
"step": 2000
},
{
"epoch": 0.06095888323325917,
"grad_norm": 7.4375,
"learning_rate": 4.062034403359069e-06,
"loss": 1.3553,
"step": 3000
},
{
"epoch": 0.0812785109776789,
"grad_norm": 9.5625,
"learning_rate": 5.416497358797237e-06,
"loss": 1.2948,
"step": 4000
},
{
"epoch": 0.10159813872209861,
"grad_norm": 4.875,
"learning_rate": 6.7709603142354064e-06,
"loss": 1.2579,
"step": 5000
},
{
"epoch": 0.12901283677725933,
"grad_norm": 4.5625,
"learning_rate": 8.598251397448762e-06,
"loss": 1.2317,
"step": 6000
},
{
"epoch": 0.1505149762401359,
"grad_norm": 5.34375,
"learning_rate": 9.999999320225357e-06,
"loss": 1.1979,
"step": 7000
},
{
"epoch": 0.17201711570301245,
"grad_norm": 4.125,
"learning_rate": 9.998533101166477e-06,
"loss": 1.1811,
"step": 8000
},
{
"epoch": 0.193519255165889,
"grad_norm": 3.859375,
"learning_rate": 9.994258851483552e-06,
"loss": 1.1605,
"step": 9000
},
{
"epoch": 0.21502139462876557,
"grad_norm": 3.984375,
"learning_rate": 9.987178972325833e-06,
"loss": 1.1434,
"step": 10000
},
{
"epoch": 0.23652353409164212,
"grad_norm": 3.84375,
"learning_rate": 9.977297440963669e-06,
"loss": 1.1348,
"step": 11000
},
{
"epoch": 0.25802567355451866,
"grad_norm": 4.28125,
"learning_rate": 9.964619808554195e-06,
"loss": 1.1287,
"step": 12000
},
{
"epoch": 0.2795278130173952,
"grad_norm": 3.4375,
"learning_rate": 9.949153197022848e-06,
"loss": 1.1108,
"step": 13000
},
{
"epoch": 0.3010299524802718,
"grad_norm": 4.15625,
"learning_rate": 9.930906295062477e-06,
"loss": 1.1053,
"step": 14000
},
{
"epoch": 0.32253209194314836,
"grad_norm": 3.75,
"learning_rate": 9.909889353252299e-06,
"loss": 1.0951,
"step": 15000
},
{
"epoch": 0.3440342314060249,
"grad_norm": 3.78125,
"learning_rate": 9.886114178299407e-06,
"loss": 1.0883,
"step": 16000
},
{
"epoch": 0.36553637086890145,
"grad_norm": 5.5,
"learning_rate": 9.85959412640611e-06,
"loss": 1.0752,
"step": 17000
},
{
"epoch": 0.387038510331778,
"grad_norm": 3.796875,
"learning_rate": 9.830344095766812e-06,
"loss": 1.0785,
"step": 18000
},
{
"epoch": 0.4087230094920541,
"grad_norm": 4.34375,
"learning_rate": 9.798108131271342e-06,
"loss": 1.0434,
"step": 19000
},
{
"epoch": 0.43023474683374113,
"grad_norm": 3.828125,
"learning_rate": 9.763411510439176e-06,
"loss": 1.0456,
"step": 20000
},
{
"epoch": 0.45174648417542823,
"grad_norm": 4.6875,
"learning_rate": 9.72603664052252e-06,
"loss": 1.0324,
"step": 21000
},
{
"epoch": 0.4732582215171153,
"grad_norm": 3.78125,
"learning_rate": 9.686004535706463e-06,
"loss": 1.0406,
"step": 22000
},
{
"epoch": 0.4947699588588023,
"grad_norm": 3.65625,
"learning_rate": 9.64333770421811e-06,
"loss": 1.0267,
"step": 23000
},
{
"epoch": 0.5162816962004894,
"grad_norm": 6.53125,
"learning_rate": 9.598060135671232e-06,
"loss": 1.0287,
"step": 24000
},
{
"epoch": 0.5377934335421765,
"grad_norm": 3.90625,
"learning_rate": 9.550197287578003e-06,
"loss": 1.0162,
"step": 25000
},
{
"epoch": 0.5593051708838636,
"grad_norm": 5.78125,
"learning_rate": 9.499776071035394e-06,
"loss": 1.0197,
"step": 26000
},
{
"epoch": 0.5808169082255505,
"grad_norm": 6.28125,
"learning_rate": 9.446824835594304e-06,
"loss": 1.0163,
"step": 27000
},
{
"epoch": 0.6023286455672376,
"grad_norm": 3.90625,
"learning_rate": 9.391373353319884e-06,
"loss": 1.0169,
"step": 28000
},
{
"epoch": 0.6238403829089246,
"grad_norm": 3.734375,
"learning_rate": 9.333452802052072e-06,
"loss": 1.0079,
"step": 29000
},
{
"epoch": 0.6453521202506117,
"grad_norm": 4.0625,
"learning_rate": 9.273095747875717e-06,
"loss": 1.0061,
"step": 30000
},
{
"epoch": 0.6668638575922988,
"grad_norm": 3.859375,
"learning_rate": 9.210336126810147e-06,
"loss": 1.0068,
"step": 31000
},
{
"epoch": 0.6883755949339858,
"grad_norm": 3.859375,
"learning_rate": 9.145209225728495e-06,
"loss": 0.9983,
"step": 32000
},
{
"epoch": 0.7098873322756729,
"grad_norm": 4.125,
"learning_rate": 9.077751662517505e-06,
"loss": 0.9988,
"step": 33000
},
{
"epoch": 0.73139906961736,
"grad_norm": 4.0625,
"learning_rate": 9.00800136548896e-06,
"loss": 0.9865,
"step": 34000
},
{
"epoch": 0.752910806959047,
"grad_norm": 4.0,
"learning_rate": 8.93599755205432e-06,
"loss": 0.9917,
"step": 35000
},
{
"epoch": 0.7744225443007341,
"grad_norm": 4.5625,
"learning_rate": 8.861780706674562e-06,
"loss": 0.9929,
"step": 36000
},
{
"epoch": 0.7959342816424212,
"grad_norm": 3.984375,
"learning_rate": 8.785392558097612e-06,
"loss": 0.9844,
"step": 37000
},
{
"epoch": 0.8174460189841082,
"grad_norm": 4.5,
"learning_rate": 8.706876055896176e-06,
"loss": 0.9879,
"step": 38000
},
{
"epoch": 0.8389577563257953,
"grad_norm": 3.953125,
"learning_rate": 8.62627534631915e-06,
"loss": 0.9858,
"step": 39000
},
{
"epoch": 0.8604694936674823,
"grad_norm": 3.96875,
"learning_rate": 8.5436357474702e-06,
"loss": 0.9782,
"step": 40000
},
{
"epoch": 0.8819812310091694,
"grad_norm": 4.34375,
"learning_rate": 8.45900372382746e-06,
"loss": 0.9819,
"step": 41000
},
{
"epoch": 0.9034929683508565,
"grad_norm": 4.53125,
"learning_rate": 8.372426860118667e-06,
"loss": 0.9706,
"step": 42000
},
{
"epoch": 0.9250047056925434,
"grad_norm": 4.375,
"learning_rate": 8.283953834566449e-06,
"loss": 0.9792,
"step": 43000
},
{
"epoch": 0.9465164430342305,
"grad_norm": 8.875,
"learning_rate": 8.193634391518774e-06,
"loss": 0.9709,
"step": 44000
},
{
"epoch": 0.9680281803759176,
"grad_norm": 4.90625,
"learning_rate": 8.101519313479972e-06,
"loss": 0.9686,
"step": 45000
},
{
"epoch": 0.9895399177176046,
"grad_norm": 4.125,
"learning_rate": 8.00766039255805e-06,
"loss": 0.9668,
"step": 46000
},
{
"epoch": 1.011057032993627,
"grad_norm": 5.03125,
"learning_rate": 7.912110401344347e-06,
"loss": 0.9404,
"step": 47000
},
{
"epoch": 1.0325687703353141,
"grad_norm": 5.28125,
"learning_rate": 7.814923063241916e-06,
"loss": 0.9154,
"step": 48000
},
{
"epoch": 1.0540805076770012,
"grad_norm": 5.9375,
"learning_rate": 7.71615302225931e-06,
"loss": 0.9131,
"step": 49000
},
{
"epoch": 1.0755922450186883,
"grad_norm": 5.71875,
"learning_rate": 7.615855812286735e-06,
"loss": 0.9124,
"step": 50000
},
{
"epoch": 1.0971039823603754,
"grad_norm": 5.28125,
"learning_rate": 7.514087825871885e-06,
"loss": 0.9144,
"step": 51000
},
{
"epoch": 1.1186157197020625,
"grad_norm": 5.53125,
"learning_rate": 7.410906282512981e-06,
"loss": 0.9054,
"step": 52000
},
{
"epoch": 1.1401274570437494,
"grad_norm": 5.875,
"learning_rate": 7.306369196486855e-06,
"loss": 0.9162,
"step": 53000
},
{
"epoch": 1.1616176826480948,
"grad_norm": 6.15625,
"learning_rate": 7.20053534423017e-06,
"loss": 0.9378,
"step": 54000
},
{
"epoch": 1.183129419989782,
"grad_norm": 5.375,
"learning_rate": 7.093464231292111e-06,
"loss": 0.9335,
"step": 55000
},
{
"epoch": 1.204641157331469,
"grad_norm": 5.09375,
"learning_rate": 6.985216058877125e-06,
"loss": 0.937,
"step": 56000
},
{
"epoch": 1.2261528946731561,
"grad_norm": 5.25,
"learning_rate": 6.875851689996526e-06,
"loss": 0.9275,
"step": 57000
},
{
"epoch": 1.247664632014843,
"grad_norm": 5.3125,
"learning_rate": 6.765432615248008e-06,
"loss": 0.9307,
"step": 58000
},
{
"epoch": 1.26917636935653,
"grad_norm": 5.78125,
"learning_rate": 6.6540209182422785e-06,
"loss": 0.9338,
"step": 59000
},
{
"epoch": 1.2906881066982172,
"grad_norm": 5.90625,
"learning_rate": 6.5416792406962785e-06,
"loss": 0.9314,
"step": 60000
},
{
"epoch": 1.3121998440399043,
"grad_norm": 5.4375,
"learning_rate": 6.4284707472126e-06,
"loss": 0.9287,
"step": 61000
},
{
"epoch": 1.3337115813815914,
"grad_norm": 4.875,
"learning_rate": 6.3144590897649084e-06,
"loss": 0.9294,
"step": 62000
},
{
"epoch": 1.3552233187232785,
"grad_norm": 6.0,
"learning_rate": 6.199708371909345e-06,
"loss": 0.9383,
"step": 63000
},
{
"epoch": 1.3767350560649654,
"grad_norm": 5.25,
"learning_rate": 6.0842831127420196e-06,
"loss": 0.9376,
"step": 64000
},
{
"epoch": 1.3982467934066525,
"grad_norm": 5.71875,
"learning_rate": 5.968248210622858e-06,
"loss": 0.8902,
"step": 65000
},
{
"epoch": 1.4197585307483396,
"grad_norm": 5.5,
"learning_rate": 5.851668906686223e-06,
"loss": 0.8611,
"step": 66000
},
{
"epoch": 1.4412702680900267,
"grad_norm": 5.15625,
"learning_rate": 5.734610748158791e-06,
"loss": 0.8572,
"step": 67000
},
{
"epoch": 1.4627820054317136,
"grad_norm": 5.78125,
"learning_rate": 5.617139551505345e-06,
"loss": 0.8541,
"step": 68000
},
{
"epoch": 1.4842937427734006,
"grad_norm": 6.21875,
"learning_rate": 5.499321365423167e-06,
"loss": 0.8559,
"step": 69000
},
{
"epoch": 1.5058054801150877,
"grad_norm": 6.28125,
"learning_rate": 5.381222433705873e-06,
"loss": 0.858,
"step": 70000
},
{
"epoch": 1.5273172174567748,
"grad_norm": 6.3125,
"learning_rate": 5.262909157997551e-06,
"loss": 0.8509,
"step": 71000
},
{
"epoch": 1.548828954798462,
"grad_norm": 6.21875,
"learning_rate": 5.144448060458137e-06,
"loss": 0.859,
"step": 72000
},
{
"epoch": 1.570340692140149,
"grad_norm": 5.375,
"learning_rate": 5.025905746361047e-06,
"loss": 0.8419,
"step": 73000
},
{
"epoch": 1.5918524294818361,
"grad_norm": 6.125,
"learning_rate": 4.907348866644061e-06,
"loss": 0.8584,
"step": 74000
},
{
"epoch": 1.6133641668235232,
"grad_norm": 5.71875,
"learning_rate": 4.78884408043454e-06,
"loss": 0.8502,
"step": 75000
},
{
"epoch": 1.6348759041652101,
"grad_norm": 6.875,
"learning_rate": 4.670458017570048e-06,
"loss": 0.8572,
"step": 76000
},
{
"epoch": 1.6563876415068972,
"grad_norm": 9.1875,
"learning_rate": 4.552257241135419e-06,
"loss": 0.8482,
"step": 77000
},
{
"epoch": 1.6778993788485843,
"grad_norm": 6.46875,
"learning_rate": 4.434308210037382e-06,
"loss": 0.8481,
"step": 78000
},
{
"epoch": 1.6994111161902712,
"grad_norm": 6.1875,
"learning_rate": 4.316677241637737e-06,
"loss": 0.8472,
"step": 79000
},
{
"epoch": 1.7209228535319583,
"grad_norm": 5.75,
"learning_rate": 4.1994304744661385e-06,
"loss": 0.8417,
"step": 80000
},
{
"epoch": 1.7424345908736454,
"grad_norm": 5.75,
"learning_rate": 4.082633831033406e-06,
"loss": 0.8441,
"step": 81000
},
{
"epoch": 1.7639463282153325,
"grad_norm": 6.6875,
"learning_rate": 3.966352980766305e-06,
"loss": 0.8517,
"step": 82000
},
{
"epoch": 1.7854580655570196,
"grad_norm": 6.09375,
"learning_rate": 3.850653303084625e-06,
"loss": 0.8474,
"step": 83000
},
{
"epoch": 1.8069698028987067,
"grad_norm": 7.25,
"learning_rate": 3.7355998506413144e-06,
"loss": 0.8467,
"step": 84000
},
{
"epoch": 1.8284815402403938,
"grad_norm": 7.4375,
"learning_rate": 3.6212573127463314e-06,
"loss": 0.8484,
"step": 85000
},
{
"epoch": 1.8499932775820809,
"grad_norm": 5.8125,
"learning_rate": 3.507689978994806e-06,
"loss": 0.8439,
"step": 86000
},
{
"epoch": 1.8715050149237678,
"grad_norm": 6.09375,
"learning_rate": 3.3949617031199265e-06,
"loss": 0.8488,
"step": 87000
},
{
"epoch": 1.8930167522654548,
"grad_norm": 6.3125,
"learning_rate": 3.283135867090894e-06,
"loss": 0.8412,
"step": 88000
},
{
"epoch": 1.914528489607142,
"grad_norm": 6.5,
"learning_rate": 3.1722753454761366e-06,
"loss": 0.8476,
"step": 89000
},
{
"epoch": 1.9360402269488288,
"grad_norm": 6.8125,
"learning_rate": 3.062442470091809e-06,
"loss": 0.8548,
"step": 90000
},
{
"epoch": 1.957551964290516,
"grad_norm": 6.90625,
"learning_rate": 2.953698994955446e-06,
"loss": 0.8512,
"step": 91000
},
{
"epoch": 1.979063701632203,
"grad_norm": 6.5,
"learning_rate": 2.8461060615644975e-06,
"loss": 0.841,
"step": 92000
},
{
"epoch": 2.0005808169082258,
"grad_norm": 8.0,
"learning_rate": 2.7397241645192564e-06,
"loss": 0.8516,
"step": 93000
},
{
"epoch": 2.0220925542499124,
"grad_norm": 7.0,
"learning_rate": 2.6346131175095015e-06,
"loss": 0.8362,
"step": 94000
},
{
"epoch": 2.0436042915915995,
"grad_norm": 10.0,
"learning_rate": 2.530832019683983e-06,
"loss": 0.8358,
"step": 95000
},
{
"epoch": 2.0651160289332866,
"grad_norm": 6.4375,
"learning_rate": 2.4284392224216755e-06,
"loss": 0.8403,
"step": 96000
},
{
"epoch": 2.0866277662749737,
"grad_norm": 5.03125,
"learning_rate": 2.327492296523444e-06,
"loss": 0.8289,
"step": 97000
},
{
"epoch": 2.108139503616661,
"grad_norm": 6.6875,
"learning_rate": 2.228047999842622e-06,
"loss": 0.8394,
"step": 98000
},
{
"epoch": 2.129651240958348,
"grad_norm": 9.0,
"learning_rate": 2.130162245372649e-06,
"loss": 0.8294,
"step": 99000
}
],
"logging_steps": 1000,
"max_steps": 139461,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.112776494664294e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}