zero-gemma-3-4b-it-beta2-e1 / trainer_state.json
bethrezen's picture
Upload folder using huggingface_hub
3812a8d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 230,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004347826086956522,
"grad_norm": 210.10928344726562,
"learning_rate": 0.0,
"loss": 5.8188,
"step": 1
},
{
"epoch": 0.008695652173913044,
"grad_norm": 216.5006561279297,
"learning_rate": 2.173913043478261e-06,
"loss": 5.9259,
"step": 2
},
{
"epoch": 0.013043478260869565,
"grad_norm": 144.48963928222656,
"learning_rate": 4.347826086956522e-06,
"loss": 5.646,
"step": 3
},
{
"epoch": 0.017391304347826087,
"grad_norm": 45.486934661865234,
"learning_rate": 6.521739130434783e-06,
"loss": 5.3097,
"step": 4
},
{
"epoch": 0.021739130434782608,
"grad_norm": 83.79264831542969,
"learning_rate": 8.695652173913044e-06,
"loss": 5.3505,
"step": 5
},
{
"epoch": 0.02608695652173913,
"grad_norm": 33.744483947753906,
"learning_rate": 1.0869565217391305e-05,
"loss": 5.1314,
"step": 6
},
{
"epoch": 0.030434782608695653,
"grad_norm": 22.175418853759766,
"learning_rate": 1.3043478260869566e-05,
"loss": 4.8346,
"step": 7
},
{
"epoch": 0.034782608695652174,
"grad_norm": 18.40424156188965,
"learning_rate": 1.5217391304347828e-05,
"loss": 4.7562,
"step": 8
},
{
"epoch": 0.0391304347826087,
"grad_norm": 15.772565841674805,
"learning_rate": 1.739130434782609e-05,
"loss": 4.5057,
"step": 9
},
{
"epoch": 0.043478260869565216,
"grad_norm": 11.410517692565918,
"learning_rate": 1.956521739130435e-05,
"loss": 4.3231,
"step": 10
},
{
"epoch": 0.04782608695652174,
"grad_norm": 14.64340877532959,
"learning_rate": 2.173913043478261e-05,
"loss": 4.3797,
"step": 11
},
{
"epoch": 0.05217391304347826,
"grad_norm": 7.4696946144104,
"learning_rate": 2.391304347826087e-05,
"loss": 3.9548,
"step": 12
},
{
"epoch": 0.05652173913043478,
"grad_norm": 3.1422557830810547,
"learning_rate": 2.608695652173913e-05,
"loss": 3.8226,
"step": 13
},
{
"epoch": 0.06086956521739131,
"grad_norm": 2.6594135761260986,
"learning_rate": 2.826086956521739e-05,
"loss": 3.8783,
"step": 14
},
{
"epoch": 0.06521739130434782,
"grad_norm": 2.0335605144500732,
"learning_rate": 3.0434782608695656e-05,
"loss": 3.626,
"step": 15
},
{
"epoch": 0.06956521739130435,
"grad_norm": 2.045989513397217,
"learning_rate": 3.260869565217392e-05,
"loss": 3.4734,
"step": 16
},
{
"epoch": 0.07391304347826087,
"grad_norm": 1.797641396522522,
"learning_rate": 3.478260869565218e-05,
"loss": 3.3667,
"step": 17
},
{
"epoch": 0.0782608695652174,
"grad_norm": 1.7289575338363647,
"learning_rate": 3.695652173913043e-05,
"loss": 3.2171,
"step": 18
},
{
"epoch": 0.08260869565217391,
"grad_norm": 1.6280560493469238,
"learning_rate": 3.91304347826087e-05,
"loss": 3.0697,
"step": 19
},
{
"epoch": 0.08695652173913043,
"grad_norm": 1.5199931859970093,
"learning_rate": 4.130434782608696e-05,
"loss": 2.9537,
"step": 20
},
{
"epoch": 0.09130434782608696,
"grad_norm": 1.4183111190795898,
"learning_rate": 4.347826086956522e-05,
"loss": 2.8091,
"step": 21
},
{
"epoch": 0.09565217391304348,
"grad_norm": 1.453029990196228,
"learning_rate": 4.565217391304348e-05,
"loss": 2.6457,
"step": 22
},
{
"epoch": 0.1,
"grad_norm": 1.135553002357483,
"learning_rate": 4.782608695652174e-05,
"loss": 2.4701,
"step": 23
},
{
"epoch": 0.10434782608695652,
"grad_norm": 0.9866960644721985,
"learning_rate": 5e-05,
"loss": 2.3948,
"step": 24
},
{
"epoch": 0.10869565217391304,
"grad_norm": 0.8710840344429016,
"learning_rate": 5.217391304347826e-05,
"loss": 2.3239,
"step": 25
},
{
"epoch": 0.11304347826086956,
"grad_norm": 0.8170456886291504,
"learning_rate": 5.4347826086956524e-05,
"loss": 2.1285,
"step": 26
},
{
"epoch": 0.11739130434782609,
"grad_norm": 0.790302038192749,
"learning_rate": 5.652173913043478e-05,
"loss": 2.021,
"step": 27
},
{
"epoch": 0.12173913043478261,
"grad_norm": 0.7848089933395386,
"learning_rate": 5.869565217391305e-05,
"loss": 1.9254,
"step": 28
},
{
"epoch": 0.12608695652173912,
"grad_norm": 0.7707406878471375,
"learning_rate": 6.086956521739131e-05,
"loss": 1.8048,
"step": 29
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.7862960696220398,
"learning_rate": 6.304347826086957e-05,
"loss": 1.6704,
"step": 30
},
{
"epoch": 0.13478260869565217,
"grad_norm": 0.8184984922409058,
"learning_rate": 6.521739130434783e-05,
"loss": 1.5525,
"step": 31
},
{
"epoch": 0.1391304347826087,
"grad_norm": 0.751800537109375,
"learning_rate": 6.73913043478261e-05,
"loss": 1.4305,
"step": 32
},
{
"epoch": 0.14347826086956522,
"grad_norm": 0.6508727073669434,
"learning_rate": 6.956521739130436e-05,
"loss": 1.3082,
"step": 33
},
{
"epoch": 0.14782608695652175,
"grad_norm": 0.5927818417549133,
"learning_rate": 7.17391304347826e-05,
"loss": 1.2962,
"step": 34
},
{
"epoch": 0.15217391304347827,
"grad_norm": 0.48864519596099854,
"learning_rate": 7.391304347826086e-05,
"loss": 1.1943,
"step": 35
},
{
"epoch": 0.1565217391304348,
"grad_norm": 0.43812891840934753,
"learning_rate": 7.608695652173914e-05,
"loss": 1.1367,
"step": 36
},
{
"epoch": 0.1608695652173913,
"grad_norm": 0.3985790014266968,
"learning_rate": 7.82608695652174e-05,
"loss": 1.0961,
"step": 37
},
{
"epoch": 0.16521739130434782,
"grad_norm": 0.3411348760128021,
"learning_rate": 8.043478260869566e-05,
"loss": 1.0314,
"step": 38
},
{
"epoch": 0.16956521739130434,
"grad_norm": 0.32298171520233154,
"learning_rate": 8.260869565217392e-05,
"loss": 0.9771,
"step": 39
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.30958038568496704,
"learning_rate": 8.478260869565218e-05,
"loss": 0.9268,
"step": 40
},
{
"epoch": 0.1782608695652174,
"grad_norm": 0.2889741063117981,
"learning_rate": 8.695652173913044e-05,
"loss": 0.9256,
"step": 41
},
{
"epoch": 0.1826086956521739,
"grad_norm": 0.24591656029224396,
"learning_rate": 8.91304347826087e-05,
"loss": 0.883,
"step": 42
},
{
"epoch": 0.18695652173913044,
"grad_norm": 0.23997186124324799,
"learning_rate": 9.130434782608696e-05,
"loss": 0.8786,
"step": 43
},
{
"epoch": 0.19130434782608696,
"grad_norm": 0.2006598263978958,
"learning_rate": 9.347826086956522e-05,
"loss": 0.8396,
"step": 44
},
{
"epoch": 0.1956521739130435,
"grad_norm": 0.18479709327220917,
"learning_rate": 9.565217391304348e-05,
"loss": 0.8413,
"step": 45
},
{
"epoch": 0.2,
"grad_norm": 0.17641599476337433,
"learning_rate": 9.782608695652174e-05,
"loss": 0.8359,
"step": 46
},
{
"epoch": 0.20434782608695654,
"grad_norm": 0.15423867106437683,
"learning_rate": 0.0001,
"loss": 0.8058,
"step": 47
},
{
"epoch": 0.20869565217391303,
"grad_norm": 0.1461988240480423,
"learning_rate": 9.999856041607731e-05,
"loss": 0.8029,
"step": 48
},
{
"epoch": 0.21304347826086956,
"grad_norm": 0.12839862704277039,
"learning_rate": 9.999424174720531e-05,
"loss": 0.7822,
"step": 49
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.12158359587192535,
"learning_rate": 9.998704424206746e-05,
"loss": 0.7748,
"step": 50
},
{
"epoch": 0.2217391304347826,
"grad_norm": 0.1291743963956833,
"learning_rate": 9.997696831512027e-05,
"loss": 0.7661,
"step": 51
},
{
"epoch": 0.22608695652173913,
"grad_norm": 0.12144283205270767,
"learning_rate": 9.99640145465694e-05,
"loss": 0.7869,
"step": 52
},
{
"epoch": 0.23043478260869565,
"grad_norm": 0.1100422814488411,
"learning_rate": 9.994818368233639e-05,
"loss": 0.7777,
"step": 53
},
{
"epoch": 0.23478260869565218,
"grad_norm": 0.0993693619966507,
"learning_rate": 9.992947663401548e-05,
"loss": 0.7473,
"step": 54
},
{
"epoch": 0.2391304347826087,
"grad_norm": 0.0941305086016655,
"learning_rate": 9.990789447882137e-05,
"loss": 0.7516,
"step": 55
},
{
"epoch": 0.24347826086956523,
"grad_norm": 0.09400874376296997,
"learning_rate": 9.988343845952697e-05,
"loss": 0.7517,
"step": 56
},
{
"epoch": 0.24782608695652175,
"grad_norm": 0.083980493247509,
"learning_rate": 9.985610998439197e-05,
"loss": 0.749,
"step": 57
},
{
"epoch": 0.25217391304347825,
"grad_norm": 0.08494170755147934,
"learning_rate": 9.98259106270817e-05,
"loss": 0.7332,
"step": 58
},
{
"epoch": 0.2565217391304348,
"grad_norm": 0.08159317076206207,
"learning_rate": 9.979284212657657e-05,
"loss": 0.7343,
"step": 59
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.08313615620136261,
"learning_rate": 9.97569063870718e-05,
"loss": 0.7211,
"step": 60
},
{
"epoch": 0.26521739130434785,
"grad_norm": 0.07852096855640411,
"learning_rate": 9.971810547786793e-05,
"loss": 0.731,
"step": 61
},
{
"epoch": 0.26956521739130435,
"grad_norm": 0.0774468258023262,
"learning_rate": 9.967644163325156e-05,
"loss": 0.7198,
"step": 62
},
{
"epoch": 0.27391304347826084,
"grad_norm": 0.07157547771930695,
"learning_rate": 9.963191725236672e-05,
"loss": 0.6946,
"step": 63
},
{
"epoch": 0.2782608695652174,
"grad_norm": 0.07179877161979675,
"learning_rate": 9.958453489907673e-05,
"loss": 0.6983,
"step": 64
},
{
"epoch": 0.2826086956521739,
"grad_norm": 0.13720852136611938,
"learning_rate": 9.953429730181653e-05,
"loss": 0.7209,
"step": 65
},
{
"epoch": 0.28695652173913044,
"grad_norm": 0.08586138486862183,
"learning_rate": 9.948120735343566e-05,
"loss": 0.7022,
"step": 66
},
{
"epoch": 0.29130434782608694,
"grad_norm": 0.06595543771982193,
"learning_rate": 9.942526811103152e-05,
"loss": 0.6857,
"step": 67
},
{
"epoch": 0.2956521739130435,
"grad_norm": 0.06423239409923553,
"learning_rate": 9.936648279577349e-05,
"loss": 0.6924,
"step": 68
},
{
"epoch": 0.3,
"grad_norm": 0.07080892473459244,
"learning_rate": 9.930485479271735e-05,
"loss": 0.6963,
"step": 69
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.06481339782476425,
"learning_rate": 9.924038765061042e-05,
"loss": 0.7055,
"step": 70
},
{
"epoch": 0.30869565217391304,
"grad_norm": 0.07143648713827133,
"learning_rate": 9.91730850816871e-05,
"loss": 0.6761,
"step": 71
},
{
"epoch": 0.3130434782608696,
"grad_norm": 0.06885742396116257,
"learning_rate": 9.91029509614553e-05,
"loss": 0.7111,
"step": 72
},
{
"epoch": 0.3173913043478261,
"grad_norm": 0.06406974792480469,
"learning_rate": 9.902998932847307e-05,
"loss": 0.6971,
"step": 73
},
{
"epoch": 0.3217391304347826,
"grad_norm": 0.06285955011844635,
"learning_rate": 9.895420438411616e-05,
"loss": 0.681,
"step": 74
},
{
"epoch": 0.32608695652173914,
"grad_norm": 0.07179131358861923,
"learning_rate": 9.887560049233605e-05,
"loss": 0.7001,
"step": 75
},
{
"epoch": 0.33043478260869563,
"grad_norm": 0.06652161478996277,
"learning_rate": 9.879418217940873e-05,
"loss": 0.6668,
"step": 76
},
{
"epoch": 0.3347826086956522,
"grad_norm": 0.06445639580488205,
"learning_rate": 9.870995413367397e-05,
"loss": 0.6981,
"step": 77
},
{
"epoch": 0.3391304347826087,
"grad_norm": 0.06834300607442856,
"learning_rate": 9.862292120526535e-05,
"loss": 0.6484,
"step": 78
},
{
"epoch": 0.34347826086956523,
"grad_norm": 0.06481563299894333,
"learning_rate": 9.853308840583109e-05,
"loss": 0.6875,
"step": 79
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.062026482075452805,
"learning_rate": 9.844046090824533e-05,
"loss": 0.6889,
"step": 80
},
{
"epoch": 0.3521739130434783,
"grad_norm": 0.07275456190109253,
"learning_rate": 9.834504404631031e-05,
"loss": 0.6879,
"step": 81
},
{
"epoch": 0.3565217391304348,
"grad_norm": 0.06591422110795975,
"learning_rate": 9.824684331444927e-05,
"loss": 0.6554,
"step": 82
},
{
"epoch": 0.36086956521739133,
"grad_norm": 0.06396066397428513,
"learning_rate": 9.814586436738998e-05,
"loss": 0.6925,
"step": 83
},
{
"epoch": 0.3652173913043478,
"grad_norm": 0.08825157582759857,
"learning_rate": 9.804211301983918e-05,
"loss": 0.6629,
"step": 84
},
{
"epoch": 0.3695652173913043,
"grad_norm": 0.06731634587049484,
"learning_rate": 9.793559524614779e-05,
"loss": 0.6745,
"step": 85
},
{
"epoch": 0.3739130434782609,
"grad_norm": 0.06455274671316147,
"learning_rate": 9.782631717996675e-05,
"loss": 0.6851,
"step": 86
},
{
"epoch": 0.3782608695652174,
"grad_norm": 0.07710668444633484,
"learning_rate": 9.771428511389395e-05,
"loss": 0.6929,
"step": 87
},
{
"epoch": 0.3826086956521739,
"grad_norm": 0.0727052241563797,
"learning_rate": 9.759950549911186e-05,
"loss": 0.6798,
"step": 88
},
{
"epoch": 0.3869565217391304,
"grad_norm": 0.07156208157539368,
"learning_rate": 9.748198494501597e-05,
"loss": 0.6807,
"step": 89
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.0921456515789032,
"learning_rate": 9.736173021883432e-05,
"loss": 0.6435,
"step": 90
},
{
"epoch": 0.39565217391304347,
"grad_norm": 0.09094609320163727,
"learning_rate": 9.723874824523771e-05,
"loss": 0.6874,
"step": 91
},
{
"epoch": 0.4,
"grad_norm": 0.09006571024656296,
"learning_rate": 9.711304610594104e-05,
"loss": 0.6778,
"step": 92
},
{
"epoch": 0.4043478260869565,
"grad_norm": 0.13732297718524933,
"learning_rate": 9.698463103929542e-05,
"loss": 0.6561,
"step": 93
},
{
"epoch": 0.40869565217391307,
"grad_norm": 0.09598764777183533,
"learning_rate": 9.685351043987151e-05,
"loss": 0.6624,
"step": 94
},
{
"epoch": 0.41304347826086957,
"grad_norm": 0.09070798009634018,
"learning_rate": 9.671969185803356e-05,
"loss": 0.6684,
"step": 95
},
{
"epoch": 0.41739130434782606,
"grad_norm": 0.0911954715847969,
"learning_rate": 9.658318299950473e-05,
"loss": 0.6568,
"step": 96
},
{
"epoch": 0.4217391304347826,
"grad_norm": 0.08703230321407318,
"learning_rate": 9.644399172492336e-05,
"loss": 0.6442,
"step": 97
},
{
"epoch": 0.4260869565217391,
"grad_norm": 0.0760849341750145,
"learning_rate": 9.630212604939026e-05,
"loss": 0.6551,
"step": 98
},
{
"epoch": 0.43043478260869567,
"grad_norm": 0.10621879249811172,
"learning_rate": 9.615759414200729e-05,
"loss": 0.6665,
"step": 99
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.08248650282621384,
"learning_rate": 9.601040432540684e-05,
"loss": 0.6752,
"step": 100
},
{
"epoch": 0.4391304347826087,
"grad_norm": 0.10147503018379211,
"learning_rate": 9.586056507527266e-05,
"loss": 0.6602,
"step": 101
},
{
"epoch": 0.4434782608695652,
"grad_norm": 0.1442282497882843,
"learning_rate": 9.570808501985175e-05,
"loss": 0.6704,
"step": 102
},
{
"epoch": 0.44782608695652176,
"grad_norm": 0.11339450627565384,
"learning_rate": 9.555297293945759e-05,
"loss": 0.6631,
"step": 103
},
{
"epoch": 0.45217391304347826,
"grad_norm": 0.15643437206745148,
"learning_rate": 9.539523776596445e-05,
"loss": 0.668,
"step": 104
},
{
"epoch": 0.45652173913043476,
"grad_norm": 0.1856074035167694,
"learning_rate": 9.523488858229313e-05,
"loss": 0.6413,
"step": 105
},
{
"epoch": 0.4608695652173913,
"grad_norm": 0.12280824780464172,
"learning_rate": 9.507193462188791e-05,
"loss": 0.6658,
"step": 106
},
{
"epoch": 0.4652173913043478,
"grad_norm": 0.18749414384365082,
"learning_rate": 9.49063852681848e-05,
"loss": 0.6785,
"step": 107
},
{
"epoch": 0.46956521739130436,
"grad_norm": 0.13954943418502808,
"learning_rate": 9.47382500540714e-05,
"loss": 0.652,
"step": 108
},
{
"epoch": 0.47391304347826085,
"grad_norm": 0.15025292336940765,
"learning_rate": 9.45675386613377e-05,
"loss": 0.6622,
"step": 109
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.11263363063335419,
"learning_rate": 9.439426092011875e-05,
"loss": 0.6573,
"step": 110
},
{
"epoch": 0.4826086956521739,
"grad_norm": 0.12779393792152405,
"learning_rate": 9.421842680832861e-05,
"loss": 0.6535,
"step": 111
},
{
"epoch": 0.48695652173913045,
"grad_norm": 0.11488567292690277,
"learning_rate": 9.404004645108568e-05,
"loss": 0.6438,
"step": 112
},
{
"epoch": 0.49130434782608695,
"grad_norm": 0.1706668585538864,
"learning_rate": 9.385913012012973e-05,
"loss": 0.6427,
"step": 113
},
{
"epoch": 0.4956521739130435,
"grad_norm": 0.13733729720115662,
"learning_rate": 9.367568823323039e-05,
"loss": 0.6555,
"step": 114
},
{
"epoch": 0.5,
"grad_norm": 0.11061578243970871,
"learning_rate": 9.348973135358734e-05,
"loss": 0.6672,
"step": 115
},
{
"epoch": 0.5043478260869565,
"grad_norm": 0.18926067650318146,
"learning_rate": 9.330127018922194e-05,
"loss": 0.6573,
"step": 116
},
{
"epoch": 0.508695652173913,
"grad_norm": 0.15428727865219116,
"learning_rate": 9.311031559236067e-05,
"loss": 0.6708,
"step": 117
},
{
"epoch": 0.5130434782608696,
"grad_norm": 0.16264328360557556,
"learning_rate": 9.291687855881026e-05,
"loss": 0.6446,
"step": 118
},
{
"epoch": 0.5173913043478261,
"grad_norm": 0.11342114955186844,
"learning_rate": 9.272097022732443e-05,
"loss": 0.6571,
"step": 119
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.15034589171409607,
"learning_rate": 9.252260187896256e-05,
"loss": 0.6408,
"step": 120
},
{
"epoch": 0.5260869565217391,
"grad_norm": 0.21747715771198273,
"learning_rate": 9.232178493644006e-05,
"loss": 0.6346,
"step": 121
},
{
"epoch": 0.5304347826086957,
"grad_norm": 0.27781569957733154,
"learning_rate": 9.211853096347058e-05,
"loss": 0.6541,
"step": 122
},
{
"epoch": 0.5347826086956522,
"grad_norm": 0.2587333023548126,
"learning_rate": 9.191285166410022e-05,
"loss": 0.6516,
"step": 123
},
{
"epoch": 0.5391304347826087,
"grad_norm": 0.16397182643413544,
"learning_rate": 9.170475888203347e-05,
"loss": 0.6716,
"step": 124
},
{
"epoch": 0.5434782608695652,
"grad_norm": 0.12862510979175568,
"learning_rate": 9.149426459995126e-05,
"loss": 0.6596,
"step": 125
},
{
"epoch": 0.5478260869565217,
"grad_norm": 0.15427789092063904,
"learning_rate": 9.128138093882098e-05,
"loss": 0.6588,
"step": 126
},
{
"epoch": 0.5521739130434783,
"grad_norm": 0.22064033150672913,
"learning_rate": 9.106612015719845e-05,
"loss": 0.6314,
"step": 127
},
{
"epoch": 0.5565217391304348,
"grad_norm": 0.1941988468170166,
"learning_rate": 9.08484946505221e-05,
"loss": 0.648,
"step": 128
},
{
"epoch": 0.5608695652173913,
"grad_norm": 0.18163767457008362,
"learning_rate": 9.062851695039915e-05,
"loss": 0.6738,
"step": 129
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.16294820606708527,
"learning_rate": 9.040619972388403e-05,
"loss": 0.6534,
"step": 130
},
{
"epoch": 0.5695652173913044,
"grad_norm": 0.23330819606781006,
"learning_rate": 9.018155577274892e-05,
"loss": 0.6478,
"step": 131
},
{
"epoch": 0.5739130434782609,
"grad_norm": 0.3880465030670166,
"learning_rate": 8.995459803274664e-05,
"loss": 0.6566,
"step": 132
},
{
"epoch": 0.5782608695652174,
"grad_norm": 0.6047540903091431,
"learning_rate": 8.972533957286573e-05,
"loss": 0.6321,
"step": 133
},
{
"epoch": 0.5826086956521739,
"grad_norm": 0.526760995388031,
"learning_rate": 8.949379359457793e-05,
"loss": 0.6501,
"step": 134
},
{
"epoch": 0.5869565217391305,
"grad_norm": 0.26121070981025696,
"learning_rate": 8.925997343107795e-05,
"loss": 0.6462,
"step": 135
},
{
"epoch": 0.591304347826087,
"grad_norm": 0.3640858232975006,
"learning_rate": 8.902389254651569e-05,
"loss": 0.6378,
"step": 136
},
{
"epoch": 0.5956521739130435,
"grad_norm": 0.3413775861263275,
"learning_rate": 8.8785564535221e-05,
"loss": 0.6549,
"step": 137
},
{
"epoch": 0.6,
"grad_norm": 0.25738435983657837,
"learning_rate": 8.854500312092081e-05,
"loss": 0.6266,
"step": 138
},
{
"epoch": 0.6043478260869565,
"grad_norm": 0.3974941670894623,
"learning_rate": 8.83022221559489e-05,
"loss": 0.6207,
"step": 139
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.3541712462902069,
"learning_rate": 8.805723562044824e-05,
"loss": 0.6623,
"step": 140
},
{
"epoch": 0.6130434782608696,
"grad_norm": 0.29466933012008667,
"learning_rate": 8.781005762156593e-05,
"loss": 0.6753,
"step": 141
},
{
"epoch": 0.6173913043478261,
"grad_norm": 0.429376482963562,
"learning_rate": 8.75607023926409e-05,
"loss": 0.6351,
"step": 142
},
{
"epoch": 0.6217391304347826,
"grad_norm": 0.28085529804229736,
"learning_rate": 8.730918429238428e-05,
"loss": 0.6584,
"step": 143
},
{
"epoch": 0.6260869565217392,
"grad_norm": 0.34451988339424133,
"learning_rate": 8.705551780405263e-05,
"loss": 0.6619,
"step": 144
},
{
"epoch": 0.6304347826086957,
"grad_norm": 0.3307543098926544,
"learning_rate": 8.679971753461387e-05,
"loss": 0.6448,
"step": 145
},
{
"epoch": 0.6347826086956522,
"grad_norm": 0.2655896842479706,
"learning_rate": 8.654179821390621e-05,
"loss": 0.6442,
"step": 146
},
{
"epoch": 0.6391304347826087,
"grad_norm": 0.4360576868057251,
"learning_rate": 8.628177469378995e-05,
"loss": 0.6487,
"step": 147
},
{
"epoch": 0.6434782608695652,
"grad_norm": 0.35094520449638367,
"learning_rate": 8.601966194729227e-05,
"loss": 0.6359,
"step": 148
},
{
"epoch": 0.6478260869565218,
"grad_norm": 0.4109646677970886,
"learning_rate": 8.575547506774497e-05,
"loss": 0.6519,
"step": 149
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.3401927053928375,
"learning_rate": 8.548922926791545e-05,
"loss": 0.6375,
"step": 150
},
{
"epoch": 0.6565217391304348,
"grad_norm": 0.22073158621788025,
"learning_rate": 8.522093987913062e-05,
"loss": 0.6462,
"step": 151
},
{
"epoch": 0.6608695652173913,
"grad_norm": 0.43310844898223877,
"learning_rate": 8.495062235039411e-05,
"loss": 0.6697,
"step": 152
},
{
"epoch": 0.6652173913043479,
"grad_norm": 0.42843684554100037,
"learning_rate": 8.467829224749665e-05,
"loss": 0.6169,
"step": 153
},
{
"epoch": 0.6695652173913044,
"grad_norm": 0.4057531952857971,
"learning_rate": 8.440396525211975e-05,
"loss": 0.6625,
"step": 154
},
{
"epoch": 0.6739130434782609,
"grad_norm": 0.23454974591732025,
"learning_rate": 8.412765716093272e-05,
"loss": 0.616,
"step": 155
},
{
"epoch": 0.6782608695652174,
"grad_norm": 0.3723919987678528,
"learning_rate": 8.384938388468296e-05,
"loss": 0.6576,
"step": 156
},
{
"epoch": 0.6826086956521739,
"grad_norm": 0.44731444120407104,
"learning_rate": 8.356916144727985e-05,
"loss": 0.6408,
"step": 157
},
{
"epoch": 0.6869565217391305,
"grad_norm": 0.4032682180404663,
"learning_rate": 8.328700598487203e-05,
"loss": 0.6541,
"step": 158
},
{
"epoch": 0.691304347826087,
"grad_norm": 0.34927839040756226,
"learning_rate": 8.300293374491821e-05,
"loss": 0.641,
"step": 159
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.37337374687194824,
"learning_rate": 8.271696108525157e-05,
"loss": 0.6409,
"step": 160
},
{
"epoch": 0.7,
"grad_norm": 0.33849138021469116,
"learning_rate": 8.24291044731378e-05,
"loss": 0.6571,
"step": 161
},
{
"epoch": 0.7043478260869566,
"grad_norm": 0.26402008533477783,
"learning_rate": 8.213938048432697e-05,
"loss": 0.6467,
"step": 162
},
{
"epoch": 0.7086956521739131,
"grad_norm": 0.31730157136917114,
"learning_rate": 8.184780580209892e-05,
"loss": 0.6519,
"step": 163
},
{
"epoch": 0.7130434782608696,
"grad_norm": 0.47295334935188293,
"learning_rate": 8.155439721630264e-05,
"loss": 0.629,
"step": 164
},
{
"epoch": 0.717391304347826,
"grad_norm": 0.3847337067127228,
"learning_rate": 8.125917162238945e-05,
"loss": 0.6404,
"step": 165
},
{
"epoch": 0.7217391304347827,
"grad_norm": 0.30035194754600525,
"learning_rate": 8.09621460204401e-05,
"loss": 0.6697,
"step": 166
},
{
"epoch": 0.7260869565217392,
"grad_norm": 0.4391736686229706,
"learning_rate": 8.066333751418583e-05,
"loss": 0.6399,
"step": 167
},
{
"epoch": 0.7304347826086957,
"grad_norm": 0.6057283878326416,
"learning_rate": 8.036276331002348e-05,
"loss": 0.6341,
"step": 168
},
{
"epoch": 0.7347826086956522,
"grad_norm": 0.644005537033081,
"learning_rate": 8.006044071602477e-05,
"loss": 0.662,
"step": 169
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.6034097671508789,
"learning_rate": 7.975638714093949e-05,
"loss": 0.638,
"step": 170
},
{
"epoch": 0.7434782608695653,
"grad_norm": 0.49918678402900696,
"learning_rate": 7.945062009319319e-05,
"loss": 0.6322,
"step": 171
},
{
"epoch": 0.7478260869565218,
"grad_norm": 0.4087945520877838,
"learning_rate": 7.914315717987892e-05,
"loss": 0.6419,
"step": 172
},
{
"epoch": 0.7521739130434782,
"grad_norm": 0.3623512089252472,
"learning_rate": 7.883401610574336e-05,
"loss": 0.6618,
"step": 173
},
{
"epoch": 0.7565217391304347,
"grad_norm": 0.7307239174842834,
"learning_rate": 7.85232146721673e-05,
"loss": 0.6572,
"step": 174
},
{
"epoch": 0.7608695652173914,
"grad_norm": 0.8763480186462402,
"learning_rate": 7.821077077614061e-05,
"loss": 0.6434,
"step": 175
},
{
"epoch": 0.7652173913043478,
"grad_norm": 0.5741376280784607,
"learning_rate": 7.789670240923168e-05,
"loss": 0.6539,
"step": 176
},
{
"epoch": 0.7695652173913043,
"grad_norm": 0.4742548167705536,
"learning_rate": 7.758102765655137e-05,
"loss": 0.6435,
"step": 177
},
{
"epoch": 0.7739130434782608,
"grad_norm": 0.6679338216781616,
"learning_rate": 7.726376469571164e-05,
"loss": 0.6654,
"step": 178
},
{
"epoch": 0.7782608695652173,
"grad_norm": 0.7236630320549011,
"learning_rate": 7.694493179577879e-05,
"loss": 0.655,
"step": 179
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.3954794704914093,
"learning_rate": 7.662454731622148e-05,
"loss": 0.6733,
"step": 180
},
{
"epoch": 0.7869565217391304,
"grad_norm": 0.41423317790031433,
"learning_rate": 7.630262970585356e-05,
"loss": 0.6466,
"step": 181
},
{
"epoch": 0.7913043478260869,
"grad_norm": 0.5248022675514221,
"learning_rate": 7.597919750177168e-05,
"loss": 0.6343,
"step": 182
},
{
"epoch": 0.7956521739130434,
"grad_norm": 0.4523037075996399,
"learning_rate": 7.56542693282879e-05,
"loss": 0.6494,
"step": 183
},
{
"epoch": 0.8,
"grad_norm": 0.3984985947608948,
"learning_rate": 7.532786389585716e-05,
"loss": 0.6511,
"step": 184
},
{
"epoch": 0.8043478260869565,
"grad_norm": 0.3854583501815796,
"learning_rate": 7.500000000000001e-05,
"loss": 0.6628,
"step": 185
},
{
"epoch": 0.808695652173913,
"grad_norm": 0.43506574630737305,
"learning_rate": 7.467069652022016e-05,
"loss": 0.6603,
"step": 186
},
{
"epoch": 0.8130434782608695,
"grad_norm": 0.41759249567985535,
"learning_rate": 7.433997241891742e-05,
"loss": 0.6674,
"step": 187
},
{
"epoch": 0.8173913043478261,
"grad_norm": 0.2531141936779022,
"learning_rate": 7.400784674029578e-05,
"loss": 0.6395,
"step": 188
},
{
"epoch": 0.8217391304347826,
"grad_norm": 0.4464227259159088,
"learning_rate": 7.36743386092667e-05,
"loss": 0.6576,
"step": 189
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.47379711270332336,
"learning_rate": 7.333946723034794e-05,
"loss": 0.6423,
"step": 190
},
{
"epoch": 0.8304347826086956,
"grad_norm": 0.3901284635066986,
"learning_rate": 7.300325188655761e-05,
"loss": 0.6594,
"step": 191
},
{
"epoch": 0.8347826086956521,
"grad_norm": 0.3132023811340332,
"learning_rate": 7.266571193830387e-05,
"loss": 0.6611,
"step": 192
},
{
"epoch": 0.8391304347826087,
"grad_norm": 0.4356115460395813,
"learning_rate": 7.232686682227001e-05,
"loss": 0.6376,
"step": 193
},
{
"epoch": 0.8434782608695652,
"grad_norm": 0.5321224331855774,
"learning_rate": 7.198673605029528e-05,
"loss": 0.6643,
"step": 194
},
{
"epoch": 0.8478260869565217,
"grad_norm": 0.3640391230583191,
"learning_rate": 7.164533920825137e-05,
"loss": 0.6476,
"step": 195
},
{
"epoch": 0.8521739130434782,
"grad_norm": 0.2873951494693756,
"learning_rate": 7.130269595491443e-05,
"loss": 0.649,
"step": 196
},
{
"epoch": 0.8565217391304348,
"grad_norm": 0.45937976241111755,
"learning_rate": 7.095882602083322e-05,
"loss": 0.648,
"step": 197
},
{
"epoch": 0.8608695652173913,
"grad_norm": 0.5308820009231567,
"learning_rate": 7.061374920719288e-05,
"loss": 0.6458,
"step": 198
},
{
"epoch": 0.8652173913043478,
"grad_norm": 0.43750235438346863,
"learning_rate": 7.026748538467474e-05,
"loss": 0.6457,
"step": 199
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.27052804827690125,
"learning_rate": 6.992005449231208e-05,
"loss": 0.6452,
"step": 200
},
{
"epoch": 0.8739130434782608,
"grad_norm": 0.37570297718048096,
"learning_rate": 6.957147653634198e-05,
"loss": 0.6566,
"step": 201
},
{
"epoch": 0.8782608695652174,
"grad_norm": 0.32025307416915894,
"learning_rate": 6.922177158905325e-05,
"loss": 0.6655,
"step": 202
},
{
"epoch": 0.8826086956521739,
"grad_norm": 0.2932673990726471,
"learning_rate": 6.887095978763072e-05,
"loss": 0.6749,
"step": 203
},
{
"epoch": 0.8869565217391304,
"grad_norm": 0.23213867843151093,
"learning_rate": 6.851906133299557e-05,
"loss": 0.6631,
"step": 204
},
{
"epoch": 0.8913043478260869,
"grad_norm": 0.3275505602359772,
"learning_rate": 6.816609648864208e-05,
"loss": 0.6758,
"step": 205
},
{
"epoch": 0.8956521739130435,
"grad_norm": 0.32032299041748047,
"learning_rate": 6.781208557947086e-05,
"loss": 0.662,
"step": 206
},
{
"epoch": 0.9,
"grad_norm": 0.26808756589889526,
"learning_rate": 6.745704899061843e-05,
"loss": 0.6464,
"step": 207
},
{
"epoch": 0.9043478260869565,
"grad_norm": 0.25998106598854065,
"learning_rate": 6.710100716628344e-05,
"loss": 0.6556,
"step": 208
},
{
"epoch": 0.908695652173913,
"grad_norm": 0.36953797936439514,
"learning_rate": 6.674398060854931e-05,
"loss": 0.6761,
"step": 209
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.43774327635765076,
"learning_rate": 6.638598987620375e-05,
"loss": 0.6481,
"step": 210
},
{
"epoch": 0.9173913043478261,
"grad_norm": 0.24901102483272552,
"learning_rate": 6.602705558355486e-05,
"loss": 0.675,
"step": 211
},
{
"epoch": 0.9217391304347826,
"grad_norm": 0.3668375313282013,
"learning_rate": 6.566719839924412e-05,
"loss": 0.6619,
"step": 212
},
{
"epoch": 0.9260869565217391,
"grad_norm": 0.5943741202354431,
"learning_rate": 6.530643904505621e-05,
"loss": 0.6561,
"step": 213
},
{
"epoch": 0.9304347826086956,
"grad_norm": 0.6538096070289612,
"learning_rate": 6.49447982947258e-05,
"loss": 0.6297,
"step": 214
},
{
"epoch": 0.9347826086956522,
"grad_norm": 0.5622021555900574,
"learning_rate": 6.458229697274125e-05,
"loss": 0.6602,
"step": 215
},
{
"epoch": 0.9391304347826087,
"grad_norm": 0.45731329917907715,
"learning_rate": 6.42189559531456e-05,
"loss": 0.6686,
"step": 216
},
{
"epoch": 0.9434782608695652,
"grad_norm": 0.26856303215026855,
"learning_rate": 6.385479615833445e-05,
"loss": 0.6358,
"step": 217
},
{
"epoch": 0.9478260869565217,
"grad_norm": 0.24898113310337067,
"learning_rate": 6.348983855785121e-05,
"loss": 0.6579,
"step": 218
},
{
"epoch": 0.9521739130434783,
"grad_norm": 0.3039465844631195,
"learning_rate": 6.312410416717968e-05,
"loss": 0.6493,
"step": 219
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.36120837926864624,
"learning_rate": 6.27576140465338e-05,
"loss": 0.6524,
"step": 220
},
{
"epoch": 0.9608695652173913,
"grad_norm": 0.35759392380714417,
"learning_rate": 6.2390389299645e-05,
"loss": 0.6247,
"step": 221
},
{
"epoch": 0.9652173913043478,
"grad_norm": 0.29583072662353516,
"learning_rate": 6.202245107254693e-05,
"loss": 0.642,
"step": 222
},
{
"epoch": 0.9695652173913043,
"grad_norm": 0.24323242902755737,
"learning_rate": 6.165382055235783e-05,
"loss": 0.6683,
"step": 223
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.4522090256214142,
"learning_rate": 6.128451896606053e-05,
"loss": 0.639,
"step": 224
},
{
"epoch": 0.9782608695652174,
"grad_norm": 0.6692441701889038,
"learning_rate": 6.091456757928008e-05,
"loss": 0.6628,
"step": 225
},
{
"epoch": 0.9826086956521739,
"grad_norm": 0.7985122203826904,
"learning_rate": 6.054398769505924e-05,
"loss": 0.6585,
"step": 226
},
{
"epoch": 0.9869565217391304,
"grad_norm": 0.6464029550552368,
"learning_rate": 6.01728006526317e-05,
"loss": 0.6563,
"step": 227
},
{
"epoch": 0.991304347826087,
"grad_norm": 0.36494386196136475,
"learning_rate": 5.980102782619342e-05,
"loss": 0.648,
"step": 228
},
{
"epoch": 0.9956521739130435,
"grad_norm": 0.40735068917274475,
"learning_rate": 5.942869062367179e-05,
"loss": 0.6502,
"step": 229
},
{
"epoch": 1.0,
"grad_norm": 0.6993163228034973,
"learning_rate": 5.905581048549279e-05,
"loss": 0.6682,
"step": 230
}
],
"logging_steps": 1,
"max_steps": 460,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.970864260913562e+18,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}