KublaiKhan1's picture
Add files using upload-large-folder tool
f9679a1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 615,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004878048780487805,
"grad_norm": 37.90617752075195,
"learning_rate": 5e-06,
"loss": 5.3731,
"step": 1
},
{
"epoch": 0.00975609756097561,
"grad_norm": 32.92101287841797,
"learning_rate": 4.999997064365715e-06,
"loss": 3.7922,
"step": 2
},
{
"epoch": 0.014634146341463415,
"grad_norm": 28.488866806030273,
"learning_rate": 4.999988257469751e-06,
"loss": 3.3238,
"step": 3
},
{
"epoch": 0.01951219512195122,
"grad_norm": 28.274654388427734,
"learning_rate": 4.999973579332793e-06,
"loss": 3.5405,
"step": 4
},
{
"epoch": 0.024390243902439025,
"grad_norm": 18.761985778808594,
"learning_rate": 4.999953029989312e-06,
"loss": 2.7492,
"step": 5
},
{
"epoch": 0.02926829268292683,
"grad_norm": 9.646541595458984,
"learning_rate": 4.999926609487568e-06,
"loss": 1.7146,
"step": 6
},
{
"epoch": 0.03414634146341464,
"grad_norm": 15.763612747192383,
"learning_rate": 4.9998943178896106e-06,
"loss": 2.8485,
"step": 7
},
{
"epoch": 0.03902439024390244,
"grad_norm": 13.853349685668945,
"learning_rate": 4.999856155271276e-06,
"loss": 3.0975,
"step": 8
},
{
"epoch": 0.04390243902439024,
"grad_norm": 16.893617630004883,
"learning_rate": 4.999812121722191e-06,
"loss": 2.8534,
"step": 9
},
{
"epoch": 0.04878048780487805,
"grad_norm": 15.47554874420166,
"learning_rate": 4.999762217345766e-06,
"loss": 3.4569,
"step": 10
},
{
"epoch": 0.05365853658536585,
"grad_norm": 6.277958393096924,
"learning_rate": 4.999706442259205e-06,
"loss": 1.8088,
"step": 11
},
{
"epoch": 0.05853658536585366,
"grad_norm": 5.451688289642334,
"learning_rate": 4.999644796593492e-06,
"loss": 1.8104,
"step": 12
},
{
"epoch": 0.06341463414634146,
"grad_norm": 4.333822727203369,
"learning_rate": 4.999577280493407e-06,
"loss": 1.7201,
"step": 13
},
{
"epoch": 0.06829268292682927,
"grad_norm": 4.488610744476318,
"learning_rate": 4.99950389411751e-06,
"loss": 1.9804,
"step": 14
},
{
"epoch": 0.07317073170731707,
"grad_norm": 1.8763024806976318,
"learning_rate": 4.999424637638148e-06,
"loss": 0.9227,
"step": 15
},
{
"epoch": 0.07804878048780488,
"grad_norm": 2.98995304107666,
"learning_rate": 4.999339511241458e-06,
"loss": 1.2368,
"step": 16
},
{
"epoch": 0.08292682926829269,
"grad_norm": 5.778397083282471,
"learning_rate": 4.9992485151273584e-06,
"loss": 2.5745,
"step": 17
},
{
"epoch": 0.08780487804878048,
"grad_norm": 5.222808361053467,
"learning_rate": 4.999151649509554e-06,
"loss": 2.0987,
"step": 18
},
{
"epoch": 0.09268292682926829,
"grad_norm": 4.075173377990723,
"learning_rate": 4.9990489146155356e-06,
"loss": 1.7732,
"step": 19
},
{
"epoch": 0.0975609756097561,
"grad_norm": 4.210625171661377,
"learning_rate": 4.9989403106865765e-06,
"loss": 1.8928,
"step": 20
},
{
"epoch": 0.1024390243902439,
"grad_norm": 1.7968504428863525,
"learning_rate": 4.9988258379777334e-06,
"loss": 1.0457,
"step": 21
},
{
"epoch": 0.1073170731707317,
"grad_norm": 2.494248151779175,
"learning_rate": 4.998705496757846e-06,
"loss": 1.3236,
"step": 22
},
{
"epoch": 0.11219512195121951,
"grad_norm": 4.312766075134277,
"learning_rate": 4.998579287309538e-06,
"loss": 2.4289,
"step": 23
},
{
"epoch": 0.11707317073170732,
"grad_norm": 5.361437797546387,
"learning_rate": 4.998447209929211e-06,
"loss": 1.7608,
"step": 24
},
{
"epoch": 0.12195121951219512,
"grad_norm": 1.4642046689987183,
"learning_rate": 4.998309264927053e-06,
"loss": 0.7571,
"step": 25
},
{
"epoch": 0.12682926829268293,
"grad_norm": 3.642005205154419,
"learning_rate": 4.998165452627025e-06,
"loss": 1.0634,
"step": 26
},
{
"epoch": 0.13170731707317074,
"grad_norm": 3.2727863788604736,
"learning_rate": 4.998015773366874e-06,
"loss": 1.4016,
"step": 27
},
{
"epoch": 0.13658536585365855,
"grad_norm": 1.5661338567733765,
"learning_rate": 4.997860227498122e-06,
"loss": 0.8305,
"step": 28
},
{
"epoch": 0.14146341463414633,
"grad_norm": 3.3989508152008057,
"learning_rate": 4.99769881538607e-06,
"loss": 1.8744,
"step": 29
},
{
"epoch": 0.14634146341463414,
"grad_norm": 3.512221574783325,
"learning_rate": 4.997531537409794e-06,
"loss": 1.7701,
"step": 30
},
{
"epoch": 0.15121951219512195,
"grad_norm": 2.261887311935425,
"learning_rate": 4.99735839396215e-06,
"loss": 1.0653,
"step": 31
},
{
"epoch": 0.15609756097560976,
"grad_norm": 2.2495696544647217,
"learning_rate": 4.9971793854497655e-06,
"loss": 1.1914,
"step": 32
},
{
"epoch": 0.16097560975609757,
"grad_norm": 4.262424468994141,
"learning_rate": 4.996994512293042e-06,
"loss": 1.7311,
"step": 33
},
{
"epoch": 0.16585365853658537,
"grad_norm": 3.361311674118042,
"learning_rate": 4.996803774926157e-06,
"loss": 1.4482,
"step": 34
},
{
"epoch": 0.17073170731707318,
"grad_norm": 1.5855786800384521,
"learning_rate": 4.996607173797059e-06,
"loss": 1.3959,
"step": 35
},
{
"epoch": 0.17560975609756097,
"grad_norm": 2.5001468658447266,
"learning_rate": 4.996404709367466e-06,
"loss": 1.3452,
"step": 36
},
{
"epoch": 0.18048780487804877,
"grad_norm": 2.7903096675872803,
"learning_rate": 4.996196382112868e-06,
"loss": 1.018,
"step": 37
},
{
"epoch": 0.18536585365853658,
"grad_norm": 2.7586171627044678,
"learning_rate": 4.9959821925225235e-06,
"loss": 1.291,
"step": 38
},
{
"epoch": 0.1902439024390244,
"grad_norm": 2.873840808868408,
"learning_rate": 4.995762141099456e-06,
"loss": 1.1418,
"step": 39
},
{
"epoch": 0.1951219512195122,
"grad_norm": 1.6896076202392578,
"learning_rate": 4.995536228360461e-06,
"loss": 1.1662,
"step": 40
},
{
"epoch": 0.2,
"grad_norm": 1.4597285985946655,
"learning_rate": 4.995304454836095e-06,
"loss": 1.0373,
"step": 41
},
{
"epoch": 0.2048780487804878,
"grad_norm": 2.4222421646118164,
"learning_rate": 4.9950668210706795e-06,
"loss": 1.1895,
"step": 42
},
{
"epoch": 0.2097560975609756,
"grad_norm": 3.295085906982422,
"learning_rate": 4.994823327622299e-06,
"loss": 1.4166,
"step": 43
},
{
"epoch": 0.2146341463414634,
"grad_norm": 1.8216716051101685,
"learning_rate": 4.9945739750628e-06,
"loss": 1.0009,
"step": 44
},
{
"epoch": 0.21951219512195122,
"grad_norm": 2.2421364784240723,
"learning_rate": 4.994318763977789e-06,
"loss": 1.1999,
"step": 45
},
{
"epoch": 0.22439024390243903,
"grad_norm": 5.6812744140625,
"learning_rate": 4.994057694966632e-06,
"loss": 1.3831,
"step": 46
},
{
"epoch": 0.22926829268292684,
"grad_norm": 1.477816104888916,
"learning_rate": 4.993790768642449e-06,
"loss": 1.1419,
"step": 47
},
{
"epoch": 0.23414634146341465,
"grad_norm": 4.485177993774414,
"learning_rate": 4.99351798563212e-06,
"loss": 1.9434,
"step": 48
},
{
"epoch": 0.23902439024390243,
"grad_norm": 2.770219326019287,
"learning_rate": 4.993239346576278e-06,
"loss": 1.3214,
"step": 49
},
{
"epoch": 0.24390243902439024,
"grad_norm": 2.721611976623535,
"learning_rate": 4.99295485212931e-06,
"loss": 1.4329,
"step": 50
},
{
"epoch": 0.24878048780487805,
"grad_norm": 3.714306592941284,
"learning_rate": 4.992664502959351e-06,
"loss": 1.372,
"step": 51
},
{
"epoch": 0.25365853658536586,
"grad_norm": 1.2679803371429443,
"learning_rate": 4.99236829974829e-06,
"loss": 0.8086,
"step": 52
},
{
"epoch": 0.25853658536585367,
"grad_norm": 1.6986169815063477,
"learning_rate": 4.992066243191762e-06,
"loss": 1.2012,
"step": 53
},
{
"epoch": 0.2634146341463415,
"grad_norm": 1.3443604707717896,
"learning_rate": 4.991758333999148e-06,
"loss": 0.8986,
"step": 54
},
{
"epoch": 0.2682926829268293,
"grad_norm": 2.0896975994110107,
"learning_rate": 4.991444572893575e-06,
"loss": 0.8819,
"step": 55
},
{
"epoch": 0.2731707317073171,
"grad_norm": 1.255516529083252,
"learning_rate": 4.991124960611916e-06,
"loss": 0.6255,
"step": 56
},
{
"epoch": 0.2780487804878049,
"grad_norm": 2.125410318374634,
"learning_rate": 4.99079949790478e-06,
"loss": 0.9983,
"step": 57
},
{
"epoch": 0.28292682926829266,
"grad_norm": 2.0312907695770264,
"learning_rate": 4.99046818553652e-06,
"loss": 1.1137,
"step": 58
},
{
"epoch": 0.28780487804878047,
"grad_norm": 2.904625177383423,
"learning_rate": 4.9901310242852246e-06,
"loss": 1.2009,
"step": 59
},
{
"epoch": 0.2926829268292683,
"grad_norm": 2.512932777404785,
"learning_rate": 4.9897880149427206e-06,
"loss": 1.2234,
"step": 60
},
{
"epoch": 0.2975609756097561,
"grad_norm": 2.5621752738952637,
"learning_rate": 4.989439158314566e-06,
"loss": 1.2354,
"step": 61
},
{
"epoch": 0.3024390243902439,
"grad_norm": 1.0051912069320679,
"learning_rate": 4.989084455220056e-06,
"loss": 0.614,
"step": 62
},
{
"epoch": 0.3073170731707317,
"grad_norm": 1.5008565187454224,
"learning_rate": 4.988723906492212e-06,
"loss": 1.002,
"step": 63
},
{
"epoch": 0.3121951219512195,
"grad_norm": 1.1436376571655273,
"learning_rate": 4.988357512977785e-06,
"loss": 0.6483,
"step": 64
},
{
"epoch": 0.3170731707317073,
"grad_norm": 1.16792893409729,
"learning_rate": 4.987985275537252e-06,
"loss": 0.638,
"step": 65
},
{
"epoch": 0.32195121951219513,
"grad_norm": 1.5670536756515503,
"learning_rate": 4.9876071950448185e-06,
"loss": 1.0274,
"step": 66
},
{
"epoch": 0.32682926829268294,
"grad_norm": 1.357515811920166,
"learning_rate": 4.987223272388407e-06,
"loss": 0.7612,
"step": 67
},
{
"epoch": 0.33170731707317075,
"grad_norm": 1.8383222818374634,
"learning_rate": 4.986833508469663e-06,
"loss": 1.1496,
"step": 68
},
{
"epoch": 0.33658536585365856,
"grad_norm": 2.8976657390594482,
"learning_rate": 4.98643790420395e-06,
"loss": 0.9924,
"step": 69
},
{
"epoch": 0.34146341463414637,
"grad_norm": 1.7679390907287598,
"learning_rate": 4.986036460520348e-06,
"loss": 0.9408,
"step": 70
},
{
"epoch": 0.3463414634146341,
"grad_norm": 2.3815059661865234,
"learning_rate": 4.98562917836165e-06,
"loss": 1.2923,
"step": 71
},
{
"epoch": 0.35121951219512193,
"grad_norm": 1.679026484489441,
"learning_rate": 4.985216058684362e-06,
"loss": 0.8476,
"step": 72
},
{
"epoch": 0.35609756097560974,
"grad_norm": 1.7254586219787598,
"learning_rate": 4.984797102458697e-06,
"loss": 1.1775,
"step": 73
},
{
"epoch": 0.36097560975609755,
"grad_norm": 1.195844292640686,
"learning_rate": 4.984372310668579e-06,
"loss": 0.6664,
"step": 74
},
{
"epoch": 0.36585365853658536,
"grad_norm": 1.754431128501892,
"learning_rate": 4.983941684311633e-06,
"loss": 1.4361,
"step": 75
},
{
"epoch": 0.37073170731707317,
"grad_norm": 1.8255788087844849,
"learning_rate": 4.983505224399188e-06,
"loss": 0.8977,
"step": 76
},
{
"epoch": 0.375609756097561,
"grad_norm": 1.340451717376709,
"learning_rate": 4.983062931956275e-06,
"loss": 0.9432,
"step": 77
},
{
"epoch": 0.3804878048780488,
"grad_norm": 1.7810503244400024,
"learning_rate": 4.9826148080216195e-06,
"loss": 1.0463,
"step": 78
},
{
"epoch": 0.3853658536585366,
"grad_norm": 1.729826807975769,
"learning_rate": 4.9821608536476445e-06,
"loss": 1.4451,
"step": 79
},
{
"epoch": 0.3902439024390244,
"grad_norm": 2.6638681888580322,
"learning_rate": 4.981701069900465e-06,
"loss": 1.0409,
"step": 80
},
{
"epoch": 0.3951219512195122,
"grad_norm": 1.467076301574707,
"learning_rate": 4.9812354578598876e-06,
"loss": 0.8596,
"step": 81
},
{
"epoch": 0.4,
"grad_norm": 1.453657627105713,
"learning_rate": 4.980764018619405e-06,
"loss": 0.9033,
"step": 82
},
{
"epoch": 0.40487804878048783,
"grad_norm": 1.9025623798370361,
"learning_rate": 4.980286753286196e-06,
"loss": 1.0586,
"step": 83
},
{
"epoch": 0.4097560975609756,
"grad_norm": 1.4698103666305542,
"learning_rate": 4.97980366298112e-06,
"loss": 0.8611,
"step": 84
},
{
"epoch": 0.4146341463414634,
"grad_norm": 1.6086381673812866,
"learning_rate": 4.97931474883872e-06,
"loss": 0.8884,
"step": 85
},
{
"epoch": 0.4195121951219512,
"grad_norm": 1.6679224967956543,
"learning_rate": 4.978820012007213e-06,
"loss": 0.969,
"step": 86
},
{
"epoch": 0.424390243902439,
"grad_norm": 1.5308334827423096,
"learning_rate": 4.978319453648495e-06,
"loss": 1.0345,
"step": 87
},
{
"epoch": 0.4292682926829268,
"grad_norm": 1.7788817882537842,
"learning_rate": 4.977813074938128e-06,
"loss": 0.9649,
"step": 88
},
{
"epoch": 0.43414634146341463,
"grad_norm": 1.591383934020996,
"learning_rate": 4.977300877065347e-06,
"loss": 0.9299,
"step": 89
},
{
"epoch": 0.43902439024390244,
"grad_norm": 1.525985836982727,
"learning_rate": 4.976782861233053e-06,
"loss": 0.7272,
"step": 90
},
{
"epoch": 0.44390243902439025,
"grad_norm": 2.7333128452301025,
"learning_rate": 4.976259028657812e-06,
"loss": 0.6694,
"step": 91
},
{
"epoch": 0.44878048780487806,
"grad_norm": 1.2718108892440796,
"learning_rate": 4.975729380569845e-06,
"loss": 0.8037,
"step": 92
},
{
"epoch": 0.45365853658536587,
"grad_norm": 1.157771110534668,
"learning_rate": 4.975193918213035e-06,
"loss": 0.52,
"step": 93
},
{
"epoch": 0.4585365853658537,
"grad_norm": 1.0602248907089233,
"learning_rate": 4.974652642844921e-06,
"loss": 0.7135,
"step": 94
},
{
"epoch": 0.4634146341463415,
"grad_norm": 2.418555974960327,
"learning_rate": 4.974105555736693e-06,
"loss": 1.0996,
"step": 95
},
{
"epoch": 0.4682926829268293,
"grad_norm": 1.1728110313415527,
"learning_rate": 4.973552658173186e-06,
"loss": 0.7268,
"step": 96
},
{
"epoch": 0.47317073170731705,
"grad_norm": 1.943113088607788,
"learning_rate": 4.972993951452887e-06,
"loss": 0.9091,
"step": 97
},
{
"epoch": 0.47804878048780486,
"grad_norm": 5.063096523284912,
"learning_rate": 4.9724294368879214e-06,
"loss": 0.8242,
"step": 98
},
{
"epoch": 0.48292682926829267,
"grad_norm": 1.5232555866241455,
"learning_rate": 4.971859115804055e-06,
"loss": 1.0543,
"step": 99
},
{
"epoch": 0.4878048780487805,
"grad_norm": 1.1453967094421387,
"learning_rate": 4.9712829895406935e-06,
"loss": 0.8209,
"step": 100
},
{
"epoch": 0.4926829268292683,
"grad_norm": 2.12345814704895,
"learning_rate": 4.970701059450872e-06,
"loss": 0.5849,
"step": 101
},
{
"epoch": 0.4975609756097561,
"grad_norm": 1.2201842069625854,
"learning_rate": 4.970113326901258e-06,
"loss": 0.9969,
"step": 102
},
{
"epoch": 0.5024390243902439,
"grad_norm": 1.3047524690628052,
"learning_rate": 4.9695197932721455e-06,
"loss": 0.9339,
"step": 103
},
{
"epoch": 0.5073170731707317,
"grad_norm": 1.6083660125732422,
"learning_rate": 4.968920459957453e-06,
"loss": 0.9702,
"step": 104
},
{
"epoch": 0.5121951219512195,
"grad_norm": 1.5247286558151245,
"learning_rate": 4.968315328364719e-06,
"loss": 1.0449,
"step": 105
},
{
"epoch": 0.5170731707317073,
"grad_norm": 1.354805588722229,
"learning_rate": 4.9677043999151e-06,
"loss": 1.1431,
"step": 106
},
{
"epoch": 0.5219512195121951,
"grad_norm": 1.088321328163147,
"learning_rate": 4.967087676043366e-06,
"loss": 0.519,
"step": 107
},
{
"epoch": 0.526829268292683,
"grad_norm": 1.5086662769317627,
"learning_rate": 4.966465158197897e-06,
"loss": 0.9357,
"step": 108
},
{
"epoch": 0.5317073170731708,
"grad_norm": 1.3161298036575317,
"learning_rate": 4.965836847840681e-06,
"loss": 0.7234,
"step": 109
},
{
"epoch": 0.5365853658536586,
"grad_norm": 1.4465640783309937,
"learning_rate": 4.96520274644731e-06,
"loss": 0.8911,
"step": 110
},
{
"epoch": 0.5414634146341464,
"grad_norm": 1.0576995611190796,
"learning_rate": 4.964562855506976e-06,
"loss": 0.7254,
"step": 111
},
{
"epoch": 0.5463414634146342,
"grad_norm": 1.1018916368484497,
"learning_rate": 4.963917176522466e-06,
"loss": 0.6603,
"step": 112
},
{
"epoch": 0.551219512195122,
"grad_norm": 2.150622606277466,
"learning_rate": 4.963265711010164e-06,
"loss": 1.0451,
"step": 113
},
{
"epoch": 0.5560975609756098,
"grad_norm": 1.7743186950683594,
"learning_rate": 4.9626084605000395e-06,
"loss": 0.8717,
"step": 114
},
{
"epoch": 0.5609756097560976,
"grad_norm": 1.205291509628296,
"learning_rate": 4.961945426535652e-06,
"loss": 0.5802,
"step": 115
},
{
"epoch": 0.5658536585365853,
"grad_norm": 1.3969353437423706,
"learning_rate": 4.961276610674141e-06,
"loss": 0.9158,
"step": 116
},
{
"epoch": 0.5707317073170731,
"grad_norm": 1.2318240404129028,
"learning_rate": 4.960602014486225e-06,
"loss": 1.0086,
"step": 117
},
{
"epoch": 0.5756097560975609,
"grad_norm": 1.2202470302581787,
"learning_rate": 4.959921639556199e-06,
"loss": 0.7888,
"step": 118
},
{
"epoch": 0.5804878048780487,
"grad_norm": 1.1564440727233887,
"learning_rate": 4.959235487481928e-06,
"loss": 1.0053,
"step": 119
},
{
"epoch": 0.5853658536585366,
"grad_norm": 1.2278865575790405,
"learning_rate": 4.958543559874846e-06,
"loss": 0.5486,
"step": 120
},
{
"epoch": 0.5902439024390244,
"grad_norm": 1.5465888977050781,
"learning_rate": 4.9578458583599495e-06,
"loss": 0.8232,
"step": 121
},
{
"epoch": 0.5951219512195122,
"grad_norm": 1.6284047365188599,
"learning_rate": 4.957142384575795e-06,
"loss": 0.7773,
"step": 122
},
{
"epoch": 0.6,
"grad_norm": 1.1794490814208984,
"learning_rate": 4.956433140174498e-06,
"loss": 0.8236,
"step": 123
},
{
"epoch": 0.6048780487804878,
"grad_norm": 1.2404091358184814,
"learning_rate": 4.9557181268217225e-06,
"loss": 0.8611,
"step": 124
},
{
"epoch": 0.6097560975609756,
"grad_norm": 1.2188373804092407,
"learning_rate": 4.954997346196683e-06,
"loss": 1.2922,
"step": 125
},
{
"epoch": 0.6146341463414634,
"grad_norm": 1.1514503955841064,
"learning_rate": 4.954270799992138e-06,
"loss": 0.7073,
"step": 126
},
{
"epoch": 0.6195121951219512,
"grad_norm": 1.285913109779358,
"learning_rate": 4.953538489914387e-06,
"loss": 0.729,
"step": 127
},
{
"epoch": 0.624390243902439,
"grad_norm": 1.8677104711532593,
"learning_rate": 4.9528004176832654e-06,
"loss": 0.6211,
"step": 128
},
{
"epoch": 0.6292682926829268,
"grad_norm": 1.2369698286056519,
"learning_rate": 4.952056585032142e-06,
"loss": 1.0311,
"step": 129
},
{
"epoch": 0.6341463414634146,
"grad_norm": 1.186990737915039,
"learning_rate": 4.951306993707913e-06,
"loss": 0.8207,
"step": 130
},
{
"epoch": 0.6390243902439025,
"grad_norm": 1.189424991607666,
"learning_rate": 4.950551645470998e-06,
"loss": 0.7853,
"step": 131
},
{
"epoch": 0.6439024390243903,
"grad_norm": 1.3477216958999634,
"learning_rate": 4.9497905420953406e-06,
"loss": 0.7271,
"step": 132
},
{
"epoch": 0.6487804878048781,
"grad_norm": 1.368592619895935,
"learning_rate": 4.949023685368395e-06,
"loss": 0.7394,
"step": 133
},
{
"epoch": 0.6536585365853659,
"grad_norm": 1.2769527435302734,
"learning_rate": 4.948251077091131e-06,
"loss": 1.0713,
"step": 134
},
{
"epoch": 0.6585365853658537,
"grad_norm": 1.1477972269058228,
"learning_rate": 4.947472719078025e-06,
"loss": 0.841,
"step": 135
},
{
"epoch": 0.6634146341463415,
"grad_norm": 1.0983368158340454,
"learning_rate": 4.9466886131570565e-06,
"loss": 0.8665,
"step": 136
},
{
"epoch": 0.6682926829268293,
"grad_norm": 1.2526847124099731,
"learning_rate": 4.945898761169704e-06,
"loss": 1.0613,
"step": 137
},
{
"epoch": 0.6731707317073171,
"grad_norm": 1.1426396369934082,
"learning_rate": 4.945103164970941e-06,
"loss": 0.6248,
"step": 138
},
{
"epoch": 0.6780487804878049,
"grad_norm": 1.3275880813598633,
"learning_rate": 4.9443018264292304e-06,
"loss": 0.755,
"step": 139
},
{
"epoch": 0.6829268292682927,
"grad_norm": 1.4942073822021484,
"learning_rate": 4.9434947474265225e-06,
"loss": 0.9451,
"step": 140
},
{
"epoch": 0.6878048780487804,
"grad_norm": 1.1777600049972534,
"learning_rate": 4.942681929858249e-06,
"loss": 1.0505,
"step": 141
},
{
"epoch": 0.6926829268292682,
"grad_norm": 1.2474116086959839,
"learning_rate": 4.941863375633315e-06,
"loss": 0.9174,
"step": 142
},
{
"epoch": 0.697560975609756,
"grad_norm": 1.5067697763442993,
"learning_rate": 4.9410390866741056e-06,
"loss": 0.7737,
"step": 143
},
{
"epoch": 0.7024390243902439,
"grad_norm": 1.3016574382781982,
"learning_rate": 4.9402090649164655e-06,
"loss": 0.7588,
"step": 144
},
{
"epoch": 0.7073170731707317,
"grad_norm": 1.652600884437561,
"learning_rate": 4.9393733123097085e-06,
"loss": 1.0588,
"step": 145
},
{
"epoch": 0.7121951219512195,
"grad_norm": 1.267997145652771,
"learning_rate": 4.9385318308166065e-06,
"loss": 0.8996,
"step": 146
},
{
"epoch": 0.7170731707317073,
"grad_norm": 1.8560484647750854,
"learning_rate": 4.937684622413385e-06,
"loss": 0.6286,
"step": 147
},
{
"epoch": 0.7219512195121951,
"grad_norm": 1.4419782161712646,
"learning_rate": 4.9368316890897185e-06,
"loss": 0.9818,
"step": 148
},
{
"epoch": 0.7268292682926829,
"grad_norm": 1.134084701538086,
"learning_rate": 4.9359730328487264e-06,
"loss": 0.5246,
"step": 149
},
{
"epoch": 0.7317073170731707,
"grad_norm": 1.0102615356445312,
"learning_rate": 4.935108655706972e-06,
"loss": 0.7948,
"step": 150
},
{
"epoch": 0.7365853658536585,
"grad_norm": 1.3480703830718994,
"learning_rate": 4.934238559694448e-06,
"loss": 1.0951,
"step": 151
},
{
"epoch": 0.7414634146341463,
"grad_norm": 1.4133951663970947,
"learning_rate": 4.9333627468545845e-06,
"loss": 0.6936,
"step": 152
},
{
"epoch": 0.7463414634146341,
"grad_norm": 1.3072413206100464,
"learning_rate": 4.932481219244231e-06,
"loss": 0.799,
"step": 153
},
{
"epoch": 0.751219512195122,
"grad_norm": 1.3893049955368042,
"learning_rate": 4.931593978933666e-06,
"loss": 0.7375,
"step": 154
},
{
"epoch": 0.7560975609756098,
"grad_norm": 1.2313531637191772,
"learning_rate": 4.930701028006577e-06,
"loss": 0.9487,
"step": 155
},
{
"epoch": 0.7609756097560976,
"grad_norm": 1.3426295518875122,
"learning_rate": 4.929802368560066e-06,
"loss": 0.7542,
"step": 156
},
{
"epoch": 0.7658536585365854,
"grad_norm": 1.5280004739761353,
"learning_rate": 4.928898002704642e-06,
"loss": 0.8784,
"step": 157
},
{
"epoch": 0.7707317073170732,
"grad_norm": 1.3149527311325073,
"learning_rate": 4.927987932564215e-06,
"loss": 0.7247,
"step": 158
},
{
"epoch": 0.775609756097561,
"grad_norm": 1.073188304901123,
"learning_rate": 4.927072160276092e-06,
"loss": 0.7826,
"step": 159
},
{
"epoch": 0.7804878048780488,
"grad_norm": 1.426188588142395,
"learning_rate": 4.926150687990969e-06,
"loss": 0.6129,
"step": 160
},
{
"epoch": 0.7853658536585366,
"grad_norm": 1.2348767518997192,
"learning_rate": 4.925223517872934e-06,
"loss": 0.9191,
"step": 161
},
{
"epoch": 0.7902439024390244,
"grad_norm": 1.599665641784668,
"learning_rate": 4.9242906520994484e-06,
"loss": 1.0177,
"step": 162
},
{
"epoch": 0.7951219512195122,
"grad_norm": 1.1841332912445068,
"learning_rate": 4.923352092861358e-06,
"loss": 0.8342,
"step": 163
},
{
"epoch": 0.8,
"grad_norm": 1.0213048458099365,
"learning_rate": 4.922407842362875e-06,
"loss": 0.6292,
"step": 164
},
{
"epoch": 0.8048780487804879,
"grad_norm": 1.1878992319107056,
"learning_rate": 4.921457902821578e-06,
"loss": 0.9357,
"step": 165
},
{
"epoch": 0.8097560975609757,
"grad_norm": 1.297462821006775,
"learning_rate": 4.920502276468408e-06,
"loss": 0.8829,
"step": 166
},
{
"epoch": 0.8146341463414634,
"grad_norm": 1.1800835132598877,
"learning_rate": 4.9195409655476605e-06,
"loss": 0.712,
"step": 167
},
{
"epoch": 0.8195121951219512,
"grad_norm": 1.4254546165466309,
"learning_rate": 4.918573972316982e-06,
"loss": 0.9997,
"step": 168
},
{
"epoch": 0.824390243902439,
"grad_norm": 1.3243224620819092,
"learning_rate": 4.917601299047361e-06,
"loss": 0.7944,
"step": 169
},
{
"epoch": 0.8292682926829268,
"grad_norm": 1.3879033327102661,
"learning_rate": 4.916622948023129e-06,
"loss": 0.7778,
"step": 170
},
{
"epoch": 0.8341463414634146,
"grad_norm": 1.245430588722229,
"learning_rate": 4.915638921541952e-06,
"loss": 0.6247,
"step": 171
},
{
"epoch": 0.8390243902439024,
"grad_norm": 1.3728258609771729,
"learning_rate": 4.914649221914822e-06,
"loss": 0.8762,
"step": 172
},
{
"epoch": 0.8439024390243902,
"grad_norm": 1.3080862760543823,
"learning_rate": 4.913653851466057e-06,
"loss": 0.6381,
"step": 173
},
{
"epoch": 0.848780487804878,
"grad_norm": 1.5109484195709229,
"learning_rate": 4.912652812533291e-06,
"loss": 0.8127,
"step": 174
},
{
"epoch": 0.8536585365853658,
"grad_norm": 1.237879753112793,
"learning_rate": 4.911646107467472e-06,
"loss": 0.8257,
"step": 175
},
{
"epoch": 0.8585365853658536,
"grad_norm": 1.2725176811218262,
"learning_rate": 4.9106337386328524e-06,
"loss": 0.9758,
"step": 176
},
{
"epoch": 0.8634146341463415,
"grad_norm": 1.271756887435913,
"learning_rate": 4.909615708406991e-06,
"loss": 0.8436,
"step": 177
},
{
"epoch": 0.8682926829268293,
"grad_norm": 1.0920095443725586,
"learning_rate": 4.908592019180738e-06,
"loss": 0.6657,
"step": 178
},
{
"epoch": 0.8731707317073171,
"grad_norm": 0.9080491065979004,
"learning_rate": 4.907562673358234e-06,
"loss": 0.6322,
"step": 179
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.9868524074554443,
"learning_rate": 4.906527673356907e-06,
"loss": 0.5918,
"step": 180
},
{
"epoch": 0.8829268292682927,
"grad_norm": 1.49118173122406,
"learning_rate": 4.905487021607462e-06,
"loss": 0.4939,
"step": 181
},
{
"epoch": 0.8878048780487805,
"grad_norm": 1.3323795795440674,
"learning_rate": 4.904440720553876e-06,
"loss": 0.8444,
"step": 182
},
{
"epoch": 0.8926829268292683,
"grad_norm": 1.340374231338501,
"learning_rate": 4.903388772653396e-06,
"loss": 0.765,
"step": 183
},
{
"epoch": 0.8975609756097561,
"grad_norm": 1.2728081941604614,
"learning_rate": 4.902331180376529e-06,
"loss": 0.8126,
"step": 184
},
{
"epoch": 0.9024390243902439,
"grad_norm": 1.154461145401001,
"learning_rate": 4.901267946207038e-06,
"loss": 0.9256,
"step": 185
},
{
"epoch": 0.9073170731707317,
"grad_norm": 1.0781828165054321,
"learning_rate": 4.900199072641937e-06,
"loss": 0.7057,
"step": 186
},
{
"epoch": 0.9121951219512195,
"grad_norm": 1.0474439859390259,
"learning_rate": 4.899124562191484e-06,
"loss": 0.6024,
"step": 187
},
{
"epoch": 0.9170731707317074,
"grad_norm": 1.019089937210083,
"learning_rate": 4.8980444173791735e-06,
"loss": 0.5001,
"step": 188
},
{
"epoch": 0.9219512195121952,
"grad_norm": 1.3556936979293823,
"learning_rate": 4.896958640741735e-06,
"loss": 0.8636,
"step": 189
},
{
"epoch": 0.926829268292683,
"grad_norm": 1.2974662780761719,
"learning_rate": 4.895867234829121e-06,
"loss": 0.9257,
"step": 190
},
{
"epoch": 0.9317073170731708,
"grad_norm": 1.3130805492401123,
"learning_rate": 4.894770202204509e-06,
"loss": 0.7026,
"step": 191
},
{
"epoch": 0.9365853658536586,
"grad_norm": 1.5521293878555298,
"learning_rate": 4.893667545444285e-06,
"loss": 0.8671,
"step": 192
},
{
"epoch": 0.9414634146341463,
"grad_norm": 1.0167640447616577,
"learning_rate": 4.8925592671380495e-06,
"loss": 0.6643,
"step": 193
},
{
"epoch": 0.9463414634146341,
"grad_norm": 0.8992295861244202,
"learning_rate": 4.891445369888601e-06,
"loss": 0.5294,
"step": 194
},
{
"epoch": 0.9512195121951219,
"grad_norm": 1.053918480873108,
"learning_rate": 4.890325856311936e-06,
"loss": 0.8734,
"step": 195
},
{
"epoch": 0.9560975609756097,
"grad_norm": 1.1650863885879517,
"learning_rate": 4.889200729037241e-06,
"loss": 0.7858,
"step": 196
},
{
"epoch": 0.9609756097560975,
"grad_norm": 1.0390702486038208,
"learning_rate": 4.888069990706884e-06,
"loss": 0.6428,
"step": 197
},
{
"epoch": 0.9658536585365853,
"grad_norm": 1.4749332666397095,
"learning_rate": 4.886933643976414e-06,
"loss": 0.55,
"step": 198
},
{
"epoch": 0.9707317073170731,
"grad_norm": 0.9483368992805481,
"learning_rate": 4.885791691514548e-06,
"loss": 0.4786,
"step": 199
},
{
"epoch": 0.975609756097561,
"grad_norm": 1.0259203910827637,
"learning_rate": 4.884644136003172e-06,
"loss": 0.6137,
"step": 200
},
{
"epoch": 0.9804878048780488,
"grad_norm": 1.1431037187576294,
"learning_rate": 4.883490980137327e-06,
"loss": 1.2973,
"step": 201
},
{
"epoch": 0.9853658536585366,
"grad_norm": 1.1985281705856323,
"learning_rate": 4.882332226625208e-06,
"loss": 0.7734,
"step": 202
},
{
"epoch": 0.9902439024390244,
"grad_norm": 0.9396414160728455,
"learning_rate": 4.881167878188158e-06,
"loss": 0.8172,
"step": 203
},
{
"epoch": 0.9951219512195122,
"grad_norm": 1.3293070793151855,
"learning_rate": 4.8799979375606565e-06,
"loss": 0.727,
"step": 204
},
{
"epoch": 1.0,
"grad_norm": 1.0133116245269775,
"learning_rate": 4.878822407490319e-06,
"loss": 0.7099,
"step": 205
},
{
"epoch": 1.0048780487804878,
"grad_norm": 1.0823330879211426,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.6611,
"step": 206
},
{
"epoch": 1.0097560975609756,
"grad_norm": 0.9297066926956177,
"learning_rate": 4.876454590077216e-06,
"loss": 0.5825,
"step": 207
},
{
"epoch": 1.0146341463414634,
"grad_norm": 1.0025253295898438,
"learning_rate": 4.875262308295289e-06,
"loss": 0.8154,
"step": 208
},
{
"epoch": 1.0195121951219512,
"grad_norm": 1.1356369256973267,
"learning_rate": 4.874064448192185e-06,
"loss": 0.7126,
"step": 209
},
{
"epoch": 1.024390243902439,
"grad_norm": 1.4361388683319092,
"learning_rate": 4.872861012581088e-06,
"loss": 0.6323,
"step": 210
},
{
"epoch": 1.0292682926829269,
"grad_norm": 1.2314097881317139,
"learning_rate": 4.871652004288275e-06,
"loss": 0.6683,
"step": 211
},
{
"epoch": 1.0341463414634147,
"grad_norm": 1.1820945739746094,
"learning_rate": 4.870437426153113e-06,
"loss": 0.6659,
"step": 212
},
{
"epoch": 1.0390243902439025,
"grad_norm": 1.1955009698867798,
"learning_rate": 4.869217281028045e-06,
"loss": 0.8964,
"step": 213
},
{
"epoch": 1.0439024390243903,
"grad_norm": 1.2589020729064941,
"learning_rate": 4.867991571778592e-06,
"loss": 0.894,
"step": 214
},
{
"epoch": 1.048780487804878,
"grad_norm": 0.8933411836624146,
"learning_rate": 4.866760301283342e-06,
"loss": 0.5199,
"step": 215
},
{
"epoch": 1.053658536585366,
"grad_norm": 1.126219630241394,
"learning_rate": 4.865523472433942e-06,
"loss": 0.7014,
"step": 216
},
{
"epoch": 1.0585365853658537,
"grad_norm": 1.1173880100250244,
"learning_rate": 4.8642810881350935e-06,
"loss": 0.6085,
"step": 217
},
{
"epoch": 1.0634146341463415,
"grad_norm": 0.9508454203605652,
"learning_rate": 4.863033151304546e-06,
"loss": 0.6333,
"step": 218
},
{
"epoch": 1.0682926829268293,
"grad_norm": 1.8204004764556885,
"learning_rate": 4.861779664873088e-06,
"loss": 0.8899,
"step": 219
},
{
"epoch": 1.0731707317073171,
"grad_norm": 1.5721564292907715,
"learning_rate": 4.8605206317845425e-06,
"loss": 0.8105,
"step": 220
},
{
"epoch": 1.078048780487805,
"grad_norm": 1.3572758436203003,
"learning_rate": 4.859256054995758e-06,
"loss": 0.8634,
"step": 221
},
{
"epoch": 1.0829268292682928,
"grad_norm": 0.8531430959701538,
"learning_rate": 4.8579859374766e-06,
"loss": 0.4826,
"step": 222
},
{
"epoch": 1.0878048780487806,
"grad_norm": 0.771088719367981,
"learning_rate": 4.856710282209952e-06,
"loss": 0.4085,
"step": 223
},
{
"epoch": 1.0926829268292684,
"grad_norm": 1.0056614875793457,
"learning_rate": 4.855429092191698e-06,
"loss": 0.7168,
"step": 224
},
{
"epoch": 1.0975609756097562,
"grad_norm": 0.9825501441955566,
"learning_rate": 4.854142370430725e-06,
"loss": 0.6276,
"step": 225
},
{
"epoch": 1.102439024390244,
"grad_norm": 1.0209708213806152,
"learning_rate": 4.8528501199489045e-06,
"loss": 0.6521,
"step": 226
},
{
"epoch": 1.1073170731707318,
"grad_norm": 1.3162622451782227,
"learning_rate": 4.851552343781099e-06,
"loss": 0.8244,
"step": 227
},
{
"epoch": 1.1121951219512196,
"grad_norm": 1.150299072265625,
"learning_rate": 4.850249044975145e-06,
"loss": 0.7804,
"step": 228
},
{
"epoch": 1.1170731707317074,
"grad_norm": 1.142996907234192,
"learning_rate": 4.848940226591849e-06,
"loss": 0.9935,
"step": 229
},
{
"epoch": 1.1219512195121952,
"grad_norm": 0.9757675528526306,
"learning_rate": 4.847625891704982e-06,
"loss": 0.6041,
"step": 230
},
{
"epoch": 1.126829268292683,
"grad_norm": 1.1686437129974365,
"learning_rate": 4.846306043401268e-06,
"loss": 0.6873,
"step": 231
},
{
"epoch": 1.1317073170731708,
"grad_norm": 1.4826396703720093,
"learning_rate": 4.844980684780381e-06,
"loss": 0.6412,
"step": 232
},
{
"epoch": 1.1365853658536587,
"grad_norm": 1.204198956489563,
"learning_rate": 4.8436498189549345e-06,
"loss": 0.5779,
"step": 233
},
{
"epoch": 1.1414634146341462,
"grad_norm": 0.8927454948425293,
"learning_rate": 4.842313449050477e-06,
"loss": 0.5894,
"step": 234
},
{
"epoch": 1.146341463414634,
"grad_norm": 0.9504485726356506,
"learning_rate": 4.840971578205486e-06,
"loss": 0.5195,
"step": 235
},
{
"epoch": 1.1512195121951219,
"grad_norm": 0.8682957291603088,
"learning_rate": 4.839624209571352e-06,
"loss": 0.3927,
"step": 236
},
{
"epoch": 1.1560975609756097,
"grad_norm": 1.0243151187896729,
"learning_rate": 4.838271346312381e-06,
"loss": 0.8549,
"step": 237
},
{
"epoch": 1.1609756097560975,
"grad_norm": 0.9931624531745911,
"learning_rate": 4.836912991605782e-06,
"loss": 0.8616,
"step": 238
},
{
"epoch": 1.1658536585365853,
"grad_norm": 1.0424671173095703,
"learning_rate": 4.835549148641663e-06,
"loss": 0.5675,
"step": 239
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.9706572890281677,
"learning_rate": 4.834179820623018e-06,
"loss": 0.6444,
"step": 240
},
{
"epoch": 1.175609756097561,
"grad_norm": 0.9557904601097107,
"learning_rate": 4.832805010765724e-06,
"loss": 0.6006,
"step": 241
},
{
"epoch": 1.1804878048780487,
"grad_norm": 1.2297061681747437,
"learning_rate": 4.831424722298531e-06,
"loss": 0.6897,
"step": 242
},
{
"epoch": 1.1853658536585365,
"grad_norm": 1.3953443765640259,
"learning_rate": 4.830038958463061e-06,
"loss": 0.772,
"step": 243
},
{
"epoch": 1.1902439024390243,
"grad_norm": 2.120232105255127,
"learning_rate": 4.828647722513785e-06,
"loss": 0.9361,
"step": 244
},
{
"epoch": 1.1951219512195121,
"grad_norm": 1.4457300901412964,
"learning_rate": 4.827251017718034e-06,
"loss": 0.8036,
"step": 245
},
{
"epoch": 1.2,
"grad_norm": 1.0927690267562866,
"learning_rate": 4.8258488473559794e-06,
"loss": 0.8354,
"step": 246
},
{
"epoch": 1.2048780487804878,
"grad_norm": 1.4953900575637817,
"learning_rate": 4.824441214720629e-06,
"loss": 0.9083,
"step": 247
},
{
"epoch": 1.2097560975609756,
"grad_norm": 0.9229385852813721,
"learning_rate": 4.823028123117818e-06,
"loss": 0.441,
"step": 248
},
{
"epoch": 1.2146341463414634,
"grad_norm": 1.8001827001571655,
"learning_rate": 4.8216095758662015e-06,
"loss": 0.7975,
"step": 249
},
{
"epoch": 1.2195121951219512,
"grad_norm": 1.0904715061187744,
"learning_rate": 4.82018557629725e-06,
"loss": 0.8526,
"step": 250
},
{
"epoch": 1.224390243902439,
"grad_norm": 1.3482294082641602,
"learning_rate": 4.8187561277552376e-06,
"loss": 0.6198,
"step": 251
},
{
"epoch": 1.2292682926829268,
"grad_norm": 0.9257897138595581,
"learning_rate": 4.817321233597232e-06,
"loss": 0.8115,
"step": 252
},
{
"epoch": 1.2341463414634146,
"grad_norm": 1.1533488035202026,
"learning_rate": 4.815880897193095e-06,
"loss": 0.5524,
"step": 253
},
{
"epoch": 1.2390243902439024,
"grad_norm": 1.2778701782226562,
"learning_rate": 4.814435121925466e-06,
"loss": 0.802,
"step": 254
},
{
"epoch": 1.2439024390243902,
"grad_norm": 1.037063717842102,
"learning_rate": 4.812983911189761e-06,
"loss": 0.696,
"step": 255
},
{
"epoch": 1.248780487804878,
"grad_norm": 1.0112067461013794,
"learning_rate": 4.811527268394157e-06,
"loss": 0.5189,
"step": 256
},
{
"epoch": 1.2536585365853659,
"grad_norm": 1.1271675825119019,
"learning_rate": 4.810065196959591e-06,
"loss": 0.6292,
"step": 257
},
{
"epoch": 1.2585365853658537,
"grad_norm": 1.0987378358840942,
"learning_rate": 4.8085977003197496e-06,
"loss": 0.8452,
"step": 258
},
{
"epoch": 1.2634146341463415,
"grad_norm": 1.5386906862258911,
"learning_rate": 4.807124781921059e-06,
"loss": 1.1094,
"step": 259
},
{
"epoch": 1.2682926829268293,
"grad_norm": 0.9946249127388,
"learning_rate": 4.805646445222679e-06,
"loss": 0.6367,
"step": 260
},
{
"epoch": 1.273170731707317,
"grad_norm": 0.9774323105812073,
"learning_rate": 4.804162693696494e-06,
"loss": 0.6157,
"step": 261
},
{
"epoch": 1.278048780487805,
"grad_norm": 1.9447377920150757,
"learning_rate": 4.802673530827105e-06,
"loss": 0.6076,
"step": 262
},
{
"epoch": 1.2829268292682927,
"grad_norm": 0.7815698981285095,
"learning_rate": 4.801178960111823e-06,
"loss": 0.5985,
"step": 263
},
{
"epoch": 1.2878048780487805,
"grad_norm": 1.014426589012146,
"learning_rate": 4.799678985060658e-06,
"loss": 0.851,
"step": 264
},
{
"epoch": 1.2926829268292683,
"grad_norm": 0.9904977083206177,
"learning_rate": 4.798173609196314e-06,
"loss": 0.8511,
"step": 265
},
{
"epoch": 1.2975609756097561,
"grad_norm": 1.0354292392730713,
"learning_rate": 4.796662836054176e-06,
"loss": 0.4885,
"step": 266
},
{
"epoch": 1.302439024390244,
"grad_norm": 0.9485008120536804,
"learning_rate": 4.795146669182304e-06,
"loss": 0.6486,
"step": 267
},
{
"epoch": 1.3073170731707318,
"grad_norm": 0.9609008431434631,
"learning_rate": 4.793625112141431e-06,
"loss": 0.5446,
"step": 268
},
{
"epoch": 1.3121951219512196,
"grad_norm": 0.8880930542945862,
"learning_rate": 4.792098168504943e-06,
"loss": 0.5741,
"step": 269
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.8475173711776733,
"learning_rate": 4.790565841858879e-06,
"loss": 0.488,
"step": 270
},
{
"epoch": 1.3219512195121952,
"grad_norm": 1.04447603225708,
"learning_rate": 4.789028135801919e-06,
"loss": 0.7792,
"step": 271
},
{
"epoch": 1.326829268292683,
"grad_norm": 1.2021688222885132,
"learning_rate": 4.787485053945377e-06,
"loss": 0.8513,
"step": 272
},
{
"epoch": 1.3317073170731708,
"grad_norm": 0.9609706401824951,
"learning_rate": 4.785936599913193e-06,
"loss": 0.7186,
"step": 273
},
{
"epoch": 1.3365853658536586,
"grad_norm": 0.9922477602958679,
"learning_rate": 4.784382777341922e-06,
"loss": 0.7355,
"step": 274
},
{
"epoch": 1.3414634146341464,
"grad_norm": 1.2870302200317383,
"learning_rate": 4.782823589880729e-06,
"loss": 1.0985,
"step": 275
},
{
"epoch": 1.346341463414634,
"grad_norm": 0.9369707107543945,
"learning_rate": 4.7812590411913755e-06,
"loss": 0.7729,
"step": 276
},
{
"epoch": 1.3512195121951218,
"grad_norm": 1.1406941413879395,
"learning_rate": 4.779689134948217e-06,
"loss": 1.1142,
"step": 277
},
{
"epoch": 1.3560975609756096,
"grad_norm": 1.1078243255615234,
"learning_rate": 4.77811387483819e-06,
"loss": 0.5348,
"step": 278
},
{
"epoch": 1.3609756097560974,
"grad_norm": 1.1744791269302368,
"learning_rate": 4.776533264560804e-06,
"loss": 0.7202,
"step": 279
},
{
"epoch": 1.3658536585365852,
"grad_norm": 1.2643998861312866,
"learning_rate": 4.774947307828134e-06,
"loss": 0.9659,
"step": 280
},
{
"epoch": 1.370731707317073,
"grad_norm": 1.069071888923645,
"learning_rate": 4.773356008364812e-06,
"loss": 0.6257,
"step": 281
},
{
"epoch": 1.3756097560975609,
"grad_norm": 0.8661286234855652,
"learning_rate": 4.771759369908017e-06,
"loss": 0.4138,
"step": 282
},
{
"epoch": 1.3804878048780487,
"grad_norm": 0.9560813903808594,
"learning_rate": 4.7701573962074635e-06,
"loss": 0.6435,
"step": 283
},
{
"epoch": 1.3853658536585365,
"grad_norm": 1.0229038000106812,
"learning_rate": 4.7685500910254015e-06,
"loss": 0.5709,
"step": 284
},
{
"epoch": 1.3902439024390243,
"grad_norm": 1.7641900777816772,
"learning_rate": 4.766937458136598e-06,
"loss": 0.7815,
"step": 285
},
{
"epoch": 1.395121951219512,
"grad_norm": 1.011093258857727,
"learning_rate": 4.765319501328332e-06,
"loss": 0.7428,
"step": 286
},
{
"epoch": 1.4,
"grad_norm": 1.0194127559661865,
"learning_rate": 4.763696224400391e-06,
"loss": 0.542,
"step": 287
},
{
"epoch": 1.4048780487804877,
"grad_norm": 0.9469794034957886,
"learning_rate": 4.762067631165049e-06,
"loss": 0.5758,
"step": 288
},
{
"epoch": 1.4097560975609755,
"grad_norm": 1.1177825927734375,
"learning_rate": 4.760433725447071e-06,
"loss": 0.8141,
"step": 289
},
{
"epoch": 1.4146341463414633,
"grad_norm": 1.388083577156067,
"learning_rate": 4.758794511083697e-06,
"loss": 0.8286,
"step": 290
},
{
"epoch": 1.4195121951219511,
"grad_norm": 1.1623952388763428,
"learning_rate": 4.757149991924633e-06,
"loss": 0.6733,
"step": 291
},
{
"epoch": 1.424390243902439,
"grad_norm": 1.328235149383545,
"learning_rate": 4.755500171832045e-06,
"loss": 0.5397,
"step": 292
},
{
"epoch": 1.4292682926829268,
"grad_norm": 1.0121268033981323,
"learning_rate": 4.753845054680548e-06,
"loss": 0.6813,
"step": 293
},
{
"epoch": 1.4341463414634146,
"grad_norm": 1.0799837112426758,
"learning_rate": 4.752184644357197e-06,
"loss": 0.5136,
"step": 294
},
{
"epoch": 1.4390243902439024,
"grad_norm": 1.010602593421936,
"learning_rate": 4.750518944761477e-06,
"loss": 0.5768,
"step": 295
},
{
"epoch": 1.4439024390243902,
"grad_norm": 1.1127134561538696,
"learning_rate": 4.748847959805297e-06,
"loss": 0.5663,
"step": 296
},
{
"epoch": 1.448780487804878,
"grad_norm": 0.9182597398757935,
"learning_rate": 4.7471716934129774e-06,
"loss": 0.5599,
"step": 297
},
{
"epoch": 1.4536585365853658,
"grad_norm": 1.0173683166503906,
"learning_rate": 4.745490149521242e-06,
"loss": 0.5155,
"step": 298
},
{
"epoch": 1.4585365853658536,
"grad_norm": 0.9683080911636353,
"learning_rate": 4.743803332079209e-06,
"loss": 0.5744,
"step": 299
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.9615181684494019,
"learning_rate": 4.742111245048382e-06,
"loss": 0.5961,
"step": 300
},
{
"epoch": 1.4682926829268292,
"grad_norm": 1.1113585233688354,
"learning_rate": 4.740413892402639e-06,
"loss": 0.5751,
"step": 301
},
{
"epoch": 1.473170731707317,
"grad_norm": 1.1533280611038208,
"learning_rate": 4.738711278128228e-06,
"loss": 0.6668,
"step": 302
},
{
"epoch": 1.4780487804878049,
"grad_norm": 1.086147665977478,
"learning_rate": 4.7370034062237476e-06,
"loss": 0.4249,
"step": 303
},
{
"epoch": 1.4829268292682927,
"grad_norm": 3.0501999855041504,
"learning_rate": 4.73529028070015e-06,
"loss": 0.6284,
"step": 304
},
{
"epoch": 1.4878048780487805,
"grad_norm": 1.2545751333236694,
"learning_rate": 4.733571905580723e-06,
"loss": 0.8968,
"step": 305
},
{
"epoch": 1.4926829268292683,
"grad_norm": 0.9740838408470154,
"learning_rate": 4.731848284901082e-06,
"loss": 0.7402,
"step": 306
},
{
"epoch": 1.497560975609756,
"grad_norm": 1.0430322885513306,
"learning_rate": 4.730119422709165e-06,
"loss": 0.4697,
"step": 307
},
{
"epoch": 1.502439024390244,
"grad_norm": 1.2707469463348389,
"learning_rate": 4.728385323065215e-06,
"loss": 0.7548,
"step": 308
},
{
"epoch": 1.5073170731707317,
"grad_norm": 0.9956101775169373,
"learning_rate": 4.7266459900417815e-06,
"loss": 0.5444,
"step": 309
},
{
"epoch": 1.5121951219512195,
"grad_norm": 1.0116164684295654,
"learning_rate": 4.724901427723698e-06,
"loss": 0.7939,
"step": 310
},
{
"epoch": 1.5170731707317073,
"grad_norm": 1.0668343305587769,
"learning_rate": 4.723151640208084e-06,
"loss": 0.3966,
"step": 311
},
{
"epoch": 1.5219512195121951,
"grad_norm": 0.9897716045379639,
"learning_rate": 4.721396631604327e-06,
"loss": 0.4675,
"step": 312
},
{
"epoch": 1.526829268292683,
"grad_norm": 0.9885281920433044,
"learning_rate": 4.7196364060340785e-06,
"loss": 0.5411,
"step": 313
},
{
"epoch": 1.5317073170731708,
"grad_norm": 1.1385118961334229,
"learning_rate": 4.7178709676312416e-06,
"loss": 0.8041,
"step": 314
},
{
"epoch": 1.5365853658536586,
"grad_norm": 1.2253623008728027,
"learning_rate": 4.716100320541961e-06,
"loss": 1.0583,
"step": 315
},
{
"epoch": 1.5414634146341464,
"grad_norm": 1.1313822269439697,
"learning_rate": 4.714324468924614e-06,
"loss": 0.7701,
"step": 316
},
{
"epoch": 1.5463414634146342,
"grad_norm": 1.120343804359436,
"learning_rate": 4.712543416949803e-06,
"loss": 0.7407,
"step": 317
},
{
"epoch": 1.551219512195122,
"grad_norm": 1.5084882974624634,
"learning_rate": 4.71075716880034e-06,
"loss": 0.753,
"step": 318
},
{
"epoch": 1.5560975609756098,
"grad_norm": 1.3213189840316772,
"learning_rate": 4.708965728671243e-06,
"loss": 0.8935,
"step": 319
},
{
"epoch": 1.5609756097560976,
"grad_norm": 1.170746922492981,
"learning_rate": 4.7071691007697214e-06,
"loss": 0.6782,
"step": 320
},
{
"epoch": 1.5658536585365854,
"grad_norm": 1.2023199796676636,
"learning_rate": 4.705367289315172e-06,
"loss": 0.8011,
"step": 321
},
{
"epoch": 1.5707317073170732,
"grad_norm": 1.0213698148727417,
"learning_rate": 4.703560298539158e-06,
"loss": 0.5011,
"step": 322
},
{
"epoch": 1.575609756097561,
"grad_norm": 1.045581579208374,
"learning_rate": 4.701748132685415e-06,
"loss": 0.5501,
"step": 323
},
{
"epoch": 1.5804878048780489,
"grad_norm": 0.9141654372215271,
"learning_rate": 4.699930796009825e-06,
"loss": 0.5551,
"step": 324
},
{
"epoch": 1.5853658536585367,
"grad_norm": 1.3005549907684326,
"learning_rate": 4.698108292780418e-06,
"loss": 0.7293,
"step": 325
},
{
"epoch": 1.5902439024390245,
"grad_norm": 0.985907793045044,
"learning_rate": 4.696280627277356e-06,
"loss": 0.5366,
"step": 326
},
{
"epoch": 1.5951219512195123,
"grad_norm": 0.9095384478569031,
"learning_rate": 4.6944478037929255e-06,
"loss": 0.5508,
"step": 327
},
{
"epoch": 1.6,
"grad_norm": 1.346676230430603,
"learning_rate": 4.692609826631525e-06,
"loss": 0.6719,
"step": 328
},
{
"epoch": 1.604878048780488,
"grad_norm": 1.088921308517456,
"learning_rate": 4.690766700109659e-06,
"loss": 0.4088,
"step": 329
},
{
"epoch": 1.6097560975609757,
"grad_norm": 0.8905205726623535,
"learning_rate": 4.6889184285559234e-06,
"loss": 0.4671,
"step": 330
},
{
"epoch": 1.6146341463414635,
"grad_norm": 1.2066144943237305,
"learning_rate": 4.687065016310996e-06,
"loss": 0.7891,
"step": 331
},
{
"epoch": 1.6195121951219513,
"grad_norm": 1.0449296236038208,
"learning_rate": 4.685206467727631e-06,
"loss": 0.6103,
"step": 332
},
{
"epoch": 1.6243902439024391,
"grad_norm": 1.15915048122406,
"learning_rate": 4.683342787170644e-06,
"loss": 0.605,
"step": 333
},
{
"epoch": 1.629268292682927,
"grad_norm": 1.0918726921081543,
"learning_rate": 4.6814739790169006e-06,
"loss": 0.5444,
"step": 334
},
{
"epoch": 1.6341463414634148,
"grad_norm": 1.0298805236816406,
"learning_rate": 4.679600047655313e-06,
"loss": 0.7902,
"step": 335
},
{
"epoch": 1.6390243902439026,
"grad_norm": 1.3017504215240479,
"learning_rate": 4.6777209974868194e-06,
"loss": 1.1195,
"step": 336
},
{
"epoch": 1.6439024390243904,
"grad_norm": 1.45652174949646,
"learning_rate": 4.675836832924387e-06,
"loss": 0.6358,
"step": 337
},
{
"epoch": 1.6487804878048782,
"grad_norm": 0.8610002398490906,
"learning_rate": 4.673947558392989e-06,
"loss": 0.4196,
"step": 338
},
{
"epoch": 1.653658536585366,
"grad_norm": 0.8891443014144897,
"learning_rate": 4.6720531783296e-06,
"loss": 0.5593,
"step": 339
},
{
"epoch": 1.6585365853658538,
"grad_norm": 0.9679135680198669,
"learning_rate": 4.670153697183185e-06,
"loss": 0.6149,
"step": 340
},
{
"epoch": 1.6634146341463416,
"grad_norm": 1.019687294960022,
"learning_rate": 4.668249119414692e-06,
"loss": 0.5855,
"step": 341
},
{
"epoch": 1.6682926829268294,
"grad_norm": 0.9645085334777832,
"learning_rate": 4.666339449497033e-06,
"loss": 0.6321,
"step": 342
},
{
"epoch": 1.6731707317073172,
"grad_norm": 1.1760913133621216,
"learning_rate": 4.664424691915084e-06,
"loss": 0.5839,
"step": 343
},
{
"epoch": 1.678048780487805,
"grad_norm": 1.1706181764602661,
"learning_rate": 4.6625048511656675e-06,
"loss": 0.5806,
"step": 344
},
{
"epoch": 1.6829268292682928,
"grad_norm": 1.1575871706008911,
"learning_rate": 4.660579931757543e-06,
"loss": 0.4945,
"step": 345
},
{
"epoch": 1.6878048780487804,
"grad_norm": 0.9929284453392029,
"learning_rate": 4.6586499382113985e-06,
"loss": 0.5662,
"step": 346
},
{
"epoch": 1.6926829268292682,
"grad_norm": 0.9940921068191528,
"learning_rate": 4.6567148750598375e-06,
"loss": 0.8166,
"step": 347
},
{
"epoch": 1.697560975609756,
"grad_norm": 1.1893478631973267,
"learning_rate": 4.6547747468473705e-06,
"loss": 0.9291,
"step": 348
},
{
"epoch": 1.7024390243902439,
"grad_norm": 0.8776846528053284,
"learning_rate": 4.652829558130404e-06,
"loss": 0.4457,
"step": 349
},
{
"epoch": 1.7073170731707317,
"grad_norm": 1.2133142948150635,
"learning_rate": 4.6508793134772265e-06,
"loss": 0.6364,
"step": 350
},
{
"epoch": 1.7121951219512195,
"grad_norm": 0.8865175247192383,
"learning_rate": 4.648924017468003e-06,
"loss": 0.5514,
"step": 351
},
{
"epoch": 1.7170731707317073,
"grad_norm": 1.3637226819992065,
"learning_rate": 4.646963674694761e-06,
"loss": 0.8656,
"step": 352
},
{
"epoch": 1.721951219512195,
"grad_norm": 0.875629723072052,
"learning_rate": 4.64499828976138e-06,
"loss": 0.3992,
"step": 353
},
{
"epoch": 1.726829268292683,
"grad_norm": 1.0361976623535156,
"learning_rate": 4.64302786728358e-06,
"loss": 0.5056,
"step": 354
},
{
"epoch": 1.7317073170731707,
"grad_norm": 0.9224256873130798,
"learning_rate": 4.641052411888913e-06,
"loss": 0.5352,
"step": 355
},
{
"epoch": 1.7365853658536585,
"grad_norm": 1.2034342288970947,
"learning_rate": 4.6390719282167515e-06,
"loss": 0.4753,
"step": 356
},
{
"epoch": 1.7414634146341463,
"grad_norm": 1.056547999382019,
"learning_rate": 4.637086420918276e-06,
"loss": 0.7975,
"step": 357
},
{
"epoch": 1.7463414634146341,
"grad_norm": 0.9398707151412964,
"learning_rate": 4.635095894656465e-06,
"loss": 0.6944,
"step": 358
},
{
"epoch": 1.751219512195122,
"grad_norm": 1.3796380758285522,
"learning_rate": 4.633100354106085e-06,
"loss": 0.377,
"step": 359
},
{
"epoch": 1.7560975609756098,
"grad_norm": 1.1999366283416748,
"learning_rate": 4.631099803953677e-06,
"loss": 0.8488,
"step": 360
},
{
"epoch": 1.7609756097560976,
"grad_norm": 1.1613068580627441,
"learning_rate": 4.629094248897546e-06,
"loss": 0.4856,
"step": 361
},
{
"epoch": 1.7658536585365854,
"grad_norm": 1.407758116722107,
"learning_rate": 4.627083693647757e-06,
"loss": 0.5932,
"step": 362
},
{
"epoch": 1.7707317073170732,
"grad_norm": 1.1262705326080322,
"learning_rate": 4.625068142926111e-06,
"loss": 0.862,
"step": 363
},
{
"epoch": 1.775609756097561,
"grad_norm": 1.0207730531692505,
"learning_rate": 4.623047601466144e-06,
"loss": 0.7695,
"step": 364
},
{
"epoch": 1.7804878048780488,
"grad_norm": 1.0084431171417236,
"learning_rate": 4.621022074013114e-06,
"loss": 0.6608,
"step": 365
},
{
"epoch": 1.7853658536585366,
"grad_norm": 1.044545292854309,
"learning_rate": 4.618991565323987e-06,
"loss": 0.5231,
"step": 366
},
{
"epoch": 1.7902439024390244,
"grad_norm": 0.9962389469146729,
"learning_rate": 4.616956080167426e-06,
"loss": 0.5736,
"step": 367
},
{
"epoch": 1.7951219512195122,
"grad_norm": 1.3212288618087769,
"learning_rate": 4.614915623323786e-06,
"loss": 0.9586,
"step": 368
},
{
"epoch": 1.8,
"grad_norm": 0.9554499983787537,
"learning_rate": 4.612870199585092e-06,
"loss": 0.4346,
"step": 369
},
{
"epoch": 1.8048780487804879,
"grad_norm": 1.082366943359375,
"learning_rate": 4.610819813755038e-06,
"loss": 0.505,
"step": 370
},
{
"epoch": 1.8097560975609757,
"grad_norm": 1.225993037223816,
"learning_rate": 4.608764470648971e-06,
"loss": 0.4801,
"step": 371
},
{
"epoch": 1.8146341463414632,
"grad_norm": 1.0881706476211548,
"learning_rate": 4.606704175093879e-06,
"loss": 0.4478,
"step": 372
},
{
"epoch": 1.819512195121951,
"grad_norm": 1.114046335220337,
"learning_rate": 4.604638931928383e-06,
"loss": 0.8015,
"step": 373
},
{
"epoch": 1.8243902439024389,
"grad_norm": 0.9838706851005554,
"learning_rate": 4.602568746002718e-06,
"loss": 0.5204,
"step": 374
},
{
"epoch": 1.8292682926829267,
"grad_norm": 1.038713812828064,
"learning_rate": 4.600493622178734e-06,
"loss": 0.8388,
"step": 375
},
{
"epoch": 1.8341463414634145,
"grad_norm": 1.0684245824813843,
"learning_rate": 4.598413565329876e-06,
"loss": 0.5744,
"step": 376
},
{
"epoch": 1.8390243902439023,
"grad_norm": 0.8907456994056702,
"learning_rate": 4.596328580341169e-06,
"loss": 0.5621,
"step": 377
},
{
"epoch": 1.84390243902439,
"grad_norm": 0.9921515583992004,
"learning_rate": 4.5942386721092195e-06,
"loss": 0.6967,
"step": 378
},
{
"epoch": 1.848780487804878,
"grad_norm": 1.1683647632598877,
"learning_rate": 4.592143845542189e-06,
"loss": 0.6489,
"step": 379
},
{
"epoch": 1.8536585365853657,
"grad_norm": 1.0409291982650757,
"learning_rate": 4.590044105559797e-06,
"loss": 0.8945,
"step": 380
},
{
"epoch": 1.8585365853658535,
"grad_norm": 1.0684564113616943,
"learning_rate": 4.587939457093296e-06,
"loss": 0.5476,
"step": 381
},
{
"epoch": 1.8634146341463413,
"grad_norm": 1.3661733865737915,
"learning_rate": 4.585829905085468e-06,
"loss": 0.6763,
"step": 382
},
{
"epoch": 1.8682926829268292,
"grad_norm": 1.1465227603912354,
"learning_rate": 4.5837154544906135e-06,
"loss": 0.7817,
"step": 383
},
{
"epoch": 1.873170731707317,
"grad_norm": 1.0232677459716797,
"learning_rate": 4.581596110274535e-06,
"loss": 0.7276,
"step": 384
},
{
"epoch": 1.8780487804878048,
"grad_norm": 1.0359809398651123,
"learning_rate": 4.579471877414527e-06,
"loss": 0.9635,
"step": 385
},
{
"epoch": 1.8829268292682926,
"grad_norm": 1.2472409009933472,
"learning_rate": 4.577342760899368e-06,
"loss": 0.6782,
"step": 386
},
{
"epoch": 1.8878048780487804,
"grad_norm": 0.9241912961006165,
"learning_rate": 4.575208765729302e-06,
"loss": 0.5327,
"step": 387
},
{
"epoch": 1.8926829268292682,
"grad_norm": 1.3745805025100708,
"learning_rate": 4.573069896916035e-06,
"loss": 0.8436,
"step": 388
},
{
"epoch": 1.897560975609756,
"grad_norm": 1.0620322227478027,
"learning_rate": 4.5709261594827125e-06,
"loss": 0.659,
"step": 389
},
{
"epoch": 1.9024390243902438,
"grad_norm": 1.3068687915802002,
"learning_rate": 4.568777558463922e-06,
"loss": 0.5219,
"step": 390
},
{
"epoch": 1.9073170731707316,
"grad_norm": 1.0368698835372925,
"learning_rate": 4.566624098905665e-06,
"loss": 0.7099,
"step": 391
},
{
"epoch": 1.9121951219512194,
"grad_norm": 1.0876407623291016,
"learning_rate": 4.564465785865359e-06,
"loss": 0.5276,
"step": 392
},
{
"epoch": 1.9170731707317072,
"grad_norm": 0.9230280518531799,
"learning_rate": 4.56230262441182e-06,
"loss": 0.4292,
"step": 393
},
{
"epoch": 1.921951219512195,
"grad_norm": 0.9985240697860718,
"learning_rate": 4.560134619625247e-06,
"loss": 0.6602,
"step": 394
},
{
"epoch": 1.9268292682926829,
"grad_norm": 0.8872730135917664,
"learning_rate": 4.5579617765972155e-06,
"loss": 0.5711,
"step": 395
},
{
"epoch": 1.9317073170731707,
"grad_norm": 0.9474479556083679,
"learning_rate": 4.555784100430662e-06,
"loss": 0.5299,
"step": 396
},
{
"epoch": 1.9365853658536585,
"grad_norm": 1.0377358198165894,
"learning_rate": 4.553601596239877e-06,
"loss": 0.4649,
"step": 397
},
{
"epoch": 1.9414634146341463,
"grad_norm": 1.0765342712402344,
"learning_rate": 4.551414269150489e-06,
"loss": 0.5578,
"step": 398
},
{
"epoch": 1.946341463414634,
"grad_norm": 1.0844486951828003,
"learning_rate": 4.54922212429945e-06,
"loss": 0.5486,
"step": 399
},
{
"epoch": 1.951219512195122,
"grad_norm": 1.1640657186508179,
"learning_rate": 4.547025166835027e-06,
"loss": 0.7462,
"step": 400
},
{
"epoch": 1.9560975609756097,
"grad_norm": 1.2733076810836792,
"learning_rate": 4.544823401916794e-06,
"loss": 0.881,
"step": 401
},
{
"epoch": 1.9609756097560975,
"grad_norm": 1.155869960784912,
"learning_rate": 4.542616834715612e-06,
"loss": 0.6374,
"step": 402
},
{
"epoch": 1.9658536585365853,
"grad_norm": 0.8623374700546265,
"learning_rate": 4.540405470413618e-06,
"loss": 0.4188,
"step": 403
},
{
"epoch": 1.9707317073170731,
"grad_norm": 1.0502699613571167,
"learning_rate": 4.53818931420422e-06,
"loss": 0.7233,
"step": 404
},
{
"epoch": 1.975609756097561,
"grad_norm": 0.8630202412605286,
"learning_rate": 4.535968371292076e-06,
"loss": 0.5896,
"step": 405
},
{
"epoch": 1.9804878048780488,
"grad_norm": 1.1017824411392212,
"learning_rate": 4.533742646893086e-06,
"loss": 0.6971,
"step": 406
},
{
"epoch": 1.9853658536585366,
"grad_norm": 0.9387734532356262,
"learning_rate": 4.531512146234383e-06,
"loss": 0.6718,
"step": 407
},
{
"epoch": 1.9902439024390244,
"grad_norm": 1.0347439050674438,
"learning_rate": 4.529276874554312e-06,
"loss": 0.8829,
"step": 408
},
{
"epoch": 1.9951219512195122,
"grad_norm": 1.0173542499542236,
"learning_rate": 4.527036837102426e-06,
"loss": 0.5154,
"step": 409
},
{
"epoch": 2.0,
"grad_norm": 1.1297523975372314,
"learning_rate": 4.524792039139471e-06,
"loss": 0.7721,
"step": 410
},
{
"epoch": 2.004878048780488,
"grad_norm": 0.9763960242271423,
"learning_rate": 4.522542485937369e-06,
"loss": 0.3978,
"step": 411
},
{
"epoch": 2.0097560975609756,
"grad_norm": 0.9650730490684509,
"learning_rate": 4.520288182779214e-06,
"loss": 0.6826,
"step": 412
},
{
"epoch": 2.0146341463414634,
"grad_norm": 0.7682514190673828,
"learning_rate": 4.518029134959253e-06,
"loss": 0.351,
"step": 413
},
{
"epoch": 2.0195121951219512,
"grad_norm": 1.0681227445602417,
"learning_rate": 4.515765347782878e-06,
"loss": 0.6467,
"step": 414
},
{
"epoch": 2.024390243902439,
"grad_norm": 1.2426350116729736,
"learning_rate": 4.5134968265666085e-06,
"loss": 0.8831,
"step": 415
},
{
"epoch": 2.029268292682927,
"grad_norm": 0.9794759154319763,
"learning_rate": 4.511223576638084e-06,
"loss": 0.6419,
"step": 416
},
{
"epoch": 2.0341463414634147,
"grad_norm": 1.119649887084961,
"learning_rate": 4.508945603336049e-06,
"loss": 0.8023,
"step": 417
},
{
"epoch": 2.0390243902439025,
"grad_norm": 0.858971118927002,
"learning_rate": 4.50666291201034e-06,
"loss": 0.4974,
"step": 418
},
{
"epoch": 2.0439024390243903,
"grad_norm": 1.0508594512939453,
"learning_rate": 4.504375508021876e-06,
"loss": 0.3603,
"step": 419
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.9566763043403625,
"learning_rate": 4.50208339674264e-06,
"loss": 0.7813,
"step": 420
},
{
"epoch": 2.053658536585366,
"grad_norm": 1.074040412902832,
"learning_rate": 4.499786583555675e-06,
"loss": 0.8065,
"step": 421
},
{
"epoch": 2.0585365853658537,
"grad_norm": 0.8816580772399902,
"learning_rate": 4.497485073855061e-06,
"loss": 0.4439,
"step": 422
},
{
"epoch": 2.0634146341463415,
"grad_norm": 1.0733896493911743,
"learning_rate": 4.495178873045913e-06,
"loss": 0.4481,
"step": 423
},
{
"epoch": 2.0682926829268293,
"grad_norm": 0.9010451436042786,
"learning_rate": 4.4928679865443605e-06,
"loss": 0.4407,
"step": 424
},
{
"epoch": 2.073170731707317,
"grad_norm": 0.9359092712402344,
"learning_rate": 4.4905524197775366e-06,
"loss": 0.5847,
"step": 425
},
{
"epoch": 2.078048780487805,
"grad_norm": 0.9389141798019409,
"learning_rate": 4.4882321781835666e-06,
"loss": 0.5377,
"step": 426
},
{
"epoch": 2.0829268292682928,
"grad_norm": 0.89751797914505,
"learning_rate": 4.4859072672115565e-06,
"loss": 0.3879,
"step": 427
},
{
"epoch": 2.0878048780487806,
"grad_norm": 1.0244700908660889,
"learning_rate": 4.483577692321577e-06,
"loss": 0.8237,
"step": 428
},
{
"epoch": 2.0926829268292684,
"grad_norm": 1.0703409910202026,
"learning_rate": 4.481243458984651e-06,
"loss": 0.5826,
"step": 429
},
{
"epoch": 2.097560975609756,
"grad_norm": 0.9995833039283752,
"learning_rate": 4.478904572682743e-06,
"loss": 0.6091,
"step": 430
},
{
"epoch": 2.102439024390244,
"grad_norm": 0.8649471998214722,
"learning_rate": 4.476561038908745e-06,
"loss": 0.4863,
"step": 431
},
{
"epoch": 2.107317073170732,
"grad_norm": 1.1670926809310913,
"learning_rate": 4.474212863166464e-06,
"loss": 0.6584,
"step": 432
},
{
"epoch": 2.1121951219512196,
"grad_norm": 1.2743312120437622,
"learning_rate": 4.471860050970608e-06,
"loss": 0.6777,
"step": 433
},
{
"epoch": 2.1170731707317074,
"grad_norm": 1.2678401470184326,
"learning_rate": 4.469502607846774e-06,
"loss": 0.9609,
"step": 434
},
{
"epoch": 2.1219512195121952,
"grad_norm": 0.9796558618545532,
"learning_rate": 4.467140539331434e-06,
"loss": 0.4574,
"step": 435
},
{
"epoch": 2.126829268292683,
"grad_norm": 1.0830684900283813,
"learning_rate": 4.464773850971924e-06,
"loss": 0.3067,
"step": 436
},
{
"epoch": 2.131707317073171,
"grad_norm": 1.002589464187622,
"learning_rate": 4.46240254832643e-06,
"loss": 0.5383,
"step": 437
},
{
"epoch": 2.1365853658536587,
"grad_norm": 1.1145734786987305,
"learning_rate": 4.460026636963971e-06,
"loss": 0.6173,
"step": 438
},
{
"epoch": 2.1414634146341465,
"grad_norm": 0.94740891456604,
"learning_rate": 4.4576461224643965e-06,
"loss": 0.4991,
"step": 439
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.8613864183425903,
"learning_rate": 4.455261010418359e-06,
"loss": 0.3956,
"step": 440
},
{
"epoch": 2.151219512195122,
"grad_norm": 0.9509091377258301,
"learning_rate": 4.452871306427314e-06,
"loss": 0.7165,
"step": 441
},
{
"epoch": 2.15609756097561,
"grad_norm": 1.259364128112793,
"learning_rate": 4.450477016103498e-06,
"loss": 0.5682,
"step": 442
},
{
"epoch": 2.1609756097560977,
"grad_norm": 1.1716279983520508,
"learning_rate": 4.4480781450699205e-06,
"loss": 0.4917,
"step": 443
},
{
"epoch": 2.1658536585365855,
"grad_norm": 1.0395866632461548,
"learning_rate": 4.4456746989603464e-06,
"loss": 0.4338,
"step": 444
},
{
"epoch": 2.1707317073170733,
"grad_norm": 1.232602834701538,
"learning_rate": 4.443266683419289e-06,
"loss": 0.6356,
"step": 445
},
{
"epoch": 2.175609756097561,
"grad_norm": 1.254172921180725,
"learning_rate": 4.440854104101988e-06,
"loss": 0.472,
"step": 446
},
{
"epoch": 2.180487804878049,
"grad_norm": 1.2319004535675049,
"learning_rate": 4.438436966674406e-06,
"loss": 0.6408,
"step": 447
},
{
"epoch": 2.1853658536585368,
"grad_norm": 1.0100780725479126,
"learning_rate": 4.436015276813208e-06,
"loss": 0.4524,
"step": 448
},
{
"epoch": 2.1902439024390246,
"grad_norm": 0.9656887650489807,
"learning_rate": 4.4335890402057505e-06,
"loss": 0.5999,
"step": 449
},
{
"epoch": 2.1951219512195124,
"grad_norm": 1.1730879545211792,
"learning_rate": 4.431158262550067e-06,
"loss": 0.5953,
"step": 450
},
{
"epoch": 2.2,
"grad_norm": 0.9255422949790955,
"learning_rate": 4.428722949554858e-06,
"loss": 0.3487,
"step": 451
},
{
"epoch": 2.204878048780488,
"grad_norm": 0.9878072142601013,
"learning_rate": 4.426283106939474e-06,
"loss": 0.4937,
"step": 452
},
{
"epoch": 2.209756097560976,
"grad_norm": 0.982023298740387,
"learning_rate": 4.423838740433903e-06,
"loss": 0.6299,
"step": 453
},
{
"epoch": 2.2146341463414636,
"grad_norm": 0.7727266550064087,
"learning_rate": 4.4213898557787586e-06,
"loss": 0.2789,
"step": 454
},
{
"epoch": 2.2195121951219514,
"grad_norm": 1.5341951847076416,
"learning_rate": 4.4189364587252636e-06,
"loss": 0.8498,
"step": 455
},
{
"epoch": 2.2243902439024392,
"grad_norm": 1.1611250638961792,
"learning_rate": 4.416478555035241e-06,
"loss": 0.4075,
"step": 456
},
{
"epoch": 2.229268292682927,
"grad_norm": 1.0459867715835571,
"learning_rate": 4.4140161504810935e-06,
"loss": 0.4946,
"step": 457
},
{
"epoch": 2.234146341463415,
"grad_norm": 0.9366090297698975,
"learning_rate": 4.4115492508457986e-06,
"loss": 0.3479,
"step": 458
},
{
"epoch": 2.2390243902439027,
"grad_norm": 1.0325448513031006,
"learning_rate": 4.409077861922887e-06,
"loss": 0.5437,
"step": 459
},
{
"epoch": 2.2439024390243905,
"grad_norm": 0.9326527118682861,
"learning_rate": 4.406601989516435e-06,
"loss": 0.4594,
"step": 460
},
{
"epoch": 2.2487804878048783,
"grad_norm": 0.7127180099487305,
"learning_rate": 4.404121639441047e-06,
"loss": 0.3067,
"step": 461
},
{
"epoch": 2.253658536585366,
"grad_norm": 1.0416815280914307,
"learning_rate": 4.401636817521843e-06,
"loss": 0.5402,
"step": 462
},
{
"epoch": 2.258536585365854,
"grad_norm": 1.8258185386657715,
"learning_rate": 4.399147529594447e-06,
"loss": 0.3964,
"step": 463
},
{
"epoch": 2.2634146341463417,
"grad_norm": 0.9795071482658386,
"learning_rate": 4.3966537815049686e-06,
"loss": 0.5118,
"step": 464
},
{
"epoch": 2.2682926829268295,
"grad_norm": 1.1920483112335205,
"learning_rate": 4.394155579109994e-06,
"loss": 0.6511,
"step": 465
},
{
"epoch": 2.2731707317073173,
"grad_norm": 1.361159324645996,
"learning_rate": 4.391652928276572e-06,
"loss": 0.6874,
"step": 466
},
{
"epoch": 2.278048780487805,
"grad_norm": 0.9973228573799133,
"learning_rate": 4.389145834882195e-06,
"loss": 0.6057,
"step": 467
},
{
"epoch": 2.2829268292682925,
"grad_norm": 1.3514574766159058,
"learning_rate": 4.386634304814789e-06,
"loss": 0.4762,
"step": 468
},
{
"epoch": 2.2878048780487803,
"grad_norm": 1.2089687585830688,
"learning_rate": 4.384118343972704e-06,
"loss": 0.689,
"step": 469
},
{
"epoch": 2.292682926829268,
"grad_norm": 0.9414058327674866,
"learning_rate": 4.381597958264692e-06,
"loss": 0.7257,
"step": 470
},
{
"epoch": 2.297560975609756,
"grad_norm": 1.0120850801467896,
"learning_rate": 4.379073153609896e-06,
"loss": 0.7515,
"step": 471
},
{
"epoch": 2.3024390243902437,
"grad_norm": 1.586024522781372,
"learning_rate": 4.37654393593784e-06,
"loss": 0.64,
"step": 472
},
{
"epoch": 2.3073170731707315,
"grad_norm": 0.9766375422477722,
"learning_rate": 4.3740103111884096e-06,
"loss": 0.5162,
"step": 473
},
{
"epoch": 2.3121951219512193,
"grad_norm": 1.001685380935669,
"learning_rate": 4.371472285311842e-06,
"loss": 0.4464,
"step": 474
},
{
"epoch": 2.317073170731707,
"grad_norm": 0.7923868894577026,
"learning_rate": 4.368929864268709e-06,
"loss": 0.365,
"step": 475
},
{
"epoch": 2.321951219512195,
"grad_norm": 1.0744857788085938,
"learning_rate": 4.366383054029907e-06,
"loss": 0.6253,
"step": 476
},
{
"epoch": 2.3268292682926828,
"grad_norm": 0.9156374931335449,
"learning_rate": 4.363831860576638e-06,
"loss": 0.6512,
"step": 477
},
{
"epoch": 2.3317073170731706,
"grad_norm": 1.0944534540176392,
"learning_rate": 4.361276289900396e-06,
"loss": 0.5627,
"step": 478
},
{
"epoch": 2.3365853658536584,
"grad_norm": 1.1438108682632446,
"learning_rate": 4.358716348002962e-06,
"loss": 0.7402,
"step": 479
},
{
"epoch": 2.341463414634146,
"grad_norm": 1.1678388118743896,
"learning_rate": 4.356152040896376e-06,
"loss": 0.512,
"step": 480
},
{
"epoch": 2.346341463414634,
"grad_norm": 0.8130245208740234,
"learning_rate": 4.3535833746029335e-06,
"loss": 0.3934,
"step": 481
},
{
"epoch": 2.351219512195122,
"grad_norm": 1.229127287864685,
"learning_rate": 4.351010355155165e-06,
"loss": 0.4782,
"step": 482
},
{
"epoch": 2.3560975609756096,
"grad_norm": 0.9830904006958008,
"learning_rate": 4.348432988595828e-06,
"loss": 0.3879,
"step": 483
},
{
"epoch": 2.3609756097560974,
"grad_norm": 1.3584911823272705,
"learning_rate": 4.345851280977885e-06,
"loss": 0.7305,
"step": 484
},
{
"epoch": 2.3658536585365852,
"grad_norm": 1.0106158256530762,
"learning_rate": 4.343265238364496e-06,
"loss": 0.4247,
"step": 485
},
{
"epoch": 2.370731707317073,
"grad_norm": 1.1921676397323608,
"learning_rate": 4.340674866829001e-06,
"loss": 0.5446,
"step": 486
},
{
"epoch": 2.375609756097561,
"grad_norm": 1.1733497381210327,
"learning_rate": 4.338080172454908e-06,
"loss": 0.8386,
"step": 487
},
{
"epoch": 2.3804878048780487,
"grad_norm": 0.8914453387260437,
"learning_rate": 4.335481161335875e-06,
"loss": 0.5393,
"step": 488
},
{
"epoch": 2.3853658536585365,
"grad_norm": 1.122434377670288,
"learning_rate": 4.332877839575699e-06,
"loss": 0.4645,
"step": 489
},
{
"epoch": 2.3902439024390243,
"grad_norm": 1.078754186630249,
"learning_rate": 4.330270213288301e-06,
"loss": 0.6284,
"step": 490
},
{
"epoch": 2.395121951219512,
"grad_norm": 1.0400331020355225,
"learning_rate": 4.32765828859771e-06,
"loss": 0.8431,
"step": 491
},
{
"epoch": 2.4,
"grad_norm": 0.9895585775375366,
"learning_rate": 4.325042071638051e-06,
"loss": 0.6601,
"step": 492
},
{
"epoch": 2.4048780487804877,
"grad_norm": 1.1766878366470337,
"learning_rate": 4.322421568553529e-06,
"loss": 0.4766,
"step": 493
},
{
"epoch": 2.4097560975609755,
"grad_norm": 0.9032670855522156,
"learning_rate": 4.319796785498416e-06,
"loss": 0.4363,
"step": 494
},
{
"epoch": 2.4146341463414633,
"grad_norm": 0.9736960530281067,
"learning_rate": 4.317167728637032e-06,
"loss": 0.6109,
"step": 495
},
{
"epoch": 2.419512195121951,
"grad_norm": 0.7923660278320312,
"learning_rate": 4.314534404143738e-06,
"loss": 0.4987,
"step": 496
},
{
"epoch": 2.424390243902439,
"grad_norm": 0.9087777137756348,
"learning_rate": 4.3118968182029155e-06,
"loss": 0.6411,
"step": 497
},
{
"epoch": 2.4292682926829268,
"grad_norm": 0.851117730140686,
"learning_rate": 4.3092549770089566e-06,
"loss": 0.3541,
"step": 498
},
{
"epoch": 2.4341463414634146,
"grad_norm": 0.9581378102302551,
"learning_rate": 4.306608886766243e-06,
"loss": 0.5448,
"step": 499
},
{
"epoch": 2.4390243902439024,
"grad_norm": 1.081851601600647,
"learning_rate": 4.303958553689137e-06,
"loss": 0.5593,
"step": 500
},
{
"epoch": 2.44390243902439,
"grad_norm": 1.1111576557159424,
"learning_rate": 4.3013039840019675e-06,
"loss": 0.7566,
"step": 501
},
{
"epoch": 2.448780487804878,
"grad_norm": 1.0168198347091675,
"learning_rate": 4.2986451839390105e-06,
"loss": 0.3996,
"step": 502
},
{
"epoch": 2.453658536585366,
"grad_norm": 0.9412428140640259,
"learning_rate": 4.295982159744476e-06,
"loss": 0.5602,
"step": 503
},
{
"epoch": 2.4585365853658536,
"grad_norm": 1.0618679523468018,
"learning_rate": 4.293314917672498e-06,
"loss": 0.6466,
"step": 504
},
{
"epoch": 2.4634146341463414,
"grad_norm": 0.8414422273635864,
"learning_rate": 4.290643463987114e-06,
"loss": 0.357,
"step": 505
},
{
"epoch": 2.4682926829268292,
"grad_norm": 0.93071448802948,
"learning_rate": 4.287967804962252e-06,
"loss": 0.4179,
"step": 506
},
{
"epoch": 2.473170731707317,
"grad_norm": 0.9793124794960022,
"learning_rate": 4.285287946881718e-06,
"loss": 0.2698,
"step": 507
},
{
"epoch": 2.478048780487805,
"grad_norm": 0.9545714259147644,
"learning_rate": 4.282603896039178e-06,
"loss": 0.6855,
"step": 508
},
{
"epoch": 2.4829268292682927,
"grad_norm": 1.1140731573104858,
"learning_rate": 4.279915658738145e-06,
"loss": 0.5114,
"step": 509
},
{
"epoch": 2.4878048780487805,
"grad_norm": 1.0547738075256348,
"learning_rate": 4.277223241291966e-06,
"loss": 0.7367,
"step": 510
},
{
"epoch": 2.4926829268292683,
"grad_norm": 0.8616530895233154,
"learning_rate": 4.274526650023801e-06,
"loss": 0.5337,
"step": 511
},
{
"epoch": 2.497560975609756,
"grad_norm": 1.3530237674713135,
"learning_rate": 4.271825891266617e-06,
"loss": 0.5597,
"step": 512
},
{
"epoch": 2.502439024390244,
"grad_norm": 1.4124853610992432,
"learning_rate": 4.269120971363164e-06,
"loss": 0.8461,
"step": 513
},
{
"epoch": 2.5073170731707317,
"grad_norm": 1.0308994054794312,
"learning_rate": 4.266411896665967e-06,
"loss": 0.4304,
"step": 514
},
{
"epoch": 2.5121951219512195,
"grad_norm": 1.0831527709960938,
"learning_rate": 4.263698673537309e-06,
"loss": 0.5428,
"step": 515
},
{
"epoch": 2.5170731707317073,
"grad_norm": 1.2008475065231323,
"learning_rate": 4.260981308349214e-06,
"loss": 0.6922,
"step": 516
},
{
"epoch": 2.521951219512195,
"grad_norm": 1.1929224729537964,
"learning_rate": 4.258259807483434e-06,
"loss": 0.5716,
"step": 517
},
{
"epoch": 2.526829268292683,
"grad_norm": 1.016539216041565,
"learning_rate": 4.255534177331435e-06,
"loss": 0.5806,
"step": 518
},
{
"epoch": 2.5317073170731708,
"grad_norm": 1.018875002861023,
"learning_rate": 4.252804424294378e-06,
"loss": 0.5581,
"step": 519
},
{
"epoch": 2.5365853658536586,
"grad_norm": 0.9992810487747192,
"learning_rate": 4.25007055478311e-06,
"loss": 0.6786,
"step": 520
},
{
"epoch": 2.5414634146341464,
"grad_norm": 1.1207003593444824,
"learning_rate": 4.247332575218144e-06,
"loss": 0.4548,
"step": 521
},
{
"epoch": 2.546341463414634,
"grad_norm": 1.1575409173965454,
"learning_rate": 4.244590492029643e-06,
"loss": 0.6846,
"step": 522
},
{
"epoch": 2.551219512195122,
"grad_norm": 0.9805243015289307,
"learning_rate": 4.241844311657411e-06,
"loss": 0.4301,
"step": 523
},
{
"epoch": 2.55609756097561,
"grad_norm": 0.9760981202125549,
"learning_rate": 4.239094040550875e-06,
"loss": 0.3545,
"step": 524
},
{
"epoch": 2.5609756097560976,
"grad_norm": 0.8702017664909363,
"learning_rate": 4.236339685169065e-06,
"loss": 0.5429,
"step": 525
},
{
"epoch": 2.5658536585365854,
"grad_norm": 1.0681567192077637,
"learning_rate": 4.233581251980604e-06,
"loss": 0.3289,
"step": 526
},
{
"epoch": 2.5707317073170732,
"grad_norm": 1.1807548999786377,
"learning_rate": 4.230818747463696e-06,
"loss": 0.4876,
"step": 527
},
{
"epoch": 2.575609756097561,
"grad_norm": 0.9812930226325989,
"learning_rate": 4.228052178106101e-06,
"loss": 0.5025,
"step": 528
},
{
"epoch": 2.580487804878049,
"grad_norm": 0.8600794672966003,
"learning_rate": 4.2252815504051285e-06,
"loss": 0.3133,
"step": 529
},
{
"epoch": 2.5853658536585367,
"grad_norm": 1.89119553565979,
"learning_rate": 4.222506870867618e-06,
"loss": 0.6036,
"step": 530
},
{
"epoch": 2.5902439024390245,
"grad_norm": 1.0424220561981201,
"learning_rate": 4.2197281460099245e-06,
"loss": 0.6877,
"step": 531
},
{
"epoch": 2.5951219512195123,
"grad_norm": 1.027593731880188,
"learning_rate": 4.216945382357905e-06,
"loss": 0.6352,
"step": 532
},
{
"epoch": 2.6,
"grad_norm": 1.1954094171524048,
"learning_rate": 4.214158586446901e-06,
"loss": 0.966,
"step": 533
},
{
"epoch": 2.604878048780488,
"grad_norm": 1.2490975856781006,
"learning_rate": 4.211367764821722e-06,
"loss": 0.9059,
"step": 534
},
{
"epoch": 2.6097560975609757,
"grad_norm": 1.1297111511230469,
"learning_rate": 4.208572924036634e-06,
"loss": 0.5006,
"step": 535
},
{
"epoch": 2.6146341463414635,
"grad_norm": 0.9446660876274109,
"learning_rate": 4.2057740706553415e-06,
"loss": 0.4905,
"step": 536
},
{
"epoch": 2.6195121951219513,
"grad_norm": 0.916692316532135,
"learning_rate": 4.202971211250971e-06,
"loss": 0.6694,
"step": 537
},
{
"epoch": 2.624390243902439,
"grad_norm": 0.840929388999939,
"learning_rate": 4.200164352406061e-06,
"loss": 0.3739,
"step": 538
},
{
"epoch": 2.629268292682927,
"grad_norm": 1.069427728652954,
"learning_rate": 4.197353500712539e-06,
"loss": 0.6359,
"step": 539
},
{
"epoch": 2.6341463414634148,
"grad_norm": 1.0660371780395508,
"learning_rate": 4.1945386627717115e-06,
"loss": 0.5006,
"step": 540
},
{
"epoch": 2.6390243902439026,
"grad_norm": 1.9662373065948486,
"learning_rate": 4.191719845194246e-06,
"loss": 0.649,
"step": 541
},
{
"epoch": 2.6439024390243904,
"grad_norm": 0.9865717887878418,
"learning_rate": 4.188897054600156e-06,
"loss": 0.6179,
"step": 542
},
{
"epoch": 2.648780487804878,
"grad_norm": 1.0393004417419434,
"learning_rate": 4.186070297618787e-06,
"loss": 0.7156,
"step": 543
},
{
"epoch": 2.653658536585366,
"grad_norm": 0.9797636270523071,
"learning_rate": 4.183239580888799e-06,
"loss": 0.7249,
"step": 544
},
{
"epoch": 2.658536585365854,
"grad_norm": 1.180819034576416,
"learning_rate": 4.18040491105815e-06,
"loss": 0.4961,
"step": 545
},
{
"epoch": 2.6634146341463416,
"grad_norm": 0.9986240863800049,
"learning_rate": 4.177566294784085e-06,
"loss": 0.4397,
"step": 546
},
{
"epoch": 2.6682926829268294,
"grad_norm": 1.2416610717773438,
"learning_rate": 4.174723738733114e-06,
"loss": 0.7625,
"step": 547
},
{
"epoch": 2.6731707317073172,
"grad_norm": 0.9271990656852722,
"learning_rate": 4.171877249581001e-06,
"loss": 0.6626,
"step": 548
},
{
"epoch": 2.678048780487805,
"grad_norm": 0.9085447788238525,
"learning_rate": 4.169026834012748e-06,
"loss": 0.4209,
"step": 549
},
{
"epoch": 2.682926829268293,
"grad_norm": 0.9767999649047852,
"learning_rate": 4.166172498722577e-06,
"loss": 0.4466,
"step": 550
},
{
"epoch": 2.68780487804878,
"grad_norm": 1.405003309249878,
"learning_rate": 4.163314250413913e-06,
"loss": 0.8207,
"step": 551
},
{
"epoch": 2.692682926829268,
"grad_norm": 1.001278042793274,
"learning_rate": 4.160452095799378e-06,
"loss": 0.4949,
"step": 552
},
{
"epoch": 2.697560975609756,
"grad_norm": 0.8813621401786804,
"learning_rate": 4.157586041600759e-06,
"loss": 0.2726,
"step": 553
},
{
"epoch": 2.7024390243902436,
"grad_norm": 1.087396502494812,
"learning_rate": 4.154716094549008e-06,
"loss": 0.6358,
"step": 554
},
{
"epoch": 2.7073170731707314,
"grad_norm": 0.864211916923523,
"learning_rate": 4.151842261384217e-06,
"loss": 0.4146,
"step": 555
},
{
"epoch": 2.7121951219512193,
"grad_norm": 1.580068588256836,
"learning_rate": 4.148964548855603e-06,
"loss": 0.9695,
"step": 556
},
{
"epoch": 2.717073170731707,
"grad_norm": 0.7623794674873352,
"learning_rate": 4.146082963721496e-06,
"loss": 0.3429,
"step": 557
},
{
"epoch": 2.721951219512195,
"grad_norm": 1.2673579454421997,
"learning_rate": 4.143197512749322e-06,
"loss": 1.1916,
"step": 558
},
{
"epoch": 2.7268292682926827,
"grad_norm": 1.0848994255065918,
"learning_rate": 4.140308202715581e-06,
"loss": 0.8112,
"step": 559
},
{
"epoch": 2.7317073170731705,
"grad_norm": 0.9205752015113831,
"learning_rate": 4.13741504040584e-06,
"loss": 0.4364,
"step": 560
},
{
"epoch": 2.7365853658536583,
"grad_norm": 1.0304152965545654,
"learning_rate": 4.134518032614713e-06,
"loss": 0.4841,
"step": 561
},
{
"epoch": 2.741463414634146,
"grad_norm": 0.9854786992073059,
"learning_rate": 4.1316171861458445e-06,
"loss": 0.418,
"step": 562
},
{
"epoch": 2.746341463414634,
"grad_norm": 1.0625019073486328,
"learning_rate": 4.128712507811893e-06,
"loss": 0.6479,
"step": 563
},
{
"epoch": 2.7512195121951217,
"grad_norm": 1.0722914934158325,
"learning_rate": 4.125804004434517e-06,
"loss": 0.6462,
"step": 564
},
{
"epoch": 2.7560975609756095,
"grad_norm": 0.890087366104126,
"learning_rate": 4.12289168284436e-06,
"loss": 0.4462,
"step": 565
},
{
"epoch": 2.7609756097560973,
"grad_norm": 1.0615348815917969,
"learning_rate": 4.119975549881029e-06,
"loss": 0.6229,
"step": 566
},
{
"epoch": 2.765853658536585,
"grad_norm": 0.8919638395309448,
"learning_rate": 4.1170556123930846e-06,
"loss": 0.3847,
"step": 567
},
{
"epoch": 2.770731707317073,
"grad_norm": 0.9881047606468201,
"learning_rate": 4.114131877238021e-06,
"loss": 0.5972,
"step": 568
},
{
"epoch": 2.7756097560975608,
"grad_norm": 0.8665289878845215,
"learning_rate": 4.111204351282254e-06,
"loss": 0.4755,
"step": 569
},
{
"epoch": 2.7804878048780486,
"grad_norm": 0.7870835661888123,
"learning_rate": 4.108273041401098e-06,
"loss": 0.4341,
"step": 570
},
{
"epoch": 2.7853658536585364,
"grad_norm": 1.3610732555389404,
"learning_rate": 4.105337954478756e-06,
"loss": 0.8646,
"step": 571
},
{
"epoch": 2.790243902439024,
"grad_norm": 1.0715464353561401,
"learning_rate": 4.102399097408304e-06,
"loss": 0.7017,
"step": 572
},
{
"epoch": 2.795121951219512,
"grad_norm": 0.952342689037323,
"learning_rate": 4.099456477091667e-06,
"loss": 0.3381,
"step": 573
},
{
"epoch": 2.8,
"grad_norm": 1.151577115058899,
"learning_rate": 4.096510100439611e-06,
"loss": 0.8217,
"step": 574
},
{
"epoch": 2.8048780487804876,
"grad_norm": 0.8553835153579712,
"learning_rate": 4.093559974371725e-06,
"loss": 0.3334,
"step": 575
},
{
"epoch": 2.8097560975609754,
"grad_norm": 1.004453420639038,
"learning_rate": 4.0906061058164e-06,
"loss": 0.8324,
"step": 576
},
{
"epoch": 2.8146341463414632,
"grad_norm": 0.9378971457481384,
"learning_rate": 4.087648501710819e-06,
"loss": 0.3753,
"step": 577
},
{
"epoch": 2.819512195121951,
"grad_norm": 0.9435027241706848,
"learning_rate": 4.084687169000938e-06,
"loss": 0.5675,
"step": 578
},
{
"epoch": 2.824390243902439,
"grad_norm": 0.856401801109314,
"learning_rate": 4.081722114641469e-06,
"loss": 0.5309,
"step": 579
},
{
"epoch": 2.8292682926829267,
"grad_norm": 1.1497118473052979,
"learning_rate": 4.0787533455958626e-06,
"loss": 0.3339,
"step": 580
},
{
"epoch": 2.8341463414634145,
"grad_norm": 1.0275132656097412,
"learning_rate": 4.075780868836296e-06,
"loss": 0.4303,
"step": 581
},
{
"epoch": 2.8390243902439023,
"grad_norm": 0.903195321559906,
"learning_rate": 4.072804691343653e-06,
"loss": 0.495,
"step": 582
},
{
"epoch": 2.84390243902439,
"grad_norm": 1.1491634845733643,
"learning_rate": 4.069824820107507e-06,
"loss": 0.9994,
"step": 583
},
{
"epoch": 2.848780487804878,
"grad_norm": 0.7706964015960693,
"learning_rate": 4.06684126212611e-06,
"loss": 0.3226,
"step": 584
},
{
"epoch": 2.8536585365853657,
"grad_norm": 0.9241564273834229,
"learning_rate": 4.063854024406369e-06,
"loss": 0.5793,
"step": 585
},
{
"epoch": 2.8585365853658535,
"grad_norm": 0.9884312152862549,
"learning_rate": 4.060863113963835e-06,
"loss": 0.4683,
"step": 586
},
{
"epoch": 2.8634146341463413,
"grad_norm": 0.9997614026069641,
"learning_rate": 4.057868537822683e-06,
"loss": 0.519,
"step": 587
},
{
"epoch": 2.868292682926829,
"grad_norm": 0.9449941515922546,
"learning_rate": 4.054870303015695e-06,
"loss": 0.381,
"step": 588
},
{
"epoch": 2.873170731707317,
"grad_norm": 1.0271875858306885,
"learning_rate": 4.05186841658425e-06,
"loss": 0.4554,
"step": 589
},
{
"epoch": 2.8780487804878048,
"grad_norm": 0.9256722331047058,
"learning_rate": 4.048862885578301e-06,
"loss": 0.5297,
"step": 590
},
{
"epoch": 2.8829268292682926,
"grad_norm": 1.0047836303710938,
"learning_rate": 4.045853717056358e-06,
"loss": 0.5968,
"step": 591
},
{
"epoch": 2.8878048780487804,
"grad_norm": 0.9485352635383606,
"learning_rate": 4.0428409180854775e-06,
"loss": 0.5042,
"step": 592
},
{
"epoch": 2.892682926829268,
"grad_norm": 0.9246886372566223,
"learning_rate": 4.039824495741238e-06,
"loss": 0.4622,
"step": 593
},
{
"epoch": 2.897560975609756,
"grad_norm": 0.8539214134216309,
"learning_rate": 4.036804457107733e-06,
"loss": 0.53,
"step": 594
},
{
"epoch": 2.902439024390244,
"grad_norm": 1.2358900308609009,
"learning_rate": 4.0337808092775435e-06,
"loss": 0.701,
"step": 595
},
{
"epoch": 2.9073170731707316,
"grad_norm": 0.8977146744728088,
"learning_rate": 4.030753559351728e-06,
"loss": 0.3942,
"step": 596
},
{
"epoch": 2.9121951219512194,
"grad_norm": 0.8575205206871033,
"learning_rate": 4.027722714439808e-06,
"loss": 0.3713,
"step": 597
},
{
"epoch": 2.9170731707317072,
"grad_norm": 1.2680315971374512,
"learning_rate": 4.024688281659743e-06,
"loss": 0.9398,
"step": 598
},
{
"epoch": 2.921951219512195,
"grad_norm": 1.7810138463974,
"learning_rate": 4.021650268137924e-06,
"loss": 0.6056,
"step": 599
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.8538106083869934,
"learning_rate": 4.018608681009143e-06,
"loss": 0.4939,
"step": 600
},
{
"epoch": 2.9317073170731707,
"grad_norm": 1.0676621198654175,
"learning_rate": 4.015563527416596e-06,
"loss": 0.5676,
"step": 601
},
{
"epoch": 2.9365853658536585,
"grad_norm": 0.987902820110321,
"learning_rate": 4.012514814511844e-06,
"loss": 0.5004,
"step": 602
},
{
"epoch": 2.9414634146341463,
"grad_norm": 0.9196493625640869,
"learning_rate": 4.009462549454816e-06,
"loss": 0.5476,
"step": 603
},
{
"epoch": 2.946341463414634,
"grad_norm": 1.1288981437683105,
"learning_rate": 4.006406739413775e-06,
"loss": 0.5804,
"step": 604
},
{
"epoch": 2.951219512195122,
"grad_norm": 0.8905384540557861,
"learning_rate": 4.003347391565317e-06,
"loss": 0.4979,
"step": 605
},
{
"epoch": 2.9560975609756097,
"grad_norm": 1.000335693359375,
"learning_rate": 4.000284513094342e-06,
"loss": 0.5727,
"step": 606
},
{
"epoch": 2.9609756097560975,
"grad_norm": 0.9347658753395081,
"learning_rate": 3.997218111194042e-06,
"loss": 0.5235,
"step": 607
},
{
"epoch": 2.9658536585365853,
"grad_norm": 0.8476413488388062,
"learning_rate": 3.994148193065886e-06,
"loss": 0.3922,
"step": 608
},
{
"epoch": 2.970731707317073,
"grad_norm": 0.8186416029930115,
"learning_rate": 3.991074765919598e-06,
"loss": 0.3482,
"step": 609
},
{
"epoch": 2.975609756097561,
"grad_norm": 1.2038166522979736,
"learning_rate": 3.987997836973147e-06,
"loss": 0.4684,
"step": 610
},
{
"epoch": 2.9804878048780488,
"grad_norm": 1.037007212638855,
"learning_rate": 3.984917413452721e-06,
"loss": 0.4811,
"step": 611
},
{
"epoch": 2.9853658536585366,
"grad_norm": 1.2915143966674805,
"learning_rate": 3.981833502592717e-06,
"loss": 0.7361,
"step": 612
},
{
"epoch": 2.9902439024390244,
"grad_norm": 1.5253301858901978,
"learning_rate": 3.978746111635725e-06,
"loss": 0.312,
"step": 613
},
{
"epoch": 2.995121951219512,
"grad_norm": 0.8432179093360901,
"learning_rate": 3.9756552478325045e-06,
"loss": 0.4931,
"step": 614
},
{
"epoch": 3.0,
"grad_norm": 0.8148512840270996,
"learning_rate": 3.972560918441972e-06,
"loss": 0.3185,
"step": 615
}
],
"logging_steps": 1,
"max_steps": 2050,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 208,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.8156947889350246e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}