sedrickkeh's picture
End of training
4cde0a4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.995051138238205,
"eval_steps": 500,
"global_step": 945,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005278785879247773,
"grad_norm": 8.559222684223487,
"learning_rate": 8.421052631578948e-07,
"loss": 1.7821,
"step": 1
},
{
"epoch": 0.010557571758495546,
"grad_norm": 8.542811150657618,
"learning_rate": 1.6842105263157895e-06,
"loss": 1.7757,
"step": 2
},
{
"epoch": 0.01583635763774332,
"grad_norm": 8.625119441824685,
"learning_rate": 2.5263157894736844e-06,
"loss": 1.7844,
"step": 3
},
{
"epoch": 0.02111514351699109,
"grad_norm": 7.911430505391723,
"learning_rate": 3.368421052631579e-06,
"loss": 1.7579,
"step": 4
},
{
"epoch": 0.026393929396238865,
"grad_norm": 6.143963490700802,
"learning_rate": 4.210526315789474e-06,
"loss": 1.7134,
"step": 5
},
{
"epoch": 0.03167271527548664,
"grad_norm": 3.3865160264777097,
"learning_rate": 5.052631578947369e-06,
"loss": 1.6629,
"step": 6
},
{
"epoch": 0.03695150115473441,
"grad_norm": 2.76282727616314,
"learning_rate": 5.8947368421052634e-06,
"loss": 1.6585,
"step": 7
},
{
"epoch": 0.04223028703398218,
"grad_norm": 6.322449790932474,
"learning_rate": 6.736842105263158e-06,
"loss": 1.6616,
"step": 8
},
{
"epoch": 0.047509072913229956,
"grad_norm": 6.279706082200225,
"learning_rate": 7.578947368421054e-06,
"loss": 1.6681,
"step": 9
},
{
"epoch": 0.05278785879247773,
"grad_norm": 6.066649896759186,
"learning_rate": 8.421052631578948e-06,
"loss": 1.6635,
"step": 10
},
{
"epoch": 0.0580666446717255,
"grad_norm": 4.720492660185518,
"learning_rate": 9.263157894736842e-06,
"loss": 1.6048,
"step": 11
},
{
"epoch": 0.06334543055097328,
"grad_norm": 4.038725392859273,
"learning_rate": 1.0105263157894738e-05,
"loss": 1.5823,
"step": 12
},
{
"epoch": 0.06862421643022105,
"grad_norm": 2.675236418624382,
"learning_rate": 1.0947368421052633e-05,
"loss": 1.5534,
"step": 13
},
{
"epoch": 0.07390300230946882,
"grad_norm": 1.9761478647451622,
"learning_rate": 1.1789473684210527e-05,
"loss": 1.5445,
"step": 14
},
{
"epoch": 0.0791817881887166,
"grad_norm": 2.1964197263869054,
"learning_rate": 1.263157894736842e-05,
"loss": 1.5301,
"step": 15
},
{
"epoch": 0.08446057406796437,
"grad_norm": 2.0364335919963903,
"learning_rate": 1.3473684210526316e-05,
"loss": 1.5004,
"step": 16
},
{
"epoch": 0.08973935994721215,
"grad_norm": 1.8037582738302078,
"learning_rate": 1.4315789473684212e-05,
"loss": 1.4781,
"step": 17
},
{
"epoch": 0.09501814582645991,
"grad_norm": 1.4875183659443978,
"learning_rate": 1.5157894736842107e-05,
"loss": 1.4884,
"step": 18
},
{
"epoch": 0.10029693170570769,
"grad_norm": 1.4977012844942934,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.463,
"step": 19
},
{
"epoch": 0.10557571758495546,
"grad_norm": 1.2341554749307428,
"learning_rate": 1.6842105263157896e-05,
"loss": 1.4694,
"step": 20
},
{
"epoch": 0.11085450346420324,
"grad_norm": 1.2543656515326747,
"learning_rate": 1.768421052631579e-05,
"loss": 1.4472,
"step": 21
},
{
"epoch": 0.116133289343451,
"grad_norm": 1.277673973994405,
"learning_rate": 1.8526315789473684e-05,
"loss": 1.4194,
"step": 22
},
{
"epoch": 0.12141207522269878,
"grad_norm": 1.0554281233550042,
"learning_rate": 1.936842105263158e-05,
"loss": 1.4259,
"step": 23
},
{
"epoch": 0.12669086110194655,
"grad_norm": 0.9059715768012179,
"learning_rate": 2.0210526315789475e-05,
"loss": 1.4112,
"step": 24
},
{
"epoch": 0.13196964698119432,
"grad_norm": 1.150687529393131,
"learning_rate": 2.105263157894737e-05,
"loss": 1.4208,
"step": 25
},
{
"epoch": 0.1372484328604421,
"grad_norm": 0.8259556273498903,
"learning_rate": 2.1894736842105266e-05,
"loss": 1.4158,
"step": 26
},
{
"epoch": 0.14252721873968988,
"grad_norm": 1.080990078765424,
"learning_rate": 2.273684210526316e-05,
"loss": 1.4037,
"step": 27
},
{
"epoch": 0.14780600461893764,
"grad_norm": 1.731100113222458,
"learning_rate": 2.3578947368421054e-05,
"loss": 1.4024,
"step": 28
},
{
"epoch": 0.1530847904981854,
"grad_norm": 0.8596609684641758,
"learning_rate": 2.442105263157895e-05,
"loss": 1.3998,
"step": 29
},
{
"epoch": 0.1583635763774332,
"grad_norm": 1.9119682716812114,
"learning_rate": 2.526315789473684e-05,
"loss": 1.402,
"step": 30
},
{
"epoch": 0.16364236225668097,
"grad_norm": 1.2533125345082186,
"learning_rate": 2.610526315789474e-05,
"loss": 1.39,
"step": 31
},
{
"epoch": 0.16892114813592873,
"grad_norm": 1.489553834805727,
"learning_rate": 2.6947368421052632e-05,
"loss": 1.3828,
"step": 32
},
{
"epoch": 0.1741999340151765,
"grad_norm": 1.6513868341059252,
"learning_rate": 2.778947368421053e-05,
"loss": 1.3678,
"step": 33
},
{
"epoch": 0.1794787198944243,
"grad_norm": 1.2742436351081012,
"learning_rate": 2.8631578947368423e-05,
"loss": 1.3535,
"step": 34
},
{
"epoch": 0.18475750577367206,
"grad_norm": 1.9331697766270215,
"learning_rate": 2.9473684210526317e-05,
"loss": 1.3837,
"step": 35
},
{
"epoch": 0.19003629165291983,
"grad_norm": 1.3601512530638553,
"learning_rate": 3.0315789473684214e-05,
"loss": 1.3722,
"step": 36
},
{
"epoch": 0.1953150775321676,
"grad_norm": 1.7311302669544857,
"learning_rate": 3.1157894736842105e-05,
"loss": 1.3686,
"step": 37
},
{
"epoch": 0.20059386341141539,
"grad_norm": 1.48755699155441,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.3671,
"step": 38
},
{
"epoch": 0.20587264929066315,
"grad_norm": 1.8327357288689288,
"learning_rate": 3.28421052631579e-05,
"loss": 1.3735,
"step": 39
},
{
"epoch": 0.21115143516991092,
"grad_norm": 1.5083133000259623,
"learning_rate": 3.368421052631579e-05,
"loss": 1.3674,
"step": 40
},
{
"epoch": 0.21643022104915868,
"grad_norm": 1.993160855595939,
"learning_rate": 3.452631578947369e-05,
"loss": 1.3699,
"step": 41
},
{
"epoch": 0.22170900692840648,
"grad_norm": 1.5856686533566766,
"learning_rate": 3.536842105263158e-05,
"loss": 1.3518,
"step": 42
},
{
"epoch": 0.22698779280765424,
"grad_norm": 1.9065067719811546,
"learning_rate": 3.621052631578948e-05,
"loss": 1.3633,
"step": 43
},
{
"epoch": 0.232266578686902,
"grad_norm": 1.506131578841278,
"learning_rate": 3.705263157894737e-05,
"loss": 1.3497,
"step": 44
},
{
"epoch": 0.23754536456614977,
"grad_norm": 2.121087638106546,
"learning_rate": 3.789473684210526e-05,
"loss": 1.3367,
"step": 45
},
{
"epoch": 0.24282415044539757,
"grad_norm": 4.67404248196046,
"learning_rate": 3.873684210526316e-05,
"loss": 1.3672,
"step": 46
},
{
"epoch": 0.24810293632464533,
"grad_norm": 1.2448680931118623,
"learning_rate": 3.9578947368421056e-05,
"loss": 1.3534,
"step": 47
},
{
"epoch": 0.2533817222038931,
"grad_norm": 2.1014651081767455,
"learning_rate": 4.042105263157895e-05,
"loss": 1.3559,
"step": 48
},
{
"epoch": 0.2586605080831409,
"grad_norm": 2.1083248107539814,
"learning_rate": 4.126315789473685e-05,
"loss": 1.3562,
"step": 49
},
{
"epoch": 0.26393929396238863,
"grad_norm": 1.0081628250168648,
"learning_rate": 4.210526315789474e-05,
"loss": 1.3478,
"step": 50
},
{
"epoch": 0.2692180798416364,
"grad_norm": 2.245552893796406,
"learning_rate": 4.294736842105264e-05,
"loss": 1.3425,
"step": 51
},
{
"epoch": 0.2744968657208842,
"grad_norm": 1.847721270053161,
"learning_rate": 4.378947368421053e-05,
"loss": 1.3484,
"step": 52
},
{
"epoch": 0.27977565160013196,
"grad_norm": 2.07621002532574,
"learning_rate": 4.463157894736842e-05,
"loss": 1.3524,
"step": 53
},
{
"epoch": 0.28505443747937975,
"grad_norm": 2.0573435626549674,
"learning_rate": 4.547368421052632e-05,
"loss": 1.3375,
"step": 54
},
{
"epoch": 0.2903332233586275,
"grad_norm": 2.2804125611624744,
"learning_rate": 4.6315789473684214e-05,
"loss": 1.3537,
"step": 55
},
{
"epoch": 0.2956120092378753,
"grad_norm": 1.614092007514655,
"learning_rate": 4.715789473684211e-05,
"loss": 1.3221,
"step": 56
},
{
"epoch": 0.3008907951171231,
"grad_norm": 1.8239392632361,
"learning_rate": 4.8e-05,
"loss": 1.3551,
"step": 57
},
{
"epoch": 0.3061695809963708,
"grad_norm": 1.892353404743652,
"learning_rate": 4.88421052631579e-05,
"loss": 1.3422,
"step": 58
},
{
"epoch": 0.3114483668756186,
"grad_norm": 1.5626211808165376,
"learning_rate": 4.9684210526315796e-05,
"loss": 1.3388,
"step": 59
},
{
"epoch": 0.3167271527548664,
"grad_norm": 2.2886161978053416,
"learning_rate": 5.052631578947368e-05,
"loss": 1.3519,
"step": 60
},
{
"epoch": 0.32200593863411414,
"grad_norm": 1.2771706641759382,
"learning_rate": 5.136842105263158e-05,
"loss": 1.3285,
"step": 61
},
{
"epoch": 0.32728472451336194,
"grad_norm": 1.8850893921105571,
"learning_rate": 5.221052631578948e-05,
"loss": 1.3285,
"step": 62
},
{
"epoch": 0.3325635103926097,
"grad_norm": 2.836538683465238,
"learning_rate": 5.305263157894737e-05,
"loss": 1.3352,
"step": 63
},
{
"epoch": 0.33784229627185747,
"grad_norm": 1.0161609306010637,
"learning_rate": 5.3894736842105265e-05,
"loss": 1.3303,
"step": 64
},
{
"epoch": 0.34312108215110526,
"grad_norm": 3.418607300631356,
"learning_rate": 5.4736842105263165e-05,
"loss": 1.3568,
"step": 65
},
{
"epoch": 0.348399868030353,
"grad_norm": 2.1951718649280716,
"learning_rate": 5.557894736842106e-05,
"loss": 1.3578,
"step": 66
},
{
"epoch": 0.3536786539096008,
"grad_norm": 2.1129578266236697,
"learning_rate": 5.642105263157895e-05,
"loss": 1.362,
"step": 67
},
{
"epoch": 0.3589574397888486,
"grad_norm": 2.6017181194163013,
"learning_rate": 5.726315789473685e-05,
"loss": 1.3446,
"step": 68
},
{
"epoch": 0.3642362256680963,
"grad_norm": 2.0929961143676095,
"learning_rate": 5.810526315789475e-05,
"loss": 1.3476,
"step": 69
},
{
"epoch": 0.3695150115473441,
"grad_norm": 2.9426854876810236,
"learning_rate": 5.8947368421052634e-05,
"loss": 1.361,
"step": 70
},
{
"epoch": 0.37479379742659186,
"grad_norm": 2.047235372498259,
"learning_rate": 5.978947368421053e-05,
"loss": 1.3295,
"step": 71
},
{
"epoch": 0.38007258330583965,
"grad_norm": 2.3163564645661605,
"learning_rate": 6.063157894736843e-05,
"loss": 1.3236,
"step": 72
},
{
"epoch": 0.38535136918508744,
"grad_norm": 1.7708571795906927,
"learning_rate": 6.147368421052632e-05,
"loss": 1.3282,
"step": 73
},
{
"epoch": 0.3906301550643352,
"grad_norm": 2.873431951917759,
"learning_rate": 6.231578947368421e-05,
"loss": 1.3124,
"step": 74
},
{
"epoch": 0.395908940943583,
"grad_norm": 2.1608485069854138,
"learning_rate": 6.315789473684212e-05,
"loss": 1.3259,
"step": 75
},
{
"epoch": 0.40118772682283077,
"grad_norm": 2.641406113572487,
"learning_rate": 6.400000000000001e-05,
"loss": 1.3073,
"step": 76
},
{
"epoch": 0.4064665127020785,
"grad_norm": 2.134077803951715,
"learning_rate": 6.484210526315789e-05,
"loss": 1.332,
"step": 77
},
{
"epoch": 0.4117452985813263,
"grad_norm": 2.548230794420582,
"learning_rate": 6.56842105263158e-05,
"loss": 1.3264,
"step": 78
},
{
"epoch": 0.41702408446057404,
"grad_norm": 1.8584772908735328,
"learning_rate": 6.652631578947369e-05,
"loss": 1.3206,
"step": 79
},
{
"epoch": 0.42230287033982183,
"grad_norm": 1.7329721515220284,
"learning_rate": 6.736842105263159e-05,
"loss": 1.3233,
"step": 80
},
{
"epoch": 0.42758165621906963,
"grad_norm": 1.3798188484446696,
"learning_rate": 6.821052631578948e-05,
"loss": 1.3215,
"step": 81
},
{
"epoch": 0.43286044209831737,
"grad_norm": 2.3304870562288627,
"learning_rate": 6.905263157894737e-05,
"loss": 1.3271,
"step": 82
},
{
"epoch": 0.43813922797756516,
"grad_norm": 1.9536589639033466,
"learning_rate": 6.989473684210527e-05,
"loss": 1.3262,
"step": 83
},
{
"epoch": 0.44341801385681295,
"grad_norm": 1.9444021198857042,
"learning_rate": 7.073684210526316e-05,
"loss": 1.3339,
"step": 84
},
{
"epoch": 0.4486967997360607,
"grad_norm": 3.192423879350558,
"learning_rate": 7.157894736842105e-05,
"loss": 1.3214,
"step": 85
},
{
"epoch": 0.4539755856153085,
"grad_norm": 1.5397227502548587,
"learning_rate": 7.242105263157896e-05,
"loss": 1.3166,
"step": 86
},
{
"epoch": 0.4592543714945563,
"grad_norm": 4.745719843797305,
"learning_rate": 7.326315789473684e-05,
"loss": 1.3401,
"step": 87
},
{
"epoch": 0.464533157373804,
"grad_norm": 2.9655787260096185,
"learning_rate": 7.410526315789474e-05,
"loss": 1.3378,
"step": 88
},
{
"epoch": 0.4698119432530518,
"grad_norm": 4.46812982024058,
"learning_rate": 7.494736842105264e-05,
"loss": 1.3488,
"step": 89
},
{
"epoch": 0.47509072913229955,
"grad_norm": 3.302230770830571,
"learning_rate": 7.578947368421052e-05,
"loss": 1.3212,
"step": 90
},
{
"epoch": 0.48036951501154734,
"grad_norm": 3.240353930955636,
"learning_rate": 7.663157894736843e-05,
"loss": 1.3339,
"step": 91
},
{
"epoch": 0.48564830089079514,
"grad_norm": 2.6043118638655134,
"learning_rate": 7.747368421052633e-05,
"loss": 1.3326,
"step": 92
},
{
"epoch": 0.4909270867700429,
"grad_norm": 2.7735037196407433,
"learning_rate": 7.831578947368422e-05,
"loss": 1.3094,
"step": 93
},
{
"epoch": 0.49620587264929067,
"grad_norm": 1.869194117660677,
"learning_rate": 7.915789473684211e-05,
"loss": 1.3337,
"step": 94
},
{
"epoch": 0.5014846585285384,
"grad_norm": 1.486935743621699,
"learning_rate": 8e-05,
"loss": 1.3209,
"step": 95
},
{
"epoch": 0.5067634444077862,
"grad_norm": 3.97176574114269,
"learning_rate": 7.999972679326877e-05,
"loss": 1.3097,
"step": 96
},
{
"epoch": 0.512042230287034,
"grad_norm": 2.909020423931811,
"learning_rate": 7.999890717680716e-05,
"loss": 1.3255,
"step": 97
},
{
"epoch": 0.5173210161662818,
"grad_norm": 3.266690230691967,
"learning_rate": 7.999754116181141e-05,
"loss": 1.322,
"step": 98
},
{
"epoch": 0.5225998020455296,
"grad_norm": 2.9736870692736477,
"learning_rate": 7.999562876694173e-05,
"loss": 1.3241,
"step": 99
},
{
"epoch": 0.5278785879247773,
"grad_norm": 2.029766896419903,
"learning_rate": 7.999317001832211e-05,
"loss": 1.2977,
"step": 100
},
{
"epoch": 0.5331573738040251,
"grad_norm": 1.8003805642877855,
"learning_rate": 7.999016494953987e-05,
"loss": 1.3131,
"step": 101
},
{
"epoch": 0.5384361596832729,
"grad_norm": 1.6793075691245238,
"learning_rate": 7.998661360164525e-05,
"loss": 1.3121,
"step": 102
},
{
"epoch": 0.5437149455625206,
"grad_norm": 1.986719183571971,
"learning_rate": 7.998251602315085e-05,
"loss": 1.3249,
"step": 103
},
{
"epoch": 0.5489937314417684,
"grad_norm": 2.195586309730319,
"learning_rate": 7.997787227003101e-05,
"loss": 1.3059,
"step": 104
},
{
"epoch": 0.5542725173210161,
"grad_norm": 1.461828111029453,
"learning_rate": 7.997268240572093e-05,
"loss": 1.3118,
"step": 105
},
{
"epoch": 0.5595513032002639,
"grad_norm": 2.148695235144627,
"learning_rate": 7.99669465011159e-05,
"loss": 1.3273,
"step": 106
},
{
"epoch": 0.5648300890795117,
"grad_norm": 1.6724341025412792,
"learning_rate": 7.996066463457032e-05,
"loss": 1.3228,
"step": 107
},
{
"epoch": 0.5701088749587595,
"grad_norm": 1.705259901076724,
"learning_rate": 7.99538368918966e-05,
"loss": 1.3189,
"step": 108
},
{
"epoch": 0.5753876608380073,
"grad_norm": 1.469080880590235,
"learning_rate": 7.9946463366364e-05,
"loss": 1.316,
"step": 109
},
{
"epoch": 0.580666446717255,
"grad_norm": 1.8098027619124655,
"learning_rate": 7.993854415869737e-05,
"loss": 1.3488,
"step": 110
},
{
"epoch": 0.5859452325965028,
"grad_norm": 2.031226749468162,
"learning_rate": 7.993007937707573e-05,
"loss": 1.3186,
"step": 111
},
{
"epoch": 0.5912240184757506,
"grad_norm": 1.676093917612674,
"learning_rate": 7.992106913713087e-05,
"loss": 1.3093,
"step": 112
},
{
"epoch": 0.5965028043549984,
"grad_norm": 1.6468968483278041,
"learning_rate": 7.991151356194568e-05,
"loss": 1.3087,
"step": 113
},
{
"epoch": 0.6017815902342462,
"grad_norm": 1.7040887005478604,
"learning_rate": 7.990141278205255e-05,
"loss": 1.305,
"step": 114
},
{
"epoch": 0.607060376113494,
"grad_norm": 1.5442416545556625,
"learning_rate": 7.989076693543153e-05,
"loss": 1.3011,
"step": 115
},
{
"epoch": 0.6123391619927416,
"grad_norm": 1.8397675930137192,
"learning_rate": 7.987957616750845e-05,
"loss": 1.3169,
"step": 116
},
{
"epoch": 0.6176179478719894,
"grad_norm": 1.6758088273809235,
"learning_rate": 7.9867840631153e-05,
"loss": 1.3019,
"step": 117
},
{
"epoch": 0.6228967337512372,
"grad_norm": 1.8403825263907043,
"learning_rate": 7.985556048667652e-05,
"loss": 1.3218,
"step": 118
},
{
"epoch": 0.628175519630485,
"grad_norm": 1.6809344973155804,
"learning_rate": 7.984273590182992e-05,
"loss": 1.3122,
"step": 119
},
{
"epoch": 0.6334543055097328,
"grad_norm": 1.6249704090599522,
"learning_rate": 7.982936705180139e-05,
"loss": 1.2886,
"step": 120
},
{
"epoch": 0.6387330913889805,
"grad_norm": 1.2705796545495383,
"learning_rate": 7.981545411921387e-05,
"loss": 1.3048,
"step": 121
},
{
"epoch": 0.6440118772682283,
"grad_norm": 2.30022576461904,
"learning_rate": 7.980099729412272e-05,
"loss": 1.3203,
"step": 122
},
{
"epoch": 0.6492906631474761,
"grad_norm": 1.3889473327754434,
"learning_rate": 7.978599677401304e-05,
"loss": 1.303,
"step": 123
},
{
"epoch": 0.6545694490267239,
"grad_norm": 1.016655411214257,
"learning_rate": 7.977045276379698e-05,
"loss": 1.3,
"step": 124
},
{
"epoch": 0.6598482349059717,
"grad_norm": 1.9260332196274916,
"learning_rate": 7.975436547581096e-05,
"loss": 1.2993,
"step": 125
},
{
"epoch": 0.6651270207852193,
"grad_norm": 1.0922760502085274,
"learning_rate": 7.973773512981272e-05,
"loss": 1.2976,
"step": 126
},
{
"epoch": 0.6704058066644671,
"grad_norm": 1.5276232239033796,
"learning_rate": 7.972056195297842e-05,
"loss": 1.3096,
"step": 127
},
{
"epoch": 0.6756845925437149,
"grad_norm": 1.109799139624589,
"learning_rate": 7.97028461798994e-05,
"loss": 1.2976,
"step": 128
},
{
"epoch": 0.6809633784229627,
"grad_norm": 2.2229397086392124,
"learning_rate": 7.968458805257913e-05,
"loss": 1.3206,
"step": 129
},
{
"epoch": 0.6862421643022105,
"grad_norm": 1.9303859724176848,
"learning_rate": 7.966578782042972e-05,
"loss": 1.3084,
"step": 130
},
{
"epoch": 0.6915209501814583,
"grad_norm": 1.3809636210427785,
"learning_rate": 7.964644574026869e-05,
"loss": 1.3148,
"step": 131
},
{
"epoch": 0.696799736060706,
"grad_norm": 2.744998744916968,
"learning_rate": 7.962656207631538e-05,
"loss": 1.3209,
"step": 132
},
{
"epoch": 0.7020785219399538,
"grad_norm": 2.3382633110913718,
"learning_rate": 7.960613710018733e-05,
"loss": 1.334,
"step": 133
},
{
"epoch": 0.7073573078192016,
"grad_norm": 2.1467199346064763,
"learning_rate": 7.958517109089657e-05,
"loss": 1.3034,
"step": 134
},
{
"epoch": 0.7126360936984494,
"grad_norm": 1.624615452394344,
"learning_rate": 7.956366433484585e-05,
"loss": 1.297,
"step": 135
},
{
"epoch": 0.7179148795776972,
"grad_norm": 1.920698615496753,
"learning_rate": 7.954161712582469e-05,
"loss": 1.3061,
"step": 136
},
{
"epoch": 0.7231936654569449,
"grad_norm": 1.3787767200786702,
"learning_rate": 7.95190297650054e-05,
"loss": 1.3038,
"step": 137
},
{
"epoch": 0.7284724513361927,
"grad_norm": 1.9764036923036765,
"learning_rate": 7.949590256093892e-05,
"loss": 1.3072,
"step": 138
},
{
"epoch": 0.7337512372154404,
"grad_norm": 1.27956585328178,
"learning_rate": 7.947223582955066e-05,
"loss": 1.3112,
"step": 139
},
{
"epoch": 0.7390300230946882,
"grad_norm": 1.8770719697712916,
"learning_rate": 7.94480298941361e-05,
"loss": 1.2998,
"step": 140
},
{
"epoch": 0.744308808973936,
"grad_norm": 1.36222195282238,
"learning_rate": 7.94232850853565e-05,
"loss": 1.3106,
"step": 141
},
{
"epoch": 0.7495875948531837,
"grad_norm": 1.7131665811756942,
"learning_rate": 7.939800174123426e-05,
"loss": 1.2972,
"step": 142
},
{
"epoch": 0.7548663807324315,
"grad_norm": 1.516526698406304,
"learning_rate": 7.937218020714838e-05,
"loss": 1.3063,
"step": 143
},
{
"epoch": 0.7601451666116793,
"grad_norm": 1.559180702735073,
"learning_rate": 7.934582083582968e-05,
"loss": 1.2949,
"step": 144
},
{
"epoch": 0.7654239524909271,
"grad_norm": 1.3965240338452205,
"learning_rate": 7.931892398735608e-05,
"loss": 1.2996,
"step": 145
},
{
"epoch": 0.7707027383701749,
"grad_norm": 1.1876037429072739,
"learning_rate": 7.929149002914756e-05,
"loss": 1.2888,
"step": 146
},
{
"epoch": 0.7759815242494227,
"grad_norm": 1.2523705350271135,
"learning_rate": 7.926351933596123e-05,
"loss": 1.3041,
"step": 147
},
{
"epoch": 0.7812603101286704,
"grad_norm": 1.7160568167139532,
"learning_rate": 7.923501228988616e-05,
"loss": 1.2864,
"step": 148
},
{
"epoch": 0.7865390960079182,
"grad_norm": 1.485339867730922,
"learning_rate": 7.920596928033819e-05,
"loss": 1.3044,
"step": 149
},
{
"epoch": 0.791817881887166,
"grad_norm": 0.7690452839999546,
"learning_rate": 7.917639070405464e-05,
"loss": 1.2923,
"step": 150
},
{
"epoch": 0.7970966677664137,
"grad_norm": 1.1067614665986554,
"learning_rate": 7.91462769650888e-05,
"loss": 1.3,
"step": 151
},
{
"epoch": 0.8023754536456615,
"grad_norm": 1.9038151496223523,
"learning_rate": 7.911562847480446e-05,
"loss": 1.2881,
"step": 152
},
{
"epoch": 0.8076542395249092,
"grad_norm": 1.328802107876604,
"learning_rate": 7.908444565187034e-05,
"loss": 1.2967,
"step": 153
},
{
"epoch": 0.812933025404157,
"grad_norm": 0.8601798964930342,
"learning_rate": 7.905272892225426e-05,
"loss": 1.2922,
"step": 154
},
{
"epoch": 0.8182118112834048,
"grad_norm": 1.2277724236844354,
"learning_rate": 7.902047871921748e-05,
"loss": 1.2904,
"step": 155
},
{
"epoch": 0.8234905971626526,
"grad_norm": 1.7600047771215326,
"learning_rate": 7.898769548330857e-05,
"loss": 1.2952,
"step": 156
},
{
"epoch": 0.8287693830419004,
"grad_norm": 1.147137205196615,
"learning_rate": 7.895437966235759e-05,
"loss": 1.3084,
"step": 157
},
{
"epoch": 0.8340481689211481,
"grad_norm": 1.4328130237637848,
"learning_rate": 7.892053171146988e-05,
"loss": 1.3145,
"step": 158
},
{
"epoch": 0.8393269548003959,
"grad_norm": 1.2401237696521015,
"learning_rate": 7.888615209301981e-05,
"loss": 1.2995,
"step": 159
},
{
"epoch": 0.8446057406796437,
"grad_norm": 1.2705800837028438,
"learning_rate": 7.885124127664456e-05,
"loss": 1.2914,
"step": 160
},
{
"epoch": 0.8498845265588915,
"grad_norm": 0.7892106895753584,
"learning_rate": 7.881579973923763e-05,
"loss": 1.2995,
"step": 161
},
{
"epoch": 0.8551633124381393,
"grad_norm": 1.558474494813178,
"learning_rate": 7.877982796494235e-05,
"loss": 1.3144,
"step": 162
},
{
"epoch": 0.860442098317387,
"grad_norm": 1.2556130173314157,
"learning_rate": 7.874332644514525e-05,
"loss": 1.2899,
"step": 163
},
{
"epoch": 0.8657208841966347,
"grad_norm": 1.3752903536629737,
"learning_rate": 7.87062956784694e-05,
"loss": 1.2922,
"step": 164
},
{
"epoch": 0.8709996700758825,
"grad_norm": 1.1533060282069754,
"learning_rate": 7.86687361707675e-05,
"loss": 1.3035,
"step": 165
},
{
"epoch": 0.8762784559551303,
"grad_norm": 1.547240673853204,
"learning_rate": 7.86306484351151e-05,
"loss": 1.297,
"step": 166
},
{
"epoch": 0.8815572418343781,
"grad_norm": 1.118967928263504,
"learning_rate": 7.859203299180347e-05,
"loss": 1.304,
"step": 167
},
{
"epoch": 0.8868360277136259,
"grad_norm": 1.5660976179186918,
"learning_rate": 7.855289036833259e-05,
"loss": 1.2896,
"step": 168
},
{
"epoch": 0.8921148135928736,
"grad_norm": 1.06197414137172,
"learning_rate": 7.851322109940383e-05,
"loss": 1.296,
"step": 169
},
{
"epoch": 0.8973935994721214,
"grad_norm": 1.3075400468268668,
"learning_rate": 7.847302572691277e-05,
"loss": 1.2761,
"step": 170
},
{
"epoch": 0.9026723853513692,
"grad_norm": 1.179457262497728,
"learning_rate": 7.843230479994173e-05,
"loss": 1.2824,
"step": 171
},
{
"epoch": 0.907951171230617,
"grad_norm": 1.3418331018558847,
"learning_rate": 7.839105887475228e-05,
"loss": 1.2932,
"step": 172
},
{
"epoch": 0.9132299571098648,
"grad_norm": 1.4832264655838705,
"learning_rate": 7.834928851477764e-05,
"loss": 1.2885,
"step": 173
},
{
"epoch": 0.9185087429891126,
"grad_norm": 0.8499346587639997,
"learning_rate": 7.830699429061498e-05,
"loss": 1.2915,
"step": 174
},
{
"epoch": 0.9237875288683602,
"grad_norm": 1.171937958836628,
"learning_rate": 7.826417678001763e-05,
"loss": 1.302,
"step": 175
},
{
"epoch": 0.929066314747608,
"grad_norm": 1.290565505153455,
"learning_rate": 7.822083656788722e-05,
"loss": 1.276,
"step": 176
},
{
"epoch": 0.9343451006268558,
"grad_norm": 1.4411711180611884,
"learning_rate": 7.817697424626562e-05,
"loss": 1.2962,
"step": 177
},
{
"epoch": 0.9396238865061036,
"grad_norm": 0.9520805041627028,
"learning_rate": 7.813259041432689e-05,
"loss": 1.2736,
"step": 178
},
{
"epoch": 0.9449026723853514,
"grad_norm": 1.3339621010189933,
"learning_rate": 7.808768567836913e-05,
"loss": 1.2915,
"step": 179
},
{
"epoch": 0.9501814582645991,
"grad_norm": 1.3347433339166948,
"learning_rate": 7.804226065180615e-05,
"loss": 1.2872,
"step": 180
},
{
"epoch": 0.9554602441438469,
"grad_norm": 0.7733485264737051,
"learning_rate": 7.79963159551591e-05,
"loss": 1.2963,
"step": 181
},
{
"epoch": 0.9607390300230947,
"grad_norm": 1.0314173735628651,
"learning_rate": 7.794985221604798e-05,
"loss": 1.3129,
"step": 182
},
{
"epoch": 0.9660178159023425,
"grad_norm": 1.1663661577084024,
"learning_rate": 7.790287006918311e-05,
"loss": 1.2886,
"step": 183
},
{
"epoch": 0.9712966017815903,
"grad_norm": 1.5581398263647552,
"learning_rate": 7.785537015635646e-05,
"loss": 1.3064,
"step": 184
},
{
"epoch": 0.976575387660838,
"grad_norm": 0.9722670744894553,
"learning_rate": 7.78073531264328e-05,
"loss": 1.2782,
"step": 185
},
{
"epoch": 0.9818541735400858,
"grad_norm": 1.3125002501899055,
"learning_rate": 7.77588196353409e-05,
"loss": 1.2956,
"step": 186
},
{
"epoch": 0.9871329594193335,
"grad_norm": 0.9504483070516759,
"learning_rate": 7.770977034606463e-05,
"loss": 1.2648,
"step": 187
},
{
"epoch": 0.9924117452985813,
"grad_norm": 1.6171235581177341,
"learning_rate": 7.766020592863375e-05,
"loss": 1.2968,
"step": 188
},
{
"epoch": 0.9976905311778291,
"grad_norm": 0.8278118436268492,
"learning_rate": 7.76101270601149e-05,
"loss": 1.2878,
"step": 189
},
{
"epoch": 1.0046189376443417,
"grad_norm": 2.5192799011368443,
"learning_rate": 7.755953442460228e-05,
"loss": 2.3905,
"step": 190
},
{
"epoch": 1.0098977235235895,
"grad_norm": 1.2367786491086867,
"learning_rate": 7.75084287132083e-05,
"loss": 1.2562,
"step": 191
},
{
"epoch": 1.0151765094028373,
"grad_norm": 0.9651190908311478,
"learning_rate": 7.745681062405421e-05,
"loss": 1.2579,
"step": 192
},
{
"epoch": 1.0204552952820851,
"grad_norm": 1.3333896910793919,
"learning_rate": 7.740468086226046e-05,
"loss": 1.2751,
"step": 193
},
{
"epoch": 1.025734081161333,
"grad_norm": 1.0679022617096325,
"learning_rate": 7.735204013993714e-05,
"loss": 1.2645,
"step": 194
},
{
"epoch": 1.0310128670405807,
"grad_norm": 0.9950802797680122,
"learning_rate": 7.729888917617424e-05,
"loss": 1.247,
"step": 195
},
{
"epoch": 1.0362916529198285,
"grad_norm": 1.385909740174782,
"learning_rate": 7.724522869703182e-05,
"loss": 1.2696,
"step": 196
},
{
"epoch": 1.0415704387990763,
"grad_norm": 0.9972863560186646,
"learning_rate": 7.719105943553007e-05,
"loss": 1.2422,
"step": 197
},
{
"epoch": 1.046849224678324,
"grad_norm": 1.4770115211452153,
"learning_rate": 7.713638213163933e-05,
"loss": 1.2769,
"step": 198
},
{
"epoch": 1.0521280105575717,
"grad_norm": 0.9454690125193276,
"learning_rate": 7.708119753226999e-05,
"loss": 1.2483,
"step": 199
},
{
"epoch": 1.0574067964368195,
"grad_norm": 1.0793753061289406,
"learning_rate": 7.702550639126226e-05,
"loss": 1.2523,
"step": 200
},
{
"epoch": 1.0626855823160672,
"grad_norm": 1.0252549645817886,
"learning_rate": 7.696930946937584e-05,
"loss": 1.2709,
"step": 201
},
{
"epoch": 1.067964368195315,
"grad_norm": 1.8767373189017829,
"learning_rate": 7.691260753427962e-05,
"loss": 1.257,
"step": 202
},
{
"epoch": 1.0732431540745628,
"grad_norm": 0.9835935975146896,
"learning_rate": 7.68554013605411e-05,
"loss": 1.2792,
"step": 203
},
{
"epoch": 1.0785219399538106,
"grad_norm": 1.7392219386374146,
"learning_rate": 7.679769172961588e-05,
"loss": 1.2813,
"step": 204
},
{
"epoch": 1.0838007258330584,
"grad_norm": 1.2615250940260356,
"learning_rate": 7.673947942983693e-05,
"loss": 1.2633,
"step": 205
},
{
"epoch": 1.0890795117123062,
"grad_norm": 1.2633538446470585,
"learning_rate": 7.668076525640386e-05,
"loss": 1.3028,
"step": 206
},
{
"epoch": 1.094358297591554,
"grad_norm": 0.9941464449801785,
"learning_rate": 7.662155001137206e-05,
"loss": 1.2603,
"step": 207
},
{
"epoch": 1.0996370834708018,
"grad_norm": 1.0879031561585994,
"learning_rate": 7.656183450364166e-05,
"loss": 1.2731,
"step": 208
},
{
"epoch": 1.1049158693500494,
"grad_norm": 0.8845819323205296,
"learning_rate": 7.650161954894666e-05,
"loss": 1.2779,
"step": 209
},
{
"epoch": 1.1101946552292972,
"grad_norm": 1.154096961423528,
"learning_rate": 7.644090596984355e-05,
"loss": 1.2631,
"step": 210
},
{
"epoch": 1.115473441108545,
"grad_norm": 1.5088917906508794,
"learning_rate": 7.637969459570027e-05,
"loss": 1.2737,
"step": 211
},
{
"epoch": 1.1207522269877928,
"grad_norm": 0.748699753662145,
"learning_rate": 7.63179862626848e-05,
"loss": 1.2541,
"step": 212
},
{
"epoch": 1.1260310128670405,
"grad_norm": 1.2438464709370354,
"learning_rate": 7.625578181375373e-05,
"loss": 1.244,
"step": 213
},
{
"epoch": 1.1313097987462883,
"grad_norm": 1.1961589311155292,
"learning_rate": 7.619308209864079e-05,
"loss": 1.2596,
"step": 214
},
{
"epoch": 1.1365885846255361,
"grad_norm": 1.1828866928066657,
"learning_rate": 7.612988797384516e-05,
"loss": 1.2737,
"step": 215
},
{
"epoch": 1.141867370504784,
"grad_norm": 0.8880753274946044,
"learning_rate": 7.606620030261987e-05,
"loss": 1.2612,
"step": 216
},
{
"epoch": 1.1471461563840317,
"grad_norm": 0.9206877300452608,
"learning_rate": 7.600201995495993e-05,
"loss": 1.2499,
"step": 217
},
{
"epoch": 1.1524249422632795,
"grad_norm": 0.6583338875617647,
"learning_rate": 7.593734780759052e-05,
"loss": 1.2486,
"step": 218
},
{
"epoch": 1.1577037281425273,
"grad_norm": 0.6183246384902424,
"learning_rate": 7.587218474395492e-05,
"loss": 1.2497,
"step": 219
},
{
"epoch": 1.1629825140217749,
"grad_norm": 0.6843337221591282,
"learning_rate": 7.58065316542025e-05,
"loss": 1.2564,
"step": 220
},
{
"epoch": 1.1682612999010227,
"grad_norm": 0.8921369062315051,
"learning_rate": 7.574038943517657e-05,
"loss": 1.2761,
"step": 221
},
{
"epoch": 1.1735400857802705,
"grad_norm": 0.9769312253319868,
"learning_rate": 7.567375899040212e-05,
"loss": 1.2651,
"step": 222
},
{
"epoch": 1.1788188716595183,
"grad_norm": 1.2636950297671263,
"learning_rate": 7.560664123007341e-05,
"loss": 1.2429,
"step": 223
},
{
"epoch": 1.184097657538766,
"grad_norm": 1.12325478849589,
"learning_rate": 7.55390370710417e-05,
"loss": 1.2499,
"step": 224
},
{
"epoch": 1.1893764434180139,
"grad_norm": 0.6688626443177845,
"learning_rate": 7.547094743680248e-05,
"loss": 1.2629,
"step": 225
},
{
"epoch": 1.1946552292972616,
"grad_norm": 0.520812499306808,
"learning_rate": 7.540237325748312e-05,
"loss": 1.2504,
"step": 226
},
{
"epoch": 1.1999340151765094,
"grad_norm": 0.5778902503931598,
"learning_rate": 7.533331546982999e-05,
"loss": 1.2405,
"step": 227
},
{
"epoch": 1.2052128010557572,
"grad_norm": 0.8253860085731144,
"learning_rate": 7.526377501719568e-05,
"loss": 1.2453,
"step": 228
},
{
"epoch": 1.210491586935005,
"grad_norm": 0.9450184881775752,
"learning_rate": 7.51937528495262e-05,
"loss": 1.2668,
"step": 229
},
{
"epoch": 1.2157703728142528,
"grad_norm": 0.8332968398615714,
"learning_rate": 7.512324992334792e-05,
"loss": 1.2492,
"step": 230
},
{
"epoch": 1.2210491586935004,
"grad_norm": 0.9025930411004827,
"learning_rate": 7.505226720175455e-05,
"loss": 1.2535,
"step": 231
},
{
"epoch": 1.2263279445727482,
"grad_norm": 1.4269868470907296,
"learning_rate": 7.498080565439395e-05,
"loss": 1.2497,
"step": 232
},
{
"epoch": 1.231606730451996,
"grad_norm": 0.8001271623337567,
"learning_rate": 7.49088662574549e-05,
"loss": 1.2563,
"step": 233
},
{
"epoch": 1.2368855163312438,
"grad_norm": 0.959351713231221,
"learning_rate": 7.483644999365379e-05,
"loss": 1.2635,
"step": 234
},
{
"epoch": 1.2421643022104916,
"grad_norm": 0.9377746411915463,
"learning_rate": 7.476355785222114e-05,
"loss": 1.2653,
"step": 235
},
{
"epoch": 1.2474430880897394,
"grad_norm": 1.4913928935233076,
"learning_rate": 7.469019082888814e-05,
"loss": 1.2659,
"step": 236
},
{
"epoch": 1.2527218739689872,
"grad_norm": 0.7519497431119264,
"learning_rate": 7.461634992587303e-05,
"loss": 1.2653,
"step": 237
},
{
"epoch": 1.258000659848235,
"grad_norm": 0.9328702237386081,
"learning_rate": 7.45420361518674e-05,
"loss": 1.2585,
"step": 238
},
{
"epoch": 1.2632794457274827,
"grad_norm": 1.4443370753472227,
"learning_rate": 7.446725052202239e-05,
"loss": 1.2674,
"step": 239
},
{
"epoch": 1.2685582316067303,
"grad_norm": 0.6850260445092191,
"learning_rate": 7.43919940579349e-05,
"loss": 1.2468,
"step": 240
},
{
"epoch": 1.2738370174859783,
"grad_norm": 1.12844194486329,
"learning_rate": 7.431626778763355e-05,
"loss": 1.2592,
"step": 241
},
{
"epoch": 1.279115803365226,
"grad_norm": 1.0108743560548688,
"learning_rate": 7.424007274556467e-05,
"loss": 1.2447,
"step": 242
},
{
"epoch": 1.2843945892444737,
"grad_norm": 0.8135944524134671,
"learning_rate": 7.416340997257819e-05,
"loss": 1.2465,
"step": 243
},
{
"epoch": 1.2896733751237215,
"grad_norm": 0.911884059778029,
"learning_rate": 7.408628051591336e-05,
"loss": 1.2481,
"step": 244
},
{
"epoch": 1.2949521610029693,
"grad_norm": 0.6823887927427031,
"learning_rate": 7.400868542918457e-05,
"loss": 1.2413,
"step": 245
},
{
"epoch": 1.300230946882217,
"grad_norm": 0.6456313903186492,
"learning_rate": 7.393062577236679e-05,
"loss": 1.255,
"step": 246
},
{
"epoch": 1.3055097327614649,
"grad_norm": 0.7655605961879589,
"learning_rate": 7.385210261178121e-05,
"loss": 1.2559,
"step": 247
},
{
"epoch": 1.3107885186407127,
"grad_norm": 1.2704761768677957,
"learning_rate": 7.377311702008061e-05,
"loss": 1.275,
"step": 248
},
{
"epoch": 1.3160673045199605,
"grad_norm": 0.7188540018618917,
"learning_rate": 7.369367007623477e-05,
"loss": 1.2622,
"step": 249
},
{
"epoch": 1.3213460903992083,
"grad_norm": 0.7077473160288391,
"learning_rate": 7.361376286551571e-05,
"loss": 1.26,
"step": 250
},
{
"epoch": 1.3266248762784558,
"grad_norm": 0.6818378006965554,
"learning_rate": 7.353339647948279e-05,
"loss": 1.2636,
"step": 251
},
{
"epoch": 1.3319036621577038,
"grad_norm": 0.9321991492837668,
"learning_rate": 7.345257201596789e-05,
"loss": 1.2506,
"step": 252
},
{
"epoch": 1.3371824480369514,
"grad_norm": 3.3053933752949005,
"learning_rate": 7.337129057906042e-05,
"loss": 1.2786,
"step": 253
},
{
"epoch": 1.3424612339161992,
"grad_norm": 2.593380141147791,
"learning_rate": 7.328955327909212e-05,
"loss": 1.2541,
"step": 254
},
{
"epoch": 1.347740019795447,
"grad_norm": 0.6054358194419176,
"learning_rate": 7.320736123262203e-05,
"loss": 1.2582,
"step": 255
},
{
"epoch": 1.3530188056746948,
"grad_norm": 1.25329796192022,
"learning_rate": 7.312471556242118e-05,
"loss": 1.2556,
"step": 256
},
{
"epoch": 1.3582975915539426,
"grad_norm": 1.0368918572602928,
"learning_rate": 7.304161739745724e-05,
"loss": 1.2481,
"step": 257
},
{
"epoch": 1.3635763774331904,
"grad_norm": 1.3001573393775,
"learning_rate": 7.295806787287909e-05,
"loss": 1.2691,
"step": 258
},
{
"epoch": 1.3688551633124382,
"grad_norm": 18.476131898389347,
"learning_rate": 7.287406813000138e-05,
"loss": 1.2768,
"step": 259
},
{
"epoch": 1.374133949191686,
"grad_norm": 1.6192604738085483,
"learning_rate": 7.278961931628886e-05,
"loss": 1.2839,
"step": 260
},
{
"epoch": 1.3794127350709338,
"grad_norm": 1.7679541160592658,
"learning_rate": 7.270472258534072e-05,
"loss": 1.2703,
"step": 261
},
{
"epoch": 1.3846915209501813,
"grad_norm": 1.8000057405797747,
"learning_rate": 7.261937909687494e-05,
"loss": 1.269,
"step": 262
},
{
"epoch": 1.3899703068294293,
"grad_norm": 1.4623812962670848,
"learning_rate": 7.253359001671224e-05,
"loss": 1.2548,
"step": 263
},
{
"epoch": 1.395249092708677,
"grad_norm": 0.9642896538975226,
"learning_rate": 7.244735651676035e-05,
"loss": 1.2513,
"step": 264
},
{
"epoch": 1.4005278785879247,
"grad_norm": 1.8215181671865868,
"learning_rate": 7.236067977499791e-05,
"loss": 1.2683,
"step": 265
},
{
"epoch": 1.4058066644671725,
"grad_norm": 1.8947839088124854,
"learning_rate": 7.227356097545835e-05,
"loss": 1.2688,
"step": 266
},
{
"epoch": 1.4110854503464203,
"grad_norm": 1.257578409586287,
"learning_rate": 7.218600130821385e-05,
"loss": 1.2656,
"step": 267
},
{
"epoch": 1.416364236225668,
"grad_norm": 1.3538243305929665,
"learning_rate": 7.209800196935888e-05,
"loss": 1.2623,
"step": 268
},
{
"epoch": 1.4216430221049159,
"grad_norm": 0.5092846767556427,
"learning_rate": 7.200956416099405e-05,
"loss": 1.2606,
"step": 269
},
{
"epoch": 1.4269218079841637,
"grad_norm": 1.3171858475761182,
"learning_rate": 7.192068909120959e-05,
"loss": 1.246,
"step": 270
},
{
"epoch": 1.4322005938634115,
"grad_norm": 0.9278498418105572,
"learning_rate": 7.183137797406886e-05,
"loss": 1.262,
"step": 271
},
{
"epoch": 1.4374793797426593,
"grad_norm": 0.7095356686741614,
"learning_rate": 7.174163202959178e-05,
"loss": 1.265,
"step": 272
},
{
"epoch": 1.4427581656219068,
"grad_norm": 0.5432578683940454,
"learning_rate": 7.165145248373814e-05,
"loss": 1.2641,
"step": 273
},
{
"epoch": 1.4480369515011549,
"grad_norm": 0.535484849080536,
"learning_rate": 7.15608405683909e-05,
"loss": 1.2645,
"step": 274
},
{
"epoch": 1.4533157373804024,
"grad_norm": 3.517558722980717,
"learning_rate": 7.146979752133934e-05,
"loss": 1.3068,
"step": 275
},
{
"epoch": 1.4585945232596502,
"grad_norm": 1.0961722347293192,
"learning_rate": 7.137832458626209e-05,
"loss": 1.2759,
"step": 276
},
{
"epoch": 1.463873309138898,
"grad_norm": 1.4589790859163712,
"learning_rate": 7.128642301271026e-05,
"loss": 1.2604,
"step": 277
},
{
"epoch": 1.4691520950181458,
"grad_norm": 0.8271599352115797,
"learning_rate": 7.119409405609025e-05,
"loss": 1.2574,
"step": 278
},
{
"epoch": 1.4744308808973936,
"grad_norm": 1.6144482683602592,
"learning_rate": 7.110133897764672e-05,
"loss": 1.2567,
"step": 279
},
{
"epoch": 1.4797096667766414,
"grad_norm": 1.0778238677287892,
"learning_rate": 7.10081590444452e-05,
"loss": 1.2633,
"step": 280
},
{
"epoch": 1.4849884526558892,
"grad_norm": 1.574048217009984,
"learning_rate": 7.091455552935499e-05,
"loss": 1.2721,
"step": 281
},
{
"epoch": 1.490267238535137,
"grad_norm": 1.4504862826022984,
"learning_rate": 7.082052971103158e-05,
"loss": 1.2527,
"step": 282
},
{
"epoch": 1.4955460244143848,
"grad_norm": 0.9050091986245387,
"learning_rate": 7.07260828738993e-05,
"loss": 1.2566,
"step": 283
},
{
"epoch": 1.5008248102936323,
"grad_norm": 1.3808682239204024,
"learning_rate": 7.063121630813374e-05,
"loss": 1.2662,
"step": 284
},
{
"epoch": 1.5061035961728804,
"grad_norm": 0.7200106206887804,
"learning_rate": 7.053593130964412e-05,
"loss": 1.2573,
"step": 285
},
{
"epoch": 1.511382382052128,
"grad_norm": 1.2302845526700892,
"learning_rate": 7.044022918005559e-05,
"loss": 1.2446,
"step": 286
},
{
"epoch": 1.5166611679313757,
"grad_norm": 0.8014756997371825,
"learning_rate": 7.034411122669142e-05,
"loss": 1.2665,
"step": 287
},
{
"epoch": 1.5219399538106235,
"grad_norm": 0.9094547926910241,
"learning_rate": 7.024757876255525e-05,
"loss": 1.2642,
"step": 288
},
{
"epoch": 1.5272187396898713,
"grad_norm": 0.7607544679621796,
"learning_rate": 7.015063310631299e-05,
"loss": 1.2547,
"step": 289
},
{
"epoch": 1.5324975255691191,
"grad_norm": 0.7181185070974516,
"learning_rate": 7.005327558227494e-05,
"loss": 1.2583,
"step": 290
},
{
"epoch": 1.537776311448367,
"grad_norm": 0.5716191527660932,
"learning_rate": 6.995550752037766e-05,
"loss": 1.2634,
"step": 291
},
{
"epoch": 1.5430550973276147,
"grad_norm": 0.5794658962331969,
"learning_rate": 6.985733025616576e-05,
"loss": 1.2594,
"step": 292
},
{
"epoch": 1.5483338832068623,
"grad_norm": 0.5823905292282883,
"learning_rate": 6.975874513077374e-05,
"loss": 1.2478,
"step": 293
},
{
"epoch": 1.5536126690861103,
"grad_norm": 0.5780008289508327,
"learning_rate": 6.965975349090757e-05,
"loss": 1.2501,
"step": 294
},
{
"epoch": 1.5588914549653579,
"grad_norm": 0.37458218109688246,
"learning_rate": 6.956035668882637e-05,
"loss": 1.2515,
"step": 295
},
{
"epoch": 1.5641702408446059,
"grad_norm": 0.4630840089540465,
"learning_rate": 6.946055608232392e-05,
"loss": 1.2555,
"step": 296
},
{
"epoch": 1.5694490267238534,
"grad_norm": 0.46579508102759115,
"learning_rate": 6.936035303471008e-05,
"loss": 1.2302,
"step": 297
},
{
"epoch": 1.5747278126031012,
"grad_norm": 0.440147654088893,
"learning_rate": 6.925974891479222e-05,
"loss": 1.2397,
"step": 298
},
{
"epoch": 1.580006598482349,
"grad_norm": 0.3789595605856227,
"learning_rate": 6.915874509685646e-05,
"loss": 1.2367,
"step": 299
},
{
"epoch": 1.5852853843615968,
"grad_norm": 0.44066314676421686,
"learning_rate": 6.905734296064897e-05,
"loss": 1.2532,
"step": 300
},
{
"epoch": 1.5905641702408446,
"grad_norm": 0.4786748650063442,
"learning_rate": 6.895554389135705e-05,
"loss": 1.2395,
"step": 301
},
{
"epoch": 1.5958429561200924,
"grad_norm": 0.3993789516492546,
"learning_rate": 6.885334927959022e-05,
"loss": 1.2475,
"step": 302
},
{
"epoch": 1.6011217419993402,
"grad_norm": 0.3734818160752607,
"learning_rate": 6.875076052136132e-05,
"loss": 1.2484,
"step": 303
},
{
"epoch": 1.6064005278785878,
"grad_norm": 0.3352937985165155,
"learning_rate": 6.864777901806728e-05,
"loss": 1.2369,
"step": 304
},
{
"epoch": 1.6116793137578358,
"grad_norm": 0.4671437568457166,
"learning_rate": 6.85444061764701e-05,
"loss": 1.2376,
"step": 305
},
{
"epoch": 1.6169580996370834,
"grad_norm": 0.3667230797477068,
"learning_rate": 6.844064340867759e-05,
"loss": 1.2487,
"step": 306
},
{
"epoch": 1.6222368855163314,
"grad_norm": 0.3770360726749058,
"learning_rate": 6.833649213212409e-05,
"loss": 1.2456,
"step": 307
},
{
"epoch": 1.627515671395579,
"grad_norm": 0.35396903252590395,
"learning_rate": 6.823195376955108e-05,
"loss": 1.2528,
"step": 308
},
{
"epoch": 1.6327944572748267,
"grad_norm": 0.29380579788484945,
"learning_rate": 6.812702974898779e-05,
"loss": 1.2441,
"step": 309
},
{
"epoch": 1.6380732431540745,
"grad_norm": 0.43788577617277263,
"learning_rate": 6.802172150373164e-05,
"loss": 1.239,
"step": 310
},
{
"epoch": 1.6433520290333223,
"grad_norm": 0.32377479140969234,
"learning_rate": 6.791603047232871e-05,
"loss": 1.233,
"step": 311
},
{
"epoch": 1.6486308149125701,
"grad_norm": 1.7706277137744126,
"learning_rate": 6.780995809855405e-05,
"loss": 1.2474,
"step": 312
},
{
"epoch": 1.653909600791818,
"grad_norm": 8.656382792450561,
"learning_rate": 6.7703505831392e-05,
"loss": 1.265,
"step": 313
},
{
"epoch": 1.6591883866710657,
"grad_norm": 0.6523846362961528,
"learning_rate": 6.759667512501637e-05,
"loss": 1.2573,
"step": 314
},
{
"epoch": 1.6644671725503133,
"grad_norm": 0.5330875065870518,
"learning_rate": 6.748946743877052e-05,
"loss": 1.2488,
"step": 315
},
{
"epoch": 1.6697459584295613,
"grad_norm": 0.7219628521797609,
"learning_rate": 6.738188423714756e-05,
"loss": 1.2602,
"step": 316
},
{
"epoch": 1.6750247443088089,
"grad_norm": 0.5201448299460263,
"learning_rate": 6.727392698977021e-05,
"loss": 1.2396,
"step": 317
},
{
"epoch": 1.680303530188057,
"grad_norm": 0.44274195631429936,
"learning_rate": 6.716559717137084e-05,
"loss": 1.2406,
"step": 318
},
{
"epoch": 1.6855823160673045,
"grad_norm": 0.5798477856934268,
"learning_rate": 6.70568962617712e-05,
"loss": 1.2414,
"step": 319
},
{
"epoch": 1.6908611019465523,
"grad_norm": 0.7807750714971328,
"learning_rate": 6.69478257458623e-05,
"loss": 1.2413,
"step": 320
},
{
"epoch": 1.6961398878258,
"grad_norm": 1.1559025640488658,
"learning_rate": 6.683838711358411e-05,
"loss": 1.2841,
"step": 321
},
{
"epoch": 1.7014186737050478,
"grad_norm": 0.5759215742814919,
"learning_rate": 6.672858185990516e-05,
"loss": 1.2406,
"step": 322
},
{
"epoch": 1.7066974595842956,
"grad_norm": 0.4726737719351046,
"learning_rate": 6.661841148480218e-05,
"loss": 1.2484,
"step": 323
},
{
"epoch": 1.7119762454635434,
"grad_norm": 0.5006895849458483,
"learning_rate": 6.650787749323959e-05,
"loss": 1.2559,
"step": 324
},
{
"epoch": 1.7172550313427912,
"grad_norm": 0.625628745360177,
"learning_rate": 6.639698139514892e-05,
"loss": 1.2747,
"step": 325
},
{
"epoch": 1.7225338172220388,
"grad_norm": 0.6550942178842886,
"learning_rate": 6.628572470540814e-05,
"loss": 1.242,
"step": 326
},
{
"epoch": 1.7278126031012868,
"grad_norm": 0.5870546197249171,
"learning_rate": 6.617410894382113e-05,
"loss": 1.2453,
"step": 327
},
{
"epoch": 1.7330913889805344,
"grad_norm": 0.41134148223364553,
"learning_rate": 6.606213563509675e-05,
"loss": 1.2478,
"step": 328
},
{
"epoch": 1.7383701748597824,
"grad_norm": 0.46444131340246914,
"learning_rate": 6.594980630882807e-05,
"loss": 1.2425,
"step": 329
},
{
"epoch": 1.74364896073903,
"grad_norm": 0.5911200501832204,
"learning_rate": 6.58371224994715e-05,
"loss": 1.2458,
"step": 330
},
{
"epoch": 1.7489277466182778,
"grad_norm": 0.7085536258057434,
"learning_rate": 6.57240857463258e-05,
"loss": 1.244,
"step": 331
},
{
"epoch": 1.7542065324975256,
"grad_norm": 0.5992624201421093,
"learning_rate": 6.561069759351105e-05,
"loss": 1.2368,
"step": 332
},
{
"epoch": 1.7594853183767734,
"grad_norm": 0.3551137979944097,
"learning_rate": 6.54969595899476e-05,
"loss": 1.2416,
"step": 333
},
{
"epoch": 1.7647641042560211,
"grad_norm": 0.4082908080936244,
"learning_rate": 6.538287328933484e-05,
"loss": 1.253,
"step": 334
},
{
"epoch": 1.770042890135269,
"grad_norm": 0.5365388835965731,
"learning_rate": 6.526844025013004e-05,
"loss": 1.2254,
"step": 335
},
{
"epoch": 1.7753216760145167,
"grad_norm": 0.5024584986563364,
"learning_rate": 6.515366203552704e-05,
"loss": 1.2456,
"step": 336
},
{
"epoch": 1.7806004618937643,
"grad_norm": 0.44456650959553395,
"learning_rate": 6.503854021343487e-05,
"loss": 1.2196,
"step": 337
},
{
"epoch": 1.7858792477730123,
"grad_norm": 0.4783375231088745,
"learning_rate": 6.492307635645637e-05,
"loss": 1.2323,
"step": 338
},
{
"epoch": 1.79115803365226,
"grad_norm": 0.48814392623760583,
"learning_rate": 6.480727204186669e-05,
"loss": 1.2417,
"step": 339
},
{
"epoch": 1.796436819531508,
"grad_norm": 0.44118266288578545,
"learning_rate": 6.469112885159172e-05,
"loss": 1.2516,
"step": 340
},
{
"epoch": 1.8017156054107555,
"grad_norm": 0.9971330600754439,
"learning_rate": 6.457464837218656e-05,
"loss": 1.2524,
"step": 341
},
{
"epoch": 1.8069943912900033,
"grad_norm": 0.3714817216276515,
"learning_rate": 6.445783219481375e-05,
"loss": 1.2477,
"step": 342
},
{
"epoch": 1.812273177169251,
"grad_norm": 0.8549026356260457,
"learning_rate": 6.434068191522158e-05,
"loss": 1.2591,
"step": 343
},
{
"epoch": 1.8175519630484989,
"grad_norm": 0.3340586702181202,
"learning_rate": 6.42231991337223e-05,
"loss": 1.2372,
"step": 344
},
{
"epoch": 1.8228307489277467,
"grad_norm": 0.43304400565838075,
"learning_rate": 6.410538545517026e-05,
"loss": 1.243,
"step": 345
},
{
"epoch": 1.8281095348069942,
"grad_norm": 0.45719292572074793,
"learning_rate": 6.398724248893995e-05,
"loss": 1.2504,
"step": 346
},
{
"epoch": 1.8333883206862422,
"grad_norm": 0.599990078135568,
"learning_rate": 6.386877184890404e-05,
"loss": 1.2438,
"step": 347
},
{
"epoch": 1.8386671065654898,
"grad_norm": 0.6906831942159901,
"learning_rate": 6.374997515341136e-05,
"loss": 1.2477,
"step": 348
},
{
"epoch": 1.8439458924447378,
"grad_norm": 0.8731330067427534,
"learning_rate": 6.363085402526477e-05,
"loss": 1.2674,
"step": 349
},
{
"epoch": 1.8492246783239854,
"grad_norm": 0.9022754372270435,
"learning_rate": 6.351141009169893e-05,
"loss": 1.382,
"step": 350
},
{
"epoch": 1.8545034642032334,
"grad_norm": 1.2486459290282084,
"learning_rate": 6.33916449843582e-05,
"loss": 1.2545,
"step": 351
},
{
"epoch": 1.859782250082481,
"grad_norm": 1.6920822976080476,
"learning_rate": 6.327156033927426e-05,
"loss": 1.2658,
"step": 352
},
{
"epoch": 1.8650610359617288,
"grad_norm": 0.7097169693478338,
"learning_rate": 6.315115779684375e-05,
"loss": 1.236,
"step": 353
},
{
"epoch": 1.8703398218409766,
"grad_norm": 1.6978575452018114,
"learning_rate": 6.303043900180595e-05,
"loss": 1.2585,
"step": 354
},
{
"epoch": 1.8756186077202244,
"grad_norm": 0.7845252429794936,
"learning_rate": 6.290940560322022e-05,
"loss": 1.2517,
"step": 355
},
{
"epoch": 1.8808973935994722,
"grad_norm": 1.3607837437906547,
"learning_rate": 6.278805925444351e-05,
"loss": 1.256,
"step": 356
},
{
"epoch": 1.8861761794787197,
"grad_norm": 0.6792465095925219,
"learning_rate": 6.26664016131078e-05,
"loss": 1.2673,
"step": 357
},
{
"epoch": 1.8914549653579678,
"grad_norm": 1.1323653230234954,
"learning_rate": 6.25444343410974e-05,
"loss": 1.2587,
"step": 358
},
{
"epoch": 1.8967337512372153,
"grad_norm": 0.8336475113673848,
"learning_rate": 6.242215910452631e-05,
"loss": 1.2487,
"step": 359
},
{
"epoch": 1.9020125371164633,
"grad_norm": 1.0432671496997088,
"learning_rate": 6.229957757371542e-05,
"loss": 1.2685,
"step": 360
},
{
"epoch": 1.907291322995711,
"grad_norm": 1.282931842988959,
"learning_rate": 6.217669142316969e-05,
"loss": 1.2437,
"step": 361
},
{
"epoch": 1.9125701088749587,
"grad_norm": 0.5494429533946689,
"learning_rate": 6.205350233155528e-05,
"loss": 1.2385,
"step": 362
},
{
"epoch": 1.9178488947542065,
"grad_norm": 0.8430421321010195,
"learning_rate": 6.193001198167666e-05,
"loss": 1.2516,
"step": 363
},
{
"epoch": 1.9231276806334543,
"grad_norm": 0.6400343107166205,
"learning_rate": 6.180622206045357e-05,
"loss": 1.2514,
"step": 364
},
{
"epoch": 1.928406466512702,
"grad_norm": 0.683698880506509,
"learning_rate": 6.168213425889798e-05,
"loss": 1.2298,
"step": 365
},
{
"epoch": 1.9336852523919499,
"grad_norm": 0.7501472371069945,
"learning_rate": 6.155775027209104e-05,
"loss": 1.2631,
"step": 366
},
{
"epoch": 1.9389640382711977,
"grad_norm": 0.5723685480165793,
"learning_rate": 6.143307179915987e-05,
"loss": 1.2524,
"step": 367
},
{
"epoch": 1.9442428241504452,
"grad_norm": 0.5386840918093146,
"learning_rate": 6.130810054325438e-05,
"loss": 1.2604,
"step": 368
},
{
"epoch": 1.9495216100296933,
"grad_norm": 0.6074943796692782,
"learning_rate": 6.118283821152396e-05,
"loss": 1.2331,
"step": 369
},
{
"epoch": 1.9548003959089408,
"grad_norm": 0.7838692807153693,
"learning_rate": 6.105728651509424e-05,
"loss": 1.2489,
"step": 370
},
{
"epoch": 1.9600791817881889,
"grad_norm": 0.4905147139769546,
"learning_rate": 6.0931447169043645e-05,
"loss": 1.2187,
"step": 371
},
{
"epoch": 1.9653579676674364,
"grad_norm": 0.49373919780087727,
"learning_rate": 6.080532189238e-05,
"loss": 1.25,
"step": 372
},
{
"epoch": 1.9706367535466842,
"grad_norm": 0.5601632476322846,
"learning_rate": 6.067891240801702e-05,
"loss": 1.2433,
"step": 373
},
{
"epoch": 1.975915539425932,
"grad_norm": 0.5235707794626788,
"learning_rate": 6.0552220442750824e-05,
"loss": 1.2384,
"step": 374
},
{
"epoch": 1.9811943253051798,
"grad_norm": 0.5725817706676869,
"learning_rate": 6.042524772723628e-05,
"loss": 1.232,
"step": 375
},
{
"epoch": 1.9864731111844276,
"grad_norm": 0.3333019113840953,
"learning_rate": 6.0297995995963434e-05,
"loss": 1.2347,
"step": 376
},
{
"epoch": 1.9917518970636754,
"grad_norm": 0.4325220309561531,
"learning_rate": 6.017046698723374e-05,
"loss": 1.2351,
"step": 377
},
{
"epoch": 1.9970306829429232,
"grad_norm": 0.5371606229052233,
"learning_rate": 6.0042662443136396e-05,
"loss": 1.2433,
"step": 378
},
{
"epoch": 2.003959089409436,
"grad_norm": 0.9139703688362438,
"learning_rate": 5.991458410952449e-05,
"loss": 2.3033,
"step": 379
},
{
"epoch": 2.0092378752886835,
"grad_norm": 1.3521600980278237,
"learning_rate": 5.978623373599117e-05,
"loss": 1.2172,
"step": 380
},
{
"epoch": 2.0145166611679315,
"grad_norm": 0.7161418602043893,
"learning_rate": 5.965761307584571e-05,
"loss": 1.2157,
"step": 381
},
{
"epoch": 2.019795447047179,
"grad_norm": 0.7509184595632193,
"learning_rate": 5.9528723886089624e-05,
"loss": 1.1977,
"step": 382
},
{
"epoch": 2.025074232926427,
"grad_norm": 1.0599214341732555,
"learning_rate": 5.939956792739264e-05,
"loss": 1.2177,
"step": 383
},
{
"epoch": 2.0303530188056746,
"grad_norm": 0.8980168138332725,
"learning_rate": 5.9270146964068614e-05,
"loss": 1.2153,
"step": 384
},
{
"epoch": 2.0356318046849227,
"grad_norm": 0.7180380231092935,
"learning_rate": 5.9140462764051464e-05,
"loss": 1.2187,
"step": 385
},
{
"epoch": 2.0409105905641702,
"grad_norm": 0.6531462330254099,
"learning_rate": 5.901051709887101e-05,
"loss": 1.2251,
"step": 386
},
{
"epoch": 2.046189376443418,
"grad_norm": 0.7734045217497407,
"learning_rate": 5.888031174362878e-05,
"loss": 1.21,
"step": 387
},
{
"epoch": 2.051468162322666,
"grad_norm": 0.8550587940395448,
"learning_rate": 5.874984847697372e-05,
"loss": 1.2012,
"step": 388
},
{
"epoch": 2.0567469482019134,
"grad_norm": 1.2780965188327706,
"learning_rate": 5.8619129081077996e-05,
"loss": 1.2518,
"step": 389
},
{
"epoch": 2.0620257340811614,
"grad_norm": 0.8321341577982173,
"learning_rate": 5.848815534161254e-05,
"loss": 1.2093,
"step": 390
},
{
"epoch": 2.067304519960409,
"grad_norm": 0.9615664316853234,
"learning_rate": 5.83569290477227e-05,
"loss": 1.2072,
"step": 391
},
{
"epoch": 2.072583305839657,
"grad_norm": 0.5979739722697388,
"learning_rate": 5.822545199200383e-05,
"loss": 1.2336,
"step": 392
},
{
"epoch": 2.0778620917189046,
"grad_norm": 0.8164608679169012,
"learning_rate": 5.8093725970476755e-05,
"loss": 1.1964,
"step": 393
},
{
"epoch": 2.0831408775981526,
"grad_norm": 3.085844123659633,
"learning_rate": 5.796175278256328e-05,
"loss": 1.2152,
"step": 394
},
{
"epoch": 2.0884196634774,
"grad_norm": 2.7108585877216593,
"learning_rate": 5.782953423106154e-05,
"loss": 1.2449,
"step": 395
},
{
"epoch": 2.093698449356648,
"grad_norm": 0.8230346043328058,
"learning_rate": 5.769707212212147e-05,
"loss": 1.2061,
"step": 396
},
{
"epoch": 2.0989772352358957,
"grad_norm": 1.227853136572132,
"learning_rate": 5.756436826522005e-05,
"loss": 1.2376,
"step": 397
},
{
"epoch": 2.1042560211151433,
"grad_norm": 0.753658806600911,
"learning_rate": 5.743142447313664e-05,
"loss": 1.2157,
"step": 398
},
{
"epoch": 2.1095348069943913,
"grad_norm": 1.015129720892302,
"learning_rate": 5.729824256192816e-05,
"loss": 1.2119,
"step": 399
},
{
"epoch": 2.114813592873639,
"grad_norm": 0.5801446237487646,
"learning_rate": 5.716482435090436e-05,
"loss": 1.2186,
"step": 400
},
{
"epoch": 2.120092378752887,
"grad_norm": 1.017251986410329,
"learning_rate": 5.703117166260291e-05,
"loss": 1.2263,
"step": 401
},
{
"epoch": 2.1253711646321345,
"grad_norm": 0.4873010162044414,
"learning_rate": 5.68972863227645e-05,
"loss": 1.2196,
"step": 402
},
{
"epoch": 2.1306499505113825,
"grad_norm": 0.7310559896138776,
"learning_rate": 5.676317016030795e-05,
"loss": 1.2177,
"step": 403
},
{
"epoch": 2.13592873639063,
"grad_norm": 1.1699510054393627,
"learning_rate": 5.662882500730517e-05,
"loss": 1.3042,
"step": 404
},
{
"epoch": 2.141207522269878,
"grad_norm": 0.676699808456523,
"learning_rate": 5.6494252698956146e-05,
"loss": 1.2048,
"step": 405
},
{
"epoch": 2.1464863081491257,
"grad_norm": 0.6473663142899424,
"learning_rate": 5.6359455073563936e-05,
"loss": 1.2161,
"step": 406
},
{
"epoch": 2.1517650940283737,
"grad_norm": 0.4172949088763905,
"learning_rate": 5.6224433972509433e-05,
"loss": 1.2192,
"step": 407
},
{
"epoch": 2.1570438799076213,
"grad_norm": 1.1239469184334865,
"learning_rate": 5.608919124022636e-05,
"loss": 1.2329,
"step": 408
},
{
"epoch": 2.162322665786869,
"grad_norm": 0.4587267505463191,
"learning_rate": 5.595372872417593e-05,
"loss": 1.2217,
"step": 409
},
{
"epoch": 2.167601451666117,
"grad_norm": 5.412203166967258,
"learning_rate": 5.58180482748217e-05,
"loss": 1.2201,
"step": 410
},
{
"epoch": 2.1728802375453644,
"grad_norm": 18.213451676887374,
"learning_rate": 5.568215174560431e-05,
"loss": 1.351,
"step": 411
},
{
"epoch": 2.1781590234246124,
"grad_norm": 13.536878824327069,
"learning_rate": 5.554604099291604e-05,
"loss": 1.2383,
"step": 412
},
{
"epoch": 2.18343780930386,
"grad_norm": 6.282206839205327,
"learning_rate": 5.5409717876075605e-05,
"loss": 1.2588,
"step": 413
},
{
"epoch": 2.188716595183108,
"grad_norm": 1.2831210440571694,
"learning_rate": 5.527318425730268e-05,
"loss": 1.238,
"step": 414
},
{
"epoch": 2.1939953810623556,
"grad_norm": 3.739838095929456,
"learning_rate": 5.513644200169242e-05,
"loss": 1.229,
"step": 415
},
{
"epoch": 2.1992741669416036,
"grad_norm": 1.899326073319162,
"learning_rate": 5.499949297719006e-05,
"loss": 1.2544,
"step": 416
},
{
"epoch": 2.204552952820851,
"grad_norm": 4.148642778578833,
"learning_rate": 5.486233905456538e-05,
"loss": 1.2406,
"step": 417
},
{
"epoch": 2.2098317387000987,
"grad_norm": 14.647605474491863,
"learning_rate": 5.472498210738713e-05,
"loss": 1.2508,
"step": 418
},
{
"epoch": 2.2151105245793468,
"grad_norm": 2.4488321875685157,
"learning_rate": 5.458742401199741e-05,
"loss": 1.2625,
"step": 419
},
{
"epoch": 2.2203893104585943,
"grad_norm": 1.2822438195586405,
"learning_rate": 5.444966664748613e-05,
"loss": 1.2378,
"step": 420
},
{
"epoch": 2.2256680963378423,
"grad_norm": 1.203185114838534,
"learning_rate": 5.431171189566522e-05,
"loss": 1.238,
"step": 421
},
{
"epoch": 2.23094688221709,
"grad_norm": 1.3127305879869517,
"learning_rate": 5.417356164104306e-05,
"loss": 1.2513,
"step": 422
},
{
"epoch": 2.236225668096338,
"grad_norm": 0.8297161506665818,
"learning_rate": 5.40352177707986e-05,
"loss": 1.2286,
"step": 423
},
{
"epoch": 2.2415044539755855,
"grad_norm": 0.7963612920042281,
"learning_rate": 5.389668217475566e-05,
"loss": 1.2333,
"step": 424
},
{
"epoch": 2.2467832398548335,
"grad_norm": 0.8841598541409225,
"learning_rate": 5.3757956745357134e-05,
"loss": 1.2401,
"step": 425
},
{
"epoch": 2.252062025734081,
"grad_norm": 0.6425481585506017,
"learning_rate": 5.3619043377639055e-05,
"loss": 1.2298,
"step": 426
},
{
"epoch": 2.257340811613329,
"grad_norm": 1.4204595652997734,
"learning_rate": 5.347994396920479e-05,
"loss": 1.2209,
"step": 427
},
{
"epoch": 2.2626195974925767,
"grad_norm": 0.6873192257170075,
"learning_rate": 5.334066042019907e-05,
"loss": 1.2254,
"step": 428
},
{
"epoch": 2.2678983833718247,
"grad_norm": 1.0883491912415153,
"learning_rate": 5.320119463328207e-05,
"loss": 1.2169,
"step": 429
},
{
"epoch": 2.2731771692510723,
"grad_norm": 0.6756173808771652,
"learning_rate": 5.306154851360333e-05,
"loss": 1.2371,
"step": 430
},
{
"epoch": 2.27845595513032,
"grad_norm": 0.9877713663217423,
"learning_rate": 5.2921723968775896e-05,
"loss": 1.2081,
"step": 431
},
{
"epoch": 2.283734741009568,
"grad_norm": 0.8015364281608918,
"learning_rate": 5.2781722908850086e-05,
"loss": 1.2086,
"step": 432
},
{
"epoch": 2.2890135268888154,
"grad_norm": 0.6008408657002147,
"learning_rate": 5.264154724628751e-05,
"loss": 1.2218,
"step": 433
},
{
"epoch": 2.2942923127680634,
"grad_norm": 0.6734753765950555,
"learning_rate": 5.250119889593488e-05,
"loss": 1.2171,
"step": 434
},
{
"epoch": 2.299571098647311,
"grad_norm": 0.42516851079297513,
"learning_rate": 5.23606797749979e-05,
"loss": 1.2219,
"step": 435
},
{
"epoch": 2.304849884526559,
"grad_norm": 0.5063759909871318,
"learning_rate": 5.221999180301506e-05,
"loss": 1.2063,
"step": 436
},
{
"epoch": 2.3101286704058066,
"grad_norm": 0.4563996005720369,
"learning_rate": 5.2079136901831425e-05,
"loss": 1.2209,
"step": 437
},
{
"epoch": 2.3154074562850546,
"grad_norm": 0.3910351995919037,
"learning_rate": 5.1938116995572325e-05,
"loss": 1.2179,
"step": 438
},
{
"epoch": 2.320686242164302,
"grad_norm": 0.5608831097319059,
"learning_rate": 5.179693401061714e-05,
"loss": 1.2197,
"step": 439
},
{
"epoch": 2.3259650280435498,
"grad_norm": 0.4703055804548109,
"learning_rate": 5.1655589875572994e-05,
"loss": 1.2136,
"step": 440
},
{
"epoch": 2.331243813922798,
"grad_norm": 0.44399692780721745,
"learning_rate": 5.151408652124831e-05,
"loss": 1.2111,
"step": 441
},
{
"epoch": 2.3365225998020454,
"grad_norm": 0.4792323461061973,
"learning_rate": 5.1372425880626536e-05,
"loss": 1.2166,
"step": 442
},
{
"epoch": 2.3418013856812934,
"grad_norm": 0.38455094020768915,
"learning_rate": 5.1230609888839724e-05,
"loss": 1.2161,
"step": 443
},
{
"epoch": 2.347080171560541,
"grad_norm": 0.4279941829860823,
"learning_rate": 5.108864048314204e-05,
"loss": 1.2203,
"step": 444
},
{
"epoch": 2.352358957439789,
"grad_norm": 0.2777582548813507,
"learning_rate": 5.0946519602883326e-05,
"loss": 1.2144,
"step": 445
},
{
"epoch": 2.3576377433190365,
"grad_norm": 0.5492728533936109,
"learning_rate": 5.0804249189482664e-05,
"loss": 1.2233,
"step": 446
},
{
"epoch": 2.3629165291982845,
"grad_norm": 0.2985781488436604,
"learning_rate": 5.066183118640177e-05,
"loss": 1.2158,
"step": 447
},
{
"epoch": 2.368195315077532,
"grad_norm": 0.3207066404394407,
"learning_rate": 5.0519267539118506e-05,
"loss": 1.2113,
"step": 448
},
{
"epoch": 2.37347410095678,
"grad_norm": 0.2926582089437268,
"learning_rate": 5.037656019510028e-05,
"loss": 1.1993,
"step": 449
},
{
"epoch": 2.3787528868360277,
"grad_norm": 0.2800596258887348,
"learning_rate": 5.023371110377743e-05,
"loss": 1.2074,
"step": 450
},
{
"epoch": 2.3840316727152757,
"grad_norm": 0.3116276776485492,
"learning_rate": 5.009072221651662e-05,
"loss": 1.2239,
"step": 451
},
{
"epoch": 2.3893104585945233,
"grad_norm": 0.22694341264711498,
"learning_rate": 4.9947595486594206e-05,
"loss": 1.2124,
"step": 452
},
{
"epoch": 2.394589244473771,
"grad_norm": 0.2733742917735896,
"learning_rate": 4.9804332869169436e-05,
"loss": 1.2084,
"step": 453
},
{
"epoch": 2.399868030353019,
"grad_norm": 0.34240813380863155,
"learning_rate": 4.966093632125792e-05,
"loss": 1.2208,
"step": 454
},
{
"epoch": 2.4051468162322664,
"grad_norm": 0.3184112467462416,
"learning_rate": 4.951740780170475e-05,
"loss": 1.2177,
"step": 455
},
{
"epoch": 2.4104256021115145,
"grad_norm": 0.21605403579412932,
"learning_rate": 4.937374927115783e-05,
"loss": 1.2026,
"step": 456
},
{
"epoch": 2.415704387990762,
"grad_norm": 0.3270749368088314,
"learning_rate": 4.9229962692041e-05,
"loss": 1.2115,
"step": 457
},
{
"epoch": 2.42098317387001,
"grad_norm": 0.22532926778581241,
"learning_rate": 4.908605002852735e-05,
"loss": 1.2151,
"step": 458
},
{
"epoch": 2.4262619597492576,
"grad_norm": 0.8669157175290935,
"learning_rate": 4.89420132465123e-05,
"loss": 1.2216,
"step": 459
},
{
"epoch": 2.4315407456285056,
"grad_norm": 0.29610996332037504,
"learning_rate": 4.879785431358675e-05,
"loss": 1.2041,
"step": 460
},
{
"epoch": 2.436819531507753,
"grad_norm": 0.2681710573971505,
"learning_rate": 4.865357519901026e-05,
"loss": 1.2184,
"step": 461
},
{
"epoch": 2.442098317387001,
"grad_norm": 0.2717324203460164,
"learning_rate": 4.850917787368409e-05,
"loss": 1.2287,
"step": 462
},
{
"epoch": 2.447377103266249,
"grad_norm": 0.34204178133936813,
"learning_rate": 4.8364664310124305e-05,
"loss": 1.2228,
"step": 463
},
{
"epoch": 2.4526558891454964,
"grad_norm": 0.30685100069464205,
"learning_rate": 4.822003648243481e-05,
"loss": 1.2188,
"step": 464
},
{
"epoch": 2.4579346750247444,
"grad_norm": 0.2732221315582542,
"learning_rate": 4.807529636628041e-05,
"loss": 1.2078,
"step": 465
},
{
"epoch": 2.463213460903992,
"grad_norm": 0.2848276134012873,
"learning_rate": 4.7930445938859824e-05,
"loss": 1.2146,
"step": 466
},
{
"epoch": 2.46849224678324,
"grad_norm": 0.23099153058477162,
"learning_rate": 4.778548717887862e-05,
"loss": 1.2144,
"step": 467
},
{
"epoch": 2.4737710326624875,
"grad_norm": 0.24410298456014018,
"learning_rate": 4.764042206652225e-05,
"loss": 1.2096,
"step": 468
},
{
"epoch": 2.4790498185417356,
"grad_norm": 0.23453508567964462,
"learning_rate": 4.7495252583429e-05,
"loss": 1.2229,
"step": 469
},
{
"epoch": 2.484328604420983,
"grad_norm": 0.2274160056940384,
"learning_rate": 4.734998071266282e-05,
"loss": 1.1957,
"step": 470
},
{
"epoch": 2.4896073903002307,
"grad_norm": 0.7222158250734677,
"learning_rate": 4.720460843868639e-05,
"loss": 1.212,
"step": 471
},
{
"epoch": 2.4948861761794787,
"grad_norm": 0.23319039438700873,
"learning_rate": 4.705913774733389e-05,
"loss": 1.2174,
"step": 472
},
{
"epoch": 2.5001649620587267,
"grad_norm": 0.20581558881463416,
"learning_rate": 4.6913570625783925e-05,
"loss": 1.2127,
"step": 473
},
{
"epoch": 2.5054437479379743,
"grad_norm": 0.580190640636312,
"learning_rate": 4.676790906253238e-05,
"loss": 1.2175,
"step": 474
},
{
"epoch": 2.510722533817222,
"grad_norm": 0.35859386128357623,
"learning_rate": 4.66221550473652e-05,
"loss": 1.2208,
"step": 475
},
{
"epoch": 2.51600131969647,
"grad_norm": 0.21572591452681666,
"learning_rate": 4.647631057133133e-05,
"loss": 1.2139,
"step": 476
},
{
"epoch": 2.5212801055757175,
"grad_norm": 0.3184926568808877,
"learning_rate": 4.633037762671536e-05,
"loss": 1.2147,
"step": 477
},
{
"epoch": 2.5265588914549655,
"grad_norm": 0.23798191993579182,
"learning_rate": 4.618435820701045e-05,
"loss": 1.2218,
"step": 478
},
{
"epoch": 2.531837677334213,
"grad_norm": 0.19775491535362455,
"learning_rate": 4.603825430689101e-05,
"loss": 1.2149,
"step": 479
},
{
"epoch": 2.5371164632134606,
"grad_norm": 0.2527202262533402,
"learning_rate": 4.589206792218551e-05,
"loss": 1.2303,
"step": 480
},
{
"epoch": 2.5423952490927086,
"grad_norm": 0.22950883586170756,
"learning_rate": 4.574580104984914e-05,
"loss": 1.213,
"step": 481
},
{
"epoch": 2.5476740349719567,
"grad_norm": 0.24370406115364615,
"learning_rate": 4.5599455687936605e-05,
"loss": 1.2063,
"step": 482
},
{
"epoch": 2.5529528208512042,
"grad_norm": 0.26261007853169177,
"learning_rate": 4.54530338355748e-05,
"loss": 1.2023,
"step": 483
},
{
"epoch": 2.558231606730452,
"grad_norm": 0.2014430995217622,
"learning_rate": 4.530653749293554e-05,
"loss": 1.2181,
"step": 484
},
{
"epoch": 2.5635103926097,
"grad_norm": 0.23150145338987912,
"learning_rate": 4.515996866120814e-05,
"loss": 1.2001,
"step": 485
},
{
"epoch": 2.5687891784889474,
"grad_norm": 0.24436976922792347,
"learning_rate": 4.501332934257217e-05,
"loss": 1.1965,
"step": 486
},
{
"epoch": 2.5740679643681954,
"grad_norm": 0.20891310725843007,
"learning_rate": 4.48666215401701e-05,
"loss": 1.2095,
"step": 487
},
{
"epoch": 2.579346750247443,
"grad_norm": 0.580497655713609,
"learning_rate": 4.471984725807987e-05,
"loss": 1.2028,
"step": 488
},
{
"epoch": 2.584625536126691,
"grad_norm": 0.2458630083617766,
"learning_rate": 4.457300850128757e-05,
"loss": 1.2314,
"step": 489
},
{
"epoch": 2.5899043220059386,
"grad_norm": 0.20029462571056664,
"learning_rate": 4.442610727566003e-05,
"loss": 1.1999,
"step": 490
},
{
"epoch": 2.5951831078851866,
"grad_norm": 0.1824564421862542,
"learning_rate": 4.427914558791747e-05,
"loss": 1.2128,
"step": 491
},
{
"epoch": 2.600461893764434,
"grad_norm": 0.22834088575096842,
"learning_rate": 4.4132125445605974e-05,
"loss": 1.1976,
"step": 492
},
{
"epoch": 2.6057406796436817,
"grad_norm": 0.19803337517555744,
"learning_rate": 4.3985048857070163e-05,
"loss": 1.2236,
"step": 493
},
{
"epoch": 2.6110194655229297,
"grad_norm": 0.23458909248075385,
"learning_rate": 4.383791783142576e-05,
"loss": 1.204,
"step": 494
},
{
"epoch": 2.6162982514021778,
"grad_norm": 0.21121300683483385,
"learning_rate": 4.369073437853208e-05,
"loss": 1.2117,
"step": 495
},
{
"epoch": 2.6215770372814253,
"grad_norm": 0.20128926160161514,
"learning_rate": 4.3543500508964636e-05,
"loss": 1.2041,
"step": 496
},
{
"epoch": 2.626855823160673,
"grad_norm": 0.21654654847139376,
"learning_rate": 4.339621823398762e-05,
"loss": 1.2228,
"step": 497
},
{
"epoch": 2.632134609039921,
"grad_norm": 0.17324746161391114,
"learning_rate": 4.32488895655265e-05,
"loss": 1.2106,
"step": 498
},
{
"epoch": 2.6374133949191685,
"grad_norm": 0.21573620879427124,
"learning_rate": 4.3101516516140466e-05,
"loss": 1.1987,
"step": 499
},
{
"epoch": 2.6426921807984165,
"grad_norm": 0.2470219196100934,
"learning_rate": 4.295410109899496e-05,
"loss": 1.2103,
"step": 500
},
{
"epoch": 2.647970966677664,
"grad_norm": 0.2350166861687524,
"learning_rate": 4.280664532783421e-05,
"loss": 1.2154,
"step": 501
},
{
"epoch": 2.6532497525569116,
"grad_norm": 0.20325354736823176,
"learning_rate": 4.265915121695368e-05,
"loss": 1.2146,
"step": 502
},
{
"epoch": 2.6585285384361597,
"grad_norm": 0.1651221750574765,
"learning_rate": 4.251162078117254e-05,
"loss": 1.2059,
"step": 503
},
{
"epoch": 2.6638073243154077,
"grad_norm": 0.2323971696245432,
"learning_rate": 4.236405603580622e-05,
"loss": 1.2124,
"step": 504
},
{
"epoch": 2.6690861101946552,
"grad_norm": 0.266884077772536,
"learning_rate": 4.22164589966388e-05,
"loss": 1.2068,
"step": 505
},
{
"epoch": 2.674364896073903,
"grad_norm": 0.17608948591217963,
"learning_rate": 4.206883167989551e-05,
"loss": 1.2183,
"step": 506
},
{
"epoch": 2.679643681953151,
"grad_norm": 0.24861673978443094,
"learning_rate": 4.1921176102215195e-05,
"loss": 1.2106,
"step": 507
},
{
"epoch": 2.6849224678323984,
"grad_norm": 0.20562099943439288,
"learning_rate": 4.1773494280622706e-05,
"loss": 1.2157,
"step": 508
},
{
"epoch": 2.6902012537116464,
"grad_norm": 0.1897513819316434,
"learning_rate": 4.1625788232501475e-05,
"loss": 1.2029,
"step": 509
},
{
"epoch": 2.695480039590894,
"grad_norm": 0.21765120587734574,
"learning_rate": 4.1478059975565806e-05,
"loss": 1.1957,
"step": 510
},
{
"epoch": 2.700758825470142,
"grad_norm": 0.17421718276141962,
"learning_rate": 4.13303115278334e-05,
"loss": 1.2126,
"step": 511
},
{
"epoch": 2.7060376113493896,
"grad_norm": 0.18994412403249641,
"learning_rate": 4.11825449075978e-05,
"loss": 1.2011,
"step": 512
},
{
"epoch": 2.7113163972286376,
"grad_norm": 0.213262900251563,
"learning_rate": 4.103476213340076e-05,
"loss": 1.1988,
"step": 513
},
{
"epoch": 2.716595183107885,
"grad_norm": 0.2193737588424257,
"learning_rate": 4.088696522400472e-05,
"loss": 1.1975,
"step": 514
},
{
"epoch": 2.7218739689871327,
"grad_norm": 0.18264279167008451,
"learning_rate": 4.07391561983652e-05,
"loss": 1.2135,
"step": 515
},
{
"epoch": 2.7271527548663808,
"grad_norm": 0.23335206923435833,
"learning_rate": 4.059133707560325e-05,
"loss": 1.2138,
"step": 516
},
{
"epoch": 2.7324315407456288,
"grad_norm": 0.25900257473153687,
"learning_rate": 4.04435098749778e-05,
"loss": 1.2107,
"step": 517
},
{
"epoch": 2.7377103266248763,
"grad_norm": 0.2103342998522433,
"learning_rate": 4.029567661585821e-05,
"loss": 1.206,
"step": 518
},
{
"epoch": 2.742989112504124,
"grad_norm": 0.17831077097654793,
"learning_rate": 4.014783931769652e-05,
"loss": 1.1986,
"step": 519
},
{
"epoch": 2.748267898383372,
"grad_norm": 0.20867309600721232,
"learning_rate": 4e-05,
"loss": 1.1943,
"step": 520
},
{
"epoch": 2.7535466842626195,
"grad_norm": 0.1537772415999436,
"learning_rate": 3.9852160682303486e-05,
"loss": 1.2091,
"step": 521
},
{
"epoch": 2.7588254701418675,
"grad_norm": 0.20801222179757986,
"learning_rate": 3.970432338414181e-05,
"loss": 1.2116,
"step": 522
},
{
"epoch": 2.764104256021115,
"grad_norm": 0.194411153494734,
"learning_rate": 3.955649012502221e-05,
"loss": 1.2015,
"step": 523
},
{
"epoch": 2.7693830419003627,
"grad_norm": 0.2605114580460575,
"learning_rate": 3.940866292439677e-05,
"loss": 1.1968,
"step": 524
},
{
"epoch": 2.7746618277796107,
"grad_norm": 0.1719623548499034,
"learning_rate": 3.926084380163481e-05,
"loss": 1.2121,
"step": 525
},
{
"epoch": 2.7799406136588587,
"grad_norm": 0.19539588740286956,
"learning_rate": 3.9113034775995285e-05,
"loss": 1.1957,
"step": 526
},
{
"epoch": 2.7852193995381063,
"grad_norm": 0.18647363107594456,
"learning_rate": 3.896523786659926e-05,
"loss": 1.2039,
"step": 527
},
{
"epoch": 2.790498185417354,
"grad_norm": 0.23992352538776165,
"learning_rate": 3.881745509240222e-05,
"loss": 1.2057,
"step": 528
},
{
"epoch": 2.795776971296602,
"grad_norm": 0.19235184099847877,
"learning_rate": 3.8669688472166604e-05,
"loss": 1.2053,
"step": 529
},
{
"epoch": 2.8010557571758494,
"grad_norm": 0.20150647808031436,
"learning_rate": 3.8521940024434214e-05,
"loss": 1.1979,
"step": 530
},
{
"epoch": 2.8063345430550974,
"grad_norm": 0.20818880270673554,
"learning_rate": 3.837421176749854e-05,
"loss": 1.1917,
"step": 531
},
{
"epoch": 2.811613328934345,
"grad_norm": 0.160286642176085,
"learning_rate": 3.822650571937729e-05,
"loss": 1.2062,
"step": 532
},
{
"epoch": 2.816892114813593,
"grad_norm": 0.2244956653582766,
"learning_rate": 3.807882389778483e-05,
"loss": 1.2149,
"step": 533
},
{
"epoch": 2.8221709006928406,
"grad_norm": 0.17765823360734992,
"learning_rate": 3.79311683201045e-05,
"loss": 1.2232,
"step": 534
},
{
"epoch": 2.8274496865720886,
"grad_norm": 0.2142393041003271,
"learning_rate": 3.7783541003361203e-05,
"loss": 1.2059,
"step": 535
},
{
"epoch": 2.832728472451336,
"grad_norm": 0.17229227174221823,
"learning_rate": 3.7635943964193786e-05,
"loss": 1.1881,
"step": 536
},
{
"epoch": 2.8380072583305838,
"grad_norm": 0.20443638338032005,
"learning_rate": 3.7488379218827466e-05,
"loss": 1.2006,
"step": 537
},
{
"epoch": 2.8432860442098318,
"grad_norm": 0.16509634715516444,
"learning_rate": 3.734084878304635e-05,
"loss": 1.2044,
"step": 538
},
{
"epoch": 2.8485648300890793,
"grad_norm": 0.19649714810146393,
"learning_rate": 3.7193354672165804e-05,
"loss": 1.1957,
"step": 539
},
{
"epoch": 2.8538436159683274,
"grad_norm": 0.18562259750029927,
"learning_rate": 3.7045898901005045e-05,
"loss": 1.2009,
"step": 540
},
{
"epoch": 2.859122401847575,
"grad_norm": 0.16283779745383173,
"learning_rate": 3.689848348385955e-05,
"loss": 1.2076,
"step": 541
},
{
"epoch": 2.864401187726823,
"grad_norm": 0.17467802811899544,
"learning_rate": 3.6751110434473504e-05,
"loss": 1.2054,
"step": 542
},
{
"epoch": 2.8696799736060705,
"grad_norm": 0.2124736329055944,
"learning_rate": 3.6603781766012374e-05,
"loss": 1.2064,
"step": 543
},
{
"epoch": 2.8749587594853185,
"grad_norm": 0.15324274860627765,
"learning_rate": 3.645649949103538e-05,
"loss": 1.2184,
"step": 544
},
{
"epoch": 2.880237545364566,
"grad_norm": 0.21566078265805713,
"learning_rate": 3.630926562146792e-05,
"loss": 1.2094,
"step": 545
},
{
"epoch": 2.8855163312438137,
"grad_norm": 0.18547268756739058,
"learning_rate": 3.616208216857424e-05,
"loss": 1.2151,
"step": 546
},
{
"epoch": 2.8907951171230617,
"grad_norm": 0.17370399818819313,
"learning_rate": 3.601495114292984e-05,
"loss": 1.2015,
"step": 547
},
{
"epoch": 2.8960739030023097,
"grad_norm": 0.1763097150756615,
"learning_rate": 3.586787455439403e-05,
"loss": 1.1834,
"step": 548
},
{
"epoch": 2.9013526888815573,
"grad_norm": 0.5161949943347345,
"learning_rate": 3.572085441208255e-05,
"loss": 1.2152,
"step": 549
},
{
"epoch": 2.906631474760805,
"grad_norm": 0.4758257977373414,
"learning_rate": 3.5573892724339974e-05,
"loss": 1.2213,
"step": 550
},
{
"epoch": 2.911910260640053,
"grad_norm": 0.20315764691238297,
"learning_rate": 3.542699149871245e-05,
"loss": 1.1998,
"step": 551
},
{
"epoch": 2.9171890465193004,
"grad_norm": 0.3185401826129705,
"learning_rate": 3.5280152741920146e-05,
"loss": 1.2057,
"step": 552
},
{
"epoch": 2.9224678323985485,
"grad_norm": 0.17732212942108816,
"learning_rate": 3.513337845982991e-05,
"loss": 1.2003,
"step": 553
},
{
"epoch": 2.927746618277796,
"grad_norm": 0.17655839868070522,
"learning_rate": 3.498667065742783e-05,
"loss": 1.2188,
"step": 554
},
{
"epoch": 2.9330254041570436,
"grad_norm": 0.17930609363824526,
"learning_rate": 3.484003133879188e-05,
"loss": 1.2178,
"step": 555
},
{
"epoch": 2.9383041900362916,
"grad_norm": 0.1771053499582412,
"learning_rate": 3.4693462507064475e-05,
"loss": 1.1851,
"step": 556
},
{
"epoch": 2.9435829759155396,
"grad_norm": 0.19233889091342063,
"learning_rate": 3.4546966164425196e-05,
"loss": 1.2282,
"step": 557
},
{
"epoch": 2.948861761794787,
"grad_norm": 0.2019800309492491,
"learning_rate": 3.440054431206341e-05,
"loss": 1.2272,
"step": 558
},
{
"epoch": 2.954140547674035,
"grad_norm": 0.18880582680263816,
"learning_rate": 3.4254198950150876e-05,
"loss": 1.2165,
"step": 559
},
{
"epoch": 2.959419333553283,
"grad_norm": 0.20209531295455146,
"learning_rate": 3.41079320778145e-05,
"loss": 1.1977,
"step": 560
},
{
"epoch": 2.9646981194325304,
"grad_norm": 0.1818502465064666,
"learning_rate": 3.3961745693108995e-05,
"loss": 1.1977,
"step": 561
},
{
"epoch": 2.9699769053117784,
"grad_norm": 0.20013609861537826,
"learning_rate": 3.3815641792989556e-05,
"loss": 1.2009,
"step": 562
},
{
"epoch": 2.975255691191026,
"grad_norm": 0.19474968068343523,
"learning_rate": 3.366962237328465e-05,
"loss": 1.1932,
"step": 563
},
{
"epoch": 2.980534477070274,
"grad_norm": 0.16811423192887717,
"learning_rate": 3.3523689428668686e-05,
"loss": 1.216,
"step": 564
},
{
"epoch": 2.9858132629495215,
"grad_norm": 0.20904143259971103,
"learning_rate": 3.33778449526348e-05,
"loss": 1.2044,
"step": 565
},
{
"epoch": 2.9910920488287696,
"grad_norm": 0.15694602056779824,
"learning_rate": 3.323209093746764e-05,
"loss": 1.2036,
"step": 566
},
{
"epoch": 2.996370834708017,
"grad_norm": 0.19731725001456019,
"learning_rate": 3.308642937421609e-05,
"loss": 1.2145,
"step": 567
},
{
"epoch": 3.00329924117453,
"grad_norm": 0.39640254269304404,
"learning_rate": 3.294086225266612e-05,
"loss": 2.2691,
"step": 568
},
{
"epoch": 3.0085780270537774,
"grad_norm": 0.35149949016057497,
"learning_rate": 3.279539156131362e-05,
"loss": 1.1908,
"step": 569
},
{
"epoch": 3.0138568129330254,
"grad_norm": 0.25800980108361316,
"learning_rate": 3.2650019287337184e-05,
"loss": 1.1863,
"step": 570
},
{
"epoch": 3.019135598812273,
"grad_norm": 0.29271578035617773,
"learning_rate": 3.250474741657101e-05,
"loss": 1.1733,
"step": 571
},
{
"epoch": 3.024414384691521,
"grad_norm": 0.2889155002761209,
"learning_rate": 3.235957793347776e-05,
"loss": 1.195,
"step": 572
},
{
"epoch": 3.0296931705707686,
"grad_norm": 0.2673002288345648,
"learning_rate": 3.221451282112139e-05,
"loss": 1.1857,
"step": 573
},
{
"epoch": 3.0349719564500166,
"grad_norm": 0.2427413851962326,
"learning_rate": 3.2069554061140196e-05,
"loss": 1.1815,
"step": 574
},
{
"epoch": 3.040250742329264,
"grad_norm": 0.27248962898507717,
"learning_rate": 3.19247036337196e-05,
"loss": 1.1735,
"step": 575
},
{
"epoch": 3.045529528208512,
"grad_norm": 0.28142619295211896,
"learning_rate": 3.177996351756521e-05,
"loss": 1.1672,
"step": 576
},
{
"epoch": 3.0508083140877598,
"grad_norm": 0.23654685125778924,
"learning_rate": 3.1635335689875716e-05,
"loss": 1.1947,
"step": 577
},
{
"epoch": 3.056087099967008,
"grad_norm": 0.3060272236601635,
"learning_rate": 3.149082212631592e-05,
"loss": 1.1669,
"step": 578
},
{
"epoch": 3.0613658858462554,
"grad_norm": 0.1990490966508174,
"learning_rate": 3.134642480098975e-05,
"loss": 1.1734,
"step": 579
},
{
"epoch": 3.066644671725503,
"grad_norm": 0.23725195603837385,
"learning_rate": 3.120214568641327e-05,
"loss": 1.1846,
"step": 580
},
{
"epoch": 3.071923457604751,
"grad_norm": 1.7515631467902577,
"learning_rate": 3.105798675348772e-05,
"loss": 1.2022,
"step": 581
},
{
"epoch": 3.0772022434839985,
"grad_norm": 0.23405760507735043,
"learning_rate": 3.0913949971472654e-05,
"loss": 1.1848,
"step": 582
},
{
"epoch": 3.0824810293632465,
"grad_norm": 0.2536321974582141,
"learning_rate": 3.0770037307959014e-05,
"loss": 1.1856,
"step": 583
},
{
"epoch": 3.087759815242494,
"grad_norm": 0.24687742855673658,
"learning_rate": 3.062625072884218e-05,
"loss": 1.182,
"step": 584
},
{
"epoch": 3.093038601121742,
"grad_norm": 0.26309550113541097,
"learning_rate": 3.048259219829526e-05,
"loss": 1.1717,
"step": 585
},
{
"epoch": 3.0983173870009897,
"grad_norm": 0.26335631307371027,
"learning_rate": 3.033906367874209e-05,
"loss": 1.175,
"step": 586
},
{
"epoch": 3.1035961728802377,
"grad_norm": 0.19703497978014536,
"learning_rate": 3.019566713083057e-05,
"loss": 1.1885,
"step": 587
},
{
"epoch": 3.1088749587594853,
"grad_norm": 0.5752260315715897,
"learning_rate": 3.0052404513405817e-05,
"loss": 1.205,
"step": 588
},
{
"epoch": 3.1141537446387333,
"grad_norm": 0.22038192362400044,
"learning_rate": 2.990927778348338e-05,
"loss": 1.1938,
"step": 589
},
{
"epoch": 3.119432530517981,
"grad_norm": 0.21601092261632623,
"learning_rate": 2.9766288896222577e-05,
"loss": 1.1754,
"step": 590
},
{
"epoch": 3.1247113163972284,
"grad_norm": 0.2237994485232262,
"learning_rate": 2.9623439804899738e-05,
"loss": 1.1753,
"step": 591
},
{
"epoch": 3.1299901022764764,
"grad_norm": 0.19731985592374132,
"learning_rate": 2.9480732460881504e-05,
"loss": 1.1695,
"step": 592
},
{
"epoch": 3.135268888155724,
"grad_norm": 0.49723859022777406,
"learning_rate": 2.9338168813598238e-05,
"loss": 1.2003,
"step": 593
},
{
"epoch": 3.140547674034972,
"grad_norm": 0.17077112936798963,
"learning_rate": 2.9195750810517353e-05,
"loss": 1.1538,
"step": 594
},
{
"epoch": 3.1458264599142196,
"grad_norm": 0.31811164602044134,
"learning_rate": 2.905348039711669e-05,
"loss": 1.2078,
"step": 595
},
{
"epoch": 3.1511052457934676,
"grad_norm": 0.18318775715519817,
"learning_rate": 2.891135951685799e-05,
"loss": 1.1747,
"step": 596
},
{
"epoch": 3.156384031672715,
"grad_norm": 0.19307423783474406,
"learning_rate": 2.8769390111160293e-05,
"loss": 1.1872,
"step": 597
},
{
"epoch": 3.161662817551963,
"grad_norm": 0.19217084719934985,
"learning_rate": 2.862757411937347e-05,
"loss": 1.1904,
"step": 598
},
{
"epoch": 3.166941603431211,
"grad_norm": 0.2033421815267501,
"learning_rate": 2.8485913478751706e-05,
"loss": 1.1795,
"step": 599
},
{
"epoch": 3.172220389310459,
"grad_norm": 0.1883185538491291,
"learning_rate": 2.834441012442702e-05,
"loss": 1.1803,
"step": 600
},
{
"epoch": 3.1774991751897064,
"grad_norm": 0.19166460775924493,
"learning_rate": 2.8203065989382853e-05,
"loss": 1.1751,
"step": 601
},
{
"epoch": 3.182777961068954,
"grad_norm": 0.6048917980927118,
"learning_rate": 2.8061883004427692e-05,
"loss": 1.1751,
"step": 602
},
{
"epoch": 3.188056746948202,
"grad_norm": 0.21052892689634514,
"learning_rate": 2.792086309816859e-05,
"loss": 1.197,
"step": 603
},
{
"epoch": 3.1933355328274495,
"grad_norm": 0.17247927939199434,
"learning_rate": 2.778000819698494e-05,
"loss": 1.1852,
"step": 604
},
{
"epoch": 3.1986143187066975,
"grad_norm": 0.21702246989530297,
"learning_rate": 2.7639320225002108e-05,
"loss": 1.1838,
"step": 605
},
{
"epoch": 3.203893104585945,
"grad_norm": 0.18519967189771228,
"learning_rate": 2.7498801104065127e-05,
"loss": 1.18,
"step": 606
},
{
"epoch": 3.209171890465193,
"grad_norm": 0.2180595120663347,
"learning_rate": 2.7358452753712506e-05,
"loss": 1.1763,
"step": 607
},
{
"epoch": 3.2144506763444407,
"grad_norm": 0.19356701655435976,
"learning_rate": 2.721827709114992e-05,
"loss": 1.1718,
"step": 608
},
{
"epoch": 3.2197294622236887,
"grad_norm": 0.2263137728095583,
"learning_rate": 2.707827603122411e-05,
"loss": 1.188,
"step": 609
},
{
"epoch": 3.2250082481029363,
"grad_norm": 0.1875965654825016,
"learning_rate": 2.6938451486396675e-05,
"loss": 1.1736,
"step": 610
},
{
"epoch": 3.230287033982184,
"grad_norm": 0.19885738913391388,
"learning_rate": 2.679880536671795e-05,
"loss": 1.1787,
"step": 611
},
{
"epoch": 3.235565819861432,
"grad_norm": 0.16754129647748367,
"learning_rate": 2.6659339579800928e-05,
"loss": 1.1866,
"step": 612
},
{
"epoch": 3.2408446057406795,
"grad_norm": 0.18478285151592688,
"learning_rate": 2.6520056030795225e-05,
"loss": 1.1925,
"step": 613
},
{
"epoch": 3.2461233916199275,
"grad_norm": 0.1741366266971174,
"learning_rate": 2.6380956622360955e-05,
"loss": 1.1761,
"step": 614
},
{
"epoch": 3.251402177499175,
"grad_norm": 0.205724397210717,
"learning_rate": 2.6242043254642876e-05,
"loss": 1.1781,
"step": 615
},
{
"epoch": 3.256680963378423,
"grad_norm": 0.1635965859364289,
"learning_rate": 2.6103317825244347e-05,
"loss": 1.1787,
"step": 616
},
{
"epoch": 3.2619597492576706,
"grad_norm": 0.18322293170555132,
"learning_rate": 2.596478222920141e-05,
"loss": 1.1707,
"step": 617
},
{
"epoch": 3.2672385351369186,
"grad_norm": 0.21024379892510617,
"learning_rate": 2.582643835895696e-05,
"loss": 1.1849,
"step": 618
},
{
"epoch": 3.272517321016166,
"grad_norm": 0.16828870708008303,
"learning_rate": 2.5688288104334787e-05,
"loss": 1.1757,
"step": 619
},
{
"epoch": 3.2777961068954142,
"grad_norm": 0.1846842052479709,
"learning_rate": 2.5550333352513885e-05,
"loss": 1.1879,
"step": 620
},
{
"epoch": 3.283074892774662,
"grad_norm": 0.19687585348678469,
"learning_rate": 2.54125759880026e-05,
"loss": 1.1816,
"step": 621
},
{
"epoch": 3.28835367865391,
"grad_norm": 0.1758961182832862,
"learning_rate": 2.5275017892612885e-05,
"loss": 1.1911,
"step": 622
},
{
"epoch": 3.2936324645331574,
"grad_norm": 0.18792489795821068,
"learning_rate": 2.5137660945434617e-05,
"loss": 1.188,
"step": 623
},
{
"epoch": 3.298911250412405,
"grad_norm": 0.16028174516582183,
"learning_rate": 2.500050702280995e-05,
"loss": 1.1861,
"step": 624
},
{
"epoch": 3.304190036291653,
"grad_norm": 0.25493998515543115,
"learning_rate": 2.4863557998307593e-05,
"loss": 1.2039,
"step": 625
},
{
"epoch": 3.3094688221709005,
"grad_norm": 0.1702317701841697,
"learning_rate": 2.4726815742697326e-05,
"loss": 1.1653,
"step": 626
},
{
"epoch": 3.3147476080501486,
"grad_norm": 0.1811300435479535,
"learning_rate": 2.4590282123924398e-05,
"loss": 1.1911,
"step": 627
},
{
"epoch": 3.320026393929396,
"grad_norm": 0.16020792712846268,
"learning_rate": 2.4453959007083968e-05,
"loss": 1.1786,
"step": 628
},
{
"epoch": 3.325305179808644,
"grad_norm": 0.17703804764351366,
"learning_rate": 2.4317848254395698e-05,
"loss": 1.1874,
"step": 629
},
{
"epoch": 3.3305839656878917,
"grad_norm": 0.17309284980916728,
"learning_rate": 2.4181951725178302e-05,
"loss": 1.1695,
"step": 630
},
{
"epoch": 3.3358627515671397,
"grad_norm": 0.1620501421501924,
"learning_rate": 2.4046271275824083e-05,
"loss": 1.2024,
"step": 631
},
{
"epoch": 3.3411415374463873,
"grad_norm": 0.28467017075140105,
"learning_rate": 2.3910808759773666e-05,
"loss": 1.173,
"step": 632
},
{
"epoch": 3.346420323325635,
"grad_norm": 0.16148158903613652,
"learning_rate": 2.3775566027490583e-05,
"loss": 1.1731,
"step": 633
},
{
"epoch": 3.351699109204883,
"grad_norm": 0.20024202970675797,
"learning_rate": 2.364054492643608e-05,
"loss": 1.1826,
"step": 634
},
{
"epoch": 3.3569778950841305,
"grad_norm": 0.15280584791983493,
"learning_rate": 2.3505747301043867e-05,
"loss": 1.1849,
"step": 635
},
{
"epoch": 3.3622566809633785,
"grad_norm": 0.20221936488778078,
"learning_rate": 2.3371174992694848e-05,
"loss": 1.1965,
"step": 636
},
{
"epoch": 3.367535466842626,
"grad_norm": 0.4791617718328374,
"learning_rate": 2.3236829839692065e-05,
"loss": 1.1957,
"step": 637
},
{
"epoch": 3.372814252721874,
"grad_norm": 0.18149628325520292,
"learning_rate": 2.310271367723551e-05,
"loss": 1.1731,
"step": 638
},
{
"epoch": 3.3780930386011216,
"grad_norm": 0.18733774960030386,
"learning_rate": 2.2968828337397095e-05,
"loss": 1.1705,
"step": 639
},
{
"epoch": 3.3833718244803697,
"grad_norm": 0.16505883561842968,
"learning_rate": 2.2835175649095645e-05,
"loss": 1.1827,
"step": 640
},
{
"epoch": 3.3886506103596172,
"grad_norm": 0.16387587478009516,
"learning_rate": 2.270175743807185e-05,
"loss": 1.1842,
"step": 641
},
{
"epoch": 3.3939293962388652,
"grad_norm": 0.6763913842218626,
"learning_rate": 2.2568575526863385e-05,
"loss": 1.1761,
"step": 642
},
{
"epoch": 3.399208182118113,
"grad_norm": 0.20445761019330047,
"learning_rate": 2.2435631734779974e-05,
"loss": 1.1868,
"step": 643
},
{
"epoch": 3.404486967997361,
"grad_norm": 0.15713802524158743,
"learning_rate": 2.2302927877878543e-05,
"loss": 1.193,
"step": 644
},
{
"epoch": 3.4097657538766084,
"grad_norm": 0.1626598876968337,
"learning_rate": 2.2170465768938473e-05,
"loss": 1.1629,
"step": 645
},
{
"epoch": 3.415044539755856,
"grad_norm": 0.15630065852416064,
"learning_rate": 2.203824721743674e-05,
"loss": 1.1835,
"step": 646
},
{
"epoch": 3.420323325635104,
"grad_norm": 0.16627965905634337,
"learning_rate": 2.1906274029523262e-05,
"loss": 1.1932,
"step": 647
},
{
"epoch": 3.4256021115143516,
"grad_norm": 0.3853688887851438,
"learning_rate": 2.177454800799618e-05,
"loss": 1.1788,
"step": 648
},
{
"epoch": 3.4308808973935996,
"grad_norm": 0.1710965211235484,
"learning_rate": 2.1643070952277314e-05,
"loss": 1.1837,
"step": 649
},
{
"epoch": 3.436159683272847,
"grad_norm": 0.16418437492334867,
"learning_rate": 2.1511844658387478e-05,
"loss": 1.1825,
"step": 650
},
{
"epoch": 3.441438469152095,
"grad_norm": 0.16951121532514787,
"learning_rate": 2.1380870918922004e-05,
"loss": 1.1799,
"step": 651
},
{
"epoch": 3.4467172550313427,
"grad_norm": 0.1693028978568346,
"learning_rate": 2.1250151523026295e-05,
"loss": 1.1651,
"step": 652
},
{
"epoch": 3.4519960409105908,
"grad_norm": 0.16242147126253437,
"learning_rate": 2.1119688256371233e-05,
"loss": 1.183,
"step": 653
},
{
"epoch": 3.4572748267898383,
"grad_norm": 0.29611972498303013,
"learning_rate": 2.0989482901128998e-05,
"loss": 1.1888,
"step": 654
},
{
"epoch": 3.462553612669086,
"grad_norm": 0.16236945816797332,
"learning_rate": 2.0859537235948543e-05,
"loss": 1.1779,
"step": 655
},
{
"epoch": 3.467832398548334,
"grad_norm": 0.1586147334948237,
"learning_rate": 2.0729853035931386e-05,
"loss": 1.1779,
"step": 656
},
{
"epoch": 3.4731111844275815,
"grad_norm": 0.18364011968994423,
"learning_rate": 2.060043207260738e-05,
"loss": 1.1814,
"step": 657
},
{
"epoch": 3.4783899703068295,
"grad_norm": 0.18650200564424077,
"learning_rate": 2.0471276113910383e-05,
"loss": 1.1796,
"step": 658
},
{
"epoch": 3.483668756186077,
"grad_norm": 0.19826495336352215,
"learning_rate": 2.0342386924154313e-05,
"loss": 1.1842,
"step": 659
},
{
"epoch": 3.488947542065325,
"grad_norm": 0.17236160815821988,
"learning_rate": 2.0213766264008857e-05,
"loss": 1.1809,
"step": 660
},
{
"epoch": 3.4942263279445727,
"grad_norm": 0.23601297199302493,
"learning_rate": 2.008541589047551e-05,
"loss": 1.1898,
"step": 661
},
{
"epoch": 3.4995051138238207,
"grad_norm": 0.17149615755966166,
"learning_rate": 1.9957337556863604e-05,
"loss": 1.1809,
"step": 662
},
{
"epoch": 3.5047838997030683,
"grad_norm": 0.20446720126414816,
"learning_rate": 1.9829533012766268e-05,
"loss": 1.1849,
"step": 663
},
{
"epoch": 3.510062685582316,
"grad_norm": 0.1883725541194937,
"learning_rate": 1.9702004004036583e-05,
"loss": 1.1724,
"step": 664
},
{
"epoch": 3.515341471461564,
"grad_norm": 0.1616721588144235,
"learning_rate": 1.957475227276373e-05,
"loss": 1.1779,
"step": 665
},
{
"epoch": 3.520620257340812,
"grad_norm": 0.16929488246463403,
"learning_rate": 1.9447779557249183e-05,
"loss": 1.1854,
"step": 666
},
{
"epoch": 3.5258990432200594,
"grad_norm": 0.16292016568868656,
"learning_rate": 1.9321087591982987e-05,
"loss": 1.1861,
"step": 667
},
{
"epoch": 3.531177829099307,
"grad_norm": 0.17403946253846359,
"learning_rate": 1.9194678107620013e-05,
"loss": 1.1805,
"step": 668
},
{
"epoch": 3.536456614978555,
"grad_norm": 0.1694106848076632,
"learning_rate": 1.906855283095637e-05,
"loss": 1.1856,
"step": 669
},
{
"epoch": 3.5417354008578026,
"grad_norm": 0.1820904220829124,
"learning_rate": 1.8942713484905762e-05,
"loss": 1.1727,
"step": 670
},
{
"epoch": 3.5470141867370506,
"grad_norm": 0.16909152503020708,
"learning_rate": 1.8817161788476052e-05,
"loss": 1.1884,
"step": 671
},
{
"epoch": 3.552292972616298,
"grad_norm": 0.15357737056906967,
"learning_rate": 1.869189945674564e-05,
"loss": 1.1894,
"step": 672
},
{
"epoch": 3.557571758495546,
"grad_norm": 0.15273867858520462,
"learning_rate": 1.8566928200840128e-05,
"loss": 1.1723,
"step": 673
},
{
"epoch": 3.5628505443747938,
"grad_norm": 0.30895988367256033,
"learning_rate": 1.8442249727908973e-05,
"loss": 1.1982,
"step": 674
},
{
"epoch": 3.5681293302540418,
"grad_norm": 0.14637005365137004,
"learning_rate": 1.8317865741102025e-05,
"loss": 1.1769,
"step": 675
},
{
"epoch": 3.5734081161332893,
"grad_norm": 0.15688995507200096,
"learning_rate": 1.819377793954646e-05,
"loss": 1.1877,
"step": 676
},
{
"epoch": 3.578686902012537,
"grad_norm": 0.15514659519283094,
"learning_rate": 1.806998801832335e-05,
"loss": 1.1745,
"step": 677
},
{
"epoch": 3.583965687891785,
"grad_norm": 0.15960023980743218,
"learning_rate": 1.7946497668444717e-05,
"loss": 1.1918,
"step": 678
},
{
"epoch": 3.5892444737710325,
"grad_norm": 0.16074105620670257,
"learning_rate": 1.7823308576830326e-05,
"loss": 1.1827,
"step": 679
},
{
"epoch": 3.5945232596502805,
"grad_norm": 0.17341000855364072,
"learning_rate": 1.770042242628458e-05,
"loss": 1.1928,
"step": 680
},
{
"epoch": 3.599802045529528,
"grad_norm": 0.19117031283641508,
"learning_rate": 1.7577840895473687e-05,
"loss": 1.1847,
"step": 681
},
{
"epoch": 3.605080831408776,
"grad_norm": 0.14631791720740261,
"learning_rate": 1.7455565658902603e-05,
"loss": 1.1919,
"step": 682
},
{
"epoch": 3.6103596172880237,
"grad_norm": 0.19178297155978408,
"learning_rate": 1.733359838689222e-05,
"loss": 1.1764,
"step": 683
},
{
"epoch": 3.6156384031672717,
"grad_norm": 0.1736295590236363,
"learning_rate": 1.7211940745556496e-05,
"loss": 1.1768,
"step": 684
},
{
"epoch": 3.6209171890465193,
"grad_norm": 0.1841012098697114,
"learning_rate": 1.7090594396779793e-05,
"loss": 1.1762,
"step": 685
},
{
"epoch": 3.626195974925767,
"grad_norm": 0.17136791598501644,
"learning_rate": 1.6969560998194062e-05,
"loss": 1.1852,
"step": 686
},
{
"epoch": 3.631474760805015,
"grad_norm": 0.17325004058389215,
"learning_rate": 1.6848842203156267e-05,
"loss": 1.1794,
"step": 687
},
{
"epoch": 3.636753546684263,
"grad_norm": 0.19438978043428873,
"learning_rate": 1.6728439660725768e-05,
"loss": 1.1836,
"step": 688
},
{
"epoch": 3.6420323325635104,
"grad_norm": 0.15890915637153022,
"learning_rate": 1.6608355015641807e-05,
"loss": 1.1777,
"step": 689
},
{
"epoch": 3.647311118442758,
"grad_norm": 0.23571007165732896,
"learning_rate": 1.648858990830108e-05,
"loss": 1.1803,
"step": 690
},
{
"epoch": 3.652589904322006,
"grad_norm": 0.18886233467603014,
"learning_rate": 1.636914597473525e-05,
"loss": 1.1921,
"step": 691
},
{
"epoch": 3.6578686902012536,
"grad_norm": 0.15020457052587974,
"learning_rate": 1.6250024846588632e-05,
"loss": 1.1768,
"step": 692
},
{
"epoch": 3.6631474760805016,
"grad_norm": 0.19794031747921945,
"learning_rate": 1.6131228151095976e-05,
"loss": 1.1975,
"step": 693
},
{
"epoch": 3.668426261959749,
"grad_norm": 0.17700601859911672,
"learning_rate": 1.6012757511060062e-05,
"loss": 1.1746,
"step": 694
},
{
"epoch": 3.6737050478389968,
"grad_norm": 0.15666164343925118,
"learning_rate": 1.589461454482975e-05,
"loss": 1.1679,
"step": 695
},
{
"epoch": 3.678983833718245,
"grad_norm": 0.1799147716866367,
"learning_rate": 1.577680086627771e-05,
"loss": 1.1829,
"step": 696
},
{
"epoch": 3.684262619597493,
"grad_norm": 0.15622510538635312,
"learning_rate": 1.5659318084778427e-05,
"loss": 1.1712,
"step": 697
},
{
"epoch": 3.6895414054767404,
"grad_norm": 0.1369739991909485,
"learning_rate": 1.5542167805186262e-05,
"loss": 1.1855,
"step": 698
},
{
"epoch": 3.694820191355988,
"grad_norm": 0.16493354365350582,
"learning_rate": 1.542535162781345e-05,
"loss": 1.1756,
"step": 699
},
{
"epoch": 3.700098977235236,
"grad_norm": 0.16258714311932046,
"learning_rate": 1.530887114840829e-05,
"loss": 1.1836,
"step": 700
},
{
"epoch": 3.7053777631144835,
"grad_norm": 0.14824689112607964,
"learning_rate": 1.5192727958133336e-05,
"loss": 1.1858,
"step": 701
},
{
"epoch": 3.7106565489937315,
"grad_norm": 0.17024560483432752,
"learning_rate": 1.5076923643543637e-05,
"loss": 1.191,
"step": 702
},
{
"epoch": 3.715935334872979,
"grad_norm": 0.2937433238239512,
"learning_rate": 1.4961459786565136e-05,
"loss": 1.2079,
"step": 703
},
{
"epoch": 3.721214120752227,
"grad_norm": 0.14683060389697267,
"learning_rate": 1.4846337964472973e-05,
"loss": 1.1549,
"step": 704
},
{
"epoch": 3.7264929066314747,
"grad_norm": 0.1736075539679898,
"learning_rate": 1.4731559749869973e-05,
"loss": 1.1772,
"step": 705
},
{
"epoch": 3.7317716925107227,
"grad_norm": 0.15781844683554935,
"learning_rate": 1.4617126710665166e-05,
"loss": 1.1967,
"step": 706
},
{
"epoch": 3.7370504783899703,
"grad_norm": 0.18024181822406263,
"learning_rate": 1.4503040410052412e-05,
"loss": 1.1763,
"step": 707
},
{
"epoch": 3.742329264269218,
"grad_norm": 0.18126802231132821,
"learning_rate": 1.438930240648896e-05,
"loss": 1.1799,
"step": 708
},
{
"epoch": 3.747608050148466,
"grad_norm": 1.030705083825917,
"learning_rate": 1.4275914253674206e-05,
"loss": 1.1889,
"step": 709
},
{
"epoch": 3.752886836027714,
"grad_norm": 0.16620977435780646,
"learning_rate": 1.4162877500528516e-05,
"loss": 1.1748,
"step": 710
},
{
"epoch": 3.7581656219069615,
"grad_norm": 0.2798778927760511,
"learning_rate": 1.4050193691171931e-05,
"loss": 1.1849,
"step": 711
},
{
"epoch": 3.763444407786209,
"grad_norm": 0.14293138521061613,
"learning_rate": 1.3937864364903253e-05,
"loss": 1.1765,
"step": 712
},
{
"epoch": 3.768723193665457,
"grad_norm": 0.16417271654966648,
"learning_rate": 1.3825891056178874e-05,
"loss": 1.1722,
"step": 713
},
{
"epoch": 3.7740019795447046,
"grad_norm": 0.1311111754910553,
"learning_rate": 1.3714275294591852e-05,
"loss": 1.1888,
"step": 714
},
{
"epoch": 3.7792807654239526,
"grad_norm": 0.14306901078476733,
"learning_rate": 1.3603018604851106e-05,
"loss": 1.1645,
"step": 715
},
{
"epoch": 3.7845595513032,
"grad_norm": 0.15620863897436546,
"learning_rate": 1.349212250676041e-05,
"loss": 1.1753,
"step": 716
},
{
"epoch": 3.789838337182448,
"grad_norm": 0.14152193009692798,
"learning_rate": 1.3381588515197818e-05,
"loss": 1.1826,
"step": 717
},
{
"epoch": 3.795117123061696,
"grad_norm": 0.19538067948218324,
"learning_rate": 1.3271418140094854e-05,
"loss": 1.1796,
"step": 718
},
{
"epoch": 3.800395908940944,
"grad_norm": 0.1442409305088691,
"learning_rate": 1.3161612886415904e-05,
"loss": 1.1869,
"step": 719
},
{
"epoch": 3.8056746948201914,
"grad_norm": 0.13512286814960517,
"learning_rate": 1.3052174254137713e-05,
"loss": 1.1915,
"step": 720
},
{
"epoch": 3.810953480699439,
"grad_norm": 0.14311317112625468,
"learning_rate": 1.2943103738228815e-05,
"loss": 1.1924,
"step": 721
},
{
"epoch": 3.816232266578687,
"grad_norm": 0.1307906860768968,
"learning_rate": 1.2834402828629177e-05,
"loss": 1.1834,
"step": 722
},
{
"epoch": 3.8215110524579345,
"grad_norm": 0.13454071601034975,
"learning_rate": 1.2726073010229798e-05,
"loss": 1.1678,
"step": 723
},
{
"epoch": 3.8267898383371826,
"grad_norm": 0.14555974766872148,
"learning_rate": 1.2618115762852451e-05,
"loss": 1.1943,
"step": 724
},
{
"epoch": 3.83206862421643,
"grad_norm": 0.18463072722535032,
"learning_rate": 1.2510532561229493e-05,
"loss": 1.1805,
"step": 725
},
{
"epoch": 3.837347410095678,
"grad_norm": 0.17705683325025137,
"learning_rate": 1.2403324874983653e-05,
"loss": 1.1795,
"step": 726
},
{
"epoch": 3.8426261959749257,
"grad_norm": 0.14977271526880837,
"learning_rate": 1.2296494168608004e-05,
"loss": 1.1861,
"step": 727
},
{
"epoch": 3.8479049818541737,
"grad_norm": 0.17767233652373313,
"learning_rate": 1.2190041901445948e-05,
"loss": 1.1804,
"step": 728
},
{
"epoch": 3.8531837677334213,
"grad_norm": 0.14698313922467643,
"learning_rate": 1.2083969527671294e-05,
"loss": 1.1863,
"step": 729
},
{
"epoch": 3.858462553612669,
"grad_norm": 0.15375271230113852,
"learning_rate": 1.1978278496268362e-05,
"loss": 1.1936,
"step": 730
},
{
"epoch": 3.863741339491917,
"grad_norm": 0.15069392504511053,
"learning_rate": 1.1872970251012204e-05,
"loss": 1.1794,
"step": 731
},
{
"epoch": 3.869020125371165,
"grad_norm": 0.15663922512201614,
"learning_rate": 1.1768046230448924e-05,
"loss": 1.1761,
"step": 732
},
{
"epoch": 3.8742989112504125,
"grad_norm": 0.1398727690424049,
"learning_rate": 1.1663507867875911e-05,
"loss": 1.1925,
"step": 733
},
{
"epoch": 3.87957769712966,
"grad_norm": 0.129891178182677,
"learning_rate": 1.1559356591322426e-05,
"loss": 1.1689,
"step": 734
},
{
"epoch": 3.884856483008908,
"grad_norm": 0.16126609799676608,
"learning_rate": 1.145559382352991e-05,
"loss": 1.1663,
"step": 735
},
{
"epoch": 3.8901352688881556,
"grad_norm": 0.72353456639229,
"learning_rate": 1.1352220981932738e-05,
"loss": 1.1898,
"step": 736
},
{
"epoch": 3.8954140547674037,
"grad_norm": 0.15606017767964075,
"learning_rate": 1.12492394786387e-05,
"loss": 1.1899,
"step": 737
},
{
"epoch": 3.9006928406466512,
"grad_norm": 0.1416594211440069,
"learning_rate": 1.1146650720409781e-05,
"loss": 1.1853,
"step": 738
},
{
"epoch": 3.905971626525899,
"grad_norm": 0.13915107172987345,
"learning_rate": 1.1044456108642967e-05,
"loss": 1.1917,
"step": 739
},
{
"epoch": 3.911250412405147,
"grad_norm": 0.12781574255133024,
"learning_rate": 1.0942657039351042e-05,
"loss": 1.1725,
"step": 740
},
{
"epoch": 3.916529198284395,
"grad_norm": 0.14766652542576716,
"learning_rate": 1.0841254903143547e-05,
"loss": 1.1967,
"step": 741
},
{
"epoch": 3.9218079841636424,
"grad_norm": 0.26120047464098534,
"learning_rate": 1.0740251085207785e-05,
"loss": 1.1785,
"step": 742
},
{
"epoch": 3.92708677004289,
"grad_norm": 0.14335573482650907,
"learning_rate": 1.0639646965289927e-05,
"loss": 1.1817,
"step": 743
},
{
"epoch": 3.932365555922138,
"grad_norm": 0.14772540528742029,
"learning_rate": 1.0539443917676092e-05,
"loss": 1.1754,
"step": 744
},
{
"epoch": 3.9376443418013856,
"grad_norm": 0.1424170141242611,
"learning_rate": 1.0439643311173642e-05,
"loss": 1.173,
"step": 745
},
{
"epoch": 3.9429231276806336,
"grad_norm": 0.7072515085588025,
"learning_rate": 1.0340246509092448e-05,
"loss": 1.21,
"step": 746
},
{
"epoch": 3.948201913559881,
"grad_norm": 0.18752526913379627,
"learning_rate": 1.024125486922627e-05,
"loss": 1.1796,
"step": 747
},
{
"epoch": 3.953480699439129,
"grad_norm": 0.14084279259329482,
"learning_rate": 1.0142669743834243e-05,
"loss": 1.1747,
"step": 748
},
{
"epoch": 3.9587594853183767,
"grad_norm": 0.18116846121283872,
"learning_rate": 1.0044492479622359e-05,
"loss": 1.1626,
"step": 749
},
{
"epoch": 3.9640382711976248,
"grad_norm": 0.1362638764334788,
"learning_rate": 9.946724417725067e-06,
"loss": 1.1958,
"step": 750
},
{
"epoch": 3.9693170570768723,
"grad_norm": 0.14138256699607013,
"learning_rate": 9.849366893687034e-06,
"loss": 1.1644,
"step": 751
},
{
"epoch": 3.97459584295612,
"grad_norm": 0.12221201901085602,
"learning_rate": 9.752421237444768e-06,
"loss": 1.174,
"step": 752
},
{
"epoch": 3.979874628835368,
"grad_norm": 0.1420340161710947,
"learning_rate": 9.655888773308586e-06,
"loss": 1.1703,
"step": 753
},
{
"epoch": 3.9851534147146155,
"grad_norm": 0.13068888106532828,
"learning_rate": 9.559770819944428e-06,
"loss": 1.1772,
"step": 754
},
{
"epoch": 3.9904322005938635,
"grad_norm": 0.15702714311609547,
"learning_rate": 9.464068690355881e-06,
"loss": 1.1776,
"step": 755
},
{
"epoch": 3.995710986473111,
"grad_norm": 0.1322693957523336,
"learning_rate": 9.368783691866272e-06,
"loss": 1.1811,
"step": 756
},
{
"epoch": 4.002639392939624,
"grad_norm": 0.32862922272344564,
"learning_rate": 9.273917126100706e-06,
"loss": 2.2105,
"step": 757
},
{
"epoch": 4.007918178818872,
"grad_norm": 0.15972455045077913,
"learning_rate": 9.179470288968435e-06,
"loss": 1.1521,
"step": 758
},
{
"epoch": 4.013196964698119,
"grad_norm": 0.16085099316633536,
"learning_rate": 9.085444470645033e-06,
"loss": 1.1649,
"step": 759
},
{
"epoch": 4.018475750577367,
"grad_norm": 0.15307676096371256,
"learning_rate": 8.991840955554805e-06,
"loss": 1.1606,
"step": 760
},
{
"epoch": 4.023754536456615,
"grad_norm": 0.1584835111545146,
"learning_rate": 8.898661022353301e-06,
"loss": 1.179,
"step": 761
},
{
"epoch": 4.029033322335863,
"grad_norm": 0.1490011216344127,
"learning_rate": 8.805905943909754e-06,
"loss": 1.1734,
"step": 762
},
{
"epoch": 4.0343121082151105,
"grad_norm": 0.171667967945809,
"learning_rate": 8.713576987289753e-06,
"loss": 1.149,
"step": 763
},
{
"epoch": 4.039590894094358,
"grad_norm": 0.14542174260694177,
"learning_rate": 8.621675413737911e-06,
"loss": 1.1773,
"step": 764
},
{
"epoch": 4.044869679973606,
"grad_norm": 0.14302481831921374,
"learning_rate": 8.530202478660672e-06,
"loss": 1.1694,
"step": 765
},
{
"epoch": 4.050148465852854,
"grad_norm": 0.15754932582955972,
"learning_rate": 8.439159431609108e-06,
"loss": 1.1604,
"step": 766
},
{
"epoch": 4.055427251732102,
"grad_norm": 0.1428839940000341,
"learning_rate": 8.34854751626188e-06,
"loss": 1.1603,
"step": 767
},
{
"epoch": 4.060706037611349,
"grad_norm": 0.145295522617084,
"learning_rate": 8.258367970408248e-06,
"loss": 1.1655,
"step": 768
},
{
"epoch": 4.065984823490597,
"grad_norm": 0.6586015632358717,
"learning_rate": 8.168622025931152e-06,
"loss": 1.1775,
"step": 769
},
{
"epoch": 4.071263609369845,
"grad_norm": 0.13235823309712424,
"learning_rate": 8.07931090879042e-06,
"loss": 1.1821,
"step": 770
},
{
"epoch": 4.076542395249093,
"grad_norm": 0.15766183056424551,
"learning_rate": 7.990435839005961e-06,
"loss": 1.1501,
"step": 771
},
{
"epoch": 4.0818211811283405,
"grad_norm": 0.16128467997458518,
"learning_rate": 7.901998030641125e-06,
"loss": 1.1647,
"step": 772
},
{
"epoch": 4.087099967007588,
"grad_norm": 0.175684873680538,
"learning_rate": 7.813998691786172e-06,
"loss": 1.1526,
"step": 773
},
{
"epoch": 4.092378752886836,
"grad_norm": 0.1483548124292395,
"learning_rate": 7.726439024541647e-06,
"loss": 1.1565,
"step": 774
},
{
"epoch": 4.097657538766084,
"grad_norm": 0.14495321888236465,
"learning_rate": 7.639320225002106e-06,
"loss": 1.1671,
"step": 775
},
{
"epoch": 4.102936324645332,
"grad_norm": 0.15968741317488708,
"learning_rate": 7.552643483239661e-06,
"loss": 1.1731,
"step": 776
},
{
"epoch": 4.108215110524579,
"grad_norm": 0.1711293844095317,
"learning_rate": 7.466409983287763e-06,
"loss": 1.1862,
"step": 777
},
{
"epoch": 4.113493896403827,
"grad_norm": 0.15163396329916032,
"learning_rate": 7.380620903125071e-06,
"loss": 1.1497,
"step": 778
},
{
"epoch": 4.118772682283075,
"grad_norm": 0.137743428037284,
"learning_rate": 7.295277414659279e-06,
"loss": 1.1513,
"step": 779
},
{
"epoch": 4.124051468162323,
"grad_norm": 0.1703498274940864,
"learning_rate": 7.21038068371116e-06,
"loss": 1.1868,
"step": 780
},
{
"epoch": 4.12933025404157,
"grad_norm": 0.14516466392073746,
"learning_rate": 7.125931869998637e-06,
"loss": 1.1722,
"step": 781
},
{
"epoch": 4.134609039920818,
"grad_norm": 0.8612828120069205,
"learning_rate": 7.041932127120916e-06,
"loss": 1.1966,
"step": 782
},
{
"epoch": 4.139887825800066,
"grad_norm": 0.16416849793012017,
"learning_rate": 6.958382602542775e-06,
"loss": 1.1575,
"step": 783
},
{
"epoch": 4.145166611679314,
"grad_norm": 0.13648389237144803,
"learning_rate": 6.875284437578829e-06,
"loss": 1.1652,
"step": 784
},
{
"epoch": 4.150445397558562,
"grad_norm": 0.12915478715404072,
"learning_rate": 6.792638767377981e-06,
"loss": 1.1623,
"step": 785
},
{
"epoch": 4.155724183437809,
"grad_norm": 0.1271714835707355,
"learning_rate": 6.710446720907886e-06,
"loss": 1.1556,
"step": 786
},
{
"epoch": 4.161002969317057,
"grad_norm": 0.1374695700589436,
"learning_rate": 6.6287094209396005e-06,
"loss": 1.1703,
"step": 787
},
{
"epoch": 4.166281755196305,
"grad_norm": 0.13643108166742873,
"learning_rate": 6.547427984032104e-06,
"loss": 1.175,
"step": 788
},
{
"epoch": 4.171560541075553,
"grad_norm": 0.12993636667214994,
"learning_rate": 6.466603520517205e-06,
"loss": 1.186,
"step": 789
},
{
"epoch": 4.1768393269548,
"grad_norm": 0.1241730758098539,
"learning_rate": 6.386237134484296e-06,
"loss": 1.1608,
"step": 790
},
{
"epoch": 4.182118112834048,
"grad_norm": 0.12139659159425875,
"learning_rate": 6.306329923765222e-06,
"loss": 1.1656,
"step": 791
},
{
"epoch": 4.187396898713296,
"grad_norm": 0.138372121781133,
"learning_rate": 6.226882979919407e-06,
"loss": 1.1464,
"step": 792
},
{
"epoch": 4.192675684592544,
"grad_norm": 0.12939648961836608,
"learning_rate": 6.147897388218811e-06,
"loss": 1.159,
"step": 793
},
{
"epoch": 4.1979544704717915,
"grad_norm": 0.1230027109136608,
"learning_rate": 6.0693742276332245e-06,
"loss": 1.1541,
"step": 794
},
{
"epoch": 4.203233256351039,
"grad_norm": 0.14916519840606735,
"learning_rate": 5.991314570815441e-06,
"loss": 1.1772,
"step": 795
},
{
"epoch": 4.208512042230287,
"grad_norm": 0.1292233287870313,
"learning_rate": 5.913719484086638e-06,
"loss": 1.1546,
"step": 796
},
{
"epoch": 4.213790828109535,
"grad_norm": 0.12382165284530203,
"learning_rate": 5.836590027421829e-06,
"loss": 1.1461,
"step": 797
},
{
"epoch": 4.219069613988783,
"grad_norm": 0.12426017833944712,
"learning_rate": 5.759927254435345e-06,
"loss": 1.1575,
"step": 798
},
{
"epoch": 4.22434839986803,
"grad_norm": 0.12511292326231407,
"learning_rate": 5.683732212366466e-06,
"loss": 1.1667,
"step": 799
},
{
"epoch": 4.229627185747278,
"grad_norm": 0.1225763097437516,
"learning_rate": 5.608005942065102e-06,
"loss": 1.1627,
"step": 800
},
{
"epoch": 4.234905971626526,
"grad_norm": 0.12366047962888504,
"learning_rate": 5.532749477977613e-06,
"loss": 1.1704,
"step": 801
},
{
"epoch": 4.240184757505774,
"grad_norm": 0.11826441744257116,
"learning_rate": 5.45796384813261e-06,
"loss": 1.1778,
"step": 802
},
{
"epoch": 4.245463543385021,
"grad_norm": 0.12376814311025498,
"learning_rate": 5.383650074126973e-06,
"loss": 1.1523,
"step": 803
},
{
"epoch": 4.250742329264269,
"grad_norm": 0.12958820147684294,
"learning_rate": 5.309809171111866e-06,
"loss": 1.1479,
"step": 804
},
{
"epoch": 4.2560211151435166,
"grad_norm": 0.13474325077163982,
"learning_rate": 5.236442147778866e-06,
"loss": 1.1672,
"step": 805
},
{
"epoch": 4.261299901022765,
"grad_norm": 0.11930301746988618,
"learning_rate": 5.163550006346225e-06,
"loss": 1.1766,
"step": 806
},
{
"epoch": 4.266578686902013,
"grad_norm": 0.13565747630020547,
"learning_rate": 5.0911337425451115e-06,
"loss": 1.1669,
"step": 807
},
{
"epoch": 4.27185747278126,
"grad_norm": 0.14084681033309193,
"learning_rate": 5.0191943456060574e-06,
"loss": 1.1572,
"step": 808
},
{
"epoch": 4.277136258660508,
"grad_norm": 0.11838154220172313,
"learning_rate": 4.947732798245466e-06,
"loss": 1.1527,
"step": 809
},
{
"epoch": 4.282415044539756,
"grad_norm": 0.11759251011433089,
"learning_rate": 4.8767500766520834e-06,
"loss": 1.1658,
"step": 810
},
{
"epoch": 4.287693830419004,
"grad_norm": 0.1083263135518414,
"learning_rate": 4.806247150473811e-06,
"loss": 1.1576,
"step": 811
},
{
"epoch": 4.292972616298251,
"grad_norm": 0.12075133264162158,
"learning_rate": 4.736224982804331e-06,
"loss": 1.1649,
"step": 812
},
{
"epoch": 4.298251402177499,
"grad_norm": 0.12348266353073907,
"learning_rate": 4.66668453017002e-06,
"loss": 1.1523,
"step": 813
},
{
"epoch": 4.303530188056747,
"grad_norm": 0.1259621809705678,
"learning_rate": 4.597626742516892e-06,
"loss": 1.1555,
"step": 814
},
{
"epoch": 4.308808973935995,
"grad_norm": 0.1162939281963056,
"learning_rate": 4.529052563197524e-06,
"loss": 1.1657,
"step": 815
},
{
"epoch": 4.3140877598152425,
"grad_norm": 0.12082024531451342,
"learning_rate": 4.460962928958323e-06,
"loss": 1.1697,
"step": 816
},
{
"epoch": 4.31936654569449,
"grad_norm": 0.1143945972992921,
"learning_rate": 4.393358769926592e-06,
"loss": 1.1664,
"step": 817
},
{
"epoch": 4.324645331573738,
"grad_norm": 0.1184039155349698,
"learning_rate": 4.326241009597891e-06,
"loss": 1.1607,
"step": 818
},
{
"epoch": 4.329924117452986,
"grad_norm": 0.11885449737609986,
"learning_rate": 4.259610564823433e-06,
"loss": 1.1711,
"step": 819
},
{
"epoch": 4.335202903332234,
"grad_norm": 0.2993015113097636,
"learning_rate": 4.193468345797511e-06,
"loss": 1.1736,
"step": 820
},
{
"epoch": 4.340481689211481,
"grad_norm": 0.11015741307567276,
"learning_rate": 4.127815256045091e-06,
"loss": 1.1809,
"step": 821
},
{
"epoch": 4.345760475090729,
"grad_norm": 0.11568779303705128,
"learning_rate": 4.06265219240948e-06,
"loss": 1.1686,
"step": 822
},
{
"epoch": 4.351039260969977,
"grad_norm": 0.12406719177604637,
"learning_rate": 3.997980045040062e-06,
"loss": 1.1626,
"step": 823
},
{
"epoch": 4.356318046849225,
"grad_norm": 0.11523454788820603,
"learning_rate": 3.933799697380134e-06,
"loss": 1.1691,
"step": 824
},
{
"epoch": 4.361596832728472,
"grad_norm": 0.11048268846604357,
"learning_rate": 3.870112026154847e-06,
"loss": 1.169,
"step": 825
},
{
"epoch": 4.36687561860772,
"grad_norm": 0.11346757591013902,
"learning_rate": 3.806917901359226e-06,
"loss": 1.1694,
"step": 826
},
{
"epoch": 4.3721544044869685,
"grad_norm": 0.13471289129675465,
"learning_rate": 3.7442181862462666e-06,
"loss": 1.1543,
"step": 827
},
{
"epoch": 4.377433190366216,
"grad_norm": 0.299919959697272,
"learning_rate": 3.6820137373152087e-06,
"loss": 1.1736,
"step": 828
},
{
"epoch": 4.382711976245464,
"grad_norm": 0.11380219584729494,
"learning_rate": 3.620305404299744e-06,
"loss": 1.1778,
"step": 829
},
{
"epoch": 4.387990762124711,
"grad_norm": 0.11818765407385347,
"learning_rate": 3.5590940301564623e-06,
"loss": 1.1567,
"step": 830
},
{
"epoch": 4.393269548003959,
"grad_norm": 0.11642391366774718,
"learning_rate": 3.498380451053365e-06,
"loss": 1.1599,
"step": 831
},
{
"epoch": 4.398548333883207,
"grad_norm": 0.11490246362522913,
"learning_rate": 3.4381654963583413e-06,
"loss": 1.168,
"step": 832
},
{
"epoch": 4.403827119762455,
"grad_norm": 0.13726613621953002,
"learning_rate": 3.3784499886279565e-06,
"loss": 1.1477,
"step": 833
},
{
"epoch": 4.409105905641702,
"grad_norm": 0.11259159480179101,
"learning_rate": 3.3192347435961493e-06,
"loss": 1.1704,
"step": 834
},
{
"epoch": 4.41438469152095,
"grad_norm": 0.11867399217960718,
"learning_rate": 3.2605205701630795e-06,
"loss": 1.1642,
"step": 835
},
{
"epoch": 4.4196634774001975,
"grad_norm": 0.11376592541343825,
"learning_rate": 3.202308270384138e-06,
"loss": 1.1593,
"step": 836
},
{
"epoch": 4.424942263279446,
"grad_norm": 0.11774644117599645,
"learning_rate": 3.144598639458911e-06,
"loss": 1.1549,
"step": 837
},
{
"epoch": 4.4302210491586935,
"grad_norm": 0.11501940477051867,
"learning_rate": 3.0873924657203934e-06,
"loss": 1.1724,
"step": 838
},
{
"epoch": 4.435499835037941,
"grad_norm": 0.12802614276387342,
"learning_rate": 3.0306905306241695e-06,
"loss": 1.1875,
"step": 839
},
{
"epoch": 4.440778620917189,
"grad_norm": 0.11853661247701457,
"learning_rate": 2.974493608737752e-06,
"loss": 1.1564,
"step": 840
},
{
"epoch": 4.446057406796437,
"grad_norm": 0.11905841171224281,
"learning_rate": 2.9188024677300065e-06,
"loss": 1.1693,
"step": 841
},
{
"epoch": 4.451336192675685,
"grad_norm": 0.10964650477920777,
"learning_rate": 2.863617868360673e-06,
"loss": 1.1605,
"step": 842
},
{
"epoch": 4.456614978554932,
"grad_norm": 0.1295527964301152,
"learning_rate": 2.8089405644699463e-06,
"loss": 1.1702,
"step": 843
},
{
"epoch": 4.46189376443418,
"grad_norm": 0.12355557618863641,
"learning_rate": 2.754771302968191e-06,
"loss": 1.1712,
"step": 844
},
{
"epoch": 4.467172550313428,
"grad_norm": 0.11165463039337062,
"learning_rate": 2.7011108238257723e-06,
"loss": 1.1838,
"step": 845
},
{
"epoch": 4.472451336192676,
"grad_norm": 0.11075858795536124,
"learning_rate": 2.647959860062872e-06,
"loss": 1.1611,
"step": 846
},
{
"epoch": 4.4777301220719234,
"grad_norm": 0.12178769651136817,
"learning_rate": 2.5953191377395557e-06,
"loss": 1.164,
"step": 847
},
{
"epoch": 4.483008907951171,
"grad_norm": 0.13608459292258282,
"learning_rate": 2.5431893759458027e-06,
"loss": 1.1749,
"step": 848
},
{
"epoch": 4.488287693830419,
"grad_norm": 0.11221373624505879,
"learning_rate": 2.491571286791703e-06,
"loss": 1.1545,
"step": 849
},
{
"epoch": 4.493566479709667,
"grad_norm": 0.10508342975727109,
"learning_rate": 2.4404655753977437e-06,
"loss": 1.1671,
"step": 850
},
{
"epoch": 4.498845265588915,
"grad_norm": 0.10995666327973941,
"learning_rate": 2.3898729398851164e-06,
"loss": 1.1576,
"step": 851
},
{
"epoch": 4.504124051468162,
"grad_norm": 0.1035869276890292,
"learning_rate": 2.3397940713662683e-06,
"loss": 1.1744,
"step": 852
},
{
"epoch": 4.50940283734741,
"grad_norm": 0.11927558398750628,
"learning_rate": 2.2902296539353895e-06,
"loss": 1.1457,
"step": 853
},
{
"epoch": 4.514681623226658,
"grad_norm": 0.11724384351038623,
"learning_rate": 2.2411803646591057e-06,
"loss": 1.1793,
"step": 854
},
{
"epoch": 4.519960409105906,
"grad_norm": 0.10784941076305816,
"learning_rate": 2.192646873567221e-06,
"loss": 1.1585,
"step": 855
},
{
"epoch": 4.525239194985153,
"grad_norm": 0.10037816732827384,
"learning_rate": 2.1446298436435508e-06,
"loss": 1.151,
"step": 856
},
{
"epoch": 4.530517980864401,
"grad_norm": 0.11365281735149528,
"learning_rate": 2.097129930816895e-06,
"loss": 1.1722,
"step": 857
},
{
"epoch": 4.535796766743649,
"grad_norm": 0.11231091970978371,
"learning_rate": 2.0501477839520323e-06,
"loss": 1.1668,
"step": 858
},
{
"epoch": 4.541075552622897,
"grad_norm": 0.10707625359180917,
"learning_rate": 2.0036840448409166e-06,
"loss": 1.1656,
"step": 859
},
{
"epoch": 4.5463543385021445,
"grad_norm": 0.103343369882393,
"learning_rate": 1.957739348193859e-06,
"loss": 1.159,
"step": 860
},
{
"epoch": 4.551633124381392,
"grad_norm": 0.10643619236223816,
"learning_rate": 1.912314321630877e-06,
"loss": 1.1653,
"step": 861
},
{
"epoch": 4.55691191026064,
"grad_norm": 0.11321998012840938,
"learning_rate": 1.867409585673121e-06,
"loss": 1.1683,
"step": 862
},
{
"epoch": 4.562190696139888,
"grad_norm": 0.11702443692773487,
"learning_rate": 1.823025753734391e-06,
"loss": 1.1491,
"step": 863
},
{
"epoch": 4.567469482019136,
"grad_norm": 0.11336390689648039,
"learning_rate": 1.779163432112787e-06,
"loss": 1.1499,
"step": 864
},
{
"epoch": 4.572748267898383,
"grad_norm": 0.1079563192946037,
"learning_rate": 1.735823219982371e-06,
"loss": 1.1492,
"step": 865
},
{
"epoch": 4.578027053777631,
"grad_norm": 0.11346949871890852,
"learning_rate": 1.693005709385025e-06,
"loss": 1.1675,
"step": 866
},
{
"epoch": 4.583305839656878,
"grad_norm": 0.10606397010769701,
"learning_rate": 1.6507114852223694e-06,
"loss": 1.1638,
"step": 867
},
{
"epoch": 4.588584625536127,
"grad_norm": 0.11224848857280989,
"learning_rate": 1.608941125247725e-06,
"loss": 1.17,
"step": 868
},
{
"epoch": 4.5938634114153745,
"grad_norm": 0.11525328958527273,
"learning_rate": 1.5676952000582746e-06,
"loss": 1.1689,
"step": 869
},
{
"epoch": 4.599142197294622,
"grad_norm": 0.10941397776251549,
"learning_rate": 1.5269742730872384e-06,
"loss": 1.1552,
"step": 870
},
{
"epoch": 4.6044209831738705,
"grad_norm": 0.1053956618070631,
"learning_rate": 1.4867789005961818e-06,
"loss": 1.1509,
"step": 871
},
{
"epoch": 4.609699769053118,
"grad_norm": 0.10586295213178988,
"learning_rate": 1.4471096316674272e-06,
"loss": 1.1679,
"step": 872
},
{
"epoch": 4.614978554932366,
"grad_norm": 0.11592453657256088,
"learning_rate": 1.4079670081965246e-06,
"loss": 1.1731,
"step": 873
},
{
"epoch": 4.620257340811613,
"grad_norm": 0.09907380207012419,
"learning_rate": 1.3693515648849042e-06,
"loss": 1.1687,
"step": 874
},
{
"epoch": 4.625536126690861,
"grad_norm": 0.12561558309571264,
"learning_rate": 1.3312638292325032e-06,
"loss": 1.1549,
"step": 875
},
{
"epoch": 4.630814912570109,
"grad_norm": 0.10690353972188854,
"learning_rate": 1.2937043215306156e-06,
"loss": 1.1537,
"step": 876
},
{
"epoch": 4.636093698449357,
"grad_norm": 0.10264572047936735,
"learning_rate": 1.256673554854757e-06,
"loss": 1.1574,
"step": 877
},
{
"epoch": 4.641372484328604,
"grad_norm": 0.12384527063181164,
"learning_rate": 1.2201720350576608e-06,
"loss": 1.1717,
"step": 878
},
{
"epoch": 4.646651270207852,
"grad_norm": 0.12292766986287774,
"learning_rate": 1.1842002607623804e-06,
"loss": 1.1351,
"step": 879
},
{
"epoch": 4.6519300560870995,
"grad_norm": 0.09960668601841101,
"learning_rate": 1.1487587233554432e-06,
"loss": 1.1635,
"step": 880
},
{
"epoch": 4.657208841966348,
"grad_norm": 0.10157889253610197,
"learning_rate": 1.1138479069801967e-06,
"loss": 1.1694,
"step": 881
},
{
"epoch": 4.662487627845596,
"grad_norm": 0.10654619175522417,
"learning_rate": 1.0794682885301344e-06,
"loss": 1.1577,
"step": 882
},
{
"epoch": 4.667766413724843,
"grad_norm": 0.11376564016272218,
"learning_rate": 1.0456203376424169e-06,
"loss": 1.1711,
"step": 883
},
{
"epoch": 4.673045199604091,
"grad_norm": 0.09940409901353127,
"learning_rate": 1.0123045166914403e-06,
"loss": 1.1718,
"step": 884
},
{
"epoch": 4.678323985483339,
"grad_norm": 0.10209569506859624,
"learning_rate": 9.79521280782536e-07,
"loss": 1.1646,
"step": 885
},
{
"epoch": 4.683602771362587,
"grad_norm": 0.09649947128746558,
"learning_rate": 9.472710777457395e-07,
"loss": 1.1662,
"step": 886
},
{
"epoch": 4.688881557241834,
"grad_norm": 0.10006450320603427,
"learning_rate": 9.155543481296747e-07,
"loss": 1.1615,
"step": 887
},
{
"epoch": 4.694160343121082,
"grad_norm": 0.10739407381371646,
"learning_rate": 8.843715251955464e-07,
"loss": 1.1644,
"step": 888
},
{
"epoch": 4.69943912900033,
"grad_norm": 0.10063647783868902,
"learning_rate": 8.537230349112158e-07,
"loss": 1.1716,
"step": 889
},
{
"epoch": 4.704717914879578,
"grad_norm": 0.10085797467259582,
"learning_rate": 8.236092959453646e-07,
"loss": 1.1613,
"step": 890
},
{
"epoch": 4.7099967007588255,
"grad_norm": 1.7837683707358607,
"learning_rate": 7.940307196618113e-07,
"loss": 1.1772,
"step": 891
},
{
"epoch": 4.715275486638073,
"grad_norm": 0.10090146467799602,
"learning_rate": 7.64987710113858e-07,
"loss": 1.1473,
"step": 892
},
{
"epoch": 4.720554272517321,
"grad_norm": 0.10306276432506471,
"learning_rate": 7.364806640387878e-07,
"loss": 1.167,
"step": 893
},
{
"epoch": 4.725833058396569,
"grad_norm": 0.10423139318723618,
"learning_rate": 7.085099708524557e-07,
"loss": 1.1724,
"step": 894
},
{
"epoch": 4.731111844275817,
"grad_norm": 0.10587170611709364,
"learning_rate": 6.810760126439287e-07,
"loss": 1.1635,
"step": 895
},
{
"epoch": 4.736390630155064,
"grad_norm": 0.10713450024011799,
"learning_rate": 6.54179164170321e-07,
"loss": 1.1799,
"step": 896
},
{
"epoch": 4.741669416034312,
"grad_norm": 0.1033918776840394,
"learning_rate": 6.278197928516294e-07,
"loss": 1.1574,
"step": 897
},
{
"epoch": 4.74694820191356,
"grad_norm": 0.10277089392474535,
"learning_rate": 6.019982587657413e-07,
"loss": 1.1625,
"step": 898
},
{
"epoch": 4.752226987792808,
"grad_norm": 0.09433906599785452,
"learning_rate": 5.767149146435014e-07,
"loss": 1.1728,
"step": 899
},
{
"epoch": 4.757505773672055,
"grad_norm": 0.10104242957996779,
"learning_rate": 5.519701058638971e-07,
"loss": 1.1532,
"step": 900
},
{
"epoch": 4.762784559551303,
"grad_norm": 0.10949650094260902,
"learning_rate": 5.277641704493519e-07,
"loss": 1.1686,
"step": 901
},
{
"epoch": 4.768063345430551,
"grad_norm": 0.10199310812595654,
"learning_rate": 5.040974390610753e-07,
"loss": 1.1635,
"step": 902
},
{
"epoch": 4.773342131309799,
"grad_norm": 0.10007983394238783,
"learning_rate": 4.809702349946044e-07,
"loss": 1.1673,
"step": 903
},
{
"epoch": 4.778620917189047,
"grad_norm": 0.11016988989364565,
"learning_rate": 4.583828741753138e-07,
"loss": 1.1543,
"step": 904
},
{
"epoch": 4.783899703068294,
"grad_norm": 0.10118174124280127,
"learning_rate": 4.363356651541617e-07,
"loss": 1.1625,
"step": 905
},
{
"epoch": 4.789178488947542,
"grad_norm": 8.168639065238155,
"learning_rate": 4.148289091034441e-07,
"loss": 1.1561,
"step": 906
},
{
"epoch": 4.79445727482679,
"grad_norm": 0.13568083810035608,
"learning_rate": 3.9386289981267813e-07,
"loss": 1.1541,
"step": 907
},
{
"epoch": 4.799736060706038,
"grad_norm": 0.09556938886884957,
"learning_rate": 3.734379236846231e-07,
"loss": 1.1589,
"step": 908
},
{
"epoch": 4.805014846585285,
"grad_norm": 0.10242778222635852,
"learning_rate": 3.5355425973131017e-07,
"loss": 1.1748,
"step": 909
},
{
"epoch": 4.810293632464533,
"grad_norm": 0.09666773860438002,
"learning_rate": 3.3421217957029017e-07,
"loss": 1.1598,
"step": 910
},
{
"epoch": 4.8155724183437805,
"grad_norm": 0.09978976902696876,
"learning_rate": 3.154119474208894e-07,
"loss": 1.1752,
"step": 911
},
{
"epoch": 4.820851204223029,
"grad_norm": 0.09710058280846443,
"learning_rate": 2.971538201005997e-07,
"loss": 1.1718,
"step": 912
},
{
"epoch": 4.8261299901022765,
"grad_norm": 0.10166616238664858,
"learning_rate": 2.7943804702159185e-07,
"loss": 1.1674,
"step": 913
},
{
"epoch": 4.831408775981524,
"grad_norm": 0.12196992584814861,
"learning_rate": 2.6226487018728317e-07,
"loss": 1.1652,
"step": 914
},
{
"epoch": 4.836687561860772,
"grad_norm": 0.10117286603737477,
"learning_rate": 2.4563452418905565e-07,
"loss": 1.1708,
"step": 915
},
{
"epoch": 4.84196634774002,
"grad_norm": 0.10639803241716381,
"learning_rate": 2.2954723620302267e-07,
"loss": 1.1717,
"step": 916
},
{
"epoch": 4.847245133619268,
"grad_norm": 0.10365034376993183,
"learning_rate": 2.140032259869651e-07,
"loss": 1.1586,
"step": 917
},
{
"epoch": 4.852523919498515,
"grad_norm": 0.09739690810779615,
"learning_rate": 1.9900270587728477e-07,
"loss": 1.1449,
"step": 918
},
{
"epoch": 4.857802705377763,
"grad_norm": 0.10136332633557722,
"learning_rate": 1.8454588078613555e-07,
"loss": 1.1616,
"step": 919
},
{
"epoch": 4.863081491257011,
"grad_norm": 0.10227583958211155,
"learning_rate": 1.706329481986213e-07,
"loss": 1.1617,
"step": 920
},
{
"epoch": 4.868360277136259,
"grad_norm": 0.1038745119866778,
"learning_rate": 1.5726409817007348e-07,
"loss": 1.1701,
"step": 921
},
{
"epoch": 4.873639063015506,
"grad_norm": 0.10774776382247592,
"learning_rate": 1.444395133234888e-07,
"loss": 1.1665,
"step": 922
},
{
"epoch": 4.878917848894754,
"grad_norm": 0.09792018389942055,
"learning_rate": 1.321593688470113e-07,
"loss": 1.1809,
"step": 923
},
{
"epoch": 4.884196634774002,
"grad_norm": 0.10169455342635682,
"learning_rate": 1.2042383249154743e-07,
"loss": 1.1703,
"step": 924
},
{
"epoch": 4.88947542065325,
"grad_norm": 0.0970982445176629,
"learning_rate": 1.0923306456847915e-07,
"loss": 1.1724,
"step": 925
},
{
"epoch": 4.894754206532498,
"grad_norm": 0.09775306631079801,
"learning_rate": 9.858721794745229e-08,
"loss": 1.1597,
"step": 926
},
{
"epoch": 4.900032992411745,
"grad_norm": 0.09765431840575976,
"learning_rate": 8.848643805432045e-08,
"loss": 1.162,
"step": 927
},
{
"epoch": 4.905311778290993,
"grad_norm": 0.09631829632067684,
"learning_rate": 7.893086286913764e-08,
"loss": 1.1572,
"step": 928
},
{
"epoch": 4.910590564170241,
"grad_norm": 0.0941345070066087,
"learning_rate": 6.992062292427548e-08,
"loss": 1.1515,
"step": 929
},
{
"epoch": 4.915869350049489,
"grad_norm": 0.09830810649690867,
"learning_rate": 6.145584130264226e-08,
"loss": 1.1534,
"step": 930
},
{
"epoch": 4.921148135928736,
"grad_norm": 0.3185034298194423,
"learning_rate": 5.353663363600437e-08,
"loss": 1.1921,
"step": 931
},
{
"epoch": 4.926426921807984,
"grad_norm": 0.10174557358686656,
"learning_rate": 4.6163108103405295e-08,
"loss": 1.1714,
"step": 932
},
{
"epoch": 4.931705707687232,
"grad_norm": 0.0955458770336446,
"learning_rate": 3.933536542968686e-08,
"loss": 1.1645,
"step": 933
},
{
"epoch": 4.93698449356648,
"grad_norm": 0.09414779046743281,
"learning_rate": 3.305349888410803e-08,
"loss": 1.1724,
"step": 934
},
{
"epoch": 4.9422632794457275,
"grad_norm": 0.10775863469508211,
"learning_rate": 2.731759427908376e-08,
"loss": 1.1738,
"step": 935
},
{
"epoch": 4.947542065324975,
"grad_norm": 0.10257711289400813,
"learning_rate": 2.2127729968999257e-08,
"loss": 1.16,
"step": 936
},
{
"epoch": 4.952820851204223,
"grad_norm": 0.11208184061076697,
"learning_rate": 1.74839768491486e-08,
"loss": 1.1694,
"step": 937
},
{
"epoch": 4.958099637083471,
"grad_norm": 0.12886345058516566,
"learning_rate": 1.3386398354762187e-08,
"loss": 1.1722,
"step": 938
},
{
"epoch": 4.963378422962719,
"grad_norm": 0.10028905949188494,
"learning_rate": 9.835050460140772e-09,
"loss": 1.1624,
"step": 939
},
{
"epoch": 4.968657208841966,
"grad_norm": 0.10904642824875087,
"learning_rate": 6.829981677891618e-09,
"loss": 1.1632,
"step": 940
},
{
"epoch": 4.973935994721214,
"grad_norm": 0.10094212240836907,
"learning_rate": 4.3712330582668105e-09,
"loss": 1.1605,
"step": 941
},
{
"epoch": 4.979214780600461,
"grad_norm": 0.11163546063862742,
"learning_rate": 2.458838188599266e-09,
"loss": 1.1583,
"step": 942
},
{
"epoch": 4.98449356647971,
"grad_norm": 0.09739350820326295,
"learning_rate": 1.0928231928497568e-09,
"loss": 1.1585,
"step": 943
},
{
"epoch": 4.989772352358957,
"grad_norm": 0.09419504954849381,
"learning_rate": 2.7320673123831796e-10,
"loss": 1.1736,
"step": 944
},
{
"epoch": 4.995051138238205,
"grad_norm": 0.12697849655419563,
"learning_rate": 0.0,
"loss": 1.1708,
"step": 945
},
{
"epoch": 4.995051138238205,
"step": 945,
"total_flos": 2.4317663849553592e+19,
"train_loss": 1.2392805136070049,
"train_runtime": 143310.648,
"train_samples_per_second": 3.383,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1.0,
"max_steps": 945,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4317663849553592e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}