gpt2-dropout / trainer_state.json
ao9000's picture
Upload 2 files
1d1176d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.22326645805206993,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00011163322902603496,
"grad_norm": 0.44215860962867737,
"learning_rate": 4.9975e-05,
"loss": 4.2341,
"step": 1
},
{
"epoch": 0.00022326645805206992,
"grad_norm": 0.43593406677246094,
"learning_rate": 4.995e-05,
"loss": 4.1995,
"step": 2
},
{
"epoch": 0.0003348996870781049,
"grad_norm": 0.4163583517074585,
"learning_rate": 4.992500000000001e-05,
"loss": 4.1353,
"step": 3
},
{
"epoch": 0.00044653291610413984,
"grad_norm": 0.42954307794570923,
"learning_rate": 4.99e-05,
"loss": 4.2196,
"step": 4
},
{
"epoch": 0.0005581661451301748,
"grad_norm": 0.412328839302063,
"learning_rate": 4.9875000000000006e-05,
"loss": 4.0622,
"step": 5
},
{
"epoch": 0.0006697993741562098,
"grad_norm": 0.4354637861251831,
"learning_rate": 4.9850000000000006e-05,
"loss": 4.2064,
"step": 6
},
{
"epoch": 0.0007814326031822447,
"grad_norm": 0.4455626308917999,
"learning_rate": 4.9825000000000005e-05,
"loss": 4.0721,
"step": 7
},
{
"epoch": 0.0008930658322082797,
"grad_norm": 0.43596258759498596,
"learning_rate": 4.9800000000000004e-05,
"loss": 4.0665,
"step": 8
},
{
"epoch": 0.0010046990612343147,
"grad_norm": 0.4392620027065277,
"learning_rate": 4.9775000000000004e-05,
"loss": 4.1342,
"step": 9
},
{
"epoch": 0.0011163322902603497,
"grad_norm": 0.4544425308704376,
"learning_rate": 4.975e-05,
"loss": 4.1405,
"step": 10
},
{
"epoch": 0.0012279655192863846,
"grad_norm": 0.4113636314868927,
"learning_rate": 4.9725e-05,
"loss": 4.0856,
"step": 11
},
{
"epoch": 0.0013395987483124196,
"grad_norm": 0.43815362453460693,
"learning_rate": 4.97e-05,
"loss": 4.0425,
"step": 12
},
{
"epoch": 0.0014512319773384544,
"grad_norm": 0.4221861958503723,
"learning_rate": 4.967500000000001e-05,
"loss": 3.9899,
"step": 13
},
{
"epoch": 0.0015628652063644894,
"grad_norm": 0.46096450090408325,
"learning_rate": 4.965e-05,
"loss": 4.0892,
"step": 14
},
{
"epoch": 0.0016744984353905244,
"grad_norm": 0.46079084277153015,
"learning_rate": 4.962500000000001e-05,
"loss": 4.0699,
"step": 15
},
{
"epoch": 0.0017861316644165594,
"grad_norm": 0.4496927261352539,
"learning_rate": 4.96e-05,
"loss": 4.1612,
"step": 16
},
{
"epoch": 0.0018977648934425943,
"grad_norm": 0.4900566339492798,
"learning_rate": 4.9575000000000006e-05,
"loss": 4.0794,
"step": 17
},
{
"epoch": 0.0020093981224686293,
"grad_norm": 0.4467006027698517,
"learning_rate": 4.9550000000000005e-05,
"loss": 4.0253,
"step": 18
},
{
"epoch": 0.0021210313514946643,
"grad_norm": 0.4582521915435791,
"learning_rate": 4.9525000000000004e-05,
"loss": 4.0763,
"step": 19
},
{
"epoch": 0.0022326645805206993,
"grad_norm": 0.42390087246894836,
"learning_rate": 4.9500000000000004e-05,
"loss": 4.1114,
"step": 20
},
{
"epoch": 0.0023442978095467343,
"grad_norm": 0.4285309314727783,
"learning_rate": 4.9475e-05,
"loss": 3.9917,
"step": 21
},
{
"epoch": 0.0024559310385727693,
"grad_norm": 0.4030017554759979,
"learning_rate": 4.945e-05,
"loss": 3.9292,
"step": 22
},
{
"epoch": 0.0025675642675988043,
"grad_norm": 0.41721802949905396,
"learning_rate": 4.9425e-05,
"loss": 4.0111,
"step": 23
},
{
"epoch": 0.0026791974966248393,
"grad_norm": 0.39811229705810547,
"learning_rate": 4.94e-05,
"loss": 3.894,
"step": 24
},
{
"epoch": 0.0027908307256508742,
"grad_norm": 0.4145883321762085,
"learning_rate": 4.937500000000001e-05,
"loss": 3.9218,
"step": 25
},
{
"epoch": 0.002902463954676909,
"grad_norm": 0.38777053356170654,
"learning_rate": 4.935e-05,
"loss": 4.003,
"step": 26
},
{
"epoch": 0.003014097183702944,
"grad_norm": 0.37414127588272095,
"learning_rate": 4.9325000000000006e-05,
"loss": 3.9657,
"step": 27
},
{
"epoch": 0.0031257304127289788,
"grad_norm": 0.3585963547229767,
"learning_rate": 4.93e-05,
"loss": 3.9656,
"step": 28
},
{
"epoch": 0.0032373636417550138,
"grad_norm": 0.34030988812446594,
"learning_rate": 4.9275000000000005e-05,
"loss": 3.8919,
"step": 29
},
{
"epoch": 0.0033489968707810487,
"grad_norm": 0.34275656938552856,
"learning_rate": 4.9250000000000004e-05,
"loss": 3.8943,
"step": 30
},
{
"epoch": 0.0034606300998070837,
"grad_norm": 0.33121684193611145,
"learning_rate": 4.9225000000000004e-05,
"loss": 3.9409,
"step": 31
},
{
"epoch": 0.0035722633288331187,
"grad_norm": 0.3226243257522583,
"learning_rate": 4.92e-05,
"loss": 3.9615,
"step": 32
},
{
"epoch": 0.0036838965578591537,
"grad_norm": 0.3060978353023529,
"learning_rate": 4.9175e-05,
"loss": 3.9521,
"step": 33
},
{
"epoch": 0.0037955297868851887,
"grad_norm": 0.309896320104599,
"learning_rate": 4.915e-05,
"loss": 3.9552,
"step": 34
},
{
"epoch": 0.003907163015911224,
"grad_norm": 0.3025077283382416,
"learning_rate": 4.9125e-05,
"loss": 3.8825,
"step": 35
},
{
"epoch": 0.004018796244937259,
"grad_norm": 0.2822646498680115,
"learning_rate": 4.91e-05,
"loss": 3.9761,
"step": 36
},
{
"epoch": 0.004130429473963294,
"grad_norm": 0.2942345440387726,
"learning_rate": 4.907500000000001e-05,
"loss": 3.9771,
"step": 37
},
{
"epoch": 0.004242062702989329,
"grad_norm": 0.3008975386619568,
"learning_rate": 4.905e-05,
"loss": 3.9041,
"step": 38
},
{
"epoch": 0.004353695932015364,
"grad_norm": 0.29803982377052307,
"learning_rate": 4.9025000000000006e-05,
"loss": 3.8995,
"step": 39
},
{
"epoch": 0.004465329161041399,
"grad_norm": 0.2962414622306824,
"learning_rate": 4.9e-05,
"loss": 3.8843,
"step": 40
},
{
"epoch": 0.004576962390067434,
"grad_norm": 0.2958971858024597,
"learning_rate": 4.8975000000000005e-05,
"loss": 3.8987,
"step": 41
},
{
"epoch": 0.004688595619093469,
"grad_norm": 0.28974649310112,
"learning_rate": 4.8950000000000004e-05,
"loss": 3.9809,
"step": 42
},
{
"epoch": 0.004800228848119504,
"grad_norm": 0.28950035572052,
"learning_rate": 4.8925e-05,
"loss": 3.8246,
"step": 43
},
{
"epoch": 0.004911862077145539,
"grad_norm": 0.29236239194869995,
"learning_rate": 4.89e-05,
"loss": 3.8756,
"step": 44
},
{
"epoch": 0.0050234953061715736,
"grad_norm": 0.29517626762390137,
"learning_rate": 4.8875e-05,
"loss": 3.8802,
"step": 45
},
{
"epoch": 0.0051351285351976085,
"grad_norm": 0.27704691886901855,
"learning_rate": 4.885e-05,
"loss": 3.8847,
"step": 46
},
{
"epoch": 0.0052467617642236435,
"grad_norm": 0.2924402058124542,
"learning_rate": 4.8825e-05,
"loss": 3.9734,
"step": 47
},
{
"epoch": 0.0053583949932496785,
"grad_norm": 0.2823637127876282,
"learning_rate": 4.88e-05,
"loss": 3.809,
"step": 48
},
{
"epoch": 0.0054700282222757135,
"grad_norm": 0.2842734754085541,
"learning_rate": 4.8775000000000007e-05,
"loss": 3.8888,
"step": 49
},
{
"epoch": 0.0055816614513017485,
"grad_norm": 0.28633418679237366,
"learning_rate": 4.875e-05,
"loss": 3.9063,
"step": 50
},
{
"epoch": 0.0056932946803277835,
"grad_norm": 0.2624303102493286,
"learning_rate": 4.8725000000000005e-05,
"loss": 4.0074,
"step": 51
},
{
"epoch": 0.005804927909353818,
"grad_norm": 0.2477407604455948,
"learning_rate": 4.87e-05,
"loss": 3.876,
"step": 52
},
{
"epoch": 0.005916561138379853,
"grad_norm": 0.25738608837127686,
"learning_rate": 4.8675000000000004e-05,
"loss": 3.9139,
"step": 53
},
{
"epoch": 0.006028194367405888,
"grad_norm": 0.2670241892337799,
"learning_rate": 4.8650000000000003e-05,
"loss": 3.7636,
"step": 54
},
{
"epoch": 0.0061398275964319226,
"grad_norm": 0.2741681635379791,
"learning_rate": 4.8625e-05,
"loss": 3.6865,
"step": 55
},
{
"epoch": 0.0062514608254579575,
"grad_norm": 0.2551226317882538,
"learning_rate": 4.86e-05,
"loss": 3.9293,
"step": 56
},
{
"epoch": 0.0063630940544839925,
"grad_norm": 0.26813098788261414,
"learning_rate": 4.8575e-05,
"loss": 3.738,
"step": 57
},
{
"epoch": 0.0064747272835100275,
"grad_norm": 0.25975507497787476,
"learning_rate": 4.855e-05,
"loss": 3.9278,
"step": 58
},
{
"epoch": 0.0065863605125360625,
"grad_norm": 0.2518691122531891,
"learning_rate": 4.8525e-05,
"loss": 3.7746,
"step": 59
},
{
"epoch": 0.0066979937415620975,
"grad_norm": 0.26091113686561584,
"learning_rate": 4.85e-05,
"loss": 3.8529,
"step": 60
},
{
"epoch": 0.0068096269705881325,
"grad_norm": 0.23660063743591309,
"learning_rate": 4.8475000000000006e-05,
"loss": 3.8632,
"step": 61
},
{
"epoch": 0.0069212601996141675,
"grad_norm": 0.25954556465148926,
"learning_rate": 4.845e-05,
"loss": 3.9002,
"step": 62
},
{
"epoch": 0.0070328934286402025,
"grad_norm": 0.23866277933120728,
"learning_rate": 4.8425000000000005e-05,
"loss": 3.7924,
"step": 63
},
{
"epoch": 0.0071445266576662374,
"grad_norm": 0.24464142322540283,
"learning_rate": 4.8400000000000004e-05,
"loss": 3.8178,
"step": 64
},
{
"epoch": 0.007256159886692272,
"grad_norm": 0.2351391762495041,
"learning_rate": 4.8375000000000004e-05,
"loss": 3.9608,
"step": 65
},
{
"epoch": 0.007367793115718307,
"grad_norm": 0.23395702242851257,
"learning_rate": 4.835e-05,
"loss": 3.808,
"step": 66
},
{
"epoch": 0.007479426344744342,
"grad_norm": 0.21845871210098267,
"learning_rate": 4.8325e-05,
"loss": 3.734,
"step": 67
},
{
"epoch": 0.007591059573770377,
"grad_norm": 0.21968621015548706,
"learning_rate": 4.83e-05,
"loss": 3.8032,
"step": 68
},
{
"epoch": 0.007702692802796412,
"grad_norm": 0.21778298914432526,
"learning_rate": 4.8275e-05,
"loss": 3.774,
"step": 69
},
{
"epoch": 0.007814326031822447,
"grad_norm": 0.23595227301120758,
"learning_rate": 4.825e-05,
"loss": 3.7063,
"step": 70
},
{
"epoch": 0.007925959260848481,
"grad_norm": 0.25157737731933594,
"learning_rate": 4.822500000000001e-05,
"loss": 3.7496,
"step": 71
},
{
"epoch": 0.008037592489874517,
"grad_norm": 0.20785056054592133,
"learning_rate": 4.82e-05,
"loss": 3.8136,
"step": 72
},
{
"epoch": 0.008149225718900551,
"grad_norm": 0.22036397457122803,
"learning_rate": 4.8175000000000005e-05,
"loss": 3.8406,
"step": 73
},
{
"epoch": 0.008260858947926587,
"grad_norm": 0.23183584213256836,
"learning_rate": 4.815e-05,
"loss": 3.6851,
"step": 74
},
{
"epoch": 0.008372492176952621,
"grad_norm": 0.2212534248828888,
"learning_rate": 4.8125000000000004e-05,
"loss": 3.8537,
"step": 75
},
{
"epoch": 0.008484125405978657,
"grad_norm": 0.21307982504367828,
"learning_rate": 4.8100000000000004e-05,
"loss": 3.7184,
"step": 76
},
{
"epoch": 0.008595758635004691,
"grad_norm": 0.2153688371181488,
"learning_rate": 4.8075e-05,
"loss": 3.8932,
"step": 77
},
{
"epoch": 0.008707391864030727,
"grad_norm": 0.2363620400428772,
"learning_rate": 4.805e-05,
"loss": 3.8352,
"step": 78
},
{
"epoch": 0.008819025093056761,
"grad_norm": 0.22032879292964935,
"learning_rate": 4.8025e-05,
"loss": 3.7329,
"step": 79
},
{
"epoch": 0.008930658322082797,
"grad_norm": 0.20803982019424438,
"learning_rate": 4.8e-05,
"loss": 3.816,
"step": 80
},
{
"epoch": 0.009042291551108831,
"grad_norm": 0.21293315291404724,
"learning_rate": 4.7975e-05,
"loss": 3.7736,
"step": 81
},
{
"epoch": 0.009153924780134867,
"grad_norm": 0.21846282482147217,
"learning_rate": 4.795e-05,
"loss": 3.8622,
"step": 82
},
{
"epoch": 0.009265558009160901,
"grad_norm": 0.21493828296661377,
"learning_rate": 4.7925000000000006e-05,
"loss": 3.7606,
"step": 83
},
{
"epoch": 0.009377191238186937,
"grad_norm": 0.19198723137378693,
"learning_rate": 4.79e-05,
"loss": 3.7661,
"step": 84
},
{
"epoch": 0.009488824467212971,
"grad_norm": 0.20541100203990936,
"learning_rate": 4.7875000000000005e-05,
"loss": 3.8284,
"step": 85
},
{
"epoch": 0.009600457696239007,
"grad_norm": 0.2089376449584961,
"learning_rate": 4.785e-05,
"loss": 3.733,
"step": 86
},
{
"epoch": 0.009712090925265041,
"grad_norm": 0.20992936193943024,
"learning_rate": 4.7825000000000004e-05,
"loss": 3.7951,
"step": 87
},
{
"epoch": 0.009823724154291077,
"grad_norm": 0.2133840173482895,
"learning_rate": 4.78e-05,
"loss": 3.9205,
"step": 88
},
{
"epoch": 0.009935357383317111,
"grad_norm": 0.21071945130825043,
"learning_rate": 4.7775e-05,
"loss": 3.8632,
"step": 89
},
{
"epoch": 0.010046990612343147,
"grad_norm": 0.21284504234790802,
"learning_rate": 4.775e-05,
"loss": 3.882,
"step": 90
},
{
"epoch": 0.010158623841369181,
"grad_norm": 0.22976839542388916,
"learning_rate": 4.7725e-05,
"loss": 3.7439,
"step": 91
},
{
"epoch": 0.010270257070395217,
"grad_norm": 0.20580005645751953,
"learning_rate": 4.77e-05,
"loss": 3.8641,
"step": 92
},
{
"epoch": 0.010381890299421251,
"grad_norm": 0.20517392456531525,
"learning_rate": 4.7675e-05,
"loss": 3.7004,
"step": 93
},
{
"epoch": 0.010493523528447287,
"grad_norm": 0.2080390453338623,
"learning_rate": 4.765e-05,
"loss": 3.8157,
"step": 94
},
{
"epoch": 0.010605156757473321,
"grad_norm": 0.218965083360672,
"learning_rate": 4.7625000000000006e-05,
"loss": 3.8501,
"step": 95
},
{
"epoch": 0.010716789986499357,
"grad_norm": 0.1980111002922058,
"learning_rate": 4.76e-05,
"loss": 3.8317,
"step": 96
},
{
"epoch": 0.010828423215525391,
"grad_norm": 0.20574727654457092,
"learning_rate": 4.7575000000000004e-05,
"loss": 3.8553,
"step": 97
},
{
"epoch": 0.010940056444551427,
"grad_norm": 0.20836076140403748,
"learning_rate": 4.755e-05,
"loss": 3.7301,
"step": 98
},
{
"epoch": 0.011051689673577461,
"grad_norm": 0.20732244849205017,
"learning_rate": 4.7525e-05,
"loss": 3.7967,
"step": 99
},
{
"epoch": 0.011163322902603497,
"grad_norm": 0.19933991134166718,
"learning_rate": 4.75e-05,
"loss": 3.7539,
"step": 100
},
{
"epoch": 0.011274956131629531,
"grad_norm": 0.20174570381641388,
"learning_rate": 4.7475e-05,
"loss": 3.7273,
"step": 101
},
{
"epoch": 0.011386589360655567,
"grad_norm": 0.20205800235271454,
"learning_rate": 4.745e-05,
"loss": 3.7442,
"step": 102
},
{
"epoch": 0.011498222589681601,
"grad_norm": 0.20022137463092804,
"learning_rate": 4.7425e-05,
"loss": 3.8821,
"step": 103
},
{
"epoch": 0.011609855818707635,
"grad_norm": 0.19893613457679749,
"learning_rate": 4.74e-05,
"loss": 3.7901,
"step": 104
},
{
"epoch": 0.011721489047733671,
"grad_norm": 0.20531974732875824,
"learning_rate": 4.7375e-05,
"loss": 3.8307,
"step": 105
},
{
"epoch": 0.011833122276759705,
"grad_norm": 0.19706977903842926,
"learning_rate": 4.735e-05,
"loss": 3.7118,
"step": 106
},
{
"epoch": 0.011944755505785741,
"grad_norm": 0.1949852705001831,
"learning_rate": 4.7325000000000005e-05,
"loss": 3.8565,
"step": 107
},
{
"epoch": 0.012056388734811775,
"grad_norm": 0.19829995930194855,
"learning_rate": 4.73e-05,
"loss": 3.7163,
"step": 108
},
{
"epoch": 0.012168021963837811,
"grad_norm": 0.19221463799476624,
"learning_rate": 4.7275000000000004e-05,
"loss": 3.7295,
"step": 109
},
{
"epoch": 0.012279655192863845,
"grad_norm": 0.20135809481143951,
"learning_rate": 4.7249999999999997e-05,
"loss": 3.8531,
"step": 110
},
{
"epoch": 0.012391288421889881,
"grad_norm": 0.1908191293478012,
"learning_rate": 4.7225e-05,
"loss": 3.7745,
"step": 111
},
{
"epoch": 0.012502921650915915,
"grad_norm": 0.18213479220867157,
"learning_rate": 4.72e-05,
"loss": 3.7475,
"step": 112
},
{
"epoch": 0.012614554879941951,
"grad_norm": 0.19967062771320343,
"learning_rate": 4.7175e-05,
"loss": 3.6219,
"step": 113
},
{
"epoch": 0.012726188108967985,
"grad_norm": 0.1982780545949936,
"learning_rate": 4.715e-05,
"loss": 3.7373,
"step": 114
},
{
"epoch": 0.012837821337994021,
"grad_norm": 0.19917789101600647,
"learning_rate": 4.7125e-05,
"loss": 3.7308,
"step": 115
},
{
"epoch": 0.012949454567020055,
"grad_norm": 0.19216570258140564,
"learning_rate": 4.71e-05,
"loss": 3.7488,
"step": 116
},
{
"epoch": 0.013061087796046091,
"grad_norm": 0.18333494663238525,
"learning_rate": 4.7075e-05,
"loss": 3.7555,
"step": 117
},
{
"epoch": 0.013172721025072125,
"grad_norm": 0.19792726635932922,
"learning_rate": 4.705e-05,
"loss": 3.7793,
"step": 118
},
{
"epoch": 0.013284354254098161,
"grad_norm": 0.19866353273391724,
"learning_rate": 4.7025000000000005e-05,
"loss": 3.7207,
"step": 119
},
{
"epoch": 0.013395987483124195,
"grad_norm": 0.19925542175769806,
"learning_rate": 4.7e-05,
"loss": 3.8039,
"step": 120
},
{
"epoch": 0.01350762071215023,
"grad_norm": 0.19152797758579254,
"learning_rate": 4.6975000000000003e-05,
"loss": 3.8374,
"step": 121
},
{
"epoch": 0.013619253941176265,
"grad_norm": 0.20094329118728638,
"learning_rate": 4.695e-05,
"loss": 3.7317,
"step": 122
},
{
"epoch": 0.0137308871702023,
"grad_norm": 0.18719394505023956,
"learning_rate": 4.6925e-05,
"loss": 3.9153,
"step": 123
},
{
"epoch": 0.013842520399228335,
"grad_norm": 0.19800803065299988,
"learning_rate": 4.69e-05,
"loss": 3.716,
"step": 124
},
{
"epoch": 0.01395415362825437,
"grad_norm": 0.1891317069530487,
"learning_rate": 4.6875e-05,
"loss": 3.8018,
"step": 125
},
{
"epoch": 0.014065786857280405,
"grad_norm": 0.20324207842350006,
"learning_rate": 4.685000000000001e-05,
"loss": 3.8073,
"step": 126
},
{
"epoch": 0.01417742008630644,
"grad_norm": 0.18200170993804932,
"learning_rate": 4.6825e-05,
"loss": 3.8118,
"step": 127
},
{
"epoch": 0.014289053315332475,
"grad_norm": 0.18437935411930084,
"learning_rate": 4.6800000000000006e-05,
"loss": 3.7065,
"step": 128
},
{
"epoch": 0.01440068654435851,
"grad_norm": 0.1906784325838089,
"learning_rate": 4.6775000000000005e-05,
"loss": 3.6978,
"step": 129
},
{
"epoch": 0.014512319773384545,
"grad_norm": 0.19428980350494385,
"learning_rate": 4.6750000000000005e-05,
"loss": 3.8249,
"step": 130
},
{
"epoch": 0.01462395300241058,
"grad_norm": 0.1876995712518692,
"learning_rate": 4.6725000000000004e-05,
"loss": 3.6724,
"step": 131
},
{
"epoch": 0.014735586231436615,
"grad_norm": 0.19242452085018158,
"learning_rate": 4.6700000000000003e-05,
"loss": 3.7775,
"step": 132
},
{
"epoch": 0.01484721946046265,
"grad_norm": 0.21177713572978973,
"learning_rate": 4.6675e-05,
"loss": 3.4649,
"step": 133
},
{
"epoch": 0.014958852689488685,
"grad_norm": 0.19784331321716309,
"learning_rate": 4.665e-05,
"loss": 3.7323,
"step": 134
},
{
"epoch": 0.01507048591851472,
"grad_norm": 0.1861720085144043,
"learning_rate": 4.6625e-05,
"loss": 3.7735,
"step": 135
},
{
"epoch": 0.015182119147540755,
"grad_norm": 0.18700018525123596,
"learning_rate": 4.660000000000001e-05,
"loss": 3.7867,
"step": 136
},
{
"epoch": 0.01529375237656679,
"grad_norm": 0.18450582027435303,
"learning_rate": 4.6575e-05,
"loss": 3.6345,
"step": 137
},
{
"epoch": 0.015405385605592825,
"grad_norm": 0.18607810139656067,
"learning_rate": 4.655000000000001e-05,
"loss": 3.8005,
"step": 138
},
{
"epoch": 0.015517018834618859,
"grad_norm": 0.18932712078094482,
"learning_rate": 4.6525e-05,
"loss": 3.7813,
"step": 139
},
{
"epoch": 0.015628652063644895,
"grad_norm": 0.19080808758735657,
"learning_rate": 4.6500000000000005e-05,
"loss": 3.7997,
"step": 140
},
{
"epoch": 0.01574028529267093,
"grad_norm": 0.1940724104642868,
"learning_rate": 4.6475000000000005e-05,
"loss": 3.7511,
"step": 141
},
{
"epoch": 0.015851918521696963,
"grad_norm": 0.19110755622386932,
"learning_rate": 4.6450000000000004e-05,
"loss": 3.7172,
"step": 142
},
{
"epoch": 0.015963551750723,
"grad_norm": 0.19346538186073303,
"learning_rate": 4.6425000000000004e-05,
"loss": 3.6824,
"step": 143
},
{
"epoch": 0.016075184979749035,
"grad_norm": 0.20193922519683838,
"learning_rate": 4.64e-05,
"loss": 3.7527,
"step": 144
},
{
"epoch": 0.01618681820877507,
"grad_norm": 0.19049830734729767,
"learning_rate": 4.6375e-05,
"loss": 3.7024,
"step": 145
},
{
"epoch": 0.016298451437801103,
"grad_norm": 0.20719942450523376,
"learning_rate": 4.635e-05,
"loss": 3.6926,
"step": 146
},
{
"epoch": 0.01641008466682714,
"grad_norm": 0.1987176239490509,
"learning_rate": 4.6325e-05,
"loss": 3.7588,
"step": 147
},
{
"epoch": 0.016521717895853175,
"grad_norm": 0.19539745151996613,
"learning_rate": 4.630000000000001e-05,
"loss": 3.7433,
"step": 148
},
{
"epoch": 0.01663335112487921,
"grad_norm": 0.1892002671957016,
"learning_rate": 4.6275e-05,
"loss": 3.7502,
"step": 149
},
{
"epoch": 0.016744984353905243,
"grad_norm": 0.17843297123908997,
"learning_rate": 4.6250000000000006e-05,
"loss": 3.7547,
"step": 150
},
{
"epoch": 0.01685661758293128,
"grad_norm": 0.1873432993888855,
"learning_rate": 4.6225e-05,
"loss": 3.6625,
"step": 151
},
{
"epoch": 0.016968250811957315,
"grad_norm": 0.19480037689208984,
"learning_rate": 4.6200000000000005e-05,
"loss": 3.6946,
"step": 152
},
{
"epoch": 0.01707988404098335,
"grad_norm": 0.18758131563663483,
"learning_rate": 4.6175000000000004e-05,
"loss": 3.6578,
"step": 153
},
{
"epoch": 0.017191517270009383,
"grad_norm": 0.2024543583393097,
"learning_rate": 4.6150000000000004e-05,
"loss": 3.7395,
"step": 154
},
{
"epoch": 0.01730315049903542,
"grad_norm": 0.2026447355747223,
"learning_rate": 4.6125e-05,
"loss": 3.6039,
"step": 155
},
{
"epoch": 0.017414783728061455,
"grad_norm": 0.18943609297275543,
"learning_rate": 4.61e-05,
"loss": 3.7418,
"step": 156
},
{
"epoch": 0.01752641695708749,
"grad_norm": 0.19075363874435425,
"learning_rate": 4.6075e-05,
"loss": 3.6281,
"step": 157
},
{
"epoch": 0.017638050186113523,
"grad_norm": 0.19054122269153595,
"learning_rate": 4.605e-05,
"loss": 3.745,
"step": 158
},
{
"epoch": 0.01774968341513956,
"grad_norm": 0.2247437834739685,
"learning_rate": 4.6025e-05,
"loss": 3.6314,
"step": 159
},
{
"epoch": 0.017861316644165594,
"grad_norm": 0.20335987210273743,
"learning_rate": 4.600000000000001e-05,
"loss": 3.7077,
"step": 160
},
{
"epoch": 0.01797294987319163,
"grad_norm": 0.18995296955108643,
"learning_rate": 4.5975e-05,
"loss": 3.7102,
"step": 161
},
{
"epoch": 0.018084583102217663,
"grad_norm": 0.19453871250152588,
"learning_rate": 4.5950000000000006e-05,
"loss": 3.8033,
"step": 162
},
{
"epoch": 0.0181962163312437,
"grad_norm": 0.1937822848558426,
"learning_rate": 4.5925e-05,
"loss": 3.7512,
"step": 163
},
{
"epoch": 0.018307849560269734,
"grad_norm": 0.20775523781776428,
"learning_rate": 4.5900000000000004e-05,
"loss": 3.7665,
"step": 164
},
{
"epoch": 0.01841948278929577,
"grad_norm": 0.20299625396728516,
"learning_rate": 4.5875000000000004e-05,
"loss": 3.8539,
"step": 165
},
{
"epoch": 0.018531116018321803,
"grad_norm": 0.21265819668769836,
"learning_rate": 4.585e-05,
"loss": 3.735,
"step": 166
},
{
"epoch": 0.01864274924734784,
"grad_norm": 0.19450101256370544,
"learning_rate": 4.5825e-05,
"loss": 3.7591,
"step": 167
},
{
"epoch": 0.018754382476373874,
"grad_norm": 0.18429763615131378,
"learning_rate": 4.58e-05,
"loss": 3.7293,
"step": 168
},
{
"epoch": 0.01886601570539991,
"grad_norm": 0.19988296926021576,
"learning_rate": 4.5775e-05,
"loss": 3.7649,
"step": 169
},
{
"epoch": 0.018977648934425943,
"grad_norm": 0.19402478635311127,
"learning_rate": 4.575e-05,
"loss": 3.6106,
"step": 170
},
{
"epoch": 0.01908928216345198,
"grad_norm": 0.18561923503875732,
"learning_rate": 4.5725e-05,
"loss": 3.7321,
"step": 171
},
{
"epoch": 0.019200915392478014,
"grad_norm": 0.19031959772109985,
"learning_rate": 4.5700000000000006e-05,
"loss": 3.7145,
"step": 172
},
{
"epoch": 0.01931254862150405,
"grad_norm": 0.19055528938770294,
"learning_rate": 4.5675e-05,
"loss": 3.7795,
"step": 173
},
{
"epoch": 0.019424181850530083,
"grad_norm": 0.22101019322872162,
"learning_rate": 4.5650000000000005e-05,
"loss": 3.7475,
"step": 174
},
{
"epoch": 0.019535815079556117,
"grad_norm": 0.18215657770633698,
"learning_rate": 4.5625e-05,
"loss": 3.766,
"step": 175
},
{
"epoch": 0.019647448308582154,
"grad_norm": 0.19932851195335388,
"learning_rate": 4.5600000000000004e-05,
"loss": 3.6191,
"step": 176
},
{
"epoch": 0.01975908153760819,
"grad_norm": 0.18989135324954987,
"learning_rate": 4.5575e-05,
"loss": 3.6882,
"step": 177
},
{
"epoch": 0.019870714766634223,
"grad_norm": 0.18184684216976166,
"learning_rate": 4.555e-05,
"loss": 3.7387,
"step": 178
},
{
"epoch": 0.019982347995660257,
"grad_norm": 0.1986314058303833,
"learning_rate": 4.5525e-05,
"loss": 3.6494,
"step": 179
},
{
"epoch": 0.020093981224686294,
"grad_norm": 0.19898182153701782,
"learning_rate": 4.55e-05,
"loss": 3.5578,
"step": 180
},
{
"epoch": 0.02020561445371233,
"grad_norm": 0.1983109712600708,
"learning_rate": 4.5475e-05,
"loss": 3.7309,
"step": 181
},
{
"epoch": 0.020317247682738362,
"grad_norm": 0.19380560517311096,
"learning_rate": 4.545000000000001e-05,
"loss": 3.7312,
"step": 182
},
{
"epoch": 0.020428880911764397,
"grad_norm": 0.21668657660484314,
"learning_rate": 4.5425e-05,
"loss": 3.7747,
"step": 183
},
{
"epoch": 0.020540514140790434,
"grad_norm": 0.19565781950950623,
"learning_rate": 4.5400000000000006e-05,
"loss": 3.6473,
"step": 184
},
{
"epoch": 0.02065214736981647,
"grad_norm": 0.19188393652439117,
"learning_rate": 4.5375e-05,
"loss": 3.6869,
"step": 185
},
{
"epoch": 0.020763780598842502,
"grad_norm": 0.19892022013664246,
"learning_rate": 4.5350000000000005e-05,
"loss": 3.6785,
"step": 186
},
{
"epoch": 0.020875413827868537,
"grad_norm": 0.1916678249835968,
"learning_rate": 4.5325000000000004e-05,
"loss": 3.7164,
"step": 187
},
{
"epoch": 0.020987047056894574,
"grad_norm": 0.2172665148973465,
"learning_rate": 4.53e-05,
"loss": 3.6778,
"step": 188
},
{
"epoch": 0.021098680285920608,
"grad_norm": 0.19170907139778137,
"learning_rate": 4.5275e-05,
"loss": 3.7764,
"step": 189
},
{
"epoch": 0.021210313514946642,
"grad_norm": 0.1953432410955429,
"learning_rate": 4.525e-05,
"loss": 3.7746,
"step": 190
},
{
"epoch": 0.021321946743972676,
"grad_norm": 0.1913440227508545,
"learning_rate": 4.5225e-05,
"loss": 3.6643,
"step": 191
},
{
"epoch": 0.021433579972998714,
"grad_norm": 0.2013910412788391,
"learning_rate": 4.52e-05,
"loss": 3.7012,
"step": 192
},
{
"epoch": 0.021545213202024748,
"grad_norm": 0.19252702593803406,
"learning_rate": 4.5175e-05,
"loss": 3.6495,
"step": 193
},
{
"epoch": 0.021656846431050782,
"grad_norm": 0.19323377311229706,
"learning_rate": 4.5150000000000006e-05,
"loss": 3.6549,
"step": 194
},
{
"epoch": 0.021768479660076816,
"grad_norm": 0.1997387707233429,
"learning_rate": 4.5125e-05,
"loss": 3.7394,
"step": 195
},
{
"epoch": 0.021880112889102854,
"grad_norm": 0.19296503067016602,
"learning_rate": 4.5100000000000005e-05,
"loss": 3.6254,
"step": 196
},
{
"epoch": 0.021991746118128888,
"grad_norm": 0.18545635044574738,
"learning_rate": 4.5075e-05,
"loss": 3.6521,
"step": 197
},
{
"epoch": 0.022103379347154922,
"grad_norm": 0.18814872205257416,
"learning_rate": 4.5050000000000004e-05,
"loss": 3.7014,
"step": 198
},
{
"epoch": 0.022215012576180956,
"grad_norm": 0.1876133382320404,
"learning_rate": 4.5025000000000003e-05,
"loss": 3.7492,
"step": 199
},
{
"epoch": 0.022326645805206994,
"grad_norm": 0.207890123128891,
"learning_rate": 4.5e-05,
"loss": 3.7278,
"step": 200
},
{
"epoch": 0.022438279034233028,
"grad_norm": 0.19480448961257935,
"learning_rate": 4.4975e-05,
"loss": 3.6504,
"step": 201
},
{
"epoch": 0.022549912263259062,
"grad_norm": 0.191660076379776,
"learning_rate": 4.495e-05,
"loss": 3.7587,
"step": 202
},
{
"epoch": 0.022661545492285096,
"grad_norm": 0.19026733934879303,
"learning_rate": 4.4925e-05,
"loss": 3.6108,
"step": 203
},
{
"epoch": 0.022773178721311134,
"grad_norm": 0.1875009387731552,
"learning_rate": 4.49e-05,
"loss": 3.6958,
"step": 204
},
{
"epoch": 0.022884811950337168,
"grad_norm": 0.1946217566728592,
"learning_rate": 4.4875e-05,
"loss": 3.6406,
"step": 205
},
{
"epoch": 0.022996445179363202,
"grad_norm": 0.19868050515651703,
"learning_rate": 4.4850000000000006e-05,
"loss": 3.6862,
"step": 206
},
{
"epoch": 0.023108078408389236,
"grad_norm": 0.1906341016292572,
"learning_rate": 4.4825e-05,
"loss": 3.6233,
"step": 207
},
{
"epoch": 0.02321971163741527,
"grad_norm": 0.2086857259273529,
"learning_rate": 4.4800000000000005e-05,
"loss": 3.7119,
"step": 208
},
{
"epoch": 0.023331344866441308,
"grad_norm": 0.17429529130458832,
"learning_rate": 4.4775e-05,
"loss": 3.6995,
"step": 209
},
{
"epoch": 0.023442978095467342,
"grad_norm": 0.18736615777015686,
"learning_rate": 4.4750000000000004e-05,
"loss": 3.7669,
"step": 210
},
{
"epoch": 0.023554611324493376,
"grad_norm": 0.21085159480571747,
"learning_rate": 4.4725e-05,
"loss": 3.7207,
"step": 211
},
{
"epoch": 0.02366624455351941,
"grad_norm": 0.20404388010501862,
"learning_rate": 4.47e-05,
"loss": 3.7233,
"step": 212
},
{
"epoch": 0.023777877782545448,
"grad_norm": 0.19512401521205902,
"learning_rate": 4.4675e-05,
"loss": 3.7256,
"step": 213
},
{
"epoch": 0.023889511011571482,
"grad_norm": 0.20508058369159698,
"learning_rate": 4.465e-05,
"loss": 3.684,
"step": 214
},
{
"epoch": 0.024001144240597516,
"grad_norm": 0.18363450467586517,
"learning_rate": 4.4625e-05,
"loss": 3.7576,
"step": 215
},
{
"epoch": 0.02411277746962355,
"grad_norm": 0.21645157039165497,
"learning_rate": 4.46e-05,
"loss": 3.6316,
"step": 216
},
{
"epoch": 0.024224410698649588,
"grad_norm": 0.19466425478458405,
"learning_rate": 4.4575e-05,
"loss": 3.6131,
"step": 217
},
{
"epoch": 0.024336043927675622,
"grad_norm": 0.1953246146440506,
"learning_rate": 4.4550000000000005e-05,
"loss": 3.7037,
"step": 218
},
{
"epoch": 0.024447677156701656,
"grad_norm": 0.1961262971162796,
"learning_rate": 4.4525e-05,
"loss": 3.5995,
"step": 219
},
{
"epoch": 0.02455931038572769,
"grad_norm": 0.1917540282011032,
"learning_rate": 4.4500000000000004e-05,
"loss": 3.6955,
"step": 220
},
{
"epoch": 0.024670943614753728,
"grad_norm": 0.2062792181968689,
"learning_rate": 4.4475e-05,
"loss": 3.7629,
"step": 221
},
{
"epoch": 0.024782576843779762,
"grad_norm": 0.2323463410139084,
"learning_rate": 4.445e-05,
"loss": 3.7542,
"step": 222
},
{
"epoch": 0.024894210072805796,
"grad_norm": 0.24410179257392883,
"learning_rate": 4.4425e-05,
"loss": 3.7321,
"step": 223
},
{
"epoch": 0.02500584330183183,
"grad_norm": 0.19220338761806488,
"learning_rate": 4.44e-05,
"loss": 3.6547,
"step": 224
},
{
"epoch": 0.025117476530857868,
"grad_norm": 0.2187253087759018,
"learning_rate": 4.4375e-05,
"loss": 3.7394,
"step": 225
},
{
"epoch": 0.025229109759883902,
"grad_norm": 0.19334666430950165,
"learning_rate": 4.435e-05,
"loss": 3.7209,
"step": 226
},
{
"epoch": 0.025340742988909936,
"grad_norm": 0.185744971036911,
"learning_rate": 4.4325e-05,
"loss": 3.7263,
"step": 227
},
{
"epoch": 0.02545237621793597,
"grad_norm": 0.19216570258140564,
"learning_rate": 4.43e-05,
"loss": 3.7028,
"step": 228
},
{
"epoch": 0.025564009446962008,
"grad_norm": 0.19409343600273132,
"learning_rate": 4.4275e-05,
"loss": 3.6517,
"step": 229
},
{
"epoch": 0.025675642675988042,
"grad_norm": 0.19789241254329681,
"learning_rate": 4.4250000000000005e-05,
"loss": 3.6823,
"step": 230
},
{
"epoch": 0.025787275905014076,
"grad_norm": 0.19082917273044586,
"learning_rate": 4.4225e-05,
"loss": 3.6572,
"step": 231
},
{
"epoch": 0.02589890913404011,
"grad_norm": 0.19395306706428528,
"learning_rate": 4.4200000000000004e-05,
"loss": 3.6305,
"step": 232
},
{
"epoch": 0.026010542363066148,
"grad_norm": 0.19945797324180603,
"learning_rate": 4.4174999999999996e-05,
"loss": 3.5742,
"step": 233
},
{
"epoch": 0.026122175592092182,
"grad_norm": 0.2004850208759308,
"learning_rate": 4.415e-05,
"loss": 3.5701,
"step": 234
},
{
"epoch": 0.026233808821118216,
"grad_norm": 0.19983965158462524,
"learning_rate": 4.4125e-05,
"loss": 3.6424,
"step": 235
},
{
"epoch": 0.02634544205014425,
"grad_norm": 0.18810872733592987,
"learning_rate": 4.41e-05,
"loss": 3.6743,
"step": 236
},
{
"epoch": 0.026457075279170288,
"grad_norm": 0.1970721185207367,
"learning_rate": 4.4075e-05,
"loss": 3.8525,
"step": 237
},
{
"epoch": 0.026568708508196322,
"grad_norm": 0.1864660233259201,
"learning_rate": 4.405e-05,
"loss": 3.6585,
"step": 238
},
{
"epoch": 0.026680341737222356,
"grad_norm": 0.19277943670749664,
"learning_rate": 4.4025e-05,
"loss": 3.6185,
"step": 239
},
{
"epoch": 0.02679197496624839,
"grad_norm": 0.205641508102417,
"learning_rate": 4.4000000000000006e-05,
"loss": 3.6228,
"step": 240
},
{
"epoch": 0.026903608195274428,
"grad_norm": 0.18816497921943665,
"learning_rate": 4.3975e-05,
"loss": 3.6762,
"step": 241
},
{
"epoch": 0.02701524142430046,
"grad_norm": 0.19171147048473358,
"learning_rate": 4.3950000000000004e-05,
"loss": 3.6941,
"step": 242
},
{
"epoch": 0.027126874653326496,
"grad_norm": 0.21295049786567688,
"learning_rate": 4.3925e-05,
"loss": 3.6384,
"step": 243
},
{
"epoch": 0.02723850788235253,
"grad_norm": 0.19490571320056915,
"learning_rate": 4.39e-05,
"loss": 3.7484,
"step": 244
},
{
"epoch": 0.027350141111378564,
"grad_norm": 0.1877412497997284,
"learning_rate": 4.3875e-05,
"loss": 3.6416,
"step": 245
},
{
"epoch": 0.0274617743404046,
"grad_norm": 0.18955086171627045,
"learning_rate": 4.385e-05,
"loss": 3.6988,
"step": 246
},
{
"epoch": 0.027573407569430636,
"grad_norm": 0.19837862253189087,
"learning_rate": 4.3825e-05,
"loss": 3.6603,
"step": 247
},
{
"epoch": 0.02768504079845667,
"grad_norm": 0.19094811379909515,
"learning_rate": 4.38e-05,
"loss": 3.5806,
"step": 248
},
{
"epoch": 0.027796674027482704,
"grad_norm": 0.19856730103492737,
"learning_rate": 4.3775e-05,
"loss": 3.6971,
"step": 249
},
{
"epoch": 0.02790830725650874,
"grad_norm": 0.19249659776687622,
"learning_rate": 4.375e-05,
"loss": 3.7003,
"step": 250
},
{
"epoch": 0.028019940485534776,
"grad_norm": 0.20369058847427368,
"learning_rate": 4.3725000000000006e-05,
"loss": 3.5656,
"step": 251
},
{
"epoch": 0.02813157371456081,
"grad_norm": 0.17702902853488922,
"learning_rate": 4.3700000000000005e-05,
"loss": 3.6978,
"step": 252
},
{
"epoch": 0.028243206943586844,
"grad_norm": 0.19231531023979187,
"learning_rate": 4.3675000000000005e-05,
"loss": 3.7294,
"step": 253
},
{
"epoch": 0.02835484017261288,
"grad_norm": 0.20242364704608917,
"learning_rate": 4.3650000000000004e-05,
"loss": 3.6087,
"step": 254
},
{
"epoch": 0.028466473401638916,
"grad_norm": 0.19615139067173004,
"learning_rate": 4.3625e-05,
"loss": 3.7132,
"step": 255
},
{
"epoch": 0.02857810663066495,
"grad_norm": 0.21341873705387115,
"learning_rate": 4.36e-05,
"loss": 3.5695,
"step": 256
},
{
"epoch": 0.028689739859690984,
"grad_norm": 0.20128357410430908,
"learning_rate": 4.3575e-05,
"loss": 3.5831,
"step": 257
},
{
"epoch": 0.02880137308871702,
"grad_norm": 0.19834469258785248,
"learning_rate": 4.355e-05,
"loss": 3.6423,
"step": 258
},
{
"epoch": 0.028913006317743056,
"grad_norm": 0.20700770616531372,
"learning_rate": 4.352500000000001e-05,
"loss": 3.739,
"step": 259
},
{
"epoch": 0.02902463954676909,
"grad_norm": 0.1813947558403015,
"learning_rate": 4.35e-05,
"loss": 3.5878,
"step": 260
},
{
"epoch": 0.029136272775795124,
"grad_norm": 0.19542041420936584,
"learning_rate": 4.3475000000000006e-05,
"loss": 3.6467,
"step": 261
},
{
"epoch": 0.02924790600482116,
"grad_norm": 0.1923271268606186,
"learning_rate": 4.345e-05,
"loss": 3.7348,
"step": 262
},
{
"epoch": 0.029359539233847196,
"grad_norm": 0.1988571584224701,
"learning_rate": 4.3425000000000005e-05,
"loss": 3.6156,
"step": 263
},
{
"epoch": 0.02947117246287323,
"grad_norm": 0.21727347373962402,
"learning_rate": 4.3400000000000005e-05,
"loss": 3.7439,
"step": 264
},
{
"epoch": 0.029582805691899264,
"grad_norm": 0.1942213624715805,
"learning_rate": 4.3375000000000004e-05,
"loss": 3.7865,
"step": 265
},
{
"epoch": 0.0296944389209253,
"grad_norm": 0.18657447397708893,
"learning_rate": 4.335e-05,
"loss": 3.7074,
"step": 266
},
{
"epoch": 0.029806072149951335,
"grad_norm": 0.19644992053508759,
"learning_rate": 4.3325e-05,
"loss": 3.7003,
"step": 267
},
{
"epoch": 0.02991770537897737,
"grad_norm": 0.1929520219564438,
"learning_rate": 4.33e-05,
"loss": 3.7275,
"step": 268
},
{
"epoch": 0.030029338608003404,
"grad_norm": 0.20509083569049835,
"learning_rate": 4.3275e-05,
"loss": 3.683,
"step": 269
},
{
"epoch": 0.03014097183702944,
"grad_norm": 0.18939244747161865,
"learning_rate": 4.325e-05,
"loss": 3.7679,
"step": 270
},
{
"epoch": 0.030252605066055475,
"grad_norm": 0.1947099268436432,
"learning_rate": 4.322500000000001e-05,
"loss": 3.7265,
"step": 271
},
{
"epoch": 0.03036423829508151,
"grad_norm": 0.19855187833309174,
"learning_rate": 4.32e-05,
"loss": 3.5712,
"step": 272
},
{
"epoch": 0.030475871524107544,
"grad_norm": 0.19451723992824554,
"learning_rate": 4.3175000000000006e-05,
"loss": 3.7446,
"step": 273
},
{
"epoch": 0.03058750475313358,
"grad_norm": 0.1808876097202301,
"learning_rate": 4.315e-05,
"loss": 3.6798,
"step": 274
},
{
"epoch": 0.030699137982159615,
"grad_norm": 0.21147924661636353,
"learning_rate": 4.3125000000000005e-05,
"loss": 3.6293,
"step": 275
},
{
"epoch": 0.03081077121118565,
"grad_norm": 0.20331715047359467,
"learning_rate": 4.3100000000000004e-05,
"loss": 3.7861,
"step": 276
},
{
"epoch": 0.030922404440211684,
"grad_norm": 0.19753040373325348,
"learning_rate": 4.3075000000000003e-05,
"loss": 3.6638,
"step": 277
},
{
"epoch": 0.031034037669237718,
"grad_norm": 0.2017693817615509,
"learning_rate": 4.305e-05,
"loss": 3.5948,
"step": 278
},
{
"epoch": 0.031145670898263755,
"grad_norm": 0.19134145975112915,
"learning_rate": 4.3025e-05,
"loss": 3.6196,
"step": 279
},
{
"epoch": 0.03125730412728979,
"grad_norm": 0.1985182762145996,
"learning_rate": 4.3e-05,
"loss": 3.6017,
"step": 280
},
{
"epoch": 0.031368937356315824,
"grad_norm": 0.2032870203256607,
"learning_rate": 4.2975e-05,
"loss": 3.611,
"step": 281
},
{
"epoch": 0.03148057058534186,
"grad_norm": 0.21434105932712555,
"learning_rate": 4.295e-05,
"loss": 3.6062,
"step": 282
},
{
"epoch": 0.03159220381436789,
"grad_norm": 0.1890399158000946,
"learning_rate": 4.2925000000000007e-05,
"loss": 3.773,
"step": 283
},
{
"epoch": 0.031703837043393926,
"grad_norm": 0.19301185011863708,
"learning_rate": 4.29e-05,
"loss": 3.6239,
"step": 284
},
{
"epoch": 0.03181547027241997,
"grad_norm": 0.1981291025876999,
"learning_rate": 4.2875000000000005e-05,
"loss": 3.6724,
"step": 285
},
{
"epoch": 0.031927103501446,
"grad_norm": 0.1951553225517273,
"learning_rate": 4.285e-05,
"loss": 3.6256,
"step": 286
},
{
"epoch": 0.032038736730472035,
"grad_norm": 0.2018650472164154,
"learning_rate": 4.2825000000000004e-05,
"loss": 3.6787,
"step": 287
},
{
"epoch": 0.03215036995949807,
"grad_norm": 0.2058183252811432,
"learning_rate": 4.2800000000000004e-05,
"loss": 3.7439,
"step": 288
},
{
"epoch": 0.032262003188524103,
"grad_norm": 0.20262151956558228,
"learning_rate": 4.2775e-05,
"loss": 3.641,
"step": 289
},
{
"epoch": 0.03237363641755014,
"grad_norm": 0.19902455806732178,
"learning_rate": 4.275e-05,
"loss": 3.6999,
"step": 290
},
{
"epoch": 0.03248526964657617,
"grad_norm": 0.18562380969524384,
"learning_rate": 4.2725e-05,
"loss": 3.5406,
"step": 291
},
{
"epoch": 0.032596902875602206,
"grad_norm": 0.20395490527153015,
"learning_rate": 4.27e-05,
"loss": 3.6466,
"step": 292
},
{
"epoch": 0.03270853610462825,
"grad_norm": 0.2005227506160736,
"learning_rate": 4.2675e-05,
"loss": 3.6424,
"step": 293
},
{
"epoch": 0.03282016933365428,
"grad_norm": 0.20853383839130402,
"learning_rate": 4.265e-05,
"loss": 3.5586,
"step": 294
},
{
"epoch": 0.032931802562680315,
"grad_norm": 0.2118186354637146,
"learning_rate": 4.2625000000000006e-05,
"loss": 3.7243,
"step": 295
},
{
"epoch": 0.03304343579170635,
"grad_norm": 0.19089064002037048,
"learning_rate": 4.26e-05,
"loss": 3.6396,
"step": 296
},
{
"epoch": 0.03315506902073238,
"grad_norm": 0.20549426972866058,
"learning_rate": 4.2575000000000005e-05,
"loss": 3.5532,
"step": 297
},
{
"epoch": 0.03326670224975842,
"grad_norm": 0.20758430659770966,
"learning_rate": 4.2550000000000004e-05,
"loss": 3.7309,
"step": 298
},
{
"epoch": 0.03337833547878445,
"grad_norm": 0.2008824646472931,
"learning_rate": 4.2525000000000004e-05,
"loss": 3.682,
"step": 299
},
{
"epoch": 0.033489968707810486,
"grad_norm": 0.18740810453891754,
"learning_rate": 4.25e-05,
"loss": 3.6559,
"step": 300
},
{
"epoch": 0.03360160193683653,
"grad_norm": 0.19573578238487244,
"learning_rate": 4.2475e-05,
"loss": 3.6254,
"step": 301
},
{
"epoch": 0.03371323516586256,
"grad_norm": 0.2099771499633789,
"learning_rate": 4.245e-05,
"loss": 3.6685,
"step": 302
},
{
"epoch": 0.033824868394888595,
"grad_norm": 0.19080573320388794,
"learning_rate": 4.2425e-05,
"loss": 3.64,
"step": 303
},
{
"epoch": 0.03393650162391463,
"grad_norm": 0.18675819039344788,
"learning_rate": 4.24e-05,
"loss": 3.7349,
"step": 304
},
{
"epoch": 0.03404813485294066,
"grad_norm": 0.22048161923885345,
"learning_rate": 4.237500000000001e-05,
"loss": 3.6219,
"step": 305
},
{
"epoch": 0.0341597680819667,
"grad_norm": 0.21092446148395538,
"learning_rate": 4.235e-05,
"loss": 3.5843,
"step": 306
},
{
"epoch": 0.03427140131099273,
"grad_norm": 0.20214509963989258,
"learning_rate": 4.2325000000000006e-05,
"loss": 3.7247,
"step": 307
},
{
"epoch": 0.034383034540018766,
"grad_norm": 0.19769619405269623,
"learning_rate": 4.23e-05,
"loss": 3.6119,
"step": 308
},
{
"epoch": 0.03449466776904481,
"grad_norm": 0.20578259229660034,
"learning_rate": 4.2275000000000004e-05,
"loss": 3.6434,
"step": 309
},
{
"epoch": 0.03460630099807084,
"grad_norm": 0.21981282532215118,
"learning_rate": 4.2250000000000004e-05,
"loss": 3.6881,
"step": 310
},
{
"epoch": 0.034717934227096875,
"grad_norm": 0.2038237452507019,
"learning_rate": 4.2225e-05,
"loss": 3.6154,
"step": 311
},
{
"epoch": 0.03482956745612291,
"grad_norm": 0.190349742770195,
"learning_rate": 4.22e-05,
"loss": 3.7292,
"step": 312
},
{
"epoch": 0.03494120068514894,
"grad_norm": 0.22228002548217773,
"learning_rate": 4.2175e-05,
"loss": 3.6014,
"step": 313
},
{
"epoch": 0.03505283391417498,
"grad_norm": 0.21092984080314636,
"learning_rate": 4.215e-05,
"loss": 3.6744,
"step": 314
},
{
"epoch": 0.03516446714320101,
"grad_norm": 0.19800858199596405,
"learning_rate": 4.2125e-05,
"loss": 3.6443,
"step": 315
},
{
"epoch": 0.035276100372227046,
"grad_norm": 0.20065619051456451,
"learning_rate": 4.21e-05,
"loss": 3.5728,
"step": 316
},
{
"epoch": 0.03538773360125308,
"grad_norm": 0.19413913786411285,
"learning_rate": 4.2075000000000006e-05,
"loss": 3.6795,
"step": 317
},
{
"epoch": 0.03549936683027912,
"grad_norm": 0.20133784413337708,
"learning_rate": 4.205e-05,
"loss": 3.6743,
"step": 318
},
{
"epoch": 0.035611000059305155,
"grad_norm": 0.20432250201702118,
"learning_rate": 4.2025000000000005e-05,
"loss": 3.6817,
"step": 319
},
{
"epoch": 0.03572263328833119,
"grad_norm": 0.18469202518463135,
"learning_rate": 4.2e-05,
"loss": 3.6151,
"step": 320
},
{
"epoch": 0.03583426651735722,
"grad_norm": 0.22478121519088745,
"learning_rate": 4.1975000000000004e-05,
"loss": 3.5854,
"step": 321
},
{
"epoch": 0.03594589974638326,
"grad_norm": 0.22018073499202728,
"learning_rate": 4.195e-05,
"loss": 3.5832,
"step": 322
},
{
"epoch": 0.03605753297540929,
"grad_norm": 0.21157695353031158,
"learning_rate": 4.1925e-05,
"loss": 3.6343,
"step": 323
},
{
"epoch": 0.036169166204435325,
"grad_norm": 0.2132994532585144,
"learning_rate": 4.19e-05,
"loss": 3.6503,
"step": 324
},
{
"epoch": 0.03628079943346136,
"grad_norm": 0.21192209422588348,
"learning_rate": 4.1875e-05,
"loss": 3.6667,
"step": 325
},
{
"epoch": 0.0363924326624874,
"grad_norm": 0.21237416565418243,
"learning_rate": 4.185e-05,
"loss": 3.6533,
"step": 326
},
{
"epoch": 0.036504065891513435,
"grad_norm": 0.19881536066532135,
"learning_rate": 4.1825e-05,
"loss": 3.5839,
"step": 327
},
{
"epoch": 0.03661569912053947,
"grad_norm": 0.2148139625787735,
"learning_rate": 4.18e-05,
"loss": 3.5074,
"step": 328
},
{
"epoch": 0.0367273323495655,
"grad_norm": 0.20267094671726227,
"learning_rate": 4.1775000000000006e-05,
"loss": 3.6637,
"step": 329
},
{
"epoch": 0.03683896557859154,
"grad_norm": 0.21256805956363678,
"learning_rate": 4.175e-05,
"loss": 3.5736,
"step": 330
},
{
"epoch": 0.03695059880761757,
"grad_norm": 0.2187931090593338,
"learning_rate": 4.1725000000000005e-05,
"loss": 3.6869,
"step": 331
},
{
"epoch": 0.037062232036643605,
"grad_norm": 0.189712792634964,
"learning_rate": 4.17e-05,
"loss": 3.6789,
"step": 332
},
{
"epoch": 0.03717386526566964,
"grad_norm": 0.2027687430381775,
"learning_rate": 4.1675e-05,
"loss": 3.6633,
"step": 333
},
{
"epoch": 0.03728549849469568,
"grad_norm": 0.1924295276403427,
"learning_rate": 4.165e-05,
"loss": 3.6169,
"step": 334
},
{
"epoch": 0.037397131723721715,
"grad_norm": 0.2029048353433609,
"learning_rate": 4.1625e-05,
"loss": 3.6223,
"step": 335
},
{
"epoch": 0.03750876495274775,
"grad_norm": 0.19263207912445068,
"learning_rate": 4.16e-05,
"loss": 3.718,
"step": 336
},
{
"epoch": 0.03762039818177378,
"grad_norm": 0.2766994833946228,
"learning_rate": 4.1575e-05,
"loss": 3.698,
"step": 337
},
{
"epoch": 0.03773203141079982,
"grad_norm": 0.19502456486225128,
"learning_rate": 4.155e-05,
"loss": 3.5847,
"step": 338
},
{
"epoch": 0.03784366463982585,
"grad_norm": 0.20540866255760193,
"learning_rate": 4.1525e-05,
"loss": 3.6341,
"step": 339
},
{
"epoch": 0.037955297868851885,
"grad_norm": 0.1927708089351654,
"learning_rate": 4.15e-05,
"loss": 3.5672,
"step": 340
},
{
"epoch": 0.03806693109787792,
"grad_norm": 0.19946596026420593,
"learning_rate": 4.1475000000000005e-05,
"loss": 3.6532,
"step": 341
},
{
"epoch": 0.03817856432690396,
"grad_norm": 0.196825310587883,
"learning_rate": 4.145e-05,
"loss": 3.6103,
"step": 342
},
{
"epoch": 0.038290197555929995,
"grad_norm": 0.19220396876335144,
"learning_rate": 4.1425000000000004e-05,
"loss": 3.586,
"step": 343
},
{
"epoch": 0.03840183078495603,
"grad_norm": 0.20811744034290314,
"learning_rate": 4.14e-05,
"loss": 3.6879,
"step": 344
},
{
"epoch": 0.03851346401398206,
"grad_norm": 0.20219473540782928,
"learning_rate": 4.1375e-05,
"loss": 3.5855,
"step": 345
},
{
"epoch": 0.0386250972430081,
"grad_norm": 0.19466525316238403,
"learning_rate": 4.135e-05,
"loss": 3.6326,
"step": 346
},
{
"epoch": 0.03873673047203413,
"grad_norm": 0.2158079296350479,
"learning_rate": 4.1325e-05,
"loss": 3.5414,
"step": 347
},
{
"epoch": 0.038848363701060165,
"grad_norm": 0.1987825483083725,
"learning_rate": 4.13e-05,
"loss": 3.4823,
"step": 348
},
{
"epoch": 0.0389599969300862,
"grad_norm": 0.1963677555322647,
"learning_rate": 4.1275e-05,
"loss": 3.5963,
"step": 349
},
{
"epoch": 0.03907163015911223,
"grad_norm": 0.19855019450187683,
"learning_rate": 4.125e-05,
"loss": 3.6206,
"step": 350
},
{
"epoch": 0.039183263388138274,
"grad_norm": 0.22052060067653656,
"learning_rate": 4.1225e-05,
"loss": 3.7545,
"step": 351
},
{
"epoch": 0.03929489661716431,
"grad_norm": 0.20555360615253448,
"learning_rate": 4.12e-05,
"loss": 3.656,
"step": 352
},
{
"epoch": 0.03940652984619034,
"grad_norm": 0.19108745455741882,
"learning_rate": 4.1175000000000005e-05,
"loss": 3.6913,
"step": 353
},
{
"epoch": 0.03951816307521638,
"grad_norm": 0.20099329948425293,
"learning_rate": 4.115e-05,
"loss": 3.6426,
"step": 354
},
{
"epoch": 0.03962979630424241,
"grad_norm": 0.19872133433818817,
"learning_rate": 4.1125000000000004e-05,
"loss": 3.6809,
"step": 355
},
{
"epoch": 0.039741429533268445,
"grad_norm": 0.1866777539253235,
"learning_rate": 4.11e-05,
"loss": 3.6121,
"step": 356
},
{
"epoch": 0.03985306276229448,
"grad_norm": 0.19584119319915771,
"learning_rate": 4.1075e-05,
"loss": 3.61,
"step": 357
},
{
"epoch": 0.03996469599132051,
"grad_norm": 0.19828204810619354,
"learning_rate": 4.105e-05,
"loss": 3.5095,
"step": 358
},
{
"epoch": 0.040076329220346554,
"grad_norm": 0.20203830301761627,
"learning_rate": 4.1025e-05,
"loss": 3.643,
"step": 359
},
{
"epoch": 0.04018796244937259,
"grad_norm": 0.19631658494472504,
"learning_rate": 4.1e-05,
"loss": 3.5947,
"step": 360
},
{
"epoch": 0.04029959567839862,
"grad_norm": 0.19795821607112885,
"learning_rate": 4.0975e-05,
"loss": 3.6314,
"step": 361
},
{
"epoch": 0.04041122890742466,
"grad_norm": 0.21024641394615173,
"learning_rate": 4.095e-05,
"loss": 3.5205,
"step": 362
},
{
"epoch": 0.04052286213645069,
"grad_norm": 0.200261190533638,
"learning_rate": 4.0925000000000005e-05,
"loss": 3.6623,
"step": 363
},
{
"epoch": 0.040634495365476725,
"grad_norm": 0.20525649189949036,
"learning_rate": 4.09e-05,
"loss": 3.6531,
"step": 364
},
{
"epoch": 0.04074612859450276,
"grad_norm": 0.2618565559387207,
"learning_rate": 4.0875000000000004e-05,
"loss": 3.6337,
"step": 365
},
{
"epoch": 0.04085776182352879,
"grad_norm": 0.18808555603027344,
"learning_rate": 4.085e-05,
"loss": 3.5751,
"step": 366
},
{
"epoch": 0.040969395052554834,
"grad_norm": 0.18894913792610168,
"learning_rate": 4.0825e-05,
"loss": 3.6602,
"step": 367
},
{
"epoch": 0.04108102828158087,
"grad_norm": 0.19801251590251923,
"learning_rate": 4.08e-05,
"loss": 3.6223,
"step": 368
},
{
"epoch": 0.0411926615106069,
"grad_norm": 0.2184888869524002,
"learning_rate": 4.0775e-05,
"loss": 3.6262,
"step": 369
},
{
"epoch": 0.04130429473963294,
"grad_norm": 0.2098560780286789,
"learning_rate": 4.075e-05,
"loss": 3.6877,
"step": 370
},
{
"epoch": 0.04141592796865897,
"grad_norm": 0.18118880689144135,
"learning_rate": 4.0725e-05,
"loss": 3.6821,
"step": 371
},
{
"epoch": 0.041527561197685005,
"grad_norm": 0.2155236452817917,
"learning_rate": 4.07e-05,
"loss": 3.5479,
"step": 372
},
{
"epoch": 0.04163919442671104,
"grad_norm": 0.19501011073589325,
"learning_rate": 4.0675e-05,
"loss": 3.6122,
"step": 373
},
{
"epoch": 0.04175082765573707,
"grad_norm": 0.21303986012935638,
"learning_rate": 4.065e-05,
"loss": 3.6121,
"step": 374
},
{
"epoch": 0.041862460884763114,
"grad_norm": 0.20115773379802704,
"learning_rate": 4.0625000000000005e-05,
"loss": 3.6661,
"step": 375
},
{
"epoch": 0.04197409411378915,
"grad_norm": 0.22303688526153564,
"learning_rate": 4.0600000000000004e-05,
"loss": 3.6614,
"step": 376
},
{
"epoch": 0.04208572734281518,
"grad_norm": 0.20577269792556763,
"learning_rate": 4.0575000000000004e-05,
"loss": 3.604,
"step": 377
},
{
"epoch": 0.042197360571841216,
"grad_norm": 0.2117035835981369,
"learning_rate": 4.055e-05,
"loss": 3.65,
"step": 378
},
{
"epoch": 0.04230899380086725,
"grad_norm": 0.2067400962114334,
"learning_rate": 4.0525e-05,
"loss": 3.6391,
"step": 379
},
{
"epoch": 0.042420627029893285,
"grad_norm": 0.21353572607040405,
"learning_rate": 4.05e-05,
"loss": 3.5458,
"step": 380
},
{
"epoch": 0.04253226025891932,
"grad_norm": 0.21309249103069305,
"learning_rate": 4.0475e-05,
"loss": 3.6584,
"step": 381
},
{
"epoch": 0.04264389348794535,
"grad_norm": 0.21452483534812927,
"learning_rate": 4.045000000000001e-05,
"loss": 3.4598,
"step": 382
},
{
"epoch": 0.04275552671697139,
"grad_norm": 0.20974920690059662,
"learning_rate": 4.0425e-05,
"loss": 3.6451,
"step": 383
},
{
"epoch": 0.04286715994599743,
"grad_norm": 0.2117718905210495,
"learning_rate": 4.0400000000000006e-05,
"loss": 3.6736,
"step": 384
},
{
"epoch": 0.04297879317502346,
"grad_norm": 0.20450986921787262,
"learning_rate": 4.0375e-05,
"loss": 3.5988,
"step": 385
},
{
"epoch": 0.043090426404049496,
"grad_norm": 0.24159705638885498,
"learning_rate": 4.0350000000000005e-05,
"loss": 3.6933,
"step": 386
},
{
"epoch": 0.04320205963307553,
"grad_norm": 0.20752617716789246,
"learning_rate": 4.0325000000000004e-05,
"loss": 3.592,
"step": 387
},
{
"epoch": 0.043313692862101565,
"grad_norm": 0.19693586230278015,
"learning_rate": 4.0300000000000004e-05,
"loss": 3.6746,
"step": 388
},
{
"epoch": 0.0434253260911276,
"grad_norm": 0.201860710978508,
"learning_rate": 4.0275e-05,
"loss": 3.5991,
"step": 389
},
{
"epoch": 0.04353695932015363,
"grad_norm": 0.19570831954479218,
"learning_rate": 4.025e-05,
"loss": 3.565,
"step": 390
},
{
"epoch": 0.04364859254917967,
"grad_norm": 0.19644080102443695,
"learning_rate": 4.0225e-05,
"loss": 3.6053,
"step": 391
},
{
"epoch": 0.04376022577820571,
"grad_norm": 0.19764544069766998,
"learning_rate": 4.02e-05,
"loss": 3.5852,
"step": 392
},
{
"epoch": 0.04387185900723174,
"grad_norm": 0.198360413312912,
"learning_rate": 4.0175e-05,
"loss": 3.5493,
"step": 393
},
{
"epoch": 0.043983492236257776,
"grad_norm": 0.2067759782075882,
"learning_rate": 4.015000000000001e-05,
"loss": 3.6692,
"step": 394
},
{
"epoch": 0.04409512546528381,
"grad_norm": 0.1964801847934723,
"learning_rate": 4.0125e-05,
"loss": 3.6442,
"step": 395
},
{
"epoch": 0.044206758694309845,
"grad_norm": 0.22487924993038177,
"learning_rate": 4.0100000000000006e-05,
"loss": 3.676,
"step": 396
},
{
"epoch": 0.04431839192333588,
"grad_norm": 0.2106131762266159,
"learning_rate": 4.0075e-05,
"loss": 3.6206,
"step": 397
},
{
"epoch": 0.04443002515236191,
"grad_norm": 0.19338656961917877,
"learning_rate": 4.0050000000000004e-05,
"loss": 3.6776,
"step": 398
},
{
"epoch": 0.04454165838138795,
"grad_norm": 0.19505569338798523,
"learning_rate": 4.0025000000000004e-05,
"loss": 3.6583,
"step": 399
},
{
"epoch": 0.04465329161041399,
"grad_norm": 0.2099572867155075,
"learning_rate": 4e-05,
"loss": 3.6318,
"step": 400
},
{
"epoch": 0.04476492483944002,
"grad_norm": 0.20380336046218872,
"learning_rate": 3.9975e-05,
"loss": 3.7099,
"step": 401
},
{
"epoch": 0.044876558068466056,
"grad_norm": 0.19614072144031525,
"learning_rate": 3.995e-05,
"loss": 3.6275,
"step": 402
},
{
"epoch": 0.04498819129749209,
"grad_norm": 0.2012193500995636,
"learning_rate": 3.9925e-05,
"loss": 3.5702,
"step": 403
},
{
"epoch": 0.045099824526518124,
"grad_norm": 0.20583601295948029,
"learning_rate": 3.99e-05,
"loss": 3.6023,
"step": 404
},
{
"epoch": 0.04521145775554416,
"grad_norm": 0.20272774994373322,
"learning_rate": 3.9875e-05,
"loss": 3.6711,
"step": 405
},
{
"epoch": 0.04532309098457019,
"grad_norm": 0.2030143141746521,
"learning_rate": 3.9850000000000006e-05,
"loss": 3.5761,
"step": 406
},
{
"epoch": 0.04543472421359623,
"grad_norm": 0.21138407289981842,
"learning_rate": 3.9825e-05,
"loss": 3.6326,
"step": 407
},
{
"epoch": 0.04554635744262227,
"grad_norm": 0.20957213640213013,
"learning_rate": 3.9800000000000005e-05,
"loss": 3.6996,
"step": 408
},
{
"epoch": 0.0456579906716483,
"grad_norm": 0.2377130389213562,
"learning_rate": 3.9775e-05,
"loss": 3.628,
"step": 409
},
{
"epoch": 0.045769623900674336,
"grad_norm": 0.19922372698783875,
"learning_rate": 3.9750000000000004e-05,
"loss": 3.6587,
"step": 410
},
{
"epoch": 0.04588125712970037,
"grad_norm": 0.19007784128189087,
"learning_rate": 3.9725e-05,
"loss": 3.5397,
"step": 411
},
{
"epoch": 0.045992890358726404,
"grad_norm": 0.2059575319290161,
"learning_rate": 3.97e-05,
"loss": 3.6504,
"step": 412
},
{
"epoch": 0.04610452358775244,
"grad_norm": 0.2094673365354538,
"learning_rate": 3.9675e-05,
"loss": 3.6077,
"step": 413
},
{
"epoch": 0.04621615681677847,
"grad_norm": 0.21034319698810577,
"learning_rate": 3.965e-05,
"loss": 3.4834,
"step": 414
},
{
"epoch": 0.04632779004580451,
"grad_norm": 0.21362099051475525,
"learning_rate": 3.9625e-05,
"loss": 3.6526,
"step": 415
},
{
"epoch": 0.04643942327483054,
"grad_norm": 0.1988418996334076,
"learning_rate": 3.960000000000001e-05,
"loss": 3.6042,
"step": 416
},
{
"epoch": 0.04655105650385658,
"grad_norm": 0.20169630646705627,
"learning_rate": 3.9575e-05,
"loss": 3.6411,
"step": 417
},
{
"epoch": 0.046662689732882616,
"grad_norm": 0.20410498976707458,
"learning_rate": 3.9550000000000006e-05,
"loss": 3.6753,
"step": 418
},
{
"epoch": 0.04677432296190865,
"grad_norm": 0.20884354412555695,
"learning_rate": 3.9525e-05,
"loss": 3.6647,
"step": 419
},
{
"epoch": 0.046885956190934684,
"grad_norm": 0.20347416400909424,
"learning_rate": 3.9500000000000005e-05,
"loss": 3.5434,
"step": 420
},
{
"epoch": 0.04699758941996072,
"grad_norm": 0.22228151559829712,
"learning_rate": 3.9475000000000004e-05,
"loss": 3.6574,
"step": 421
},
{
"epoch": 0.04710922264898675,
"grad_norm": 0.20534701645374298,
"learning_rate": 3.9450000000000003e-05,
"loss": 3.6411,
"step": 422
},
{
"epoch": 0.04722085587801279,
"grad_norm": 0.21961861848831177,
"learning_rate": 3.9425e-05,
"loss": 3.6012,
"step": 423
},
{
"epoch": 0.04733248910703882,
"grad_norm": 0.21343372762203217,
"learning_rate": 3.94e-05,
"loss": 3.5772,
"step": 424
},
{
"epoch": 0.04744412233606486,
"grad_norm": 0.19163194298744202,
"learning_rate": 3.9375e-05,
"loss": 3.5523,
"step": 425
},
{
"epoch": 0.047555755565090896,
"grad_norm": 0.21674080193042755,
"learning_rate": 3.935e-05,
"loss": 3.5712,
"step": 426
},
{
"epoch": 0.04766738879411693,
"grad_norm": 0.20363913476467133,
"learning_rate": 3.9325e-05,
"loss": 3.487,
"step": 427
},
{
"epoch": 0.047779022023142964,
"grad_norm": 0.21637770533561707,
"learning_rate": 3.9300000000000007e-05,
"loss": 3.6345,
"step": 428
},
{
"epoch": 0.047890655252169,
"grad_norm": 0.21086527407169342,
"learning_rate": 3.9275e-05,
"loss": 3.6082,
"step": 429
},
{
"epoch": 0.04800228848119503,
"grad_norm": 0.2062910795211792,
"learning_rate": 3.9250000000000005e-05,
"loss": 3.5865,
"step": 430
},
{
"epoch": 0.048113921710221066,
"grad_norm": 0.2561924457550049,
"learning_rate": 3.9225e-05,
"loss": 3.6394,
"step": 431
},
{
"epoch": 0.0482255549392471,
"grad_norm": 0.20014211535453796,
"learning_rate": 3.9200000000000004e-05,
"loss": 3.5843,
"step": 432
},
{
"epoch": 0.04833718816827314,
"grad_norm": 0.21369154751300812,
"learning_rate": 3.9175000000000004e-05,
"loss": 3.6582,
"step": 433
},
{
"epoch": 0.048448821397299176,
"grad_norm": 0.20110559463500977,
"learning_rate": 3.915e-05,
"loss": 3.6805,
"step": 434
},
{
"epoch": 0.04856045462632521,
"grad_norm": 0.19506920874118805,
"learning_rate": 3.9125e-05,
"loss": 3.5921,
"step": 435
},
{
"epoch": 0.048672087855351244,
"grad_norm": 0.2067500650882721,
"learning_rate": 3.91e-05,
"loss": 3.6332,
"step": 436
},
{
"epoch": 0.04878372108437728,
"grad_norm": 0.21102051436901093,
"learning_rate": 3.9075e-05,
"loss": 3.6332,
"step": 437
},
{
"epoch": 0.04889535431340331,
"grad_norm": 0.21857093274593353,
"learning_rate": 3.905e-05,
"loss": 3.6837,
"step": 438
},
{
"epoch": 0.049006987542429346,
"grad_norm": 0.21166151762008667,
"learning_rate": 3.9025e-05,
"loss": 3.6378,
"step": 439
},
{
"epoch": 0.04911862077145538,
"grad_norm": 0.218805193901062,
"learning_rate": 3.9000000000000006e-05,
"loss": 3.5928,
"step": 440
},
{
"epoch": 0.04923025400048142,
"grad_norm": 0.19528433680534363,
"learning_rate": 3.8975e-05,
"loss": 3.6415,
"step": 441
},
{
"epoch": 0.049341887229507456,
"grad_norm": 0.1972483992576599,
"learning_rate": 3.8950000000000005e-05,
"loss": 3.5092,
"step": 442
},
{
"epoch": 0.04945352045853349,
"grad_norm": 0.19940587878227234,
"learning_rate": 3.8925e-05,
"loss": 3.6985,
"step": 443
},
{
"epoch": 0.049565153687559524,
"grad_norm": 0.1984112411737442,
"learning_rate": 3.8900000000000004e-05,
"loss": 3.6895,
"step": 444
},
{
"epoch": 0.04967678691658556,
"grad_norm": 0.22806307673454285,
"learning_rate": 3.8875e-05,
"loss": 3.5371,
"step": 445
},
{
"epoch": 0.04978842014561159,
"grad_norm": 0.21049273014068604,
"learning_rate": 3.885e-05,
"loss": 3.6103,
"step": 446
},
{
"epoch": 0.049900053374637626,
"grad_norm": 0.20258364081382751,
"learning_rate": 3.8825e-05,
"loss": 3.546,
"step": 447
},
{
"epoch": 0.05001168660366366,
"grad_norm": 0.21317580342292786,
"learning_rate": 3.88e-05,
"loss": 3.6167,
"step": 448
},
{
"epoch": 0.0501233198326897,
"grad_norm": 0.20881395041942596,
"learning_rate": 3.8775e-05,
"loss": 3.5394,
"step": 449
},
{
"epoch": 0.050234953061715736,
"grad_norm": 0.19215163588523865,
"learning_rate": 3.875e-05,
"loss": 3.471,
"step": 450
},
{
"epoch": 0.05034658629074177,
"grad_norm": 0.2127693146467209,
"learning_rate": 3.8725e-05,
"loss": 3.5345,
"step": 451
},
{
"epoch": 0.050458219519767804,
"grad_norm": 0.21150276064872742,
"learning_rate": 3.8700000000000006e-05,
"loss": 3.6626,
"step": 452
},
{
"epoch": 0.05056985274879384,
"grad_norm": 0.21794983744621277,
"learning_rate": 3.8675e-05,
"loss": 3.6768,
"step": 453
},
{
"epoch": 0.05068148597781987,
"grad_norm": 0.2117374986410141,
"learning_rate": 3.8650000000000004e-05,
"loss": 3.5617,
"step": 454
},
{
"epoch": 0.050793119206845906,
"grad_norm": 0.19826945662498474,
"learning_rate": 3.8625e-05,
"loss": 3.6631,
"step": 455
},
{
"epoch": 0.05090475243587194,
"grad_norm": 0.20101666450500488,
"learning_rate": 3.86e-05,
"loss": 3.6238,
"step": 456
},
{
"epoch": 0.051016385664897974,
"grad_norm": 0.21595193445682526,
"learning_rate": 3.8575e-05,
"loss": 3.5917,
"step": 457
},
{
"epoch": 0.051128018893924015,
"grad_norm": 0.21451033651828766,
"learning_rate": 3.855e-05,
"loss": 3.6808,
"step": 458
},
{
"epoch": 0.05123965212295005,
"grad_norm": 0.20762884616851807,
"learning_rate": 3.8525e-05,
"loss": 3.6714,
"step": 459
},
{
"epoch": 0.051351285351976084,
"grad_norm": 0.20596258342266083,
"learning_rate": 3.85e-05,
"loss": 3.4612,
"step": 460
},
{
"epoch": 0.05146291858100212,
"grad_norm": 0.21628469228744507,
"learning_rate": 3.8475e-05,
"loss": 3.6385,
"step": 461
},
{
"epoch": 0.05157455181002815,
"grad_norm": 0.2363012433052063,
"learning_rate": 3.845e-05,
"loss": 3.6088,
"step": 462
},
{
"epoch": 0.051686185039054186,
"grad_norm": 0.2120707780122757,
"learning_rate": 3.8425e-05,
"loss": 3.5697,
"step": 463
},
{
"epoch": 0.05179781826808022,
"grad_norm": 0.21287605166435242,
"learning_rate": 3.8400000000000005e-05,
"loss": 3.6306,
"step": 464
},
{
"epoch": 0.051909451497106254,
"grad_norm": 0.2011529505252838,
"learning_rate": 3.8375e-05,
"loss": 3.6259,
"step": 465
},
{
"epoch": 0.052021084726132295,
"grad_norm": 0.21314974129199982,
"learning_rate": 3.8350000000000004e-05,
"loss": 3.6741,
"step": 466
},
{
"epoch": 0.05213271795515833,
"grad_norm": 0.23234759271144867,
"learning_rate": 3.8324999999999996e-05,
"loss": 3.6147,
"step": 467
},
{
"epoch": 0.052244351184184364,
"grad_norm": 0.19671300053596497,
"learning_rate": 3.83e-05,
"loss": 3.5434,
"step": 468
},
{
"epoch": 0.0523559844132104,
"grad_norm": 0.1999201476573944,
"learning_rate": 3.8275e-05,
"loss": 3.6855,
"step": 469
},
{
"epoch": 0.05246761764223643,
"grad_norm": 0.20142202079296112,
"learning_rate": 3.825e-05,
"loss": 3.6708,
"step": 470
},
{
"epoch": 0.052579250871262466,
"grad_norm": 0.19664877653121948,
"learning_rate": 3.8225e-05,
"loss": 3.7052,
"step": 471
},
{
"epoch": 0.0526908841002885,
"grad_norm": 0.21721257269382477,
"learning_rate": 3.82e-05,
"loss": 3.6069,
"step": 472
},
{
"epoch": 0.052802517329314534,
"grad_norm": 0.21580630540847778,
"learning_rate": 3.8175e-05,
"loss": 3.502,
"step": 473
},
{
"epoch": 0.052914150558340575,
"grad_norm": 0.2157697081565857,
"learning_rate": 3.8150000000000006e-05,
"loss": 3.6706,
"step": 474
},
{
"epoch": 0.05302578378736661,
"grad_norm": 0.20716632902622223,
"learning_rate": 3.8125e-05,
"loss": 3.6108,
"step": 475
},
{
"epoch": 0.053137417016392643,
"grad_norm": 0.20868180692195892,
"learning_rate": 3.8100000000000005e-05,
"loss": 3.6299,
"step": 476
},
{
"epoch": 0.05324905024541868,
"grad_norm": 0.21104998886585236,
"learning_rate": 3.8075e-05,
"loss": 3.5584,
"step": 477
},
{
"epoch": 0.05336068347444471,
"grad_norm": 0.215906023979187,
"learning_rate": 3.805e-05,
"loss": 3.6846,
"step": 478
},
{
"epoch": 0.053472316703470746,
"grad_norm": 0.2212597280740738,
"learning_rate": 3.8025e-05,
"loss": 3.6923,
"step": 479
},
{
"epoch": 0.05358394993249678,
"grad_norm": 0.24599696695804596,
"learning_rate": 3.8e-05,
"loss": 3.5999,
"step": 480
},
{
"epoch": 0.053695583161522814,
"grad_norm": 0.19291022419929504,
"learning_rate": 3.7975e-05,
"loss": 3.6881,
"step": 481
},
{
"epoch": 0.053807216390548855,
"grad_norm": 0.1976030468940735,
"learning_rate": 3.795e-05,
"loss": 3.516,
"step": 482
},
{
"epoch": 0.05391884961957489,
"grad_norm": 0.20248141884803772,
"learning_rate": 3.7925e-05,
"loss": 3.6767,
"step": 483
},
{
"epoch": 0.05403048284860092,
"grad_norm": 0.213862344622612,
"learning_rate": 3.79e-05,
"loss": 3.5824,
"step": 484
},
{
"epoch": 0.05414211607762696,
"grad_norm": 0.2002500742673874,
"learning_rate": 3.7875e-05,
"loss": 3.5507,
"step": 485
},
{
"epoch": 0.05425374930665299,
"grad_norm": 0.20893022418022156,
"learning_rate": 3.7850000000000005e-05,
"loss": 3.667,
"step": 486
},
{
"epoch": 0.054365382535679026,
"grad_norm": 0.21401585638523102,
"learning_rate": 3.7825e-05,
"loss": 3.6107,
"step": 487
},
{
"epoch": 0.05447701576470506,
"grad_norm": 0.20484302937984467,
"learning_rate": 3.7800000000000004e-05,
"loss": 3.5919,
"step": 488
},
{
"epoch": 0.054588648993731094,
"grad_norm": 0.2011970728635788,
"learning_rate": 3.7775e-05,
"loss": 3.6105,
"step": 489
},
{
"epoch": 0.05470028222275713,
"grad_norm": 0.20599226653575897,
"learning_rate": 3.775e-05,
"loss": 3.5781,
"step": 490
},
{
"epoch": 0.05481191545178317,
"grad_norm": 0.21728824079036713,
"learning_rate": 3.7725e-05,
"loss": 3.6049,
"step": 491
},
{
"epoch": 0.0549235486808092,
"grad_norm": 0.19060693681240082,
"learning_rate": 3.77e-05,
"loss": 3.6425,
"step": 492
},
{
"epoch": 0.05503518190983524,
"grad_norm": 0.20058387517929077,
"learning_rate": 3.7675e-05,
"loss": 3.5215,
"step": 493
},
{
"epoch": 0.05514681513886127,
"grad_norm": 0.21866890788078308,
"learning_rate": 3.765e-05,
"loss": 3.6304,
"step": 494
},
{
"epoch": 0.055258448367887306,
"grad_norm": 0.2062961310148239,
"learning_rate": 3.7625e-05,
"loss": 3.657,
"step": 495
},
{
"epoch": 0.05537008159691334,
"grad_norm": 0.209868922829628,
"learning_rate": 3.76e-05,
"loss": 3.469,
"step": 496
},
{
"epoch": 0.055481714825939374,
"grad_norm": 0.20485499501228333,
"learning_rate": 3.7575e-05,
"loss": 3.5908,
"step": 497
},
{
"epoch": 0.05559334805496541,
"grad_norm": 0.19796526432037354,
"learning_rate": 3.7550000000000005e-05,
"loss": 3.5933,
"step": 498
},
{
"epoch": 0.05570498128399145,
"grad_norm": 0.2258112132549286,
"learning_rate": 3.7525e-05,
"loss": 3.529,
"step": 499
},
{
"epoch": 0.05581661451301748,
"grad_norm": 0.20565062761306763,
"learning_rate": 3.7500000000000003e-05,
"loss": 3.66,
"step": 500
},
{
"epoch": 0.05592824774204352,
"grad_norm": 0.19696688652038574,
"learning_rate": 3.7475e-05,
"loss": 3.6562,
"step": 501
},
{
"epoch": 0.05603988097106955,
"grad_norm": 0.2181757539510727,
"learning_rate": 3.745e-05,
"loss": 3.568,
"step": 502
},
{
"epoch": 0.056151514200095586,
"grad_norm": 0.22674915194511414,
"learning_rate": 3.7425e-05,
"loss": 3.7254,
"step": 503
},
{
"epoch": 0.05626314742912162,
"grad_norm": 0.20146335661411285,
"learning_rate": 3.74e-05,
"loss": 3.5644,
"step": 504
},
{
"epoch": 0.056374780658147654,
"grad_norm": 0.20804086327552795,
"learning_rate": 3.737500000000001e-05,
"loss": 3.6564,
"step": 505
},
{
"epoch": 0.05648641388717369,
"grad_norm": 0.2047882080078125,
"learning_rate": 3.735e-05,
"loss": 3.7228,
"step": 506
},
{
"epoch": 0.05659804711619973,
"grad_norm": 0.2154848873615265,
"learning_rate": 3.7325000000000006e-05,
"loss": 3.6843,
"step": 507
},
{
"epoch": 0.05670968034522576,
"grad_norm": 0.20942941308021545,
"learning_rate": 3.73e-05,
"loss": 3.5236,
"step": 508
},
{
"epoch": 0.0568213135742518,
"grad_norm": 0.21687182784080505,
"learning_rate": 3.7275000000000005e-05,
"loss": 3.6912,
"step": 509
},
{
"epoch": 0.05693294680327783,
"grad_norm": 0.20924438536167145,
"learning_rate": 3.7250000000000004e-05,
"loss": 3.6083,
"step": 510
},
{
"epoch": 0.057044580032303865,
"grad_norm": 0.20899319648742676,
"learning_rate": 3.7225000000000004e-05,
"loss": 3.6191,
"step": 511
},
{
"epoch": 0.0571562132613299,
"grad_norm": 0.23186710476875305,
"learning_rate": 3.72e-05,
"loss": 3.6003,
"step": 512
},
{
"epoch": 0.057267846490355934,
"grad_norm": 0.19647559523582458,
"learning_rate": 3.7175e-05,
"loss": 3.6489,
"step": 513
},
{
"epoch": 0.05737947971938197,
"grad_norm": 0.2111947238445282,
"learning_rate": 3.715e-05,
"loss": 3.6333,
"step": 514
},
{
"epoch": 0.05749111294840801,
"grad_norm": 0.20235547423362732,
"learning_rate": 3.7125e-05,
"loss": 3.5843,
"step": 515
},
{
"epoch": 0.05760274617743404,
"grad_norm": 0.19469691812992096,
"learning_rate": 3.71e-05,
"loss": 3.6388,
"step": 516
},
{
"epoch": 0.05771437940646008,
"grad_norm": 0.210061714053154,
"learning_rate": 3.707500000000001e-05,
"loss": 3.6171,
"step": 517
},
{
"epoch": 0.05782601263548611,
"grad_norm": 0.20284529030323029,
"learning_rate": 3.705e-05,
"loss": 3.5404,
"step": 518
},
{
"epoch": 0.057937645864512145,
"grad_norm": 0.20453263819217682,
"learning_rate": 3.7025000000000005e-05,
"loss": 3.658,
"step": 519
},
{
"epoch": 0.05804927909353818,
"grad_norm": 0.2044084370136261,
"learning_rate": 3.7e-05,
"loss": 3.439,
"step": 520
},
{
"epoch": 0.058160912322564214,
"grad_norm": 0.19979850947856903,
"learning_rate": 3.6975000000000004e-05,
"loss": 3.551,
"step": 521
},
{
"epoch": 0.05827254555159025,
"grad_norm": 0.1926315724849701,
"learning_rate": 3.6950000000000004e-05,
"loss": 3.7069,
"step": 522
},
{
"epoch": 0.05838417878061628,
"grad_norm": 0.199242502450943,
"learning_rate": 3.6925e-05,
"loss": 3.6647,
"step": 523
},
{
"epoch": 0.05849581200964232,
"grad_norm": 0.23368626832962036,
"learning_rate": 3.69e-05,
"loss": 3.5607,
"step": 524
},
{
"epoch": 0.05860744523866836,
"grad_norm": 0.1949475258588791,
"learning_rate": 3.6875e-05,
"loss": 3.4272,
"step": 525
},
{
"epoch": 0.05871907846769439,
"grad_norm": 0.19842061400413513,
"learning_rate": 3.685e-05,
"loss": 3.6106,
"step": 526
},
{
"epoch": 0.058830711696720425,
"grad_norm": 0.21880899369716644,
"learning_rate": 3.6825e-05,
"loss": 3.6308,
"step": 527
},
{
"epoch": 0.05894234492574646,
"grad_norm": 0.20493078231811523,
"learning_rate": 3.68e-05,
"loss": 3.672,
"step": 528
},
{
"epoch": 0.05905397815477249,
"grad_norm": 0.20863597095012665,
"learning_rate": 3.6775000000000006e-05,
"loss": 3.605,
"step": 529
},
{
"epoch": 0.05916561138379853,
"grad_norm": 0.2061459869146347,
"learning_rate": 3.675e-05,
"loss": 3.515,
"step": 530
},
{
"epoch": 0.05927724461282456,
"grad_norm": 0.2080865502357483,
"learning_rate": 3.6725000000000005e-05,
"loss": 3.5959,
"step": 531
},
{
"epoch": 0.0593888778418506,
"grad_norm": 0.21818754076957703,
"learning_rate": 3.6700000000000004e-05,
"loss": 3.6593,
"step": 532
},
{
"epoch": 0.05950051107087664,
"grad_norm": 0.21977297961711884,
"learning_rate": 3.6675000000000004e-05,
"loss": 3.6572,
"step": 533
},
{
"epoch": 0.05961214429990267,
"grad_norm": 0.20654290914535522,
"learning_rate": 3.665e-05,
"loss": 3.5068,
"step": 534
},
{
"epoch": 0.059723777528928705,
"grad_norm": 0.20002619922161102,
"learning_rate": 3.6625e-05,
"loss": 3.6025,
"step": 535
},
{
"epoch": 0.05983541075795474,
"grad_norm": 0.2031226009130478,
"learning_rate": 3.66e-05,
"loss": 3.4866,
"step": 536
},
{
"epoch": 0.05994704398698077,
"grad_norm": 0.22946283221244812,
"learning_rate": 3.6575e-05,
"loss": 3.5478,
"step": 537
},
{
"epoch": 0.06005867721600681,
"grad_norm": 0.19767338037490845,
"learning_rate": 3.655e-05,
"loss": 3.6015,
"step": 538
},
{
"epoch": 0.06017031044503284,
"grad_norm": 0.1975858509540558,
"learning_rate": 3.652500000000001e-05,
"loss": 3.5822,
"step": 539
},
{
"epoch": 0.06028194367405888,
"grad_norm": 0.19805032014846802,
"learning_rate": 3.65e-05,
"loss": 3.5211,
"step": 540
},
{
"epoch": 0.06039357690308492,
"grad_norm": 0.2010287046432495,
"learning_rate": 3.6475000000000006e-05,
"loss": 3.5165,
"step": 541
},
{
"epoch": 0.06050521013211095,
"grad_norm": 0.2080509066581726,
"learning_rate": 3.645e-05,
"loss": 3.5437,
"step": 542
},
{
"epoch": 0.060616843361136985,
"grad_norm": 0.1941136121749878,
"learning_rate": 3.6425000000000004e-05,
"loss": 3.5466,
"step": 543
},
{
"epoch": 0.06072847659016302,
"grad_norm": 0.211980402469635,
"learning_rate": 3.6400000000000004e-05,
"loss": 3.6911,
"step": 544
},
{
"epoch": 0.06084010981918905,
"grad_norm": 0.20555490255355835,
"learning_rate": 3.6375e-05,
"loss": 3.6553,
"step": 545
},
{
"epoch": 0.06095174304821509,
"grad_norm": 0.21043545007705688,
"learning_rate": 3.635e-05,
"loss": 3.6069,
"step": 546
},
{
"epoch": 0.06106337627724112,
"grad_norm": 0.19112235307693481,
"learning_rate": 3.6325e-05,
"loss": 3.651,
"step": 547
},
{
"epoch": 0.06117500950626716,
"grad_norm": 0.21341146528720856,
"learning_rate": 3.63e-05,
"loss": 3.5975,
"step": 548
},
{
"epoch": 0.0612866427352932,
"grad_norm": 0.20642784237861633,
"learning_rate": 3.6275e-05,
"loss": 3.6572,
"step": 549
},
{
"epoch": 0.06139827596431923,
"grad_norm": 0.22406286001205444,
"learning_rate": 3.625e-05,
"loss": 3.5697,
"step": 550
},
{
"epoch": 0.061509909193345265,
"grad_norm": 0.2295171171426773,
"learning_rate": 3.6225000000000006e-05,
"loss": 3.4785,
"step": 551
},
{
"epoch": 0.0616215424223713,
"grad_norm": 0.21359552443027496,
"learning_rate": 3.62e-05,
"loss": 3.6032,
"step": 552
},
{
"epoch": 0.06173317565139733,
"grad_norm": 0.22213253378868103,
"learning_rate": 3.6175000000000005e-05,
"loss": 3.612,
"step": 553
},
{
"epoch": 0.06184480888042337,
"grad_norm": 0.2014327049255371,
"learning_rate": 3.615e-05,
"loss": 3.6558,
"step": 554
},
{
"epoch": 0.0619564421094494,
"grad_norm": 0.2081911265850067,
"learning_rate": 3.6125000000000004e-05,
"loss": 3.5923,
"step": 555
},
{
"epoch": 0.062068075338475436,
"grad_norm": 0.19592246413230896,
"learning_rate": 3.61e-05,
"loss": 3.695,
"step": 556
},
{
"epoch": 0.06217970856750148,
"grad_norm": 0.20268042385578156,
"learning_rate": 3.6075e-05,
"loss": 3.4698,
"step": 557
},
{
"epoch": 0.06229134179652751,
"grad_norm": 0.228108748793602,
"learning_rate": 3.605e-05,
"loss": 3.686,
"step": 558
},
{
"epoch": 0.062402975025553545,
"grad_norm": 0.2089182585477829,
"learning_rate": 3.6025e-05,
"loss": 3.4894,
"step": 559
},
{
"epoch": 0.06251460825457958,
"grad_norm": 0.20214137434959412,
"learning_rate": 3.6e-05,
"loss": 3.5976,
"step": 560
},
{
"epoch": 0.06262624148360561,
"grad_norm": 0.19966624677181244,
"learning_rate": 3.5975e-05,
"loss": 3.6826,
"step": 561
},
{
"epoch": 0.06273787471263165,
"grad_norm": 0.2026381641626358,
"learning_rate": 3.595e-05,
"loss": 3.5893,
"step": 562
},
{
"epoch": 0.06284950794165768,
"grad_norm": 0.20752030611038208,
"learning_rate": 3.5925000000000006e-05,
"loss": 3.6498,
"step": 563
},
{
"epoch": 0.06296114117068372,
"grad_norm": 0.20754122734069824,
"learning_rate": 3.59e-05,
"loss": 3.4749,
"step": 564
},
{
"epoch": 0.06307277439970975,
"grad_norm": 0.22250573337078094,
"learning_rate": 3.5875000000000005e-05,
"loss": 3.519,
"step": 565
},
{
"epoch": 0.06318440762873578,
"grad_norm": 0.2104233354330063,
"learning_rate": 3.585e-05,
"loss": 3.5996,
"step": 566
},
{
"epoch": 0.06329604085776182,
"grad_norm": 0.20424549281597137,
"learning_rate": 3.5825000000000003e-05,
"loss": 3.5705,
"step": 567
},
{
"epoch": 0.06340767408678785,
"grad_norm": 0.2098836600780487,
"learning_rate": 3.58e-05,
"loss": 3.5736,
"step": 568
},
{
"epoch": 0.0635193073158139,
"grad_norm": 0.21191683411598206,
"learning_rate": 3.5775e-05,
"loss": 3.6211,
"step": 569
},
{
"epoch": 0.06363094054483993,
"grad_norm": 0.2122860699892044,
"learning_rate": 3.575e-05,
"loss": 3.595,
"step": 570
},
{
"epoch": 0.06374257377386597,
"grad_norm": 0.22052264213562012,
"learning_rate": 3.5725e-05,
"loss": 3.5398,
"step": 571
},
{
"epoch": 0.063854207002892,
"grad_norm": 0.21279598772525787,
"learning_rate": 3.57e-05,
"loss": 3.598,
"step": 572
},
{
"epoch": 0.06396584023191804,
"grad_norm": 0.21606601774692535,
"learning_rate": 3.5675e-05,
"loss": 3.6978,
"step": 573
},
{
"epoch": 0.06407747346094407,
"grad_norm": 0.20927487313747406,
"learning_rate": 3.565e-05,
"loss": 3.5732,
"step": 574
},
{
"epoch": 0.0641891066899701,
"grad_norm": 0.21585509181022644,
"learning_rate": 3.5625000000000005e-05,
"loss": 3.6545,
"step": 575
},
{
"epoch": 0.06430073991899614,
"grad_norm": 0.20341403782367706,
"learning_rate": 3.56e-05,
"loss": 3.6114,
"step": 576
},
{
"epoch": 0.06441237314802217,
"grad_norm": 0.21158598363399506,
"learning_rate": 3.5575000000000004e-05,
"loss": 3.6014,
"step": 577
},
{
"epoch": 0.06452400637704821,
"grad_norm": 0.2080516368150711,
"learning_rate": 3.555e-05,
"loss": 3.6085,
"step": 578
},
{
"epoch": 0.06463563960607424,
"grad_norm": 0.20859673619270325,
"learning_rate": 3.5525e-05,
"loss": 3.6317,
"step": 579
},
{
"epoch": 0.06474727283510028,
"grad_norm": 0.2400979846715927,
"learning_rate": 3.55e-05,
"loss": 3.6465,
"step": 580
},
{
"epoch": 0.06485890606412631,
"grad_norm": 0.19518429040908813,
"learning_rate": 3.5475e-05,
"loss": 3.6244,
"step": 581
},
{
"epoch": 0.06497053929315234,
"grad_norm": 0.21000368893146515,
"learning_rate": 3.545e-05,
"loss": 3.5597,
"step": 582
},
{
"epoch": 0.06508217252217838,
"grad_norm": 0.19964025914669037,
"learning_rate": 3.5425e-05,
"loss": 3.6984,
"step": 583
},
{
"epoch": 0.06519380575120441,
"grad_norm": 0.20453962683677673,
"learning_rate": 3.54e-05,
"loss": 3.6038,
"step": 584
},
{
"epoch": 0.06530543898023046,
"grad_norm": 0.2083796262741089,
"learning_rate": 3.5375e-05,
"loss": 3.4987,
"step": 585
},
{
"epoch": 0.0654170722092565,
"grad_norm": 0.20932497084140778,
"learning_rate": 3.535e-05,
"loss": 3.62,
"step": 586
},
{
"epoch": 0.06552870543828253,
"grad_norm": 0.2235613614320755,
"learning_rate": 3.5325000000000005e-05,
"loss": 3.6054,
"step": 587
},
{
"epoch": 0.06564033866730856,
"grad_norm": 0.21258306503295898,
"learning_rate": 3.53e-05,
"loss": 3.6473,
"step": 588
},
{
"epoch": 0.0657519718963346,
"grad_norm": 0.21014408767223358,
"learning_rate": 3.5275000000000004e-05,
"loss": 3.6692,
"step": 589
},
{
"epoch": 0.06586360512536063,
"grad_norm": 0.2153196632862091,
"learning_rate": 3.525e-05,
"loss": 3.5699,
"step": 590
},
{
"epoch": 0.06597523835438666,
"grad_norm": 0.20447254180908203,
"learning_rate": 3.5225e-05,
"loss": 3.6798,
"step": 591
},
{
"epoch": 0.0660868715834127,
"grad_norm": 0.20993854105472565,
"learning_rate": 3.52e-05,
"loss": 3.4706,
"step": 592
},
{
"epoch": 0.06619850481243873,
"grad_norm": 0.20533594489097595,
"learning_rate": 3.5175e-05,
"loss": 3.5942,
"step": 593
},
{
"epoch": 0.06631013804146477,
"grad_norm": 0.21875996887683868,
"learning_rate": 3.515e-05,
"loss": 3.6488,
"step": 594
},
{
"epoch": 0.0664217712704908,
"grad_norm": 0.21062344312667847,
"learning_rate": 3.5125e-05,
"loss": 3.5885,
"step": 595
},
{
"epoch": 0.06653340449951683,
"grad_norm": 0.20061716437339783,
"learning_rate": 3.51e-05,
"loss": 3.6171,
"step": 596
},
{
"epoch": 0.06664503772854287,
"grad_norm": 0.20265276730060577,
"learning_rate": 3.5075000000000006e-05,
"loss": 3.5871,
"step": 597
},
{
"epoch": 0.0667566709575689,
"grad_norm": 0.20613998174667358,
"learning_rate": 3.505e-05,
"loss": 3.5887,
"step": 598
},
{
"epoch": 0.06686830418659494,
"grad_norm": 0.2191297560930252,
"learning_rate": 3.5025000000000004e-05,
"loss": 3.613,
"step": 599
},
{
"epoch": 0.06697993741562097,
"grad_norm": 0.22701820731163025,
"learning_rate": 3.5e-05,
"loss": 3.6706,
"step": 600
},
{
"epoch": 0.067091570644647,
"grad_norm": 0.2038762867450714,
"learning_rate": 3.4975e-05,
"loss": 3.6295,
"step": 601
},
{
"epoch": 0.06720320387367305,
"grad_norm": 0.24341601133346558,
"learning_rate": 3.495e-05,
"loss": 3.543,
"step": 602
},
{
"epoch": 0.06731483710269909,
"grad_norm": 0.21053524315357208,
"learning_rate": 3.4925e-05,
"loss": 3.4894,
"step": 603
},
{
"epoch": 0.06742647033172512,
"grad_norm": 0.3276633322238922,
"learning_rate": 3.49e-05,
"loss": 3.5902,
"step": 604
},
{
"epoch": 0.06753810356075116,
"grad_norm": 0.22165906429290771,
"learning_rate": 3.4875e-05,
"loss": 3.6678,
"step": 605
},
{
"epoch": 0.06764973678977719,
"grad_norm": 0.20696178078651428,
"learning_rate": 3.485e-05,
"loss": 3.4853,
"step": 606
},
{
"epoch": 0.06776137001880322,
"grad_norm": 0.2037273496389389,
"learning_rate": 3.4825e-05,
"loss": 3.6664,
"step": 607
},
{
"epoch": 0.06787300324782926,
"grad_norm": 0.2176288217306137,
"learning_rate": 3.48e-05,
"loss": 3.6401,
"step": 608
},
{
"epoch": 0.06798463647685529,
"grad_norm": 0.20030933618545532,
"learning_rate": 3.4775000000000005e-05,
"loss": 3.5133,
"step": 609
},
{
"epoch": 0.06809626970588133,
"grad_norm": 0.21036197245121002,
"learning_rate": 3.475e-05,
"loss": 3.6818,
"step": 610
},
{
"epoch": 0.06820790293490736,
"grad_norm": 0.20761357247829437,
"learning_rate": 3.4725000000000004e-05,
"loss": 3.6665,
"step": 611
},
{
"epoch": 0.0683195361639334,
"grad_norm": 0.21138760447502136,
"learning_rate": 3.4699999999999996e-05,
"loss": 3.5247,
"step": 612
},
{
"epoch": 0.06843116939295943,
"grad_norm": 0.20724289119243622,
"learning_rate": 3.4675e-05,
"loss": 3.5568,
"step": 613
},
{
"epoch": 0.06854280262198546,
"grad_norm": 0.21234413981437683,
"learning_rate": 3.465e-05,
"loss": 3.6402,
"step": 614
},
{
"epoch": 0.0686544358510115,
"grad_norm": 0.2101334035396576,
"learning_rate": 3.4625e-05,
"loss": 3.6388,
"step": 615
},
{
"epoch": 0.06876606908003753,
"grad_norm": 0.2128676176071167,
"learning_rate": 3.46e-05,
"loss": 3.5939,
"step": 616
},
{
"epoch": 0.06887770230906357,
"grad_norm": 0.20992402732372284,
"learning_rate": 3.4575e-05,
"loss": 3.6203,
"step": 617
},
{
"epoch": 0.06898933553808961,
"grad_norm": 0.20483505725860596,
"learning_rate": 3.455e-05,
"loss": 3.5842,
"step": 618
},
{
"epoch": 0.06910096876711565,
"grad_norm": 0.20162242650985718,
"learning_rate": 3.4525e-05,
"loss": 3.6394,
"step": 619
},
{
"epoch": 0.06921260199614168,
"grad_norm": 0.2180623710155487,
"learning_rate": 3.45e-05,
"loss": 3.4964,
"step": 620
},
{
"epoch": 0.06932423522516772,
"grad_norm": 0.19638143479824066,
"learning_rate": 3.4475000000000005e-05,
"loss": 3.583,
"step": 621
},
{
"epoch": 0.06943586845419375,
"grad_norm": 0.23122988641262054,
"learning_rate": 3.445e-05,
"loss": 3.6265,
"step": 622
},
{
"epoch": 0.06954750168321978,
"grad_norm": 0.19868123531341553,
"learning_rate": 3.4425e-05,
"loss": 3.5372,
"step": 623
},
{
"epoch": 0.06965913491224582,
"grad_norm": 0.2077895551919937,
"learning_rate": 3.4399999999999996e-05,
"loss": 3.5409,
"step": 624
},
{
"epoch": 0.06977076814127185,
"grad_norm": 0.20021426677703857,
"learning_rate": 3.4375e-05,
"loss": 3.5935,
"step": 625
},
{
"epoch": 0.06988240137029789,
"grad_norm": 0.21118071675300598,
"learning_rate": 3.435e-05,
"loss": 3.4979,
"step": 626
},
{
"epoch": 0.06999403459932392,
"grad_norm": 0.2101222574710846,
"learning_rate": 3.4325e-05,
"loss": 3.4673,
"step": 627
},
{
"epoch": 0.07010566782834995,
"grad_norm": 0.2205255776643753,
"learning_rate": 3.430000000000001e-05,
"loss": 3.4988,
"step": 628
},
{
"epoch": 0.07021730105737599,
"grad_norm": 0.2231379598379135,
"learning_rate": 3.4275e-05,
"loss": 3.5456,
"step": 629
},
{
"epoch": 0.07032893428640202,
"grad_norm": 0.2156384438276291,
"learning_rate": 3.4250000000000006e-05,
"loss": 3.6333,
"step": 630
},
{
"epoch": 0.07044056751542806,
"grad_norm": 0.21369129419326782,
"learning_rate": 3.4225e-05,
"loss": 3.5574,
"step": 631
},
{
"epoch": 0.07055220074445409,
"grad_norm": 0.2289404571056366,
"learning_rate": 3.4200000000000005e-05,
"loss": 3.3606,
"step": 632
},
{
"epoch": 0.07066383397348013,
"grad_norm": 0.20150862634181976,
"learning_rate": 3.4175000000000004e-05,
"loss": 3.6017,
"step": 633
},
{
"epoch": 0.07077546720250616,
"grad_norm": 0.222214013338089,
"learning_rate": 3.415e-05,
"loss": 3.6549,
"step": 634
},
{
"epoch": 0.07088710043153221,
"grad_norm": 0.21610718965530396,
"learning_rate": 3.4125e-05,
"loss": 3.5501,
"step": 635
},
{
"epoch": 0.07099873366055824,
"grad_norm": 0.1953548789024353,
"learning_rate": 3.41e-05,
"loss": 3.5144,
"step": 636
},
{
"epoch": 0.07111036688958428,
"grad_norm": 0.20244839787483215,
"learning_rate": 3.4075e-05,
"loss": 3.5845,
"step": 637
},
{
"epoch": 0.07122200011861031,
"grad_norm": 0.2028990238904953,
"learning_rate": 3.405e-05,
"loss": 3.5516,
"step": 638
},
{
"epoch": 0.07133363334763634,
"grad_norm": 0.21585910022258759,
"learning_rate": 3.4025e-05,
"loss": 3.5117,
"step": 639
},
{
"epoch": 0.07144526657666238,
"grad_norm": 0.2236899733543396,
"learning_rate": 3.4000000000000007e-05,
"loss": 3.6208,
"step": 640
},
{
"epoch": 0.07155689980568841,
"grad_norm": 0.1983974277973175,
"learning_rate": 3.3975e-05,
"loss": 3.4299,
"step": 641
},
{
"epoch": 0.07166853303471445,
"grad_norm": 0.2042345553636551,
"learning_rate": 3.3950000000000005e-05,
"loss": 3.6792,
"step": 642
},
{
"epoch": 0.07178016626374048,
"grad_norm": 0.2312481850385666,
"learning_rate": 3.3925e-05,
"loss": 3.6323,
"step": 643
},
{
"epoch": 0.07189179949276651,
"grad_norm": 0.20952174067497253,
"learning_rate": 3.3900000000000004e-05,
"loss": 3.5728,
"step": 644
},
{
"epoch": 0.07200343272179255,
"grad_norm": 0.21245324611663818,
"learning_rate": 3.3875000000000003e-05,
"loss": 3.7041,
"step": 645
},
{
"epoch": 0.07211506595081858,
"grad_norm": 0.21495112776756287,
"learning_rate": 3.385e-05,
"loss": 3.6101,
"step": 646
},
{
"epoch": 0.07222669917984462,
"grad_norm": 0.21500621736049652,
"learning_rate": 3.3825e-05,
"loss": 3.6662,
"step": 647
},
{
"epoch": 0.07233833240887065,
"grad_norm": 0.20064933598041534,
"learning_rate": 3.38e-05,
"loss": 3.5911,
"step": 648
},
{
"epoch": 0.07244996563789668,
"grad_norm": 0.22472144663333893,
"learning_rate": 3.3775e-05,
"loss": 3.6901,
"step": 649
},
{
"epoch": 0.07256159886692272,
"grad_norm": 0.21234481036663055,
"learning_rate": 3.375000000000001e-05,
"loss": 3.6124,
"step": 650
},
{
"epoch": 0.07267323209594877,
"grad_norm": 0.21081648766994476,
"learning_rate": 3.3725e-05,
"loss": 3.619,
"step": 651
},
{
"epoch": 0.0727848653249748,
"grad_norm": 0.21900911629199982,
"learning_rate": 3.3700000000000006e-05,
"loss": 3.4626,
"step": 652
},
{
"epoch": 0.07289649855400084,
"grad_norm": 0.197637677192688,
"learning_rate": 3.3675e-05,
"loss": 3.4884,
"step": 653
},
{
"epoch": 0.07300813178302687,
"grad_norm": 0.20702286064624786,
"learning_rate": 3.3650000000000005e-05,
"loss": 3.6801,
"step": 654
},
{
"epoch": 0.0731197650120529,
"grad_norm": 0.21678529679775238,
"learning_rate": 3.3625000000000004e-05,
"loss": 3.6823,
"step": 655
},
{
"epoch": 0.07323139824107894,
"grad_norm": 0.21352356672286987,
"learning_rate": 3.3600000000000004e-05,
"loss": 3.6928,
"step": 656
},
{
"epoch": 0.07334303147010497,
"grad_norm": 0.20690806210041046,
"learning_rate": 3.3575e-05,
"loss": 3.5821,
"step": 657
},
{
"epoch": 0.073454664699131,
"grad_norm": 0.2118692845106125,
"learning_rate": 3.355e-05,
"loss": 3.6084,
"step": 658
},
{
"epoch": 0.07356629792815704,
"grad_norm": 0.19922851026058197,
"learning_rate": 3.3525e-05,
"loss": 3.6401,
"step": 659
},
{
"epoch": 0.07367793115718307,
"grad_norm": 0.23980186879634857,
"learning_rate": 3.35e-05,
"loss": 3.4706,
"step": 660
},
{
"epoch": 0.07378956438620911,
"grad_norm": 0.22764219343662262,
"learning_rate": 3.3475e-05,
"loss": 3.5348,
"step": 661
},
{
"epoch": 0.07390119761523514,
"grad_norm": 0.22749237716197968,
"learning_rate": 3.345000000000001e-05,
"loss": 3.6037,
"step": 662
},
{
"epoch": 0.07401283084426118,
"grad_norm": 0.22787408530712128,
"learning_rate": 3.3425e-05,
"loss": 3.6074,
"step": 663
},
{
"epoch": 0.07412446407328721,
"grad_norm": 0.2036660611629486,
"learning_rate": 3.3400000000000005e-05,
"loss": 3.5924,
"step": 664
},
{
"epoch": 0.07423609730231324,
"grad_norm": 0.20867742598056793,
"learning_rate": 3.3375e-05,
"loss": 3.5671,
"step": 665
},
{
"epoch": 0.07434773053133928,
"grad_norm": 0.20624525845050812,
"learning_rate": 3.3350000000000004e-05,
"loss": 3.5458,
"step": 666
},
{
"epoch": 0.07445936376036531,
"grad_norm": 0.21608272194862366,
"learning_rate": 3.3325000000000004e-05,
"loss": 3.4783,
"step": 667
},
{
"epoch": 0.07457099698939136,
"grad_norm": 0.21738409996032715,
"learning_rate": 3.33e-05,
"loss": 3.5275,
"step": 668
},
{
"epoch": 0.0746826302184174,
"grad_norm": 0.2182583063840866,
"learning_rate": 3.3275e-05,
"loss": 3.6068,
"step": 669
},
{
"epoch": 0.07479426344744343,
"grad_norm": 0.24012039601802826,
"learning_rate": 3.325e-05,
"loss": 3.5988,
"step": 670
},
{
"epoch": 0.07490589667646946,
"grad_norm": 0.21245433390140533,
"learning_rate": 3.3225e-05,
"loss": 3.6144,
"step": 671
},
{
"epoch": 0.0750175299054955,
"grad_norm": 0.21457639336585999,
"learning_rate": 3.32e-05,
"loss": 3.599,
"step": 672
},
{
"epoch": 0.07512916313452153,
"grad_norm": 0.21469654142856598,
"learning_rate": 3.3175e-05,
"loss": 3.6553,
"step": 673
},
{
"epoch": 0.07524079636354757,
"grad_norm": 0.21156013011932373,
"learning_rate": 3.3150000000000006e-05,
"loss": 3.5895,
"step": 674
},
{
"epoch": 0.0753524295925736,
"grad_norm": 0.20618505775928497,
"learning_rate": 3.3125e-05,
"loss": 3.5192,
"step": 675
},
{
"epoch": 0.07546406282159963,
"grad_norm": 0.2190624475479126,
"learning_rate": 3.3100000000000005e-05,
"loss": 3.6059,
"step": 676
},
{
"epoch": 0.07557569605062567,
"grad_norm": 0.20832453668117523,
"learning_rate": 3.3075e-05,
"loss": 3.7065,
"step": 677
},
{
"epoch": 0.0756873292796517,
"grad_norm": 0.20501980185508728,
"learning_rate": 3.3050000000000004e-05,
"loss": 3.6885,
"step": 678
},
{
"epoch": 0.07579896250867774,
"grad_norm": 0.21864673495292664,
"learning_rate": 3.3025e-05,
"loss": 3.5877,
"step": 679
},
{
"epoch": 0.07591059573770377,
"grad_norm": 0.19896148145198822,
"learning_rate": 3.3e-05,
"loss": 3.6484,
"step": 680
},
{
"epoch": 0.0760222289667298,
"grad_norm": 0.1950829178094864,
"learning_rate": 3.2975e-05,
"loss": 3.5761,
"step": 681
},
{
"epoch": 0.07613386219575584,
"grad_norm": 0.19603589177131653,
"learning_rate": 3.295e-05,
"loss": 3.5584,
"step": 682
},
{
"epoch": 0.07624549542478187,
"grad_norm": 0.22633026540279388,
"learning_rate": 3.2925e-05,
"loss": 3.5569,
"step": 683
},
{
"epoch": 0.07635712865380792,
"grad_norm": 0.20851753652095795,
"learning_rate": 3.29e-05,
"loss": 3.5679,
"step": 684
},
{
"epoch": 0.07646876188283395,
"grad_norm": 0.21804654598236084,
"learning_rate": 3.2875e-05,
"loss": 3.5892,
"step": 685
},
{
"epoch": 0.07658039511185999,
"grad_norm": 0.19973644614219666,
"learning_rate": 3.2850000000000006e-05,
"loss": 3.6339,
"step": 686
},
{
"epoch": 0.07669202834088602,
"grad_norm": 0.23653185367584229,
"learning_rate": 3.2825e-05,
"loss": 3.4296,
"step": 687
},
{
"epoch": 0.07680366156991206,
"grad_norm": 0.20258909463882446,
"learning_rate": 3.2800000000000004e-05,
"loss": 3.7052,
"step": 688
},
{
"epoch": 0.07691529479893809,
"grad_norm": 0.20199154317378998,
"learning_rate": 3.2775e-05,
"loss": 3.5907,
"step": 689
},
{
"epoch": 0.07702692802796413,
"grad_norm": 0.2128395289182663,
"learning_rate": 3.275e-05,
"loss": 3.5611,
"step": 690
},
{
"epoch": 0.07713856125699016,
"grad_norm": 0.2032470554113388,
"learning_rate": 3.2725e-05,
"loss": 3.5238,
"step": 691
},
{
"epoch": 0.0772501944860162,
"grad_norm": 0.21081288158893585,
"learning_rate": 3.27e-05,
"loss": 3.5186,
"step": 692
},
{
"epoch": 0.07736182771504223,
"grad_norm": 0.20675109326839447,
"learning_rate": 3.2675e-05,
"loss": 3.5917,
"step": 693
},
{
"epoch": 0.07747346094406826,
"grad_norm": 0.20470421016216278,
"learning_rate": 3.265e-05,
"loss": 3.5196,
"step": 694
},
{
"epoch": 0.0775850941730943,
"grad_norm": 0.19875124096870422,
"learning_rate": 3.2625e-05,
"loss": 3.5417,
"step": 695
},
{
"epoch": 0.07769672740212033,
"grad_norm": 0.22509066760540009,
"learning_rate": 3.26e-05,
"loss": 3.7016,
"step": 696
},
{
"epoch": 0.07780836063114636,
"grad_norm": 0.19841545820236206,
"learning_rate": 3.2575e-05,
"loss": 3.556,
"step": 697
},
{
"epoch": 0.0779199938601724,
"grad_norm": 0.23103176057338715,
"learning_rate": 3.2550000000000005e-05,
"loss": 3.6544,
"step": 698
},
{
"epoch": 0.07803162708919843,
"grad_norm": 0.20713350176811218,
"learning_rate": 3.2525e-05,
"loss": 3.48,
"step": 699
},
{
"epoch": 0.07814326031822447,
"grad_norm": 0.20921729505062103,
"learning_rate": 3.2500000000000004e-05,
"loss": 3.7033,
"step": 700
},
{
"epoch": 0.07825489354725051,
"grad_norm": 0.20115943253040314,
"learning_rate": 3.2474999999999997e-05,
"loss": 3.6208,
"step": 701
},
{
"epoch": 0.07836652677627655,
"grad_norm": 0.23367123305797577,
"learning_rate": 3.245e-05,
"loss": 3.5486,
"step": 702
},
{
"epoch": 0.07847816000530258,
"grad_norm": 0.21118339896202087,
"learning_rate": 3.2425e-05,
"loss": 3.5262,
"step": 703
},
{
"epoch": 0.07858979323432862,
"grad_norm": 0.22563502192497253,
"learning_rate": 3.24e-05,
"loss": 3.5908,
"step": 704
},
{
"epoch": 0.07870142646335465,
"grad_norm": 0.20896336436271667,
"learning_rate": 3.2375e-05,
"loss": 3.4899,
"step": 705
},
{
"epoch": 0.07881305969238069,
"grad_norm": 0.20777581632137299,
"learning_rate": 3.235e-05,
"loss": 3.4798,
"step": 706
},
{
"epoch": 0.07892469292140672,
"grad_norm": 0.21695426106452942,
"learning_rate": 3.2325e-05,
"loss": 3.5235,
"step": 707
},
{
"epoch": 0.07903632615043275,
"grad_norm": 0.20708265900611877,
"learning_rate": 3.2300000000000006e-05,
"loss": 3.5823,
"step": 708
},
{
"epoch": 0.07914795937945879,
"grad_norm": 0.1974812150001526,
"learning_rate": 3.2275e-05,
"loss": 3.6126,
"step": 709
},
{
"epoch": 0.07925959260848482,
"grad_norm": 0.20645280182361603,
"learning_rate": 3.2250000000000005e-05,
"loss": 3.637,
"step": 710
},
{
"epoch": 0.07937122583751086,
"grad_norm": 0.21989686787128448,
"learning_rate": 3.2225e-05,
"loss": 3.5479,
"step": 711
},
{
"epoch": 0.07948285906653689,
"grad_norm": 0.20603938400745392,
"learning_rate": 3.2200000000000003e-05,
"loss": 3.5319,
"step": 712
},
{
"epoch": 0.07959449229556292,
"grad_norm": 0.19978053867816925,
"learning_rate": 3.2175e-05,
"loss": 3.5255,
"step": 713
},
{
"epoch": 0.07970612552458896,
"grad_norm": 0.23431240022182465,
"learning_rate": 3.215e-05,
"loss": 3.5456,
"step": 714
},
{
"epoch": 0.07981775875361499,
"grad_norm": 0.21745674312114716,
"learning_rate": 3.2125e-05,
"loss": 3.562,
"step": 715
},
{
"epoch": 0.07992939198264103,
"grad_norm": 0.20296964049339294,
"learning_rate": 3.21e-05,
"loss": 3.5871,
"step": 716
},
{
"epoch": 0.08004102521166707,
"grad_norm": 0.20567430555820465,
"learning_rate": 3.2075e-05,
"loss": 3.5492,
"step": 717
},
{
"epoch": 0.08015265844069311,
"grad_norm": 0.22609549760818481,
"learning_rate": 3.205e-05,
"loss": 3.5224,
"step": 718
},
{
"epoch": 0.08026429166971914,
"grad_norm": 0.20620097219944,
"learning_rate": 3.2025e-05,
"loss": 3.5665,
"step": 719
},
{
"epoch": 0.08037592489874518,
"grad_norm": 0.20428533852100372,
"learning_rate": 3.2000000000000005e-05,
"loss": 3.6188,
"step": 720
},
{
"epoch": 0.08048755812777121,
"grad_norm": 0.24304233491420746,
"learning_rate": 3.1975e-05,
"loss": 3.5818,
"step": 721
},
{
"epoch": 0.08059919135679725,
"grad_norm": 0.2195928692817688,
"learning_rate": 3.1950000000000004e-05,
"loss": 3.5927,
"step": 722
},
{
"epoch": 0.08071082458582328,
"grad_norm": 0.1901378035545349,
"learning_rate": 3.1925e-05,
"loss": 3.5895,
"step": 723
},
{
"epoch": 0.08082245781484931,
"grad_norm": 0.22191864252090454,
"learning_rate": 3.19e-05,
"loss": 3.6793,
"step": 724
},
{
"epoch": 0.08093409104387535,
"grad_norm": 0.2285040020942688,
"learning_rate": 3.1875e-05,
"loss": 3.5262,
"step": 725
},
{
"epoch": 0.08104572427290138,
"grad_norm": 0.2254706621170044,
"learning_rate": 3.185e-05,
"loss": 3.6232,
"step": 726
},
{
"epoch": 0.08115735750192742,
"grad_norm": 0.2088705450296402,
"learning_rate": 3.1825e-05,
"loss": 3.6469,
"step": 727
},
{
"epoch": 0.08126899073095345,
"grad_norm": 0.21970973908901215,
"learning_rate": 3.18e-05,
"loss": 3.5826,
"step": 728
},
{
"epoch": 0.08138062395997948,
"grad_norm": 0.1970076858997345,
"learning_rate": 3.1775e-05,
"loss": 3.5018,
"step": 729
},
{
"epoch": 0.08149225718900552,
"grad_norm": 0.19966204464435577,
"learning_rate": 3.175e-05,
"loss": 3.5034,
"step": 730
},
{
"epoch": 0.08160389041803155,
"grad_norm": 0.2079876959323883,
"learning_rate": 3.1725e-05,
"loss": 3.5967,
"step": 731
},
{
"epoch": 0.08171552364705759,
"grad_norm": 0.20858098566532135,
"learning_rate": 3.1700000000000005e-05,
"loss": 3.4784,
"step": 732
},
{
"epoch": 0.08182715687608362,
"grad_norm": 0.21335047483444214,
"learning_rate": 3.1675e-05,
"loss": 3.6192,
"step": 733
},
{
"epoch": 0.08193879010510967,
"grad_norm": 0.2188566029071808,
"learning_rate": 3.1650000000000004e-05,
"loss": 3.577,
"step": 734
},
{
"epoch": 0.0820504233341357,
"grad_norm": 0.22537541389465332,
"learning_rate": 3.1624999999999996e-05,
"loss": 3.705,
"step": 735
},
{
"epoch": 0.08216205656316174,
"grad_norm": 0.20943774282932281,
"learning_rate": 3.16e-05,
"loss": 3.5179,
"step": 736
},
{
"epoch": 0.08227368979218777,
"grad_norm": 0.21664151549339294,
"learning_rate": 3.1575e-05,
"loss": 3.5898,
"step": 737
},
{
"epoch": 0.0823853230212138,
"grad_norm": 0.20324793457984924,
"learning_rate": 3.155e-05,
"loss": 3.6199,
"step": 738
},
{
"epoch": 0.08249695625023984,
"grad_norm": 0.22451044619083405,
"learning_rate": 3.1525e-05,
"loss": 3.6476,
"step": 739
},
{
"epoch": 0.08260858947926587,
"grad_norm": 0.2397347092628479,
"learning_rate": 3.15e-05,
"loss": 3.5332,
"step": 740
},
{
"epoch": 0.08272022270829191,
"grad_norm": 0.21358029544353485,
"learning_rate": 3.1475e-05,
"loss": 3.5676,
"step": 741
},
{
"epoch": 0.08283185593731794,
"grad_norm": 0.20642797648906708,
"learning_rate": 3.145e-05,
"loss": 3.5602,
"step": 742
},
{
"epoch": 0.08294348916634398,
"grad_norm": 0.19995102286338806,
"learning_rate": 3.1425e-05,
"loss": 3.583,
"step": 743
},
{
"epoch": 0.08305512239537001,
"grad_norm": 0.20533202588558197,
"learning_rate": 3.1400000000000004e-05,
"loss": 3.6937,
"step": 744
},
{
"epoch": 0.08316675562439604,
"grad_norm": 0.20621930062770844,
"learning_rate": 3.1375e-05,
"loss": 3.5962,
"step": 745
},
{
"epoch": 0.08327838885342208,
"grad_norm": 0.203664168715477,
"learning_rate": 3.135e-05,
"loss": 3.613,
"step": 746
},
{
"epoch": 0.08339002208244811,
"grad_norm": 0.2092573195695877,
"learning_rate": 3.1324999999999996e-05,
"loss": 3.579,
"step": 747
},
{
"epoch": 0.08350165531147415,
"grad_norm": 0.22122147679328918,
"learning_rate": 3.13e-05,
"loss": 3.5755,
"step": 748
},
{
"epoch": 0.08361328854050018,
"grad_norm": 0.2089209109544754,
"learning_rate": 3.1275e-05,
"loss": 3.677,
"step": 749
},
{
"epoch": 0.08372492176952623,
"grad_norm": 0.20402583479881287,
"learning_rate": 3.125e-05,
"loss": 3.534,
"step": 750
},
{
"epoch": 0.08383655499855226,
"grad_norm": 0.22248327732086182,
"learning_rate": 3.122500000000001e-05,
"loss": 3.5395,
"step": 751
},
{
"epoch": 0.0839481882275783,
"grad_norm": 0.20626935362815857,
"learning_rate": 3.12e-05,
"loss": 3.6733,
"step": 752
},
{
"epoch": 0.08405982145660433,
"grad_norm": 0.22046342492103577,
"learning_rate": 3.1175000000000006e-05,
"loss": 3.7321,
"step": 753
},
{
"epoch": 0.08417145468563036,
"grad_norm": 0.1966770589351654,
"learning_rate": 3.115e-05,
"loss": 3.5111,
"step": 754
},
{
"epoch": 0.0842830879146564,
"grad_norm": 0.20537209510803223,
"learning_rate": 3.1125000000000004e-05,
"loss": 3.6667,
"step": 755
},
{
"epoch": 0.08439472114368243,
"grad_norm": 0.1903800368309021,
"learning_rate": 3.1100000000000004e-05,
"loss": 3.539,
"step": 756
},
{
"epoch": 0.08450635437270847,
"grad_norm": 0.21043941378593445,
"learning_rate": 3.1075e-05,
"loss": 3.6112,
"step": 757
},
{
"epoch": 0.0846179876017345,
"grad_norm": 0.21503373980522156,
"learning_rate": 3.105e-05,
"loss": 3.5448,
"step": 758
},
{
"epoch": 0.08472962083076054,
"grad_norm": 0.20530228316783905,
"learning_rate": 3.1025e-05,
"loss": 3.613,
"step": 759
},
{
"epoch": 0.08484125405978657,
"grad_norm": 0.19645661115646362,
"learning_rate": 3.1e-05,
"loss": 3.6007,
"step": 760
},
{
"epoch": 0.0849528872888126,
"grad_norm": 0.21577034890651703,
"learning_rate": 3.0975e-05,
"loss": 3.505,
"step": 761
},
{
"epoch": 0.08506452051783864,
"grad_norm": 0.20609325170516968,
"learning_rate": 3.095e-05,
"loss": 3.422,
"step": 762
},
{
"epoch": 0.08517615374686467,
"grad_norm": 0.23299914598464966,
"learning_rate": 3.0925000000000006e-05,
"loss": 3.6075,
"step": 763
},
{
"epoch": 0.0852877869758907,
"grad_norm": 0.20689697563648224,
"learning_rate": 3.09e-05,
"loss": 3.5514,
"step": 764
},
{
"epoch": 0.08539942020491674,
"grad_norm": 0.20687976479530334,
"learning_rate": 3.0875000000000005e-05,
"loss": 3.5863,
"step": 765
},
{
"epoch": 0.08551105343394277,
"grad_norm": 0.23674167692661285,
"learning_rate": 3.0850000000000004e-05,
"loss": 3.5432,
"step": 766
},
{
"epoch": 0.08562268666296882,
"grad_norm": 0.21824003756046295,
"learning_rate": 3.0825000000000004e-05,
"loss": 3.5684,
"step": 767
},
{
"epoch": 0.08573431989199486,
"grad_norm": 0.19605115056037903,
"learning_rate": 3.08e-05,
"loss": 3.5413,
"step": 768
},
{
"epoch": 0.08584595312102089,
"grad_norm": 0.20351500809192657,
"learning_rate": 3.0775e-05,
"loss": 3.5437,
"step": 769
},
{
"epoch": 0.08595758635004692,
"grad_norm": 0.21889986097812653,
"learning_rate": 3.075e-05,
"loss": 3.5174,
"step": 770
},
{
"epoch": 0.08606921957907296,
"grad_norm": 0.21644869446754456,
"learning_rate": 3.0725e-05,
"loss": 3.5423,
"step": 771
},
{
"epoch": 0.08618085280809899,
"grad_norm": 0.22180013358592987,
"learning_rate": 3.07e-05,
"loss": 3.7087,
"step": 772
},
{
"epoch": 0.08629248603712503,
"grad_norm": 0.21358925104141235,
"learning_rate": 3.067500000000001e-05,
"loss": 3.5255,
"step": 773
},
{
"epoch": 0.08640411926615106,
"grad_norm": 0.21618421375751495,
"learning_rate": 3.065e-05,
"loss": 3.6475,
"step": 774
},
{
"epoch": 0.0865157524951771,
"grad_norm": 0.22747644782066345,
"learning_rate": 3.0625000000000006e-05,
"loss": 3.5514,
"step": 775
},
{
"epoch": 0.08662738572420313,
"grad_norm": 0.2220214605331421,
"learning_rate": 3.06e-05,
"loss": 3.6294,
"step": 776
},
{
"epoch": 0.08673901895322916,
"grad_norm": 0.21085909008979797,
"learning_rate": 3.0575000000000005e-05,
"loss": 3.5649,
"step": 777
},
{
"epoch": 0.0868506521822552,
"grad_norm": 0.21917814016342163,
"learning_rate": 3.0550000000000004e-05,
"loss": 3.4499,
"step": 778
},
{
"epoch": 0.08696228541128123,
"grad_norm": 0.20632682740688324,
"learning_rate": 3.0525e-05,
"loss": 3.5209,
"step": 779
},
{
"epoch": 0.08707391864030727,
"grad_norm": 0.21019919216632843,
"learning_rate": 3.05e-05,
"loss": 3.5406,
"step": 780
},
{
"epoch": 0.0871855518693333,
"grad_norm": 0.2109745293855667,
"learning_rate": 3.0475000000000002e-05,
"loss": 3.6177,
"step": 781
},
{
"epoch": 0.08729718509835933,
"grad_norm": 0.21107935905456543,
"learning_rate": 3.045e-05,
"loss": 3.5587,
"step": 782
},
{
"epoch": 0.08740881832738538,
"grad_norm": 0.22432756423950195,
"learning_rate": 3.0425000000000004e-05,
"loss": 3.5861,
"step": 783
},
{
"epoch": 0.08752045155641142,
"grad_norm": 0.24212072789669037,
"learning_rate": 3.04e-05,
"loss": 3.5443,
"step": 784
},
{
"epoch": 0.08763208478543745,
"grad_norm": 0.21636484563350677,
"learning_rate": 3.0375000000000003e-05,
"loss": 3.6316,
"step": 785
},
{
"epoch": 0.08774371801446348,
"grad_norm": 0.21452303230762482,
"learning_rate": 3.035e-05,
"loss": 3.5652,
"step": 786
},
{
"epoch": 0.08785535124348952,
"grad_norm": 0.21871964633464813,
"learning_rate": 3.0325000000000002e-05,
"loss": 3.5378,
"step": 787
},
{
"epoch": 0.08796698447251555,
"grad_norm": 0.20544585585594177,
"learning_rate": 3.03e-05,
"loss": 3.5809,
"step": 788
},
{
"epoch": 0.08807861770154159,
"grad_norm": 0.22098152339458466,
"learning_rate": 3.0275000000000004e-05,
"loss": 3.5506,
"step": 789
},
{
"epoch": 0.08819025093056762,
"grad_norm": 0.20020943880081177,
"learning_rate": 3.025e-05,
"loss": 3.5057,
"step": 790
},
{
"epoch": 0.08830188415959365,
"grad_norm": 0.2010350227355957,
"learning_rate": 3.0225000000000003e-05,
"loss": 3.5634,
"step": 791
},
{
"epoch": 0.08841351738861969,
"grad_norm": 0.20014971494674683,
"learning_rate": 3.02e-05,
"loss": 3.636,
"step": 792
},
{
"epoch": 0.08852515061764572,
"grad_norm": 0.2196456640958786,
"learning_rate": 3.0175e-05,
"loss": 3.6196,
"step": 793
},
{
"epoch": 0.08863678384667176,
"grad_norm": 0.2147689312696457,
"learning_rate": 3.015e-05,
"loss": 3.5438,
"step": 794
},
{
"epoch": 0.08874841707569779,
"grad_norm": 0.20580841600894928,
"learning_rate": 3.0125000000000004e-05,
"loss": 3.6649,
"step": 795
},
{
"epoch": 0.08886005030472383,
"grad_norm": 0.1942083239555359,
"learning_rate": 3.01e-05,
"loss": 3.5551,
"step": 796
},
{
"epoch": 0.08897168353374986,
"grad_norm": 0.29285910725593567,
"learning_rate": 3.0075000000000003e-05,
"loss": 3.619,
"step": 797
},
{
"epoch": 0.0890833167627759,
"grad_norm": 0.22100916504859924,
"learning_rate": 3.0050000000000002e-05,
"loss": 3.6067,
"step": 798
},
{
"epoch": 0.08919494999180193,
"grad_norm": 0.22875243425369263,
"learning_rate": 3.0025000000000005e-05,
"loss": 3.5509,
"step": 799
},
{
"epoch": 0.08930658322082798,
"grad_norm": 0.2442617416381836,
"learning_rate": 3e-05,
"loss": 3.5261,
"step": 800
},
{
"epoch": 0.08941821644985401,
"grad_norm": 0.1970231831073761,
"learning_rate": 2.9975000000000004e-05,
"loss": 3.6594,
"step": 801
},
{
"epoch": 0.08952984967888004,
"grad_norm": 0.2086515575647354,
"learning_rate": 2.995e-05,
"loss": 3.5652,
"step": 802
},
{
"epoch": 0.08964148290790608,
"grad_norm": 0.2005903422832489,
"learning_rate": 2.9925000000000002e-05,
"loss": 3.6021,
"step": 803
},
{
"epoch": 0.08975311613693211,
"grad_norm": 0.22656391561031342,
"learning_rate": 2.9900000000000002e-05,
"loss": 3.5465,
"step": 804
},
{
"epoch": 0.08986474936595815,
"grad_norm": 0.2097763866186142,
"learning_rate": 2.9875000000000004e-05,
"loss": 3.6744,
"step": 805
},
{
"epoch": 0.08997638259498418,
"grad_norm": 0.21133223176002502,
"learning_rate": 2.985e-05,
"loss": 3.5463,
"step": 806
},
{
"epoch": 0.09008801582401021,
"grad_norm": 0.20539484918117523,
"learning_rate": 2.9825000000000003e-05,
"loss": 3.5978,
"step": 807
},
{
"epoch": 0.09019964905303625,
"grad_norm": 0.20438183844089508,
"learning_rate": 2.98e-05,
"loss": 3.6513,
"step": 808
},
{
"epoch": 0.09031128228206228,
"grad_norm": 0.21557852625846863,
"learning_rate": 2.9775000000000002e-05,
"loss": 3.6008,
"step": 809
},
{
"epoch": 0.09042291551108832,
"grad_norm": 0.1979275941848755,
"learning_rate": 2.975e-05,
"loss": 3.6066,
"step": 810
},
{
"epoch": 0.09053454874011435,
"grad_norm": 0.2179822027683258,
"learning_rate": 2.9725000000000004e-05,
"loss": 3.5869,
"step": 811
},
{
"epoch": 0.09064618196914039,
"grad_norm": 0.20712365210056305,
"learning_rate": 2.97e-05,
"loss": 3.4057,
"step": 812
},
{
"epoch": 0.09075781519816642,
"grad_norm": 0.21529102325439453,
"learning_rate": 2.9675000000000003e-05,
"loss": 3.6645,
"step": 813
},
{
"epoch": 0.09086944842719245,
"grad_norm": 0.20569564402103424,
"learning_rate": 2.965e-05,
"loss": 3.4961,
"step": 814
},
{
"epoch": 0.09098108165621849,
"grad_norm": 0.2132355272769928,
"learning_rate": 2.9625000000000002e-05,
"loss": 3.5372,
"step": 815
},
{
"epoch": 0.09109271488524454,
"grad_norm": 0.19764548540115356,
"learning_rate": 2.96e-05,
"loss": 3.6608,
"step": 816
},
{
"epoch": 0.09120434811427057,
"grad_norm": 0.24116063117980957,
"learning_rate": 2.9575000000000004e-05,
"loss": 3.62,
"step": 817
},
{
"epoch": 0.0913159813432966,
"grad_norm": 0.23692041635513306,
"learning_rate": 2.955e-05,
"loss": 3.5035,
"step": 818
},
{
"epoch": 0.09142761457232264,
"grad_norm": 0.20293064415454865,
"learning_rate": 2.9525000000000003e-05,
"loss": 3.5916,
"step": 819
},
{
"epoch": 0.09153924780134867,
"grad_norm": 0.21626363694667816,
"learning_rate": 2.95e-05,
"loss": 3.5288,
"step": 820
},
{
"epoch": 0.0916508810303747,
"grad_norm": 0.2050466388463974,
"learning_rate": 2.9475e-05,
"loss": 3.5985,
"step": 821
},
{
"epoch": 0.09176251425940074,
"grad_norm": 0.21667271852493286,
"learning_rate": 2.945e-05,
"loss": 3.6129,
"step": 822
},
{
"epoch": 0.09187414748842677,
"grad_norm": 0.1966468244791031,
"learning_rate": 2.9425000000000004e-05,
"loss": 3.5927,
"step": 823
},
{
"epoch": 0.09198578071745281,
"grad_norm": 0.22131453454494476,
"learning_rate": 2.94e-05,
"loss": 3.5661,
"step": 824
},
{
"epoch": 0.09209741394647884,
"grad_norm": 0.22024045884609222,
"learning_rate": 2.9375000000000003e-05,
"loss": 3.5848,
"step": 825
},
{
"epoch": 0.09220904717550488,
"grad_norm": 0.22097060084342957,
"learning_rate": 2.935e-05,
"loss": 3.6131,
"step": 826
},
{
"epoch": 0.09232068040453091,
"grad_norm": 0.2158297598361969,
"learning_rate": 2.9325e-05,
"loss": 3.5683,
"step": 827
},
{
"epoch": 0.09243231363355695,
"grad_norm": 0.2036168873310089,
"learning_rate": 2.93e-05,
"loss": 3.6495,
"step": 828
},
{
"epoch": 0.09254394686258298,
"grad_norm": 0.20106995105743408,
"learning_rate": 2.9275000000000003e-05,
"loss": 3.5471,
"step": 829
},
{
"epoch": 0.09265558009160901,
"grad_norm": 0.20637951791286469,
"learning_rate": 2.925e-05,
"loss": 3.5929,
"step": 830
},
{
"epoch": 0.09276721332063505,
"grad_norm": 0.19301024079322815,
"learning_rate": 2.9225000000000002e-05,
"loss": 3.5408,
"step": 831
},
{
"epoch": 0.09287884654966108,
"grad_norm": 0.21553462743759155,
"learning_rate": 2.9199999999999998e-05,
"loss": 3.5882,
"step": 832
},
{
"epoch": 0.09299047977868713,
"grad_norm": 0.2115125209093094,
"learning_rate": 2.9175e-05,
"loss": 3.5019,
"step": 833
},
{
"epoch": 0.09310211300771316,
"grad_norm": 0.218712717294693,
"learning_rate": 2.915e-05,
"loss": 3.6261,
"step": 834
},
{
"epoch": 0.0932137462367392,
"grad_norm": 0.20960000157356262,
"learning_rate": 2.9125000000000003e-05,
"loss": 3.4769,
"step": 835
},
{
"epoch": 0.09332537946576523,
"grad_norm": 0.19260746240615845,
"learning_rate": 2.91e-05,
"loss": 3.5662,
"step": 836
},
{
"epoch": 0.09343701269479127,
"grad_norm": 0.2303057610988617,
"learning_rate": 2.9075000000000002e-05,
"loss": 3.5079,
"step": 837
},
{
"epoch": 0.0935486459238173,
"grad_norm": 0.2179318219423294,
"learning_rate": 2.9049999999999998e-05,
"loss": 3.54,
"step": 838
},
{
"epoch": 0.09366027915284333,
"grad_norm": 0.20790190994739532,
"learning_rate": 2.9025e-05,
"loss": 3.589,
"step": 839
},
{
"epoch": 0.09377191238186937,
"grad_norm": 0.2073325216770172,
"learning_rate": 2.9e-05,
"loss": 3.6382,
"step": 840
},
{
"epoch": 0.0938835456108954,
"grad_norm": 0.2231975644826889,
"learning_rate": 2.8975000000000003e-05,
"loss": 3.7686,
"step": 841
},
{
"epoch": 0.09399517883992144,
"grad_norm": 0.21642033755779266,
"learning_rate": 2.895e-05,
"loss": 3.5536,
"step": 842
},
{
"epoch": 0.09410681206894747,
"grad_norm": 0.22024497389793396,
"learning_rate": 2.8925000000000002e-05,
"loss": 3.4644,
"step": 843
},
{
"epoch": 0.0942184452979735,
"grad_norm": 0.2281215786933899,
"learning_rate": 2.8899999999999998e-05,
"loss": 3.5195,
"step": 844
},
{
"epoch": 0.09433007852699954,
"grad_norm": 0.2216193825006485,
"learning_rate": 2.8875e-05,
"loss": 3.4843,
"step": 845
},
{
"epoch": 0.09444171175602557,
"grad_norm": 0.2158605009317398,
"learning_rate": 2.885e-05,
"loss": 3.5457,
"step": 846
},
{
"epoch": 0.09455334498505161,
"grad_norm": 0.20994016528129578,
"learning_rate": 2.8825000000000003e-05,
"loss": 3.5348,
"step": 847
},
{
"epoch": 0.09466497821407764,
"grad_norm": 0.19853565096855164,
"learning_rate": 2.88e-05,
"loss": 3.5193,
"step": 848
},
{
"epoch": 0.09477661144310369,
"grad_norm": 0.20257002115249634,
"learning_rate": 2.8775e-05,
"loss": 3.5974,
"step": 849
},
{
"epoch": 0.09488824467212972,
"grad_norm": 0.2186659574508667,
"learning_rate": 2.8749999999999997e-05,
"loss": 3.5502,
"step": 850
},
{
"epoch": 0.09499987790115576,
"grad_norm": 0.20825856924057007,
"learning_rate": 2.8725e-05,
"loss": 3.5458,
"step": 851
},
{
"epoch": 0.09511151113018179,
"grad_norm": 0.20816844701766968,
"learning_rate": 2.87e-05,
"loss": 3.6357,
"step": 852
},
{
"epoch": 0.09522314435920783,
"grad_norm": 0.1968451589345932,
"learning_rate": 2.8675000000000002e-05,
"loss": 3.5206,
"step": 853
},
{
"epoch": 0.09533477758823386,
"grad_norm": 0.21720953285694122,
"learning_rate": 2.865e-05,
"loss": 3.5595,
"step": 854
},
{
"epoch": 0.0954464108172599,
"grad_norm": 0.21327778697013855,
"learning_rate": 2.8625e-05,
"loss": 3.5749,
"step": 855
},
{
"epoch": 0.09555804404628593,
"grad_norm": 0.20327028632164001,
"learning_rate": 2.86e-05,
"loss": 3.4917,
"step": 856
},
{
"epoch": 0.09566967727531196,
"grad_norm": 0.2028653770685196,
"learning_rate": 2.8575000000000003e-05,
"loss": 3.5359,
"step": 857
},
{
"epoch": 0.095781310504338,
"grad_norm": 0.19929373264312744,
"learning_rate": 2.855e-05,
"loss": 3.5643,
"step": 858
},
{
"epoch": 0.09589294373336403,
"grad_norm": 0.2242179661989212,
"learning_rate": 2.8525000000000002e-05,
"loss": 3.5903,
"step": 859
},
{
"epoch": 0.09600457696239006,
"grad_norm": 0.2157723605632782,
"learning_rate": 2.8499999999999998e-05,
"loss": 3.597,
"step": 860
},
{
"epoch": 0.0961162101914161,
"grad_norm": 0.20756587386131287,
"learning_rate": 2.8475e-05,
"loss": 3.5556,
"step": 861
},
{
"epoch": 0.09622784342044213,
"grad_norm": 0.2240949124097824,
"learning_rate": 2.845e-05,
"loss": 3.5269,
"step": 862
},
{
"epoch": 0.09633947664946817,
"grad_norm": 0.21741090714931488,
"learning_rate": 2.8425000000000003e-05,
"loss": 3.5865,
"step": 863
},
{
"epoch": 0.0964511098784942,
"grad_norm": 0.2230018675327301,
"learning_rate": 2.84e-05,
"loss": 3.6416,
"step": 864
},
{
"epoch": 0.09656274310752025,
"grad_norm": 0.19828388094902039,
"learning_rate": 2.8375000000000002e-05,
"loss": 3.5352,
"step": 865
},
{
"epoch": 0.09667437633654628,
"grad_norm": 0.19198822975158691,
"learning_rate": 2.8349999999999998e-05,
"loss": 3.4927,
"step": 866
},
{
"epoch": 0.09678600956557232,
"grad_norm": 0.20700480043888092,
"learning_rate": 2.8325e-05,
"loss": 3.6027,
"step": 867
},
{
"epoch": 0.09689764279459835,
"grad_norm": 0.20969006419181824,
"learning_rate": 2.83e-05,
"loss": 3.5348,
"step": 868
},
{
"epoch": 0.09700927602362439,
"grad_norm": 0.21899078786373138,
"learning_rate": 2.8275000000000003e-05,
"loss": 3.6044,
"step": 869
},
{
"epoch": 0.09712090925265042,
"grad_norm": 0.2017839103937149,
"learning_rate": 2.825e-05,
"loss": 3.6093,
"step": 870
},
{
"epoch": 0.09723254248167645,
"grad_norm": 0.20474469661712646,
"learning_rate": 2.8225e-05,
"loss": 3.5682,
"step": 871
},
{
"epoch": 0.09734417571070249,
"grad_norm": 0.20501159131526947,
"learning_rate": 2.8199999999999998e-05,
"loss": 3.5023,
"step": 872
},
{
"epoch": 0.09745580893972852,
"grad_norm": 0.20929984748363495,
"learning_rate": 2.8175e-05,
"loss": 3.5169,
"step": 873
},
{
"epoch": 0.09756744216875456,
"grad_norm": 0.21647411584854126,
"learning_rate": 2.815e-05,
"loss": 3.6719,
"step": 874
},
{
"epoch": 0.09767907539778059,
"grad_norm": 0.2262195199728012,
"learning_rate": 2.8125000000000003e-05,
"loss": 3.6392,
"step": 875
},
{
"epoch": 0.09779070862680662,
"grad_norm": 0.2187419831752777,
"learning_rate": 2.8100000000000005e-05,
"loss": 3.5012,
"step": 876
},
{
"epoch": 0.09790234185583266,
"grad_norm": 0.20536285638809204,
"learning_rate": 2.8075e-05,
"loss": 3.5809,
"step": 877
},
{
"epoch": 0.09801397508485869,
"grad_norm": 0.20504502952098846,
"learning_rate": 2.8050000000000004e-05,
"loss": 3.5897,
"step": 878
},
{
"epoch": 0.09812560831388473,
"grad_norm": 0.2219771295785904,
"learning_rate": 2.8025e-05,
"loss": 3.6262,
"step": 879
},
{
"epoch": 0.09823724154291076,
"grad_norm": 0.2204463630914688,
"learning_rate": 2.8000000000000003e-05,
"loss": 3.5202,
"step": 880
},
{
"epoch": 0.0983488747719368,
"grad_norm": 0.20425459742546082,
"learning_rate": 2.7975000000000002e-05,
"loss": 3.63,
"step": 881
},
{
"epoch": 0.09846050800096284,
"grad_norm": 0.20711715519428253,
"learning_rate": 2.7950000000000005e-05,
"loss": 3.4147,
"step": 882
},
{
"epoch": 0.09857214122998888,
"grad_norm": 0.21784266829490662,
"learning_rate": 2.7925e-05,
"loss": 3.5869,
"step": 883
},
{
"epoch": 0.09868377445901491,
"grad_norm": 0.2048199325799942,
"learning_rate": 2.7900000000000004e-05,
"loss": 3.5756,
"step": 884
},
{
"epoch": 0.09879540768804095,
"grad_norm": 0.20878691971302032,
"learning_rate": 2.7875e-05,
"loss": 3.6429,
"step": 885
},
{
"epoch": 0.09890704091706698,
"grad_norm": 0.21570119261741638,
"learning_rate": 2.7850000000000003e-05,
"loss": 3.5949,
"step": 886
},
{
"epoch": 0.09901867414609301,
"grad_norm": 0.21512436866760254,
"learning_rate": 2.7825000000000002e-05,
"loss": 3.6499,
"step": 887
},
{
"epoch": 0.09913030737511905,
"grad_norm": 0.21218284964561462,
"learning_rate": 2.7800000000000005e-05,
"loss": 3.5337,
"step": 888
},
{
"epoch": 0.09924194060414508,
"grad_norm": 0.21699771285057068,
"learning_rate": 2.7775e-05,
"loss": 3.4743,
"step": 889
},
{
"epoch": 0.09935357383317112,
"grad_norm": 0.20775876939296722,
"learning_rate": 2.7750000000000004e-05,
"loss": 3.6219,
"step": 890
},
{
"epoch": 0.09946520706219715,
"grad_norm": 0.22210370004177094,
"learning_rate": 2.7725e-05,
"loss": 3.5626,
"step": 891
},
{
"epoch": 0.09957684029122318,
"grad_norm": 0.21089830994606018,
"learning_rate": 2.7700000000000002e-05,
"loss": 3.6357,
"step": 892
},
{
"epoch": 0.09968847352024922,
"grad_norm": 0.22170381247997284,
"learning_rate": 2.7675000000000002e-05,
"loss": 3.6639,
"step": 893
},
{
"epoch": 0.09980010674927525,
"grad_norm": 0.20654913783073425,
"learning_rate": 2.7650000000000005e-05,
"loss": 3.6904,
"step": 894
},
{
"epoch": 0.09991173997830129,
"grad_norm": 0.2104388326406479,
"learning_rate": 2.7625e-05,
"loss": 3.644,
"step": 895
},
{
"epoch": 0.10002337320732732,
"grad_norm": 0.2188648134469986,
"learning_rate": 2.7600000000000003e-05,
"loss": 3.5624,
"step": 896
},
{
"epoch": 0.10013500643635335,
"grad_norm": 0.2162526249885559,
"learning_rate": 2.7575e-05,
"loss": 3.5387,
"step": 897
},
{
"epoch": 0.1002466396653794,
"grad_norm": 0.20775236189365387,
"learning_rate": 2.7550000000000002e-05,
"loss": 3.5958,
"step": 898
},
{
"epoch": 0.10035827289440544,
"grad_norm": 0.21890074014663696,
"learning_rate": 2.7525e-05,
"loss": 3.455,
"step": 899
},
{
"epoch": 0.10046990612343147,
"grad_norm": 0.21309438347816467,
"learning_rate": 2.7500000000000004e-05,
"loss": 3.578,
"step": 900
},
{
"epoch": 0.1005815393524575,
"grad_norm": 0.21553517878055573,
"learning_rate": 2.7475e-05,
"loss": 3.4719,
"step": 901
},
{
"epoch": 0.10069317258148354,
"grad_norm": 0.22065530717372894,
"learning_rate": 2.7450000000000003e-05,
"loss": 3.6469,
"step": 902
},
{
"epoch": 0.10080480581050957,
"grad_norm": 0.2120080441236496,
"learning_rate": 2.7425e-05,
"loss": 3.6787,
"step": 903
},
{
"epoch": 0.10091643903953561,
"grad_norm": 0.20645517110824585,
"learning_rate": 2.7400000000000002e-05,
"loss": 3.6039,
"step": 904
},
{
"epoch": 0.10102807226856164,
"grad_norm": 0.2203800231218338,
"learning_rate": 2.7375e-05,
"loss": 3.5199,
"step": 905
},
{
"epoch": 0.10113970549758768,
"grad_norm": 0.20567728579044342,
"learning_rate": 2.7350000000000004e-05,
"loss": 3.5771,
"step": 906
},
{
"epoch": 0.10125133872661371,
"grad_norm": 0.22025975584983826,
"learning_rate": 2.7325e-05,
"loss": 3.5693,
"step": 907
},
{
"epoch": 0.10136297195563974,
"grad_norm": 0.22443410754203796,
"learning_rate": 2.7300000000000003e-05,
"loss": 3.499,
"step": 908
},
{
"epoch": 0.10147460518466578,
"grad_norm": 0.22537918388843536,
"learning_rate": 2.7275e-05,
"loss": 3.5203,
"step": 909
},
{
"epoch": 0.10158623841369181,
"grad_norm": 0.22454096376895905,
"learning_rate": 2.725e-05,
"loss": 3.4862,
"step": 910
},
{
"epoch": 0.10169787164271785,
"grad_norm": 0.2018805742263794,
"learning_rate": 2.7225e-05,
"loss": 3.5734,
"step": 911
},
{
"epoch": 0.10180950487174388,
"grad_norm": 0.21323262155056,
"learning_rate": 2.7200000000000004e-05,
"loss": 3.586,
"step": 912
},
{
"epoch": 0.10192113810076991,
"grad_norm": 0.2109421044588089,
"learning_rate": 2.7175e-05,
"loss": 3.5609,
"step": 913
},
{
"epoch": 0.10203277132979595,
"grad_norm": 0.21333380043506622,
"learning_rate": 2.7150000000000003e-05,
"loss": 3.5167,
"step": 914
},
{
"epoch": 0.102144404558822,
"grad_norm": 0.21649768948554993,
"learning_rate": 2.7125000000000002e-05,
"loss": 3.5431,
"step": 915
},
{
"epoch": 0.10225603778784803,
"grad_norm": 0.22201687097549438,
"learning_rate": 2.7100000000000005e-05,
"loss": 3.6262,
"step": 916
},
{
"epoch": 0.10236767101687407,
"grad_norm": 0.2086552381515503,
"learning_rate": 2.7075e-05,
"loss": 3.6113,
"step": 917
},
{
"epoch": 0.1024793042459001,
"grad_norm": 0.19474974274635315,
"learning_rate": 2.7050000000000004e-05,
"loss": 3.5955,
"step": 918
},
{
"epoch": 0.10259093747492613,
"grad_norm": 0.24013975262641907,
"learning_rate": 2.7025e-05,
"loss": 3.4309,
"step": 919
},
{
"epoch": 0.10270257070395217,
"grad_norm": 0.211077019572258,
"learning_rate": 2.7000000000000002e-05,
"loss": 3.5708,
"step": 920
},
{
"epoch": 0.1028142039329782,
"grad_norm": 0.21628805994987488,
"learning_rate": 2.6975000000000002e-05,
"loss": 3.5718,
"step": 921
},
{
"epoch": 0.10292583716200424,
"grad_norm": 0.22247810661792755,
"learning_rate": 2.6950000000000005e-05,
"loss": 3.517,
"step": 922
},
{
"epoch": 0.10303747039103027,
"grad_norm": 0.2166246920824051,
"learning_rate": 2.6925e-05,
"loss": 3.5201,
"step": 923
},
{
"epoch": 0.1031491036200563,
"grad_norm": 0.22399188578128815,
"learning_rate": 2.6900000000000003e-05,
"loss": 3.5311,
"step": 924
},
{
"epoch": 0.10326073684908234,
"grad_norm": 0.20243695378303528,
"learning_rate": 2.6875e-05,
"loss": 3.5067,
"step": 925
},
{
"epoch": 0.10337237007810837,
"grad_norm": 0.2083643674850464,
"learning_rate": 2.6850000000000002e-05,
"loss": 3.6032,
"step": 926
},
{
"epoch": 0.1034840033071344,
"grad_norm": 0.20954583585262299,
"learning_rate": 2.6825e-05,
"loss": 3.6164,
"step": 927
},
{
"epoch": 0.10359563653616044,
"grad_norm": 0.19354793429374695,
"learning_rate": 2.6800000000000004e-05,
"loss": 3.5979,
"step": 928
},
{
"epoch": 0.10370726976518647,
"grad_norm": 0.20499737560749054,
"learning_rate": 2.6775e-05,
"loss": 3.5708,
"step": 929
},
{
"epoch": 0.10381890299421251,
"grad_norm": 0.2208271622657776,
"learning_rate": 2.6750000000000003e-05,
"loss": 3.5369,
"step": 930
},
{
"epoch": 0.10393053622323856,
"grad_norm": 0.23384785652160645,
"learning_rate": 2.6725e-05,
"loss": 3.6815,
"step": 931
},
{
"epoch": 0.10404216945226459,
"grad_norm": 0.22724102437496185,
"learning_rate": 2.6700000000000002e-05,
"loss": 3.455,
"step": 932
},
{
"epoch": 0.10415380268129062,
"grad_norm": 0.20938707888126373,
"learning_rate": 2.6675e-05,
"loss": 3.5502,
"step": 933
},
{
"epoch": 0.10426543591031666,
"grad_norm": 0.21530798077583313,
"learning_rate": 2.6650000000000004e-05,
"loss": 3.548,
"step": 934
},
{
"epoch": 0.10437706913934269,
"grad_norm": 0.19547294080257416,
"learning_rate": 2.6625e-05,
"loss": 3.6098,
"step": 935
},
{
"epoch": 0.10448870236836873,
"grad_norm": 0.21658943593502045,
"learning_rate": 2.6600000000000003e-05,
"loss": 3.5294,
"step": 936
},
{
"epoch": 0.10460033559739476,
"grad_norm": 0.21133162081241608,
"learning_rate": 2.6575e-05,
"loss": 3.4385,
"step": 937
},
{
"epoch": 0.1047119688264208,
"grad_norm": 0.21791619062423706,
"learning_rate": 2.655e-05,
"loss": 3.6282,
"step": 938
},
{
"epoch": 0.10482360205544683,
"grad_norm": 0.20954184234142303,
"learning_rate": 2.6525e-05,
"loss": 3.588,
"step": 939
},
{
"epoch": 0.10493523528447286,
"grad_norm": 0.22659572958946228,
"learning_rate": 2.6500000000000004e-05,
"loss": 3.6033,
"step": 940
},
{
"epoch": 0.1050468685134989,
"grad_norm": 0.19709984958171844,
"learning_rate": 2.6475e-05,
"loss": 3.4595,
"step": 941
},
{
"epoch": 0.10515850174252493,
"grad_norm": 0.2094334065914154,
"learning_rate": 2.6450000000000003e-05,
"loss": 3.5436,
"step": 942
},
{
"epoch": 0.10527013497155097,
"grad_norm": 0.2036394476890564,
"learning_rate": 2.6425e-05,
"loss": 3.6691,
"step": 943
},
{
"epoch": 0.105381768200577,
"grad_norm": 0.23410657048225403,
"learning_rate": 2.64e-05,
"loss": 3.5435,
"step": 944
},
{
"epoch": 0.10549340142960303,
"grad_norm": 0.20020778477191925,
"learning_rate": 2.6375e-05,
"loss": 3.4263,
"step": 945
},
{
"epoch": 0.10560503465862907,
"grad_norm": 0.21765069663524628,
"learning_rate": 2.6350000000000004e-05,
"loss": 3.6565,
"step": 946
},
{
"epoch": 0.1057166678876551,
"grad_norm": 0.2083187699317932,
"learning_rate": 2.6325e-05,
"loss": 3.6342,
"step": 947
},
{
"epoch": 0.10582830111668115,
"grad_norm": 0.22899410128593445,
"learning_rate": 2.6300000000000002e-05,
"loss": 3.6186,
"step": 948
},
{
"epoch": 0.10593993434570718,
"grad_norm": 0.21531718969345093,
"learning_rate": 2.6275e-05,
"loss": 3.649,
"step": 949
},
{
"epoch": 0.10605156757473322,
"grad_norm": 0.20011349022388458,
"learning_rate": 2.625e-05,
"loss": 3.6019,
"step": 950
},
{
"epoch": 0.10616320080375925,
"grad_norm": 0.2076261192560196,
"learning_rate": 2.6225e-05,
"loss": 3.5969,
"step": 951
},
{
"epoch": 0.10627483403278529,
"grad_norm": 0.21286866068840027,
"learning_rate": 2.6200000000000003e-05,
"loss": 3.5662,
"step": 952
},
{
"epoch": 0.10638646726181132,
"grad_norm": 0.21730615198612213,
"learning_rate": 2.6175e-05,
"loss": 3.6115,
"step": 953
},
{
"epoch": 0.10649810049083736,
"grad_norm": 0.2095257192850113,
"learning_rate": 2.6150000000000002e-05,
"loss": 3.5406,
"step": 954
},
{
"epoch": 0.10660973371986339,
"grad_norm": 0.22268979251384735,
"learning_rate": 2.6124999999999998e-05,
"loss": 3.6423,
"step": 955
},
{
"epoch": 0.10672136694888942,
"grad_norm": 0.19767190515995026,
"learning_rate": 2.61e-05,
"loss": 3.4876,
"step": 956
},
{
"epoch": 0.10683300017791546,
"grad_norm": 0.21027278900146484,
"learning_rate": 2.6075e-05,
"loss": 3.6007,
"step": 957
},
{
"epoch": 0.10694463340694149,
"grad_norm": 0.21084555983543396,
"learning_rate": 2.6050000000000003e-05,
"loss": 3.6006,
"step": 958
},
{
"epoch": 0.10705626663596753,
"grad_norm": 0.22369611263275146,
"learning_rate": 2.6025e-05,
"loss": 3.5232,
"step": 959
},
{
"epoch": 0.10716789986499356,
"grad_norm": 0.21574999392032623,
"learning_rate": 2.6000000000000002e-05,
"loss": 3.5445,
"step": 960
},
{
"epoch": 0.1072795330940196,
"grad_norm": 0.21067297458648682,
"learning_rate": 2.5974999999999998e-05,
"loss": 3.4656,
"step": 961
},
{
"epoch": 0.10739116632304563,
"grad_norm": 0.21689800918102264,
"learning_rate": 2.595e-05,
"loss": 3.4898,
"step": 962
},
{
"epoch": 0.10750279955207166,
"grad_norm": 0.2273510992527008,
"learning_rate": 2.5925e-05,
"loss": 3.4481,
"step": 963
},
{
"epoch": 0.10761443278109771,
"grad_norm": 0.23203350603580475,
"learning_rate": 2.5900000000000003e-05,
"loss": 3.6151,
"step": 964
},
{
"epoch": 0.10772606601012374,
"grad_norm": 0.23211504518985748,
"learning_rate": 2.5875e-05,
"loss": 3.514,
"step": 965
},
{
"epoch": 0.10783769923914978,
"grad_norm": 0.2149866223335266,
"learning_rate": 2.585e-05,
"loss": 3.5996,
"step": 966
},
{
"epoch": 0.10794933246817581,
"grad_norm": 0.20356595516204834,
"learning_rate": 2.5824999999999998e-05,
"loss": 3.5729,
"step": 967
},
{
"epoch": 0.10806096569720185,
"grad_norm": 0.21406051516532898,
"learning_rate": 2.58e-05,
"loss": 3.5708,
"step": 968
},
{
"epoch": 0.10817259892622788,
"grad_norm": 0.2657632827758789,
"learning_rate": 2.5775e-05,
"loss": 3.5776,
"step": 969
},
{
"epoch": 0.10828423215525391,
"grad_norm": 0.22186456620693207,
"learning_rate": 2.5750000000000002e-05,
"loss": 3.5773,
"step": 970
},
{
"epoch": 0.10839586538427995,
"grad_norm": 0.20678916573524475,
"learning_rate": 2.5725e-05,
"loss": 3.5758,
"step": 971
},
{
"epoch": 0.10850749861330598,
"grad_norm": 0.21337588131427765,
"learning_rate": 2.57e-05,
"loss": 3.6108,
"step": 972
},
{
"epoch": 0.10861913184233202,
"grad_norm": 0.22611363232135773,
"learning_rate": 2.5675e-05,
"loss": 3.5212,
"step": 973
},
{
"epoch": 0.10873076507135805,
"grad_norm": 0.18545237183570862,
"learning_rate": 2.5650000000000003e-05,
"loss": 3.5526,
"step": 974
},
{
"epoch": 0.10884239830038409,
"grad_norm": 0.21344155073165894,
"learning_rate": 2.5625e-05,
"loss": 3.5118,
"step": 975
},
{
"epoch": 0.10895403152941012,
"grad_norm": 0.2155769020318985,
"learning_rate": 2.5600000000000002e-05,
"loss": 3.5866,
"step": 976
},
{
"epoch": 0.10906566475843615,
"grad_norm": 0.21503998339176178,
"learning_rate": 2.5574999999999998e-05,
"loss": 3.6141,
"step": 977
},
{
"epoch": 0.10917729798746219,
"grad_norm": 0.20480018854141235,
"learning_rate": 2.555e-05,
"loss": 3.6353,
"step": 978
},
{
"epoch": 0.10928893121648822,
"grad_norm": 0.22992366552352905,
"learning_rate": 2.5525e-05,
"loss": 3.4979,
"step": 979
},
{
"epoch": 0.10940056444551426,
"grad_norm": 0.20455677807331085,
"learning_rate": 2.5500000000000003e-05,
"loss": 3.5722,
"step": 980
},
{
"epoch": 0.1095121976745403,
"grad_norm": 0.22036480903625488,
"learning_rate": 2.5475e-05,
"loss": 3.4976,
"step": 981
},
{
"epoch": 0.10962383090356634,
"grad_norm": 0.21743951737880707,
"learning_rate": 2.5450000000000002e-05,
"loss": 3.6067,
"step": 982
},
{
"epoch": 0.10973546413259237,
"grad_norm": 0.2102222740650177,
"learning_rate": 2.5424999999999998e-05,
"loss": 3.5195,
"step": 983
},
{
"epoch": 0.1098470973616184,
"grad_norm": 0.22581855952739716,
"learning_rate": 2.54e-05,
"loss": 3.6253,
"step": 984
},
{
"epoch": 0.10995873059064444,
"grad_norm": 0.22696761786937714,
"learning_rate": 2.5375e-05,
"loss": 3.5024,
"step": 985
},
{
"epoch": 0.11007036381967047,
"grad_norm": 0.2530415952205658,
"learning_rate": 2.5350000000000003e-05,
"loss": 3.5781,
"step": 986
},
{
"epoch": 0.11018199704869651,
"grad_norm": 0.20334866642951965,
"learning_rate": 2.5325e-05,
"loss": 3.5655,
"step": 987
},
{
"epoch": 0.11029363027772254,
"grad_norm": 0.21357102692127228,
"learning_rate": 2.5300000000000002e-05,
"loss": 3.5622,
"step": 988
},
{
"epoch": 0.11040526350674858,
"grad_norm": 0.2151983678340912,
"learning_rate": 2.5274999999999998e-05,
"loss": 3.5925,
"step": 989
},
{
"epoch": 0.11051689673577461,
"grad_norm": 0.22665520012378693,
"learning_rate": 2.525e-05,
"loss": 3.6549,
"step": 990
},
{
"epoch": 0.11062852996480065,
"grad_norm": 0.2117994874715805,
"learning_rate": 2.5225e-05,
"loss": 3.573,
"step": 991
},
{
"epoch": 0.11074016319382668,
"grad_norm": 0.21009349822998047,
"learning_rate": 2.5200000000000003e-05,
"loss": 3.6013,
"step": 992
},
{
"epoch": 0.11085179642285271,
"grad_norm": 0.19128143787384033,
"learning_rate": 2.5175e-05,
"loss": 3.4833,
"step": 993
},
{
"epoch": 0.11096342965187875,
"grad_norm": 0.22493302822113037,
"learning_rate": 2.515e-05,
"loss": 3.559,
"step": 994
},
{
"epoch": 0.11107506288090478,
"grad_norm": 0.227183535695076,
"learning_rate": 2.5124999999999997e-05,
"loss": 3.5789,
"step": 995
},
{
"epoch": 0.11118669610993082,
"grad_norm": 0.20829260349273682,
"learning_rate": 2.51e-05,
"loss": 3.5505,
"step": 996
},
{
"epoch": 0.11129832933895686,
"grad_norm": 0.2089599072933197,
"learning_rate": 2.5075e-05,
"loss": 3.5832,
"step": 997
},
{
"epoch": 0.1114099625679829,
"grad_norm": 0.2007664442062378,
"learning_rate": 2.5050000000000002e-05,
"loss": 3.6101,
"step": 998
},
{
"epoch": 0.11152159579700893,
"grad_norm": 0.2141019105911255,
"learning_rate": 2.5025e-05,
"loss": 3.5275,
"step": 999
},
{
"epoch": 0.11163322902603497,
"grad_norm": 0.22629062831401825,
"learning_rate": 2.5e-05,
"loss": 3.6387,
"step": 1000
},
{
"epoch": 0.111744862255061,
"grad_norm": 0.20573370158672333,
"learning_rate": 2.4975e-05,
"loss": 3.43,
"step": 1001
},
{
"epoch": 0.11185649548408703,
"grad_norm": 0.2253374457359314,
"learning_rate": 2.495e-05,
"loss": 3.4178,
"step": 1002
},
{
"epoch": 0.11196812871311307,
"grad_norm": 0.22604350745677948,
"learning_rate": 2.4925000000000003e-05,
"loss": 3.593,
"step": 1003
},
{
"epoch": 0.1120797619421391,
"grad_norm": 0.21653111279010773,
"learning_rate": 2.4900000000000002e-05,
"loss": 3.5323,
"step": 1004
},
{
"epoch": 0.11219139517116514,
"grad_norm": 0.21783775091171265,
"learning_rate": 2.4875e-05,
"loss": 3.4612,
"step": 1005
},
{
"epoch": 0.11230302840019117,
"grad_norm": 0.20930244028568268,
"learning_rate": 2.485e-05,
"loss": 3.5483,
"step": 1006
},
{
"epoch": 0.1124146616292172,
"grad_norm": 0.22645394504070282,
"learning_rate": 2.4825e-05,
"loss": 3.5777,
"step": 1007
},
{
"epoch": 0.11252629485824324,
"grad_norm": 0.2336995005607605,
"learning_rate": 2.48e-05,
"loss": 3.484,
"step": 1008
},
{
"epoch": 0.11263792808726927,
"grad_norm": 0.23106829822063446,
"learning_rate": 2.4775000000000003e-05,
"loss": 3.6442,
"step": 1009
},
{
"epoch": 0.11274956131629531,
"grad_norm": 0.21401670575141907,
"learning_rate": 2.4750000000000002e-05,
"loss": 3.5101,
"step": 1010
},
{
"epoch": 0.11286119454532134,
"grad_norm": 0.21309833228588104,
"learning_rate": 2.4725e-05,
"loss": 3.5388,
"step": 1011
},
{
"epoch": 0.11297282777434738,
"grad_norm": 0.2092665135860443,
"learning_rate": 2.47e-05,
"loss": 3.5636,
"step": 1012
},
{
"epoch": 0.11308446100337341,
"grad_norm": 0.22628679871559143,
"learning_rate": 2.4675e-05,
"loss": 3.5913,
"step": 1013
},
{
"epoch": 0.11319609423239946,
"grad_norm": 0.22277000546455383,
"learning_rate": 2.465e-05,
"loss": 3.623,
"step": 1014
},
{
"epoch": 0.11330772746142549,
"grad_norm": 0.20142893493175507,
"learning_rate": 2.4625000000000002e-05,
"loss": 3.5986,
"step": 1015
},
{
"epoch": 0.11341936069045153,
"grad_norm": 0.21489818394184113,
"learning_rate": 2.46e-05,
"loss": 3.4893,
"step": 1016
},
{
"epoch": 0.11353099391947756,
"grad_norm": 0.22191816568374634,
"learning_rate": 2.4575e-05,
"loss": 3.5334,
"step": 1017
},
{
"epoch": 0.1136426271485036,
"grad_norm": 0.24484211206436157,
"learning_rate": 2.455e-05,
"loss": 3.6105,
"step": 1018
},
{
"epoch": 0.11375426037752963,
"grad_norm": 0.21178849041461945,
"learning_rate": 2.4525e-05,
"loss": 3.6572,
"step": 1019
},
{
"epoch": 0.11386589360655566,
"grad_norm": 0.20839335024356842,
"learning_rate": 2.45e-05,
"loss": 3.5095,
"step": 1020
},
{
"epoch": 0.1139775268355817,
"grad_norm": 0.21311017870903015,
"learning_rate": 2.4475000000000002e-05,
"loss": 3.5771,
"step": 1021
},
{
"epoch": 0.11408916006460773,
"grad_norm": 0.22781410813331604,
"learning_rate": 2.445e-05,
"loss": 3.6721,
"step": 1022
},
{
"epoch": 0.11420079329363376,
"grad_norm": 0.2103354036808014,
"learning_rate": 2.4425e-05,
"loss": 3.5926,
"step": 1023
},
{
"epoch": 0.1143124265226598,
"grad_norm": 0.21094770729541779,
"learning_rate": 2.44e-05,
"loss": 3.5744,
"step": 1024
},
{
"epoch": 0.11442405975168583,
"grad_norm": 0.22316160798072815,
"learning_rate": 2.4375e-05,
"loss": 3.6067,
"step": 1025
},
{
"epoch": 0.11453569298071187,
"grad_norm": 0.23794502019882202,
"learning_rate": 2.435e-05,
"loss": 3.5447,
"step": 1026
},
{
"epoch": 0.1146473262097379,
"grad_norm": 0.20586435496807098,
"learning_rate": 2.4325000000000002e-05,
"loss": 3.5765,
"step": 1027
},
{
"epoch": 0.11475895943876394,
"grad_norm": 0.22575588524341583,
"learning_rate": 2.43e-05,
"loss": 3.5735,
"step": 1028
},
{
"epoch": 0.11487059266778997,
"grad_norm": 0.2228063941001892,
"learning_rate": 2.4275e-05,
"loss": 3.6316,
"step": 1029
},
{
"epoch": 0.11498222589681602,
"grad_norm": 0.21498019993305206,
"learning_rate": 2.425e-05,
"loss": 3.6333,
"step": 1030
},
{
"epoch": 0.11509385912584205,
"grad_norm": 0.23310241103172302,
"learning_rate": 2.4225e-05,
"loss": 3.6192,
"step": 1031
},
{
"epoch": 0.11520549235486809,
"grad_norm": 0.2210279405117035,
"learning_rate": 2.4200000000000002e-05,
"loss": 3.6494,
"step": 1032
},
{
"epoch": 0.11531712558389412,
"grad_norm": 0.20151478052139282,
"learning_rate": 2.4175e-05,
"loss": 3.5992,
"step": 1033
},
{
"epoch": 0.11542875881292015,
"grad_norm": 0.20288068056106567,
"learning_rate": 2.415e-05,
"loss": 3.5792,
"step": 1034
},
{
"epoch": 0.11554039204194619,
"grad_norm": 0.20331978797912598,
"learning_rate": 2.4125e-05,
"loss": 3.5536,
"step": 1035
},
{
"epoch": 0.11565202527097222,
"grad_norm": 0.20420598983764648,
"learning_rate": 2.41e-05,
"loss": 3.6383,
"step": 1036
},
{
"epoch": 0.11576365849999826,
"grad_norm": 0.21970772743225098,
"learning_rate": 2.4075e-05,
"loss": 3.5477,
"step": 1037
},
{
"epoch": 0.11587529172902429,
"grad_norm": 0.2087366133928299,
"learning_rate": 2.4050000000000002e-05,
"loss": 3.5247,
"step": 1038
},
{
"epoch": 0.11598692495805032,
"grad_norm": 0.21893273293972015,
"learning_rate": 2.4025e-05,
"loss": 3.5824,
"step": 1039
},
{
"epoch": 0.11609855818707636,
"grad_norm": 0.20187819004058838,
"learning_rate": 2.4e-05,
"loss": 3.5173,
"step": 1040
},
{
"epoch": 0.11621019141610239,
"grad_norm": 0.2175942361354828,
"learning_rate": 2.3975e-05,
"loss": 3.6047,
"step": 1041
},
{
"epoch": 0.11632182464512843,
"grad_norm": 0.21350717544555664,
"learning_rate": 2.395e-05,
"loss": 3.5274,
"step": 1042
},
{
"epoch": 0.11643345787415446,
"grad_norm": 0.20837292075157166,
"learning_rate": 2.3925e-05,
"loss": 3.4366,
"step": 1043
},
{
"epoch": 0.1165450911031805,
"grad_norm": 0.20703855156898499,
"learning_rate": 2.39e-05,
"loss": 3.545,
"step": 1044
},
{
"epoch": 0.11665672433220653,
"grad_norm": 0.21580536663532257,
"learning_rate": 2.3875e-05,
"loss": 3.4814,
"step": 1045
},
{
"epoch": 0.11676835756123256,
"grad_norm": 0.21015796065330505,
"learning_rate": 2.385e-05,
"loss": 3.5995,
"step": 1046
},
{
"epoch": 0.11687999079025861,
"grad_norm": 0.21610529720783234,
"learning_rate": 2.3825e-05,
"loss": 3.5723,
"step": 1047
},
{
"epoch": 0.11699162401928465,
"grad_norm": 0.20380514860153198,
"learning_rate": 2.38e-05,
"loss": 3.6053,
"step": 1048
},
{
"epoch": 0.11710325724831068,
"grad_norm": 0.23667509853839874,
"learning_rate": 2.3775e-05,
"loss": 3.4631,
"step": 1049
},
{
"epoch": 0.11721489047733671,
"grad_norm": 0.20652474462985992,
"learning_rate": 2.375e-05,
"loss": 3.4647,
"step": 1050
},
{
"epoch": 0.11732652370636275,
"grad_norm": 0.2219213992357254,
"learning_rate": 2.3725e-05,
"loss": 3.6314,
"step": 1051
},
{
"epoch": 0.11743815693538878,
"grad_norm": 0.21442031860351562,
"learning_rate": 2.37e-05,
"loss": 3.5025,
"step": 1052
},
{
"epoch": 0.11754979016441482,
"grad_norm": 0.2127211093902588,
"learning_rate": 2.3675e-05,
"loss": 3.5665,
"step": 1053
},
{
"epoch": 0.11766142339344085,
"grad_norm": 0.2181536704301834,
"learning_rate": 2.365e-05,
"loss": 3.5881,
"step": 1054
},
{
"epoch": 0.11777305662246688,
"grad_norm": 0.20248019695281982,
"learning_rate": 2.3624999999999998e-05,
"loss": 3.5307,
"step": 1055
},
{
"epoch": 0.11788468985149292,
"grad_norm": 0.20492227375507355,
"learning_rate": 2.36e-05,
"loss": 3.567,
"step": 1056
},
{
"epoch": 0.11799632308051895,
"grad_norm": 0.21641506254673004,
"learning_rate": 2.3575e-05,
"loss": 3.5616,
"step": 1057
},
{
"epoch": 0.11810795630954499,
"grad_norm": 0.22132819890975952,
"learning_rate": 2.355e-05,
"loss": 3.5973,
"step": 1058
},
{
"epoch": 0.11821958953857102,
"grad_norm": 0.2025281935930252,
"learning_rate": 2.3525e-05,
"loss": 3.6129,
"step": 1059
},
{
"epoch": 0.11833122276759706,
"grad_norm": 0.22255505621433258,
"learning_rate": 2.35e-05,
"loss": 3.5781,
"step": 1060
},
{
"epoch": 0.11844285599662309,
"grad_norm": 0.2115355283021927,
"learning_rate": 2.3475e-05,
"loss": 3.6349,
"step": 1061
},
{
"epoch": 0.11855448922564912,
"grad_norm": 0.19691035151481628,
"learning_rate": 2.345e-05,
"loss": 3.5141,
"step": 1062
},
{
"epoch": 0.11866612245467517,
"grad_norm": 0.21706706285476685,
"learning_rate": 2.3425000000000004e-05,
"loss": 3.6435,
"step": 1063
},
{
"epoch": 0.1187777556837012,
"grad_norm": 0.22850151360034943,
"learning_rate": 2.3400000000000003e-05,
"loss": 3.518,
"step": 1064
},
{
"epoch": 0.11888938891272724,
"grad_norm": 0.20553866028785706,
"learning_rate": 2.3375000000000002e-05,
"loss": 3.4698,
"step": 1065
},
{
"epoch": 0.11900102214175327,
"grad_norm": 0.20245562493801117,
"learning_rate": 2.3350000000000002e-05,
"loss": 3.5196,
"step": 1066
},
{
"epoch": 0.11911265537077931,
"grad_norm": 0.21691791713237762,
"learning_rate": 2.3325e-05,
"loss": 3.596,
"step": 1067
},
{
"epoch": 0.11922428859980534,
"grad_norm": 0.22258713841438293,
"learning_rate": 2.3300000000000004e-05,
"loss": 3.5375,
"step": 1068
},
{
"epoch": 0.11933592182883138,
"grad_norm": 0.21868087351322174,
"learning_rate": 2.3275000000000003e-05,
"loss": 3.602,
"step": 1069
},
{
"epoch": 0.11944755505785741,
"grad_norm": 0.22205567359924316,
"learning_rate": 2.3250000000000003e-05,
"loss": 3.5399,
"step": 1070
},
{
"epoch": 0.11955918828688344,
"grad_norm": 0.20832020044326782,
"learning_rate": 2.3225000000000002e-05,
"loss": 3.5286,
"step": 1071
},
{
"epoch": 0.11967082151590948,
"grad_norm": 0.2094217836856842,
"learning_rate": 2.32e-05,
"loss": 3.6278,
"step": 1072
},
{
"epoch": 0.11978245474493551,
"grad_norm": 0.21546511352062225,
"learning_rate": 2.3175e-05,
"loss": 3.4909,
"step": 1073
},
{
"epoch": 0.11989408797396155,
"grad_norm": 0.1934429258108139,
"learning_rate": 2.3150000000000004e-05,
"loss": 3.471,
"step": 1074
},
{
"epoch": 0.12000572120298758,
"grad_norm": 0.2332877367734909,
"learning_rate": 2.3125000000000003e-05,
"loss": 3.5833,
"step": 1075
},
{
"epoch": 0.12011735443201361,
"grad_norm": 0.2151862233877182,
"learning_rate": 2.3100000000000002e-05,
"loss": 3.5801,
"step": 1076
},
{
"epoch": 0.12022898766103965,
"grad_norm": 0.21085871756076813,
"learning_rate": 2.3075000000000002e-05,
"loss": 3.5103,
"step": 1077
},
{
"epoch": 0.12034062089006568,
"grad_norm": 0.20378586649894714,
"learning_rate": 2.305e-05,
"loss": 3.6359,
"step": 1078
},
{
"epoch": 0.12045225411909172,
"grad_norm": 0.20275592803955078,
"learning_rate": 2.3025e-05,
"loss": 3.5791,
"step": 1079
},
{
"epoch": 0.12056388734811777,
"grad_norm": 0.23284991085529327,
"learning_rate": 2.3000000000000003e-05,
"loss": 3.4422,
"step": 1080
},
{
"epoch": 0.1206755205771438,
"grad_norm": 0.20677290856838226,
"learning_rate": 2.2975000000000003e-05,
"loss": 3.6419,
"step": 1081
},
{
"epoch": 0.12078715380616983,
"grad_norm": 0.21440984308719635,
"learning_rate": 2.2950000000000002e-05,
"loss": 3.5431,
"step": 1082
},
{
"epoch": 0.12089878703519587,
"grad_norm": 0.22608491778373718,
"learning_rate": 2.2925e-05,
"loss": 3.5521,
"step": 1083
},
{
"epoch": 0.1210104202642219,
"grad_norm": 0.2001263052225113,
"learning_rate": 2.29e-05,
"loss": 3.4811,
"step": 1084
},
{
"epoch": 0.12112205349324794,
"grad_norm": 0.21206040680408478,
"learning_rate": 2.2875e-05,
"loss": 3.6102,
"step": 1085
},
{
"epoch": 0.12123368672227397,
"grad_norm": 0.2126099318265915,
"learning_rate": 2.2850000000000003e-05,
"loss": 3.5218,
"step": 1086
},
{
"epoch": 0.1213453199513,
"grad_norm": 0.22794200479984283,
"learning_rate": 2.2825000000000003e-05,
"loss": 3.6669,
"step": 1087
},
{
"epoch": 0.12145695318032604,
"grad_norm": 0.21312151849269867,
"learning_rate": 2.2800000000000002e-05,
"loss": 3.5217,
"step": 1088
},
{
"epoch": 0.12156858640935207,
"grad_norm": 0.21940995752811432,
"learning_rate": 2.2775e-05,
"loss": 3.5297,
"step": 1089
},
{
"epoch": 0.1216802196383781,
"grad_norm": 0.22202187776565552,
"learning_rate": 2.275e-05,
"loss": 3.6379,
"step": 1090
},
{
"epoch": 0.12179185286740414,
"grad_norm": 0.2047797590494156,
"learning_rate": 2.2725000000000003e-05,
"loss": 3.584,
"step": 1091
},
{
"epoch": 0.12190348609643017,
"grad_norm": 0.19574470818042755,
"learning_rate": 2.2700000000000003e-05,
"loss": 3.5189,
"step": 1092
},
{
"epoch": 0.12201511932545621,
"grad_norm": 0.21649527549743652,
"learning_rate": 2.2675000000000002e-05,
"loss": 3.5355,
"step": 1093
},
{
"epoch": 0.12212675255448224,
"grad_norm": 0.2137257158756256,
"learning_rate": 2.265e-05,
"loss": 3.5381,
"step": 1094
},
{
"epoch": 0.12223838578350828,
"grad_norm": 0.21022838354110718,
"learning_rate": 2.2625e-05,
"loss": 3.4563,
"step": 1095
},
{
"epoch": 0.12235001901253433,
"grad_norm": 0.24833272397518158,
"learning_rate": 2.26e-05,
"loss": 3.7083,
"step": 1096
},
{
"epoch": 0.12246165224156036,
"grad_norm": 0.22184813022613525,
"learning_rate": 2.2575000000000003e-05,
"loss": 3.5903,
"step": 1097
},
{
"epoch": 0.1225732854705864,
"grad_norm": 0.21476183831691742,
"learning_rate": 2.2550000000000003e-05,
"loss": 3.4829,
"step": 1098
},
{
"epoch": 0.12268491869961243,
"grad_norm": 0.21774616837501526,
"learning_rate": 2.2525000000000002e-05,
"loss": 3.5285,
"step": 1099
},
{
"epoch": 0.12279655192863846,
"grad_norm": 0.20590829849243164,
"learning_rate": 2.25e-05,
"loss": 3.5381,
"step": 1100
},
{
"epoch": 0.1229081851576645,
"grad_norm": 0.20462672412395477,
"learning_rate": 2.2475e-05,
"loss": 3.5634,
"step": 1101
},
{
"epoch": 0.12301981838669053,
"grad_norm": 0.22843170166015625,
"learning_rate": 2.245e-05,
"loss": 3.6618,
"step": 1102
},
{
"epoch": 0.12313145161571656,
"grad_norm": 0.20621445775032043,
"learning_rate": 2.2425000000000003e-05,
"loss": 3.6289,
"step": 1103
},
{
"epoch": 0.1232430848447426,
"grad_norm": 0.21088461577892303,
"learning_rate": 2.2400000000000002e-05,
"loss": 3.5852,
"step": 1104
},
{
"epoch": 0.12335471807376863,
"grad_norm": 0.24154846370220184,
"learning_rate": 2.2375000000000002e-05,
"loss": 3.602,
"step": 1105
},
{
"epoch": 0.12346635130279467,
"grad_norm": 0.2082412987947464,
"learning_rate": 2.235e-05,
"loss": 3.5066,
"step": 1106
},
{
"epoch": 0.1235779845318207,
"grad_norm": 0.22379964590072632,
"learning_rate": 2.2325e-05,
"loss": 3.56,
"step": 1107
},
{
"epoch": 0.12368961776084673,
"grad_norm": 0.2274475395679474,
"learning_rate": 2.23e-05,
"loss": 3.5029,
"step": 1108
},
{
"epoch": 0.12380125098987277,
"grad_norm": 0.20525825023651123,
"learning_rate": 2.2275000000000003e-05,
"loss": 3.5858,
"step": 1109
},
{
"epoch": 0.1239128842188988,
"grad_norm": 0.20631299912929535,
"learning_rate": 2.2250000000000002e-05,
"loss": 3.5092,
"step": 1110
},
{
"epoch": 0.12402451744792484,
"grad_norm": 0.21575914323329926,
"learning_rate": 2.2225e-05,
"loss": 3.4236,
"step": 1111
},
{
"epoch": 0.12413615067695087,
"grad_norm": 0.20700016617774963,
"learning_rate": 2.22e-05,
"loss": 3.5913,
"step": 1112
},
{
"epoch": 0.12424778390597692,
"grad_norm": 0.21574947237968445,
"learning_rate": 2.2175e-05,
"loss": 3.5339,
"step": 1113
},
{
"epoch": 0.12435941713500295,
"grad_norm": 0.23032042384147644,
"learning_rate": 2.215e-05,
"loss": 3.4201,
"step": 1114
},
{
"epoch": 0.12447105036402899,
"grad_norm": 0.21576251089572906,
"learning_rate": 2.2125000000000002e-05,
"loss": 3.4075,
"step": 1115
},
{
"epoch": 0.12458268359305502,
"grad_norm": 0.2106648087501526,
"learning_rate": 2.2100000000000002e-05,
"loss": 3.5918,
"step": 1116
},
{
"epoch": 0.12469431682208106,
"grad_norm": 0.2069109082221985,
"learning_rate": 2.2075e-05,
"loss": 3.4508,
"step": 1117
},
{
"epoch": 0.12480595005110709,
"grad_norm": 0.22045724093914032,
"learning_rate": 2.205e-05,
"loss": 3.621,
"step": 1118
},
{
"epoch": 0.12491758328013312,
"grad_norm": 0.20719186961650848,
"learning_rate": 2.2025e-05,
"loss": 3.4853,
"step": 1119
},
{
"epoch": 0.12502921650915916,
"grad_norm": 0.21642549335956573,
"learning_rate": 2.2000000000000003e-05,
"loss": 3.5141,
"step": 1120
},
{
"epoch": 0.1251408497381852,
"grad_norm": 0.2098337560892105,
"learning_rate": 2.1975000000000002e-05,
"loss": 3.4767,
"step": 1121
},
{
"epoch": 0.12525248296721123,
"grad_norm": 0.22343602776527405,
"learning_rate": 2.195e-05,
"loss": 3.5406,
"step": 1122
},
{
"epoch": 0.12536411619623727,
"grad_norm": 0.2159809023141861,
"learning_rate": 2.1925e-05,
"loss": 3.5063,
"step": 1123
},
{
"epoch": 0.1254757494252633,
"grad_norm": 0.2167646586894989,
"learning_rate": 2.19e-05,
"loss": 3.5774,
"step": 1124
},
{
"epoch": 0.12558738265428934,
"grad_norm": 0.21842193603515625,
"learning_rate": 2.1875e-05,
"loss": 3.6195,
"step": 1125
},
{
"epoch": 0.12569901588331536,
"grad_norm": 0.2263418734073639,
"learning_rate": 2.1850000000000003e-05,
"loss": 3.4394,
"step": 1126
},
{
"epoch": 0.1258106491123414,
"grad_norm": 0.22126170992851257,
"learning_rate": 2.1825000000000002e-05,
"loss": 3.63,
"step": 1127
},
{
"epoch": 0.12592228234136743,
"grad_norm": 0.21873587369918823,
"learning_rate": 2.18e-05,
"loss": 3.4907,
"step": 1128
},
{
"epoch": 0.12603391557039348,
"grad_norm": 0.20537471771240234,
"learning_rate": 2.1775e-05,
"loss": 3.5101,
"step": 1129
},
{
"epoch": 0.1261455487994195,
"grad_norm": 0.21918705105781555,
"learning_rate": 2.175e-05,
"loss": 3.5732,
"step": 1130
},
{
"epoch": 0.12625718202844555,
"grad_norm": 0.19748879969120026,
"learning_rate": 2.1725e-05,
"loss": 3.5652,
"step": 1131
},
{
"epoch": 0.12636881525747157,
"grad_norm": 0.20400671660900116,
"learning_rate": 2.1700000000000002e-05,
"loss": 3.5722,
"step": 1132
},
{
"epoch": 0.12648044848649762,
"grad_norm": 0.2150668501853943,
"learning_rate": 2.1675e-05,
"loss": 3.5738,
"step": 1133
},
{
"epoch": 0.12659208171552364,
"grad_norm": 0.22378993034362793,
"learning_rate": 2.165e-05,
"loss": 3.51,
"step": 1134
},
{
"epoch": 0.12670371494454968,
"grad_norm": 0.20679309964179993,
"learning_rate": 2.1625e-05,
"loss": 3.5392,
"step": 1135
},
{
"epoch": 0.1268153481735757,
"grad_norm": 0.21865659952163696,
"learning_rate": 2.16e-05,
"loss": 3.414,
"step": 1136
},
{
"epoch": 0.12692698140260175,
"grad_norm": 0.23066933453083038,
"learning_rate": 2.1575e-05,
"loss": 3.5871,
"step": 1137
},
{
"epoch": 0.1270386146316278,
"grad_norm": 0.21353909373283386,
"learning_rate": 2.1550000000000002e-05,
"loss": 3.6255,
"step": 1138
},
{
"epoch": 0.12715024786065382,
"grad_norm": 0.21113964915275574,
"learning_rate": 2.1525e-05,
"loss": 3.5693,
"step": 1139
},
{
"epoch": 0.12726188108967987,
"grad_norm": 0.2107081115245819,
"learning_rate": 2.15e-05,
"loss": 3.5284,
"step": 1140
},
{
"epoch": 0.1273735143187059,
"grad_norm": 0.21763132512569427,
"learning_rate": 2.1475e-05,
"loss": 3.5754,
"step": 1141
},
{
"epoch": 0.12748514754773194,
"grad_norm": 0.21894939243793488,
"learning_rate": 2.145e-05,
"loss": 3.5288,
"step": 1142
},
{
"epoch": 0.12759678077675796,
"grad_norm": 0.21666420996189117,
"learning_rate": 2.1425e-05,
"loss": 3.5725,
"step": 1143
},
{
"epoch": 0.127708414005784,
"grad_norm": 0.2001209706068039,
"learning_rate": 2.1400000000000002e-05,
"loss": 3.484,
"step": 1144
},
{
"epoch": 0.12782004723481002,
"grad_norm": 0.21310877799987793,
"learning_rate": 2.1375e-05,
"loss": 3.5372,
"step": 1145
},
{
"epoch": 0.12793168046383607,
"grad_norm": 0.2128131240606308,
"learning_rate": 2.135e-05,
"loss": 3.5728,
"step": 1146
},
{
"epoch": 0.1280433136928621,
"grad_norm": 0.20548781752586365,
"learning_rate": 2.1325e-05,
"loss": 3.562,
"step": 1147
},
{
"epoch": 0.12815494692188814,
"grad_norm": 0.22280630469322205,
"learning_rate": 2.13e-05,
"loss": 3.5497,
"step": 1148
},
{
"epoch": 0.12826658015091416,
"grad_norm": 0.20951642096042633,
"learning_rate": 2.1275000000000002e-05,
"loss": 3.532,
"step": 1149
},
{
"epoch": 0.1283782133799402,
"grad_norm": 0.2074415534734726,
"learning_rate": 2.125e-05,
"loss": 3.5628,
"step": 1150
},
{
"epoch": 0.12848984660896623,
"grad_norm": 0.2202114462852478,
"learning_rate": 2.1225e-05,
"loss": 3.5687,
"step": 1151
},
{
"epoch": 0.12860147983799228,
"grad_norm": 0.21095822751522064,
"learning_rate": 2.12e-05,
"loss": 3.5973,
"step": 1152
},
{
"epoch": 0.1287131130670183,
"grad_norm": 0.22076840698719025,
"learning_rate": 2.1175e-05,
"loss": 3.5953,
"step": 1153
},
{
"epoch": 0.12882474629604435,
"grad_norm": 0.2007976472377777,
"learning_rate": 2.115e-05,
"loss": 3.5082,
"step": 1154
},
{
"epoch": 0.1289363795250704,
"grad_norm": 0.2047666758298874,
"learning_rate": 2.1125000000000002e-05,
"loss": 3.5165,
"step": 1155
},
{
"epoch": 0.12904801275409641,
"grad_norm": 0.21572445333003998,
"learning_rate": 2.11e-05,
"loss": 3.5957,
"step": 1156
},
{
"epoch": 0.12915964598312246,
"grad_norm": 0.2182898223400116,
"learning_rate": 2.1075e-05,
"loss": 3.6007,
"step": 1157
},
{
"epoch": 0.12927127921214848,
"grad_norm": 0.21526333689689636,
"learning_rate": 2.105e-05,
"loss": 3.5694,
"step": 1158
},
{
"epoch": 0.12938291244117453,
"grad_norm": 0.20486585795879364,
"learning_rate": 2.1025e-05,
"loss": 3.5158,
"step": 1159
},
{
"epoch": 0.12949454567020055,
"grad_norm": 0.20395594835281372,
"learning_rate": 2.1e-05,
"loss": 3.4897,
"step": 1160
},
{
"epoch": 0.1296061788992266,
"grad_norm": 0.20245909690856934,
"learning_rate": 2.0975e-05,
"loss": 3.416,
"step": 1161
},
{
"epoch": 0.12971781212825262,
"grad_norm": 0.20281971991062164,
"learning_rate": 2.095e-05,
"loss": 3.6577,
"step": 1162
},
{
"epoch": 0.12982944535727867,
"grad_norm": 0.20469930768013,
"learning_rate": 2.0925e-05,
"loss": 3.5925,
"step": 1163
},
{
"epoch": 0.1299410785863047,
"grad_norm": 0.21145623922348022,
"learning_rate": 2.09e-05,
"loss": 3.6639,
"step": 1164
},
{
"epoch": 0.13005271181533073,
"grad_norm": 0.2256203591823578,
"learning_rate": 2.0875e-05,
"loss": 3.6121,
"step": 1165
},
{
"epoch": 0.13016434504435676,
"grad_norm": 0.21971777081489563,
"learning_rate": 2.085e-05,
"loss": 3.6576,
"step": 1166
},
{
"epoch": 0.1302759782733828,
"grad_norm": 0.21563662588596344,
"learning_rate": 2.0825e-05,
"loss": 3.6168,
"step": 1167
},
{
"epoch": 0.13038761150240882,
"grad_norm": 0.2200465053319931,
"learning_rate": 2.08e-05,
"loss": 3.5932,
"step": 1168
},
{
"epoch": 0.13049924473143487,
"grad_norm": 0.22271549701690674,
"learning_rate": 2.0775e-05,
"loss": 3.6098,
"step": 1169
},
{
"epoch": 0.13061087796046092,
"grad_norm": 0.19784262776374817,
"learning_rate": 2.075e-05,
"loss": 3.6427,
"step": 1170
},
{
"epoch": 0.13072251118948694,
"grad_norm": 0.2114451676607132,
"learning_rate": 2.0725e-05,
"loss": 3.5497,
"step": 1171
},
{
"epoch": 0.130834144418513,
"grad_norm": 0.2250889390707016,
"learning_rate": 2.07e-05,
"loss": 3.4859,
"step": 1172
},
{
"epoch": 0.130945777647539,
"grad_norm": 0.20784199237823486,
"learning_rate": 2.0675e-05,
"loss": 3.5053,
"step": 1173
},
{
"epoch": 0.13105741087656506,
"grad_norm": 0.20545898377895355,
"learning_rate": 2.065e-05,
"loss": 3.5911,
"step": 1174
},
{
"epoch": 0.13116904410559108,
"grad_norm": 0.20443479716777802,
"learning_rate": 2.0625e-05,
"loss": 3.5173,
"step": 1175
},
{
"epoch": 0.13128067733461712,
"grad_norm": 0.2182706594467163,
"learning_rate": 2.06e-05,
"loss": 3.5701,
"step": 1176
},
{
"epoch": 0.13139231056364314,
"grad_norm": 0.20864829421043396,
"learning_rate": 2.0575e-05,
"loss": 3.4998,
"step": 1177
},
{
"epoch": 0.1315039437926692,
"grad_norm": 0.2056749314069748,
"learning_rate": 2.055e-05,
"loss": 3.5519,
"step": 1178
},
{
"epoch": 0.1316155770216952,
"grad_norm": 0.2386610209941864,
"learning_rate": 2.0525e-05,
"loss": 3.5182,
"step": 1179
},
{
"epoch": 0.13172721025072126,
"grad_norm": 0.2098884880542755,
"learning_rate": 2.05e-05,
"loss": 3.6639,
"step": 1180
},
{
"epoch": 0.13183884347974728,
"grad_norm": 0.20269910991191864,
"learning_rate": 2.0475e-05,
"loss": 3.5699,
"step": 1181
},
{
"epoch": 0.13195047670877333,
"grad_norm": 0.2196349799633026,
"learning_rate": 2.045e-05,
"loss": 3.4996,
"step": 1182
},
{
"epoch": 0.13206210993779935,
"grad_norm": 0.20694006979465485,
"learning_rate": 2.0425e-05,
"loss": 3.4347,
"step": 1183
},
{
"epoch": 0.1321737431668254,
"grad_norm": 0.21867570281028748,
"learning_rate": 2.04e-05,
"loss": 3.5738,
"step": 1184
},
{
"epoch": 0.13228537639585142,
"grad_norm": 0.20155374705791473,
"learning_rate": 2.0375e-05,
"loss": 3.4647,
"step": 1185
},
{
"epoch": 0.13239700962487747,
"grad_norm": 0.22435399889945984,
"learning_rate": 2.035e-05,
"loss": 3.5772,
"step": 1186
},
{
"epoch": 0.1325086428539035,
"grad_norm": 0.25098690390586853,
"learning_rate": 2.0325e-05,
"loss": 3.6616,
"step": 1187
},
{
"epoch": 0.13262027608292953,
"grad_norm": 0.2307552546262741,
"learning_rate": 2.0300000000000002e-05,
"loss": 3.5784,
"step": 1188
},
{
"epoch": 0.13273190931195558,
"grad_norm": 0.25000953674316406,
"learning_rate": 2.0275e-05,
"loss": 3.5027,
"step": 1189
},
{
"epoch": 0.1328435425409816,
"grad_norm": 0.19841337203979492,
"learning_rate": 2.025e-05,
"loss": 3.4868,
"step": 1190
},
{
"epoch": 0.13295517577000765,
"grad_norm": 0.22916224598884583,
"learning_rate": 2.0225000000000004e-05,
"loss": 3.5208,
"step": 1191
},
{
"epoch": 0.13306680899903367,
"grad_norm": 0.19958780705928802,
"learning_rate": 2.0200000000000003e-05,
"loss": 3.4185,
"step": 1192
},
{
"epoch": 0.13317844222805972,
"grad_norm": 0.22007910907268524,
"learning_rate": 2.0175000000000003e-05,
"loss": 3.4641,
"step": 1193
},
{
"epoch": 0.13329007545708574,
"grad_norm": 0.21056948602199554,
"learning_rate": 2.0150000000000002e-05,
"loss": 3.5459,
"step": 1194
},
{
"epoch": 0.1334017086861118,
"grad_norm": 0.23522870242595673,
"learning_rate": 2.0125e-05,
"loss": 3.594,
"step": 1195
},
{
"epoch": 0.1335133419151378,
"grad_norm": 0.23401057720184326,
"learning_rate": 2.01e-05,
"loss": 3.4671,
"step": 1196
},
{
"epoch": 0.13362497514416385,
"grad_norm": 0.218308225274086,
"learning_rate": 2.0075000000000003e-05,
"loss": 3.5739,
"step": 1197
},
{
"epoch": 0.13373660837318987,
"grad_norm": 0.2071041613817215,
"learning_rate": 2.0050000000000003e-05,
"loss": 3.5082,
"step": 1198
},
{
"epoch": 0.13384824160221592,
"grad_norm": 0.21211779117584229,
"learning_rate": 2.0025000000000002e-05,
"loss": 3.5992,
"step": 1199
},
{
"epoch": 0.13395987483124194,
"grad_norm": 0.20415015518665314,
"learning_rate": 2e-05,
"loss": 3.4991,
"step": 1200
},
{
"epoch": 0.134071508060268,
"grad_norm": 0.23036159574985504,
"learning_rate": 1.9975e-05,
"loss": 3.6017,
"step": 1201
},
{
"epoch": 0.134183141289294,
"grad_norm": 0.2087535858154297,
"learning_rate": 1.995e-05,
"loss": 3.4676,
"step": 1202
},
{
"epoch": 0.13429477451832006,
"grad_norm": 0.2105867713689804,
"learning_rate": 1.9925000000000003e-05,
"loss": 3.5076,
"step": 1203
},
{
"epoch": 0.1344064077473461,
"grad_norm": 0.20705103874206543,
"learning_rate": 1.9900000000000003e-05,
"loss": 3.501,
"step": 1204
},
{
"epoch": 0.13451804097637213,
"grad_norm": 0.21231180429458618,
"learning_rate": 1.9875000000000002e-05,
"loss": 3.6563,
"step": 1205
},
{
"epoch": 0.13462967420539818,
"grad_norm": 0.21027925610542297,
"learning_rate": 1.985e-05,
"loss": 3.4541,
"step": 1206
},
{
"epoch": 0.1347413074344242,
"grad_norm": 0.2149907499551773,
"learning_rate": 1.9825e-05,
"loss": 3.5745,
"step": 1207
},
{
"epoch": 0.13485294066345024,
"grad_norm": 0.20292307436466217,
"learning_rate": 1.9800000000000004e-05,
"loss": 3.5716,
"step": 1208
},
{
"epoch": 0.13496457389247626,
"grad_norm": 0.21953509747982025,
"learning_rate": 1.9775000000000003e-05,
"loss": 3.52,
"step": 1209
},
{
"epoch": 0.1350762071215023,
"grad_norm": 0.20980951189994812,
"learning_rate": 1.9750000000000002e-05,
"loss": 3.5848,
"step": 1210
},
{
"epoch": 0.13518784035052833,
"grad_norm": 0.200588658452034,
"learning_rate": 1.9725000000000002e-05,
"loss": 3.5595,
"step": 1211
},
{
"epoch": 0.13529947357955438,
"grad_norm": 0.23342964053153992,
"learning_rate": 1.97e-05,
"loss": 3.5737,
"step": 1212
},
{
"epoch": 0.1354111068085804,
"grad_norm": 0.21018610894680023,
"learning_rate": 1.9675e-05,
"loss": 3.5323,
"step": 1213
},
{
"epoch": 0.13552274003760645,
"grad_norm": 0.2235204577445984,
"learning_rate": 1.9650000000000003e-05,
"loss": 3.6266,
"step": 1214
},
{
"epoch": 0.13563437326663247,
"grad_norm": 0.20950132608413696,
"learning_rate": 1.9625000000000003e-05,
"loss": 3.588,
"step": 1215
},
{
"epoch": 0.13574600649565852,
"grad_norm": 0.21354885399341583,
"learning_rate": 1.9600000000000002e-05,
"loss": 3.5136,
"step": 1216
},
{
"epoch": 0.13585763972468454,
"grad_norm": 0.22791220247745514,
"learning_rate": 1.9575e-05,
"loss": 3.5571,
"step": 1217
},
{
"epoch": 0.13596927295371058,
"grad_norm": 0.2134818136692047,
"learning_rate": 1.955e-05,
"loss": 3.5628,
"step": 1218
},
{
"epoch": 0.1360809061827366,
"grad_norm": 0.22793889045715332,
"learning_rate": 1.9525e-05,
"loss": 3.6111,
"step": 1219
},
{
"epoch": 0.13619253941176265,
"grad_norm": 0.2034095823764801,
"learning_rate": 1.9500000000000003e-05,
"loss": 3.5964,
"step": 1220
},
{
"epoch": 0.1363041726407887,
"grad_norm": 0.22055798768997192,
"learning_rate": 1.9475000000000002e-05,
"loss": 3.5659,
"step": 1221
},
{
"epoch": 0.13641580586981472,
"grad_norm": 0.20950908958911896,
"learning_rate": 1.9450000000000002e-05,
"loss": 3.5998,
"step": 1222
},
{
"epoch": 0.13652743909884077,
"grad_norm": 0.2164158672094345,
"learning_rate": 1.9425e-05,
"loss": 3.5998,
"step": 1223
},
{
"epoch": 0.1366390723278668,
"grad_norm": 0.208068385720253,
"learning_rate": 1.94e-05,
"loss": 3.5735,
"step": 1224
},
{
"epoch": 0.13675070555689284,
"grad_norm": 0.21069864928722382,
"learning_rate": 1.9375e-05,
"loss": 3.5624,
"step": 1225
},
{
"epoch": 0.13686233878591886,
"grad_norm": 0.20937590301036835,
"learning_rate": 1.9350000000000003e-05,
"loss": 3.5785,
"step": 1226
},
{
"epoch": 0.1369739720149449,
"grad_norm": 0.21422581374645233,
"learning_rate": 1.9325000000000002e-05,
"loss": 3.5665,
"step": 1227
},
{
"epoch": 0.13708560524397093,
"grad_norm": 0.20492665469646454,
"learning_rate": 1.93e-05,
"loss": 3.5021,
"step": 1228
},
{
"epoch": 0.13719723847299697,
"grad_norm": 0.21013765037059784,
"learning_rate": 1.9275e-05,
"loss": 3.4796,
"step": 1229
},
{
"epoch": 0.137308871702023,
"grad_norm": 0.21842752397060394,
"learning_rate": 1.925e-05,
"loss": 3.6837,
"step": 1230
},
{
"epoch": 0.13742050493104904,
"grad_norm": 0.20804911851882935,
"learning_rate": 1.9225e-05,
"loss": 3.585,
"step": 1231
},
{
"epoch": 0.13753213816007506,
"grad_norm": 0.2158387452363968,
"learning_rate": 1.9200000000000003e-05,
"loss": 3.4812,
"step": 1232
},
{
"epoch": 0.1376437713891011,
"grad_norm": 0.20360736548900604,
"learning_rate": 1.9175000000000002e-05,
"loss": 3.5355,
"step": 1233
},
{
"epoch": 0.13775540461812713,
"grad_norm": 0.2207733541727066,
"learning_rate": 1.915e-05,
"loss": 3.6129,
"step": 1234
},
{
"epoch": 0.13786703784715318,
"grad_norm": 0.22414511442184448,
"learning_rate": 1.9125e-05,
"loss": 3.5076,
"step": 1235
},
{
"epoch": 0.13797867107617923,
"grad_norm": 0.21356096863746643,
"learning_rate": 1.91e-05,
"loss": 3.5397,
"step": 1236
},
{
"epoch": 0.13809030430520525,
"grad_norm": 0.2204858362674713,
"learning_rate": 1.9075000000000003e-05,
"loss": 3.5354,
"step": 1237
},
{
"epoch": 0.1382019375342313,
"grad_norm": 0.21018990874290466,
"learning_rate": 1.9050000000000002e-05,
"loss": 3.6482,
"step": 1238
},
{
"epoch": 0.13831357076325732,
"grad_norm": 0.2107504904270172,
"learning_rate": 1.9025e-05,
"loss": 3.5916,
"step": 1239
},
{
"epoch": 0.13842520399228336,
"grad_norm": 0.21185946464538574,
"learning_rate": 1.9e-05,
"loss": 3.4799,
"step": 1240
},
{
"epoch": 0.13853683722130938,
"grad_norm": 0.2139020413160324,
"learning_rate": 1.8975e-05,
"loss": 3.5557,
"step": 1241
},
{
"epoch": 0.13864847045033543,
"grad_norm": 0.21282795071601868,
"learning_rate": 1.895e-05,
"loss": 3.4584,
"step": 1242
},
{
"epoch": 0.13876010367936145,
"grad_norm": 0.21034365892410278,
"learning_rate": 1.8925000000000003e-05,
"loss": 3.4701,
"step": 1243
},
{
"epoch": 0.1388717369083875,
"grad_norm": 0.21892791986465454,
"learning_rate": 1.8900000000000002e-05,
"loss": 3.499,
"step": 1244
},
{
"epoch": 0.13898337013741352,
"grad_norm": 0.22241051495075226,
"learning_rate": 1.8875e-05,
"loss": 3.5793,
"step": 1245
},
{
"epoch": 0.13909500336643957,
"grad_norm": 0.2173536866903305,
"learning_rate": 1.885e-05,
"loss": 3.545,
"step": 1246
},
{
"epoch": 0.1392066365954656,
"grad_norm": 0.20194391906261444,
"learning_rate": 1.8825e-05,
"loss": 3.5761,
"step": 1247
},
{
"epoch": 0.13931826982449164,
"grad_norm": 0.2071259468793869,
"learning_rate": 1.88e-05,
"loss": 3.4426,
"step": 1248
},
{
"epoch": 0.13942990305351766,
"grad_norm": 0.2104252129793167,
"learning_rate": 1.8775000000000002e-05,
"loss": 3.5148,
"step": 1249
},
{
"epoch": 0.1395415362825437,
"grad_norm": 0.25029781460762024,
"learning_rate": 1.8750000000000002e-05,
"loss": 3.4991,
"step": 1250
},
{
"epoch": 0.13965316951156972,
"grad_norm": 0.21814922988414764,
"learning_rate": 1.8725e-05,
"loss": 3.5417,
"step": 1251
},
{
"epoch": 0.13976480274059577,
"grad_norm": 0.21379521489143372,
"learning_rate": 1.87e-05,
"loss": 3.5883,
"step": 1252
},
{
"epoch": 0.13987643596962182,
"grad_norm": 0.21376143395900726,
"learning_rate": 1.8675e-05,
"loss": 3.541,
"step": 1253
},
{
"epoch": 0.13998806919864784,
"grad_norm": 0.2076859325170517,
"learning_rate": 1.865e-05,
"loss": 3.5329,
"step": 1254
},
{
"epoch": 0.1400997024276739,
"grad_norm": 0.20759625732898712,
"learning_rate": 1.8625000000000002e-05,
"loss": 3.5954,
"step": 1255
},
{
"epoch": 0.1402113356566999,
"grad_norm": 0.19671057164669037,
"learning_rate": 1.86e-05,
"loss": 3.5775,
"step": 1256
},
{
"epoch": 0.14032296888572596,
"grad_norm": 0.2108294665813446,
"learning_rate": 1.8575e-05,
"loss": 3.4886,
"step": 1257
},
{
"epoch": 0.14043460211475198,
"grad_norm": 0.24132932722568512,
"learning_rate": 1.855e-05,
"loss": 3.6032,
"step": 1258
},
{
"epoch": 0.14054623534377803,
"grad_norm": 0.20363110303878784,
"learning_rate": 1.8525e-05,
"loss": 3.6652,
"step": 1259
},
{
"epoch": 0.14065786857280405,
"grad_norm": 0.23135706782341003,
"learning_rate": 1.85e-05,
"loss": 3.5993,
"step": 1260
},
{
"epoch": 0.1407695018018301,
"grad_norm": 0.20346996188163757,
"learning_rate": 1.8475000000000002e-05,
"loss": 3.5945,
"step": 1261
},
{
"epoch": 0.14088113503085611,
"grad_norm": 0.2167162448167801,
"learning_rate": 1.845e-05,
"loss": 3.5206,
"step": 1262
},
{
"epoch": 0.14099276825988216,
"grad_norm": 0.2166433036327362,
"learning_rate": 1.8425e-05,
"loss": 3.5885,
"step": 1263
},
{
"epoch": 0.14110440148890818,
"grad_norm": 0.20988233387470245,
"learning_rate": 1.84e-05,
"loss": 3.5372,
"step": 1264
},
{
"epoch": 0.14121603471793423,
"grad_norm": 0.23411618173122406,
"learning_rate": 1.8375e-05,
"loss": 3.6102,
"step": 1265
},
{
"epoch": 0.14132766794696025,
"grad_norm": 0.22386209666728973,
"learning_rate": 1.8350000000000002e-05,
"loss": 3.4237,
"step": 1266
},
{
"epoch": 0.1414393011759863,
"grad_norm": 0.20570573210716248,
"learning_rate": 1.8325e-05,
"loss": 3.5774,
"step": 1267
},
{
"epoch": 0.14155093440501232,
"grad_norm": 0.20662033557891846,
"learning_rate": 1.83e-05,
"loss": 3.5471,
"step": 1268
},
{
"epoch": 0.14166256763403837,
"grad_norm": 0.21239370107650757,
"learning_rate": 1.8275e-05,
"loss": 3.5708,
"step": 1269
},
{
"epoch": 0.14177420086306441,
"grad_norm": 0.21178027987480164,
"learning_rate": 1.825e-05,
"loss": 3.6264,
"step": 1270
},
{
"epoch": 0.14188583409209043,
"grad_norm": 0.21225592494010925,
"learning_rate": 1.8225e-05,
"loss": 3.5279,
"step": 1271
},
{
"epoch": 0.14199746732111648,
"grad_norm": 0.22514958679676056,
"learning_rate": 1.8200000000000002e-05,
"loss": 3.5705,
"step": 1272
},
{
"epoch": 0.1421091005501425,
"grad_norm": 0.2146787941455841,
"learning_rate": 1.8175e-05,
"loss": 3.4781,
"step": 1273
},
{
"epoch": 0.14222073377916855,
"grad_norm": 0.20119592547416687,
"learning_rate": 1.815e-05,
"loss": 3.4134,
"step": 1274
},
{
"epoch": 0.14233236700819457,
"grad_norm": 0.2040000855922699,
"learning_rate": 1.8125e-05,
"loss": 3.5428,
"step": 1275
},
{
"epoch": 0.14244400023722062,
"grad_norm": 0.22360073029994965,
"learning_rate": 1.81e-05,
"loss": 3.5801,
"step": 1276
},
{
"epoch": 0.14255563346624664,
"grad_norm": 0.20249921083450317,
"learning_rate": 1.8075e-05,
"loss": 3.5855,
"step": 1277
},
{
"epoch": 0.1426672666952727,
"grad_norm": 0.22937600314617157,
"learning_rate": 1.805e-05,
"loss": 3.5728,
"step": 1278
},
{
"epoch": 0.1427788999242987,
"grad_norm": 0.22563280165195465,
"learning_rate": 1.8025e-05,
"loss": 3.6048,
"step": 1279
},
{
"epoch": 0.14289053315332476,
"grad_norm": 0.2215108871459961,
"learning_rate": 1.8e-05,
"loss": 3.598,
"step": 1280
},
{
"epoch": 0.14300216638235078,
"grad_norm": 0.24647381901741028,
"learning_rate": 1.7975e-05,
"loss": 3.4733,
"step": 1281
},
{
"epoch": 0.14311379961137682,
"grad_norm": 0.22133450210094452,
"learning_rate": 1.795e-05,
"loss": 3.5061,
"step": 1282
},
{
"epoch": 0.14322543284040284,
"grad_norm": 0.20675964653491974,
"learning_rate": 1.7925e-05,
"loss": 3.5898,
"step": 1283
},
{
"epoch": 0.1433370660694289,
"grad_norm": 0.2168865203857422,
"learning_rate": 1.79e-05,
"loss": 3.5999,
"step": 1284
},
{
"epoch": 0.1434486992984549,
"grad_norm": 0.21378791332244873,
"learning_rate": 1.7875e-05,
"loss": 3.5835,
"step": 1285
},
{
"epoch": 0.14356033252748096,
"grad_norm": 0.2131820023059845,
"learning_rate": 1.785e-05,
"loss": 3.5167,
"step": 1286
},
{
"epoch": 0.143671965756507,
"grad_norm": 0.21868109703063965,
"learning_rate": 1.7825e-05,
"loss": 3.5859,
"step": 1287
},
{
"epoch": 0.14378359898553303,
"grad_norm": 0.2127954065799713,
"learning_rate": 1.78e-05,
"loss": 3.607,
"step": 1288
},
{
"epoch": 0.14389523221455908,
"grad_norm": 0.20421262085437775,
"learning_rate": 1.7775e-05,
"loss": 3.5493,
"step": 1289
},
{
"epoch": 0.1440068654435851,
"grad_norm": 0.2145199179649353,
"learning_rate": 1.775e-05,
"loss": 3.5346,
"step": 1290
},
{
"epoch": 0.14411849867261114,
"grad_norm": 0.2110096961259842,
"learning_rate": 1.7725e-05,
"loss": 3.5686,
"step": 1291
},
{
"epoch": 0.14423013190163717,
"grad_norm": 0.22473768889904022,
"learning_rate": 1.77e-05,
"loss": 3.511,
"step": 1292
},
{
"epoch": 0.1443417651306632,
"grad_norm": 0.23084314167499542,
"learning_rate": 1.7675e-05,
"loss": 3.5603,
"step": 1293
},
{
"epoch": 0.14445339835968923,
"grad_norm": 0.21070967614650726,
"learning_rate": 1.765e-05,
"loss": 3.6489,
"step": 1294
},
{
"epoch": 0.14456503158871528,
"grad_norm": 0.22051683068275452,
"learning_rate": 1.7625e-05,
"loss": 3.5091,
"step": 1295
},
{
"epoch": 0.1446766648177413,
"grad_norm": 0.19961531460285187,
"learning_rate": 1.76e-05,
"loss": 3.5403,
"step": 1296
},
{
"epoch": 0.14478829804676735,
"grad_norm": 0.22710789740085602,
"learning_rate": 1.7575e-05,
"loss": 3.4781,
"step": 1297
},
{
"epoch": 0.14489993127579337,
"grad_norm": 0.2289581596851349,
"learning_rate": 1.755e-05,
"loss": 3.5291,
"step": 1298
},
{
"epoch": 0.14501156450481942,
"grad_norm": 0.204254150390625,
"learning_rate": 1.7525e-05,
"loss": 3.6457,
"step": 1299
},
{
"epoch": 0.14512319773384544,
"grad_norm": 0.20505326986312866,
"learning_rate": 1.75e-05,
"loss": 3.5872,
"step": 1300
},
{
"epoch": 0.1452348309628715,
"grad_norm": 0.22890357673168182,
"learning_rate": 1.7475e-05,
"loss": 3.4962,
"step": 1301
},
{
"epoch": 0.14534646419189753,
"grad_norm": 0.20769526064395905,
"learning_rate": 1.745e-05,
"loss": 3.4875,
"step": 1302
},
{
"epoch": 0.14545809742092355,
"grad_norm": 0.22655627131462097,
"learning_rate": 1.7425e-05,
"loss": 3.6089,
"step": 1303
},
{
"epoch": 0.1455697306499496,
"grad_norm": 0.2158648818731308,
"learning_rate": 1.74e-05,
"loss": 3.5794,
"step": 1304
},
{
"epoch": 0.14568136387897562,
"grad_norm": 0.22210943698883057,
"learning_rate": 1.7375e-05,
"loss": 3.584,
"step": 1305
},
{
"epoch": 0.14579299710800167,
"grad_norm": 0.21348996460437775,
"learning_rate": 1.7349999999999998e-05,
"loss": 3.4943,
"step": 1306
},
{
"epoch": 0.1459046303370277,
"grad_norm": 0.21592015027999878,
"learning_rate": 1.7325e-05,
"loss": 3.5682,
"step": 1307
},
{
"epoch": 0.14601626356605374,
"grad_norm": 0.20042981207370758,
"learning_rate": 1.73e-05,
"loss": 3.6089,
"step": 1308
},
{
"epoch": 0.14612789679507976,
"grad_norm": 0.23309749364852905,
"learning_rate": 1.7275e-05,
"loss": 3.5649,
"step": 1309
},
{
"epoch": 0.1462395300241058,
"grad_norm": 0.2152320146560669,
"learning_rate": 1.725e-05,
"loss": 3.5387,
"step": 1310
},
{
"epoch": 0.14635116325313183,
"grad_norm": 0.22288793325424194,
"learning_rate": 1.7225e-05,
"loss": 3.669,
"step": 1311
},
{
"epoch": 0.14646279648215788,
"grad_norm": 0.22235427796840668,
"learning_rate": 1.7199999999999998e-05,
"loss": 3.5105,
"step": 1312
},
{
"epoch": 0.1465744297111839,
"grad_norm": 0.21448449790477753,
"learning_rate": 1.7175e-05,
"loss": 3.6209,
"step": 1313
},
{
"epoch": 0.14668606294020994,
"grad_norm": 0.21014919877052307,
"learning_rate": 1.7150000000000004e-05,
"loss": 3.6002,
"step": 1314
},
{
"epoch": 0.14679769616923596,
"grad_norm": 0.20971186459064484,
"learning_rate": 1.7125000000000003e-05,
"loss": 3.5084,
"step": 1315
},
{
"epoch": 0.146909329398262,
"grad_norm": 0.20794574916362762,
"learning_rate": 1.7100000000000002e-05,
"loss": 3.5632,
"step": 1316
},
{
"epoch": 0.14702096262728803,
"grad_norm": 0.20937994122505188,
"learning_rate": 1.7075e-05,
"loss": 3.5138,
"step": 1317
},
{
"epoch": 0.14713259585631408,
"grad_norm": 0.21143656969070435,
"learning_rate": 1.705e-05,
"loss": 3.4616,
"step": 1318
},
{
"epoch": 0.14724422908534013,
"grad_norm": 0.22292785346508026,
"learning_rate": 1.7025e-05,
"loss": 3.4165,
"step": 1319
},
{
"epoch": 0.14735586231436615,
"grad_norm": 0.21518686413764954,
"learning_rate": 1.7000000000000003e-05,
"loss": 3.6225,
"step": 1320
},
{
"epoch": 0.1474674955433922,
"grad_norm": 0.20869877934455872,
"learning_rate": 1.6975000000000003e-05,
"loss": 3.5646,
"step": 1321
},
{
"epoch": 0.14757912877241822,
"grad_norm": 0.2118578851222992,
"learning_rate": 1.6950000000000002e-05,
"loss": 3.4983,
"step": 1322
},
{
"epoch": 0.14769076200144426,
"grad_norm": 0.22146885097026825,
"learning_rate": 1.6925e-05,
"loss": 3.5643,
"step": 1323
},
{
"epoch": 0.14780239523047028,
"grad_norm": 0.23182646930217743,
"learning_rate": 1.69e-05,
"loss": 3.4286,
"step": 1324
},
{
"epoch": 0.14791402845949633,
"grad_norm": 0.2240353375673294,
"learning_rate": 1.6875000000000004e-05,
"loss": 3.6328,
"step": 1325
},
{
"epoch": 0.14802566168852235,
"grad_norm": 0.1978340595960617,
"learning_rate": 1.6850000000000003e-05,
"loss": 3.5807,
"step": 1326
},
{
"epoch": 0.1481372949175484,
"grad_norm": 0.2036842554807663,
"learning_rate": 1.6825000000000002e-05,
"loss": 3.6243,
"step": 1327
},
{
"epoch": 0.14824892814657442,
"grad_norm": 0.20744021236896515,
"learning_rate": 1.6800000000000002e-05,
"loss": 3.5193,
"step": 1328
},
{
"epoch": 0.14836056137560047,
"grad_norm": 0.2049788534641266,
"learning_rate": 1.6775e-05,
"loss": 3.595,
"step": 1329
},
{
"epoch": 0.1484721946046265,
"grad_norm": 0.20628666877746582,
"learning_rate": 1.675e-05,
"loss": 3.4855,
"step": 1330
},
{
"epoch": 0.14858382783365254,
"grad_norm": 0.21975022554397583,
"learning_rate": 1.6725000000000003e-05,
"loss": 3.5284,
"step": 1331
},
{
"epoch": 0.14869546106267856,
"grad_norm": 0.19901278614997864,
"learning_rate": 1.6700000000000003e-05,
"loss": 3.5468,
"step": 1332
},
{
"epoch": 0.1488070942917046,
"grad_norm": 0.19940173625946045,
"learning_rate": 1.6675000000000002e-05,
"loss": 3.5352,
"step": 1333
},
{
"epoch": 0.14891872752073063,
"grad_norm": 0.218553826212883,
"learning_rate": 1.665e-05,
"loss": 3.525,
"step": 1334
},
{
"epoch": 0.14903036074975667,
"grad_norm": 0.22972969710826874,
"learning_rate": 1.6625e-05,
"loss": 3.5349,
"step": 1335
},
{
"epoch": 0.14914199397878272,
"grad_norm": 0.22111742198467255,
"learning_rate": 1.66e-05,
"loss": 3.4897,
"step": 1336
},
{
"epoch": 0.14925362720780874,
"grad_norm": 0.2089938074350357,
"learning_rate": 1.6575000000000003e-05,
"loss": 3.5503,
"step": 1337
},
{
"epoch": 0.1493652604368348,
"grad_norm": 0.210914745926857,
"learning_rate": 1.6550000000000002e-05,
"loss": 3.4767,
"step": 1338
},
{
"epoch": 0.1494768936658608,
"grad_norm": 0.23048734664916992,
"learning_rate": 1.6525000000000002e-05,
"loss": 3.6128,
"step": 1339
},
{
"epoch": 0.14958852689488686,
"grad_norm": 0.21404390037059784,
"learning_rate": 1.65e-05,
"loss": 3.4126,
"step": 1340
},
{
"epoch": 0.14970016012391288,
"grad_norm": 0.21800267696380615,
"learning_rate": 1.6475e-05,
"loss": 3.5079,
"step": 1341
},
{
"epoch": 0.14981179335293893,
"grad_norm": 0.21693755686283112,
"learning_rate": 1.645e-05,
"loss": 3.6194,
"step": 1342
},
{
"epoch": 0.14992342658196495,
"grad_norm": 0.20285376906394958,
"learning_rate": 1.6425000000000003e-05,
"loss": 3.6108,
"step": 1343
},
{
"epoch": 0.150035059810991,
"grad_norm": 0.22221077978610992,
"learning_rate": 1.6400000000000002e-05,
"loss": 3.6083,
"step": 1344
},
{
"epoch": 0.15014669304001702,
"grad_norm": 0.2023969292640686,
"learning_rate": 1.6375e-05,
"loss": 3.4888,
"step": 1345
},
{
"epoch": 0.15025832626904306,
"grad_norm": 0.23052248358726501,
"learning_rate": 1.635e-05,
"loss": 3.4719,
"step": 1346
},
{
"epoch": 0.15036995949806908,
"grad_norm": 0.21861739456653595,
"learning_rate": 1.6325e-05,
"loss": 3.5636,
"step": 1347
},
{
"epoch": 0.15048159272709513,
"grad_norm": 0.20246142148971558,
"learning_rate": 1.63e-05,
"loss": 3.6445,
"step": 1348
},
{
"epoch": 0.15059322595612115,
"grad_norm": 0.22579650580883026,
"learning_rate": 1.6275000000000003e-05,
"loss": 3.5358,
"step": 1349
},
{
"epoch": 0.1507048591851472,
"grad_norm": 0.23292294144630432,
"learning_rate": 1.6250000000000002e-05,
"loss": 3.6751,
"step": 1350
},
{
"epoch": 0.15081649241417325,
"grad_norm": 0.21318607032299042,
"learning_rate": 1.6225e-05,
"loss": 3.5914,
"step": 1351
},
{
"epoch": 0.15092812564319927,
"grad_norm": 0.22807946801185608,
"learning_rate": 1.62e-05,
"loss": 3.5981,
"step": 1352
},
{
"epoch": 0.15103975887222532,
"grad_norm": 0.20141063630580902,
"learning_rate": 1.6175e-05,
"loss": 3.5295,
"step": 1353
},
{
"epoch": 0.15115139210125134,
"grad_norm": 0.2140730619430542,
"learning_rate": 1.6150000000000003e-05,
"loss": 3.6889,
"step": 1354
},
{
"epoch": 0.15126302533027738,
"grad_norm": 0.20323944091796875,
"learning_rate": 1.6125000000000002e-05,
"loss": 3.5814,
"step": 1355
},
{
"epoch": 0.1513746585593034,
"grad_norm": 0.21241354942321777,
"learning_rate": 1.6100000000000002e-05,
"loss": 3.6089,
"step": 1356
},
{
"epoch": 0.15148629178832945,
"grad_norm": 0.21149618923664093,
"learning_rate": 1.6075e-05,
"loss": 3.6046,
"step": 1357
},
{
"epoch": 0.15159792501735547,
"grad_norm": 0.2123936265707016,
"learning_rate": 1.605e-05,
"loss": 3.6159,
"step": 1358
},
{
"epoch": 0.15170955824638152,
"grad_norm": 0.2110690176486969,
"learning_rate": 1.6025e-05,
"loss": 3.566,
"step": 1359
},
{
"epoch": 0.15182119147540754,
"grad_norm": 0.2124713510274887,
"learning_rate": 1.6000000000000003e-05,
"loss": 3.5233,
"step": 1360
},
{
"epoch": 0.1519328247044336,
"grad_norm": 0.22298456728458405,
"learning_rate": 1.5975000000000002e-05,
"loss": 3.5865,
"step": 1361
},
{
"epoch": 0.1520444579334596,
"grad_norm": 0.21143953502178192,
"learning_rate": 1.595e-05,
"loss": 3.4987,
"step": 1362
},
{
"epoch": 0.15215609116248566,
"grad_norm": 0.20869436860084534,
"learning_rate": 1.5925e-05,
"loss": 3.5331,
"step": 1363
},
{
"epoch": 0.15226772439151168,
"grad_norm": 0.2056984305381775,
"learning_rate": 1.59e-05,
"loss": 3.4991,
"step": 1364
},
{
"epoch": 0.15237935762053773,
"grad_norm": 0.195772185921669,
"learning_rate": 1.5875e-05,
"loss": 3.5712,
"step": 1365
},
{
"epoch": 0.15249099084956375,
"grad_norm": 0.21393226087093353,
"learning_rate": 1.5850000000000002e-05,
"loss": 3.5251,
"step": 1366
},
{
"epoch": 0.1526026240785898,
"grad_norm": 0.22534306347370148,
"learning_rate": 1.5825000000000002e-05,
"loss": 3.5171,
"step": 1367
},
{
"epoch": 0.15271425730761584,
"grad_norm": 0.21686024963855743,
"learning_rate": 1.58e-05,
"loss": 3.5745,
"step": 1368
},
{
"epoch": 0.15282589053664186,
"grad_norm": 0.21589086949825287,
"learning_rate": 1.5775e-05,
"loss": 3.5186,
"step": 1369
},
{
"epoch": 0.1529375237656679,
"grad_norm": 0.21077436208724976,
"learning_rate": 1.575e-05,
"loss": 3.4983,
"step": 1370
},
{
"epoch": 0.15304915699469393,
"grad_norm": 0.200812429189682,
"learning_rate": 1.5725e-05,
"loss": 3.5693,
"step": 1371
},
{
"epoch": 0.15316079022371998,
"grad_norm": 0.22226884961128235,
"learning_rate": 1.5700000000000002e-05,
"loss": 3.6007,
"step": 1372
},
{
"epoch": 0.153272423452746,
"grad_norm": 0.2146586924791336,
"learning_rate": 1.5675e-05,
"loss": 3.5932,
"step": 1373
},
{
"epoch": 0.15338405668177205,
"grad_norm": 0.2117159515619278,
"learning_rate": 1.565e-05,
"loss": 3.5523,
"step": 1374
},
{
"epoch": 0.15349568991079807,
"grad_norm": 0.20955950021743774,
"learning_rate": 1.5625e-05,
"loss": 3.456,
"step": 1375
},
{
"epoch": 0.15360732313982411,
"grad_norm": 0.20618252456188202,
"learning_rate": 1.56e-05,
"loss": 3.6145,
"step": 1376
},
{
"epoch": 0.15371895636885013,
"grad_norm": 0.22932898998260498,
"learning_rate": 1.5575e-05,
"loss": 3.5814,
"step": 1377
},
{
"epoch": 0.15383058959787618,
"grad_norm": 0.18903784453868866,
"learning_rate": 1.5550000000000002e-05,
"loss": 3.5468,
"step": 1378
},
{
"epoch": 0.1539422228269022,
"grad_norm": 0.22605137526988983,
"learning_rate": 1.5525e-05,
"loss": 3.5694,
"step": 1379
},
{
"epoch": 0.15405385605592825,
"grad_norm": 0.2134653925895691,
"learning_rate": 1.55e-05,
"loss": 3.5371,
"step": 1380
},
{
"epoch": 0.15416548928495427,
"grad_norm": 0.22066886723041534,
"learning_rate": 1.5475e-05,
"loss": 3.645,
"step": 1381
},
{
"epoch": 0.15427712251398032,
"grad_norm": 0.2152653932571411,
"learning_rate": 1.545e-05,
"loss": 3.6038,
"step": 1382
},
{
"epoch": 0.15438875574300634,
"grad_norm": 0.20610612630844116,
"learning_rate": 1.5425000000000002e-05,
"loss": 3.5485,
"step": 1383
},
{
"epoch": 0.1545003889720324,
"grad_norm": 0.24205341935157776,
"learning_rate": 1.54e-05,
"loss": 3.5409,
"step": 1384
},
{
"epoch": 0.15461202220105844,
"grad_norm": 0.20902207493782043,
"learning_rate": 1.5375e-05,
"loss": 3.5187,
"step": 1385
},
{
"epoch": 0.15472365543008446,
"grad_norm": 0.21475106477737427,
"learning_rate": 1.535e-05,
"loss": 3.4662,
"step": 1386
},
{
"epoch": 0.1548352886591105,
"grad_norm": 0.21286524832248688,
"learning_rate": 1.5325e-05,
"loss": 3.4786,
"step": 1387
},
{
"epoch": 0.15494692188813652,
"grad_norm": 0.20601478219032288,
"learning_rate": 1.53e-05,
"loss": 3.5084,
"step": 1388
},
{
"epoch": 0.15505855511716257,
"grad_norm": 0.22020862996578217,
"learning_rate": 1.5275000000000002e-05,
"loss": 3.5496,
"step": 1389
},
{
"epoch": 0.1551701883461886,
"grad_norm": 0.21930280327796936,
"learning_rate": 1.525e-05,
"loss": 3.5062,
"step": 1390
},
{
"epoch": 0.15528182157521464,
"grad_norm": 0.21371957659721375,
"learning_rate": 1.5225e-05,
"loss": 3.6362,
"step": 1391
},
{
"epoch": 0.15539345480424066,
"grad_norm": 0.2155320644378662,
"learning_rate": 1.52e-05,
"loss": 3.5661,
"step": 1392
},
{
"epoch": 0.1555050880332667,
"grad_norm": 0.20452113449573517,
"learning_rate": 1.5175e-05,
"loss": 3.5367,
"step": 1393
},
{
"epoch": 0.15561672126229273,
"grad_norm": 0.22195492684841156,
"learning_rate": 1.515e-05,
"loss": 3.5974,
"step": 1394
},
{
"epoch": 0.15572835449131878,
"grad_norm": 0.2118270993232727,
"learning_rate": 1.5125e-05,
"loss": 3.6211,
"step": 1395
},
{
"epoch": 0.1558399877203448,
"grad_norm": 0.21555618941783905,
"learning_rate": 1.51e-05,
"loss": 3.4184,
"step": 1396
},
{
"epoch": 0.15595162094937084,
"grad_norm": 0.21868236362934113,
"learning_rate": 1.5075e-05,
"loss": 3.5928,
"step": 1397
},
{
"epoch": 0.15606325417839687,
"grad_norm": 0.2279786914587021,
"learning_rate": 1.505e-05,
"loss": 3.423,
"step": 1398
},
{
"epoch": 0.1561748874074229,
"grad_norm": 0.20806945860385895,
"learning_rate": 1.5025000000000001e-05,
"loss": 3.5181,
"step": 1399
},
{
"epoch": 0.15628652063644893,
"grad_norm": 0.21479915082454681,
"learning_rate": 1.5e-05,
"loss": 3.5659,
"step": 1400
},
{
"epoch": 0.15639815386547498,
"grad_norm": 0.2213413268327713,
"learning_rate": 1.4975e-05,
"loss": 3.4806,
"step": 1401
},
{
"epoch": 0.15650978709450103,
"grad_norm": 0.21492530405521393,
"learning_rate": 1.4950000000000001e-05,
"loss": 3.5851,
"step": 1402
},
{
"epoch": 0.15662142032352705,
"grad_norm": 0.24160236120224,
"learning_rate": 1.4925e-05,
"loss": 3.5838,
"step": 1403
},
{
"epoch": 0.1567330535525531,
"grad_norm": 0.22599904239177704,
"learning_rate": 1.49e-05,
"loss": 3.6036,
"step": 1404
},
{
"epoch": 0.15684468678157912,
"grad_norm": 0.20814429223537445,
"learning_rate": 1.4875e-05,
"loss": 3.5942,
"step": 1405
},
{
"epoch": 0.15695632001060517,
"grad_norm": 0.2230771780014038,
"learning_rate": 1.485e-05,
"loss": 3.4972,
"step": 1406
},
{
"epoch": 0.1570679532396312,
"grad_norm": 0.20796571671962738,
"learning_rate": 1.4825e-05,
"loss": 3.4894,
"step": 1407
},
{
"epoch": 0.15717958646865723,
"grad_norm": 0.21681338548660278,
"learning_rate": 1.48e-05,
"loss": 3.4848,
"step": 1408
},
{
"epoch": 0.15729121969768325,
"grad_norm": 0.20095278322696686,
"learning_rate": 1.4775e-05,
"loss": 3.5127,
"step": 1409
},
{
"epoch": 0.1574028529267093,
"grad_norm": 0.21350175142288208,
"learning_rate": 1.475e-05,
"loss": 3.5527,
"step": 1410
},
{
"epoch": 0.15751448615573532,
"grad_norm": 0.2261526733636856,
"learning_rate": 1.4725e-05,
"loss": 3.5848,
"step": 1411
},
{
"epoch": 0.15762611938476137,
"grad_norm": 0.21552251279354095,
"learning_rate": 1.47e-05,
"loss": 3.533,
"step": 1412
},
{
"epoch": 0.1577377526137874,
"grad_norm": 0.22712787985801697,
"learning_rate": 1.4675e-05,
"loss": 3.459,
"step": 1413
},
{
"epoch": 0.15784938584281344,
"grad_norm": 0.20938892662525177,
"learning_rate": 1.465e-05,
"loss": 3.4798,
"step": 1414
},
{
"epoch": 0.15796101907183946,
"grad_norm": 0.2373296320438385,
"learning_rate": 1.4625e-05,
"loss": 3.6355,
"step": 1415
},
{
"epoch": 0.1580726523008655,
"grad_norm": 0.22512052953243256,
"learning_rate": 1.4599999999999999e-05,
"loss": 3.4992,
"step": 1416
},
{
"epoch": 0.15818428552989156,
"grad_norm": 0.20521408319473267,
"learning_rate": 1.4575e-05,
"loss": 3.4781,
"step": 1417
},
{
"epoch": 0.15829591875891758,
"grad_norm": 0.20856474339962006,
"learning_rate": 1.455e-05,
"loss": 3.5781,
"step": 1418
},
{
"epoch": 0.15840755198794362,
"grad_norm": 0.2121255099773407,
"learning_rate": 1.4524999999999999e-05,
"loss": 3.4772,
"step": 1419
},
{
"epoch": 0.15851918521696964,
"grad_norm": 0.2173224836587906,
"learning_rate": 1.45e-05,
"loss": 3.6131,
"step": 1420
},
{
"epoch": 0.1586308184459957,
"grad_norm": 0.21213147044181824,
"learning_rate": 1.4475e-05,
"loss": 3.6118,
"step": 1421
},
{
"epoch": 0.1587424516750217,
"grad_norm": 0.21709036827087402,
"learning_rate": 1.4449999999999999e-05,
"loss": 3.5946,
"step": 1422
},
{
"epoch": 0.15885408490404776,
"grad_norm": 0.2092135101556778,
"learning_rate": 1.4425e-05,
"loss": 3.5075,
"step": 1423
},
{
"epoch": 0.15896571813307378,
"grad_norm": 0.2020912766456604,
"learning_rate": 1.44e-05,
"loss": 3.545,
"step": 1424
},
{
"epoch": 0.15907735136209983,
"grad_norm": 0.2089933305978775,
"learning_rate": 1.4374999999999999e-05,
"loss": 3.4883,
"step": 1425
},
{
"epoch": 0.15918898459112585,
"grad_norm": 0.21972651779651642,
"learning_rate": 1.435e-05,
"loss": 3.5493,
"step": 1426
},
{
"epoch": 0.1593006178201519,
"grad_norm": 0.21938163042068481,
"learning_rate": 1.4325e-05,
"loss": 3.5339,
"step": 1427
},
{
"epoch": 0.15941225104917792,
"grad_norm": 0.21089443564414978,
"learning_rate": 1.43e-05,
"loss": 3.6018,
"step": 1428
},
{
"epoch": 0.15952388427820396,
"grad_norm": 0.19909141957759857,
"learning_rate": 1.4275e-05,
"loss": 3.4656,
"step": 1429
},
{
"epoch": 0.15963551750722998,
"grad_norm": 0.22186774015426636,
"learning_rate": 1.4249999999999999e-05,
"loss": 3.6036,
"step": 1430
},
{
"epoch": 0.15974715073625603,
"grad_norm": 0.21138955652713776,
"learning_rate": 1.4225e-05,
"loss": 3.4847,
"step": 1431
},
{
"epoch": 0.15985878396528205,
"grad_norm": 0.20794129371643066,
"learning_rate": 1.42e-05,
"loss": 3.539,
"step": 1432
},
{
"epoch": 0.1599704171943081,
"grad_norm": 0.2123575657606125,
"learning_rate": 1.4174999999999999e-05,
"loss": 3.574,
"step": 1433
},
{
"epoch": 0.16008205042333415,
"grad_norm": 0.20771554112434387,
"learning_rate": 1.415e-05,
"loss": 3.4436,
"step": 1434
},
{
"epoch": 0.16019368365236017,
"grad_norm": 0.2056964933872223,
"learning_rate": 1.4125e-05,
"loss": 3.5954,
"step": 1435
},
{
"epoch": 0.16030531688138622,
"grad_norm": 0.204432874917984,
"learning_rate": 1.4099999999999999e-05,
"loss": 3.4707,
"step": 1436
},
{
"epoch": 0.16041695011041224,
"grad_norm": 0.22269847989082336,
"learning_rate": 1.4075e-05,
"loss": 3.5813,
"step": 1437
},
{
"epoch": 0.16052858333943829,
"grad_norm": 0.22461041808128357,
"learning_rate": 1.4050000000000003e-05,
"loss": 3.6105,
"step": 1438
},
{
"epoch": 0.1606402165684643,
"grad_norm": 0.21629801392555237,
"learning_rate": 1.4025000000000002e-05,
"loss": 3.3747,
"step": 1439
},
{
"epoch": 0.16075184979749035,
"grad_norm": 0.2119080126285553,
"learning_rate": 1.4000000000000001e-05,
"loss": 3.5527,
"step": 1440
},
{
"epoch": 0.16086348302651637,
"grad_norm": 0.2112884819507599,
"learning_rate": 1.3975000000000003e-05,
"loss": 3.4673,
"step": 1441
},
{
"epoch": 0.16097511625554242,
"grad_norm": 0.2316942662000656,
"learning_rate": 1.3950000000000002e-05,
"loss": 3.6066,
"step": 1442
},
{
"epoch": 0.16108674948456844,
"grad_norm": 0.23155200481414795,
"learning_rate": 1.3925000000000001e-05,
"loss": 3.4735,
"step": 1443
},
{
"epoch": 0.1611983827135945,
"grad_norm": 0.21494297683238983,
"learning_rate": 1.3900000000000002e-05,
"loss": 3.4838,
"step": 1444
},
{
"epoch": 0.1613100159426205,
"grad_norm": 0.22902114689350128,
"learning_rate": 1.3875000000000002e-05,
"loss": 3.5036,
"step": 1445
},
{
"epoch": 0.16142164917164656,
"grad_norm": 0.21502353250980377,
"learning_rate": 1.3850000000000001e-05,
"loss": 3.6018,
"step": 1446
},
{
"epoch": 0.16153328240067258,
"grad_norm": 0.20738163590431213,
"learning_rate": 1.3825000000000002e-05,
"loss": 3.5585,
"step": 1447
},
{
"epoch": 0.16164491562969863,
"grad_norm": 0.2262074500322342,
"learning_rate": 1.3800000000000002e-05,
"loss": 3.6609,
"step": 1448
},
{
"epoch": 0.16175654885872465,
"grad_norm": 0.23943816125392914,
"learning_rate": 1.3775000000000001e-05,
"loss": 3.5755,
"step": 1449
},
{
"epoch": 0.1618681820877507,
"grad_norm": 0.2126341611146927,
"learning_rate": 1.3750000000000002e-05,
"loss": 3.6235,
"step": 1450
},
{
"epoch": 0.16197981531677674,
"grad_norm": 0.22666750848293304,
"learning_rate": 1.3725000000000002e-05,
"loss": 3.5032,
"step": 1451
},
{
"epoch": 0.16209144854580276,
"grad_norm": 0.2192983776330948,
"learning_rate": 1.3700000000000001e-05,
"loss": 3.5321,
"step": 1452
},
{
"epoch": 0.1622030817748288,
"grad_norm": 0.22649292647838593,
"learning_rate": 1.3675000000000002e-05,
"loss": 3.6066,
"step": 1453
},
{
"epoch": 0.16231471500385483,
"grad_norm": 0.20492173731327057,
"learning_rate": 1.3650000000000001e-05,
"loss": 3.5261,
"step": 1454
},
{
"epoch": 0.16242634823288088,
"grad_norm": 0.2134513258934021,
"learning_rate": 1.3625e-05,
"loss": 3.5639,
"step": 1455
},
{
"epoch": 0.1625379814619069,
"grad_norm": 0.21607226133346558,
"learning_rate": 1.3600000000000002e-05,
"loss": 3.659,
"step": 1456
},
{
"epoch": 0.16264961469093295,
"grad_norm": 0.2083975374698639,
"learning_rate": 1.3575000000000001e-05,
"loss": 3.5156,
"step": 1457
},
{
"epoch": 0.16276124791995897,
"grad_norm": 0.2102838009595871,
"learning_rate": 1.3550000000000002e-05,
"loss": 3.5657,
"step": 1458
},
{
"epoch": 0.16287288114898502,
"grad_norm": 0.21993212401866913,
"learning_rate": 1.3525000000000002e-05,
"loss": 3.4882,
"step": 1459
},
{
"epoch": 0.16298451437801104,
"grad_norm": 0.21374772489070892,
"learning_rate": 1.3500000000000001e-05,
"loss": 3.5411,
"step": 1460
},
{
"epoch": 0.16309614760703708,
"grad_norm": 0.20911991596221924,
"learning_rate": 1.3475000000000002e-05,
"loss": 3.5503,
"step": 1461
},
{
"epoch": 0.1632077808360631,
"grad_norm": 0.21545152366161346,
"learning_rate": 1.3450000000000002e-05,
"loss": 3.5762,
"step": 1462
},
{
"epoch": 0.16331941406508915,
"grad_norm": 0.21005657315254211,
"learning_rate": 1.3425000000000001e-05,
"loss": 3.5041,
"step": 1463
},
{
"epoch": 0.16343104729411517,
"grad_norm": 0.21614089608192444,
"learning_rate": 1.3400000000000002e-05,
"loss": 3.6132,
"step": 1464
},
{
"epoch": 0.16354268052314122,
"grad_norm": 0.21868258714675903,
"learning_rate": 1.3375000000000002e-05,
"loss": 3.5561,
"step": 1465
},
{
"epoch": 0.16365431375216724,
"grad_norm": 0.21252040565013885,
"learning_rate": 1.3350000000000001e-05,
"loss": 3.4272,
"step": 1466
},
{
"epoch": 0.1637659469811933,
"grad_norm": 0.21259881556034088,
"learning_rate": 1.3325000000000002e-05,
"loss": 3.5324,
"step": 1467
},
{
"epoch": 0.16387758021021934,
"grad_norm": 0.22414828836917877,
"learning_rate": 1.3300000000000001e-05,
"loss": 3.5275,
"step": 1468
},
{
"epoch": 0.16398921343924536,
"grad_norm": 0.2094557285308838,
"learning_rate": 1.3275e-05,
"loss": 3.5668,
"step": 1469
},
{
"epoch": 0.1641008466682714,
"grad_norm": 0.2149396389722824,
"learning_rate": 1.3250000000000002e-05,
"loss": 3.687,
"step": 1470
},
{
"epoch": 0.16421247989729743,
"grad_norm": 0.19812384247779846,
"learning_rate": 1.3225000000000001e-05,
"loss": 3.6539,
"step": 1471
},
{
"epoch": 0.16432411312632347,
"grad_norm": 0.21302932500839233,
"learning_rate": 1.32e-05,
"loss": 3.4727,
"step": 1472
},
{
"epoch": 0.1644357463553495,
"grad_norm": 0.233284130692482,
"learning_rate": 1.3175000000000002e-05,
"loss": 3.5392,
"step": 1473
},
{
"epoch": 0.16454737958437554,
"grad_norm": 0.21683721244335175,
"learning_rate": 1.3150000000000001e-05,
"loss": 3.576,
"step": 1474
},
{
"epoch": 0.16465901281340156,
"grad_norm": 0.23188064992427826,
"learning_rate": 1.3125e-05,
"loss": 3.5392,
"step": 1475
},
{
"epoch": 0.1647706460424276,
"grad_norm": 0.21342453360557556,
"learning_rate": 1.3100000000000002e-05,
"loss": 3.5261,
"step": 1476
},
{
"epoch": 0.16488227927145363,
"grad_norm": 0.21851873397827148,
"learning_rate": 1.3075000000000001e-05,
"loss": 3.5029,
"step": 1477
},
{
"epoch": 0.16499391250047968,
"grad_norm": 0.2827218174934387,
"learning_rate": 1.305e-05,
"loss": 3.4919,
"step": 1478
},
{
"epoch": 0.1651055457295057,
"grad_norm": 0.21115051209926605,
"learning_rate": 1.3025000000000002e-05,
"loss": 3.615,
"step": 1479
},
{
"epoch": 0.16521717895853175,
"grad_norm": 0.2067495584487915,
"learning_rate": 1.3000000000000001e-05,
"loss": 3.5447,
"step": 1480
},
{
"epoch": 0.16532881218755777,
"grad_norm": 0.2529817223548889,
"learning_rate": 1.2975e-05,
"loss": 3.5876,
"step": 1481
},
{
"epoch": 0.16544044541658381,
"grad_norm": 0.21947415173053741,
"learning_rate": 1.2950000000000001e-05,
"loss": 3.5314,
"step": 1482
},
{
"epoch": 0.16555207864560986,
"grad_norm": 0.21117177605628967,
"learning_rate": 1.2925e-05,
"loss": 3.5831,
"step": 1483
},
{
"epoch": 0.16566371187463588,
"grad_norm": 0.21180784702301025,
"learning_rate": 1.29e-05,
"loss": 3.5916,
"step": 1484
},
{
"epoch": 0.16577534510366193,
"grad_norm": 0.22672826051712036,
"learning_rate": 1.2875000000000001e-05,
"loss": 3.5531,
"step": 1485
},
{
"epoch": 0.16588697833268795,
"grad_norm": 0.21674370765686035,
"learning_rate": 1.285e-05,
"loss": 3.5221,
"step": 1486
},
{
"epoch": 0.165998611561714,
"grad_norm": 0.21089033782482147,
"learning_rate": 1.2825000000000002e-05,
"loss": 3.5448,
"step": 1487
},
{
"epoch": 0.16611024479074002,
"grad_norm": 0.22855010628700256,
"learning_rate": 1.2800000000000001e-05,
"loss": 3.5717,
"step": 1488
},
{
"epoch": 0.16622187801976607,
"grad_norm": 0.21975110471248627,
"learning_rate": 1.2775e-05,
"loss": 3.5829,
"step": 1489
},
{
"epoch": 0.1663335112487921,
"grad_norm": 0.22855724394321442,
"learning_rate": 1.2750000000000002e-05,
"loss": 3.464,
"step": 1490
},
{
"epoch": 0.16644514447781814,
"grad_norm": 0.2015756219625473,
"learning_rate": 1.2725000000000001e-05,
"loss": 3.6538,
"step": 1491
},
{
"epoch": 0.16655677770684416,
"grad_norm": 0.2162073254585266,
"learning_rate": 1.27e-05,
"loss": 3.482,
"step": 1492
},
{
"epoch": 0.1666684109358702,
"grad_norm": 0.20621392130851746,
"learning_rate": 1.2675000000000001e-05,
"loss": 3.5012,
"step": 1493
},
{
"epoch": 0.16678004416489622,
"grad_norm": 0.22770124673843384,
"learning_rate": 1.2650000000000001e-05,
"loss": 3.5976,
"step": 1494
},
{
"epoch": 0.16689167739392227,
"grad_norm": 0.21707509458065033,
"learning_rate": 1.2625e-05,
"loss": 3.495,
"step": 1495
},
{
"epoch": 0.1670033106229483,
"grad_norm": 0.22156280279159546,
"learning_rate": 1.2600000000000001e-05,
"loss": 3.617,
"step": 1496
},
{
"epoch": 0.16711494385197434,
"grad_norm": 0.2090180665254593,
"learning_rate": 1.2575e-05,
"loss": 3.5106,
"step": 1497
},
{
"epoch": 0.16722657708100036,
"grad_norm": 0.22272536158561707,
"learning_rate": 1.255e-05,
"loss": 3.6574,
"step": 1498
},
{
"epoch": 0.1673382103100264,
"grad_norm": 0.24037303030490875,
"learning_rate": 1.2525000000000001e-05,
"loss": 3.411,
"step": 1499
},
{
"epoch": 0.16744984353905246,
"grad_norm": 0.21370282769203186,
"learning_rate": 1.25e-05,
"loss": 3.6125,
"step": 1500
},
{
"epoch": 0.16756147676807848,
"grad_norm": 0.2080262452363968,
"learning_rate": 1.2475e-05,
"loss": 3.5046,
"step": 1501
},
{
"epoch": 0.16767310999710452,
"grad_norm": 0.21222364902496338,
"learning_rate": 1.2450000000000001e-05,
"loss": 3.5178,
"step": 1502
},
{
"epoch": 0.16778474322613054,
"grad_norm": 0.21181917190551758,
"learning_rate": 1.2425e-05,
"loss": 3.5196,
"step": 1503
},
{
"epoch": 0.1678963764551566,
"grad_norm": 0.21544714272022247,
"learning_rate": 1.24e-05,
"loss": 3.53,
"step": 1504
},
{
"epoch": 0.1680080096841826,
"grad_norm": 0.20736469328403473,
"learning_rate": 1.2375000000000001e-05,
"loss": 3.5103,
"step": 1505
},
{
"epoch": 0.16811964291320866,
"grad_norm": 0.21963639557361603,
"learning_rate": 1.235e-05,
"loss": 3.4397,
"step": 1506
},
{
"epoch": 0.16823127614223468,
"grad_norm": 0.2241033911705017,
"learning_rate": 1.2325e-05,
"loss": 3.4924,
"step": 1507
},
{
"epoch": 0.16834290937126073,
"grad_norm": 0.2196267992258072,
"learning_rate": 1.23e-05,
"loss": 3.4072,
"step": 1508
},
{
"epoch": 0.16845454260028675,
"grad_norm": 0.2034514844417572,
"learning_rate": 1.2275e-05,
"loss": 3.6212,
"step": 1509
},
{
"epoch": 0.1685661758293128,
"grad_norm": 0.21471145749092102,
"learning_rate": 1.225e-05,
"loss": 3.5626,
"step": 1510
},
{
"epoch": 0.16867780905833882,
"grad_norm": 0.23062168061733246,
"learning_rate": 1.2225e-05,
"loss": 3.5535,
"step": 1511
},
{
"epoch": 0.16878944228736487,
"grad_norm": 0.21895597875118256,
"learning_rate": 1.22e-05,
"loss": 3.6344,
"step": 1512
},
{
"epoch": 0.16890107551639089,
"grad_norm": 0.21283774077892303,
"learning_rate": 1.2175e-05,
"loss": 3.6335,
"step": 1513
},
{
"epoch": 0.16901270874541693,
"grad_norm": 0.21924851834774017,
"learning_rate": 1.215e-05,
"loss": 3.528,
"step": 1514
},
{
"epoch": 0.16912434197444295,
"grad_norm": 0.22328750789165497,
"learning_rate": 1.2125e-05,
"loss": 3.6072,
"step": 1515
},
{
"epoch": 0.169235975203469,
"grad_norm": 0.22236225008964539,
"learning_rate": 1.2100000000000001e-05,
"loss": 3.4442,
"step": 1516
},
{
"epoch": 0.16934760843249505,
"grad_norm": 0.21451590955257416,
"learning_rate": 1.2075e-05,
"loss": 3.6533,
"step": 1517
},
{
"epoch": 0.16945924166152107,
"grad_norm": 0.22100859880447388,
"learning_rate": 1.205e-05,
"loss": 3.4443,
"step": 1518
},
{
"epoch": 0.16957087489054712,
"grad_norm": 0.2172645926475525,
"learning_rate": 1.2025000000000001e-05,
"loss": 3.4917,
"step": 1519
},
{
"epoch": 0.16968250811957314,
"grad_norm": 0.21748663485050201,
"learning_rate": 1.2e-05,
"loss": 3.5854,
"step": 1520
},
{
"epoch": 0.1697941413485992,
"grad_norm": 0.2013814002275467,
"learning_rate": 1.1975e-05,
"loss": 3.4533,
"step": 1521
},
{
"epoch": 0.1699057745776252,
"grad_norm": 0.2037298083305359,
"learning_rate": 1.195e-05,
"loss": 3.5438,
"step": 1522
},
{
"epoch": 0.17001740780665126,
"grad_norm": 0.2236228585243225,
"learning_rate": 1.1925e-05,
"loss": 3.3935,
"step": 1523
},
{
"epoch": 0.17012904103567728,
"grad_norm": 0.20873163640499115,
"learning_rate": 1.19e-05,
"loss": 3.506,
"step": 1524
},
{
"epoch": 0.17024067426470332,
"grad_norm": 0.21777422726154327,
"learning_rate": 1.1875e-05,
"loss": 3.5229,
"step": 1525
},
{
"epoch": 0.17035230749372934,
"grad_norm": 0.20728673040866852,
"learning_rate": 1.185e-05,
"loss": 3.5541,
"step": 1526
},
{
"epoch": 0.1704639407227554,
"grad_norm": 0.20998942852020264,
"learning_rate": 1.1825e-05,
"loss": 3.5376,
"step": 1527
},
{
"epoch": 0.1705755739517814,
"grad_norm": 0.21379025280475616,
"learning_rate": 1.18e-05,
"loss": 3.5639,
"step": 1528
},
{
"epoch": 0.17068720718080746,
"grad_norm": 0.21432730555534363,
"learning_rate": 1.1775e-05,
"loss": 3.5917,
"step": 1529
},
{
"epoch": 0.17079884040983348,
"grad_norm": 0.21745604276657104,
"learning_rate": 1.175e-05,
"loss": 3.5292,
"step": 1530
},
{
"epoch": 0.17091047363885953,
"grad_norm": 0.2096647471189499,
"learning_rate": 1.1725e-05,
"loss": 3.4218,
"step": 1531
},
{
"epoch": 0.17102210686788555,
"grad_norm": 0.20660053193569183,
"learning_rate": 1.1700000000000001e-05,
"loss": 3.5688,
"step": 1532
},
{
"epoch": 0.1711337400969116,
"grad_norm": 0.21090877056121826,
"learning_rate": 1.1675000000000001e-05,
"loss": 3.5688,
"step": 1533
},
{
"epoch": 0.17124537332593764,
"grad_norm": 0.20636992156505585,
"learning_rate": 1.1650000000000002e-05,
"loss": 3.5253,
"step": 1534
},
{
"epoch": 0.17135700655496366,
"grad_norm": 0.2143183797597885,
"learning_rate": 1.1625000000000001e-05,
"loss": 3.5414,
"step": 1535
},
{
"epoch": 0.1714686397839897,
"grad_norm": 0.21636615693569183,
"learning_rate": 1.16e-05,
"loss": 3.6101,
"step": 1536
},
{
"epoch": 0.17158027301301573,
"grad_norm": 0.21370527148246765,
"learning_rate": 1.1575000000000002e-05,
"loss": 3.4445,
"step": 1537
},
{
"epoch": 0.17169190624204178,
"grad_norm": 0.21299275755882263,
"learning_rate": 1.1550000000000001e-05,
"loss": 3.4939,
"step": 1538
},
{
"epoch": 0.1718035394710678,
"grad_norm": 0.20779496431350708,
"learning_rate": 1.1525e-05,
"loss": 3.5206,
"step": 1539
},
{
"epoch": 0.17191517270009385,
"grad_norm": 0.2223391830921173,
"learning_rate": 1.1500000000000002e-05,
"loss": 3.5862,
"step": 1540
},
{
"epoch": 0.17202680592911987,
"grad_norm": 0.2126963585615158,
"learning_rate": 1.1475000000000001e-05,
"loss": 3.4409,
"step": 1541
},
{
"epoch": 0.17213843915814592,
"grad_norm": 0.22384919226169586,
"learning_rate": 1.145e-05,
"loss": 3.5958,
"step": 1542
},
{
"epoch": 0.17225007238717194,
"grad_norm": 0.4821729362010956,
"learning_rate": 1.1425000000000002e-05,
"loss": 3.5489,
"step": 1543
},
{
"epoch": 0.17236170561619799,
"grad_norm": 0.21050678193569183,
"learning_rate": 1.1400000000000001e-05,
"loss": 3.4991,
"step": 1544
},
{
"epoch": 0.172473338845224,
"grad_norm": 0.22060689330101013,
"learning_rate": 1.1375e-05,
"loss": 3.5374,
"step": 1545
},
{
"epoch": 0.17258497207425005,
"grad_norm": 0.2169683873653412,
"learning_rate": 1.1350000000000001e-05,
"loss": 3.4706,
"step": 1546
},
{
"epoch": 0.17269660530327607,
"grad_norm": 0.20885589718818665,
"learning_rate": 1.1325e-05,
"loss": 3.575,
"step": 1547
},
{
"epoch": 0.17280823853230212,
"grad_norm": 0.2167971283197403,
"learning_rate": 1.13e-05,
"loss": 3.6424,
"step": 1548
},
{
"epoch": 0.17291987176132817,
"grad_norm": 0.2178073525428772,
"learning_rate": 1.1275000000000001e-05,
"loss": 3.4981,
"step": 1549
},
{
"epoch": 0.1730315049903542,
"grad_norm": 0.2138373851776123,
"learning_rate": 1.125e-05,
"loss": 3.5385,
"step": 1550
},
{
"epoch": 0.17314313821938024,
"grad_norm": 0.216523215174675,
"learning_rate": 1.1225e-05,
"loss": 3.587,
"step": 1551
},
{
"epoch": 0.17325477144840626,
"grad_norm": 0.20889335870742798,
"learning_rate": 1.1200000000000001e-05,
"loss": 3.5526,
"step": 1552
},
{
"epoch": 0.1733664046774323,
"grad_norm": 0.27265238761901855,
"learning_rate": 1.1175e-05,
"loss": 3.5769,
"step": 1553
},
{
"epoch": 0.17347803790645833,
"grad_norm": 0.21519778668880463,
"learning_rate": 1.115e-05,
"loss": 3.637,
"step": 1554
},
{
"epoch": 0.17358967113548437,
"grad_norm": 0.2167736291885376,
"learning_rate": 1.1125000000000001e-05,
"loss": 3.6629,
"step": 1555
},
{
"epoch": 0.1737013043645104,
"grad_norm": 0.22681158781051636,
"learning_rate": 1.11e-05,
"loss": 3.5012,
"step": 1556
},
{
"epoch": 0.17381293759353644,
"grad_norm": 0.21524807810783386,
"learning_rate": 1.1075e-05,
"loss": 3.5489,
"step": 1557
},
{
"epoch": 0.17392457082256246,
"grad_norm": 0.21959412097930908,
"learning_rate": 1.1050000000000001e-05,
"loss": 3.5919,
"step": 1558
},
{
"epoch": 0.1740362040515885,
"grad_norm": 0.21743929386138916,
"learning_rate": 1.1025e-05,
"loss": 3.5424,
"step": 1559
},
{
"epoch": 0.17414783728061453,
"grad_norm": 0.21246570348739624,
"learning_rate": 1.1000000000000001e-05,
"loss": 3.5134,
"step": 1560
},
{
"epoch": 0.17425947050964058,
"grad_norm": 0.2095751166343689,
"learning_rate": 1.0975e-05,
"loss": 3.5248,
"step": 1561
},
{
"epoch": 0.1743711037386666,
"grad_norm": 0.21749746799468994,
"learning_rate": 1.095e-05,
"loss": 3.492,
"step": 1562
},
{
"epoch": 0.17448273696769265,
"grad_norm": 0.20110465586185455,
"learning_rate": 1.0925000000000001e-05,
"loss": 3.5485,
"step": 1563
},
{
"epoch": 0.17459437019671867,
"grad_norm": 0.21883323788642883,
"learning_rate": 1.09e-05,
"loss": 3.5844,
"step": 1564
},
{
"epoch": 0.17470600342574472,
"grad_norm": 0.23392991721630096,
"learning_rate": 1.0875e-05,
"loss": 3.5022,
"step": 1565
},
{
"epoch": 0.17481763665477076,
"grad_norm": 0.2050667405128479,
"learning_rate": 1.0850000000000001e-05,
"loss": 3.3671,
"step": 1566
},
{
"epoch": 0.17492926988379678,
"grad_norm": 0.21827714145183563,
"learning_rate": 1.0825e-05,
"loss": 3.5177,
"step": 1567
},
{
"epoch": 0.17504090311282283,
"grad_norm": 0.19960032403469086,
"learning_rate": 1.08e-05,
"loss": 3.517,
"step": 1568
},
{
"epoch": 0.17515253634184885,
"grad_norm": 0.2874765694141388,
"learning_rate": 1.0775000000000001e-05,
"loss": 3.5364,
"step": 1569
},
{
"epoch": 0.1752641695708749,
"grad_norm": 0.2284320592880249,
"learning_rate": 1.075e-05,
"loss": 3.5305,
"step": 1570
},
{
"epoch": 0.17537580279990092,
"grad_norm": 0.21969011425971985,
"learning_rate": 1.0725e-05,
"loss": 3.5458,
"step": 1571
},
{
"epoch": 0.17548743602892697,
"grad_norm": 0.21821846067905426,
"learning_rate": 1.0700000000000001e-05,
"loss": 3.5432,
"step": 1572
},
{
"epoch": 0.175599069257953,
"grad_norm": 0.2403402477502823,
"learning_rate": 1.0675e-05,
"loss": 3.6681,
"step": 1573
},
{
"epoch": 0.17571070248697904,
"grad_norm": 0.22653374075889587,
"learning_rate": 1.065e-05,
"loss": 3.6269,
"step": 1574
},
{
"epoch": 0.17582233571600506,
"grad_norm": 0.22169093787670135,
"learning_rate": 1.0625e-05,
"loss": 3.5753,
"step": 1575
},
{
"epoch": 0.1759339689450311,
"grad_norm": 0.21036727726459503,
"learning_rate": 1.06e-05,
"loss": 3.5751,
"step": 1576
},
{
"epoch": 0.17604560217405713,
"grad_norm": 0.20712779462337494,
"learning_rate": 1.0575e-05,
"loss": 3.5511,
"step": 1577
},
{
"epoch": 0.17615723540308317,
"grad_norm": 0.22018960118293762,
"learning_rate": 1.055e-05,
"loss": 3.6056,
"step": 1578
},
{
"epoch": 0.1762688686321092,
"grad_norm": 0.25769972801208496,
"learning_rate": 1.0525e-05,
"loss": 3.6141,
"step": 1579
},
{
"epoch": 0.17638050186113524,
"grad_norm": 0.21188807487487793,
"learning_rate": 1.05e-05,
"loss": 3.6399,
"step": 1580
},
{
"epoch": 0.17649213509016126,
"grad_norm": 0.22229023277759552,
"learning_rate": 1.0475e-05,
"loss": 3.5115,
"step": 1581
},
{
"epoch": 0.1766037683191873,
"grad_norm": 0.21499265730381012,
"learning_rate": 1.045e-05,
"loss": 3.4834,
"step": 1582
},
{
"epoch": 0.17671540154821336,
"grad_norm": 0.24194024503231049,
"learning_rate": 1.0425e-05,
"loss": 3.4907,
"step": 1583
},
{
"epoch": 0.17682703477723938,
"grad_norm": 0.23081763088703156,
"learning_rate": 1.04e-05,
"loss": 3.5244,
"step": 1584
},
{
"epoch": 0.17693866800626543,
"grad_norm": 0.2430485486984253,
"learning_rate": 1.0375e-05,
"loss": 3.5799,
"step": 1585
},
{
"epoch": 0.17705030123529145,
"grad_norm": 0.21328388154506683,
"learning_rate": 1.035e-05,
"loss": 3.4979,
"step": 1586
},
{
"epoch": 0.1771619344643175,
"grad_norm": 0.20552369952201843,
"learning_rate": 1.0325e-05,
"loss": 3.5274,
"step": 1587
},
{
"epoch": 0.17727356769334351,
"grad_norm": 0.2030210793018341,
"learning_rate": 1.03e-05,
"loss": 3.6013,
"step": 1588
},
{
"epoch": 0.17738520092236956,
"grad_norm": 0.22777998447418213,
"learning_rate": 1.0275e-05,
"loss": 3.5272,
"step": 1589
},
{
"epoch": 0.17749683415139558,
"grad_norm": 0.21183083951473236,
"learning_rate": 1.025e-05,
"loss": 3.5106,
"step": 1590
},
{
"epoch": 0.17760846738042163,
"grad_norm": 0.21270430088043213,
"learning_rate": 1.0225e-05,
"loss": 3.5142,
"step": 1591
},
{
"epoch": 0.17772010060944765,
"grad_norm": 0.21856921911239624,
"learning_rate": 1.02e-05,
"loss": 3.5786,
"step": 1592
},
{
"epoch": 0.1778317338384737,
"grad_norm": 0.2257380336523056,
"learning_rate": 1.0175e-05,
"loss": 3.4983,
"step": 1593
},
{
"epoch": 0.17794336706749972,
"grad_norm": 0.21744483709335327,
"learning_rate": 1.0150000000000001e-05,
"loss": 3.5805,
"step": 1594
},
{
"epoch": 0.17805500029652577,
"grad_norm": 0.21964868903160095,
"learning_rate": 1.0125e-05,
"loss": 3.559,
"step": 1595
},
{
"epoch": 0.1781666335255518,
"grad_norm": 0.24103224277496338,
"learning_rate": 1.0100000000000002e-05,
"loss": 3.6182,
"step": 1596
},
{
"epoch": 0.17827826675457784,
"grad_norm": 0.2221798449754715,
"learning_rate": 1.0075000000000001e-05,
"loss": 3.507,
"step": 1597
},
{
"epoch": 0.17838989998360386,
"grad_norm": 0.22181838750839233,
"learning_rate": 1.005e-05,
"loss": 3.4702,
"step": 1598
},
{
"epoch": 0.1785015332126299,
"grad_norm": 0.2252049446105957,
"learning_rate": 1.0025000000000001e-05,
"loss": 3.5435,
"step": 1599
},
{
"epoch": 0.17861316644165595,
"grad_norm": 0.20981422066688538,
"learning_rate": 1e-05,
"loss": 3.5726,
"step": 1600
},
{
"epoch": 0.17872479967068197,
"grad_norm": 0.20881150662899017,
"learning_rate": 9.975e-06,
"loss": 3.5144,
"step": 1601
},
{
"epoch": 0.17883643289970802,
"grad_norm": 0.20763754844665527,
"learning_rate": 9.950000000000001e-06,
"loss": 3.5704,
"step": 1602
},
{
"epoch": 0.17894806612873404,
"grad_norm": 0.21722842752933502,
"learning_rate": 9.925e-06,
"loss": 3.5227,
"step": 1603
},
{
"epoch": 0.1790596993577601,
"grad_norm": 0.21515443921089172,
"learning_rate": 9.900000000000002e-06,
"loss": 3.5175,
"step": 1604
},
{
"epoch": 0.1791713325867861,
"grad_norm": 0.20165440440177917,
"learning_rate": 9.875000000000001e-06,
"loss": 3.5318,
"step": 1605
},
{
"epoch": 0.17928296581581216,
"grad_norm": 0.2161119282245636,
"learning_rate": 9.85e-06,
"loss": 3.6028,
"step": 1606
},
{
"epoch": 0.17939459904483818,
"grad_norm": 0.21552051603794098,
"learning_rate": 9.825000000000002e-06,
"loss": 3.5124,
"step": 1607
},
{
"epoch": 0.17950623227386422,
"grad_norm": 0.21852916479110718,
"learning_rate": 9.800000000000001e-06,
"loss": 3.6083,
"step": 1608
},
{
"epoch": 0.17961786550289024,
"grad_norm": 0.2451242059469223,
"learning_rate": 9.775e-06,
"loss": 3.5618,
"step": 1609
},
{
"epoch": 0.1797294987319163,
"grad_norm": 0.2138652354478836,
"learning_rate": 9.750000000000002e-06,
"loss": 3.551,
"step": 1610
},
{
"epoch": 0.1798411319609423,
"grad_norm": 0.21602340042591095,
"learning_rate": 9.725000000000001e-06,
"loss": 3.5948,
"step": 1611
},
{
"epoch": 0.17995276518996836,
"grad_norm": 0.2034982591867447,
"learning_rate": 9.7e-06,
"loss": 3.5985,
"step": 1612
},
{
"epoch": 0.18006439841899438,
"grad_norm": 0.20241326093673706,
"learning_rate": 9.675000000000001e-06,
"loss": 3.5132,
"step": 1613
},
{
"epoch": 0.18017603164802043,
"grad_norm": 0.20858971774578094,
"learning_rate": 9.65e-06,
"loss": 3.533,
"step": 1614
},
{
"epoch": 0.18028766487704648,
"grad_norm": 0.21889185905456543,
"learning_rate": 9.625e-06,
"loss": 3.6809,
"step": 1615
},
{
"epoch": 0.1803992981060725,
"grad_norm": 0.2096552848815918,
"learning_rate": 9.600000000000001e-06,
"loss": 3.4821,
"step": 1616
},
{
"epoch": 0.18051093133509855,
"grad_norm": 0.23852674663066864,
"learning_rate": 9.575e-06,
"loss": 3.4374,
"step": 1617
},
{
"epoch": 0.18062256456412457,
"grad_norm": 0.211916983127594,
"learning_rate": 9.55e-06,
"loss": 3.5486,
"step": 1618
},
{
"epoch": 0.1807341977931506,
"grad_norm": 0.2230437695980072,
"learning_rate": 9.525000000000001e-06,
"loss": 3.608,
"step": 1619
},
{
"epoch": 0.18084583102217663,
"grad_norm": 0.21562589704990387,
"learning_rate": 9.5e-06,
"loss": 3.537,
"step": 1620
},
{
"epoch": 0.18095746425120268,
"grad_norm": 0.22619010508060455,
"learning_rate": 9.475e-06,
"loss": 3.5595,
"step": 1621
},
{
"epoch": 0.1810690974802287,
"grad_norm": 0.1970967799425125,
"learning_rate": 9.450000000000001e-06,
"loss": 3.4868,
"step": 1622
},
{
"epoch": 0.18118073070925475,
"grad_norm": 0.21510455012321472,
"learning_rate": 9.425e-06,
"loss": 3.4862,
"step": 1623
},
{
"epoch": 0.18129236393828077,
"grad_norm": 0.21073304116725922,
"learning_rate": 9.4e-06,
"loss": 3.5623,
"step": 1624
},
{
"epoch": 0.18140399716730682,
"grad_norm": 0.20508894324302673,
"learning_rate": 9.375000000000001e-06,
"loss": 3.4956,
"step": 1625
},
{
"epoch": 0.18151563039633284,
"grad_norm": 0.2108650505542755,
"learning_rate": 9.35e-06,
"loss": 3.624,
"step": 1626
},
{
"epoch": 0.1816272636253589,
"grad_norm": 0.22114230692386627,
"learning_rate": 9.325e-06,
"loss": 3.5948,
"step": 1627
},
{
"epoch": 0.1817388968543849,
"grad_norm": 0.2232550084590912,
"learning_rate": 9.3e-06,
"loss": 3.6198,
"step": 1628
},
{
"epoch": 0.18185053008341096,
"grad_norm": 0.2209545373916626,
"learning_rate": 9.275e-06,
"loss": 3.5504,
"step": 1629
},
{
"epoch": 0.18196216331243698,
"grad_norm": 0.21971401572227478,
"learning_rate": 9.25e-06,
"loss": 3.5116,
"step": 1630
},
{
"epoch": 0.18207379654146302,
"grad_norm": 0.22807852923870087,
"learning_rate": 9.225e-06,
"loss": 3.4833,
"step": 1631
},
{
"epoch": 0.18218542977048907,
"grad_norm": 0.20887424051761627,
"learning_rate": 9.2e-06,
"loss": 3.5277,
"step": 1632
},
{
"epoch": 0.1822970629995151,
"grad_norm": 0.22335150837898254,
"learning_rate": 9.175000000000001e-06,
"loss": 3.5907,
"step": 1633
},
{
"epoch": 0.18240869622854114,
"grad_norm": 0.20540666580200195,
"learning_rate": 9.15e-06,
"loss": 3.5086,
"step": 1634
},
{
"epoch": 0.18252032945756716,
"grad_norm": 0.2120242565870285,
"learning_rate": 9.125e-06,
"loss": 3.5217,
"step": 1635
},
{
"epoch": 0.1826319626865932,
"grad_norm": 0.22870121896266937,
"learning_rate": 9.100000000000001e-06,
"loss": 3.6372,
"step": 1636
},
{
"epoch": 0.18274359591561923,
"grad_norm": 0.2195448875427246,
"learning_rate": 9.075e-06,
"loss": 3.5235,
"step": 1637
},
{
"epoch": 0.18285522914464528,
"grad_norm": 0.2103642225265503,
"learning_rate": 9.05e-06,
"loss": 3.5271,
"step": 1638
},
{
"epoch": 0.1829668623736713,
"grad_norm": 0.23172304034233093,
"learning_rate": 9.025e-06,
"loss": 3.6327,
"step": 1639
},
{
"epoch": 0.18307849560269734,
"grad_norm": 0.21956613659858704,
"learning_rate": 9e-06,
"loss": 3.5466,
"step": 1640
},
{
"epoch": 0.18319012883172336,
"grad_norm": 0.23706240952014923,
"learning_rate": 8.975e-06,
"loss": 3.5896,
"step": 1641
},
{
"epoch": 0.1833017620607494,
"grad_norm": 0.21631015837192535,
"learning_rate": 8.95e-06,
"loss": 3.5894,
"step": 1642
},
{
"epoch": 0.18341339528977543,
"grad_norm": 0.2186046689748764,
"learning_rate": 8.925e-06,
"loss": 3.5039,
"step": 1643
},
{
"epoch": 0.18352502851880148,
"grad_norm": 0.20839321613311768,
"learning_rate": 8.9e-06,
"loss": 3.5918,
"step": 1644
},
{
"epoch": 0.1836366617478275,
"grad_norm": 0.22128106653690338,
"learning_rate": 8.875e-06,
"loss": 3.6208,
"step": 1645
},
{
"epoch": 0.18374829497685355,
"grad_norm": 0.20387986302375793,
"learning_rate": 8.85e-06,
"loss": 3.5432,
"step": 1646
},
{
"epoch": 0.18385992820587957,
"grad_norm": 0.20732374489307404,
"learning_rate": 8.825e-06,
"loss": 3.5408,
"step": 1647
},
{
"epoch": 0.18397156143490562,
"grad_norm": 0.21314465999603271,
"learning_rate": 8.8e-06,
"loss": 3.4921,
"step": 1648
},
{
"epoch": 0.18408319466393167,
"grad_norm": 0.20982185006141663,
"learning_rate": 8.775e-06,
"loss": 3.5423,
"step": 1649
},
{
"epoch": 0.18419482789295769,
"grad_norm": 0.2033572494983673,
"learning_rate": 8.75e-06,
"loss": 3.5086,
"step": 1650
},
{
"epoch": 0.18430646112198373,
"grad_norm": 0.2375604808330536,
"learning_rate": 8.725e-06,
"loss": 3.5427,
"step": 1651
},
{
"epoch": 0.18441809435100975,
"grad_norm": 0.21518820524215698,
"learning_rate": 8.7e-06,
"loss": 3.5797,
"step": 1652
},
{
"epoch": 0.1845297275800358,
"grad_norm": 0.21988040208816528,
"learning_rate": 8.674999999999999e-06,
"loss": 3.6418,
"step": 1653
},
{
"epoch": 0.18464136080906182,
"grad_norm": 0.22092029452323914,
"learning_rate": 8.65e-06,
"loss": 3.4977,
"step": 1654
},
{
"epoch": 0.18475299403808787,
"grad_norm": 0.2101849913597107,
"learning_rate": 8.625e-06,
"loss": 3.5365,
"step": 1655
},
{
"epoch": 0.1848646272671139,
"grad_norm": 0.21999023854732513,
"learning_rate": 8.599999999999999e-06,
"loss": 3.5518,
"step": 1656
},
{
"epoch": 0.18497626049613994,
"grad_norm": 0.21221406757831573,
"learning_rate": 8.575000000000002e-06,
"loss": 3.5265,
"step": 1657
},
{
"epoch": 0.18508789372516596,
"grad_norm": 0.23556159436702728,
"learning_rate": 8.550000000000001e-06,
"loss": 3.4086,
"step": 1658
},
{
"epoch": 0.185199526954192,
"grad_norm": 0.21537427604198456,
"learning_rate": 8.525e-06,
"loss": 3.5649,
"step": 1659
},
{
"epoch": 0.18531116018321803,
"grad_norm": 0.22734335064888,
"learning_rate": 8.500000000000002e-06,
"loss": 3.5569,
"step": 1660
},
{
"epoch": 0.18542279341224407,
"grad_norm": 0.23715220391750336,
"learning_rate": 8.475000000000001e-06,
"loss": 3.5523,
"step": 1661
},
{
"epoch": 0.1855344266412701,
"grad_norm": 0.2167229950428009,
"learning_rate": 8.45e-06,
"loss": 3.5503,
"step": 1662
},
{
"epoch": 0.18564605987029614,
"grad_norm": 0.23782390356063843,
"learning_rate": 8.425000000000001e-06,
"loss": 3.4942,
"step": 1663
},
{
"epoch": 0.18575769309932216,
"grad_norm": 0.21744339168071747,
"learning_rate": 8.400000000000001e-06,
"loss": 3.6044,
"step": 1664
},
{
"epoch": 0.1858693263283482,
"grad_norm": 0.21382561326026917,
"learning_rate": 8.375e-06,
"loss": 3.5715,
"step": 1665
},
{
"epoch": 0.18598095955737426,
"grad_norm": 0.22757397592067719,
"learning_rate": 8.350000000000001e-06,
"loss": 3.5058,
"step": 1666
},
{
"epoch": 0.18609259278640028,
"grad_norm": 0.2172391563653946,
"learning_rate": 8.325e-06,
"loss": 3.4757,
"step": 1667
},
{
"epoch": 0.18620422601542633,
"grad_norm": 0.21588599681854248,
"learning_rate": 8.3e-06,
"loss": 3.5328,
"step": 1668
},
{
"epoch": 0.18631585924445235,
"grad_norm": 0.2147780954837799,
"learning_rate": 8.275000000000001e-06,
"loss": 3.5743,
"step": 1669
},
{
"epoch": 0.1864274924734784,
"grad_norm": 0.21497298777103424,
"learning_rate": 8.25e-06,
"loss": 3.5867,
"step": 1670
},
{
"epoch": 0.18653912570250442,
"grad_norm": 0.22152438759803772,
"learning_rate": 8.225e-06,
"loss": 3.567,
"step": 1671
},
{
"epoch": 0.18665075893153046,
"grad_norm": 0.21230459213256836,
"learning_rate": 8.200000000000001e-06,
"loss": 3.5939,
"step": 1672
},
{
"epoch": 0.18676239216055648,
"grad_norm": 0.20990978181362152,
"learning_rate": 8.175e-06,
"loss": 3.6556,
"step": 1673
},
{
"epoch": 0.18687402538958253,
"grad_norm": 0.21551361680030823,
"learning_rate": 8.15e-06,
"loss": 3.5663,
"step": 1674
},
{
"epoch": 0.18698565861860855,
"grad_norm": 0.21378657221794128,
"learning_rate": 8.125000000000001e-06,
"loss": 3.5193,
"step": 1675
},
{
"epoch": 0.1870972918476346,
"grad_norm": 0.2161632627248764,
"learning_rate": 8.1e-06,
"loss": 3.5594,
"step": 1676
},
{
"epoch": 0.18720892507666062,
"grad_norm": 0.2246815413236618,
"learning_rate": 8.075000000000001e-06,
"loss": 3.6398,
"step": 1677
},
{
"epoch": 0.18732055830568667,
"grad_norm": 0.22244827449321747,
"learning_rate": 8.050000000000001e-06,
"loss": 3.528,
"step": 1678
},
{
"epoch": 0.1874321915347127,
"grad_norm": 0.2109561711549759,
"learning_rate": 8.025e-06,
"loss": 3.5443,
"step": 1679
},
{
"epoch": 0.18754382476373874,
"grad_norm": 0.232377827167511,
"learning_rate": 8.000000000000001e-06,
"loss": 3.5274,
"step": 1680
},
{
"epoch": 0.18765545799276478,
"grad_norm": 0.21060387790203094,
"learning_rate": 7.975e-06,
"loss": 3.4706,
"step": 1681
},
{
"epoch": 0.1877670912217908,
"grad_norm": 0.2102738320827484,
"learning_rate": 7.95e-06,
"loss": 3.577,
"step": 1682
},
{
"epoch": 0.18787872445081685,
"grad_norm": 0.20153282582759857,
"learning_rate": 7.925000000000001e-06,
"loss": 3.4756,
"step": 1683
},
{
"epoch": 0.18799035767984287,
"grad_norm": 0.21588648855686188,
"learning_rate": 7.9e-06,
"loss": 3.5611,
"step": 1684
},
{
"epoch": 0.18810199090886892,
"grad_norm": 0.2133258581161499,
"learning_rate": 7.875e-06,
"loss": 3.5545,
"step": 1685
},
{
"epoch": 0.18821362413789494,
"grad_norm": 0.21955335140228271,
"learning_rate": 7.850000000000001e-06,
"loss": 3.5164,
"step": 1686
},
{
"epoch": 0.188325257366921,
"grad_norm": 0.20732825994491577,
"learning_rate": 7.825e-06,
"loss": 3.5436,
"step": 1687
},
{
"epoch": 0.188436890595947,
"grad_norm": 0.2761525809764862,
"learning_rate": 7.8e-06,
"loss": 3.4323,
"step": 1688
},
{
"epoch": 0.18854852382497306,
"grad_norm": 0.20956240594387054,
"learning_rate": 7.775000000000001e-06,
"loss": 3.553,
"step": 1689
},
{
"epoch": 0.18866015705399908,
"grad_norm": 0.2258850336074829,
"learning_rate": 7.75e-06,
"loss": 3.6367,
"step": 1690
},
{
"epoch": 0.18877179028302513,
"grad_norm": 0.2164323776960373,
"learning_rate": 7.725e-06,
"loss": 3.5386,
"step": 1691
},
{
"epoch": 0.18888342351205115,
"grad_norm": 0.20159319043159485,
"learning_rate": 7.7e-06,
"loss": 3.5318,
"step": 1692
},
{
"epoch": 0.1889950567410772,
"grad_norm": 0.21403788030147552,
"learning_rate": 7.675e-06,
"loss": 3.5573,
"step": 1693
},
{
"epoch": 0.18910668997010321,
"grad_norm": 0.21474437415599823,
"learning_rate": 7.65e-06,
"loss": 3.3878,
"step": 1694
},
{
"epoch": 0.18921832319912926,
"grad_norm": 0.20819175243377686,
"learning_rate": 7.625e-06,
"loss": 3.5155,
"step": 1695
},
{
"epoch": 0.18932995642815528,
"grad_norm": 0.2366546243429184,
"learning_rate": 7.6e-06,
"loss": 3.6345,
"step": 1696
},
{
"epoch": 0.18944158965718133,
"grad_norm": 0.21544064581394196,
"learning_rate": 7.575e-06,
"loss": 3.5887,
"step": 1697
},
{
"epoch": 0.18955322288620738,
"grad_norm": 0.21254029870033264,
"learning_rate": 7.55e-06,
"loss": 3.5919,
"step": 1698
},
{
"epoch": 0.1896648561152334,
"grad_norm": 0.20380495488643646,
"learning_rate": 7.525e-06,
"loss": 3.5529,
"step": 1699
},
{
"epoch": 0.18977648934425945,
"grad_norm": 0.20975427329540253,
"learning_rate": 7.5e-06,
"loss": 3.551,
"step": 1700
},
{
"epoch": 0.18988812257328547,
"grad_norm": 0.2120400220155716,
"learning_rate": 7.4750000000000004e-06,
"loss": 3.5268,
"step": 1701
},
{
"epoch": 0.18999975580231152,
"grad_norm": 0.20939916372299194,
"learning_rate": 7.45e-06,
"loss": 3.514,
"step": 1702
},
{
"epoch": 0.19011138903133754,
"grad_norm": 0.2030288428068161,
"learning_rate": 7.425e-06,
"loss": 3.4275,
"step": 1703
},
{
"epoch": 0.19022302226036358,
"grad_norm": 0.208505317568779,
"learning_rate": 7.4e-06,
"loss": 3.4469,
"step": 1704
},
{
"epoch": 0.1903346554893896,
"grad_norm": 0.23165282607078552,
"learning_rate": 7.375e-06,
"loss": 3.6005,
"step": 1705
},
{
"epoch": 0.19044628871841565,
"grad_norm": 0.21855324506759644,
"learning_rate": 7.35e-06,
"loss": 3.6461,
"step": 1706
},
{
"epoch": 0.19055792194744167,
"grad_norm": 0.22873982787132263,
"learning_rate": 7.325e-06,
"loss": 3.5015,
"step": 1707
},
{
"epoch": 0.19066955517646772,
"grad_norm": 0.22183842957019806,
"learning_rate": 7.2999999999999996e-06,
"loss": 3.4261,
"step": 1708
},
{
"epoch": 0.19078118840549374,
"grad_norm": 0.20959420502185822,
"learning_rate": 7.275e-06,
"loss": 3.5273,
"step": 1709
},
{
"epoch": 0.1908928216345198,
"grad_norm": 0.20506909489631653,
"learning_rate": 7.25e-06,
"loss": 3.5316,
"step": 1710
},
{
"epoch": 0.1910044548635458,
"grad_norm": 0.21426881849765778,
"learning_rate": 7.2249999999999994e-06,
"loss": 3.5625,
"step": 1711
},
{
"epoch": 0.19111608809257186,
"grad_norm": 0.21400906145572662,
"learning_rate": 7.2e-06,
"loss": 3.5668,
"step": 1712
},
{
"epoch": 0.19122772132159788,
"grad_norm": 0.21649572253227234,
"learning_rate": 7.175e-06,
"loss": 3.4373,
"step": 1713
},
{
"epoch": 0.19133935455062392,
"grad_norm": 0.21563754975795746,
"learning_rate": 7.15e-06,
"loss": 3.6013,
"step": 1714
},
{
"epoch": 0.19145098777964997,
"grad_norm": 0.19784951210021973,
"learning_rate": 7.1249999999999995e-06,
"loss": 3.529,
"step": 1715
},
{
"epoch": 0.191562621008676,
"grad_norm": 0.2245485633611679,
"learning_rate": 7.1e-06,
"loss": 3.4646,
"step": 1716
},
{
"epoch": 0.19167425423770204,
"grad_norm": 0.21543480455875397,
"learning_rate": 7.075e-06,
"loss": 3.5116,
"step": 1717
},
{
"epoch": 0.19178588746672806,
"grad_norm": 0.21941086649894714,
"learning_rate": 7.049999999999999e-06,
"loss": 3.591,
"step": 1718
},
{
"epoch": 0.1918975206957541,
"grad_norm": 0.2070179581642151,
"learning_rate": 7.025000000000001e-06,
"loss": 3.5775,
"step": 1719
},
{
"epoch": 0.19200915392478013,
"grad_norm": 0.2146470993757248,
"learning_rate": 7.000000000000001e-06,
"loss": 3.4814,
"step": 1720
},
{
"epoch": 0.19212078715380618,
"grad_norm": 0.21325471997261047,
"learning_rate": 6.975000000000001e-06,
"loss": 3.5154,
"step": 1721
},
{
"epoch": 0.1922324203828322,
"grad_norm": 0.21501004695892334,
"learning_rate": 6.950000000000001e-06,
"loss": 3.519,
"step": 1722
},
{
"epoch": 0.19234405361185825,
"grad_norm": 0.23035281896591187,
"learning_rate": 6.925000000000001e-06,
"loss": 3.4918,
"step": 1723
},
{
"epoch": 0.19245568684088427,
"grad_norm": 0.2104371339082718,
"learning_rate": 6.900000000000001e-06,
"loss": 3.6112,
"step": 1724
},
{
"epoch": 0.1925673200699103,
"grad_norm": 0.22019049525260925,
"learning_rate": 6.875000000000001e-06,
"loss": 3.4574,
"step": 1725
},
{
"epoch": 0.19267895329893633,
"grad_norm": 0.21455317735671997,
"learning_rate": 6.8500000000000005e-06,
"loss": 3.6248,
"step": 1726
},
{
"epoch": 0.19279058652796238,
"grad_norm": 0.2298976182937622,
"learning_rate": 6.825000000000001e-06,
"loss": 3.6386,
"step": 1727
},
{
"epoch": 0.1929022197569884,
"grad_norm": 0.21778175234794617,
"learning_rate": 6.800000000000001e-06,
"loss": 3.5837,
"step": 1728
},
{
"epoch": 0.19301385298601445,
"grad_norm": 0.22063672542572021,
"learning_rate": 6.775000000000001e-06,
"loss": 3.4086,
"step": 1729
},
{
"epoch": 0.1931254862150405,
"grad_norm": 0.21533387899398804,
"learning_rate": 6.750000000000001e-06,
"loss": 3.5653,
"step": 1730
},
{
"epoch": 0.19323711944406652,
"grad_norm": 0.21561191976070404,
"learning_rate": 6.725000000000001e-06,
"loss": 3.532,
"step": 1731
},
{
"epoch": 0.19334875267309257,
"grad_norm": 0.21011056005954742,
"learning_rate": 6.700000000000001e-06,
"loss": 3.4606,
"step": 1732
},
{
"epoch": 0.1934603859021186,
"grad_norm": 0.24638298153877258,
"learning_rate": 6.6750000000000005e-06,
"loss": 3.5425,
"step": 1733
},
{
"epoch": 0.19357201913114463,
"grad_norm": 0.19980867207050323,
"learning_rate": 6.650000000000001e-06,
"loss": 3.5724,
"step": 1734
},
{
"epoch": 0.19368365236017066,
"grad_norm": 0.22461222112178802,
"learning_rate": 6.625000000000001e-06,
"loss": 3.6093,
"step": 1735
},
{
"epoch": 0.1937952855891967,
"grad_norm": 0.2222217172384262,
"learning_rate": 6.6e-06,
"loss": 3.417,
"step": 1736
},
{
"epoch": 0.19390691881822272,
"grad_norm": 0.21770079433918,
"learning_rate": 6.5750000000000006e-06,
"loss": 3.6089,
"step": 1737
},
{
"epoch": 0.19401855204724877,
"grad_norm": 0.21207939088344574,
"learning_rate": 6.550000000000001e-06,
"loss": 3.58,
"step": 1738
},
{
"epoch": 0.1941301852762748,
"grad_norm": 0.2103612869977951,
"learning_rate": 6.525e-06,
"loss": 3.4986,
"step": 1739
},
{
"epoch": 0.19424181850530084,
"grad_norm": 0.25747326016426086,
"learning_rate": 6.5000000000000004e-06,
"loss": 3.6467,
"step": 1740
},
{
"epoch": 0.19435345173432686,
"grad_norm": 0.2058345228433609,
"learning_rate": 6.475000000000001e-06,
"loss": 3.603,
"step": 1741
},
{
"epoch": 0.1944650849633529,
"grad_norm": 0.21543510258197784,
"learning_rate": 6.45e-06,
"loss": 3.5405,
"step": 1742
},
{
"epoch": 0.19457671819237893,
"grad_norm": 0.21140018105506897,
"learning_rate": 6.425e-06,
"loss": 3.641,
"step": 1743
},
{
"epoch": 0.19468835142140498,
"grad_norm": 0.20616336166858673,
"learning_rate": 6.4000000000000006e-06,
"loss": 3.524,
"step": 1744
},
{
"epoch": 0.194799984650431,
"grad_norm": 0.20036578178405762,
"learning_rate": 6.375000000000001e-06,
"loss": 3.4803,
"step": 1745
},
{
"epoch": 0.19491161787945704,
"grad_norm": 0.2050037682056427,
"learning_rate": 6.35e-06,
"loss": 3.5761,
"step": 1746
},
{
"epoch": 0.1950232511084831,
"grad_norm": 0.2095308005809784,
"learning_rate": 6.3250000000000004e-06,
"loss": 3.5951,
"step": 1747
},
{
"epoch": 0.1951348843375091,
"grad_norm": 0.20952288806438446,
"learning_rate": 6.300000000000001e-06,
"loss": 3.5,
"step": 1748
},
{
"epoch": 0.19524651756653516,
"grad_norm": 0.2102053463459015,
"learning_rate": 6.275e-06,
"loss": 3.5638,
"step": 1749
},
{
"epoch": 0.19535815079556118,
"grad_norm": 0.20355603098869324,
"learning_rate": 6.25e-06,
"loss": 3.5573,
"step": 1750
},
{
"epoch": 0.19546978402458723,
"grad_norm": 0.20803166925907135,
"learning_rate": 6.2250000000000005e-06,
"loss": 3.47,
"step": 1751
},
{
"epoch": 0.19558141725361325,
"grad_norm": 0.2218266874551773,
"learning_rate": 6.2e-06,
"loss": 3.5435,
"step": 1752
},
{
"epoch": 0.1956930504826393,
"grad_norm": 0.2191057652235031,
"learning_rate": 6.175e-06,
"loss": 3.5344,
"step": 1753
},
{
"epoch": 0.19580468371166532,
"grad_norm": 0.210830420255661,
"learning_rate": 6.15e-06,
"loss": 3.5448,
"step": 1754
},
{
"epoch": 0.19591631694069137,
"grad_norm": 0.22419650852680206,
"learning_rate": 6.125e-06,
"loss": 3.367,
"step": 1755
},
{
"epoch": 0.19602795016971739,
"grad_norm": 0.21330904960632324,
"learning_rate": 6.1e-06,
"loss": 3.5257,
"step": 1756
},
{
"epoch": 0.19613958339874343,
"grad_norm": 0.21912828087806702,
"learning_rate": 6.075e-06,
"loss": 3.5257,
"step": 1757
},
{
"epoch": 0.19625121662776945,
"grad_norm": 0.21435360610485077,
"learning_rate": 6.0500000000000005e-06,
"loss": 3.598,
"step": 1758
},
{
"epoch": 0.1963628498567955,
"grad_norm": 0.207380473613739,
"learning_rate": 6.025e-06,
"loss": 3.5569,
"step": 1759
},
{
"epoch": 0.19647448308582152,
"grad_norm": 0.1989145129919052,
"learning_rate": 6e-06,
"loss": 3.5359,
"step": 1760
},
{
"epoch": 0.19658611631484757,
"grad_norm": 0.21557660400867462,
"learning_rate": 5.975e-06,
"loss": 3.5078,
"step": 1761
},
{
"epoch": 0.1966977495438736,
"grad_norm": 0.2107868790626526,
"learning_rate": 5.95e-06,
"loss": 3.476,
"step": 1762
},
{
"epoch": 0.19680938277289964,
"grad_norm": 0.21909202635288239,
"learning_rate": 5.925e-06,
"loss": 3.5745,
"step": 1763
},
{
"epoch": 0.19692101600192569,
"grad_norm": 0.2006104439496994,
"learning_rate": 5.9e-06,
"loss": 3.6226,
"step": 1764
},
{
"epoch": 0.1970326492309517,
"grad_norm": 0.20176716148853302,
"learning_rate": 5.875e-06,
"loss": 3.4707,
"step": 1765
},
{
"epoch": 0.19714428245997775,
"grad_norm": 0.20947526395320892,
"learning_rate": 5.850000000000001e-06,
"loss": 3.3932,
"step": 1766
},
{
"epoch": 0.19725591568900377,
"grad_norm": 0.20574991405010223,
"learning_rate": 5.825000000000001e-06,
"loss": 3.4643,
"step": 1767
},
{
"epoch": 0.19736754891802982,
"grad_norm": 0.20592179894447327,
"learning_rate": 5.8e-06,
"loss": 3.5466,
"step": 1768
},
{
"epoch": 0.19747918214705584,
"grad_norm": 0.20153668522834778,
"learning_rate": 5.775000000000001e-06,
"loss": 3.5184,
"step": 1769
},
{
"epoch": 0.1975908153760819,
"grad_norm": 0.22168035805225372,
"learning_rate": 5.750000000000001e-06,
"loss": 3.6203,
"step": 1770
},
{
"epoch": 0.1977024486051079,
"grad_norm": 0.21859316527843475,
"learning_rate": 5.725e-06,
"loss": 3.5636,
"step": 1771
},
{
"epoch": 0.19781408183413396,
"grad_norm": 0.21583330631256104,
"learning_rate": 5.7000000000000005e-06,
"loss": 3.4887,
"step": 1772
},
{
"epoch": 0.19792571506315998,
"grad_norm": 0.24064335227012634,
"learning_rate": 5.675000000000001e-06,
"loss": 3.5956,
"step": 1773
},
{
"epoch": 0.19803734829218603,
"grad_norm": 0.2163335084915161,
"learning_rate": 5.65e-06,
"loss": 3.5449,
"step": 1774
},
{
"epoch": 0.19814898152121205,
"grad_norm": 0.23865336179733276,
"learning_rate": 5.625e-06,
"loss": 3.4284,
"step": 1775
},
{
"epoch": 0.1982606147502381,
"grad_norm": 0.21515367925167084,
"learning_rate": 5.600000000000001e-06,
"loss": 3.4509,
"step": 1776
},
{
"epoch": 0.19837224797926412,
"grad_norm": 0.202334463596344,
"learning_rate": 5.575e-06,
"loss": 3.4719,
"step": 1777
},
{
"epoch": 0.19848388120829016,
"grad_norm": 0.22086431086063385,
"learning_rate": 5.55e-06,
"loss": 3.5625,
"step": 1778
},
{
"epoch": 0.19859551443731618,
"grad_norm": 0.2051386833190918,
"learning_rate": 5.5250000000000005e-06,
"loss": 3.4851,
"step": 1779
},
{
"epoch": 0.19870714766634223,
"grad_norm": 0.21226409077644348,
"learning_rate": 5.500000000000001e-06,
"loss": 3.5417,
"step": 1780
},
{
"epoch": 0.19881878089536828,
"grad_norm": 0.22610831260681152,
"learning_rate": 5.475e-06,
"loss": 3.6455,
"step": 1781
},
{
"epoch": 0.1989304141243943,
"grad_norm": 0.2170376479625702,
"learning_rate": 5.45e-06,
"loss": 3.6092,
"step": 1782
},
{
"epoch": 0.19904204735342035,
"grad_norm": 0.2663693428039551,
"learning_rate": 5.4250000000000006e-06,
"loss": 3.4673,
"step": 1783
},
{
"epoch": 0.19915368058244637,
"grad_norm": 0.21275226771831512,
"learning_rate": 5.4e-06,
"loss": 3.5114,
"step": 1784
},
{
"epoch": 0.19926531381147242,
"grad_norm": 0.22679205238819122,
"learning_rate": 5.375e-06,
"loss": 3.6242,
"step": 1785
},
{
"epoch": 0.19937694704049844,
"grad_norm": 0.21246251463890076,
"learning_rate": 5.3500000000000004e-06,
"loss": 3.454,
"step": 1786
},
{
"epoch": 0.19948858026952448,
"grad_norm": 0.21031992137432098,
"learning_rate": 5.325e-06,
"loss": 3.5477,
"step": 1787
},
{
"epoch": 0.1996002134985505,
"grad_norm": 0.21701624989509583,
"learning_rate": 5.3e-06,
"loss": 3.4562,
"step": 1788
},
{
"epoch": 0.19971184672757655,
"grad_norm": 0.20907016098499298,
"learning_rate": 5.275e-06,
"loss": 3.5097,
"step": 1789
},
{
"epoch": 0.19982347995660257,
"grad_norm": 0.21945489943027496,
"learning_rate": 5.25e-06,
"loss": 3.5454,
"step": 1790
},
{
"epoch": 0.19993511318562862,
"grad_norm": 0.22950352728366852,
"learning_rate": 5.225e-06,
"loss": 3.4832,
"step": 1791
},
{
"epoch": 0.20004674641465464,
"grad_norm": 0.21792039275169373,
"learning_rate": 5.2e-06,
"loss": 3.5609,
"step": 1792
},
{
"epoch": 0.2001583796436807,
"grad_norm": 0.2099238485097885,
"learning_rate": 5.175e-06,
"loss": 3.4514,
"step": 1793
},
{
"epoch": 0.2002700128727067,
"grad_norm": 0.21807892620563507,
"learning_rate": 5.15e-06,
"loss": 3.4391,
"step": 1794
},
{
"epoch": 0.20038164610173276,
"grad_norm": 0.216991126537323,
"learning_rate": 5.125e-06,
"loss": 3.6034,
"step": 1795
},
{
"epoch": 0.2004932793307588,
"grad_norm": 0.21016691625118256,
"learning_rate": 5.1e-06,
"loss": 3.5621,
"step": 1796
},
{
"epoch": 0.20060491255978483,
"grad_norm": 0.211182102560997,
"learning_rate": 5.0750000000000005e-06,
"loss": 3.6104,
"step": 1797
},
{
"epoch": 0.20071654578881087,
"grad_norm": 0.19964830577373505,
"learning_rate": 5.050000000000001e-06,
"loss": 3.5154,
"step": 1798
},
{
"epoch": 0.2008281790178369,
"grad_norm": 0.21706761419773102,
"learning_rate": 5.025e-06,
"loss": 3.4314,
"step": 1799
},
{
"epoch": 0.20093981224686294,
"grad_norm": 0.21016335487365723,
"learning_rate": 5e-06,
"loss": 3.4975,
"step": 1800
},
{
"epoch": 0.20105144547588896,
"grad_norm": 0.23697106540203094,
"learning_rate": 4.975000000000001e-06,
"loss": 3.6012,
"step": 1801
},
{
"epoch": 0.201163078704915,
"grad_norm": 0.2100965976715088,
"learning_rate": 4.950000000000001e-06,
"loss": 3.4225,
"step": 1802
},
{
"epoch": 0.20127471193394103,
"grad_norm": 0.2075640708208084,
"learning_rate": 4.925e-06,
"loss": 3.5397,
"step": 1803
},
{
"epoch": 0.20138634516296708,
"grad_norm": 0.24496634304523468,
"learning_rate": 4.9000000000000005e-06,
"loss": 3.5449,
"step": 1804
},
{
"epoch": 0.2014979783919931,
"grad_norm": 0.20510834455490112,
"learning_rate": 4.875000000000001e-06,
"loss": 3.6166,
"step": 1805
},
{
"epoch": 0.20160961162101915,
"grad_norm": 0.2140471637248993,
"learning_rate": 4.85e-06,
"loss": 3.482,
"step": 1806
},
{
"epoch": 0.20172124485004517,
"grad_norm": 0.20598606765270233,
"learning_rate": 4.825e-06,
"loss": 3.4585,
"step": 1807
},
{
"epoch": 0.20183287807907122,
"grad_norm": 0.20537462830543518,
"learning_rate": 4.800000000000001e-06,
"loss": 3.5279,
"step": 1808
},
{
"epoch": 0.20194451130809724,
"grad_norm": 0.22577260434627533,
"learning_rate": 4.775e-06,
"loss": 3.5,
"step": 1809
},
{
"epoch": 0.20205614453712328,
"grad_norm": 0.2183264046907425,
"learning_rate": 4.75e-06,
"loss": 3.5869,
"step": 1810
},
{
"epoch": 0.2021677777661493,
"grad_norm": 0.20449145138263702,
"learning_rate": 4.7250000000000005e-06,
"loss": 3.5822,
"step": 1811
},
{
"epoch": 0.20227941099517535,
"grad_norm": 0.21103350818157196,
"learning_rate": 4.7e-06,
"loss": 3.5096,
"step": 1812
},
{
"epoch": 0.2023910442242014,
"grad_norm": 0.2327439785003662,
"learning_rate": 4.675e-06,
"loss": 3.4611,
"step": 1813
},
{
"epoch": 0.20250267745322742,
"grad_norm": 0.22530223429203033,
"learning_rate": 4.65e-06,
"loss": 3.5714,
"step": 1814
},
{
"epoch": 0.20261431068225347,
"grad_norm": 0.21586278080940247,
"learning_rate": 4.625e-06,
"loss": 3.4991,
"step": 1815
},
{
"epoch": 0.2027259439112795,
"grad_norm": 0.21747596561908722,
"learning_rate": 4.6e-06,
"loss": 3.4844,
"step": 1816
},
{
"epoch": 0.20283757714030554,
"grad_norm": 0.2151140719652176,
"learning_rate": 4.575e-06,
"loss": 3.5387,
"step": 1817
},
{
"epoch": 0.20294921036933156,
"grad_norm": 0.2119089663028717,
"learning_rate": 4.5500000000000005e-06,
"loss": 3.505,
"step": 1818
},
{
"epoch": 0.2030608435983576,
"grad_norm": 0.21960803866386414,
"learning_rate": 4.525e-06,
"loss": 3.633,
"step": 1819
},
{
"epoch": 0.20317247682738362,
"grad_norm": 0.23329858481884003,
"learning_rate": 4.5e-06,
"loss": 3.5623,
"step": 1820
},
{
"epoch": 0.20328411005640967,
"grad_norm": 0.21105192601680756,
"learning_rate": 4.475e-06,
"loss": 3.4557,
"step": 1821
},
{
"epoch": 0.2033957432854357,
"grad_norm": 0.23179160058498383,
"learning_rate": 4.45e-06,
"loss": 3.4755,
"step": 1822
},
{
"epoch": 0.20350737651446174,
"grad_norm": 0.2121267467737198,
"learning_rate": 4.425e-06,
"loss": 3.5484,
"step": 1823
},
{
"epoch": 0.20361900974348776,
"grad_norm": 0.21276584267616272,
"learning_rate": 4.4e-06,
"loss": 3.4167,
"step": 1824
},
{
"epoch": 0.2037306429725138,
"grad_norm": 0.22784695029258728,
"learning_rate": 4.375e-06,
"loss": 3.5111,
"step": 1825
},
{
"epoch": 0.20384227620153983,
"grad_norm": 0.2126598209142685,
"learning_rate": 4.35e-06,
"loss": 3.5127,
"step": 1826
},
{
"epoch": 0.20395390943056588,
"grad_norm": 0.2082507312297821,
"learning_rate": 4.325e-06,
"loss": 3.5663,
"step": 1827
},
{
"epoch": 0.2040655426595919,
"grad_norm": 0.21978691220283508,
"learning_rate": 4.2999999999999995e-06,
"loss": 3.5124,
"step": 1828
},
{
"epoch": 0.20417717588861795,
"grad_norm": 0.21531830728054047,
"learning_rate": 4.2750000000000006e-06,
"loss": 3.5518,
"step": 1829
},
{
"epoch": 0.204288809117644,
"grad_norm": 0.2193949818611145,
"learning_rate": 4.250000000000001e-06,
"loss": 3.5591,
"step": 1830
},
{
"epoch": 0.20440044234667,
"grad_norm": 0.24037496745586395,
"learning_rate": 4.225e-06,
"loss": 3.5244,
"step": 1831
},
{
"epoch": 0.20451207557569606,
"grad_norm": 0.2125871181488037,
"learning_rate": 4.2000000000000004e-06,
"loss": 3.538,
"step": 1832
},
{
"epoch": 0.20462370880472208,
"grad_norm": 0.2190350890159607,
"learning_rate": 4.175000000000001e-06,
"loss": 3.6318,
"step": 1833
},
{
"epoch": 0.20473534203374813,
"grad_norm": 0.20892810821533203,
"learning_rate": 4.15e-06,
"loss": 3.5271,
"step": 1834
},
{
"epoch": 0.20484697526277415,
"grad_norm": 0.20510607957839966,
"learning_rate": 4.125e-06,
"loss": 3.4348,
"step": 1835
},
{
"epoch": 0.2049586084918002,
"grad_norm": 0.21379521489143372,
"learning_rate": 4.1000000000000006e-06,
"loss": 3.5479,
"step": 1836
},
{
"epoch": 0.20507024172082622,
"grad_norm": 0.2153952717781067,
"learning_rate": 4.075e-06,
"loss": 3.5224,
"step": 1837
},
{
"epoch": 0.20518187494985227,
"grad_norm": 0.20247319340705872,
"learning_rate": 4.05e-06,
"loss": 3.5739,
"step": 1838
},
{
"epoch": 0.2052935081788783,
"grad_norm": 0.2167060524225235,
"learning_rate": 4.0250000000000004e-06,
"loss": 3.542,
"step": 1839
},
{
"epoch": 0.20540514140790433,
"grad_norm": 0.20425930619239807,
"learning_rate": 4.000000000000001e-06,
"loss": 3.4803,
"step": 1840
},
{
"epoch": 0.20551677463693035,
"grad_norm": 0.2459854632616043,
"learning_rate": 3.975e-06,
"loss": 3.5605,
"step": 1841
},
{
"epoch": 0.2056284078659564,
"grad_norm": 0.21814677119255066,
"learning_rate": 3.95e-06,
"loss": 3.5506,
"step": 1842
},
{
"epoch": 0.20574004109498242,
"grad_norm": 0.23344196379184723,
"learning_rate": 3.9250000000000005e-06,
"loss": 3.5186,
"step": 1843
},
{
"epoch": 0.20585167432400847,
"grad_norm": 0.22359785437583923,
"learning_rate": 3.9e-06,
"loss": 3.5471,
"step": 1844
},
{
"epoch": 0.2059633075530345,
"grad_norm": 0.24664321541786194,
"learning_rate": 3.875e-06,
"loss": 3.5544,
"step": 1845
},
{
"epoch": 0.20607494078206054,
"grad_norm": 0.2479398250579834,
"learning_rate": 3.85e-06,
"loss": 3.5187,
"step": 1846
},
{
"epoch": 0.2061865740110866,
"grad_norm": 0.2162202149629593,
"learning_rate": 3.825e-06,
"loss": 3.5604,
"step": 1847
},
{
"epoch": 0.2062982072401126,
"grad_norm": 0.21028858423233032,
"learning_rate": 3.8e-06,
"loss": 3.4007,
"step": 1848
},
{
"epoch": 0.20640984046913866,
"grad_norm": 0.22720041871070862,
"learning_rate": 3.775e-06,
"loss": 3.5528,
"step": 1849
},
{
"epoch": 0.20652147369816468,
"grad_norm": 0.20484808087348938,
"learning_rate": 3.75e-06,
"loss": 3.4657,
"step": 1850
},
{
"epoch": 0.20663310692719072,
"grad_norm": 0.2123991698026657,
"learning_rate": 3.725e-06,
"loss": 3.5653,
"step": 1851
},
{
"epoch": 0.20674474015621674,
"grad_norm": 0.21905416250228882,
"learning_rate": 3.7e-06,
"loss": 3.5497,
"step": 1852
},
{
"epoch": 0.2068563733852428,
"grad_norm": 0.20679041743278503,
"learning_rate": 3.675e-06,
"loss": 3.5367,
"step": 1853
},
{
"epoch": 0.2069680066142688,
"grad_norm": 0.22440306842327118,
"learning_rate": 3.6499999999999998e-06,
"loss": 3.4365,
"step": 1854
},
{
"epoch": 0.20707963984329486,
"grad_norm": 0.23062323033809662,
"learning_rate": 3.625e-06,
"loss": 3.5663,
"step": 1855
},
{
"epoch": 0.20719127307232088,
"grad_norm": 0.22723813354969025,
"learning_rate": 3.6e-06,
"loss": 3.4967,
"step": 1856
},
{
"epoch": 0.20730290630134693,
"grad_norm": 0.2153361588716507,
"learning_rate": 3.575e-06,
"loss": 3.5003,
"step": 1857
},
{
"epoch": 0.20741453953037295,
"grad_norm": 0.2120848000049591,
"learning_rate": 3.55e-06,
"loss": 3.4241,
"step": 1858
},
{
"epoch": 0.207526172759399,
"grad_norm": 0.2151784747838974,
"learning_rate": 3.5249999999999997e-06,
"loss": 3.5259,
"step": 1859
},
{
"epoch": 0.20763780598842502,
"grad_norm": 0.20985038578510284,
"learning_rate": 3.5000000000000004e-06,
"loss": 3.4011,
"step": 1860
},
{
"epoch": 0.20774943921745107,
"grad_norm": 0.20849952101707458,
"learning_rate": 3.4750000000000006e-06,
"loss": 3.5246,
"step": 1861
},
{
"epoch": 0.2078610724464771,
"grad_norm": 0.22794748842716217,
"learning_rate": 3.4500000000000004e-06,
"loss": 3.4456,
"step": 1862
},
{
"epoch": 0.20797270567550313,
"grad_norm": 0.26037687063217163,
"learning_rate": 3.4250000000000002e-06,
"loss": 3.5013,
"step": 1863
},
{
"epoch": 0.20808433890452918,
"grad_norm": 0.20757165551185608,
"learning_rate": 3.4000000000000005e-06,
"loss": 3.5984,
"step": 1864
},
{
"epoch": 0.2081959721335552,
"grad_norm": 0.19982293248176575,
"learning_rate": 3.3750000000000003e-06,
"loss": 3.591,
"step": 1865
},
{
"epoch": 0.20830760536258125,
"grad_norm": 0.21423710882663727,
"learning_rate": 3.3500000000000005e-06,
"loss": 3.5852,
"step": 1866
},
{
"epoch": 0.20841923859160727,
"grad_norm": 0.21068517863750458,
"learning_rate": 3.3250000000000004e-06,
"loss": 3.5056,
"step": 1867
},
{
"epoch": 0.20853087182063332,
"grad_norm": 0.2205931842327118,
"learning_rate": 3.3e-06,
"loss": 3.5279,
"step": 1868
},
{
"epoch": 0.20864250504965934,
"grad_norm": 0.24292927980422974,
"learning_rate": 3.2750000000000004e-06,
"loss": 3.3487,
"step": 1869
},
{
"epoch": 0.20875413827868539,
"grad_norm": 0.20406247675418854,
"learning_rate": 3.2500000000000002e-06,
"loss": 3.5239,
"step": 1870
},
{
"epoch": 0.2088657715077114,
"grad_norm": 0.2234838604927063,
"learning_rate": 3.225e-06,
"loss": 3.411,
"step": 1871
},
{
"epoch": 0.20897740473673745,
"grad_norm": 0.21245650947093964,
"learning_rate": 3.2000000000000003e-06,
"loss": 3.5195,
"step": 1872
},
{
"epoch": 0.20908903796576347,
"grad_norm": 0.21902744472026825,
"learning_rate": 3.175e-06,
"loss": 3.4734,
"step": 1873
},
{
"epoch": 0.20920067119478952,
"grad_norm": 0.2130463868379593,
"learning_rate": 3.1500000000000003e-06,
"loss": 3.6257,
"step": 1874
},
{
"epoch": 0.20931230442381554,
"grad_norm": 0.21090976893901825,
"learning_rate": 3.125e-06,
"loss": 3.5525,
"step": 1875
},
{
"epoch": 0.2094239376528416,
"grad_norm": 0.1988898515701294,
"learning_rate": 3.1e-06,
"loss": 3.3792,
"step": 1876
},
{
"epoch": 0.2095355708818676,
"grad_norm": 0.21217332780361176,
"learning_rate": 3.075e-06,
"loss": 3.5956,
"step": 1877
},
{
"epoch": 0.20964720411089366,
"grad_norm": 0.215387225151062,
"learning_rate": 3.05e-06,
"loss": 3.502,
"step": 1878
},
{
"epoch": 0.2097588373399197,
"grad_norm": 0.20931552350521088,
"learning_rate": 3.0250000000000003e-06,
"loss": 3.5398,
"step": 1879
},
{
"epoch": 0.20987047056894573,
"grad_norm": 0.2231699824333191,
"learning_rate": 3e-06,
"loss": 3.5844,
"step": 1880
},
{
"epoch": 0.20998210379797178,
"grad_norm": 0.21826721727848053,
"learning_rate": 2.975e-06,
"loss": 3.5849,
"step": 1881
},
{
"epoch": 0.2100937370269978,
"grad_norm": 0.21915479004383087,
"learning_rate": 2.95e-06,
"loss": 3.5511,
"step": 1882
},
{
"epoch": 0.21020537025602384,
"grad_norm": 0.21437153220176697,
"learning_rate": 2.9250000000000004e-06,
"loss": 3.5835,
"step": 1883
},
{
"epoch": 0.21031700348504986,
"grad_norm": 0.22190143167972565,
"learning_rate": 2.9e-06,
"loss": 3.5348,
"step": 1884
},
{
"epoch": 0.2104286367140759,
"grad_norm": 0.20274153351783752,
"learning_rate": 2.8750000000000004e-06,
"loss": 3.5397,
"step": 1885
},
{
"epoch": 0.21054026994310193,
"grad_norm": 0.20769768953323364,
"learning_rate": 2.8500000000000002e-06,
"loss": 3.6509,
"step": 1886
},
{
"epoch": 0.21065190317212798,
"grad_norm": 0.20080943405628204,
"learning_rate": 2.825e-06,
"loss": 3.4959,
"step": 1887
},
{
"epoch": 0.210763536401154,
"grad_norm": 0.21280959248542786,
"learning_rate": 2.8000000000000003e-06,
"loss": 3.5626,
"step": 1888
},
{
"epoch": 0.21087516963018005,
"grad_norm": 0.20168128609657288,
"learning_rate": 2.775e-06,
"loss": 3.4148,
"step": 1889
},
{
"epoch": 0.21098680285920607,
"grad_norm": 0.2141074240207672,
"learning_rate": 2.7500000000000004e-06,
"loss": 3.588,
"step": 1890
},
{
"epoch": 0.21109843608823212,
"grad_norm": 0.21445541083812714,
"learning_rate": 2.725e-06,
"loss": 3.5719,
"step": 1891
},
{
"epoch": 0.21121006931725814,
"grad_norm": 0.21445779502391815,
"learning_rate": 2.7e-06,
"loss": 3.4997,
"step": 1892
},
{
"epoch": 0.21132170254628418,
"grad_norm": 0.21173401176929474,
"learning_rate": 2.6750000000000002e-06,
"loss": 3.5909,
"step": 1893
},
{
"epoch": 0.2114333357753102,
"grad_norm": 0.20298442244529724,
"learning_rate": 2.65e-06,
"loss": 3.5549,
"step": 1894
},
{
"epoch": 0.21154496900433625,
"grad_norm": 0.22621174156665802,
"learning_rate": 2.625e-06,
"loss": 3.5773,
"step": 1895
},
{
"epoch": 0.2116566022333623,
"grad_norm": 0.19721810519695282,
"learning_rate": 2.6e-06,
"loss": 3.4917,
"step": 1896
},
{
"epoch": 0.21176823546238832,
"grad_norm": 0.2206929326057434,
"learning_rate": 2.575e-06,
"loss": 3.4502,
"step": 1897
},
{
"epoch": 0.21187986869141437,
"grad_norm": 0.22209490835666656,
"learning_rate": 2.55e-06,
"loss": 3.5138,
"step": 1898
},
{
"epoch": 0.2119915019204404,
"grad_norm": 0.21448233723640442,
"learning_rate": 2.5250000000000004e-06,
"loss": 3.5413,
"step": 1899
},
{
"epoch": 0.21210313514946644,
"grad_norm": 0.22643551230430603,
"learning_rate": 2.5e-06,
"loss": 3.6266,
"step": 1900
},
{
"epoch": 0.21221476837849246,
"grad_norm": 0.21020323038101196,
"learning_rate": 2.4750000000000004e-06,
"loss": 3.553,
"step": 1901
},
{
"epoch": 0.2123264016075185,
"grad_norm": 0.21794013679027557,
"learning_rate": 2.4500000000000003e-06,
"loss": 3.5648,
"step": 1902
},
{
"epoch": 0.21243803483654453,
"grad_norm": 0.20865659415721893,
"learning_rate": 2.425e-06,
"loss": 3.4656,
"step": 1903
},
{
"epoch": 0.21254966806557057,
"grad_norm": 0.21266360580921173,
"learning_rate": 2.4000000000000003e-06,
"loss": 3.5308,
"step": 1904
},
{
"epoch": 0.2126613012945966,
"grad_norm": 0.21949303150177002,
"learning_rate": 2.375e-06,
"loss": 3.5859,
"step": 1905
},
{
"epoch": 0.21277293452362264,
"grad_norm": 0.2185942530632019,
"learning_rate": 2.35e-06,
"loss": 3.4909,
"step": 1906
},
{
"epoch": 0.21288456775264866,
"grad_norm": 0.20778022706508636,
"learning_rate": 2.325e-06,
"loss": 3.5625,
"step": 1907
},
{
"epoch": 0.2129962009816747,
"grad_norm": 0.20998512208461761,
"learning_rate": 2.3e-06,
"loss": 3.6109,
"step": 1908
},
{
"epoch": 0.21310783421070073,
"grad_norm": 0.22301526367664337,
"learning_rate": 2.2750000000000002e-06,
"loss": 3.5196,
"step": 1909
},
{
"epoch": 0.21321946743972678,
"grad_norm": 0.21039019525051117,
"learning_rate": 2.25e-06,
"loss": 3.485,
"step": 1910
},
{
"epoch": 0.2133311006687528,
"grad_norm": 0.23018528521060944,
"learning_rate": 2.225e-06,
"loss": 3.5085,
"step": 1911
},
{
"epoch": 0.21344273389777885,
"grad_norm": 0.20879410207271576,
"learning_rate": 2.2e-06,
"loss": 3.6065,
"step": 1912
},
{
"epoch": 0.2135543671268049,
"grad_norm": 0.20855922996997833,
"learning_rate": 2.175e-06,
"loss": 3.4406,
"step": 1913
},
{
"epoch": 0.21366600035583092,
"grad_norm": 0.20183274149894714,
"learning_rate": 2.1499999999999997e-06,
"loss": 3.3975,
"step": 1914
},
{
"epoch": 0.21377763358485696,
"grad_norm": 0.21912531554698944,
"learning_rate": 2.1250000000000004e-06,
"loss": 3.5158,
"step": 1915
},
{
"epoch": 0.21388926681388298,
"grad_norm": 0.21876703202724457,
"learning_rate": 2.1000000000000002e-06,
"loss": 3.5644,
"step": 1916
},
{
"epoch": 0.21400090004290903,
"grad_norm": 0.21577627956867218,
"learning_rate": 2.075e-06,
"loss": 3.617,
"step": 1917
},
{
"epoch": 0.21411253327193505,
"grad_norm": 0.21058258414268494,
"learning_rate": 2.0500000000000003e-06,
"loss": 3.4918,
"step": 1918
},
{
"epoch": 0.2142241665009611,
"grad_norm": 0.211268350481987,
"learning_rate": 2.025e-06,
"loss": 3.5054,
"step": 1919
},
{
"epoch": 0.21433579972998712,
"grad_norm": 0.21503891050815582,
"learning_rate": 2.0000000000000003e-06,
"loss": 3.4201,
"step": 1920
},
{
"epoch": 0.21444743295901317,
"grad_norm": 0.21528396010398865,
"learning_rate": 1.975e-06,
"loss": 3.564,
"step": 1921
},
{
"epoch": 0.2145590661880392,
"grad_norm": 0.2132415920495987,
"learning_rate": 1.95e-06,
"loss": 3.5607,
"step": 1922
},
{
"epoch": 0.21467069941706524,
"grad_norm": 0.23113086819648743,
"learning_rate": 1.925e-06,
"loss": 3.533,
"step": 1923
},
{
"epoch": 0.21478233264609126,
"grad_norm": 0.2480856329202652,
"learning_rate": 1.9e-06,
"loss": 3.4923,
"step": 1924
},
{
"epoch": 0.2148939658751173,
"grad_norm": 0.20031027495861053,
"learning_rate": 1.875e-06,
"loss": 3.5392,
"step": 1925
},
{
"epoch": 0.21500559910414332,
"grad_norm": 0.22247815132141113,
"learning_rate": 1.85e-06,
"loss": 3.5365,
"step": 1926
},
{
"epoch": 0.21511723233316937,
"grad_norm": 0.22210949659347534,
"learning_rate": 1.8249999999999999e-06,
"loss": 3.5415,
"step": 1927
},
{
"epoch": 0.21522886556219542,
"grad_norm": 0.20896992087364197,
"learning_rate": 1.8e-06,
"loss": 3.5881,
"step": 1928
},
{
"epoch": 0.21534049879122144,
"grad_norm": 0.21389928460121155,
"learning_rate": 1.775e-06,
"loss": 3.4641,
"step": 1929
},
{
"epoch": 0.2154521320202475,
"grad_norm": 0.2228117287158966,
"learning_rate": 1.7500000000000002e-06,
"loss": 3.6,
"step": 1930
},
{
"epoch": 0.2155637652492735,
"grad_norm": 0.21629740297794342,
"learning_rate": 1.7250000000000002e-06,
"loss": 3.5749,
"step": 1931
},
{
"epoch": 0.21567539847829956,
"grad_norm": 0.19869105517864227,
"learning_rate": 1.7000000000000002e-06,
"loss": 3.5656,
"step": 1932
},
{
"epoch": 0.21578703170732558,
"grad_norm": 0.21316036581993103,
"learning_rate": 1.6750000000000003e-06,
"loss": 3.5997,
"step": 1933
},
{
"epoch": 0.21589866493635163,
"grad_norm": 0.20703160762786865,
"learning_rate": 1.65e-06,
"loss": 3.4679,
"step": 1934
},
{
"epoch": 0.21601029816537765,
"grad_norm": 0.22159571945667267,
"learning_rate": 1.6250000000000001e-06,
"loss": 3.6049,
"step": 1935
},
{
"epoch": 0.2161219313944037,
"grad_norm": 0.23291505873203278,
"learning_rate": 1.6000000000000001e-06,
"loss": 3.4441,
"step": 1936
},
{
"epoch": 0.2162335646234297,
"grad_norm": 0.23334316909313202,
"learning_rate": 1.5750000000000002e-06,
"loss": 3.4708,
"step": 1937
},
{
"epoch": 0.21634519785245576,
"grad_norm": 0.22799208760261536,
"learning_rate": 1.55e-06,
"loss": 3.6898,
"step": 1938
},
{
"epoch": 0.21645683108148178,
"grad_norm": 0.21187621355056763,
"learning_rate": 1.525e-06,
"loss": 3.528,
"step": 1939
},
{
"epoch": 0.21656846431050783,
"grad_norm": 0.21387922763824463,
"learning_rate": 1.5e-06,
"loss": 3.5061,
"step": 1940
},
{
"epoch": 0.21668009753953385,
"grad_norm": 0.23416286706924438,
"learning_rate": 1.475e-06,
"loss": 3.5368,
"step": 1941
},
{
"epoch": 0.2167917307685599,
"grad_norm": 0.21654552221298218,
"learning_rate": 1.45e-06,
"loss": 3.5881,
"step": 1942
},
{
"epoch": 0.21690336399758592,
"grad_norm": 0.20907607674598694,
"learning_rate": 1.4250000000000001e-06,
"loss": 3.5333,
"step": 1943
},
{
"epoch": 0.21701499722661197,
"grad_norm": 0.2072449028491974,
"learning_rate": 1.4000000000000001e-06,
"loss": 3.4783,
"step": 1944
},
{
"epoch": 0.21712663045563801,
"grad_norm": 0.21768568456172943,
"learning_rate": 1.3750000000000002e-06,
"loss": 3.4982,
"step": 1945
},
{
"epoch": 0.21723826368466403,
"grad_norm": 0.2058151662349701,
"learning_rate": 1.35e-06,
"loss": 3.5512,
"step": 1946
},
{
"epoch": 0.21734989691369008,
"grad_norm": 0.22220179438591003,
"learning_rate": 1.325e-06,
"loss": 3.5102,
"step": 1947
},
{
"epoch": 0.2174615301427161,
"grad_norm": 0.20181167125701904,
"learning_rate": 1.3e-06,
"loss": 3.5571,
"step": 1948
},
{
"epoch": 0.21757316337174215,
"grad_norm": 0.20426467061042786,
"learning_rate": 1.275e-06,
"loss": 3.5973,
"step": 1949
},
{
"epoch": 0.21768479660076817,
"grad_norm": 0.22129550576210022,
"learning_rate": 1.25e-06,
"loss": 3.546,
"step": 1950
},
{
"epoch": 0.21779642982979422,
"grad_norm": 0.21661601960659027,
"learning_rate": 1.2250000000000001e-06,
"loss": 3.6242,
"step": 1951
},
{
"epoch": 0.21790806305882024,
"grad_norm": 0.2206379920244217,
"learning_rate": 1.2000000000000002e-06,
"loss": 3.5149,
"step": 1952
},
{
"epoch": 0.2180196962878463,
"grad_norm": 0.207596093416214,
"learning_rate": 1.175e-06,
"loss": 3.5572,
"step": 1953
},
{
"epoch": 0.2181313295168723,
"grad_norm": 0.21597479283809662,
"learning_rate": 1.15e-06,
"loss": 3.5172,
"step": 1954
},
{
"epoch": 0.21824296274589836,
"grad_norm": 0.2263294905424118,
"learning_rate": 1.125e-06,
"loss": 3.555,
"step": 1955
},
{
"epoch": 0.21835459597492438,
"grad_norm": 0.21802575886249542,
"learning_rate": 1.1e-06,
"loss": 3.6233,
"step": 1956
},
{
"epoch": 0.21846622920395042,
"grad_norm": 0.21086303889751434,
"learning_rate": 1.0749999999999999e-06,
"loss": 3.6221,
"step": 1957
},
{
"epoch": 0.21857786243297644,
"grad_norm": 0.23955845832824707,
"learning_rate": 1.0500000000000001e-06,
"loss": 3.5772,
"step": 1958
},
{
"epoch": 0.2186894956620025,
"grad_norm": 0.2189156860113144,
"learning_rate": 1.0250000000000001e-06,
"loss": 3.528,
"step": 1959
},
{
"epoch": 0.2188011288910285,
"grad_norm": 0.21303071081638336,
"learning_rate": 1.0000000000000002e-06,
"loss": 3.4882,
"step": 1960
},
{
"epoch": 0.21891276212005456,
"grad_norm": 0.23032133281230927,
"learning_rate": 9.75e-07,
"loss": 3.5733,
"step": 1961
},
{
"epoch": 0.2190243953490806,
"grad_norm": 0.2242199033498764,
"learning_rate": 9.5e-07,
"loss": 3.6099,
"step": 1962
},
{
"epoch": 0.21913602857810663,
"grad_norm": 0.22498571872711182,
"learning_rate": 9.25e-07,
"loss": 3.6028,
"step": 1963
},
{
"epoch": 0.21924766180713268,
"grad_norm": 0.21430239081382751,
"learning_rate": 9e-07,
"loss": 3.5309,
"step": 1964
},
{
"epoch": 0.2193592950361587,
"grad_norm": 0.2114960253238678,
"learning_rate": 8.750000000000001e-07,
"loss": 3.4717,
"step": 1965
},
{
"epoch": 0.21947092826518474,
"grad_norm": 0.2020798921585083,
"learning_rate": 8.500000000000001e-07,
"loss": 3.345,
"step": 1966
},
{
"epoch": 0.21958256149421077,
"grad_norm": 0.22436527907848358,
"learning_rate": 8.25e-07,
"loss": 3.5741,
"step": 1967
},
{
"epoch": 0.2196941947232368,
"grad_norm": 0.22517482936382294,
"learning_rate": 8.000000000000001e-07,
"loss": 3.5461,
"step": 1968
},
{
"epoch": 0.21980582795226283,
"grad_norm": 0.2143029272556305,
"learning_rate": 7.75e-07,
"loss": 3.523,
"step": 1969
},
{
"epoch": 0.21991746118128888,
"grad_norm": 0.21153564751148224,
"learning_rate": 7.5e-07,
"loss": 3.5754,
"step": 1970
},
{
"epoch": 0.2200290944103149,
"grad_norm": 0.20260325074195862,
"learning_rate": 7.25e-07,
"loss": 3.5259,
"step": 1971
},
{
"epoch": 0.22014072763934095,
"grad_norm": 0.2162707895040512,
"learning_rate": 7.000000000000001e-07,
"loss": 3.4889,
"step": 1972
},
{
"epoch": 0.22025236086836697,
"grad_norm": 0.2090853750705719,
"learning_rate": 6.75e-07,
"loss": 3.5414,
"step": 1973
},
{
"epoch": 0.22036399409739302,
"grad_norm": 0.2316162884235382,
"learning_rate": 6.5e-07,
"loss": 3.656,
"step": 1974
},
{
"epoch": 0.22047562732641904,
"grad_norm": 0.21441136300563812,
"learning_rate": 6.25e-07,
"loss": 3.6367,
"step": 1975
},
{
"epoch": 0.22058726055544509,
"grad_norm": 0.22844521701335907,
"learning_rate": 6.000000000000001e-07,
"loss": 3.4845,
"step": 1976
},
{
"epoch": 0.2206988937844711,
"grad_norm": 0.20617970824241638,
"learning_rate": 5.75e-07,
"loss": 3.4261,
"step": 1977
},
{
"epoch": 0.22081052701349715,
"grad_norm": 0.20670214295387268,
"learning_rate": 5.5e-07,
"loss": 3.4602,
"step": 1978
},
{
"epoch": 0.2209221602425232,
"grad_norm": 0.2365548461675644,
"learning_rate": 5.250000000000001e-07,
"loss": 3.5162,
"step": 1979
},
{
"epoch": 0.22103379347154922,
"grad_norm": 0.21691879630088806,
"learning_rate": 5.000000000000001e-07,
"loss": 3.5123,
"step": 1980
},
{
"epoch": 0.22114542670057527,
"grad_norm": 0.2166820913553238,
"learning_rate": 4.75e-07,
"loss": 3.4972,
"step": 1981
},
{
"epoch": 0.2212570599296013,
"grad_norm": 0.2209530472755432,
"learning_rate": 4.5e-07,
"loss": 3.5259,
"step": 1982
},
{
"epoch": 0.22136869315862734,
"grad_norm": 0.21106043457984924,
"learning_rate": 4.2500000000000006e-07,
"loss": 3.5868,
"step": 1983
},
{
"epoch": 0.22148032638765336,
"grad_norm": 0.2273637056350708,
"learning_rate": 4.0000000000000003e-07,
"loss": 3.5926,
"step": 1984
},
{
"epoch": 0.2215919596166794,
"grad_norm": 0.2141508013010025,
"learning_rate": 3.75e-07,
"loss": 3.5556,
"step": 1985
},
{
"epoch": 0.22170359284570543,
"grad_norm": 0.21325929462909698,
"learning_rate": 3.5000000000000004e-07,
"loss": 3.5389,
"step": 1986
},
{
"epoch": 0.22181522607473148,
"grad_norm": 0.2140723019838333,
"learning_rate": 3.25e-07,
"loss": 3.5753,
"step": 1987
},
{
"epoch": 0.2219268593037575,
"grad_norm": 0.20672175288200378,
"learning_rate": 3.0000000000000004e-07,
"loss": 3.53,
"step": 1988
},
{
"epoch": 0.22203849253278354,
"grad_norm": 0.20724664628505707,
"learning_rate": 2.75e-07,
"loss": 3.6788,
"step": 1989
},
{
"epoch": 0.22215012576180956,
"grad_norm": 0.20857301354408264,
"learning_rate": 2.5000000000000004e-07,
"loss": 3.5271,
"step": 1990
},
{
"epoch": 0.2222617589908356,
"grad_norm": 0.21427108347415924,
"learning_rate": 2.25e-07,
"loss": 3.4925,
"step": 1991
},
{
"epoch": 0.22237339221986163,
"grad_norm": 0.2282572239637375,
"learning_rate": 2.0000000000000002e-07,
"loss": 3.6178,
"step": 1992
},
{
"epoch": 0.22248502544888768,
"grad_norm": 0.23287388682365417,
"learning_rate": 1.7500000000000002e-07,
"loss": 3.5572,
"step": 1993
},
{
"epoch": 0.22259665867791373,
"grad_norm": 0.2408231645822525,
"learning_rate": 1.5000000000000002e-07,
"loss": 3.4855,
"step": 1994
},
{
"epoch": 0.22270829190693975,
"grad_norm": 0.22038929164409637,
"learning_rate": 1.2500000000000002e-07,
"loss": 3.5893,
"step": 1995
},
{
"epoch": 0.2228199251359658,
"grad_norm": 0.23969551920890808,
"learning_rate": 1.0000000000000001e-07,
"loss": 3.6311,
"step": 1996
},
{
"epoch": 0.22293155836499182,
"grad_norm": 0.23214609920978546,
"learning_rate": 7.500000000000001e-08,
"loss": 3.485,
"step": 1997
},
{
"epoch": 0.22304319159401786,
"grad_norm": 0.2179361879825592,
"learning_rate": 5.0000000000000004e-08,
"loss": 3.5429,
"step": 1998
},
{
"epoch": 0.22315482482304388,
"grad_norm": 0.21428325772285461,
"learning_rate": 2.5000000000000002e-08,
"loss": 3.5054,
"step": 1999
},
{
"epoch": 0.22326645805206993,
"grad_norm": 0.24234139919281006,
"learning_rate": 0.0,
"loss": 3.4922,
"step": 2000
},
{
"epoch": 0.22326645805206993,
"step": 2000,
"total_flos": 3.3909236563968e+16,
"train_loss": 3.594551098823547,
"train_runtime": 5469.927,
"train_samples_per_second": 11.7,
"train_steps_per_second": 0.366
}
],
"logging_steps": 1.0,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3909236563968e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}