Mag4b-c194 / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
efb7a8c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9990344383649823,
"eval_steps": 49,
"global_step": 194,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005149662053427744,
"grad_norm": 9.3125,
"learning_rate": 5.555555555555555e-07,
"loss": 1.6733,
"step": 1
},
{
"epoch": 0.005149662053427744,
"eval_loss": 1.6590306758880615,
"eval_runtime": 1698.6705,
"eval_samples_per_second": 0.555,
"eval_steps_per_second": 0.555,
"step": 1
},
{
"epoch": 0.010299324106855488,
"grad_norm": 9.5625,
"learning_rate": 1.111111111111111e-06,
"loss": 1.6598,
"step": 2
},
{
"epoch": 0.01544898616028323,
"grad_norm": 13.9375,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.6732,
"step": 3
},
{
"epoch": 0.020598648213710977,
"grad_norm": 14.125,
"learning_rate": 2.222222222222222e-06,
"loss": 1.6195,
"step": 4
},
{
"epoch": 0.025748310267138717,
"grad_norm": 17.125,
"learning_rate": 2.7777777777777783e-06,
"loss": 1.6886,
"step": 5
},
{
"epoch": 0.03089797232056646,
"grad_norm": 14.9375,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.6817,
"step": 6
},
{
"epoch": 0.036047634373994206,
"grad_norm": 13.125,
"learning_rate": 3.88888888888889e-06,
"loss": 1.6891,
"step": 7
},
{
"epoch": 0.04119729642742195,
"grad_norm": 9.875,
"learning_rate": 4.444444444444444e-06,
"loss": 1.6443,
"step": 8
},
{
"epoch": 0.046346958480849694,
"grad_norm": 9.25,
"learning_rate": 5e-06,
"loss": 1.6221,
"step": 9
},
{
"epoch": 0.051496620534277435,
"grad_norm": 7.875,
"learning_rate": 5.555555555555557e-06,
"loss": 1.6435,
"step": 10
},
{
"epoch": 0.05664628258770518,
"grad_norm": 10.0625,
"learning_rate": 6.111111111111112e-06,
"loss": 1.6459,
"step": 11
},
{
"epoch": 0.06179594464113292,
"grad_norm": 7.0,
"learning_rate": 6.666666666666667e-06,
"loss": 1.6,
"step": 12
},
{
"epoch": 0.06694560669456066,
"grad_norm": 6.78125,
"learning_rate": 7.222222222222223e-06,
"loss": 1.6128,
"step": 13
},
{
"epoch": 0.07209526874798841,
"grad_norm": 4.1875,
"learning_rate": 7.77777777777778e-06,
"loss": 1.5681,
"step": 14
},
{
"epoch": 0.07724493080141616,
"grad_norm": 2.875,
"learning_rate": 8.333333333333334e-06,
"loss": 1.5742,
"step": 15
},
{
"epoch": 0.0823945928548439,
"grad_norm": 2.734375,
"learning_rate": 8.888888888888888e-06,
"loss": 1.5442,
"step": 16
},
{
"epoch": 0.08754425490827164,
"grad_norm": 2.234375,
"learning_rate": 9.444444444444445e-06,
"loss": 1.5707,
"step": 17
},
{
"epoch": 0.09269391696169939,
"grad_norm": 1.8125,
"learning_rate": 1e-05,
"loss": 1.5419,
"step": 18
},
{
"epoch": 0.09784357901512714,
"grad_norm": 1.890625,
"learning_rate": 1.0555555555555557e-05,
"loss": 1.5595,
"step": 19
},
{
"epoch": 0.10299324106855487,
"grad_norm": 1.578125,
"learning_rate": 1.1111111111111113e-05,
"loss": 1.5199,
"step": 20
},
{
"epoch": 0.10814290312198262,
"grad_norm": 1.4296875,
"learning_rate": 1.1666666666666668e-05,
"loss": 1.5539,
"step": 21
},
{
"epoch": 0.11329256517541036,
"grad_norm": 1.3125,
"learning_rate": 1.2222222222222224e-05,
"loss": 1.5596,
"step": 22
},
{
"epoch": 0.11844222722883811,
"grad_norm": 1.03125,
"learning_rate": 1.2777777777777777e-05,
"loss": 1.5245,
"step": 23
},
{
"epoch": 0.12359188928226585,
"grad_norm": 0.99609375,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.5023,
"step": 24
},
{
"epoch": 0.1287415513356936,
"grad_norm": 1.0546875,
"learning_rate": 1.388888888888889e-05,
"loss": 1.5238,
"step": 25
},
{
"epoch": 0.13389121338912133,
"grad_norm": 0.9765625,
"learning_rate": 1.4444444444444446e-05,
"loss": 1.5156,
"step": 26
},
{
"epoch": 0.1390408754425491,
"grad_norm": 0.9921875,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.4935,
"step": 27
},
{
"epoch": 0.14419053749597682,
"grad_norm": 0.890625,
"learning_rate": 1.555555555555556e-05,
"loss": 1.4596,
"step": 28
},
{
"epoch": 0.14934019954940458,
"grad_norm": 0.9375,
"learning_rate": 1.6111111111111115e-05,
"loss": 1.4786,
"step": 29
},
{
"epoch": 0.15448986160283232,
"grad_norm": 0.9140625,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.5071,
"step": 30
},
{
"epoch": 0.15963952365626005,
"grad_norm": 0.91015625,
"learning_rate": 1.7222222222222224e-05,
"loss": 1.483,
"step": 31
},
{
"epoch": 0.1647891857096878,
"grad_norm": 0.83984375,
"learning_rate": 1.7777777777777777e-05,
"loss": 1.4428,
"step": 32
},
{
"epoch": 0.16993884776311555,
"grad_norm": 0.91015625,
"learning_rate": 1.8333333333333333e-05,
"loss": 1.4917,
"step": 33
},
{
"epoch": 0.17508850981654328,
"grad_norm": 1.03125,
"learning_rate": 1.888888888888889e-05,
"loss": 1.4407,
"step": 34
},
{
"epoch": 0.18023817186997104,
"grad_norm": 1.09375,
"learning_rate": 1.9444444444444445e-05,
"loss": 1.4626,
"step": 35
},
{
"epoch": 0.18538783392339878,
"grad_norm": 0.8671875,
"learning_rate": 2e-05,
"loss": 1.4678,
"step": 36
},
{
"epoch": 0.1905374959768265,
"grad_norm": 0.9296875,
"learning_rate": 1.9999601726381415e-05,
"loss": 1.446,
"step": 37
},
{
"epoch": 0.19568715803025427,
"grad_norm": 1.046875,
"learning_rate": 1.9998406937250035e-05,
"loss": 1.4688,
"step": 38
},
{
"epoch": 0.200836820083682,
"grad_norm": 0.9140625,
"learning_rate": 1.9996415727776456e-05,
"loss": 1.4704,
"step": 39
},
{
"epoch": 0.20598648213710974,
"grad_norm": 0.984375,
"learning_rate": 1.999362825656992e-05,
"loss": 1.4307,
"step": 40
},
{
"epoch": 0.2111361441905375,
"grad_norm": 1.015625,
"learning_rate": 1.9990044745665672e-05,
"loss": 1.4086,
"step": 41
},
{
"epoch": 0.21628580624396523,
"grad_norm": 0.8828125,
"learning_rate": 1.998566548050729e-05,
"loss": 1.4272,
"step": 42
},
{
"epoch": 0.221435468297393,
"grad_norm": 0.890625,
"learning_rate": 1.9980490809923928e-05,
"loss": 1.4274,
"step": 43
},
{
"epoch": 0.22658513035082073,
"grad_norm": 0.828125,
"learning_rate": 1.9974521146102535e-05,
"loss": 1.3833,
"step": 44
},
{
"epoch": 0.23173479240424846,
"grad_norm": 0.9453125,
"learning_rate": 1.9967756964555044e-05,
"loss": 1.4207,
"step": 45
},
{
"epoch": 0.23688445445767622,
"grad_norm": 0.8515625,
"learning_rate": 1.9960198804080462e-05,
"loss": 1.4168,
"step": 46
},
{
"epoch": 0.24203411651110396,
"grad_norm": 0.85546875,
"learning_rate": 1.995184726672197e-05,
"loss": 1.3842,
"step": 47
},
{
"epoch": 0.2471837785645317,
"grad_norm": 0.89453125,
"learning_rate": 1.9942703017718977e-05,
"loss": 1.4107,
"step": 48
},
{
"epoch": 0.2523334406179594,
"grad_norm": 0.8671875,
"learning_rate": 1.99327667854541e-05,
"loss": 1.4425,
"step": 49
},
{
"epoch": 0.2523334406179594,
"eval_loss": 1.304018259048462,
"eval_runtime": 1700.3742,
"eval_samples_per_second": 0.554,
"eval_steps_per_second": 0.554,
"step": 49
},
{
"epoch": 0.2574831026713872,
"grad_norm": 0.8359375,
"learning_rate": 1.9922039361395186e-05,
"loss": 1.3989,
"step": 50
},
{
"epoch": 0.26263276472481495,
"grad_norm": 0.86328125,
"learning_rate": 1.991052160003223e-05,
"loss": 1.373,
"step": 51
},
{
"epoch": 0.26778242677824265,
"grad_norm": 0.83203125,
"learning_rate": 1.989821441880933e-05,
"loss": 1.392,
"step": 52
},
{
"epoch": 0.2729320888316704,
"grad_norm": 0.82421875,
"learning_rate": 1.9885118798051607e-05,
"loss": 1.403,
"step": 53
},
{
"epoch": 0.2780817508850982,
"grad_norm": 0.80859375,
"learning_rate": 1.9871235780887114e-05,
"loss": 1.4111,
"step": 54
},
{
"epoch": 0.2832314129385259,
"grad_norm": 0.77734375,
"learning_rate": 1.9856566473163747e-05,
"loss": 1.3713,
"step": 55
},
{
"epoch": 0.28838107499195365,
"grad_norm": 0.81640625,
"learning_rate": 1.984111204336116e-05,
"loss": 1.3748,
"step": 56
},
{
"epoch": 0.2935307370453814,
"grad_norm": 0.83984375,
"learning_rate": 1.9824873722497694e-05,
"loss": 1.3923,
"step": 57
},
{
"epoch": 0.29868039909880917,
"grad_norm": 0.80078125,
"learning_rate": 1.9807852804032306e-05,
"loss": 1.36,
"step": 58
},
{
"epoch": 0.3038300611522369,
"grad_norm": 0.83203125,
"learning_rate": 1.9790050643761552e-05,
"loss": 1.3809,
"step": 59
},
{
"epoch": 0.30897972320566464,
"grad_norm": 0.7578125,
"learning_rate": 1.9771468659711595e-05,
"loss": 1.4097,
"step": 60
},
{
"epoch": 0.3141293852590924,
"grad_norm": 0.828125,
"learning_rate": 1.975210833202524e-05,
"loss": 1.4105,
"step": 61
},
{
"epoch": 0.3192790473125201,
"grad_norm": 0.74609375,
"learning_rate": 1.9731971202844036e-05,
"loss": 1.3961,
"step": 62
},
{
"epoch": 0.32442870936594786,
"grad_norm": 0.83984375,
"learning_rate": 1.9711058876185446e-05,
"loss": 1.4012,
"step": 63
},
{
"epoch": 0.3295783714193756,
"grad_norm": 0.76953125,
"learning_rate": 1.9689373017815076e-05,
"loss": 1.3832,
"step": 64
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.91796875,
"learning_rate": 1.9666915355113976e-05,
"loss": 1.3909,
"step": 65
},
{
"epoch": 0.3398776955262311,
"grad_norm": 0.73828125,
"learning_rate": 1.964368767694107e-05,
"loss": 1.382,
"step": 66
},
{
"epoch": 0.34502735757965886,
"grad_norm": 0.84375,
"learning_rate": 1.9619691833490645e-05,
"loss": 1.3805,
"step": 67
},
{
"epoch": 0.35017701963308656,
"grad_norm": 0.7578125,
"learning_rate": 1.9594929736144978e-05,
"loss": 1.3804,
"step": 68
},
{
"epoch": 0.3553266816865143,
"grad_norm": 0.76953125,
"learning_rate": 1.956940335732209e-05,
"loss": 1.3655,
"step": 69
},
{
"epoch": 0.3604763437399421,
"grad_norm": 0.7265625,
"learning_rate": 1.954311473031864e-05,
"loss": 1.3518,
"step": 70
},
{
"epoch": 0.3656260057933698,
"grad_norm": 0.8125,
"learning_rate": 1.9516065949147945e-05,
"loss": 1.3908,
"step": 71
},
{
"epoch": 0.37077566784679755,
"grad_norm": 0.73828125,
"learning_rate": 1.9488259168373198e-05,
"loss": 1.359,
"step": 72
},
{
"epoch": 0.3759253299002253,
"grad_norm": 0.7890625,
"learning_rate": 1.9459696602935838e-05,
"loss": 1.3655,
"step": 73
},
{
"epoch": 0.381074991953653,
"grad_norm": 0.76953125,
"learning_rate": 1.9430380527979124e-05,
"loss": 1.3783,
"step": 74
},
{
"epoch": 0.3862246540070808,
"grad_norm": 0.77734375,
"learning_rate": 1.94003132786669e-05,
"loss": 1.3515,
"step": 75
},
{
"epoch": 0.39137431606050854,
"grad_norm": 0.7109375,
"learning_rate": 1.936949724999762e-05,
"loss": 1.3505,
"step": 76
},
{
"epoch": 0.39652397811393625,
"grad_norm": 0.76953125,
"learning_rate": 1.9337934896613516e-05,
"loss": 1.3355,
"step": 77
},
{
"epoch": 0.401673640167364,
"grad_norm": 0.796875,
"learning_rate": 1.930562873260514e-05,
"loss": 1.3004,
"step": 78
},
{
"epoch": 0.40682330222079177,
"grad_norm": 0.765625,
"learning_rate": 1.927258133131105e-05,
"loss": 1.3993,
"step": 79
},
{
"epoch": 0.4119729642742195,
"grad_norm": 0.80078125,
"learning_rate": 1.9238795325112867e-05,
"loss": 1.362,
"step": 80
},
{
"epoch": 0.41712262632764724,
"grad_norm": 0.77734375,
"learning_rate": 1.9204273405225588e-05,
"loss": 1.402,
"step": 81
},
{
"epoch": 0.422272288381075,
"grad_norm": 0.734375,
"learning_rate": 1.9169018321483198e-05,
"loss": 1.344,
"step": 82
},
{
"epoch": 0.42742195043450276,
"grad_norm": 0.74609375,
"learning_rate": 1.9133032882119656e-05,
"loss": 1.3598,
"step": 83
},
{
"epoch": 0.43257161248793047,
"grad_norm": 0.77734375,
"learning_rate": 1.9096319953545186e-05,
"loss": 1.3534,
"step": 84
},
{
"epoch": 0.43772127454135823,
"grad_norm": 0.75390625,
"learning_rate": 1.9058882460117972e-05,
"loss": 1.3772,
"step": 85
},
{
"epoch": 0.442870936594786,
"grad_norm": 0.7421875,
"learning_rate": 1.9020723383911214e-05,
"loss": 1.377,
"step": 86
},
{
"epoch": 0.4480205986482137,
"grad_norm": 0.765625,
"learning_rate": 1.8981845764475585e-05,
"loss": 1.3455,
"step": 87
},
{
"epoch": 0.45317026070164146,
"grad_norm": 0.75,
"learning_rate": 1.8942252698597113e-05,
"loss": 1.3543,
"step": 88
},
{
"epoch": 0.4583199227550692,
"grad_norm": 0.76953125,
"learning_rate": 1.890194734005053e-05,
"loss": 1.348,
"step": 89
},
{
"epoch": 0.4634695848084969,
"grad_norm": 0.77734375,
"learning_rate": 1.8860932899348028e-05,
"loss": 1.3453,
"step": 90
},
{
"epoch": 0.4686192468619247,
"grad_norm": 0.74609375,
"learning_rate": 1.881921264348355e-05,
"loss": 1.3391,
"step": 91
},
{
"epoch": 0.47376890891535245,
"grad_norm": 0.73828125,
"learning_rate": 1.8776789895672557e-05,
"loss": 1.3562,
"step": 92
},
{
"epoch": 0.47891857096878016,
"grad_norm": 0.7734375,
"learning_rate": 1.8733668035087302e-05,
"loss": 1.3366,
"step": 93
},
{
"epoch": 0.4840682330222079,
"grad_norm": 0.72265625,
"learning_rate": 1.8689850496587674e-05,
"loss": 1.3733,
"step": 94
},
{
"epoch": 0.4892178950756357,
"grad_norm": 0.79296875,
"learning_rate": 1.8645340770447595e-05,
"loss": 1.3383,
"step": 95
},
{
"epoch": 0.4943675571290634,
"grad_norm": 0.77734375,
"learning_rate": 1.8600142402077006e-05,
"loss": 1.3333,
"step": 96
},
{
"epoch": 0.49951721918249115,
"grad_norm": 0.875,
"learning_rate": 1.8554258991739454e-05,
"loss": 1.3422,
"step": 97
},
{
"epoch": 0.5046668812359189,
"grad_norm": 0.80078125,
"learning_rate": 1.850769419426531e-05,
"loss": 1.3564,
"step": 98
},
{
"epoch": 0.5046668812359189,
"eval_loss": 1.2450780868530273,
"eval_runtime": 1700.3802,
"eval_samples_per_second": 0.554,
"eval_steps_per_second": 0.554,
"step": 98
},
{
"epoch": 0.5098165432893467,
"grad_norm": 0.875,
"learning_rate": 1.8460451718760653e-05,
"loss": 1.3322,
"step": 99
},
{
"epoch": 0.5149662053427744,
"grad_norm": 0.75390625,
"learning_rate": 1.8412535328311813e-05,
"loss": 1.3379,
"step": 100
},
{
"epoch": 0.5201158673962021,
"grad_norm": 0.88671875,
"learning_rate": 1.8363948839685638e-05,
"loss": 1.3512,
"step": 101
},
{
"epoch": 0.5252655294496299,
"grad_norm": 0.7734375,
"learning_rate": 1.8314696123025456e-05,
"loss": 1.3655,
"step": 102
},
{
"epoch": 0.5304151915030576,
"grad_norm": 0.84765625,
"learning_rate": 1.8264781101542797e-05,
"loss": 1.3363,
"step": 103
},
{
"epoch": 0.5355648535564853,
"grad_norm": 0.83203125,
"learning_rate": 1.8214207751204917e-05,
"loss": 1.3398,
"step": 104
},
{
"epoch": 0.5407145156099131,
"grad_norm": 0.85546875,
"learning_rate": 1.816298010041806e-05,
"loss": 1.3327,
"step": 105
},
{
"epoch": 0.5458641776633408,
"grad_norm": 0.82421875,
"learning_rate": 1.8111102229706593e-05,
"loss": 1.343,
"step": 106
},
{
"epoch": 0.5510138397167685,
"grad_norm": 0.78515625,
"learning_rate": 1.805857827138798e-05,
"loss": 1.3181,
"step": 107
},
{
"epoch": 0.5561635017701964,
"grad_norm": 0.78515625,
"learning_rate": 1.8005412409243604e-05,
"loss": 1.3375,
"step": 108
},
{
"epoch": 0.5613131638236241,
"grad_norm": 0.74609375,
"learning_rate": 1.7951608878185533e-05,
"loss": 1.2939,
"step": 109
},
{
"epoch": 0.5664628258770518,
"grad_norm": 0.91796875,
"learning_rate": 1.789717196391916e-05,
"loss": 1.3739,
"step": 110
},
{
"epoch": 0.5716124879304796,
"grad_norm": 0.74609375,
"learning_rate": 1.7842106002601854e-05,
"loss": 1.3492,
"step": 111
},
{
"epoch": 0.5767621499839073,
"grad_norm": 0.828125,
"learning_rate": 1.778641538049755e-05,
"loss": 1.3026,
"step": 112
},
{
"epoch": 0.581911812037335,
"grad_norm": 0.84765625,
"learning_rate": 1.773010453362737e-05,
"loss": 1.2874,
"step": 113
},
{
"epoch": 0.5870614740907628,
"grad_norm": 0.82421875,
"learning_rate": 1.7673177947416258e-05,
"loss": 1.328,
"step": 114
},
{
"epoch": 0.5922111361441905,
"grad_norm": 0.84375,
"learning_rate": 1.7615640156335713e-05,
"loss": 1.3258,
"step": 115
},
{
"epoch": 0.5973607981976183,
"grad_norm": 0.78515625,
"learning_rate": 1.7557495743542586e-05,
"loss": 1.3586,
"step": 116
},
{
"epoch": 0.602510460251046,
"grad_norm": 0.7890625,
"learning_rate": 1.749874934051401e-05,
"loss": 1.2967,
"step": 117
},
{
"epoch": 0.6076601223044737,
"grad_norm": 0.75390625,
"learning_rate": 1.7439405626678496e-05,
"loss": 1.3443,
"step": 118
},
{
"epoch": 0.6128097843579016,
"grad_norm": 0.875,
"learning_rate": 1.7379469329043166e-05,
"loss": 1.3578,
"step": 119
},
{
"epoch": 0.6179594464113293,
"grad_norm": 0.82421875,
"learning_rate": 1.7318945221817255e-05,
"loss": 1.353,
"step": 120
},
{
"epoch": 0.623109108464757,
"grad_norm": 0.79296875,
"learning_rate": 1.7257838126031797e-05,
"loss": 1.3388,
"step": 121
},
{
"epoch": 0.6282587705181848,
"grad_norm": 0.8125,
"learning_rate": 1.719615290915563e-05,
"loss": 1.3432,
"step": 122
},
{
"epoch": 0.6334084325716125,
"grad_norm": 0.7890625,
"learning_rate": 1.7133894484707657e-05,
"loss": 1.3497,
"step": 123
},
{
"epoch": 0.6385580946250402,
"grad_norm": 0.82421875,
"learning_rate": 1.7071067811865477e-05,
"loss": 1.3239,
"step": 124
},
{
"epoch": 0.643707756678468,
"grad_norm": 0.765625,
"learning_rate": 1.7007677895070358e-05,
"loss": 1.3457,
"step": 125
},
{
"epoch": 0.6488574187318957,
"grad_norm": 0.765625,
"learning_rate": 1.694372978362861e-05,
"loss": 1.3505,
"step": 126
},
{
"epoch": 0.6540070807853234,
"grad_norm": 0.890625,
"learning_rate": 1.6879228571309377e-05,
"loss": 1.3564,
"step": 127
},
{
"epoch": 0.6591567428387513,
"grad_norm": 0.71484375,
"learning_rate": 1.6814179395938915e-05,
"loss": 1.315,
"step": 128
},
{
"epoch": 0.664306404892179,
"grad_norm": 0.9140625,
"learning_rate": 1.6748587438991303e-05,
"loss": 1.3245,
"step": 129
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.8125,
"learning_rate": 1.6682457925175762e-05,
"loss": 1.3176,
"step": 130
},
{
"epoch": 0.6746057289990345,
"grad_norm": 0.72265625,
"learning_rate": 1.6615796122020443e-05,
"loss": 1.335,
"step": 131
},
{
"epoch": 0.6797553910524622,
"grad_norm": 0.84375,
"learning_rate": 1.6548607339452853e-05,
"loss": 1.3302,
"step": 132
},
{
"epoch": 0.6849050531058899,
"grad_norm": 0.80859375,
"learning_rate": 1.6480896929376905e-05,
"loss": 1.3532,
"step": 133
},
{
"epoch": 0.6900547151593177,
"grad_norm": 0.93359375,
"learning_rate": 1.641267028524661e-05,
"loss": 1.2829,
"step": 134
},
{
"epoch": 0.6952043772127454,
"grad_norm": 0.79296875,
"learning_rate": 1.6343932841636455e-05,
"loss": 1.2986,
"step": 135
},
{
"epoch": 0.7003540392661731,
"grad_norm": 0.765625,
"learning_rate": 1.627469007380852e-05,
"loss": 1.3589,
"step": 136
},
{
"epoch": 0.7055037013196009,
"grad_norm": 0.79296875,
"learning_rate": 1.6204947497276346e-05,
"loss": 1.3704,
"step": 137
},
{
"epoch": 0.7106533633730286,
"grad_norm": 0.765625,
"learning_rate": 1.6134710667365598e-05,
"loss": 1.3371,
"step": 138
},
{
"epoch": 0.7158030254264564,
"grad_norm": 0.87109375,
"learning_rate": 1.6063985178771555e-05,
"loss": 1.3284,
"step": 139
},
{
"epoch": 0.7209526874798842,
"grad_norm": 0.75,
"learning_rate": 1.599277666511347e-05,
"loss": 1.3307,
"step": 140
},
{
"epoch": 0.7261023495333119,
"grad_norm": 0.75,
"learning_rate": 1.592109079848583e-05,
"loss": 1.3265,
"step": 141
},
{
"epoch": 0.7312520115867396,
"grad_norm": 0.7421875,
"learning_rate": 1.584893328900653e-05,
"loss": 1.3313,
"step": 142
},
{
"epoch": 0.7364016736401674,
"grad_norm": 0.7421875,
"learning_rate": 1.577630988436206e-05,
"loss": 1.345,
"step": 143
},
{
"epoch": 0.7415513356935951,
"grad_norm": 0.71484375,
"learning_rate": 1.5703226369349642e-05,
"loss": 1.2963,
"step": 144
},
{
"epoch": 0.7467009977470228,
"grad_norm": 0.77734375,
"learning_rate": 1.562968856541648e-05,
"loss": 1.3287,
"step": 145
},
{
"epoch": 0.7518506598004506,
"grad_norm": 0.77734375,
"learning_rate": 1.5555702330196024e-05,
"loss": 1.3291,
"step": 146
},
{
"epoch": 0.7570003218538783,
"grad_norm": 0.75390625,
"learning_rate": 1.5481273557041402e-05,
"loss": 1.333,
"step": 147
},
{
"epoch": 0.7570003218538783,
"eval_loss": 1.2201098203659058,
"eval_runtime": 2166.0893,
"eval_samples_per_second": 0.435,
"eval_steps_per_second": 0.435,
"step": 147
},
{
"epoch": 0.762149983907306,
"grad_norm": 0.734375,
"learning_rate": 1.5406408174555978e-05,
"loss": 1.3466,
"step": 148
},
{
"epoch": 0.7672996459607339,
"grad_norm": 0.6953125,
"learning_rate": 1.5331112146121104e-05,
"loss": 1.3547,
"step": 149
},
{
"epoch": 0.7724493080141616,
"grad_norm": 0.71484375,
"learning_rate": 1.525539146942113e-05,
"loss": 1.3049,
"step": 150
},
{
"epoch": 0.7775989700675893,
"grad_norm": 0.71875,
"learning_rate": 1.5179252175965632e-05,
"loss": 1.2915,
"step": 151
},
{
"epoch": 0.7827486321210171,
"grad_norm": 0.6953125,
"learning_rate": 1.5102700330609e-05,
"loss": 1.329,
"step": 152
},
{
"epoch": 0.7878982941744448,
"grad_norm": 0.73046875,
"learning_rate": 1.5025742031067316e-05,
"loss": 1.2951,
"step": 153
},
{
"epoch": 0.7930479562278725,
"grad_norm": 0.7421875,
"learning_rate": 1.4948383407432678e-05,
"loss": 1.2913,
"step": 154
},
{
"epoch": 0.7981976182813003,
"grad_norm": 0.7265625,
"learning_rate": 1.4870630621684873e-05,
"loss": 1.2915,
"step": 155
},
{
"epoch": 0.803347280334728,
"grad_norm": 0.72265625,
"learning_rate": 1.479248986720057e-05,
"loss": 1.3233,
"step": 156
},
{
"epoch": 0.8084969423881557,
"grad_norm": 0.75390625,
"learning_rate": 1.4713967368259981e-05,
"loss": 1.2759,
"step": 157
},
{
"epoch": 0.8136466044415835,
"grad_norm": 0.7421875,
"learning_rate": 1.4635069379551054e-05,
"loss": 1.3446,
"step": 158
},
{
"epoch": 0.8187962664950112,
"grad_norm": 0.7109375,
"learning_rate": 1.4555802185671297e-05,
"loss": 1.3373,
"step": 159
},
{
"epoch": 0.823945928548439,
"grad_norm": 0.76171875,
"learning_rate": 1.4476172100627127e-05,
"loss": 1.3016,
"step": 160
},
{
"epoch": 0.8290955906018668,
"grad_norm": 0.73046875,
"learning_rate": 1.4396185467330974e-05,
"loss": 1.3983,
"step": 161
},
{
"epoch": 0.8342452526552945,
"grad_norm": 0.77734375,
"learning_rate": 1.4315848657096006e-05,
"loss": 1.3133,
"step": 162
},
{
"epoch": 0.8393949147087223,
"grad_norm": 0.7421875,
"learning_rate": 1.4235168069128657e-05,
"loss": 1.3356,
"step": 163
},
{
"epoch": 0.84454457676215,
"grad_norm": 0.96875,
"learning_rate": 1.4154150130018867e-05,
"loss": 1.319,
"step": 164
},
{
"epoch": 0.8496942388155777,
"grad_norm": 0.71875,
"learning_rate": 1.407280129322819e-05,
"loss": 1.3293,
"step": 165
},
{
"epoch": 0.8548439008690055,
"grad_norm": 0.71875,
"learning_rate": 1.3991128038575741e-05,
"loss": 1.3331,
"step": 166
},
{
"epoch": 0.8599935629224332,
"grad_norm": 0.74609375,
"learning_rate": 1.3909136871722066e-05,
"loss": 1.3043,
"step": 167
},
{
"epoch": 0.8651432249758609,
"grad_norm": 0.7265625,
"learning_rate": 1.3826834323650899e-05,
"loss": 1.3491,
"step": 168
},
{
"epoch": 0.8702928870292888,
"grad_norm": 0.72265625,
"learning_rate": 1.374422695014897e-05,
"loss": 1.3379,
"step": 169
},
{
"epoch": 0.8754425490827165,
"grad_norm": 0.71484375,
"learning_rate": 1.3661321331283796e-05,
"loss": 1.3071,
"step": 170
},
{
"epoch": 0.8805922111361442,
"grad_norm": 0.6796875,
"learning_rate": 1.3578124070879534e-05,
"loss": 1.3342,
"step": 171
},
{
"epoch": 0.885741873189572,
"grad_norm": 0.71484375,
"learning_rate": 1.3494641795990986e-05,
"loss": 1.3486,
"step": 172
},
{
"epoch": 0.8908915352429997,
"grad_norm": 0.73828125,
"learning_rate": 1.3410881156375684e-05,
"loss": 1.2531,
"step": 173
},
{
"epoch": 0.8960411972964274,
"grad_norm": 0.73046875,
"learning_rate": 1.3326848823964243e-05,
"loss": 1.3467,
"step": 174
},
{
"epoch": 0.9011908593498552,
"grad_norm": 0.68359375,
"learning_rate": 1.3242551492328875e-05,
"loss": 1.2883,
"step": 175
},
{
"epoch": 0.9063405214032829,
"grad_norm": 0.69921875,
"learning_rate": 1.3157995876150252e-05,
"loss": 1.3573,
"step": 176
},
{
"epoch": 0.9114901834567106,
"grad_norm": 0.703125,
"learning_rate": 1.3073188710682612e-05,
"loss": 1.3087,
"step": 177
},
{
"epoch": 0.9166398455101384,
"grad_norm": 0.7421875,
"learning_rate": 1.2988136751217292e-05,
"loss": 1.2934,
"step": 178
},
{
"epoch": 0.9217895075635661,
"grad_norm": 0.73828125,
"learning_rate": 1.2902846772544625e-05,
"loss": 1.3317,
"step": 179
},
{
"epoch": 0.9269391696169939,
"grad_norm": 0.796875,
"learning_rate": 1.2817325568414299e-05,
"loss": 1.2936,
"step": 180
},
{
"epoch": 0.9320888316704217,
"grad_norm": 0.703125,
"learning_rate": 1.27315799509942e-05,
"loss": 1.3156,
"step": 181
},
{
"epoch": 0.9372384937238494,
"grad_norm": 0.7109375,
"learning_rate": 1.2645616750327792e-05,
"loss": 1.3287,
"step": 182
},
{
"epoch": 0.9423881557772771,
"grad_norm": 0.73828125,
"learning_rate": 1.2559442813790077e-05,
"loss": 1.3513,
"step": 183
},
{
"epoch": 0.9475378178307049,
"grad_norm": 0.671875,
"learning_rate": 1.2473065005542155e-05,
"loss": 1.2912,
"step": 184
},
{
"epoch": 0.9526874798841326,
"grad_norm": 0.72265625,
"learning_rate": 1.2386490205984488e-05,
"loss": 1.3411,
"step": 185
},
{
"epoch": 0.9578371419375603,
"grad_norm": 0.77734375,
"learning_rate": 1.2299725311208807e-05,
"loss": 1.3789,
"step": 186
},
{
"epoch": 0.9629868039909881,
"grad_norm": 0.67578125,
"learning_rate": 1.2212777232448837e-05,
"loss": 1.307,
"step": 187
},
{
"epoch": 0.9681364660444158,
"grad_norm": 0.69921875,
"learning_rate": 1.2125652895529766e-05,
"loss": 1.303,
"step": 188
},
{
"epoch": 0.9732861280978435,
"grad_norm": 0.6796875,
"learning_rate": 1.2038359240316589e-05,
"loss": 1.2732,
"step": 189
},
{
"epoch": 0.9784357901512714,
"grad_norm": 0.68359375,
"learning_rate": 1.1950903220161286e-05,
"loss": 1.3369,
"step": 190
},
{
"epoch": 0.9835854522046991,
"grad_norm": 0.73046875,
"learning_rate": 1.186329180134898e-05,
"loss": 1.3079,
"step": 191
},
{
"epoch": 0.9887351142581268,
"grad_norm": 0.67578125,
"learning_rate": 1.1775531962543036e-05,
"loss": 1.3106,
"step": 192
},
{
"epoch": 0.9938847763115546,
"grad_norm": 0.67578125,
"learning_rate": 1.1687630694229159e-05,
"loss": 1.3163,
"step": 193
},
{
"epoch": 0.9990344383649823,
"grad_norm": 0.69921875,
"learning_rate": 1.1599594998158602e-05,
"loss": 1.3379,
"step": 194
}
],
"logging_steps": 1,
"max_steps": 388,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 194,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.513551014229967e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}