Phi-4-bifrost-sol-3.8B / trainer_state.json
BifrostTitan's picture
Upload 17 files
dcb6f1e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005,
"grad_norm": 13.5625,
"learning_rate": 1.3333333333333336e-07,
"loss": 2.1142,
"mean_token_accuracy": 0.5323337733745575,
"num_tokens": 1733.0,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 84.0,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.7051,
"mean_token_accuracy": 0.6366191267967224,
"num_tokens": 3780.0,
"step": 10
},
{
"epoch": 0.015,
"grad_norm": 110.5,
"learning_rate": 4.666666666666667e-07,
"loss": 1.9183,
"mean_token_accuracy": 0.6092176318168641,
"num_tokens": 6241.0,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 9.6875,
"learning_rate": 6.333333333333334e-07,
"loss": 1.6002,
"mean_token_accuracy": 0.6509096503257752,
"num_tokens": 8399.0,
"step": 20
},
{
"epoch": 0.025,
"grad_norm": 71.0,
"learning_rate": 8.000000000000001e-07,
"loss": 1.5134,
"mean_token_accuracy": 0.6915769815444947,
"num_tokens": 11215.0,
"step": 25
},
{
"epoch": 0.03,
"grad_norm": 60.25,
"learning_rate": 9.666666666666668e-07,
"loss": 1.3989,
"mean_token_accuracy": 0.7019677758216858,
"num_tokens": 13673.0,
"step": 30
},
{
"epoch": 0.035,
"grad_norm": 16.0,
"learning_rate": 1.1333333333333334e-06,
"loss": 2.3128,
"mean_token_accuracy": 0.5302097499370575,
"num_tokens": 14834.0,
"step": 35
},
{
"epoch": 0.04,
"grad_norm": 12.0625,
"learning_rate": 1.3e-06,
"loss": 1.0989,
"mean_token_accuracy": 0.7303242325782776,
"num_tokens": 18087.0,
"step": 40
},
{
"epoch": 0.045,
"grad_norm": 5.75,
"learning_rate": 1.4666666666666669e-06,
"loss": 1.4854,
"mean_token_accuracy": 0.6774025321006775,
"num_tokens": 21156.0,
"step": 45
},
{
"epoch": 0.05,
"grad_norm": 11.625,
"learning_rate": 1.6333333333333335e-06,
"loss": 1.2891,
"mean_token_accuracy": 0.6656664133071899,
"num_tokens": 24379.0,
"step": 50
},
{
"epoch": 0.055,
"grad_norm": 85.0,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.3012,
"mean_token_accuracy": 0.5090380042791367,
"num_tokens": 26962.0,
"step": 55
},
{
"epoch": 0.06,
"grad_norm": 8.75,
"learning_rate": 1.9666666666666668e-06,
"loss": 1.6577,
"mean_token_accuracy": 0.6504465699195862,
"num_tokens": 29127.0,
"step": 60
},
{
"epoch": 0.065,
"grad_norm": 8.875,
"learning_rate": 2.133333333333334e-06,
"loss": 1.0609,
"mean_token_accuracy": 0.7339364051818847,
"num_tokens": 32645.0,
"step": 65
},
{
"epoch": 0.07,
"grad_norm": 6.34375,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.271,
"mean_token_accuracy": 0.7069566965103149,
"num_tokens": 35749.0,
"step": 70
},
{
"epoch": 0.075,
"grad_norm": 7.125,
"learning_rate": 2.466666666666667e-06,
"loss": 1.1678,
"mean_token_accuracy": 0.7368309020996093,
"num_tokens": 38324.0,
"step": 75
},
{
"epoch": 0.08,
"grad_norm": 60.0,
"learning_rate": 2.6333333333333332e-06,
"loss": 1.3974,
"mean_token_accuracy": 0.664735347032547,
"num_tokens": 41203.0,
"step": 80
},
{
"epoch": 0.085,
"grad_norm": 50.25,
"learning_rate": 2.8000000000000003e-06,
"loss": 2.3673,
"mean_token_accuracy": 0.5113712131977082,
"num_tokens": 42375.0,
"step": 85
},
{
"epoch": 0.09,
"grad_norm": 53.5,
"learning_rate": 2.9666666666666673e-06,
"loss": 1.8348,
"mean_token_accuracy": 0.6246361255645752,
"num_tokens": 44196.0,
"step": 90
},
{
"epoch": 0.095,
"grad_norm": 26.125,
"learning_rate": 3.133333333333334e-06,
"loss": 2.1825,
"mean_token_accuracy": 0.553260189294815,
"num_tokens": 45435.0,
"step": 95
},
{
"epoch": 0.1,
"grad_norm": 20.875,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.6649,
"mean_token_accuracy": 0.653439199924469,
"num_tokens": 46787.0,
"step": 100
},
{
"epoch": 0.105,
"grad_norm": 6.15625,
"learning_rate": 3.4666666666666672e-06,
"loss": 2.0613,
"mean_token_accuracy": 0.589026153087616,
"num_tokens": 48337.0,
"step": 105
},
{
"epoch": 0.11,
"grad_norm": 6.21875,
"learning_rate": 3.633333333333334e-06,
"loss": 1.3082,
"mean_token_accuracy": 0.6807661652565002,
"num_tokens": 51045.0,
"step": 110
},
{
"epoch": 0.115,
"grad_norm": 7.09375,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.7279,
"mean_token_accuracy": 0.7977130174636841,
"num_tokens": 55698.0,
"step": 115
},
{
"epoch": 0.12,
"grad_norm": 46.75,
"learning_rate": 3.966666666666667e-06,
"loss": 1.6527,
"mean_token_accuracy": 0.5857374429702759,
"num_tokens": 57880.0,
"step": 120
},
{
"epoch": 0.125,
"grad_norm": 8.5,
"learning_rate": 4.133333333333333e-06,
"loss": 1.1536,
"mean_token_accuracy": 0.6969289302825927,
"num_tokens": 60652.0,
"step": 125
},
{
"epoch": 0.13,
"grad_norm": 23.25,
"learning_rate": 4.3e-06,
"loss": 1.4197,
"mean_token_accuracy": 0.6533516108989715,
"num_tokens": 62909.0,
"step": 130
},
{
"epoch": 0.135,
"grad_norm": 33.0,
"learning_rate": 4.4666666666666665e-06,
"loss": 1.6691,
"mean_token_accuracy": 0.610290253162384,
"num_tokens": 64644.0,
"step": 135
},
{
"epoch": 0.14,
"grad_norm": 4.6875,
"learning_rate": 4.633333333333334e-06,
"loss": 1.0595,
"mean_token_accuracy": 0.7040035367012024,
"num_tokens": 67286.0,
"step": 140
},
{
"epoch": 0.145,
"grad_norm": 6.09375,
"learning_rate": 4.800000000000001e-06,
"loss": 1.6933,
"mean_token_accuracy": 0.6413110613822937,
"num_tokens": 68571.0,
"step": 145
},
{
"epoch": 0.15,
"grad_norm": 15.6875,
"learning_rate": 4.966666666666667e-06,
"loss": 1.5301,
"mean_token_accuracy": 0.6430149674415588,
"num_tokens": 69810.0,
"step": 150
},
{
"epoch": 0.155,
"grad_norm": 32.25,
"learning_rate": 5.133333333333334e-06,
"loss": 1.6792,
"mean_token_accuracy": 0.6632359743118286,
"num_tokens": 70724.0,
"step": 155
},
{
"epoch": 0.16,
"grad_norm": 7.90625,
"learning_rate": 5.300000000000001e-06,
"loss": 1.2571,
"mean_token_accuracy": 0.7145259499549865,
"num_tokens": 73380.0,
"step": 160
},
{
"epoch": 0.165,
"grad_norm": 26.375,
"learning_rate": 5.466666666666667e-06,
"loss": 1.3008,
"mean_token_accuracy": 0.6873301148414612,
"num_tokens": 75374.0,
"step": 165
},
{
"epoch": 0.17,
"grad_norm": 27.5,
"learning_rate": 5.633333333333334e-06,
"loss": 0.9376,
"mean_token_accuracy": 0.7572417616844177,
"num_tokens": 78638.0,
"step": 170
},
{
"epoch": 0.175,
"grad_norm": 25.75,
"learning_rate": 5.8e-06,
"loss": 1.2469,
"mean_token_accuracy": 0.7168016552925109,
"num_tokens": 81253.0,
"step": 175
},
{
"epoch": 0.18,
"grad_norm": 8.9375,
"learning_rate": 5.966666666666667e-06,
"loss": 1.6604,
"mean_token_accuracy": 0.6027044415473938,
"num_tokens": 82921.0,
"step": 180
},
{
"epoch": 0.185,
"grad_norm": 34.25,
"learning_rate": 6.133333333333334e-06,
"loss": 1.3829,
"mean_token_accuracy": 0.6629180788993836,
"num_tokens": 84865.0,
"step": 185
},
{
"epoch": 0.19,
"grad_norm": 11.25,
"learning_rate": 6.300000000000001e-06,
"loss": 1.1711,
"mean_token_accuracy": 0.7101378679275513,
"num_tokens": 87233.0,
"step": 190
},
{
"epoch": 0.195,
"grad_norm": 10.3125,
"learning_rate": 6.466666666666667e-06,
"loss": 1.6846,
"mean_token_accuracy": 0.6098420560359955,
"num_tokens": 89523.0,
"step": 195
},
{
"epoch": 0.2,
"grad_norm": 32.5,
"learning_rate": 6.633333333333334e-06,
"loss": 1.3466,
"mean_token_accuracy": 0.6577545762062073,
"num_tokens": 91844.0,
"step": 200
},
{
"epoch": 0.205,
"grad_norm": 14.0,
"learning_rate": 6.800000000000001e-06,
"loss": 1.2093,
"mean_token_accuracy": 0.7090275406837463,
"num_tokens": 94168.0,
"step": 205
},
{
"epoch": 0.21,
"grad_norm": 7.15625,
"learning_rate": 6.966666666666667e-06,
"loss": 1.3387,
"mean_token_accuracy": 0.664970874786377,
"num_tokens": 96323.0,
"step": 210
},
{
"epoch": 0.215,
"grad_norm": 32.75,
"learning_rate": 7.133333333333334e-06,
"loss": 1.1935,
"mean_token_accuracy": 0.6921853065490723,
"num_tokens": 98444.0,
"step": 215
},
{
"epoch": 0.22,
"grad_norm": 8.6875,
"learning_rate": 7.3e-06,
"loss": 1.1413,
"mean_token_accuracy": 0.7070739269256592,
"num_tokens": 101442.0,
"step": 220
},
{
"epoch": 0.225,
"grad_norm": 23.75,
"learning_rate": 7.4666666666666675e-06,
"loss": 1.0761,
"mean_token_accuracy": 0.7386624693870545,
"num_tokens": 103604.0,
"step": 225
},
{
"epoch": 0.23,
"grad_norm": 8.3125,
"learning_rate": 7.633333333333334e-06,
"loss": 1.1318,
"mean_token_accuracy": 0.7022495746612549,
"num_tokens": 105540.0,
"step": 230
},
{
"epoch": 0.235,
"grad_norm": 34.5,
"learning_rate": 7.800000000000002e-06,
"loss": 1.4643,
"mean_token_accuracy": 0.6770303070545196,
"num_tokens": 107522.0,
"step": 235
},
{
"epoch": 0.24,
"grad_norm": 8.25,
"learning_rate": 7.966666666666668e-06,
"loss": 1.1224,
"mean_token_accuracy": 0.7211631774902344,
"num_tokens": 110450.0,
"step": 240
},
{
"epoch": 0.245,
"grad_norm": 5.53125,
"learning_rate": 8.133333333333334e-06,
"loss": 1.1466,
"mean_token_accuracy": 0.7181923449039459,
"num_tokens": 113134.0,
"step": 245
},
{
"epoch": 0.25,
"grad_norm": 11.9375,
"learning_rate": 8.3e-06,
"loss": 1.1408,
"mean_token_accuracy": 0.7148800849914551,
"num_tokens": 115554.0,
"step": 250
},
{
"epoch": 0.255,
"grad_norm": 30.25,
"learning_rate": 8.466666666666668e-06,
"loss": 1.1868,
"mean_token_accuracy": 0.6954805672168731,
"num_tokens": 118247.0,
"step": 255
},
{
"epoch": 0.26,
"grad_norm": 21.75,
"learning_rate": 8.633333333333334e-06,
"loss": 1.2475,
"mean_token_accuracy": 0.6629476428031922,
"num_tokens": 120275.0,
"step": 260
},
{
"epoch": 0.265,
"grad_norm": 5.75,
"learning_rate": 8.8e-06,
"loss": 1.1846,
"mean_token_accuracy": 0.687124228477478,
"num_tokens": 122539.0,
"step": 265
},
{
"epoch": 0.27,
"grad_norm": 29.625,
"learning_rate": 8.966666666666667e-06,
"loss": 1.4612,
"mean_token_accuracy": 0.6541434586048126,
"num_tokens": 123637.0,
"step": 270
},
{
"epoch": 0.275,
"grad_norm": 5.3125,
"learning_rate": 9.133333333333335e-06,
"loss": 0.8047,
"mean_token_accuracy": 0.7711950898170471,
"num_tokens": 127360.0,
"step": 275
},
{
"epoch": 0.28,
"grad_norm": 8.0,
"learning_rate": 9.3e-06,
"loss": 1.1051,
"mean_token_accuracy": 0.703935158252716,
"num_tokens": 130138.0,
"step": 280
},
{
"epoch": 0.285,
"grad_norm": 7.65625,
"learning_rate": 9.466666666666667e-06,
"loss": 1.2678,
"mean_token_accuracy": 0.7222112536430358,
"num_tokens": 131621.0,
"step": 285
},
{
"epoch": 0.29,
"grad_norm": 9.0,
"learning_rate": 9.633333333333335e-06,
"loss": 1.1627,
"mean_token_accuracy": 0.6856825113296509,
"num_tokens": 134152.0,
"step": 290
},
{
"epoch": 0.295,
"grad_norm": 26.0,
"learning_rate": 9.800000000000001e-06,
"loss": 1.0517,
"mean_token_accuracy": 0.728261661529541,
"num_tokens": 136289.0,
"step": 295
},
{
"epoch": 0.3,
"grad_norm": 40.5,
"learning_rate": 9.966666666666667e-06,
"loss": 1.1132,
"mean_token_accuracy": 0.7170758843421936,
"num_tokens": 138790.0,
"step": 300
},
{
"epoch": 0.305,
"grad_norm": 8.0,
"learning_rate": 1.0133333333333335e-05,
"loss": 1.4069,
"mean_token_accuracy": 0.6523711323738098,
"num_tokens": 141292.0,
"step": 305
},
{
"epoch": 0.31,
"grad_norm": 8.3125,
"learning_rate": 1.0300000000000001e-05,
"loss": 0.72,
"mean_token_accuracy": 0.7980146527290344,
"num_tokens": 145289.0,
"step": 310
},
{
"epoch": 0.315,
"grad_norm": 29.875,
"learning_rate": 1.0466666666666668e-05,
"loss": 1.1226,
"mean_token_accuracy": 0.7180093169212342,
"num_tokens": 147253.0,
"step": 315
},
{
"epoch": 0.32,
"grad_norm": 13.4375,
"learning_rate": 1.0633333333333334e-05,
"loss": 0.7705,
"mean_token_accuracy": 0.7848826766014099,
"num_tokens": 150324.0,
"step": 320
},
{
"epoch": 0.325,
"grad_norm": 12.6875,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.8428,
"mean_token_accuracy": 0.7722244143486023,
"num_tokens": 154155.0,
"step": 325
},
{
"epoch": 0.33,
"grad_norm": 4.5,
"learning_rate": 1.0966666666666668e-05,
"loss": 1.1001,
"mean_token_accuracy": 0.7051229119300843,
"num_tokens": 157400.0,
"step": 330
},
{
"epoch": 0.335,
"grad_norm": 35.25,
"learning_rate": 1.1133333333333334e-05,
"loss": 1.0369,
"mean_token_accuracy": 0.7228875994682312,
"num_tokens": 160018.0,
"step": 335
},
{
"epoch": 0.34,
"grad_norm": 9.5625,
"learning_rate": 1.13e-05,
"loss": 1.0404,
"mean_token_accuracy": 0.7233722567558288,
"num_tokens": 163034.0,
"step": 340
},
{
"epoch": 0.345,
"grad_norm": 41.75,
"learning_rate": 1.1466666666666668e-05,
"loss": 1.536,
"mean_token_accuracy": 0.6506137490272522,
"num_tokens": 163880.0,
"step": 345
},
{
"epoch": 0.35,
"grad_norm": 30.75,
"learning_rate": 1.1633333333333334e-05,
"loss": 1.5428,
"mean_token_accuracy": 0.607353800535202,
"num_tokens": 164994.0,
"step": 350
},
{
"epoch": 0.355,
"grad_norm": 4.78125,
"learning_rate": 1.18e-05,
"loss": 0.7706,
"mean_token_accuracy": 0.786465299129486,
"num_tokens": 168825.0,
"step": 355
},
{
"epoch": 0.36,
"grad_norm": 31.75,
"learning_rate": 1.1966666666666668e-05,
"loss": 1.0517,
"mean_token_accuracy": 0.7385401725769043,
"num_tokens": 172234.0,
"step": 360
},
{
"epoch": 0.365,
"grad_norm": 6.6875,
"learning_rate": 1.2133333333333335e-05,
"loss": 1.4293,
"mean_token_accuracy": 0.6525760054588318,
"num_tokens": 174168.0,
"step": 365
},
{
"epoch": 0.37,
"grad_norm": 25.625,
"learning_rate": 1.23e-05,
"loss": 1.5997,
"mean_token_accuracy": 0.6229295372962952,
"num_tokens": 175506.0,
"step": 370
},
{
"epoch": 0.375,
"grad_norm": 9.4375,
"learning_rate": 1.2466666666666667e-05,
"loss": 1.0306,
"mean_token_accuracy": 0.7189781785011291,
"num_tokens": 177941.0,
"step": 375
},
{
"epoch": 0.38,
"grad_norm": 7.53125,
"learning_rate": 1.2633333333333335e-05,
"loss": 0.9724,
"mean_token_accuracy": 0.7246036767959595,
"num_tokens": 180506.0,
"step": 380
},
{
"epoch": 0.385,
"grad_norm": 7.40625,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.1149,
"mean_token_accuracy": 0.7029499173164367,
"num_tokens": 182002.0,
"step": 385
},
{
"epoch": 0.39,
"grad_norm": 26.375,
"learning_rate": 1.2966666666666667e-05,
"loss": 1.5045,
"mean_token_accuracy": 0.6098679423332214,
"num_tokens": 182656.0,
"step": 390
},
{
"epoch": 0.395,
"grad_norm": 15.75,
"learning_rate": 1.3133333333333334e-05,
"loss": 0.9356,
"mean_token_accuracy": 0.7362943291664124,
"num_tokens": 185089.0,
"step": 395
},
{
"epoch": 0.4,
"grad_norm": 8.4375,
"learning_rate": 1.3300000000000001e-05,
"loss": 0.9694,
"mean_token_accuracy": 0.7282199025154114,
"num_tokens": 188035.0,
"step": 400
},
{
"epoch": 0.405,
"grad_norm": 7.28125,
"learning_rate": 1.3466666666666668e-05,
"loss": 1.2325,
"mean_token_accuracy": 0.6913779973983765,
"num_tokens": 190462.0,
"step": 405
},
{
"epoch": 0.41,
"grad_norm": 29.125,
"learning_rate": 1.3633333333333334e-05,
"loss": 1.2702,
"mean_token_accuracy": 0.663710606098175,
"num_tokens": 192716.0,
"step": 410
},
{
"epoch": 0.415,
"grad_norm": 8.0625,
"learning_rate": 1.38e-05,
"loss": 1.1096,
"mean_token_accuracy": 0.7282086968421936,
"num_tokens": 195440.0,
"step": 415
},
{
"epoch": 0.42,
"grad_norm": 7.9375,
"learning_rate": 1.3966666666666668e-05,
"loss": 0.8709,
"mean_token_accuracy": 0.757840347290039,
"num_tokens": 199110.0,
"step": 420
},
{
"epoch": 0.425,
"grad_norm": 29.375,
"learning_rate": 1.4133333333333334e-05,
"loss": 1.1679,
"mean_token_accuracy": 0.7137652635574341,
"num_tokens": 201034.0,
"step": 425
},
{
"epoch": 0.43,
"grad_norm": 6.84375,
"learning_rate": 1.43e-05,
"loss": 0.8863,
"mean_token_accuracy": 0.7540697813034057,
"num_tokens": 204163.0,
"step": 430
},
{
"epoch": 0.435,
"grad_norm": 7.65625,
"learning_rate": 1.4466666666666668e-05,
"loss": 0.9015,
"mean_token_accuracy": 0.7539669752120972,
"num_tokens": 207778.0,
"step": 435
},
{
"epoch": 0.44,
"grad_norm": 24.0,
"learning_rate": 1.4633333333333334e-05,
"loss": 1.0044,
"mean_token_accuracy": 0.7438582420349121,
"num_tokens": 211196.0,
"step": 440
},
{
"epoch": 0.445,
"grad_norm": 16.125,
"learning_rate": 1.48e-05,
"loss": 1.003,
"mean_token_accuracy": 0.722856342792511,
"num_tokens": 213420.0,
"step": 445
},
{
"epoch": 0.45,
"grad_norm": 10.0,
"learning_rate": 1.4966666666666667e-05,
"loss": 1.3499,
"mean_token_accuracy": 0.64574693441391,
"num_tokens": 215785.0,
"step": 450
},
{
"epoch": 0.455,
"grad_norm": 19.25,
"learning_rate": 1.5133333333333335e-05,
"loss": 0.8628,
"mean_token_accuracy": 0.7608179092407227,
"num_tokens": 218181.0,
"step": 455
},
{
"epoch": 0.46,
"grad_norm": 7.4375,
"learning_rate": 1.5300000000000003e-05,
"loss": 0.9714,
"mean_token_accuracy": 0.737330162525177,
"num_tokens": 221215.0,
"step": 460
},
{
"epoch": 0.465,
"grad_norm": 7.40625,
"learning_rate": 1.546666666666667e-05,
"loss": 1.2211,
"mean_token_accuracy": 0.6857584714889526,
"num_tokens": 223524.0,
"step": 465
},
{
"epoch": 0.47,
"grad_norm": 8.75,
"learning_rate": 1.5633333333333335e-05,
"loss": 0.9083,
"mean_token_accuracy": 0.7348023653030396,
"num_tokens": 226999.0,
"step": 470
},
{
"epoch": 0.475,
"grad_norm": 21.375,
"learning_rate": 1.58e-05,
"loss": 1.2031,
"mean_token_accuracy": 0.6628332495689392,
"num_tokens": 229176.0,
"step": 475
},
{
"epoch": 0.48,
"grad_norm": 38.5,
"learning_rate": 1.5966666666666667e-05,
"loss": 1.0468,
"mean_token_accuracy": 0.7459447026252747,
"num_tokens": 232128.0,
"step": 480
},
{
"epoch": 0.485,
"grad_norm": 26.875,
"learning_rate": 1.6133333333333334e-05,
"loss": 1.376,
"mean_token_accuracy": 0.6213439345359802,
"num_tokens": 233115.0,
"step": 485
},
{
"epoch": 0.49,
"grad_norm": 6.75,
"learning_rate": 1.63e-05,
"loss": 1.3646,
"mean_token_accuracy": 0.6519601762294769,
"num_tokens": 234739.0,
"step": 490
},
{
"epoch": 0.495,
"grad_norm": 4.0,
"learning_rate": 1.646666666666667e-05,
"loss": 0.7117,
"mean_token_accuracy": 0.7921866178512573,
"num_tokens": 238072.0,
"step": 495
},
{
"epoch": 0.5,
"grad_norm": 7.90625,
"learning_rate": 1.6633333333333336e-05,
"loss": 1.241,
"mean_token_accuracy": 0.6775242328643799,
"num_tokens": 239427.0,
"step": 500
},
{
"epoch": 0.505,
"grad_norm": 5.09375,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.9237,
"mean_token_accuracy": 0.7540452361106873,
"num_tokens": 241443.0,
"step": 505
},
{
"epoch": 0.51,
"grad_norm": 8.875,
"learning_rate": 1.6966666666666668e-05,
"loss": 0.9896,
"mean_token_accuracy": 0.7450487017631531,
"num_tokens": 244266.0,
"step": 510
},
{
"epoch": 0.515,
"grad_norm": 9.25,
"learning_rate": 1.7133333333333334e-05,
"loss": 1.0898,
"mean_token_accuracy": 0.6930008172988892,
"num_tokens": 247220.0,
"step": 515
},
{
"epoch": 0.52,
"grad_norm": 6.1875,
"learning_rate": 1.73e-05,
"loss": 0.7898,
"mean_token_accuracy": 0.7681538939476014,
"num_tokens": 251322.0,
"step": 520
},
{
"epoch": 0.525,
"grad_norm": 25.625,
"learning_rate": 1.7466666666666667e-05,
"loss": 1.1583,
"mean_token_accuracy": 0.7108910560607911,
"num_tokens": 253357.0,
"step": 525
},
{
"epoch": 0.53,
"grad_norm": 30.0,
"learning_rate": 1.7633333333333336e-05,
"loss": 0.9222,
"mean_token_accuracy": 0.7563987374305725,
"num_tokens": 256122.0,
"step": 530
},
{
"epoch": 0.535,
"grad_norm": 30.75,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.9896,
"mean_token_accuracy": 0.722571051120758,
"num_tokens": 258447.0,
"step": 535
},
{
"epoch": 0.54,
"grad_norm": 8.5625,
"learning_rate": 1.796666666666667e-05,
"loss": 0.7903,
"mean_token_accuracy": 0.7722970724105835,
"num_tokens": 261917.0,
"step": 540
},
{
"epoch": 0.545,
"grad_norm": 7.25,
"learning_rate": 1.8133333333333335e-05,
"loss": 0.6733,
"mean_token_accuracy": 0.798851752281189,
"num_tokens": 265658.0,
"step": 545
},
{
"epoch": 0.55,
"grad_norm": 23.125,
"learning_rate": 1.83e-05,
"loss": 1.2436,
"mean_token_accuracy": 0.67679682970047,
"num_tokens": 267665.0,
"step": 550
},
{
"epoch": 0.555,
"grad_norm": 26.75,
"learning_rate": 1.8466666666666667e-05,
"loss": 1.1331,
"mean_token_accuracy": 0.686410254240036,
"num_tokens": 269966.0,
"step": 555
},
{
"epoch": 0.56,
"grad_norm": 27.375,
"learning_rate": 1.8633333333333333e-05,
"loss": 0.9328,
"mean_token_accuracy": 0.7283676505088806,
"num_tokens": 273033.0,
"step": 560
},
{
"epoch": 0.565,
"grad_norm": 4.375,
"learning_rate": 1.88e-05,
"loss": 1.0776,
"mean_token_accuracy": 0.7249770760536194,
"num_tokens": 275754.0,
"step": 565
},
{
"epoch": 0.57,
"grad_norm": 25.25,
"learning_rate": 1.896666666666667e-05,
"loss": 0.9722,
"mean_token_accuracy": 0.7202863574028016,
"num_tokens": 279072.0,
"step": 570
},
{
"epoch": 0.575,
"grad_norm": 6.5625,
"learning_rate": 1.9133333333333335e-05,
"loss": 1.0787,
"mean_token_accuracy": 0.7136101365089417,
"num_tokens": 281884.0,
"step": 575
},
{
"epoch": 0.58,
"grad_norm": 8.0625,
"learning_rate": 1.93e-05,
"loss": 0.8845,
"mean_token_accuracy": 0.7490099549293519,
"num_tokens": 285077.0,
"step": 580
},
{
"epoch": 0.585,
"grad_norm": 8.4375,
"learning_rate": 1.9466666666666668e-05,
"loss": 1.1224,
"mean_token_accuracy": 0.7014864623546601,
"num_tokens": 287813.0,
"step": 585
},
{
"epoch": 0.59,
"grad_norm": 37.0,
"learning_rate": 1.9633333333333334e-05,
"loss": 1.1778,
"mean_token_accuracy": 0.6806494235992432,
"num_tokens": 288638.0,
"step": 590
},
{
"epoch": 0.595,
"grad_norm": 10.125,
"learning_rate": 1.98e-05,
"loss": 0.9242,
"mean_token_accuracy": 0.760206151008606,
"num_tokens": 291339.0,
"step": 595
},
{
"epoch": 0.6,
"grad_norm": 14.0625,
"learning_rate": 1.9966666666666666e-05,
"loss": 0.9388,
"mean_token_accuracy": 0.7776451945304871,
"num_tokens": 292680.0,
"step": 600
},
{
"epoch": 0.605,
"grad_norm": 30.625,
"learning_rate": 1.999986292247427e-05,
"loss": 0.8811,
"mean_token_accuracy": 0.7725589990615844,
"num_tokens": 295802.0,
"step": 605
},
{
"epoch": 0.61,
"grad_norm": 7.25,
"learning_rate": 1.9999306051466772e-05,
"loss": 0.9805,
"mean_token_accuracy": 0.7329005718231201,
"num_tokens": 298109.0,
"step": 610
},
{
"epoch": 0.615,
"grad_norm": 7.5625,
"learning_rate": 1.999832084346831e-05,
"loss": 0.8401,
"mean_token_accuracy": 0.7744085669517518,
"num_tokens": 299060.0,
"step": 615
},
{
"epoch": 0.62,
"grad_norm": 4.28125,
"learning_rate": 1.9996907340681907e-05,
"loss": 0.8956,
"mean_token_accuracy": 0.7751296997070313,
"num_tokens": 301210.0,
"step": 620
},
{
"epoch": 0.625,
"grad_norm": 21.125,
"learning_rate": 1.9995065603657317e-05,
"loss": 1.0128,
"mean_token_accuracy": 0.7251328110694886,
"num_tokens": 303234.0,
"step": 625
},
{
"epoch": 0.63,
"grad_norm": 8.0625,
"learning_rate": 1.9992795711288432e-05,
"loss": 1.1435,
"mean_token_accuracy": 0.6821866631507874,
"num_tokens": 305773.0,
"step": 630
},
{
"epoch": 0.635,
"grad_norm": 5.03125,
"learning_rate": 1.9990097760809878e-05,
"loss": 0.9333,
"mean_token_accuracy": 0.7206153392791748,
"num_tokens": 307991.0,
"step": 635
},
{
"epoch": 0.64,
"grad_norm": 7.375,
"learning_rate": 1.998697186779288e-05,
"loss": 1.3283,
"mean_token_accuracy": 0.6572122693061828,
"num_tokens": 309669.0,
"step": 640
},
{
"epoch": 0.645,
"grad_norm": 4.96875,
"learning_rate": 1.9983418166140286e-05,
"loss": 0.7746,
"mean_token_accuracy": 0.7766924381256104,
"num_tokens": 312663.0,
"step": 645
},
{
"epoch": 0.65,
"grad_norm": 28.125,
"learning_rate": 1.997943680808085e-05,
"loss": 1.1281,
"mean_token_accuracy": 0.6910940647125244,
"num_tokens": 314698.0,
"step": 650
},
{
"epoch": 0.655,
"grad_norm": 9.6875,
"learning_rate": 1.9975027964162704e-05,
"loss": 0.8849,
"mean_token_accuracy": 0.7341102123260498,
"num_tokens": 317519.0,
"step": 655
},
{
"epoch": 0.66,
"grad_norm": 5.84375,
"learning_rate": 1.997019182324604e-05,
"loss": 0.5954,
"mean_token_accuracy": 0.8246450662612915,
"num_tokens": 321897.0,
"step": 660
},
{
"epoch": 0.665,
"grad_norm": 24.625,
"learning_rate": 1.9964928592495046e-05,
"loss": 1.0302,
"mean_token_accuracy": 0.6987462162971496,
"num_tokens": 323882.0,
"step": 665
},
{
"epoch": 0.67,
"grad_norm": 6.40625,
"learning_rate": 1.9959238497369006e-05,
"loss": 0.9251,
"mean_token_accuracy": 0.7502574563026428,
"num_tokens": 325803.0,
"step": 670
},
{
"epoch": 0.675,
"grad_norm": 7.3125,
"learning_rate": 1.9953121781612657e-05,
"loss": 0.9518,
"mean_token_accuracy": 0.7389394760131835,
"num_tokens": 329701.0,
"step": 675
},
{
"epoch": 0.68,
"grad_norm": 5.71875,
"learning_rate": 1.9946578707245744e-05,
"loss": 1.0196,
"mean_token_accuracy": 0.7160016298294067,
"num_tokens": 333538.0,
"step": 680
},
{
"epoch": 0.685,
"grad_norm": 5.40625,
"learning_rate": 1.99396095545518e-05,
"loss": 1.145,
"mean_token_accuracy": 0.71680166721344,
"num_tokens": 335053.0,
"step": 685
},
{
"epoch": 0.69,
"grad_norm": 30.25,
"learning_rate": 1.9932214622066123e-05,
"loss": 1.2193,
"mean_token_accuracy": 0.6923537135124207,
"num_tokens": 336284.0,
"step": 690
},
{
"epoch": 0.695,
"grad_norm": 6.53125,
"learning_rate": 1.9924394226563016e-05,
"loss": 0.7077,
"mean_token_accuracy": 0.8013210415840148,
"num_tokens": 340181.0,
"step": 695
},
{
"epoch": 0.7,
"grad_norm": 27.125,
"learning_rate": 1.9916148703042193e-05,
"loss": 1.2146,
"mean_token_accuracy": 0.6644866585731506,
"num_tokens": 342323.0,
"step": 700
},
{
"epoch": 0.705,
"grad_norm": 17.375,
"learning_rate": 1.9907478404714438e-05,
"loss": 1.0603,
"mean_token_accuracy": 0.7222308039665222,
"num_tokens": 343975.0,
"step": 705
},
{
"epoch": 0.71,
"grad_norm": 29.25,
"learning_rate": 1.9898383702986473e-05,
"loss": 1.0548,
"mean_token_accuracy": 0.7053543448448181,
"num_tokens": 344944.0,
"step": 710
},
{
"epoch": 0.715,
"grad_norm": 5.0625,
"learning_rate": 1.988886498744505e-05,
"loss": 0.703,
"mean_token_accuracy": 0.8008648157119751,
"num_tokens": 348397.0,
"step": 715
},
{
"epoch": 0.72,
"grad_norm": 10.25,
"learning_rate": 1.987892266584026e-05,
"loss": 1.1011,
"mean_token_accuracy": 0.7074811816215515,
"num_tokens": 351294.0,
"step": 720
},
{
"epoch": 0.725,
"grad_norm": 33.75,
"learning_rate": 1.9868557164068073e-05,
"loss": 0.9834,
"mean_token_accuracy": 0.706681752204895,
"num_tokens": 353157.0,
"step": 725
},
{
"epoch": 0.73,
"grad_norm": 34.0,
"learning_rate": 1.985776892615209e-05,
"loss": 0.858,
"mean_token_accuracy": 0.7515957832336426,
"num_tokens": 355800.0,
"step": 730
},
{
"epoch": 0.735,
"grad_norm": 7.6875,
"learning_rate": 1.984655841422451e-05,
"loss": 0.9033,
"mean_token_accuracy": 0.7307195067405701,
"num_tokens": 358533.0,
"step": 735
},
{
"epoch": 0.74,
"grad_norm": 10.1875,
"learning_rate": 1.9834926108506357e-05,
"loss": 0.9196,
"mean_token_accuracy": 0.740574061870575,
"num_tokens": 361710.0,
"step": 740
},
{
"epoch": 0.745,
"grad_norm": 27.125,
"learning_rate": 1.982287250728689e-05,
"loss": 1.2976,
"mean_token_accuracy": 0.6771107912063599,
"num_tokens": 363820.0,
"step": 745
},
{
"epoch": 0.75,
"grad_norm": 6.40625,
"learning_rate": 1.981039812690227e-05,
"loss": 0.9639,
"mean_token_accuracy": 0.7433285117149353,
"num_tokens": 366953.0,
"step": 750
},
{
"epoch": 0.755,
"grad_norm": 26.0,
"learning_rate": 1.979750350171343e-05,
"loss": 0.9865,
"mean_token_accuracy": 0.7108042955398559,
"num_tokens": 369294.0,
"step": 755
},
{
"epoch": 0.76,
"grad_norm": 20.375,
"learning_rate": 1.9784189184083203e-05,
"loss": 1.2112,
"mean_token_accuracy": 0.6762296676635742,
"num_tokens": 370620.0,
"step": 760
},
{
"epoch": 0.765,
"grad_norm": 7.40625,
"learning_rate": 1.977045574435264e-05,
"loss": 0.8026,
"mean_token_accuracy": 0.7732225179672241,
"num_tokens": 373947.0,
"step": 765
},
{
"epoch": 0.77,
"grad_norm": 27.625,
"learning_rate": 1.9756303770816588e-05,
"loss": 0.9244,
"mean_token_accuracy": 0.7461455583572387,
"num_tokens": 376618.0,
"step": 770
},
{
"epoch": 0.775,
"grad_norm": 23.625,
"learning_rate": 1.9741733869698497e-05,
"loss": 1.0857,
"mean_token_accuracy": 0.7027423620223999,
"num_tokens": 378773.0,
"step": 775
},
{
"epoch": 0.78,
"grad_norm": 28.375,
"learning_rate": 1.972674666512443e-05,
"loss": 1.1611,
"mean_token_accuracy": 0.687284791469574,
"num_tokens": 380436.0,
"step": 780
},
{
"epoch": 0.785,
"grad_norm": 5.5,
"learning_rate": 1.971134279909636e-05,
"loss": 0.9853,
"mean_token_accuracy": 0.7149757027626038,
"num_tokens": 383360.0,
"step": 785
},
{
"epoch": 0.79,
"grad_norm": 29.625,
"learning_rate": 1.9695522931464637e-05,
"loss": 1.0704,
"mean_token_accuracy": 0.7092410445213317,
"num_tokens": 384901.0,
"step": 790
},
{
"epoch": 0.795,
"grad_norm": 4.90625,
"learning_rate": 1.9679287739899733e-05,
"loss": 0.9147,
"mean_token_accuracy": 0.7358805298805237,
"num_tokens": 388295.0,
"step": 795
},
{
"epoch": 0.8,
"grad_norm": 24.0,
"learning_rate": 1.9662637919863224e-05,
"loss": 0.9703,
"mean_token_accuracy": 0.7364350974559783,
"num_tokens": 391062.0,
"step": 800
},
{
"epoch": 0.805,
"grad_norm": 19.875,
"learning_rate": 1.9645574184577982e-05,
"loss": 1.314,
"mean_token_accuracy": 0.6746392011642456,
"num_tokens": 392268.0,
"step": 805
},
{
"epoch": 0.81,
"grad_norm": 6.53125,
"learning_rate": 1.9628097264997637e-05,
"loss": 0.7936,
"mean_token_accuracy": 0.7952145218849183,
"num_tokens": 395200.0,
"step": 810
},
{
"epoch": 0.815,
"grad_norm": 5.40625,
"learning_rate": 1.9610207909775252e-05,
"loss": 0.7545,
"mean_token_accuracy": 0.7765708088874816,
"num_tokens": 398196.0,
"step": 815
},
{
"epoch": 0.82,
"grad_norm": 8.0625,
"learning_rate": 1.9591906885231275e-05,
"loss": 0.9946,
"mean_token_accuracy": 0.7330313444137573,
"num_tokens": 400393.0,
"step": 820
},
{
"epoch": 0.825,
"grad_norm": 6.53125,
"learning_rate": 1.9573194975320672e-05,
"loss": 0.9535,
"mean_token_accuracy": 0.7550879120826721,
"num_tokens": 403335.0,
"step": 825
},
{
"epoch": 0.83,
"grad_norm": 5.21875,
"learning_rate": 1.9554072981599398e-05,
"loss": 0.7711,
"mean_token_accuracy": 0.7814031720161438,
"num_tokens": 406793.0,
"step": 830
},
{
"epoch": 0.835,
"grad_norm": 7.5625,
"learning_rate": 1.953454172319001e-05,
"loss": 0.835,
"mean_token_accuracy": 0.7673190832138062,
"num_tokens": 410253.0,
"step": 835
},
{
"epoch": 0.84,
"grad_norm": 10.4375,
"learning_rate": 1.9514602036746627e-05,
"loss": 0.8274,
"mean_token_accuracy": 0.7780193209648132,
"num_tokens": 412426.0,
"step": 840
},
{
"epoch": 0.845,
"grad_norm": 16.0,
"learning_rate": 1.949425477641904e-05,
"loss": 1.0199,
"mean_token_accuracy": 0.7647771120071412,
"num_tokens": 414613.0,
"step": 845
},
{
"epoch": 0.85,
"grad_norm": 5.375,
"learning_rate": 1.9473500813816163e-05,
"loss": 0.7706,
"mean_token_accuracy": 0.7762330651283265,
"num_tokens": 417980.0,
"step": 850
},
{
"epoch": 0.855,
"grad_norm": 24.875,
"learning_rate": 1.9452341037968684e-05,
"loss": 1.1174,
"mean_token_accuracy": 0.6667572498321533,
"num_tokens": 419696.0,
"step": 855
},
{
"epoch": 0.86,
"grad_norm": 9.0625,
"learning_rate": 1.943077635529097e-05,
"loss": 0.869,
"mean_token_accuracy": 0.7522995114326477,
"num_tokens": 423247.0,
"step": 860
},
{
"epoch": 0.865,
"grad_norm": 8.375,
"learning_rate": 1.9408807689542257e-05,
"loss": 1.0374,
"mean_token_accuracy": 0.7039300322532653,
"num_tokens": 425586.0,
"step": 865
},
{
"epoch": 0.87,
"grad_norm": 23.875,
"learning_rate": 1.9386435981787067e-05,
"loss": 1.1326,
"mean_token_accuracy": 0.6725295066833497,
"num_tokens": 427221.0,
"step": 870
},
{
"epoch": 0.875,
"grad_norm": 7.53125,
"learning_rate": 1.93636621903549e-05,
"loss": 0.9602,
"mean_token_accuracy": 0.7483055353164673,
"num_tokens": 429868.0,
"step": 875
},
{
"epoch": 0.88,
"grad_norm": 5.28125,
"learning_rate": 1.9340487290799187e-05,
"loss": 0.8656,
"mean_token_accuracy": 0.762619799375534,
"num_tokens": 433334.0,
"step": 880
},
{
"epoch": 0.885,
"grad_norm": 5.5,
"learning_rate": 1.931691227585549e-05,
"loss": 0.9443,
"mean_token_accuracy": 0.742293655872345,
"num_tokens": 435710.0,
"step": 885
},
{
"epoch": 0.89,
"grad_norm": 8.6875,
"learning_rate": 1.929293815539899e-05,
"loss": 1.1683,
"mean_token_accuracy": 0.6682364106178283,
"num_tokens": 437697.0,
"step": 890
},
{
"epoch": 0.895,
"grad_norm": 9.125,
"learning_rate": 1.926856595640121e-05,
"loss": 1.2909,
"mean_token_accuracy": 0.6599106311798095,
"num_tokens": 439418.0,
"step": 895
},
{
"epoch": 0.9,
"grad_norm": 5.0,
"learning_rate": 1.924379672288604e-05,
"loss": 0.7213,
"mean_token_accuracy": 0.7874080419540406,
"num_tokens": 443108.0,
"step": 900
},
{
"epoch": 0.905,
"grad_norm": 5.59375,
"learning_rate": 1.9218631515885007e-05,
"loss": 1.0733,
"mean_token_accuracy": 0.7128746628761291,
"num_tokens": 446547.0,
"step": 905
},
{
"epoch": 0.91,
"grad_norm": 5.65625,
"learning_rate": 1.9193071413391823e-05,
"loss": 1.1654,
"mean_token_accuracy": 0.6913678884506226,
"num_tokens": 448651.0,
"step": 910
},
{
"epoch": 0.915,
"grad_norm": 6.90625,
"learning_rate": 1.9167117510316203e-05,
"loss": 0.8036,
"mean_token_accuracy": 0.7887425661087036,
"num_tokens": 452294.0,
"step": 915
},
{
"epoch": 0.92,
"grad_norm": 6.78125,
"learning_rate": 1.9140770918436977e-05,
"loss": 0.8051,
"mean_token_accuracy": 0.7639839768409729,
"num_tokens": 454833.0,
"step": 920
},
{
"epoch": 0.925,
"grad_norm": 25.375,
"learning_rate": 1.9114032766354453e-05,
"loss": 1.4186,
"mean_token_accuracy": 0.6357974767684936,
"num_tokens": 455450.0,
"step": 925
},
{
"epoch": 0.93,
"grad_norm": 30.0,
"learning_rate": 1.9086904199442076e-05,
"loss": 1.249,
"mean_token_accuracy": 0.6428504943847656,
"num_tokens": 456891.0,
"step": 930
},
{
"epoch": 0.935,
"grad_norm": 8.5,
"learning_rate": 1.905938637979736e-05,
"loss": 1.0072,
"mean_token_accuracy": 0.707493394613266,
"num_tokens": 459398.0,
"step": 935
},
{
"epoch": 0.94,
"grad_norm": 5.34375,
"learning_rate": 1.9031480486192112e-05,
"loss": 0.8838,
"mean_token_accuracy": 0.747218382358551,
"num_tokens": 462454.0,
"step": 940
},
{
"epoch": 0.945,
"grad_norm": 20.25,
"learning_rate": 1.9003187714021936e-05,
"loss": 1.2989,
"mean_token_accuracy": 0.6610628962516785,
"num_tokens": 464222.0,
"step": 945
},
{
"epoch": 0.95,
"grad_norm": 5.28125,
"learning_rate": 1.897450927525503e-05,
"loss": 0.8568,
"mean_token_accuracy": 0.7829653263092041,
"num_tokens": 466434.0,
"step": 950
},
{
"epoch": 0.955,
"grad_norm": 6.0625,
"learning_rate": 1.894544639838025e-05,
"loss": 1.0098,
"mean_token_accuracy": 0.726330041885376,
"num_tokens": 470554.0,
"step": 955
},
{
"epoch": 0.96,
"grad_norm": 30.125,
"learning_rate": 1.8916000328354527e-05,
"loss": 1.0486,
"mean_token_accuracy": 0.6986983299255372,
"num_tokens": 473548.0,
"step": 960
},
{
"epoch": 0.965,
"grad_norm": 5.4375,
"learning_rate": 1.888617232654949e-05,
"loss": 1.003,
"mean_token_accuracy": 0.7258612275123596,
"num_tokens": 476545.0,
"step": 965
},
{
"epoch": 0.97,
"grad_norm": 6.0,
"learning_rate": 1.8855963670697458e-05,
"loss": 0.9224,
"mean_token_accuracy": 0.7539906024932861,
"num_tokens": 478267.0,
"step": 970
},
{
"epoch": 0.975,
"grad_norm": 8.0625,
"learning_rate": 1.8825375654836712e-05,
"loss": 0.7836,
"mean_token_accuracy": 0.7753713011741639,
"num_tokens": 482619.0,
"step": 975
},
{
"epoch": 0.98,
"grad_norm": 5.375,
"learning_rate": 1.8794409589256043e-05,
"loss": 0.7844,
"mean_token_accuracy": 0.7756492972373963,
"num_tokens": 485731.0,
"step": 980
},
{
"epoch": 0.985,
"grad_norm": 26.625,
"learning_rate": 1.8763066800438638e-05,
"loss": 1.1647,
"mean_token_accuracy": 0.7253339409828186,
"num_tokens": 487225.0,
"step": 985
},
{
"epoch": 0.99,
"grad_norm": 4.53125,
"learning_rate": 1.8731348631005254e-05,
"loss": 0.8562,
"mean_token_accuracy": 0.7538999199867249,
"num_tokens": 490666.0,
"step": 990
},
{
"epoch": 0.995,
"grad_norm": 6.53125,
"learning_rate": 1.8699256439656695e-05,
"loss": 0.8333,
"mean_token_accuracy": 0.776654314994812,
"num_tokens": 493700.0,
"step": 995
},
{
"epoch": 1.0,
"grad_norm": 6.53125,
"learning_rate": 1.866679160111564e-05,
"loss": 0.8648,
"mean_token_accuracy": 0.7611707806587219,
"num_tokens": 495684.0,
"step": 1000
},
{
"epoch": 1.005,
"grad_norm": 10.125,
"learning_rate": 1.8633955506067717e-05,
"loss": 0.7187,
"mean_token_accuracy": 0.8037701487541199,
"num_tokens": 497750.0,
"step": 1005
},
{
"epoch": 1.01,
"grad_norm": 12.9375,
"learning_rate": 1.8600749561101947e-05,
"loss": 0.8252,
"mean_token_accuracy": 0.7683161616325378,
"num_tokens": 500015.0,
"step": 1010
},
{
"epoch": 1.015,
"grad_norm": 5.71875,
"learning_rate": 1.85671751886505e-05,
"loss": 0.7824,
"mean_token_accuracy": 0.7723958611488342,
"num_tokens": 502191.0,
"step": 1015
},
{
"epoch": 1.02,
"grad_norm": 11.5,
"learning_rate": 1.853323382692774e-05,
"loss": 0.8636,
"mean_token_accuracy": 0.7786948680877686,
"num_tokens": 504669.0,
"step": 1020
},
{
"epoch": 1.025,
"grad_norm": 6.84375,
"learning_rate": 1.849892692986864e-05,
"loss": 1.0297,
"mean_token_accuracy": 0.7207900285720825,
"num_tokens": 505672.0,
"step": 1025
},
{
"epoch": 1.03,
"grad_norm": 9.5625,
"learning_rate": 1.8464255967066493e-05,
"loss": 0.7061,
"mean_token_accuracy": 0.8118877649307251,
"num_tokens": 508170.0,
"step": 1030
},
{
"epoch": 1.035,
"grad_norm": 6.53125,
"learning_rate": 1.8429222423709946e-05,
"loss": 0.7749,
"mean_token_accuracy": 0.7793939590454102,
"num_tokens": 511561.0,
"step": 1035
},
{
"epoch": 1.04,
"grad_norm": 6.375,
"learning_rate": 1.8393827800519397e-05,
"loss": 0.7721,
"mean_token_accuracy": 0.7689852476119995,
"num_tokens": 514986.0,
"step": 1040
},
{
"epoch": 1.045,
"grad_norm": 25.25,
"learning_rate": 1.8358073613682705e-05,
"loss": 0.8118,
"mean_token_accuracy": 0.7597795128822327,
"num_tokens": 517050.0,
"step": 1045
},
{
"epoch": 1.05,
"grad_norm": 29.125,
"learning_rate": 1.8321961394790227e-05,
"loss": 0.8591,
"mean_token_accuracy": 0.7560444116592407,
"num_tokens": 520299.0,
"step": 1050
},
{
"epoch": 1.055,
"grad_norm": 8.0,
"learning_rate": 1.8285492690769237e-05,
"loss": 0.6946,
"mean_token_accuracy": 0.7862708926200866,
"num_tokens": 523512.0,
"step": 1055
},
{
"epoch": 1.06,
"grad_norm": 5.09375,
"learning_rate": 1.8248669063817636e-05,
"loss": 0.9663,
"mean_token_accuracy": 0.7233885884284973,
"num_tokens": 526506.0,
"step": 1060
},
{
"epoch": 1.065,
"grad_norm": 8.625,
"learning_rate": 1.821149209133704e-05,
"loss": 0.7843,
"mean_token_accuracy": 0.7711453795433044,
"num_tokens": 529422.0,
"step": 1065
},
{
"epoch": 1.07,
"grad_norm": 6.25,
"learning_rate": 1.8173963365865224e-05,
"loss": 0.7622,
"mean_token_accuracy": 0.7758561253547669,
"num_tokens": 532773.0,
"step": 1070
},
{
"epoch": 1.075,
"grad_norm": 7.8125,
"learning_rate": 1.8136084495007874e-05,
"loss": 1.0229,
"mean_token_accuracy": 0.7247954487800599,
"num_tokens": 535103.0,
"step": 1075
},
{
"epoch": 1.08,
"grad_norm": 25.625,
"learning_rate": 1.8097857101369746e-05,
"loss": 0.5869,
"mean_token_accuracy": 0.8586034655570984,
"num_tokens": 537072.0,
"step": 1080
},
{
"epoch": 1.085,
"grad_norm": 17.75,
"learning_rate": 1.805928282248516e-05,
"loss": 0.8882,
"mean_token_accuracy": 0.7501534223556519,
"num_tokens": 538321.0,
"step": 1085
},
{
"epoch": 1.09,
"grad_norm": 30.25,
"learning_rate": 1.8020363310747836e-05,
"loss": 0.9023,
"mean_token_accuracy": 0.7529425621032715,
"num_tokens": 540248.0,
"step": 1090
},
{
"epoch": 1.095,
"grad_norm": 8.4375,
"learning_rate": 1.7981100233340118e-05,
"loss": 1.0241,
"mean_token_accuracy": 0.7149657249450684,
"num_tokens": 542034.0,
"step": 1095
},
{
"epoch": 1.1,
"grad_norm": 9.6875,
"learning_rate": 1.7941495272161566e-05,
"loss": 0.5642,
"mean_token_accuracy": 0.837715458869934,
"num_tokens": 545512.0,
"step": 1100
},
{
"epoch": 1.105,
"grad_norm": 5.09375,
"learning_rate": 1.7901550123756906e-05,
"loss": 0.5522,
"mean_token_accuracy": 0.8279439449310303,
"num_tokens": 549092.0,
"step": 1105
},
{
"epoch": 1.11,
"grad_norm": 5.59375,
"learning_rate": 1.7861266499243345e-05,
"loss": 0.673,
"mean_token_accuracy": 0.809853708744049,
"num_tokens": 553001.0,
"step": 1110
},
{
"epoch": 1.115,
"grad_norm": 4.78125,
"learning_rate": 1.782064612423728e-05,
"loss": 0.6577,
"mean_token_accuracy": 0.8298884153366088,
"num_tokens": 554999.0,
"step": 1115
},
{
"epoch": 1.12,
"grad_norm": 5.59375,
"learning_rate": 1.7779690738780386e-05,
"loss": 0.6141,
"mean_token_accuracy": 0.8177601218223571,
"num_tokens": 558170.0,
"step": 1120
},
{
"epoch": 1.125,
"grad_norm": 9.3125,
"learning_rate": 1.7738402097265063e-05,
"loss": 0.8862,
"mean_token_accuracy": 0.753680431842804,
"num_tokens": 560695.0,
"step": 1125
},
{
"epoch": 1.13,
"grad_norm": 4.09375,
"learning_rate": 1.7696781968359295e-05,
"loss": 0.6387,
"mean_token_accuracy": 0.8356854557991028,
"num_tokens": 563429.0,
"step": 1130
},
{
"epoch": 1.135,
"grad_norm": 6.96875,
"learning_rate": 1.7654832134930885e-05,
"loss": 0.7735,
"mean_token_accuracy": 0.7737359762191772,
"num_tokens": 566588.0,
"step": 1135
},
{
"epoch": 1.1400000000000001,
"grad_norm": 6.5625,
"learning_rate": 1.7612554393971072e-05,
"loss": 0.6943,
"mean_token_accuracy": 0.7929964780807495,
"num_tokens": 567913.0,
"step": 1140
},
{
"epoch": 1.145,
"grad_norm": 8.4375,
"learning_rate": 1.7569950556517566e-05,
"loss": 0.7843,
"mean_token_accuracy": 0.7716325044631958,
"num_tokens": 570225.0,
"step": 1145
},
{
"epoch": 1.15,
"grad_norm": 6.40625,
"learning_rate": 1.752702244757697e-05,
"loss": 0.9425,
"mean_token_accuracy": 0.7387582778930664,
"num_tokens": 572694.0,
"step": 1150
},
{
"epoch": 1.155,
"grad_norm": 7.8125,
"learning_rate": 1.7483771906046604e-05,
"loss": 0.7913,
"mean_token_accuracy": 0.768131959438324,
"num_tokens": 575852.0,
"step": 1155
},
{
"epoch": 1.16,
"grad_norm": 19.625,
"learning_rate": 1.7440200784635702e-05,
"loss": 0.8175,
"mean_token_accuracy": 0.7646706700325012,
"num_tokens": 577552.0,
"step": 1160
},
{
"epoch": 1.165,
"grad_norm": 5.46875,
"learning_rate": 1.73963109497861e-05,
"loss": 0.8025,
"mean_token_accuracy": 0.7692142486572265,
"num_tokens": 580416.0,
"step": 1165
},
{
"epoch": 1.17,
"grad_norm": 11.0625,
"learning_rate": 1.735210428159224e-05,
"loss": 0.6878,
"mean_token_accuracy": 0.7846928119659424,
"num_tokens": 580808.0,
"step": 1170
},
{
"epoch": 1.175,
"grad_norm": 5.46875,
"learning_rate": 1.7307582673720663e-05,
"loss": 0.55,
"mean_token_accuracy": 0.8443691372871399,
"num_tokens": 584652.0,
"step": 1175
},
{
"epoch": 1.18,
"grad_norm": 9.5,
"learning_rate": 1.7262748033328867e-05,
"loss": 0.8979,
"mean_token_accuracy": 0.7402187824249268,
"num_tokens": 586182.0,
"step": 1180
},
{
"epoch": 1.185,
"grad_norm": 9.6875,
"learning_rate": 1.7217602280983622e-05,
"loss": 0.6855,
"mean_token_accuracy": 0.7872085213661194,
"num_tokens": 588297.0,
"step": 1185
},
{
"epoch": 1.19,
"grad_norm": 4.46875,
"learning_rate": 1.717214735057871e-05,
"loss": 0.4546,
"mean_token_accuracy": 0.8626020073890686,
"num_tokens": 591413.0,
"step": 1190
},
{
"epoch": 1.195,
"grad_norm": 9.375,
"learning_rate": 1.7126385189252055e-05,
"loss": 0.8309,
"mean_token_accuracy": 0.7628986001014709,
"num_tokens": 593383.0,
"step": 1195
},
{
"epoch": 1.2,
"grad_norm": 11.375,
"learning_rate": 1.7080317757302346e-05,
"loss": 0.9022,
"mean_token_accuracy": 0.7245154261589051,
"num_tokens": 595420.0,
"step": 1200
},
{
"epoch": 1.205,
"grad_norm": 7.53125,
"learning_rate": 1.703394702810504e-05,
"loss": 1.0703,
"mean_token_accuracy": 0.7057002305984497,
"num_tokens": 597288.0,
"step": 1205
},
{
"epoch": 1.21,
"grad_norm": 5.59375,
"learning_rate": 1.6987274988027844e-05,
"loss": 0.7194,
"mean_token_accuracy": 0.8033593416213989,
"num_tokens": 601071.0,
"step": 1210
},
{
"epoch": 1.215,
"grad_norm": 8.8125,
"learning_rate": 1.694030363634562e-05,
"loss": 0.9448,
"mean_token_accuracy": 0.7666464924812317,
"num_tokens": 602400.0,
"step": 1215
},
{
"epoch": 1.22,
"grad_norm": 5.96875,
"learning_rate": 1.6893034985154736e-05,
"loss": 0.5014,
"mean_token_accuracy": 0.8425140500068664,
"num_tokens": 606273.0,
"step": 1220
},
{
"epoch": 1.225,
"grad_norm": 5.71875,
"learning_rate": 1.684547105928689e-05,
"loss": 0.8279,
"mean_token_accuracy": 0.7773316502571106,
"num_tokens": 607875.0,
"step": 1225
},
{
"epoch": 1.23,
"grad_norm": 6.125,
"learning_rate": 1.6797613896222362e-05,
"loss": 0.7825,
"mean_token_accuracy": 0.7865223050117492,
"num_tokens": 610323.0,
"step": 1230
},
{
"epoch": 1.2349999999999999,
"grad_norm": 8.5,
"learning_rate": 1.6749465546002734e-05,
"loss": 0.6528,
"mean_token_accuracy": 0.7994057536125183,
"num_tokens": 613556.0,
"step": 1235
},
{
"epoch": 1.24,
"grad_norm": 5.6875,
"learning_rate": 1.6701028071143078e-05,
"loss": 0.6212,
"mean_token_accuracy": 0.8158914804458618,
"num_tokens": 616500.0,
"step": 1240
},
{
"epoch": 1.245,
"grad_norm": 22.5,
"learning_rate": 1.665230354654361e-05,
"loss": 0.6869,
"mean_token_accuracy": 0.8139790058135986,
"num_tokens": 618250.0,
"step": 1245
},
{
"epoch": 1.25,
"grad_norm": 21.75,
"learning_rate": 1.6603294059400792e-05,
"loss": 0.6546,
"mean_token_accuracy": 0.8378836750984192,
"num_tokens": 619652.0,
"step": 1250
},
{
"epoch": 1.255,
"grad_norm": 7.65625,
"learning_rate": 1.655400170911794e-05,
"loss": 0.9698,
"mean_token_accuracy": 0.7234090209007263,
"num_tokens": 621675.0,
"step": 1255
},
{
"epoch": 1.26,
"grad_norm": 27.875,
"learning_rate": 1.6504428607215278e-05,
"loss": 0.6869,
"mean_token_accuracy": 0.7933684349060058,
"num_tokens": 624935.0,
"step": 1260
},
{
"epoch": 1.2650000000000001,
"grad_norm": 4.6875,
"learning_rate": 1.645457687723951e-05,
"loss": 0.7998,
"mean_token_accuracy": 0.7805917382240295,
"num_tokens": 627211.0,
"step": 1265
},
{
"epoch": 1.27,
"grad_norm": 47.25,
"learning_rate": 1.640444865467281e-05,
"loss": 0.914,
"mean_token_accuracy": 0.7634903073310852,
"num_tokens": 630413.0,
"step": 1270
},
{
"epoch": 1.275,
"grad_norm": 7.40625,
"learning_rate": 1.635404608684141e-05,
"loss": 1.1762,
"mean_token_accuracy": 0.7183934211730957,
"num_tokens": 632050.0,
"step": 1275
},
{
"epoch": 1.28,
"grad_norm": 23.375,
"learning_rate": 1.630337133282356e-05,
"loss": 0.9415,
"mean_token_accuracy": 0.7469957709312439,
"num_tokens": 633415.0,
"step": 1280
},
{
"epoch": 1.285,
"grad_norm": 25.25,
"learning_rate": 1.6252426563357054e-05,
"loss": 0.8475,
"mean_token_accuracy": 0.7693663716316224,
"num_tokens": 635732.0,
"step": 1285
},
{
"epoch": 1.29,
"grad_norm": 5.78125,
"learning_rate": 1.6201213960746265e-05,
"loss": 0.6118,
"mean_token_accuracy": 0.8120120167732239,
"num_tokens": 638295.0,
"step": 1290
},
{
"epoch": 1.295,
"grad_norm": 7.84375,
"learning_rate": 1.6149735718768643e-05,
"loss": 0.8979,
"mean_token_accuracy": 0.7418479681015014,
"num_tokens": 640676.0,
"step": 1295
},
{
"epoch": 1.3,
"grad_norm": 24.25,
"learning_rate": 1.609799404258074e-05,
"loss": 1.347,
"mean_token_accuracy": 0.645957636833191,
"num_tokens": 642328.0,
"step": 1300
},
{
"epoch": 1.305,
"grad_norm": 5.28125,
"learning_rate": 1.6045991148623752e-05,
"loss": 0.7518,
"mean_token_accuracy": 0.785522711277008,
"num_tokens": 645217.0,
"step": 1305
},
{
"epoch": 1.31,
"grad_norm": 8.6875,
"learning_rate": 1.5993729264528574e-05,
"loss": 0.7075,
"mean_token_accuracy": 0.7937332510948181,
"num_tokens": 647392.0,
"step": 1310
},
{
"epoch": 1.315,
"grad_norm": 8.375,
"learning_rate": 1.594121062902039e-05,
"loss": 0.8125,
"mean_token_accuracy": 0.7603193163871765,
"num_tokens": 649437.0,
"step": 1315
},
{
"epoch": 1.32,
"grad_norm": 7.34375,
"learning_rate": 1.5888437491822735e-05,
"loss": 0.7205,
"mean_token_accuracy": 0.7862048745155334,
"num_tokens": 651137.0,
"step": 1320
},
{
"epoch": 1.325,
"grad_norm": 8.9375,
"learning_rate": 1.5835412113561176e-05,
"loss": 0.809,
"mean_token_accuracy": 0.777035117149353,
"num_tokens": 654612.0,
"step": 1325
},
{
"epoch": 1.33,
"grad_norm": 9.0,
"learning_rate": 1.578213676566643e-05,
"loss": 0.6462,
"mean_token_accuracy": 0.8325076103210449,
"num_tokens": 656903.0,
"step": 1330
},
{
"epoch": 1.335,
"grad_norm": 7.75,
"learning_rate": 1.572861373027709e-05,
"loss": 0.8659,
"mean_token_accuracy": 0.7543999433517456,
"num_tokens": 659625.0,
"step": 1335
},
{
"epoch": 1.34,
"grad_norm": 7.90625,
"learning_rate": 1.5674845300141853e-05,
"loss": 0.7714,
"mean_token_accuracy": 0.7812225937843322,
"num_tokens": 661962.0,
"step": 1340
},
{
"epoch": 1.345,
"grad_norm": 20.125,
"learning_rate": 1.5620833778521306e-05,
"loss": 0.7595,
"mean_token_accuracy": 0.7850270628929138,
"num_tokens": 664595.0,
"step": 1345
},
{
"epoch": 1.35,
"grad_norm": 8.3125,
"learning_rate": 1.5566581479089278e-05,
"loss": 0.5983,
"mean_token_accuracy": 0.8208136677742004,
"num_tokens": 668641.0,
"step": 1350
},
{
"epoch": 1.355,
"grad_norm": 20.0,
"learning_rate": 1.5512090725833706e-05,
"loss": 0.5589,
"mean_token_accuracy": 0.8399159669876098,
"num_tokens": 670717.0,
"step": 1355
},
{
"epoch": 1.3599999999999999,
"grad_norm": 4.90625,
"learning_rate": 1.54573638529571e-05,
"loss": 1.1796,
"mean_token_accuracy": 0.6812988758087158,
"num_tokens": 673121.0,
"step": 1360
},
{
"epoch": 1.365,
"grad_norm": 4.28125,
"learning_rate": 1.5402403204776552e-05,
"loss": 0.7475,
"mean_token_accuracy": 0.7796531915664673,
"num_tokens": 676221.0,
"step": 1365
},
{
"epoch": 1.37,
"grad_norm": 41.0,
"learning_rate": 1.5347211135623305e-05,
"loss": 1.0734,
"mean_token_accuracy": 0.7397149443626404,
"num_tokens": 679178.0,
"step": 1370
},
{
"epoch": 1.375,
"grad_norm": 4.6875,
"learning_rate": 1.5291790009741906e-05,
"loss": 0.9777,
"mean_token_accuracy": 0.7575628995895386,
"num_tokens": 681713.0,
"step": 1375
},
{
"epoch": 1.38,
"grad_norm": 7.75,
"learning_rate": 1.5236142201188937e-05,
"loss": 0.9092,
"mean_token_accuracy": 0.7369059562683106,
"num_tokens": 684554.0,
"step": 1380
},
{
"epoch": 1.385,
"grad_norm": 26.125,
"learning_rate": 1.5180270093731305e-05,
"loss": 0.6692,
"mean_token_accuracy": 0.816079044342041,
"num_tokens": 687204.0,
"step": 1385
},
{
"epoch": 1.3900000000000001,
"grad_norm": 31.5,
"learning_rate": 1.5124176080744133e-05,
"loss": 0.8812,
"mean_token_accuracy": 0.7615653157234192,
"num_tokens": 688872.0,
"step": 1390
},
{
"epoch": 1.395,
"grad_norm": 8.0,
"learning_rate": 1.5067862565108242e-05,
"loss": 0.6465,
"mean_token_accuracy": 0.8142944216728211,
"num_tokens": 691813.0,
"step": 1395
},
{
"epoch": 1.4,
"grad_norm": 33.0,
"learning_rate": 1.5011331959107218e-05,
"loss": 0.9764,
"mean_token_accuracy": 0.7363247156143189,
"num_tokens": 693259.0,
"step": 1400
},
{
"epoch": 1.405,
"grad_norm": 8.8125,
"learning_rate": 1.4954586684324077e-05,
"loss": 0.6891,
"mean_token_accuracy": 0.7893139243125915,
"num_tokens": 695056.0,
"step": 1405
},
{
"epoch": 1.41,
"grad_norm": 8.6875,
"learning_rate": 1.4897629171537522e-05,
"loss": 1.0266,
"mean_token_accuracy": 0.718493127822876,
"num_tokens": 696392.0,
"step": 1410
},
{
"epoch": 1.415,
"grad_norm": 8.375,
"learning_rate": 1.4840461860617834e-05,
"loss": 0.7586,
"mean_token_accuracy": 0.7843063712120056,
"num_tokens": 699830.0,
"step": 1415
},
{
"epoch": 1.42,
"grad_norm": 6.0625,
"learning_rate": 1.4783087200422346e-05,
"loss": 0.6742,
"mean_token_accuracy": 0.7995662927627564,
"num_tokens": 703132.0,
"step": 1420
},
{
"epoch": 1.425,
"grad_norm": 4.84375,
"learning_rate": 1.4725507648690542e-05,
"loss": 0.832,
"mean_token_accuracy": 0.7495120525360107,
"num_tokens": 705829.0,
"step": 1425
},
{
"epoch": 1.43,
"grad_norm": 36.75,
"learning_rate": 1.4667725671938777e-05,
"loss": 0.8103,
"mean_token_accuracy": 0.7845470070838928,
"num_tokens": 708529.0,
"step": 1430
},
{
"epoch": 1.435,
"grad_norm": 7.9375,
"learning_rate": 1.4609743745354625e-05,
"loss": 0.6037,
"mean_token_accuracy": 0.8083573698997497,
"num_tokens": 711859.0,
"step": 1435
},
{
"epoch": 1.44,
"grad_norm": 4.59375,
"learning_rate": 1.455156435269084e-05,
"loss": 0.4976,
"mean_token_accuracy": 0.8688451528549195,
"num_tokens": 715456.0,
"step": 1440
},
{
"epoch": 1.445,
"grad_norm": 6.1875,
"learning_rate": 1.4493189986158966e-05,
"loss": 0.7662,
"mean_token_accuracy": 0.7678531765937805,
"num_tokens": 718603.0,
"step": 1445
},
{
"epoch": 1.45,
"grad_norm": 4.90625,
"learning_rate": 1.4434623146322585e-05,
"loss": 0.7864,
"mean_token_accuracy": 0.7857390880584717,
"num_tokens": 721618.0,
"step": 1450
},
{
"epoch": 1.455,
"grad_norm": 4.75,
"learning_rate": 1.4375866341990187e-05,
"loss": 0.8562,
"mean_token_accuracy": 0.7766207218170166,
"num_tokens": 723473.0,
"step": 1455
},
{
"epoch": 1.46,
"grad_norm": 8.625,
"learning_rate": 1.4316922090107712e-05,
"loss": 0.8474,
"mean_token_accuracy": 0.7638363718986512,
"num_tokens": 726150.0,
"step": 1460
},
{
"epoch": 1.465,
"grad_norm": 18.125,
"learning_rate": 1.4257792915650728e-05,
"loss": 0.8561,
"mean_token_accuracy": 0.7815182447433472,
"num_tokens": 727068.0,
"step": 1465
},
{
"epoch": 1.47,
"grad_norm": 3.78125,
"learning_rate": 1.4198481351516274e-05,
"loss": 0.6429,
"mean_token_accuracy": 0.8231713652610779,
"num_tokens": 729654.0,
"step": 1470
},
{
"epoch": 1.475,
"grad_norm": 7.1875,
"learning_rate": 1.413898993841435e-05,
"loss": 1.1566,
"mean_token_accuracy": 0.6975414037704468,
"num_tokens": 731546.0,
"step": 1475
},
{
"epoch": 1.48,
"grad_norm": 7.6875,
"learning_rate": 1.4079321224759093e-05,
"loss": 0.6556,
"mean_token_accuracy": 0.8056102156639099,
"num_tokens": 734532.0,
"step": 1480
},
{
"epoch": 1.4849999999999999,
"grad_norm": 24.0,
"learning_rate": 1.4019477766559604e-05,
"loss": 0.6731,
"mean_token_accuracy": 0.7824649691581727,
"num_tokens": 736766.0,
"step": 1485
},
{
"epoch": 1.49,
"grad_norm": 9.1875,
"learning_rate": 1.3959462127310455e-05,
"loss": 0.863,
"mean_token_accuracy": 0.7568913578987122,
"num_tokens": 738963.0,
"step": 1490
},
{
"epoch": 1.495,
"grad_norm": 18.5,
"learning_rate": 1.3899276877881884e-05,
"loss": 0.6798,
"mean_token_accuracy": 0.8145083069801331,
"num_tokens": 740735.0,
"step": 1495
},
{
"epoch": 1.5,
"grad_norm": 21.625,
"learning_rate": 1.3838924596409669e-05,
"loss": 0.7044,
"mean_token_accuracy": 0.788611114025116,
"num_tokens": 743437.0,
"step": 1500
},
{
"epoch": 1.505,
"grad_norm": 7.3125,
"learning_rate": 1.3778407868184674e-05,
"loss": 0.6833,
"mean_token_accuracy": 0.7906283736228943,
"num_tokens": 746683.0,
"step": 1505
},
{
"epoch": 1.51,
"grad_norm": 5.9375,
"learning_rate": 1.3717729285542123e-05,
"loss": 0.8181,
"mean_token_accuracy": 0.8152880191802978,
"num_tokens": 748460.0,
"step": 1510
},
{
"epoch": 1.5150000000000001,
"grad_norm": 7.59375,
"learning_rate": 1.3656891447750544e-05,
"loss": 0.6501,
"mean_token_accuracy": 0.8229759573936463,
"num_tokens": 750909.0,
"step": 1515
},
{
"epoch": 1.52,
"grad_norm": 23.25,
"learning_rate": 1.3595896960900424e-05,
"loss": 0.6641,
"mean_token_accuracy": 0.7920986771583557,
"num_tokens": 753135.0,
"step": 1520
},
{
"epoch": 1.525,
"grad_norm": 56.5,
"learning_rate": 1.3534748437792573e-05,
"loss": 0.9268,
"mean_token_accuracy": 0.7407203435897827,
"num_tokens": 754912.0,
"step": 1525
},
{
"epoch": 1.53,
"grad_norm": 5.21875,
"learning_rate": 1.3473448497826203e-05,
"loss": 0.7019,
"mean_token_accuracy": 0.8039157390594482,
"num_tokens": 757369.0,
"step": 1530
},
{
"epoch": 1.5350000000000001,
"grad_norm": 7.375,
"learning_rate": 1.341199976688672e-05,
"loss": 0.9554,
"mean_token_accuracy": 0.7373032093048095,
"num_tokens": 761380.0,
"step": 1535
},
{
"epoch": 1.54,
"grad_norm": 26.75,
"learning_rate": 1.335040487723324e-05,
"loss": 0.7944,
"mean_token_accuracy": 0.7538552761077881,
"num_tokens": 763096.0,
"step": 1540
},
{
"epoch": 1.545,
"grad_norm": 25.75,
"learning_rate": 1.3288666467385834e-05,
"loss": 0.7346,
"mean_token_accuracy": 0.7893040895462036,
"num_tokens": 766281.0,
"step": 1545
},
{
"epoch": 1.55,
"grad_norm": 3.734375,
"learning_rate": 1.3226787182012494e-05,
"loss": 0.5305,
"mean_token_accuracy": 0.8514875173568726,
"num_tokens": 768598.0,
"step": 1550
},
{
"epoch": 1.5550000000000002,
"grad_norm": 14.0625,
"learning_rate": 1.3164769671815862e-05,
"loss": 0.707,
"mean_token_accuracy": 0.8167065143585205,
"num_tokens": 770795.0,
"step": 1555
},
{
"epoch": 1.56,
"grad_norm": 7.28125,
"learning_rate": 1.310261659341966e-05,
"loss": 0.6754,
"mean_token_accuracy": 0.8203782916069031,
"num_tokens": 774051.0,
"step": 1560
},
{
"epoch": 1.565,
"grad_norm": 7.34375,
"learning_rate": 1.3040330609254903e-05,
"loss": 0.5884,
"mean_token_accuracy": 0.8256639838218689,
"num_tokens": 776466.0,
"step": 1565
},
{
"epoch": 1.5699999999999998,
"grad_norm": 8.0,
"learning_rate": 1.2977914387445855e-05,
"loss": 0.8225,
"mean_token_accuracy": 0.7328911781311035,
"num_tokens": 778001.0,
"step": 1570
},
{
"epoch": 1.575,
"grad_norm": 6.90625,
"learning_rate": 1.2915370601695715e-05,
"loss": 0.7298,
"mean_token_accuracy": 0.7988103628158569,
"num_tokens": 781046.0,
"step": 1575
},
{
"epoch": 1.58,
"grad_norm": 6.40625,
"learning_rate": 1.2852701931172105e-05,
"loss": 0.934,
"mean_token_accuracy": 0.7509458780288696,
"num_tokens": 782879.0,
"step": 1580
},
{
"epoch": 1.585,
"grad_norm": 6.8125,
"learning_rate": 1.2789911060392295e-05,
"loss": 0.5838,
"mean_token_accuracy": 0.8262917876243592,
"num_tokens": 786507.0,
"step": 1585
},
{
"epoch": 1.5899999999999999,
"grad_norm": 10.25,
"learning_rate": 1.2727000679108198e-05,
"loss": 0.6244,
"mean_token_accuracy": 0.8335184097290039,
"num_tokens": 788829.0,
"step": 1590
},
{
"epoch": 1.595,
"grad_norm": 4.96875,
"learning_rate": 1.2663973482191177e-05,
"loss": 0.5816,
"mean_token_accuracy": 0.8350133657455444,
"num_tokens": 792924.0,
"step": 1595
},
{
"epoch": 1.6,
"grad_norm": 6.4375,
"learning_rate": 1.2600832169516569e-05,
"loss": 0.9038,
"mean_token_accuracy": 0.7410524249076843,
"num_tokens": 795525.0,
"step": 1600
},
{
"epoch": 1.605,
"grad_norm": 8.0,
"learning_rate": 1.2537579445848058e-05,
"loss": 0.7561,
"mean_token_accuracy": 0.7771415114402771,
"num_tokens": 797845.0,
"step": 1605
},
{
"epoch": 1.6099999999999999,
"grad_norm": 5.34375,
"learning_rate": 1.2474218020721808e-05,
"loss": 0.8331,
"mean_token_accuracy": 0.7366581082344055,
"num_tokens": 799764.0,
"step": 1610
},
{
"epoch": 1.615,
"grad_norm": 8.25,
"learning_rate": 1.2410750608330389e-05,
"loss": 0.7238,
"mean_token_accuracy": 0.805954110622406,
"num_tokens": 801318.0,
"step": 1615
},
{
"epoch": 1.62,
"grad_norm": 5.34375,
"learning_rate": 1.234717992740651e-05,
"loss": 0.5966,
"mean_token_accuracy": 0.846554183959961,
"num_tokens": 803034.0,
"step": 1620
},
{
"epoch": 1.625,
"grad_norm": 21.75,
"learning_rate": 1.2283508701106559e-05,
"loss": 0.9917,
"mean_token_accuracy": 0.734014344215393,
"num_tokens": 804647.0,
"step": 1625
},
{
"epoch": 1.63,
"grad_norm": 7.71875,
"learning_rate": 1.221973965689396e-05,
"loss": 0.6353,
"mean_token_accuracy": 0.8209498286247253,
"num_tokens": 807462.0,
"step": 1630
},
{
"epoch": 1.635,
"grad_norm": 17.875,
"learning_rate": 1.2155875526422332e-05,
"loss": 0.6549,
"mean_token_accuracy": 0.8138946652412414,
"num_tokens": 810426.0,
"step": 1635
},
{
"epoch": 1.6400000000000001,
"grad_norm": 8.125,
"learning_rate": 1.2091919045418456e-05,
"loss": 1.0867,
"mean_token_accuracy": 0.7125813007354737,
"num_tokens": 813337.0,
"step": 1640
},
{
"epoch": 1.645,
"grad_norm": 4.78125,
"learning_rate": 1.2027872953565125e-05,
"loss": 0.8961,
"mean_token_accuracy": 0.7431066751480102,
"num_tokens": 815299.0,
"step": 1645
},
{
"epoch": 1.65,
"grad_norm": 7.46875,
"learning_rate": 1.1963739994383751e-05,
"loss": 0.8284,
"mean_token_accuracy": 0.7688675642013549,
"num_tokens": 818263.0,
"step": 1650
},
{
"epoch": 1.655,
"grad_norm": 7.84375,
"learning_rate": 1.1899522915116848e-05,
"loss": 0.5926,
"mean_token_accuracy": 0.8393247485160827,
"num_tokens": 821340.0,
"step": 1655
},
{
"epoch": 1.6600000000000001,
"grad_norm": 6.375,
"learning_rate": 1.1835224466610366e-05,
"loss": 0.6788,
"mean_token_accuracy": 0.8024531245231629,
"num_tokens": 823905.0,
"step": 1660
},
{
"epoch": 1.665,
"grad_norm": 5.03125,
"learning_rate": 1.1770847403195836e-05,
"loss": 0.7675,
"mean_token_accuracy": 0.7753681302070617,
"num_tokens": 825154.0,
"step": 1665
},
{
"epoch": 1.67,
"grad_norm": 31.5,
"learning_rate": 1.1706394482572389e-05,
"loss": 0.9249,
"mean_token_accuracy": 0.7288736701011658,
"num_tokens": 827410.0,
"step": 1670
},
{
"epoch": 1.675,
"grad_norm": 6.90625,
"learning_rate": 1.164186846568863e-05,
"loss": 0.7804,
"mean_token_accuracy": 0.7748052835464477,
"num_tokens": 829865.0,
"step": 1675
},
{
"epoch": 1.6800000000000002,
"grad_norm": 16.375,
"learning_rate": 1.1577272116624365e-05,
"loss": 0.6965,
"mean_token_accuracy": 0.7901236653327942,
"num_tokens": 831718.0,
"step": 1680
},
{
"epoch": 1.685,
"grad_norm": 8.4375,
"learning_rate": 1.1512608202472195e-05,
"loss": 0.6891,
"mean_token_accuracy": 0.8099049091339111,
"num_tokens": 834931.0,
"step": 1685
},
{
"epoch": 1.69,
"grad_norm": 12.8125,
"learning_rate": 1.144787949321898e-05,
"loss": 0.6237,
"mean_token_accuracy": 0.8211381077766419,
"num_tokens": 835848.0,
"step": 1690
},
{
"epoch": 1.6949999999999998,
"grad_norm": 7.25,
"learning_rate": 1.1383088761627193e-05,
"loss": 0.9123,
"mean_token_accuracy": 0.7527759313583374,
"num_tokens": 839009.0,
"step": 1695
},
{
"epoch": 1.7,
"grad_norm": 8.0,
"learning_rate": 1.131823878311613e-05,
"loss": 0.8947,
"mean_token_accuracy": 0.7173206686973572,
"num_tokens": 840767.0,
"step": 1700
},
{
"epoch": 1.705,
"grad_norm": 8.6875,
"learning_rate": 1.1253332335643043e-05,
"loss": 0.8703,
"mean_token_accuracy": 0.7573009014129639,
"num_tokens": 842188.0,
"step": 1705
},
{
"epoch": 1.71,
"grad_norm": 7.53125,
"learning_rate": 1.118837219958411e-05,
"loss": 0.8249,
"mean_token_accuracy": 0.7549278020858765,
"num_tokens": 845037.0,
"step": 1710
},
{
"epoch": 1.7149999999999999,
"grad_norm": 7.875,
"learning_rate": 1.1123361157615355e-05,
"loss": 0.5855,
"mean_token_accuracy": 0.8285978078842163,
"num_tokens": 847057.0,
"step": 1715
},
{
"epoch": 1.72,
"grad_norm": 6.75,
"learning_rate": 1.1058301994593447e-05,
"loss": 0.9877,
"mean_token_accuracy": 0.7238339900970459,
"num_tokens": 850125.0,
"step": 1720
},
{
"epoch": 1.725,
"grad_norm": 5.53125,
"learning_rate": 1.0993197497436392e-05,
"loss": 0.7891,
"mean_token_accuracy": 0.7663105010986329,
"num_tokens": 852644.0,
"step": 1725
},
{
"epoch": 1.73,
"grad_norm": 21.0,
"learning_rate": 1.0928050455004164e-05,
"loss": 0.6967,
"mean_token_accuracy": 0.8234347224235534,
"num_tokens": 853429.0,
"step": 1730
},
{
"epoch": 1.7349999999999999,
"grad_norm": 8.6875,
"learning_rate": 1.0862863657979237e-05,
"loss": 0.9785,
"mean_token_accuracy": 0.7190654397010803,
"num_tokens": 855823.0,
"step": 1735
},
{
"epoch": 1.74,
"grad_norm": 31.75,
"learning_rate": 1.0797639898747033e-05,
"loss": 0.7938,
"mean_token_accuracy": 0.7673383355140686,
"num_tokens": 857885.0,
"step": 1740
},
{
"epoch": 1.745,
"grad_norm": 7.03125,
"learning_rate": 1.0732381971276318e-05,
"loss": 0.7509,
"mean_token_accuracy": 0.7781156420707702,
"num_tokens": 860114.0,
"step": 1745
},
{
"epoch": 1.75,
"grad_norm": 4.75,
"learning_rate": 1.0667092670999512e-05,
"loss": 0.8932,
"mean_token_accuracy": 0.7806082010269165,
"num_tokens": 863726.0,
"step": 1750
},
{
"epoch": 1.755,
"grad_norm": 8.8125,
"learning_rate": 1.0601774794692936e-05,
"loss": 0.5887,
"mean_token_accuracy": 0.8234007000923157,
"num_tokens": 867574.0,
"step": 1755
},
{
"epoch": 1.76,
"grad_norm": 13.125,
"learning_rate": 1.0536431140357018e-05,
"loss": 0.8601,
"mean_token_accuracy": 0.7545054912567138,
"num_tokens": 869234.0,
"step": 1760
},
{
"epoch": 1.7650000000000001,
"grad_norm": 23.0,
"learning_rate": 1.0471064507096427e-05,
"loss": 0.8858,
"mean_token_accuracy": 0.7431819677352905,
"num_tokens": 871713.0,
"step": 1765
},
{
"epoch": 1.77,
"grad_norm": 5.53125,
"learning_rate": 1.040567769500018e-05,
"loss": 0.6333,
"mean_token_accuracy": 0.8191001772880554,
"num_tokens": 875192.0,
"step": 1770
},
{
"epoch": 1.775,
"grad_norm": 28.125,
"learning_rate": 1.0340273505021675e-05,
"loss": 0.7526,
"mean_token_accuracy": 0.7924058437347412,
"num_tokens": 878090.0,
"step": 1775
},
{
"epoch": 1.78,
"grad_norm": 20.625,
"learning_rate": 1.0274854738858735e-05,
"loss": 0.8469,
"mean_token_accuracy": 0.7690641760826111,
"num_tokens": 880115.0,
"step": 1780
},
{
"epoch": 1.7850000000000001,
"grad_norm": 20.5,
"learning_rate": 1.0209424198833571e-05,
"loss": 0.9089,
"mean_token_accuracy": 0.7421629071235657,
"num_tokens": 881569.0,
"step": 1785
},
{
"epoch": 1.79,
"grad_norm": 30.25,
"learning_rate": 1.0143984687772746e-05,
"loss": 0.9561,
"mean_token_accuracy": 0.7445757389068604,
"num_tokens": 883754.0,
"step": 1790
},
{
"epoch": 1.795,
"grad_norm": 5.09375,
"learning_rate": 1.0078539008887114e-05,
"loss": 0.6389,
"mean_token_accuracy": 0.81376873254776,
"num_tokens": 887475.0,
"step": 1795
},
{
"epoch": 1.8,
"grad_norm": 7.21875,
"learning_rate": 1.0013089965651743e-05,
"loss": 0.7537,
"mean_token_accuracy": 0.7766026139259339,
"num_tokens": 890091.0,
"step": 1800
},
{
"epoch": 1.8050000000000002,
"grad_norm": 17.75,
"learning_rate": 9.947640361685805e-06,
"loss": 0.6551,
"mean_token_accuracy": 0.827604067325592,
"num_tokens": 892332.0,
"step": 1805
},
{
"epoch": 1.81,
"grad_norm": 5.625,
"learning_rate": 9.882193000632507e-06,
"loss": 0.5868,
"mean_token_accuracy": 0.830898129940033,
"num_tokens": 895800.0,
"step": 1810
},
{
"epoch": 1.815,
"grad_norm": 8.0625,
"learning_rate": 9.816750686038974e-06,
"loss": 0.8278,
"mean_token_accuracy": 0.7838187098503113,
"num_tokens": 897036.0,
"step": 1815
},
{
"epoch": 1.8199999999999998,
"grad_norm": 5.09375,
"learning_rate": 9.751316221236147e-06,
"loss": 0.6766,
"mean_token_accuracy": 0.7844752073287964,
"num_tokens": 900013.0,
"step": 1820
},
{
"epoch": 1.825,
"grad_norm": 5.15625,
"learning_rate": 9.685892409218718e-06,
"loss": 0.5658,
"mean_token_accuracy": 0.8363893985748291,
"num_tokens": 903991.0,
"step": 1825
},
{
"epoch": 1.83,
"grad_norm": 11.625,
"learning_rate": 9.620482052525048e-06,
"loss": 0.7384,
"mean_token_accuracy": 0.7787490963935852,
"num_tokens": 905860.0,
"step": 1830
},
{
"epoch": 1.835,
"grad_norm": 5.53125,
"learning_rate": 9.555087953117114e-06,
"loss": 0.8144,
"mean_token_accuracy": 0.762047529220581,
"num_tokens": 909342.0,
"step": 1835
},
{
"epoch": 1.8399999999999999,
"grad_norm": 6.53125,
"learning_rate": 9.489712912260486e-06,
"loss": 0.894,
"mean_token_accuracy": 0.7327398538589478,
"num_tokens": 911671.0,
"step": 1840
},
{
"epoch": 1.845,
"grad_norm": 32.75,
"learning_rate": 9.424359730404329e-06,
"loss": 1.1982,
"mean_token_accuracy": 0.6791368842124939,
"num_tokens": 912586.0,
"step": 1845
},
{
"epoch": 1.85,
"grad_norm": 7.84375,
"learning_rate": 9.359031207061442e-06,
"loss": 0.651,
"mean_token_accuracy": 0.8372863888740539,
"num_tokens": 915696.0,
"step": 1850
},
{
"epoch": 1.855,
"grad_norm": 6.25,
"learning_rate": 9.293730140688336e-06,
"loss": 0.9395,
"mean_token_accuracy": 0.7412417650222778,
"num_tokens": 917503.0,
"step": 1855
},
{
"epoch": 1.8599999999999999,
"grad_norm": 6.875,
"learning_rate": 9.228459328565354e-06,
"loss": 1.136,
"mean_token_accuracy": 0.7111948132514954,
"num_tokens": 918864.0,
"step": 1860
},
{
"epoch": 1.865,
"grad_norm": 4.65625,
"learning_rate": 9.163221566676847e-06,
"loss": 0.8892,
"mean_token_accuracy": 0.7210251212120056,
"num_tokens": 920527.0,
"step": 1865
},
{
"epoch": 1.87,
"grad_norm": 8.0625,
"learning_rate": 9.098019649591409e-06,
"loss": 0.7635,
"mean_token_accuracy": 0.7711951851844787,
"num_tokens": 923776.0,
"step": 1870
},
{
"epoch": 1.875,
"grad_norm": 4.4375,
"learning_rate": 9.032856370342158e-06,
"loss": 0.5589,
"mean_token_accuracy": 0.8322081685066223,
"num_tokens": 927187.0,
"step": 1875
},
{
"epoch": 1.88,
"grad_norm": 27.0,
"learning_rate": 8.967734520307102e-06,
"loss": 0.6424,
"mean_token_accuracy": 0.8095461368560791,
"num_tokens": 929400.0,
"step": 1880
},
{
"epoch": 1.885,
"grad_norm": 7.875,
"learning_rate": 8.902656889089548e-06,
"loss": 0.7556,
"mean_token_accuracy": 0.7700052261352539,
"num_tokens": 932708.0,
"step": 1885
},
{
"epoch": 1.8900000000000001,
"grad_norm": 18.75,
"learning_rate": 8.837626264398623e-06,
"loss": 0.8495,
"mean_token_accuracy": 0.7746875047683716,
"num_tokens": 934905.0,
"step": 1890
},
{
"epoch": 1.895,
"grad_norm": 8.0,
"learning_rate": 8.772645431929851e-06,
"loss": 0.8468,
"mean_token_accuracy": 0.7573032855987549,
"num_tokens": 936953.0,
"step": 1895
},
{
"epoch": 1.9,
"grad_norm": 6.15625,
"learning_rate": 8.707717175245826e-06,
"loss": 0.735,
"mean_token_accuracy": 0.7652702808380127,
"num_tokens": 939323.0,
"step": 1900
},
{
"epoch": 1.905,
"grad_norm": 7.90625,
"learning_rate": 8.642844275656957e-06,
"loss": 0.8166,
"mean_token_accuracy": 0.7702357769012451,
"num_tokens": 941802.0,
"step": 1905
},
{
"epoch": 1.9100000000000001,
"grad_norm": 24.375,
"learning_rate": 8.578029512102357e-06,
"loss": 0.6977,
"mean_token_accuracy": 0.7843759894371033,
"num_tokens": 944128.0,
"step": 1910
},
{
"epoch": 1.915,
"grad_norm": 6.625,
"learning_rate": 8.51327566103077e-06,
"loss": 0.7186,
"mean_token_accuracy": 0.7871297121047973,
"num_tokens": 947149.0,
"step": 1915
},
{
"epoch": 1.92,
"grad_norm": 7.9375,
"learning_rate": 8.448585496281667e-06,
"loss": 0.8985,
"mean_token_accuracy": 0.7248257637023926,
"num_tokens": 950716.0,
"step": 1920
},
{
"epoch": 1.925,
"grad_norm": 13.8125,
"learning_rate": 8.38396178896639e-06,
"loss": 0.8082,
"mean_token_accuracy": 0.7705564975738526,
"num_tokens": 952299.0,
"step": 1925
},
{
"epoch": 1.9300000000000002,
"grad_norm": 13.4375,
"learning_rate": 8.319407307349482e-06,
"loss": 0.7754,
"mean_token_accuracy": 0.7596949458122253,
"num_tokens": 954737.0,
"step": 1930
},
{
"epoch": 1.935,
"grad_norm": 5.15625,
"learning_rate": 8.254924816730083e-06,
"loss": 0.7946,
"mean_token_accuracy": 0.7884817004203797,
"num_tokens": 956701.0,
"step": 1935
},
{
"epoch": 1.94,
"grad_norm": 38.25,
"learning_rate": 8.190517079323472e-06,
"loss": 0.9175,
"mean_token_accuracy": 0.7576337218284607,
"num_tokens": 959914.0,
"step": 1940
},
{
"epoch": 1.9449999999999998,
"grad_norm": 27.125,
"learning_rate": 8.126186854142752e-06,
"loss": 0.5564,
"mean_token_accuracy": 0.8395720958709717,
"num_tokens": 962779.0,
"step": 1945
},
{
"epoch": 1.95,
"grad_norm": 17.375,
"learning_rate": 8.061936896880674e-06,
"loss": 0.6506,
"mean_token_accuracy": 0.8393109798431396,
"num_tokens": 964649.0,
"step": 1950
},
{
"epoch": 1.955,
"grad_norm": 13.75,
"learning_rate": 7.997769959791554e-06,
"loss": 0.8628,
"mean_token_accuracy": 0.7746016979217529,
"num_tokens": 966716.0,
"step": 1955
},
{
"epoch": 1.96,
"grad_norm": 6.4375,
"learning_rate": 7.933688791573418e-06,
"loss": 0.6913,
"mean_token_accuracy": 0.7971544742584229,
"num_tokens": 969874.0,
"step": 1960
},
{
"epoch": 1.9649999999999999,
"grad_norm": 5.1875,
"learning_rate": 7.869696137250235e-06,
"loss": 0.5946,
"mean_token_accuracy": 0.8192303895950317,
"num_tokens": 972170.0,
"step": 1965
},
{
"epoch": 1.97,
"grad_norm": 18.875,
"learning_rate": 7.80579473805433e-06,
"loss": 0.632,
"mean_token_accuracy": 0.8178293108940125,
"num_tokens": 974308.0,
"step": 1970
},
{
"epoch": 1.975,
"grad_norm": 7.78125,
"learning_rate": 7.741987331308964e-06,
"loss": 0.6573,
"mean_token_accuracy": 0.8085449814796448,
"num_tokens": 977606.0,
"step": 1975
},
{
"epoch": 1.98,
"grad_norm": 5.1875,
"learning_rate": 7.678276650311075e-06,
"loss": 0.8132,
"mean_token_accuracy": 0.7615883231163025,
"num_tokens": 980558.0,
"step": 1980
},
{
"epoch": 1.9849999999999999,
"grad_norm": 4.65625,
"learning_rate": 7.6146654242141935e-06,
"loss": 0.7852,
"mean_token_accuracy": 0.7767473936080933,
"num_tokens": 983519.0,
"step": 1985
},
{
"epoch": 1.99,
"grad_norm": 24.25,
"learning_rate": 7.551156377911526e-06,
"loss": 0.7591,
"mean_token_accuracy": 0.7873912572860717,
"num_tokens": 985872.0,
"step": 1990
},
{
"epoch": 1.995,
"grad_norm": 31.0,
"learning_rate": 7.487752231919246e-06,
"loss": 0.9672,
"mean_token_accuracy": 0.7186308145523072,
"num_tokens": 987339.0,
"step": 1995
},
{
"epoch": 2.0,
"grad_norm": 17.125,
"learning_rate": 7.4244557022599394e-06,
"loss": 0.55,
"mean_token_accuracy": 0.8468565583229065,
"num_tokens": 991368.0,
"step": 2000
},
{
"epoch": 2.005,
"grad_norm": 5.15625,
"learning_rate": 7.361269500346274e-06,
"loss": 0.4919,
"mean_token_accuracy": 0.8586863517761231,
"num_tokens": 994329.0,
"step": 2005
},
{
"epoch": 2.01,
"grad_norm": 24.625,
"learning_rate": 7.298196332864834e-06,
"loss": 0.7438,
"mean_token_accuracy": 0.827777373790741,
"num_tokens": 995585.0,
"step": 2010
},
{
"epoch": 2.015,
"grad_norm": 6.78125,
"learning_rate": 7.235238901660195e-06,
"loss": 0.6072,
"mean_token_accuracy": 0.8268356323242188,
"num_tokens": 998236.0,
"step": 2015
},
{
"epoch": 2.02,
"grad_norm": 22.75,
"learning_rate": 7.172399903619165e-06,
"loss": 0.8657,
"mean_token_accuracy": 0.7580430507659912,
"num_tokens": 999149.0,
"step": 2020
},
{
"epoch": 2.025,
"grad_norm": 7.78125,
"learning_rate": 7.109682030555283e-06,
"loss": 0.4715,
"mean_token_accuracy": 0.8574858069419861,
"num_tokens": 1003020.0,
"step": 2025
},
{
"epoch": 2.03,
"grad_norm": 6.4375,
"learning_rate": 7.047087969093488e-06,
"loss": 0.6394,
"mean_token_accuracy": 0.8059547662734985,
"num_tokens": 1006359.0,
"step": 2030
},
{
"epoch": 2.035,
"grad_norm": 25.0,
"learning_rate": 6.984620400555044e-06,
"loss": 0.7012,
"mean_token_accuracy": 0.7960086703300476,
"num_tokens": 1008642.0,
"step": 2035
},
{
"epoch": 2.04,
"grad_norm": 7.84375,
"learning_rate": 6.922282000842675e-06,
"loss": 0.6689,
"mean_token_accuracy": 0.8068014264106751,
"num_tokens": 1011422.0,
"step": 2040
},
{
"epoch": 2.045,
"grad_norm": 17.5,
"learning_rate": 6.860075440325951e-06,
"loss": 0.5843,
"mean_token_accuracy": 0.8451680421829224,
"num_tokens": 1013590.0,
"step": 2045
},
{
"epoch": 2.05,
"grad_norm": 6.90625,
"learning_rate": 6.798003383726883e-06,
"loss": 0.6298,
"mean_token_accuracy": 0.8257234930992127,
"num_tokens": 1015307.0,
"step": 2050
},
{
"epoch": 2.055,
"grad_norm": 9.9375,
"learning_rate": 6.736068490005784e-06,
"loss": 0.791,
"mean_token_accuracy": 0.7678486227989196,
"num_tokens": 1017765.0,
"step": 2055
},
{
"epoch": 2.06,
"grad_norm": 10.75,
"learning_rate": 6.674273412247365e-06,
"loss": 0.5853,
"mean_token_accuracy": 0.8227458953857422,
"num_tokens": 1020785.0,
"step": 2060
},
{
"epoch": 2.065,
"grad_norm": 22.625,
"learning_rate": 6.612620797547087e-06,
"loss": 0.6217,
"mean_token_accuracy": 0.8210171341896058,
"num_tokens": 1023175.0,
"step": 2065
},
{
"epoch": 2.07,
"grad_norm": 5.96875,
"learning_rate": 6.551113286897771e-06,
"loss": 0.4628,
"mean_token_accuracy": 0.8576245903968811,
"num_tokens": 1024847.0,
"step": 2070
},
{
"epoch": 2.075,
"grad_norm": 8.8125,
"learning_rate": 6.489753515076457e-06,
"loss": 0.9042,
"mean_token_accuracy": 0.753120231628418,
"num_tokens": 1028443.0,
"step": 2075
},
{
"epoch": 2.08,
"grad_norm": 27.75,
"learning_rate": 6.428544110531549e-06,
"loss": 0.8369,
"mean_token_accuracy": 0.7524871230125427,
"num_tokens": 1030302.0,
"step": 2080
},
{
"epoch": 2.085,
"grad_norm": 8.125,
"learning_rate": 6.367487695270218e-06,
"loss": 0.5785,
"mean_token_accuracy": 0.8259750604629517,
"num_tokens": 1033853.0,
"step": 2085
},
{
"epoch": 2.09,
"grad_norm": 7.5625,
"learning_rate": 6.306586884746085e-06,
"loss": 0.5235,
"mean_token_accuracy": 0.8460973501205444,
"num_tokens": 1036706.0,
"step": 2090
},
{
"epoch": 2.095,
"grad_norm": 25.25,
"learning_rate": 6.245844287747168e-06,
"loss": 0.8981,
"mean_token_accuracy": 0.7195080876350403,
"num_tokens": 1038871.0,
"step": 2095
},
{
"epoch": 2.1,
"grad_norm": 8.625,
"learning_rate": 6.185262506284171e-06,
"loss": 0.7093,
"mean_token_accuracy": 0.7909031867980957,
"num_tokens": 1042005.0,
"step": 2100
},
{
"epoch": 2.105,
"grad_norm": 23.125,
"learning_rate": 6.124844135478971e-06,
"loss": 0.7969,
"mean_token_accuracy": 0.7591824889183044,
"num_tokens": 1044314.0,
"step": 2105
},
{
"epoch": 2.11,
"grad_norm": 23.125,
"learning_rate": 6.0645917634534856e-06,
"loss": 0.6442,
"mean_token_accuracy": 0.8363109707832337,
"num_tokens": 1046326.0,
"step": 2110
},
{
"epoch": 2.115,
"grad_norm": 6.625,
"learning_rate": 6.004507971218801e-06,
"loss": 0.5353,
"mean_token_accuracy": 0.8375053167343139,
"num_tokens": 1049452.0,
"step": 2115
},
{
"epoch": 2.12,
"grad_norm": 10.5625,
"learning_rate": 5.944595332564598e-06,
"loss": 0.4135,
"mean_token_accuracy": 0.8807172417640686,
"num_tokens": 1051626.0,
"step": 2120
},
{
"epoch": 2.125,
"grad_norm": 15.8125,
"learning_rate": 5.884856413948913e-06,
"loss": 0.4851,
"mean_token_accuracy": 0.8450929164886475,
"num_tokens": 1055101.0,
"step": 2125
},
{
"epoch": 2.13,
"grad_norm": 8.375,
"learning_rate": 5.825293774388196e-06,
"loss": 0.6181,
"mean_token_accuracy": 0.8257151484489441,
"num_tokens": 1058263.0,
"step": 2130
},
{
"epoch": 2.135,
"grad_norm": 8.3125,
"learning_rate": 5.7659099653476805e-06,
"loss": 0.6252,
"mean_token_accuracy": 0.8017407178878784,
"num_tokens": 1060704.0,
"step": 2135
},
{
"epoch": 2.14,
"grad_norm": 8.125,
"learning_rate": 5.7067075306321025e-06,
"loss": 0.7446,
"mean_token_accuracy": 0.8047372102737427,
"num_tokens": 1063001.0,
"step": 2140
},
{
"epoch": 2.145,
"grad_norm": 8.0625,
"learning_rate": 5.647689006276727e-06,
"loss": 0.8342,
"mean_token_accuracy": 0.7604440450668335,
"num_tokens": 1064965.0,
"step": 2145
},
{
"epoch": 2.15,
"grad_norm": 16.375,
"learning_rate": 5.588856920438706e-06,
"loss": 0.4514,
"mean_token_accuracy": 0.8737953782081604,
"num_tokens": 1066941.0,
"step": 2150
},
{
"epoch": 2.155,
"grad_norm": 7.21875,
"learning_rate": 5.53021379328879e-06,
"loss": 0.6148,
"mean_token_accuracy": 0.8180341601371766,
"num_tokens": 1069106.0,
"step": 2155
},
{
"epoch": 2.16,
"grad_norm": 7.21875,
"learning_rate": 5.4717621369033654e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8827916264533997,
"num_tokens": 1071843.0,
"step": 2160
},
{
"epoch": 2.165,
"grad_norm": 12.25,
"learning_rate": 5.413504455156855e-06,
"loss": 0.6309,
"mean_token_accuracy": 0.8097968459129333,
"num_tokens": 1073821.0,
"step": 2165
},
{
"epoch": 2.17,
"grad_norm": 21.875,
"learning_rate": 5.355443243614434e-06,
"loss": 0.6844,
"mean_token_accuracy": 0.8111440539360046,
"num_tokens": 1075459.0,
"step": 2170
},
{
"epoch": 2.175,
"grad_norm": 9.0625,
"learning_rate": 5.297580989425177e-06,
"loss": 0.6962,
"mean_token_accuracy": 0.7998672127723694,
"num_tokens": 1077136.0,
"step": 2175
},
{
"epoch": 2.18,
"grad_norm": 5.5,
"learning_rate": 5.2399201712154666e-06,
"loss": 0.6331,
"mean_token_accuracy": 0.8542533636093139,
"num_tokens": 1079897.0,
"step": 2180
},
{
"epoch": 2.185,
"grad_norm": 7.0625,
"learning_rate": 5.1824632589828465e-06,
"loss": 0.4966,
"mean_token_accuracy": 0.8605088949203491,
"num_tokens": 1081679.0,
"step": 2185
},
{
"epoch": 2.19,
"grad_norm": 18.0,
"learning_rate": 5.125212713990207e-06,
"loss": 0.5673,
"mean_token_accuracy": 0.8518822908401489,
"num_tokens": 1083383.0,
"step": 2190
},
{
"epoch": 2.195,
"grad_norm": 4.40625,
"learning_rate": 5.0681709886603454e-06,
"loss": 0.72,
"mean_token_accuracy": 0.789927351474762,
"num_tokens": 1085659.0,
"step": 2195
},
{
"epoch": 2.2,
"grad_norm": 6.09375,
"learning_rate": 5.011340526470928e-06,
"loss": 0.6352,
"mean_token_accuracy": 0.8001546740531922,
"num_tokens": 1088395.0,
"step": 2200
},
{
"epoch": 2.205,
"grad_norm": 10.125,
"learning_rate": 4.954723761849809e-06,
"loss": 0.7239,
"mean_token_accuracy": 0.7876050591468811,
"num_tokens": 1090913.0,
"step": 2205
},
{
"epoch": 2.21,
"grad_norm": 29.875,
"learning_rate": 4.8983231200707495e-06,
"loss": 0.6724,
"mean_token_accuracy": 0.7935240149497986,
"num_tokens": 1093568.0,
"step": 2210
},
{
"epoch": 2.215,
"grad_norm": 7.28125,
"learning_rate": 4.8421410171495265e-06,
"loss": 0.7759,
"mean_token_accuracy": 0.7690300822257996,
"num_tokens": 1096199.0,
"step": 2215
},
{
"epoch": 2.22,
"grad_norm": 8.5625,
"learning_rate": 4.786179859740442e-06,
"loss": 0.5158,
"mean_token_accuracy": 0.8456796884536744,
"num_tokens": 1098664.0,
"step": 2220
},
{
"epoch": 2.225,
"grad_norm": 5.375,
"learning_rate": 4.7304420450332244e-06,
"loss": 0.6302,
"mean_token_accuracy": 0.8168246984481812,
"num_tokens": 1102032.0,
"step": 2225
},
{
"epoch": 2.23,
"grad_norm": 4.65625,
"learning_rate": 4.674929960650352e-06,
"loss": 0.6012,
"mean_token_accuracy": 0.8399333238601685,
"num_tokens": 1104825.0,
"step": 2230
},
{
"epoch": 2.235,
"grad_norm": 4.5,
"learning_rate": 4.619645984544752e-06,
"loss": 0.3681,
"mean_token_accuracy": 0.8887263059616088,
"num_tokens": 1106158.0,
"step": 2235
},
{
"epoch": 2.24,
"grad_norm": 9.25,
"learning_rate": 4.564592484897965e-06,
"loss": 0.5975,
"mean_token_accuracy": 0.8158373713493348,
"num_tokens": 1109858.0,
"step": 2240
},
{
"epoch": 2.245,
"grad_norm": 25.25,
"learning_rate": 4.509771820018682e-06,
"loss": 0.7861,
"mean_token_accuracy": 0.7793623328208923,
"num_tokens": 1111179.0,
"step": 2245
},
{
"epoch": 2.25,
"grad_norm": 8.5,
"learning_rate": 4.455186338241733e-06,
"loss": 0.7591,
"mean_token_accuracy": 0.7832494258880616,
"num_tokens": 1114073.0,
"step": 2250
},
{
"epoch": 2.255,
"grad_norm": 18.75,
"learning_rate": 4.4008383778274835e-06,
"loss": 0.7995,
"mean_token_accuracy": 0.7816387176513672,
"num_tokens": 1114839.0,
"step": 2255
},
{
"epoch": 2.26,
"grad_norm": 7.3125,
"learning_rate": 4.346730266861673e-06,
"loss": 0.6332,
"mean_token_accuracy": 0.8161526322364807,
"num_tokens": 1116483.0,
"step": 2260
},
{
"epoch": 2.265,
"grad_norm": 7.59375,
"learning_rate": 4.292864323155684e-06,
"loss": 0.7247,
"mean_token_accuracy": 0.7976547837257385,
"num_tokens": 1118654.0,
"step": 2265
},
{
"epoch": 2.27,
"grad_norm": 6.375,
"learning_rate": 4.23924285414727e-06,
"loss": 0.6007,
"mean_token_accuracy": 0.8198469638824463,
"num_tokens": 1122216.0,
"step": 2270
},
{
"epoch": 2.275,
"grad_norm": 4.15625,
"learning_rate": 4.185868156801695e-06,
"loss": 0.5729,
"mean_token_accuracy": 0.8336104035377503,
"num_tokens": 1125041.0,
"step": 2275
},
{
"epoch": 2.2800000000000002,
"grad_norm": 22.25,
"learning_rate": 4.13274251751335e-06,
"loss": 0.6672,
"mean_token_accuracy": 0.7996611833572388,
"num_tokens": 1127366.0,
"step": 2280
},
{
"epoch": 2.285,
"grad_norm": 4.21875,
"learning_rate": 4.0798682120078046e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.8652261853218078,
"num_tokens": 1131524.0,
"step": 2285
},
{
"epoch": 2.29,
"grad_norm": 18.25,
"learning_rate": 4.027247505244329e-06,
"loss": 0.6732,
"mean_token_accuracy": 0.8089190244674682,
"num_tokens": 1133566.0,
"step": 2290
},
{
"epoch": 2.295,
"grad_norm": 24.125,
"learning_rate": 3.974882651318869e-06,
"loss": 0.5543,
"mean_token_accuracy": 0.8400264263153077,
"num_tokens": 1136223.0,
"step": 2295
},
{
"epoch": 2.3,
"grad_norm": 6.5625,
"learning_rate": 3.9227758933674756e-06,
"loss": 0.7666,
"mean_token_accuracy": 0.7888131022453309,
"num_tokens": 1139524.0,
"step": 2300
},
{
"epoch": 2.305,
"grad_norm": 14.5,
"learning_rate": 3.8709294634702374e-06,
"loss": 0.9221,
"mean_token_accuracy": 0.7322679758071899,
"num_tokens": 1141630.0,
"step": 2305
},
{
"epoch": 2.31,
"grad_norm": 6.78125,
"learning_rate": 3.819345582555653e-06,
"loss": 0.5426,
"mean_token_accuracy": 0.8481068849563599,
"num_tokens": 1143585.0,
"step": 2310
},
{
"epoch": 2.315,
"grad_norm": 7.9375,
"learning_rate": 3.7680264603054985e-06,
"loss": 0.6749,
"mean_token_accuracy": 0.8034712195396423,
"num_tokens": 1146169.0,
"step": 2315
},
{
"epoch": 2.32,
"grad_norm": 6.6875,
"learning_rate": 3.716974295060163e-06,
"loss": 0.4901,
"mean_token_accuracy": 0.8497132182121276,
"num_tokens": 1149812.0,
"step": 2320
},
{
"epoch": 2.325,
"grad_norm": 13.0,
"learning_rate": 3.6661912737244996e-06,
"loss": 0.5275,
"mean_token_accuracy": 0.8501026272773743,
"num_tokens": 1153572.0,
"step": 2325
},
{
"epoch": 2.33,
"grad_norm": 6.46875,
"learning_rate": 3.6156795716741146e-06,
"loss": 0.536,
"mean_token_accuracy": 0.8431202411651612,
"num_tokens": 1156510.0,
"step": 2330
},
{
"epoch": 2.335,
"grad_norm": 4.84375,
"learning_rate": 3.565441352662211e-06,
"loss": 0.6569,
"mean_token_accuracy": 0.8402642846107483,
"num_tokens": 1158458.0,
"step": 2335
},
{
"epoch": 2.34,
"grad_norm": 19.5,
"learning_rate": 3.5154787687268852e-06,
"loss": 0.6224,
"mean_token_accuracy": 0.8333223938941956,
"num_tokens": 1159689.0,
"step": 2340
},
{
"epoch": 2.3449999999999998,
"grad_norm": 26.125,
"learning_rate": 3.4657939600989453e-06,
"loss": 0.4519,
"mean_token_accuracy": 0.8754086375236512,
"num_tokens": 1162518.0,
"step": 2345
},
{
"epoch": 2.35,
"grad_norm": 5.9375,
"learning_rate": 3.4163890551102284e-06,
"loss": 0.6395,
"mean_token_accuracy": 0.7997304558753967,
"num_tokens": 1165678.0,
"step": 2350
},
{
"epoch": 2.355,
"grad_norm": 9.875,
"learning_rate": 3.3672661701024324e-06,
"loss": 0.9325,
"mean_token_accuracy": 0.7247589707374573,
"num_tokens": 1167423.0,
"step": 2355
},
{
"epoch": 2.36,
"grad_norm": 6.875,
"learning_rate": 3.318427409336461e-06,
"loss": 0.5473,
"mean_token_accuracy": 0.8480026125907898,
"num_tokens": 1169693.0,
"step": 2360
},
{
"epoch": 2.365,
"grad_norm": 26.375,
"learning_rate": 3.2698748649022693e-06,
"loss": 0.4592,
"mean_token_accuracy": 0.8708458781242371,
"num_tokens": 1171977.0,
"step": 2365
},
{
"epoch": 2.37,
"grad_norm": 7.6875,
"learning_rate": 3.2216106166292694e-06,
"loss": 0.5238,
"mean_token_accuracy": 0.8693408727645874,
"num_tokens": 1173552.0,
"step": 2370
},
{
"epoch": 2.375,
"grad_norm": 8.75,
"learning_rate": 3.1736367319972216e-06,
"loss": 0.7585,
"mean_token_accuracy": 0.7678687095642089,
"num_tokens": 1174533.0,
"step": 2375
},
{
"epoch": 2.38,
"grad_norm": 17.875,
"learning_rate": 3.1259552660476744e-06,
"loss": 0.609,
"mean_token_accuracy": 0.853853178024292,
"num_tokens": 1176567.0,
"step": 2380
},
{
"epoch": 2.385,
"grad_norm": 27.0,
"learning_rate": 3.0785682612959334e-06,
"loss": 0.6462,
"mean_token_accuracy": 0.8025004386901855,
"num_tokens": 1178840.0,
"step": 2385
},
{
"epoch": 2.39,
"grad_norm": 8.0625,
"learning_rate": 3.031477747643564e-06,
"loss": 0.6865,
"mean_token_accuracy": 0.8314264178276062,
"num_tokens": 1181017.0,
"step": 2390
},
{
"epoch": 2.395,
"grad_norm": 35.75,
"learning_rate": 2.9846857422914434e-06,
"loss": 0.6985,
"mean_token_accuracy": 0.8023651003837585,
"num_tokens": 1184083.0,
"step": 2395
},
{
"epoch": 2.4,
"grad_norm": 9.375,
"learning_rate": 2.9381942496533443e-06,
"loss": 0.6431,
"mean_token_accuracy": 0.8262930393218995,
"num_tokens": 1185646.0,
"step": 2400
},
{
"epoch": 2.4050000000000002,
"grad_norm": 9.1875,
"learning_rate": 2.8920052612700755e-06,
"loss": 0.5356,
"mean_token_accuracy": 0.8470789790153503,
"num_tokens": 1187614.0,
"step": 2405
},
{
"epoch": 2.41,
"grad_norm": 10.5,
"learning_rate": 2.846120755724171e-06,
"loss": 0.5002,
"mean_token_accuracy": 0.8470618605613709,
"num_tokens": 1190017.0,
"step": 2410
},
{
"epoch": 2.415,
"grad_norm": 22.5,
"learning_rate": 2.800542698555132e-06,
"loss": 0.4556,
"mean_token_accuracy": 0.8644039750099182,
"num_tokens": 1193266.0,
"step": 2415
},
{
"epoch": 2.42,
"grad_norm": 5.28125,
"learning_rate": 2.755273042175233e-06,
"loss": 0.4292,
"mean_token_accuracy": 0.8651724457740784,
"num_tokens": 1197242.0,
"step": 2420
},
{
"epoch": 2.425,
"grad_norm": 24.875,
"learning_rate": 2.7103137257858867e-06,
"loss": 0.5487,
"mean_token_accuracy": 0.8282508969306945,
"num_tokens": 1199347.0,
"step": 2425
},
{
"epoch": 2.43,
"grad_norm": 28.375,
"learning_rate": 2.6656666752945647e-06,
"loss": 0.7351,
"mean_token_accuracy": 0.7949961185455322,
"num_tokens": 1201102.0,
"step": 2430
},
{
"epoch": 2.435,
"grad_norm": 13.25,
"learning_rate": 2.6213338032323175e-06,
"loss": 0.6013,
"mean_token_accuracy": 0.8310695767402649,
"num_tokens": 1203274.0,
"step": 2435
},
{
"epoch": 2.44,
"grad_norm": 13.25,
"learning_rate": 2.5773170086718324e-06,
"loss": 0.5486,
"mean_token_accuracy": 0.8236384749412536,
"num_tokens": 1204799.0,
"step": 2440
},
{
"epoch": 2.445,
"grad_norm": 5.5,
"learning_rate": 2.5336181771460865e-06,
"loss": 0.6963,
"mean_token_accuracy": 0.8013479828834533,
"num_tokens": 1207929.0,
"step": 2445
},
{
"epoch": 2.45,
"grad_norm": 9.1875,
"learning_rate": 2.490239180567585e-06,
"loss": 0.7415,
"mean_token_accuracy": 0.792452335357666,
"num_tokens": 1209506.0,
"step": 2450
},
{
"epoch": 2.455,
"grad_norm": 8.0625,
"learning_rate": 2.447181877148165e-06,
"loss": 0.7063,
"mean_token_accuracy": 0.7985927581787109,
"num_tokens": 1212741.0,
"step": 2455
},
{
"epoch": 2.46,
"grad_norm": 6.65625,
"learning_rate": 2.4044481113193953e-06,
"loss": 0.5619,
"mean_token_accuracy": 0.837166678905487,
"num_tokens": 1214624.0,
"step": 2460
},
{
"epoch": 2.465,
"grad_norm": 29.0,
"learning_rate": 2.362039713653581e-06,
"loss": 0.6907,
"mean_token_accuracy": 0.7909873247146606,
"num_tokens": 1217176.0,
"step": 2465
},
{
"epoch": 2.4699999999999998,
"grad_norm": 7.9375,
"learning_rate": 2.3199585007853233e-06,
"loss": 0.7487,
"mean_token_accuracy": 0.7831075429916382,
"num_tokens": 1218962.0,
"step": 2470
},
{
"epoch": 2.475,
"grad_norm": 8.25,
"learning_rate": 2.278206275333731e-06,
"loss": 0.694,
"mean_token_accuracy": 0.8036178708076477,
"num_tokens": 1220856.0,
"step": 2475
},
{
"epoch": 2.48,
"grad_norm": 7.0,
"learning_rate": 2.236784825825179e-06,
"loss": 0.6465,
"mean_token_accuracy": 0.8076552510261535,
"num_tokens": 1222310.0,
"step": 2480
},
{
"epoch": 2.485,
"grad_norm": 6.28125,
"learning_rate": 2.195695926616702e-06,
"loss": 0.7398,
"mean_token_accuracy": 0.7894096493721008,
"num_tokens": 1224812.0,
"step": 2485
},
{
"epoch": 2.49,
"grad_norm": 19.75,
"learning_rate": 2.1549413378199912e-06,
"loss": 0.7505,
"mean_token_accuracy": 0.7837583899497986,
"num_tokens": 1226946.0,
"step": 2490
},
{
"epoch": 2.495,
"grad_norm": 8.5,
"learning_rate": 2.1145228052259823e-06,
"loss": 0.5331,
"mean_token_accuracy": 0.8338425397872925,
"num_tokens": 1229666.0,
"step": 2495
},
{
"epoch": 2.5,
"grad_norm": 8.9375,
"learning_rate": 2.074442060230093e-06,
"loss": 0.3655,
"mean_token_accuracy": 0.8997077226638794,
"num_tokens": 1232184.0,
"step": 2500
},
{
"epoch": 2.505,
"grad_norm": 8.125,
"learning_rate": 2.034700819758039e-06,
"loss": 0.5048,
"mean_token_accuracy": 0.8454057693481445,
"num_tokens": 1235829.0,
"step": 2505
},
{
"epoch": 2.51,
"grad_norm": 3.59375,
"learning_rate": 1.995300786192291e-06,
"loss": 0.5504,
"mean_token_accuracy": 0.8341328144073487,
"num_tokens": 1239156.0,
"step": 2510
},
{
"epoch": 2.515,
"grad_norm": 8.5625,
"learning_rate": 1.956243647299155e-06,
"loss": 0.6985,
"mean_token_accuracy": 0.8047675132751465,
"num_tokens": 1241694.0,
"step": 2515
},
{
"epoch": 2.52,
"grad_norm": 19.75,
"learning_rate": 1.91753107615647e-06,
"loss": 0.6351,
"mean_token_accuracy": 0.8266091227531434,
"num_tokens": 1242546.0,
"step": 2520
},
{
"epoch": 2.525,
"grad_norm": 7.5,
"learning_rate": 1.8791647310819371e-06,
"loss": 0.6763,
"mean_token_accuracy": 0.8041231989860534,
"num_tokens": 1245217.0,
"step": 2525
},
{
"epoch": 2.5300000000000002,
"grad_norm": 26.25,
"learning_rate": 1.8411462555620896e-06,
"loss": 0.5798,
"mean_token_accuracy": 0.8182765483856201,
"num_tokens": 1247501.0,
"step": 2530
},
{
"epoch": 2.535,
"grad_norm": 3.796875,
"learning_rate": 1.8034772781818776e-06,
"loss": 0.6065,
"mean_token_accuracy": 0.8119491338729858,
"num_tokens": 1249229.0,
"step": 2535
},
{
"epoch": 2.54,
"grad_norm": 6.15625,
"learning_rate": 1.766159412554922e-06,
"loss": 0.4853,
"mean_token_accuracy": 0.8598642110824585,
"num_tokens": 1251011.0,
"step": 2540
},
{
"epoch": 2.545,
"grad_norm": 6.375,
"learning_rate": 1.7291942572543806e-06,
"loss": 0.3356,
"mean_token_accuracy": 0.904644763469696,
"num_tokens": 1254128.0,
"step": 2545
},
{
"epoch": 2.55,
"grad_norm": 6.84375,
"learning_rate": 1.6925833957444826e-06,
"loss": 0.4384,
"mean_token_accuracy": 0.8696613907814026,
"num_tokens": 1257246.0,
"step": 2550
},
{
"epoch": 2.555,
"grad_norm": 5.25,
"learning_rate": 1.656328396312682e-06,
"loss": 0.5397,
"mean_token_accuracy": 0.8403537273406982,
"num_tokens": 1260577.0,
"step": 2555
},
{
"epoch": 2.56,
"grad_norm": 4.0,
"learning_rate": 1.6204308120024915e-06,
"loss": 0.5642,
"mean_token_accuracy": 0.831739854812622,
"num_tokens": 1262707.0,
"step": 2560
},
{
"epoch": 2.565,
"grad_norm": 5.71875,
"learning_rate": 1.5848921805469396e-06,
"loss": 0.782,
"mean_token_accuracy": 0.7676750183105469,
"num_tokens": 1266882.0,
"step": 2565
},
{
"epoch": 2.57,
"grad_norm": 10.625,
"learning_rate": 1.5497140243027198e-06,
"loss": 0.6529,
"mean_token_accuracy": 0.8205275893211365,
"num_tokens": 1268811.0,
"step": 2570
},
{
"epoch": 2.575,
"grad_norm": 5.46875,
"learning_rate": 1.5148978501849642e-06,
"loss": 0.492,
"mean_token_accuracy": 0.8443620800971985,
"num_tokens": 1272499.0,
"step": 2575
},
{
"epoch": 2.58,
"grad_norm": 9.4375,
"learning_rate": 1.4804451496026928e-06,
"loss": 0.5225,
"mean_token_accuracy": 0.8471122026443482,
"num_tokens": 1275704.0,
"step": 2580
},
{
"epoch": 2.585,
"grad_norm": 5.40625,
"learning_rate": 1.446357398394934e-06,
"loss": 0.4996,
"mean_token_accuracy": 0.8760560512542724,
"num_tokens": 1277875.0,
"step": 2585
},
{
"epoch": 2.59,
"grad_norm": 12.1875,
"learning_rate": 1.4126360567674946e-06,
"loss": 0.7877,
"mean_token_accuracy": 0.7944910049438476,
"num_tokens": 1280557.0,
"step": 2590
},
{
"epoch": 2.5949999999999998,
"grad_norm": 10.125,
"learning_rate": 1.379282569230419e-06,
"loss": 0.5377,
"mean_token_accuracy": 0.8431946516036988,
"num_tokens": 1283379.0,
"step": 2595
},
{
"epoch": 2.6,
"grad_norm": 7.09375,
"learning_rate": 1.3462983645360994e-06,
"loss": 0.6152,
"mean_token_accuracy": 0.7838948607444763,
"num_tokens": 1286317.0,
"step": 2600
},
{
"epoch": 2.605,
"grad_norm": 5.75,
"learning_rate": 1.3136848556180893e-06,
"loss": 0.6216,
"mean_token_accuracy": 0.8219085574150086,
"num_tokens": 1289902.0,
"step": 2605
},
{
"epoch": 2.61,
"grad_norm": 10.0,
"learning_rate": 1.281443439530562e-06,
"loss": 0.792,
"mean_token_accuracy": 0.7688913822174073,
"num_tokens": 1292298.0,
"step": 2610
},
{
"epoch": 2.615,
"grad_norm": 9.625,
"learning_rate": 1.2495754973884766e-06,
"loss": 0.7977,
"mean_token_accuracy": 0.7754817247390747,
"num_tokens": 1295435.0,
"step": 2615
},
{
"epoch": 2.62,
"grad_norm": 14.625,
"learning_rate": 1.2180823943084076e-06,
"loss": 0.7828,
"mean_token_accuracy": 0.7664639234542847,
"num_tokens": 1296353.0,
"step": 2620
},
{
"epoch": 2.625,
"grad_norm": 7.34375,
"learning_rate": 1.1869654793500784e-06,
"loss": 0.7631,
"mean_token_accuracy": 0.7716354012489319,
"num_tokens": 1298031.0,
"step": 2625
},
{
"epoch": 2.63,
"grad_norm": 5.5,
"learning_rate": 1.156226085458556e-06,
"loss": 0.4382,
"mean_token_accuracy": 0.8708669185638428,
"num_tokens": 1301255.0,
"step": 2630
},
{
"epoch": 2.635,
"grad_norm": 24.0,
"learning_rate": 1.1258655294071686e-06,
"loss": 0.5946,
"mean_token_accuracy": 0.8110897779464722,
"num_tokens": 1303391.0,
"step": 2635
},
{
"epoch": 2.64,
"grad_norm": 8.375,
"learning_rate": 1.0958851117410874e-06,
"loss": 0.5818,
"mean_token_accuracy": 0.8356985807418823,
"num_tokens": 1304908.0,
"step": 2640
},
{
"epoch": 2.645,
"grad_norm": 16.5,
"learning_rate": 1.0662861167216243e-06,
"loss": 0.9185,
"mean_token_accuracy": 0.7712666034698487,
"num_tokens": 1306457.0,
"step": 2645
},
{
"epoch": 2.65,
"grad_norm": 5.3125,
"learning_rate": 1.0370698122712131e-06,
"loss": 0.4105,
"mean_token_accuracy": 0.8678780198097229,
"num_tokens": 1310791.0,
"step": 2650
},
{
"epoch": 2.6550000000000002,
"grad_norm": 18.625,
"learning_rate": 1.0082374499190961e-06,
"loss": 0.6534,
"mean_token_accuracy": 0.826017415523529,
"num_tokens": 1312521.0,
"step": 2655
},
{
"epoch": 2.66,
"grad_norm": 18.125,
"learning_rate": 9.797902647477175e-07,
"loss": 0.5011,
"mean_token_accuracy": 0.8516492247581482,
"num_tokens": 1314019.0,
"step": 2660
},
{
"epoch": 2.665,
"grad_norm": 19.5,
"learning_rate": 9.517294753398066e-07,
"loss": 0.3417,
"mean_token_accuracy": 0.9054901719093322,
"num_tokens": 1315720.0,
"step": 2665
},
{
"epoch": 2.67,
"grad_norm": 17.875,
"learning_rate": 9.240562837261891e-07,
"loss": 0.6058,
"mean_token_accuracy": 0.8199878454208374,
"num_tokens": 1316657.0,
"step": 2670
},
{
"epoch": 2.675,
"grad_norm": 7.46875,
"learning_rate": 8.967718753342902e-07,
"loss": 0.7952,
"mean_token_accuracy": 0.7606085181236267,
"num_tokens": 1318961.0,
"step": 2675
},
{
"epoch": 2.68,
"grad_norm": 7.375,
"learning_rate": 8.698774189373571e-07,
"loss": 0.4273,
"mean_token_accuracy": 0.8727764368057251,
"num_tokens": 1321286.0,
"step": 2680
},
{
"epoch": 2.685,
"grad_norm": 4.875,
"learning_rate": 8.433740666043899e-07,
"loss": 0.5555,
"mean_token_accuracy": 0.8267341732978821,
"num_tokens": 1325500.0,
"step": 2685
},
{
"epoch": 2.69,
"grad_norm": 7.75,
"learning_rate": 8.172629536507915e-07,
"loss": 0.459,
"mean_token_accuracy": 0.8654767513275147,
"num_tokens": 1327274.0,
"step": 2690
},
{
"epoch": 2.695,
"grad_norm": 27.0,
"learning_rate": 7.915451985897382e-07,
"loss": 1.0706,
"mean_token_accuracy": 0.6949961304664611,
"num_tokens": 1328978.0,
"step": 2695
},
{
"epoch": 2.7,
"grad_norm": 6.875,
"learning_rate": 7.662219030842588e-07,
"loss": 0.5883,
"mean_token_accuracy": 0.8402994990348815,
"num_tokens": 1331301.0,
"step": 2700
},
{
"epoch": 2.705,
"grad_norm": 7.4375,
"learning_rate": 7.412941519000527e-07,
"loss": 0.7718,
"mean_token_accuracy": 0.7737884640693664,
"num_tokens": 1334311.0,
"step": 2705
},
{
"epoch": 2.71,
"grad_norm": 20.375,
"learning_rate": 7.167630128590131e-07,
"loss": 0.6655,
"mean_token_accuracy": 0.8034143686294556,
"num_tokens": 1335913.0,
"step": 2710
},
{
"epoch": 2.715,
"grad_norm": 5.875,
"learning_rate": 6.92629536793491e-07,
"loss": 0.6347,
"mean_token_accuracy": 0.8110605478286743,
"num_tokens": 1338621.0,
"step": 2715
},
{
"epoch": 2.7199999999999998,
"grad_norm": 22.375,
"learning_rate": 6.688947575012794e-07,
"loss": 0.5813,
"mean_token_accuracy": 0.8388025999069214,
"num_tokens": 1341731.0,
"step": 2720
},
{
"epoch": 2.725,
"grad_norm": 25.0,
"learning_rate": 6.455596917013274e-07,
"loss": 0.6901,
"mean_token_accuracy": 0.8210200190544128,
"num_tokens": 1343282.0,
"step": 2725
},
{
"epoch": 2.73,
"grad_norm": 8.3125,
"learning_rate": 6.226253389901882e-07,
"loss": 0.7357,
"mean_token_accuracy": 0.7805762529373169,
"num_tokens": 1346733.0,
"step": 2730
},
{
"epoch": 2.735,
"grad_norm": 7.53125,
"learning_rate": 6.000926817991992e-07,
"loss": 0.715,
"mean_token_accuracy": 0.7933832883834839,
"num_tokens": 1349395.0,
"step": 2735
},
{
"epoch": 2.74,
"grad_norm": 8.1875,
"learning_rate": 5.779626853524023e-07,
"loss": 0.4749,
"mean_token_accuracy": 0.8691505193710327,
"num_tokens": 1351191.0,
"step": 2740
},
{
"epoch": 2.745,
"grad_norm": 8.9375,
"learning_rate": 5.562362976251901e-07,
"loss": 0.556,
"mean_token_accuracy": 0.8356801629066467,
"num_tokens": 1353846.0,
"step": 2745
},
{
"epoch": 2.75,
"grad_norm": 11.1875,
"learning_rate": 5.349144493037017e-07,
"loss": 0.5487,
"mean_token_accuracy": 0.8493734836578369,
"num_tokens": 1355697.0,
"step": 2750
},
{
"epoch": 2.755,
"grad_norm": 23.75,
"learning_rate": 5.13998053744954e-07,
"loss": 0.7255,
"mean_token_accuracy": 0.8080898642539978,
"num_tokens": 1357081.0,
"step": 2755
},
{
"epoch": 2.76,
"grad_norm": 5.65625,
"learning_rate": 4.934880069377179e-07,
"loss": 0.6731,
"mean_token_accuracy": 0.8096045136451722,
"num_tokens": 1360537.0,
"step": 2760
},
{
"epoch": 2.765,
"grad_norm": 7.5,
"learning_rate": 4.733851874641382e-07,
"loss": 0.7519,
"mean_token_accuracy": 0.7882884621620179,
"num_tokens": 1363841.0,
"step": 2765
},
{
"epoch": 2.77,
"grad_norm": 3.71875,
"learning_rate": 4.536904564620892e-07,
"loss": 0.5781,
"mean_token_accuracy": 0.8222225427627563,
"num_tokens": 1367406.0,
"step": 2770
},
{
"epoch": 2.775,
"grad_norm": 15.0625,
"learning_rate": 4.344046575883021e-07,
"loss": 0.6944,
"mean_token_accuracy": 0.7947175741195679,
"num_tokens": 1369306.0,
"step": 2775
},
{
"epoch": 2.7800000000000002,
"grad_norm": 9.4375,
"learning_rate": 4.1552861698220927e-07,
"loss": 0.5703,
"mean_token_accuracy": 0.8330890297889709,
"num_tokens": 1372042.0,
"step": 2780
},
{
"epoch": 2.785,
"grad_norm": 14.625,
"learning_rate": 3.9706314323056936e-07,
"loss": 0.5702,
"mean_token_accuracy": 0.8367876529693603,
"num_tokens": 1373975.0,
"step": 2785
},
{
"epoch": 2.79,
"grad_norm": 26.875,
"learning_rate": 3.7900902733281843e-07,
"loss": 0.7193,
"mean_token_accuracy": 0.8044403076171875,
"num_tokens": 1376187.0,
"step": 2790
},
{
"epoch": 2.795,
"grad_norm": 17.5,
"learning_rate": 3.6136704266719115e-07,
"loss": 0.3683,
"mean_token_accuracy": 0.8847111463546753,
"num_tokens": 1378803.0,
"step": 2795
},
{
"epoch": 2.8,
"grad_norm": 5.78125,
"learning_rate": 3.441379449575943e-07,
"loss": 0.5358,
"mean_token_accuracy": 0.8456088662147522,
"num_tokens": 1381997.0,
"step": 2800
},
{
"epoch": 2.805,
"grad_norm": 5.96875,
"learning_rate": 3.273224722412327e-07,
"loss": 0.6415,
"mean_token_accuracy": 0.8274478316307068,
"num_tokens": 1384359.0,
"step": 2805
},
{
"epoch": 2.81,
"grad_norm": 7.40625,
"learning_rate": 3.1092134483698966e-07,
"loss": 0.6165,
"mean_token_accuracy": 0.8140155911445618,
"num_tokens": 1385707.0,
"step": 2810
},
{
"epoch": 2.815,
"grad_norm": 22.875,
"learning_rate": 2.949352653145754e-07,
"loss": 0.4145,
"mean_token_accuracy": 0.8786633729934692,
"num_tokens": 1388446.0,
"step": 2815
},
{
"epoch": 2.82,
"grad_norm": 4.78125,
"learning_rate": 2.793649184644331e-07,
"loss": 0.5485,
"mean_token_accuracy": 0.8479639649391174,
"num_tokens": 1390738.0,
"step": 2820
},
{
"epoch": 2.825,
"grad_norm": 6.65625,
"learning_rate": 2.6421097126839714e-07,
"loss": 0.6989,
"mean_token_accuracy": 0.7954578876495362,
"num_tokens": 1394232.0,
"step": 2825
},
{
"epoch": 2.83,
"grad_norm": 5.3125,
"learning_rate": 2.49474072871132e-07,
"loss": 0.6715,
"mean_token_accuracy": 0.8026228666305542,
"num_tokens": 1396421.0,
"step": 2830
},
{
"epoch": 2.835,
"grad_norm": 40.25,
"learning_rate": 2.3515485455231412e-07,
"loss": 0.6224,
"mean_token_accuracy": 0.8251531958580017,
"num_tokens": 1399233.0,
"step": 2835
},
{
"epoch": 2.84,
"grad_norm": 39.0,
"learning_rate": 2.212539296995997e-07,
"loss": 0.6011,
"mean_token_accuracy": 0.8397654533386231,
"num_tokens": 1401387.0,
"step": 2840
},
{
"epoch": 2.8449999999999998,
"grad_norm": 8.125,
"learning_rate": 2.077718937823414e-07,
"loss": 0.4666,
"mean_token_accuracy": 0.8580923438072204,
"num_tokens": 1404363.0,
"step": 2845
},
{
"epoch": 2.85,
"grad_norm": 26.25,
"learning_rate": 1.9470932432609001e-07,
"loss": 0.5946,
"mean_token_accuracy": 0.8188249826431274,
"num_tokens": 1406182.0,
"step": 2850
},
{
"epoch": 2.855,
"grad_norm": 13.625,
"learning_rate": 1.820667808878429e-07,
"loss": 0.5073,
"mean_token_accuracy": 0.8545669674873352,
"num_tokens": 1409661.0,
"step": 2855
},
{
"epoch": 2.86,
"grad_norm": 22.625,
"learning_rate": 1.6984480503208445e-07,
"loss": 0.5162,
"mean_token_accuracy": 0.8625428080558777,
"num_tokens": 1412766.0,
"step": 2860
},
{
"epoch": 2.865,
"grad_norm": 23.375,
"learning_rate": 1.580439203075812e-07,
"loss": 0.6426,
"mean_token_accuracy": 0.8121456384658814,
"num_tokens": 1414551.0,
"step": 2865
},
{
"epoch": 2.87,
"grad_norm": 14.625,
"learning_rate": 1.4666463222495875e-07,
"loss": 0.5375,
"mean_token_accuracy": 0.8589481472969055,
"num_tokens": 1417753.0,
"step": 2870
},
{
"epoch": 2.875,
"grad_norm": 6.71875,
"learning_rate": 1.357074282350457e-07,
"loss": 0.7809,
"mean_token_accuracy": 0.7673912167549133,
"num_tokens": 1420667.0,
"step": 2875
},
{
"epoch": 2.88,
"grad_norm": 7.375,
"learning_rate": 1.2517277770799142e-07,
"loss": 0.6226,
"mean_token_accuracy": 0.8142449855804443,
"num_tokens": 1423042.0,
"step": 2880
},
{
"epoch": 2.885,
"grad_norm": 102.5,
"learning_rate": 1.1506113191316447e-07,
"loss": 0.5121,
"mean_token_accuracy": 0.8586719989776611,
"num_tokens": 1425381.0,
"step": 2885
},
{
"epoch": 2.89,
"grad_norm": 16.125,
"learning_rate": 1.053729239998158e-07,
"loss": 0.4504,
"mean_token_accuracy": 0.887079381942749,
"num_tokens": 1427853.0,
"step": 2890
},
{
"epoch": 2.895,
"grad_norm": 8.0625,
"learning_rate": 9.610856897852683e-08,
"loss": 0.6264,
"mean_token_accuracy": 0.832252037525177,
"num_tokens": 1429454.0,
"step": 2895
},
{
"epoch": 2.9,
"grad_norm": 18.375,
"learning_rate": 8.726846370343267e-08,
"loss": 0.7333,
"mean_token_accuracy": 0.8016393065452576,
"num_tokens": 1431278.0,
"step": 2900
},
{
"epoch": 2.9050000000000002,
"grad_norm": 4.1875,
"learning_rate": 7.885298685522235e-08,
"loss": 0.634,
"mean_token_accuracy": 0.8164662957191468,
"num_tokens": 1434290.0,
"step": 2905
},
{
"epoch": 2.91,
"grad_norm": 4.5,
"learning_rate": 7.086249892491292e-08,
"loss": 0.6649,
"mean_token_accuracy": 0.8010025978088379,
"num_tokens": 1437234.0,
"step": 2910
},
{
"epoch": 2.915,
"grad_norm": 23.625,
"learning_rate": 6.32973421984151e-08,
"loss": 0.4797,
"mean_token_accuracy": 0.8689961910247803,
"num_tokens": 1440016.0,
"step": 2915
},
{
"epoch": 2.92,
"grad_norm": 7.59375,
"learning_rate": 5.615784074186281e-08,
"loss": 0.6124,
"mean_token_accuracy": 0.8135050177574158,
"num_tokens": 1444074.0,
"step": 2920
},
{
"epoch": 2.925,
"grad_norm": 18.5,
"learning_rate": 4.944430038773762e-08,
"loss": 0.4905,
"mean_token_accuracy": 0.8594370007514953,
"num_tokens": 1446622.0,
"step": 2925
},
{
"epoch": 2.93,
"grad_norm": 34.75,
"learning_rate": 4.315700872176254e-08,
"loss": 0.5769,
"mean_token_accuracy": 0.8277747988700866,
"num_tokens": 1449915.0,
"step": 2930
},
{
"epoch": 2.935,
"grad_norm": 7.6875,
"learning_rate": 3.729623507058744e-08,
"loss": 0.4319,
"mean_token_accuracy": 0.8636555314064026,
"num_tokens": 1453832.0,
"step": 2935
},
{
"epoch": 2.94,
"grad_norm": 16.25,
"learning_rate": 3.1862230490249394e-08,
"loss": 0.4257,
"mean_token_accuracy": 0.8717685580253601,
"num_tokens": 1456670.0,
"step": 2940
},
{
"epoch": 2.945,
"grad_norm": 6.90625,
"learning_rate": 2.6855227755419046e-08,
"loss": 0.6175,
"mean_token_accuracy": 0.8333387851715088,
"num_tokens": 1459182.0,
"step": 2945
},
{
"epoch": 2.95,
"grad_norm": 16.375,
"learning_rate": 2.2275441349428607e-08,
"loss": 0.6274,
"mean_token_accuracy": 0.8260146617889405,
"num_tokens": 1460880.0,
"step": 2950
},
{
"epoch": 2.955,
"grad_norm": 4.90625,
"learning_rate": 1.812306745508474e-08,
"loss": 0.4596,
"mean_token_accuracy": 0.8644664764404297,
"num_tokens": 1464261.0,
"step": 2955
},
{
"epoch": 2.96,
"grad_norm": 6.84375,
"learning_rate": 1.439828394626641e-08,
"loss": 0.8734,
"mean_token_accuracy": 0.7530173778533935,
"num_tokens": 1465974.0,
"step": 2960
},
{
"epoch": 2.965,
"grad_norm": 25.25,
"learning_rate": 1.1101250380300965e-08,
"loss": 0.6501,
"mean_token_accuracy": 0.8143182754516601,
"num_tokens": 1468379.0,
"step": 2965
},
{
"epoch": 2.9699999999999998,
"grad_norm": 6.03125,
"learning_rate": 8.232107991131833e-09,
"loss": 0.565,
"mean_token_accuracy": 0.831293773651123,
"num_tokens": 1471515.0,
"step": 2970
},
{
"epoch": 2.975,
"grad_norm": 9.1875,
"learning_rate": 5.790979683271136e-09,
"loss": 0.6732,
"mean_token_accuracy": 0.8122606992721557,
"num_tokens": 1473375.0,
"step": 2975
},
{
"epoch": 2.98,
"grad_norm": 20.125,
"learning_rate": 3.777970026531685e-09,
"loss": 0.4793,
"mean_token_accuracy": 0.8602708101272583,
"num_tokens": 1476487.0,
"step": 2980
},
{
"epoch": 2.985,
"grad_norm": 32.25,
"learning_rate": 2.193165251545004e-09,
"loss": 0.7171,
"mean_token_accuracy": 0.790663480758667,
"num_tokens": 1478477.0,
"step": 2985
},
{
"epoch": 2.99,
"grad_norm": 4.75,
"learning_rate": 1.0366332460753913e-09,
"loss": 0.6874,
"mean_token_accuracy": 0.8349518656730652,
"num_tokens": 1480618.0,
"step": 2990
},
{
"epoch": 2.995,
"grad_norm": 8.625,
"learning_rate": 3.0842355210336515e-10,
"loss": 0.9018,
"mean_token_accuracy": 0.7254812955856323,
"num_tokens": 1483656.0,
"step": 2995
},
{
"epoch": 3.0,
"grad_norm": 9.125,
"learning_rate": 8.567363708467114e-12,
"loss": 0.5487,
"mean_token_accuracy": 0.8382766008377075,
"num_tokens": 1487052.0,
"step": 3000
},
{
"epoch": 3.0,
"step": 3000,
"total_flos": 2.8742560290791424e+16,
"train_loss": 0.8452948161760966,
"train_runtime": 60165.202,
"train_samples_per_second": 0.05,
"train_steps_per_second": 0.05
}
],
"logging_steps": 5,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8742560290791424e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}