rm-v2-20k-epoch1-s845 / trainer_state.json
kalomaze's picture
Add files using upload-large-folder tool
cf852ab verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0011848341232228,
"eval_steps": 50,
"global_step": 845,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001184834123222749,
"grad_norm": 4.842912681173325,
"learning_rate": 5e-09,
"loss": 0.6931,
"step": 1
},
{
"epoch": 0.001184834123222749,
"eval_loss": 0.6931471228599548,
"eval_runtime": 62.005,
"eval_samples_per_second": 13.273,
"eval_steps_per_second": 0.839,
"step": 1
},
{
"epoch": 0.002369668246445498,
"grad_norm": 13.013466540861657,
"learning_rate": 1e-08,
"loss": 0.6931,
"step": 2
},
{
"epoch": 0.0035545023696682463,
"grad_norm": 25.207549721573656,
"learning_rate": 1.5e-08,
"loss": 0.6931,
"step": 3
},
{
"epoch": 0.004739336492890996,
"grad_norm": 11.273510432292214,
"learning_rate": 2e-08,
"loss": 0.6931,
"step": 4
},
{
"epoch": 0.005924170616113744,
"grad_norm": 24.11230272682227,
"learning_rate": 2.5e-08,
"loss": 0.6931,
"step": 5
},
{
"epoch": 0.0071090047393364926,
"grad_norm": 5.18401838103531,
"learning_rate": 3e-08,
"loss": 0.6931,
"step": 6
},
{
"epoch": 0.008293838862559242,
"grad_norm": 7.3509565781013135,
"learning_rate": 3.5e-08,
"loss": 0.6931,
"step": 7
},
{
"epoch": 0.009478672985781991,
"grad_norm": 6.56880629718354,
"learning_rate": 4e-08,
"loss": 0.6931,
"step": 8
},
{
"epoch": 0.01066350710900474,
"grad_norm": 5.592557790447292,
"learning_rate": 4.5e-08,
"loss": 0.6931,
"step": 9
},
{
"epoch": 0.011848341232227487,
"grad_norm": 20.527050875976098,
"learning_rate": 5e-08,
"loss": 0.6932,
"step": 10
},
{
"epoch": 0.013033175355450236,
"grad_norm": 6.269601088107528,
"learning_rate": 5.4999999999999996e-08,
"loss": 0.6931,
"step": 11
},
{
"epoch": 0.014218009478672985,
"grad_norm": 9.294134534941529,
"learning_rate": 6e-08,
"loss": 0.6931,
"step": 12
},
{
"epoch": 0.015402843601895734,
"grad_norm": 5.855434213215903,
"learning_rate": 6.5e-08,
"loss": 0.6931,
"step": 13
},
{
"epoch": 0.016587677725118485,
"grad_norm": 10.474986444115153,
"learning_rate": 7e-08,
"loss": 0.6931,
"step": 14
},
{
"epoch": 0.017772511848341232,
"grad_norm": 6.6840985514065325,
"learning_rate": 7.5e-08,
"loss": 0.6931,
"step": 15
},
{
"epoch": 0.018957345971563982,
"grad_norm": 7.4437101801933645,
"learning_rate": 8e-08,
"loss": 0.6931,
"step": 16
},
{
"epoch": 0.02014218009478673,
"grad_norm": 6.13165834073612,
"learning_rate": 8.500000000000001e-08,
"loss": 0.6931,
"step": 17
},
{
"epoch": 0.02132701421800948,
"grad_norm": 8.753680910080762,
"learning_rate": 9e-08,
"loss": 0.6931,
"step": 18
},
{
"epoch": 0.022511848341232227,
"grad_norm": 4.494857956282118,
"learning_rate": 9.499999999999999e-08,
"loss": 0.6931,
"step": 19
},
{
"epoch": 0.023696682464454975,
"grad_norm": 5.518409034456923,
"learning_rate": 1e-07,
"loss": 0.6931,
"step": 20
},
{
"epoch": 0.024881516587677725,
"grad_norm": 22.99472715304125,
"learning_rate": 1.0499999999999999e-07,
"loss": 0.6932,
"step": 21
},
{
"epoch": 0.026066350710900472,
"grad_norm": 5.44008866558376,
"learning_rate": 1.0999999999999999e-07,
"loss": 0.6931,
"step": 22
},
{
"epoch": 0.027251184834123223,
"grad_norm": 8.315427696253538,
"learning_rate": 1.15e-07,
"loss": 0.693,
"step": 23
},
{
"epoch": 0.02843601895734597,
"grad_norm": 9.819572604936798,
"learning_rate": 1.2e-07,
"loss": 0.693,
"step": 24
},
{
"epoch": 0.02962085308056872,
"grad_norm": 19.108539667598766,
"learning_rate": 1.25e-07,
"loss": 0.6929,
"step": 25
},
{
"epoch": 0.030805687203791468,
"grad_norm": 14.585034902914206,
"learning_rate": 1.3e-07,
"loss": 0.6932,
"step": 26
},
{
"epoch": 0.031990521327014215,
"grad_norm": 9.101944616686147,
"learning_rate": 1.35e-07,
"loss": 0.693,
"step": 27
},
{
"epoch": 0.03317535545023697,
"grad_norm": 5.776790369186141,
"learning_rate": 1.4e-07,
"loss": 0.693,
"step": 28
},
{
"epoch": 0.034360189573459717,
"grad_norm": 12.420816953833807,
"learning_rate": 1.45e-07,
"loss": 0.6931,
"step": 29
},
{
"epoch": 0.035545023696682464,
"grad_norm": 12.016877135801524,
"learning_rate": 1.5e-07,
"loss": 0.6929,
"step": 30
},
{
"epoch": 0.03672985781990521,
"grad_norm": 17.865737609497657,
"learning_rate": 1.55e-07,
"loss": 0.6931,
"step": 31
},
{
"epoch": 0.037914691943127965,
"grad_norm": 10.946159830311144,
"learning_rate": 1.6e-07,
"loss": 0.6929,
"step": 32
},
{
"epoch": 0.03909952606635071,
"grad_norm": 7.26222881952521,
"learning_rate": 1.65e-07,
"loss": 0.693,
"step": 33
},
{
"epoch": 0.04028436018957346,
"grad_norm": 16.017675753924713,
"learning_rate": 1.7000000000000001e-07,
"loss": 0.6927,
"step": 34
},
{
"epoch": 0.041469194312796206,
"grad_norm": 14.865936997931808,
"learning_rate": 1.75e-07,
"loss": 0.6931,
"step": 35
},
{
"epoch": 0.04265402843601896,
"grad_norm": 6.989152070363955,
"learning_rate": 1.8e-07,
"loss": 0.6929,
"step": 36
},
{
"epoch": 0.04383886255924171,
"grad_norm": 5.830142644690912,
"learning_rate": 1.85e-07,
"loss": 0.693,
"step": 37
},
{
"epoch": 0.045023696682464455,
"grad_norm": 18.298834268327568,
"learning_rate": 1.8999999999999998e-07,
"loss": 0.6931,
"step": 38
},
{
"epoch": 0.0462085308056872,
"grad_norm": 8.595226932799667,
"learning_rate": 1.9499999999999999e-07,
"loss": 0.6929,
"step": 39
},
{
"epoch": 0.04739336492890995,
"grad_norm": 5.6255428173780055,
"learning_rate": 2e-07,
"loss": 0.6929,
"step": 40
},
{
"epoch": 0.0485781990521327,
"grad_norm": 10.273539930083835,
"learning_rate": 2.0499999999999997e-07,
"loss": 0.6927,
"step": 41
},
{
"epoch": 0.04976303317535545,
"grad_norm": 5.959521496263993,
"learning_rate": 2.0999999999999997e-07,
"loss": 0.6928,
"step": 42
},
{
"epoch": 0.0509478672985782,
"grad_norm": 5.398338620565399,
"learning_rate": 2.1499999999999998e-07,
"loss": 0.6929,
"step": 43
},
{
"epoch": 0.052132701421800945,
"grad_norm": 18.51701133821435,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.6928,
"step": 44
},
{
"epoch": 0.0533175355450237,
"grad_norm": 10.120166425396867,
"learning_rate": 2.25e-07,
"loss": 0.6928,
"step": 45
},
{
"epoch": 0.054502369668246446,
"grad_norm": 9.406423565733586,
"learning_rate": 2.3e-07,
"loss": 0.6927,
"step": 46
},
{
"epoch": 0.05568720379146919,
"grad_norm": 13.864654741893007,
"learning_rate": 2.3499999999999997e-07,
"loss": 0.6927,
"step": 47
},
{
"epoch": 0.05687203791469194,
"grad_norm": 4.845925029634522,
"learning_rate": 2.4e-07,
"loss": 0.6928,
"step": 48
},
{
"epoch": 0.058056872037914695,
"grad_norm": 5.37379618929059,
"learning_rate": 2.45e-07,
"loss": 0.6928,
"step": 49
},
{
"epoch": 0.05924170616113744,
"grad_norm": 6.6369233721322605,
"learning_rate": 2.5e-07,
"loss": 0.6927,
"step": 50
},
{
"epoch": 0.05924170616113744,
"eval_loss": 0.6926634311676025,
"eval_runtime": 58.7485,
"eval_samples_per_second": 14.009,
"eval_steps_per_second": 0.885,
"step": 50
},
{
"epoch": 0.06042654028436019,
"grad_norm": 10.966673821105905,
"learning_rate": 2.55e-07,
"loss": 0.6929,
"step": 51
},
{
"epoch": 0.061611374407582936,
"grad_norm": 11.582337862061097,
"learning_rate": 2.6e-07,
"loss": 0.6926,
"step": 52
},
{
"epoch": 0.06279620853080568,
"grad_norm": 8.941972889362651,
"learning_rate": 2.65e-07,
"loss": 0.6927,
"step": 53
},
{
"epoch": 0.06398104265402843,
"grad_norm": 14.997343470976562,
"learning_rate": 2.7e-07,
"loss": 0.6927,
"step": 54
},
{
"epoch": 0.06516587677725119,
"grad_norm": 5.677663010655966,
"learning_rate": 2.75e-07,
"loss": 0.6926,
"step": 55
},
{
"epoch": 0.06635071090047394,
"grad_norm": 5.020981860702543,
"learning_rate": 2.8e-07,
"loss": 0.6925,
"step": 56
},
{
"epoch": 0.06753554502369669,
"grad_norm": 7.474033040046244,
"learning_rate": 2.8499999999999997e-07,
"loss": 0.6925,
"step": 57
},
{
"epoch": 0.06872037914691943,
"grad_norm": 17.180875525483216,
"learning_rate": 2.9e-07,
"loss": 0.6929,
"step": 58
},
{
"epoch": 0.06990521327014218,
"grad_norm": 13.25617168356445,
"learning_rate": 2.95e-07,
"loss": 0.6925,
"step": 59
},
{
"epoch": 0.07109004739336493,
"grad_norm": 30.116258413787573,
"learning_rate": 3e-07,
"loss": 0.693,
"step": 60
},
{
"epoch": 0.07227488151658767,
"grad_norm": 10.64426253518926,
"learning_rate": 3.05e-07,
"loss": 0.6922,
"step": 61
},
{
"epoch": 0.07345971563981042,
"grad_norm": 11.453245118504901,
"learning_rate": 3.1e-07,
"loss": 0.6925,
"step": 62
},
{
"epoch": 0.07464454976303317,
"grad_norm": 5.907026386237989,
"learning_rate": 3.15e-07,
"loss": 0.6924,
"step": 63
},
{
"epoch": 0.07582938388625593,
"grad_norm": 19.406661340061074,
"learning_rate": 3.2e-07,
"loss": 0.6925,
"step": 64
},
{
"epoch": 0.07701421800947868,
"grad_norm": 18.18198520017815,
"learning_rate": 3.25e-07,
"loss": 0.6927,
"step": 65
},
{
"epoch": 0.07819905213270142,
"grad_norm": 4.7357931743672035,
"learning_rate": 3.3e-07,
"loss": 0.6924,
"step": 66
},
{
"epoch": 0.07938388625592417,
"grad_norm": 10.532802042903194,
"learning_rate": 3.35e-07,
"loss": 0.6924,
"step": 67
},
{
"epoch": 0.08056872037914692,
"grad_norm": 11.915177348864768,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.6924,
"step": 68
},
{
"epoch": 0.08175355450236967,
"grad_norm": 18.00153138952007,
"learning_rate": 3.45e-07,
"loss": 0.6924,
"step": 69
},
{
"epoch": 0.08293838862559241,
"grad_norm": 10.234716621499297,
"learning_rate": 3.5e-07,
"loss": 0.6922,
"step": 70
},
{
"epoch": 0.08412322274881516,
"grad_norm": 11.147351790721068,
"learning_rate": 3.55e-07,
"loss": 0.6923,
"step": 71
},
{
"epoch": 0.08530805687203792,
"grad_norm": 5.9427941542555365,
"learning_rate": 3.6e-07,
"loss": 0.692,
"step": 72
},
{
"epoch": 0.08649289099526067,
"grad_norm": 9.650743281341448,
"learning_rate": 3.65e-07,
"loss": 0.6921,
"step": 73
},
{
"epoch": 0.08767772511848342,
"grad_norm": 14.437098875051285,
"learning_rate": 3.7e-07,
"loss": 0.6923,
"step": 74
},
{
"epoch": 0.08886255924170616,
"grad_norm": 5.378048791391647,
"learning_rate": 3.75e-07,
"loss": 0.6921,
"step": 75
},
{
"epoch": 0.09004739336492891,
"grad_norm": 23.76267009040223,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.6921,
"step": 76
},
{
"epoch": 0.09123222748815166,
"grad_norm": 26.84411773978452,
"learning_rate": 3.8499999999999997e-07,
"loss": 0.6926,
"step": 77
},
{
"epoch": 0.0924170616113744,
"grad_norm": 6.313251132925155,
"learning_rate": 3.8999999999999997e-07,
"loss": 0.6918,
"step": 78
},
{
"epoch": 0.09360189573459715,
"grad_norm": 5.002486541981642,
"learning_rate": 3.95e-07,
"loss": 0.6922,
"step": 79
},
{
"epoch": 0.0947867298578199,
"grad_norm": 5.125567908053878,
"learning_rate": 4e-07,
"loss": 0.6921,
"step": 80
},
{
"epoch": 0.09597156398104266,
"grad_norm": 5.76391233867192,
"learning_rate": 4.05e-07,
"loss": 0.6926,
"step": 81
},
{
"epoch": 0.0971563981042654,
"grad_norm": 9.172761657475997,
"learning_rate": 4.0999999999999994e-07,
"loss": 0.6923,
"step": 82
},
{
"epoch": 0.09834123222748815,
"grad_norm": 15.521052808202064,
"learning_rate": 4.1499999999999994e-07,
"loss": 0.6922,
"step": 83
},
{
"epoch": 0.0995260663507109,
"grad_norm": 6.256832828679843,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.6917,
"step": 84
},
{
"epoch": 0.10071090047393365,
"grad_norm": 6.119025109284402,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.6923,
"step": 85
},
{
"epoch": 0.1018957345971564,
"grad_norm": 15.199284387604703,
"learning_rate": 4.2999999999999996e-07,
"loss": 0.6919,
"step": 86
},
{
"epoch": 0.10308056872037914,
"grad_norm": 15.339972724148373,
"learning_rate": 4.3499999999999996e-07,
"loss": 0.6921,
"step": 87
},
{
"epoch": 0.10426540284360189,
"grad_norm": 11.496730664780461,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.692,
"step": 88
},
{
"epoch": 0.10545023696682465,
"grad_norm": 4.773306731969947,
"learning_rate": 4.45e-07,
"loss": 0.6919,
"step": 89
},
{
"epoch": 0.1066350710900474,
"grad_norm": 8.260323351824246,
"learning_rate": 4.5e-07,
"loss": 0.6913,
"step": 90
},
{
"epoch": 0.10781990521327015,
"grad_norm": 5.659723672215949,
"learning_rate": 4.55e-07,
"loss": 0.6915,
"step": 91
},
{
"epoch": 0.10900473933649289,
"grad_norm": 5.107079958535661,
"learning_rate": 4.6e-07,
"loss": 0.6915,
"step": 92
},
{
"epoch": 0.11018957345971564,
"grad_norm": 22.056688920378733,
"learning_rate": 4.65e-07,
"loss": 0.6917,
"step": 93
},
{
"epoch": 0.11137440758293839,
"grad_norm": 4.86312181195146,
"learning_rate": 4.6999999999999995e-07,
"loss": 0.6916,
"step": 94
},
{
"epoch": 0.11255924170616113,
"grad_norm": 4.682425298642347,
"learning_rate": 4.7499999999999995e-07,
"loss": 0.6921,
"step": 95
},
{
"epoch": 0.11374407582938388,
"grad_norm": 7.5909421969744315,
"learning_rate": 4.8e-07,
"loss": 0.6917,
"step": 96
},
{
"epoch": 0.11492890995260663,
"grad_norm": 9.52244613318277,
"learning_rate": 4.85e-07,
"loss": 0.6918,
"step": 97
},
{
"epoch": 0.11611374407582939,
"grad_norm": 20.463831503545507,
"learning_rate": 4.9e-07,
"loss": 0.6922,
"step": 98
},
{
"epoch": 0.11729857819905214,
"grad_norm": 5.85073635301077,
"learning_rate": 4.95e-07,
"loss": 0.6919,
"step": 99
},
{
"epoch": 0.11848341232227488,
"grad_norm": 9.713864024698502,
"learning_rate": 5e-07,
"loss": 0.691,
"step": 100
},
{
"epoch": 0.11848341232227488,
"eval_loss": 0.6912016868591309,
"eval_runtime": 55.0989,
"eval_samples_per_second": 14.937,
"eval_steps_per_second": 0.944,
"step": 100
},
{
"epoch": 0.11966824644549763,
"grad_norm": 8.735757832881882,
"learning_rate": 5.049999999999999e-07,
"loss": 0.6913,
"step": 101
},
{
"epoch": 0.12085308056872038,
"grad_norm": 8.536276252611597,
"learning_rate": 5.1e-07,
"loss": 0.6916,
"step": 102
},
{
"epoch": 0.12203791469194313,
"grad_norm": 12.307955522799803,
"learning_rate": 5.149999999999999e-07,
"loss": 0.6919,
"step": 103
},
{
"epoch": 0.12322274881516587,
"grad_norm": 11.5436391524331,
"learning_rate": 5.2e-07,
"loss": 0.6908,
"step": 104
},
{
"epoch": 0.12440758293838862,
"grad_norm": 10.060636220616002,
"learning_rate": 5.25e-07,
"loss": 0.6907,
"step": 105
},
{
"epoch": 0.12559241706161137,
"grad_norm": 5.041585370529337,
"learning_rate": 5.3e-07,
"loss": 0.691,
"step": 106
},
{
"epoch": 0.12677725118483413,
"grad_norm": 11.64846080453421,
"learning_rate": 5.35e-07,
"loss": 0.6914,
"step": 107
},
{
"epoch": 0.12796208530805686,
"grad_norm": 19.76068719894559,
"learning_rate": 5.4e-07,
"loss": 0.6919,
"step": 108
},
{
"epoch": 0.12914691943127962,
"grad_norm": 14.136074416348508,
"learning_rate": 5.45e-07,
"loss": 0.6909,
"step": 109
},
{
"epoch": 0.13033175355450238,
"grad_norm": 10.28440428895041,
"learning_rate": 5.5e-07,
"loss": 0.6911,
"step": 110
},
{
"epoch": 0.13151658767772512,
"grad_norm": 7.391244110222167,
"learning_rate": 5.55e-07,
"loss": 0.6909,
"step": 111
},
{
"epoch": 0.13270142180094788,
"grad_norm": 4.823429256379869,
"learning_rate": 5.6e-07,
"loss": 0.6911,
"step": 112
},
{
"epoch": 0.1338862559241706,
"grad_norm": 23.383696969681193,
"learning_rate": 5.649999999999999e-07,
"loss": 0.6906,
"step": 113
},
{
"epoch": 0.13507109004739337,
"grad_norm": 19.49459565572995,
"learning_rate": 5.699999999999999e-07,
"loss": 0.6896,
"step": 114
},
{
"epoch": 0.1362559241706161,
"grad_norm": 9.86021667122269,
"learning_rate": 5.749999999999999e-07,
"loss": 0.6908,
"step": 115
},
{
"epoch": 0.13744075829383887,
"grad_norm": 9.176213186401592,
"learning_rate": 5.8e-07,
"loss": 0.69,
"step": 116
},
{
"epoch": 0.1386255924170616,
"grad_norm": 10.965194330031808,
"learning_rate": 5.849999999999999e-07,
"loss": 0.6907,
"step": 117
},
{
"epoch": 0.13981042654028436,
"grad_norm": 5.324309937382266,
"learning_rate": 5.9e-07,
"loss": 0.6901,
"step": 118
},
{
"epoch": 0.14099526066350712,
"grad_norm": 6.295743777248256,
"learning_rate": 5.949999999999999e-07,
"loss": 0.6904,
"step": 119
},
{
"epoch": 0.14218009478672985,
"grad_norm": 15.67891975538989,
"learning_rate": 6e-07,
"loss": 0.692,
"step": 120
},
{
"epoch": 0.14336492890995262,
"grad_norm": 9.127359508013708,
"learning_rate": 6.049999999999999e-07,
"loss": 0.6906,
"step": 121
},
{
"epoch": 0.14454976303317535,
"grad_norm": 7.5044121458383595,
"learning_rate": 6.1e-07,
"loss": 0.6891,
"step": 122
},
{
"epoch": 0.1457345971563981,
"grad_norm": 12.683289947181704,
"learning_rate": 6.149999999999999e-07,
"loss": 0.6906,
"step": 123
},
{
"epoch": 0.14691943127962084,
"grad_norm": 13.110766398513006,
"learning_rate": 6.2e-07,
"loss": 0.6904,
"step": 124
},
{
"epoch": 0.1481042654028436,
"grad_norm": 11.995325494799939,
"learning_rate": 6.249999999999999e-07,
"loss": 0.6903,
"step": 125
},
{
"epoch": 0.14928909952606634,
"grad_norm": 6.422118572134765,
"learning_rate": 6.3e-07,
"loss": 0.6904,
"step": 126
},
{
"epoch": 0.1504739336492891,
"grad_norm": 10.691214133018695,
"learning_rate": 6.35e-07,
"loss": 0.6905,
"step": 127
},
{
"epoch": 0.15165876777251186,
"grad_norm": 5.404413946055909,
"learning_rate": 6.4e-07,
"loss": 0.691,
"step": 128
},
{
"epoch": 0.1528436018957346,
"grad_norm": 4.447046938476608,
"learning_rate": 6.45e-07,
"loss": 0.6911,
"step": 129
},
{
"epoch": 0.15402843601895735,
"grad_norm": 5.0248072341394865,
"learning_rate": 6.5e-07,
"loss": 0.6914,
"step": 130
},
{
"epoch": 0.1552132701421801,
"grad_norm": 5.056679953391262,
"learning_rate": 6.55e-07,
"loss": 0.6905,
"step": 131
},
{
"epoch": 0.15639810426540285,
"grad_norm": 16.537683924677737,
"learning_rate": 6.6e-07,
"loss": 0.6895,
"step": 132
},
{
"epoch": 0.15758293838862558,
"grad_norm": 21.091493660710064,
"learning_rate": 6.65e-07,
"loss": 0.6897,
"step": 133
},
{
"epoch": 0.15876777251184834,
"grad_norm": 8.432912706940217,
"learning_rate": 6.7e-07,
"loss": 0.689,
"step": 134
},
{
"epoch": 0.15995260663507108,
"grad_norm": 5.270172573046601,
"learning_rate": 6.75e-07,
"loss": 0.6894,
"step": 135
},
{
"epoch": 0.16113744075829384,
"grad_norm": 16.85436413901504,
"learning_rate": 6.800000000000001e-07,
"loss": 0.6906,
"step": 136
},
{
"epoch": 0.1623222748815166,
"grad_norm": 6.916851112907155,
"learning_rate": 6.85e-07,
"loss": 0.6889,
"step": 137
},
{
"epoch": 0.16350710900473933,
"grad_norm": 7.1221016172033655,
"learning_rate": 6.9e-07,
"loss": 0.6897,
"step": 138
},
{
"epoch": 0.1646919431279621,
"grad_norm": 10.163791082368434,
"learning_rate": 6.949999999999999e-07,
"loss": 0.6896,
"step": 139
},
{
"epoch": 0.16587677725118483,
"grad_norm": 14.381198582029823,
"learning_rate": 7e-07,
"loss": 0.6892,
"step": 140
},
{
"epoch": 0.1670616113744076,
"grad_norm": 27.46453698599659,
"learning_rate": 7.049999999999999e-07,
"loss": 0.69,
"step": 141
},
{
"epoch": 0.16824644549763032,
"grad_norm": 6.604176868933808,
"learning_rate": 7.1e-07,
"loss": 0.6875,
"step": 142
},
{
"epoch": 0.16943127962085308,
"grad_norm": 9.080958816193693,
"learning_rate": 7.149999999999999e-07,
"loss": 0.6882,
"step": 143
},
{
"epoch": 0.17061611374407584,
"grad_norm": 8.606545021512053,
"learning_rate": 7.2e-07,
"loss": 0.688,
"step": 144
},
{
"epoch": 0.17180094786729858,
"grad_norm": 8.818101912012192,
"learning_rate": 7.249999999999999e-07,
"loss": 0.6888,
"step": 145
},
{
"epoch": 0.17298578199052134,
"grad_norm": 7.023229633046101,
"learning_rate": 7.3e-07,
"loss": 0.6883,
"step": 146
},
{
"epoch": 0.17417061611374407,
"grad_norm": 13.202458096332483,
"learning_rate": 7.35e-07,
"loss": 0.6911,
"step": 147
},
{
"epoch": 0.17535545023696683,
"grad_norm": 15.969335522541915,
"learning_rate": 7.4e-07,
"loss": 0.6898,
"step": 148
},
{
"epoch": 0.17654028436018956,
"grad_norm": 21.252078515205625,
"learning_rate": 7.45e-07,
"loss": 0.6911,
"step": 149
},
{
"epoch": 0.17772511848341233,
"grad_norm": 10.052620681652504,
"learning_rate": 7.5e-07,
"loss": 0.6878,
"step": 150
},
{
"epoch": 0.17772511848341233,
"eval_loss": 0.6874876618385315,
"eval_runtime": 54.043,
"eval_samples_per_second": 15.229,
"eval_steps_per_second": 0.962,
"step": 150
},
{
"epoch": 0.17890995260663506,
"grad_norm": 7.158864715478862,
"learning_rate": 7.55e-07,
"loss": 0.6871,
"step": 151
},
{
"epoch": 0.18009478672985782,
"grad_norm": 9.1358043468594,
"learning_rate": 7.599999999999999e-07,
"loss": 0.6875,
"step": 152
},
{
"epoch": 0.18127962085308058,
"grad_norm": 5.457525606930803,
"learning_rate": 7.65e-07,
"loss": 0.6882,
"step": 153
},
{
"epoch": 0.18246445497630331,
"grad_norm": 8.695681816363182,
"learning_rate": 7.699999999999999e-07,
"loss": 0.6875,
"step": 154
},
{
"epoch": 0.18364928909952608,
"grad_norm": 6.992833901092717,
"learning_rate": 7.75e-07,
"loss": 0.69,
"step": 155
},
{
"epoch": 0.1848341232227488,
"grad_norm": 11.327106053516577,
"learning_rate": 7.799999999999999e-07,
"loss": 0.6861,
"step": 156
},
{
"epoch": 0.18601895734597157,
"grad_norm": 12.206795478765468,
"learning_rate": 7.85e-07,
"loss": 0.6872,
"step": 157
},
{
"epoch": 0.1872037914691943,
"grad_norm": 11.670452718361528,
"learning_rate": 7.9e-07,
"loss": 0.6873,
"step": 158
},
{
"epoch": 0.18838862559241706,
"grad_norm": 10.394255318275,
"learning_rate": 7.95e-07,
"loss": 0.6883,
"step": 159
},
{
"epoch": 0.1895734597156398,
"grad_norm": 29.830224779254273,
"learning_rate": 8e-07,
"loss": 0.6873,
"step": 160
},
{
"epoch": 0.19075829383886256,
"grad_norm": 8.19405638351032,
"learning_rate": 8.05e-07,
"loss": 0.6849,
"step": 161
},
{
"epoch": 0.19194312796208532,
"grad_norm": 6.4691692398191165,
"learning_rate": 8.1e-07,
"loss": 0.6854,
"step": 162
},
{
"epoch": 0.19312796208530805,
"grad_norm": 7.639628649694936,
"learning_rate": 8.149999999999999e-07,
"loss": 0.6887,
"step": 163
},
{
"epoch": 0.1943127962085308,
"grad_norm": 17.413499986090972,
"learning_rate": 8.199999999999999e-07,
"loss": 0.6846,
"step": 164
},
{
"epoch": 0.19549763033175355,
"grad_norm": 5.309092918246534,
"learning_rate": 8.249999999999999e-07,
"loss": 0.6891,
"step": 165
},
{
"epoch": 0.1966824644549763,
"grad_norm": 7.446590034492426,
"learning_rate": 8.299999999999999e-07,
"loss": 0.6855,
"step": 166
},
{
"epoch": 0.19786729857819904,
"grad_norm": 9.96015478678009,
"learning_rate": 8.349999999999999e-07,
"loss": 0.6843,
"step": 167
},
{
"epoch": 0.1990521327014218,
"grad_norm": 12.633638991242824,
"learning_rate": 8.399999999999999e-07,
"loss": 0.6844,
"step": 168
},
{
"epoch": 0.20023696682464456,
"grad_norm": 7.533772786798129,
"learning_rate": 8.45e-07,
"loss": 0.685,
"step": 169
},
{
"epoch": 0.2014218009478673,
"grad_norm": 6.594946642463255,
"learning_rate": 8.499999999999999e-07,
"loss": 0.6851,
"step": 170
},
{
"epoch": 0.20260663507109006,
"grad_norm": 10.234001728156082,
"learning_rate": 8.55e-07,
"loss": 0.6845,
"step": 171
},
{
"epoch": 0.2037914691943128,
"grad_norm": 14.063644054463136,
"learning_rate": 8.599999999999999e-07,
"loss": 0.683,
"step": 172
},
{
"epoch": 0.20497630331753555,
"grad_norm": 7.492696336567383,
"learning_rate": 8.65e-07,
"loss": 0.6849,
"step": 173
},
{
"epoch": 0.20616113744075829,
"grad_norm": 9.371933323337045,
"learning_rate": 8.699999999999999e-07,
"loss": 0.6859,
"step": 174
},
{
"epoch": 0.20734597156398105,
"grad_norm": 14.68266196345255,
"learning_rate": 8.75e-07,
"loss": 0.685,
"step": 175
},
{
"epoch": 0.20853080568720378,
"grad_norm": 8.595805104024867,
"learning_rate": 8.799999999999999e-07,
"loss": 0.682,
"step": 176
},
{
"epoch": 0.20971563981042654,
"grad_norm": 8.176461365837259,
"learning_rate": 8.85e-07,
"loss": 0.6834,
"step": 177
},
{
"epoch": 0.2109004739336493,
"grad_norm": 24.75404438632194,
"learning_rate": 8.9e-07,
"loss": 0.6823,
"step": 178
},
{
"epoch": 0.21208530805687204,
"grad_norm": 14.936592212079153,
"learning_rate": 8.95e-07,
"loss": 0.6804,
"step": 179
},
{
"epoch": 0.2132701421800948,
"grad_norm": 8.958711285200756,
"learning_rate": 9e-07,
"loss": 0.6812,
"step": 180
},
{
"epoch": 0.21445497630331753,
"grad_norm": 19.370653934060627,
"learning_rate": 9.05e-07,
"loss": 0.6846,
"step": 181
},
{
"epoch": 0.2156398104265403,
"grad_norm": 11.292808912187253,
"learning_rate": 9.1e-07,
"loss": 0.6793,
"step": 182
},
{
"epoch": 0.21682464454976302,
"grad_norm": 20.1962448672265,
"learning_rate": 9.15e-07,
"loss": 0.6837,
"step": 183
},
{
"epoch": 0.21800947867298578,
"grad_norm": 7.866346608383013,
"learning_rate": 9.2e-07,
"loss": 0.6791,
"step": 184
},
{
"epoch": 0.21919431279620852,
"grad_norm": 8.265265530197812,
"learning_rate": 9.25e-07,
"loss": 0.6805,
"step": 185
},
{
"epoch": 0.22037914691943128,
"grad_norm": 8.27541943064013,
"learning_rate": 9.3e-07,
"loss": 0.6818,
"step": 186
},
{
"epoch": 0.22156398104265404,
"grad_norm": 9.175840706959432,
"learning_rate": 9.35e-07,
"loss": 0.6782,
"step": 187
},
{
"epoch": 0.22274881516587677,
"grad_norm": 8.750697832461059,
"learning_rate": 9.399999999999999e-07,
"loss": 0.6815,
"step": 188
},
{
"epoch": 0.22393364928909953,
"grad_norm": 19.807879890918883,
"learning_rate": 9.45e-07,
"loss": 0.6794,
"step": 189
},
{
"epoch": 0.22511848341232227,
"grad_norm": 9.84886237169711,
"learning_rate": 9.499999999999999e-07,
"loss": 0.6761,
"step": 190
},
{
"epoch": 0.22630331753554503,
"grad_norm": 8.847377027851778,
"learning_rate": 9.55e-07,
"loss": 0.674,
"step": 191
},
{
"epoch": 0.22748815165876776,
"grad_norm": 10.078019239583814,
"learning_rate": 9.6e-07,
"loss": 0.6785,
"step": 192
},
{
"epoch": 0.22867298578199052,
"grad_norm": 10.221043516188288,
"learning_rate": 9.649999999999999e-07,
"loss": 0.6731,
"step": 193
},
{
"epoch": 0.22985781990521326,
"grad_norm": 8.647282299307719,
"learning_rate": 9.7e-07,
"loss": 0.6738,
"step": 194
},
{
"epoch": 0.23104265402843602,
"grad_norm": 19.36746818547541,
"learning_rate": 9.75e-07,
"loss": 0.6771,
"step": 195
},
{
"epoch": 0.23222748815165878,
"grad_norm": 11.068473794452629,
"learning_rate": 9.8e-07,
"loss": 0.6738,
"step": 196
},
{
"epoch": 0.2334123222748815,
"grad_norm": 19.420363183493418,
"learning_rate": 9.849999999999999e-07,
"loss": 0.6715,
"step": 197
},
{
"epoch": 0.23459715639810427,
"grad_norm": 24.42503330707244,
"learning_rate": 9.9e-07,
"loss": 0.6701,
"step": 198
},
{
"epoch": 0.235781990521327,
"grad_norm": 16.175882257620756,
"learning_rate": 9.95e-07,
"loss": 0.6737,
"step": 199
},
{
"epoch": 0.23696682464454977,
"grad_norm": 20.652990783070912,
"learning_rate": 1e-06,
"loss": 0.6723,
"step": 200
},
{
"epoch": 0.23696682464454977,
"eval_loss": 0.6700453162193298,
"eval_runtime": 58.1638,
"eval_samples_per_second": 14.15,
"eval_steps_per_second": 0.894,
"step": 200
},
{
"epoch": 0.2381516587677725,
"grad_norm": 15.879396676885444,
"learning_rate": 9.999988856189192e-07,
"loss": 0.6766,
"step": 201
},
{
"epoch": 0.23933649289099526,
"grad_norm": 12.333108739189962,
"learning_rate": 9.999955424806443e-07,
"loss": 0.6731,
"step": 202
},
{
"epoch": 0.24052132701421802,
"grad_norm": 7.242095379151133,
"learning_rate": 9.99989970600077e-07,
"loss": 0.6751,
"step": 203
},
{
"epoch": 0.24170616113744076,
"grad_norm": 8.415435271063192,
"learning_rate": 9.999821700020548e-07,
"loss": 0.6734,
"step": 204
},
{
"epoch": 0.24289099526066352,
"grad_norm": 9.671774375618897,
"learning_rate": 9.999721407213486e-07,
"loss": 0.6702,
"step": 205
},
{
"epoch": 0.24407582938388625,
"grad_norm": 10.14187864614814,
"learning_rate": 9.999598828026642e-07,
"loss": 0.6681,
"step": 206
},
{
"epoch": 0.245260663507109,
"grad_norm": 15.681885174284957,
"learning_rate": 9.999453963006417e-07,
"loss": 0.6659,
"step": 207
},
{
"epoch": 0.24644549763033174,
"grad_norm": 10.7172396576495,
"learning_rate": 9.99928681279855e-07,
"loss": 0.6622,
"step": 208
},
{
"epoch": 0.2476303317535545,
"grad_norm": 11.41841503900657,
"learning_rate": 9.999097378148114e-07,
"loss": 0.6723,
"step": 209
},
{
"epoch": 0.24881516587677724,
"grad_norm": 17.238661996134446,
"learning_rate": 9.998885659899523e-07,
"loss": 0.6675,
"step": 210
},
{
"epoch": 0.25,
"grad_norm": 19.98765993245952,
"learning_rate": 9.998651658996514e-07,
"loss": 0.6611,
"step": 211
},
{
"epoch": 0.25118483412322273,
"grad_norm": 10.75284639990253,
"learning_rate": 9.998395376482152e-07,
"loss": 0.6593,
"step": 212
},
{
"epoch": 0.2523696682464455,
"grad_norm": 18.467338204557816,
"learning_rate": 9.998116813498823e-07,
"loss": 0.6517,
"step": 213
},
{
"epoch": 0.25355450236966826,
"grad_norm": 9.031428370949925,
"learning_rate": 9.99781597128823e-07,
"loss": 0.669,
"step": 214
},
{
"epoch": 0.254739336492891,
"grad_norm": 20.37162505190203,
"learning_rate": 9.997492851191378e-07,
"loss": 0.645,
"step": 215
},
{
"epoch": 0.2559241706161137,
"grad_norm": 14.505828957564063,
"learning_rate": 9.997147454648588e-07,
"loss": 0.6427,
"step": 216
},
{
"epoch": 0.2571090047393365,
"grad_norm": 29.49184766373109,
"learning_rate": 9.996779783199475e-07,
"loss": 0.6579,
"step": 217
},
{
"epoch": 0.25829383886255924,
"grad_norm": 13.33521595334444,
"learning_rate": 9.996389838482942e-07,
"loss": 0.6493,
"step": 218
},
{
"epoch": 0.259478672985782,
"grad_norm": 13.69883838689083,
"learning_rate": 9.995977622237173e-07,
"loss": 0.6475,
"step": 219
},
{
"epoch": 0.26066350710900477,
"grad_norm": 22.69929227147676,
"learning_rate": 9.995543136299635e-07,
"loss": 0.6572,
"step": 220
},
{
"epoch": 0.2618483412322275,
"grad_norm": 15.389124321700107,
"learning_rate": 9.995086382607063e-07,
"loss": 0.646,
"step": 221
},
{
"epoch": 0.26303317535545023,
"grad_norm": 12.968325719512725,
"learning_rate": 9.994607363195442e-07,
"loss": 0.6469,
"step": 222
},
{
"epoch": 0.26421800947867297,
"grad_norm": 16.79050316553183,
"learning_rate": 9.994106080200015e-07,
"loss": 0.6383,
"step": 223
},
{
"epoch": 0.26540284360189575,
"grad_norm": 17.7107351566139,
"learning_rate": 9.993582535855263e-07,
"loss": 0.6317,
"step": 224
},
{
"epoch": 0.2665876777251185,
"grad_norm": 13.001311901357276,
"learning_rate": 9.9930367324949e-07,
"loss": 0.6389,
"step": 225
},
{
"epoch": 0.2677725118483412,
"grad_norm": 12.560158599901298,
"learning_rate": 9.992468672551852e-07,
"loss": 0.6465,
"step": 226
},
{
"epoch": 0.26895734597156395,
"grad_norm": 17.194433268804165,
"learning_rate": 9.991878358558268e-07,
"loss": 0.628,
"step": 227
},
{
"epoch": 0.27014218009478674,
"grad_norm": 14.21824731123265,
"learning_rate": 9.991265793145479e-07,
"loss": 0.6286,
"step": 228
},
{
"epoch": 0.2713270142180095,
"grad_norm": 11.717975256273306,
"learning_rate": 9.990630979044014e-07,
"loss": 0.6417,
"step": 229
},
{
"epoch": 0.2725118483412322,
"grad_norm": 22.73985386534345,
"learning_rate": 9.989973919083573e-07,
"loss": 0.6294,
"step": 230
},
{
"epoch": 0.273696682464455,
"grad_norm": 17.287735794213862,
"learning_rate": 9.989294616193017e-07,
"loss": 0.6158,
"step": 231
},
{
"epoch": 0.27488151658767773,
"grad_norm": 23.899199553299088,
"learning_rate": 9.988593073400354e-07,
"loss": 0.6181,
"step": 232
},
{
"epoch": 0.27606635071090047,
"grad_norm": 19.875731229760778,
"learning_rate": 9.987869293832727e-07,
"loss": 0.609,
"step": 233
},
{
"epoch": 0.2772511848341232,
"grad_norm": 16.046553980851552,
"learning_rate": 9.987123280716402e-07,
"loss": 0.6294,
"step": 234
},
{
"epoch": 0.278436018957346,
"grad_norm": 12.483201348763467,
"learning_rate": 9.98635503737675e-07,
"loss": 0.6294,
"step": 235
},
{
"epoch": 0.2796208530805687,
"grad_norm": 17.59142339660279,
"learning_rate": 9.985564567238236e-07,
"loss": 0.6094,
"step": 236
},
{
"epoch": 0.28080568720379145,
"grad_norm": 16.71315419070765,
"learning_rate": 9.9847518738244e-07,
"loss": 0.6036,
"step": 237
},
{
"epoch": 0.28199052132701424,
"grad_norm": 24.971593610223145,
"learning_rate": 9.98391696075784e-07,
"loss": 0.6105,
"step": 238
},
{
"epoch": 0.283175355450237,
"grad_norm": 14.983923687572714,
"learning_rate": 9.983059831760205e-07,
"loss": 0.611,
"step": 239
},
{
"epoch": 0.2843601895734597,
"grad_norm": 20.998049091911874,
"learning_rate": 9.982180490652164e-07,
"loss": 0.5744,
"step": 240
},
{
"epoch": 0.28554502369668244,
"grad_norm": 26.833381930400048,
"learning_rate": 9.981278941353406e-07,
"loss": 0.6077,
"step": 241
},
{
"epoch": 0.28672985781990523,
"grad_norm": 18.431818850347863,
"learning_rate": 9.980355187882604e-07,
"loss": 0.6052,
"step": 242
},
{
"epoch": 0.28791469194312796,
"grad_norm": 13.590781471879904,
"learning_rate": 9.979409234357416e-07,
"loss": 0.6044,
"step": 243
},
{
"epoch": 0.2890995260663507,
"grad_norm": 16.197540843506868,
"learning_rate": 9.97844108499445e-07,
"loss": 0.5904,
"step": 244
},
{
"epoch": 0.2902843601895735,
"grad_norm": 14.326350645052589,
"learning_rate": 9.977450744109258e-07,
"loss": 0.6138,
"step": 245
},
{
"epoch": 0.2914691943127962,
"grad_norm": 16.000841952080826,
"learning_rate": 9.976438216116304e-07,
"loss": 0.5841,
"step": 246
},
{
"epoch": 0.29265402843601895,
"grad_norm": 18.889358277976672,
"learning_rate": 9.975403505528961e-07,
"loss": 0.575,
"step": 247
},
{
"epoch": 0.2938388625592417,
"grad_norm": 22.108322730446503,
"learning_rate": 9.974346616959475e-07,
"loss": 0.5583,
"step": 248
},
{
"epoch": 0.2950236966824645,
"grad_norm": 18.209727273646372,
"learning_rate": 9.973267555118952e-07,
"loss": 0.5823,
"step": 249
},
{
"epoch": 0.2962085308056872,
"grad_norm": 13.709315722628926,
"learning_rate": 9.972166324817336e-07,
"loss": 0.5913,
"step": 250
},
{
"epoch": 0.2962085308056872,
"eval_loss": 0.5701844692230225,
"eval_runtime": 55.0664,
"eval_samples_per_second": 14.946,
"eval_steps_per_second": 0.944,
"step": 250
},
{
"epoch": 0.29739336492890994,
"grad_norm": 18.07345575992755,
"learning_rate": 9.97104293096339e-07,
"loss": 0.5743,
"step": 251
},
{
"epoch": 0.2985781990521327,
"grad_norm": 20.622897768235585,
"learning_rate": 9.969897378564667e-07,
"loss": 0.5779,
"step": 252
},
{
"epoch": 0.29976303317535546,
"grad_norm": 13.016776898697422,
"learning_rate": 9.968729672727493e-07,
"loss": 0.5739,
"step": 253
},
{
"epoch": 0.3009478672985782,
"grad_norm": 17.91120105322002,
"learning_rate": 9.967539818656952e-07,
"loss": 0.5588,
"step": 254
},
{
"epoch": 0.30213270142180093,
"grad_norm": 19.232325042748638,
"learning_rate": 9.966327821656841e-07,
"loss": 0.5633,
"step": 255
},
{
"epoch": 0.3033175355450237,
"grad_norm": 23.47321995589726,
"learning_rate": 9.965093687129667e-07,
"loss": 0.5704,
"step": 256
},
{
"epoch": 0.30450236966824645,
"grad_norm": 12.62217881537126,
"learning_rate": 9.963837420576618e-07,
"loss": 0.5795,
"step": 257
},
{
"epoch": 0.3056872037914692,
"grad_norm": 16.57740235377143,
"learning_rate": 9.96255902759753e-07,
"loss": 0.5857,
"step": 258
},
{
"epoch": 0.3068720379146919,
"grad_norm": 14.94128961102294,
"learning_rate": 9.961258513890873e-07,
"loss": 0.5666,
"step": 259
},
{
"epoch": 0.3080568720379147,
"grad_norm": 11.83770792082476,
"learning_rate": 9.959935885253715e-07,
"loss": 0.5701,
"step": 260
},
{
"epoch": 0.30924170616113744,
"grad_norm": 16.858957675590407,
"learning_rate": 9.958591147581707e-07,
"loss": 0.5513,
"step": 261
},
{
"epoch": 0.3104265402843602,
"grad_norm": 8.613890836880318,
"learning_rate": 9.957224306869053e-07,
"loss": 0.5861,
"step": 262
},
{
"epoch": 0.31161137440758296,
"grad_norm": 24.30412660533916,
"learning_rate": 9.955835369208473e-07,
"loss": 0.5253,
"step": 263
},
{
"epoch": 0.3127962085308057,
"grad_norm": 13.509894066187833,
"learning_rate": 9.954424340791195e-07,
"loss": 0.5404,
"step": 264
},
{
"epoch": 0.31398104265402843,
"grad_norm": 19.97859230272331,
"learning_rate": 9.952991227906909e-07,
"loss": 0.4929,
"step": 265
},
{
"epoch": 0.31516587677725116,
"grad_norm": 14.659876223619262,
"learning_rate": 9.951536036943753e-07,
"loss": 0.5207,
"step": 266
},
{
"epoch": 0.31635071090047395,
"grad_norm": 16.82648724963916,
"learning_rate": 9.950058774388277e-07,
"loss": 0.5042,
"step": 267
},
{
"epoch": 0.3175355450236967,
"grad_norm": 16.48505871135186,
"learning_rate": 9.948559446825411e-07,
"loss": 0.5068,
"step": 268
},
{
"epoch": 0.3187203791469194,
"grad_norm": 8.986957964532992,
"learning_rate": 9.94703806093845e-07,
"loss": 0.5482,
"step": 269
},
{
"epoch": 0.31990521327014215,
"grad_norm": 15.557609628645618,
"learning_rate": 9.945494623509002e-07,
"loss": 0.5149,
"step": 270
},
{
"epoch": 0.32109004739336494,
"grad_norm": 8.220937327819602,
"learning_rate": 9.943929141416977e-07,
"loss": 0.5604,
"step": 271
},
{
"epoch": 0.3222748815165877,
"grad_norm": 8.409373218590941,
"learning_rate": 9.942341621640557e-07,
"loss": 0.5693,
"step": 272
},
{
"epoch": 0.3234597156398104,
"grad_norm": 16.43820378461214,
"learning_rate": 9.940732071256144e-07,
"loss": 0.4995,
"step": 273
},
{
"epoch": 0.3246445497630332,
"grad_norm": 13.252808150885926,
"learning_rate": 9.93910049743835e-07,
"loss": 0.5654,
"step": 274
},
{
"epoch": 0.32582938388625593,
"grad_norm": 13.986032395862484,
"learning_rate": 9.937446907459953e-07,
"loss": 0.4991,
"step": 275
},
{
"epoch": 0.32701421800947866,
"grad_norm": 17.89107100963545,
"learning_rate": 9.93577130869187e-07,
"loss": 0.5458,
"step": 276
},
{
"epoch": 0.3281990521327014,
"grad_norm": 11.837076385727045,
"learning_rate": 9.934073708603129e-07,
"loss": 0.5505,
"step": 277
},
{
"epoch": 0.3293838862559242,
"grad_norm": 24.996997176345527,
"learning_rate": 9.932354114760817e-07,
"loss": 0.4814,
"step": 278
},
{
"epoch": 0.3305687203791469,
"grad_norm": 18.253469210284003,
"learning_rate": 9.930612534830068e-07,
"loss": 0.4836,
"step": 279
},
{
"epoch": 0.33175355450236965,
"grad_norm": 18.194976720930658,
"learning_rate": 9.928848976574018e-07,
"loss": 0.5073,
"step": 280
},
{
"epoch": 0.33293838862559244,
"grad_norm": 12.062798161873806,
"learning_rate": 9.92706344785377e-07,
"loss": 0.5107,
"step": 281
},
{
"epoch": 0.3341232227488152,
"grad_norm": 8.94049601499799,
"learning_rate": 9.925255956628361e-07,
"loss": 0.518,
"step": 282
},
{
"epoch": 0.3353080568720379,
"grad_norm": 10.188890972987107,
"learning_rate": 9.92342651095473e-07,
"loss": 0.5235,
"step": 283
},
{
"epoch": 0.33649289099526064,
"grad_norm": 12.800443292579475,
"learning_rate": 9.921575118987671e-07,
"loss": 0.5658,
"step": 284
},
{
"epoch": 0.33767772511848343,
"grad_norm": 31.455015869085003,
"learning_rate": 9.919701788979812e-07,
"loss": 0.5875,
"step": 285
},
{
"epoch": 0.33886255924170616,
"grad_norm": 15.744017529825106,
"learning_rate": 9.917806529281566e-07,
"loss": 0.49,
"step": 286
},
{
"epoch": 0.3400473933649289,
"grad_norm": 19.703316838489233,
"learning_rate": 9.915889348341096e-07,
"loss": 0.5247,
"step": 287
},
{
"epoch": 0.3412322274881517,
"grad_norm": 11.492986218008904,
"learning_rate": 9.91395025470429e-07,
"loss": 0.5366,
"step": 288
},
{
"epoch": 0.3424170616113744,
"grad_norm": 11.964355876969535,
"learning_rate": 9.911989257014699e-07,
"loss": 0.4738,
"step": 289
},
{
"epoch": 0.34360189573459715,
"grad_norm": 10.275962280696506,
"learning_rate": 9.91000636401352e-07,
"loss": 0.5351,
"step": 290
},
{
"epoch": 0.3447867298578199,
"grad_norm": 10.213809707901728,
"learning_rate": 9.908001584539547e-07,
"loss": 0.5354,
"step": 291
},
{
"epoch": 0.3459715639810427,
"grad_norm": 17.95448112723858,
"learning_rate": 9.905974927529133e-07,
"loss": 0.5051,
"step": 292
},
{
"epoch": 0.3471563981042654,
"grad_norm": 26.40149829400645,
"learning_rate": 9.90392640201615e-07,
"loss": 0.4529,
"step": 293
},
{
"epoch": 0.34834123222748814,
"grad_norm": 13.875172918400288,
"learning_rate": 9.901856017131954e-07,
"loss": 0.4558,
"step": 294
},
{
"epoch": 0.3495260663507109,
"grad_norm": 11.52864219662223,
"learning_rate": 9.899763782105331e-07,
"loss": 0.4818,
"step": 295
},
{
"epoch": 0.35071090047393366,
"grad_norm": 21.328486936498273,
"learning_rate": 9.897649706262473e-07,
"loss": 0.4784,
"step": 296
},
{
"epoch": 0.3518957345971564,
"grad_norm": 17.693095134232415,
"learning_rate": 9.89551379902692e-07,
"loss": 0.4655,
"step": 297
},
{
"epoch": 0.35308056872037913,
"grad_norm": 25.968174875217105,
"learning_rate": 9.893356069919537e-07,
"loss": 0.5219,
"step": 298
},
{
"epoch": 0.3542654028436019,
"grad_norm": 19.931738567555474,
"learning_rate": 9.89117652855845e-07,
"loss": 0.5079,
"step": 299
},
{
"epoch": 0.35545023696682465,
"grad_norm": 12.373031819577077,
"learning_rate": 9.888975184659016e-07,
"loss": 0.4765,
"step": 300
},
{
"epoch": 0.35545023696682465,
"eval_loss": 0.4756671190261841,
"eval_runtime": 53.9082,
"eval_samples_per_second": 15.267,
"eval_steps_per_second": 0.965,
"step": 300
},
{
"epoch": 0.3566350710900474,
"grad_norm": 11.901393690452515,
"learning_rate": 9.886752048033784e-07,
"loss": 0.5016,
"step": 301
},
{
"epoch": 0.3578199052132701,
"grad_norm": 13.650779933985442,
"learning_rate": 9.884507128592434e-07,
"loss": 0.4753,
"step": 302
},
{
"epoch": 0.3590047393364929,
"grad_norm": 20.815524417557032,
"learning_rate": 9.882240436341753e-07,
"loss": 0.5277,
"step": 303
},
{
"epoch": 0.36018957345971564,
"grad_norm": 12.74609352334382,
"learning_rate": 9.879951981385577e-07,
"loss": 0.4511,
"step": 304
},
{
"epoch": 0.3613744075829384,
"grad_norm": 11.556206853163904,
"learning_rate": 9.877641773924747e-07,
"loss": 0.4753,
"step": 305
},
{
"epoch": 0.36255924170616116,
"grad_norm": 15.56091408911645,
"learning_rate": 9.87530982425707e-07,
"loss": 0.4923,
"step": 306
},
{
"epoch": 0.3637440758293839,
"grad_norm": 26.248499034472612,
"learning_rate": 9.872956142777269e-07,
"loss": 0.4912,
"step": 307
},
{
"epoch": 0.36492890995260663,
"grad_norm": 21.677314925110117,
"learning_rate": 9.870580739976935e-07,
"loss": 0.4263,
"step": 308
},
{
"epoch": 0.36611374407582936,
"grad_norm": 16.062837045314517,
"learning_rate": 9.868183626444486e-07,
"loss": 0.423,
"step": 309
},
{
"epoch": 0.36729857819905215,
"grad_norm": 22.884567704263773,
"learning_rate": 9.865764812865111e-07,
"loss": 0.486,
"step": 310
},
{
"epoch": 0.3684834123222749,
"grad_norm": 7.861313978764723,
"learning_rate": 9.863324310020733e-07,
"loss": 0.4869,
"step": 311
},
{
"epoch": 0.3696682464454976,
"grad_norm": 17.102991536445742,
"learning_rate": 9.860862128789952e-07,
"loss": 0.4391,
"step": 312
},
{
"epoch": 0.3708530805687204,
"grad_norm": 11.567087012823887,
"learning_rate": 9.858378280148002e-07,
"loss": 0.4084,
"step": 313
},
{
"epoch": 0.37203791469194314,
"grad_norm": 11.587792701499762,
"learning_rate": 9.855872775166694e-07,
"loss": 0.4433,
"step": 314
},
{
"epoch": 0.3732227488151659,
"grad_norm": 23.622410585506035,
"learning_rate": 9.853345625014383e-07,
"loss": 0.4305,
"step": 315
},
{
"epoch": 0.3744075829383886,
"grad_norm": 13.658748759519776,
"learning_rate": 9.850796840955899e-07,
"loss": 0.3982,
"step": 316
},
{
"epoch": 0.3755924170616114,
"grad_norm": 10.548121395453487,
"learning_rate": 9.848226434352512e-07,
"loss": 0.441,
"step": 317
},
{
"epoch": 0.3767772511848341,
"grad_norm": 8.899859469015052,
"learning_rate": 9.845634416661867e-07,
"loss": 0.4485,
"step": 318
},
{
"epoch": 0.37796208530805686,
"grad_norm": 13.742583201602377,
"learning_rate": 9.843020799437949e-07,
"loss": 0.3761,
"step": 319
},
{
"epoch": 0.3791469194312796,
"grad_norm": 13.610551297016709,
"learning_rate": 9.840385594331022e-07,
"loss": 0.4778,
"step": 320
},
{
"epoch": 0.3803317535545024,
"grad_norm": 9.005064796206403,
"learning_rate": 9.837728813087573e-07,
"loss": 0.4638,
"step": 321
},
{
"epoch": 0.3815165876777251,
"grad_norm": 14.791810107030555,
"learning_rate": 9.835050467550272e-07,
"loss": 0.4583,
"step": 322
},
{
"epoch": 0.38270142180094785,
"grad_norm": 10.65855589614688,
"learning_rate": 9.832350569657909e-07,
"loss": 0.4448,
"step": 323
},
{
"epoch": 0.38388625592417064,
"grad_norm": 9.546803884411506,
"learning_rate": 9.82962913144534e-07,
"loss": 0.4436,
"step": 324
},
{
"epoch": 0.38507109004739337,
"grad_norm": 10.287431160752277,
"learning_rate": 9.82688616504345e-07,
"loss": 0.4474,
"step": 325
},
{
"epoch": 0.3862559241706161,
"grad_norm": 17.228603164183177,
"learning_rate": 9.824121682679072e-07,
"loss": 0.3995,
"step": 326
},
{
"epoch": 0.38744075829383884,
"grad_norm": 13.097102197908033,
"learning_rate": 9.821335696674956e-07,
"loss": 0.4524,
"step": 327
},
{
"epoch": 0.3886255924170616,
"grad_norm": 8.529650091081256,
"learning_rate": 9.818528219449704e-07,
"loss": 0.4443,
"step": 328
},
{
"epoch": 0.38981042654028436,
"grad_norm": 8.07159063167578,
"learning_rate": 9.81569926351771e-07,
"loss": 0.4785,
"step": 329
},
{
"epoch": 0.3909952606635071,
"grad_norm": 12.444263995999188,
"learning_rate": 9.812848841489118e-07,
"loss": 0.3992,
"step": 330
},
{
"epoch": 0.3921800947867299,
"grad_norm": 13.809272050407982,
"learning_rate": 9.80997696606975e-07,
"loss": 0.4228,
"step": 331
},
{
"epoch": 0.3933649289099526,
"grad_norm": 12.96433701549732,
"learning_rate": 9.807083650061062e-07,
"loss": 0.4429,
"step": 332
},
{
"epoch": 0.39454976303317535,
"grad_norm": 25.039415619632724,
"learning_rate": 9.80416890636008e-07,
"loss": 0.4477,
"step": 333
},
{
"epoch": 0.3957345971563981,
"grad_norm": 11.801141489512451,
"learning_rate": 9.801232747959347e-07,
"loss": 0.4057,
"step": 334
},
{
"epoch": 0.39691943127962087,
"grad_norm": 10.40250553997614,
"learning_rate": 9.798275187946859e-07,
"loss": 0.4461,
"step": 335
},
{
"epoch": 0.3981042654028436,
"grad_norm": 15.30568042611723,
"learning_rate": 9.79529623950601e-07,
"loss": 0.3718,
"step": 336
},
{
"epoch": 0.39928909952606634,
"grad_norm": 14.314286641368666,
"learning_rate": 9.792295915915538e-07,
"loss": 0.4786,
"step": 337
},
{
"epoch": 0.4004739336492891,
"grad_norm": 25.85018234396281,
"learning_rate": 9.789274230549457e-07,
"loss": 0.3853,
"step": 338
},
{
"epoch": 0.40165876777251186,
"grad_norm": 11.907701908243332,
"learning_rate": 9.786231196877003e-07,
"loss": 0.4767,
"step": 339
},
{
"epoch": 0.4028436018957346,
"grad_norm": 15.77066249877433,
"learning_rate": 9.783166828462572e-07,
"loss": 0.416,
"step": 340
},
{
"epoch": 0.4040284360189573,
"grad_norm": 12.04950477801811,
"learning_rate": 9.780081138965663e-07,
"loss": 0.3606,
"step": 341
},
{
"epoch": 0.4052132701421801,
"grad_norm": 15.477847258339152,
"learning_rate": 9.77697414214081e-07,
"loss": 0.4484,
"step": 342
},
{
"epoch": 0.40639810426540285,
"grad_norm": 14.577869548106914,
"learning_rate": 9.773845851837526e-07,
"loss": 0.3849,
"step": 343
},
{
"epoch": 0.4075829383886256,
"grad_norm": 12.396572387524897,
"learning_rate": 9.770696282000244e-07,
"loss": 0.4499,
"step": 344
},
{
"epoch": 0.4087677725118483,
"grad_norm": 26.07143660678688,
"learning_rate": 9.767525446668245e-07,
"loss": 0.4213,
"step": 345
},
{
"epoch": 0.4099526066350711,
"grad_norm": 11.257790868634928,
"learning_rate": 9.764333359975609e-07,
"loss": 0.3746,
"step": 346
},
{
"epoch": 0.41113744075829384,
"grad_norm": 12.642981496078672,
"learning_rate": 9.761120036151135e-07,
"loss": 0.3848,
"step": 347
},
{
"epoch": 0.41232227488151657,
"grad_norm": 10.063976006804062,
"learning_rate": 9.757885489518296e-07,
"loss": 0.4124,
"step": 348
},
{
"epoch": 0.41350710900473936,
"grad_norm": 17.188706479555265,
"learning_rate": 9.754629734495162e-07,
"loss": 0.4039,
"step": 349
},
{
"epoch": 0.4146919431279621,
"grad_norm": 9.497211967243476,
"learning_rate": 9.751352785594336e-07,
"loss": 0.382,
"step": 350
},
{
"epoch": 0.4146919431279621,
"eval_loss": 0.39488136768341064,
"eval_runtime": 52.1468,
"eval_samples_per_second": 15.782,
"eval_steps_per_second": 0.997,
"step": 350
},
{
"epoch": 0.4158767772511848,
"grad_norm": 12.756224679112309,
"learning_rate": 9.748054657422901e-07,
"loss": 0.4015,
"step": 351
},
{
"epoch": 0.41706161137440756,
"grad_norm": 9.07206117193569,
"learning_rate": 9.744735364682344e-07,
"loss": 0.3581,
"step": 352
},
{
"epoch": 0.41824644549763035,
"grad_norm": 23.106705695654462,
"learning_rate": 9.741394922168494e-07,
"loss": 0.3788,
"step": 353
},
{
"epoch": 0.4194312796208531,
"grad_norm": 10.974238261984121,
"learning_rate": 9.73803334477145e-07,
"loss": 0.4242,
"step": 354
},
{
"epoch": 0.4206161137440758,
"grad_norm": 12.994216146112727,
"learning_rate": 9.73465064747553e-07,
"loss": 0.403,
"step": 355
},
{
"epoch": 0.4218009478672986,
"grad_norm": 18.925007359784697,
"learning_rate": 9.731246845359184e-07,
"loss": 0.3239,
"step": 356
},
{
"epoch": 0.42298578199052134,
"grad_norm": 8.90716567987188,
"learning_rate": 9.727821953594949e-07,
"loss": 0.4393,
"step": 357
},
{
"epoch": 0.42417061611374407,
"grad_norm": 9.875417282979411,
"learning_rate": 9.724375987449358e-07,
"loss": 0.431,
"step": 358
},
{
"epoch": 0.4253554502369668,
"grad_norm": 9.978683204562415,
"learning_rate": 9.720908962282891e-07,
"loss": 0.3797,
"step": 359
},
{
"epoch": 0.4265402843601896,
"grad_norm": 12.328565183998132,
"learning_rate": 9.7174208935499e-07,
"loss": 0.3872,
"step": 360
},
{
"epoch": 0.4277251184834123,
"grad_norm": 16.8038180357482,
"learning_rate": 9.713911796798532e-07,
"loss": 0.326,
"step": 361
},
{
"epoch": 0.42890995260663506,
"grad_norm": 13.56237675029536,
"learning_rate": 9.710381687670674e-07,
"loss": 0.3193,
"step": 362
},
{
"epoch": 0.43009478672985785,
"grad_norm": 9.779248850664269,
"learning_rate": 9.70683058190187e-07,
"loss": 0.4309,
"step": 363
},
{
"epoch": 0.4312796208530806,
"grad_norm": 25.506229124536276,
"learning_rate": 9.703258495321265e-07,
"loss": 0.3538,
"step": 364
},
{
"epoch": 0.4324644549763033,
"grad_norm": 20.45504872820827,
"learning_rate": 9.699665443851516e-07,
"loss": 0.3492,
"step": 365
},
{
"epoch": 0.43364928909952605,
"grad_norm": 13.89447348939714,
"learning_rate": 9.696051443508743e-07,
"loss": 0.3398,
"step": 366
},
{
"epoch": 0.43483412322274884,
"grad_norm": 8.113018348581734,
"learning_rate": 9.692416510402438e-07,
"loss": 0.3549,
"step": 367
},
{
"epoch": 0.43601895734597157,
"grad_norm": 9.72301680291045,
"learning_rate": 9.688760660735402e-07,
"loss": 0.3721,
"step": 368
},
{
"epoch": 0.4372037914691943,
"grad_norm": 33.8473636494632,
"learning_rate": 9.685083910803675e-07,
"loss": 0.3994,
"step": 369
},
{
"epoch": 0.43838862559241704,
"grad_norm": 13.79742948772135,
"learning_rate": 9.681386276996462e-07,
"loss": 0.2895,
"step": 370
},
{
"epoch": 0.4395734597156398,
"grad_norm": 20.772884497594852,
"learning_rate": 9.677667775796051e-07,
"loss": 0.3482,
"step": 371
},
{
"epoch": 0.44075829383886256,
"grad_norm": 8.179600721905086,
"learning_rate": 9.673928423777756e-07,
"loss": 0.3725,
"step": 372
},
{
"epoch": 0.4419431279620853,
"grad_norm": 13.178159783502442,
"learning_rate": 9.670168237609826e-07,
"loss": 0.2856,
"step": 373
},
{
"epoch": 0.4431279620853081,
"grad_norm": 8.912426226468748,
"learning_rate": 9.666387234053385e-07,
"loss": 0.3303,
"step": 374
},
{
"epoch": 0.4443127962085308,
"grad_norm": 21.400582293310897,
"learning_rate": 9.662585429962343e-07,
"loss": 0.2905,
"step": 375
},
{
"epoch": 0.44549763033175355,
"grad_norm": 32.80855736267371,
"learning_rate": 9.658762842283341e-07,
"loss": 0.3911,
"step": 376
},
{
"epoch": 0.4466824644549763,
"grad_norm": 22.395443998215274,
"learning_rate": 9.654919488055655e-07,
"loss": 0.4053,
"step": 377
},
{
"epoch": 0.44786729857819907,
"grad_norm": 23.236385222445783,
"learning_rate": 9.651055384411128e-07,
"loss": 0.3227,
"step": 378
},
{
"epoch": 0.4490521327014218,
"grad_norm": 18.495951608692828,
"learning_rate": 9.647170548574096e-07,
"loss": 0.3531,
"step": 379
},
{
"epoch": 0.45023696682464454,
"grad_norm": 10.010304118535766,
"learning_rate": 9.643264997861312e-07,
"loss": 0.3041,
"step": 380
},
{
"epoch": 0.4514218009478673,
"grad_norm": 16.208569360943383,
"learning_rate": 9.639338749681858e-07,
"loss": 0.3441,
"step": 381
},
{
"epoch": 0.45260663507109006,
"grad_norm": 20.365615006060256,
"learning_rate": 9.635391821537087e-07,
"loss": 0.2808,
"step": 382
},
{
"epoch": 0.4537914691943128,
"grad_norm": 10.662406472341873,
"learning_rate": 9.631424231020522e-07,
"loss": 0.3038,
"step": 383
},
{
"epoch": 0.4549763033175355,
"grad_norm": 18.323625372296732,
"learning_rate": 9.627435995817797e-07,
"loss": 0.3919,
"step": 384
},
{
"epoch": 0.4561611374407583,
"grad_norm": 44.08391965239445,
"learning_rate": 9.623427133706567e-07,
"loss": 0.393,
"step": 385
},
{
"epoch": 0.45734597156398105,
"grad_norm": 27.040918313360542,
"learning_rate": 9.619397662556433e-07,
"loss": 0.3563,
"step": 386
},
{
"epoch": 0.4585308056872038,
"grad_norm": 25.347089501942985,
"learning_rate": 9.61534760032886e-07,
"loss": 0.3483,
"step": 387
},
{
"epoch": 0.4597156398104265,
"grad_norm": 17.035195231859507,
"learning_rate": 9.611276965077097e-07,
"loss": 0.3364,
"step": 388
},
{
"epoch": 0.4609004739336493,
"grad_norm": 21.246750011503874,
"learning_rate": 9.607185774946104e-07,
"loss": 0.3101,
"step": 389
},
{
"epoch": 0.46208530805687204,
"grad_norm": 17.210843425254826,
"learning_rate": 9.603074048172457e-07,
"loss": 0.3319,
"step": 390
},
{
"epoch": 0.46327014218009477,
"grad_norm": 24.98240664073611,
"learning_rate": 9.59894180308428e-07,
"loss": 0.3649,
"step": 391
},
{
"epoch": 0.46445497630331756,
"grad_norm": 24.468616634471285,
"learning_rate": 9.594789058101153e-07,
"loss": 0.3073,
"step": 392
},
{
"epoch": 0.4656398104265403,
"grad_norm": 15.180353369606431,
"learning_rate": 9.59061583173404e-07,
"loss": 0.3272,
"step": 393
},
{
"epoch": 0.466824644549763,
"grad_norm": 11.417227295893692,
"learning_rate": 9.5864221425852e-07,
"loss": 0.2914,
"step": 394
},
{
"epoch": 0.46800947867298576,
"grad_norm": 28.69801888783048,
"learning_rate": 9.582208009348102e-07,
"loss": 0.3353,
"step": 395
},
{
"epoch": 0.46919431279620855,
"grad_norm": 14.918763851473592,
"learning_rate": 9.577973450807351e-07,
"loss": 0.3012,
"step": 396
},
{
"epoch": 0.4703791469194313,
"grad_norm": 9.293352558696206,
"learning_rate": 9.57371848583859e-07,
"loss": 0.3748,
"step": 397
},
{
"epoch": 0.471563981042654,
"grad_norm": 15.565028469986906,
"learning_rate": 9.569443133408433e-07,
"loss": 0.2178,
"step": 398
},
{
"epoch": 0.4727488151658768,
"grad_norm": 10.490221123541932,
"learning_rate": 9.565147412574365e-07,
"loss": 0.2922,
"step": 399
},
{
"epoch": 0.47393364928909953,
"grad_norm": 11.285895674191094,
"learning_rate": 9.560831342484666e-07,
"loss": 0.3337,
"step": 400
},
{
"epoch": 0.47393364928909953,
"eval_loss": 0.330232173204422,
"eval_runtime": 53.3543,
"eval_samples_per_second": 15.425,
"eval_steps_per_second": 0.975,
"step": 400
},
{
"epoch": 0.47511848341232227,
"grad_norm": 28.038068999167958,
"learning_rate": 9.556494942378326e-07,
"loss": 0.3151,
"step": 401
},
{
"epoch": 0.476303317535545,
"grad_norm": 11.577255213698676,
"learning_rate": 9.55213823158495e-07,
"loss": 0.3169,
"step": 402
},
{
"epoch": 0.4774881516587678,
"grad_norm": 28.394190964261917,
"learning_rate": 9.547761229524686e-07,
"loss": 0.3333,
"step": 403
},
{
"epoch": 0.4786729857819905,
"grad_norm": 15.348322403958202,
"learning_rate": 9.543363955708124e-07,
"loss": 0.2914,
"step": 404
},
{
"epoch": 0.47985781990521326,
"grad_norm": 18.702903061519496,
"learning_rate": 9.538946429736222e-07,
"loss": 0.2772,
"step": 405
},
{
"epoch": 0.48104265402843605,
"grad_norm": 16.755380600846234,
"learning_rate": 9.534508671300207e-07,
"loss": 0.2896,
"step": 406
},
{
"epoch": 0.4822274881516588,
"grad_norm": 20.912613748172376,
"learning_rate": 9.530050700181498e-07,
"loss": 0.2833,
"step": 407
},
{
"epoch": 0.4834123222748815,
"grad_norm": 20.828932167424576,
"learning_rate": 9.525572536251605e-07,
"loss": 0.3388,
"step": 408
},
{
"epoch": 0.48459715639810425,
"grad_norm": 10.1105886949867,
"learning_rate": 9.521074199472058e-07,
"loss": 0.304,
"step": 409
},
{
"epoch": 0.48578199052132703,
"grad_norm": 13.465306721197914,
"learning_rate": 9.516555709894298e-07,
"loss": 0.3369,
"step": 410
},
{
"epoch": 0.48696682464454977,
"grad_norm": 13.601780436154742,
"learning_rate": 9.512017087659607e-07,
"loss": 0.2554,
"step": 411
},
{
"epoch": 0.4881516587677725,
"grad_norm": 33.58961567504914,
"learning_rate": 9.507458352999001e-07,
"loss": 0.2462,
"step": 412
},
{
"epoch": 0.48933649289099523,
"grad_norm": 24.341590716742427,
"learning_rate": 9.50287952623315e-07,
"loss": 0.3197,
"step": 413
},
{
"epoch": 0.490521327014218,
"grad_norm": 30.667743812966282,
"learning_rate": 9.498280627772286e-07,
"loss": 0.3444,
"step": 414
},
{
"epoch": 0.49170616113744076,
"grad_norm": 11.036970679678214,
"learning_rate": 9.493661678116111e-07,
"loss": 0.2676,
"step": 415
},
{
"epoch": 0.4928909952606635,
"grad_norm": 13.416233053816551,
"learning_rate": 9.489022697853708e-07,
"loss": 0.2569,
"step": 416
},
{
"epoch": 0.4940758293838863,
"grad_norm": 18.498918047484647,
"learning_rate": 9.484363707663441e-07,
"loss": 0.2786,
"step": 417
},
{
"epoch": 0.495260663507109,
"grad_norm": 13.091240724568658,
"learning_rate": 9.479684728312873e-07,
"loss": 0.3145,
"step": 418
},
{
"epoch": 0.49644549763033174,
"grad_norm": 15.68575581642405,
"learning_rate": 9.474985780658669e-07,
"loss": 0.3368,
"step": 419
},
{
"epoch": 0.4976303317535545,
"grad_norm": 10.607199617999157,
"learning_rate": 9.470266885646503e-07,
"loss": 0.3069,
"step": 420
},
{
"epoch": 0.49881516587677727,
"grad_norm": 11.646651108047797,
"learning_rate": 9.465528064310962e-07,
"loss": 0.2546,
"step": 421
},
{
"epoch": 0.5,
"grad_norm": 33.53142097878634,
"learning_rate": 9.46076933777546e-07,
"loss": 0.2844,
"step": 422
},
{
"epoch": 0.5011848341232228,
"grad_norm": 49.38351100638877,
"learning_rate": 9.455990727252134e-07,
"loss": 0.3986,
"step": 423
},
{
"epoch": 0.5023696682464455,
"grad_norm": 76.11068369059677,
"learning_rate": 9.451192254041758e-07,
"loss": 0.4806,
"step": 424
},
{
"epoch": 0.5035545023696683,
"grad_norm": 20.921249429852224,
"learning_rate": 9.446373939533642e-07,
"loss": 0.2513,
"step": 425
},
{
"epoch": 0.504739336492891,
"grad_norm": 10.556489076929935,
"learning_rate": 9.44153580520554e-07,
"loss": 0.2736,
"step": 426
},
{
"epoch": 0.5059241706161137,
"grad_norm": 11.161679055753279,
"learning_rate": 9.436677872623556e-07,
"loss": 0.3311,
"step": 427
},
{
"epoch": 0.5071090047393365,
"grad_norm": 29.411384894701257,
"learning_rate": 9.431800163442041e-07,
"loss": 0.2743,
"step": 428
},
{
"epoch": 0.5082938388625592,
"grad_norm": 24.09078330704124,
"learning_rate": 9.426902699403501e-07,
"loss": 0.2984,
"step": 429
},
{
"epoch": 0.509478672985782,
"grad_norm": 41.60195448463097,
"learning_rate": 9.421985502338503e-07,
"loss": 0.3429,
"step": 430
},
{
"epoch": 0.5106635071090048,
"grad_norm": 20.506598663359913,
"learning_rate": 9.417048594165571e-07,
"loss": 0.2812,
"step": 431
},
{
"epoch": 0.5118483412322274,
"grad_norm": 9.64959188004726,
"learning_rate": 9.412091996891095e-07,
"loss": 0.3062,
"step": 432
},
{
"epoch": 0.5130331753554502,
"grad_norm": 9.836481752429766,
"learning_rate": 9.407115732609227e-07,
"loss": 0.2302,
"step": 433
},
{
"epoch": 0.514218009478673,
"grad_norm": 17.30655443858829,
"learning_rate": 9.402119823501785e-07,
"loss": 0.2861,
"step": 434
},
{
"epoch": 0.5154028436018957,
"grad_norm": 9.843572391305129,
"learning_rate": 9.397104291838157e-07,
"loss": 0.2392,
"step": 435
},
{
"epoch": 0.5165876777251185,
"grad_norm": 29.15515296698138,
"learning_rate": 9.392069159975198e-07,
"loss": 0.2655,
"step": 436
},
{
"epoch": 0.5177725118483413,
"grad_norm": 12.813628762652185,
"learning_rate": 9.387014450357127e-07,
"loss": 0.2816,
"step": 437
},
{
"epoch": 0.518957345971564,
"grad_norm": 17.614534795911805,
"learning_rate": 9.381940185515439e-07,
"loss": 0.3352,
"step": 438
},
{
"epoch": 0.5201421800947867,
"grad_norm": 11.6468061201244,
"learning_rate": 9.376846388068791e-07,
"loss": 0.2779,
"step": 439
},
{
"epoch": 0.5213270142180095,
"grad_norm": 8.711937405694224,
"learning_rate": 9.37173308072291e-07,
"loss": 0.2976,
"step": 440
},
{
"epoch": 0.5225118483412322,
"grad_norm": 12.218827336023685,
"learning_rate": 9.366600286270488e-07,
"loss": 0.2385,
"step": 441
},
{
"epoch": 0.523696682464455,
"grad_norm": 8.929215350730324,
"learning_rate": 9.361448027591079e-07,
"loss": 0.2481,
"step": 442
},
{
"epoch": 0.5248815165876777,
"grad_norm": 19.502457511436713,
"learning_rate": 9.356276327651005e-07,
"loss": 0.3382,
"step": 443
},
{
"epoch": 0.5260663507109005,
"grad_norm": 9.4443581912619,
"learning_rate": 9.35108520950324e-07,
"loss": 0.2613,
"step": 444
},
{
"epoch": 0.5272511848341233,
"grad_norm": 14.850874350089374,
"learning_rate": 9.345874696287323e-07,
"loss": 0.2541,
"step": 445
},
{
"epoch": 0.5284360189573459,
"grad_norm": 9.118502911961933,
"learning_rate": 9.340644811229242e-07,
"loss": 0.276,
"step": 446
},
{
"epoch": 0.5296208530805687,
"grad_norm": 23.523085623060844,
"learning_rate": 9.335395577641336e-07,
"loss": 0.2921,
"step": 447
},
{
"epoch": 0.5308056872037915,
"grad_norm": 18.972413118807122,
"learning_rate": 9.330127018922193e-07,
"loss": 0.3997,
"step": 448
},
{
"epoch": 0.5319905213270142,
"grad_norm": 17.752099885602984,
"learning_rate": 9.324839158556541e-07,
"loss": 0.2269,
"step": 449
},
{
"epoch": 0.533175355450237,
"grad_norm": 16.130571939179283,
"learning_rate": 9.319532020115146e-07,
"loss": 0.2667,
"step": 450
},
{
"epoch": 0.533175355450237,
"eval_loss": 0.2882239818572998,
"eval_runtime": 57.9598,
"eval_samples_per_second": 14.199,
"eval_steps_per_second": 0.897,
"step": 450
},
{
"epoch": 0.5343601895734598,
"grad_norm": 10.798615084456,
"learning_rate": 9.314205627254705e-07,
"loss": 0.3282,
"step": 451
},
{
"epoch": 0.5355450236966824,
"grad_norm": 9.774035887071843,
"learning_rate": 9.308860003717748e-07,
"loss": 0.281,
"step": 452
},
{
"epoch": 0.5367298578199052,
"grad_norm": 16.038788995378557,
"learning_rate": 9.303495173332518e-07,
"loss": 0.2562,
"step": 453
},
{
"epoch": 0.5379146919431279,
"grad_norm": 11.384133516677016,
"learning_rate": 9.298111160012879e-07,
"loss": 0.3327,
"step": 454
},
{
"epoch": 0.5390995260663507,
"grad_norm": 8.79727604270708,
"learning_rate": 9.2927079877582e-07,
"loss": 0.2762,
"step": 455
},
{
"epoch": 0.5402843601895735,
"grad_norm": 18.163749435039747,
"learning_rate": 9.287285680653254e-07,
"loss": 0.3339,
"step": 456
},
{
"epoch": 0.5414691943127962,
"grad_norm": 14.60130218396388,
"learning_rate": 9.281844262868107e-07,
"loss": 0.2554,
"step": 457
},
{
"epoch": 0.542654028436019,
"grad_norm": 13.71276669752716,
"learning_rate": 9.27638375865801e-07,
"loss": 0.2719,
"step": 458
},
{
"epoch": 0.5438388625592417,
"grad_norm": 9.152330320021967,
"learning_rate": 9.270904192363293e-07,
"loss": 0.271,
"step": 459
},
{
"epoch": 0.5450236966824644,
"grad_norm": 8.622874280639174,
"learning_rate": 9.265405588409256e-07,
"loss": 0.2657,
"step": 460
},
{
"epoch": 0.5462085308056872,
"grad_norm": 26.89194137073647,
"learning_rate": 9.259887971306063e-07,
"loss": 0.31,
"step": 461
},
{
"epoch": 0.54739336492891,
"grad_norm": 20.12125062032546,
"learning_rate": 9.254351365648623e-07,
"loss": 0.2562,
"step": 462
},
{
"epoch": 0.5485781990521327,
"grad_norm": 12.317200130554701,
"learning_rate": 9.248795796116491e-07,
"loss": 0.2975,
"step": 463
},
{
"epoch": 0.5497630331753555,
"grad_norm": 11.025734474055628,
"learning_rate": 9.243221287473755e-07,
"loss": 0.2336,
"step": 464
},
{
"epoch": 0.5509478672985783,
"grad_norm": 27.10300511896542,
"learning_rate": 9.23762786456892e-07,
"loss": 0.3397,
"step": 465
},
{
"epoch": 0.5521327014218009,
"grad_norm": 8.033107110139403,
"learning_rate": 9.232015552334806e-07,
"loss": 0.2816,
"step": 466
},
{
"epoch": 0.5533175355450237,
"grad_norm": 8.737091390744437,
"learning_rate": 9.226384375788434e-07,
"loss": 0.2823,
"step": 467
},
{
"epoch": 0.5545023696682464,
"grad_norm": 15.494483524008073,
"learning_rate": 9.220734360030906e-07,
"loss": 0.2121,
"step": 468
},
{
"epoch": 0.5556872037914692,
"grad_norm": 18.735896230365537,
"learning_rate": 9.215065530247308e-07,
"loss": 0.2533,
"step": 469
},
{
"epoch": 0.556872037914692,
"grad_norm": 23.81669178709177,
"learning_rate": 9.209377911706584e-07,
"loss": 0.2746,
"step": 470
},
{
"epoch": 0.5580568720379147,
"grad_norm": 21.159454245219628,
"learning_rate": 9.203671529761434e-07,
"loss": 0.2383,
"step": 471
},
{
"epoch": 0.5592417061611374,
"grad_norm": 9.301848540598128,
"learning_rate": 9.197946409848194e-07,
"loss": 0.2104,
"step": 472
},
{
"epoch": 0.5604265402843602,
"grad_norm": 23.482486410314653,
"learning_rate": 9.192202577486724e-07,
"loss": 0.2782,
"step": 473
},
{
"epoch": 0.5616113744075829,
"grad_norm": 17.27462356394257,
"learning_rate": 9.186440058280298e-07,
"loss": 0.2889,
"step": 474
},
{
"epoch": 0.5627962085308057,
"grad_norm": 15.545463980531496,
"learning_rate": 9.180658877915484e-07,
"loss": 0.2527,
"step": 475
},
{
"epoch": 0.5639810426540285,
"grad_norm": 11.567535127758402,
"learning_rate": 9.174859062162037e-07,
"loss": 0.1832,
"step": 476
},
{
"epoch": 0.5651658767772512,
"grad_norm": 9.388250647196948,
"learning_rate": 9.169040636872773e-07,
"loss": 0.2731,
"step": 477
},
{
"epoch": 0.566350710900474,
"grad_norm": 23.509643846600934,
"learning_rate": 9.163203627983466e-07,
"loss": 0.3582,
"step": 478
},
{
"epoch": 0.5675355450236966,
"grad_norm": 12.082207751549461,
"learning_rate": 9.157348061512726e-07,
"loss": 0.2983,
"step": 479
},
{
"epoch": 0.5687203791469194,
"grad_norm": 16.662630945663924,
"learning_rate": 9.151473963561882e-07,
"loss": 0.2527,
"step": 480
},
{
"epoch": 0.5699052132701422,
"grad_norm": 40.485858820321404,
"learning_rate": 9.145581360314867e-07,
"loss": 0.2963,
"step": 481
},
{
"epoch": 0.5710900473933649,
"grad_norm": 9.050228610199474,
"learning_rate": 9.139670278038107e-07,
"loss": 0.2482,
"step": 482
},
{
"epoch": 0.5722748815165877,
"grad_norm": 9.662434463280741,
"learning_rate": 9.133740743080392e-07,
"loss": 0.2478,
"step": 483
},
{
"epoch": 0.5734597156398105,
"grad_norm": 21.678208323314635,
"learning_rate": 9.127792781872768e-07,
"loss": 0.239,
"step": 484
},
{
"epoch": 0.5746445497630331,
"grad_norm": 11.468515771661801,
"learning_rate": 9.12182642092842e-07,
"loss": 0.2798,
"step": 485
},
{
"epoch": 0.5758293838862559,
"grad_norm": 14.951189679567591,
"learning_rate": 9.115841686842543e-07,
"loss": 0.2098,
"step": 486
},
{
"epoch": 0.5770142180094787,
"grad_norm": 19.92647185342026,
"learning_rate": 9.109838606292239e-07,
"loss": 0.2159,
"step": 487
},
{
"epoch": 0.5781990521327014,
"grad_norm": 13.36729958783011,
"learning_rate": 9.103817206036382e-07,
"loss": 0.2629,
"step": 488
},
{
"epoch": 0.5793838862559242,
"grad_norm": 9.313353223167388,
"learning_rate": 9.09777751291551e-07,
"loss": 0.2911,
"step": 489
},
{
"epoch": 0.580568720379147,
"grad_norm": 9.262594651674652,
"learning_rate": 9.091719553851706e-07,
"loss": 0.2717,
"step": 490
},
{
"epoch": 0.5817535545023697,
"grad_norm": 29.4435625345033,
"learning_rate": 9.085643355848466e-07,
"loss": 0.2651,
"step": 491
},
{
"epoch": 0.5829383886255924,
"grad_norm": 18.54225369010181,
"learning_rate": 9.079548945990592e-07,
"loss": 0.2638,
"step": 492
},
{
"epoch": 0.5841232227488151,
"grad_norm": 19.484158097176785,
"learning_rate": 9.073436351444064e-07,
"loss": 0.2397,
"step": 493
},
{
"epoch": 0.5853080568720379,
"grad_norm": 13.867282359289979,
"learning_rate": 9.067305599455919e-07,
"loss": 0.2688,
"step": 494
},
{
"epoch": 0.5864928909952607,
"grad_norm": 22.877893811260783,
"learning_rate": 9.061156717354137e-07,
"loss": 0.298,
"step": 495
},
{
"epoch": 0.5876777251184834,
"grad_norm": 15.47565263048955,
"learning_rate": 9.054989732547506e-07,
"loss": 0.1763,
"step": 496
},
{
"epoch": 0.5888625592417062,
"grad_norm": 22.335682115478164,
"learning_rate": 9.048804672525512e-07,
"loss": 0.3208,
"step": 497
},
{
"epoch": 0.590047393364929,
"grad_norm": 28.746533629080943,
"learning_rate": 9.042601564858212e-07,
"loss": 0.2347,
"step": 498
},
{
"epoch": 0.5912322274881516,
"grad_norm": 13.201402592912574,
"learning_rate": 9.036380437196108e-07,
"loss": 0.2278,
"step": 499
},
{
"epoch": 0.5924170616113744,
"grad_norm": 9.51664794041542,
"learning_rate": 9.030141317270025e-07,
"loss": 0.2287,
"step": 500
},
{
"epoch": 0.5924170616113744,
"eval_loss": 0.2544219195842743,
"eval_runtime": 60.3512,
"eval_samples_per_second": 13.637,
"eval_steps_per_second": 0.862,
"step": 500
},
{
"epoch": 0.5936018957345972,
"grad_norm": 9.947570386307305,
"learning_rate": 9.023884232890997e-07,
"loss": 0.2918,
"step": 501
},
{
"epoch": 0.5947867298578199,
"grad_norm": 14.937306114287768,
"learning_rate": 9.017609211950126e-07,
"loss": 0.2268,
"step": 502
},
{
"epoch": 0.5959715639810427,
"grad_norm": 20.03266579033559,
"learning_rate": 9.011316282418473e-07,
"loss": 0.2355,
"step": 503
},
{
"epoch": 0.5971563981042654,
"grad_norm": 10.688118849978588,
"learning_rate": 9.005005472346923e-07,
"loss": 0.2649,
"step": 504
},
{
"epoch": 0.5983412322274881,
"grad_norm": 21.412875220411518,
"learning_rate": 8.998676809866066e-07,
"loss": 0.2503,
"step": 505
},
{
"epoch": 0.5995260663507109,
"grad_norm": 11.743872196709809,
"learning_rate": 8.992330323186068e-07,
"loss": 0.2682,
"step": 506
},
{
"epoch": 0.6007109004739336,
"grad_norm": 11.771441009882276,
"learning_rate": 8.985966040596549e-07,
"loss": 0.2756,
"step": 507
},
{
"epoch": 0.6018957345971564,
"grad_norm": 17.976263194907734,
"learning_rate": 8.979583990466452e-07,
"loss": 0.202,
"step": 508
},
{
"epoch": 0.6030805687203792,
"grad_norm": 25.90047189441292,
"learning_rate": 8.973184201243922e-07,
"loss": 0.2376,
"step": 509
},
{
"epoch": 0.6042654028436019,
"grad_norm": 20.62266389659553,
"learning_rate": 8.966766701456176e-07,
"loss": 0.3115,
"step": 510
},
{
"epoch": 0.6054502369668247,
"grad_norm": 23.091302678759135,
"learning_rate": 8.960331519709372e-07,
"loss": 0.2091,
"step": 511
},
{
"epoch": 0.6066350710900474,
"grad_norm": 16.47382278548211,
"learning_rate": 8.953878684688492e-07,
"loss": 0.2174,
"step": 512
},
{
"epoch": 0.6078199052132701,
"grad_norm": 14.906355441019993,
"learning_rate": 8.947408225157205e-07,
"loss": 0.2446,
"step": 513
},
{
"epoch": 0.6090047393364929,
"grad_norm": 8.216540291358418,
"learning_rate": 8.940920169957739e-07,
"loss": 0.2575,
"step": 514
},
{
"epoch": 0.6101895734597157,
"grad_norm": 13.034643278399566,
"learning_rate": 8.934414548010762e-07,
"loss": 0.197,
"step": 515
},
{
"epoch": 0.6113744075829384,
"grad_norm": 13.048988852473453,
"learning_rate": 8.92789138831524e-07,
"loss": 0.1853,
"step": 516
},
{
"epoch": 0.6125592417061612,
"grad_norm": 17.38819922422907,
"learning_rate": 8.921350719948315e-07,
"loss": 0.2427,
"step": 517
},
{
"epoch": 0.6137440758293838,
"grad_norm": 24.37628041900463,
"learning_rate": 8.914792572065177e-07,
"loss": 0.2592,
"step": 518
},
{
"epoch": 0.6149289099526066,
"grad_norm": 20.993815161831648,
"learning_rate": 8.908216973898928e-07,
"loss": 0.2072,
"step": 519
},
{
"epoch": 0.6161137440758294,
"grad_norm": 28.61411000218645,
"learning_rate": 8.901623954760459e-07,
"loss": 0.2746,
"step": 520
},
{
"epoch": 0.6172985781990521,
"grad_norm": 11.09636600039551,
"learning_rate": 8.89501354403831e-07,
"loss": 0.2656,
"step": 521
},
{
"epoch": 0.6184834123222749,
"grad_norm": 16.998967485383094,
"learning_rate": 8.888385771198552e-07,
"loss": 0.2235,
"step": 522
},
{
"epoch": 0.6196682464454977,
"grad_norm": 28.861443273678475,
"learning_rate": 8.88174066578464e-07,
"loss": 0.2978,
"step": 523
},
{
"epoch": 0.6208530805687204,
"grad_norm": 14.714955209304444,
"learning_rate": 8.875078257417294e-07,
"loss": 0.2732,
"step": 524
},
{
"epoch": 0.6220379146919431,
"grad_norm": 9.735548367963192,
"learning_rate": 8.868398575794362e-07,
"loss": 0.1986,
"step": 525
},
{
"epoch": 0.6232227488151659,
"grad_norm": 17.493573015109547,
"learning_rate": 8.861701650690685e-07,
"loss": 0.2646,
"step": 526
},
{
"epoch": 0.6244075829383886,
"grad_norm": 25.375815141472067,
"learning_rate": 8.854987511957973e-07,
"loss": 0.1653,
"step": 527
},
{
"epoch": 0.6255924170616114,
"grad_norm": 35.646843725332126,
"learning_rate": 8.84825618952466e-07,
"loss": 0.3044,
"step": 528
},
{
"epoch": 0.6267772511848341,
"grad_norm": 16.38434340194561,
"learning_rate": 8.841507713395782e-07,
"loss": 0.2138,
"step": 529
},
{
"epoch": 0.6279620853080569,
"grad_norm": 9.828589104598672,
"learning_rate": 8.834742113652833e-07,
"loss": 0.2025,
"step": 530
},
{
"epoch": 0.6291469194312796,
"grad_norm": 14.966605589812715,
"learning_rate": 8.827959420453642e-07,
"loss": 0.2194,
"step": 531
},
{
"epoch": 0.6303317535545023,
"grad_norm": 9.043281744519222,
"learning_rate": 8.821159664032223e-07,
"loss": 0.2106,
"step": 532
},
{
"epoch": 0.6315165876777251,
"grad_norm": 25.156545576852338,
"learning_rate": 8.814342874698659e-07,
"loss": 0.296,
"step": 533
},
{
"epoch": 0.6327014218009479,
"grad_norm": 30.207865323923034,
"learning_rate": 8.807509082838956e-07,
"loss": 0.3279,
"step": 534
},
{
"epoch": 0.6338862559241706,
"grad_norm": 21.8597722447571,
"learning_rate": 8.800658318914905e-07,
"loss": 0.22,
"step": 535
},
{
"epoch": 0.6350710900473934,
"grad_norm": 18.83124332148458,
"learning_rate": 8.793790613463954e-07,
"loss": 0.2945,
"step": 536
},
{
"epoch": 0.6362559241706162,
"grad_norm": 11.621272923164774,
"learning_rate": 8.786905997099066e-07,
"loss": 0.1921,
"step": 537
},
{
"epoch": 0.6374407582938388,
"grad_norm": 15.981620422788177,
"learning_rate": 8.780004500508587e-07,
"loss": 0.2368,
"step": 538
},
{
"epoch": 0.6386255924170616,
"grad_norm": 30.452192715434165,
"learning_rate": 8.773086154456106e-07,
"loss": 0.2448,
"step": 539
},
{
"epoch": 0.6398104265402843,
"grad_norm": 9.559309492322154,
"learning_rate": 8.766150989780317e-07,
"loss": 0.2027,
"step": 540
},
{
"epoch": 0.6409952606635071,
"grad_norm": 27.607639874576503,
"learning_rate": 8.759199037394886e-07,
"loss": 0.2556,
"step": 541
},
{
"epoch": 0.6421800947867299,
"grad_norm": 12.26608497304879,
"learning_rate": 8.752230328288313e-07,
"loss": 0.1891,
"step": 542
},
{
"epoch": 0.6433649289099526,
"grad_norm": 33.570472082251705,
"learning_rate": 8.745244893523783e-07,
"loss": 0.3039,
"step": 543
},
{
"epoch": 0.6445497630331753,
"grad_norm": 14.538932588131711,
"learning_rate": 8.738242764239046e-07,
"loss": 0.1903,
"step": 544
},
{
"epoch": 0.6457345971563981,
"grad_norm": 8.885681558345862,
"learning_rate": 8.73122397164626e-07,
"loss": 0.25,
"step": 545
},
{
"epoch": 0.6469194312796208,
"grad_norm": 9.724842455204085,
"learning_rate": 8.724188547031865e-07,
"loss": 0.2286,
"step": 546
},
{
"epoch": 0.6481042654028436,
"grad_norm": 11.641123423029827,
"learning_rate": 8.717136521756439e-07,
"loss": 0.2298,
"step": 547
},
{
"epoch": 0.6492890995260664,
"grad_norm": 35.64156668351365,
"learning_rate": 8.710067927254554e-07,
"loss": 0.2487,
"step": 548
},
{
"epoch": 0.6504739336492891,
"grad_norm": 55.07441844223735,
"learning_rate": 8.702982795034644e-07,
"loss": 0.2974,
"step": 549
},
{
"epoch": 0.6516587677725119,
"grad_norm": 30.67359922217043,
"learning_rate": 8.695881156678855e-07,
"loss": 0.2409,
"step": 550
},
{
"epoch": 0.6516587677725119,
"eval_loss": 0.26306506991386414,
"eval_runtime": 55.58,
"eval_samples_per_second": 14.807,
"eval_steps_per_second": 0.936,
"step": 550
},
{
"epoch": 0.6528436018957346,
"grad_norm": 34.111707302892505,
"learning_rate": 8.688763043842915e-07,
"loss": 0.3007,
"step": 551
},
{
"epoch": 0.6540284360189573,
"grad_norm": 24.72916737680904,
"learning_rate": 8.681628488255986e-07,
"loss": 0.2861,
"step": 552
},
{
"epoch": 0.6552132701421801,
"grad_norm": 15.195799871782196,
"learning_rate": 8.674477521720521e-07,
"loss": 0.2379,
"step": 553
},
{
"epoch": 0.6563981042654028,
"grad_norm": 35.31766339327681,
"learning_rate": 8.667310176112129e-07,
"loss": 0.3087,
"step": 554
},
{
"epoch": 0.6575829383886256,
"grad_norm": 29.669297495826285,
"learning_rate": 8.660126483379426e-07,
"loss": 0.2826,
"step": 555
},
{
"epoch": 0.6587677725118484,
"grad_norm": 20.188142404979462,
"learning_rate": 8.652926475543898e-07,
"loss": 0.2353,
"step": 556
},
{
"epoch": 0.659952606635071,
"grad_norm": 9.878030902564287,
"learning_rate": 8.645710184699754e-07,
"loss": 0.2091,
"step": 557
},
{
"epoch": 0.6611374407582938,
"grad_norm": 8.326879683208155,
"learning_rate": 8.638477643013787e-07,
"loss": 0.2097,
"step": 558
},
{
"epoch": 0.6623222748815166,
"grad_norm": 17.04100715275265,
"learning_rate": 8.631228882725227e-07,
"loss": 0.2862,
"step": 559
},
{
"epoch": 0.6635071090047393,
"grad_norm": 21.154307793886165,
"learning_rate": 8.623963936145599e-07,
"loss": 0.3105,
"step": 560
},
{
"epoch": 0.6646919431279621,
"grad_norm": 20.578330200177938,
"learning_rate": 8.61668283565858e-07,
"loss": 0.2362,
"step": 561
},
{
"epoch": 0.6658767772511849,
"grad_norm": 23.48493045218934,
"learning_rate": 8.609385613719853e-07,
"loss": 0.2178,
"step": 562
},
{
"epoch": 0.6670616113744076,
"grad_norm": 9.91650775543966,
"learning_rate": 8.60207230285696e-07,
"loss": 0.3152,
"step": 563
},
{
"epoch": 0.6682464454976303,
"grad_norm": 14.993510418458422,
"learning_rate": 8.594742935669164e-07,
"loss": 0.2369,
"step": 564
},
{
"epoch": 0.669431279620853,
"grad_norm": 36.79636092939016,
"learning_rate": 8.587397544827295e-07,
"loss": 0.2666,
"step": 565
},
{
"epoch": 0.6706161137440758,
"grad_norm": 13.316556858524384,
"learning_rate": 8.580036163073614e-07,
"loss": 0.2829,
"step": 566
},
{
"epoch": 0.6718009478672986,
"grad_norm": 14.159800939343656,
"learning_rate": 8.572658823221658e-07,
"loss": 0.198,
"step": 567
},
{
"epoch": 0.6729857819905213,
"grad_norm": 9.745584395603467,
"learning_rate": 8.565265558156101e-07,
"loss": 0.2444,
"step": 568
},
{
"epoch": 0.6741706161137441,
"grad_norm": 12.150823832817574,
"learning_rate": 8.5578564008326e-07,
"loss": 0.2192,
"step": 569
},
{
"epoch": 0.6753554502369669,
"grad_norm": 24.362220333619582,
"learning_rate": 8.550431384277653e-07,
"loss": 0.2476,
"step": 570
},
{
"epoch": 0.6765402843601895,
"grad_norm": 8.952930844365163,
"learning_rate": 8.542990541588453e-07,
"loss": 0.2264,
"step": 571
},
{
"epoch": 0.6777251184834123,
"grad_norm": 9.560301949012237,
"learning_rate": 8.535533905932737e-07,
"loss": 0.2736,
"step": 572
},
{
"epoch": 0.6789099526066351,
"grad_norm": 9.139618148174796,
"learning_rate": 8.528061510548641e-07,
"loss": 0.22,
"step": 573
},
{
"epoch": 0.6800947867298578,
"grad_norm": 18.201403006271022,
"learning_rate": 8.520573388744548e-07,
"loss": 0.2366,
"step": 574
},
{
"epoch": 0.6812796208530806,
"grad_norm": 8.042669272668338,
"learning_rate": 8.513069573898943e-07,
"loss": 0.2055,
"step": 575
},
{
"epoch": 0.6824644549763034,
"grad_norm": 11.84351957271319,
"learning_rate": 8.505550099460263e-07,
"loss": 0.2745,
"step": 576
},
{
"epoch": 0.683649289099526,
"grad_norm": 16.427699085907832,
"learning_rate": 8.49801499894675e-07,
"loss": 0.187,
"step": 577
},
{
"epoch": 0.6848341232227488,
"grad_norm": 18.357805574878622,
"learning_rate": 8.490464305946294e-07,
"loss": 0.2259,
"step": 578
},
{
"epoch": 0.6860189573459715,
"grad_norm": 21.137167128440737,
"learning_rate": 8.482898054116299e-07,
"loss": 0.2139,
"step": 579
},
{
"epoch": 0.6872037914691943,
"grad_norm": 17.717042156687203,
"learning_rate": 8.475316277183508e-07,
"loss": 0.2515,
"step": 580
},
{
"epoch": 0.6883886255924171,
"grad_norm": 23.793425039907586,
"learning_rate": 8.467719008943886e-07,
"loss": 0.2664,
"step": 581
},
{
"epoch": 0.6895734597156398,
"grad_norm": 11.919196893599274,
"learning_rate": 8.460106283262431e-07,
"loss": 0.1886,
"step": 582
},
{
"epoch": 0.6907582938388626,
"grad_norm": 22.28808906798008,
"learning_rate": 8.452478134073062e-07,
"loss": 0.2738,
"step": 583
},
{
"epoch": 0.6919431279620853,
"grad_norm": 23.537434314232886,
"learning_rate": 8.444834595378433e-07,
"loss": 0.2796,
"step": 584
},
{
"epoch": 0.693127962085308,
"grad_norm": 8.778711331006974,
"learning_rate": 8.437175701249805e-07,
"loss": 0.2371,
"step": 585
},
{
"epoch": 0.6943127962085308,
"grad_norm": 8.478947761255274,
"learning_rate": 8.429501485826889e-07,
"loss": 0.183,
"step": 586
},
{
"epoch": 0.6954976303317536,
"grad_norm": 24.182074930968447,
"learning_rate": 8.421811983317681e-07,
"loss": 0.2003,
"step": 587
},
{
"epoch": 0.6966824644549763,
"grad_norm": 9.861012071727558,
"learning_rate": 8.414107227998328e-07,
"loss": 0.2774,
"step": 588
},
{
"epoch": 0.6978672985781991,
"grad_norm": 11.632857692746178,
"learning_rate": 8.406387254212965e-07,
"loss": 0.2403,
"step": 589
},
{
"epoch": 0.6990521327014217,
"grad_norm": 20.917402317103377,
"learning_rate": 8.398652096373564e-07,
"loss": 0.2078,
"step": 590
},
{
"epoch": 0.7002369668246445,
"grad_norm": 11.067641557659394,
"learning_rate": 8.390901788959777e-07,
"loss": 0.2554,
"step": 591
},
{
"epoch": 0.7014218009478673,
"grad_norm": 11.776999680097008,
"learning_rate": 8.383136366518787e-07,
"loss": 0.2309,
"step": 592
},
{
"epoch": 0.70260663507109,
"grad_norm": 25.259853705212183,
"learning_rate": 8.375355863665154e-07,
"loss": 0.2359,
"step": 593
},
{
"epoch": 0.7037914691943128,
"grad_norm": 19.40581177984766,
"learning_rate": 8.367560315080662e-07,
"loss": 0.1904,
"step": 594
},
{
"epoch": 0.7049763033175356,
"grad_norm": 15.1929515678031,
"learning_rate": 8.359749755514154e-07,
"loss": 0.2692,
"step": 595
},
{
"epoch": 0.7061611374407583,
"grad_norm": 15.147667421338586,
"learning_rate": 8.351924219781392e-07,
"loss": 0.254,
"step": 596
},
{
"epoch": 0.707345971563981,
"grad_norm": 7.901829661318031,
"learning_rate": 8.344083742764891e-07,
"loss": 0.1868,
"step": 597
},
{
"epoch": 0.7085308056872038,
"grad_norm": 14.381280613101293,
"learning_rate": 8.336228359413768e-07,
"loss": 0.1958,
"step": 598
},
{
"epoch": 0.7097156398104265,
"grad_norm": 36.71771015436861,
"learning_rate": 8.328358104743585e-07,
"loss": 0.3001,
"step": 599
},
{
"epoch": 0.7109004739336493,
"grad_norm": 33.7833690585172,
"learning_rate": 8.320473013836195e-07,
"loss": 0.259,
"step": 600
},
{
"epoch": 0.7109004739336493,
"eval_loss": 0.251609206199646,
"eval_runtime": 59.1343,
"eval_samples_per_second": 13.917,
"eval_steps_per_second": 0.879,
"step": 600
},
{
"epoch": 0.7120853080568721,
"grad_norm": 27.43991881068027,
"learning_rate": 8.312573121839581e-07,
"loss": 0.2528,
"step": 601
},
{
"epoch": 0.7132701421800948,
"grad_norm": 11.54084932255205,
"learning_rate": 8.304658463967705e-07,
"loss": 0.2258,
"step": 602
},
{
"epoch": 0.7144549763033176,
"grad_norm": 9.38181972163327,
"learning_rate": 8.296729075500343e-07,
"loss": 0.1483,
"step": 603
},
{
"epoch": 0.7156398104265402,
"grad_norm": 9.63021263503591,
"learning_rate": 8.288784991782945e-07,
"loss": 0.2134,
"step": 604
},
{
"epoch": 0.716824644549763,
"grad_norm": 25.241305745406983,
"learning_rate": 8.280826248226449e-07,
"loss": 0.2241,
"step": 605
},
{
"epoch": 0.7180094786729858,
"grad_norm": 19.58953870780054,
"learning_rate": 8.272852880307153e-07,
"loss": 0.2973,
"step": 606
},
{
"epoch": 0.7191943127962085,
"grad_norm": 20.908601756019916,
"learning_rate": 8.264864923566537e-07,
"loss": 0.3026,
"step": 607
},
{
"epoch": 0.7203791469194313,
"grad_norm": 18.641795222590094,
"learning_rate": 8.256862413611112e-07,
"loss": 0.2092,
"step": 608
},
{
"epoch": 0.7215639810426541,
"grad_norm": 9.078402132315118,
"learning_rate": 8.24884538611226e-07,
"loss": 0.2293,
"step": 609
},
{
"epoch": 0.7227488151658767,
"grad_norm": 9.870990754662362,
"learning_rate": 8.240813876806078e-07,
"loss": 0.2364,
"step": 610
},
{
"epoch": 0.7239336492890995,
"grad_norm": 32.61024025487414,
"learning_rate": 8.232767921493215e-07,
"loss": 0.2391,
"step": 611
},
{
"epoch": 0.7251184834123223,
"grad_norm": 24.836388708454873,
"learning_rate": 8.22470755603871e-07,
"loss": 0.2667,
"step": 612
},
{
"epoch": 0.726303317535545,
"grad_norm": 41.315609395728075,
"learning_rate": 8.216632816371838e-07,
"loss": 0.2302,
"step": 613
},
{
"epoch": 0.7274881516587678,
"grad_norm": 17.579148648614137,
"learning_rate": 8.208543738485949e-07,
"loss": 0.2644,
"step": 614
},
{
"epoch": 0.7286729857819905,
"grad_norm": 7.685937476575705,
"learning_rate": 8.200440358438305e-07,
"loss": 0.2128,
"step": 615
},
{
"epoch": 0.7298578199052133,
"grad_norm": 18.987732881829164,
"learning_rate": 8.192322712349917e-07,
"loss": 0.1535,
"step": 616
},
{
"epoch": 0.731042654028436,
"grad_norm": 25.078374459237278,
"learning_rate": 8.184190836405393e-07,
"loss": 0.2313,
"step": 617
},
{
"epoch": 0.7322274881516587,
"grad_norm": 19.93693990442207,
"learning_rate": 8.176044766852765e-07,
"loss": 0.1826,
"step": 618
},
{
"epoch": 0.7334123222748815,
"grad_norm": 9.174225763509883,
"learning_rate": 8.167884540003337e-07,
"loss": 0.2183,
"step": 619
},
{
"epoch": 0.7345971563981043,
"grad_norm": 22.712219561256028,
"learning_rate": 8.159710192231519e-07,
"loss": 0.2233,
"step": 620
},
{
"epoch": 0.735781990521327,
"grad_norm": 12.602248166695643,
"learning_rate": 8.151521759974666e-07,
"loss": 0.2168,
"step": 621
},
{
"epoch": 0.7369668246445498,
"grad_norm": 11.648901495290106,
"learning_rate": 8.143319279732913e-07,
"loss": 0.3022,
"step": 622
},
{
"epoch": 0.7381516587677726,
"grad_norm": 25.848339933826537,
"learning_rate": 8.135102788069015e-07,
"loss": 0.2619,
"step": 623
},
{
"epoch": 0.7393364928909952,
"grad_norm": 26.446445165860816,
"learning_rate": 8.126872321608183e-07,
"loss": 0.1956,
"step": 624
},
{
"epoch": 0.740521327014218,
"grad_norm": 11.078507030053842,
"learning_rate": 8.118627917037924e-07,
"loss": 0.2531,
"step": 625
},
{
"epoch": 0.7417061611374408,
"grad_norm": 16.144215184908756,
"learning_rate": 8.110369611107868e-07,
"loss": 0.2048,
"step": 626
},
{
"epoch": 0.7428909952606635,
"grad_norm": 9.763738781518256,
"learning_rate": 8.102097440629618e-07,
"loss": 0.2536,
"step": 627
},
{
"epoch": 0.7440758293838863,
"grad_norm": 15.658750679197453,
"learning_rate": 8.093811442476572e-07,
"loss": 0.2143,
"step": 628
},
{
"epoch": 0.745260663507109,
"grad_norm": 14.261166875712147,
"learning_rate": 8.085511653583772e-07,
"loss": 0.2228,
"step": 629
},
{
"epoch": 0.7464454976303317,
"grad_norm": 18.41294125408353,
"learning_rate": 8.077198110947725e-07,
"loss": 0.2047,
"step": 630
},
{
"epoch": 0.7476303317535545,
"grad_norm": 7.188949902488354,
"learning_rate": 8.068870851626253e-07,
"loss": 0.2506,
"step": 631
},
{
"epoch": 0.7488151658767772,
"grad_norm": 21.1672326000127,
"learning_rate": 8.060529912738314e-07,
"loss": 0.2493,
"step": 632
},
{
"epoch": 0.75,
"grad_norm": 11.771338026161125,
"learning_rate": 8.052175331463848e-07,
"loss": 0.1622,
"step": 633
},
{
"epoch": 0.7511848341232228,
"grad_norm": 21.486215050391507,
"learning_rate": 8.043807145043603e-07,
"loss": 0.2102,
"step": 634
},
{
"epoch": 0.7523696682464455,
"grad_norm": 22.130536750069474,
"learning_rate": 8.035425390778973e-07,
"loss": 0.2048,
"step": 635
},
{
"epoch": 0.7535545023696683,
"grad_norm": 15.296795850248081,
"learning_rate": 8.027030106031835e-07,
"loss": 0.199,
"step": 636
},
{
"epoch": 0.754739336492891,
"grad_norm": 12.519470076894477,
"learning_rate": 8.018621328224371e-07,
"loss": 0.2273,
"step": 637
},
{
"epoch": 0.7559241706161137,
"grad_norm": 22.060617497410405,
"learning_rate": 8.010199094838914e-07,
"loss": 0.194,
"step": 638
},
{
"epoch": 0.7571090047393365,
"grad_norm": 9.606082768273398,
"learning_rate": 8.001763443417775e-07,
"loss": 0.1982,
"step": 639
},
{
"epoch": 0.7582938388625592,
"grad_norm": 14.494565608281063,
"learning_rate": 7.993314411563075e-07,
"loss": 0.162,
"step": 640
},
{
"epoch": 0.759478672985782,
"grad_norm": 16.589300163864532,
"learning_rate": 7.984852036936578e-07,
"loss": 0.2806,
"step": 641
},
{
"epoch": 0.7606635071090048,
"grad_norm": 11.576925049645048,
"learning_rate": 7.976376357259526e-07,
"loss": 0.1665,
"step": 642
},
{
"epoch": 0.7618483412322274,
"grad_norm": 14.58868190468772,
"learning_rate": 7.967887410312466e-07,
"loss": 0.1948,
"step": 643
},
{
"epoch": 0.7630331753554502,
"grad_norm": 21.320856814477253,
"learning_rate": 7.959385233935085e-07,
"loss": 0.2362,
"step": 644
},
{
"epoch": 0.764218009478673,
"grad_norm": 15.04617262190784,
"learning_rate": 7.950869866026045e-07,
"loss": 0.2164,
"step": 645
},
{
"epoch": 0.7654028436018957,
"grad_norm": 9.80317001840058,
"learning_rate": 7.9423413445428e-07,
"loss": 0.1788,
"step": 646
},
{
"epoch": 0.7665876777251185,
"grad_norm": 19.844028210298966,
"learning_rate": 7.933799707501447e-07,
"loss": 0.2209,
"step": 647
},
{
"epoch": 0.7677725118483413,
"grad_norm": 13.751437660235108,
"learning_rate": 7.925244992976537e-07,
"loss": 0.1899,
"step": 648
},
{
"epoch": 0.768957345971564,
"grad_norm": 9.41285729970307,
"learning_rate": 7.916677239100922e-07,
"loss": 0.2487,
"step": 649
},
{
"epoch": 0.7701421800947867,
"grad_norm": 18.895758189790847,
"learning_rate": 7.908096484065568e-07,
"loss": 0.2111,
"step": 650
},
{
"epoch": 0.7701421800947867,
"eval_loss": 0.2293587177991867,
"eval_runtime": 57.8447,
"eval_samples_per_second": 14.228,
"eval_steps_per_second": 0.899,
"step": 650
},
{
"epoch": 0.7713270142180095,
"grad_norm": 13.862666177134349,
"learning_rate": 7.899502766119403e-07,
"loss": 0.2696,
"step": 651
},
{
"epoch": 0.7725118483412322,
"grad_norm": 16.120955468412227,
"learning_rate": 7.890896123569135e-07,
"loss": 0.214,
"step": 652
},
{
"epoch": 0.773696682464455,
"grad_norm": 9.011365770979943,
"learning_rate": 7.882276594779079e-07,
"loss": 0.1631,
"step": 653
},
{
"epoch": 0.7748815165876777,
"grad_norm": 9.222611641594378,
"learning_rate": 7.873644218170996e-07,
"loss": 0.2073,
"step": 654
},
{
"epoch": 0.7760663507109005,
"grad_norm": 27.175342390723387,
"learning_rate": 7.864999032223914e-07,
"loss": 0.1848,
"step": 655
},
{
"epoch": 0.7772511848341233,
"grad_norm": 30.41172358480833,
"learning_rate": 7.856341075473961e-07,
"loss": 0.2405,
"step": 656
},
{
"epoch": 0.7784360189573459,
"grad_norm": 9.377963440893206,
"learning_rate": 7.847670386514189e-07,
"loss": 0.2181,
"step": 657
},
{
"epoch": 0.7796208530805687,
"grad_norm": 9.330139452266417,
"learning_rate": 7.838987003994404e-07,
"loss": 0.2582,
"step": 658
},
{
"epoch": 0.7808056872037915,
"grad_norm": 21.896919162817507,
"learning_rate": 7.830290966620995e-07,
"loss": 0.1798,
"step": 659
},
{
"epoch": 0.7819905213270142,
"grad_norm": 9.358252051552995,
"learning_rate": 7.821582313156763e-07,
"loss": 0.2502,
"step": 660
},
{
"epoch": 0.783175355450237,
"grad_norm": 10.845451720550335,
"learning_rate": 7.812861082420739e-07,
"loss": 0.2301,
"step": 661
},
{
"epoch": 0.7843601895734598,
"grad_norm": 9.9684188287499,
"learning_rate": 7.804127313288023e-07,
"loss": 0.2687,
"step": 662
},
{
"epoch": 0.7855450236966824,
"grad_norm": 25.687262257581693,
"learning_rate": 7.795381044689602e-07,
"loss": 0.1958,
"step": 663
},
{
"epoch": 0.7867298578199052,
"grad_norm": 13.55499223559753,
"learning_rate": 7.786622315612181e-07,
"loss": 0.2485,
"step": 664
},
{
"epoch": 0.7879146919431279,
"grad_norm": 12.218617521005212,
"learning_rate": 7.777851165098011e-07,
"loss": 0.1479,
"step": 665
},
{
"epoch": 0.7890995260663507,
"grad_norm": 7.721800269638889,
"learning_rate": 7.769067632244706e-07,
"loss": 0.1626,
"step": 666
},
{
"epoch": 0.7902843601895735,
"grad_norm": 24.314109763668625,
"learning_rate": 7.760271756205077e-07,
"loss": 0.2109,
"step": 667
},
{
"epoch": 0.7914691943127962,
"grad_norm": 7.701242021010707,
"learning_rate": 7.751463576186957e-07,
"loss": 0.1472,
"step": 668
},
{
"epoch": 0.792654028436019,
"grad_norm": 13.37997398622404,
"learning_rate": 7.742643131453021e-07,
"loss": 0.2217,
"step": 669
},
{
"epoch": 0.7938388625592417,
"grad_norm": 8.745184371153652,
"learning_rate": 7.733810461320618e-07,
"loss": 0.1434,
"step": 670
},
{
"epoch": 0.7950236966824644,
"grad_norm": 11.16885275222009,
"learning_rate": 7.724965605161588e-07,
"loss": 0.2009,
"step": 671
},
{
"epoch": 0.7962085308056872,
"grad_norm": 9.806142647457065,
"learning_rate": 7.716108602402094e-07,
"loss": 0.18,
"step": 672
},
{
"epoch": 0.79739336492891,
"grad_norm": 16.335655824559705,
"learning_rate": 7.707239492522439e-07,
"loss": 0.2102,
"step": 673
},
{
"epoch": 0.7985781990521327,
"grad_norm": 17.375625902322422,
"learning_rate": 7.6983583150569e-07,
"loss": 0.2452,
"step": 674
},
{
"epoch": 0.7997630331753555,
"grad_norm": 13.425423648864005,
"learning_rate": 7.689465109593539e-07,
"loss": 0.222,
"step": 675
},
{
"epoch": 0.8009478672985783,
"grad_norm": 16.905635944612605,
"learning_rate": 7.680559915774033e-07,
"loss": 0.2264,
"step": 676
},
{
"epoch": 0.8021327014218009,
"grad_norm": 16.781958411958737,
"learning_rate": 7.671642773293505e-07,
"loss": 0.2078,
"step": 677
},
{
"epoch": 0.8033175355450237,
"grad_norm": 17.58937112986415,
"learning_rate": 7.662713721900331e-07,
"loss": 0.2382,
"step": 678
},
{
"epoch": 0.8045023696682464,
"grad_norm": 17.576419370888175,
"learning_rate": 7.653772801395977e-07,
"loss": 0.219,
"step": 679
},
{
"epoch": 0.8056872037914692,
"grad_norm": 42.78433100503283,
"learning_rate": 7.644820051634812e-07,
"loss": 0.3042,
"step": 680
},
{
"epoch": 0.806872037914692,
"grad_norm": 12.172358570342093,
"learning_rate": 7.635855512523937e-07,
"loss": 0.1905,
"step": 681
},
{
"epoch": 0.8080568720379147,
"grad_norm": 7.694538953431653,
"learning_rate": 7.626879224023001e-07,
"loss": 0.209,
"step": 682
},
{
"epoch": 0.8092417061611374,
"grad_norm": 12.703516850184618,
"learning_rate": 7.617891226144033e-07,
"loss": 0.1979,
"step": 683
},
{
"epoch": 0.8104265402843602,
"grad_norm": 9.11601457295805,
"learning_rate": 7.608891558951248e-07,
"loss": 0.1748,
"step": 684
},
{
"epoch": 0.8116113744075829,
"grad_norm": 9.014204421884562,
"learning_rate": 7.599880262560882e-07,
"loss": 0.2629,
"step": 685
},
{
"epoch": 0.8127962085308057,
"grad_norm": 14.044829378680554,
"learning_rate": 7.590857377141009e-07,
"loss": 0.3174,
"step": 686
},
{
"epoch": 0.8139810426540285,
"grad_norm": 14.2323739084943,
"learning_rate": 7.58182294291136e-07,
"loss": 0.214,
"step": 687
},
{
"epoch": 0.8151658767772512,
"grad_norm": 23.66550864483779,
"learning_rate": 7.572777000143145e-07,
"loss": 0.2742,
"step": 688
},
{
"epoch": 0.816350710900474,
"grad_norm": 16.58222384020301,
"learning_rate": 7.563719589158872e-07,
"loss": 0.2081,
"step": 689
},
{
"epoch": 0.8175355450236966,
"grad_norm": 14.390980642614025,
"learning_rate": 7.554650750332174e-07,
"loss": 0.1946,
"step": 690
},
{
"epoch": 0.8187203791469194,
"grad_norm": 10.122145213378829,
"learning_rate": 7.545570524087619e-07,
"loss": 0.1758,
"step": 691
},
{
"epoch": 0.8199052132701422,
"grad_norm": 11.308324582282285,
"learning_rate": 7.536478950900536e-07,
"loss": 0.1705,
"step": 692
},
{
"epoch": 0.8210900473933649,
"grad_norm": 23.02921863609514,
"learning_rate": 7.527376071296836e-07,
"loss": 0.1841,
"step": 693
},
{
"epoch": 0.8222748815165877,
"grad_norm": 14.409456371586236,
"learning_rate": 7.518261925852823e-07,
"loss": 0.2406,
"step": 694
},
{
"epoch": 0.8234597156398105,
"grad_norm": 32.09834915802655,
"learning_rate": 7.509136555195023e-07,
"loss": 0.2367,
"step": 695
},
{
"epoch": 0.8246445497630331,
"grad_norm": 12.356727122169731,
"learning_rate": 7.5e-07,
"loss": 0.2417,
"step": 696
},
{
"epoch": 0.8258293838862559,
"grad_norm": 13.126563741604407,
"learning_rate": 7.490852300994168e-07,
"loss": 0.2063,
"step": 697
},
{
"epoch": 0.8270142180094787,
"grad_norm": 8.032225849806373,
"learning_rate": 7.48169349895362e-07,
"loss": 0.1786,
"step": 698
},
{
"epoch": 0.8281990521327014,
"grad_norm": 20.962662296925956,
"learning_rate": 7.472523634703936e-07,
"loss": 0.1698,
"step": 699
},
{
"epoch": 0.8293838862559242,
"grad_norm": 12.75110057617325,
"learning_rate": 7.463342749120013e-07,
"loss": 0.1812,
"step": 700
},
{
"epoch": 0.8293838862559242,
"eval_loss": 0.23034565150737762,
"eval_runtime": 55.8882,
"eval_samples_per_second": 14.726,
"eval_steps_per_second": 0.93,
"step": 700
},
{
"epoch": 0.830568720379147,
"grad_norm": 9.913055744102358,
"learning_rate": 7.454150883125868e-07,
"loss": 0.2133,
"step": 701
},
{
"epoch": 0.8317535545023697,
"grad_norm": 25.833382992108902,
"learning_rate": 7.44494807769447e-07,
"loss": 0.2969,
"step": 702
},
{
"epoch": 0.8329383886255924,
"grad_norm": 11.281775033559253,
"learning_rate": 7.435734373847545e-07,
"loss": 0.1416,
"step": 703
},
{
"epoch": 0.8341232227488151,
"grad_norm": 19.594414240296107,
"learning_rate": 7.426509812655405e-07,
"loss": 0.2314,
"step": 704
},
{
"epoch": 0.8353080568720379,
"grad_norm": 17.70712288867108,
"learning_rate": 7.417274435236755e-07,
"loss": 0.2538,
"step": 705
},
{
"epoch": 0.8364928909952607,
"grad_norm": 10.607970236644762,
"learning_rate": 7.408028282758514e-07,
"loss": 0.1711,
"step": 706
},
{
"epoch": 0.8376777251184834,
"grad_norm": 11.57185625276237,
"learning_rate": 7.398771396435632e-07,
"loss": 0.1872,
"step": 707
},
{
"epoch": 0.8388625592417062,
"grad_norm": 11.182287748396945,
"learning_rate": 7.389503817530905e-07,
"loss": 0.2349,
"step": 708
},
{
"epoch": 0.840047393364929,
"grad_norm": 16.684749926205544,
"learning_rate": 7.380225587354789e-07,
"loss": 0.2506,
"step": 709
},
{
"epoch": 0.8412322274881516,
"grad_norm": 11.69863271084872,
"learning_rate": 7.370936747265225e-07,
"loss": 0.2208,
"step": 710
},
{
"epoch": 0.8424170616113744,
"grad_norm": 10.788997392898397,
"learning_rate": 7.361637338667441e-07,
"loss": 0.2114,
"step": 711
},
{
"epoch": 0.8436018957345972,
"grad_norm": 7.602485659984748,
"learning_rate": 7.352327403013779e-07,
"loss": 0.1821,
"step": 712
},
{
"epoch": 0.8447867298578199,
"grad_norm": 7.26614734509637,
"learning_rate": 7.343006981803499e-07,
"loss": 0.2309,
"step": 713
},
{
"epoch": 0.8459715639810427,
"grad_norm": 8.364511901273751,
"learning_rate": 7.33367611658261e-07,
"loss": 0.2118,
"step": 714
},
{
"epoch": 0.8471563981042654,
"grad_norm": 7.5672261947917985,
"learning_rate": 7.324334848943668e-07,
"loss": 0.2214,
"step": 715
},
{
"epoch": 0.8483412322274881,
"grad_norm": 15.227230664760391,
"learning_rate": 7.314983220525604e-07,
"loss": 0.1782,
"step": 716
},
{
"epoch": 0.8495260663507109,
"grad_norm": 10.217799076813009,
"learning_rate": 7.305621273013525e-07,
"loss": 0.2302,
"step": 717
},
{
"epoch": 0.8507109004739336,
"grad_norm": 18.691186404113676,
"learning_rate": 7.296249048138542e-07,
"loss": 0.1682,
"step": 718
},
{
"epoch": 0.8518957345971564,
"grad_norm": 10.452172850788102,
"learning_rate": 7.286866587677574e-07,
"loss": 0.2355,
"step": 719
},
{
"epoch": 0.8530805687203792,
"grad_norm": 10.604574552375054,
"learning_rate": 7.277473933453169e-07,
"loss": 0.1133,
"step": 720
},
{
"epoch": 0.8542654028436019,
"grad_norm": 9.891539825214483,
"learning_rate": 7.268071127333311e-07,
"loss": 0.2195,
"step": 721
},
{
"epoch": 0.8554502369668247,
"grad_norm": 22.53671982979302,
"learning_rate": 7.258658211231234e-07,
"loss": 0.2269,
"step": 722
},
{
"epoch": 0.8566350710900474,
"grad_norm": 23.83975858014513,
"learning_rate": 7.249235227105245e-07,
"loss": 0.2294,
"step": 723
},
{
"epoch": 0.8578199052132701,
"grad_norm": 19.583412313581313,
"learning_rate": 7.239802216958522e-07,
"loss": 0.2309,
"step": 724
},
{
"epoch": 0.8590047393364929,
"grad_norm": 9.156760472446708,
"learning_rate": 7.230359222838938e-07,
"loss": 0.1893,
"step": 725
},
{
"epoch": 0.8601895734597157,
"grad_norm": 14.337897781365237,
"learning_rate": 7.220906286838868e-07,
"loss": 0.1709,
"step": 726
},
{
"epoch": 0.8613744075829384,
"grad_norm": 17.90848323335686,
"learning_rate": 7.211443451095006e-07,
"loss": 0.2353,
"step": 727
},
{
"epoch": 0.8625592417061612,
"grad_norm": 8.380769804085316,
"learning_rate": 7.201970757788171e-07,
"loss": 0.21,
"step": 728
},
{
"epoch": 0.8637440758293838,
"grad_norm": 11.322111166952071,
"learning_rate": 7.192488249143125e-07,
"loss": 0.2225,
"step": 729
},
{
"epoch": 0.8649289099526066,
"grad_norm": 10.045093401838667,
"learning_rate": 7.182995967428379e-07,
"loss": 0.1583,
"step": 730
},
{
"epoch": 0.8661137440758294,
"grad_norm": 17.370985243300748,
"learning_rate": 7.173493954956011e-07,
"loss": 0.2028,
"step": 731
},
{
"epoch": 0.8672985781990521,
"grad_norm": 10.458119006433506,
"learning_rate": 7.163982254081474e-07,
"loss": 0.1776,
"step": 732
},
{
"epoch": 0.8684834123222749,
"grad_norm": 8.754265017532378,
"learning_rate": 7.154460907203405e-07,
"loss": 0.1875,
"step": 733
},
{
"epoch": 0.8696682464454977,
"grad_norm": 12.4225760590751,
"learning_rate": 7.144929956763437e-07,
"loss": 0.2778,
"step": 734
},
{
"epoch": 0.8708530805687204,
"grad_norm": 11.608393438177549,
"learning_rate": 7.135389445246017e-07,
"loss": 0.192,
"step": 735
},
{
"epoch": 0.8720379146919431,
"grad_norm": 8.633498509424786,
"learning_rate": 7.125839415178203e-07,
"loss": 0.1541,
"step": 736
},
{
"epoch": 0.8732227488151659,
"grad_norm": 16.670199329768597,
"learning_rate": 7.116279909129491e-07,
"loss": 0.1941,
"step": 737
},
{
"epoch": 0.8744075829383886,
"grad_norm": 12.74769327926982,
"learning_rate": 7.106710969711609e-07,
"loss": 0.2348,
"step": 738
},
{
"epoch": 0.8755924170616114,
"grad_norm": 10.452681837710877,
"learning_rate": 7.097132639578337e-07,
"loss": 0.1702,
"step": 739
},
{
"epoch": 0.8767772511848341,
"grad_norm": 9.914095739792367,
"learning_rate": 7.087544961425316e-07,
"loss": 0.1876,
"step": 740
},
{
"epoch": 0.8779620853080569,
"grad_norm": 9.459241899301656,
"learning_rate": 7.077947977989853e-07,
"loss": 0.2389,
"step": 741
},
{
"epoch": 0.8791469194312796,
"grad_norm": 10.31379047778746,
"learning_rate": 7.068341732050737e-07,
"loss": 0.2262,
"step": 742
},
{
"epoch": 0.8803317535545023,
"grad_norm": 14.015863725214746,
"learning_rate": 7.058726266428041e-07,
"loss": 0.2054,
"step": 743
},
{
"epoch": 0.8815165876777251,
"grad_norm": 10.095904056771356,
"learning_rate": 7.049101623982937e-07,
"loss": 0.2564,
"step": 744
},
{
"epoch": 0.8827014218009479,
"grad_norm": 14.900120087150492,
"learning_rate": 7.039467847617504e-07,
"loss": 0.2516,
"step": 745
},
{
"epoch": 0.8838862559241706,
"grad_norm": 14.989089635791842,
"learning_rate": 7.029824980274534e-07,
"loss": 0.186,
"step": 746
},
{
"epoch": 0.8850710900473934,
"grad_norm": 21.089551499728422,
"learning_rate": 7.020173064937344e-07,
"loss": 0.1767,
"step": 747
},
{
"epoch": 0.8862559241706162,
"grad_norm": 8.4313495925763,
"learning_rate": 7.010512144629579e-07,
"loss": 0.227,
"step": 748
},
{
"epoch": 0.8874407582938388,
"grad_norm": 8.70162638226272,
"learning_rate": 7.000842262415028e-07,
"loss": 0.1797,
"step": 749
},
{
"epoch": 0.8886255924170616,
"grad_norm": 10.07719445107742,
"learning_rate": 6.991163461397424e-07,
"loss": 0.198,
"step": 750
},
{
"epoch": 0.8886255924170616,
"eval_loss": 0.21252204477787018,
"eval_runtime": 55.1978,
"eval_samples_per_second": 14.91,
"eval_steps_per_second": 0.942,
"step": 750
},
{
"epoch": 0.8898104265402843,
"grad_norm": 8.952461064774775,
"learning_rate": 6.981475784720262e-07,
"loss": 0.258,
"step": 751
},
{
"epoch": 0.8909952606635071,
"grad_norm": 20.719939479561397,
"learning_rate": 6.971779275566593e-07,
"loss": 0.2014,
"step": 752
},
{
"epoch": 0.8921800947867299,
"grad_norm": 7.936070074068664,
"learning_rate": 6.962073977158842e-07,
"loss": 0.1962,
"step": 753
},
{
"epoch": 0.8933649289099526,
"grad_norm": 15.527186441037935,
"learning_rate": 6.952359932758615e-07,
"loss": 0.1838,
"step": 754
},
{
"epoch": 0.8945497630331753,
"grad_norm": 11.785111972126451,
"learning_rate": 6.9426371856665e-07,
"loss": 0.2201,
"step": 755
},
{
"epoch": 0.8957345971563981,
"grad_norm": 20.18101839777207,
"learning_rate": 6.93290577922188e-07,
"loss": 0.2597,
"step": 756
},
{
"epoch": 0.8969194312796208,
"grad_norm": 9.9152594543863,
"learning_rate": 6.923165756802733e-07,
"loss": 0.1701,
"step": 757
},
{
"epoch": 0.8981042654028436,
"grad_norm": 20.043984042966567,
"learning_rate": 6.913417161825449e-07,
"loss": 0.2574,
"step": 758
},
{
"epoch": 0.8992890995260664,
"grad_norm": 21.123597700553983,
"learning_rate": 6.903660037744626e-07,
"loss": 0.2426,
"step": 759
},
{
"epoch": 0.9004739336492891,
"grad_norm": 17.3636230361985,
"learning_rate": 6.89389442805288e-07,
"loss": 0.2198,
"step": 760
},
{
"epoch": 0.9016587677725119,
"grad_norm": 7.896724380954798,
"learning_rate": 6.884120376280657e-07,
"loss": 0.2368,
"step": 761
},
{
"epoch": 0.9028436018957346,
"grad_norm": 10.744383746333114,
"learning_rate": 6.874337925996028e-07,
"loss": 0.2166,
"step": 762
},
{
"epoch": 0.9040284360189573,
"grad_norm": 11.973048617938426,
"learning_rate": 6.864547120804505e-07,
"loss": 0.2149,
"step": 763
},
{
"epoch": 0.9052132701421801,
"grad_norm": 15.374227350719309,
"learning_rate": 6.85474800434884e-07,
"loss": 0.1623,
"step": 764
},
{
"epoch": 0.9063981042654028,
"grad_norm": 9.460007923138491,
"learning_rate": 6.84494062030883e-07,
"loss": 0.1601,
"step": 765
},
{
"epoch": 0.9075829383886256,
"grad_norm": 8.169757295616728,
"learning_rate": 6.835125012401131e-07,
"loss": 0.1693,
"step": 766
},
{
"epoch": 0.9087677725118484,
"grad_norm": 16.061474109942935,
"learning_rate": 6.825301224379056e-07,
"loss": 0.2211,
"step": 767
},
{
"epoch": 0.909952606635071,
"grad_norm": 10.477258738350997,
"learning_rate": 6.815469300032373e-07,
"loss": 0.2279,
"step": 768
},
{
"epoch": 0.9111374407582938,
"grad_norm": 10.65538754992105,
"learning_rate": 6.805629283187129e-07,
"loss": 0.1925,
"step": 769
},
{
"epoch": 0.9123222748815166,
"grad_norm": 9.123234339056102,
"learning_rate": 6.795781217705435e-07,
"loss": 0.2253,
"step": 770
},
{
"epoch": 0.9135071090047393,
"grad_norm": 10.377714671981227,
"learning_rate": 6.785925147485285e-07,
"loss": 0.1754,
"step": 771
},
{
"epoch": 0.9146919431279621,
"grad_norm": 10.393576077978427,
"learning_rate": 6.776061116460352e-07,
"loss": 0.2009,
"step": 772
},
{
"epoch": 0.9158767772511849,
"grad_norm": 16.02077338864102,
"learning_rate": 6.766189168599789e-07,
"loss": 0.1636,
"step": 773
},
{
"epoch": 0.9170616113744076,
"grad_norm": 14.519560151061562,
"learning_rate": 6.756309347908051e-07,
"loss": 0.2455,
"step": 774
},
{
"epoch": 0.9182464454976303,
"grad_norm": 21.53296240222359,
"learning_rate": 6.746421698424676e-07,
"loss": 0.1942,
"step": 775
},
{
"epoch": 0.919431279620853,
"grad_norm": 10.224973261982292,
"learning_rate": 6.7365262642241e-07,
"loss": 0.1958,
"step": 776
},
{
"epoch": 0.9206161137440758,
"grad_norm": 12.82882389122881,
"learning_rate": 6.726623089415467e-07,
"loss": 0.211,
"step": 777
},
{
"epoch": 0.9218009478672986,
"grad_norm": 15.696609544363108,
"learning_rate": 6.716712218142413e-07,
"loss": 0.281,
"step": 778
},
{
"epoch": 0.9229857819905213,
"grad_norm": 22.27881823197468,
"learning_rate": 6.706793694582891e-07,
"loss": 0.2251,
"step": 779
},
{
"epoch": 0.9241706161137441,
"grad_norm": 15.024010449291305,
"learning_rate": 6.696867562948962e-07,
"loss": 0.2681,
"step": 780
},
{
"epoch": 0.9253554502369669,
"grad_norm": 20.247007205993885,
"learning_rate": 6.686933867486596e-07,
"loss": 0.2191,
"step": 781
},
{
"epoch": 0.9265402843601895,
"grad_norm": 7.998974278453153,
"learning_rate": 6.676992652475486e-07,
"loss": 0.1871,
"step": 782
},
{
"epoch": 0.9277251184834123,
"grad_norm": 10.708704166774769,
"learning_rate": 6.667043962228838e-07,
"loss": 0.182,
"step": 783
},
{
"epoch": 0.9289099526066351,
"grad_norm": 8.78684615317003,
"learning_rate": 6.657087841093179e-07,
"loss": 0.1568,
"step": 784
},
{
"epoch": 0.9300947867298578,
"grad_norm": 9.778799963563257,
"learning_rate": 6.647124333448164e-07,
"loss": 0.2085,
"step": 785
},
{
"epoch": 0.9312796208530806,
"grad_norm": 12.864160262988083,
"learning_rate": 6.637153483706368e-07,
"loss": 0.2463,
"step": 786
},
{
"epoch": 0.9324644549763034,
"grad_norm": 24.981518255133043,
"learning_rate": 6.6271753363131e-07,
"loss": 0.1694,
"step": 787
},
{
"epoch": 0.933649289099526,
"grad_norm": 32.80984613365954,
"learning_rate": 6.61718993574619e-07,
"loss": 0.2152,
"step": 788
},
{
"epoch": 0.9348341232227488,
"grad_norm": 14.086295795520133,
"learning_rate": 6.607197326515807e-07,
"loss": 0.1499,
"step": 789
},
{
"epoch": 0.9360189573459715,
"grad_norm": 13.406991924570532,
"learning_rate": 6.597197553164251e-07,
"loss": 0.3099,
"step": 790
},
{
"epoch": 0.9372037914691943,
"grad_norm": 13.492701196428424,
"learning_rate": 6.587190660265751e-07,
"loss": 0.1985,
"step": 791
},
{
"epoch": 0.9383886255924171,
"grad_norm": 12.689739842206874,
"learning_rate": 6.577176692426278e-07,
"loss": 0.2184,
"step": 792
},
{
"epoch": 0.9395734597156398,
"grad_norm": 25.733609867586548,
"learning_rate": 6.567155694283336e-07,
"loss": 0.1801,
"step": 793
},
{
"epoch": 0.9407582938388626,
"grad_norm": 30.12391831939558,
"learning_rate": 6.55712771050577e-07,
"loss": 0.2542,
"step": 794
},
{
"epoch": 0.9419431279620853,
"grad_norm": 29.270469329492194,
"learning_rate": 6.547092785793559e-07,
"loss": 0.2172,
"step": 795
},
{
"epoch": 0.943127962085308,
"grad_norm": 40.42865798160653,
"learning_rate": 6.537050964877625e-07,
"loss": 0.2777,
"step": 796
},
{
"epoch": 0.9443127962085308,
"grad_norm": 16.58432837538837,
"learning_rate": 6.527002292519629e-07,
"loss": 0.227,
"step": 797
},
{
"epoch": 0.9454976303317536,
"grad_norm": 12.821494884458682,
"learning_rate": 6.516946813511773e-07,
"loss": 0.1727,
"step": 798
},
{
"epoch": 0.9466824644549763,
"grad_norm": 22.825107619536553,
"learning_rate": 6.5068845726766e-07,
"loss": 0.207,
"step": 799
},
{
"epoch": 0.9478672985781991,
"grad_norm": 31.69359524459485,
"learning_rate": 6.496815614866791e-07,
"loss": 0.2378,
"step": 800
},
{
"epoch": 0.9478672985781991,
"eval_loss": 0.27354177832603455,
"eval_runtime": 62.6028,
"eval_samples_per_second": 13.146,
"eval_steps_per_second": 0.831,
"step": 800
},
{
"epoch": 0.9490521327014217,
"grad_norm": 37.52743059970339,
"learning_rate": 6.486739984964971e-07,
"loss": 0.2762,
"step": 801
},
{
"epoch": 0.9502369668246445,
"grad_norm": 38.872276944043726,
"learning_rate": 6.476657727883506e-07,
"loss": 0.1925,
"step": 802
},
{
"epoch": 0.9514218009478673,
"grad_norm": 15.289062834224044,
"learning_rate": 6.466568888564302e-07,
"loss": 0.276,
"step": 803
},
{
"epoch": 0.95260663507109,
"grad_norm": 15.911245985870275,
"learning_rate": 6.456473511978606e-07,
"loss": 0.1977,
"step": 804
},
{
"epoch": 0.9537914691943128,
"grad_norm": 8.593962442854156,
"learning_rate": 6.446371643126805e-07,
"loss": 0.2118,
"step": 805
},
{
"epoch": 0.9549763033175356,
"grad_norm": 11.503080592385206,
"learning_rate": 6.436263327038224e-07,
"loss": 0.2003,
"step": 806
},
{
"epoch": 0.9561611374407583,
"grad_norm": 16.784875802136042,
"learning_rate": 6.426148608770928e-07,
"loss": 0.211,
"step": 807
},
{
"epoch": 0.957345971563981,
"grad_norm": 8.039773337530425,
"learning_rate": 6.416027533411519e-07,
"loss": 0.1776,
"step": 808
},
{
"epoch": 0.9585308056872038,
"grad_norm": 35.99706053391443,
"learning_rate": 6.40590014607494e-07,
"loss": 0.2358,
"step": 809
},
{
"epoch": 0.9597156398104265,
"grad_norm": 40.99321152622947,
"learning_rate": 6.395766491904262e-07,
"loss": 0.3066,
"step": 810
},
{
"epoch": 0.9609004739336493,
"grad_norm": 18.164194558552783,
"learning_rate": 6.385626616070498e-07,
"loss": 0.1893,
"step": 811
},
{
"epoch": 0.9620853080568721,
"grad_norm": 7.741779460691234,
"learning_rate": 6.375480563772389e-07,
"loss": 0.1874,
"step": 812
},
{
"epoch": 0.9632701421800948,
"grad_norm": 8.638620885624041,
"learning_rate": 6.365328380236213e-07,
"loss": 0.2293,
"step": 813
},
{
"epoch": 0.9644549763033176,
"grad_norm": 8.26453063301268,
"learning_rate": 6.355170110715571e-07,
"loss": 0.2436,
"step": 814
},
{
"epoch": 0.9656398104265402,
"grad_norm": 16.74712475701176,
"learning_rate": 6.3450058004912e-07,
"loss": 0.185,
"step": 815
},
{
"epoch": 0.966824644549763,
"grad_norm": 11.579496290026954,
"learning_rate": 6.334835494870758e-07,
"loss": 0.2624,
"step": 816
},
{
"epoch": 0.9680094786729858,
"grad_norm": 23.226048329711308,
"learning_rate": 6.32465923918863e-07,
"loss": 0.225,
"step": 817
},
{
"epoch": 0.9691943127962085,
"grad_norm": 17.655075691563827,
"learning_rate": 6.314477078805723e-07,
"loss": 0.215,
"step": 818
},
{
"epoch": 0.9703791469194313,
"grad_norm": 13.202128531234838,
"learning_rate": 6.304289059109267e-07,
"loss": 0.2403,
"step": 819
},
{
"epoch": 0.9715639810426541,
"grad_norm": 13.076408764926182,
"learning_rate": 6.294095225512604e-07,
"loss": 0.1663,
"step": 820
},
{
"epoch": 0.9727488151658767,
"grad_norm": 14.171619384960582,
"learning_rate": 6.283895623454997e-07,
"loss": 0.2774,
"step": 821
},
{
"epoch": 0.9739336492890995,
"grad_norm": 19.8877660661784,
"learning_rate": 6.273690298401419e-07,
"loss": 0.2723,
"step": 822
},
{
"epoch": 0.9751184834123223,
"grad_norm": 6.900387582314849,
"learning_rate": 6.263479295842357e-07,
"loss": 0.1971,
"step": 823
},
{
"epoch": 0.976303317535545,
"grad_norm": 18.414912712274486,
"learning_rate": 6.253262661293602e-07,
"loss": 0.2433,
"step": 824
},
{
"epoch": 0.9774881516587678,
"grad_norm": 15.35186217689985,
"learning_rate": 6.243040440296051e-07,
"loss": 0.1789,
"step": 825
},
{
"epoch": 0.9786729857819905,
"grad_norm": 10.207560328723083,
"learning_rate": 6.232812678415504e-07,
"loss": 0.2385,
"step": 826
},
{
"epoch": 0.9798578199052133,
"grad_norm": 9.098282908972523,
"learning_rate": 6.222579421242455e-07,
"loss": 0.2396,
"step": 827
},
{
"epoch": 0.981042654028436,
"grad_norm": 8.545110006193829,
"learning_rate": 6.2123407143919e-07,
"loss": 0.2131,
"step": 828
},
{
"epoch": 0.9822274881516587,
"grad_norm": 7.862000410634824,
"learning_rate": 6.202096603503122e-07,
"loss": 0.1932,
"step": 829
},
{
"epoch": 0.9834123222748815,
"grad_norm": 16.06519254165904,
"learning_rate": 6.191847134239495e-07,
"loss": 0.1942,
"step": 830
},
{
"epoch": 0.9845971563981043,
"grad_norm": 8.103880149384155,
"learning_rate": 6.181592352288279e-07,
"loss": 0.1843,
"step": 831
},
{
"epoch": 0.985781990521327,
"grad_norm": 9.399017520546249,
"learning_rate": 6.17133230336041e-07,
"loss": 0.2042,
"step": 832
},
{
"epoch": 0.9869668246445498,
"grad_norm": 11.17143516636379,
"learning_rate": 6.16106703319031e-07,
"loss": 0.2414,
"step": 833
},
{
"epoch": 0.9881516587677726,
"grad_norm": 16.347223668954097,
"learning_rate": 6.150796587535669e-07,
"loss": 0.2388,
"step": 834
},
{
"epoch": 0.9893364928909952,
"grad_norm": 24.954899681915954,
"learning_rate": 6.140521012177249e-07,
"loss": 0.1575,
"step": 835
},
{
"epoch": 0.990521327014218,
"grad_norm": 8.054339443033578,
"learning_rate": 6.130240352918674e-07,
"loss": 0.1829,
"step": 836
},
{
"epoch": 0.9917061611374408,
"grad_norm": 11.535190625870067,
"learning_rate": 6.119954655586236e-07,
"loss": 0.2162,
"step": 837
},
{
"epoch": 0.9928909952606635,
"grad_norm": 24.1831071367917,
"learning_rate": 6.10966396602868e-07,
"loss": 0.2608,
"step": 838
},
{
"epoch": 0.9940758293838863,
"grad_norm": 8.55670621920293,
"learning_rate": 6.099368330117004e-07,
"loss": 0.1847,
"step": 839
},
{
"epoch": 0.995260663507109,
"grad_norm": 12.190546843323471,
"learning_rate": 6.089067793744257e-07,
"loss": 0.1573,
"step": 840
},
{
"epoch": 0.9964454976303317,
"grad_norm": 24.359810277866014,
"learning_rate": 6.078762402825331e-07,
"loss": 0.2092,
"step": 841
},
{
"epoch": 0.9976303317535545,
"grad_norm": 9.263467659494774,
"learning_rate": 6.068452203296754e-07,
"loss": 0.1749,
"step": 842
},
{
"epoch": 0.9988151658767772,
"grad_norm": 10.629198605852931,
"learning_rate": 6.058137241116493e-07,
"loss": 0.1846,
"step": 843
},
{
"epoch": 1.0,
"grad_norm": 15.124988874606263,
"learning_rate": 6.047817562263743e-07,
"loss": 0.2534,
"step": 844
},
{
"epoch": 1.0011848341232228,
"grad_norm": 12.707802633067423,
"learning_rate": 6.037493212738722e-07,
"loss": 0.2384,
"step": 845
}
],
"logging_steps": 1,
"max_steps": 1688,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 169,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 164206322122752.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}