PEFT
Safetensors
glm4
axolotl
Generated from Trainer
GLM-v2-lora / checkpoint-772 /trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
6c23bbb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 97,
"global_step": 772,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025906735751295338,
"grad_norm": 758.2562349755826,
"learning_rate": 0.0,
"loss": 1.3719,
"step": 1
},
{
"epoch": 0.0025906735751295338,
"eval_loss": 1.3159157037734985,
"eval_runtime": 36.907,
"eval_samples_per_second": 20.159,
"eval_steps_per_second": 1.273,
"step": 1
},
{
"epoch": 0.0051813471502590676,
"grad_norm": 666.308184823038,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.36,
"step": 2
},
{
"epoch": 0.007772020725388601,
"grad_norm": 211.0771195353068,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.3746,
"step": 3
},
{
"epoch": 0.010362694300518135,
"grad_norm": 431.5114709683218,
"learning_rate": 3e-06,
"loss": 1.3412,
"step": 4
},
{
"epoch": 0.012953367875647668,
"grad_norm": 230.87468433791625,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3837,
"step": 5
},
{
"epoch": 0.015544041450777202,
"grad_norm": 635.1636587738542,
"learning_rate": 5e-06,
"loss": 1.3761,
"step": 6
},
{
"epoch": 0.018134715025906734,
"grad_norm": 791.5536958334704,
"learning_rate": 6e-06,
"loss": 1.2855,
"step": 7
},
{
"epoch": 0.02072538860103627,
"grad_norm": 667.7197994216477,
"learning_rate": 7e-06,
"loss": 1.3267,
"step": 8
},
{
"epoch": 0.023316062176165803,
"grad_norm": 254.3855973692125,
"learning_rate": 8.000000000000001e-06,
"loss": 1.2977,
"step": 9
},
{
"epoch": 0.025906735751295335,
"grad_norm": 162.29347257682093,
"learning_rate": 9e-06,
"loss": 1.3522,
"step": 10
},
{
"epoch": 0.02849740932642487,
"grad_norm": 352.6352930651456,
"learning_rate": 1e-05,
"loss": 1.2688,
"step": 11
},
{
"epoch": 0.031088082901554404,
"grad_norm": 148.2629265526552,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.3342,
"step": 12
},
{
"epoch": 0.03367875647668394,
"grad_norm": 249.88753789723657,
"learning_rate": 1.2e-05,
"loss": 1.2983,
"step": 13
},
{
"epoch": 0.03626943005181347,
"grad_norm": 184.03358422636597,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.3291,
"step": 14
},
{
"epoch": 0.038860103626943004,
"grad_norm": 198.4491469860763,
"learning_rate": 1.4e-05,
"loss": 1.4014,
"step": 15
},
{
"epoch": 0.04145077720207254,
"grad_norm": 680.9537058769038,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.3775,
"step": 16
},
{
"epoch": 0.04404145077720207,
"grad_norm": 563.0247638614801,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3228,
"step": 17
},
{
"epoch": 0.046632124352331605,
"grad_norm": 271.985463813746,
"learning_rate": 1.7e-05,
"loss": 1.3695,
"step": 18
},
{
"epoch": 0.04922279792746114,
"grad_norm": 399.51218452223316,
"learning_rate": 1.8e-05,
"loss": 1.2556,
"step": 19
},
{
"epoch": 0.05181347150259067,
"grad_norm": 160.70697055826656,
"learning_rate": 1.9e-05,
"loss": 1.2982,
"step": 20
},
{
"epoch": 0.054404145077720206,
"grad_norm": 227.8927504687491,
"learning_rate": 2e-05,
"loss": 1.3532,
"step": 21
},
{
"epoch": 0.05699481865284974,
"grad_norm": 550.1538868076032,
"learning_rate": 2.1000000000000002e-05,
"loss": 1.2603,
"step": 22
},
{
"epoch": 0.05958549222797927,
"grad_norm": 291.8994359919024,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.3663,
"step": 23
},
{
"epoch": 0.06217616580310881,
"grad_norm": 120.60677833129643,
"learning_rate": 2.3e-05,
"loss": 1.3129,
"step": 24
},
{
"epoch": 0.06476683937823834,
"grad_norm": 414.4006662101242,
"learning_rate": 2.4e-05,
"loss": 1.3037,
"step": 25
},
{
"epoch": 0.06735751295336788,
"grad_norm": 141.48324465317884,
"learning_rate": 2.5e-05,
"loss": 1.3095,
"step": 26
},
{
"epoch": 0.06994818652849741,
"grad_norm": 147.86066819937994,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.2372,
"step": 27
},
{
"epoch": 0.07253886010362694,
"grad_norm": 214.47337614964576,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.3384,
"step": 28
},
{
"epoch": 0.07512953367875648,
"grad_norm": 898.4324889241673,
"learning_rate": 2.8e-05,
"loss": 1.2003,
"step": 29
},
{
"epoch": 0.07772020725388601,
"grad_norm": 128.83026557596128,
"learning_rate": 2.9e-05,
"loss": 1.2172,
"step": 30
},
{
"epoch": 0.08031088082901554,
"grad_norm": 183.0777862405529,
"learning_rate": 3.0000000000000004e-05,
"loss": 1.2674,
"step": 31
},
{
"epoch": 0.08290155440414508,
"grad_norm": 119.01841833358732,
"learning_rate": 3.1e-05,
"loss": 1.2554,
"step": 32
},
{
"epoch": 0.08549222797927461,
"grad_norm": 117.65980267542858,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.2716,
"step": 33
},
{
"epoch": 0.08808290155440414,
"grad_norm": 82.40151099433953,
"learning_rate": 3.3e-05,
"loss": 1.2019,
"step": 34
},
{
"epoch": 0.09067357512953368,
"grad_norm": 82.61816783653785,
"learning_rate": 3.4e-05,
"loss": 1.2424,
"step": 35
},
{
"epoch": 0.09326424870466321,
"grad_norm": 136.42743433868276,
"learning_rate": 3.5000000000000004e-05,
"loss": 1.2066,
"step": 36
},
{
"epoch": 0.09585492227979274,
"grad_norm": 36.775911657584444,
"learning_rate": 3.6e-05,
"loss": 1.2485,
"step": 37
},
{
"epoch": 0.09844559585492228,
"grad_norm": 56.55022603284064,
"learning_rate": 3.7000000000000005e-05,
"loss": 1.2112,
"step": 38
},
{
"epoch": 0.10103626943005181,
"grad_norm": 50.09896932886107,
"learning_rate": 3.8e-05,
"loss": 1.2027,
"step": 39
},
{
"epoch": 0.10362694300518134,
"grad_norm": 54.2661481198025,
"learning_rate": 3.9e-05,
"loss": 1.2673,
"step": 40
},
{
"epoch": 0.10621761658031088,
"grad_norm": 60.04145981731815,
"learning_rate": 4e-05,
"loss": 1.1648,
"step": 41
},
{
"epoch": 0.10880829015544041,
"grad_norm": 169.47741055545822,
"learning_rate": 3.999981580539036e-05,
"loss": 1.2393,
"step": 42
},
{
"epoch": 0.11139896373056994,
"grad_norm": 43.64716987307323,
"learning_rate": 3.9999263224954204e-05,
"loss": 1.2906,
"step": 43
},
{
"epoch": 0.11398963730569948,
"grad_norm": 51.3206609767585,
"learning_rate": 3.999834226886976e-05,
"loss": 1.1807,
"step": 44
},
{
"epoch": 0.11658031088082901,
"grad_norm": 38.95055887413869,
"learning_rate": 3.999705295410054e-05,
"loss": 1.1825,
"step": 45
},
{
"epoch": 0.11917098445595854,
"grad_norm": 40.59968974426338,
"learning_rate": 3.999539530439504e-05,
"loss": 1.193,
"step": 46
},
{
"epoch": 0.12176165803108809,
"grad_norm": 34.5796571445333,
"learning_rate": 3.9993369350286265e-05,
"loss": 1.2127,
"step": 47
},
{
"epoch": 0.12435233160621761,
"grad_norm": 37.97693356149241,
"learning_rate": 3.99909751290912e-05,
"loss": 1.1543,
"step": 48
},
{
"epoch": 0.12694300518134716,
"grad_norm": 82.9217015858092,
"learning_rate": 3.9988212684910107e-05,
"loss": 1.2329,
"step": 49
},
{
"epoch": 0.12953367875647667,
"grad_norm": 49.256542144400214,
"learning_rate": 3.9985082068625724e-05,
"loss": 1.212,
"step": 50
},
{
"epoch": 0.13212435233160622,
"grad_norm": 45.025980435259484,
"learning_rate": 3.998158333790231e-05,
"loss": 1.2129,
"step": 51
},
{
"epoch": 0.13471502590673576,
"grad_norm": 45.98465689592428,
"learning_rate": 3.99777165571846e-05,
"loss": 1.1709,
"step": 52
},
{
"epoch": 0.13730569948186527,
"grad_norm": 43.481241408477906,
"learning_rate": 3.997348179769661e-05,
"loss": 1.1614,
"step": 53
},
{
"epoch": 0.13989637305699482,
"grad_norm": 82.17633750834132,
"learning_rate": 3.996887913744033e-05,
"loss": 1.2205,
"step": 54
},
{
"epoch": 0.14248704663212436,
"grad_norm": 53.0176514970764,
"learning_rate": 3.9963908661194285e-05,
"loss": 1.1204,
"step": 55
},
{
"epoch": 0.14507772020725387,
"grad_norm": 67.86382426995611,
"learning_rate": 3.995857046051196e-05,
"loss": 1.1839,
"step": 56
},
{
"epoch": 0.14766839378238342,
"grad_norm": 31.282407703790597,
"learning_rate": 3.995286463372013e-05,
"loss": 1.2126,
"step": 57
},
{
"epoch": 0.15025906735751296,
"grad_norm": 52.200764429265604,
"learning_rate": 3.994679128591706e-05,
"loss": 1.2036,
"step": 58
},
{
"epoch": 0.15284974093264247,
"grad_norm": 60.706608653531895,
"learning_rate": 3.9940350528970535e-05,
"loss": 1.1848,
"step": 59
},
{
"epoch": 0.15544041450777202,
"grad_norm": 47.31754062899529,
"learning_rate": 3.993354248151583e-05,
"loss": 1.0869,
"step": 60
},
{
"epoch": 0.15803108808290156,
"grad_norm": 49.42450836392811,
"learning_rate": 3.9926367268953514e-05,
"loss": 1.2651,
"step": 61
},
{
"epoch": 0.16062176165803108,
"grad_norm": 38.791167030088886,
"learning_rate": 3.991882502344712e-05,
"loss": 1.1881,
"step": 62
},
{
"epoch": 0.16321243523316062,
"grad_norm": 56.16339499737216,
"learning_rate": 3.991091588392077e-05,
"loss": 1.1518,
"step": 63
},
{
"epoch": 0.16580310880829016,
"grad_norm": 861.8559063020828,
"learning_rate": 3.990263999605652e-05,
"loss": 1.1614,
"step": 64
},
{
"epoch": 0.16839378238341968,
"grad_norm": 50.92822786500888,
"learning_rate": 3.989399751229179e-05,
"loss": 1.1998,
"step": 65
},
{
"epoch": 0.17098445595854922,
"grad_norm": 31.04121324055666,
"learning_rate": 3.988498859181645e-05,
"loss": 1.1795,
"step": 66
},
{
"epoch": 0.17357512953367876,
"grad_norm": 50.33061983380845,
"learning_rate": 3.9875613400569975e-05,
"loss": 1.1742,
"step": 67
},
{
"epoch": 0.17616580310880828,
"grad_norm": 75.20462514003519,
"learning_rate": 3.986587211123833e-05,
"loss": 1.1856,
"step": 68
},
{
"epoch": 0.17875647668393782,
"grad_norm": 38.82139317052205,
"learning_rate": 3.98557649032508e-05,
"loss": 1.1529,
"step": 69
},
{
"epoch": 0.18134715025906736,
"grad_norm": 36.55988806615175,
"learning_rate": 3.984529196277674e-05,
"loss": 1.1884,
"step": 70
},
{
"epoch": 0.18393782383419688,
"grad_norm": 104.8931793971097,
"learning_rate": 3.983445348272203e-05,
"loss": 1.2182,
"step": 71
},
{
"epoch": 0.18652849740932642,
"grad_norm": 36.50395409234617,
"learning_rate": 3.982324966272566e-05,
"loss": 1.1609,
"step": 72
},
{
"epoch": 0.18911917098445596,
"grad_norm": 35.019191693448626,
"learning_rate": 3.981168070915594e-05,
"loss": 1.173,
"step": 73
},
{
"epoch": 0.19170984455958548,
"grad_norm": 33.378390048053596,
"learning_rate": 3.979974683510677e-05,
"loss": 1.173,
"step": 74
},
{
"epoch": 0.19430051813471502,
"grad_norm": 43.356840136984154,
"learning_rate": 3.978744826039366e-05,
"loss": 1.2032,
"step": 75
},
{
"epoch": 0.19689119170984457,
"grad_norm": 31.285725922510768,
"learning_rate": 3.977478521154974e-05,
"loss": 1.1569,
"step": 76
},
{
"epoch": 0.19948186528497408,
"grad_norm": 35.19264482867074,
"learning_rate": 3.9761757921821544e-05,
"loss": 1.1365,
"step": 77
},
{
"epoch": 0.20207253886010362,
"grad_norm": 44.66037256551279,
"learning_rate": 3.974836663116472e-05,
"loss": 1.164,
"step": 78
},
{
"epoch": 0.20466321243523317,
"grad_norm": 68.91101457952654,
"learning_rate": 3.973461158623963e-05,
"loss": 1.2256,
"step": 79
},
{
"epoch": 0.20725388601036268,
"grad_norm": 45.866521854583,
"learning_rate": 3.9720493040406786e-05,
"loss": 1.1697,
"step": 80
},
{
"epoch": 0.20984455958549222,
"grad_norm": 59.63095169617338,
"learning_rate": 3.970601125372218e-05,
"loss": 1.2094,
"step": 81
},
{
"epoch": 0.21243523316062177,
"grad_norm": 39.085597271064216,
"learning_rate": 3.9691166492932535e-05,
"loss": 1.1048,
"step": 82
},
{
"epoch": 0.21502590673575128,
"grad_norm": 36.40256073477861,
"learning_rate": 3.9675959031470336e-05,
"loss": 1.248,
"step": 83
},
{
"epoch": 0.21761658031088082,
"grad_norm": 29.846921716586085,
"learning_rate": 3.966038914944881e-05,
"loss": 1.1718,
"step": 84
},
{
"epoch": 0.22020725388601037,
"grad_norm": 50.87052190327881,
"learning_rate": 3.964445713365682e-05,
"loss": 1.1529,
"step": 85
},
{
"epoch": 0.22279792746113988,
"grad_norm": 35.32915760431302,
"learning_rate": 3.9628163277553486e-05,
"loss": 1.1767,
"step": 86
},
{
"epoch": 0.22538860103626943,
"grad_norm": 157.5587514654703,
"learning_rate": 3.961150788126286e-05,
"loss": 1.2194,
"step": 87
},
{
"epoch": 0.22797927461139897,
"grad_norm": 25.03485489120971,
"learning_rate": 3.9594491251568376e-05,
"loss": 1.1392,
"step": 88
},
{
"epoch": 0.23056994818652848,
"grad_norm": 80.55933867045263,
"learning_rate": 3.957711370190716e-05,
"loss": 1.1819,
"step": 89
},
{
"epoch": 0.23316062176165803,
"grad_norm": 272.22874004071406,
"learning_rate": 3.9559375552364325e-05,
"loss": 1.0998,
"step": 90
},
{
"epoch": 0.23575129533678757,
"grad_norm": 91.94671663482514,
"learning_rate": 3.954127712966702e-05,
"loss": 1.2494,
"step": 91
},
{
"epoch": 0.23834196891191708,
"grad_norm": 54.31533598131098,
"learning_rate": 3.952281876717843e-05,
"loss": 1.1385,
"step": 92
},
{
"epoch": 0.24093264248704663,
"grad_norm": 103.20789745908105,
"learning_rate": 3.950400080489165e-05,
"loss": 1.1398,
"step": 93
},
{
"epoch": 0.24352331606217617,
"grad_norm": 45.14746362545893,
"learning_rate": 3.94848235894234e-05,
"loss": 1.2697,
"step": 94
},
{
"epoch": 0.24611398963730569,
"grad_norm": 21.271923336142002,
"learning_rate": 3.9465287474007654e-05,
"loss": 1.1397,
"step": 95
},
{
"epoch": 0.24870466321243523,
"grad_norm": 93.89786795431422,
"learning_rate": 3.944539281848912e-05,
"loss": 1.1542,
"step": 96
},
{
"epoch": 0.25129533678756477,
"grad_norm": 32.38768349342839,
"learning_rate": 3.942513998931663e-05,
"loss": 1.1693,
"step": 97
},
{
"epoch": 0.25129533678756477,
"eval_loss": 1.1344976425170898,
"eval_runtime": 37.8807,
"eval_samples_per_second": 19.641,
"eval_steps_per_second": 1.241,
"step": 97
},
{
"epoch": 0.2538860103626943,
"grad_norm": 91.41293468177638,
"learning_rate": 3.940452935953639e-05,
"loss": 1.1724,
"step": 98
},
{
"epoch": 0.25647668393782386,
"grad_norm": 39.20645478419229,
"learning_rate": 3.9383561308785075e-05,
"loss": 1.1583,
"step": 99
},
{
"epoch": 0.25906735751295334,
"grad_norm": 35.32804513153546,
"learning_rate": 3.9362236223282885e-05,
"loss": 1.158,
"step": 100
},
{
"epoch": 0.2616580310880829,
"grad_norm": 35.24783762804842,
"learning_rate": 3.934055449582641e-05,
"loss": 1.1552,
"step": 101
},
{
"epoch": 0.26424870466321243,
"grad_norm": 33.743808031979775,
"learning_rate": 3.931851652578137e-05,
"loss": 1.264,
"step": 102
},
{
"epoch": 0.266839378238342,
"grad_norm": 113.49798793226394,
"learning_rate": 3.92961227190753e-05,
"loss": 1.2361,
"step": 103
},
{
"epoch": 0.2694300518134715,
"grad_norm": 31.813807349410364,
"learning_rate": 3.9273373488190036e-05,
"loss": 1.1246,
"step": 104
},
{
"epoch": 0.27202072538860106,
"grad_norm": 29.391695486306187,
"learning_rate": 3.925026925215417e-05,
"loss": 1.1142,
"step": 105
},
{
"epoch": 0.27461139896373055,
"grad_norm": 33.79933331839905,
"learning_rate": 3.922681043653526e-05,
"loss": 1.1401,
"step": 106
},
{
"epoch": 0.2772020725388601,
"grad_norm": 39.09509012730907,
"learning_rate": 3.920299747343204e-05,
"loss": 1.1822,
"step": 107
},
{
"epoch": 0.27979274611398963,
"grad_norm": 37.81471938433609,
"learning_rate": 3.9178830801466465e-05,
"loss": 1.1592,
"step": 108
},
{
"epoch": 0.2823834196891192,
"grad_norm": 69.07753778460207,
"learning_rate": 3.915431086577561e-05,
"loss": 1.1683,
"step": 109
},
{
"epoch": 0.2849740932642487,
"grad_norm": 28.864787246081605,
"learning_rate": 3.912943811800347e-05,
"loss": 1.1179,
"step": 110
},
{
"epoch": 0.28756476683937826,
"grad_norm": 28.842042951717836,
"learning_rate": 3.910421301629264e-05,
"loss": 1.1317,
"step": 111
},
{
"epoch": 0.29015544041450775,
"grad_norm": 51.475482074695506,
"learning_rate": 3.9078636025275904e-05,
"loss": 1.1451,
"step": 112
},
{
"epoch": 0.2927461139896373,
"grad_norm": 33.48279556713943,
"learning_rate": 3.9052707616067654e-05,
"loss": 1.1554,
"step": 113
},
{
"epoch": 0.29533678756476683,
"grad_norm": 21.279603575929844,
"learning_rate": 3.9026428266255205e-05,
"loss": 1.1636,
"step": 114
},
{
"epoch": 0.2979274611398964,
"grad_norm": 36.226178034876675,
"learning_rate": 3.899979845989003e-05,
"loss": 1.1966,
"step": 115
},
{
"epoch": 0.3005181347150259,
"grad_norm": 29.90506353145981,
"learning_rate": 3.897281868747878e-05,
"loss": 1.1888,
"step": 116
},
{
"epoch": 0.30310880829015546,
"grad_norm": 36.04602777809767,
"learning_rate": 3.894548944597434e-05,
"loss": 1.2066,
"step": 117
},
{
"epoch": 0.30569948186528495,
"grad_norm": 36.42793844948301,
"learning_rate": 3.8917811238766606e-05,
"loss": 1.1712,
"step": 118
},
{
"epoch": 0.3082901554404145,
"grad_norm": 58.788967662325696,
"learning_rate": 3.888978457567323e-05,
"loss": 1.1225,
"step": 119
},
{
"epoch": 0.31088082901554404,
"grad_norm": 29.357299816022326,
"learning_rate": 3.886140997293024e-05,
"loss": 1.1315,
"step": 120
},
{
"epoch": 0.3134715025906736,
"grad_norm": 95.08345317107502,
"learning_rate": 3.883268795318252e-05,
"loss": 1.1852,
"step": 121
},
{
"epoch": 0.3160621761658031,
"grad_norm": 33.6623824593179,
"learning_rate": 3.88036190454742e-05,
"loss": 1.16,
"step": 122
},
{
"epoch": 0.31865284974093266,
"grad_norm": 42.587546987131105,
"learning_rate": 3.8774203785238886e-05,
"loss": 1.1374,
"step": 123
},
{
"epoch": 0.32124352331606215,
"grad_norm": 33.360649853064245,
"learning_rate": 3.8744442714289816e-05,
"loss": 1.1757,
"step": 124
},
{
"epoch": 0.3238341968911917,
"grad_norm": 49.09256643961471,
"learning_rate": 3.8714336380809874e-05,
"loss": 1.1782,
"step": 125
},
{
"epoch": 0.32642487046632124,
"grad_norm": 31.505007051172793,
"learning_rate": 3.86838853393415e-05,
"loss": 1.195,
"step": 126
},
{
"epoch": 0.3290155440414508,
"grad_norm": 34.36735417254799,
"learning_rate": 3.865309015077645e-05,
"loss": 1.1078,
"step": 127
},
{
"epoch": 0.3316062176165803,
"grad_norm": 36.63220606142181,
"learning_rate": 3.862195138234551e-05,
"loss": 1.1319,
"step": 128
},
{
"epoch": 0.33419689119170987,
"grad_norm": 53.324986862513676,
"learning_rate": 3.859046960760801e-05,
"loss": 1.2301,
"step": 129
},
{
"epoch": 0.33678756476683935,
"grad_norm": 47.41445409144979,
"learning_rate": 3.855864540644126e-05,
"loss": 1.2366,
"step": 130
},
{
"epoch": 0.3393782383419689,
"grad_norm": 32.57355122427366,
"learning_rate": 3.8526479365029906e-05,
"loss": 1.142,
"step": 131
},
{
"epoch": 0.34196891191709844,
"grad_norm": 28.445824333644715,
"learning_rate": 3.849397207585508e-05,
"loss": 1.0847,
"step": 132
},
{
"epoch": 0.344559585492228,
"grad_norm": 49.23062726715889,
"learning_rate": 3.846112413768353e-05,
"loss": 1.2241,
"step": 133
},
{
"epoch": 0.3471502590673575,
"grad_norm": 53.424206543788074,
"learning_rate": 3.842793615555657e-05,
"loss": 1.2392,
"step": 134
},
{
"epoch": 0.34974093264248707,
"grad_norm": 38.19316140175426,
"learning_rate": 3.8394408740778934e-05,
"loss": 1.1208,
"step": 135
},
{
"epoch": 0.35233160621761656,
"grad_norm": 32.35931252369273,
"learning_rate": 3.836054251090755e-05,
"loss": 1.1604,
"step": 136
},
{
"epoch": 0.3549222797927461,
"grad_norm": 37.90085344799495,
"learning_rate": 3.83263380897401e-05,
"loss": 1.1134,
"step": 137
},
{
"epoch": 0.35751295336787564,
"grad_norm": 44.49191588319939,
"learning_rate": 3.829179610730359e-05,
"loss": 1.1281,
"step": 138
},
{
"epoch": 0.3601036269430052,
"grad_norm": 141.98524430756757,
"learning_rate": 3.8256917199842715e-05,
"loss": 1.0928,
"step": 139
},
{
"epoch": 0.3626943005181347,
"grad_norm": 30.887093976524472,
"learning_rate": 3.822170200980815e-05,
"loss": 1.0936,
"step": 140
},
{
"epoch": 0.36528497409326427,
"grad_norm": 21.980521878837745,
"learning_rate": 3.818615118584472e-05,
"loss": 1.1368,
"step": 141
},
{
"epoch": 0.36787564766839376,
"grad_norm": 538.6650762618656,
"learning_rate": 3.815026538277943e-05,
"loss": 1.0918,
"step": 142
},
{
"epoch": 0.3704663212435233,
"grad_norm": 40.842881572203,
"learning_rate": 3.811404526160943e-05,
"loss": 1.1705,
"step": 143
},
{
"epoch": 0.37305699481865284,
"grad_norm": 26.891553492377298,
"learning_rate": 3.8077491489489835e-05,
"loss": 1.1468,
"step": 144
},
{
"epoch": 0.3756476683937824,
"grad_norm": 45.138483181178074,
"learning_rate": 3.8040604739721415e-05,
"loss": 1.1679,
"step": 145
},
{
"epoch": 0.37823834196891193,
"grad_norm": 35.133763086168244,
"learning_rate": 3.8003385691738227e-05,
"loss": 1.1029,
"step": 146
},
{
"epoch": 0.38082901554404147,
"grad_norm": 36.941250802707344,
"learning_rate": 3.7965835031095065e-05,
"loss": 1.1491,
"step": 147
},
{
"epoch": 0.38341968911917096,
"grad_norm": 90.1080256703095,
"learning_rate": 3.792795344945485e-05,
"loss": 1.1212,
"step": 148
},
{
"epoch": 0.3860103626943005,
"grad_norm": 39.70360899750413,
"learning_rate": 3.7889741644575914e-05,
"loss": 1.15,
"step": 149
},
{
"epoch": 0.38860103626943004,
"grad_norm": 28.229369877304094,
"learning_rate": 3.78512003202991e-05,
"loss": 1.1111,
"step": 150
},
{
"epoch": 0.3911917098445596,
"grad_norm": 31.611752191925987,
"learning_rate": 3.7812330186534815e-05,
"loss": 1.1366,
"step": 151
},
{
"epoch": 0.39378238341968913,
"grad_norm": 38.196015586772425,
"learning_rate": 3.777313195924998e-05,
"loss": 1.1433,
"step": 152
},
{
"epoch": 0.3963730569948187,
"grad_norm": 22.732638044547453,
"learning_rate": 3.773360636045481e-05,
"loss": 1.1125,
"step": 153
},
{
"epoch": 0.39896373056994816,
"grad_norm": 90.19158665385014,
"learning_rate": 3.7693754118189525e-05,
"loss": 1.1242,
"step": 154
},
{
"epoch": 0.4015544041450777,
"grad_norm": 42.43479974993017,
"learning_rate": 3.765357596651095e-05,
"loss": 1.1191,
"step": 155
},
{
"epoch": 0.40414507772020725,
"grad_norm": 88.0076735720364,
"learning_rate": 3.761307264547899e-05,
"loss": 1.1718,
"step": 156
},
{
"epoch": 0.4067357512953368,
"grad_norm": 30.782507703935767,
"learning_rate": 3.757224490114297e-05,
"loss": 1.109,
"step": 157
},
{
"epoch": 0.40932642487046633,
"grad_norm": 69.89871106113397,
"learning_rate": 3.7531093485527943e-05,
"loss": 1.1018,
"step": 158
},
{
"epoch": 0.4119170984455959,
"grad_norm": 37.339006645717305,
"learning_rate": 3.7489619156620796e-05,
"loss": 1.1358,
"step": 159
},
{
"epoch": 0.41450777202072536,
"grad_norm": 28.06388054378899,
"learning_rate": 3.744782267835632e-05,
"loss": 1.0847,
"step": 160
},
{
"epoch": 0.4170984455958549,
"grad_norm": 54.05874281297702,
"learning_rate": 3.740570482060311e-05,
"loss": 1.1682,
"step": 161
},
{
"epoch": 0.41968911917098445,
"grad_norm": 32.299093265328835,
"learning_rate": 3.73632663591494e-05,
"loss": 1.1413,
"step": 162
},
{
"epoch": 0.422279792746114,
"grad_norm": 31.213652090157694,
"learning_rate": 3.732050807568878e-05,
"loss": 1.1313,
"step": 163
},
{
"epoch": 0.42487046632124353,
"grad_norm": 40.01090035937505,
"learning_rate": 3.727743075780578e-05,
"loss": 1.1513,
"step": 164
},
{
"epoch": 0.4274611398963731,
"grad_norm": 47.11352577964853,
"learning_rate": 3.723403519896136e-05,
"loss": 1.2192,
"step": 165
},
{
"epoch": 0.43005181347150256,
"grad_norm": 28.645086506093037,
"learning_rate": 3.7190322198478355e-05,
"loss": 1.1097,
"step": 166
},
{
"epoch": 0.4326424870466321,
"grad_norm": 35.28541113925116,
"learning_rate": 3.7146292561526654e-05,
"loss": 1.1557,
"step": 167
},
{
"epoch": 0.43523316062176165,
"grad_norm": 58.30281063037669,
"learning_rate": 3.7101947099108425e-05,
"loss": 1.1829,
"step": 168
},
{
"epoch": 0.4378238341968912,
"grad_norm": 26.33563548968379,
"learning_rate": 3.70572866280432e-05,
"loss": 1.147,
"step": 169
},
{
"epoch": 0.44041450777202074,
"grad_norm": 57.00052875402651,
"learning_rate": 3.701231197095277e-05,
"loss": 1.1212,
"step": 170
},
{
"epoch": 0.4430051813471503,
"grad_norm": 23.672828037237174,
"learning_rate": 3.696702395624608e-05,
"loss": 1.1152,
"step": 171
},
{
"epoch": 0.44559585492227977,
"grad_norm": 41.1264174112964,
"learning_rate": 3.692142341810395e-05,
"loss": 1.1154,
"step": 172
},
{
"epoch": 0.4481865284974093,
"grad_norm": 26.72177706144361,
"learning_rate": 3.6875511196463715e-05,
"loss": 1.1725,
"step": 173
},
{
"epoch": 0.45077720207253885,
"grad_norm": 95.4088800585977,
"learning_rate": 3.682928813700375e-05,
"loss": 1.1339,
"step": 174
},
{
"epoch": 0.4533678756476684,
"grad_norm": 34.33666578349465,
"learning_rate": 3.678275509112788e-05,
"loss": 1.1867,
"step": 175
},
{
"epoch": 0.45595854922279794,
"grad_norm": 31.032304531003014,
"learning_rate": 3.6735912915949745e-05,
"loss": 1.1386,
"step": 176
},
{
"epoch": 0.4585492227979275,
"grad_norm": 55.22043313188224,
"learning_rate": 3.6688762474276945e-05,
"loss": 1.1102,
"step": 177
},
{
"epoch": 0.46113989637305697,
"grad_norm": 29.82713377876857,
"learning_rate": 3.6641304634595216e-05,
"loss": 1.1564,
"step": 178
},
{
"epoch": 0.4637305699481865,
"grad_norm": 35.71025459541737,
"learning_rate": 3.659354027105238e-05,
"loss": 1.0939,
"step": 179
},
{
"epoch": 0.46632124352331605,
"grad_norm": 52.41175655642653,
"learning_rate": 3.6545470263442265e-05,
"loss": 1.1578,
"step": 180
},
{
"epoch": 0.4689119170984456,
"grad_norm": 27.682485766528306,
"learning_rate": 3.649709549718849e-05,
"loss": 1.1875,
"step": 181
},
{
"epoch": 0.47150259067357514,
"grad_norm": 36.53293663303487,
"learning_rate": 3.6448416863328186e-05,
"loss": 1.1111,
"step": 182
},
{
"epoch": 0.4740932642487047,
"grad_norm": 31.45177998538027,
"learning_rate": 3.639943525849555e-05,
"loss": 1.113,
"step": 183
},
{
"epoch": 0.47668393782383417,
"grad_norm": 28.323097072885673,
"learning_rate": 3.635015158490533e-05,
"loss": 1.1159,
"step": 184
},
{
"epoch": 0.4792746113989637,
"grad_norm": 47.75573754341213,
"learning_rate": 3.6300566750336225e-05,
"loss": 1.1305,
"step": 185
},
{
"epoch": 0.48186528497409326,
"grad_norm": 21.384095061494357,
"learning_rate": 3.625068166811418e-05,
"loss": 1.1369,
"step": 186
},
{
"epoch": 0.4844559585492228,
"grad_norm": 30.714645036809546,
"learning_rate": 3.6200497257095504e-05,
"loss": 1.1858,
"step": 187
},
{
"epoch": 0.48704663212435234,
"grad_norm": 35.12161426399798,
"learning_rate": 3.615001444165001e-05,
"loss": 1.1293,
"step": 188
},
{
"epoch": 0.4896373056994819,
"grad_norm": 116.83443661381396,
"learning_rate": 3.6099234151643924e-05,
"loss": 1.1515,
"step": 189
},
{
"epoch": 0.49222797927461137,
"grad_norm": 55.47885243409044,
"learning_rate": 3.604815732242283e-05,
"loss": 1.112,
"step": 190
},
{
"epoch": 0.4948186528497409,
"grad_norm": 32.332747429034285,
"learning_rate": 3.5996784894794394e-05,
"loss": 1.1661,
"step": 191
},
{
"epoch": 0.49740932642487046,
"grad_norm": 33.039210183180046,
"learning_rate": 3.594511781501103e-05,
"loss": 1.1244,
"step": 192
},
{
"epoch": 0.5,
"grad_norm": 21.325687337182504,
"learning_rate": 3.58931570347525e-05,
"loss": 1.1634,
"step": 193
},
{
"epoch": 0.5025906735751295,
"grad_norm": 51.37599478469561,
"learning_rate": 3.584090351110838e-05,
"loss": 1.2106,
"step": 194
},
{
"epoch": 0.5025906735751295,
"eval_loss": 1.1119717359542847,
"eval_runtime": 49.6027,
"eval_samples_per_second": 14.999,
"eval_steps_per_second": 0.948,
"step": 194
},
{
"epoch": 0.5051813471502591,
"grad_norm": 42.105169991612456,
"learning_rate": 3.57883582065604e-05,
"loss": 1.1303,
"step": 195
},
{
"epoch": 0.5077720207253886,
"grad_norm": 37.14457014578168,
"learning_rate": 3.573552208896474e-05,
"loss": 1.1483,
"step": 196
},
{
"epoch": 0.5103626943005182,
"grad_norm": 28.56241612018119,
"learning_rate": 3.568239613153421e-05,
"loss": 1.0843,
"step": 197
},
{
"epoch": 0.5129533678756477,
"grad_norm": 35.399304035761865,
"learning_rate": 3.5628981312820315e-05,
"loss": 1.1177,
"step": 198
},
{
"epoch": 0.5155440414507773,
"grad_norm": 25.91156850470446,
"learning_rate": 3.557527861669522e-05,
"loss": 1.1215,
"step": 199
},
{
"epoch": 0.5181347150259067,
"grad_norm": 43.509516777992324,
"learning_rate": 3.552128903233363e-05,
"loss": 1.1532,
"step": 200
},
{
"epoch": 0.5207253886010362,
"grad_norm": 38.18164449834795,
"learning_rate": 3.54670135541946e-05,
"loss": 1.1142,
"step": 201
},
{
"epoch": 0.5233160621761658,
"grad_norm": 48.576743289054534,
"learning_rate": 3.541245318200318e-05,
"loss": 1.1152,
"step": 202
},
{
"epoch": 0.5259067357512953,
"grad_norm": 38.65411737007163,
"learning_rate": 3.5357608920732e-05,
"loss": 1.1607,
"step": 203
},
{
"epoch": 0.5284974093264249,
"grad_norm": 35.663493907396834,
"learning_rate": 3.530248178058282e-05,
"loss": 1.1273,
"step": 204
},
{
"epoch": 0.5310880829015544,
"grad_norm": 26.829817821665976,
"learning_rate": 3.5247072776967805e-05,
"loss": 1.1174,
"step": 205
},
{
"epoch": 0.533678756476684,
"grad_norm": 39.79604912152638,
"learning_rate": 3.519138293049097e-05,
"loss": 1.1811,
"step": 206
},
{
"epoch": 0.5362694300518135,
"grad_norm": 32.26179097390416,
"learning_rate": 3.513541326692925e-05,
"loss": 1.1346,
"step": 207
},
{
"epoch": 0.538860103626943,
"grad_norm": 24.35769329902787,
"learning_rate": 3.5079164817213684e-05,
"loss": 1.1061,
"step": 208
},
{
"epoch": 0.5414507772020726,
"grad_norm": 26.645546258363844,
"learning_rate": 3.5022638617410396e-05,
"loss": 1.0514,
"step": 209
},
{
"epoch": 0.5440414507772021,
"grad_norm": 105.19676603444857,
"learning_rate": 3.496583570870152e-05,
"loss": 1.1474,
"step": 210
},
{
"epoch": 0.5466321243523317,
"grad_norm": 61.600623030405885,
"learning_rate": 3.4908757137366006e-05,
"loss": 1.104,
"step": 211
},
{
"epoch": 0.5492227979274611,
"grad_norm": 31.65460129853052,
"learning_rate": 3.485140395476038e-05,
"loss": 1.0737,
"step": 212
},
{
"epoch": 0.5518134715025906,
"grad_norm": 26.860379117211497,
"learning_rate": 3.4793777217299346e-05,
"loss": 1.1119,
"step": 213
},
{
"epoch": 0.5544041450777202,
"grad_norm": 39.89324262309783,
"learning_rate": 3.473587798643633e-05,
"loss": 1.1626,
"step": 214
},
{
"epoch": 0.5569948186528497,
"grad_norm": 39.77638257731599,
"learning_rate": 3.467770732864399e-05,
"loss": 1.1545,
"step": 215
},
{
"epoch": 0.5595854922279793,
"grad_norm": 30.994657564291458,
"learning_rate": 3.461926631539445e-05,
"loss": 1.1646,
"step": 216
},
{
"epoch": 0.5621761658031088,
"grad_norm": 51.99674092516571,
"learning_rate": 3.4560556023139695e-05,
"loss": 1.1638,
"step": 217
},
{
"epoch": 0.5647668393782384,
"grad_norm": 58.5132713002146,
"learning_rate": 3.450157753329166e-05,
"loss": 1.1461,
"step": 218
},
{
"epoch": 0.5673575129533679,
"grad_norm": 30.712469030418482,
"learning_rate": 3.4442331932202326e-05,
"loss": 1.1583,
"step": 219
},
{
"epoch": 0.5699481865284974,
"grad_norm": 47.00217426642832,
"learning_rate": 3.438282031114374e-05,
"loss": 1.1154,
"step": 220
},
{
"epoch": 0.572538860103627,
"grad_norm": 37.33927961163222,
"learning_rate": 3.432304376628787e-05,
"loss": 1.1372,
"step": 221
},
{
"epoch": 0.5751295336787565,
"grad_norm": 28.858636933974392,
"learning_rate": 3.4263003398686464e-05,
"loss": 1.0488,
"step": 222
},
{
"epoch": 0.5777202072538861,
"grad_norm": 37.842230890171486,
"learning_rate": 3.420270031425072e-05,
"loss": 1.1892,
"step": 223
},
{
"epoch": 0.5803108808290155,
"grad_norm": 32.65394945357516,
"learning_rate": 3.4142135623730954e-05,
"loss": 1.1218,
"step": 224
},
{
"epoch": 0.582901554404145,
"grad_norm": 115.22040829465772,
"learning_rate": 3.4081310442696114e-05,
"loss": 1.1546,
"step": 225
},
{
"epoch": 0.5854922279792746,
"grad_norm": 31.20514468446119,
"learning_rate": 3.402022589151325e-05,
"loss": 1.0969,
"step": 226
},
{
"epoch": 0.5880829015544041,
"grad_norm": 52.8397361926395,
"learning_rate": 3.395888309532687e-05,
"loss": 1.1218,
"step": 227
},
{
"epoch": 0.5906735751295337,
"grad_norm": 51.7991692917308,
"learning_rate": 3.3897283184038215e-05,
"loss": 1.1395,
"step": 228
},
{
"epoch": 0.5932642487046632,
"grad_norm": 33.56775233970504,
"learning_rate": 3.3835427292284445e-05,
"loss": 1.1107,
"step": 229
},
{
"epoch": 0.5958549222797928,
"grad_norm": 46.081120788214314,
"learning_rate": 3.3773316559417734e-05,
"loss": 1.1472,
"step": 230
},
{
"epoch": 0.5984455958549223,
"grad_norm": 41.72558170492288,
"learning_rate": 3.371095212948431e-05,
"loss": 1.1871,
"step": 231
},
{
"epoch": 0.6010362694300518,
"grad_norm": 34.27957927587091,
"learning_rate": 3.364833515120336e-05,
"loss": 1.1376,
"step": 232
},
{
"epoch": 0.6036269430051814,
"grad_norm": 36.58452602010953,
"learning_rate": 3.358546677794586e-05,
"loss": 1.1885,
"step": 233
},
{
"epoch": 0.6062176165803109,
"grad_norm": 28.010809914189192,
"learning_rate": 3.352234816771337e-05,
"loss": 1.102,
"step": 234
},
{
"epoch": 0.6088082901554405,
"grad_norm": 24.78419558611963,
"learning_rate": 3.3458980483116664e-05,
"loss": 1.0818,
"step": 235
},
{
"epoch": 0.6113989637305699,
"grad_norm": 28.12830040081226,
"learning_rate": 3.3395364891354316e-05,
"loss": 1.1862,
"step": 236
},
{
"epoch": 0.6139896373056994,
"grad_norm": 37.94181651161551,
"learning_rate": 3.333150256419127e-05,
"loss": 1.147,
"step": 237
},
{
"epoch": 0.616580310880829,
"grad_norm": 21.809518482701854,
"learning_rate": 3.3267394677937134e-05,
"loss": 1.0994,
"step": 238
},
{
"epoch": 0.6191709844559585,
"grad_norm": 32.12135773753589,
"learning_rate": 3.320304241342464e-05,
"loss": 1.1531,
"step": 239
},
{
"epoch": 0.6217616580310881,
"grad_norm": 51.959731073524054,
"learning_rate": 3.31384469559878e-05,
"loss": 1.1717,
"step": 240
},
{
"epoch": 0.6243523316062176,
"grad_norm": 28.045815836372345,
"learning_rate": 3.307360949544012e-05,
"loss": 1.1814,
"step": 241
},
{
"epoch": 0.6269430051813472,
"grad_norm": 39.55208384578746,
"learning_rate": 3.300853122605268e-05,
"loss": 1.1483,
"step": 242
},
{
"epoch": 0.6295336787564767,
"grad_norm": 29.799974205160808,
"learning_rate": 3.294321334653213e-05,
"loss": 1.1838,
"step": 243
},
{
"epoch": 0.6321243523316062,
"grad_norm": 124.31035254102245,
"learning_rate": 3.2877657059998584e-05,
"loss": 1.0698,
"step": 244
},
{
"epoch": 0.6347150259067358,
"grad_norm": 37.989925180187655,
"learning_rate": 3.281186357396351e-05,
"loss": 1.0984,
"step": 245
},
{
"epoch": 0.6373056994818653,
"grad_norm": 55.72599333657572,
"learning_rate": 3.274583410030745e-05,
"loss": 1.2333,
"step": 246
},
{
"epoch": 0.6398963730569949,
"grad_norm": 46.77079456439719,
"learning_rate": 3.267956985525774e-05,
"loss": 1.2157,
"step": 247
},
{
"epoch": 0.6424870466321243,
"grad_norm": 33.62329915252562,
"learning_rate": 3.261307205936603e-05,
"loss": 1.1752,
"step": 248
},
{
"epoch": 0.6450777202072538,
"grad_norm": 34.11794183225494,
"learning_rate": 3.2546341937485884e-05,
"loss": 1.1265,
"step": 249
},
{
"epoch": 0.6476683937823834,
"grad_norm": 36.027636323913896,
"learning_rate": 3.247938071875017e-05,
"loss": 1.103,
"step": 250
},
{
"epoch": 0.6502590673575129,
"grad_norm": 35.393219337329946,
"learning_rate": 3.2412189636548456e-05,
"loss": 1.1148,
"step": 251
},
{
"epoch": 0.6528497409326425,
"grad_norm": 31.578919022569924,
"learning_rate": 3.234476992850425e-05,
"loss": 1.1149,
"step": 252
},
{
"epoch": 0.655440414507772,
"grad_norm": 28.93717647736964,
"learning_rate": 3.227712283645224e-05,
"loss": 1.1425,
"step": 253
},
{
"epoch": 0.6580310880829016,
"grad_norm": 34.170026750703684,
"learning_rate": 3.2209249606415394e-05,
"loss": 1.1591,
"step": 254
},
{
"epoch": 0.6606217616580311,
"grad_norm": 27.52194954061608,
"learning_rate": 3.214115148858201e-05,
"loss": 1.1704,
"step": 255
},
{
"epoch": 0.6632124352331606,
"grad_norm": 81.65404753769732,
"learning_rate": 3.207282973728273e-05,
"loss": 1.161,
"step": 256
},
{
"epoch": 0.6658031088082902,
"grad_norm": 57.45351536522683,
"learning_rate": 3.200428561096737e-05,
"loss": 1.116,
"step": 257
},
{
"epoch": 0.6683937823834197,
"grad_norm": 30.968529074463714,
"learning_rate": 3.193552037218179e-05,
"loss": 1.1265,
"step": 258
},
{
"epoch": 0.6709844559585493,
"grad_norm": 37.8817748068655,
"learning_rate": 3.186653528754464e-05,
"loss": 1.1287,
"step": 259
},
{
"epoch": 0.6735751295336787,
"grad_norm": 29.197031189172545,
"learning_rate": 3.179733162772398e-05,
"loss": 1.1045,
"step": 260
},
{
"epoch": 0.6761658031088082,
"grad_norm": 36.56253841299107,
"learning_rate": 3.172791066741392e-05,
"loss": 1.1539,
"step": 261
},
{
"epoch": 0.6787564766839378,
"grad_norm": 25.799921116950998,
"learning_rate": 3.165827368531113e-05,
"loss": 1.0796,
"step": 262
},
{
"epoch": 0.6813471502590673,
"grad_norm": 82.81825216532526,
"learning_rate": 3.1588421964091276e-05,
"loss": 1.142,
"step": 263
},
{
"epoch": 0.6839378238341969,
"grad_norm": 31.100074747569124,
"learning_rate": 3.151835679038542e-05,
"loss": 1.0908,
"step": 264
},
{
"epoch": 0.6865284974093264,
"grad_norm": 25.57297200703221,
"learning_rate": 3.14480794547563e-05,
"loss": 1.1436,
"step": 265
},
{
"epoch": 0.689119170984456,
"grad_norm": 23.92492773149328,
"learning_rate": 3.137759125167455e-05,
"loss": 1.1202,
"step": 266
},
{
"epoch": 0.6917098445595855,
"grad_norm": 22.14274360766396,
"learning_rate": 3.130689347949486e-05,
"loss": 1.1113,
"step": 267
},
{
"epoch": 0.694300518134715,
"grad_norm": 26.68725288649902,
"learning_rate": 3.123598744043211e-05,
"loss": 1.1517,
"step": 268
},
{
"epoch": 0.6968911917098446,
"grad_norm": 25.559817524659362,
"learning_rate": 3.1164874440537295e-05,
"loss": 1.0976,
"step": 269
},
{
"epoch": 0.6994818652849741,
"grad_norm": 28.89996834100355,
"learning_rate": 3.109355578967356e-05,
"loss": 1.1932,
"step": 270
},
{
"epoch": 0.7020725388601037,
"grad_norm": 32.09658045195569,
"learning_rate": 3.1022032801492e-05,
"loss": 1.1161,
"step": 271
},
{
"epoch": 0.7046632124352331,
"grad_norm": 30.623705646213768,
"learning_rate": 3.095030679340751e-05,
"loss": 1.1993,
"step": 272
},
{
"epoch": 0.7072538860103627,
"grad_norm": 41.71263710932429,
"learning_rate": 3.0878379086574494e-05,
"loss": 1.1624,
"step": 273
},
{
"epoch": 0.7098445595854922,
"grad_norm": 34.68352639470226,
"learning_rate": 3.0806251005862535e-05,
"loss": 1.1156,
"step": 274
},
{
"epoch": 0.7124352331606217,
"grad_norm": 23.52580702428812,
"learning_rate": 3.073392387983202e-05,
"loss": 1.0963,
"step": 275
},
{
"epoch": 0.7150259067357513,
"grad_norm": 28.10687988214902,
"learning_rate": 3.0661399040709584e-05,
"loss": 1.1095,
"step": 276
},
{
"epoch": 0.7176165803108808,
"grad_norm": 66.72288729975841,
"learning_rate": 3.05886778243637e-05,
"loss": 1.0865,
"step": 277
},
{
"epoch": 0.7202072538860104,
"grad_norm": 25.775217430321934,
"learning_rate": 3.051576157027998e-05,
"loss": 1.1058,
"step": 278
},
{
"epoch": 0.7227979274611399,
"grad_norm": 36.82942099016794,
"learning_rate": 3.0442651621536502e-05,
"loss": 1.1211,
"step": 279
},
{
"epoch": 0.7253886010362695,
"grad_norm": 27.878820856521013,
"learning_rate": 3.0369349324779115e-05,
"loss": 1.1471,
"step": 280
},
{
"epoch": 0.727979274611399,
"grad_norm": 31.293156717285573,
"learning_rate": 3.0295856030196618e-05,
"loss": 1.0748,
"step": 281
},
{
"epoch": 0.7305699481865285,
"grad_norm": 39.315952115194435,
"learning_rate": 3.022217309149588e-05,
"loss": 1.0993,
"step": 282
},
{
"epoch": 0.7331606217616581,
"grad_norm": 36.79954071435495,
"learning_rate": 3.0148301865876913e-05,
"loss": 1.1045,
"step": 283
},
{
"epoch": 0.7357512953367875,
"grad_norm": 26.127389502147167,
"learning_rate": 3.0074243714007875e-05,
"loss": 1.1424,
"step": 284
},
{
"epoch": 0.7383419689119171,
"grad_norm": 25.608778060317068,
"learning_rate": 3.0000000000000004e-05,
"loss": 1.1055,
"step": 285
},
{
"epoch": 0.7409326424870466,
"grad_norm": 36.22629669671894,
"learning_rate": 2.992557209138249e-05,
"loss": 1.0845,
"step": 286
},
{
"epoch": 0.7435233160621761,
"grad_norm": 35.30642111132886,
"learning_rate": 2.9850961359077293e-05,
"loss": 1.204,
"step": 287
},
{
"epoch": 0.7461139896373057,
"grad_norm": 29.765894622087952,
"learning_rate": 2.977616917737388e-05,
"loss": 1.168,
"step": 288
},
{
"epoch": 0.7487046632124352,
"grad_norm": 27.194683587397567,
"learning_rate": 2.9701196923903927e-05,
"loss": 1.1236,
"step": 289
},
{
"epoch": 0.7512953367875648,
"grad_norm": 63.09779240191165,
"learning_rate": 2.9626045979615928e-05,
"loss": 1.1395,
"step": 290
},
{
"epoch": 0.7538860103626943,
"grad_norm": 25.014233377763066,
"learning_rate": 2.9550717728749768e-05,
"loss": 1.1054,
"step": 291
},
{
"epoch": 0.7538860103626943,
"eval_loss": 1.0996382236480713,
"eval_runtime": 37.9545,
"eval_samples_per_second": 19.602,
"eval_steps_per_second": 1.238,
"step": 291
},
{
"epoch": 0.7564766839378239,
"grad_norm": 27.481891737318097,
"learning_rate": 2.947521355881122e-05,
"loss": 1.1252,
"step": 292
},
{
"epoch": 0.7590673575129534,
"grad_norm": 67.57807413949878,
"learning_rate": 2.9399534860546404e-05,
"loss": 1.1761,
"step": 293
},
{
"epoch": 0.7616580310880829,
"grad_norm": 65.66834495909988,
"learning_rate": 2.932368302791614e-05,
"loss": 1.0551,
"step": 294
},
{
"epoch": 0.7642487046632125,
"grad_norm": 30.051210942517116,
"learning_rate": 2.92476594580703e-05,
"loss": 1.138,
"step": 295
},
{
"epoch": 0.7668393782383419,
"grad_norm": 22.693089678510507,
"learning_rate": 2.917146555132206e-05,
"loss": 1.1495,
"step": 296
},
{
"epoch": 0.7694300518134715,
"grad_norm": 53.84166280540606,
"learning_rate": 2.909510271112212e-05,
"loss": 1.1409,
"step": 297
},
{
"epoch": 0.772020725388601,
"grad_norm": 32.69106061524578,
"learning_rate": 2.9018572344032823e-05,
"loss": 1.1709,
"step": 298
},
{
"epoch": 0.7746113989637305,
"grad_norm": 39.44484991312582,
"learning_rate": 2.8941875859702283e-05,
"loss": 1.1138,
"step": 299
},
{
"epoch": 0.7772020725388601,
"grad_norm": 31.51857596969122,
"learning_rate": 2.88650146708384e-05,
"loss": 1.1931,
"step": 300
},
{
"epoch": 0.7797927461139896,
"grad_norm": 70.51218412614058,
"learning_rate": 2.878799019318283e-05,
"loss": 1.155,
"step": 301
},
{
"epoch": 0.7823834196891192,
"grad_norm": 80.27969224752457,
"learning_rate": 2.8710803845484955e-05,
"loss": 1.1425,
"step": 302
},
{
"epoch": 0.7849740932642487,
"grad_norm": 28.16560857981767,
"learning_rate": 2.8633457049475678e-05,
"loss": 1.1072,
"step": 303
},
{
"epoch": 0.7875647668393783,
"grad_norm": 41.15138307552231,
"learning_rate": 2.855595122984129e-05,
"loss": 1.1492,
"step": 304
},
{
"epoch": 0.7901554404145078,
"grad_norm": 23.894217282116276,
"learning_rate": 2.847828781419722e-05,
"loss": 1.1136,
"step": 305
},
{
"epoch": 0.7927461139896373,
"grad_norm": 25.005501120810248,
"learning_rate": 2.8400468233061708e-05,
"loss": 1.0921,
"step": 306
},
{
"epoch": 0.7953367875647669,
"grad_norm": 30.91791938195468,
"learning_rate": 2.832249391982949e-05,
"loss": 1.1098,
"step": 307
},
{
"epoch": 0.7979274611398963,
"grad_norm": 44.776563922922726,
"learning_rate": 2.8244366310745398e-05,
"loss": 1.1845,
"step": 308
},
{
"epoch": 0.8005181347150259,
"grad_norm": 19.059329544784376,
"learning_rate": 2.816608684487787e-05,
"loss": 1.169,
"step": 309
},
{
"epoch": 0.8031088082901554,
"grad_norm": 63.97334641962602,
"learning_rate": 2.8087656964092472e-05,
"loss": 1.124,
"step": 310
},
{
"epoch": 0.805699481865285,
"grad_norm": 30.878848859015882,
"learning_rate": 2.8009078113025335e-05,
"loss": 1.2087,
"step": 311
},
{
"epoch": 0.8082901554404145,
"grad_norm": 34.63835471543836,
"learning_rate": 2.7930351739056533e-05,
"loss": 1.1338,
"step": 312
},
{
"epoch": 0.810880829015544,
"grad_norm": 30.03178182445718,
"learning_rate": 2.7851479292283442e-05,
"loss": 1.1321,
"step": 313
},
{
"epoch": 0.8134715025906736,
"grad_norm": 38.42236523356876,
"learning_rate": 2.7772462225494013e-05,
"loss": 1.1557,
"step": 314
},
{
"epoch": 0.8160621761658031,
"grad_norm": 39.179683790956744,
"learning_rate": 2.7693301994140026e-05,
"loss": 1.1201,
"step": 315
},
{
"epoch": 0.8186528497409327,
"grad_norm": 38.32243159447327,
"learning_rate": 2.761400005631028e-05,
"loss": 1.1105,
"step": 316
},
{
"epoch": 0.8212435233160622,
"grad_norm": 39.913808227411835,
"learning_rate": 2.7534557872703705e-05,
"loss": 1.1598,
"step": 317
},
{
"epoch": 0.8238341968911918,
"grad_norm": 69.73521867812421,
"learning_rate": 2.7454976906602513e-05,
"loss": 1.1145,
"step": 318
},
{
"epoch": 0.8264248704663213,
"grad_norm": 65.55887588207746,
"learning_rate": 2.7375258623845207e-05,
"loss": 1.1255,
"step": 319
},
{
"epoch": 0.8290155440414507,
"grad_norm": 30.980111545641563,
"learning_rate": 2.7295404492799575e-05,
"loss": 1.122,
"step": 320
},
{
"epoch": 0.8316062176165803,
"grad_norm": 30.12179911444832,
"learning_rate": 2.721541598433567e-05,
"loss": 1.113,
"step": 321
},
{
"epoch": 0.8341968911917098,
"grad_norm": 28.329434659508582,
"learning_rate": 2.7135294571798706e-05,
"loss": 1.0498,
"step": 322
},
{
"epoch": 0.8367875647668394,
"grad_norm": 25.114787597049578,
"learning_rate": 2.70550417309819e-05,
"loss": 1.0633,
"step": 323
},
{
"epoch": 0.8393782383419689,
"grad_norm": 27.754037709590385,
"learning_rate": 2.6974658940099337e-05,
"loss": 1.1585,
"step": 324
},
{
"epoch": 0.8419689119170984,
"grad_norm": 29.489888159179444,
"learning_rate": 2.6894147679758678e-05,
"loss": 1.1259,
"step": 325
},
{
"epoch": 0.844559585492228,
"grad_norm": 24.426102194202898,
"learning_rate": 2.6813509432933957e-05,
"loss": 1.1515,
"step": 326
},
{
"epoch": 0.8471502590673575,
"grad_norm": 24.75197483331429,
"learning_rate": 2.673274568493821e-05,
"loss": 1.15,
"step": 327
},
{
"epoch": 0.8497409326424871,
"grad_norm": 40.604864626683366,
"learning_rate": 2.6651857923396132e-05,
"loss": 1.1219,
"step": 328
},
{
"epoch": 0.8523316062176166,
"grad_norm": 34.694568404196026,
"learning_rate": 2.6570847638216698e-05,
"loss": 1.103,
"step": 329
},
{
"epoch": 0.8549222797927462,
"grad_norm": 48.715136403425035,
"learning_rate": 2.648971632156569e-05,
"loss": 1.1675,
"step": 330
},
{
"epoch": 0.8575129533678757,
"grad_norm": 97.77526410121799,
"learning_rate": 2.6408465467838225e-05,
"loss": 1.1502,
"step": 331
},
{
"epoch": 0.8601036269430051,
"grad_norm": 54.697215318949276,
"learning_rate": 2.632709657363124e-05,
"loss": 1.1446,
"step": 332
},
{
"epoch": 0.8626943005181347,
"grad_norm": 38.09192002041798,
"learning_rate": 2.6245611137715897e-05,
"loss": 1.1333,
"step": 333
},
{
"epoch": 0.8652849740932642,
"grad_norm": 46.713623556984956,
"learning_rate": 2.6164010661010007e-05,
"loss": 1.1252,
"step": 334
},
{
"epoch": 0.8678756476683938,
"grad_norm": 46.40552686286593,
"learning_rate": 2.6082296646550364e-05,
"loss": 1.121,
"step": 335
},
{
"epoch": 0.8704663212435233,
"grad_norm": 37.57424454065957,
"learning_rate": 2.6000470599465065e-05,
"loss": 1.1671,
"step": 336
},
{
"epoch": 0.8730569948186528,
"grad_norm": 38.580777053099204,
"learning_rate": 2.5918534026945787e-05,
"loss": 1.0849,
"step": 337
},
{
"epoch": 0.8756476683937824,
"grad_norm": 154.3106712010981,
"learning_rate": 2.5836488438220044e-05,
"loss": 1.0663,
"step": 338
},
{
"epoch": 0.8782383419689119,
"grad_norm": 34.21394067951015,
"learning_rate": 2.575433534452334e-05,
"loss": 1.0895,
"step": 339
},
{
"epoch": 0.8808290155440415,
"grad_norm": 36.291611242733886,
"learning_rate": 2.5672076259071385e-05,
"loss": 1.1242,
"step": 340
},
{
"epoch": 0.883419689119171,
"grad_norm": 29.411623389655112,
"learning_rate": 2.558971269703219e-05,
"loss": 1.1005,
"step": 341
},
{
"epoch": 0.8860103626943006,
"grad_norm": 30.24903086761753,
"learning_rate": 2.5507246175498174e-05,
"loss": 1.1134,
"step": 342
},
{
"epoch": 0.8886010362694301,
"grad_norm": 22.032293114161938,
"learning_rate": 2.5424678213458202e-05,
"loss": 1.1121,
"step": 343
},
{
"epoch": 0.8911917098445595,
"grad_norm": 34.997361528376956,
"learning_rate": 2.5342010331769635e-05,
"loss": 1.1341,
"step": 344
},
{
"epoch": 0.8937823834196891,
"grad_norm": 28.212824875732352,
"learning_rate": 2.5259244053130295e-05,
"loss": 1.0748,
"step": 345
},
{
"epoch": 0.8963730569948186,
"grad_norm": 23.870011592985897,
"learning_rate": 2.5176380902050418e-05,
"loss": 1.0643,
"step": 346
},
{
"epoch": 0.8989637305699482,
"grad_norm": 26.10018699309748,
"learning_rate": 2.5093422404824574e-05,
"loss": 1.1662,
"step": 347
},
{
"epoch": 0.9015544041450777,
"grad_norm": 30.191468778559166,
"learning_rate": 2.5010370089503578e-05,
"loss": 1.1023,
"step": 348
},
{
"epoch": 0.9041450777202072,
"grad_norm": 55.799581973427415,
"learning_rate": 2.4927225485866297e-05,
"loss": 1.1538,
"step": 349
},
{
"epoch": 0.9067357512953368,
"grad_norm": 35.7030284720465,
"learning_rate": 2.4843990125391516e-05,
"loss": 1.1,
"step": 350
},
{
"epoch": 0.9093264248704663,
"grad_norm": 28.61763302791738,
"learning_rate": 2.4760665541229712e-05,
"loss": 1.0914,
"step": 351
},
{
"epoch": 0.9119170984455959,
"grad_norm": 33.34233685155311,
"learning_rate": 2.467725326817481e-05,
"loss": 1.0862,
"step": 352
},
{
"epoch": 0.9145077720207254,
"grad_norm": 25.441052078480084,
"learning_rate": 2.4593754842635917e-05,
"loss": 1.1422,
"step": 353
},
{
"epoch": 0.917098445595855,
"grad_norm": 24.217974454985058,
"learning_rate": 2.451017180260902e-05,
"loss": 1.132,
"step": 354
},
{
"epoch": 0.9196891191709845,
"grad_norm": 57.986011465793155,
"learning_rate": 2.4426505687648653e-05,
"loss": 1.2082,
"step": 355
},
{
"epoch": 0.9222797927461139,
"grad_norm": 34.058264716876195,
"learning_rate": 2.4342758038839573e-05,
"loss": 1.1679,
"step": 356
},
{
"epoch": 0.9248704663212435,
"grad_norm": 28.621514922275253,
"learning_rate": 2.4258930398768317e-05,
"loss": 1.1319,
"step": 357
},
{
"epoch": 0.927461139896373,
"grad_norm": 35.33355417283227,
"learning_rate": 2.4175024311494835e-05,
"loss": 1.0705,
"step": 358
},
{
"epoch": 0.9300518134715026,
"grad_norm": 46.579572933583265,
"learning_rate": 2.4091041322524023e-05,
"loss": 1.0842,
"step": 359
},
{
"epoch": 0.9326424870466321,
"grad_norm": 35.494740787672974,
"learning_rate": 2.4006982978777263e-05,
"loss": 1.1072,
"step": 360
},
{
"epoch": 0.9352331606217616,
"grad_norm": 44.56606839509262,
"learning_rate": 2.392285082856394e-05,
"loss": 1.1125,
"step": 361
},
{
"epoch": 0.9378238341968912,
"grad_norm": 46.26363869084929,
"learning_rate": 2.3838646421552917e-05,
"loss": 1.1268,
"step": 362
},
{
"epoch": 0.9404145077720207,
"grad_norm": 89.17676267680146,
"learning_rate": 2.3754371308743975e-05,
"loss": 1.0893,
"step": 363
},
{
"epoch": 0.9430051813471503,
"grad_norm": 34.87700187494181,
"learning_rate": 2.367002704243927e-05,
"loss": 1.1203,
"step": 364
},
{
"epoch": 0.9455958549222798,
"grad_norm": 32.92806939217504,
"learning_rate": 2.3585615176214716e-05,
"loss": 1.1488,
"step": 365
},
{
"epoch": 0.9481865284974094,
"grad_norm": 27.27458755248548,
"learning_rate": 2.3501137264891396e-05,
"loss": 1.0874,
"step": 366
},
{
"epoch": 0.9507772020725389,
"grad_norm": 24.959123789739834,
"learning_rate": 2.3416594864506887e-05,
"loss": 1.1783,
"step": 367
},
{
"epoch": 0.9533678756476683,
"grad_norm": 31.838670988369724,
"learning_rate": 2.333198953228664e-05,
"loss": 1.0759,
"step": 368
},
{
"epoch": 0.9559585492227979,
"grad_norm": 28.112870222863155,
"learning_rate": 2.3247322826615276e-05,
"loss": 1.1481,
"step": 369
},
{
"epoch": 0.9585492227979274,
"grad_norm": 35.08461098450067,
"learning_rate": 2.316259630700787e-05,
"loss": 1.0953,
"step": 370
},
{
"epoch": 0.961139896373057,
"grad_norm": 37.80899503618479,
"learning_rate": 2.307781153408124e-05,
"loss": 1.1224,
"step": 371
},
{
"epoch": 0.9637305699481865,
"grad_norm": 31.644978122007387,
"learning_rate": 2.2992970069525202e-05,
"loss": 1.1608,
"step": 372
},
{
"epoch": 0.966321243523316,
"grad_norm": 23.51029318210938,
"learning_rate": 2.29080734760738e-05,
"loss": 1.0914,
"step": 373
},
{
"epoch": 0.9689119170984456,
"grad_norm": 28.97240481418573,
"learning_rate": 2.2823123317476522e-05,
"loss": 1.1117,
"step": 374
},
{
"epoch": 0.9715025906735751,
"grad_norm": 36.613893678320395,
"learning_rate": 2.273812115846951e-05,
"loss": 1.1118,
"step": 375
},
{
"epoch": 0.9740932642487047,
"grad_norm": 26.402979304578093,
"learning_rate": 2.2653068564746692e-05,
"loss": 1.13,
"step": 376
},
{
"epoch": 0.9766839378238342,
"grad_norm": 114.3000444613392,
"learning_rate": 2.2567967102931025e-05,
"loss": 1.1539,
"step": 377
},
{
"epoch": 0.9792746113989638,
"grad_norm": 26.861359932396834,
"learning_rate": 2.2482818340545534e-05,
"loss": 1.0566,
"step": 378
},
{
"epoch": 0.9818652849740933,
"grad_norm": 32.75509374223994,
"learning_rate": 2.2397623845984548e-05,
"loss": 1.1746,
"step": 379
},
{
"epoch": 0.9844559585492227,
"grad_norm": 34.11964206838379,
"learning_rate": 2.2312385188484718e-05,
"loss": 1.0834,
"step": 380
},
{
"epoch": 0.9870466321243523,
"grad_norm": 38.019564122226434,
"learning_rate": 2.2227103938096176e-05,
"loss": 1.1074,
"step": 381
},
{
"epoch": 0.9896373056994818,
"grad_norm": 39.5073811375391,
"learning_rate": 2.2141781665653584e-05,
"loss": 1.1082,
"step": 382
},
{
"epoch": 0.9922279792746114,
"grad_norm": 298.4258332795163,
"learning_rate": 2.205641994274721e-05,
"loss": 1.125,
"step": 383
},
{
"epoch": 0.9948186528497409,
"grad_norm": 36.444415670935506,
"learning_rate": 2.1971020341693973e-05,
"loss": 1.0935,
"step": 384
},
{
"epoch": 0.9974093264248705,
"grad_norm": 28.96533429210575,
"learning_rate": 2.188558443550849e-05,
"loss": 1.0957,
"step": 385
},
{
"epoch": 1.0,
"grad_norm": 66.41241684127401,
"learning_rate": 2.180011379787411e-05,
"loss": 1.1335,
"step": 386
},
{
"epoch": 1.0025906735751295,
"grad_norm": 28.75549619538953,
"learning_rate": 2.1714610003113887e-05,
"loss": 1.1316,
"step": 387
},
{
"epoch": 1.005181347150259,
"grad_norm": 26.911837500852275,
"learning_rate": 2.1629074626161647e-05,
"loss": 1.1026,
"step": 388
},
{
"epoch": 1.005181347150259,
"eval_loss": 1.0908173322677612,
"eval_runtime": 37.7642,
"eval_samples_per_second": 19.701,
"eval_steps_per_second": 1.245,
"step": 388
},
{
"epoch": 1.0077720207253886,
"grad_norm": 34.28722746775385,
"learning_rate": 2.1543509242532932e-05,
"loss": 1.1104,
"step": 389
},
{
"epoch": 1.0103626943005182,
"grad_norm": 37.97709310694863,
"learning_rate": 2.145791542829597e-05,
"loss": 1.0663,
"step": 390
},
{
"epoch": 1.0129533678756477,
"grad_norm": 39.379668162327384,
"learning_rate": 2.1372294760042686e-05,
"loss": 1.1405,
"step": 391
},
{
"epoch": 1.0155440414507773,
"grad_norm": 27.136201219298698,
"learning_rate": 2.1286648814859636e-05,
"loss": 1.0963,
"step": 392
},
{
"epoch": 1.0181347150259068,
"grad_norm": 39.34261641469313,
"learning_rate": 2.120097917029897e-05,
"loss": 1.1276,
"step": 393
},
{
"epoch": 1.0207253886010363,
"grad_norm": 46.77583801285328,
"learning_rate": 2.1115287404349357e-05,
"loss": 1.1171,
"step": 394
},
{
"epoch": 1.0233160621761659,
"grad_norm": 55.10335066695868,
"learning_rate": 2.1029575095406933e-05,
"loss": 1.0831,
"step": 395
},
{
"epoch": 1.0259067357512954,
"grad_norm": 76.88533851789373,
"learning_rate": 2.0943843822246234e-05,
"loss": 1.0925,
"step": 396
},
{
"epoch": 1.028497409326425,
"grad_norm": 29.604569209708462,
"learning_rate": 2.0858095163991094e-05,
"loss": 1.1259,
"step": 397
},
{
"epoch": 1.0310880829015545,
"grad_norm": 37.71348366628868,
"learning_rate": 2.077233070008557e-05,
"loss": 1.0792,
"step": 398
},
{
"epoch": 1.0336787564766838,
"grad_norm": 26.866133194031644,
"learning_rate": 2.0686552010264872e-05,
"loss": 1.1649,
"step": 399
},
{
"epoch": 1.0362694300518134,
"grad_norm": 35.739274800620635,
"learning_rate": 2.060076067452622e-05,
"loss": 1.0837,
"step": 400
},
{
"epoch": 1.038860103626943,
"grad_norm": 24.479129391259896,
"learning_rate": 2.0514958273099778e-05,
"loss": 1.073,
"step": 401
},
{
"epoch": 1.0414507772020725,
"grad_norm": 50.49963650108008,
"learning_rate": 2.042914638641952e-05,
"loss": 1.0912,
"step": 402
},
{
"epoch": 1.044041450777202,
"grad_norm": 35.6875451072032,
"learning_rate": 2.0343326595094154e-05,
"loss": 1.0936,
"step": 403
},
{
"epoch": 1.0466321243523315,
"grad_norm": 30.212298193414487,
"learning_rate": 2.0257500479877965e-05,
"loss": 1.089,
"step": 404
},
{
"epoch": 1.049222797927461,
"grad_norm": 28.65828720015124,
"learning_rate": 2.0171669621641743e-05,
"loss": 1.1727,
"step": 405
},
{
"epoch": 1.0518134715025906,
"grad_norm": 39.2199058392425,
"learning_rate": 2.0085835601343627e-05,
"loss": 1.1493,
"step": 406
},
{
"epoch": 1.0544041450777202,
"grad_norm": 110.01204177059546,
"learning_rate": 2e-05,
"loss": 1.1245,
"step": 407
},
{
"epoch": 1.0569948186528497,
"grad_norm": 43.427381349600374,
"learning_rate": 1.9914164398656383e-05,
"loss": 1.1183,
"step": 408
},
{
"epoch": 1.0595854922279793,
"grad_norm": 64.78768909817894,
"learning_rate": 1.9828330378358264e-05,
"loss": 1.1528,
"step": 409
},
{
"epoch": 1.0621761658031088,
"grad_norm": 26.50257915912425,
"learning_rate": 1.974249952012204e-05,
"loss": 1.1568,
"step": 410
},
{
"epoch": 1.0647668393782384,
"grad_norm": 27.63159204178893,
"learning_rate": 1.9656673404905852e-05,
"loss": 1.1071,
"step": 411
},
{
"epoch": 1.067357512953368,
"grad_norm": 27.0795355533723,
"learning_rate": 1.957085361358049e-05,
"loss": 1.0809,
"step": 412
},
{
"epoch": 1.0699481865284974,
"grad_norm": 41.84795332660821,
"learning_rate": 1.9485041726900232e-05,
"loss": 1.0744,
"step": 413
},
{
"epoch": 1.072538860103627,
"grad_norm": 143.2109134427192,
"learning_rate": 1.939923932547379e-05,
"loss": 1.0905,
"step": 414
},
{
"epoch": 1.0751295336787565,
"grad_norm": 89.55384065946154,
"learning_rate": 1.931344798973513e-05,
"loss": 1.1012,
"step": 415
},
{
"epoch": 1.077720207253886,
"grad_norm": 31.072074793068015,
"learning_rate": 1.922766929991443e-05,
"loss": 1.1141,
"step": 416
},
{
"epoch": 1.0803108808290156,
"grad_norm": 29.82683189045969,
"learning_rate": 1.914190483600891e-05,
"loss": 1.0842,
"step": 417
},
{
"epoch": 1.0829015544041452,
"grad_norm": 30.09708662586305,
"learning_rate": 1.9056156177753776e-05,
"loss": 1.1088,
"step": 418
},
{
"epoch": 1.0854922279792747,
"grad_norm": 27.637437518920503,
"learning_rate": 1.897042490459307e-05,
"loss": 1.058,
"step": 419
},
{
"epoch": 1.0880829015544042,
"grad_norm": 69.34285700381683,
"learning_rate": 1.8884712595650653e-05,
"loss": 1.0314,
"step": 420
},
{
"epoch": 1.0906735751295338,
"grad_norm": 25.644927284592956,
"learning_rate": 1.8799020829701036e-05,
"loss": 1.0916,
"step": 421
},
{
"epoch": 1.093264248704663,
"grad_norm": 30.3898986852319,
"learning_rate": 1.871335118514037e-05,
"loss": 1.0797,
"step": 422
},
{
"epoch": 1.0958549222797926,
"grad_norm": 22.271334693423444,
"learning_rate": 1.862770523995732e-05,
"loss": 1.1134,
"step": 423
},
{
"epoch": 1.0984455958549222,
"grad_norm": 35.85874616678876,
"learning_rate": 1.854208457170404e-05,
"loss": 1.0927,
"step": 424
},
{
"epoch": 1.1010362694300517,
"grad_norm": 43.06832041948097,
"learning_rate": 1.8456490757467075e-05,
"loss": 1.093,
"step": 425
},
{
"epoch": 1.1036269430051813,
"grad_norm": 37.83777637993467,
"learning_rate": 1.8370925373838356e-05,
"loss": 1.1268,
"step": 426
},
{
"epoch": 1.1062176165803108,
"grad_norm": 23.798059023605177,
"learning_rate": 1.8285389996886113e-05,
"loss": 1.0989,
"step": 427
},
{
"epoch": 1.1088082901554404,
"grad_norm": 25.443104465500795,
"learning_rate": 1.8199886202125897e-05,
"loss": 1.0581,
"step": 428
},
{
"epoch": 1.11139896373057,
"grad_norm": 23.76241444847441,
"learning_rate": 1.8114415564491513e-05,
"loss": 1.0908,
"step": 429
},
{
"epoch": 1.1139896373056994,
"grad_norm": 26.5600693044426,
"learning_rate": 1.8028979658306033e-05,
"loss": 1.1321,
"step": 430
},
{
"epoch": 1.116580310880829,
"grad_norm": 44.854375199828986,
"learning_rate": 1.794358005725279e-05,
"loss": 1.0762,
"step": 431
},
{
"epoch": 1.1191709844559585,
"grad_norm": 28.05797777410846,
"learning_rate": 1.785821833434642e-05,
"loss": 1.0698,
"step": 432
},
{
"epoch": 1.121761658031088,
"grad_norm": 26.488479630212364,
"learning_rate": 1.7772896061903824e-05,
"loss": 1.1223,
"step": 433
},
{
"epoch": 1.1243523316062176,
"grad_norm": 32.77084542157883,
"learning_rate": 1.768761481151529e-05,
"loss": 1.0984,
"step": 434
},
{
"epoch": 1.1269430051813472,
"grad_norm": 39.13198413130026,
"learning_rate": 1.7602376154015456e-05,
"loss": 1.1551,
"step": 435
},
{
"epoch": 1.1295336787564767,
"grad_norm": 23.878966995283953,
"learning_rate": 1.751718165945447e-05,
"loss": 1.1133,
"step": 436
},
{
"epoch": 1.1321243523316062,
"grad_norm": 33.90472985566232,
"learning_rate": 1.743203289706898e-05,
"loss": 1.1219,
"step": 437
},
{
"epoch": 1.1347150259067358,
"grad_norm": 23.340369938533712,
"learning_rate": 1.734693143525331e-05,
"loss": 1.1244,
"step": 438
},
{
"epoch": 1.1373056994818653,
"grad_norm": 105.6885206147852,
"learning_rate": 1.7261878841530494e-05,
"loss": 1.0788,
"step": 439
},
{
"epoch": 1.1398963730569949,
"grad_norm": 28.453526076458317,
"learning_rate": 1.717687668252348e-05,
"loss": 1.1576,
"step": 440
},
{
"epoch": 1.1424870466321244,
"grad_norm": 36.1473991485961,
"learning_rate": 1.7091926523926205e-05,
"loss": 1.0859,
"step": 441
},
{
"epoch": 1.145077720207254,
"grad_norm": 27.043461146902448,
"learning_rate": 1.7007029930474804e-05,
"loss": 1.1072,
"step": 442
},
{
"epoch": 1.1476683937823835,
"grad_norm": 28.066170619981435,
"learning_rate": 1.6922188465918763e-05,
"loss": 1.1279,
"step": 443
},
{
"epoch": 1.150259067357513,
"grad_norm": 38.62445822837212,
"learning_rate": 1.6837403692992136e-05,
"loss": 1.1275,
"step": 444
},
{
"epoch": 1.1528497409326426,
"grad_norm": 28.077258963587767,
"learning_rate": 1.6752677173384734e-05,
"loss": 1.1004,
"step": 445
},
{
"epoch": 1.1554404145077721,
"grad_norm": 42.1405744301338,
"learning_rate": 1.6668010467713363e-05,
"loss": 1.1141,
"step": 446
},
{
"epoch": 1.1580310880829017,
"grad_norm": 26.827291684301034,
"learning_rate": 1.658340513549312e-05,
"loss": 1.1216,
"step": 447
},
{
"epoch": 1.160621761658031,
"grad_norm": 30.863489441619983,
"learning_rate": 1.649886273510861e-05,
"loss": 1.1898,
"step": 448
},
{
"epoch": 1.1632124352331605,
"grad_norm": 27.73579733476068,
"learning_rate": 1.641438482378529e-05,
"loss": 1.0971,
"step": 449
},
{
"epoch": 1.16580310880829,
"grad_norm": 32.84347174567353,
"learning_rate": 1.6329972957560736e-05,
"loss": 1.0579,
"step": 450
},
{
"epoch": 1.1683937823834196,
"grad_norm": 30.06456192962641,
"learning_rate": 1.6245628691256032e-05,
"loss": 1.1057,
"step": 451
},
{
"epoch": 1.1709844559585492,
"grad_norm": 36.554506394377846,
"learning_rate": 1.616135357844709e-05,
"loss": 1.1008,
"step": 452
},
{
"epoch": 1.1735751295336787,
"grad_norm": 27.358643056184114,
"learning_rate": 1.6077149171436063e-05,
"loss": 1.101,
"step": 453
},
{
"epoch": 1.1761658031088082,
"grad_norm": 111.13373813893604,
"learning_rate": 1.599301702122274e-05,
"loss": 1.0688,
"step": 454
},
{
"epoch": 1.1787564766839378,
"grad_norm": 33.94168250727336,
"learning_rate": 1.590895867747599e-05,
"loss": 1.0721,
"step": 455
},
{
"epoch": 1.1813471502590673,
"grad_norm": 53.93978395349692,
"learning_rate": 1.582497568850517e-05,
"loss": 1.0584,
"step": 456
},
{
"epoch": 1.1839378238341969,
"grad_norm": 29.19245794937285,
"learning_rate": 1.574106960123169e-05,
"loss": 1.067,
"step": 457
},
{
"epoch": 1.1865284974093264,
"grad_norm": 28.06897801999048,
"learning_rate": 1.5657241961160434e-05,
"loss": 1.0899,
"step": 458
},
{
"epoch": 1.189119170984456,
"grad_norm": 52.31256652964293,
"learning_rate": 1.557349431235135e-05,
"loss": 1.0925,
"step": 459
},
{
"epoch": 1.1917098445595855,
"grad_norm": 65.39771110845307,
"learning_rate": 1.5489828197390988e-05,
"loss": 1.1448,
"step": 460
},
{
"epoch": 1.194300518134715,
"grad_norm": 27.062780348557254,
"learning_rate": 1.5406245157364093e-05,
"loss": 1.0871,
"step": 461
},
{
"epoch": 1.1968911917098446,
"grad_norm": 41.667025056250424,
"learning_rate": 1.5322746731825195e-05,
"loss": 1.048,
"step": 462
},
{
"epoch": 1.1994818652849741,
"grad_norm": 24.936669803360665,
"learning_rate": 1.5239334458770291e-05,
"loss": 1.1243,
"step": 463
},
{
"epoch": 1.2020725388601037,
"grad_norm": 26.65392149600558,
"learning_rate": 1.5156009874608484e-05,
"loss": 1.0919,
"step": 464
},
{
"epoch": 1.2046632124352332,
"grad_norm": 48.57730651937978,
"learning_rate": 1.5072774514133708e-05,
"loss": 1.1259,
"step": 465
},
{
"epoch": 1.2072538860103628,
"grad_norm": 31.34891257114439,
"learning_rate": 1.4989629910496424e-05,
"loss": 1.0733,
"step": 466
},
{
"epoch": 1.2098445595854923,
"grad_norm": 24.541559850584985,
"learning_rate": 1.4906577595175428e-05,
"loss": 1.1166,
"step": 467
},
{
"epoch": 1.2124352331606219,
"grad_norm": 20.4345832961354,
"learning_rate": 1.4823619097949584e-05,
"loss": 1.0916,
"step": 468
},
{
"epoch": 1.2150259067357512,
"grad_norm": 28.860712194727487,
"learning_rate": 1.4740755946869708e-05,
"loss": 1.1043,
"step": 469
},
{
"epoch": 1.2176165803108807,
"grad_norm": 25.71820242946282,
"learning_rate": 1.4657989668230363e-05,
"loss": 1.0949,
"step": 470
},
{
"epoch": 1.2202072538860103,
"grad_norm": 51.16994773097077,
"learning_rate": 1.4575321786541801e-05,
"loss": 1.141,
"step": 471
},
{
"epoch": 1.2227979274611398,
"grad_norm": 32.70442309640389,
"learning_rate": 1.4492753824501833e-05,
"loss": 1.1127,
"step": 472
},
{
"epoch": 1.2253886010362693,
"grad_norm": 21.913285172411495,
"learning_rate": 1.4410287302967813e-05,
"loss": 1.084,
"step": 473
},
{
"epoch": 1.2279792746113989,
"grad_norm": 34.45727214001296,
"learning_rate": 1.4327923740928613e-05,
"loss": 1.0836,
"step": 474
},
{
"epoch": 1.2305699481865284,
"grad_norm": 26.768013926034776,
"learning_rate": 1.4245664655476663e-05,
"loss": 1.1264,
"step": 475
},
{
"epoch": 1.233160621761658,
"grad_norm": 28.401965255935572,
"learning_rate": 1.4163511561779956e-05,
"loss": 1.0805,
"step": 476
},
{
"epoch": 1.2357512953367875,
"grad_norm": 29.19935757288793,
"learning_rate": 1.4081465973054216e-05,
"loss": 1.0825,
"step": 477
},
{
"epoch": 1.238341968911917,
"grad_norm": 24.55918541541201,
"learning_rate": 1.3999529400534941e-05,
"loss": 1.1164,
"step": 478
},
{
"epoch": 1.2409326424870466,
"grad_norm": 25.35635406268312,
"learning_rate": 1.3917703353449646e-05,
"loss": 1.1334,
"step": 479
},
{
"epoch": 1.2435233160621761,
"grad_norm": 45.453901005004184,
"learning_rate": 1.3835989338989996e-05,
"loss": 1.1387,
"step": 480
},
{
"epoch": 1.2461139896373057,
"grad_norm": 21.67852694202104,
"learning_rate": 1.375438886228411e-05,
"loss": 1.0846,
"step": 481
},
{
"epoch": 1.2487046632124352,
"grad_norm": 171.2474074894732,
"learning_rate": 1.3672903426368773e-05,
"loss": 1.1388,
"step": 482
},
{
"epoch": 1.2512953367875648,
"grad_norm": 43.18223835070906,
"learning_rate": 1.3591534532161781e-05,
"loss": 1.1483,
"step": 483
},
{
"epoch": 1.2538860103626943,
"grad_norm": 29.447332565856644,
"learning_rate": 1.3510283678434317e-05,
"loss": 1.07,
"step": 484
},
{
"epoch": 1.2564766839378239,
"grad_norm": 28.600251051615228,
"learning_rate": 1.3429152361783307e-05,
"loss": 1.0798,
"step": 485
},
{
"epoch": 1.2564766839378239,
"eval_loss": 1.085669755935669,
"eval_runtime": 38.1134,
"eval_samples_per_second": 19.521,
"eval_steps_per_second": 1.233,
"step": 485
},
{
"epoch": 1.2590673575129534,
"grad_norm": 47.124643074410464,
"learning_rate": 1.3348142076603876e-05,
"loss": 1.0875,
"step": 486
},
{
"epoch": 1.261658031088083,
"grad_norm": 42.06019726307143,
"learning_rate": 1.3267254315061797e-05,
"loss": 1.1429,
"step": 487
},
{
"epoch": 1.2642487046632125,
"grad_norm": 18.950734630756962,
"learning_rate": 1.318649056706605e-05,
"loss": 1.0747,
"step": 488
},
{
"epoch": 1.266839378238342,
"grad_norm": 31.903949502516806,
"learning_rate": 1.3105852320241326e-05,
"loss": 1.1041,
"step": 489
},
{
"epoch": 1.2694300518134716,
"grad_norm": 22.957473008085927,
"learning_rate": 1.3025341059900675e-05,
"loss": 1.1046,
"step": 490
},
{
"epoch": 1.2720207253886011,
"grad_norm": 22.325983256563678,
"learning_rate": 1.2944958269018103e-05,
"loss": 1.0643,
"step": 491
},
{
"epoch": 1.2746113989637307,
"grad_norm": 29.689383331974955,
"learning_rate": 1.2864705428201307e-05,
"loss": 1.0949,
"step": 492
},
{
"epoch": 1.2772020725388602,
"grad_norm": 25.338298442945575,
"learning_rate": 1.2784584015664337e-05,
"loss": 1.0725,
"step": 493
},
{
"epoch": 1.2797927461139897,
"grad_norm": 31.591732488078588,
"learning_rate": 1.2704595507200435e-05,
"loss": 1.0347,
"step": 494
},
{
"epoch": 1.2823834196891193,
"grad_norm": 42.96243570696118,
"learning_rate": 1.26247413761548e-05,
"loss": 1.1196,
"step": 495
},
{
"epoch": 1.2849740932642488,
"grad_norm": 26.559546676266024,
"learning_rate": 1.254502309339749e-05,
"loss": 1.0187,
"step": 496
},
{
"epoch": 1.2875647668393784,
"grad_norm": 27.58444017584016,
"learning_rate": 1.2465442127296297e-05,
"loss": 1.0985,
"step": 497
},
{
"epoch": 1.2901554404145077,
"grad_norm": 36.53028730423797,
"learning_rate": 1.2385999943689732e-05,
"loss": 1.068,
"step": 498
},
{
"epoch": 1.2927461139896372,
"grad_norm": 38.94837307599113,
"learning_rate": 1.2306698005859975e-05,
"loss": 1.0736,
"step": 499
},
{
"epoch": 1.2953367875647668,
"grad_norm": 36.67208266195125,
"learning_rate": 1.2227537774505996e-05,
"loss": 1.119,
"step": 500
},
{
"epoch": 1.2979274611398963,
"grad_norm": 31.086410648635283,
"learning_rate": 1.2148520707716567e-05,
"loss": 1.1094,
"step": 501
},
{
"epoch": 1.3005181347150259,
"grad_norm": 27.96977481605826,
"learning_rate": 1.2069648260943473e-05,
"loss": 1.1345,
"step": 502
},
{
"epoch": 1.3031088082901554,
"grad_norm": 22.89450502840197,
"learning_rate": 1.1990921886974669e-05,
"loss": 1.12,
"step": 503
},
{
"epoch": 1.305699481865285,
"grad_norm": 18.54206032224653,
"learning_rate": 1.1912343035907535e-05,
"loss": 1.0929,
"step": 504
},
{
"epoch": 1.3082901554404145,
"grad_norm": 38.9386007237313,
"learning_rate": 1.1833913155122132e-05,
"loss": 1.1381,
"step": 505
},
{
"epoch": 1.310880829015544,
"grad_norm": 37.05899458809635,
"learning_rate": 1.1755633689254609e-05,
"loss": 1.0535,
"step": 506
},
{
"epoch": 1.3134715025906736,
"grad_norm": 27.716372794195156,
"learning_rate": 1.1677506080170512e-05,
"loss": 1.1342,
"step": 507
},
{
"epoch": 1.3160621761658031,
"grad_norm": 40.42306246079416,
"learning_rate": 1.1599531766938306e-05,
"loss": 1.0887,
"step": 508
},
{
"epoch": 1.3186528497409327,
"grad_norm": 98.56681767405578,
"learning_rate": 1.1521712185802789e-05,
"loss": 1.0954,
"step": 509
},
{
"epoch": 1.3212435233160622,
"grad_norm": 34.42816933350743,
"learning_rate": 1.1444048770158718e-05,
"loss": 1.0512,
"step": 510
},
{
"epoch": 1.3238341968911918,
"grad_norm": 52.457523653614096,
"learning_rate": 1.136654295052433e-05,
"loss": 1.1599,
"step": 511
},
{
"epoch": 1.3264248704663213,
"grad_norm": 26.832339531661276,
"learning_rate": 1.1289196154515048e-05,
"loss": 1.0602,
"step": 512
},
{
"epoch": 1.3290155440414508,
"grad_norm": 32.746047673769816,
"learning_rate": 1.1212009806817163e-05,
"loss": 1.1544,
"step": 513
},
{
"epoch": 1.3316062176165804,
"grad_norm": 37.44483451702055,
"learning_rate": 1.1134985329161608e-05,
"loss": 1.1421,
"step": 514
},
{
"epoch": 1.33419689119171,
"grad_norm": 28.625976525737606,
"learning_rate": 1.1058124140297718e-05,
"loss": 1.0858,
"step": 515
},
{
"epoch": 1.3367875647668392,
"grad_norm": 38.64141195246213,
"learning_rate": 1.0981427655967183e-05,
"loss": 1.0983,
"step": 516
},
{
"epoch": 1.3393782383419688,
"grad_norm": 29.989753893533425,
"learning_rate": 1.0904897288877891e-05,
"loss": 1.1269,
"step": 517
},
{
"epoch": 1.3419689119170983,
"grad_norm": 48.63990665515511,
"learning_rate": 1.0828534448677942e-05,
"loss": 1.0844,
"step": 518
},
{
"epoch": 1.3445595854922279,
"grad_norm": 25.477227318250847,
"learning_rate": 1.0752340541929711e-05,
"loss": 1.0742,
"step": 519
},
{
"epoch": 1.3471502590673574,
"grad_norm": 26.363588814537763,
"learning_rate": 1.0676316972083867e-05,
"loss": 1.0533,
"step": 520
},
{
"epoch": 1.349740932642487,
"grad_norm": 34.59968737708606,
"learning_rate": 1.060046513945361e-05,
"loss": 1.0983,
"step": 521
},
{
"epoch": 1.3523316062176165,
"grad_norm": 52.51652561846762,
"learning_rate": 1.0524786441188786e-05,
"loss": 1.1319,
"step": 522
},
{
"epoch": 1.354922279792746,
"grad_norm": 21.360221214301127,
"learning_rate": 1.0449282271250239e-05,
"loss": 1.0627,
"step": 523
},
{
"epoch": 1.3575129533678756,
"grad_norm": 37.00053933682603,
"learning_rate": 1.0373954020384073e-05,
"loss": 1.096,
"step": 524
},
{
"epoch": 1.3601036269430051,
"grad_norm": 39.212240822687484,
"learning_rate": 1.029880307609608e-05,
"loss": 1.0512,
"step": 525
},
{
"epoch": 1.3626943005181347,
"grad_norm": 24.89842378385804,
"learning_rate": 1.0223830822626124e-05,
"loss": 1.0538,
"step": 526
},
{
"epoch": 1.3652849740932642,
"grad_norm": 29.14416894424653,
"learning_rate": 1.0149038640922715e-05,
"loss": 1.1538,
"step": 527
},
{
"epoch": 1.3678756476683938,
"grad_norm": 31.688722122648855,
"learning_rate": 1.0074427908617515e-05,
"loss": 1.171,
"step": 528
},
{
"epoch": 1.3704663212435233,
"grad_norm": 41.918909004413734,
"learning_rate": 1.0000000000000006e-05,
"loss": 1.1203,
"step": 529
},
{
"epoch": 1.3730569948186528,
"grad_norm": 26.70963454516576,
"learning_rate": 9.92575628599213e-06,
"loss": 1.0855,
"step": 530
},
{
"epoch": 1.3756476683937824,
"grad_norm": 24.819351173466824,
"learning_rate": 9.851698134123095e-06,
"loss": 1.0972,
"step": 531
},
{
"epoch": 1.378238341968912,
"grad_norm": 22.100465399566815,
"learning_rate": 9.777826908504126e-06,
"loss": 1.08,
"step": 532
},
{
"epoch": 1.3808290155440415,
"grad_norm": 29.31574709406259,
"learning_rate": 9.704143969803392e-06,
"loss": 1.0835,
"step": 533
},
{
"epoch": 1.383419689119171,
"grad_norm": 25.551326748473052,
"learning_rate": 9.630650675220892e-06,
"loss": 1.0396,
"step": 534
},
{
"epoch": 1.3860103626943006,
"grad_norm": 59.07595627892596,
"learning_rate": 9.557348378463503e-06,
"loss": 1.0814,
"step": 535
},
{
"epoch": 1.38860103626943,
"grad_norm": 24.96501978981908,
"learning_rate": 9.484238429720018e-06,
"loss": 1.0187,
"step": 536
},
{
"epoch": 1.3911917098445596,
"grad_norm": 42.530604702279234,
"learning_rate": 9.411322175636298e-06,
"loss": 1.074,
"step": 537
},
{
"epoch": 1.3937823834196892,
"grad_norm": 34.91129065632851,
"learning_rate": 9.338600959290414e-06,
"loss": 1.0878,
"step": 538
},
{
"epoch": 1.3963730569948187,
"grad_norm": 32.07525956876426,
"learning_rate": 9.266076120167992e-06,
"loss": 1.0962,
"step": 539
},
{
"epoch": 1.3989637305699483,
"grad_norm": 40.18387743296675,
"learning_rate": 9.193748994137462e-06,
"loss": 1.1033,
"step": 540
},
{
"epoch": 1.4015544041450778,
"grad_norm": 66.68031460980451,
"learning_rate": 9.121620913425508e-06,
"loss": 1.1466,
"step": 541
},
{
"epoch": 1.4041450777202074,
"grad_norm": 34.07506059584738,
"learning_rate": 9.04969320659249e-06,
"loss": 1.1184,
"step": 542
},
{
"epoch": 1.406735751295337,
"grad_norm": 17.130845779169075,
"learning_rate": 8.977967198508001e-06,
"loss": 1.0803,
"step": 543
},
{
"epoch": 1.4093264248704664,
"grad_norm": 22.4457025132615,
"learning_rate": 8.906444210326441e-06,
"loss": 1.0745,
"step": 544
},
{
"epoch": 1.411917098445596,
"grad_norm": 73.43971735356851,
"learning_rate": 8.83512555946271e-06,
"loss": 1.0717,
"step": 545
},
{
"epoch": 1.4145077720207253,
"grad_norm": 38.16321297719761,
"learning_rate": 8.764012559567899e-06,
"loss": 1.1371,
"step": 546
},
{
"epoch": 1.4170984455958548,
"grad_norm": 56.14718024907725,
"learning_rate": 8.693106520505147e-06,
"loss": 1.0185,
"step": 547
},
{
"epoch": 1.4196891191709844,
"grad_norm": 53.3812598790062,
"learning_rate": 8.622408748325461e-06,
"loss": 1.0859,
"step": 548
},
{
"epoch": 1.422279792746114,
"grad_norm": 39.69041631433326,
"learning_rate": 8.551920545243704e-06,
"loss": 1.1146,
"step": 549
},
{
"epoch": 1.4248704663212435,
"grad_norm": 24.099260758984773,
"learning_rate": 8.481643209614576e-06,
"loss": 1.0968,
"step": 550
},
{
"epoch": 1.427461139896373,
"grad_norm": 22.623850373369237,
"learning_rate": 8.411578035908728e-06,
"loss": 1.0642,
"step": 551
},
{
"epoch": 1.4300518134715026,
"grad_norm": 25.343746374404027,
"learning_rate": 8.341726314688875e-06,
"loss": 1.0815,
"step": 552
},
{
"epoch": 1.432642487046632,
"grad_norm": 35.82641011588973,
"learning_rate": 8.272089332586089e-06,
"loss": 1.1012,
"step": 553
},
{
"epoch": 1.4352331606217616,
"grad_norm": 24.81161215784662,
"learning_rate": 8.20266837227603e-06,
"loss": 1.1086,
"step": 554
},
{
"epoch": 1.4378238341968912,
"grad_norm": 54.18243481591251,
"learning_rate": 8.133464712455364e-06,
"loss": 1.0704,
"step": 555
},
{
"epoch": 1.4404145077720207,
"grad_norm": 23.602598217141395,
"learning_rate": 8.064479627818213e-06,
"loss": 1.1519,
"step": 556
},
{
"epoch": 1.4430051813471503,
"grad_norm": 31.124404868409982,
"learning_rate": 7.995714389032638e-06,
"loss": 1.0705,
"step": 557
},
{
"epoch": 1.4455958549222798,
"grad_norm": 24.14171016995626,
"learning_rate": 7.927170262717284e-06,
"loss": 1.1083,
"step": 558
},
{
"epoch": 1.4481865284974094,
"grad_norm": 47.987203109917175,
"learning_rate": 7.858848511417998e-06,
"loss": 1.0836,
"step": 559
},
{
"epoch": 1.450777202072539,
"grad_norm": 25.871447098066056,
"learning_rate": 7.790750393584616e-06,
"loss": 1.0787,
"step": 560
},
{
"epoch": 1.4533678756476685,
"grad_norm": 23.820249113937482,
"learning_rate": 7.72287716354776e-06,
"loss": 1.1165,
"step": 561
},
{
"epoch": 1.455958549222798,
"grad_norm": 48.04131308947624,
"learning_rate": 7.65523007149575e-06,
"loss": 1.0819,
"step": 562
},
{
"epoch": 1.4585492227979275,
"grad_norm": 29.273494083692352,
"learning_rate": 7.587810363451544e-06,
"loss": 1.0302,
"step": 563
},
{
"epoch": 1.4611398963730569,
"grad_norm": 120.01571222366722,
"learning_rate": 7.5206192812498345e-06,
"loss": 1.1291,
"step": 564
},
{
"epoch": 1.4637305699481864,
"grad_norm": 33.16947662083338,
"learning_rate": 7.4536580625141244e-06,
"loss": 1.0842,
"step": 565
},
{
"epoch": 1.466321243523316,
"grad_norm": 29.979556378166713,
"learning_rate": 7.386927940633981e-06,
"loss": 1.1116,
"step": 566
},
{
"epoch": 1.4689119170984455,
"grad_norm": 27.172344859281896,
"learning_rate": 7.32043014474227e-06,
"loss": 1.0676,
"step": 567
},
{
"epoch": 1.471502590673575,
"grad_norm": 30.208548637757318,
"learning_rate": 7.254165899692554e-06,
"loss": 1.1104,
"step": 568
},
{
"epoch": 1.4740932642487046,
"grad_norm": 19.385421184583773,
"learning_rate": 7.188136426036498e-06,
"loss": 1.0085,
"step": 569
},
{
"epoch": 1.4766839378238341,
"grad_norm": 30.350787749309685,
"learning_rate": 7.12234294000143e-06,
"loss": 1.0584,
"step": 570
},
{
"epoch": 1.4792746113989637,
"grad_norm": 31.520305600900198,
"learning_rate": 7.056786653467882e-06,
"loss": 1.0831,
"step": 571
},
{
"epoch": 1.4818652849740932,
"grad_norm": 46.13006972574487,
"learning_rate": 6.991468773947321e-06,
"loss": 1.1761,
"step": 572
},
{
"epoch": 1.4844559585492227,
"grad_norm": 26.72340868362835,
"learning_rate": 6.926390504559879e-06,
"loss": 1.0605,
"step": 573
},
{
"epoch": 1.4870466321243523,
"grad_norm": 25.992965411102556,
"learning_rate": 6.861553044012206e-06,
"loss": 1.1015,
"step": 574
},
{
"epoch": 1.4896373056994818,
"grad_norm": 38.60187420279626,
"learning_rate": 6.796957586575364e-06,
"loss": 1.1232,
"step": 575
},
{
"epoch": 1.4922279792746114,
"grad_norm": 21.7618591565717,
"learning_rate": 6.732605322062869e-06,
"loss": 1.1196,
"step": 576
},
{
"epoch": 1.494818652849741,
"grad_norm": 28.233093007170996,
"learning_rate": 6.668497435808736e-06,
"loss": 1.1451,
"step": 577
},
{
"epoch": 1.4974093264248705,
"grad_norm": 28.061514297823816,
"learning_rate": 6.604635108645683e-06,
"loss": 1.0832,
"step": 578
},
{
"epoch": 1.5,
"grad_norm": 35.34503147975386,
"learning_rate": 6.5410195168833425e-06,
"loss": 1.118,
"step": 579
},
{
"epoch": 1.5025906735751295,
"grad_norm": 31.940516004139344,
"learning_rate": 6.477651832286633e-06,
"loss": 1.1052,
"step": 580
},
{
"epoch": 1.505181347150259,
"grad_norm": 25.647504733675635,
"learning_rate": 6.414533222054138e-06,
"loss": 1.1055,
"step": 581
},
{
"epoch": 1.5077720207253886,
"grad_norm": 68.16422579698298,
"learning_rate": 6.3516648487966456e-06,
"loss": 1.0784,
"step": 582
},
{
"epoch": 1.5077720207253886,
"eval_loss": 1.0824710130691528,
"eval_runtime": 37.4923,
"eval_samples_per_second": 19.844,
"eval_steps_per_second": 1.254,
"step": 582
},
{
"epoch": 1.5103626943005182,
"grad_norm": 46.95363643283118,
"learning_rate": 6.289047870515692e-06,
"loss": 1.1271,
"step": 583
},
{
"epoch": 1.5129533678756477,
"grad_norm": 37.80701104174098,
"learning_rate": 6.226683440582268e-06,
"loss": 1.126,
"step": 584
},
{
"epoch": 1.5155440414507773,
"grad_norm": 32.03225059321182,
"learning_rate": 6.164572707715564e-06,
"loss": 1.0152,
"step": 585
},
{
"epoch": 1.5181347150259068,
"grad_norm": 31.21438627768379,
"learning_rate": 6.102716815961787e-06,
"loss": 1.1595,
"step": 586
},
{
"epoch": 1.5207253886010363,
"grad_norm": 23.55515793723355,
"learning_rate": 6.041116904673125e-06,
"loss": 1.0943,
"step": 587
},
{
"epoch": 1.5233160621761659,
"grad_norm": 26.92022994571063,
"learning_rate": 5.979774108486751e-06,
"loss": 1.0554,
"step": 588
},
{
"epoch": 1.5259067357512954,
"grad_norm": 24.957086694295352,
"learning_rate": 5.918689557303885e-06,
"loss": 1.0711,
"step": 589
},
{
"epoch": 1.528497409326425,
"grad_norm": 87.48440577770464,
"learning_rate": 5.857864376269051e-06,
"loss": 1.1679,
"step": 590
},
{
"epoch": 1.5310880829015545,
"grad_norm": 21.756969247026838,
"learning_rate": 5.7972996857492896e-06,
"loss": 1.0716,
"step": 591
},
{
"epoch": 1.533678756476684,
"grad_norm": 33.92695136944769,
"learning_rate": 5.736996601313545e-06,
"loss": 1.0376,
"step": 592
},
{
"epoch": 1.5362694300518136,
"grad_norm": 32.738888590276794,
"learning_rate": 5.676956233712139e-06,
"loss": 1.0245,
"step": 593
},
{
"epoch": 1.5388601036269431,
"grad_norm": 22.38597679049821,
"learning_rate": 5.617179688856271e-06,
"loss": 1.1103,
"step": 594
},
{
"epoch": 1.5414507772020727,
"grad_norm": 30.168619654124416,
"learning_rate": 5.557668067797677e-06,
"loss": 1.2007,
"step": 595
},
{
"epoch": 1.5440414507772022,
"grad_norm": 24.460334668593116,
"learning_rate": 5.498422466708349e-06,
"loss": 1.0842,
"step": 596
},
{
"epoch": 1.5466321243523318,
"grad_norm": 25.877463433966412,
"learning_rate": 5.439443976860306e-06,
"loss": 1.0537,
"step": 597
},
{
"epoch": 1.549222797927461,
"grad_norm": 27.67111694532404,
"learning_rate": 5.38073368460555e-06,
"loss": 1.0863,
"step": 598
},
{
"epoch": 1.5518134715025906,
"grad_norm": 43.112045139256026,
"learning_rate": 5.32229267135602e-06,
"loss": 1.1168,
"step": 599
},
{
"epoch": 1.5544041450777202,
"grad_norm": 31.60344278763487,
"learning_rate": 5.2641220135636685e-06,
"loss": 1.0939,
"step": 600
},
{
"epoch": 1.5569948186528497,
"grad_norm": 37.795536334167195,
"learning_rate": 5.206222782700667e-06,
"loss": 1.1084,
"step": 601
},
{
"epoch": 1.5595854922279793,
"grad_norm": 27.529824319458413,
"learning_rate": 5.1485960452396266e-06,
"loss": 1.0755,
"step": 602
},
{
"epoch": 1.5621761658031088,
"grad_norm": 29.172376961452496,
"learning_rate": 5.091242862634e-06,
"loss": 1.0231,
"step": 603
},
{
"epoch": 1.5647668393782384,
"grad_norm": 24.94560254083931,
"learning_rate": 5.0341642912984844e-06,
"loss": 1.0782,
"step": 604
},
{
"epoch": 1.567357512953368,
"grad_norm": 31.79546143794924,
"learning_rate": 4.977361382589607e-06,
"loss": 1.1202,
"step": 605
},
{
"epoch": 1.5699481865284974,
"grad_norm": 39.3795372477718,
"learning_rate": 4.920835182786316e-06,
"loss": 1.0349,
"step": 606
},
{
"epoch": 1.572538860103627,
"grad_norm": 31.308429467189708,
"learning_rate": 4.864586733070755e-06,
"loss": 1.0582,
"step": 607
},
{
"epoch": 1.5751295336787565,
"grad_norm": 32.82748366949945,
"learning_rate": 4.808617069509034e-06,
"loss": 1.1246,
"step": 608
},
{
"epoch": 1.577720207253886,
"grad_norm": 24.281936328515055,
"learning_rate": 4.752927223032196e-06,
"loss": 1.0679,
"step": 609
},
{
"epoch": 1.5803108808290154,
"grad_norm": 111.23884469313498,
"learning_rate": 4.697518219417188e-06,
"loss": 1.1319,
"step": 610
},
{
"epoch": 1.582901554404145,
"grad_norm": 35.484299416160596,
"learning_rate": 4.6423910792680005e-06,
"loss": 1.1348,
"step": 611
},
{
"epoch": 1.5854922279792745,
"grad_norm": 27.135342529418295,
"learning_rate": 4.587546817996826e-06,
"loss": 1.0948,
"step": 612
},
{
"epoch": 1.588082901554404,
"grad_norm": 81.98158494527004,
"learning_rate": 4.532986445805405e-06,
"loss": 1.0864,
"step": 613
},
{
"epoch": 1.5906735751295336,
"grad_norm": 61.490418707157346,
"learning_rate": 4.478710967666371e-06,
"loss": 1.0693,
"step": 614
},
{
"epoch": 1.593264248704663,
"grad_norm": 25.633018846282962,
"learning_rate": 4.424721383304791e-06,
"loss": 1.1084,
"step": 615
},
{
"epoch": 1.5958549222797926,
"grad_norm": 28.194280804517373,
"learning_rate": 4.371018687179689e-06,
"loss": 1.1722,
"step": 616
},
{
"epoch": 1.5984455958549222,
"grad_norm": 27.8080566828581,
"learning_rate": 4.317603868465794e-06,
"loss": 1.1171,
"step": 617
},
{
"epoch": 1.6010362694300517,
"grad_norm": 42.959036729178806,
"learning_rate": 4.264477911035265e-06,
"loss": 1.074,
"step": 618
},
{
"epoch": 1.6036269430051813,
"grad_norm": 23.937218136554392,
"learning_rate": 4.211641793439609e-06,
"loss": 1.13,
"step": 619
},
{
"epoch": 1.6062176165803108,
"grad_norm": 43.913677975121566,
"learning_rate": 4.159096488891623e-06,
"loss": 1.1671,
"step": 620
},
{
"epoch": 1.6088082901554404,
"grad_norm": 48.107566289352114,
"learning_rate": 4.106842965247497e-06,
"loss": 1.1071,
"step": 621
},
{
"epoch": 1.61139896373057,
"grad_norm": 28.25790913819402,
"learning_rate": 4.054882184988971e-06,
"loss": 1.0716,
"step": 622
},
{
"epoch": 1.6139896373056994,
"grad_norm": 26.59960827233381,
"learning_rate": 4.003215105205613e-06,
"loss": 1.146,
"step": 623
},
{
"epoch": 1.616580310880829,
"grad_norm": 22.79614250574067,
"learning_rate": 3.951842677577171e-06,
"loss": 1.0761,
"step": 624
},
{
"epoch": 1.6191709844559585,
"grad_norm": 24.24036779343114,
"learning_rate": 3.900765848356083e-06,
"loss": 1.1037,
"step": 625
},
{
"epoch": 1.621761658031088,
"grad_norm": 27.295669679621373,
"learning_rate": 3.849985558349998e-06,
"loss": 1.1015,
"step": 626
},
{
"epoch": 1.6243523316062176,
"grad_norm": 54.413225233914176,
"learning_rate": 3.799502742904497e-06,
"loss": 1.0318,
"step": 627
},
{
"epoch": 1.6269430051813472,
"grad_norm": 38.84848713400369,
"learning_rate": 3.749318331885825e-06,
"loss": 1.1147,
"step": 628
},
{
"epoch": 1.6295336787564767,
"grad_norm": 23.912199342429506,
"learning_rate": 3.699433249663775e-06,
"loss": 1.1439,
"step": 629
},
{
"epoch": 1.6321243523316062,
"grad_norm": 48.95526983090661,
"learning_rate": 3.649848415094681e-06,
"loss": 1.0229,
"step": 630
},
{
"epoch": 1.6347150259067358,
"grad_norm": 32.099897123524585,
"learning_rate": 3.60056474150446e-06,
"loss": 1.0589,
"step": 631
},
{
"epoch": 1.6373056994818653,
"grad_norm": 31.802660850585973,
"learning_rate": 3.551583136671817e-06,
"loss": 1.1137,
"step": 632
},
{
"epoch": 1.6398963730569949,
"grad_norm": 34.2655686599537,
"learning_rate": 3.5029045028115105e-06,
"loss": 1.1318,
"step": 633
},
{
"epoch": 1.6424870466321244,
"grad_norm": 191.48847051006786,
"learning_rate": 3.4545297365577437e-06,
"loss": 1.0921,
"step": 634
},
{
"epoch": 1.645077720207254,
"grad_norm": 24.236450154622357,
"learning_rate": 3.406459728947622e-06,
"loss": 1.0851,
"step": 635
},
{
"epoch": 1.6476683937823835,
"grad_norm": 38.819342476228876,
"learning_rate": 3.358695365404785e-06,
"loss": 1.0962,
"step": 636
},
{
"epoch": 1.650259067357513,
"grad_norm": 31.53545103406636,
"learning_rate": 3.3112375257230547e-06,
"loss": 1.0994,
"step": 637
},
{
"epoch": 1.6528497409326426,
"grad_norm": 71.55299438562814,
"learning_rate": 3.2640870840502646e-06,
"loss": 1.08,
"step": 638
},
{
"epoch": 1.6554404145077721,
"grad_norm": 57.94234006640972,
"learning_rate": 3.2172449088721235e-06,
"loss": 1.0921,
"step": 639
},
{
"epoch": 1.6580310880829017,
"grad_norm": 58.15229256885828,
"learning_rate": 3.1707118629962607e-06,
"loss": 1.0981,
"step": 640
},
{
"epoch": 1.6606217616580312,
"grad_norm": 25.105795165561457,
"learning_rate": 3.1244888035362875e-06,
"loss": 1.101,
"step": 641
},
{
"epoch": 1.6632124352331608,
"grad_norm": 33.15366058006866,
"learning_rate": 3.0785765818960534e-06,
"loss": 1.0517,
"step": 642
},
{
"epoch": 1.6658031088082903,
"grad_norm": 35.79893709161297,
"learning_rate": 3.0329760437539233e-06,
"loss": 1.0886,
"step": 643
},
{
"epoch": 1.6683937823834198,
"grad_norm": 49.59918009099835,
"learning_rate": 2.9876880290472376e-06,
"loss": 1.0756,
"step": 644
},
{
"epoch": 1.6709844559585494,
"grad_norm": 21.485142494367135,
"learning_rate": 2.942713371956809e-06,
"loss": 1.1017,
"step": 645
},
{
"epoch": 1.6735751295336787,
"grad_norm": 29.23169287520316,
"learning_rate": 2.8980529008915793e-06,
"loss": 1.1241,
"step": 646
},
{
"epoch": 1.6761658031088082,
"grad_norm": 27.913868608886553,
"learning_rate": 2.853707438473352e-06,
"loss": 1.0841,
"step": 647
},
{
"epoch": 1.6787564766839378,
"grad_norm": 18.438597602055644,
"learning_rate": 2.8096778015216484e-06,
"loss": 1.0891,
"step": 648
},
{
"epoch": 1.6813471502590673,
"grad_norm": 54.0556941620233,
"learning_rate": 2.7659648010386365e-06,
"loss": 1.0589,
"step": 649
},
{
"epoch": 1.6839378238341969,
"grad_norm": 108.10101848740734,
"learning_rate": 2.7225692421942306e-06,
"loss": 1.0766,
"step": 650
},
{
"epoch": 1.6865284974093264,
"grad_norm": 106.58835736628185,
"learning_rate": 2.679491924311226e-06,
"loss": 1.1144,
"step": 651
},
{
"epoch": 1.689119170984456,
"grad_norm": 31.53371570516213,
"learning_rate": 2.6367336408506063e-06,
"loss": 1.02,
"step": 652
},
{
"epoch": 1.6917098445595855,
"grad_norm": 36.263088086669775,
"learning_rate": 2.594295179396895e-06,
"loss": 1.0679,
"step": 653
},
{
"epoch": 1.694300518134715,
"grad_norm": 24.47507184337666,
"learning_rate": 2.5521773216436875e-06,
"loss": 1.1092,
"step": 654
},
{
"epoch": 1.6968911917098446,
"grad_norm": 33.05899532106974,
"learning_rate": 2.5103808433792075e-06,
"loss": 1.053,
"step": 655
},
{
"epoch": 1.6994818652849741,
"grad_norm": 29.132344102799873,
"learning_rate": 2.468906514472065e-06,
"loss": 1.0518,
"step": 656
},
{
"epoch": 1.7020725388601037,
"grad_norm": 43.48960854254409,
"learning_rate": 2.4277550988570362e-06,
"loss": 1.0537,
"step": 657
},
{
"epoch": 1.704663212435233,
"grad_norm": 28.13627467897817,
"learning_rate": 2.3869273545210158e-06,
"loss": 1.0558,
"step": 658
},
{
"epoch": 1.7072538860103625,
"grad_norm": 33.18164212520423,
"learning_rate": 2.3464240334890496e-06,
"loss": 1.054,
"step": 659
},
{
"epoch": 1.709844559585492,
"grad_norm": 41.884394437273144,
"learning_rate": 2.3062458818104804e-06,
"loss": 1.0871,
"step": 660
},
{
"epoch": 1.7124352331606216,
"grad_norm": 27.119840736470916,
"learning_rate": 2.266393639545197e-06,
"loss": 1.0743,
"step": 661
},
{
"epoch": 1.7150259067357512,
"grad_norm": 20.70474999023591,
"learning_rate": 2.22686804075003e-06,
"loss": 1.0718,
"step": 662
},
{
"epoch": 1.7176165803108807,
"grad_norm": 21.469651089617198,
"learning_rate": 2.187669813465192e-06,
"loss": 1.0584,
"step": 663
},
{
"epoch": 1.7202072538860103,
"grad_norm": 29.901704269591495,
"learning_rate": 2.1487996797009103e-06,
"loss": 1.1175,
"step": 664
},
{
"epoch": 1.7227979274611398,
"grad_norm": 75.06310533674302,
"learning_rate": 2.110258355424093e-06,
"loss": 1.124,
"step": 665
},
{
"epoch": 1.7253886010362693,
"grad_norm": 34.13349153293387,
"learning_rate": 2.0720465505451524e-06,
"loss": 1.1395,
"step": 666
},
{
"epoch": 1.7279792746113989,
"grad_norm": 26.83922350447555,
"learning_rate": 2.0341649689049458e-06,
"loss": 1.0449,
"step": 667
},
{
"epoch": 1.7305699481865284,
"grad_norm": 37.284339589086024,
"learning_rate": 1.9966143082617797e-06,
"loss": 1.0332,
"step": 668
},
{
"epoch": 1.733160621761658,
"grad_norm": 46.453238969399074,
"learning_rate": 1.959395260278587e-06,
"loss": 1.1303,
"step": 669
},
{
"epoch": 1.7357512953367875,
"grad_norm": 22.743791018223284,
"learning_rate": 1.922508510510166e-06,
"loss": 1.0993,
"step": 670
},
{
"epoch": 1.738341968911917,
"grad_norm": 27.788137087891727,
"learning_rate": 1.885954738390572e-06,
"loss": 1.1234,
"step": 671
},
{
"epoch": 1.7409326424870466,
"grad_norm": 34.03637743502625,
"learning_rate": 1.8497346172205733e-06,
"loss": 1.085,
"step": 672
},
{
"epoch": 1.7435233160621761,
"grad_norm": 30.308363072599853,
"learning_rate": 1.8138488141552856e-06,
"loss": 1.0348,
"step": 673
},
{
"epoch": 1.7461139896373057,
"grad_norm": 26.81612464278571,
"learning_rate": 1.7782979901918507e-06,
"loss": 1.0672,
"step": 674
},
{
"epoch": 1.7487046632124352,
"grad_norm": 46.96340147563577,
"learning_rate": 1.7430828001572897e-06,
"loss": 1.0807,
"step": 675
},
{
"epoch": 1.7512953367875648,
"grad_norm": 30.87064631308438,
"learning_rate": 1.7082038926964162e-06,
"loss": 1.1411,
"step": 676
},
{
"epoch": 1.7538860103626943,
"grad_norm": 79.59411718865987,
"learning_rate": 1.6736619102599073e-06,
"loss": 1.0234,
"step": 677
},
{
"epoch": 1.7564766839378239,
"grad_norm": 30.875792565440594,
"learning_rate": 1.6394574890924574e-06,
"loss": 1.1506,
"step": 678
},
{
"epoch": 1.7590673575129534,
"grad_norm": 34.227935587917464,
"learning_rate": 1.605591259221071e-06,
"loss": 1.0981,
"step": 679
},
{
"epoch": 1.7590673575129534,
"eval_loss": 1.0809757709503174,
"eval_runtime": 37.9729,
"eval_samples_per_second": 19.593,
"eval_steps_per_second": 1.238,
"step": 679
},
{
"epoch": 1.761658031088083,
"grad_norm": 31.849171622198522,
"learning_rate": 1.572063844443441e-06,
"loss": 1.1227,
"step": 680
},
{
"epoch": 1.7642487046632125,
"grad_norm": 32.75765881856165,
"learning_rate": 1.5388758623164802e-06,
"loss": 1.0842,
"step": 681
},
{
"epoch": 1.766839378238342,
"grad_norm": 27.83779558188967,
"learning_rate": 1.5060279241449304e-06,
"loss": 1.0419,
"step": 682
},
{
"epoch": 1.7694300518134716,
"grad_norm": 30.646833576522408,
"learning_rate": 1.4735206349701003e-06,
"loss": 1.0983,
"step": 683
},
{
"epoch": 1.7720207253886011,
"grad_norm": 29.748071428344947,
"learning_rate": 1.4413545935587415e-06,
"loss": 1.1276,
"step": 684
},
{
"epoch": 1.7746113989637307,
"grad_norm": 32.57104117085742,
"learning_rate": 1.4095303923919956e-06,
"loss": 1.0728,
"step": 685
},
{
"epoch": 1.7772020725388602,
"grad_norm": 32.02209671450587,
"learning_rate": 1.3780486176544905e-06,
"loss": 1.1148,
"step": 686
},
{
"epoch": 1.7797927461139897,
"grad_norm": 31.902388050458736,
"learning_rate": 1.3469098492235521e-06,
"loss": 1.0873,
"step": 687
},
{
"epoch": 1.7823834196891193,
"grad_norm": 33.159581668201604,
"learning_rate": 1.316114660658505e-06,
"loss": 1.0308,
"step": 688
},
{
"epoch": 1.7849740932642488,
"grad_norm": 25.531240947030152,
"learning_rate": 1.2856636191901296e-06,
"loss": 1.0893,
"step": 689
},
{
"epoch": 1.7875647668393784,
"grad_norm": 25.382870674663973,
"learning_rate": 1.255557285710185e-06,
"loss": 1.1089,
"step": 690
},
{
"epoch": 1.790155440414508,
"grad_norm": 26.184606368046406,
"learning_rate": 1.225796214761117e-06,
"loss": 1.1515,
"step": 691
},
{
"epoch": 1.7927461139896375,
"grad_norm": 27.78595815725415,
"learning_rate": 1.196380954525802e-06,
"loss": 1.0871,
"step": 692
},
{
"epoch": 1.795336787564767,
"grad_norm": 32.137607036645285,
"learning_rate": 1.1673120468174837e-06,
"loss": 1.1396,
"step": 693
},
{
"epoch": 1.7979274611398963,
"grad_norm": 31.931928767500203,
"learning_rate": 1.1385900270697658e-06,
"loss": 1.1175,
"step": 694
},
{
"epoch": 1.8005181347150259,
"grad_norm": 36.61199052966244,
"learning_rate": 1.110215424326775e-06,
"loss": 1.1867,
"step": 695
},
{
"epoch": 1.8031088082901554,
"grad_norm": 49.9081839820131,
"learning_rate": 1.0821887612333959e-06,
"loss": 1.1266,
"step": 696
},
{
"epoch": 1.805699481865285,
"grad_norm": 25.346034138603734,
"learning_rate": 1.0545105540256628e-06,
"loss": 1.0614,
"step": 697
},
{
"epoch": 1.8082901554404145,
"grad_norm": 47.53838459679947,
"learning_rate": 1.0271813125212237e-06,
"loss": 1.1314,
"step": 698
},
{
"epoch": 1.810880829015544,
"grad_norm": 30.496460286583815,
"learning_rate": 1.0002015401099797e-06,
"loss": 1.1067,
"step": 699
},
{
"epoch": 1.8134715025906736,
"grad_norm": 29.929097539381686,
"learning_rate": 9.735717337447981e-07,
"loss": 1.0424,
"step": 700
},
{
"epoch": 1.8160621761658031,
"grad_norm": 30.887132457194266,
"learning_rate": 9.4729238393235e-07,
"loss": 1.1248,
"step": 701
},
{
"epoch": 1.8186528497409327,
"grad_norm": 24.26916275448189,
"learning_rate": 9.21363974724101e-07,
"loss": 1.0577,
"step": 702
},
{
"epoch": 1.8212435233160622,
"grad_norm": 40.34641617989283,
"learning_rate": 8.957869837073673e-07,
"loss": 1.1639,
"step": 703
},
{
"epoch": 1.8238341968911918,
"grad_norm": 34.3133374466777,
"learning_rate": 8.705618819965411e-07,
"loss": 1.0866,
"step": 704
},
{
"epoch": 1.8264248704663213,
"grad_norm": 25.164299615685284,
"learning_rate": 8.456891342243945e-07,
"loss": 1.1232,
"step": 705
},
{
"epoch": 1.8290155440414506,
"grad_norm": 129.91297199628124,
"learning_rate": 8.211691985335357e-07,
"loss": 1.1542,
"step": 706
},
{
"epoch": 1.8316062176165802,
"grad_norm": 23.928927141144797,
"learning_rate": 7.970025265679648e-07,
"loss": 1.0813,
"step": 707
},
{
"epoch": 1.8341968911917097,
"grad_norm": 22.631504479886225,
"learning_rate": 7.731895634647513e-07,
"loss": 1.1164,
"step": 708
},
{
"epoch": 1.8367875647668392,
"grad_norm": 84.2359250723018,
"learning_rate": 7.497307478458382e-07,
"loss": 1.1081,
"step": 709
},
{
"epoch": 1.8393782383419688,
"grad_norm": 51.39142883893451,
"learning_rate": 7.266265118099669e-07,
"loss": 1.105,
"step": 710
},
{
"epoch": 1.8419689119170983,
"grad_norm": 41.18280727079993,
"learning_rate": 7.038772809247075e-07,
"loss": 1.1211,
"step": 711
},
{
"epoch": 1.8445595854922279,
"grad_norm": 34.330855277813534,
"learning_rate": 6.814834742186361e-07,
"loss": 1.0783,
"step": 712
},
{
"epoch": 1.8471502590673574,
"grad_norm": 46.858780552576334,
"learning_rate": 6.594455041735925e-07,
"loss": 1.0214,
"step": 713
},
{
"epoch": 1.849740932642487,
"grad_norm": 94.2712798319484,
"learning_rate": 6.377637767171152e-07,
"loss": 1.098,
"step": 714
},
{
"epoch": 1.8523316062176165,
"grad_norm": 33.00073975184253,
"learning_rate": 6.164386912149289e-07,
"loss": 1.0906,
"step": 715
},
{
"epoch": 1.854922279792746,
"grad_norm": 30.030119862133272,
"learning_rate": 5.954706404636179e-07,
"loss": 1.1073,
"step": 716
},
{
"epoch": 1.8575129533678756,
"grad_norm": 46.42282973245658,
"learning_rate": 5.748600106833735e-07,
"loss": 1.1553,
"step": 717
},
{
"epoch": 1.8601036269430051,
"grad_norm": 26.48910946182044,
"learning_rate": 5.546071815108845e-07,
"loss": 1.0704,
"step": 718
},
{
"epoch": 1.8626943005181347,
"grad_norm": 29.34093197155635,
"learning_rate": 5.347125259923491e-07,
"loss": 1.1,
"step": 719
},
{
"epoch": 1.8652849740932642,
"grad_norm": 24.689130499541356,
"learning_rate": 5.151764105766011e-07,
"loss": 1.067,
"step": 720
},
{
"epoch": 1.8678756476683938,
"grad_norm": 21.25619644617847,
"learning_rate": 4.959991951083498e-07,
"loss": 1.1125,
"step": 721
},
{
"epoch": 1.8704663212435233,
"grad_norm": 23.946272802272112,
"learning_rate": 4.771812328215708e-07,
"loss": 1.0798,
"step": 722
},
{
"epoch": 1.8730569948186528,
"grad_norm": 33.286030816378954,
"learning_rate": 4.587228703329838e-07,
"loss": 1.0756,
"step": 723
},
{
"epoch": 1.8756476683937824,
"grad_norm": 109.02542545414109,
"learning_rate": 4.40624447635678e-07,
"loss": 1.073,
"step": 724
},
{
"epoch": 1.878238341968912,
"grad_norm": 133.80505789447585,
"learning_rate": 4.228862980928439e-07,
"loss": 1.1218,
"step": 725
},
{
"epoch": 1.8808290155440415,
"grad_norm": 28.671374209715793,
"learning_rate": 4.0550874843163337e-07,
"loss": 1.1546,
"step": 726
},
{
"epoch": 1.883419689119171,
"grad_norm": 20.092775273550536,
"learning_rate": 3.8849211873714266e-07,
"loss": 1.0608,
"step": 727
},
{
"epoch": 1.8860103626943006,
"grad_norm": 18.87195408427635,
"learning_rate": 3.7183672244652135e-07,
"loss": 1.0437,
"step": 728
},
{
"epoch": 1.88860103626943,
"grad_norm": 24.985644120932864,
"learning_rate": 3.5554286634318814e-07,
"loss": 1.0989,
"step": 729
},
{
"epoch": 1.8911917098445596,
"grad_norm": 24.09887960702925,
"learning_rate": 3.3961085055119083e-07,
"loss": 1.0347,
"step": 730
},
{
"epoch": 1.8937823834196892,
"grad_norm": 98.50926523613283,
"learning_rate": 3.2404096852967305e-07,
"loss": 1.1163,
"step": 731
},
{
"epoch": 1.8963730569948187,
"grad_norm": 42.45357973111845,
"learning_rate": 3.0883350706746973e-07,
"loss": 1.1497,
"step": 732
},
{
"epoch": 1.8989637305699483,
"grad_norm": 25.430184794482617,
"learning_rate": 2.9398874627782014e-07,
"loss": 1.0154,
"step": 733
},
{
"epoch": 1.9015544041450778,
"grad_norm": 32.56552224066898,
"learning_rate": 2.7950695959322093e-07,
"loss": 1.0976,
"step": 734
},
{
"epoch": 1.9041450777202074,
"grad_norm": 25.518391980867197,
"learning_rate": 2.653884137603702e-07,
"loss": 1.1122,
"step": 735
},
{
"epoch": 1.906735751295337,
"grad_norm": 20.537146853099735,
"learning_rate": 2.516333688352801e-07,
"loss": 1.0592,
"step": 736
},
{
"epoch": 1.9093264248704664,
"grad_norm": 25.28898033119641,
"learning_rate": 2.382420781784589e-07,
"loss": 1.0706,
"step": 737
},
{
"epoch": 1.911917098445596,
"grad_norm": 55.74230904177274,
"learning_rate": 2.2521478845025867e-07,
"loss": 1.1706,
"step": 738
},
{
"epoch": 1.9145077720207255,
"grad_norm": 42.768439146141375,
"learning_rate": 2.1255173960634146e-07,
"loss": 1.0917,
"step": 739
},
{
"epoch": 1.917098445595855,
"grad_norm": 31.627146067352545,
"learning_rate": 2.0025316489323597e-07,
"loss": 1.0842,
"step": 740
},
{
"epoch": 1.9196891191709846,
"grad_norm": 67.01614151937272,
"learning_rate": 1.8831929084406119e-07,
"loss": 1.1287,
"step": 741
},
{
"epoch": 1.922279792746114,
"grad_norm": 56.931018082229045,
"learning_rate": 1.7675033727434288e-07,
"loss": 1.148,
"step": 742
},
{
"epoch": 1.9248704663212435,
"grad_norm": 35.24107640275113,
"learning_rate": 1.655465172779702e-07,
"loss": 1.0814,
"step": 743
},
{
"epoch": 1.927461139896373,
"grad_norm": 28.45308969334642,
"learning_rate": 1.547080372232679e-07,
"loss": 1.1092,
"step": 744
},
{
"epoch": 1.9300518134715026,
"grad_norm": 67.36918357149847,
"learning_rate": 1.44235096749199e-07,
"loss": 1.1332,
"step": 745
},
{
"epoch": 1.932642487046632,
"grad_norm": 33.50866269131509,
"learning_rate": 1.3412788876167925e-07,
"loss": 1.0884,
"step": 746
},
{
"epoch": 1.9352331606217616,
"grad_norm": 34.359505767271465,
"learning_rate": 1.2438659943003306e-07,
"loss": 0.9982,
"step": 747
},
{
"epoch": 1.9378238341968912,
"grad_norm": 44.805290236152125,
"learning_rate": 1.1501140818355627e-07,
"loss": 1.065,
"step": 748
},
{
"epoch": 1.9404145077720207,
"grad_norm": 35.70322964853727,
"learning_rate": 1.0600248770821886e-07,
"loss": 1.1435,
"step": 749
},
{
"epoch": 1.9430051813471503,
"grad_norm": 37.7037381444634,
"learning_rate": 9.736000394348299e-08,
"loss": 1.1085,
"step": 750
},
{
"epoch": 1.9455958549222798,
"grad_norm": 19.88028370873119,
"learning_rate": 8.908411607923884e-08,
"loss": 1.0903,
"step": 751
},
{
"epoch": 1.9481865284974094,
"grad_norm": 22.037441897095253,
"learning_rate": 8.117497655287798e-08,
"loss": 1.0621,
"step": 752
},
{
"epoch": 1.950777202072539,
"grad_norm": 36.597366625713235,
"learning_rate": 7.363273104648904e-08,
"loss": 1.134,
"step": 753
},
{
"epoch": 1.9533678756476682,
"grad_norm": 36.91544331752125,
"learning_rate": 6.645751848417093e-08,
"loss": 1.0894,
"step": 754
},
{
"epoch": 1.9559585492227978,
"grad_norm": 30.791496804716704,
"learning_rate": 5.964947102946594e-08,
"loss": 1.0774,
"step": 755
},
{
"epoch": 1.9585492227979273,
"grad_norm": 24.76204564200231,
"learning_rate": 5.320871408294403e-08,
"loss": 1.1167,
"step": 756
},
{
"epoch": 1.9611398963730569,
"grad_norm": 31.78111531944549,
"learning_rate": 4.713536627987347e-08,
"loss": 1.0709,
"step": 757
},
{
"epoch": 1.9637305699481864,
"grad_norm": 36.388018093644106,
"learning_rate": 4.1429539488047066e-08,
"loss": 1.0492,
"step": 758
},
{
"epoch": 1.966321243523316,
"grad_norm": 27.235358627643226,
"learning_rate": 3.6091338805719356e-08,
"loss": 1.1128,
"step": 759
},
{
"epoch": 1.9689119170984455,
"grad_norm": 26.526882273916378,
"learning_rate": 3.1120862559670396e-08,
"loss": 1.1129,
"step": 760
},
{
"epoch": 1.971502590673575,
"grad_norm": 28.962449597773997,
"learning_rate": 2.651820230338942e-08,
"loss": 1.1286,
"step": 761
},
{
"epoch": 1.9740932642487046,
"grad_norm": 104.33848533313731,
"learning_rate": 2.2283442815402845e-08,
"loss": 1.117,
"step": 762
},
{
"epoch": 1.9766839378238341,
"grad_norm": 179.66099272542536,
"learning_rate": 1.8416662097693326e-08,
"loss": 1.0788,
"step": 763
},
{
"epoch": 1.9792746113989637,
"grad_norm": 28.438877123785307,
"learning_rate": 1.491793137427866e-08,
"loss": 1.1436,
"step": 764
},
{
"epoch": 1.9818652849740932,
"grad_norm": 44.454308819411644,
"learning_rate": 1.1787315089895057e-08,
"loss": 1.1108,
"step": 765
},
{
"epoch": 1.9844559585492227,
"grad_norm": 53.23249975862293,
"learning_rate": 9.024870908802552e-09,
"loss": 0.9971,
"step": 766
},
{
"epoch": 1.9870466321243523,
"grad_norm": 35.2043549019015,
"learning_rate": 6.630649713739168e-09,
"loss": 1.1205,
"step": 767
},
{
"epoch": 1.9896373056994818,
"grad_norm": 22.286284343829376,
"learning_rate": 4.6046956049639045e-09,
"loss": 1.0848,
"step": 768
},
{
"epoch": 1.9922279792746114,
"grad_norm": 24.94719200433733,
"learning_rate": 2.94704589946182e-09,
"loss": 1.1308,
"step": 769
},
{
"epoch": 1.994818652849741,
"grad_norm": 41.684623957583106,
"learning_rate": 1.657731130246809e-09,
"loss": 1.1555,
"step": 770
},
{
"epoch": 1.9974093264248705,
"grad_norm": 55.480495348949425,
"learning_rate": 7.367750458020518e-10,
"loss": 1.129,
"step": 771
},
{
"epoch": 2.0,
"grad_norm": 43.2148652279276,
"learning_rate": 1.8419460964258505e-10,
"loss": 1.0835,
"step": 772
}
],
"logging_steps": 1,
"max_steps": 772,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 193,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3363988309999616e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}