PEFT
Safetensors
glm4
axolotl
Generated from Trainer
GLM-v2-lora / checkpoint-822 /trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
6c23bbb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9963547995139734,
"eval_steps": 103,
"global_step": 822,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002430133657351154,
"grad_norm": 715.4923219036787,
"learning_rate": 0.0,
"loss": 1.3541,
"step": 1
},
{
"epoch": 0.002430133657351154,
"eval_loss": 1.3335719108581543,
"eval_runtime": 53.4883,
"eval_samples_per_second": 13.91,
"eval_steps_per_second": 1.739,
"step": 1
},
{
"epoch": 0.004860267314702308,
"grad_norm": 614.6970578314867,
"learning_rate": 5e-06,
"loss": 1.3775,
"step": 2
},
{
"epoch": 0.007290400972053463,
"grad_norm": 471.59017991123795,
"learning_rate": 1e-05,
"loss": 1.339,
"step": 3
},
{
"epoch": 0.009720534629404616,
"grad_norm": 238.72216262259653,
"learning_rate": 1.5e-05,
"loss": 1.3829,
"step": 4
},
{
"epoch": 0.012150668286755772,
"grad_norm": 355.68955726709873,
"learning_rate": 2e-05,
"loss": 1.3597,
"step": 5
},
{
"epoch": 0.014580801944106925,
"grad_norm": 414.5627284272111,
"learning_rate": 2.5e-05,
"loss": 1.3862,
"step": 6
},
{
"epoch": 0.01701093560145808,
"grad_norm": 534.9877222052693,
"learning_rate": 3e-05,
"loss": 1.2784,
"step": 7
},
{
"epoch": 0.019441069258809233,
"grad_norm": 153.38895635666677,
"learning_rate": 3.5e-05,
"loss": 1.3521,
"step": 8
},
{
"epoch": 0.02187120291616039,
"grad_norm": 858.293734138087,
"learning_rate": 4e-05,
"loss": 1.2461,
"step": 9
},
{
"epoch": 0.024301336573511544,
"grad_norm": 255.81989388533376,
"learning_rate": 4.5e-05,
"loss": 1.2778,
"step": 10
},
{
"epoch": 0.026731470230862697,
"grad_norm": 368.91949003479226,
"learning_rate": 5e-05,
"loss": 1.3412,
"step": 11
},
{
"epoch": 0.02916160388821385,
"grad_norm": 176.49481799555898,
"learning_rate": 5.500000000000001e-05,
"loss": 1.3437,
"step": 12
},
{
"epoch": 0.031591737545565005,
"grad_norm": 208.57742104974147,
"learning_rate": 6e-05,
"loss": 1.2859,
"step": 13
},
{
"epoch": 0.03402187120291616,
"grad_norm": 93.26742036471734,
"learning_rate": 6.500000000000001e-05,
"loss": 1.1843,
"step": 14
},
{
"epoch": 0.03645200486026731,
"grad_norm": 145.53380444622215,
"learning_rate": 7e-05,
"loss": 1.4281,
"step": 15
},
{
"epoch": 0.038882138517618466,
"grad_norm": 126.56724937430516,
"learning_rate": 7.500000000000001e-05,
"loss": 1.3908,
"step": 16
},
{
"epoch": 0.041312272174969626,
"grad_norm": 106.19246390662754,
"learning_rate": 8e-05,
"loss": 1.344,
"step": 17
},
{
"epoch": 0.04374240583232078,
"grad_norm": 289.348178084847,
"learning_rate": 8.5e-05,
"loss": 1.2708,
"step": 18
},
{
"epoch": 0.046172539489671933,
"grad_norm": 286.63676887065634,
"learning_rate": 9e-05,
"loss": 1.3564,
"step": 19
},
{
"epoch": 0.04860267314702309,
"grad_norm": 269.6096299101413,
"learning_rate": 9.5e-05,
"loss": 1.2184,
"step": 20
},
{
"epoch": 0.05103280680437424,
"grad_norm": 151.28678796160915,
"learning_rate": 0.0001,
"loss": 1.2974,
"step": 21
},
{
"epoch": 0.053462940461725394,
"grad_norm": 265.5625538646362,
"learning_rate": 0.000105,
"loss": 1.2703,
"step": 22
},
{
"epoch": 0.05589307411907655,
"grad_norm": 724.7157187586193,
"learning_rate": 0.00011000000000000002,
"loss": 1.2691,
"step": 23
},
{
"epoch": 0.0583232077764277,
"grad_norm": 425.3768239347252,
"learning_rate": 0.00011499999999999999,
"loss": 1.375,
"step": 24
},
{
"epoch": 0.060753341433778855,
"grad_norm": 314.5119318308783,
"learning_rate": 0.00012,
"loss": 1.2952,
"step": 25
},
{
"epoch": 0.06318347509113001,
"grad_norm": 557.519173033834,
"learning_rate": 0.000125,
"loss": 1.2923,
"step": 26
},
{
"epoch": 0.06561360874848117,
"grad_norm": 211.4069356529637,
"learning_rate": 0.00013000000000000002,
"loss": 1.2629,
"step": 27
},
{
"epoch": 0.06804374240583232,
"grad_norm": 299.7742653722713,
"learning_rate": 0.00013500000000000003,
"loss": 1.3099,
"step": 28
},
{
"epoch": 0.07047387606318348,
"grad_norm": 182.18551965886013,
"learning_rate": 0.00014,
"loss": 1.2215,
"step": 29
},
{
"epoch": 0.07290400972053462,
"grad_norm": 153.38300520125887,
"learning_rate": 0.000145,
"loss": 1.2799,
"step": 30
},
{
"epoch": 0.07533414337788578,
"grad_norm": 849.4472853252786,
"learning_rate": 0.00015000000000000001,
"loss": 1.2012,
"step": 31
},
{
"epoch": 0.07776427703523693,
"grad_norm": 179.94814586965418,
"learning_rate": 0.000155,
"loss": 1.2103,
"step": 32
},
{
"epoch": 0.08019441069258809,
"grad_norm": 180.36681057956048,
"learning_rate": 0.00016,
"loss": 1.2414,
"step": 33
},
{
"epoch": 0.08262454434993925,
"grad_norm": 113.72852454032189,
"learning_rate": 0.000165,
"loss": 1.2508,
"step": 34
},
{
"epoch": 0.0850546780072904,
"grad_norm": 150.53415363213057,
"learning_rate": 0.00017,
"loss": 1.2528,
"step": 35
},
{
"epoch": 0.08748481166464156,
"grad_norm": 156.19567878683574,
"learning_rate": 0.000175,
"loss": 1.2016,
"step": 36
},
{
"epoch": 0.0899149453219927,
"grad_norm": 416.34884765145057,
"learning_rate": 0.00018,
"loss": 1.254,
"step": 37
},
{
"epoch": 0.09234507897934387,
"grad_norm": 269.7105025581372,
"learning_rate": 0.00018500000000000002,
"loss": 1.2215,
"step": 38
},
{
"epoch": 0.09477521263669501,
"grad_norm": 249.35069047655023,
"learning_rate": 0.00019,
"loss": 1.2078,
"step": 39
},
{
"epoch": 0.09720534629404617,
"grad_norm": 167.16896045613478,
"learning_rate": 0.000195,
"loss": 1.1866,
"step": 40
},
{
"epoch": 0.09963547995139732,
"grad_norm": 248.22240554128427,
"learning_rate": 0.0002,
"loss": 1.252,
"step": 41
},
{
"epoch": 0.10206561360874848,
"grad_norm": 180.89520841022969,
"learning_rate": 0.0001999991930332148,
"loss": 1.2251,
"step": 42
},
{
"epoch": 0.10449574726609964,
"grad_norm": 614.4291375430485,
"learning_rate": 0.00019999677214588312,
"loss": 1.2563,
"step": 43
},
{
"epoch": 0.10692588092345079,
"grad_norm": 211.7523427355369,
"learning_rate": 0.00019999273737707646,
"loss": 1.193,
"step": 44
},
{
"epoch": 0.10935601458080195,
"grad_norm": 181.56788458769344,
"learning_rate": 0.00019998708879191335,
"loss": 1.2598,
"step": 45
},
{
"epoch": 0.1117861482381531,
"grad_norm": 157.5783414916277,
"learning_rate": 0.00019997982648155814,
"loss": 1.2663,
"step": 46
},
{
"epoch": 0.11421628189550426,
"grad_norm": 155.78006251192625,
"learning_rate": 0.00019997095056321971,
"loss": 1.1637,
"step": 47
},
{
"epoch": 0.1166464155528554,
"grad_norm": 202.0253360488958,
"learning_rate": 0.00019996046118014955,
"loss": 1.2508,
"step": 48
},
{
"epoch": 0.11907654921020656,
"grad_norm": 192.7576297264874,
"learning_rate": 0.00019994835850163924,
"loss": 1.2014,
"step": 49
},
{
"epoch": 0.12150668286755771,
"grad_norm": 132.5484871621418,
"learning_rate": 0.00019993464272301804,
"loss": 1.2279,
"step": 50
},
{
"epoch": 0.12393681652490887,
"grad_norm": 128.32285438248965,
"learning_rate": 0.00019991931406564944,
"loss": 1.2179,
"step": 51
},
{
"epoch": 0.12636695018226002,
"grad_norm": 552.3669463716512,
"learning_rate": 0.00019990237277692788,
"loss": 1.1498,
"step": 52
},
{
"epoch": 0.12879708383961117,
"grad_norm": 86.17911790260192,
"learning_rate": 0.00019988381913027442,
"loss": 1.2784,
"step": 53
},
{
"epoch": 0.13122721749696234,
"grad_norm": 70.83294605515782,
"learning_rate": 0.00019986365342513265,
"loss": 1.2224,
"step": 54
},
{
"epoch": 0.1336573511543135,
"grad_norm": 45.23624563299466,
"learning_rate": 0.00019984187598696363,
"loss": 1.1746,
"step": 55
},
{
"epoch": 0.13608748481166463,
"grad_norm": 57.67645735585192,
"learning_rate": 0.00019981848716724073,
"loss": 1.2154,
"step": 56
},
{
"epoch": 0.1385176184690158,
"grad_norm": 45.661268047129674,
"learning_rate": 0.00019979348734344398,
"loss": 1.1411,
"step": 57
},
{
"epoch": 0.14094775212636695,
"grad_norm": 53.10628399970359,
"learning_rate": 0.00019976687691905393,
"loss": 1.2029,
"step": 58
},
{
"epoch": 0.1433778857837181,
"grad_norm": 38.71353325803162,
"learning_rate": 0.00019973865632354516,
"loss": 1.1976,
"step": 59
},
{
"epoch": 0.14580801944106925,
"grad_norm": 42.789208063581114,
"learning_rate": 0.0001997088260123793,
"loss": 1.1477,
"step": 60
},
{
"epoch": 0.14823815309842042,
"grad_norm": 37.613194740192164,
"learning_rate": 0.0001996773864669978,
"loss": 1.2529,
"step": 61
},
{
"epoch": 0.15066828675577157,
"grad_norm": 47.96813084127655,
"learning_rate": 0.00019964433819481405,
"loss": 1.2328,
"step": 62
},
{
"epoch": 0.15309842041312272,
"grad_norm": 55.30483872428545,
"learning_rate": 0.00019960968172920516,
"loss": 1.1996,
"step": 63
},
{
"epoch": 0.15552855407047386,
"grad_norm": 35.58995799070749,
"learning_rate": 0.00019957341762950344,
"loss": 1.1248,
"step": 64
},
{
"epoch": 0.15795868772782504,
"grad_norm": 58.86131222300149,
"learning_rate": 0.00019953554648098748,
"loss": 1.3017,
"step": 65
},
{
"epoch": 0.16038882138517618,
"grad_norm": 32.12091331878439,
"learning_rate": 0.00019949606889487233,
"loss": 1.1961,
"step": 66
},
{
"epoch": 0.16281895504252733,
"grad_norm": 167.27433996357928,
"learning_rate": 0.0001994549855083001,
"loss": 1.1768,
"step": 67
},
{
"epoch": 0.1652490886998785,
"grad_norm": 32.3328494297432,
"learning_rate": 0.0001994122969843293,
"loss": 1.1802,
"step": 68
},
{
"epoch": 0.16767922235722965,
"grad_norm": 39.92530074438497,
"learning_rate": 0.0001993680040119244,
"loss": 1.2098,
"step": 69
},
{
"epoch": 0.1701093560145808,
"grad_norm": 45.60830517129956,
"learning_rate": 0.0001993221073059445,
"loss": 1.2159,
"step": 70
},
{
"epoch": 0.17253948967193194,
"grad_norm": 35.462695032736335,
"learning_rate": 0.00019927460760713197,
"loss": 1.1818,
"step": 71
},
{
"epoch": 0.17496962332928312,
"grad_norm": 43.05751624597826,
"learning_rate": 0.0001992255056821004,
"loss": 1.2011,
"step": 72
},
{
"epoch": 0.17739975698663427,
"grad_norm": 47.13143404969894,
"learning_rate": 0.00019917480232332224,
"loss": 1.1669,
"step": 73
},
{
"epoch": 0.1798298906439854,
"grad_norm": 72.07146401418987,
"learning_rate": 0.000199122498349116,
"loss": 1.181,
"step": 74
},
{
"epoch": 0.1822600243013366,
"grad_norm": 36.289202348834955,
"learning_rate": 0.00019906859460363307,
"loss": 1.1787,
"step": 75
},
{
"epoch": 0.18469015795868773,
"grad_norm": 46.92636167228936,
"learning_rate": 0.00019901309195684416,
"loss": 1.2316,
"step": 76
},
{
"epoch": 0.18712029161603888,
"grad_norm": 31.71425340357504,
"learning_rate": 0.00019895599130452505,
"loss": 1.1607,
"step": 77
},
{
"epoch": 0.18955042527339003,
"grad_norm": 43.94199928621344,
"learning_rate": 0.00019889729356824235,
"loss": 1.1919,
"step": 78
},
{
"epoch": 0.1919805589307412,
"grad_norm": 45.33073791860179,
"learning_rate": 0.0001988369996953386,
"loss": 1.2237,
"step": 79
},
{
"epoch": 0.19441069258809235,
"grad_norm": 135.89980489661897,
"learning_rate": 0.00019877511065891673,
"loss": 1.1822,
"step": 80
},
{
"epoch": 0.1968408262454435,
"grad_norm": 439.6770852212966,
"learning_rate": 0.00019871162745782478,
"loss": 1.1441,
"step": 81
},
{
"epoch": 0.19927095990279464,
"grad_norm": 80.73319798776026,
"learning_rate": 0.0001986465511166394,
"loss": 1.1709,
"step": 82
},
{
"epoch": 0.20170109356014582,
"grad_norm": 87.76515297497458,
"learning_rate": 0.00019857988268564953,
"loss": 1.1549,
"step": 83
},
{
"epoch": 0.20413122721749696,
"grad_norm": 70.08754986406095,
"learning_rate": 0.00019851162324083932,
"loss": 1.1771,
"step": 84
},
{
"epoch": 0.2065613608748481,
"grad_norm": 187.8198997057664,
"learning_rate": 0.0001984417738838709,
"loss": 1.2068,
"step": 85
},
{
"epoch": 0.20899149453219928,
"grad_norm": 127.78818684755072,
"learning_rate": 0.00019837033574206646,
"loss": 1.1974,
"step": 86
},
{
"epoch": 0.21142162818955043,
"grad_norm": 127.82979216871074,
"learning_rate": 0.0001982973099683902,
"loss": 1.185,
"step": 87
},
{
"epoch": 0.21385176184690158,
"grad_norm": 142.35425084857746,
"learning_rate": 0.00019822269774142954,
"loss": 1.2225,
"step": 88
},
{
"epoch": 0.21628189550425272,
"grad_norm": 246.64019353564817,
"learning_rate": 0.0001981465002653763,
"loss": 1.2574,
"step": 89
},
{
"epoch": 0.2187120291616039,
"grad_norm": 189.88471076285524,
"learning_rate": 0.0001980687187700071,
"loss": 1.1635,
"step": 90
},
{
"epoch": 0.22114216281895505,
"grad_norm": 116.65693373141701,
"learning_rate": 0.00019798935451066361,
"loss": 1.1457,
"step": 91
},
{
"epoch": 0.2235722964763062,
"grad_norm": 71.76422539970217,
"learning_rate": 0.00019790840876823232,
"loss": 1.2354,
"step": 92
},
{
"epoch": 0.22600243013365734,
"grad_norm": 139.42330509386431,
"learning_rate": 0.0001978258828491236,
"loss": 1.18,
"step": 93
},
{
"epoch": 0.2284325637910085,
"grad_norm": 131.88308820601443,
"learning_rate": 0.00019774177808525113,
"loss": 1.1868,
"step": 94
},
{
"epoch": 0.23086269744835966,
"grad_norm": 85.81071125615291,
"learning_rate": 0.00019765609583400977,
"loss": 1.1814,
"step": 95
},
{
"epoch": 0.2332928311057108,
"grad_norm": 84.43756298541064,
"learning_rate": 0.00019756883747825424,
"loss": 1.1658,
"step": 96
},
{
"epoch": 0.23572296476306198,
"grad_norm": 114.24245545143974,
"learning_rate": 0.0001974800044262764,
"loss": 1.2497,
"step": 97
},
{
"epoch": 0.23815309842041313,
"grad_norm": 76.577511222722,
"learning_rate": 0.00019738959811178272,
"loss": 1.1414,
"step": 98
},
{
"epoch": 0.24058323207776428,
"grad_norm": 171.8084830895381,
"learning_rate": 0.00019729761999387103,
"loss": 1.1619,
"step": 99
},
{
"epoch": 0.24301336573511542,
"grad_norm": 221.87752250936416,
"learning_rate": 0.00019720407155700707,
"loss": 1.2718,
"step": 100
},
{
"epoch": 0.2454434993924666,
"grad_norm": 205.64943975370608,
"learning_rate": 0.00019710895431100046,
"loss": 1.1786,
"step": 101
},
{
"epoch": 0.24787363304981774,
"grad_norm": 160.16582903260615,
"learning_rate": 0.00019701226979098037,
"loss": 1.1426,
"step": 102
},
{
"epoch": 0.2503037667071689,
"grad_norm": 82.85031394537334,
"learning_rate": 0.00019691401955737072,
"loss": 1.1718,
"step": 103
},
{
"epoch": 0.2503037667071689,
"eval_loss": 1.1633374691009521,
"eval_runtime": 52.6182,
"eval_samples_per_second": 14.14,
"eval_steps_per_second": 1.767,
"step": 103
},
{
"epoch": 0.25273390036452004,
"grad_norm": 94.74469296109082,
"learning_rate": 0.000196814205195865,
"loss": 1.2255,
"step": 104
},
{
"epoch": 0.2551640340218712,
"grad_norm": 126.15797466756656,
"learning_rate": 0.00019671282831740076,
"loss": 1.1623,
"step": 105
},
{
"epoch": 0.25759416767922233,
"grad_norm": 79.41156434272008,
"learning_rate": 0.0001966098905581334,
"loss": 1.1606,
"step": 106
},
{
"epoch": 0.2600243013365735,
"grad_norm": 70.33104031058372,
"learning_rate": 0.00019650539357941003,
"loss": 1.196,
"step": 107
},
{
"epoch": 0.2624544349939247,
"grad_norm": 69.57260733822498,
"learning_rate": 0.0001963993390677424,
"loss": 1.1939,
"step": 108
},
{
"epoch": 0.2648845686512758,
"grad_norm": 81.78820691772725,
"learning_rate": 0.00019629172873477995,
"loss": 1.2553,
"step": 109
},
{
"epoch": 0.267314702308627,
"grad_norm": 117.06324110268656,
"learning_rate": 0.00019618256431728194,
"loss": 1.2535,
"step": 110
},
{
"epoch": 0.26974483596597815,
"grad_norm": 83.26993317104247,
"learning_rate": 0.00019607184757708951,
"loss": 1.157,
"step": 111
},
{
"epoch": 0.27217496962332927,
"grad_norm": 51.990829456422375,
"learning_rate": 0.00019595958030109735,
"loss": 1.1274,
"step": 112
},
{
"epoch": 0.27460510328068044,
"grad_norm": 119.7487160875729,
"learning_rate": 0.00019584576430122473,
"loss": 1.1422,
"step": 113
},
{
"epoch": 0.2770352369380316,
"grad_norm": 88.15636932272304,
"learning_rate": 0.00019573040141438624,
"loss": 1.1599,
"step": 114
},
{
"epoch": 0.27946537059538273,
"grad_norm": 62.346402225534774,
"learning_rate": 0.00019561349350246226,
"loss": 1.1909,
"step": 115
},
{
"epoch": 0.2818955042527339,
"grad_norm": 76.40612150653034,
"learning_rate": 0.0001954950424522688,
"loss": 1.1646,
"step": 116
},
{
"epoch": 0.284325637910085,
"grad_norm": 94.8711613055073,
"learning_rate": 0.00019537505017552716,
"loss": 1.1547,
"step": 117
},
{
"epoch": 0.2867557715674362,
"grad_norm": 63.86961661796314,
"learning_rate": 0.00019525351860883293,
"loss": 1.1841,
"step": 118
},
{
"epoch": 0.2891859052247874,
"grad_norm": 133.2417924150684,
"learning_rate": 0.00019513044971362494,
"loss": 1.1365,
"step": 119
},
{
"epoch": 0.2916160388821385,
"grad_norm": 133.44891510996445,
"learning_rate": 0.00019500584547615333,
"loss": 1.1696,
"step": 120
},
{
"epoch": 0.29404617253948967,
"grad_norm": 58.51701768739601,
"learning_rate": 0.00019487970790744774,
"loss": 1.1874,
"step": 121
},
{
"epoch": 0.29647630619684084,
"grad_norm": 49.536158238056196,
"learning_rate": 0.00019475203904328474,
"loss": 1.1798,
"step": 122
},
{
"epoch": 0.29890643985419196,
"grad_norm": 94.27608706983857,
"learning_rate": 0.000194622840944155,
"loss": 1.2443,
"step": 123
},
{
"epoch": 0.30133657351154314,
"grad_norm": 103.868243202843,
"learning_rate": 0.00019449211569523,
"loss": 1.1759,
"step": 124
},
{
"epoch": 0.3037667071688943,
"grad_norm": 73.31536435980003,
"learning_rate": 0.00019435986540632843,
"loss": 1.1885,
"step": 125
},
{
"epoch": 0.30619684082624543,
"grad_norm": 64.91149114745738,
"learning_rate": 0.00019422609221188207,
"loss": 1.1864,
"step": 126
},
{
"epoch": 0.3086269744835966,
"grad_norm": 95.34449184763653,
"learning_rate": 0.00019409079827090145,
"loss": 1.1339,
"step": 127
},
{
"epoch": 0.3110571081409477,
"grad_norm": 67.36156159754226,
"learning_rate": 0.00019395398576694086,
"loss": 1.1845,
"step": 128
},
{
"epoch": 0.3134872417982989,
"grad_norm": 36.94913176821407,
"learning_rate": 0.00019381565690806328,
"loss": 1.2154,
"step": 129
},
{
"epoch": 0.3159173754556501,
"grad_norm": 69.05265214547647,
"learning_rate": 0.00019367581392680457,
"loss": 1.1642,
"step": 130
},
{
"epoch": 0.3183475091130012,
"grad_norm": 38.974761165559855,
"learning_rate": 0.00019353445908013755,
"loss": 1.1508,
"step": 131
},
{
"epoch": 0.32077764277035237,
"grad_norm": 48.47215142199794,
"learning_rate": 0.00019339159464943557,
"loss": 1.2011,
"step": 132
},
{
"epoch": 0.32320777642770354,
"grad_norm": 41.88512063342574,
"learning_rate": 0.00019324722294043558,
"loss": 1.1643,
"step": 133
},
{
"epoch": 0.32563791008505466,
"grad_norm": 25.59403215229145,
"learning_rate": 0.00019310134628320114,
"loss": 1.1954,
"step": 134
},
{
"epoch": 0.32806804374240583,
"grad_norm": 58.02634988046396,
"learning_rate": 0.00019295396703208453,
"loss": 1.1544,
"step": 135
},
{
"epoch": 0.330498177399757,
"grad_norm": 31.26218977398251,
"learning_rate": 0.00019280508756568896,
"loss": 1.1613,
"step": 136
},
{
"epoch": 0.33292831105710813,
"grad_norm": 31.81234539284103,
"learning_rate": 0.00019265471028683014,
"loss": 1.1892,
"step": 137
},
{
"epoch": 0.3353584447144593,
"grad_norm": 54.44930114675527,
"learning_rate": 0.00019250283762249748,
"loss": 1.2801,
"step": 138
},
{
"epoch": 0.3377885783718105,
"grad_norm": 30.320486287732734,
"learning_rate": 0.00019234947202381486,
"loss": 1.1934,
"step": 139
},
{
"epoch": 0.3402187120291616,
"grad_norm": 32.76175001943503,
"learning_rate": 0.00019219461596600113,
"loss": 1.1436,
"step": 140
},
{
"epoch": 0.34264884568651277,
"grad_norm": 36.802264122697316,
"learning_rate": 0.00019203827194833026,
"loss": 1.1418,
"step": 141
},
{
"epoch": 0.3450789793438639,
"grad_norm": 35.03898729580271,
"learning_rate": 0.0001918804424940908,
"loss": 1.2479,
"step": 142
},
{
"epoch": 0.34750911300121506,
"grad_norm": 89.58068030461165,
"learning_rate": 0.00019172113015054532,
"loss": 1.2504,
"step": 143
},
{
"epoch": 0.34993924665856624,
"grad_norm": 30.05799668441019,
"learning_rate": 0.00019156033748888917,
"loss": 1.1662,
"step": 144
},
{
"epoch": 0.35236938031591736,
"grad_norm": 33.80121199203598,
"learning_rate": 0.00019139806710420914,
"loss": 1.1862,
"step": 145
},
{
"epoch": 0.35479951397326853,
"grad_norm": 31.510896023067872,
"learning_rate": 0.00019123432161544142,
"loss": 1.147,
"step": 146
},
{
"epoch": 0.3572296476306197,
"grad_norm": 32.92613286618093,
"learning_rate": 0.00019106910366532942,
"loss": 1.1421,
"step": 147
},
{
"epoch": 0.3596597812879708,
"grad_norm": 245.36013493823395,
"learning_rate": 0.00019090241592038113,
"loss": 1.1306,
"step": 148
},
{
"epoch": 0.362089914945322,
"grad_norm": 72.3061625644275,
"learning_rate": 0.000190734261070826,
"loss": 1.1144,
"step": 149
},
{
"epoch": 0.3645200486026732,
"grad_norm": 63.77748866336388,
"learning_rate": 0.00019056464183057157,
"loss": 1.1249,
"step": 150
},
{
"epoch": 0.3669501822600243,
"grad_norm": 633.2421324308109,
"learning_rate": 0.00019039356093715975,
"loss": 1.1359,
"step": 151
},
{
"epoch": 0.36938031591737547,
"grad_norm": 34.456657555313704,
"learning_rate": 0.00019022102115172248,
"loss": 1.1397,
"step": 152
},
{
"epoch": 0.3718104495747266,
"grad_norm": 35.21328820959324,
"learning_rate": 0.00019004702525893732,
"loss": 1.1741,
"step": 153
},
{
"epoch": 0.37424058323207776,
"grad_norm": 90.32405227187036,
"learning_rate": 0.00018987157606698235,
"loss": 1.1844,
"step": 154
},
{
"epoch": 0.37667071688942894,
"grad_norm": 39.348755664527914,
"learning_rate": 0.000189694676407491,
"loss": 1.1216,
"step": 155
},
{
"epoch": 0.37910085054678005,
"grad_norm": 58.85540744859834,
"learning_rate": 0.00018951632913550626,
"loss": 1.115,
"step": 156
},
{
"epoch": 0.38153098420413123,
"grad_norm": 39.849945227365325,
"learning_rate": 0.0001893365371294346,
"loss": 1.1705,
"step": 157
},
{
"epoch": 0.3839611178614824,
"grad_norm": 40.300954908722304,
"learning_rate": 0.0001891553032909996,
"loss": 1.1831,
"step": 158
},
{
"epoch": 0.3863912515188335,
"grad_norm": 53.72009888405355,
"learning_rate": 0.00018897263054519498,
"loss": 1.1613,
"step": 159
},
{
"epoch": 0.3888213851761847,
"grad_norm": 142.22686975859034,
"learning_rate": 0.0001887885218402375,
"loss": 1.1639,
"step": 160
},
{
"epoch": 0.39125151883353587,
"grad_norm": 50.141889086717356,
"learning_rate": 0.00018860298014751944,
"loss": 1.1659,
"step": 161
},
{
"epoch": 0.393681652490887,
"grad_norm": 63.25519968311113,
"learning_rate": 0.0001884160084615604,
"loss": 1.168,
"step": 162
},
{
"epoch": 0.39611178614823817,
"grad_norm": 50.59325246324073,
"learning_rate": 0.0001882276097999592,
"loss": 1.1202,
"step": 163
},
{
"epoch": 0.3985419198055893,
"grad_norm": 58.32587879810431,
"learning_rate": 0.0001880377872033451,
"loss": 1.1587,
"step": 164
},
{
"epoch": 0.40097205346294046,
"grad_norm": 211.50882688314653,
"learning_rate": 0.00018784654373532866,
"loss": 1.1551,
"step": 165
},
{
"epoch": 0.40340218712029163,
"grad_norm": 47.82888424614203,
"learning_rate": 0.00018765388248245246,
"loss": 1.2274,
"step": 166
},
{
"epoch": 0.40583232077764275,
"grad_norm": 97.94922685274778,
"learning_rate": 0.00018745980655414114,
"loss": 1.0872,
"step": 167
},
{
"epoch": 0.4082624544349939,
"grad_norm": 44.74994721544976,
"learning_rate": 0.0001872643190826512,
"loss": 1.1244,
"step": 168
},
{
"epoch": 0.4106925880923451,
"grad_norm": 53.84692426866845,
"learning_rate": 0.00018706742322302064,
"loss": 1.1576,
"step": 169
},
{
"epoch": 0.4131227217496962,
"grad_norm": 54.43599132185614,
"learning_rate": 0.0001868691221530178,
"loss": 1.0957,
"step": 170
},
{
"epoch": 0.4155528554070474,
"grad_norm": 39.21766518089018,
"learning_rate": 0.00018666941907309026,
"loss": 1.1625,
"step": 171
},
{
"epoch": 0.41798298906439857,
"grad_norm": 49.40030697752548,
"learning_rate": 0.000186468317206313,
"loss": 1.1556,
"step": 172
},
{
"epoch": 0.4204131227217497,
"grad_norm": 101.50309647820374,
"learning_rate": 0.0001862658197983366,
"loss": 1.1687,
"step": 173
},
{
"epoch": 0.42284325637910086,
"grad_norm": 105.41233861946563,
"learning_rate": 0.0001860619301173347,
"loss": 1.1687,
"step": 174
},
{
"epoch": 0.425273390036452,
"grad_norm": 103.99749987770305,
"learning_rate": 0.0001858566514539513,
"loss": 1.144,
"step": 175
},
{
"epoch": 0.42770352369380316,
"grad_norm": 78.83490301242213,
"learning_rate": 0.0001856499871212477,
"loss": 1.2318,
"step": 176
},
{
"epoch": 0.43013365735115433,
"grad_norm": 62.325757489859335,
"learning_rate": 0.00018544194045464886,
"loss": 1.1092,
"step": 177
},
{
"epoch": 0.43256379100850545,
"grad_norm": 81.32804926878099,
"learning_rate": 0.00018523251481188986,
"loss": 1.2233,
"step": 178
},
{
"epoch": 0.4349939246658566,
"grad_norm": 38.97928032166606,
"learning_rate": 0.00018502171357296144,
"loss": 1.2371,
"step": 179
},
{
"epoch": 0.4374240583232078,
"grad_norm": 82.62345361244209,
"learning_rate": 0.0001848095401400555,
"loss": 1.1562,
"step": 180
},
{
"epoch": 0.4398541919805589,
"grad_norm": 47.793381366401626,
"learning_rate": 0.0001845959979375104,
"loss": 1.1249,
"step": 181
},
{
"epoch": 0.4422843256379101,
"grad_norm": 53.6022948471739,
"learning_rate": 0.00018438109041175532,
"loss": 1.1415,
"step": 182
},
{
"epoch": 0.44471445929526127,
"grad_norm": 65.92717051568573,
"learning_rate": 0.00018416482103125506,
"loss": 1.1748,
"step": 183
},
{
"epoch": 0.4471445929526124,
"grad_norm": 59.410481167619494,
"learning_rate": 0.0001839471932864537,
"loss": 1.1399,
"step": 184
},
{
"epoch": 0.44957472660996356,
"grad_norm": 64.22740395872977,
"learning_rate": 0.0001837282106897185,
"loss": 1.2193,
"step": 185
},
{
"epoch": 0.4520048602673147,
"grad_norm": 54.63497168787729,
"learning_rate": 0.00018350787677528306,
"loss": 1.153,
"step": 186
},
{
"epoch": 0.45443499392466585,
"grad_norm": 49.60676029637355,
"learning_rate": 0.00018328619509919044,
"loss": 1.1509,
"step": 187
},
{
"epoch": 0.456865127582017,
"grad_norm": 32.29074835877607,
"learning_rate": 0.00018306316923923563,
"loss": 1.1851,
"step": 188
},
{
"epoch": 0.45929526123936815,
"grad_norm": 61.13632454163589,
"learning_rate": 0.0001828388027949078,
"loss": 1.1323,
"step": 189
},
{
"epoch": 0.4617253948967193,
"grad_norm": 67.48617660835801,
"learning_rate": 0.00018261309938733238,
"loss": 1.1956,
"step": 190
},
{
"epoch": 0.4641555285540705,
"grad_norm": 38.31182257784929,
"learning_rate": 0.00018238606265921238,
"loss": 1.1379,
"step": 191
},
{
"epoch": 0.4665856622114216,
"grad_norm": 47.30995766708629,
"learning_rate": 0.00018215769627476984,
"loss": 1.1462,
"step": 192
},
{
"epoch": 0.4690157958687728,
"grad_norm": 34.57093925891121,
"learning_rate": 0.00018192800391968642,
"loss": 1.1979,
"step": 193
},
{
"epoch": 0.47144592952612396,
"grad_norm": 34.45645740457662,
"learning_rate": 0.0001816969893010442,
"loss": 1.1763,
"step": 194
},
{
"epoch": 0.4738760631834751,
"grad_norm": 39.21862152859671,
"learning_rate": 0.00018146465614726567,
"loss": 1.1514,
"step": 195
},
{
"epoch": 0.47630619684082626,
"grad_norm": 34.765347344568106,
"learning_rate": 0.00018123100820805355,
"loss": 1.1426,
"step": 196
},
{
"epoch": 0.4787363304981774,
"grad_norm": 35.04245362239315,
"learning_rate": 0.00018099604925433043,
"loss": 1.143,
"step": 197
},
{
"epoch": 0.48116646415552855,
"grad_norm": 103.45636476066032,
"learning_rate": 0.00018075978307817764,
"loss": 1.1713,
"step": 198
},
{
"epoch": 0.4835965978128797,
"grad_norm": 43.0297373660821,
"learning_rate": 0.00018052221349277442,
"loss": 1.2226,
"step": 199
},
{
"epoch": 0.48602673147023084,
"grad_norm": 32.80474372048966,
"learning_rate": 0.000180283344332336,
"loss": 1.1556,
"step": 200
},
{
"epoch": 0.488456865127582,
"grad_norm": 59.42688731224296,
"learning_rate": 0.00018004317945205197,
"loss": 1.1411,
"step": 201
},
{
"epoch": 0.4908869987849332,
"grad_norm": 102.0917822407188,
"learning_rate": 0.000179801722728024,
"loss": 1.1309,
"step": 202
},
{
"epoch": 0.4933171324422843,
"grad_norm": 309.9346821950787,
"learning_rate": 0.0001795589780572031,
"loss": 1.1953,
"step": 203
},
{
"epoch": 0.4957472660996355,
"grad_norm": 344.5019267346993,
"learning_rate": 0.0001793149493573271,
"loss": 1.1524,
"step": 204
},
{
"epoch": 0.49817739975698666,
"grad_norm": 50.075205946207085,
"learning_rate": 0.00017906964056685706,
"loss": 1.1495,
"step": 205
},
{
"epoch": 0.5006075334143378,
"grad_norm": 132.32227258331488,
"learning_rate": 0.00017882305564491396,
"loss": 1.1976,
"step": 206
},
{
"epoch": 0.5006075334143378,
"eval_loss": 1.146019458770752,
"eval_runtime": 52.7816,
"eval_samples_per_second": 14.096,
"eval_steps_per_second": 1.762,
"step": 206
},
{
"epoch": 0.503037667071689,
"grad_norm": 138.57200377669218,
"learning_rate": 0.00017857519857121458,
"loss": 1.2159,
"step": 207
},
{
"epoch": 0.5054678007290401,
"grad_norm": 268.41109734161546,
"learning_rate": 0.00017832607334600746,
"loss": 1.1748,
"step": 208
},
{
"epoch": 0.5078979343863913,
"grad_norm": 72.44153953442401,
"learning_rate": 0.00017807568399000822,
"loss": 1.1758,
"step": 209
},
{
"epoch": 0.5103280680437424,
"grad_norm": 97.75400124096738,
"learning_rate": 0.00017782403454433477,
"loss": 1.1004,
"step": 210
},
{
"epoch": 0.5127582017010935,
"grad_norm": 84.19522802756285,
"learning_rate": 0.000177571129070442,
"loss": 1.1397,
"step": 211
},
{
"epoch": 0.5151883353584447,
"grad_norm": 132.95081835535706,
"learning_rate": 0.00017731697165005618,
"loss": 1.146,
"step": 212
},
{
"epoch": 0.5176184690157959,
"grad_norm": 560.3351292126325,
"learning_rate": 0.0001770615663851093,
"loss": 1.1937,
"step": 213
},
{
"epoch": 0.520048602673147,
"grad_norm": 252.72862614645885,
"learning_rate": 0.0001768049173976727,
"loss": 1.1213,
"step": 214
},
{
"epoch": 0.5224787363304981,
"grad_norm": 356.2985211032981,
"learning_rate": 0.0001765470288298905,
"loss": 1.22,
"step": 215
},
{
"epoch": 0.5249088699878494,
"grad_norm": 952.600672502031,
"learning_rate": 0.00017628790484391284,
"loss": 1.1321,
"step": 216
},
{
"epoch": 0.5273390036452005,
"grad_norm": 289.9357041930161,
"learning_rate": 0.0001760275496218288,
"loss": 1.1688,
"step": 217
},
{
"epoch": 0.5297691373025516,
"grad_norm": 48.69445264741508,
"learning_rate": 0.0001757659673655986,
"loss": 1.1551,
"step": 218
},
{
"epoch": 0.5321992709599028,
"grad_norm": 40.15160247154335,
"learning_rate": 0.0001755031622969862,
"loss": 1.1459,
"step": 219
},
{
"epoch": 0.534629404617254,
"grad_norm": 44.59390817019205,
"learning_rate": 0.00017523913865749078,
"loss": 1.2012,
"step": 220
},
{
"epoch": 0.5370595382746051,
"grad_norm": 30.189717624412484,
"learning_rate": 0.00017497390070827848,
"loss": 1.15,
"step": 221
},
{
"epoch": 0.5394896719319563,
"grad_norm": 27.185608574176108,
"learning_rate": 0.00017470745273011362,
"loss": 1.0763,
"step": 222
},
{
"epoch": 0.5419198055893074,
"grad_norm": 99.44121390806423,
"learning_rate": 0.00017443979902328956,
"loss": 1.1478,
"step": 223
},
{
"epoch": 0.5443499392466585,
"grad_norm": 29.684499344634585,
"learning_rate": 0.00017417094390755934,
"loss": 1.1123,
"step": 224
},
{
"epoch": 0.5467800729040098,
"grad_norm": 26.788847114635054,
"learning_rate": 0.00017390089172206592,
"loss": 1.1169,
"step": 225
},
{
"epoch": 0.5492102065613609,
"grad_norm": 31.84817878214798,
"learning_rate": 0.00017362964682527218,
"loss": 1.1524,
"step": 226
},
{
"epoch": 0.551640340218712,
"grad_norm": 34.834632993822424,
"learning_rate": 0.00017335721359489057,
"loss": 1.1761,
"step": 227
},
{
"epoch": 0.5540704738760632,
"grad_norm": 66.6084234453716,
"learning_rate": 0.00017308359642781242,
"loss": 1.1175,
"step": 228
},
{
"epoch": 0.5565006075334143,
"grad_norm": 35.15720180142773,
"learning_rate": 0.00017280879974003707,
"loss": 1.2012,
"step": 229
},
{
"epoch": 0.5589307411907655,
"grad_norm": 35.975450782756226,
"learning_rate": 0.00017253282796660056,
"loss": 1.1801,
"step": 230
},
{
"epoch": 0.5613608748481167,
"grad_norm": 83.49050230764925,
"learning_rate": 0.0001722556855615039,
"loss": 1.1576,
"step": 231
},
{
"epoch": 0.5637910085054678,
"grad_norm": 150.44630441002784,
"learning_rate": 0.00017197737699764146,
"loss": 1.1826,
"step": 232
},
{
"epoch": 0.5662211421628189,
"grad_norm": 31.322382197739042,
"learning_rate": 0.00017169790676672858,
"loss": 1.1784,
"step": 233
},
{
"epoch": 0.56865127582017,
"grad_norm": 33.15983653687515,
"learning_rate": 0.0001714172793792291,
"loss": 1.1411,
"step": 234
},
{
"epoch": 0.5710814094775213,
"grad_norm": 22.206850165103052,
"learning_rate": 0.0001711354993642827,
"loss": 1.1772,
"step": 235
},
{
"epoch": 0.5735115431348724,
"grad_norm": 43.35721272668955,
"learning_rate": 0.00017085257126963152,
"loss": 1.0915,
"step": 236
},
{
"epoch": 0.5759416767922235,
"grad_norm": 29.57234737116712,
"learning_rate": 0.0001705684996615472,
"loss": 1.0977,
"step": 237
},
{
"epoch": 0.5783718104495748,
"grad_norm": 42.929644875053214,
"learning_rate": 0.00017028328912475668,
"loss": 1.1782,
"step": 238
},
{
"epoch": 0.5808019441069259,
"grad_norm": 32.15711272871687,
"learning_rate": 0.0001699969442623686,
"loss": 1.1855,
"step": 239
},
{
"epoch": 0.583232077764277,
"grad_norm": 43.64453730184205,
"learning_rate": 0.00016970946969579887,
"loss": 1.1171,
"step": 240
},
{
"epoch": 0.5856622114216282,
"grad_norm": 26.145541544112593,
"learning_rate": 0.00016942087006469592,
"loss": 1.1656,
"step": 241
},
{
"epoch": 0.5880923450789793,
"grad_norm": 53.98173886095731,
"learning_rate": 0.00016913115002686616,
"loss": 1.1378,
"step": 242
},
{
"epoch": 0.5905224787363305,
"grad_norm": 50.851193586801195,
"learning_rate": 0.00016884031425819853,
"loss": 1.1338,
"step": 243
},
{
"epoch": 0.5929526123936817,
"grad_norm": 30.166674036386443,
"learning_rate": 0.0001685483674525891,
"loss": 1.1732,
"step": 244
},
{
"epoch": 0.5953827460510328,
"grad_norm": 32.580505176392656,
"learning_rate": 0.00016825531432186543,
"loss": 1.143,
"step": 245
},
{
"epoch": 0.5978128797083839,
"grad_norm": 35.087231952662634,
"learning_rate": 0.0001679611595957103,
"loss": 1.212,
"step": 246
},
{
"epoch": 0.6002430133657352,
"grad_norm": 44.69578306542608,
"learning_rate": 0.00016766590802158566,
"loss": 1.1527,
"step": 247
},
{
"epoch": 0.6026731470230863,
"grad_norm": 39.8378839133733,
"learning_rate": 0.00016736956436465573,
"loss": 1.2174,
"step": 248
},
{
"epoch": 0.6051032806804374,
"grad_norm": 25.571860004032857,
"learning_rate": 0.0001670721334077103,
"loss": 1.1031,
"step": 249
},
{
"epoch": 0.6075334143377886,
"grad_norm": 27.626061413643438,
"learning_rate": 0.00016677361995108743,
"loss": 1.107,
"step": 250
},
{
"epoch": 0.6099635479951397,
"grad_norm": 47.405627339857176,
"learning_rate": 0.00016647402881259598,
"loss": 1.1521,
"step": 251
},
{
"epoch": 0.6123936816524909,
"grad_norm": 31.951762409660272,
"learning_rate": 0.00016617336482743794,
"loss": 1.174,
"step": 252
},
{
"epoch": 0.6148238153098421,
"grad_norm": 44.304437144236104,
"learning_rate": 0.00016587163284813032,
"loss": 1.1286,
"step": 253
},
{
"epoch": 0.6172539489671932,
"grad_norm": 21.990501251879344,
"learning_rate": 0.00016556883774442675,
"loss": 1.1927,
"step": 254
},
{
"epoch": 0.6196840826245443,
"grad_norm": 43.91119350789936,
"learning_rate": 0.00016526498440323914,
"loss": 1.1399,
"step": 255
},
{
"epoch": 0.6221142162818954,
"grad_norm": 28.064569132249982,
"learning_rate": 0.00016496007772855853,
"loss": 1.1913,
"step": 256
},
{
"epoch": 0.6245443499392467,
"grad_norm": 99.97142272243896,
"learning_rate": 0.0001646541226413761,
"loss": 1.1694,
"step": 257
},
{
"epoch": 0.6269744835965978,
"grad_norm": 27.12524206817854,
"learning_rate": 0.00016434712407960373,
"loss": 1.2398,
"step": 258
},
{
"epoch": 0.6294046172539489,
"grad_norm": 42.99171796479219,
"learning_rate": 0.00016403908699799425,
"loss": 1.145,
"step": 259
},
{
"epoch": 0.6318347509113001,
"grad_norm": 24.064938768293658,
"learning_rate": 0.00016373001636806153,
"loss": 1.098,
"step": 260
},
{
"epoch": 0.6342648845686513,
"grad_norm": 31.72232981247621,
"learning_rate": 0.00016341991717800023,
"loss": 1.1779,
"step": 261
},
{
"epoch": 0.6366950182260024,
"grad_norm": 39.97326887390835,
"learning_rate": 0.00016310879443260528,
"loss": 1.3142,
"step": 262
},
{
"epoch": 0.6391251518833536,
"grad_norm": 27.519208072826963,
"learning_rate": 0.00016279665315319114,
"loss": 1.2039,
"step": 263
},
{
"epoch": 0.6415552855407047,
"grad_norm": 52.94895557810481,
"learning_rate": 0.00016248349837751062,
"loss": 1.1718,
"step": 264
},
{
"epoch": 0.6439854191980559,
"grad_norm": 23.603047222747566,
"learning_rate": 0.0001621693351596739,
"loss": 1.1155,
"step": 265
},
{
"epoch": 0.6464155528554071,
"grad_norm": 21.400341520569807,
"learning_rate": 0.00016185416857006647,
"loss": 1.1242,
"step": 266
},
{
"epoch": 0.6488456865127582,
"grad_norm": 51.167335508822276,
"learning_rate": 0.00016153800369526788,
"loss": 1.1746,
"step": 267
},
{
"epoch": 0.6512758201701093,
"grad_norm": 26.219581065473573,
"learning_rate": 0.00016122084563796905,
"loss": 1.0836,
"step": 268
},
{
"epoch": 0.6537059538274606,
"grad_norm": 56.820249886600706,
"learning_rate": 0.0001609026995168904,
"loss": 1.1625,
"step": 269
},
{
"epoch": 0.6561360874848117,
"grad_norm": 37.43384869992443,
"learning_rate": 0.00016058357046669898,
"loss": 1.2143,
"step": 270
},
{
"epoch": 0.6585662211421628,
"grad_norm": 31.885237168871473,
"learning_rate": 0.00016026346363792567,
"loss": 1.1536,
"step": 271
},
{
"epoch": 0.660996354799514,
"grad_norm": 34.66147983279251,
"learning_rate": 0.00015994238419688199,
"loss": 1.2095,
"step": 272
},
{
"epoch": 0.6634264884568651,
"grad_norm": 86.90365354594917,
"learning_rate": 0.00015962033732557686,
"loss": 1.1149,
"step": 273
},
{
"epoch": 0.6658566221142163,
"grad_norm": 52.21177462889067,
"learning_rate": 0.00015929732822163287,
"loss": 1.1861,
"step": 274
},
{
"epoch": 0.6682867557715675,
"grad_norm": 92.11184701145604,
"learning_rate": 0.00015897336209820239,
"loss": 1.1853,
"step": 275
},
{
"epoch": 0.6707168894289186,
"grad_norm": 30.662475573811115,
"learning_rate": 0.00015864844418388342,
"loss": 1.0912,
"step": 276
},
{
"epoch": 0.6731470230862697,
"grad_norm": 26.15855468837027,
"learning_rate": 0.00015832257972263523,
"loss": 1.1618,
"step": 277
},
{
"epoch": 0.675577156743621,
"grad_norm": 41.14250673970726,
"learning_rate": 0.00015799577397369375,
"loss": 1.1499,
"step": 278
},
{
"epoch": 0.6780072904009721,
"grad_norm": 31.93253644773631,
"learning_rate": 0.00015766803221148673,
"loss": 1.1229,
"step": 279
},
{
"epoch": 0.6804374240583232,
"grad_norm": 39.87120131585165,
"learning_rate": 0.00015733935972554844,
"loss": 1.1647,
"step": 280
},
{
"epoch": 0.6828675577156743,
"grad_norm": 52.741654062271124,
"learning_rate": 0.0001570097618204345,
"loss": 1.1362,
"step": 281
},
{
"epoch": 0.6852976913730255,
"grad_norm": 33.13137686002526,
"learning_rate": 0.0001566792438156362,
"loss": 1.1825,
"step": 282
},
{
"epoch": 0.6877278250303767,
"grad_norm": 20.284041564566042,
"learning_rate": 0.00015634781104549442,
"loss": 1.1439,
"step": 283
},
{
"epoch": 0.6901579586877278,
"grad_norm": 164.9222932471453,
"learning_rate": 0.00015601546885911404,
"loss": 1.122,
"step": 284
},
{
"epoch": 0.692588092345079,
"grad_norm": 27.092346730158148,
"learning_rate": 0.00015568222262027717,
"loss": 1.157,
"step": 285
},
{
"epoch": 0.6950182260024301,
"grad_norm": 39.46898996008012,
"learning_rate": 0.00015534807770735664,
"loss": 1.1092,
"step": 286
},
{
"epoch": 0.6974483596597812,
"grad_norm": 30.00942949300714,
"learning_rate": 0.00015501303951322943,
"loss": 1.243,
"step": 287
},
{
"epoch": 0.6998784933171325,
"grad_norm": 31.435817418038887,
"learning_rate": 0.00015467711344518942,
"loss": 1.1034,
"step": 288
},
{
"epoch": 0.7023086269744836,
"grad_norm": 54.53572773177548,
"learning_rate": 0.00015434030492486023,
"loss": 1.2216,
"step": 289
},
{
"epoch": 0.7047387606318347,
"grad_norm": 24.51082708234768,
"learning_rate": 0.00015400261938810757,
"loss": 1.1532,
"step": 290
},
{
"epoch": 0.707168894289186,
"grad_norm": 104.85480514443172,
"learning_rate": 0.00015366406228495172,
"loss": 1.1156,
"step": 291
},
{
"epoch": 0.7095990279465371,
"grad_norm": 26.398830117870997,
"learning_rate": 0.0001533246390794794,
"loss": 1.0934,
"step": 292
},
{
"epoch": 0.7120291616038882,
"grad_norm": 25.062392373037707,
"learning_rate": 0.00015298435524975572,
"loss": 1.1453,
"step": 293
},
{
"epoch": 0.7144592952612394,
"grad_norm": 25.385505352027444,
"learning_rate": 0.0001526432162877356,
"loss": 1.1359,
"step": 294
},
{
"epoch": 0.7168894289185905,
"grad_norm": 18.00146943000571,
"learning_rate": 0.00015230122769917527,
"loss": 1.1129,
"step": 295
},
{
"epoch": 0.7193195625759417,
"grad_norm": 22.55383473288135,
"learning_rate": 0.00015195839500354335,
"loss": 1.142,
"step": 296
},
{
"epoch": 0.7217496962332929,
"grad_norm": 30.013723395820165,
"learning_rate": 0.00015161472373393186,
"loss": 1.1379,
"step": 297
},
{
"epoch": 0.724179829890644,
"grad_norm": 40.566201545240425,
"learning_rate": 0.0001512702194369668,
"loss": 1.1326,
"step": 298
},
{
"epoch": 0.7266099635479951,
"grad_norm": 27.34716639907029,
"learning_rate": 0.00015092488767271857,
"loss": 1.0782,
"step": 299
},
{
"epoch": 0.7290400972053463,
"grad_norm": 45.0837594669075,
"learning_rate": 0.00015057873401461253,
"loss": 1.2054,
"step": 300
},
{
"epoch": 0.7314702308626975,
"grad_norm": 22.39794101270309,
"learning_rate": 0.00015023176404933874,
"loss": 1.1052,
"step": 301
},
{
"epoch": 0.7339003645200486,
"grad_norm": 21.818512025585306,
"learning_rate": 0.00014988398337676198,
"loss": 1.1664,
"step": 302
},
{
"epoch": 0.7363304981773997,
"grad_norm": 33.09386163968815,
"learning_rate": 0.00014953539760983122,
"loss": 1.1364,
"step": 303
},
{
"epoch": 0.7387606318347509,
"grad_norm": 26.3253592215911,
"learning_rate": 0.00014918601237448923,
"loss": 1.1093,
"step": 304
},
{
"epoch": 0.741190765492102,
"grad_norm": 32.54878723405212,
"learning_rate": 0.0001488358333095816,
"loss": 1.182,
"step": 305
},
{
"epoch": 0.7436208991494532,
"grad_norm": 28.645473311846015,
"learning_rate": 0.0001484848660667658,
"loss": 1.2064,
"step": 306
},
{
"epoch": 0.7460510328068044,
"grad_norm": 29.02693042820854,
"learning_rate": 0.00014813311631041995,
"loss": 1.1545,
"step": 307
},
{
"epoch": 0.7484811664641555,
"grad_norm": 20.28193033099828,
"learning_rate": 0.00014778058971755154,
"loss": 1.1885,
"step": 308
},
{
"epoch": 0.7509113001215066,
"grad_norm": 121.86121371804961,
"learning_rate": 0.00014742729197770552,
"loss": 1.095,
"step": 309
},
{
"epoch": 0.7509113001215066,
"eval_loss": 1.133868932723999,
"eval_runtime": 52.6711,
"eval_samples_per_second": 14.125,
"eval_steps_per_second": 1.766,
"step": 309
},
{
"epoch": 0.7533414337788579,
"grad_norm": 50.1793074315811,
"learning_rate": 0.00014707322879287276,
"loss": 1.1679,
"step": 310
},
{
"epoch": 0.755771567436209,
"grad_norm": 31.791309498678103,
"learning_rate": 0.00014671840587739783,
"loss": 1.1277,
"step": 311
},
{
"epoch": 0.7582017010935601,
"grad_norm": 56.88911226488106,
"learning_rate": 0.00014636282895788688,
"loss": 1.1492,
"step": 312
},
{
"epoch": 0.7606318347509113,
"grad_norm": 117.29437608667352,
"learning_rate": 0.00014600650377311522,
"loss": 1.1123,
"step": 313
},
{
"epoch": 0.7630619684082625,
"grad_norm": 107.56728772749254,
"learning_rate": 0.00014564943607393459,
"loss": 1.171,
"step": 314
},
{
"epoch": 0.7654921020656136,
"grad_norm": 34.085830256919685,
"learning_rate": 0.0001452916316231805,
"loss": 1.1854,
"step": 315
},
{
"epoch": 0.7679222357229648,
"grad_norm": 23.625747202851176,
"learning_rate": 0.000144933096195579,
"loss": 1.1622,
"step": 316
},
{
"epoch": 0.7703523693803159,
"grad_norm": 56.9917185309248,
"learning_rate": 0.00014457383557765386,
"loss": 1.2037,
"step": 317
},
{
"epoch": 0.772782503037667,
"grad_norm": 34.55554043725056,
"learning_rate": 0.00014421385556763266,
"loss": 1.1273,
"step": 318
},
{
"epoch": 0.7752126366950183,
"grad_norm": 34.205286759913115,
"learning_rate": 0.00014385316197535372,
"loss": 1.2039,
"step": 319
},
{
"epoch": 0.7776427703523694,
"grad_norm": 27.30015395778206,
"learning_rate": 0.00014349176062217195,
"loss": 1.1903,
"step": 320
},
{
"epoch": 0.7800729040097205,
"grad_norm": 23.077745147127867,
"learning_rate": 0.00014312965734086518,
"loss": 1.1539,
"step": 321
},
{
"epoch": 0.7825030376670717,
"grad_norm": 26.22112568156326,
"learning_rate": 0.00014276685797553977,
"loss": 1.1807,
"step": 322
},
{
"epoch": 0.7849331713244229,
"grad_norm": 34.813719314948514,
"learning_rate": 0.0001424033683815365,
"loss": 1.1247,
"step": 323
},
{
"epoch": 0.787363304981774,
"grad_norm": 27.109609629038324,
"learning_rate": 0.00014203919442533597,
"loss": 1.1735,
"step": 324
},
{
"epoch": 0.7897934386391251,
"grad_norm": 144.91672798575476,
"learning_rate": 0.00014167434198446383,
"loss": 1.1007,
"step": 325
},
{
"epoch": 0.7922235722964763,
"grad_norm": 42.19042828736382,
"learning_rate": 0.00014130881694739616,
"loss": 1.1398,
"step": 326
},
{
"epoch": 0.7946537059538274,
"grad_norm": 43.00144921766715,
"learning_rate": 0.00014094262521346427,
"loss": 1.1712,
"step": 327
},
{
"epoch": 0.7970838396111786,
"grad_norm": 26.343159670729925,
"learning_rate": 0.0001405757726927595,
"loss": 1.2103,
"step": 328
},
{
"epoch": 0.7995139732685298,
"grad_norm": 31.68271222195729,
"learning_rate": 0.00014020826530603776,
"loss": 1.1578,
"step": 329
},
{
"epoch": 0.8019441069258809,
"grad_norm": 39.08920292536896,
"learning_rate": 0.00013984010898462416,
"loss": 1.1377,
"step": 330
},
{
"epoch": 0.804374240583232,
"grad_norm": 34.56898084569197,
"learning_rate": 0.00013947130967031717,
"loss": 1.1886,
"step": 331
},
{
"epoch": 0.8068043742405833,
"grad_norm": 42.016356369933895,
"learning_rate": 0.00013910187331529276,
"loss": 1.1577,
"step": 332
},
{
"epoch": 0.8092345078979344,
"grad_norm": 21.25953597879822,
"learning_rate": 0.00013873180588200827,
"loss": 1.1259,
"step": 333
},
{
"epoch": 0.8116646415552855,
"grad_norm": 39.49634140985428,
"learning_rate": 0.0001383611133431062,
"loss": 1.173,
"step": 334
},
{
"epoch": 0.8140947752126367,
"grad_norm": 29.837690582268863,
"learning_rate": 0.00013798980168131794,
"loss": 1.1322,
"step": 335
},
{
"epoch": 0.8165249088699879,
"grad_norm": 23.510451396240928,
"learning_rate": 0.000137617876889367,
"loss": 1.1392,
"step": 336
},
{
"epoch": 0.818955042527339,
"grad_norm": 19.183017199526635,
"learning_rate": 0.00013724534496987247,
"loss": 1.157,
"step": 337
},
{
"epoch": 0.8213851761846902,
"grad_norm": 51.85037647612581,
"learning_rate": 0.0001368722119352521,
"loss": 1.1255,
"step": 338
},
{
"epoch": 0.8238153098420413,
"grad_norm": 31.635699477838273,
"learning_rate": 0.00013649848380762513,
"loss": 1.1429,
"step": 339
},
{
"epoch": 0.8262454434993924,
"grad_norm": 39.6479124739029,
"learning_rate": 0.00013612416661871533,
"loss": 1.1609,
"step": 340
},
{
"epoch": 0.8286755771567437,
"grad_norm": 21.453228401011238,
"learning_rate": 0.0001357492664097534,
"loss": 1.1247,
"step": 341
},
{
"epoch": 0.8311057108140948,
"grad_norm": 28.514958428145494,
"learning_rate": 0.00013537378923137973,
"loss": 1.0845,
"step": 342
},
{
"epoch": 0.8335358444714459,
"grad_norm": 26.98663985253516,
"learning_rate": 0.00013499774114354655,
"loss": 1.1092,
"step": 343
},
{
"epoch": 0.8359659781287971,
"grad_norm": 30.76143424141064,
"learning_rate": 0.00013462112821542016,
"loss": 1.1759,
"step": 344
},
{
"epoch": 0.8383961117861483,
"grad_norm": 39.023771167108656,
"learning_rate": 0.0001342439565252831,
"loss": 1.1024,
"step": 345
},
{
"epoch": 0.8408262454434994,
"grad_norm": 29.787639099820225,
"learning_rate": 0.0001338662321604358,
"loss": 1.2141,
"step": 346
},
{
"epoch": 0.8432563791008505,
"grad_norm": 25.60634301240642,
"learning_rate": 0.00013348796121709862,
"loss": 1.1244,
"step": 347
},
{
"epoch": 0.8456865127582017,
"grad_norm": 76.98542857181108,
"learning_rate": 0.00013310914980031334,
"loss": 1.19,
"step": 348
},
{
"epoch": 0.8481166464155528,
"grad_norm": 110.28982985071892,
"learning_rate": 0.0001327298040238446,
"loss": 1.1295,
"step": 349
},
{
"epoch": 0.850546780072904,
"grad_norm": 22.610631125609732,
"learning_rate": 0.0001323499300100811,
"loss": 1.1445,
"step": 350
},
{
"epoch": 0.8529769137302552,
"grad_norm": 29.958515973723888,
"learning_rate": 0.00013196953388993726,
"loss": 1.2048,
"step": 351
},
{
"epoch": 0.8554070473876063,
"grad_norm": 30.691798031468103,
"learning_rate": 0.00013158862180275363,
"loss": 1.1628,
"step": 352
},
{
"epoch": 0.8578371810449574,
"grad_norm": 28.568576369680258,
"learning_rate": 0.00013120719989619833,
"loss": 1.0899,
"step": 353
},
{
"epoch": 0.8602673147023087,
"grad_norm": 42.12623456189728,
"learning_rate": 0.0001308252743261675,
"loss": 1.1451,
"step": 354
},
{
"epoch": 0.8626974483596598,
"grad_norm": 112.39248005736448,
"learning_rate": 0.00013044285125668614,
"loss": 1.154,
"step": 355
},
{
"epoch": 0.8651275820170109,
"grad_norm": 28.013602355549782,
"learning_rate": 0.0001300599368598086,
"loss": 1.1937,
"step": 356
},
{
"epoch": 0.8675577156743621,
"grad_norm": 27.763517972300694,
"learning_rate": 0.0001296765373155188,
"loss": 1.1243,
"step": 357
},
{
"epoch": 0.8699878493317132,
"grad_norm": 112.85815824767063,
"learning_rate": 0.0001292926588116308,
"loss": 1.1595,
"step": 358
},
{
"epoch": 0.8724179829890644,
"grad_norm": 27.085127886556087,
"learning_rate": 0.00012890830754368855,
"loss": 1.1196,
"step": 359
},
{
"epoch": 0.8748481166464156,
"grad_norm": 31.56336829128541,
"learning_rate": 0.00012852348971486617,
"loss": 1.1231,
"step": 360
},
{
"epoch": 0.8772782503037667,
"grad_norm": 31.904393738907178,
"learning_rate": 0.0001281382115358679,
"loss": 1.097,
"step": 361
},
{
"epoch": 0.8797083839611178,
"grad_norm": 25.034453894065827,
"learning_rate": 0.00012775247922482748,
"loss": 1.1246,
"step": 362
},
{
"epoch": 0.8821385176184691,
"grad_norm": 33.221958266501474,
"learning_rate": 0.0001273662990072083,
"loss": 1.1189,
"step": 363
},
{
"epoch": 0.8845686512758202,
"grad_norm": 26.638980136773224,
"learning_rate": 0.00012697967711570242,
"loss": 1.1315,
"step": 364
},
{
"epoch": 0.8869987849331713,
"grad_norm": 27.231479341362885,
"learning_rate": 0.00012659261979013043,
"loss": 1.1464,
"step": 365
},
{
"epoch": 0.8894289185905225,
"grad_norm": 19.654091006710207,
"learning_rate": 0.0001262051332773404,
"loss": 1.1271,
"step": 366
},
{
"epoch": 0.8918590522478737,
"grad_norm": 50.3934263865559,
"learning_rate": 0.00012581722383110718,
"loss": 1.1002,
"step": 367
},
{
"epoch": 0.8942891859052248,
"grad_norm": 20.25952031318632,
"learning_rate": 0.00012542889771203166,
"loss": 1.0629,
"step": 368
},
{
"epoch": 0.8967193195625759,
"grad_norm": 19.16914945262315,
"learning_rate": 0.00012504016118743935,
"loss": 1.1597,
"step": 369
},
{
"epoch": 0.8991494532199271,
"grad_norm": 35.65941460173898,
"learning_rate": 0.00012465102053127957,
"loss": 1.1501,
"step": 370
},
{
"epoch": 0.9015795868772782,
"grad_norm": 26.093269180565315,
"learning_rate": 0.00012426148202402404,
"loss": 1.1455,
"step": 371
},
{
"epoch": 0.9040097205346294,
"grad_norm": 30.928987547424892,
"learning_rate": 0.00012387155195256537,
"loss": 1.1392,
"step": 372
},
{
"epoch": 0.9064398541919806,
"grad_norm": 20.17512596846915,
"learning_rate": 0.00012348123661011601,
"loss": 1.1196,
"step": 373
},
{
"epoch": 0.9088699878493317,
"grad_norm": 24.380789157356805,
"learning_rate": 0.00012309054229610623,
"loss": 1.1,
"step": 374
},
{
"epoch": 0.9113001215066828,
"grad_norm": 95.49408387682203,
"learning_rate": 0.00012269947531608276,
"loss": 1.1825,
"step": 375
},
{
"epoch": 0.913730255164034,
"grad_norm": 23.635286340368726,
"learning_rate": 0.0001223080419816069,
"loss": 1.1717,
"step": 376
},
{
"epoch": 0.9161603888213852,
"grad_norm": 21.942478063568313,
"learning_rate": 0.00012191624861015254,
"loss": 1.1661,
"step": 377
},
{
"epoch": 0.9185905224787363,
"grad_norm": 74.12601397150299,
"learning_rate": 0.00012152410152500453,
"loss": 1.1967,
"step": 378
},
{
"epoch": 0.9210206561360875,
"grad_norm": 37.26720386499629,
"learning_rate": 0.00012113160705515625,
"loss": 1.1566,
"step": 379
},
{
"epoch": 0.9234507897934386,
"grad_norm": 34.080854733427635,
"learning_rate": 0.00012073877153520776,
"loss": 1.0847,
"step": 380
},
{
"epoch": 0.9258809234507898,
"grad_norm": 26.50842916877183,
"learning_rate": 0.0001203456013052634,
"loss": 1.0824,
"step": 381
},
{
"epoch": 0.928311057108141,
"grad_norm": 37.92039651416441,
"learning_rate": 0.00011995210271082944,
"loss": 1.1485,
"step": 382
},
{
"epoch": 0.9307411907654921,
"grad_norm": 38.56931832374284,
"learning_rate": 0.00011955828210271187,
"loss": 1.0737,
"step": 383
},
{
"epoch": 0.9331713244228432,
"grad_norm": 24.419015296791592,
"learning_rate": 0.0001191641458369136,
"loss": 1.1208,
"step": 384
},
{
"epoch": 0.9356014580801945,
"grad_norm": 28.75379656643836,
"learning_rate": 0.00011876970027453222,
"loss": 1.1071,
"step": 385
},
{
"epoch": 0.9380315917375456,
"grad_norm": 138.39305133994282,
"learning_rate": 0.00011837495178165706,
"loss": 1.1405,
"step": 386
},
{
"epoch": 0.9404617253948967,
"grad_norm": 22.200435229928654,
"learning_rate": 0.00011797990672926652,
"loss": 1.124,
"step": 387
},
{
"epoch": 0.9428918590522479,
"grad_norm": 40.21978055156661,
"learning_rate": 0.00011758457149312538,
"loss": 1.1875,
"step": 388
},
{
"epoch": 0.945321992709599,
"grad_norm": 23.592672098002485,
"learning_rate": 0.00011718895245368167,
"loss": 1.1748,
"step": 389
},
{
"epoch": 0.9477521263669502,
"grad_norm": 17.463183827323444,
"learning_rate": 0.00011679305599596393,
"loss": 1.1794,
"step": 390
},
{
"epoch": 0.9501822600243013,
"grad_norm": 36.219441964332646,
"learning_rate": 0.00011639688850947799,
"loss": 1.1459,
"step": 391
},
{
"epoch": 0.9526123936816525,
"grad_norm": 23.727472560980413,
"learning_rate": 0.00011600045638810386,
"loss": 1.076,
"step": 392
},
{
"epoch": 0.9550425273390036,
"grad_norm": 57.63284414960702,
"learning_rate": 0.00011560376602999272,
"loss": 1.1919,
"step": 393
},
{
"epoch": 0.9574726609963548,
"grad_norm": 40.23829998466358,
"learning_rate": 0.00011520682383746333,
"loss": 1.0701,
"step": 394
},
{
"epoch": 0.959902794653706,
"grad_norm": 58.2018640218209,
"learning_rate": 0.00011480963621689905,
"loss": 1.1745,
"step": 395
},
{
"epoch": 0.9623329283110571,
"grad_norm": 27.693448904288406,
"learning_rate": 0.00011441220957864421,
"loss": 1.1323,
"step": 396
},
{
"epoch": 0.9647630619684082,
"grad_norm": 34.94430005820724,
"learning_rate": 0.00011401455033690076,
"loss": 1.1497,
"step": 397
},
{
"epoch": 0.9671931956257594,
"grad_norm": 17.521922247865188,
"learning_rate": 0.00011361666490962468,
"loss": 1.1319,
"step": 398
},
{
"epoch": 0.9696233292831106,
"grad_norm": 25.886687159935246,
"learning_rate": 0.00011321855971842243,
"loss": 1.1418,
"step": 399
},
{
"epoch": 0.9720534629404617,
"grad_norm": 31.388154506614836,
"learning_rate": 0.00011282024118844738,
"loss": 1.1282,
"step": 400
},
{
"epoch": 0.9744835965978129,
"grad_norm": 27.458601253675347,
"learning_rate": 0.00011242171574829599,
"loss": 1.1647,
"step": 401
},
{
"epoch": 0.976913730255164,
"grad_norm": 25.922873022924257,
"learning_rate": 0.00011202298982990411,
"loss": 1.091,
"step": 402
},
{
"epoch": 0.9793438639125152,
"grad_norm": 20.129467589894766,
"learning_rate": 0.00011162406986844323,
"loss": 1.2,
"step": 403
},
{
"epoch": 0.9817739975698664,
"grad_norm": 25.11892123906363,
"learning_rate": 0.00011122496230221645,
"loss": 1.0731,
"step": 404
},
{
"epoch": 0.9842041312272175,
"grad_norm": 26.416884392453543,
"learning_rate": 0.00011082567357255484,
"loss": 1.1836,
"step": 405
},
{
"epoch": 0.9866342648845686,
"grad_norm": 18.768078773975784,
"learning_rate": 0.00011042621012371322,
"loss": 1.1275,
"step": 406
},
{
"epoch": 0.9890643985419199,
"grad_norm": 22.275756523796257,
"learning_rate": 0.00011002657840276627,
"loss": 1.1228,
"step": 407
},
{
"epoch": 0.991494532199271,
"grad_norm": 29.605335344828575,
"learning_rate": 0.00010962678485950455,
"loss": 1.0255,
"step": 408
},
{
"epoch": 0.9939246658566221,
"grad_norm": 41.1718200727633,
"learning_rate": 0.00010922683594633021,
"loss": 1.1876,
"step": 409
},
{
"epoch": 0.9963547995139733,
"grad_norm": 20.46397475257922,
"learning_rate": 0.00010882673811815304,
"loss": 1.1168,
"step": 410
},
{
"epoch": 0.9987849331713244,
"grad_norm": 21.084924025016928,
"learning_rate": 0.00010842649783228624,
"loss": 1.1948,
"step": 411
},
{
"epoch": 1.0,
"grad_norm": 21.084924025016928,
"learning_rate": 0.00010802612154834211,
"loss": 1.1076,
"step": 412
},
{
"epoch": 1.0,
"eval_loss": 1.121336579322815,
"eval_runtime": 52.7043,
"eval_samples_per_second": 14.116,
"eval_steps_per_second": 1.765,
"step": 412
},
{
"epoch": 1.0024301336573511,
"grad_norm": 35.25758968935371,
"learning_rate": 0.00010762561572812788,
"loss": 1.1335,
"step": 413
},
{
"epoch": 1.0048602673147022,
"grad_norm": 20.78715726366623,
"learning_rate": 0.0001072249868355415,
"loss": 1.1003,
"step": 414
},
{
"epoch": 1.0072904009720534,
"grad_norm": 31.01116633763719,
"learning_rate": 0.0001068242413364671,
"loss": 1.1225,
"step": 415
},
{
"epoch": 1.0097205346294047,
"grad_norm": 19.050638172672897,
"learning_rate": 0.00010642338569867086,
"loss": 1.0595,
"step": 416
},
{
"epoch": 1.0121506682867558,
"grad_norm": 41.54235389574412,
"learning_rate": 0.00010602242639169648,
"loss": 1.1719,
"step": 417
},
{
"epoch": 1.014580801944107,
"grad_norm": 41.34218206464363,
"learning_rate": 0.00010562136988676078,
"loss": 1.1292,
"step": 418
},
{
"epoch": 1.017010935601458,
"grad_norm": 32.436985934581934,
"learning_rate": 0.0001052202226566494,
"loss": 1.1244,
"step": 419
},
{
"epoch": 1.0194410692588092,
"grad_norm": 19.631825450596665,
"learning_rate": 0.0001048189911756121,
"loss": 1.1323,
"step": 420
},
{
"epoch": 1.0218712029161603,
"grad_norm": 23.275029440216805,
"learning_rate": 0.00010441768191925847,
"loss": 1.1605,
"step": 421
},
{
"epoch": 1.0243013365735116,
"grad_norm": 21.44161988455765,
"learning_rate": 0.0001040163013644533,
"loss": 1.0886,
"step": 422
},
{
"epoch": 1.0267314702308628,
"grad_norm": 31.9765167465431,
"learning_rate": 0.00010361485598921212,
"loss": 1.1378,
"step": 423
},
{
"epoch": 1.0291616038882139,
"grad_norm": 22.340741556027833,
"learning_rate": 0.00010321335227259661,
"loss": 1.1278,
"step": 424
},
{
"epoch": 1.031591737545565,
"grad_norm": 29.27286563037163,
"learning_rate": 0.00010281179669461005,
"loss": 1.1186,
"step": 425
},
{
"epoch": 1.034021871202916,
"grad_norm": 65.85877610734141,
"learning_rate": 0.00010241019573609269,
"loss": 1.1673,
"step": 426
},
{
"epoch": 1.0364520048602672,
"grad_norm": 35.173784527846884,
"learning_rate": 0.00010200855587861724,
"loss": 1.0903,
"step": 427
},
{
"epoch": 1.0388821385176186,
"grad_norm": 29.91546238299385,
"learning_rate": 0.00010160688360438419,
"loss": 1.0884,
"step": 428
},
{
"epoch": 1.0413122721749697,
"grad_norm": 26.873308685100223,
"learning_rate": 0.0001012051853961172,
"loss": 1.1296,
"step": 429
},
{
"epoch": 1.0437424058323208,
"grad_norm": 25.90622275527891,
"learning_rate": 0.00010080346773695853,
"loss": 1.1349,
"step": 430
},
{
"epoch": 1.046172539489672,
"grad_norm": 21.388851321680434,
"learning_rate": 0.00010040173711036431,
"loss": 1.0947,
"step": 431
},
{
"epoch": 1.048602673147023,
"grad_norm": 31.206506843880053,
"learning_rate": 0.0001,
"loss": 1.1541,
"step": 432
},
{
"epoch": 1.0510328068043742,
"grad_norm": 19.486767323523555,
"learning_rate": 9.959826288963571e-05,
"loss": 1.1574,
"step": 433
},
{
"epoch": 1.0534629404617255,
"grad_norm": 102.81325604770561,
"learning_rate": 9.919653226304148e-05,
"loss": 1.1762,
"step": 434
},
{
"epoch": 1.0558930741190766,
"grad_norm": 17.18170280255333,
"learning_rate": 9.879481460388282e-05,
"loss": 1.1208,
"step": 435
},
{
"epoch": 1.0583232077764277,
"grad_norm": 29.88292309614927,
"learning_rate": 9.839311639561583e-05,
"loss": 1.1114,
"step": 436
},
{
"epoch": 1.0607533414337789,
"grad_norm": 23.50392429976475,
"learning_rate": 9.799144412138275e-05,
"loss": 1.2026,
"step": 437
},
{
"epoch": 1.06318347509113,
"grad_norm": 24.794408487434744,
"learning_rate": 9.758980426390732e-05,
"loss": 1.1587,
"step": 438
},
{
"epoch": 1.065613608748481,
"grad_norm": 38.726295800289655,
"learning_rate": 9.718820330538998e-05,
"loss": 1.14,
"step": 439
},
{
"epoch": 1.0680437424058322,
"grad_norm": 31.152256057732977,
"learning_rate": 9.678664772740343e-05,
"loss": 1.0882,
"step": 440
},
{
"epoch": 1.0704738760631836,
"grad_norm": 65.73380095432839,
"learning_rate": 9.638514401078788e-05,
"loss": 1.1213,
"step": 441
},
{
"epoch": 1.0729040097205347,
"grad_norm": 69.07317297910537,
"learning_rate": 9.598369863554673e-05,
"loss": 1.1285,
"step": 442
},
{
"epoch": 1.0753341433778858,
"grad_norm": 62.55969576940585,
"learning_rate": 9.558231808074156e-05,
"loss": 1.1252,
"step": 443
},
{
"epoch": 1.077764277035237,
"grad_norm": 26.35106444530265,
"learning_rate": 9.51810088243879e-05,
"loss": 1.108,
"step": 444
},
{
"epoch": 1.080194410692588,
"grad_norm": 76.70006955440516,
"learning_rate": 9.477977734335061e-05,
"loss": 1.1144,
"step": 445
},
{
"epoch": 1.0826245443499392,
"grad_norm": 22.376983523395264,
"learning_rate": 9.437863011323922e-05,
"loss": 1.173,
"step": 446
},
{
"epoch": 1.0850546780072905,
"grad_norm": 33.51322062360491,
"learning_rate": 9.397757360830353e-05,
"loss": 1.089,
"step": 447
},
{
"epoch": 1.0874848116646416,
"grad_norm": 24.87252097324779,
"learning_rate": 9.357661430132915e-05,
"loss": 1.098,
"step": 448
},
{
"epoch": 1.0899149453219927,
"grad_norm": 48.95371674408058,
"learning_rate": 9.317575866353292e-05,
"loss": 1.0491,
"step": 449
},
{
"epoch": 1.0923450789793439,
"grad_norm": 25.50740340531524,
"learning_rate": 9.277501316445854e-05,
"loss": 1.0939,
"step": 450
},
{
"epoch": 1.094775212636695,
"grad_norm": 27.60998778610316,
"learning_rate": 9.23743842718721e-05,
"loss": 1.1564,
"step": 451
},
{
"epoch": 1.097205346294046,
"grad_norm": 63.99226186124907,
"learning_rate": 9.197387845165793e-05,
"loss": 1.1088,
"step": 452
},
{
"epoch": 1.0996354799513974,
"grad_norm": 36.441157466567596,
"learning_rate": 9.157350216771378e-05,
"loss": 1.0897,
"step": 453
},
{
"epoch": 1.1020656136087486,
"grad_norm": 32.32587774153429,
"learning_rate": 9.117326188184695e-05,
"loss": 1.1285,
"step": 454
},
{
"epoch": 1.1044957472660997,
"grad_norm": 33.39257750037465,
"learning_rate": 9.077316405366981e-05,
"loss": 1.1568,
"step": 455
},
{
"epoch": 1.1069258809234508,
"grad_norm": 45.03485873480868,
"learning_rate": 9.037321514049548e-05,
"loss": 1.0791,
"step": 456
},
{
"epoch": 1.109356014580802,
"grad_norm": 35.1451377482015,
"learning_rate": 8.997342159723371e-05,
"loss": 1.1243,
"step": 457
},
{
"epoch": 1.111786148238153,
"grad_norm": 67.01465976966,
"learning_rate": 8.957378987628682e-05,
"loss": 1.0978,
"step": 458
},
{
"epoch": 1.1142162818955041,
"grad_norm": 33.057859846207634,
"learning_rate": 8.917432642744518e-05,
"loss": 1.1431,
"step": 459
},
{
"epoch": 1.1166464155528555,
"grad_norm": 30.602840863536635,
"learning_rate": 8.877503769778356e-05,
"loss": 1.1157,
"step": 460
},
{
"epoch": 1.1190765492102066,
"grad_norm": 38.088467248288964,
"learning_rate": 8.83759301315568e-05,
"loss": 1.0776,
"step": 461
},
{
"epoch": 1.1215066828675577,
"grad_norm": 66.03671829863266,
"learning_rate": 8.797701017009591e-05,
"loss": 1.1468,
"step": 462
},
{
"epoch": 1.1239368165249088,
"grad_norm": 32.293691874682686,
"learning_rate": 8.757828425170404e-05,
"loss": 1.1115,
"step": 463
},
{
"epoch": 1.12636695018226,
"grad_norm": 32.70707175332633,
"learning_rate": 8.717975881155261e-05,
"loss": 1.1677,
"step": 464
},
{
"epoch": 1.128797083839611,
"grad_norm": 48.79069594971439,
"learning_rate": 8.678144028157759e-05,
"loss": 1.1341,
"step": 465
},
{
"epoch": 1.1312272174969624,
"grad_norm": 37.52808559072613,
"learning_rate": 8.638333509037536e-05,
"loss": 1.1414,
"step": 466
},
{
"epoch": 1.1336573511543135,
"grad_norm": 27.096068124970536,
"learning_rate": 8.598544966309925e-05,
"loss": 1.1719,
"step": 467
},
{
"epoch": 1.1360874848116647,
"grad_norm": 16.019227077248434,
"learning_rate": 8.55877904213558e-05,
"loss": 1.1148,
"step": 468
},
{
"epoch": 1.1385176184690158,
"grad_norm": 29.861941956913498,
"learning_rate": 8.519036378310096e-05,
"loss": 1.1486,
"step": 469
},
{
"epoch": 1.140947752126367,
"grad_norm": 23.058998452019107,
"learning_rate": 8.47931761625367e-05,
"loss": 1.0745,
"step": 470
},
{
"epoch": 1.143377885783718,
"grad_norm": 24.486692418227875,
"learning_rate": 8.43962339700073e-05,
"loss": 1.1333,
"step": 471
},
{
"epoch": 1.1458080194410694,
"grad_norm": 31.632544516924323,
"learning_rate": 8.399954361189615e-05,
"loss": 1.1565,
"step": 472
},
{
"epoch": 1.1482381530984205,
"grad_norm": 21.67735267443374,
"learning_rate": 8.360311149052205e-05,
"loss": 1.109,
"step": 473
},
{
"epoch": 1.1506682867557716,
"grad_norm": 29.096918560226527,
"learning_rate": 8.320694400403606e-05,
"loss": 1.1517,
"step": 474
},
{
"epoch": 1.1530984204131227,
"grad_norm": 46.067313216206955,
"learning_rate": 8.281104754631835e-05,
"loss": 1.1043,
"step": 475
},
{
"epoch": 1.1555285540704738,
"grad_norm": 30.84953769166141,
"learning_rate": 8.241542850687465e-05,
"loss": 1.1081,
"step": 476
},
{
"epoch": 1.157958687727825,
"grad_norm": 39.34158523904847,
"learning_rate": 8.20200932707335e-05,
"loss": 1.1787,
"step": 477
},
{
"epoch": 1.160388821385176,
"grad_norm": 39.14663302484904,
"learning_rate": 8.162504821834295e-05,
"loss": 1.202,
"step": 478
},
{
"epoch": 1.1628189550425274,
"grad_norm": 49.7279004249915,
"learning_rate": 8.123029972546781e-05,
"loss": 1.1439,
"step": 479
},
{
"epoch": 1.1652490886998785,
"grad_norm": 35.49897960878779,
"learning_rate": 8.083585416308642e-05,
"loss": 1.0741,
"step": 480
},
{
"epoch": 1.1676792223572297,
"grad_norm": 31.306252618855535,
"learning_rate": 8.044171789728816e-05,
"loss": 1.0697,
"step": 481
},
{
"epoch": 1.1701093560145808,
"grad_norm": 22.40745672651249,
"learning_rate": 8.004789728917059e-05,
"loss": 1.1498,
"step": 482
},
{
"epoch": 1.172539489671932,
"grad_norm": 32.19326746671122,
"learning_rate": 7.965439869473664e-05,
"loss": 1.1392,
"step": 483
},
{
"epoch": 1.1749696233292832,
"grad_norm": 33.66876390791385,
"learning_rate": 7.926122846479224e-05,
"loss": 1.1049,
"step": 484
},
{
"epoch": 1.1773997569866343,
"grad_norm": 35.43357233261174,
"learning_rate": 7.886839294484377e-05,
"loss": 1.0467,
"step": 485
},
{
"epoch": 1.1798298906439855,
"grad_norm": 50.660998166256256,
"learning_rate": 7.84758984749955e-05,
"loss": 1.1244,
"step": 486
},
{
"epoch": 1.1822600243013366,
"grad_norm": 41.356845334605936,
"learning_rate": 7.808375138984745e-05,
"loss": 1.1279,
"step": 487
},
{
"epoch": 1.1846901579586877,
"grad_norm": 22.947663723281487,
"learning_rate": 7.769195801839313e-05,
"loss": 1.0787,
"step": 488
},
{
"epoch": 1.1871202916160388,
"grad_norm": 36.434647074399905,
"learning_rate": 7.730052468391725e-05,
"loss": 1.1148,
"step": 489
},
{
"epoch": 1.18955042527339,
"grad_norm": 75.94549877059467,
"learning_rate": 7.690945770389377e-05,
"loss": 1.1127,
"step": 490
},
{
"epoch": 1.1919805589307413,
"grad_norm": 68.03126664734435,
"learning_rate": 7.6518763389884e-05,
"loss": 1.1672,
"step": 491
},
{
"epoch": 1.1944106925880924,
"grad_norm": 40.15361719091623,
"learning_rate": 7.612844804743466e-05,
"loss": 1.0962,
"step": 492
},
{
"epoch": 1.1968408262454435,
"grad_norm": 105.80023571763755,
"learning_rate": 7.573851797597602e-05,
"loss": 1.1091,
"step": 493
},
{
"epoch": 1.1992709599027946,
"grad_norm": 41.84401502420881,
"learning_rate": 7.534897946872042e-05,
"loss": 1.1359,
"step": 494
},
{
"epoch": 1.2017010935601458,
"grad_norm": 21.985533615468846,
"learning_rate": 7.495983881256067e-05,
"loss": 1.1024,
"step": 495
},
{
"epoch": 1.2041312272174969,
"grad_norm": 23.02649898605792,
"learning_rate": 7.457110228796838e-05,
"loss": 1.1089,
"step": 496
},
{
"epoch": 1.206561360874848,
"grad_norm": 74.4950498938832,
"learning_rate": 7.418277616889282e-05,
"loss": 1.0439,
"step": 497
},
{
"epoch": 1.2089914945321993,
"grad_norm": 27.637660484960865,
"learning_rate": 7.379486672265964e-05,
"loss": 1.1453,
"step": 498
},
{
"epoch": 1.2114216281895505,
"grad_norm": 34.98561655821008,
"learning_rate": 7.340738020986961e-05,
"loss": 1.139,
"step": 499
},
{
"epoch": 1.2138517618469016,
"grad_norm": 28.47627677351389,
"learning_rate": 7.302032288429756e-05,
"loss": 1.0623,
"step": 500
},
{
"epoch": 1.2162818955042527,
"grad_norm": 39.551486186427596,
"learning_rate": 7.263370099279172e-05,
"loss": 1.1277,
"step": 501
},
{
"epoch": 1.2187120291616038,
"grad_norm": 44.12973085459368,
"learning_rate": 7.224752077517253e-05,
"loss": 1.1768,
"step": 502
},
{
"epoch": 1.2211421628189552,
"grad_norm": 84.84836585196132,
"learning_rate": 7.186178846413214e-05,
"loss": 1.1892,
"step": 503
},
{
"epoch": 1.2235722964763063,
"grad_norm": 34.94807915131505,
"learning_rate": 7.147651028513383e-05,
"loss": 1.1108,
"step": 504
},
{
"epoch": 1.2260024301336574,
"grad_norm": 46.19847384406232,
"learning_rate": 7.109169245631149e-05,
"loss": 1.0956,
"step": 505
},
{
"epoch": 1.2284325637910085,
"grad_norm": 38.58484473058957,
"learning_rate": 7.070734118836925e-05,
"loss": 1.1175,
"step": 506
},
{
"epoch": 1.2308626974483596,
"grad_norm": 37.84739298111386,
"learning_rate": 7.032346268448118e-05,
"loss": 1.1411,
"step": 507
},
{
"epoch": 1.2332928311057108,
"grad_norm": 53.5471335398439,
"learning_rate": 6.994006314019141e-05,
"loss": 1.1332,
"step": 508
},
{
"epoch": 1.2357229647630619,
"grad_norm": 91.55067777365485,
"learning_rate": 6.955714874331387e-05,
"loss": 1.1205,
"step": 509
},
{
"epoch": 1.2381530984204132,
"grad_norm": 27.05333642785952,
"learning_rate": 6.917472567383252e-05,
"loss": 1.099,
"step": 510
},
{
"epoch": 1.2405832320777643,
"grad_norm": 24.519879042487336,
"learning_rate": 6.87928001038017e-05,
"loss": 1.1401,
"step": 511
},
{
"epoch": 1.2430133657351154,
"grad_norm": 33.763495598365786,
"learning_rate": 6.84113781972464e-05,
"loss": 1.2058,
"step": 512
},
{
"epoch": 1.2454434993924666,
"grad_norm": 34.49114206138826,
"learning_rate": 6.803046611006278e-05,
"loss": 1.1044,
"step": 513
},
{
"epoch": 1.2478736330498177,
"grad_norm": 74.20211157975073,
"learning_rate": 6.765006998991888e-05,
"loss": 1.111,
"step": 514
},
{
"epoch": 1.250303766707169,
"grad_norm": 32.30436806042553,
"learning_rate": 6.727019597615545e-05,
"loss": 1.1063,
"step": 515
},
{
"epoch": 1.250303766707169,
"eval_loss": 1.1128273010253906,
"eval_runtime": 53.4998,
"eval_samples_per_second": 13.907,
"eval_steps_per_second": 1.738,
"step": 515
},
{
"epoch": 1.25273390036452,
"grad_norm": 42.104054612880084,
"learning_rate": 6.689085019968669e-05,
"loss": 1.1315,
"step": 516
},
{
"epoch": 1.2551640340218713,
"grad_norm": 25.66097714624212,
"learning_rate": 6.651203878290139e-05,
"loss": 1.0916,
"step": 517
},
{
"epoch": 1.2575941676792224,
"grad_norm": 35.12310576456352,
"learning_rate": 6.613376783956423e-05,
"loss": 1.0699,
"step": 518
},
{
"epoch": 1.2600243013365735,
"grad_norm": 34.172951559594566,
"learning_rate": 6.575604347471695e-05,
"loss": 1.1412,
"step": 519
},
{
"epoch": 1.2624544349939246,
"grad_norm": 54.373563773275116,
"learning_rate": 6.537887178457984e-05,
"loss": 1.1255,
"step": 520
},
{
"epoch": 1.2648845686512757,
"grad_norm": 33.806385046788755,
"learning_rate": 6.500225885645346e-05,
"loss": 1.101,
"step": 521
},
{
"epoch": 1.267314702308627,
"grad_norm": 34.17813695957543,
"learning_rate": 6.46262107686203e-05,
"loss": 1.1226,
"step": 522
},
{
"epoch": 1.2697448359659782,
"grad_norm": 24.68048087106548,
"learning_rate": 6.425073359024663e-05,
"loss": 1.1787,
"step": 523
},
{
"epoch": 1.2721749696233293,
"grad_norm": 32.78749757697808,
"learning_rate": 6.387583338128471e-05,
"loss": 1.0541,
"step": 524
},
{
"epoch": 1.2746051032806804,
"grad_norm": 30.906673844090044,
"learning_rate": 6.350151619237488e-05,
"loss": 1.0964,
"step": 525
},
{
"epoch": 1.2770352369380316,
"grad_norm": 32.571858392892736,
"learning_rate": 6.312778806474795e-05,
"loss": 1.1251,
"step": 526
},
{
"epoch": 1.2794653705953827,
"grad_norm": 43.02428916532565,
"learning_rate": 6.275465503012751e-05,
"loss": 1.0473,
"step": 527
},
{
"epoch": 1.2818955042527338,
"grad_norm": 60.93587506764561,
"learning_rate": 6.2382123110633e-05,
"loss": 1.078,
"step": 528
},
{
"epoch": 1.2843256379100851,
"grad_norm": 64.6934775930251,
"learning_rate": 6.201019831868208e-05,
"loss": 1.0904,
"step": 529
},
{
"epoch": 1.2867557715674363,
"grad_norm": 32.977077613035426,
"learning_rate": 6.16388866568938e-05,
"loss": 1.0705,
"step": 530
},
{
"epoch": 1.2891859052247874,
"grad_norm": 28.27407310492513,
"learning_rate": 6.126819411799175e-05,
"loss": 1.1252,
"step": 531
},
{
"epoch": 1.2916160388821385,
"grad_norm": 33.73515826089828,
"learning_rate": 6.0898126684707265e-05,
"loss": 1.1262,
"step": 532
},
{
"epoch": 1.2940461725394896,
"grad_norm": 25.370361818959903,
"learning_rate": 6.052869032968285e-05,
"loss": 1.0845,
"step": 533
},
{
"epoch": 1.296476306196841,
"grad_norm": 37.389287060597105,
"learning_rate": 6.015989101537586e-05,
"loss": 1.1352,
"step": 534
},
{
"epoch": 1.2989064398541919,
"grad_norm": 39.04755104008223,
"learning_rate": 5.979173469396227e-05,
"loss": 1.1538,
"step": 535
},
{
"epoch": 1.3013365735115432,
"grad_norm": 34.33676719612293,
"learning_rate": 5.9424227307240554e-05,
"loss": 1.1725,
"step": 536
},
{
"epoch": 1.3037667071688943,
"grad_norm": 64.66076997769457,
"learning_rate": 5.905737478653572e-05,
"loss": 1.1146,
"step": 537
},
{
"epoch": 1.3061968408262454,
"grad_norm": 48.043289790386325,
"learning_rate": 5.8691183052603834e-05,
"loss": 1.1035,
"step": 538
},
{
"epoch": 1.3086269744835966,
"grad_norm": 49.08397341659928,
"learning_rate": 5.83256580155362e-05,
"loss": 1.1653,
"step": 539
},
{
"epoch": 1.3110571081409477,
"grad_norm": 46.688886812303515,
"learning_rate": 5.796080557466406e-05,
"loss": 1.1328,
"step": 540
},
{
"epoch": 1.313487241798299,
"grad_norm": 27.503882325413493,
"learning_rate": 5.7596631618463514e-05,
"loss": 1.1019,
"step": 541
},
{
"epoch": 1.3159173754556501,
"grad_norm": 48.88974129574653,
"learning_rate": 5.723314202446026e-05,
"loss": 1.121,
"step": 542
},
{
"epoch": 1.3183475091130012,
"grad_norm": 28.105881157995345,
"learning_rate": 5.687034265913485e-05,
"loss": 1.0898,
"step": 543
},
{
"epoch": 1.3207776427703524,
"grad_norm": 30.410731278414804,
"learning_rate": 5.6508239377828034e-05,
"loss": 1.07,
"step": 544
},
{
"epoch": 1.3232077764277035,
"grad_norm": 38.08324176765882,
"learning_rate": 5.614683802464631e-05,
"loss": 1.1503,
"step": 545
},
{
"epoch": 1.3256379100850546,
"grad_norm": 46.28952293745534,
"learning_rate": 5.578614443236738e-05,
"loss": 1.1282,
"step": 546
},
{
"epoch": 1.3280680437424057,
"grad_norm": 68.2597453597135,
"learning_rate": 5.542616442234618e-05,
"loss": 1.1373,
"step": 547
},
{
"epoch": 1.330498177399757,
"grad_norm": 30.351663825014143,
"learning_rate": 5.5066903804421025e-05,
"loss": 1.1633,
"step": 548
},
{
"epoch": 1.3329283110571082,
"grad_norm": 38.2711285636887,
"learning_rate": 5.470836837681954e-05,
"loss": 1.1604,
"step": 549
},
{
"epoch": 1.3353584447144593,
"grad_norm": 35.64230091531108,
"learning_rate": 5.4350563926065404e-05,
"loss": 1.0564,
"step": 550
},
{
"epoch": 1.3377885783718104,
"grad_norm": 44.869816046925564,
"learning_rate": 5.399349622688479e-05,
"loss": 1.1376,
"step": 551
},
{
"epoch": 1.3402187120291615,
"grad_norm": 26.681037126315633,
"learning_rate": 5.3637171042113146e-05,
"loss": 1.0867,
"step": 552
},
{
"epoch": 1.3426488456865129,
"grad_norm": 34.6124686262535,
"learning_rate": 5.32815941226022e-05,
"loss": 1.0474,
"step": 553
},
{
"epoch": 1.3450789793438638,
"grad_norm": 35.92639009060983,
"learning_rate": 5.2926771207127254e-05,
"loss": 1.0958,
"step": 554
},
{
"epoch": 1.3475091130012151,
"grad_norm": 39.08938922562224,
"learning_rate": 5.2572708022294504e-05,
"loss": 1.074,
"step": 555
},
{
"epoch": 1.3499392466585662,
"grad_norm": 76.06708166273745,
"learning_rate": 5.2219410282448514e-05,
"loss": 1.0865,
"step": 556
},
{
"epoch": 1.3523693803159174,
"grad_norm": 74.14222265654887,
"learning_rate": 5.1866883689580056e-05,
"loss": 1.1567,
"step": 557
},
{
"epoch": 1.3547995139732685,
"grad_norm": 34.82441678662901,
"learning_rate": 5.151513393323426e-05,
"loss": 1.0802,
"step": 558
},
{
"epoch": 1.3572296476306196,
"grad_norm": 75.53504846566143,
"learning_rate": 5.116416669041843e-05,
"loss": 1.0623,
"step": 559
},
{
"epoch": 1.359659781287971,
"grad_norm": 29.423475817434785,
"learning_rate": 5.0813987625510775e-05,
"loss": 1.077,
"step": 560
},
{
"epoch": 1.362089914945322,
"grad_norm": 44.607486168434534,
"learning_rate": 5.046460239016879e-05,
"loss": 1.096,
"step": 561
},
{
"epoch": 1.3645200486026732,
"grad_norm": 40.684125033315404,
"learning_rate": 5.011601662323807e-05,
"loss": 1.148,
"step": 562
},
{
"epoch": 1.3669501822600243,
"grad_norm": 47.33103026318705,
"learning_rate": 4.976823595066128e-05,
"loss": 1.1712,
"step": 563
},
{
"epoch": 1.3693803159173754,
"grad_norm": 51.17017845058186,
"learning_rate": 4.9421265985387476e-05,
"loss": 1.1287,
"step": 564
},
{
"epoch": 1.3718104495747265,
"grad_norm": 50.76665552103517,
"learning_rate": 4.907511232728145e-05,
"loss": 1.1156,
"step": 565
},
{
"epoch": 1.3742405832320777,
"grad_norm": 32.6007633025874,
"learning_rate": 4.872978056303327e-05,
"loss": 1.1477,
"step": 566
},
{
"epoch": 1.376670716889429,
"grad_norm": 29.696241441710107,
"learning_rate": 4.8385276266068146e-05,
"loss": 1.0874,
"step": 567
},
{
"epoch": 1.37910085054678,
"grad_norm": 58.96613500379004,
"learning_rate": 4.804160499645667e-05,
"loss": 1.0616,
"step": 568
},
{
"epoch": 1.3815309842041312,
"grad_norm": 37.104100020310334,
"learning_rate": 4.7698772300824756e-05,
"loss": 1.0878,
"step": 569
},
{
"epoch": 1.3839611178614823,
"grad_norm": 51.735902941979305,
"learning_rate": 4.735678371226441e-05,
"loss": 1.0836,
"step": 570
},
{
"epoch": 1.3863912515188335,
"grad_norm": 55.49190976804079,
"learning_rate": 4.7015644750244306e-05,
"loss": 1.0473,
"step": 571
},
{
"epoch": 1.3888213851761848,
"grad_norm": 34.27972449829039,
"learning_rate": 4.6675360920520625e-05,
"loss": 1.0723,
"step": 572
},
{
"epoch": 1.391251518833536,
"grad_norm": 28.508157856527724,
"learning_rate": 4.6335937715048306e-05,
"loss": 1.0723,
"step": 573
},
{
"epoch": 1.393681652490887,
"grad_norm": 106.84009565003795,
"learning_rate": 4.599738061189244e-05,
"loss": 1.149,
"step": 574
},
{
"epoch": 1.3961117861482382,
"grad_norm": 50.543394606036294,
"learning_rate": 4.565969507513981e-05,
"loss": 1.0991,
"step": 575
},
{
"epoch": 1.3985419198055893,
"grad_norm": 30.409124335052745,
"learning_rate": 4.532288655481062e-05,
"loss": 1.1157,
"step": 576
},
{
"epoch": 1.4009720534629404,
"grad_norm": 89.92061876679301,
"learning_rate": 4.498696048677059e-05,
"loss": 1.1526,
"step": 577
},
{
"epoch": 1.4034021871202915,
"grad_norm": 84.27775422110602,
"learning_rate": 4.465192229264337e-05,
"loss": 1.1418,
"step": 578
},
{
"epoch": 1.4058323207776429,
"grad_norm": 40.7815489623743,
"learning_rate": 4.4317777379722866e-05,
"loss": 1.0831,
"step": 579
},
{
"epoch": 1.408262454434994,
"grad_norm": 66.6911504313278,
"learning_rate": 4.3984531140885943e-05,
"loss": 1.1088,
"step": 580
},
{
"epoch": 1.410692588092345,
"grad_norm": 137.00882181835217,
"learning_rate": 4.365218895450558e-05,
"loss": 1.1089,
"step": 581
},
{
"epoch": 1.4131227217496962,
"grad_norm": 41.139168895296855,
"learning_rate": 4.332075618436386e-05,
"loss": 1.1603,
"step": 582
},
{
"epoch": 1.4155528554070473,
"grad_norm": 35.443969765428506,
"learning_rate": 4.29902381795655e-05,
"loss": 1.0301,
"step": 583
},
{
"epoch": 1.4179829890643987,
"grad_norm": 32.931514576694674,
"learning_rate": 4.266064027445155e-05,
"loss": 1.1016,
"step": 584
},
{
"epoch": 1.4204131227217496,
"grad_norm": 64.21015694858382,
"learning_rate": 4.2331967788513295e-05,
"loss": 1.0789,
"step": 585
},
{
"epoch": 1.422843256379101,
"grad_norm": 84.13251752827094,
"learning_rate": 4.200422602630629e-05,
"loss": 1.1573,
"step": 586
},
{
"epoch": 1.425273390036452,
"grad_norm": 53.61636603108024,
"learning_rate": 4.167742027736482e-05,
"loss": 1.0942,
"step": 587
},
{
"epoch": 1.4277035236938032,
"grad_norm": 133.20877569415256,
"learning_rate": 4.135155581611661e-05,
"loss": 1.0877,
"step": 588
},
{
"epoch": 1.4301336573511543,
"grad_norm": 49.85736467319357,
"learning_rate": 4.102663790179764e-05,
"loss": 1.0619,
"step": 589
},
{
"epoch": 1.4325637910085054,
"grad_norm": 91.13217639524017,
"learning_rate": 4.070267177836712e-05,
"loss": 1.1093,
"step": 590
},
{
"epoch": 1.4349939246658567,
"grad_norm": 49.25558128250457,
"learning_rate": 4.037966267442315e-05,
"loss": 1.1344,
"step": 591
},
{
"epoch": 1.4374240583232079,
"grad_norm": 95.87244356130316,
"learning_rate": 4.005761580311805e-05,
"loss": 1.0929,
"step": 592
},
{
"epoch": 1.439854191980559,
"grad_norm": 74.28903671045653,
"learning_rate": 3.973653636207437e-05,
"loss": 1.1263,
"step": 593
},
{
"epoch": 1.44228432563791,
"grad_norm": 53.99454529785116,
"learning_rate": 3.941642953330103e-05,
"loss": 1.0916,
"step": 594
},
{
"epoch": 1.4447144592952612,
"grad_norm": 113.26015597338959,
"learning_rate": 3.909730048310962e-05,
"loss": 1.1009,
"step": 595
},
{
"epoch": 1.4471445929526123,
"grad_norm": 134.4015550981493,
"learning_rate": 3.8779154362030986e-05,
"loss": 1.1351,
"step": 596
},
{
"epoch": 1.4495747266099634,
"grad_norm": 90.61611981238187,
"learning_rate": 3.846199630473216e-05,
"loss": 1.0827,
"step": 597
},
{
"epoch": 1.4520048602673148,
"grad_norm": 56.55050791518521,
"learning_rate": 3.814583142993352e-05,
"loss": 1.1145,
"step": 598
},
{
"epoch": 1.454434993924666,
"grad_norm": 265.6916535243014,
"learning_rate": 3.7830664840326145e-05,
"loss": 1.1459,
"step": 599
},
{
"epoch": 1.456865127582017,
"grad_norm": 72.81191101030372,
"learning_rate": 3.7516501622489367e-05,
"loss": 1.0903,
"step": 600
},
{
"epoch": 1.4592952612393681,
"grad_norm": 58.309143549086556,
"learning_rate": 3.720334684680889e-05,
"loss": 1.1041,
"step": 601
},
{
"epoch": 1.4617253948967193,
"grad_norm": 35.19205741792398,
"learning_rate": 3.689120556739475e-05,
"loss": 1.1523,
"step": 602
},
{
"epoch": 1.4641555285540706,
"grad_norm": 88.97226951757321,
"learning_rate": 3.6580082821999786e-05,
"loss": 1.1117,
"step": 603
},
{
"epoch": 1.4665856622114215,
"grad_norm": 64.50873879301322,
"learning_rate": 3.6269983631938475e-05,
"loss": 1.1256,
"step": 604
},
{
"epoch": 1.4690157958687728,
"grad_norm": 78.10556611104111,
"learning_rate": 3.596091300200578e-05,
"loss": 1.0834,
"step": 605
},
{
"epoch": 1.471445929526124,
"grad_norm": 69.38449946362529,
"learning_rate": 3.565287592039628e-05,
"loss": 1.1026,
"step": 606
},
{
"epoch": 1.473876063183475,
"grad_norm": 79.60241521456905,
"learning_rate": 3.534587735862391e-05,
"loss": 1.0456,
"step": 607
},
{
"epoch": 1.4763061968408262,
"grad_norm": 89.68581306071424,
"learning_rate": 3.503992227144147e-05,
"loss": 1.0809,
"step": 608
},
{
"epoch": 1.4787363304981773,
"grad_norm": 68.570527237558,
"learning_rate": 3.473501559676088e-05,
"loss": 1.0754,
"step": 609
},
{
"epoch": 1.4811664641555287,
"grad_norm": 54.94762317625427,
"learning_rate": 3.4431162255573245e-05,
"loss": 1.1751,
"step": 610
},
{
"epoch": 1.4835965978128798,
"grad_norm": 109.12821602719706,
"learning_rate": 3.4128367151869714e-05,
"loss": 1.1055,
"step": 611
},
{
"epoch": 1.486026731470231,
"grad_norm": 198.79030469542352,
"learning_rate": 3.3826635172562094e-05,
"loss": 1.1369,
"step": 612
},
{
"epoch": 1.488456865127582,
"grad_norm": 62.002866716809,
"learning_rate": 3.352597118740404e-05,
"loss": 1.1611,
"step": 613
},
{
"epoch": 1.4908869987849331,
"grad_norm": 79.21193137029579,
"learning_rate": 3.3226380048912585e-05,
"loss": 1.1688,
"step": 614
},
{
"epoch": 1.4933171324422843,
"grad_norm": 68.6722934326242,
"learning_rate": 3.292786659228973e-05,
"loss": 1.1248,
"step": 615
},
{
"epoch": 1.4957472660996354,
"grad_norm": 104.34122241838278,
"learning_rate": 3.263043563534428e-05,
"loss": 1.1425,
"step": 616
},
{
"epoch": 1.4981773997569867,
"grad_norm": 86.43862038340298,
"learning_rate": 3.233409197841437e-05,
"loss": 1.0562,
"step": 617
},
{
"epoch": 1.5006075334143378,
"grad_norm": 79.74137751394451,
"learning_rate": 3.2038840404289705e-05,
"loss": 1.1214,
"step": 618
},
{
"epoch": 1.5006075334143378,
"eval_loss": 1.1088899374008179,
"eval_runtime": 53.0545,
"eval_samples_per_second": 14.023,
"eval_steps_per_second": 1.753,
"step": 618
},
{
"epoch": 1.503037667071689,
"grad_norm": 126.19650708566132,
"learning_rate": 3.174468567813461e-05,
"loss": 1.181,
"step": 619
},
{
"epoch": 1.50546780072904,
"grad_norm": 64.86293986153461,
"learning_rate": 3.14516325474109e-05,
"loss": 1.0607,
"step": 620
},
{
"epoch": 1.5078979343863912,
"grad_norm": 62.06308896160908,
"learning_rate": 3.115968574180149e-05,
"loss": 1.0914,
"step": 621
},
{
"epoch": 1.5103280680437425,
"grad_norm": 168.27548636755165,
"learning_rate": 3.086884997313387e-05,
"loss": 1.1595,
"step": 622
},
{
"epoch": 1.5127582017010934,
"grad_norm": 156.46495738513647,
"learning_rate": 3.0579129935304066e-05,
"loss": 1.1263,
"step": 623
},
{
"epoch": 1.5151883353584448,
"grad_norm": 71.761765760571,
"learning_rate": 3.029053030420115e-05,
"loss": 1.049,
"step": 624
},
{
"epoch": 1.517618469015796,
"grad_norm": 87.26870047585324,
"learning_rate": 3.0003055737631403e-05,
"loss": 1.1917,
"step": 625
},
{
"epoch": 1.520048602673147,
"grad_norm": 142.01139847883954,
"learning_rate": 2.9716710875243326e-05,
"loss": 1.1038,
"step": 626
},
{
"epoch": 1.5224787363304981,
"grad_norm": 81.15254185021365,
"learning_rate": 2.9431500338452832e-05,
"loss": 1.0824,
"step": 627
},
{
"epoch": 1.5249088699878492,
"grad_norm": 68.21138775878333,
"learning_rate": 2.9147428730368475e-05,
"loss": 1.0676,
"step": 628
},
{
"epoch": 1.5273390036452006,
"grad_norm": 61.929977077152344,
"learning_rate": 2.886450063571735e-05,
"loss": 1.1928,
"step": 629
},
{
"epoch": 1.5297691373025515,
"grad_norm": 76.19248167649229,
"learning_rate": 2.858272062077091e-05,
"loss": 1.0737,
"step": 630
},
{
"epoch": 1.5321992709599028,
"grad_norm": 67.40817795826194,
"learning_rate": 2.8302093233271453e-05,
"loss": 1.0734,
"step": 631
},
{
"epoch": 1.534629404617254,
"grad_norm": 35.17352084915858,
"learning_rate": 2.802262300235857e-05,
"loss": 1.0062,
"step": 632
},
{
"epoch": 1.537059538274605,
"grad_norm": 97.0705094618675,
"learning_rate": 2.7744314438496088e-05,
"loss": 1.121,
"step": 633
},
{
"epoch": 1.5394896719319564,
"grad_norm": 52.21457659022329,
"learning_rate": 2.7467172033399458e-05,
"loss": 1.1864,
"step": 634
},
{
"epoch": 1.5419198055893073,
"grad_norm": 260.1057846866782,
"learning_rate": 2.7191200259962934e-05,
"loss": 1.1549,
"step": 635
},
{
"epoch": 1.5443499392466586,
"grad_norm": 66.65086231184844,
"learning_rate": 2.691640357218759e-05,
"loss": 1.1023,
"step": 636
},
{
"epoch": 1.5467800729040098,
"grad_norm": 680.8791021196618,
"learning_rate": 2.6642786405109475e-05,
"loss": 1.0943,
"step": 637
},
{
"epoch": 1.5492102065613609,
"grad_norm": 36.199872792671414,
"learning_rate": 2.6370353174727836e-05,
"loss": 1.0924,
"step": 638
},
{
"epoch": 1.551640340218712,
"grad_norm": 84.1148767833362,
"learning_rate": 2.6099108277934103e-05,
"loss": 1.1361,
"step": 639
},
{
"epoch": 1.5540704738760631,
"grad_norm": 81.84432345021693,
"learning_rate": 2.5829056092440662e-05,
"loss": 1.0868,
"step": 640
},
{
"epoch": 1.5565006075334145,
"grad_norm": 39.42683610456025,
"learning_rate": 2.556020097671046e-05,
"loss": 1.1506,
"step": 641
},
{
"epoch": 1.5589307411907654,
"grad_norm": 54.33249421192736,
"learning_rate": 2.5292547269886392e-05,
"loss": 1.0517,
"step": 642
},
{
"epoch": 1.5613608748481167,
"grad_norm": 410.5903072488164,
"learning_rate": 2.5026099291721516e-05,
"loss": 1.0995,
"step": 643
},
{
"epoch": 1.5637910085054678,
"grad_norm": 83.574545998207,
"learning_rate": 2.4760861342509233e-05,
"loss": 1.0792,
"step": 644
},
{
"epoch": 1.566221142162819,
"grad_norm": 399.66181496308434,
"learning_rate": 2.449683770301382e-05,
"loss": 1.2167,
"step": 645
},
{
"epoch": 1.56865127582017,
"grad_norm": 55.12309263364805,
"learning_rate": 2.4234032634401406e-05,
"loss": 1.0332,
"step": 646
},
{
"epoch": 1.5710814094775212,
"grad_norm": 61.30588953316776,
"learning_rate": 2.397245037817125e-05,
"loss": 1.0659,
"step": 647
},
{
"epoch": 1.5735115431348725,
"grad_norm": 75.74467195338701,
"learning_rate": 2.371209515608718e-05,
"loss": 1.1254,
"step": 648
},
{
"epoch": 1.5759416767922234,
"grad_norm": 67.98309962901806,
"learning_rate": 2.345297117010954e-05,
"loss": 1.1119,
"step": 649
},
{
"epoch": 1.5783718104495748,
"grad_norm": 59.08178521357814,
"learning_rate": 2.3195082602327312e-05,
"loss": 1.0866,
"step": 650
},
{
"epoch": 1.5808019441069259,
"grad_norm": 94.26571313695092,
"learning_rate": 2.2938433614890697e-05,
"loss": 1.1742,
"step": 651
},
{
"epoch": 1.583232077764277,
"grad_norm": 92.74387959878898,
"learning_rate": 2.2683028349943815e-05,
"loss": 1.1765,
"step": 652
},
{
"epoch": 1.5856622114216283,
"grad_norm": 54.0790750014235,
"learning_rate": 2.242887092955801e-05,
"loss": 1.0979,
"step": 653
},
{
"epoch": 1.5880923450789792,
"grad_norm": 55.72195824432094,
"learning_rate": 2.2175965455665226e-05,
"loss": 1.0826,
"step": 654
},
{
"epoch": 1.5905224787363306,
"grad_norm": 60.8162820416134,
"learning_rate": 2.1924316009991787e-05,
"loss": 1.0884,
"step": 655
},
{
"epoch": 1.5929526123936817,
"grad_norm": 67.20621804796278,
"learning_rate": 2.167392665399256e-05,
"loss": 1.1426,
"step": 656
},
{
"epoch": 1.5953827460510328,
"grad_norm": 63.50889552696206,
"learning_rate": 2.1424801428785447e-05,
"loss": 1.1819,
"step": 657
},
{
"epoch": 1.597812879708384,
"grad_norm": 60.34121097929382,
"learning_rate": 2.1176944355086058e-05,
"loss": 1.1051,
"step": 658
},
{
"epoch": 1.600243013365735,
"grad_norm": 91.95807405182529,
"learning_rate": 2.0930359433142932e-05,
"loss": 1.0768,
"step": 659
},
{
"epoch": 1.6026731470230864,
"grad_norm": 33.84817514299781,
"learning_rate": 2.068505064267292e-05,
"loss": 1.1556,
"step": 660
},
{
"epoch": 1.6051032806804373,
"grad_norm": 44.846129252871364,
"learning_rate": 2.0441021942796944e-05,
"loss": 1.192,
"step": 661
},
{
"epoch": 1.6075334143377886,
"grad_norm": 104.85494442468764,
"learning_rate": 2.0198277271976052e-05,
"loss": 1.1912,
"step": 662
},
{
"epoch": 1.6099635479951397,
"grad_norm": 59.541562510020924,
"learning_rate": 1.995682054794803e-05,
"loss": 1.0932,
"step": 663
},
{
"epoch": 1.6123936816524909,
"grad_norm": 57.73876590809742,
"learning_rate": 1.9716655667664008e-05,
"loss": 1.1691,
"step": 664
},
{
"epoch": 1.6148238153098422,
"grad_norm": 37.00550106127363,
"learning_rate": 1.9477786507225616e-05,
"loss": 1.0974,
"step": 665
},
{
"epoch": 1.617253948967193,
"grad_norm": 271.6238263663105,
"learning_rate": 1.924021692182236e-05,
"loss": 1.1196,
"step": 666
},
{
"epoch": 1.6196840826245444,
"grad_norm": 69.94535819115217,
"learning_rate": 1.900395074566962e-05,
"loss": 1.1219,
"step": 667
},
{
"epoch": 1.6221142162818953,
"grad_norm": 64.77937566314249,
"learning_rate": 1.8768991791946456e-05,
"loss": 1.0457,
"step": 668
},
{
"epoch": 1.6245443499392467,
"grad_norm": 91.1799572658908,
"learning_rate": 1.8535343852734332e-05,
"loss": 1.1082,
"step": 669
},
{
"epoch": 1.6269744835965978,
"grad_norm": 140.3320781032681,
"learning_rate": 1.8303010698955804e-05,
"loss": 1.1587,
"step": 670
},
{
"epoch": 1.629404617253949,
"grad_norm": 129.9206563142473,
"learning_rate": 1.8071996080313602e-05,
"loss": 1.0436,
"step": 671
},
{
"epoch": 1.6318347509113003,
"grad_norm": 57.52355335064491,
"learning_rate": 1.784230372523018e-05,
"loss": 1.0777,
"step": 672
},
{
"epoch": 1.6342648845686512,
"grad_norm": 45.59691137086442,
"learning_rate": 1.76139373407876e-05,
"loss": 1.1133,
"step": 673
},
{
"epoch": 1.6366950182260025,
"grad_norm": 174.9829716096277,
"learning_rate": 1.7386900612667633e-05,
"loss": 1.1704,
"step": 674
},
{
"epoch": 1.6391251518833536,
"grad_norm": 106.67575565748977,
"learning_rate": 1.7161197205092216e-05,
"loss": 1.108,
"step": 675
},
{
"epoch": 1.6415552855407047,
"grad_norm": 80.2118578939736,
"learning_rate": 1.69368307607644e-05,
"loss": 1.1134,
"step": 676
},
{
"epoch": 1.6439854191980559,
"grad_norm": 50.075694613199865,
"learning_rate": 1.6713804900809582e-05,
"loss": 1.103,
"step": 677
},
{
"epoch": 1.646415552855407,
"grad_norm": 69.23038320811604,
"learning_rate": 1.649212322471695e-05,
"loss": 1.1189,
"step": 678
},
{
"epoch": 1.6488456865127583,
"grad_norm": 33.2935221457007,
"learning_rate": 1.6271789310281517e-05,
"loss": 1.0763,
"step": 679
},
{
"epoch": 1.6512758201701092,
"grad_norm": 74.75507124872362,
"learning_rate": 1.605280671354632e-05,
"loss": 1.0983,
"step": 680
},
{
"epoch": 1.6537059538274606,
"grad_norm": 72.6880045095337,
"learning_rate": 1.583517896874498e-05,
"loss": 1.1151,
"step": 681
},
{
"epoch": 1.6561360874848117,
"grad_norm": 59.70666181469054,
"learning_rate": 1.561890958824469e-05,
"loss": 1.1202,
"step": 682
},
{
"epoch": 1.6585662211421628,
"grad_norm": 136.06883726877848,
"learning_rate": 1.540400206248963e-05,
"loss": 1.114,
"step": 683
},
{
"epoch": 1.6609963547995141,
"grad_norm": 48.25877797639542,
"learning_rate": 1.5190459859944505e-05,
"loss": 1.0926,
"step": 684
},
{
"epoch": 1.663426488456865,
"grad_norm": 99.27065031977625,
"learning_rate": 1.4978286427038601e-05,
"loss": 1.0938,
"step": 685
},
{
"epoch": 1.6658566221142164,
"grad_norm": 73.70604863380417,
"learning_rate": 1.4767485188110152e-05,
"loss": 1.0955,
"step": 686
},
{
"epoch": 1.6682867557715675,
"grad_norm": 97.29634642853938,
"learning_rate": 1.4558059545351143e-05,
"loss": 1.0993,
"step": 687
},
{
"epoch": 1.6707168894289186,
"grad_norm": 169.33237029052367,
"learning_rate": 1.435001287875234e-05,
"loss": 1.1484,
"step": 688
},
{
"epoch": 1.6731470230862697,
"grad_norm": 51.080335246500006,
"learning_rate": 1.4143348546048707e-05,
"loss": 1.1279,
"step": 689
},
{
"epoch": 1.6755771567436208,
"grad_norm": 123.74332262351422,
"learning_rate": 1.3938069882665328e-05,
"loss": 1.144,
"step": 690
},
{
"epoch": 1.6780072904009722,
"grad_norm": 150.6264388349919,
"learning_rate": 1.3734180201663439e-05,
"loss": 1.048,
"step": 691
},
{
"epoch": 1.680437424058323,
"grad_norm": 45.78978589208615,
"learning_rate": 1.3531682793687028e-05,
"loss": 1.0943,
"step": 692
},
{
"epoch": 1.6828675577156744,
"grad_norm": 59.23541668296553,
"learning_rate": 1.3330580926909763e-05,
"loss": 1.1422,
"step": 693
},
{
"epoch": 1.6852976913730255,
"grad_norm": 83.37564839198684,
"learning_rate": 1.3130877846982204e-05,
"loss": 1.1167,
"step": 694
},
{
"epoch": 1.6877278250303767,
"grad_norm": 169.89181363126755,
"learning_rate": 1.2932576776979377e-05,
"loss": 1.0153,
"step": 695
},
{
"epoch": 1.6901579586877278,
"grad_norm": 41.65359342112402,
"learning_rate": 1.2735680917348802e-05,
"loss": 1.0842,
"step": 696
},
{
"epoch": 1.692588092345079,
"grad_norm": 91.76072613046553,
"learning_rate": 1.2540193445858883e-05,
"loss": 1.1274,
"step": 697
},
{
"epoch": 1.6950182260024302,
"grad_norm": 86.16989165645253,
"learning_rate": 1.2346117517547551e-05,
"loss": 1.106,
"step": 698
},
{
"epoch": 1.6974483596597811,
"grad_norm": 75.86627467070798,
"learning_rate": 1.2153456264671337e-05,
"loss": 1.0642,
"step": 699
},
{
"epoch": 1.6998784933171325,
"grad_norm": 78.47579727138226,
"learning_rate": 1.1962212796654926e-05,
"loss": 1.053,
"step": 700
},
{
"epoch": 1.7023086269744836,
"grad_norm": 81.45952046323904,
"learning_rate": 1.1772390200040817e-05,
"loss": 1.1003,
"step": 701
},
{
"epoch": 1.7047387606318347,
"grad_norm": 81.5215081559605,
"learning_rate": 1.1583991538439598e-05,
"loss": 1.0789,
"step": 702
},
{
"epoch": 1.707168894289186,
"grad_norm": 123.8954411953181,
"learning_rate": 1.139701985248055e-05,
"loss": 1.0574,
"step": 703
},
{
"epoch": 1.709599027946537,
"grad_norm": 66.51876171521589,
"learning_rate": 1.1211478159762478e-05,
"loss": 1.0866,
"step": 704
},
{
"epoch": 1.7120291616038883,
"grad_norm": 88.7505135509034,
"learning_rate": 1.1027369454805058e-05,
"loss": 1.1039,
"step": 705
},
{
"epoch": 1.7144592952612394,
"grad_norm": 51.948320911337355,
"learning_rate": 1.0844696709000435e-05,
"loss": 1.0891,
"step": 706
},
{
"epoch": 1.7168894289185905,
"grad_norm": 116.12502404263041,
"learning_rate": 1.0663462870565411e-05,
"loss": 1.1284,
"step": 707
},
{
"epoch": 1.7193195625759417,
"grad_norm": 49.752442053177056,
"learning_rate": 1.0483670864493778e-05,
"loss": 1.11,
"step": 708
},
{
"epoch": 1.7217496962332928,
"grad_norm": 89.67691421405478,
"learning_rate": 1.0305323592509009e-05,
"loss": 1.1504,
"step": 709
},
{
"epoch": 1.7241798298906441,
"grad_norm": 84.9951363796106,
"learning_rate": 1.0128423933017671e-05,
"loss": 1.1163,
"step": 710
},
{
"epoch": 1.726609963547995,
"grad_norm": 53.83015858877197,
"learning_rate": 9.952974741062703e-06,
"loss": 1.0768,
"step": 711
},
{
"epoch": 1.7290400972053463,
"grad_norm": 87.01137462153444,
"learning_rate": 9.77897884827752e-06,
"loss": 1.0505,
"step": 712
},
{
"epoch": 1.7314702308626975,
"grad_norm": 119.85348125427905,
"learning_rate": 9.606439062840256e-06,
"loss": 1.1866,
"step": 713
},
{
"epoch": 1.7339003645200486,
"grad_norm": 38.86482306830089,
"learning_rate": 9.435358169428442e-06,
"loss": 1.1203,
"step": 714
},
{
"epoch": 1.7363304981773997,
"grad_norm": 105.47836599222568,
"learning_rate": 9.265738929174051e-06,
"loss": 1.1219,
"step": 715
},
{
"epoch": 1.7387606318347508,
"grad_norm": 97.01504953945435,
"learning_rate": 9.097584079618893e-06,
"loss": 1.0897,
"step": 716
},
{
"epoch": 1.7411907654921022,
"grad_norm": 55.37203351389315,
"learning_rate": 8.93089633467058e-06,
"loss": 1.0747,
"step": 717
},
{
"epoch": 1.743620899149453,
"grad_norm": 53.68546468478919,
"learning_rate": 8.765678384558607e-06,
"loss": 1.0636,
"step": 718
},
{
"epoch": 1.7460510328068044,
"grad_norm": 93.22850661983693,
"learning_rate": 8.601932895790877e-06,
"loss": 1.0801,
"step": 719
},
{
"epoch": 1.7484811664641555,
"grad_norm": 75.10018201630282,
"learning_rate": 8.439662511110847e-06,
"loss": 1.1608,
"step": 720
},
{
"epoch": 1.7509113001215066,
"grad_norm": 75.88601313663253,
"learning_rate": 8.278869849454718e-06,
"loss": 1.0286,
"step": 721
},
{
"epoch": 1.7509113001215066,
"eval_loss": 1.1075224876403809,
"eval_runtime": 53.2869,
"eval_samples_per_second": 13.962,
"eval_steps_per_second": 1.745,
"step": 721
},
{
"epoch": 1.753341433778858,
"grad_norm": 75.94291636970333,
"learning_rate": 8.119557505909215e-06,
"loss": 1.1615,
"step": 722
},
{
"epoch": 1.7557715674362089,
"grad_norm": 85.61204534745477,
"learning_rate": 7.961728051669737e-06,
"loss": 1.1312,
"step": 723
},
{
"epoch": 1.7582017010935602,
"grad_norm": 42.0496338509614,
"learning_rate": 7.805384033998875e-06,
"loss": 1.1068,
"step": 724
},
{
"epoch": 1.7606318347509113,
"grad_norm": 67.9823900081791,
"learning_rate": 7.650527976185173e-06,
"loss": 1.134,
"step": 725
},
{
"epoch": 1.7630619684082625,
"grad_norm": 50.797982181202315,
"learning_rate": 7.497162377502542e-06,
"loss": 1.0903,
"step": 726
},
{
"epoch": 1.7654921020656136,
"grad_norm": 66.34495889496102,
"learning_rate": 7.3452897131698564e-06,
"loss": 1.0895,
"step": 727
},
{
"epoch": 1.7679222357229647,
"grad_norm": 97.21072984563654,
"learning_rate": 7.194912434311052e-06,
"loss": 1.0891,
"step": 728
},
{
"epoch": 1.770352369380316,
"grad_norm": 153.67433901334545,
"learning_rate": 7.046032967915483e-06,
"loss": 1.1057,
"step": 729
},
{
"epoch": 1.772782503037667,
"grad_norm": 65.34101790074203,
"learning_rate": 6.898653716798887e-06,
"loss": 1.1252,
"step": 730
},
{
"epoch": 1.7752126366950183,
"grad_norm": 60.35832905175029,
"learning_rate": 6.75277705956443e-06,
"loss": 1.1177,
"step": 731
},
{
"epoch": 1.7776427703523694,
"grad_norm": 47.338317641259096,
"learning_rate": 6.60840535056445e-06,
"loss": 1.0986,
"step": 732
},
{
"epoch": 1.7800729040097205,
"grad_norm": 50.2479403169235,
"learning_rate": 6.465540919862456e-06,
"loss": 1.0675,
"step": 733
},
{
"epoch": 1.7825030376670719,
"grad_norm": 76.05847584461722,
"learning_rate": 6.32418607319546e-06,
"loss": 1.0962,
"step": 734
},
{
"epoch": 1.7849331713244228,
"grad_norm": 5776.25484119808,
"learning_rate": 6.184343091936751e-06,
"loss": 1.1224,
"step": 735
},
{
"epoch": 1.787363304981774,
"grad_norm": 60.26557281969165,
"learning_rate": 6.046014233059161e-06,
"loss": 1.1682,
"step": 736
},
{
"epoch": 1.789793438639125,
"grad_norm": 173.76865709745172,
"learning_rate": 5.909201729098579e-06,
"loss": 1.1463,
"step": 737
},
{
"epoch": 1.7922235722964763,
"grad_norm": 44.51475254326123,
"learning_rate": 5.77390778811796e-06,
"loss": 1.1127,
"step": 738
},
{
"epoch": 1.7946537059538274,
"grad_norm": 62.21753016508825,
"learning_rate": 5.640134593671598e-06,
"loss": 1.1897,
"step": 739
},
{
"epoch": 1.7970838396111786,
"grad_norm": 57.213736643350934,
"learning_rate": 5.5078843047700275e-06,
"loss": 1.1004,
"step": 740
},
{
"epoch": 1.79951397326853,
"grad_norm": 73.79879091710353,
"learning_rate": 5.3771590558450265e-06,
"loss": 1.2378,
"step": 741
},
{
"epoch": 1.8019441069258808,
"grad_norm": 69.33802717306622,
"learning_rate": 5.247960956715259e-06,
"loss": 1.078,
"step": 742
},
{
"epoch": 1.8043742405832321,
"grad_norm": 70.1118673770208,
"learning_rate": 5.12029209255227e-06,
"loss": 1.1082,
"step": 743
},
{
"epoch": 1.8068043742405833,
"grad_norm": 60.49720666164233,
"learning_rate": 4.994154523846695e-06,
"loss": 1.1694,
"step": 744
},
{
"epoch": 1.8092345078979344,
"grad_norm": 62.67576959014564,
"learning_rate": 4.869550286375091e-06,
"loss": 1.1017,
"step": 745
},
{
"epoch": 1.8116646415552855,
"grad_norm": 52.797557858037294,
"learning_rate": 4.746481391167068e-06,
"loss": 1.0547,
"step": 746
},
{
"epoch": 1.8140947752126366,
"grad_norm": 78.39250293351613,
"learning_rate": 4.624949824472858e-06,
"loss": 1.1395,
"step": 747
},
{
"epoch": 1.816524908869988,
"grad_norm": 345.5438304913529,
"learning_rate": 4.504957547731214e-06,
"loss": 1.1248,
"step": 748
},
{
"epoch": 1.8189550425273389,
"grad_norm": 128.04277202807285,
"learning_rate": 4.386506497537757e-06,
"loss": 1.2115,
"step": 749
},
{
"epoch": 1.8213851761846902,
"grad_norm": 81.9882481842496,
"learning_rate": 4.269598585613776e-06,
"loss": 1.071,
"step": 750
},
{
"epoch": 1.8238153098420413,
"grad_norm": 99.80236227862193,
"learning_rate": 4.154235698775277e-06,
"loss": 1.1591,
"step": 751
},
{
"epoch": 1.8262454434993924,
"grad_norm": 152.61066223998088,
"learning_rate": 4.040419698902631e-06,
"loss": 1.1322,
"step": 752
},
{
"epoch": 1.8286755771567438,
"grad_norm": 32.12973237305346,
"learning_rate": 3.928152422910491e-06,
"loss": 1.0985,
"step": 753
},
{
"epoch": 1.8311057108140947,
"grad_norm": 42.81358112745556,
"learning_rate": 3.817435682718096e-06,
"loss": 1.1252,
"step": 754
},
{
"epoch": 1.833535844471446,
"grad_norm": 54.37793958217706,
"learning_rate": 3.7082712652200867e-06,
"loss": 1.1263,
"step": 755
},
{
"epoch": 1.8359659781287971,
"grad_norm": 76.10189962024336,
"learning_rate": 3.6006609322576156e-06,
"loss": 1.2002,
"step": 756
},
{
"epoch": 1.8383961117861483,
"grad_norm": 65.3082792846633,
"learning_rate": 3.4946064205899965e-06,
"loss": 1.074,
"step": 757
},
{
"epoch": 1.8408262454434994,
"grad_norm": 51.04243707129297,
"learning_rate": 3.390109441866618e-06,
"loss": 1.1253,
"step": 758
},
{
"epoch": 1.8432563791008505,
"grad_norm": 162.25838422994087,
"learning_rate": 3.287171682599255e-06,
"loss": 1.0746,
"step": 759
},
{
"epoch": 1.8456865127582018,
"grad_norm": 53.42288615863692,
"learning_rate": 3.1857948041349894e-06,
"loss": 1.0434,
"step": 760
},
{
"epoch": 1.8481166464155527,
"grad_norm": 47.70669365600897,
"learning_rate": 3.085980442629288e-06,
"loss": 1.0694,
"step": 761
},
{
"epoch": 1.850546780072904,
"grad_norm": 35.5865078619899,
"learning_rate": 2.9877302090196346e-06,
"loss": 1.1292,
"step": 762
},
{
"epoch": 1.8529769137302552,
"grad_norm": 259.96541089150685,
"learning_rate": 2.8910456889995498e-06,
"loss": 1.1138,
"step": 763
},
{
"epoch": 1.8554070473876063,
"grad_norm": 100.9818148806922,
"learning_rate": 2.7959284429929456e-06,
"loss": 1.1414,
"step": 764
},
{
"epoch": 1.8578371810449574,
"grad_norm": 58.16414187512478,
"learning_rate": 2.7023800061289907e-06,
"loss": 1.1076,
"step": 765
},
{
"epoch": 1.8602673147023085,
"grad_norm": 48.78094545883972,
"learning_rate": 2.6104018882173064e-06,
"loss": 1.1061,
"step": 766
},
{
"epoch": 1.86269744835966,
"grad_norm": 32.1696048844329,
"learning_rate": 2.5199955737236104e-06,
"loss": 1.0771,
"step": 767
},
{
"epoch": 1.8651275820170108,
"grad_norm": 67.39153628388367,
"learning_rate": 2.4311625217457778e-06,
"loss": 1.1179,
"step": 768
},
{
"epoch": 1.8675577156743621,
"grad_norm": 57.3659391222766,
"learning_rate": 2.3439041659902407e-06,
"loss": 1.1348,
"step": 769
},
{
"epoch": 1.8699878493317132,
"grad_norm": 48.86332839622862,
"learning_rate": 2.2582219147489147e-06,
"loss": 1.067,
"step": 770
},
{
"epoch": 1.8724179829890644,
"grad_norm": 59.25100384913438,
"learning_rate": 2.174117150876398e-06,
"loss": 1.0814,
"step": 771
},
{
"epoch": 1.8748481166464157,
"grad_norm": 63.612150673079476,
"learning_rate": 2.091591231767709e-06,
"loss": 1.149,
"step": 772
},
{
"epoch": 1.8772782503037666,
"grad_norm": 76.64767939109268,
"learning_rate": 2.010645489336382e-06,
"loss": 1.1809,
"step": 773
},
{
"epoch": 1.879708383961118,
"grad_norm": 73.74931792569282,
"learning_rate": 1.9312812299929094e-06,
"loss": 1.1327,
"step": 774
},
{
"epoch": 1.882138517618469,
"grad_norm": 58.80672287733774,
"learning_rate": 1.8534997346237093e-06,
"loss": 1.0941,
"step": 775
},
{
"epoch": 1.8845686512758202,
"grad_norm": 38.760707938386155,
"learning_rate": 1.777302258570479e-06,
"loss": 1.0785,
"step": 776
},
{
"epoch": 1.8869987849331713,
"grad_norm": 78.58894037195526,
"learning_rate": 1.7026900316098215e-06,
"loss": 1.1067,
"step": 777
},
{
"epoch": 1.8894289185905224,
"grad_norm": 34.985018454660455,
"learning_rate": 1.6296642579335496e-06,
"loss": 1.0454,
"step": 778
},
{
"epoch": 1.8918590522478738,
"grad_norm": 56.94816404212683,
"learning_rate": 1.5582261161291245e-06,
"loss": 1.1402,
"step": 779
},
{
"epoch": 1.8942891859052247,
"grad_norm": 88.47683213840897,
"learning_rate": 1.4883767591606924e-06,
"loss": 1.1847,
"step": 780
},
{
"epoch": 1.896719319562576,
"grad_norm": 164.4313185857989,
"learning_rate": 1.4201173143504888e-06,
"loss": 1.0246,
"step": 781
},
{
"epoch": 1.8991494532199271,
"grad_norm": 44.826186028268175,
"learning_rate": 1.3534488833605974e-06,
"loss": 1.1285,
"step": 782
},
{
"epoch": 1.9015795868772782,
"grad_norm": 75.6977790059232,
"learning_rate": 1.2883725421752201e-06,
"loss": 1.111,
"step": 783
},
{
"epoch": 1.9040097205346294,
"grad_norm": 74.26685984847175,
"learning_rate": 1.2248893410832685e-06,
"loss": 1.0741,
"step": 784
},
{
"epoch": 1.9064398541919805,
"grad_norm": 54.63352143603637,
"learning_rate": 1.1630003046614323e-06,
"loss": 1.0816,
"step": 785
},
{
"epoch": 1.9088699878493318,
"grad_norm": 55.31777597928668,
"learning_rate": 1.1027064317576385e-06,
"loss": 1.1589,
"step": 786
},
{
"epoch": 1.9113001215066827,
"grad_norm": 81.61311763219298,
"learning_rate": 1.0440086954749517e-06,
"loss": 1.1165,
"step": 787
},
{
"epoch": 1.913730255164034,
"grad_norm": 49.503370916119344,
"learning_rate": 9.869080431558542e-07,
"loss": 1.1239,
"step": 788
},
{
"epoch": 1.9161603888213852,
"grad_norm": 47.648410246233475,
"learning_rate": 9.314053963669245e-07,
"loss": 1.1121,
"step": 789
},
{
"epoch": 1.9185905224787363,
"grad_norm": 66.83906080469764,
"learning_rate": 8.775016508840272e-07,
"loss": 1.1461,
"step": 790
},
{
"epoch": 1.9210206561360876,
"grad_norm": 148.92765988977712,
"learning_rate": 8.251976766777913e-07,
"loss": 1.1627,
"step": 791
},
{
"epoch": 1.9234507897934385,
"grad_norm": 70.09639733538229,
"learning_rate": 7.744943178996101e-07,
"loss": 1.0935,
"step": 792
},
{
"epoch": 1.9258809234507899,
"grad_norm": 85.9264341638202,
"learning_rate": 7.253923928680406e-07,
"loss": 1.1071,
"step": 793
},
{
"epoch": 1.928311057108141,
"grad_norm": 76.17401772146403,
"learning_rate": 6.778926940555152e-07,
"loss": 1.1448,
"step": 794
},
{
"epoch": 1.930741190765492,
"grad_norm": 91.15259575797278,
"learning_rate": 6.319959880756177e-07,
"loss": 1.1101,
"step": 795
},
{
"epoch": 1.9331713244228432,
"grad_norm": 31.655747702448462,
"learning_rate": 5.877030156707042e-07,
"loss": 1.039,
"step": 796
},
{
"epoch": 1.9356014580801943,
"grad_norm": 50.84824944272368,
"learning_rate": 5.450144916999134e-07,
"loss": 1.0511,
"step": 797
},
{
"epoch": 1.9380315917375457,
"grad_norm": 65.68864220454576,
"learning_rate": 5.039311051276752e-07,
"loss": 1.1926,
"step": 798
},
{
"epoch": 1.9404617253948966,
"grad_norm": 87.84909056776107,
"learning_rate": 4.644535190125421e-07,
"loss": 1.1022,
"step": 799
},
{
"epoch": 1.942891859052248,
"grad_norm": 49.60648808282262,
"learning_rate": 4.2658237049655323e-07,
"loss": 1.1283,
"step": 800
},
{
"epoch": 1.945321992709599,
"grad_norm": 60.29916671699956,
"learning_rate": 3.903182707948649e-07,
"loss": 1.0659,
"step": 801
},
{
"epoch": 1.9477521263669502,
"grad_norm": 85.60352537490172,
"learning_rate": 3.556618051859584e-07,
"loss": 1.1473,
"step": 802
},
{
"epoch": 1.9501822600243013,
"grad_norm": 63.26392365183893,
"learning_rate": 3.2261353300219176e-07,
"loss": 1.1018,
"step": 803
},
{
"epoch": 1.9526123936816524,
"grad_norm": 58.902160704029676,
"learning_rate": 2.9117398762069647e-07,
"loss": 1.1158,
"step": 804
},
{
"epoch": 1.9550425273390037,
"grad_norm": 95.75043109531097,
"learning_rate": 2.613436764548505e-07,
"loss": 1.1034,
"step": 805
},
{
"epoch": 1.9574726609963546,
"grad_norm": 61.028532599630765,
"learning_rate": 2.3312308094607382e-07,
"loss": 1.1239,
"step": 806
},
{
"epoch": 1.959902794653706,
"grad_norm": 42.40393099941929,
"learning_rate": 2.0651265655603492e-07,
"loss": 1.0899,
"step": 807
},
{
"epoch": 1.962332928311057,
"grad_norm": 72.5324497099459,
"learning_rate": 1.8151283275928964e-07,
"loss": 1.0923,
"step": 808
},
{
"epoch": 1.9647630619684082,
"grad_norm": 70.09629889999877,
"learning_rate": 1.5812401303639813e-07,
"loss": 1.122,
"step": 809
},
{
"epoch": 1.9671931956257596,
"grad_norm": 51.82649450794088,
"learning_rate": 1.3634657486737424e-07,
"loss": 1.1976,
"step": 810
},
{
"epoch": 1.9696233292831105,
"grad_norm": 38.14749887357619,
"learning_rate": 1.1618086972559062e-07,
"loss": 1.1402,
"step": 811
},
{
"epoch": 1.9720534629404618,
"grad_norm": 63.952260610433626,
"learning_rate": 9.762722307213868e-08,
"loss": 1.1099,
"step": 812
},
{
"epoch": 1.974483596597813,
"grad_norm": 67.73936930746979,
"learning_rate": 8.068593435055505e-08,
"loss": 1.0666,
"step": 813
},
{
"epoch": 1.976913730255164,
"grad_norm": 199.80849466800598,
"learning_rate": 6.535727698199213e-08,
"loss": 1.1676,
"step": 814
},
{
"epoch": 1.9793438639125152,
"grad_norm": 128.36184695966296,
"learning_rate": 5.164149836077714e-08,
"loss": 1.1347,
"step": 815
},
{
"epoch": 1.9817739975698663,
"grad_norm": 117.93650898577977,
"learning_rate": 3.953881985047092e-08,
"loss": 1.0093,
"step": 816
},
{
"epoch": 1.9842041312272176,
"grad_norm": 133.66858791525644,
"learning_rate": 2.9049436780281825e-08,
"loss": 1.132,
"step": 817
},
{
"epoch": 1.9866342648845685,
"grad_norm": 45.0102112528463,
"learning_rate": 2.0173518441868324e-08,
"loss": 1.1243,
"step": 818
},
{
"epoch": 1.9890643985419199,
"grad_norm": 99.86626236714106,
"learning_rate": 1.2911208086663351e-08,
"loss": 1.1503,
"step": 819
},
{
"epoch": 1.991494532199271,
"grad_norm": 76.9095216170618,
"learning_rate": 7.262622923531747e-09,
"loss": 1.1219,
"step": 820
},
{
"epoch": 1.993924665856622,
"grad_norm": 53.6297519446309,
"learning_rate": 3.2278541168717646e-09,
"loss": 1.1142,
"step": 821
},
{
"epoch": 1.9963547995139734,
"grad_norm": 113.49011128083137,
"learning_rate": 8.069667851939855e-10,
"loss": 1.1222,
"step": 822
}
],
"logging_steps": 1,
"max_steps": 822,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 206,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.218071900027093e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}