PEFT
Safetensors
glm4
axolotl
Generated from Trainer
GLM-v2-lora / checkpoint-618 /trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
6c23bbb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.5006075334143378,
"eval_steps": 103,
"global_step": 618,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002430133657351154,
"grad_norm": 715.4923219036787,
"learning_rate": 0.0,
"loss": 1.3541,
"step": 1
},
{
"epoch": 0.002430133657351154,
"eval_loss": 1.3335719108581543,
"eval_runtime": 53.4883,
"eval_samples_per_second": 13.91,
"eval_steps_per_second": 1.739,
"step": 1
},
{
"epoch": 0.004860267314702308,
"grad_norm": 614.6970578314867,
"learning_rate": 5e-06,
"loss": 1.3775,
"step": 2
},
{
"epoch": 0.007290400972053463,
"grad_norm": 471.59017991123795,
"learning_rate": 1e-05,
"loss": 1.339,
"step": 3
},
{
"epoch": 0.009720534629404616,
"grad_norm": 238.72216262259653,
"learning_rate": 1.5e-05,
"loss": 1.3829,
"step": 4
},
{
"epoch": 0.012150668286755772,
"grad_norm": 355.68955726709873,
"learning_rate": 2e-05,
"loss": 1.3597,
"step": 5
},
{
"epoch": 0.014580801944106925,
"grad_norm": 414.5627284272111,
"learning_rate": 2.5e-05,
"loss": 1.3862,
"step": 6
},
{
"epoch": 0.01701093560145808,
"grad_norm": 534.9877222052693,
"learning_rate": 3e-05,
"loss": 1.2784,
"step": 7
},
{
"epoch": 0.019441069258809233,
"grad_norm": 153.38895635666677,
"learning_rate": 3.5e-05,
"loss": 1.3521,
"step": 8
},
{
"epoch": 0.02187120291616039,
"grad_norm": 858.293734138087,
"learning_rate": 4e-05,
"loss": 1.2461,
"step": 9
},
{
"epoch": 0.024301336573511544,
"grad_norm": 255.81989388533376,
"learning_rate": 4.5e-05,
"loss": 1.2778,
"step": 10
},
{
"epoch": 0.026731470230862697,
"grad_norm": 368.91949003479226,
"learning_rate": 5e-05,
"loss": 1.3412,
"step": 11
},
{
"epoch": 0.02916160388821385,
"grad_norm": 176.49481799555898,
"learning_rate": 5.500000000000001e-05,
"loss": 1.3437,
"step": 12
},
{
"epoch": 0.031591737545565005,
"grad_norm": 208.57742104974147,
"learning_rate": 6e-05,
"loss": 1.2859,
"step": 13
},
{
"epoch": 0.03402187120291616,
"grad_norm": 93.26742036471734,
"learning_rate": 6.500000000000001e-05,
"loss": 1.1843,
"step": 14
},
{
"epoch": 0.03645200486026731,
"grad_norm": 145.53380444622215,
"learning_rate": 7e-05,
"loss": 1.4281,
"step": 15
},
{
"epoch": 0.038882138517618466,
"grad_norm": 126.56724937430516,
"learning_rate": 7.500000000000001e-05,
"loss": 1.3908,
"step": 16
},
{
"epoch": 0.041312272174969626,
"grad_norm": 106.19246390662754,
"learning_rate": 8e-05,
"loss": 1.344,
"step": 17
},
{
"epoch": 0.04374240583232078,
"grad_norm": 289.348178084847,
"learning_rate": 8.5e-05,
"loss": 1.2708,
"step": 18
},
{
"epoch": 0.046172539489671933,
"grad_norm": 286.63676887065634,
"learning_rate": 9e-05,
"loss": 1.3564,
"step": 19
},
{
"epoch": 0.04860267314702309,
"grad_norm": 269.6096299101413,
"learning_rate": 9.5e-05,
"loss": 1.2184,
"step": 20
},
{
"epoch": 0.05103280680437424,
"grad_norm": 151.28678796160915,
"learning_rate": 0.0001,
"loss": 1.2974,
"step": 21
},
{
"epoch": 0.053462940461725394,
"grad_norm": 265.5625538646362,
"learning_rate": 0.000105,
"loss": 1.2703,
"step": 22
},
{
"epoch": 0.05589307411907655,
"grad_norm": 724.7157187586193,
"learning_rate": 0.00011000000000000002,
"loss": 1.2691,
"step": 23
},
{
"epoch": 0.0583232077764277,
"grad_norm": 425.3768239347252,
"learning_rate": 0.00011499999999999999,
"loss": 1.375,
"step": 24
},
{
"epoch": 0.060753341433778855,
"grad_norm": 314.5119318308783,
"learning_rate": 0.00012,
"loss": 1.2952,
"step": 25
},
{
"epoch": 0.06318347509113001,
"grad_norm": 557.519173033834,
"learning_rate": 0.000125,
"loss": 1.2923,
"step": 26
},
{
"epoch": 0.06561360874848117,
"grad_norm": 211.4069356529637,
"learning_rate": 0.00013000000000000002,
"loss": 1.2629,
"step": 27
},
{
"epoch": 0.06804374240583232,
"grad_norm": 299.7742653722713,
"learning_rate": 0.00013500000000000003,
"loss": 1.3099,
"step": 28
},
{
"epoch": 0.07047387606318348,
"grad_norm": 182.18551965886013,
"learning_rate": 0.00014,
"loss": 1.2215,
"step": 29
},
{
"epoch": 0.07290400972053462,
"grad_norm": 153.38300520125887,
"learning_rate": 0.000145,
"loss": 1.2799,
"step": 30
},
{
"epoch": 0.07533414337788578,
"grad_norm": 849.4472853252786,
"learning_rate": 0.00015000000000000001,
"loss": 1.2012,
"step": 31
},
{
"epoch": 0.07776427703523693,
"grad_norm": 179.94814586965418,
"learning_rate": 0.000155,
"loss": 1.2103,
"step": 32
},
{
"epoch": 0.08019441069258809,
"grad_norm": 180.36681057956048,
"learning_rate": 0.00016,
"loss": 1.2414,
"step": 33
},
{
"epoch": 0.08262454434993925,
"grad_norm": 113.72852454032189,
"learning_rate": 0.000165,
"loss": 1.2508,
"step": 34
},
{
"epoch": 0.0850546780072904,
"grad_norm": 150.53415363213057,
"learning_rate": 0.00017,
"loss": 1.2528,
"step": 35
},
{
"epoch": 0.08748481166464156,
"grad_norm": 156.19567878683574,
"learning_rate": 0.000175,
"loss": 1.2016,
"step": 36
},
{
"epoch": 0.0899149453219927,
"grad_norm": 416.34884765145057,
"learning_rate": 0.00018,
"loss": 1.254,
"step": 37
},
{
"epoch": 0.09234507897934387,
"grad_norm": 269.7105025581372,
"learning_rate": 0.00018500000000000002,
"loss": 1.2215,
"step": 38
},
{
"epoch": 0.09477521263669501,
"grad_norm": 249.35069047655023,
"learning_rate": 0.00019,
"loss": 1.2078,
"step": 39
},
{
"epoch": 0.09720534629404617,
"grad_norm": 167.16896045613478,
"learning_rate": 0.000195,
"loss": 1.1866,
"step": 40
},
{
"epoch": 0.09963547995139732,
"grad_norm": 248.22240554128427,
"learning_rate": 0.0002,
"loss": 1.252,
"step": 41
},
{
"epoch": 0.10206561360874848,
"grad_norm": 180.89520841022969,
"learning_rate": 0.0001999991930332148,
"loss": 1.2251,
"step": 42
},
{
"epoch": 0.10449574726609964,
"grad_norm": 614.4291375430485,
"learning_rate": 0.00019999677214588312,
"loss": 1.2563,
"step": 43
},
{
"epoch": 0.10692588092345079,
"grad_norm": 211.7523427355369,
"learning_rate": 0.00019999273737707646,
"loss": 1.193,
"step": 44
},
{
"epoch": 0.10935601458080195,
"grad_norm": 181.56788458769344,
"learning_rate": 0.00019998708879191335,
"loss": 1.2598,
"step": 45
},
{
"epoch": 0.1117861482381531,
"grad_norm": 157.5783414916277,
"learning_rate": 0.00019997982648155814,
"loss": 1.2663,
"step": 46
},
{
"epoch": 0.11421628189550426,
"grad_norm": 155.78006251192625,
"learning_rate": 0.00019997095056321971,
"loss": 1.1637,
"step": 47
},
{
"epoch": 0.1166464155528554,
"grad_norm": 202.0253360488958,
"learning_rate": 0.00019996046118014955,
"loss": 1.2508,
"step": 48
},
{
"epoch": 0.11907654921020656,
"grad_norm": 192.7576297264874,
"learning_rate": 0.00019994835850163924,
"loss": 1.2014,
"step": 49
},
{
"epoch": 0.12150668286755771,
"grad_norm": 132.5484871621418,
"learning_rate": 0.00019993464272301804,
"loss": 1.2279,
"step": 50
},
{
"epoch": 0.12393681652490887,
"grad_norm": 128.32285438248965,
"learning_rate": 0.00019991931406564944,
"loss": 1.2179,
"step": 51
},
{
"epoch": 0.12636695018226002,
"grad_norm": 552.3669463716512,
"learning_rate": 0.00019990237277692788,
"loss": 1.1498,
"step": 52
},
{
"epoch": 0.12879708383961117,
"grad_norm": 86.17911790260192,
"learning_rate": 0.00019988381913027442,
"loss": 1.2784,
"step": 53
},
{
"epoch": 0.13122721749696234,
"grad_norm": 70.83294605515782,
"learning_rate": 0.00019986365342513265,
"loss": 1.2224,
"step": 54
},
{
"epoch": 0.1336573511543135,
"grad_norm": 45.23624563299466,
"learning_rate": 0.00019984187598696363,
"loss": 1.1746,
"step": 55
},
{
"epoch": 0.13608748481166463,
"grad_norm": 57.67645735585192,
"learning_rate": 0.00019981848716724073,
"loss": 1.2154,
"step": 56
},
{
"epoch": 0.1385176184690158,
"grad_norm": 45.661268047129674,
"learning_rate": 0.00019979348734344398,
"loss": 1.1411,
"step": 57
},
{
"epoch": 0.14094775212636695,
"grad_norm": 53.10628399970359,
"learning_rate": 0.00019976687691905393,
"loss": 1.2029,
"step": 58
},
{
"epoch": 0.1433778857837181,
"grad_norm": 38.71353325803162,
"learning_rate": 0.00019973865632354516,
"loss": 1.1976,
"step": 59
},
{
"epoch": 0.14580801944106925,
"grad_norm": 42.789208063581114,
"learning_rate": 0.0001997088260123793,
"loss": 1.1477,
"step": 60
},
{
"epoch": 0.14823815309842042,
"grad_norm": 37.613194740192164,
"learning_rate": 0.0001996773864669978,
"loss": 1.2529,
"step": 61
},
{
"epoch": 0.15066828675577157,
"grad_norm": 47.96813084127655,
"learning_rate": 0.00019964433819481405,
"loss": 1.2328,
"step": 62
},
{
"epoch": 0.15309842041312272,
"grad_norm": 55.30483872428545,
"learning_rate": 0.00019960968172920516,
"loss": 1.1996,
"step": 63
},
{
"epoch": 0.15552855407047386,
"grad_norm": 35.58995799070749,
"learning_rate": 0.00019957341762950344,
"loss": 1.1248,
"step": 64
},
{
"epoch": 0.15795868772782504,
"grad_norm": 58.86131222300149,
"learning_rate": 0.00019953554648098748,
"loss": 1.3017,
"step": 65
},
{
"epoch": 0.16038882138517618,
"grad_norm": 32.12091331878439,
"learning_rate": 0.00019949606889487233,
"loss": 1.1961,
"step": 66
},
{
"epoch": 0.16281895504252733,
"grad_norm": 167.27433996357928,
"learning_rate": 0.0001994549855083001,
"loss": 1.1768,
"step": 67
},
{
"epoch": 0.1652490886998785,
"grad_norm": 32.3328494297432,
"learning_rate": 0.0001994122969843293,
"loss": 1.1802,
"step": 68
},
{
"epoch": 0.16767922235722965,
"grad_norm": 39.92530074438497,
"learning_rate": 0.0001993680040119244,
"loss": 1.2098,
"step": 69
},
{
"epoch": 0.1701093560145808,
"grad_norm": 45.60830517129956,
"learning_rate": 0.0001993221073059445,
"loss": 1.2159,
"step": 70
},
{
"epoch": 0.17253948967193194,
"grad_norm": 35.462695032736335,
"learning_rate": 0.00019927460760713197,
"loss": 1.1818,
"step": 71
},
{
"epoch": 0.17496962332928312,
"grad_norm": 43.05751624597826,
"learning_rate": 0.0001992255056821004,
"loss": 1.2011,
"step": 72
},
{
"epoch": 0.17739975698663427,
"grad_norm": 47.13143404969894,
"learning_rate": 0.00019917480232332224,
"loss": 1.1669,
"step": 73
},
{
"epoch": 0.1798298906439854,
"grad_norm": 72.07146401418987,
"learning_rate": 0.000199122498349116,
"loss": 1.181,
"step": 74
},
{
"epoch": 0.1822600243013366,
"grad_norm": 36.289202348834955,
"learning_rate": 0.00019906859460363307,
"loss": 1.1787,
"step": 75
},
{
"epoch": 0.18469015795868773,
"grad_norm": 46.92636167228936,
"learning_rate": 0.00019901309195684416,
"loss": 1.2316,
"step": 76
},
{
"epoch": 0.18712029161603888,
"grad_norm": 31.71425340357504,
"learning_rate": 0.00019895599130452505,
"loss": 1.1607,
"step": 77
},
{
"epoch": 0.18955042527339003,
"grad_norm": 43.94199928621344,
"learning_rate": 0.00019889729356824235,
"loss": 1.1919,
"step": 78
},
{
"epoch": 0.1919805589307412,
"grad_norm": 45.33073791860179,
"learning_rate": 0.0001988369996953386,
"loss": 1.2237,
"step": 79
},
{
"epoch": 0.19441069258809235,
"grad_norm": 135.89980489661897,
"learning_rate": 0.00019877511065891673,
"loss": 1.1822,
"step": 80
},
{
"epoch": 0.1968408262454435,
"grad_norm": 439.6770852212966,
"learning_rate": 0.00019871162745782478,
"loss": 1.1441,
"step": 81
},
{
"epoch": 0.19927095990279464,
"grad_norm": 80.73319798776026,
"learning_rate": 0.0001986465511166394,
"loss": 1.1709,
"step": 82
},
{
"epoch": 0.20170109356014582,
"grad_norm": 87.76515297497458,
"learning_rate": 0.00019857988268564953,
"loss": 1.1549,
"step": 83
},
{
"epoch": 0.20413122721749696,
"grad_norm": 70.08754986406095,
"learning_rate": 0.00019851162324083932,
"loss": 1.1771,
"step": 84
},
{
"epoch": 0.2065613608748481,
"grad_norm": 187.8198997057664,
"learning_rate": 0.0001984417738838709,
"loss": 1.2068,
"step": 85
},
{
"epoch": 0.20899149453219928,
"grad_norm": 127.78818684755072,
"learning_rate": 0.00019837033574206646,
"loss": 1.1974,
"step": 86
},
{
"epoch": 0.21142162818955043,
"grad_norm": 127.82979216871074,
"learning_rate": 0.0001982973099683902,
"loss": 1.185,
"step": 87
},
{
"epoch": 0.21385176184690158,
"grad_norm": 142.35425084857746,
"learning_rate": 0.00019822269774142954,
"loss": 1.2225,
"step": 88
},
{
"epoch": 0.21628189550425272,
"grad_norm": 246.64019353564817,
"learning_rate": 0.0001981465002653763,
"loss": 1.2574,
"step": 89
},
{
"epoch": 0.2187120291616039,
"grad_norm": 189.88471076285524,
"learning_rate": 0.0001980687187700071,
"loss": 1.1635,
"step": 90
},
{
"epoch": 0.22114216281895505,
"grad_norm": 116.65693373141701,
"learning_rate": 0.00019798935451066361,
"loss": 1.1457,
"step": 91
},
{
"epoch": 0.2235722964763062,
"grad_norm": 71.76422539970217,
"learning_rate": 0.00019790840876823232,
"loss": 1.2354,
"step": 92
},
{
"epoch": 0.22600243013365734,
"grad_norm": 139.42330509386431,
"learning_rate": 0.0001978258828491236,
"loss": 1.18,
"step": 93
},
{
"epoch": 0.2284325637910085,
"grad_norm": 131.88308820601443,
"learning_rate": 0.00019774177808525113,
"loss": 1.1868,
"step": 94
},
{
"epoch": 0.23086269744835966,
"grad_norm": 85.81071125615291,
"learning_rate": 0.00019765609583400977,
"loss": 1.1814,
"step": 95
},
{
"epoch": 0.2332928311057108,
"grad_norm": 84.43756298541064,
"learning_rate": 0.00019756883747825424,
"loss": 1.1658,
"step": 96
},
{
"epoch": 0.23572296476306198,
"grad_norm": 114.24245545143974,
"learning_rate": 0.0001974800044262764,
"loss": 1.2497,
"step": 97
},
{
"epoch": 0.23815309842041313,
"grad_norm": 76.577511222722,
"learning_rate": 0.00019738959811178272,
"loss": 1.1414,
"step": 98
},
{
"epoch": 0.24058323207776428,
"grad_norm": 171.8084830895381,
"learning_rate": 0.00019729761999387103,
"loss": 1.1619,
"step": 99
},
{
"epoch": 0.24301336573511542,
"grad_norm": 221.87752250936416,
"learning_rate": 0.00019720407155700707,
"loss": 1.2718,
"step": 100
},
{
"epoch": 0.2454434993924666,
"grad_norm": 205.64943975370608,
"learning_rate": 0.00019710895431100046,
"loss": 1.1786,
"step": 101
},
{
"epoch": 0.24787363304981774,
"grad_norm": 160.16582903260615,
"learning_rate": 0.00019701226979098037,
"loss": 1.1426,
"step": 102
},
{
"epoch": 0.2503037667071689,
"grad_norm": 82.85031394537334,
"learning_rate": 0.00019691401955737072,
"loss": 1.1718,
"step": 103
},
{
"epoch": 0.2503037667071689,
"eval_loss": 1.1633374691009521,
"eval_runtime": 52.6182,
"eval_samples_per_second": 14.14,
"eval_steps_per_second": 1.767,
"step": 103
},
{
"epoch": 0.25273390036452004,
"grad_norm": 94.74469296109082,
"learning_rate": 0.000196814205195865,
"loss": 1.2255,
"step": 104
},
{
"epoch": 0.2551640340218712,
"grad_norm": 126.15797466756656,
"learning_rate": 0.00019671282831740076,
"loss": 1.1623,
"step": 105
},
{
"epoch": 0.25759416767922233,
"grad_norm": 79.41156434272008,
"learning_rate": 0.0001966098905581334,
"loss": 1.1606,
"step": 106
},
{
"epoch": 0.2600243013365735,
"grad_norm": 70.33104031058372,
"learning_rate": 0.00019650539357941003,
"loss": 1.196,
"step": 107
},
{
"epoch": 0.2624544349939247,
"grad_norm": 69.57260733822498,
"learning_rate": 0.0001963993390677424,
"loss": 1.1939,
"step": 108
},
{
"epoch": 0.2648845686512758,
"grad_norm": 81.78820691772725,
"learning_rate": 0.00019629172873477995,
"loss": 1.2553,
"step": 109
},
{
"epoch": 0.267314702308627,
"grad_norm": 117.06324110268656,
"learning_rate": 0.00019618256431728194,
"loss": 1.2535,
"step": 110
},
{
"epoch": 0.26974483596597815,
"grad_norm": 83.26993317104247,
"learning_rate": 0.00019607184757708951,
"loss": 1.157,
"step": 111
},
{
"epoch": 0.27217496962332927,
"grad_norm": 51.990829456422375,
"learning_rate": 0.00019595958030109735,
"loss": 1.1274,
"step": 112
},
{
"epoch": 0.27460510328068044,
"grad_norm": 119.7487160875729,
"learning_rate": 0.00019584576430122473,
"loss": 1.1422,
"step": 113
},
{
"epoch": 0.2770352369380316,
"grad_norm": 88.15636932272304,
"learning_rate": 0.00019573040141438624,
"loss": 1.1599,
"step": 114
},
{
"epoch": 0.27946537059538273,
"grad_norm": 62.346402225534774,
"learning_rate": 0.00019561349350246226,
"loss": 1.1909,
"step": 115
},
{
"epoch": 0.2818955042527339,
"grad_norm": 76.40612150653034,
"learning_rate": 0.0001954950424522688,
"loss": 1.1646,
"step": 116
},
{
"epoch": 0.284325637910085,
"grad_norm": 94.8711613055073,
"learning_rate": 0.00019537505017552716,
"loss": 1.1547,
"step": 117
},
{
"epoch": 0.2867557715674362,
"grad_norm": 63.86961661796314,
"learning_rate": 0.00019525351860883293,
"loss": 1.1841,
"step": 118
},
{
"epoch": 0.2891859052247874,
"grad_norm": 133.2417924150684,
"learning_rate": 0.00019513044971362494,
"loss": 1.1365,
"step": 119
},
{
"epoch": 0.2916160388821385,
"grad_norm": 133.44891510996445,
"learning_rate": 0.00019500584547615333,
"loss": 1.1696,
"step": 120
},
{
"epoch": 0.29404617253948967,
"grad_norm": 58.51701768739601,
"learning_rate": 0.00019487970790744774,
"loss": 1.1874,
"step": 121
},
{
"epoch": 0.29647630619684084,
"grad_norm": 49.536158238056196,
"learning_rate": 0.00019475203904328474,
"loss": 1.1798,
"step": 122
},
{
"epoch": 0.29890643985419196,
"grad_norm": 94.27608706983857,
"learning_rate": 0.000194622840944155,
"loss": 1.2443,
"step": 123
},
{
"epoch": 0.30133657351154314,
"grad_norm": 103.868243202843,
"learning_rate": 0.00019449211569523,
"loss": 1.1759,
"step": 124
},
{
"epoch": 0.3037667071688943,
"grad_norm": 73.31536435980003,
"learning_rate": 0.00019435986540632843,
"loss": 1.1885,
"step": 125
},
{
"epoch": 0.30619684082624543,
"grad_norm": 64.91149114745738,
"learning_rate": 0.00019422609221188207,
"loss": 1.1864,
"step": 126
},
{
"epoch": 0.3086269744835966,
"grad_norm": 95.34449184763653,
"learning_rate": 0.00019409079827090145,
"loss": 1.1339,
"step": 127
},
{
"epoch": 0.3110571081409477,
"grad_norm": 67.36156159754226,
"learning_rate": 0.00019395398576694086,
"loss": 1.1845,
"step": 128
},
{
"epoch": 0.3134872417982989,
"grad_norm": 36.94913176821407,
"learning_rate": 0.00019381565690806328,
"loss": 1.2154,
"step": 129
},
{
"epoch": 0.3159173754556501,
"grad_norm": 69.05265214547647,
"learning_rate": 0.00019367581392680457,
"loss": 1.1642,
"step": 130
},
{
"epoch": 0.3183475091130012,
"grad_norm": 38.974761165559855,
"learning_rate": 0.00019353445908013755,
"loss": 1.1508,
"step": 131
},
{
"epoch": 0.32077764277035237,
"grad_norm": 48.47215142199794,
"learning_rate": 0.00019339159464943557,
"loss": 1.2011,
"step": 132
},
{
"epoch": 0.32320777642770354,
"grad_norm": 41.88512063342574,
"learning_rate": 0.00019324722294043558,
"loss": 1.1643,
"step": 133
},
{
"epoch": 0.32563791008505466,
"grad_norm": 25.59403215229145,
"learning_rate": 0.00019310134628320114,
"loss": 1.1954,
"step": 134
},
{
"epoch": 0.32806804374240583,
"grad_norm": 58.02634988046396,
"learning_rate": 0.00019295396703208453,
"loss": 1.1544,
"step": 135
},
{
"epoch": 0.330498177399757,
"grad_norm": 31.26218977398251,
"learning_rate": 0.00019280508756568896,
"loss": 1.1613,
"step": 136
},
{
"epoch": 0.33292831105710813,
"grad_norm": 31.81234539284103,
"learning_rate": 0.00019265471028683014,
"loss": 1.1892,
"step": 137
},
{
"epoch": 0.3353584447144593,
"grad_norm": 54.44930114675527,
"learning_rate": 0.00019250283762249748,
"loss": 1.2801,
"step": 138
},
{
"epoch": 0.3377885783718105,
"grad_norm": 30.320486287732734,
"learning_rate": 0.00019234947202381486,
"loss": 1.1934,
"step": 139
},
{
"epoch": 0.3402187120291616,
"grad_norm": 32.76175001943503,
"learning_rate": 0.00019219461596600113,
"loss": 1.1436,
"step": 140
},
{
"epoch": 0.34264884568651277,
"grad_norm": 36.802264122697316,
"learning_rate": 0.00019203827194833026,
"loss": 1.1418,
"step": 141
},
{
"epoch": 0.3450789793438639,
"grad_norm": 35.03898729580271,
"learning_rate": 0.0001918804424940908,
"loss": 1.2479,
"step": 142
},
{
"epoch": 0.34750911300121506,
"grad_norm": 89.58068030461165,
"learning_rate": 0.00019172113015054532,
"loss": 1.2504,
"step": 143
},
{
"epoch": 0.34993924665856624,
"grad_norm": 30.05799668441019,
"learning_rate": 0.00019156033748888917,
"loss": 1.1662,
"step": 144
},
{
"epoch": 0.35236938031591736,
"grad_norm": 33.80121199203598,
"learning_rate": 0.00019139806710420914,
"loss": 1.1862,
"step": 145
},
{
"epoch": 0.35479951397326853,
"grad_norm": 31.510896023067872,
"learning_rate": 0.00019123432161544142,
"loss": 1.147,
"step": 146
},
{
"epoch": 0.3572296476306197,
"grad_norm": 32.92613286618093,
"learning_rate": 0.00019106910366532942,
"loss": 1.1421,
"step": 147
},
{
"epoch": 0.3596597812879708,
"grad_norm": 245.36013493823395,
"learning_rate": 0.00019090241592038113,
"loss": 1.1306,
"step": 148
},
{
"epoch": 0.362089914945322,
"grad_norm": 72.3061625644275,
"learning_rate": 0.000190734261070826,
"loss": 1.1144,
"step": 149
},
{
"epoch": 0.3645200486026732,
"grad_norm": 63.77748866336388,
"learning_rate": 0.00019056464183057157,
"loss": 1.1249,
"step": 150
},
{
"epoch": 0.3669501822600243,
"grad_norm": 633.2421324308109,
"learning_rate": 0.00019039356093715975,
"loss": 1.1359,
"step": 151
},
{
"epoch": 0.36938031591737547,
"grad_norm": 34.456657555313704,
"learning_rate": 0.00019022102115172248,
"loss": 1.1397,
"step": 152
},
{
"epoch": 0.3718104495747266,
"grad_norm": 35.21328820959324,
"learning_rate": 0.00019004702525893732,
"loss": 1.1741,
"step": 153
},
{
"epoch": 0.37424058323207776,
"grad_norm": 90.32405227187036,
"learning_rate": 0.00018987157606698235,
"loss": 1.1844,
"step": 154
},
{
"epoch": 0.37667071688942894,
"grad_norm": 39.348755664527914,
"learning_rate": 0.000189694676407491,
"loss": 1.1216,
"step": 155
},
{
"epoch": 0.37910085054678005,
"grad_norm": 58.85540744859834,
"learning_rate": 0.00018951632913550626,
"loss": 1.115,
"step": 156
},
{
"epoch": 0.38153098420413123,
"grad_norm": 39.849945227365325,
"learning_rate": 0.0001893365371294346,
"loss": 1.1705,
"step": 157
},
{
"epoch": 0.3839611178614824,
"grad_norm": 40.300954908722304,
"learning_rate": 0.0001891553032909996,
"loss": 1.1831,
"step": 158
},
{
"epoch": 0.3863912515188335,
"grad_norm": 53.72009888405355,
"learning_rate": 0.00018897263054519498,
"loss": 1.1613,
"step": 159
},
{
"epoch": 0.3888213851761847,
"grad_norm": 142.22686975859034,
"learning_rate": 0.0001887885218402375,
"loss": 1.1639,
"step": 160
},
{
"epoch": 0.39125151883353587,
"grad_norm": 50.141889086717356,
"learning_rate": 0.00018860298014751944,
"loss": 1.1659,
"step": 161
},
{
"epoch": 0.393681652490887,
"grad_norm": 63.25519968311113,
"learning_rate": 0.0001884160084615604,
"loss": 1.168,
"step": 162
},
{
"epoch": 0.39611178614823817,
"grad_norm": 50.59325246324073,
"learning_rate": 0.0001882276097999592,
"loss": 1.1202,
"step": 163
},
{
"epoch": 0.3985419198055893,
"grad_norm": 58.32587879810431,
"learning_rate": 0.0001880377872033451,
"loss": 1.1587,
"step": 164
},
{
"epoch": 0.40097205346294046,
"grad_norm": 211.50882688314653,
"learning_rate": 0.00018784654373532866,
"loss": 1.1551,
"step": 165
},
{
"epoch": 0.40340218712029163,
"grad_norm": 47.82888424614203,
"learning_rate": 0.00018765388248245246,
"loss": 1.2274,
"step": 166
},
{
"epoch": 0.40583232077764275,
"grad_norm": 97.94922685274778,
"learning_rate": 0.00018745980655414114,
"loss": 1.0872,
"step": 167
},
{
"epoch": 0.4082624544349939,
"grad_norm": 44.74994721544976,
"learning_rate": 0.0001872643190826512,
"loss": 1.1244,
"step": 168
},
{
"epoch": 0.4106925880923451,
"grad_norm": 53.84692426866845,
"learning_rate": 0.00018706742322302064,
"loss": 1.1576,
"step": 169
},
{
"epoch": 0.4131227217496962,
"grad_norm": 54.43599132185614,
"learning_rate": 0.0001868691221530178,
"loss": 1.0957,
"step": 170
},
{
"epoch": 0.4155528554070474,
"grad_norm": 39.21766518089018,
"learning_rate": 0.00018666941907309026,
"loss": 1.1625,
"step": 171
},
{
"epoch": 0.41798298906439857,
"grad_norm": 49.40030697752548,
"learning_rate": 0.000186468317206313,
"loss": 1.1556,
"step": 172
},
{
"epoch": 0.4204131227217497,
"grad_norm": 101.50309647820374,
"learning_rate": 0.0001862658197983366,
"loss": 1.1687,
"step": 173
},
{
"epoch": 0.42284325637910086,
"grad_norm": 105.41233861946563,
"learning_rate": 0.0001860619301173347,
"loss": 1.1687,
"step": 174
},
{
"epoch": 0.425273390036452,
"grad_norm": 103.99749987770305,
"learning_rate": 0.0001858566514539513,
"loss": 1.144,
"step": 175
},
{
"epoch": 0.42770352369380316,
"grad_norm": 78.83490301242213,
"learning_rate": 0.0001856499871212477,
"loss": 1.2318,
"step": 176
},
{
"epoch": 0.43013365735115433,
"grad_norm": 62.325757489859335,
"learning_rate": 0.00018544194045464886,
"loss": 1.1092,
"step": 177
},
{
"epoch": 0.43256379100850545,
"grad_norm": 81.32804926878099,
"learning_rate": 0.00018523251481188986,
"loss": 1.2233,
"step": 178
},
{
"epoch": 0.4349939246658566,
"grad_norm": 38.97928032166606,
"learning_rate": 0.00018502171357296144,
"loss": 1.2371,
"step": 179
},
{
"epoch": 0.4374240583232078,
"grad_norm": 82.62345361244209,
"learning_rate": 0.0001848095401400555,
"loss": 1.1562,
"step": 180
},
{
"epoch": 0.4398541919805589,
"grad_norm": 47.793381366401626,
"learning_rate": 0.0001845959979375104,
"loss": 1.1249,
"step": 181
},
{
"epoch": 0.4422843256379101,
"grad_norm": 53.6022948471739,
"learning_rate": 0.00018438109041175532,
"loss": 1.1415,
"step": 182
},
{
"epoch": 0.44471445929526127,
"grad_norm": 65.92717051568573,
"learning_rate": 0.00018416482103125506,
"loss": 1.1748,
"step": 183
},
{
"epoch": 0.4471445929526124,
"grad_norm": 59.410481167619494,
"learning_rate": 0.0001839471932864537,
"loss": 1.1399,
"step": 184
},
{
"epoch": 0.44957472660996356,
"grad_norm": 64.22740395872977,
"learning_rate": 0.0001837282106897185,
"loss": 1.2193,
"step": 185
},
{
"epoch": 0.4520048602673147,
"grad_norm": 54.63497168787729,
"learning_rate": 0.00018350787677528306,
"loss": 1.153,
"step": 186
},
{
"epoch": 0.45443499392466585,
"grad_norm": 49.60676029637355,
"learning_rate": 0.00018328619509919044,
"loss": 1.1509,
"step": 187
},
{
"epoch": 0.456865127582017,
"grad_norm": 32.29074835877607,
"learning_rate": 0.00018306316923923563,
"loss": 1.1851,
"step": 188
},
{
"epoch": 0.45929526123936815,
"grad_norm": 61.13632454163589,
"learning_rate": 0.0001828388027949078,
"loss": 1.1323,
"step": 189
},
{
"epoch": 0.4617253948967193,
"grad_norm": 67.48617660835801,
"learning_rate": 0.00018261309938733238,
"loss": 1.1956,
"step": 190
},
{
"epoch": 0.4641555285540705,
"grad_norm": 38.31182257784929,
"learning_rate": 0.00018238606265921238,
"loss": 1.1379,
"step": 191
},
{
"epoch": 0.4665856622114216,
"grad_norm": 47.30995766708629,
"learning_rate": 0.00018215769627476984,
"loss": 1.1462,
"step": 192
},
{
"epoch": 0.4690157958687728,
"grad_norm": 34.57093925891121,
"learning_rate": 0.00018192800391968642,
"loss": 1.1979,
"step": 193
},
{
"epoch": 0.47144592952612396,
"grad_norm": 34.45645740457662,
"learning_rate": 0.0001816969893010442,
"loss": 1.1763,
"step": 194
},
{
"epoch": 0.4738760631834751,
"grad_norm": 39.21862152859671,
"learning_rate": 0.00018146465614726567,
"loss": 1.1514,
"step": 195
},
{
"epoch": 0.47630619684082626,
"grad_norm": 34.765347344568106,
"learning_rate": 0.00018123100820805355,
"loss": 1.1426,
"step": 196
},
{
"epoch": 0.4787363304981774,
"grad_norm": 35.04245362239315,
"learning_rate": 0.00018099604925433043,
"loss": 1.143,
"step": 197
},
{
"epoch": 0.48116646415552855,
"grad_norm": 103.45636476066032,
"learning_rate": 0.00018075978307817764,
"loss": 1.1713,
"step": 198
},
{
"epoch": 0.4835965978128797,
"grad_norm": 43.0297373660821,
"learning_rate": 0.00018052221349277442,
"loss": 1.2226,
"step": 199
},
{
"epoch": 0.48602673147023084,
"grad_norm": 32.80474372048966,
"learning_rate": 0.000180283344332336,
"loss": 1.1556,
"step": 200
},
{
"epoch": 0.488456865127582,
"grad_norm": 59.42688731224296,
"learning_rate": 0.00018004317945205197,
"loss": 1.1411,
"step": 201
},
{
"epoch": 0.4908869987849332,
"grad_norm": 102.0917822407188,
"learning_rate": 0.000179801722728024,
"loss": 1.1309,
"step": 202
},
{
"epoch": 0.4933171324422843,
"grad_norm": 309.9346821950787,
"learning_rate": 0.0001795589780572031,
"loss": 1.1953,
"step": 203
},
{
"epoch": 0.4957472660996355,
"grad_norm": 344.5019267346993,
"learning_rate": 0.0001793149493573271,
"loss": 1.1524,
"step": 204
},
{
"epoch": 0.49817739975698666,
"grad_norm": 50.075205946207085,
"learning_rate": 0.00017906964056685706,
"loss": 1.1495,
"step": 205
},
{
"epoch": 0.5006075334143378,
"grad_norm": 132.32227258331488,
"learning_rate": 0.00017882305564491396,
"loss": 1.1976,
"step": 206
},
{
"epoch": 0.5006075334143378,
"eval_loss": 1.146019458770752,
"eval_runtime": 52.7816,
"eval_samples_per_second": 14.096,
"eval_steps_per_second": 1.762,
"step": 206
},
{
"epoch": 0.503037667071689,
"grad_norm": 138.57200377669218,
"learning_rate": 0.00017857519857121458,
"loss": 1.2159,
"step": 207
},
{
"epoch": 0.5054678007290401,
"grad_norm": 268.41109734161546,
"learning_rate": 0.00017832607334600746,
"loss": 1.1748,
"step": 208
},
{
"epoch": 0.5078979343863913,
"grad_norm": 72.44153953442401,
"learning_rate": 0.00017807568399000822,
"loss": 1.1758,
"step": 209
},
{
"epoch": 0.5103280680437424,
"grad_norm": 97.75400124096738,
"learning_rate": 0.00017782403454433477,
"loss": 1.1004,
"step": 210
},
{
"epoch": 0.5127582017010935,
"grad_norm": 84.19522802756285,
"learning_rate": 0.000177571129070442,
"loss": 1.1397,
"step": 211
},
{
"epoch": 0.5151883353584447,
"grad_norm": 132.95081835535706,
"learning_rate": 0.00017731697165005618,
"loss": 1.146,
"step": 212
},
{
"epoch": 0.5176184690157959,
"grad_norm": 560.3351292126325,
"learning_rate": 0.0001770615663851093,
"loss": 1.1937,
"step": 213
},
{
"epoch": 0.520048602673147,
"grad_norm": 252.72862614645885,
"learning_rate": 0.0001768049173976727,
"loss": 1.1213,
"step": 214
},
{
"epoch": 0.5224787363304981,
"grad_norm": 356.2985211032981,
"learning_rate": 0.0001765470288298905,
"loss": 1.22,
"step": 215
},
{
"epoch": 0.5249088699878494,
"grad_norm": 952.600672502031,
"learning_rate": 0.00017628790484391284,
"loss": 1.1321,
"step": 216
},
{
"epoch": 0.5273390036452005,
"grad_norm": 289.9357041930161,
"learning_rate": 0.0001760275496218288,
"loss": 1.1688,
"step": 217
},
{
"epoch": 0.5297691373025516,
"grad_norm": 48.69445264741508,
"learning_rate": 0.0001757659673655986,
"loss": 1.1551,
"step": 218
},
{
"epoch": 0.5321992709599028,
"grad_norm": 40.15160247154335,
"learning_rate": 0.0001755031622969862,
"loss": 1.1459,
"step": 219
},
{
"epoch": 0.534629404617254,
"grad_norm": 44.59390817019205,
"learning_rate": 0.00017523913865749078,
"loss": 1.2012,
"step": 220
},
{
"epoch": 0.5370595382746051,
"grad_norm": 30.189717624412484,
"learning_rate": 0.00017497390070827848,
"loss": 1.15,
"step": 221
},
{
"epoch": 0.5394896719319563,
"grad_norm": 27.185608574176108,
"learning_rate": 0.00017470745273011362,
"loss": 1.0763,
"step": 222
},
{
"epoch": 0.5419198055893074,
"grad_norm": 99.44121390806423,
"learning_rate": 0.00017443979902328956,
"loss": 1.1478,
"step": 223
},
{
"epoch": 0.5443499392466585,
"grad_norm": 29.684499344634585,
"learning_rate": 0.00017417094390755934,
"loss": 1.1123,
"step": 224
},
{
"epoch": 0.5467800729040098,
"grad_norm": 26.788847114635054,
"learning_rate": 0.00017390089172206592,
"loss": 1.1169,
"step": 225
},
{
"epoch": 0.5492102065613609,
"grad_norm": 31.84817878214798,
"learning_rate": 0.00017362964682527218,
"loss": 1.1524,
"step": 226
},
{
"epoch": 0.551640340218712,
"grad_norm": 34.834632993822424,
"learning_rate": 0.00017335721359489057,
"loss": 1.1761,
"step": 227
},
{
"epoch": 0.5540704738760632,
"grad_norm": 66.6084234453716,
"learning_rate": 0.00017308359642781242,
"loss": 1.1175,
"step": 228
},
{
"epoch": 0.5565006075334143,
"grad_norm": 35.15720180142773,
"learning_rate": 0.00017280879974003707,
"loss": 1.2012,
"step": 229
},
{
"epoch": 0.5589307411907655,
"grad_norm": 35.975450782756226,
"learning_rate": 0.00017253282796660056,
"loss": 1.1801,
"step": 230
},
{
"epoch": 0.5613608748481167,
"grad_norm": 83.49050230764925,
"learning_rate": 0.0001722556855615039,
"loss": 1.1576,
"step": 231
},
{
"epoch": 0.5637910085054678,
"grad_norm": 150.44630441002784,
"learning_rate": 0.00017197737699764146,
"loss": 1.1826,
"step": 232
},
{
"epoch": 0.5662211421628189,
"grad_norm": 31.322382197739042,
"learning_rate": 0.00017169790676672858,
"loss": 1.1784,
"step": 233
},
{
"epoch": 0.56865127582017,
"grad_norm": 33.15983653687515,
"learning_rate": 0.0001714172793792291,
"loss": 1.1411,
"step": 234
},
{
"epoch": 0.5710814094775213,
"grad_norm": 22.206850165103052,
"learning_rate": 0.0001711354993642827,
"loss": 1.1772,
"step": 235
},
{
"epoch": 0.5735115431348724,
"grad_norm": 43.35721272668955,
"learning_rate": 0.00017085257126963152,
"loss": 1.0915,
"step": 236
},
{
"epoch": 0.5759416767922235,
"grad_norm": 29.57234737116712,
"learning_rate": 0.0001705684996615472,
"loss": 1.0977,
"step": 237
},
{
"epoch": 0.5783718104495748,
"grad_norm": 42.929644875053214,
"learning_rate": 0.00017028328912475668,
"loss": 1.1782,
"step": 238
},
{
"epoch": 0.5808019441069259,
"grad_norm": 32.15711272871687,
"learning_rate": 0.0001699969442623686,
"loss": 1.1855,
"step": 239
},
{
"epoch": 0.583232077764277,
"grad_norm": 43.64453730184205,
"learning_rate": 0.00016970946969579887,
"loss": 1.1171,
"step": 240
},
{
"epoch": 0.5856622114216282,
"grad_norm": 26.145541544112593,
"learning_rate": 0.00016942087006469592,
"loss": 1.1656,
"step": 241
},
{
"epoch": 0.5880923450789793,
"grad_norm": 53.98173886095731,
"learning_rate": 0.00016913115002686616,
"loss": 1.1378,
"step": 242
},
{
"epoch": 0.5905224787363305,
"grad_norm": 50.851193586801195,
"learning_rate": 0.00016884031425819853,
"loss": 1.1338,
"step": 243
},
{
"epoch": 0.5929526123936817,
"grad_norm": 30.166674036386443,
"learning_rate": 0.0001685483674525891,
"loss": 1.1732,
"step": 244
},
{
"epoch": 0.5953827460510328,
"grad_norm": 32.580505176392656,
"learning_rate": 0.00016825531432186543,
"loss": 1.143,
"step": 245
},
{
"epoch": 0.5978128797083839,
"grad_norm": 35.087231952662634,
"learning_rate": 0.0001679611595957103,
"loss": 1.212,
"step": 246
},
{
"epoch": 0.6002430133657352,
"grad_norm": 44.69578306542608,
"learning_rate": 0.00016766590802158566,
"loss": 1.1527,
"step": 247
},
{
"epoch": 0.6026731470230863,
"grad_norm": 39.8378839133733,
"learning_rate": 0.00016736956436465573,
"loss": 1.2174,
"step": 248
},
{
"epoch": 0.6051032806804374,
"grad_norm": 25.571860004032857,
"learning_rate": 0.0001670721334077103,
"loss": 1.1031,
"step": 249
},
{
"epoch": 0.6075334143377886,
"grad_norm": 27.626061413643438,
"learning_rate": 0.00016677361995108743,
"loss": 1.107,
"step": 250
},
{
"epoch": 0.6099635479951397,
"grad_norm": 47.405627339857176,
"learning_rate": 0.00016647402881259598,
"loss": 1.1521,
"step": 251
},
{
"epoch": 0.6123936816524909,
"grad_norm": 31.951762409660272,
"learning_rate": 0.00016617336482743794,
"loss": 1.174,
"step": 252
},
{
"epoch": 0.6148238153098421,
"grad_norm": 44.304437144236104,
"learning_rate": 0.00016587163284813032,
"loss": 1.1286,
"step": 253
},
{
"epoch": 0.6172539489671932,
"grad_norm": 21.990501251879344,
"learning_rate": 0.00016556883774442675,
"loss": 1.1927,
"step": 254
},
{
"epoch": 0.6196840826245443,
"grad_norm": 43.91119350789936,
"learning_rate": 0.00016526498440323914,
"loss": 1.1399,
"step": 255
},
{
"epoch": 0.6221142162818954,
"grad_norm": 28.064569132249982,
"learning_rate": 0.00016496007772855853,
"loss": 1.1913,
"step": 256
},
{
"epoch": 0.6245443499392467,
"grad_norm": 99.97142272243896,
"learning_rate": 0.0001646541226413761,
"loss": 1.1694,
"step": 257
},
{
"epoch": 0.6269744835965978,
"grad_norm": 27.12524206817854,
"learning_rate": 0.00016434712407960373,
"loss": 1.2398,
"step": 258
},
{
"epoch": 0.6294046172539489,
"grad_norm": 42.99171796479219,
"learning_rate": 0.00016403908699799425,
"loss": 1.145,
"step": 259
},
{
"epoch": 0.6318347509113001,
"grad_norm": 24.064938768293658,
"learning_rate": 0.00016373001636806153,
"loss": 1.098,
"step": 260
},
{
"epoch": 0.6342648845686513,
"grad_norm": 31.72232981247621,
"learning_rate": 0.00016341991717800023,
"loss": 1.1779,
"step": 261
},
{
"epoch": 0.6366950182260024,
"grad_norm": 39.97326887390835,
"learning_rate": 0.00016310879443260528,
"loss": 1.3142,
"step": 262
},
{
"epoch": 0.6391251518833536,
"grad_norm": 27.519208072826963,
"learning_rate": 0.00016279665315319114,
"loss": 1.2039,
"step": 263
},
{
"epoch": 0.6415552855407047,
"grad_norm": 52.94895557810481,
"learning_rate": 0.00016248349837751062,
"loss": 1.1718,
"step": 264
},
{
"epoch": 0.6439854191980559,
"grad_norm": 23.603047222747566,
"learning_rate": 0.0001621693351596739,
"loss": 1.1155,
"step": 265
},
{
"epoch": 0.6464155528554071,
"grad_norm": 21.400341520569807,
"learning_rate": 0.00016185416857006647,
"loss": 1.1242,
"step": 266
},
{
"epoch": 0.6488456865127582,
"grad_norm": 51.167335508822276,
"learning_rate": 0.00016153800369526788,
"loss": 1.1746,
"step": 267
},
{
"epoch": 0.6512758201701093,
"grad_norm": 26.219581065473573,
"learning_rate": 0.00016122084563796905,
"loss": 1.0836,
"step": 268
},
{
"epoch": 0.6537059538274606,
"grad_norm": 56.820249886600706,
"learning_rate": 0.0001609026995168904,
"loss": 1.1625,
"step": 269
},
{
"epoch": 0.6561360874848117,
"grad_norm": 37.43384869992443,
"learning_rate": 0.00016058357046669898,
"loss": 1.2143,
"step": 270
},
{
"epoch": 0.6585662211421628,
"grad_norm": 31.885237168871473,
"learning_rate": 0.00016026346363792567,
"loss": 1.1536,
"step": 271
},
{
"epoch": 0.660996354799514,
"grad_norm": 34.66147983279251,
"learning_rate": 0.00015994238419688199,
"loss": 1.2095,
"step": 272
},
{
"epoch": 0.6634264884568651,
"grad_norm": 86.90365354594917,
"learning_rate": 0.00015962033732557686,
"loss": 1.1149,
"step": 273
},
{
"epoch": 0.6658566221142163,
"grad_norm": 52.21177462889067,
"learning_rate": 0.00015929732822163287,
"loss": 1.1861,
"step": 274
},
{
"epoch": 0.6682867557715675,
"grad_norm": 92.11184701145604,
"learning_rate": 0.00015897336209820239,
"loss": 1.1853,
"step": 275
},
{
"epoch": 0.6707168894289186,
"grad_norm": 30.662475573811115,
"learning_rate": 0.00015864844418388342,
"loss": 1.0912,
"step": 276
},
{
"epoch": 0.6731470230862697,
"grad_norm": 26.15855468837027,
"learning_rate": 0.00015832257972263523,
"loss": 1.1618,
"step": 277
},
{
"epoch": 0.675577156743621,
"grad_norm": 41.14250673970726,
"learning_rate": 0.00015799577397369375,
"loss": 1.1499,
"step": 278
},
{
"epoch": 0.6780072904009721,
"grad_norm": 31.93253644773631,
"learning_rate": 0.00015766803221148673,
"loss": 1.1229,
"step": 279
},
{
"epoch": 0.6804374240583232,
"grad_norm": 39.87120131585165,
"learning_rate": 0.00015733935972554844,
"loss": 1.1647,
"step": 280
},
{
"epoch": 0.6828675577156743,
"grad_norm": 52.741654062271124,
"learning_rate": 0.0001570097618204345,
"loss": 1.1362,
"step": 281
},
{
"epoch": 0.6852976913730255,
"grad_norm": 33.13137686002526,
"learning_rate": 0.0001566792438156362,
"loss": 1.1825,
"step": 282
},
{
"epoch": 0.6877278250303767,
"grad_norm": 20.284041564566042,
"learning_rate": 0.00015634781104549442,
"loss": 1.1439,
"step": 283
},
{
"epoch": 0.6901579586877278,
"grad_norm": 164.9222932471453,
"learning_rate": 0.00015601546885911404,
"loss": 1.122,
"step": 284
},
{
"epoch": 0.692588092345079,
"grad_norm": 27.092346730158148,
"learning_rate": 0.00015568222262027717,
"loss": 1.157,
"step": 285
},
{
"epoch": 0.6950182260024301,
"grad_norm": 39.46898996008012,
"learning_rate": 0.00015534807770735664,
"loss": 1.1092,
"step": 286
},
{
"epoch": 0.6974483596597812,
"grad_norm": 30.00942949300714,
"learning_rate": 0.00015501303951322943,
"loss": 1.243,
"step": 287
},
{
"epoch": 0.6998784933171325,
"grad_norm": 31.435817418038887,
"learning_rate": 0.00015467711344518942,
"loss": 1.1034,
"step": 288
},
{
"epoch": 0.7023086269744836,
"grad_norm": 54.53572773177548,
"learning_rate": 0.00015434030492486023,
"loss": 1.2216,
"step": 289
},
{
"epoch": 0.7047387606318347,
"grad_norm": 24.51082708234768,
"learning_rate": 0.00015400261938810757,
"loss": 1.1532,
"step": 290
},
{
"epoch": 0.707168894289186,
"grad_norm": 104.85480514443172,
"learning_rate": 0.00015366406228495172,
"loss": 1.1156,
"step": 291
},
{
"epoch": 0.7095990279465371,
"grad_norm": 26.398830117870997,
"learning_rate": 0.0001533246390794794,
"loss": 1.0934,
"step": 292
},
{
"epoch": 0.7120291616038882,
"grad_norm": 25.062392373037707,
"learning_rate": 0.00015298435524975572,
"loss": 1.1453,
"step": 293
},
{
"epoch": 0.7144592952612394,
"grad_norm": 25.385505352027444,
"learning_rate": 0.0001526432162877356,
"loss": 1.1359,
"step": 294
},
{
"epoch": 0.7168894289185905,
"grad_norm": 18.00146943000571,
"learning_rate": 0.00015230122769917527,
"loss": 1.1129,
"step": 295
},
{
"epoch": 0.7193195625759417,
"grad_norm": 22.55383473288135,
"learning_rate": 0.00015195839500354335,
"loss": 1.142,
"step": 296
},
{
"epoch": 0.7217496962332929,
"grad_norm": 30.013723395820165,
"learning_rate": 0.00015161472373393186,
"loss": 1.1379,
"step": 297
},
{
"epoch": 0.724179829890644,
"grad_norm": 40.566201545240425,
"learning_rate": 0.0001512702194369668,
"loss": 1.1326,
"step": 298
},
{
"epoch": 0.7266099635479951,
"grad_norm": 27.34716639907029,
"learning_rate": 0.00015092488767271857,
"loss": 1.0782,
"step": 299
},
{
"epoch": 0.7290400972053463,
"grad_norm": 45.0837594669075,
"learning_rate": 0.00015057873401461253,
"loss": 1.2054,
"step": 300
},
{
"epoch": 0.7314702308626975,
"grad_norm": 22.39794101270309,
"learning_rate": 0.00015023176404933874,
"loss": 1.1052,
"step": 301
},
{
"epoch": 0.7339003645200486,
"grad_norm": 21.818512025585306,
"learning_rate": 0.00014988398337676198,
"loss": 1.1664,
"step": 302
},
{
"epoch": 0.7363304981773997,
"grad_norm": 33.09386163968815,
"learning_rate": 0.00014953539760983122,
"loss": 1.1364,
"step": 303
},
{
"epoch": 0.7387606318347509,
"grad_norm": 26.3253592215911,
"learning_rate": 0.00014918601237448923,
"loss": 1.1093,
"step": 304
},
{
"epoch": 0.741190765492102,
"grad_norm": 32.54878723405212,
"learning_rate": 0.0001488358333095816,
"loss": 1.182,
"step": 305
},
{
"epoch": 0.7436208991494532,
"grad_norm": 28.645473311846015,
"learning_rate": 0.0001484848660667658,
"loss": 1.2064,
"step": 306
},
{
"epoch": 0.7460510328068044,
"grad_norm": 29.02693042820854,
"learning_rate": 0.00014813311631041995,
"loss": 1.1545,
"step": 307
},
{
"epoch": 0.7484811664641555,
"grad_norm": 20.28193033099828,
"learning_rate": 0.00014778058971755154,
"loss": 1.1885,
"step": 308
},
{
"epoch": 0.7509113001215066,
"grad_norm": 121.86121371804961,
"learning_rate": 0.00014742729197770552,
"loss": 1.095,
"step": 309
},
{
"epoch": 0.7509113001215066,
"eval_loss": 1.133868932723999,
"eval_runtime": 52.6711,
"eval_samples_per_second": 14.125,
"eval_steps_per_second": 1.766,
"step": 309
},
{
"epoch": 0.7533414337788579,
"grad_norm": 50.1793074315811,
"learning_rate": 0.00014707322879287276,
"loss": 1.1679,
"step": 310
},
{
"epoch": 0.755771567436209,
"grad_norm": 31.791309498678103,
"learning_rate": 0.00014671840587739783,
"loss": 1.1277,
"step": 311
},
{
"epoch": 0.7582017010935601,
"grad_norm": 56.88911226488106,
"learning_rate": 0.00014636282895788688,
"loss": 1.1492,
"step": 312
},
{
"epoch": 0.7606318347509113,
"grad_norm": 117.29437608667352,
"learning_rate": 0.00014600650377311522,
"loss": 1.1123,
"step": 313
},
{
"epoch": 0.7630619684082625,
"grad_norm": 107.56728772749254,
"learning_rate": 0.00014564943607393459,
"loss": 1.171,
"step": 314
},
{
"epoch": 0.7654921020656136,
"grad_norm": 34.085830256919685,
"learning_rate": 0.0001452916316231805,
"loss": 1.1854,
"step": 315
},
{
"epoch": 0.7679222357229648,
"grad_norm": 23.625747202851176,
"learning_rate": 0.000144933096195579,
"loss": 1.1622,
"step": 316
},
{
"epoch": 0.7703523693803159,
"grad_norm": 56.9917185309248,
"learning_rate": 0.00014457383557765386,
"loss": 1.2037,
"step": 317
},
{
"epoch": 0.772782503037667,
"grad_norm": 34.55554043725056,
"learning_rate": 0.00014421385556763266,
"loss": 1.1273,
"step": 318
},
{
"epoch": 0.7752126366950183,
"grad_norm": 34.205286759913115,
"learning_rate": 0.00014385316197535372,
"loss": 1.2039,
"step": 319
},
{
"epoch": 0.7776427703523694,
"grad_norm": 27.30015395778206,
"learning_rate": 0.00014349176062217195,
"loss": 1.1903,
"step": 320
},
{
"epoch": 0.7800729040097205,
"grad_norm": 23.077745147127867,
"learning_rate": 0.00014312965734086518,
"loss": 1.1539,
"step": 321
},
{
"epoch": 0.7825030376670717,
"grad_norm": 26.22112568156326,
"learning_rate": 0.00014276685797553977,
"loss": 1.1807,
"step": 322
},
{
"epoch": 0.7849331713244229,
"grad_norm": 34.813719314948514,
"learning_rate": 0.0001424033683815365,
"loss": 1.1247,
"step": 323
},
{
"epoch": 0.787363304981774,
"grad_norm": 27.109609629038324,
"learning_rate": 0.00014203919442533597,
"loss": 1.1735,
"step": 324
},
{
"epoch": 0.7897934386391251,
"grad_norm": 144.91672798575476,
"learning_rate": 0.00014167434198446383,
"loss": 1.1007,
"step": 325
},
{
"epoch": 0.7922235722964763,
"grad_norm": 42.19042828736382,
"learning_rate": 0.00014130881694739616,
"loss": 1.1398,
"step": 326
},
{
"epoch": 0.7946537059538274,
"grad_norm": 43.00144921766715,
"learning_rate": 0.00014094262521346427,
"loss": 1.1712,
"step": 327
},
{
"epoch": 0.7970838396111786,
"grad_norm": 26.343159670729925,
"learning_rate": 0.0001405757726927595,
"loss": 1.2103,
"step": 328
},
{
"epoch": 0.7995139732685298,
"grad_norm": 31.68271222195729,
"learning_rate": 0.00014020826530603776,
"loss": 1.1578,
"step": 329
},
{
"epoch": 0.8019441069258809,
"grad_norm": 39.08920292536896,
"learning_rate": 0.00013984010898462416,
"loss": 1.1377,
"step": 330
},
{
"epoch": 0.804374240583232,
"grad_norm": 34.56898084569197,
"learning_rate": 0.00013947130967031717,
"loss": 1.1886,
"step": 331
},
{
"epoch": 0.8068043742405833,
"grad_norm": 42.016356369933895,
"learning_rate": 0.00013910187331529276,
"loss": 1.1577,
"step": 332
},
{
"epoch": 0.8092345078979344,
"grad_norm": 21.25953597879822,
"learning_rate": 0.00013873180588200827,
"loss": 1.1259,
"step": 333
},
{
"epoch": 0.8116646415552855,
"grad_norm": 39.49634140985428,
"learning_rate": 0.0001383611133431062,
"loss": 1.173,
"step": 334
},
{
"epoch": 0.8140947752126367,
"grad_norm": 29.837690582268863,
"learning_rate": 0.00013798980168131794,
"loss": 1.1322,
"step": 335
},
{
"epoch": 0.8165249088699879,
"grad_norm": 23.510451396240928,
"learning_rate": 0.000137617876889367,
"loss": 1.1392,
"step": 336
},
{
"epoch": 0.818955042527339,
"grad_norm": 19.183017199526635,
"learning_rate": 0.00013724534496987247,
"loss": 1.157,
"step": 337
},
{
"epoch": 0.8213851761846902,
"grad_norm": 51.85037647612581,
"learning_rate": 0.0001368722119352521,
"loss": 1.1255,
"step": 338
},
{
"epoch": 0.8238153098420413,
"grad_norm": 31.635699477838273,
"learning_rate": 0.00013649848380762513,
"loss": 1.1429,
"step": 339
},
{
"epoch": 0.8262454434993924,
"grad_norm": 39.6479124739029,
"learning_rate": 0.00013612416661871533,
"loss": 1.1609,
"step": 340
},
{
"epoch": 0.8286755771567437,
"grad_norm": 21.453228401011238,
"learning_rate": 0.0001357492664097534,
"loss": 1.1247,
"step": 341
},
{
"epoch": 0.8311057108140948,
"grad_norm": 28.514958428145494,
"learning_rate": 0.00013537378923137973,
"loss": 1.0845,
"step": 342
},
{
"epoch": 0.8335358444714459,
"grad_norm": 26.98663985253516,
"learning_rate": 0.00013499774114354655,
"loss": 1.1092,
"step": 343
},
{
"epoch": 0.8359659781287971,
"grad_norm": 30.76143424141064,
"learning_rate": 0.00013462112821542016,
"loss": 1.1759,
"step": 344
},
{
"epoch": 0.8383961117861483,
"grad_norm": 39.023771167108656,
"learning_rate": 0.0001342439565252831,
"loss": 1.1024,
"step": 345
},
{
"epoch": 0.8408262454434994,
"grad_norm": 29.787639099820225,
"learning_rate": 0.0001338662321604358,
"loss": 1.2141,
"step": 346
},
{
"epoch": 0.8432563791008505,
"grad_norm": 25.60634301240642,
"learning_rate": 0.00013348796121709862,
"loss": 1.1244,
"step": 347
},
{
"epoch": 0.8456865127582017,
"grad_norm": 76.98542857181108,
"learning_rate": 0.00013310914980031334,
"loss": 1.19,
"step": 348
},
{
"epoch": 0.8481166464155528,
"grad_norm": 110.28982985071892,
"learning_rate": 0.0001327298040238446,
"loss": 1.1295,
"step": 349
},
{
"epoch": 0.850546780072904,
"grad_norm": 22.610631125609732,
"learning_rate": 0.0001323499300100811,
"loss": 1.1445,
"step": 350
},
{
"epoch": 0.8529769137302552,
"grad_norm": 29.958515973723888,
"learning_rate": 0.00013196953388993726,
"loss": 1.2048,
"step": 351
},
{
"epoch": 0.8554070473876063,
"grad_norm": 30.691798031468103,
"learning_rate": 0.00013158862180275363,
"loss": 1.1628,
"step": 352
},
{
"epoch": 0.8578371810449574,
"grad_norm": 28.568576369680258,
"learning_rate": 0.00013120719989619833,
"loss": 1.0899,
"step": 353
},
{
"epoch": 0.8602673147023087,
"grad_norm": 42.12623456189728,
"learning_rate": 0.0001308252743261675,
"loss": 1.1451,
"step": 354
},
{
"epoch": 0.8626974483596598,
"grad_norm": 112.39248005736448,
"learning_rate": 0.00013044285125668614,
"loss": 1.154,
"step": 355
},
{
"epoch": 0.8651275820170109,
"grad_norm": 28.013602355549782,
"learning_rate": 0.0001300599368598086,
"loss": 1.1937,
"step": 356
},
{
"epoch": 0.8675577156743621,
"grad_norm": 27.763517972300694,
"learning_rate": 0.0001296765373155188,
"loss": 1.1243,
"step": 357
},
{
"epoch": 0.8699878493317132,
"grad_norm": 112.85815824767063,
"learning_rate": 0.0001292926588116308,
"loss": 1.1595,
"step": 358
},
{
"epoch": 0.8724179829890644,
"grad_norm": 27.085127886556087,
"learning_rate": 0.00012890830754368855,
"loss": 1.1196,
"step": 359
},
{
"epoch": 0.8748481166464156,
"grad_norm": 31.56336829128541,
"learning_rate": 0.00012852348971486617,
"loss": 1.1231,
"step": 360
},
{
"epoch": 0.8772782503037667,
"grad_norm": 31.904393738907178,
"learning_rate": 0.0001281382115358679,
"loss": 1.097,
"step": 361
},
{
"epoch": 0.8797083839611178,
"grad_norm": 25.034453894065827,
"learning_rate": 0.00012775247922482748,
"loss": 1.1246,
"step": 362
},
{
"epoch": 0.8821385176184691,
"grad_norm": 33.221958266501474,
"learning_rate": 0.0001273662990072083,
"loss": 1.1189,
"step": 363
},
{
"epoch": 0.8845686512758202,
"grad_norm": 26.638980136773224,
"learning_rate": 0.00012697967711570242,
"loss": 1.1315,
"step": 364
},
{
"epoch": 0.8869987849331713,
"grad_norm": 27.231479341362885,
"learning_rate": 0.00012659261979013043,
"loss": 1.1464,
"step": 365
},
{
"epoch": 0.8894289185905225,
"grad_norm": 19.654091006710207,
"learning_rate": 0.0001262051332773404,
"loss": 1.1271,
"step": 366
},
{
"epoch": 0.8918590522478737,
"grad_norm": 50.3934263865559,
"learning_rate": 0.00012581722383110718,
"loss": 1.1002,
"step": 367
},
{
"epoch": 0.8942891859052248,
"grad_norm": 20.25952031318632,
"learning_rate": 0.00012542889771203166,
"loss": 1.0629,
"step": 368
},
{
"epoch": 0.8967193195625759,
"grad_norm": 19.16914945262315,
"learning_rate": 0.00012504016118743935,
"loss": 1.1597,
"step": 369
},
{
"epoch": 0.8991494532199271,
"grad_norm": 35.65941460173898,
"learning_rate": 0.00012465102053127957,
"loss": 1.1501,
"step": 370
},
{
"epoch": 0.9015795868772782,
"grad_norm": 26.093269180565315,
"learning_rate": 0.00012426148202402404,
"loss": 1.1455,
"step": 371
},
{
"epoch": 0.9040097205346294,
"grad_norm": 30.928987547424892,
"learning_rate": 0.00012387155195256537,
"loss": 1.1392,
"step": 372
},
{
"epoch": 0.9064398541919806,
"grad_norm": 20.17512596846915,
"learning_rate": 0.00012348123661011601,
"loss": 1.1196,
"step": 373
},
{
"epoch": 0.9088699878493317,
"grad_norm": 24.380789157356805,
"learning_rate": 0.00012309054229610623,
"loss": 1.1,
"step": 374
},
{
"epoch": 0.9113001215066828,
"grad_norm": 95.49408387682203,
"learning_rate": 0.00012269947531608276,
"loss": 1.1825,
"step": 375
},
{
"epoch": 0.913730255164034,
"grad_norm": 23.635286340368726,
"learning_rate": 0.0001223080419816069,
"loss": 1.1717,
"step": 376
},
{
"epoch": 0.9161603888213852,
"grad_norm": 21.942478063568313,
"learning_rate": 0.00012191624861015254,
"loss": 1.1661,
"step": 377
},
{
"epoch": 0.9185905224787363,
"grad_norm": 74.12601397150299,
"learning_rate": 0.00012152410152500453,
"loss": 1.1967,
"step": 378
},
{
"epoch": 0.9210206561360875,
"grad_norm": 37.26720386499629,
"learning_rate": 0.00012113160705515625,
"loss": 1.1566,
"step": 379
},
{
"epoch": 0.9234507897934386,
"grad_norm": 34.080854733427635,
"learning_rate": 0.00012073877153520776,
"loss": 1.0847,
"step": 380
},
{
"epoch": 0.9258809234507898,
"grad_norm": 26.50842916877183,
"learning_rate": 0.0001203456013052634,
"loss": 1.0824,
"step": 381
},
{
"epoch": 0.928311057108141,
"grad_norm": 37.92039651416441,
"learning_rate": 0.00011995210271082944,
"loss": 1.1485,
"step": 382
},
{
"epoch": 0.9307411907654921,
"grad_norm": 38.56931832374284,
"learning_rate": 0.00011955828210271187,
"loss": 1.0737,
"step": 383
},
{
"epoch": 0.9331713244228432,
"grad_norm": 24.419015296791592,
"learning_rate": 0.0001191641458369136,
"loss": 1.1208,
"step": 384
},
{
"epoch": 0.9356014580801945,
"grad_norm": 28.75379656643836,
"learning_rate": 0.00011876970027453222,
"loss": 1.1071,
"step": 385
},
{
"epoch": 0.9380315917375456,
"grad_norm": 138.39305133994282,
"learning_rate": 0.00011837495178165706,
"loss": 1.1405,
"step": 386
},
{
"epoch": 0.9404617253948967,
"grad_norm": 22.200435229928654,
"learning_rate": 0.00011797990672926652,
"loss": 1.124,
"step": 387
},
{
"epoch": 0.9428918590522479,
"grad_norm": 40.21978055156661,
"learning_rate": 0.00011758457149312538,
"loss": 1.1875,
"step": 388
},
{
"epoch": 0.945321992709599,
"grad_norm": 23.592672098002485,
"learning_rate": 0.00011718895245368167,
"loss": 1.1748,
"step": 389
},
{
"epoch": 0.9477521263669502,
"grad_norm": 17.463183827323444,
"learning_rate": 0.00011679305599596393,
"loss": 1.1794,
"step": 390
},
{
"epoch": 0.9501822600243013,
"grad_norm": 36.219441964332646,
"learning_rate": 0.00011639688850947799,
"loss": 1.1459,
"step": 391
},
{
"epoch": 0.9526123936816525,
"grad_norm": 23.727472560980413,
"learning_rate": 0.00011600045638810386,
"loss": 1.076,
"step": 392
},
{
"epoch": 0.9550425273390036,
"grad_norm": 57.63284414960702,
"learning_rate": 0.00011560376602999272,
"loss": 1.1919,
"step": 393
},
{
"epoch": 0.9574726609963548,
"grad_norm": 40.23829998466358,
"learning_rate": 0.00011520682383746333,
"loss": 1.0701,
"step": 394
},
{
"epoch": 0.959902794653706,
"grad_norm": 58.2018640218209,
"learning_rate": 0.00011480963621689905,
"loss": 1.1745,
"step": 395
},
{
"epoch": 0.9623329283110571,
"grad_norm": 27.693448904288406,
"learning_rate": 0.00011441220957864421,
"loss": 1.1323,
"step": 396
},
{
"epoch": 0.9647630619684082,
"grad_norm": 34.94430005820724,
"learning_rate": 0.00011401455033690076,
"loss": 1.1497,
"step": 397
},
{
"epoch": 0.9671931956257594,
"grad_norm": 17.521922247865188,
"learning_rate": 0.00011361666490962468,
"loss": 1.1319,
"step": 398
},
{
"epoch": 0.9696233292831106,
"grad_norm": 25.886687159935246,
"learning_rate": 0.00011321855971842243,
"loss": 1.1418,
"step": 399
},
{
"epoch": 0.9720534629404617,
"grad_norm": 31.388154506614836,
"learning_rate": 0.00011282024118844738,
"loss": 1.1282,
"step": 400
},
{
"epoch": 0.9744835965978129,
"grad_norm": 27.458601253675347,
"learning_rate": 0.00011242171574829599,
"loss": 1.1647,
"step": 401
},
{
"epoch": 0.976913730255164,
"grad_norm": 25.922873022924257,
"learning_rate": 0.00011202298982990411,
"loss": 1.091,
"step": 402
},
{
"epoch": 0.9793438639125152,
"grad_norm": 20.129467589894766,
"learning_rate": 0.00011162406986844323,
"loss": 1.2,
"step": 403
},
{
"epoch": 0.9817739975698664,
"grad_norm": 25.11892123906363,
"learning_rate": 0.00011122496230221645,
"loss": 1.0731,
"step": 404
},
{
"epoch": 0.9842041312272175,
"grad_norm": 26.416884392453543,
"learning_rate": 0.00011082567357255484,
"loss": 1.1836,
"step": 405
},
{
"epoch": 0.9866342648845686,
"grad_norm": 18.768078773975784,
"learning_rate": 0.00011042621012371322,
"loss": 1.1275,
"step": 406
},
{
"epoch": 0.9890643985419199,
"grad_norm": 22.275756523796257,
"learning_rate": 0.00011002657840276627,
"loss": 1.1228,
"step": 407
},
{
"epoch": 0.991494532199271,
"grad_norm": 29.605335344828575,
"learning_rate": 0.00010962678485950455,
"loss": 1.0255,
"step": 408
},
{
"epoch": 0.9939246658566221,
"grad_norm": 41.1718200727633,
"learning_rate": 0.00010922683594633021,
"loss": 1.1876,
"step": 409
},
{
"epoch": 0.9963547995139733,
"grad_norm": 20.46397475257922,
"learning_rate": 0.00010882673811815304,
"loss": 1.1168,
"step": 410
},
{
"epoch": 0.9987849331713244,
"grad_norm": 21.084924025016928,
"learning_rate": 0.00010842649783228624,
"loss": 1.1948,
"step": 411
},
{
"epoch": 1.0,
"grad_norm": 21.084924025016928,
"learning_rate": 0.00010802612154834211,
"loss": 1.1076,
"step": 412
},
{
"epoch": 1.0,
"eval_loss": 1.121336579322815,
"eval_runtime": 52.7043,
"eval_samples_per_second": 14.116,
"eval_steps_per_second": 1.765,
"step": 412
},
{
"epoch": 1.0024301336573511,
"grad_norm": 35.25758968935371,
"learning_rate": 0.00010762561572812788,
"loss": 1.1335,
"step": 413
},
{
"epoch": 1.0048602673147022,
"grad_norm": 20.78715726366623,
"learning_rate": 0.0001072249868355415,
"loss": 1.1003,
"step": 414
},
{
"epoch": 1.0072904009720534,
"grad_norm": 31.01116633763719,
"learning_rate": 0.0001068242413364671,
"loss": 1.1225,
"step": 415
},
{
"epoch": 1.0097205346294047,
"grad_norm": 19.050638172672897,
"learning_rate": 0.00010642338569867086,
"loss": 1.0595,
"step": 416
},
{
"epoch": 1.0121506682867558,
"grad_norm": 41.54235389574412,
"learning_rate": 0.00010602242639169648,
"loss": 1.1719,
"step": 417
},
{
"epoch": 1.014580801944107,
"grad_norm": 41.34218206464363,
"learning_rate": 0.00010562136988676078,
"loss": 1.1292,
"step": 418
},
{
"epoch": 1.017010935601458,
"grad_norm": 32.436985934581934,
"learning_rate": 0.0001052202226566494,
"loss": 1.1244,
"step": 419
},
{
"epoch": 1.0194410692588092,
"grad_norm": 19.631825450596665,
"learning_rate": 0.0001048189911756121,
"loss": 1.1323,
"step": 420
},
{
"epoch": 1.0218712029161603,
"grad_norm": 23.275029440216805,
"learning_rate": 0.00010441768191925847,
"loss": 1.1605,
"step": 421
},
{
"epoch": 1.0243013365735116,
"grad_norm": 21.44161988455765,
"learning_rate": 0.0001040163013644533,
"loss": 1.0886,
"step": 422
},
{
"epoch": 1.0267314702308628,
"grad_norm": 31.9765167465431,
"learning_rate": 0.00010361485598921212,
"loss": 1.1378,
"step": 423
},
{
"epoch": 1.0291616038882139,
"grad_norm": 22.340741556027833,
"learning_rate": 0.00010321335227259661,
"loss": 1.1278,
"step": 424
},
{
"epoch": 1.031591737545565,
"grad_norm": 29.27286563037163,
"learning_rate": 0.00010281179669461005,
"loss": 1.1186,
"step": 425
},
{
"epoch": 1.034021871202916,
"grad_norm": 65.85877610734141,
"learning_rate": 0.00010241019573609269,
"loss": 1.1673,
"step": 426
},
{
"epoch": 1.0364520048602672,
"grad_norm": 35.173784527846884,
"learning_rate": 0.00010200855587861724,
"loss": 1.0903,
"step": 427
},
{
"epoch": 1.0388821385176186,
"grad_norm": 29.91546238299385,
"learning_rate": 0.00010160688360438419,
"loss": 1.0884,
"step": 428
},
{
"epoch": 1.0413122721749697,
"grad_norm": 26.873308685100223,
"learning_rate": 0.0001012051853961172,
"loss": 1.1296,
"step": 429
},
{
"epoch": 1.0437424058323208,
"grad_norm": 25.90622275527891,
"learning_rate": 0.00010080346773695853,
"loss": 1.1349,
"step": 430
},
{
"epoch": 1.046172539489672,
"grad_norm": 21.388851321680434,
"learning_rate": 0.00010040173711036431,
"loss": 1.0947,
"step": 431
},
{
"epoch": 1.048602673147023,
"grad_norm": 31.206506843880053,
"learning_rate": 0.0001,
"loss": 1.1541,
"step": 432
},
{
"epoch": 1.0510328068043742,
"grad_norm": 19.486767323523555,
"learning_rate": 9.959826288963571e-05,
"loss": 1.1574,
"step": 433
},
{
"epoch": 1.0534629404617255,
"grad_norm": 102.81325604770561,
"learning_rate": 9.919653226304148e-05,
"loss": 1.1762,
"step": 434
},
{
"epoch": 1.0558930741190766,
"grad_norm": 17.18170280255333,
"learning_rate": 9.879481460388282e-05,
"loss": 1.1208,
"step": 435
},
{
"epoch": 1.0583232077764277,
"grad_norm": 29.88292309614927,
"learning_rate": 9.839311639561583e-05,
"loss": 1.1114,
"step": 436
},
{
"epoch": 1.0607533414337789,
"grad_norm": 23.50392429976475,
"learning_rate": 9.799144412138275e-05,
"loss": 1.2026,
"step": 437
},
{
"epoch": 1.06318347509113,
"grad_norm": 24.794408487434744,
"learning_rate": 9.758980426390732e-05,
"loss": 1.1587,
"step": 438
},
{
"epoch": 1.065613608748481,
"grad_norm": 38.726295800289655,
"learning_rate": 9.718820330538998e-05,
"loss": 1.14,
"step": 439
},
{
"epoch": 1.0680437424058322,
"grad_norm": 31.152256057732977,
"learning_rate": 9.678664772740343e-05,
"loss": 1.0882,
"step": 440
},
{
"epoch": 1.0704738760631836,
"grad_norm": 65.73380095432839,
"learning_rate": 9.638514401078788e-05,
"loss": 1.1213,
"step": 441
},
{
"epoch": 1.0729040097205347,
"grad_norm": 69.07317297910537,
"learning_rate": 9.598369863554673e-05,
"loss": 1.1285,
"step": 442
},
{
"epoch": 1.0753341433778858,
"grad_norm": 62.55969576940585,
"learning_rate": 9.558231808074156e-05,
"loss": 1.1252,
"step": 443
},
{
"epoch": 1.077764277035237,
"grad_norm": 26.35106444530265,
"learning_rate": 9.51810088243879e-05,
"loss": 1.108,
"step": 444
},
{
"epoch": 1.080194410692588,
"grad_norm": 76.70006955440516,
"learning_rate": 9.477977734335061e-05,
"loss": 1.1144,
"step": 445
},
{
"epoch": 1.0826245443499392,
"grad_norm": 22.376983523395264,
"learning_rate": 9.437863011323922e-05,
"loss": 1.173,
"step": 446
},
{
"epoch": 1.0850546780072905,
"grad_norm": 33.51322062360491,
"learning_rate": 9.397757360830353e-05,
"loss": 1.089,
"step": 447
},
{
"epoch": 1.0874848116646416,
"grad_norm": 24.87252097324779,
"learning_rate": 9.357661430132915e-05,
"loss": 1.098,
"step": 448
},
{
"epoch": 1.0899149453219927,
"grad_norm": 48.95371674408058,
"learning_rate": 9.317575866353292e-05,
"loss": 1.0491,
"step": 449
},
{
"epoch": 1.0923450789793439,
"grad_norm": 25.50740340531524,
"learning_rate": 9.277501316445854e-05,
"loss": 1.0939,
"step": 450
},
{
"epoch": 1.094775212636695,
"grad_norm": 27.60998778610316,
"learning_rate": 9.23743842718721e-05,
"loss": 1.1564,
"step": 451
},
{
"epoch": 1.097205346294046,
"grad_norm": 63.99226186124907,
"learning_rate": 9.197387845165793e-05,
"loss": 1.1088,
"step": 452
},
{
"epoch": 1.0996354799513974,
"grad_norm": 36.441157466567596,
"learning_rate": 9.157350216771378e-05,
"loss": 1.0897,
"step": 453
},
{
"epoch": 1.1020656136087486,
"grad_norm": 32.32587774153429,
"learning_rate": 9.117326188184695e-05,
"loss": 1.1285,
"step": 454
},
{
"epoch": 1.1044957472660997,
"grad_norm": 33.39257750037465,
"learning_rate": 9.077316405366981e-05,
"loss": 1.1568,
"step": 455
},
{
"epoch": 1.1069258809234508,
"grad_norm": 45.03485873480868,
"learning_rate": 9.037321514049548e-05,
"loss": 1.0791,
"step": 456
},
{
"epoch": 1.109356014580802,
"grad_norm": 35.1451377482015,
"learning_rate": 8.997342159723371e-05,
"loss": 1.1243,
"step": 457
},
{
"epoch": 1.111786148238153,
"grad_norm": 67.01465976966,
"learning_rate": 8.957378987628682e-05,
"loss": 1.0978,
"step": 458
},
{
"epoch": 1.1142162818955041,
"grad_norm": 33.057859846207634,
"learning_rate": 8.917432642744518e-05,
"loss": 1.1431,
"step": 459
},
{
"epoch": 1.1166464155528555,
"grad_norm": 30.602840863536635,
"learning_rate": 8.877503769778356e-05,
"loss": 1.1157,
"step": 460
},
{
"epoch": 1.1190765492102066,
"grad_norm": 38.088467248288964,
"learning_rate": 8.83759301315568e-05,
"loss": 1.0776,
"step": 461
},
{
"epoch": 1.1215066828675577,
"grad_norm": 66.03671829863266,
"learning_rate": 8.797701017009591e-05,
"loss": 1.1468,
"step": 462
},
{
"epoch": 1.1239368165249088,
"grad_norm": 32.293691874682686,
"learning_rate": 8.757828425170404e-05,
"loss": 1.1115,
"step": 463
},
{
"epoch": 1.12636695018226,
"grad_norm": 32.70707175332633,
"learning_rate": 8.717975881155261e-05,
"loss": 1.1677,
"step": 464
},
{
"epoch": 1.128797083839611,
"grad_norm": 48.79069594971439,
"learning_rate": 8.678144028157759e-05,
"loss": 1.1341,
"step": 465
},
{
"epoch": 1.1312272174969624,
"grad_norm": 37.52808559072613,
"learning_rate": 8.638333509037536e-05,
"loss": 1.1414,
"step": 466
},
{
"epoch": 1.1336573511543135,
"grad_norm": 27.096068124970536,
"learning_rate": 8.598544966309925e-05,
"loss": 1.1719,
"step": 467
},
{
"epoch": 1.1360874848116647,
"grad_norm": 16.019227077248434,
"learning_rate": 8.55877904213558e-05,
"loss": 1.1148,
"step": 468
},
{
"epoch": 1.1385176184690158,
"grad_norm": 29.861941956913498,
"learning_rate": 8.519036378310096e-05,
"loss": 1.1486,
"step": 469
},
{
"epoch": 1.140947752126367,
"grad_norm": 23.058998452019107,
"learning_rate": 8.47931761625367e-05,
"loss": 1.0745,
"step": 470
},
{
"epoch": 1.143377885783718,
"grad_norm": 24.486692418227875,
"learning_rate": 8.43962339700073e-05,
"loss": 1.1333,
"step": 471
},
{
"epoch": 1.1458080194410694,
"grad_norm": 31.632544516924323,
"learning_rate": 8.399954361189615e-05,
"loss": 1.1565,
"step": 472
},
{
"epoch": 1.1482381530984205,
"grad_norm": 21.67735267443374,
"learning_rate": 8.360311149052205e-05,
"loss": 1.109,
"step": 473
},
{
"epoch": 1.1506682867557716,
"grad_norm": 29.096918560226527,
"learning_rate": 8.320694400403606e-05,
"loss": 1.1517,
"step": 474
},
{
"epoch": 1.1530984204131227,
"grad_norm": 46.067313216206955,
"learning_rate": 8.281104754631835e-05,
"loss": 1.1043,
"step": 475
},
{
"epoch": 1.1555285540704738,
"grad_norm": 30.84953769166141,
"learning_rate": 8.241542850687465e-05,
"loss": 1.1081,
"step": 476
},
{
"epoch": 1.157958687727825,
"grad_norm": 39.34158523904847,
"learning_rate": 8.20200932707335e-05,
"loss": 1.1787,
"step": 477
},
{
"epoch": 1.160388821385176,
"grad_norm": 39.14663302484904,
"learning_rate": 8.162504821834295e-05,
"loss": 1.202,
"step": 478
},
{
"epoch": 1.1628189550425274,
"grad_norm": 49.7279004249915,
"learning_rate": 8.123029972546781e-05,
"loss": 1.1439,
"step": 479
},
{
"epoch": 1.1652490886998785,
"grad_norm": 35.49897960878779,
"learning_rate": 8.083585416308642e-05,
"loss": 1.0741,
"step": 480
},
{
"epoch": 1.1676792223572297,
"grad_norm": 31.306252618855535,
"learning_rate": 8.044171789728816e-05,
"loss": 1.0697,
"step": 481
},
{
"epoch": 1.1701093560145808,
"grad_norm": 22.40745672651249,
"learning_rate": 8.004789728917059e-05,
"loss": 1.1498,
"step": 482
},
{
"epoch": 1.172539489671932,
"grad_norm": 32.19326746671122,
"learning_rate": 7.965439869473664e-05,
"loss": 1.1392,
"step": 483
},
{
"epoch": 1.1749696233292832,
"grad_norm": 33.66876390791385,
"learning_rate": 7.926122846479224e-05,
"loss": 1.1049,
"step": 484
},
{
"epoch": 1.1773997569866343,
"grad_norm": 35.43357233261174,
"learning_rate": 7.886839294484377e-05,
"loss": 1.0467,
"step": 485
},
{
"epoch": 1.1798298906439855,
"grad_norm": 50.660998166256256,
"learning_rate": 7.84758984749955e-05,
"loss": 1.1244,
"step": 486
},
{
"epoch": 1.1822600243013366,
"grad_norm": 41.356845334605936,
"learning_rate": 7.808375138984745e-05,
"loss": 1.1279,
"step": 487
},
{
"epoch": 1.1846901579586877,
"grad_norm": 22.947663723281487,
"learning_rate": 7.769195801839313e-05,
"loss": 1.0787,
"step": 488
},
{
"epoch": 1.1871202916160388,
"grad_norm": 36.434647074399905,
"learning_rate": 7.730052468391725e-05,
"loss": 1.1148,
"step": 489
},
{
"epoch": 1.18955042527339,
"grad_norm": 75.94549877059467,
"learning_rate": 7.690945770389377e-05,
"loss": 1.1127,
"step": 490
},
{
"epoch": 1.1919805589307413,
"grad_norm": 68.03126664734435,
"learning_rate": 7.6518763389884e-05,
"loss": 1.1672,
"step": 491
},
{
"epoch": 1.1944106925880924,
"grad_norm": 40.15361719091623,
"learning_rate": 7.612844804743466e-05,
"loss": 1.0962,
"step": 492
},
{
"epoch": 1.1968408262454435,
"grad_norm": 105.80023571763755,
"learning_rate": 7.573851797597602e-05,
"loss": 1.1091,
"step": 493
},
{
"epoch": 1.1992709599027946,
"grad_norm": 41.84401502420881,
"learning_rate": 7.534897946872042e-05,
"loss": 1.1359,
"step": 494
},
{
"epoch": 1.2017010935601458,
"grad_norm": 21.985533615468846,
"learning_rate": 7.495983881256067e-05,
"loss": 1.1024,
"step": 495
},
{
"epoch": 1.2041312272174969,
"grad_norm": 23.02649898605792,
"learning_rate": 7.457110228796838e-05,
"loss": 1.1089,
"step": 496
},
{
"epoch": 1.206561360874848,
"grad_norm": 74.4950498938832,
"learning_rate": 7.418277616889282e-05,
"loss": 1.0439,
"step": 497
},
{
"epoch": 1.2089914945321993,
"grad_norm": 27.637660484960865,
"learning_rate": 7.379486672265964e-05,
"loss": 1.1453,
"step": 498
},
{
"epoch": 1.2114216281895505,
"grad_norm": 34.98561655821008,
"learning_rate": 7.340738020986961e-05,
"loss": 1.139,
"step": 499
},
{
"epoch": 1.2138517618469016,
"grad_norm": 28.47627677351389,
"learning_rate": 7.302032288429756e-05,
"loss": 1.0623,
"step": 500
},
{
"epoch": 1.2162818955042527,
"grad_norm": 39.551486186427596,
"learning_rate": 7.263370099279172e-05,
"loss": 1.1277,
"step": 501
},
{
"epoch": 1.2187120291616038,
"grad_norm": 44.12973085459368,
"learning_rate": 7.224752077517253e-05,
"loss": 1.1768,
"step": 502
},
{
"epoch": 1.2211421628189552,
"grad_norm": 84.84836585196132,
"learning_rate": 7.186178846413214e-05,
"loss": 1.1892,
"step": 503
},
{
"epoch": 1.2235722964763063,
"grad_norm": 34.94807915131505,
"learning_rate": 7.147651028513383e-05,
"loss": 1.1108,
"step": 504
},
{
"epoch": 1.2260024301336574,
"grad_norm": 46.19847384406232,
"learning_rate": 7.109169245631149e-05,
"loss": 1.0956,
"step": 505
},
{
"epoch": 1.2284325637910085,
"grad_norm": 38.58484473058957,
"learning_rate": 7.070734118836925e-05,
"loss": 1.1175,
"step": 506
},
{
"epoch": 1.2308626974483596,
"grad_norm": 37.84739298111386,
"learning_rate": 7.032346268448118e-05,
"loss": 1.1411,
"step": 507
},
{
"epoch": 1.2332928311057108,
"grad_norm": 53.5471335398439,
"learning_rate": 6.994006314019141e-05,
"loss": 1.1332,
"step": 508
},
{
"epoch": 1.2357229647630619,
"grad_norm": 91.55067777365485,
"learning_rate": 6.955714874331387e-05,
"loss": 1.1205,
"step": 509
},
{
"epoch": 1.2381530984204132,
"grad_norm": 27.05333642785952,
"learning_rate": 6.917472567383252e-05,
"loss": 1.099,
"step": 510
},
{
"epoch": 1.2405832320777643,
"grad_norm": 24.519879042487336,
"learning_rate": 6.87928001038017e-05,
"loss": 1.1401,
"step": 511
},
{
"epoch": 1.2430133657351154,
"grad_norm": 33.763495598365786,
"learning_rate": 6.84113781972464e-05,
"loss": 1.2058,
"step": 512
},
{
"epoch": 1.2454434993924666,
"grad_norm": 34.49114206138826,
"learning_rate": 6.803046611006278e-05,
"loss": 1.1044,
"step": 513
},
{
"epoch": 1.2478736330498177,
"grad_norm": 74.20211157975073,
"learning_rate": 6.765006998991888e-05,
"loss": 1.111,
"step": 514
},
{
"epoch": 1.250303766707169,
"grad_norm": 32.30436806042553,
"learning_rate": 6.727019597615545e-05,
"loss": 1.1063,
"step": 515
},
{
"epoch": 1.250303766707169,
"eval_loss": 1.1128273010253906,
"eval_runtime": 53.4998,
"eval_samples_per_second": 13.907,
"eval_steps_per_second": 1.738,
"step": 515
},
{
"epoch": 1.25273390036452,
"grad_norm": 42.104054612880084,
"learning_rate": 6.689085019968669e-05,
"loss": 1.1315,
"step": 516
},
{
"epoch": 1.2551640340218713,
"grad_norm": 25.66097714624212,
"learning_rate": 6.651203878290139e-05,
"loss": 1.0916,
"step": 517
},
{
"epoch": 1.2575941676792224,
"grad_norm": 35.12310576456352,
"learning_rate": 6.613376783956423e-05,
"loss": 1.0699,
"step": 518
},
{
"epoch": 1.2600243013365735,
"grad_norm": 34.172951559594566,
"learning_rate": 6.575604347471695e-05,
"loss": 1.1412,
"step": 519
},
{
"epoch": 1.2624544349939246,
"grad_norm": 54.373563773275116,
"learning_rate": 6.537887178457984e-05,
"loss": 1.1255,
"step": 520
},
{
"epoch": 1.2648845686512757,
"grad_norm": 33.806385046788755,
"learning_rate": 6.500225885645346e-05,
"loss": 1.101,
"step": 521
},
{
"epoch": 1.267314702308627,
"grad_norm": 34.17813695957543,
"learning_rate": 6.46262107686203e-05,
"loss": 1.1226,
"step": 522
},
{
"epoch": 1.2697448359659782,
"grad_norm": 24.68048087106548,
"learning_rate": 6.425073359024663e-05,
"loss": 1.1787,
"step": 523
},
{
"epoch": 1.2721749696233293,
"grad_norm": 32.78749757697808,
"learning_rate": 6.387583338128471e-05,
"loss": 1.0541,
"step": 524
},
{
"epoch": 1.2746051032806804,
"grad_norm": 30.906673844090044,
"learning_rate": 6.350151619237488e-05,
"loss": 1.0964,
"step": 525
},
{
"epoch": 1.2770352369380316,
"grad_norm": 32.571858392892736,
"learning_rate": 6.312778806474795e-05,
"loss": 1.1251,
"step": 526
},
{
"epoch": 1.2794653705953827,
"grad_norm": 43.02428916532565,
"learning_rate": 6.275465503012751e-05,
"loss": 1.0473,
"step": 527
},
{
"epoch": 1.2818955042527338,
"grad_norm": 60.93587506764561,
"learning_rate": 6.2382123110633e-05,
"loss": 1.078,
"step": 528
},
{
"epoch": 1.2843256379100851,
"grad_norm": 64.6934775930251,
"learning_rate": 6.201019831868208e-05,
"loss": 1.0904,
"step": 529
},
{
"epoch": 1.2867557715674363,
"grad_norm": 32.977077613035426,
"learning_rate": 6.16388866568938e-05,
"loss": 1.0705,
"step": 530
},
{
"epoch": 1.2891859052247874,
"grad_norm": 28.27407310492513,
"learning_rate": 6.126819411799175e-05,
"loss": 1.1252,
"step": 531
},
{
"epoch": 1.2916160388821385,
"grad_norm": 33.73515826089828,
"learning_rate": 6.0898126684707265e-05,
"loss": 1.1262,
"step": 532
},
{
"epoch": 1.2940461725394896,
"grad_norm": 25.370361818959903,
"learning_rate": 6.052869032968285e-05,
"loss": 1.0845,
"step": 533
},
{
"epoch": 1.296476306196841,
"grad_norm": 37.389287060597105,
"learning_rate": 6.015989101537586e-05,
"loss": 1.1352,
"step": 534
},
{
"epoch": 1.2989064398541919,
"grad_norm": 39.04755104008223,
"learning_rate": 5.979173469396227e-05,
"loss": 1.1538,
"step": 535
},
{
"epoch": 1.3013365735115432,
"grad_norm": 34.33676719612293,
"learning_rate": 5.9424227307240554e-05,
"loss": 1.1725,
"step": 536
},
{
"epoch": 1.3037667071688943,
"grad_norm": 64.66076997769457,
"learning_rate": 5.905737478653572e-05,
"loss": 1.1146,
"step": 537
},
{
"epoch": 1.3061968408262454,
"grad_norm": 48.043289790386325,
"learning_rate": 5.8691183052603834e-05,
"loss": 1.1035,
"step": 538
},
{
"epoch": 1.3086269744835966,
"grad_norm": 49.08397341659928,
"learning_rate": 5.83256580155362e-05,
"loss": 1.1653,
"step": 539
},
{
"epoch": 1.3110571081409477,
"grad_norm": 46.688886812303515,
"learning_rate": 5.796080557466406e-05,
"loss": 1.1328,
"step": 540
},
{
"epoch": 1.313487241798299,
"grad_norm": 27.503882325413493,
"learning_rate": 5.7596631618463514e-05,
"loss": 1.1019,
"step": 541
},
{
"epoch": 1.3159173754556501,
"grad_norm": 48.88974129574653,
"learning_rate": 5.723314202446026e-05,
"loss": 1.121,
"step": 542
},
{
"epoch": 1.3183475091130012,
"grad_norm": 28.105881157995345,
"learning_rate": 5.687034265913485e-05,
"loss": 1.0898,
"step": 543
},
{
"epoch": 1.3207776427703524,
"grad_norm": 30.410731278414804,
"learning_rate": 5.6508239377828034e-05,
"loss": 1.07,
"step": 544
},
{
"epoch": 1.3232077764277035,
"grad_norm": 38.08324176765882,
"learning_rate": 5.614683802464631e-05,
"loss": 1.1503,
"step": 545
},
{
"epoch": 1.3256379100850546,
"grad_norm": 46.28952293745534,
"learning_rate": 5.578614443236738e-05,
"loss": 1.1282,
"step": 546
},
{
"epoch": 1.3280680437424057,
"grad_norm": 68.2597453597135,
"learning_rate": 5.542616442234618e-05,
"loss": 1.1373,
"step": 547
},
{
"epoch": 1.330498177399757,
"grad_norm": 30.351663825014143,
"learning_rate": 5.5066903804421025e-05,
"loss": 1.1633,
"step": 548
},
{
"epoch": 1.3329283110571082,
"grad_norm": 38.2711285636887,
"learning_rate": 5.470836837681954e-05,
"loss": 1.1604,
"step": 549
},
{
"epoch": 1.3353584447144593,
"grad_norm": 35.64230091531108,
"learning_rate": 5.4350563926065404e-05,
"loss": 1.0564,
"step": 550
},
{
"epoch": 1.3377885783718104,
"grad_norm": 44.869816046925564,
"learning_rate": 5.399349622688479e-05,
"loss": 1.1376,
"step": 551
},
{
"epoch": 1.3402187120291615,
"grad_norm": 26.681037126315633,
"learning_rate": 5.3637171042113146e-05,
"loss": 1.0867,
"step": 552
},
{
"epoch": 1.3426488456865129,
"grad_norm": 34.6124686262535,
"learning_rate": 5.32815941226022e-05,
"loss": 1.0474,
"step": 553
},
{
"epoch": 1.3450789793438638,
"grad_norm": 35.92639009060983,
"learning_rate": 5.2926771207127254e-05,
"loss": 1.0958,
"step": 554
},
{
"epoch": 1.3475091130012151,
"grad_norm": 39.08938922562224,
"learning_rate": 5.2572708022294504e-05,
"loss": 1.074,
"step": 555
},
{
"epoch": 1.3499392466585662,
"grad_norm": 76.06708166273745,
"learning_rate": 5.2219410282448514e-05,
"loss": 1.0865,
"step": 556
},
{
"epoch": 1.3523693803159174,
"grad_norm": 74.14222265654887,
"learning_rate": 5.1866883689580056e-05,
"loss": 1.1567,
"step": 557
},
{
"epoch": 1.3547995139732685,
"grad_norm": 34.82441678662901,
"learning_rate": 5.151513393323426e-05,
"loss": 1.0802,
"step": 558
},
{
"epoch": 1.3572296476306196,
"grad_norm": 75.53504846566143,
"learning_rate": 5.116416669041843e-05,
"loss": 1.0623,
"step": 559
},
{
"epoch": 1.359659781287971,
"grad_norm": 29.423475817434785,
"learning_rate": 5.0813987625510775e-05,
"loss": 1.077,
"step": 560
},
{
"epoch": 1.362089914945322,
"grad_norm": 44.607486168434534,
"learning_rate": 5.046460239016879e-05,
"loss": 1.096,
"step": 561
},
{
"epoch": 1.3645200486026732,
"grad_norm": 40.684125033315404,
"learning_rate": 5.011601662323807e-05,
"loss": 1.148,
"step": 562
},
{
"epoch": 1.3669501822600243,
"grad_norm": 47.33103026318705,
"learning_rate": 4.976823595066128e-05,
"loss": 1.1712,
"step": 563
},
{
"epoch": 1.3693803159173754,
"grad_norm": 51.17017845058186,
"learning_rate": 4.9421265985387476e-05,
"loss": 1.1287,
"step": 564
},
{
"epoch": 1.3718104495747265,
"grad_norm": 50.76665552103517,
"learning_rate": 4.907511232728145e-05,
"loss": 1.1156,
"step": 565
},
{
"epoch": 1.3742405832320777,
"grad_norm": 32.6007633025874,
"learning_rate": 4.872978056303327e-05,
"loss": 1.1477,
"step": 566
},
{
"epoch": 1.376670716889429,
"grad_norm": 29.696241441710107,
"learning_rate": 4.8385276266068146e-05,
"loss": 1.0874,
"step": 567
},
{
"epoch": 1.37910085054678,
"grad_norm": 58.96613500379004,
"learning_rate": 4.804160499645667e-05,
"loss": 1.0616,
"step": 568
},
{
"epoch": 1.3815309842041312,
"grad_norm": 37.104100020310334,
"learning_rate": 4.7698772300824756e-05,
"loss": 1.0878,
"step": 569
},
{
"epoch": 1.3839611178614823,
"grad_norm": 51.735902941979305,
"learning_rate": 4.735678371226441e-05,
"loss": 1.0836,
"step": 570
},
{
"epoch": 1.3863912515188335,
"grad_norm": 55.49190976804079,
"learning_rate": 4.7015644750244306e-05,
"loss": 1.0473,
"step": 571
},
{
"epoch": 1.3888213851761848,
"grad_norm": 34.27972449829039,
"learning_rate": 4.6675360920520625e-05,
"loss": 1.0723,
"step": 572
},
{
"epoch": 1.391251518833536,
"grad_norm": 28.508157856527724,
"learning_rate": 4.6335937715048306e-05,
"loss": 1.0723,
"step": 573
},
{
"epoch": 1.393681652490887,
"grad_norm": 106.84009565003795,
"learning_rate": 4.599738061189244e-05,
"loss": 1.149,
"step": 574
},
{
"epoch": 1.3961117861482382,
"grad_norm": 50.543394606036294,
"learning_rate": 4.565969507513981e-05,
"loss": 1.0991,
"step": 575
},
{
"epoch": 1.3985419198055893,
"grad_norm": 30.409124335052745,
"learning_rate": 4.532288655481062e-05,
"loss": 1.1157,
"step": 576
},
{
"epoch": 1.4009720534629404,
"grad_norm": 89.92061876679301,
"learning_rate": 4.498696048677059e-05,
"loss": 1.1526,
"step": 577
},
{
"epoch": 1.4034021871202915,
"grad_norm": 84.27775422110602,
"learning_rate": 4.465192229264337e-05,
"loss": 1.1418,
"step": 578
},
{
"epoch": 1.4058323207776429,
"grad_norm": 40.7815489623743,
"learning_rate": 4.4317777379722866e-05,
"loss": 1.0831,
"step": 579
},
{
"epoch": 1.408262454434994,
"grad_norm": 66.6911504313278,
"learning_rate": 4.3984531140885943e-05,
"loss": 1.1088,
"step": 580
},
{
"epoch": 1.410692588092345,
"grad_norm": 137.00882181835217,
"learning_rate": 4.365218895450558e-05,
"loss": 1.1089,
"step": 581
},
{
"epoch": 1.4131227217496962,
"grad_norm": 41.139168895296855,
"learning_rate": 4.332075618436386e-05,
"loss": 1.1603,
"step": 582
},
{
"epoch": 1.4155528554070473,
"grad_norm": 35.443969765428506,
"learning_rate": 4.29902381795655e-05,
"loss": 1.0301,
"step": 583
},
{
"epoch": 1.4179829890643987,
"grad_norm": 32.931514576694674,
"learning_rate": 4.266064027445155e-05,
"loss": 1.1016,
"step": 584
},
{
"epoch": 1.4204131227217496,
"grad_norm": 64.21015694858382,
"learning_rate": 4.2331967788513295e-05,
"loss": 1.0789,
"step": 585
},
{
"epoch": 1.422843256379101,
"grad_norm": 84.13251752827094,
"learning_rate": 4.200422602630629e-05,
"loss": 1.1573,
"step": 586
},
{
"epoch": 1.425273390036452,
"grad_norm": 53.61636603108024,
"learning_rate": 4.167742027736482e-05,
"loss": 1.0942,
"step": 587
},
{
"epoch": 1.4277035236938032,
"grad_norm": 133.20877569415256,
"learning_rate": 4.135155581611661e-05,
"loss": 1.0877,
"step": 588
},
{
"epoch": 1.4301336573511543,
"grad_norm": 49.85736467319357,
"learning_rate": 4.102663790179764e-05,
"loss": 1.0619,
"step": 589
},
{
"epoch": 1.4325637910085054,
"grad_norm": 91.13217639524017,
"learning_rate": 4.070267177836712e-05,
"loss": 1.1093,
"step": 590
},
{
"epoch": 1.4349939246658567,
"grad_norm": 49.25558128250457,
"learning_rate": 4.037966267442315e-05,
"loss": 1.1344,
"step": 591
},
{
"epoch": 1.4374240583232079,
"grad_norm": 95.87244356130316,
"learning_rate": 4.005761580311805e-05,
"loss": 1.0929,
"step": 592
},
{
"epoch": 1.439854191980559,
"grad_norm": 74.28903671045653,
"learning_rate": 3.973653636207437e-05,
"loss": 1.1263,
"step": 593
},
{
"epoch": 1.44228432563791,
"grad_norm": 53.99454529785116,
"learning_rate": 3.941642953330103e-05,
"loss": 1.0916,
"step": 594
},
{
"epoch": 1.4447144592952612,
"grad_norm": 113.26015597338959,
"learning_rate": 3.909730048310962e-05,
"loss": 1.1009,
"step": 595
},
{
"epoch": 1.4471445929526123,
"grad_norm": 134.4015550981493,
"learning_rate": 3.8779154362030986e-05,
"loss": 1.1351,
"step": 596
},
{
"epoch": 1.4495747266099634,
"grad_norm": 90.61611981238187,
"learning_rate": 3.846199630473216e-05,
"loss": 1.0827,
"step": 597
},
{
"epoch": 1.4520048602673148,
"grad_norm": 56.55050791518521,
"learning_rate": 3.814583142993352e-05,
"loss": 1.1145,
"step": 598
},
{
"epoch": 1.454434993924666,
"grad_norm": 265.6916535243014,
"learning_rate": 3.7830664840326145e-05,
"loss": 1.1459,
"step": 599
},
{
"epoch": 1.456865127582017,
"grad_norm": 72.81191101030372,
"learning_rate": 3.7516501622489367e-05,
"loss": 1.0903,
"step": 600
},
{
"epoch": 1.4592952612393681,
"grad_norm": 58.309143549086556,
"learning_rate": 3.720334684680889e-05,
"loss": 1.1041,
"step": 601
},
{
"epoch": 1.4617253948967193,
"grad_norm": 35.19205741792398,
"learning_rate": 3.689120556739475e-05,
"loss": 1.1523,
"step": 602
},
{
"epoch": 1.4641555285540706,
"grad_norm": 88.97226951757321,
"learning_rate": 3.6580082821999786e-05,
"loss": 1.1117,
"step": 603
},
{
"epoch": 1.4665856622114215,
"grad_norm": 64.50873879301322,
"learning_rate": 3.6269983631938475e-05,
"loss": 1.1256,
"step": 604
},
{
"epoch": 1.4690157958687728,
"grad_norm": 78.10556611104111,
"learning_rate": 3.596091300200578e-05,
"loss": 1.0834,
"step": 605
},
{
"epoch": 1.471445929526124,
"grad_norm": 69.38449946362529,
"learning_rate": 3.565287592039628e-05,
"loss": 1.1026,
"step": 606
},
{
"epoch": 1.473876063183475,
"grad_norm": 79.60241521456905,
"learning_rate": 3.534587735862391e-05,
"loss": 1.0456,
"step": 607
},
{
"epoch": 1.4763061968408262,
"grad_norm": 89.68581306071424,
"learning_rate": 3.503992227144147e-05,
"loss": 1.0809,
"step": 608
},
{
"epoch": 1.4787363304981773,
"grad_norm": 68.570527237558,
"learning_rate": 3.473501559676088e-05,
"loss": 1.0754,
"step": 609
},
{
"epoch": 1.4811664641555287,
"grad_norm": 54.94762317625427,
"learning_rate": 3.4431162255573245e-05,
"loss": 1.1751,
"step": 610
},
{
"epoch": 1.4835965978128798,
"grad_norm": 109.12821602719706,
"learning_rate": 3.4128367151869714e-05,
"loss": 1.1055,
"step": 611
},
{
"epoch": 1.486026731470231,
"grad_norm": 198.79030469542352,
"learning_rate": 3.3826635172562094e-05,
"loss": 1.1369,
"step": 612
},
{
"epoch": 1.488456865127582,
"grad_norm": 62.002866716809,
"learning_rate": 3.352597118740404e-05,
"loss": 1.1611,
"step": 613
},
{
"epoch": 1.4908869987849331,
"grad_norm": 79.21193137029579,
"learning_rate": 3.3226380048912585e-05,
"loss": 1.1688,
"step": 614
},
{
"epoch": 1.4933171324422843,
"grad_norm": 68.6722934326242,
"learning_rate": 3.292786659228973e-05,
"loss": 1.1248,
"step": 615
},
{
"epoch": 1.4957472660996354,
"grad_norm": 104.34122241838278,
"learning_rate": 3.263043563534428e-05,
"loss": 1.1425,
"step": 616
},
{
"epoch": 1.4981773997569867,
"grad_norm": 86.43862038340298,
"learning_rate": 3.233409197841437e-05,
"loss": 1.0562,
"step": 617
},
{
"epoch": 1.5006075334143378,
"grad_norm": 79.74137751394451,
"learning_rate": 3.2038840404289705e-05,
"loss": 1.1214,
"step": 618
},
{
"epoch": 1.5006075334143378,
"eval_loss": 1.1088899374008179,
"eval_runtime": 53.0545,
"eval_samples_per_second": 14.023,
"eval_steps_per_second": 1.753,
"step": 618
}
],
"logging_steps": 1,
"max_steps": 822,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 206,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.157723878347244e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}