MNLP_M3_rag_model / checkpoint-7600 /trainer_state.json
qchapp's picture
Upload folder using huggingface_hub
44a7556 verified
{
"best_global_step": 7600,
"best_metric": 0.7774137258529663,
"best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-7600",
"epoch": 3.97036166601384,
"eval_steps": 200,
"global_step": 7600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005222613918266093,
"grad_norm": 11.394719123840332,
"learning_rate": 9.000000000000001e-07,
"loss": 3.5515,
"step": 10
},
{
"epoch": 0.010445227836532185,
"grad_norm": 6.919948577880859,
"learning_rate": 1.9000000000000002e-06,
"loss": 3.5008,
"step": 20
},
{
"epoch": 0.015667841754798278,
"grad_norm": 7.034717559814453,
"learning_rate": 2.9e-06,
"loss": 3.29,
"step": 30
},
{
"epoch": 0.02089045567306437,
"grad_norm": 5.738417625427246,
"learning_rate": 3.900000000000001e-06,
"loss": 3.1144,
"step": 40
},
{
"epoch": 0.02611306959133046,
"grad_norm": 6.416561603546143,
"learning_rate": 4.9000000000000005e-06,
"loss": 3.0745,
"step": 50
},
{
"epoch": 0.031335683509596556,
"grad_norm": 5.485833168029785,
"learning_rate": 5.9e-06,
"loss": 2.8625,
"step": 60
},
{
"epoch": 0.036558297427862645,
"grad_norm": 5.88957405090332,
"learning_rate": 6.9e-06,
"loss": 2.8214,
"step": 70
},
{
"epoch": 0.04178091134612874,
"grad_norm": 6.4762654304504395,
"learning_rate": 7.9e-06,
"loss": 2.8076,
"step": 80
},
{
"epoch": 0.04700352526439483,
"grad_norm": 6.038456916809082,
"learning_rate": 8.900000000000001e-06,
"loss": 2.7299,
"step": 90
},
{
"epoch": 0.05222613918266092,
"grad_norm": 7.74363899230957,
"learning_rate": 9.9e-06,
"loss": 2.5875,
"step": 100
},
{
"epoch": 0.057448753100927015,
"grad_norm": 11.824464797973633,
"learning_rate": 9.990496304118268e-06,
"loss": 2.5328,
"step": 110
},
{
"epoch": 0.06267136701919311,
"grad_norm": 6.736820697784424,
"learning_rate": 9.979936642027456e-06,
"loss": 2.414,
"step": 120
},
{
"epoch": 0.0678939809374592,
"grad_norm": 6.303720474243164,
"learning_rate": 9.969376979936643e-06,
"loss": 2.2947,
"step": 130
},
{
"epoch": 0.07311659485572529,
"grad_norm": 6.727591037750244,
"learning_rate": 9.95881731784583e-06,
"loss": 2.2003,
"step": 140
},
{
"epoch": 0.07833920877399138,
"grad_norm": 8.046416282653809,
"learning_rate": 9.948257655755017e-06,
"loss": 2.0726,
"step": 150
},
{
"epoch": 0.08356182269225748,
"grad_norm": 8.67299747467041,
"learning_rate": 9.937697993664203e-06,
"loss": 2.2524,
"step": 160
},
{
"epoch": 0.08878443661052357,
"grad_norm": 7.629809856414795,
"learning_rate": 9.927138331573391e-06,
"loss": 2.0773,
"step": 170
},
{
"epoch": 0.09400705052878966,
"grad_norm": 10.00472640991211,
"learning_rate": 9.916578669482577e-06,
"loss": 1.9697,
"step": 180
},
{
"epoch": 0.09922966444705575,
"grad_norm": 7.71968412399292,
"learning_rate": 9.907074973600845e-06,
"loss": 2.1101,
"step": 190
},
{
"epoch": 0.10445227836532184,
"grad_norm": 6.909250736236572,
"learning_rate": 9.896515311510033e-06,
"loss": 1.866,
"step": 200
},
{
"epoch": 0.10445227836532184,
"eval_loss": 2.0226237773895264,
"eval_runtime": 46.6876,
"eval_samples_per_second": 36.455,
"eval_steps_per_second": 4.562,
"step": 200
},
{
"epoch": 0.10967489228358794,
"grad_norm": 6.597925186157227,
"learning_rate": 9.88595564941922e-06,
"loss": 1.9556,
"step": 210
},
{
"epoch": 0.11489750620185403,
"grad_norm": 9.504620552062988,
"learning_rate": 9.875395987328407e-06,
"loss": 1.9974,
"step": 220
},
{
"epoch": 0.12012012012012012,
"grad_norm": 6.907344818115234,
"learning_rate": 9.864836325237593e-06,
"loss": 1.8866,
"step": 230
},
{
"epoch": 0.12534273403838622,
"grad_norm": 9.235527038574219,
"learning_rate": 9.85427666314678e-06,
"loss": 1.8387,
"step": 240
},
{
"epoch": 0.1305653479566523,
"grad_norm": 7.033239841461182,
"learning_rate": 9.843717001055967e-06,
"loss": 1.7348,
"step": 250
},
{
"epoch": 0.1357879618749184,
"grad_norm": 10.7998628616333,
"learning_rate": 9.833157338965154e-06,
"loss": 1.7569,
"step": 260
},
{
"epoch": 0.1410105757931845,
"grad_norm": 7.843267917633057,
"learning_rate": 9.82259767687434e-06,
"loss": 1.7569,
"step": 270
},
{
"epoch": 0.14623318971145058,
"grad_norm": 6.2468953132629395,
"learning_rate": 9.812038014783528e-06,
"loss": 1.6301,
"step": 280
},
{
"epoch": 0.15145580362971667,
"grad_norm": 7.654909133911133,
"learning_rate": 9.801478352692714e-06,
"loss": 1.7127,
"step": 290
},
{
"epoch": 0.15667841754798276,
"grad_norm": 7.152418613433838,
"learning_rate": 9.790918690601902e-06,
"loss": 1.8976,
"step": 300
},
{
"epoch": 0.16190103146624885,
"grad_norm": 7.338048458099365,
"learning_rate": 9.780359028511088e-06,
"loss": 1.7498,
"step": 310
},
{
"epoch": 0.16712364538451496,
"grad_norm": 7.256304740905762,
"learning_rate": 9.769799366420275e-06,
"loss": 1.7501,
"step": 320
},
{
"epoch": 0.17234625930278105,
"grad_norm": 10.67475700378418,
"learning_rate": 9.759239704329462e-06,
"loss": 1.6759,
"step": 330
},
{
"epoch": 0.17756887322104714,
"grad_norm": 7.884083271026611,
"learning_rate": 9.74868004223865e-06,
"loss": 1.707,
"step": 340
},
{
"epoch": 0.18279148713931323,
"grad_norm": 8.517298698425293,
"learning_rate": 9.738120380147837e-06,
"loss": 1.5422,
"step": 350
},
{
"epoch": 0.18801410105757932,
"grad_norm": 6.652080059051514,
"learning_rate": 9.727560718057023e-06,
"loss": 1.6762,
"step": 360
},
{
"epoch": 0.1932367149758454,
"grad_norm": 6.86594820022583,
"learning_rate": 9.71700105596621e-06,
"loss": 1.5937,
"step": 370
},
{
"epoch": 0.1984593288941115,
"grad_norm": 7.43917989730835,
"learning_rate": 9.707497360084478e-06,
"loss": 1.4299,
"step": 380
},
{
"epoch": 0.2036819428123776,
"grad_norm": 10.837226867675781,
"learning_rate": 9.696937697993665e-06,
"loss": 1.5797,
"step": 390
},
{
"epoch": 0.20890455673064368,
"grad_norm": 10.075883865356445,
"learning_rate": 9.686378035902851e-06,
"loss": 1.5084,
"step": 400
},
{
"epoch": 0.20890455673064368,
"eval_loss": 1.625764012336731,
"eval_runtime": 46.2554,
"eval_samples_per_second": 36.796,
"eval_steps_per_second": 4.605,
"step": 400
},
{
"epoch": 0.2141271706489098,
"grad_norm": 6.131842613220215,
"learning_rate": 9.675818373812039e-06,
"loss": 1.5666,
"step": 410
},
{
"epoch": 0.21934978456717588,
"grad_norm": 8.409153938293457,
"learning_rate": 9.665258711721227e-06,
"loss": 1.6822,
"step": 420
},
{
"epoch": 0.22457239848544197,
"grad_norm": 8.761375427246094,
"learning_rate": 9.654699049630413e-06,
"loss": 1.3924,
"step": 430
},
{
"epoch": 0.22979501240370806,
"grad_norm": 6.627100944519043,
"learning_rate": 9.6441393875396e-06,
"loss": 1.6737,
"step": 440
},
{
"epoch": 0.23501762632197415,
"grad_norm": 9.165101051330566,
"learning_rate": 9.633579725448786e-06,
"loss": 1.4854,
"step": 450
},
{
"epoch": 0.24024024024024024,
"grad_norm": 8.033590316772461,
"learning_rate": 9.623020063357974e-06,
"loss": 1.4066,
"step": 460
},
{
"epoch": 0.24546285415850633,
"grad_norm": 7.320120811462402,
"learning_rate": 9.612460401267162e-06,
"loss": 1.4489,
"step": 470
},
{
"epoch": 0.25068546807677244,
"grad_norm": 6.342758655548096,
"learning_rate": 9.601900739176348e-06,
"loss": 1.4078,
"step": 480
},
{
"epoch": 0.2559080819950385,
"grad_norm": 7.489528179168701,
"learning_rate": 9.591341077085534e-06,
"loss": 1.3413,
"step": 490
},
{
"epoch": 0.2611306959133046,
"grad_norm": 6.255088806152344,
"learning_rate": 9.58078141499472e-06,
"loss": 1.4824,
"step": 500
},
{
"epoch": 0.2663533098315707,
"grad_norm": 6.387566089630127,
"learning_rate": 9.570221752903908e-06,
"loss": 1.2526,
"step": 510
},
{
"epoch": 0.2715759237498368,
"grad_norm": 7.210233688354492,
"learning_rate": 9.559662090813095e-06,
"loss": 1.4269,
"step": 520
},
{
"epoch": 0.27679853766810286,
"grad_norm": 6.717288017272949,
"learning_rate": 9.549102428722282e-06,
"loss": 1.4725,
"step": 530
},
{
"epoch": 0.282021151586369,
"grad_norm": 6.161440372467041,
"learning_rate": 9.538542766631469e-06,
"loss": 1.4743,
"step": 540
},
{
"epoch": 0.2872437655046351,
"grad_norm": 6.798609733581543,
"learning_rate": 9.527983104540655e-06,
"loss": 1.4601,
"step": 550
},
{
"epoch": 0.29246637942290116,
"grad_norm": 7.112600326538086,
"learning_rate": 9.517423442449843e-06,
"loss": 1.3427,
"step": 560
},
{
"epoch": 0.2976889933411673,
"grad_norm": 5.958874225616455,
"learning_rate": 9.50686378035903e-06,
"loss": 1.5848,
"step": 570
},
{
"epoch": 0.30291160725943334,
"grad_norm": 8.950939178466797,
"learning_rate": 9.496304118268215e-06,
"loss": 1.3731,
"step": 580
},
{
"epoch": 0.30813422117769945,
"grad_norm": 7.173642635345459,
"learning_rate": 9.485744456177403e-06,
"loss": 1.4672,
"step": 590
},
{
"epoch": 0.3133568350959655,
"grad_norm": 5.97654390335083,
"learning_rate": 9.47518479408659e-06,
"loss": 1.3207,
"step": 600
},
{
"epoch": 0.3133568350959655,
"eval_loss": 1.425838589668274,
"eval_runtime": 46.3712,
"eval_samples_per_second": 36.704,
"eval_steps_per_second": 4.593,
"step": 600
},
{
"epoch": 0.31857944901423163,
"grad_norm": 6.612730503082275,
"learning_rate": 9.464625131995778e-06,
"loss": 1.3925,
"step": 610
},
{
"epoch": 0.3238020629324977,
"grad_norm": 7.3144049644470215,
"learning_rate": 9.454065469904964e-06,
"loss": 1.494,
"step": 620
},
{
"epoch": 0.3290246768507638,
"grad_norm": 5.725366592407227,
"learning_rate": 9.44350580781415e-06,
"loss": 1.4134,
"step": 630
},
{
"epoch": 0.3342472907690299,
"grad_norm": 8.623462677001953,
"learning_rate": 9.432946145723338e-06,
"loss": 1.144,
"step": 640
},
{
"epoch": 0.339469904687296,
"grad_norm": 6.790163040161133,
"learning_rate": 9.422386483632524e-06,
"loss": 1.4894,
"step": 650
},
{
"epoch": 0.3446925186055621,
"grad_norm": 7.366871356964111,
"learning_rate": 9.411826821541712e-06,
"loss": 1.3748,
"step": 660
},
{
"epoch": 0.34991513252382817,
"grad_norm": 5.874607086181641,
"learning_rate": 9.401267159450899e-06,
"loss": 1.0795,
"step": 670
},
{
"epoch": 0.3551377464420943,
"grad_norm": 6.789367198944092,
"learning_rate": 9.390707497360085e-06,
"loss": 1.2088,
"step": 680
},
{
"epoch": 0.36036036036036034,
"grad_norm": 6.885139465332031,
"learning_rate": 9.380147835269273e-06,
"loss": 1.1882,
"step": 690
},
{
"epoch": 0.36558297427862646,
"grad_norm": 7.129133224487305,
"learning_rate": 9.369588173178459e-06,
"loss": 1.2315,
"step": 700
},
{
"epoch": 0.3708055881968925,
"grad_norm": 6.09841775894165,
"learning_rate": 9.359028511087645e-06,
"loss": 1.1916,
"step": 710
},
{
"epoch": 0.37602820211515864,
"grad_norm": 6.911228179931641,
"learning_rate": 9.348468848996833e-06,
"loss": 1.44,
"step": 720
},
{
"epoch": 0.38125081603342476,
"grad_norm": 8.852502822875977,
"learning_rate": 9.33790918690602e-06,
"loss": 1.2999,
"step": 730
},
{
"epoch": 0.3864734299516908,
"grad_norm": 7.887015342712402,
"learning_rate": 9.327349524815207e-06,
"loss": 1.3509,
"step": 740
},
{
"epoch": 0.39169604386995693,
"grad_norm": 5.902195930480957,
"learning_rate": 9.316789862724394e-06,
"loss": 1.2506,
"step": 750
},
{
"epoch": 0.396918657788223,
"grad_norm": 5.92559814453125,
"learning_rate": 9.30623020063358e-06,
"loss": 1.1291,
"step": 760
},
{
"epoch": 0.4021412717064891,
"grad_norm": 7.447033405303955,
"learning_rate": 9.295670538542766e-06,
"loss": 1.1785,
"step": 770
},
{
"epoch": 0.4073638856247552,
"grad_norm": 5.407535552978516,
"learning_rate": 9.285110876451954e-06,
"loss": 1.1236,
"step": 780
},
{
"epoch": 0.4125864995430213,
"grad_norm": 5.1882219314575195,
"learning_rate": 9.274551214361142e-06,
"loss": 1.2353,
"step": 790
},
{
"epoch": 0.41780911346128735,
"grad_norm": 7.093064785003662,
"learning_rate": 9.263991552270328e-06,
"loss": 1.459,
"step": 800
},
{
"epoch": 0.41780911346128735,
"eval_loss": 1.3080272674560547,
"eval_runtime": 46.2478,
"eval_samples_per_second": 36.802,
"eval_steps_per_second": 4.606,
"step": 800
},
{
"epoch": 0.42303172737955347,
"grad_norm": 6.416601657867432,
"learning_rate": 9.253431890179515e-06,
"loss": 1.3287,
"step": 810
},
{
"epoch": 0.4282543412978196,
"grad_norm": 6.671374797821045,
"learning_rate": 9.242872228088701e-06,
"loss": 1.2637,
"step": 820
},
{
"epoch": 0.43347695521608565,
"grad_norm": 6.5349931716918945,
"learning_rate": 9.232312565997889e-06,
"loss": 1.2804,
"step": 830
},
{
"epoch": 0.43869956913435176,
"grad_norm": 5.837822437286377,
"learning_rate": 9.221752903907075e-06,
"loss": 1.4397,
"step": 840
},
{
"epoch": 0.4439221830526178,
"grad_norm": 6.69824743270874,
"learning_rate": 9.211193241816263e-06,
"loss": 1.2325,
"step": 850
},
{
"epoch": 0.44914479697088394,
"grad_norm": 5.331833362579346,
"learning_rate": 9.20063357972545e-06,
"loss": 1.3133,
"step": 860
},
{
"epoch": 0.45436741088915,
"grad_norm": 6.8653950691223145,
"learning_rate": 9.190073917634637e-06,
"loss": 1.0374,
"step": 870
},
{
"epoch": 0.4595900248074161,
"grad_norm": 6.36031436920166,
"learning_rate": 9.179514255543824e-06,
"loss": 1.0597,
"step": 880
},
{
"epoch": 0.4648126387256822,
"grad_norm": 3.890155553817749,
"learning_rate": 9.16895459345301e-06,
"loss": 1.2453,
"step": 890
},
{
"epoch": 0.4700352526439483,
"grad_norm": 5.179451942443848,
"learning_rate": 9.158394931362196e-06,
"loss": 1.0745,
"step": 900
},
{
"epoch": 0.4752578665622144,
"grad_norm": 7.032919406890869,
"learning_rate": 9.147835269271384e-06,
"loss": 1.1987,
"step": 910
},
{
"epoch": 0.4804804804804805,
"grad_norm": 6.223219394683838,
"learning_rate": 9.137275607180572e-06,
"loss": 1.2441,
"step": 920
},
{
"epoch": 0.4857030943987466,
"grad_norm": 8.567842483520508,
"learning_rate": 9.126715945089758e-06,
"loss": 1.1818,
"step": 930
},
{
"epoch": 0.49092570831701265,
"grad_norm": 5.338006019592285,
"learning_rate": 9.116156282998945e-06,
"loss": 1.1527,
"step": 940
},
{
"epoch": 0.49614832223527877,
"grad_norm": 6.287044048309326,
"learning_rate": 9.10559662090813e-06,
"loss": 1.3234,
"step": 950
},
{
"epoch": 0.5013709361535449,
"grad_norm": 6.575079441070557,
"learning_rate": 9.095036958817319e-06,
"loss": 1.1989,
"step": 960
},
{
"epoch": 0.506593550071811,
"grad_norm": 7.368027687072754,
"learning_rate": 9.084477296726507e-06,
"loss": 1.0985,
"step": 970
},
{
"epoch": 0.511816163990077,
"grad_norm": 5.3375020027160645,
"learning_rate": 9.073917634635693e-06,
"loss": 1.1171,
"step": 980
},
{
"epoch": 0.5170387779083431,
"grad_norm": 7.050992965698242,
"learning_rate": 9.06335797254488e-06,
"loss": 1.2531,
"step": 990
},
{
"epoch": 0.5222613918266092,
"grad_norm": 6.947799205780029,
"learning_rate": 9.052798310454066e-06,
"loss": 1.0675,
"step": 1000
},
{
"epoch": 0.5222613918266092,
"eval_loss": 1.2219752073287964,
"eval_runtime": 46.2981,
"eval_samples_per_second": 36.762,
"eval_steps_per_second": 4.601,
"step": 1000
},
{
"epoch": 0.5274840057448753,
"grad_norm": 10.056715965270996,
"learning_rate": 9.042238648363253e-06,
"loss": 1.1444,
"step": 1010
},
{
"epoch": 0.5327066196631414,
"grad_norm": 6.277551651000977,
"learning_rate": 9.03167898627244e-06,
"loss": 1.118,
"step": 1020
},
{
"epoch": 0.5379292335814075,
"grad_norm": 5.285930633544922,
"learning_rate": 9.021119324181626e-06,
"loss": 1.0377,
"step": 1030
},
{
"epoch": 0.5431518474996736,
"grad_norm": 5.600802421569824,
"learning_rate": 9.010559662090814e-06,
"loss": 1.0265,
"step": 1040
},
{
"epoch": 0.5483744614179397,
"grad_norm": 6.1389007568359375,
"learning_rate": 9e-06,
"loss": 1.1026,
"step": 1050
},
{
"epoch": 0.5535970753362057,
"grad_norm": 7.223113536834717,
"learning_rate": 8.989440337909188e-06,
"loss": 1.1998,
"step": 1060
},
{
"epoch": 0.5588196892544719,
"grad_norm": 8.13656997680664,
"learning_rate": 8.978880675818374e-06,
"loss": 1.0726,
"step": 1070
},
{
"epoch": 0.564042303172738,
"grad_norm": 7.210083484649658,
"learning_rate": 8.96832101372756e-06,
"loss": 1.1418,
"step": 1080
},
{
"epoch": 0.569264917091004,
"grad_norm": 7.832534313201904,
"learning_rate": 8.957761351636749e-06,
"loss": 1.1464,
"step": 1090
},
{
"epoch": 0.5744875310092702,
"grad_norm": 5.135114669799805,
"learning_rate": 8.947201689545935e-06,
"loss": 1.0915,
"step": 1100
},
{
"epoch": 0.5797101449275363,
"grad_norm": 8.231823921203613,
"learning_rate": 8.936642027455123e-06,
"loss": 1.1763,
"step": 1110
},
{
"epoch": 0.5849327588458023,
"grad_norm": 5.530185699462891,
"learning_rate": 8.926082365364309e-06,
"loss": 1.1993,
"step": 1120
},
{
"epoch": 0.5901553727640684,
"grad_norm": 5.953641414642334,
"learning_rate": 8.915522703273495e-06,
"loss": 1.0549,
"step": 1130
},
{
"epoch": 0.5953779866823345,
"grad_norm": 5.919338226318359,
"learning_rate": 8.904963041182683e-06,
"loss": 1.0351,
"step": 1140
},
{
"epoch": 0.6006006006006006,
"grad_norm": 6.962036609649658,
"learning_rate": 8.89440337909187e-06,
"loss": 1.046,
"step": 1150
},
{
"epoch": 0.6058232145188667,
"grad_norm": 5.828774929046631,
"learning_rate": 8.883843717001058e-06,
"loss": 1.087,
"step": 1160
},
{
"epoch": 0.6110458284371327,
"grad_norm": 6.581724643707275,
"learning_rate": 8.873284054910244e-06,
"loss": 1.0683,
"step": 1170
},
{
"epoch": 0.6162684423553989,
"grad_norm": 7.396463394165039,
"learning_rate": 8.86272439281943e-06,
"loss": 1.1508,
"step": 1180
},
{
"epoch": 0.621491056273665,
"grad_norm": 5.524245262145996,
"learning_rate": 8.852164730728618e-06,
"loss": 1.155,
"step": 1190
},
{
"epoch": 0.626713670191931,
"grad_norm": 8.499662399291992,
"learning_rate": 8.841605068637804e-06,
"loss": 1.2246,
"step": 1200
},
{
"epoch": 0.626713670191931,
"eval_loss": 1.154821753501892,
"eval_runtime": 46.2456,
"eval_samples_per_second": 36.803,
"eval_steps_per_second": 4.606,
"step": 1200
},
{
"epoch": 0.6319362841101972,
"grad_norm": 4.054498195648193,
"learning_rate": 8.83104540654699e-06,
"loss": 1.1299,
"step": 1210
},
{
"epoch": 0.6371588980284633,
"grad_norm": 6.266629695892334,
"learning_rate": 8.820485744456179e-06,
"loss": 1.0569,
"step": 1220
},
{
"epoch": 0.6423815119467293,
"grad_norm": 7.285578727722168,
"learning_rate": 8.809926082365365e-06,
"loss": 1.1132,
"step": 1230
},
{
"epoch": 0.6476041258649954,
"grad_norm": 7.280442237854004,
"learning_rate": 8.799366420274553e-06,
"loss": 1.1091,
"step": 1240
},
{
"epoch": 0.6528267397832616,
"grad_norm": 6.459787368774414,
"learning_rate": 8.788806758183739e-06,
"loss": 1.1433,
"step": 1250
},
{
"epoch": 0.6580493537015276,
"grad_norm": 6.095096588134766,
"learning_rate": 8.778247096092925e-06,
"loss": 1.1457,
"step": 1260
},
{
"epoch": 0.6632719676197937,
"grad_norm": 6.624663352966309,
"learning_rate": 8.767687434002112e-06,
"loss": 1.1755,
"step": 1270
},
{
"epoch": 0.6684945815380599,
"grad_norm": 5.858925819396973,
"learning_rate": 8.7571277719113e-06,
"loss": 0.9852,
"step": 1280
},
{
"epoch": 0.6737171954563259,
"grad_norm": 6.378047943115234,
"learning_rate": 8.746568109820487e-06,
"loss": 0.989,
"step": 1290
},
{
"epoch": 0.678939809374592,
"grad_norm": 6.781316757202148,
"learning_rate": 8.736008447729674e-06,
"loss": 0.8984,
"step": 1300
},
{
"epoch": 0.684162423292858,
"grad_norm": 5.8333940505981445,
"learning_rate": 8.72544878563886e-06,
"loss": 1.0745,
"step": 1310
},
{
"epoch": 0.6893850372111242,
"grad_norm": 5.035146236419678,
"learning_rate": 8.714889123548046e-06,
"loss": 0.9417,
"step": 1320
},
{
"epoch": 0.6946076511293903,
"grad_norm": 5.607509613037109,
"learning_rate": 8.704329461457234e-06,
"loss": 1.0106,
"step": 1330
},
{
"epoch": 0.6998302650476563,
"grad_norm": 8.404295921325684,
"learning_rate": 8.69376979936642e-06,
"loss": 1.063,
"step": 1340
},
{
"epoch": 0.7050528789659224,
"grad_norm": 6.693871021270752,
"learning_rate": 8.683210137275608e-06,
"loss": 0.9467,
"step": 1350
},
{
"epoch": 0.7102754928841886,
"grad_norm": 8.556498527526855,
"learning_rate": 8.672650475184795e-06,
"loss": 0.9936,
"step": 1360
},
{
"epoch": 0.7154981068024546,
"grad_norm": 6.516254425048828,
"learning_rate": 8.662090813093983e-06,
"loss": 0.958,
"step": 1370
},
{
"epoch": 0.7207207207207207,
"grad_norm": 6.926424026489258,
"learning_rate": 8.651531151003169e-06,
"loss": 1.0758,
"step": 1380
},
{
"epoch": 0.7259433346389869,
"grad_norm": 5.722439765930176,
"learning_rate": 8.640971488912355e-06,
"loss": 0.987,
"step": 1390
},
{
"epoch": 0.7311659485572529,
"grad_norm": 5.813411712646484,
"learning_rate": 8.630411826821541e-06,
"loss": 1.0847,
"step": 1400
},
{
"epoch": 0.7311659485572529,
"eval_loss": 1.0959590673446655,
"eval_runtime": 46.2353,
"eval_samples_per_second": 36.812,
"eval_steps_per_second": 4.607,
"step": 1400
},
{
"epoch": 0.736388562475519,
"grad_norm": 9.68322467803955,
"learning_rate": 8.61985216473073e-06,
"loss": 1.0947,
"step": 1410
},
{
"epoch": 0.741611176393785,
"grad_norm": 5.5112762451171875,
"learning_rate": 8.609292502639917e-06,
"loss": 1.0718,
"step": 1420
},
{
"epoch": 0.7468337903120512,
"grad_norm": 6.487595558166504,
"learning_rate": 8.598732840549104e-06,
"loss": 1.008,
"step": 1430
},
{
"epoch": 0.7520564042303173,
"grad_norm": 6.72760534286499,
"learning_rate": 8.58817317845829e-06,
"loss": 1.0198,
"step": 1440
},
{
"epoch": 0.7572790181485833,
"grad_norm": 6.073751449584961,
"learning_rate": 8.577613516367476e-06,
"loss": 1.0065,
"step": 1450
},
{
"epoch": 0.7625016320668495,
"grad_norm": 6.201911449432373,
"learning_rate": 8.567053854276664e-06,
"loss": 1.2082,
"step": 1460
},
{
"epoch": 0.7677242459851156,
"grad_norm": 7.030183792114258,
"learning_rate": 8.55649419218585e-06,
"loss": 1.0877,
"step": 1470
},
{
"epoch": 0.7729468599033816,
"grad_norm": 6.393901348114014,
"learning_rate": 8.545934530095038e-06,
"loss": 0.9444,
"step": 1480
},
{
"epoch": 0.7781694738216477,
"grad_norm": 7.0087571144104,
"learning_rate": 8.535374868004225e-06,
"loss": 0.9064,
"step": 1490
},
{
"epoch": 0.7833920877399139,
"grad_norm": 7.8241777420043945,
"learning_rate": 8.52481520591341e-06,
"loss": 1.0552,
"step": 1500
},
{
"epoch": 0.7886147016581799,
"grad_norm": 6.263652801513672,
"learning_rate": 8.514255543822599e-06,
"loss": 1.1847,
"step": 1510
},
{
"epoch": 0.793837315576446,
"grad_norm": 7.5798797607421875,
"learning_rate": 8.503695881731785e-06,
"loss": 1.076,
"step": 1520
},
{
"epoch": 0.799059929494712,
"grad_norm": 5.389642238616943,
"learning_rate": 8.493136219640971e-06,
"loss": 1.0102,
"step": 1530
},
{
"epoch": 0.8042825434129782,
"grad_norm": 5.936399459838867,
"learning_rate": 8.48257655755016e-06,
"loss": 1.0314,
"step": 1540
},
{
"epoch": 0.8095051573312443,
"grad_norm": 8.437224388122559,
"learning_rate": 8.472016895459345e-06,
"loss": 1.0985,
"step": 1550
},
{
"epoch": 0.8147277712495103,
"grad_norm": 5.470308303833008,
"learning_rate": 8.461457233368533e-06,
"loss": 1.0003,
"step": 1560
},
{
"epoch": 0.8199503851677765,
"grad_norm": 10.188332557678223,
"learning_rate": 8.45089757127772e-06,
"loss": 1.0165,
"step": 1570
},
{
"epoch": 0.8251729990860426,
"grad_norm": 8.477367401123047,
"learning_rate": 8.440337909186906e-06,
"loss": 1.0532,
"step": 1580
},
{
"epoch": 0.8303956130043086,
"grad_norm": 4.078097820281982,
"learning_rate": 8.429778247096094e-06,
"loss": 1.0828,
"step": 1590
},
{
"epoch": 0.8356182269225747,
"grad_norm": 5.285001277923584,
"learning_rate": 8.41921858500528e-06,
"loss": 0.9931,
"step": 1600
},
{
"epoch": 0.8356182269225747,
"eval_loss": 1.0581225156784058,
"eval_runtime": 46.2517,
"eval_samples_per_second": 36.799,
"eval_steps_per_second": 4.605,
"step": 1600
},
{
"epoch": 0.8408408408408409,
"grad_norm": 4.965864181518555,
"learning_rate": 8.408658922914468e-06,
"loss": 1.1802,
"step": 1610
},
{
"epoch": 0.8460634547591069,
"grad_norm": 6.969324588775635,
"learning_rate": 8.398099260823654e-06,
"loss": 1.0296,
"step": 1620
},
{
"epoch": 0.851286068677373,
"grad_norm": 8.806953430175781,
"learning_rate": 8.38753959873284e-06,
"loss": 0.9611,
"step": 1630
},
{
"epoch": 0.8565086825956392,
"grad_norm": 7.901791572570801,
"learning_rate": 8.376979936642029e-06,
"loss": 0.9345,
"step": 1640
},
{
"epoch": 0.8617312965139052,
"grad_norm": 6.055725574493408,
"learning_rate": 8.366420274551215e-06,
"loss": 1.0747,
"step": 1650
},
{
"epoch": 0.8669539104321713,
"grad_norm": 7.076270580291748,
"learning_rate": 8.355860612460403e-06,
"loss": 0.9801,
"step": 1660
},
{
"epoch": 0.8721765243504374,
"grad_norm": 5.258996963500977,
"learning_rate": 8.345300950369589e-06,
"loss": 1.0273,
"step": 1670
},
{
"epoch": 0.8773991382687035,
"grad_norm": 6.378342151641846,
"learning_rate": 8.334741288278775e-06,
"loss": 0.8198,
"step": 1680
},
{
"epoch": 0.8826217521869696,
"grad_norm": 6.257429599761963,
"learning_rate": 8.324181626187963e-06,
"loss": 0.9199,
"step": 1690
},
{
"epoch": 0.8878443661052356,
"grad_norm": 8.555800437927246,
"learning_rate": 8.31362196409715e-06,
"loss": 0.9679,
"step": 1700
},
{
"epoch": 0.8930669800235017,
"grad_norm": 5.934536933898926,
"learning_rate": 8.303062302006336e-06,
"loss": 0.9922,
"step": 1710
},
{
"epoch": 0.8982895939417679,
"grad_norm": 5.284457206726074,
"learning_rate": 8.292502639915522e-06,
"loss": 1.0682,
"step": 1720
},
{
"epoch": 0.903512207860034,
"grad_norm": 6.454044342041016,
"learning_rate": 8.28194297782471e-06,
"loss": 1.1901,
"step": 1730
},
{
"epoch": 0.9087348217783,
"grad_norm": 6.973818778991699,
"learning_rate": 8.271383315733898e-06,
"loss": 1.01,
"step": 1740
},
{
"epoch": 0.9139574356965662,
"grad_norm": 7.165948390960693,
"learning_rate": 8.260823653643084e-06,
"loss": 0.926,
"step": 1750
},
{
"epoch": 0.9191800496148322,
"grad_norm": 5.891210556030273,
"learning_rate": 8.25026399155227e-06,
"loss": 1.0527,
"step": 1760
},
{
"epoch": 0.9244026635330983,
"grad_norm": 6.440408229827881,
"learning_rate": 8.239704329461457e-06,
"loss": 1.0271,
"step": 1770
},
{
"epoch": 0.9296252774513644,
"grad_norm": 6.762996673583984,
"learning_rate": 8.229144667370645e-06,
"loss": 1.1016,
"step": 1780
},
{
"epoch": 0.9348478913696305,
"grad_norm": 7.777276515960693,
"learning_rate": 8.218585005279833e-06,
"loss": 1.2933,
"step": 1790
},
{
"epoch": 0.9400705052878966,
"grad_norm": 6.9960713386535645,
"learning_rate": 8.208025343189019e-06,
"loss": 1.1484,
"step": 1800
},
{
"epoch": 0.9400705052878966,
"eval_loss": 1.0149155855178833,
"eval_runtime": 46.3161,
"eval_samples_per_second": 36.748,
"eval_steps_per_second": 4.599,
"step": 1800
},
{
"epoch": 0.9452931192061627,
"grad_norm": 5.822863578796387,
"learning_rate": 8.197465681098205e-06,
"loss": 0.952,
"step": 1810
},
{
"epoch": 0.9505157331244288,
"grad_norm": 5.783324718475342,
"learning_rate": 8.186906019007393e-06,
"loss": 0.9129,
"step": 1820
},
{
"epoch": 0.9557383470426949,
"grad_norm": 7.200591564178467,
"learning_rate": 8.17634635691658e-06,
"loss": 1.0411,
"step": 1830
},
{
"epoch": 0.960960960960961,
"grad_norm": 6.707890033721924,
"learning_rate": 8.165786694825766e-06,
"loss": 1.0755,
"step": 1840
},
{
"epoch": 0.966183574879227,
"grad_norm": 4.4951372146606445,
"learning_rate": 8.155227032734954e-06,
"loss": 1.1139,
"step": 1850
},
{
"epoch": 0.9714061887974932,
"grad_norm": 7.841273307800293,
"learning_rate": 8.14466737064414e-06,
"loss": 0.9171,
"step": 1860
},
{
"epoch": 0.9766288027157592,
"grad_norm": 8.396512985229492,
"learning_rate": 8.134107708553328e-06,
"loss": 0.8624,
"step": 1870
},
{
"epoch": 0.9818514166340253,
"grad_norm": 7.903951168060303,
"learning_rate": 8.123548046462514e-06,
"loss": 0.9832,
"step": 1880
},
{
"epoch": 0.9870740305522914,
"grad_norm": 5.722747325897217,
"learning_rate": 8.1129883843717e-06,
"loss": 0.9931,
"step": 1890
},
{
"epoch": 0.9922966444705575,
"grad_norm": 6.809545993804932,
"learning_rate": 8.102428722280887e-06,
"loss": 0.987,
"step": 1900
},
{
"epoch": 0.9975192583888236,
"grad_norm": 5.796718120574951,
"learning_rate": 8.091869060190075e-06,
"loss": 0.9521,
"step": 1910
},
{
"epoch": 1.0031335683509597,
"grad_norm": 4.707700252532959,
"learning_rate": 8.081309398099263e-06,
"loss": 1.0939,
"step": 1920
},
{
"epoch": 1.0083561822692257,
"grad_norm": 5.458223342895508,
"learning_rate": 8.070749736008449e-06,
"loss": 0.8501,
"step": 1930
},
{
"epoch": 1.0135787961874918,
"grad_norm": 7.022110939025879,
"learning_rate": 8.060190073917635e-06,
"loss": 0.8575,
"step": 1940
},
{
"epoch": 1.0188014101057579,
"grad_norm": 6.051275730133057,
"learning_rate": 8.049630411826821e-06,
"loss": 0.7803,
"step": 1950
},
{
"epoch": 1.024024024024024,
"grad_norm": 13.546333312988281,
"learning_rate": 8.03907074973601e-06,
"loss": 0.691,
"step": 1960
},
{
"epoch": 1.0292466379422902,
"grad_norm": 6.829512596130371,
"learning_rate": 8.028511087645196e-06,
"loss": 0.7195,
"step": 1970
},
{
"epoch": 1.0344692518605563,
"grad_norm": 6.821556091308594,
"learning_rate": 8.017951425554384e-06,
"loss": 0.7773,
"step": 1980
},
{
"epoch": 1.0396918657788223,
"grad_norm": 4.730713844299316,
"learning_rate": 8.00739176346357e-06,
"loss": 0.6783,
"step": 1990
},
{
"epoch": 1.0449144796970884,
"grad_norm": 6.354138374328613,
"learning_rate": 7.996832101372756e-06,
"loss": 0.9788,
"step": 2000
},
{
"epoch": 1.0449144796970884,
"eval_loss": 0.9896802306175232,
"eval_runtime": 46.2765,
"eval_samples_per_second": 36.779,
"eval_steps_per_second": 4.603,
"step": 2000
},
{
"epoch": 1.0501370936153545,
"grad_norm": 6.299434185028076,
"learning_rate": 7.986272439281944e-06,
"loss": 0.7521,
"step": 2010
},
{
"epoch": 1.0553597075336205,
"grad_norm": 8.378788948059082,
"learning_rate": 7.97571277719113e-06,
"loss": 0.7983,
"step": 2020
},
{
"epoch": 1.0605823214518866,
"grad_norm": 5.674183368682861,
"learning_rate": 7.965153115100317e-06,
"loss": 0.8165,
"step": 2030
},
{
"epoch": 1.0658049353701529,
"grad_norm": 5.855607032775879,
"learning_rate": 7.954593453009504e-06,
"loss": 1.0197,
"step": 2040
},
{
"epoch": 1.071027549288419,
"grad_norm": 5.06273078918457,
"learning_rate": 7.94403379091869e-06,
"loss": 0.799,
"step": 2050
},
{
"epoch": 1.076250163206685,
"grad_norm": 4.809935092926025,
"learning_rate": 7.933474128827879e-06,
"loss": 0.8601,
"step": 2060
},
{
"epoch": 1.081472777124951,
"grad_norm": 4.642035007476807,
"learning_rate": 7.922914466737065e-06,
"loss": 0.8228,
"step": 2070
},
{
"epoch": 1.086695391043217,
"grad_norm": 6.76859188079834,
"learning_rate": 7.912354804646251e-06,
"loss": 0.8457,
"step": 2080
},
{
"epoch": 1.0919180049614832,
"grad_norm": 7.555065155029297,
"learning_rate": 7.901795142555439e-06,
"loss": 0.8369,
"step": 2090
},
{
"epoch": 1.0971406188797492,
"grad_norm": 5.144375324249268,
"learning_rate": 7.891235480464627e-06,
"loss": 0.6828,
"step": 2100
},
{
"epoch": 1.1023632327980155,
"grad_norm": 6.584686756134033,
"learning_rate": 7.880675818373813e-06,
"loss": 0.7198,
"step": 2110
},
{
"epoch": 1.1075858467162816,
"grad_norm": 5.92726469039917,
"learning_rate": 7.870116156283e-06,
"loss": 0.8571,
"step": 2120
},
{
"epoch": 1.1128084606345476,
"grad_norm": 5.866957187652588,
"learning_rate": 7.859556494192186e-06,
"loss": 0.8838,
"step": 2130
},
{
"epoch": 1.1180310745528137,
"grad_norm": 6.889613151550293,
"learning_rate": 7.848996832101374e-06,
"loss": 0.7764,
"step": 2140
},
{
"epoch": 1.1232536884710798,
"grad_norm": 7.770586013793945,
"learning_rate": 7.83843717001056e-06,
"loss": 0.7697,
"step": 2150
},
{
"epoch": 1.1284763023893458,
"grad_norm": 6.084799766540527,
"learning_rate": 7.827877507919746e-06,
"loss": 0.8748,
"step": 2160
},
{
"epoch": 1.1336989163076119,
"grad_norm": 8.996906280517578,
"learning_rate": 7.817317845828934e-06,
"loss": 0.6519,
"step": 2170
},
{
"epoch": 1.1389215302258782,
"grad_norm": 4.936269283294678,
"learning_rate": 7.80675818373812e-06,
"loss": 0.9594,
"step": 2180
},
{
"epoch": 1.1441441441441442,
"grad_norm": 5.769779205322266,
"learning_rate": 7.796198521647309e-06,
"loss": 0.7806,
"step": 2190
},
{
"epoch": 1.1493667580624103,
"grad_norm": 7.1322808265686035,
"learning_rate": 7.785638859556495e-06,
"loss": 0.8086,
"step": 2200
},
{
"epoch": 1.1493667580624103,
"eval_loss": 0.969113826751709,
"eval_runtime": 46.2762,
"eval_samples_per_second": 36.779,
"eval_steps_per_second": 4.603,
"step": 2200
},
{
"epoch": 1.1545893719806763,
"grad_norm": 6.716241359710693,
"learning_rate": 7.775079197465681e-06,
"loss": 0.9122,
"step": 2210
},
{
"epoch": 1.1598119858989424,
"grad_norm": 5.767160892486572,
"learning_rate": 7.764519535374867e-06,
"loss": 0.7598,
"step": 2220
},
{
"epoch": 1.1650345998172085,
"grad_norm": 7.989006519317627,
"learning_rate": 7.753959873284055e-06,
"loss": 0.6704,
"step": 2230
},
{
"epoch": 1.1702572137354745,
"grad_norm": 5.272390365600586,
"learning_rate": 7.743400211193243e-06,
"loss": 0.8521,
"step": 2240
},
{
"epoch": 1.1754798276537408,
"grad_norm": 6.191717147827148,
"learning_rate": 7.73284054910243e-06,
"loss": 0.8905,
"step": 2250
},
{
"epoch": 1.1807024415720069,
"grad_norm": 5.682114124298096,
"learning_rate": 7.722280887011616e-06,
"loss": 0.8386,
"step": 2260
},
{
"epoch": 1.185925055490273,
"grad_norm": 6.549655914306641,
"learning_rate": 7.711721224920804e-06,
"loss": 0.691,
"step": 2270
},
{
"epoch": 1.191147669408539,
"grad_norm": 6.487022876739502,
"learning_rate": 7.70116156282999e-06,
"loss": 0.7769,
"step": 2280
},
{
"epoch": 1.196370283326805,
"grad_norm": 7.281522274017334,
"learning_rate": 7.690601900739178e-06,
"loss": 0.7235,
"step": 2290
},
{
"epoch": 1.2015928972450711,
"grad_norm": 7.294795513153076,
"learning_rate": 7.680042238648364e-06,
"loss": 0.7813,
"step": 2300
},
{
"epoch": 1.2068155111633372,
"grad_norm": 6.67874813079834,
"learning_rate": 7.66948257655755e-06,
"loss": 0.7375,
"step": 2310
},
{
"epoch": 1.2120381250816032,
"grad_norm": 3.83427357673645,
"learning_rate": 7.658922914466738e-06,
"loss": 0.7533,
"step": 2320
},
{
"epoch": 1.2172607389998695,
"grad_norm": 5.654359340667725,
"learning_rate": 7.648363252375925e-06,
"loss": 0.8276,
"step": 2330
},
{
"epoch": 1.2224833529181356,
"grad_norm": 5.315032482147217,
"learning_rate": 7.637803590285111e-06,
"loss": 0.7584,
"step": 2340
},
{
"epoch": 1.2277059668364017,
"grad_norm": 6.630548477172852,
"learning_rate": 7.627243928194299e-06,
"loss": 0.6388,
"step": 2350
},
{
"epoch": 1.2329285807546677,
"grad_norm": 5.981212615966797,
"learning_rate": 7.616684266103486e-06,
"loss": 0.8596,
"step": 2360
},
{
"epoch": 1.2381511946729338,
"grad_norm": 5.186179161071777,
"learning_rate": 7.606124604012672e-06,
"loss": 0.7127,
"step": 2370
},
{
"epoch": 1.2433738085911998,
"grad_norm": 6.0210747718811035,
"learning_rate": 7.595564941921859e-06,
"loss": 0.8277,
"step": 2380
},
{
"epoch": 1.248596422509466,
"grad_norm": 6.917499542236328,
"learning_rate": 7.585005279831046e-06,
"loss": 0.6321,
"step": 2390
},
{
"epoch": 1.253819036427732,
"grad_norm": 6.423802852630615,
"learning_rate": 7.574445617740233e-06,
"loss": 0.8517,
"step": 2400
},
{
"epoch": 1.253819036427732,
"eval_loss": 0.9506328701972961,
"eval_runtime": 46.3505,
"eval_samples_per_second": 36.72,
"eval_steps_per_second": 4.595,
"step": 2400
},
{
"epoch": 1.2590416503459982,
"grad_norm": 6.889662742614746,
"learning_rate": 7.563885955649419e-06,
"loss": 0.7603,
"step": 2410
},
{
"epoch": 1.2642642642642643,
"grad_norm": 4.960092544555664,
"learning_rate": 7.553326293558607e-06,
"loss": 0.7804,
"step": 2420
},
{
"epoch": 1.2694868781825304,
"grad_norm": 5.164410591125488,
"learning_rate": 7.542766631467794e-06,
"loss": 0.6938,
"step": 2430
},
{
"epoch": 1.2747094921007964,
"grad_norm": 5.916507720947266,
"learning_rate": 7.53220696937698e-06,
"loss": 0.6964,
"step": 2440
},
{
"epoch": 1.2799321060190625,
"grad_norm": 6.940438747406006,
"learning_rate": 7.521647307286167e-06,
"loss": 0.7605,
"step": 2450
},
{
"epoch": 1.2851547199373288,
"grad_norm": 6.659502983093262,
"learning_rate": 7.511087645195354e-06,
"loss": 0.8735,
"step": 2460
},
{
"epoch": 1.2903773338555946,
"grad_norm": 5.659145355224609,
"learning_rate": 7.500527983104541e-06,
"loss": 0.726,
"step": 2470
},
{
"epoch": 1.295599947773861,
"grad_norm": 3.484576463699341,
"learning_rate": 7.489968321013729e-06,
"loss": 0.6992,
"step": 2480
},
{
"epoch": 1.300822561692127,
"grad_norm": 6.005791664123535,
"learning_rate": 7.479408658922915e-06,
"loss": 0.8257,
"step": 2490
},
{
"epoch": 1.306045175610393,
"grad_norm": 4.005056381225586,
"learning_rate": 7.468848996832102e-06,
"loss": 0.8418,
"step": 2500
},
{
"epoch": 1.311267789528659,
"grad_norm": 6.585374355316162,
"learning_rate": 7.458289334741288e-06,
"loss": 0.7168,
"step": 2510
},
{
"epoch": 1.3164904034469251,
"grad_norm": 6.849618434906006,
"learning_rate": 7.4477296726504755e-06,
"loss": 0.8348,
"step": 2520
},
{
"epoch": 1.3217130173651912,
"grad_norm": 4.997506618499756,
"learning_rate": 7.437170010559663e-06,
"loss": 0.7155,
"step": 2530
},
{
"epoch": 1.3269356312834573,
"grad_norm": 6.247817516326904,
"learning_rate": 7.42661034846885e-06,
"loss": 0.7999,
"step": 2540
},
{
"epoch": 1.3321582452017235,
"grad_norm": 6.865342617034912,
"learning_rate": 7.416050686378037e-06,
"loss": 0.8178,
"step": 2550
},
{
"epoch": 1.3373808591199896,
"grad_norm": 7.5182695388793945,
"learning_rate": 7.405491024287224e-06,
"loss": 0.8535,
"step": 2560
},
{
"epoch": 1.3426034730382557,
"grad_norm": 5.786922454833984,
"learning_rate": 7.39493136219641e-06,
"loss": 0.8538,
"step": 2570
},
{
"epoch": 1.3478260869565217,
"grad_norm": 5.576653480529785,
"learning_rate": 7.384371700105597e-06,
"loss": 0.6142,
"step": 2580
},
{
"epoch": 1.3530487008747878,
"grad_norm": 6.3509135246276855,
"learning_rate": 7.3738120380147835e-06,
"loss": 0.872,
"step": 2590
},
{
"epoch": 1.3582713147930539,
"grad_norm": 6.3292131423950195,
"learning_rate": 7.3632523759239715e-06,
"loss": 0.7452,
"step": 2600
},
{
"epoch": 1.3582713147930539,
"eval_loss": 0.928360104560852,
"eval_runtime": 46.2783,
"eval_samples_per_second": 36.778,
"eval_steps_per_second": 4.603,
"step": 2600
},
{
"epoch": 1.36349392871132,
"grad_norm": 5.916106700897217,
"learning_rate": 7.352692713833159e-06,
"loss": 0.7897,
"step": 2610
},
{
"epoch": 1.3687165426295862,
"grad_norm": 6.225005149841309,
"learning_rate": 7.342133051742345e-06,
"loss": 0.7989,
"step": 2620
},
{
"epoch": 1.3739391565478523,
"grad_norm": 7.300755500793457,
"learning_rate": 7.331573389651532e-06,
"loss": 0.8093,
"step": 2630
},
{
"epoch": 1.3791617704661183,
"grad_norm": 6.355301380157471,
"learning_rate": 7.321013727560718e-06,
"loss": 0.6979,
"step": 2640
},
{
"epoch": 1.3843843843843844,
"grad_norm": 6.439295291900635,
"learning_rate": 7.310454065469905e-06,
"loss": 0.6481,
"step": 2650
},
{
"epoch": 1.3896069983026504,
"grad_norm": 5.19166374206543,
"learning_rate": 7.299894403379092e-06,
"loss": 0.6853,
"step": 2660
},
{
"epoch": 1.3948296122209165,
"grad_norm": 7.574211120605469,
"learning_rate": 7.2893347412882796e-06,
"loss": 0.6786,
"step": 2670
},
{
"epoch": 1.4000522261391826,
"grad_norm": 6.01971435546875,
"learning_rate": 7.278775079197467e-06,
"loss": 0.7968,
"step": 2680
},
{
"epoch": 1.4052748400574488,
"grad_norm": 4.888395309448242,
"learning_rate": 7.268215417106653e-06,
"loss": 0.8298,
"step": 2690
},
{
"epoch": 1.410497453975715,
"grad_norm": 4.738596439361572,
"learning_rate": 7.25765575501584e-06,
"loss": 0.6968,
"step": 2700
},
{
"epoch": 1.415720067893981,
"grad_norm": 6.128376483917236,
"learning_rate": 7.247096092925026e-06,
"loss": 0.7189,
"step": 2710
},
{
"epoch": 1.420942681812247,
"grad_norm": 6.777405738830566,
"learning_rate": 7.236536430834213e-06,
"loss": 0.7324,
"step": 2720
},
{
"epoch": 1.426165295730513,
"grad_norm": 5.1269402503967285,
"learning_rate": 7.225976768743401e-06,
"loss": 0.6326,
"step": 2730
},
{
"epoch": 1.4313879096487792,
"grad_norm": 7.080173492431641,
"learning_rate": 7.215417106652588e-06,
"loss": 0.7193,
"step": 2740
},
{
"epoch": 1.4366105235670452,
"grad_norm": 6.149571895599365,
"learning_rate": 7.204857444561775e-06,
"loss": 0.815,
"step": 2750
},
{
"epoch": 1.4418331374853115,
"grad_norm": 4.2188849449157715,
"learning_rate": 7.194297782470962e-06,
"loss": 0.8526,
"step": 2760
},
{
"epoch": 1.4470557514035776,
"grad_norm": 6.189548492431641,
"learning_rate": 7.183738120380148e-06,
"loss": 0.7041,
"step": 2770
},
{
"epoch": 1.4522783653218436,
"grad_norm": 8.304208755493164,
"learning_rate": 7.173178458289335e-06,
"loss": 0.8734,
"step": 2780
},
{
"epoch": 1.4575009792401097,
"grad_norm": 8.095356941223145,
"learning_rate": 7.162618796198522e-06,
"loss": 0.733,
"step": 2790
},
{
"epoch": 1.4627235931583757,
"grad_norm": 5.834177017211914,
"learning_rate": 7.1520591341077094e-06,
"loss": 0.7102,
"step": 2800
},
{
"epoch": 1.4627235931583757,
"eval_loss": 0.9070786237716675,
"eval_runtime": 46.2298,
"eval_samples_per_second": 36.816,
"eval_steps_per_second": 4.607,
"step": 2800
},
{
"epoch": 1.4679462070766418,
"grad_norm": 7.127483367919922,
"learning_rate": 7.1414994720168965e-06,
"loss": 0.8149,
"step": 2810
},
{
"epoch": 1.4731688209949079,
"grad_norm": 5.216626167297363,
"learning_rate": 7.131995776135164e-06,
"loss": 0.7184,
"step": 2820
},
{
"epoch": 1.4783914349131742,
"grad_norm": 5.421391487121582,
"learning_rate": 7.121436114044351e-06,
"loss": 0.757,
"step": 2830
},
{
"epoch": 1.4836140488314402,
"grad_norm": 5.557046413421631,
"learning_rate": 7.110876451953538e-06,
"loss": 0.7887,
"step": 2840
},
{
"epoch": 1.4888366627497063,
"grad_norm": 4.0539870262146,
"learning_rate": 7.1003167898627245e-06,
"loss": 0.7473,
"step": 2850
},
{
"epoch": 1.4940592766679723,
"grad_norm": 5.317719459533691,
"learning_rate": 7.0897571277719124e-06,
"loss": 0.7603,
"step": 2860
},
{
"epoch": 1.4992818905862384,
"grad_norm": 7.20483922958374,
"learning_rate": 7.079197465681099e-06,
"loss": 0.618,
"step": 2870
},
{
"epoch": 1.5045045045045045,
"grad_norm": 5.992430686950684,
"learning_rate": 7.068637803590286e-06,
"loss": 0.7101,
"step": 2880
},
{
"epoch": 1.5097271184227705,
"grad_norm": 5.599936008453369,
"learning_rate": 7.058078141499473e-06,
"loss": 0.7421,
"step": 2890
},
{
"epoch": 1.5149497323410368,
"grad_norm": 3.7422847747802734,
"learning_rate": 7.047518479408659e-06,
"loss": 0.57,
"step": 2900
},
{
"epoch": 1.5201723462593026,
"grad_norm": 6.516021251678467,
"learning_rate": 7.036958817317846e-06,
"loss": 0.7308,
"step": 2910
},
{
"epoch": 1.525394960177569,
"grad_norm": 6.263866901397705,
"learning_rate": 7.026399155227034e-06,
"loss": 0.8289,
"step": 2920
},
{
"epoch": 1.530617574095835,
"grad_norm": 4.577157974243164,
"learning_rate": 7.0158394931362205e-06,
"loss": 0.8547,
"step": 2930
},
{
"epoch": 1.535840188014101,
"grad_norm": 4.392026424407959,
"learning_rate": 7.006335797254489e-06,
"loss": 0.7407,
"step": 2940
},
{
"epoch": 1.541062801932367,
"grad_norm": 4.726680755615234,
"learning_rate": 6.995776135163675e-06,
"loss": 0.7082,
"step": 2950
},
{
"epoch": 1.5462854158506332,
"grad_norm": 6.287652492523193,
"learning_rate": 6.985216473072862e-06,
"loss": 0.7867,
"step": 2960
},
{
"epoch": 1.5515080297688995,
"grad_norm": 4.373517990112305,
"learning_rate": 6.974656810982049e-06,
"loss": 0.6765,
"step": 2970
},
{
"epoch": 1.5567306436871653,
"grad_norm": 6.31062126159668,
"learning_rate": 6.9640971488912356e-06,
"loss": 0.6577,
"step": 2980
},
{
"epoch": 1.5619532576054316,
"grad_norm": 4.2869415283203125,
"learning_rate": 6.9535374868004235e-06,
"loss": 0.6042,
"step": 2990
},
{
"epoch": 1.5671758715236976,
"grad_norm": 4.132930278778076,
"learning_rate": 6.942977824709611e-06,
"loss": 0.7663,
"step": 3000
},
{
"epoch": 1.5671758715236976,
"eval_loss": 0.8875888586044312,
"eval_runtime": 46.2332,
"eval_samples_per_second": 36.813,
"eval_steps_per_second": 4.607,
"step": 3000
},
{
"epoch": 1.5723984854419637,
"grad_norm": 4.136049270629883,
"learning_rate": 6.932418162618797e-06,
"loss": 0.6862,
"step": 3010
},
{
"epoch": 1.5776210993602298,
"grad_norm": 4.138570308685303,
"learning_rate": 6.921858500527984e-06,
"loss": 0.6622,
"step": 3020
},
{
"epoch": 1.5828437132784958,
"grad_norm": 6.920501708984375,
"learning_rate": 6.91129883843717e-06,
"loss": 0.6052,
"step": 3030
},
{
"epoch": 1.588066327196762,
"grad_norm": 5.639624118804932,
"learning_rate": 6.900739176346357e-06,
"loss": 0.6701,
"step": 3040
},
{
"epoch": 1.593288941115028,
"grad_norm": 5.700570106506348,
"learning_rate": 6.890179514255544e-06,
"loss": 0.8079,
"step": 3050
},
{
"epoch": 1.5985115550332942,
"grad_norm": 4.964538097381592,
"learning_rate": 6.8796198521647316e-06,
"loss": 0.6963,
"step": 3060
},
{
"epoch": 1.6037341689515603,
"grad_norm": 4.319785118103027,
"learning_rate": 6.869060190073919e-06,
"loss": 0.6664,
"step": 3070
},
{
"epoch": 1.6089567828698264,
"grad_norm": 6.524580478668213,
"learning_rate": 6.858500527983105e-06,
"loss": 0.7526,
"step": 3080
},
{
"epoch": 1.6141793967880924,
"grad_norm": 5.287715911865234,
"learning_rate": 6.847940865892292e-06,
"loss": 0.6971,
"step": 3090
},
{
"epoch": 1.6194020107063585,
"grad_norm": 6.37127161026001,
"learning_rate": 6.837381203801478e-06,
"loss": 0.6264,
"step": 3100
},
{
"epoch": 1.6246246246246248,
"grad_norm": 5.0084123611450195,
"learning_rate": 6.826821541710665e-06,
"loss": 0.7568,
"step": 3110
},
{
"epoch": 1.6298472385428906,
"grad_norm": 4.985651969909668,
"learning_rate": 6.816261879619853e-06,
"loss": 0.7554,
"step": 3120
},
{
"epoch": 1.6350698524611569,
"grad_norm": 6.405339241027832,
"learning_rate": 6.80570221752904e-06,
"loss": 0.6628,
"step": 3130
},
{
"epoch": 1.640292466379423,
"grad_norm": 5.617925643920898,
"learning_rate": 6.795142555438227e-06,
"loss": 0.6515,
"step": 3140
},
{
"epoch": 1.645515080297689,
"grad_norm": 2.850438117980957,
"learning_rate": 6.784582893347413e-06,
"loss": 0.6283,
"step": 3150
},
{
"epoch": 1.650737694215955,
"grad_norm": 7.605482578277588,
"learning_rate": 6.7740232312566e-06,
"loss": 0.6881,
"step": 3160
},
{
"epoch": 1.6559603081342211,
"grad_norm": 4.773893356323242,
"learning_rate": 6.763463569165787e-06,
"loss": 0.6669,
"step": 3170
},
{
"epoch": 1.6611829220524874,
"grad_norm": 4.882478713989258,
"learning_rate": 6.752903907074974e-06,
"loss": 0.6374,
"step": 3180
},
{
"epoch": 1.6664055359707532,
"grad_norm": 6.456390380859375,
"learning_rate": 6.7423442449841614e-06,
"loss": 0.7759,
"step": 3190
},
{
"epoch": 1.6716281498890195,
"grad_norm": 7.474002361297607,
"learning_rate": 6.7317845828933485e-06,
"loss": 0.6657,
"step": 3200
},
{
"epoch": 1.6716281498890195,
"eval_loss": 0.8759788870811462,
"eval_runtime": 46.2024,
"eval_samples_per_second": 36.838,
"eval_steps_per_second": 4.61,
"step": 3200
},
{
"epoch": 1.6768507638072856,
"grad_norm": 4.953747272491455,
"learning_rate": 6.721224920802535e-06,
"loss": 0.7443,
"step": 3210
},
{
"epoch": 1.6820733777255517,
"grad_norm": 5.401075839996338,
"learning_rate": 6.710665258711722e-06,
"loss": 0.7067,
"step": 3220
},
{
"epoch": 1.6872959916438177,
"grad_norm": 5.775487899780273,
"learning_rate": 6.700105596620908e-06,
"loss": 0.78,
"step": 3230
},
{
"epoch": 1.6925186055620838,
"grad_norm": 6.578312397003174,
"learning_rate": 6.689545934530095e-06,
"loss": 0.6206,
"step": 3240
},
{
"epoch": 1.69774121948035,
"grad_norm": 7.354413986206055,
"learning_rate": 6.678986272439283e-06,
"loss": 0.7265,
"step": 3250
},
{
"epoch": 1.702963833398616,
"grad_norm": 6.80817985534668,
"learning_rate": 6.6684266103484695e-06,
"loss": 0.8696,
"step": 3260
},
{
"epoch": 1.7081864473168822,
"grad_norm": 6.331092357635498,
"learning_rate": 6.657866948257657e-06,
"loss": 0.7226,
"step": 3270
},
{
"epoch": 1.7134090612351482,
"grad_norm": 6.063718795776367,
"learning_rate": 6.647307286166843e-06,
"loss": 0.6339,
"step": 3280
},
{
"epoch": 1.7186316751534143,
"grad_norm": 4.693406105041504,
"learning_rate": 6.63674762407603e-06,
"loss": 0.7316,
"step": 3290
},
{
"epoch": 1.7238542890716804,
"grad_norm": 6.732961654663086,
"learning_rate": 6.626187961985216e-06,
"loss": 0.7185,
"step": 3300
},
{
"epoch": 1.7290769029899464,
"grad_norm": 4.482574939727783,
"learning_rate": 6.615628299894404e-06,
"loss": 0.7814,
"step": 3310
},
{
"epoch": 1.7342995169082127,
"grad_norm": 7.299856662750244,
"learning_rate": 6.605068637803591e-06,
"loss": 0.7275,
"step": 3320
},
{
"epoch": 1.7395221308264786,
"grad_norm": 4.190903186798096,
"learning_rate": 6.5945089757127776e-06,
"loss": 0.6967,
"step": 3330
},
{
"epoch": 1.7447447447447448,
"grad_norm": 5.144697666168213,
"learning_rate": 6.583949313621965e-06,
"loss": 0.7279,
"step": 3340
},
{
"epoch": 1.7499673586630107,
"grad_norm": 8.06899642944336,
"learning_rate": 6.573389651531151e-06,
"loss": 0.7133,
"step": 3350
},
{
"epoch": 1.755189972581277,
"grad_norm": 5.388707637786865,
"learning_rate": 6.562829989440338e-06,
"loss": 0.7695,
"step": 3360
},
{
"epoch": 1.760412586499543,
"grad_norm": 5.485361576080322,
"learning_rate": 6.552270327349526e-06,
"loss": 0.6475,
"step": 3370
},
{
"epoch": 1.765635200417809,
"grad_norm": 5.023000717163086,
"learning_rate": 6.541710665258712e-06,
"loss": 0.6575,
"step": 3380
},
{
"epoch": 1.7708578143360754,
"grad_norm": 5.406675815582275,
"learning_rate": 6.531151003167899e-06,
"loss": 0.7263,
"step": 3390
},
{
"epoch": 1.7760804282543412,
"grad_norm": 3.564267873764038,
"learning_rate": 6.520591341077086e-06,
"loss": 0.598,
"step": 3400
},
{
"epoch": 1.7760804282543412,
"eval_loss": 0.8584678769111633,
"eval_runtime": 46.2248,
"eval_samples_per_second": 36.82,
"eval_steps_per_second": 4.608,
"step": 3400
},
{
"epoch": 1.7813030421726075,
"grad_norm": 4.055863380432129,
"learning_rate": 6.510031678986273e-06,
"loss": 0.6674,
"step": 3410
},
{
"epoch": 1.7865256560908733,
"grad_norm": 5.625813007354736,
"learning_rate": 6.49947201689546e-06,
"loss": 0.8424,
"step": 3420
},
{
"epoch": 1.7917482700091396,
"grad_norm": 6.47999906539917,
"learning_rate": 6.488912354804647e-06,
"loss": 0.6039,
"step": 3430
},
{
"epoch": 1.7969708839274057,
"grad_norm": 5.702643871307373,
"learning_rate": 6.478352692713834e-06,
"loss": 0.588,
"step": 3440
},
{
"epoch": 1.8021934978456717,
"grad_norm": 6.600216388702393,
"learning_rate": 6.467793030623021e-06,
"loss": 0.7704,
"step": 3450
},
{
"epoch": 1.807416111763938,
"grad_norm": 5.743258476257324,
"learning_rate": 6.4572333685322074e-06,
"loss": 0.6381,
"step": 3460
},
{
"epoch": 1.8126387256822039,
"grad_norm": 7.323511123657227,
"learning_rate": 6.4466737064413945e-06,
"loss": 0.8124,
"step": 3470
},
{
"epoch": 1.8178613396004701,
"grad_norm": 4.996503829956055,
"learning_rate": 6.436114044350581e-06,
"loss": 0.6816,
"step": 3480
},
{
"epoch": 1.823083953518736,
"grad_norm": 6.126676559448242,
"learning_rate": 6.425554382259768e-06,
"loss": 0.6773,
"step": 3490
},
{
"epoch": 1.8283065674370023,
"grad_norm": 4.893184185028076,
"learning_rate": 6.414994720168956e-06,
"loss": 0.684,
"step": 3500
},
{
"epoch": 1.8335291813552683,
"grad_norm": 6.5332841873168945,
"learning_rate": 6.404435058078142e-06,
"loss": 0.771,
"step": 3510
},
{
"epoch": 1.8387517952735344,
"grad_norm": 8.364972114562988,
"learning_rate": 6.393875395987329e-06,
"loss": 0.7224,
"step": 3520
},
{
"epoch": 1.8439744091918004,
"grad_norm": 5.508388042449951,
"learning_rate": 6.3833157338965155e-06,
"loss": 0.7867,
"step": 3530
},
{
"epoch": 1.8491970231100665,
"grad_norm": 6.582828044891357,
"learning_rate": 6.372756071805703e-06,
"loss": 0.5697,
"step": 3540
},
{
"epoch": 1.8544196370283328,
"grad_norm": 6.311943531036377,
"learning_rate": 6.362196409714889e-06,
"loss": 0.6919,
"step": 3550
},
{
"epoch": 1.8596422509465986,
"grad_norm": 8.718938827514648,
"learning_rate": 6.351636747624077e-06,
"loss": 0.685,
"step": 3560
},
{
"epoch": 1.864864864864865,
"grad_norm": 7.9847846031188965,
"learning_rate": 6.341077085533264e-06,
"loss": 0.5817,
"step": 3570
},
{
"epoch": 1.870087478783131,
"grad_norm": 5.257252216339111,
"learning_rate": 6.33051742344245e-06,
"loss": 0.7489,
"step": 3580
},
{
"epoch": 1.875310092701397,
"grad_norm": 6.155455589294434,
"learning_rate": 6.319957761351637e-06,
"loss": 0.6976,
"step": 3590
},
{
"epoch": 1.880532706619663,
"grad_norm": 5.958306789398193,
"learning_rate": 6.3093980992608236e-06,
"loss": 0.6385,
"step": 3600
},
{
"epoch": 1.880532706619663,
"eval_loss": 0.8434246778488159,
"eval_runtime": 46.2141,
"eval_samples_per_second": 36.829,
"eval_steps_per_second": 4.609,
"step": 3600
},
{
"epoch": 1.8857553205379292,
"grad_norm": 4.219689846038818,
"learning_rate": 6.298838437170011e-06,
"loss": 0.6891,
"step": 3610
},
{
"epoch": 1.8909779344561954,
"grad_norm": 6.840215682983398,
"learning_rate": 6.288278775079199e-06,
"loss": 0.5082,
"step": 3620
},
{
"epoch": 1.8962005483744613,
"grad_norm": 4.201242923736572,
"learning_rate": 6.277719112988385e-06,
"loss": 0.6311,
"step": 3630
},
{
"epoch": 1.9014231622927276,
"grad_norm": 4.635916709899902,
"learning_rate": 6.267159450897572e-06,
"loss": 0.5849,
"step": 3640
},
{
"epoch": 1.9066457762109936,
"grad_norm": 9.082616806030273,
"learning_rate": 6.256599788806758e-06,
"loss": 0.7785,
"step": 3650
},
{
"epoch": 1.9118683901292597,
"grad_norm": 7.005586624145508,
"learning_rate": 6.246040126715945e-06,
"loss": 0.5691,
"step": 3660
},
{
"epoch": 1.9170910040475257,
"grad_norm": 6.064583778381348,
"learning_rate": 6.2354804646251325e-06,
"loss": 0.7137,
"step": 3670
},
{
"epoch": 1.9223136179657918,
"grad_norm": 8.23308277130127,
"learning_rate": 6.2249208025343196e-06,
"loss": 0.7679,
"step": 3680
},
{
"epoch": 1.927536231884058,
"grad_norm": 3.684671401977539,
"learning_rate": 6.214361140443507e-06,
"loss": 0.652,
"step": 3690
},
{
"epoch": 1.932758845802324,
"grad_norm": 3.7538697719573975,
"learning_rate": 6.203801478352694e-06,
"loss": 0.7469,
"step": 3700
},
{
"epoch": 1.9379814597205902,
"grad_norm": 6.125692844390869,
"learning_rate": 6.19324181626188e-06,
"loss": 0.775,
"step": 3710
},
{
"epoch": 1.9432040736388563,
"grad_norm": 6.9215989112854,
"learning_rate": 6.182682154171067e-06,
"loss": 0.7417,
"step": 3720
},
{
"epoch": 1.9484266875571223,
"grad_norm": 5.37251615524292,
"learning_rate": 6.172122492080253e-06,
"loss": 0.6806,
"step": 3730
},
{
"epoch": 1.9536493014753884,
"grad_norm": 6.892059803009033,
"learning_rate": 6.1615628299894405e-06,
"loss": 0.72,
"step": 3740
},
{
"epoch": 1.9588719153936545,
"grad_norm": 4.601454257965088,
"learning_rate": 6.1510031678986285e-06,
"loss": 0.5358,
"step": 3750
},
{
"epoch": 1.9640945293119207,
"grad_norm": 6.319756984710693,
"learning_rate": 6.140443505807815e-06,
"loss": 0.6178,
"step": 3760
},
{
"epoch": 1.9693171432301866,
"grad_norm": 5.619534015655518,
"learning_rate": 6.129883843717002e-06,
"loss": 0.6324,
"step": 3770
},
{
"epoch": 1.9745397571484529,
"grad_norm": 6.824273109436035,
"learning_rate": 6.119324181626188e-06,
"loss": 0.6543,
"step": 3780
},
{
"epoch": 1.979762371066719,
"grad_norm": 4.03241491317749,
"learning_rate": 6.108764519535375e-06,
"loss": 0.6577,
"step": 3790
},
{
"epoch": 1.984984984984985,
"grad_norm": 8.365711212158203,
"learning_rate": 6.0982048574445615e-06,
"loss": 0.7096,
"step": 3800
},
{
"epoch": 1.984984984984985,
"eval_loss": 0.8318305611610413,
"eval_runtime": 46.2366,
"eval_samples_per_second": 36.811,
"eval_steps_per_second": 4.607,
"step": 3800
},
{
"epoch": 1.990207598903251,
"grad_norm": 4.457459926605225,
"learning_rate": 6.0876451953537494e-06,
"loss": 0.7665,
"step": 3810
},
{
"epoch": 1.9954302128215171,
"grad_norm": 5.687203407287598,
"learning_rate": 6.0770855332629365e-06,
"loss": 0.6722,
"step": 3820
},
{
"epoch": 2.001044522783653,
"grad_norm": 6.511070251464844,
"learning_rate": 6.066525871172123e-06,
"loss": 0.7669,
"step": 3830
},
{
"epoch": 2.0062671367019194,
"grad_norm": 5.137298583984375,
"learning_rate": 6.05596620908131e-06,
"loss": 0.5114,
"step": 3840
},
{
"epoch": 2.011489750620185,
"grad_norm": 7.063135623931885,
"learning_rate": 6.045406546990496e-06,
"loss": 0.5532,
"step": 3850
},
{
"epoch": 2.0167123645384515,
"grad_norm": 4.833804607391357,
"learning_rate": 6.034846884899683e-06,
"loss": 0.4304,
"step": 3860
},
{
"epoch": 2.0219349784567178,
"grad_norm": 6.820064544677734,
"learning_rate": 6.024287222808871e-06,
"loss": 0.5066,
"step": 3870
},
{
"epoch": 2.0271575923749836,
"grad_norm": 6.833749771118164,
"learning_rate": 6.0137275607180575e-06,
"loss": 0.5043,
"step": 3880
},
{
"epoch": 2.03238020629325,
"grad_norm": 4.371280670166016,
"learning_rate": 6.003167898627245e-06,
"loss": 0.6015,
"step": 3890
},
{
"epoch": 2.0376028202115157,
"grad_norm": 4.963273048400879,
"learning_rate": 5.992608236536432e-06,
"loss": 0.5881,
"step": 3900
},
{
"epoch": 2.042825434129782,
"grad_norm": 5.74879264831543,
"learning_rate": 5.982048574445618e-06,
"loss": 0.5776,
"step": 3910
},
{
"epoch": 2.048048048048048,
"grad_norm": 3.3885014057159424,
"learning_rate": 5.971488912354805e-06,
"loss": 0.6508,
"step": 3920
},
{
"epoch": 2.053270661966314,
"grad_norm": 3.875781774520874,
"learning_rate": 5.960929250263991e-06,
"loss": 0.5135,
"step": 3930
},
{
"epoch": 2.0584932758845804,
"grad_norm": 5.869786739349365,
"learning_rate": 5.950369588173179e-06,
"loss": 0.5343,
"step": 3940
},
{
"epoch": 2.0637158898028463,
"grad_norm": 5.666866779327393,
"learning_rate": 5.939809926082366e-06,
"loss": 0.5258,
"step": 3950
},
{
"epoch": 2.0689385037211125,
"grad_norm": 5.5713276863098145,
"learning_rate": 5.929250263991553e-06,
"loss": 0.6209,
"step": 3960
},
{
"epoch": 2.0741611176393784,
"grad_norm": 5.73265266418457,
"learning_rate": 5.91869060190074e-06,
"loss": 0.4707,
"step": 3970
},
{
"epoch": 2.0793837315576447,
"grad_norm": 5.312356948852539,
"learning_rate": 5.908130939809926e-06,
"loss": 0.5429,
"step": 3980
},
{
"epoch": 2.0846063454759105,
"grad_norm": 5.636459827423096,
"learning_rate": 5.897571277719113e-06,
"loss": 0.5524,
"step": 3990
},
{
"epoch": 2.089828959394177,
"grad_norm": 5.541628360748291,
"learning_rate": 5.887011615628301e-06,
"loss": 0.6338,
"step": 4000
},
{
"epoch": 2.089828959394177,
"eval_loss": 0.8418287038803101,
"eval_runtime": 46.2756,
"eval_samples_per_second": 36.78,
"eval_steps_per_second": 4.603,
"step": 4000
},
{
"epoch": 2.095051573312443,
"grad_norm": 7.190380096435547,
"learning_rate": 5.876451953537487e-06,
"loss": 0.5818,
"step": 4010
},
{
"epoch": 2.100274187230709,
"grad_norm": 6.190309047698975,
"learning_rate": 5.8658922914466745e-06,
"loss": 0.5268,
"step": 4020
},
{
"epoch": 2.105496801148975,
"grad_norm": 5.186282157897949,
"learning_rate": 5.855332629355861e-06,
"loss": 0.4396,
"step": 4030
},
{
"epoch": 2.110719415067241,
"grad_norm": 3.6927499771118164,
"learning_rate": 5.844772967265048e-06,
"loss": 0.6601,
"step": 4040
},
{
"epoch": 2.1159420289855073,
"grad_norm": 5.906070232391357,
"learning_rate": 5.834213305174234e-06,
"loss": 0.5902,
"step": 4050
},
{
"epoch": 2.121164642903773,
"grad_norm": 5.765960216522217,
"learning_rate": 5.823653643083422e-06,
"loss": 0.6127,
"step": 4060
},
{
"epoch": 2.1263872568220394,
"grad_norm": 6.663849830627441,
"learning_rate": 5.813093980992609e-06,
"loss": 0.5044,
"step": 4070
},
{
"epoch": 2.1316098707403057,
"grad_norm": 5.963075637817383,
"learning_rate": 5.8025343189017954e-06,
"loss": 0.4473,
"step": 4080
},
{
"epoch": 2.1368324846585716,
"grad_norm": 5.396392345428467,
"learning_rate": 5.7919746568109825e-06,
"loss": 0.4896,
"step": 4090
},
{
"epoch": 2.142055098576838,
"grad_norm": 6.087408542633057,
"learning_rate": 5.781414994720169e-06,
"loss": 0.5368,
"step": 4100
},
{
"epoch": 2.1472777124951037,
"grad_norm": 4.697368144989014,
"learning_rate": 5.770855332629356e-06,
"loss": 0.6034,
"step": 4110
},
{
"epoch": 2.15250032641337,
"grad_norm": 5.101240634918213,
"learning_rate": 5.760295670538544e-06,
"loss": 0.4699,
"step": 4120
},
{
"epoch": 2.157722940331636,
"grad_norm": 6.5133891105651855,
"learning_rate": 5.74973600844773e-06,
"loss": 0.4589,
"step": 4130
},
{
"epoch": 2.162945554249902,
"grad_norm": 7.932409763336182,
"learning_rate": 5.739176346356917e-06,
"loss": 0.5443,
"step": 4140
},
{
"epoch": 2.1681681681681684,
"grad_norm": 4.897655010223389,
"learning_rate": 5.728616684266104e-06,
"loss": 0.5367,
"step": 4150
},
{
"epoch": 2.173390782086434,
"grad_norm": 6.237987041473389,
"learning_rate": 5.718057022175291e-06,
"loss": 0.5477,
"step": 4160
},
{
"epoch": 2.1786133960047005,
"grad_norm": 5.690924167633057,
"learning_rate": 5.707497360084478e-06,
"loss": 0.6216,
"step": 4170
},
{
"epoch": 2.1838360099229663,
"grad_norm": 5.274245738983154,
"learning_rate": 5.696937697993664e-06,
"loss": 0.5441,
"step": 4180
},
{
"epoch": 2.1890586238412326,
"grad_norm": 6.222249984741211,
"learning_rate": 5.686378035902852e-06,
"loss": 0.5579,
"step": 4190
},
{
"epoch": 2.1942812377594985,
"grad_norm": 6.638361930847168,
"learning_rate": 5.675818373812039e-06,
"loss": 0.6108,
"step": 4200
},
{
"epoch": 2.1942812377594985,
"eval_loss": 0.8388937711715698,
"eval_runtime": 46.2559,
"eval_samples_per_second": 36.795,
"eval_steps_per_second": 4.605,
"step": 4200
},
{
"epoch": 2.1995038516777647,
"grad_norm": 5.303590297698975,
"learning_rate": 5.665258711721225e-06,
"loss": 0.5602,
"step": 4210
},
{
"epoch": 2.204726465596031,
"grad_norm": 4.8176727294921875,
"learning_rate": 5.654699049630412e-06,
"loss": 0.5329,
"step": 4220
},
{
"epoch": 2.209949079514297,
"grad_norm": 7.120988368988037,
"learning_rate": 5.644139387539599e-06,
"loss": 0.4963,
"step": 4230
},
{
"epoch": 2.215171693432563,
"grad_norm": 5.514588832855225,
"learning_rate": 5.633579725448786e-06,
"loss": 0.6124,
"step": 4240
},
{
"epoch": 2.220394307350829,
"grad_norm": 5.315512657165527,
"learning_rate": 5.623020063357974e-06,
"loss": 0.5925,
"step": 4250
},
{
"epoch": 2.2256169212690953,
"grad_norm": 5.721431732177734,
"learning_rate": 5.61246040126716e-06,
"loss": 0.6081,
"step": 4260
},
{
"epoch": 2.230839535187361,
"grad_norm": 7.8369832038879395,
"learning_rate": 5.601900739176347e-06,
"loss": 0.4506,
"step": 4270
},
{
"epoch": 2.2360621491056274,
"grad_norm": 5.328855514526367,
"learning_rate": 5.591341077085533e-06,
"loss": 0.61,
"step": 4280
},
{
"epoch": 2.2412847630238932,
"grad_norm": 4.6545891761779785,
"learning_rate": 5.5807814149947205e-06,
"loss": 0.5102,
"step": 4290
},
{
"epoch": 2.2465073769421595,
"grad_norm": 4.399157524108887,
"learning_rate": 5.570221752903907e-06,
"loss": 0.5495,
"step": 4300
},
{
"epoch": 2.251729990860426,
"grad_norm": 8.38592529296875,
"learning_rate": 5.559662090813095e-06,
"loss": 0.5832,
"step": 4310
},
{
"epoch": 2.2569526047786916,
"grad_norm": 3.9857163429260254,
"learning_rate": 5.549102428722282e-06,
"loss": 0.4725,
"step": 4320
},
{
"epoch": 2.262175218696958,
"grad_norm": 5.648230075836182,
"learning_rate": 5.538542766631468e-06,
"loss": 0.5457,
"step": 4330
},
{
"epoch": 2.2673978326152238,
"grad_norm": 3.6229002475738525,
"learning_rate": 5.527983104540655e-06,
"loss": 0.5997,
"step": 4340
},
{
"epoch": 2.27262044653349,
"grad_norm": 6.094500541687012,
"learning_rate": 5.517423442449842e-06,
"loss": 0.5271,
"step": 4350
},
{
"epoch": 2.2778430604517563,
"grad_norm": 7.447030544281006,
"learning_rate": 5.5068637803590285e-06,
"loss": 0.6592,
"step": 4360
},
{
"epoch": 2.283065674370022,
"grad_norm": 6.060546398162842,
"learning_rate": 5.4963041182682165e-06,
"loss": 0.5148,
"step": 4370
},
{
"epoch": 2.2882882882882885,
"grad_norm": 6.3843092918396,
"learning_rate": 5.485744456177403e-06,
"loss": 0.5469,
"step": 4380
},
{
"epoch": 2.2935109022065543,
"grad_norm": 5.431898593902588,
"learning_rate": 5.47518479408659e-06,
"loss": 0.5922,
"step": 4390
},
{
"epoch": 2.2987335161248206,
"grad_norm": 7.113710403442383,
"learning_rate": 5.464625131995777e-06,
"loss": 0.6052,
"step": 4400
},
{
"epoch": 2.2987335161248206,
"eval_loss": 0.8255796432495117,
"eval_runtime": 46.2225,
"eval_samples_per_second": 36.822,
"eval_steps_per_second": 4.608,
"step": 4400
},
{
"epoch": 2.3039561300430864,
"grad_norm": 5.431846618652344,
"learning_rate": 5.454065469904963e-06,
"loss": 0.5565,
"step": 4410
},
{
"epoch": 2.3091787439613527,
"grad_norm": 4.409207344055176,
"learning_rate": 5.44350580781415e-06,
"loss": 0.6042,
"step": 4420
},
{
"epoch": 2.3144013578796185,
"grad_norm": 6.498474597930908,
"learning_rate": 5.432946145723337e-06,
"loss": 0.526,
"step": 4430
},
{
"epoch": 2.319623971797885,
"grad_norm": 5.55870246887207,
"learning_rate": 5.4223864836325246e-06,
"loss": 0.6467,
"step": 4440
},
{
"epoch": 2.324846585716151,
"grad_norm": 5.727528095245361,
"learning_rate": 5.411826821541712e-06,
"loss": 0.5701,
"step": 4450
},
{
"epoch": 2.330069199634417,
"grad_norm": 5.781177520751953,
"learning_rate": 5.401267159450898e-06,
"loss": 0.5244,
"step": 4460
},
{
"epoch": 2.3352918135526832,
"grad_norm": 3.1030118465423584,
"learning_rate": 5.390707497360085e-06,
"loss": 0.4523,
"step": 4470
},
{
"epoch": 2.340514427470949,
"grad_norm": 3.780679941177368,
"learning_rate": 5.380147835269271e-06,
"loss": 0.5043,
"step": 4480
},
{
"epoch": 2.3457370413892153,
"grad_norm": 7.008651256561279,
"learning_rate": 5.369588173178458e-06,
"loss": 0.4744,
"step": 4490
},
{
"epoch": 2.3509596553074816,
"grad_norm": 5.0416693687438965,
"learning_rate": 5.359028511087646e-06,
"loss": 0.485,
"step": 4500
},
{
"epoch": 2.3561822692257475,
"grad_norm": 6.809351921081543,
"learning_rate": 5.348468848996833e-06,
"loss": 0.6127,
"step": 4510
},
{
"epoch": 2.3614048831440138,
"grad_norm": 5.149819374084473,
"learning_rate": 5.33790918690602e-06,
"loss": 0.5398,
"step": 4520
},
{
"epoch": 2.3666274970622796,
"grad_norm": 6.1692962646484375,
"learning_rate": 5.327349524815206e-06,
"loss": 0.5371,
"step": 4530
},
{
"epoch": 2.371850110980546,
"grad_norm": 7.224261283874512,
"learning_rate": 5.316789862724393e-06,
"loss": 0.5493,
"step": 4540
},
{
"epoch": 2.3770727248988117,
"grad_norm": 6.257209777832031,
"learning_rate": 5.306230200633579e-06,
"loss": 0.5177,
"step": 4550
},
{
"epoch": 2.382295338817078,
"grad_norm": 5.789205074310303,
"learning_rate": 5.295670538542767e-06,
"loss": 0.5016,
"step": 4560
},
{
"epoch": 2.387517952735344,
"grad_norm": 6.116456508636475,
"learning_rate": 5.285110876451954e-06,
"loss": 0.6088,
"step": 4570
},
{
"epoch": 2.39274056665361,
"grad_norm": 5.818353176116943,
"learning_rate": 5.274551214361141e-06,
"loss": 0.5105,
"step": 4580
},
{
"epoch": 2.3979631805718764,
"grad_norm": 5.000683784484863,
"learning_rate": 5.263991552270328e-06,
"loss": 0.5899,
"step": 4590
},
{
"epoch": 2.4031857944901422,
"grad_norm": 6.238855838775635,
"learning_rate": 5.253431890179515e-06,
"loss": 0.6326,
"step": 4600
},
{
"epoch": 2.4031857944901422,
"eval_loss": 0.8141046166419983,
"eval_runtime": 46.2243,
"eval_samples_per_second": 36.82,
"eval_steps_per_second": 4.608,
"step": 4600
},
{
"epoch": 2.4084084084084085,
"grad_norm": 4.040828704833984,
"learning_rate": 5.242872228088701e-06,
"loss": 0.4504,
"step": 4610
},
{
"epoch": 2.4136310223266744,
"grad_norm": 4.401372909545898,
"learning_rate": 5.232312565997888e-06,
"loss": 0.4497,
"step": 4620
},
{
"epoch": 2.4188536362449407,
"grad_norm": 5.584476947784424,
"learning_rate": 5.221752903907075e-06,
"loss": 0.6214,
"step": 4630
},
{
"epoch": 2.4240762501632065,
"grad_norm": 3.0335025787353516,
"learning_rate": 5.2111932418162625e-06,
"loss": 0.5114,
"step": 4640
},
{
"epoch": 2.4292988640814728,
"grad_norm": 6.4493727684021,
"learning_rate": 5.20063357972545e-06,
"loss": 0.6065,
"step": 4650
},
{
"epoch": 2.434521477999739,
"grad_norm": 4.674168109893799,
"learning_rate": 5.190073917634636e-06,
"loss": 0.4778,
"step": 4660
},
{
"epoch": 2.439744091918005,
"grad_norm": 6.156538963317871,
"learning_rate": 5.179514255543823e-06,
"loss": 0.489,
"step": 4670
},
{
"epoch": 2.444966705836271,
"grad_norm": 5.803397178649902,
"learning_rate": 5.168954593453009e-06,
"loss": 0.5948,
"step": 4680
},
{
"epoch": 2.450189319754537,
"grad_norm": 6.375555992126465,
"learning_rate": 5.158394931362197e-06,
"loss": 0.4768,
"step": 4690
},
{
"epoch": 2.4554119336728033,
"grad_norm": 6.442558288574219,
"learning_rate": 5.147835269271384e-06,
"loss": 0.4996,
"step": 4700
},
{
"epoch": 2.460634547591069,
"grad_norm": 8.586763381958008,
"learning_rate": 5.1372756071805705e-06,
"loss": 0.7869,
"step": 4710
},
{
"epoch": 2.4658571615093354,
"grad_norm": 6.680276393890381,
"learning_rate": 5.126715945089758e-06,
"loss": 0.5526,
"step": 4720
},
{
"epoch": 2.4710797754276017,
"grad_norm": 3.914189100265503,
"learning_rate": 5.116156282998944e-06,
"loss": 0.6078,
"step": 4730
},
{
"epoch": 2.4763023893458675,
"grad_norm": 7.448416233062744,
"learning_rate": 5.105596620908131e-06,
"loss": 0.4982,
"step": 4740
},
{
"epoch": 2.481525003264134,
"grad_norm": 7.687001705169678,
"learning_rate": 5.095036958817319e-06,
"loss": 0.6684,
"step": 4750
},
{
"epoch": 2.4867476171823997,
"grad_norm": 6.067854404449463,
"learning_rate": 5.084477296726505e-06,
"loss": 0.5189,
"step": 4760
},
{
"epoch": 2.491970231100666,
"grad_norm": 5.405977725982666,
"learning_rate": 5.073917634635692e-06,
"loss": 0.5212,
"step": 4770
},
{
"epoch": 2.497192845018932,
"grad_norm": 7.773893356323242,
"learning_rate": 5.063357972544879e-06,
"loss": 0.6681,
"step": 4780
},
{
"epoch": 2.502415458937198,
"grad_norm": 9.031746864318848,
"learning_rate": 5.052798310454066e-06,
"loss": 0.6579,
"step": 4790
},
{
"epoch": 2.507638072855464,
"grad_norm": 5.8720245361328125,
"learning_rate": 5.042238648363252e-06,
"loss": 0.5891,
"step": 4800
},
{
"epoch": 2.507638072855464,
"eval_loss": 0.8107092380523682,
"eval_runtime": 46.2852,
"eval_samples_per_second": 36.772,
"eval_steps_per_second": 4.602,
"step": 4800
},
{
"epoch": 2.51286068677373,
"grad_norm": 4.402786731719971,
"learning_rate": 5.03167898627244e-06,
"loss": 0.4713,
"step": 4810
},
{
"epoch": 2.5180833006919965,
"grad_norm": 5.443326473236084,
"learning_rate": 5.021119324181627e-06,
"loss": 0.6362,
"step": 4820
},
{
"epoch": 2.5233059146102623,
"grad_norm": 6.188055515289307,
"learning_rate": 5.010559662090813e-06,
"loss": 0.6042,
"step": 4830
},
{
"epoch": 2.5285285285285286,
"grad_norm": 5.5944600105285645,
"learning_rate": 5e-06,
"loss": 0.5093,
"step": 4840
},
{
"epoch": 2.5337511424467944,
"grad_norm": 6.214510917663574,
"learning_rate": 4.9894403379091875e-06,
"loss": 0.5405,
"step": 4850
},
{
"epoch": 2.5389737563650607,
"grad_norm": 4.829537868499756,
"learning_rate": 4.978880675818375e-06,
"loss": 0.52,
"step": 4860
},
{
"epoch": 2.544196370283327,
"grad_norm": 5.498637676239014,
"learning_rate": 4.968321013727561e-06,
"loss": 0.4998,
"step": 4870
},
{
"epoch": 2.549418984201593,
"grad_norm": 6.1551361083984375,
"learning_rate": 4.957761351636748e-06,
"loss": 0.5842,
"step": 4880
},
{
"epoch": 2.554641598119859,
"grad_norm": 5.512228488922119,
"learning_rate": 4.947201689545935e-06,
"loss": 0.5877,
"step": 4890
},
{
"epoch": 2.559864212038125,
"grad_norm": 6.113735675811768,
"learning_rate": 4.936642027455122e-06,
"loss": 0.4979,
"step": 4900
},
{
"epoch": 2.5650868259563913,
"grad_norm": 9.69180965423584,
"learning_rate": 4.9260823653643085e-06,
"loss": 0.5595,
"step": 4910
},
{
"epoch": 2.5703094398746575,
"grad_norm": 6.716381072998047,
"learning_rate": 4.915522703273496e-06,
"loss": 0.577,
"step": 4920
},
{
"epoch": 2.5755320537929234,
"grad_norm": 6.1616926193237305,
"learning_rate": 4.904963041182683e-06,
"loss": 0.4457,
"step": 4930
},
{
"epoch": 2.580754667711189,
"grad_norm": 5.395188331604004,
"learning_rate": 4.894403379091869e-06,
"loss": 0.4934,
"step": 4940
},
{
"epoch": 2.5859772816294555,
"grad_norm": 5.993736743927002,
"learning_rate": 4.883843717001057e-06,
"loss": 0.5133,
"step": 4950
},
{
"epoch": 2.591199895547722,
"grad_norm": 4.6023054122924805,
"learning_rate": 4.873284054910243e-06,
"loss": 0.5084,
"step": 4960
},
{
"epoch": 2.5964225094659876,
"grad_norm": 4.444733619689941,
"learning_rate": 4.86272439281943e-06,
"loss": 0.5717,
"step": 4970
},
{
"epoch": 2.601645123384254,
"grad_norm": 6.304750442504883,
"learning_rate": 4.8521647307286165e-06,
"loss": 0.5583,
"step": 4980
},
{
"epoch": 2.6068677373025197,
"grad_norm": 5.222369194030762,
"learning_rate": 4.8416050686378045e-06,
"loss": 0.5425,
"step": 4990
},
{
"epoch": 2.612090351220786,
"grad_norm": 5.956515789031982,
"learning_rate": 4.831045406546991e-06,
"loss": 0.5626,
"step": 5000
},
{
"epoch": 2.612090351220786,
"eval_loss": 0.8068883419036865,
"eval_runtime": 46.2364,
"eval_samples_per_second": 36.811,
"eval_steps_per_second": 4.607,
"step": 5000
},
{
"epoch": 2.6173129651390523,
"grad_norm": 6.04465913772583,
"learning_rate": 4.820485744456178e-06,
"loss": 0.6118,
"step": 5010
},
{
"epoch": 2.622535579057318,
"grad_norm": 4.643867015838623,
"learning_rate": 4.809926082365365e-06,
"loss": 0.6346,
"step": 5020
},
{
"epoch": 2.6277581929755844,
"grad_norm": 5.192962646484375,
"learning_rate": 4.799366420274551e-06,
"loss": 0.539,
"step": 5030
},
{
"epoch": 2.6329808068938503,
"grad_norm": 5.151241779327393,
"learning_rate": 4.788806758183738e-06,
"loss": 0.552,
"step": 5040
},
{
"epoch": 2.6382034208121166,
"grad_norm": 4.308994293212891,
"learning_rate": 4.7782470960929254e-06,
"loss": 0.5292,
"step": 5050
},
{
"epoch": 2.6434260347303824,
"grad_norm": 5.595186233520508,
"learning_rate": 4.7676874340021126e-06,
"loss": 0.4777,
"step": 5060
},
{
"epoch": 2.6486486486486487,
"grad_norm": 6.7816643714904785,
"learning_rate": 4.757127771911299e-06,
"loss": 0.4999,
"step": 5070
},
{
"epoch": 2.6538712625669145,
"grad_norm": 7.1444315910339355,
"learning_rate": 4.746568109820486e-06,
"loss": 0.5028,
"step": 5080
},
{
"epoch": 2.659093876485181,
"grad_norm": 5.04287576675415,
"learning_rate": 4.736008447729673e-06,
"loss": 0.5156,
"step": 5090
},
{
"epoch": 2.664316490403447,
"grad_norm": 4.410764694213867,
"learning_rate": 4.72544878563886e-06,
"loss": 0.5241,
"step": 5100
},
{
"epoch": 2.669539104321713,
"grad_norm": 4.788335800170898,
"learning_rate": 4.714889123548047e-06,
"loss": 0.5099,
"step": 5110
},
{
"epoch": 2.674761718239979,
"grad_norm": 5.737407207489014,
"learning_rate": 4.7043294614572335e-06,
"loss": 0.4395,
"step": 5120
},
{
"epoch": 2.679984332158245,
"grad_norm": 6.255344867706299,
"learning_rate": 4.693769799366421e-06,
"loss": 0.5994,
"step": 5130
},
{
"epoch": 2.6852069460765113,
"grad_norm": 5.028295516967773,
"learning_rate": 4.683210137275608e-06,
"loss": 0.5518,
"step": 5140
},
{
"epoch": 2.6904295599947776,
"grad_norm": 4.391537189483643,
"learning_rate": 4.672650475184795e-06,
"loss": 0.4991,
"step": 5150
},
{
"epoch": 2.6956521739130435,
"grad_norm": 5.823568820953369,
"learning_rate": 4.662090813093981e-06,
"loss": 0.5885,
"step": 5160
},
{
"epoch": 2.7008747878313093,
"grad_norm": 2.928971529006958,
"learning_rate": 4.651531151003168e-06,
"loss": 0.6942,
"step": 5170
},
{
"epoch": 2.7060974017495756,
"grad_norm": 9.696499824523926,
"learning_rate": 4.640971488912355e-06,
"loss": 0.5699,
"step": 5180
},
{
"epoch": 2.711320015667842,
"grad_norm": 4.2604217529296875,
"learning_rate": 4.630411826821542e-06,
"loss": 0.4676,
"step": 5190
},
{
"epoch": 2.7165426295861077,
"grad_norm": 5.672421932220459,
"learning_rate": 4.619852164730729e-06,
"loss": 0.4576,
"step": 5200
},
{
"epoch": 2.7165426295861077,
"eval_loss": 0.7933436632156372,
"eval_runtime": 46.2281,
"eval_samples_per_second": 36.817,
"eval_steps_per_second": 4.608,
"step": 5200
},
{
"epoch": 2.721765243504374,
"grad_norm": 4.114403247833252,
"learning_rate": 4.609292502639916e-06,
"loss": 0.4742,
"step": 5210
},
{
"epoch": 2.72698785742264,
"grad_norm": 5.642977237701416,
"learning_rate": 4.598732840549103e-06,
"loss": 0.5453,
"step": 5220
},
{
"epoch": 2.732210471340906,
"grad_norm": 5.690392017364502,
"learning_rate": 4.588173178458289e-06,
"loss": 0.5248,
"step": 5230
},
{
"epoch": 2.7374330852591724,
"grad_norm": 7.28243350982666,
"learning_rate": 4.577613516367477e-06,
"loss": 0.5059,
"step": 5240
},
{
"epoch": 2.7426556991774382,
"grad_norm": 5.291462421417236,
"learning_rate": 4.567053854276663e-06,
"loss": 0.4494,
"step": 5250
},
{
"epoch": 2.7478783130957045,
"grad_norm": 2.6974287033081055,
"learning_rate": 4.5564941921858505e-06,
"loss": 0.5439,
"step": 5260
},
{
"epoch": 2.7531009270139704,
"grad_norm": 4.0036940574646,
"learning_rate": 4.545934530095038e-06,
"loss": 0.4797,
"step": 5270
},
{
"epoch": 2.7583235409322366,
"grad_norm": 5.800724506378174,
"learning_rate": 4.535374868004224e-06,
"loss": 0.5122,
"step": 5280
},
{
"epoch": 2.763546154850503,
"grad_norm": 6.420878887176514,
"learning_rate": 4.524815205913411e-06,
"loss": 0.4956,
"step": 5290
},
{
"epoch": 2.7687687687687688,
"grad_norm": 6.129545211791992,
"learning_rate": 4.514255543822598e-06,
"loss": 0.5465,
"step": 5300
},
{
"epoch": 2.7739913826870346,
"grad_norm": 5.964089870452881,
"learning_rate": 4.503695881731785e-06,
"loss": 0.5012,
"step": 5310
},
{
"epoch": 2.779213996605301,
"grad_norm": 5.476171493530273,
"learning_rate": 4.4931362196409714e-06,
"loss": 0.4842,
"step": 5320
},
{
"epoch": 2.784436610523567,
"grad_norm": 3.6184587478637695,
"learning_rate": 4.4825765575501585e-06,
"loss": 0.464,
"step": 5330
},
{
"epoch": 2.789659224441833,
"grad_norm": 6.052497863769531,
"learning_rate": 4.472016895459346e-06,
"loss": 0.5625,
"step": 5340
},
{
"epoch": 2.7948818383600993,
"grad_norm": 8.540828704833984,
"learning_rate": 4.461457233368533e-06,
"loss": 0.581,
"step": 5350
},
{
"epoch": 2.800104452278365,
"grad_norm": 6.093939781188965,
"learning_rate": 4.45089757127772e-06,
"loss": 0.5907,
"step": 5360
},
{
"epoch": 2.8053270661966314,
"grad_norm": 4.6257405281066895,
"learning_rate": 4.440337909186906e-06,
"loss": 0.4679,
"step": 5370
},
{
"epoch": 2.8105496801148977,
"grad_norm": 4.155122756958008,
"learning_rate": 4.429778247096093e-06,
"loss": 0.4799,
"step": 5380
},
{
"epoch": 2.8157722940331635,
"grad_norm": 6.194579601287842,
"learning_rate": 4.41921858500528e-06,
"loss": 0.5603,
"step": 5390
},
{
"epoch": 2.82099490795143,
"grad_norm": 7.508232593536377,
"learning_rate": 4.4086589229144675e-06,
"loss": 0.6672,
"step": 5400
},
{
"epoch": 2.82099490795143,
"eval_loss": 0.7919074296951294,
"eval_runtime": 46.2409,
"eval_samples_per_second": 36.807,
"eval_steps_per_second": 4.606,
"step": 5400
},
{
"epoch": 2.8262175218696957,
"grad_norm": 6.648965358734131,
"learning_rate": 4.398099260823654e-06,
"loss": 0.6096,
"step": 5410
},
{
"epoch": 2.831440135787962,
"grad_norm": 6.033213138580322,
"learning_rate": 4.387539598732841e-06,
"loss": 0.5375,
"step": 5420
},
{
"epoch": 2.8366627497062282,
"grad_norm": 5.509268760681152,
"learning_rate": 4.376979936642028e-06,
"loss": 0.4289,
"step": 5430
},
{
"epoch": 2.841885363624494,
"grad_norm": 4.335843563079834,
"learning_rate": 4.366420274551215e-06,
"loss": 0.443,
"step": 5440
},
{
"epoch": 2.84710797754276,
"grad_norm": 3.9269707202911377,
"learning_rate": 4.355860612460401e-06,
"loss": 0.4666,
"step": 5450
},
{
"epoch": 2.852330591461026,
"grad_norm": 7.474977970123291,
"learning_rate": 4.345300950369588e-06,
"loss": 0.5793,
"step": 5460
},
{
"epoch": 2.8575532053792925,
"grad_norm": 3.6517860889434814,
"learning_rate": 4.3347412882787755e-06,
"loss": 0.4276,
"step": 5470
},
{
"epoch": 2.8627758192975583,
"grad_norm": 4.662909030914307,
"learning_rate": 4.324181626187962e-06,
"loss": 0.5647,
"step": 5480
},
{
"epoch": 2.8679984332158246,
"grad_norm": 6.30706787109375,
"learning_rate": 4.31362196409715e-06,
"loss": 0.5985,
"step": 5490
},
{
"epoch": 2.8732210471340904,
"grad_norm": 3.8878538608551025,
"learning_rate": 4.303062302006336e-06,
"loss": 0.4561,
"step": 5500
},
{
"epoch": 2.8784436610523567,
"grad_norm": 4.7928667068481445,
"learning_rate": 4.292502639915523e-06,
"loss": 0.5941,
"step": 5510
},
{
"epoch": 2.883666274970623,
"grad_norm": 7.024383544921875,
"learning_rate": 4.28194297782471e-06,
"loss": 0.51,
"step": 5520
},
{
"epoch": 2.888888888888889,
"grad_norm": 4.680624961853027,
"learning_rate": 4.2713833157338965e-06,
"loss": 0.5751,
"step": 5530
},
{
"epoch": 2.894111502807155,
"grad_norm": 5.982644081115723,
"learning_rate": 4.260823653643084e-06,
"loss": 0.5139,
"step": 5540
},
{
"epoch": 2.899334116725421,
"grad_norm": 5.7597784996032715,
"learning_rate": 4.250263991552271e-06,
"loss": 0.5367,
"step": 5550
},
{
"epoch": 2.9045567306436872,
"grad_norm": 5.401406764984131,
"learning_rate": 4.239704329461458e-06,
"loss": 0.5542,
"step": 5560
},
{
"epoch": 2.9097793445619535,
"grad_norm": 8.459036827087402,
"learning_rate": 4.229144667370644e-06,
"loss": 0.5354,
"step": 5570
},
{
"epoch": 2.9150019584802194,
"grad_norm": 6.203250885009766,
"learning_rate": 4.218585005279832e-06,
"loss": 0.5376,
"step": 5580
},
{
"epoch": 2.920224572398485,
"grad_norm": 6.801321983337402,
"learning_rate": 4.208025343189018e-06,
"loss": 0.4566,
"step": 5590
},
{
"epoch": 2.9254471863167515,
"grad_norm": 2.9718496799468994,
"learning_rate": 4.197465681098205e-06,
"loss": 0.483,
"step": 5600
},
{
"epoch": 2.9254471863167515,
"eval_loss": 0.7785268425941467,
"eval_runtime": 46.2314,
"eval_samples_per_second": 36.815,
"eval_steps_per_second": 4.607,
"step": 5600
},
{
"epoch": 2.9306698002350178,
"grad_norm": 3.2528350353240967,
"learning_rate": 4.1869060190073925e-06,
"loss": 0.5526,
"step": 5610
},
{
"epoch": 2.9358924141532836,
"grad_norm": 3.9217379093170166,
"learning_rate": 4.176346356916579e-06,
"loss": 0.558,
"step": 5620
},
{
"epoch": 2.94111502807155,
"grad_norm": 4.15424919128418,
"learning_rate": 4.165786694825766e-06,
"loss": 0.4264,
"step": 5630
},
{
"epoch": 2.9463376419898157,
"grad_norm": 5.303351402282715,
"learning_rate": 4.155227032734953e-06,
"loss": 0.5739,
"step": 5640
},
{
"epoch": 2.951560255908082,
"grad_norm": 7.237427234649658,
"learning_rate": 4.14466737064414e-06,
"loss": 0.5442,
"step": 5650
},
{
"epoch": 2.9567828698263483,
"grad_norm": 4.967709541320801,
"learning_rate": 4.134107708553326e-06,
"loss": 0.4926,
"step": 5660
},
{
"epoch": 2.962005483744614,
"grad_norm": 5.920149326324463,
"learning_rate": 4.1235480464625134e-06,
"loss": 0.5655,
"step": 5670
},
{
"epoch": 2.9672280976628804,
"grad_norm": 5.823659896850586,
"learning_rate": 4.1129883843717006e-06,
"loss": 0.5124,
"step": 5680
},
{
"epoch": 2.9724507115811463,
"grad_norm": 3.851020336151123,
"learning_rate": 4.102428722280888e-06,
"loss": 0.423,
"step": 5690
},
{
"epoch": 2.9776733254994125,
"grad_norm": 4.632102012634277,
"learning_rate": 4.091869060190074e-06,
"loss": 0.42,
"step": 5700
},
{
"epoch": 2.9828959394176784,
"grad_norm": 6.207057476043701,
"learning_rate": 4.081309398099261e-06,
"loss": 0.4569,
"step": 5710
},
{
"epoch": 2.9881185533359447,
"grad_norm": 4.414632797241211,
"learning_rate": 4.070749736008448e-06,
"loss": 0.4628,
"step": 5720
},
{
"epoch": 2.9933411672542105,
"grad_norm": 5.721477508544922,
"learning_rate": 4.060190073917634e-06,
"loss": 0.4518,
"step": 5730
},
{
"epoch": 2.998563781172477,
"grad_norm": 5.436526298522949,
"learning_rate": 4.049630411826822e-06,
"loss": 0.5313,
"step": 5740
},
{
"epoch": 3.0041780911346128,
"grad_norm": 7.654147148132324,
"learning_rate": 4.039070749736009e-06,
"loss": 0.494,
"step": 5750
},
{
"epoch": 3.009400705052879,
"grad_norm": 5.68324089050293,
"learning_rate": 4.028511087645196e-06,
"loss": 0.3699,
"step": 5760
},
{
"epoch": 3.014623318971145,
"grad_norm": 5.219386577606201,
"learning_rate": 4.017951425554383e-06,
"loss": 0.4355,
"step": 5770
},
{
"epoch": 3.019845932889411,
"grad_norm": 6.570154190063477,
"learning_rate": 4.007391763463569e-06,
"loss": 0.4075,
"step": 5780
},
{
"epoch": 3.0250685468076774,
"grad_norm": 7.014920234680176,
"learning_rate": 3.996832101372756e-06,
"loss": 0.3904,
"step": 5790
},
{
"epoch": 3.0302911607259433,
"grad_norm": 4.148968696594238,
"learning_rate": 3.986272439281943e-06,
"loss": 0.3938,
"step": 5800
},
{
"epoch": 3.0302911607259433,
"eval_loss": 0.8061103224754333,
"eval_runtime": 46.2431,
"eval_samples_per_second": 36.806,
"eval_steps_per_second": 4.606,
"step": 5800
},
{
"epoch": 3.0355137746442096,
"grad_norm": 8.123299598693848,
"learning_rate": 3.97571277719113e-06,
"loss": 0.4201,
"step": 5810
},
{
"epoch": 3.0407363885624754,
"grad_norm": 8.127564430236816,
"learning_rate": 3.965153115100317e-06,
"loss": 0.416,
"step": 5820
},
{
"epoch": 3.0459590024807417,
"grad_norm": 3.643094539642334,
"learning_rate": 3.954593453009505e-06,
"loss": 0.36,
"step": 5830
},
{
"epoch": 3.0511816163990075,
"grad_norm": 3.6215567588806152,
"learning_rate": 3.944033790918691e-06,
"loss": 0.354,
"step": 5840
},
{
"epoch": 3.056404230317274,
"grad_norm": 6.820983409881592,
"learning_rate": 3.933474128827878e-06,
"loss": 0.4113,
"step": 5850
},
{
"epoch": 3.06162684423554,
"grad_norm": 5.715891361236572,
"learning_rate": 3.922914466737065e-06,
"loss": 0.3628,
"step": 5860
},
{
"epoch": 3.066849458153806,
"grad_norm": 7.616763114929199,
"learning_rate": 3.912354804646251e-06,
"loss": 0.4049,
"step": 5870
},
{
"epoch": 3.0720720720720722,
"grad_norm": 4.177463531494141,
"learning_rate": 3.9017951425554385e-06,
"loss": 0.4452,
"step": 5880
},
{
"epoch": 3.077294685990338,
"grad_norm": 4.512898921966553,
"learning_rate": 3.891235480464626e-06,
"loss": 0.384,
"step": 5890
},
{
"epoch": 3.0825172999086043,
"grad_norm": 3.9176089763641357,
"learning_rate": 3.880675818373813e-06,
"loss": 0.4524,
"step": 5900
},
{
"epoch": 3.08773991382687,
"grad_norm": 5.587650299072266,
"learning_rate": 3.870116156282999e-06,
"loss": 0.3638,
"step": 5910
},
{
"epoch": 3.0929625277451365,
"grad_norm": 5.886160373687744,
"learning_rate": 3.859556494192186e-06,
"loss": 0.3664,
"step": 5920
},
{
"epoch": 3.0981851416634028,
"grad_norm": 3.332893133163452,
"learning_rate": 3.848996832101373e-06,
"loss": 0.4352,
"step": 5930
},
{
"epoch": 3.1034077555816686,
"grad_norm": 4.1508097648620605,
"learning_rate": 3.83843717001056e-06,
"loss": 0.485,
"step": 5940
},
{
"epoch": 3.108630369499935,
"grad_norm": 6.934416770935059,
"learning_rate": 3.8278775079197465e-06,
"loss": 0.4333,
"step": 5950
},
{
"epoch": 3.1138529834182007,
"grad_norm": 5.893505573272705,
"learning_rate": 3.817317845828934e-06,
"loss": 0.4209,
"step": 5960
},
{
"epoch": 3.119075597336467,
"grad_norm": 6.001057147979736,
"learning_rate": 3.8067581837381208e-06,
"loss": 0.4274,
"step": 5970
},
{
"epoch": 3.124298211254733,
"grad_norm": 4.873240947723389,
"learning_rate": 3.7961985216473074e-06,
"loss": 0.3278,
"step": 5980
},
{
"epoch": 3.129520825172999,
"grad_norm": 6.454697132110596,
"learning_rate": 3.7856388595564946e-06,
"loss": 0.4673,
"step": 5990
},
{
"epoch": 3.134743439091265,
"grad_norm": 5.828022003173828,
"learning_rate": 3.7750791974656812e-06,
"loss": 0.4288,
"step": 6000
},
{
"epoch": 3.134743439091265,
"eval_loss": 0.8229681849479675,
"eval_runtime": 46.2578,
"eval_samples_per_second": 36.794,
"eval_steps_per_second": 4.605,
"step": 6000
},
{
"epoch": 3.1399660530095312,
"grad_norm": 6.981528282165527,
"learning_rate": 3.7645195353748684e-06,
"loss": 0.3955,
"step": 6010
},
{
"epoch": 3.1451886669277975,
"grad_norm": 4.903995990753174,
"learning_rate": 3.7539598732840555e-06,
"loss": 0.3907,
"step": 6020
},
{
"epoch": 3.1504112808460634,
"grad_norm": 4.399137496948242,
"learning_rate": 3.743400211193242e-06,
"loss": 0.4785,
"step": 6030
},
{
"epoch": 3.1556338947643296,
"grad_norm": 3.4194514751434326,
"learning_rate": 3.732840549102429e-06,
"loss": 0.4086,
"step": 6040
},
{
"epoch": 3.1608565086825955,
"grad_norm": 6.683743476867676,
"learning_rate": 3.7222808870116164e-06,
"loss": 0.4418,
"step": 6050
},
{
"epoch": 3.1660791226008618,
"grad_norm": 4.924780368804932,
"learning_rate": 3.711721224920803e-06,
"loss": 0.4035,
"step": 6060
},
{
"epoch": 3.171301736519128,
"grad_norm": 6.23117733001709,
"learning_rate": 3.7011615628299897e-06,
"loss": 0.3554,
"step": 6070
},
{
"epoch": 3.176524350437394,
"grad_norm": 7.044112682342529,
"learning_rate": 3.690601900739177e-06,
"loss": 0.4143,
"step": 6080
},
{
"epoch": 3.18174696435566,
"grad_norm": 6.9131059646606445,
"learning_rate": 3.6800422386483635e-06,
"loss": 0.4045,
"step": 6090
},
{
"epoch": 3.186969578273926,
"grad_norm": 5.362022876739502,
"learning_rate": 3.66948257655755e-06,
"loss": 0.4318,
"step": 6100
},
{
"epoch": 3.1921921921921923,
"grad_norm": 5.799452781677246,
"learning_rate": 3.6589229144667377e-06,
"loss": 0.4649,
"step": 6110
},
{
"epoch": 3.197414806110458,
"grad_norm": 7.595244884490967,
"learning_rate": 3.6483632523759244e-06,
"loss": 0.4466,
"step": 6120
},
{
"epoch": 3.2026374200287244,
"grad_norm": 2.4312336444854736,
"learning_rate": 3.637803590285111e-06,
"loss": 0.4557,
"step": 6130
},
{
"epoch": 3.2078600339469903,
"grad_norm": 5.092735767364502,
"learning_rate": 3.627243928194298e-06,
"loss": 0.4255,
"step": 6140
},
{
"epoch": 3.2130826478652565,
"grad_norm": 5.3324151039123535,
"learning_rate": 3.616684266103485e-06,
"loss": 0.432,
"step": 6150
},
{
"epoch": 3.218305261783523,
"grad_norm": 4.059586524963379,
"learning_rate": 3.606124604012672e-06,
"loss": 0.4766,
"step": 6160
},
{
"epoch": 3.2235278757017887,
"grad_norm": 6.850623607635498,
"learning_rate": 3.5955649419218587e-06,
"loss": 0.4387,
"step": 6170
},
{
"epoch": 3.228750489620055,
"grad_norm": 5.995065212249756,
"learning_rate": 3.585005279831046e-06,
"loss": 0.4257,
"step": 6180
},
{
"epoch": 3.233973103538321,
"grad_norm": 3.885401487350464,
"learning_rate": 3.5744456177402325e-06,
"loss": 0.4423,
"step": 6190
},
{
"epoch": 3.239195717456587,
"grad_norm": 4.709335803985596,
"learning_rate": 3.563885955649419e-06,
"loss": 0.3812,
"step": 6200
},
{
"epoch": 3.239195717456587,
"eval_loss": 0.810655415058136,
"eval_runtime": 46.276,
"eval_samples_per_second": 36.779,
"eval_steps_per_second": 4.603,
"step": 6200
},
{
"epoch": 3.244418331374853,
"grad_norm": 7.61454963684082,
"learning_rate": 3.5533262935586067e-06,
"loss": 0.4508,
"step": 6210
},
{
"epoch": 3.249640945293119,
"grad_norm": 7.8873066902160645,
"learning_rate": 3.5427666314677934e-06,
"loss": 0.4208,
"step": 6220
},
{
"epoch": 3.2548635592113855,
"grad_norm": 5.000669956207275,
"learning_rate": 3.53220696937698e-06,
"loss": 0.4103,
"step": 6230
},
{
"epoch": 3.2600861731296513,
"grad_norm": 4.964175701141357,
"learning_rate": 3.521647307286167e-06,
"loss": 0.4562,
"step": 6240
},
{
"epoch": 3.2653087870479176,
"grad_norm": 4.287696838378906,
"learning_rate": 3.511087645195354e-06,
"loss": 0.4661,
"step": 6250
},
{
"epoch": 3.2705314009661834,
"grad_norm": 5.519683837890625,
"learning_rate": 3.500527983104541e-06,
"loss": 0.4882,
"step": 6260
},
{
"epoch": 3.2757540148844497,
"grad_norm": 5.700749397277832,
"learning_rate": 3.489968321013728e-06,
"loss": 0.4463,
"step": 6270
},
{
"epoch": 3.2809766288027156,
"grad_norm": 5.7745466232299805,
"learning_rate": 3.4794086589229148e-06,
"loss": 0.4626,
"step": 6280
},
{
"epoch": 3.286199242720982,
"grad_norm": 8.0064058303833,
"learning_rate": 3.4688489968321015e-06,
"loss": 0.3982,
"step": 6290
},
{
"epoch": 3.291421856639248,
"grad_norm": 5.860507488250732,
"learning_rate": 3.458289334741289e-06,
"loss": 0.3835,
"step": 6300
},
{
"epoch": 3.296644470557514,
"grad_norm": 7.413349628448486,
"learning_rate": 3.4477296726504757e-06,
"loss": 0.4338,
"step": 6310
},
{
"epoch": 3.3018670844757803,
"grad_norm": 4.818141937255859,
"learning_rate": 3.4371700105596624e-06,
"loss": 0.497,
"step": 6320
},
{
"epoch": 3.307089698394046,
"grad_norm": 3.987377405166626,
"learning_rate": 3.4266103484688495e-06,
"loss": 0.4099,
"step": 6330
},
{
"epoch": 3.3123123123123124,
"grad_norm": 7.0202860832214355,
"learning_rate": 3.416050686378036e-06,
"loss": 0.4791,
"step": 6340
},
{
"epoch": 3.317534926230578,
"grad_norm": 6.688587665557861,
"learning_rate": 3.405491024287223e-06,
"loss": 0.427,
"step": 6350
},
{
"epoch": 3.3227575401488445,
"grad_norm": 6.508810997009277,
"learning_rate": 3.39493136219641e-06,
"loss": 0.431,
"step": 6360
},
{
"epoch": 3.3279801540671103,
"grad_norm": 7.402127265930176,
"learning_rate": 3.384371700105597e-06,
"loss": 0.4145,
"step": 6370
},
{
"epoch": 3.3332027679853766,
"grad_norm": 4.331240177154541,
"learning_rate": 3.3738120380147837e-06,
"loss": 0.3799,
"step": 6380
},
{
"epoch": 3.338425381903643,
"grad_norm": 6.24545431137085,
"learning_rate": 3.3632523759239704e-06,
"loss": 0.4339,
"step": 6390
},
{
"epoch": 3.3436479958219087,
"grad_norm": 6.327270030975342,
"learning_rate": 3.352692713833158e-06,
"loss": 0.4506,
"step": 6400
},
{
"epoch": 3.3436479958219087,
"eval_loss": 0.8060568571090698,
"eval_runtime": 46.2458,
"eval_samples_per_second": 36.803,
"eval_steps_per_second": 4.606,
"step": 6400
},
{
"epoch": 3.348870609740175,
"grad_norm": 2.5744376182556152,
"learning_rate": 3.3421330517423446e-06,
"loss": 0.484,
"step": 6410
},
{
"epoch": 3.354093223658441,
"grad_norm": 3.4706344604492188,
"learning_rate": 3.3315733896515313e-06,
"loss": 0.3897,
"step": 6420
},
{
"epoch": 3.359315837576707,
"grad_norm": 6.175302028656006,
"learning_rate": 3.3210137275607184e-06,
"loss": 0.4054,
"step": 6430
},
{
"epoch": 3.3645384514949734,
"grad_norm": 5.064645767211914,
"learning_rate": 3.310454065469905e-06,
"loss": 0.4458,
"step": 6440
},
{
"epoch": 3.3697610654132393,
"grad_norm": 8.018420219421387,
"learning_rate": 3.299894403379092e-06,
"loss": 0.5056,
"step": 6450
},
{
"epoch": 3.3749836793315056,
"grad_norm": 6.1567301750183105,
"learning_rate": 3.2893347412882793e-06,
"loss": 0.3735,
"step": 6460
},
{
"epoch": 3.3802062932497714,
"grad_norm": 5.155027866363525,
"learning_rate": 3.278775079197466e-06,
"loss": 0.3715,
"step": 6470
},
{
"epoch": 3.3854289071680377,
"grad_norm": 5.396885395050049,
"learning_rate": 3.2682154171066527e-06,
"loss": 0.3597,
"step": 6480
},
{
"epoch": 3.3906515210863035,
"grad_norm": 3.0646581649780273,
"learning_rate": 3.25765575501584e-06,
"loss": 0.4777,
"step": 6490
},
{
"epoch": 3.39587413500457,
"grad_norm": 5.380611419677734,
"learning_rate": 3.2470960929250265e-06,
"loss": 0.4137,
"step": 6500
},
{
"epoch": 3.4010967489228356,
"grad_norm": 6.225546360015869,
"learning_rate": 3.2365364308342136e-06,
"loss": 0.313,
"step": 6510
},
{
"epoch": 3.406319362841102,
"grad_norm": 4.760247707366943,
"learning_rate": 3.2259767687434007e-06,
"loss": 0.3674,
"step": 6520
},
{
"epoch": 3.411541976759368,
"grad_norm": 7.30977725982666,
"learning_rate": 3.2154171066525874e-06,
"loss": 0.3591,
"step": 6530
},
{
"epoch": 3.416764590677634,
"grad_norm": 7.618262767791748,
"learning_rate": 3.204857444561774e-06,
"loss": 0.4384,
"step": 6540
},
{
"epoch": 3.4219872045959003,
"grad_norm": 3.0289359092712402,
"learning_rate": 3.1942977824709616e-06,
"loss": 0.3818,
"step": 6550
},
{
"epoch": 3.427209818514166,
"grad_norm": 3.5998988151550293,
"learning_rate": 3.1837381203801483e-06,
"loss": 0.4318,
"step": 6560
},
{
"epoch": 3.4324324324324325,
"grad_norm": 4.631134033203125,
"learning_rate": 3.173178458289335e-06,
"loss": 0.3956,
"step": 6570
},
{
"epoch": 3.4376550463506987,
"grad_norm": 8.129390716552734,
"learning_rate": 3.1626187961985217e-06,
"loss": 0.3928,
"step": 6580
},
{
"epoch": 3.4428776602689646,
"grad_norm": 7.39198637008667,
"learning_rate": 3.1520591341077088e-06,
"loss": 0.4827,
"step": 6590
},
{
"epoch": 3.448100274187231,
"grad_norm": 4.935920238494873,
"learning_rate": 3.1414994720168955e-06,
"loss": 0.4126,
"step": 6600
},
{
"epoch": 3.448100274187231,
"eval_loss": 0.8010614514350891,
"eval_runtime": 46.3186,
"eval_samples_per_second": 36.746,
"eval_steps_per_second": 4.599,
"step": 6600
},
{
"epoch": 3.4533228881054967,
"grad_norm": 4.4114460945129395,
"learning_rate": 3.1309398099260826e-06,
"loss": 0.4439,
"step": 6610
},
{
"epoch": 3.458545502023763,
"grad_norm": 6.824039459228516,
"learning_rate": 3.1203801478352697e-06,
"loss": 0.3552,
"step": 6620
},
{
"epoch": 3.463768115942029,
"grad_norm": 4.671921253204346,
"learning_rate": 3.1098204857444564e-06,
"loss": 0.3289,
"step": 6630
},
{
"epoch": 3.468990729860295,
"grad_norm": 3.557352304458618,
"learning_rate": 3.099260823653643e-06,
"loss": 0.4342,
"step": 6640
},
{
"epoch": 3.474213343778561,
"grad_norm": 5.833057403564453,
"learning_rate": 3.0887011615628306e-06,
"loss": 0.4985,
"step": 6650
},
{
"epoch": 3.4794359576968272,
"grad_norm": 3.8494341373443604,
"learning_rate": 3.0781414994720173e-06,
"loss": 0.4175,
"step": 6660
},
{
"epoch": 3.4846585716150935,
"grad_norm": 5.017399311065674,
"learning_rate": 3.067581837381204e-06,
"loss": 0.3396,
"step": 6670
},
{
"epoch": 3.4898811855333594,
"grad_norm": 8.080399513244629,
"learning_rate": 3.057022175290391e-06,
"loss": 0.412,
"step": 6680
},
{
"epoch": 3.4951037994516256,
"grad_norm": 4.017096996307373,
"learning_rate": 3.0464625131995777e-06,
"loss": 0.4273,
"step": 6690
},
{
"epoch": 3.5003264133698915,
"grad_norm": 5.441287517547607,
"learning_rate": 3.0359028511087644e-06,
"loss": 0.3789,
"step": 6700
},
{
"epoch": 3.5055490272881578,
"grad_norm": 6.666896343231201,
"learning_rate": 3.025343189017952e-06,
"loss": 0.4841,
"step": 6710
},
{
"epoch": 3.510771641206424,
"grad_norm": 6.075514316558838,
"learning_rate": 3.0147835269271386e-06,
"loss": 0.4189,
"step": 6720
},
{
"epoch": 3.51599425512469,
"grad_norm": 4.0529303550720215,
"learning_rate": 3.0042238648363253e-06,
"loss": 0.442,
"step": 6730
},
{
"epoch": 3.521216869042956,
"grad_norm": 4.397035598754883,
"learning_rate": 2.9936642027455124e-06,
"loss": 0.3511,
"step": 6740
},
{
"epoch": 3.526439482961222,
"grad_norm": 4.687301158905029,
"learning_rate": 2.9831045406546995e-06,
"loss": 0.4178,
"step": 6750
},
{
"epoch": 3.5316620968794883,
"grad_norm": 7.033337593078613,
"learning_rate": 2.9725448785638862e-06,
"loss": 0.4109,
"step": 6760
},
{
"epoch": 3.536884710797754,
"grad_norm": 5.542331218719482,
"learning_rate": 2.9619852164730733e-06,
"loss": 0.4457,
"step": 6770
},
{
"epoch": 3.5421073247160204,
"grad_norm": 2.987297534942627,
"learning_rate": 2.95142555438226e-06,
"loss": 0.4106,
"step": 6780
},
{
"epoch": 3.5473299386342863,
"grad_norm": 4.037609100341797,
"learning_rate": 2.9408658922914467e-06,
"loss": 0.3515,
"step": 6790
},
{
"epoch": 3.5525525525525525,
"grad_norm": 6.087532997131348,
"learning_rate": 2.9303062302006342e-06,
"loss": 0.4127,
"step": 6800
},
{
"epoch": 3.5525525525525525,
"eval_loss": 0.799282431602478,
"eval_runtime": 46.2878,
"eval_samples_per_second": 36.77,
"eval_steps_per_second": 4.602,
"step": 6800
},
{
"epoch": 3.557775166470819,
"grad_norm": 5.302961826324463,
"learning_rate": 2.919746568109821e-06,
"loss": 0.3994,
"step": 6810
},
{
"epoch": 3.5629977803890847,
"grad_norm": 6.701052188873291,
"learning_rate": 2.9091869060190076e-06,
"loss": 0.3903,
"step": 6820
},
{
"epoch": 3.568220394307351,
"grad_norm": 5.228213310241699,
"learning_rate": 2.8986272439281943e-06,
"loss": 0.4006,
"step": 6830
},
{
"epoch": 3.573443008225617,
"grad_norm": 5.282093524932861,
"learning_rate": 2.8880675818373814e-06,
"loss": 0.4515,
"step": 6840
},
{
"epoch": 3.578665622143883,
"grad_norm": 3.786198616027832,
"learning_rate": 2.8775079197465685e-06,
"loss": 0.3696,
"step": 6850
},
{
"epoch": 3.5838882360621493,
"grad_norm": 5.512637138366699,
"learning_rate": 2.866948257655755e-06,
"loss": 0.4546,
"step": 6860
},
{
"epoch": 3.589110849980415,
"grad_norm": 7.117464542388916,
"learning_rate": 2.8563885955649423e-06,
"loss": 0.4562,
"step": 6870
},
{
"epoch": 3.594333463898681,
"grad_norm": 4.943199634552002,
"learning_rate": 2.845828933474129e-06,
"loss": 0.4894,
"step": 6880
},
{
"epoch": 3.5995560778169473,
"grad_norm": 8.774984359741211,
"learning_rate": 2.8352692713833157e-06,
"loss": 0.3925,
"step": 6890
},
{
"epoch": 3.6047786917352136,
"grad_norm": 7.126657009124756,
"learning_rate": 2.824709609292503e-06,
"loss": 0.4375,
"step": 6900
},
{
"epoch": 3.6100013056534794,
"grad_norm": 5.460080146789551,
"learning_rate": 2.81414994720169e-06,
"loss": 0.3847,
"step": 6910
},
{
"epoch": 3.6152239195717457,
"grad_norm": 5.7454833984375,
"learning_rate": 2.8035902851108766e-06,
"loss": 0.3669,
"step": 6920
},
{
"epoch": 3.6204465334900116,
"grad_norm": 7.132731914520264,
"learning_rate": 2.7930306230200637e-06,
"loss": 0.4013,
"step": 6930
},
{
"epoch": 3.625669147408278,
"grad_norm": 4.874327659606934,
"learning_rate": 2.7824709609292504e-06,
"loss": 0.3866,
"step": 6940
},
{
"epoch": 3.630891761326544,
"grad_norm": 6.133016586303711,
"learning_rate": 2.771911298838437e-06,
"loss": 0.3879,
"step": 6950
},
{
"epoch": 3.63611437524481,
"grad_norm": 7.170290470123291,
"learning_rate": 2.7613516367476246e-06,
"loss": 0.374,
"step": 6960
},
{
"epoch": 3.6413369891630762,
"grad_norm": 4.124912738800049,
"learning_rate": 2.7507919746568113e-06,
"loss": 0.5092,
"step": 6970
},
{
"epoch": 3.646559603081342,
"grad_norm": 6.091069221496582,
"learning_rate": 2.740232312565998e-06,
"loss": 0.4028,
"step": 6980
},
{
"epoch": 3.6517822169996084,
"grad_norm": 3.907172203063965,
"learning_rate": 2.729672650475185e-06,
"loss": 0.4105,
"step": 6990
},
{
"epoch": 3.6570048309178746,
"grad_norm": 4.004384517669678,
"learning_rate": 2.719112988384372e-06,
"loss": 0.3923,
"step": 7000
},
{
"epoch": 3.6570048309178746,
"eval_loss": 0.7925397157669067,
"eval_runtime": 46.2922,
"eval_samples_per_second": 36.766,
"eval_steps_per_second": 4.601,
"step": 7000
},
{
"epoch": 3.6622274448361405,
"grad_norm": 4.95105504989624,
"learning_rate": 2.708553326293559e-06,
"loss": 0.4407,
"step": 7010
},
{
"epoch": 3.6674500587544063,
"grad_norm": 5.319108963012695,
"learning_rate": 2.697993664202746e-06,
"loss": 0.4652,
"step": 7020
},
{
"epoch": 3.6726726726726726,
"grad_norm": 7.128709316253662,
"learning_rate": 2.6874340021119326e-06,
"loss": 0.4353,
"step": 7030
},
{
"epoch": 3.677895286590939,
"grad_norm": 4.808097839355469,
"learning_rate": 2.6768743400211193e-06,
"loss": 0.345,
"step": 7040
},
{
"epoch": 3.6831179005092047,
"grad_norm": 6.002725601196289,
"learning_rate": 2.666314677930306e-06,
"loss": 0.425,
"step": 7050
},
{
"epoch": 3.688340514427471,
"grad_norm": 4.452878952026367,
"learning_rate": 2.6557550158394935e-06,
"loss": 0.4487,
"step": 7060
},
{
"epoch": 3.693563128345737,
"grad_norm": 5.096455097198486,
"learning_rate": 2.6451953537486802e-06,
"loss": 0.4158,
"step": 7070
},
{
"epoch": 3.698785742264003,
"grad_norm": 6.426013946533203,
"learning_rate": 2.634635691657867e-06,
"loss": 0.3701,
"step": 7080
},
{
"epoch": 3.7040083561822694,
"grad_norm": 7.101649284362793,
"learning_rate": 2.624076029567054e-06,
"loss": 0.4757,
"step": 7090
},
{
"epoch": 3.7092309701005353,
"grad_norm": 5.96986198425293,
"learning_rate": 2.613516367476241e-06,
"loss": 0.4475,
"step": 7100
},
{
"epoch": 3.7144535840188015,
"grad_norm": 6.221879482269287,
"learning_rate": 2.602956705385428e-06,
"loss": 0.3911,
"step": 7110
},
{
"epoch": 3.7196761979370674,
"grad_norm": 6.827433109283447,
"learning_rate": 2.592397043294615e-06,
"loss": 0.4751,
"step": 7120
},
{
"epoch": 3.7248988118553337,
"grad_norm": 5.734457015991211,
"learning_rate": 2.5818373812038016e-06,
"loss": 0.4343,
"step": 7130
},
{
"epoch": 3.7301214257736,
"grad_norm": 3.825587034225464,
"learning_rate": 2.5712777191129883e-06,
"loss": 0.3712,
"step": 7140
},
{
"epoch": 3.735344039691866,
"grad_norm": 7.958340644836426,
"learning_rate": 2.560718057022176e-06,
"loss": 0.4076,
"step": 7150
},
{
"epoch": 3.7405666536101316,
"grad_norm": 6.706486701965332,
"learning_rate": 2.5501583949313625e-06,
"loss": 0.4379,
"step": 7160
},
{
"epoch": 3.745789267528398,
"grad_norm": 3.6787171363830566,
"learning_rate": 2.539598732840549e-06,
"loss": 0.3937,
"step": 7170
},
{
"epoch": 3.751011881446664,
"grad_norm": 3.7710745334625244,
"learning_rate": 2.5290390707497363e-06,
"loss": 0.4023,
"step": 7180
},
{
"epoch": 3.75623449536493,
"grad_norm": 9.986141204833984,
"learning_rate": 2.518479408658923e-06,
"loss": 0.4278,
"step": 7190
},
{
"epoch": 3.7614571092831963,
"grad_norm": 5.851425647735596,
"learning_rate": 2.50791974656811e-06,
"loss": 0.3561,
"step": 7200
},
{
"epoch": 3.7614571092831963,
"eval_loss": 0.7960723042488098,
"eval_runtime": 46.2236,
"eval_samples_per_second": 36.821,
"eval_steps_per_second": 4.608,
"step": 7200
},
{
"epoch": 3.766679723201462,
"grad_norm": 5.080770969390869,
"learning_rate": 2.4973600844772968e-06,
"loss": 0.4494,
"step": 7210
},
{
"epoch": 3.7719023371197284,
"grad_norm": 6.7447190284729,
"learning_rate": 2.486800422386484e-06,
"loss": 0.4839,
"step": 7220
},
{
"epoch": 3.7771249510379947,
"grad_norm": 5.529577255249023,
"learning_rate": 2.4762407602956706e-06,
"loss": 0.3799,
"step": 7230
},
{
"epoch": 3.7823475649562606,
"grad_norm": 3.1499006748199463,
"learning_rate": 2.4656810982048577e-06,
"loss": 0.3959,
"step": 7240
},
{
"epoch": 3.787570178874527,
"grad_norm": 7.214032173156738,
"learning_rate": 2.4551214361140448e-06,
"loss": 0.4165,
"step": 7250
},
{
"epoch": 3.7927927927927927,
"grad_norm": 3.6615257263183594,
"learning_rate": 2.4445617740232315e-06,
"loss": 0.3899,
"step": 7260
},
{
"epoch": 3.798015406711059,
"grad_norm": 8.117351531982422,
"learning_rate": 2.4340021119324186e-06,
"loss": 0.3904,
"step": 7270
},
{
"epoch": 3.803238020629325,
"grad_norm": 6.586986064910889,
"learning_rate": 2.4234424498416053e-06,
"loss": 0.4566,
"step": 7280
},
{
"epoch": 3.808460634547591,
"grad_norm": 2.746188163757324,
"learning_rate": 2.412882787750792e-06,
"loss": 0.3717,
"step": 7290
},
{
"epoch": 3.813683248465857,
"grad_norm": 3.323824167251587,
"learning_rate": 2.402323125659979e-06,
"loss": 0.4007,
"step": 7300
},
{
"epoch": 3.818905862384123,
"grad_norm": 4.206129550933838,
"learning_rate": 2.391763463569166e-06,
"loss": 0.3699,
"step": 7310
},
{
"epoch": 3.8241284763023895,
"grad_norm": 4.980790615081787,
"learning_rate": 2.381203801478353e-06,
"loss": 0.4213,
"step": 7320
},
{
"epoch": 3.8293510902206553,
"grad_norm": 4.453920841217041,
"learning_rate": 2.37064413938754e-06,
"loss": 0.3974,
"step": 7330
},
{
"epoch": 3.8345737041389216,
"grad_norm": 4.445418834686279,
"learning_rate": 2.3600844772967266e-06,
"loss": 0.3761,
"step": 7340
},
{
"epoch": 3.8397963180571875,
"grad_norm": 5.032138347625732,
"learning_rate": 2.3495248152059137e-06,
"loss": 0.3759,
"step": 7350
},
{
"epoch": 3.8450189319754537,
"grad_norm": 6.470358371734619,
"learning_rate": 2.3389651531151004e-06,
"loss": 0.4194,
"step": 7360
},
{
"epoch": 3.85024154589372,
"grad_norm": 3.676422119140625,
"learning_rate": 2.328405491024287e-06,
"loss": 0.4073,
"step": 7370
},
{
"epoch": 3.855464159811986,
"grad_norm": 2.7682676315307617,
"learning_rate": 2.3178458289334742e-06,
"loss": 0.4572,
"step": 7380
},
{
"epoch": 3.860686773730252,
"grad_norm": 2.9539079666137695,
"learning_rate": 2.3072861668426613e-06,
"loss": 0.4329,
"step": 7390
},
{
"epoch": 3.865909387648518,
"grad_norm": 3.254023551940918,
"learning_rate": 2.296726504751848e-06,
"loss": 0.3963,
"step": 7400
},
{
"epoch": 3.865909387648518,
"eval_loss": 0.7841590046882629,
"eval_runtime": 46.2618,
"eval_samples_per_second": 36.791,
"eval_steps_per_second": 4.604,
"step": 7400
},
{
"epoch": 3.8711320015667843,
"grad_norm": 3.7601821422576904,
"learning_rate": 2.286166842661035e-06,
"loss": 0.4022,
"step": 7410
},
{
"epoch": 3.87635461548505,
"grad_norm": 6.364165782928467,
"learning_rate": 2.2756071805702222e-06,
"loss": 0.5135,
"step": 7420
},
{
"epoch": 3.8815772294033164,
"grad_norm": 4.010589599609375,
"learning_rate": 2.265047518479409e-06,
"loss": 0.433,
"step": 7430
},
{
"epoch": 3.8867998433215822,
"grad_norm": 9.321678161621094,
"learning_rate": 2.2544878563885956e-06,
"loss": 0.4228,
"step": 7440
},
{
"epoch": 3.8920224572398485,
"grad_norm": 7.4074273109436035,
"learning_rate": 2.2439281942977827e-06,
"loss": 0.4568,
"step": 7450
},
{
"epoch": 3.897245071158115,
"grad_norm": 6.130796432495117,
"learning_rate": 2.2333685322069694e-06,
"loss": 0.3895,
"step": 7460
},
{
"epoch": 3.9024676850763806,
"grad_norm": 6.908585071563721,
"learning_rate": 2.2228088701161565e-06,
"loss": 0.35,
"step": 7470
},
{
"epoch": 3.907690298994647,
"grad_norm": 2.8046581745147705,
"learning_rate": 2.212249208025343e-06,
"loss": 0.4573,
"step": 7480
},
{
"epoch": 3.9129129129129128,
"grad_norm": 4.019318580627441,
"learning_rate": 2.2016895459345303e-06,
"loss": 0.4703,
"step": 7490
},
{
"epoch": 3.918135526831179,
"grad_norm": 5.776391983032227,
"learning_rate": 2.1911298838437174e-06,
"loss": 0.3865,
"step": 7500
},
{
"epoch": 3.9233581407494453,
"grad_norm": 5.176472187042236,
"learning_rate": 2.180570221752904e-06,
"loss": 0.3992,
"step": 7510
},
{
"epoch": 3.928580754667711,
"grad_norm": 5.863769054412842,
"learning_rate": 2.170010559662091e-06,
"loss": 0.4149,
"step": 7520
},
{
"epoch": 3.933803368585977,
"grad_norm": 4.269286632537842,
"learning_rate": 2.159450897571278e-06,
"loss": 0.3281,
"step": 7530
},
{
"epoch": 3.9390259825042433,
"grad_norm": 5.141351699829102,
"learning_rate": 2.1488912354804646e-06,
"loss": 0.3818,
"step": 7540
},
{
"epoch": 3.9442485964225096,
"grad_norm": 7.267117977142334,
"learning_rate": 2.1383315733896517e-06,
"loss": 0.4357,
"step": 7550
},
{
"epoch": 3.9494712103407754,
"grad_norm": 2.134504556655884,
"learning_rate": 2.1277719112988384e-06,
"loss": 0.408,
"step": 7560
},
{
"epoch": 3.9546938242590417,
"grad_norm": 2.806506872177124,
"learning_rate": 2.1172122492080255e-06,
"loss": 0.4074,
"step": 7570
},
{
"epoch": 3.9599164381773075,
"grad_norm": 7.443352699279785,
"learning_rate": 2.1066525871172126e-06,
"loss": 0.4055,
"step": 7580
},
{
"epoch": 3.965139052095574,
"grad_norm": 4.180816650390625,
"learning_rate": 2.0960929250263993e-06,
"loss": 0.4754,
"step": 7590
},
{
"epoch": 3.97036166601384,
"grad_norm": 7.595555305480957,
"learning_rate": 2.0855332629355864e-06,
"loss": 0.4441,
"step": 7600
},
{
"epoch": 3.97036166601384,
"eval_loss": 0.7774137258529663,
"eval_runtime": 46.2875,
"eval_samples_per_second": 36.77,
"eval_steps_per_second": 4.602,
"step": 7600
}
],
"logging_steps": 10,
"max_steps": 9570,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.228569170208358e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}