gg-armv8-O2 / trainer_state.json
ahmedheakl's picture
End of training
11259e1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9997595575859581,
"eval_steps": 500,
"global_step": 12476,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016029494269455798,
"grad_norm": 5.699583530426025,
"learning_rate": 1.6025641025641025e-07,
"loss": 0.5515,
"step": 10
},
{
"epoch": 0.0032058988538911596,
"grad_norm": 5.793712615966797,
"learning_rate": 3.205128205128205e-07,
"loss": 0.545,
"step": 20
},
{
"epoch": 0.004808848280836739,
"grad_norm": 4.68794059753418,
"learning_rate": 4.807692307692308e-07,
"loss": 0.525,
"step": 30
},
{
"epoch": 0.006411797707782319,
"grad_norm": 3.254547595977783,
"learning_rate": 6.41025641025641e-07,
"loss": 0.4268,
"step": 40
},
{
"epoch": 0.0080147471347279,
"grad_norm": 2.2941272258758545,
"learning_rate": 8.012820512820515e-07,
"loss": 0.3266,
"step": 50
},
{
"epoch": 0.009617696561673479,
"grad_norm": 1.5437036752700806,
"learning_rate": 9.615384615384617e-07,
"loss": 0.2447,
"step": 60
},
{
"epoch": 0.01122064598861906,
"grad_norm": 0.6866864562034607,
"learning_rate": 1.121794871794872e-06,
"loss": 0.1701,
"step": 70
},
{
"epoch": 0.012823595415564638,
"grad_norm": 0.5046991109848022,
"learning_rate": 1.282051282051282e-06,
"loss": 0.1366,
"step": 80
},
{
"epoch": 0.014426544842510219,
"grad_norm": 0.4317735433578491,
"learning_rate": 1.4423076923076922e-06,
"loss": 0.1178,
"step": 90
},
{
"epoch": 0.0160294942694558,
"grad_norm": 0.39152026176452637,
"learning_rate": 1.602564102564103e-06,
"loss": 0.1008,
"step": 100
},
{
"epoch": 0.01763244369640138,
"grad_norm": 0.3579396605491638,
"learning_rate": 1.7628205128205131e-06,
"loss": 0.0927,
"step": 110
},
{
"epoch": 0.019235393123346958,
"grad_norm": 0.3451220393180847,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.0885,
"step": 120
},
{
"epoch": 0.02083834255029254,
"grad_norm": 0.32997310161590576,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0787,
"step": 130
},
{
"epoch": 0.02244129197723812,
"grad_norm": 0.31072285771369934,
"learning_rate": 2.243589743589744e-06,
"loss": 0.0736,
"step": 140
},
{
"epoch": 0.024044241404183698,
"grad_norm": 0.3469359874725342,
"learning_rate": 2.403846153846154e-06,
"loss": 0.0731,
"step": 150
},
{
"epoch": 0.025647190831129277,
"grad_norm": 0.29716140031814575,
"learning_rate": 2.564102564102564e-06,
"loss": 0.067,
"step": 160
},
{
"epoch": 0.02725014025807486,
"grad_norm": 0.3381027281284332,
"learning_rate": 2.7243589743589744e-06,
"loss": 0.0628,
"step": 170
},
{
"epoch": 0.028853089685020438,
"grad_norm": 0.31842488050460815,
"learning_rate": 2.8846153846153845e-06,
"loss": 0.0608,
"step": 180
},
{
"epoch": 0.030456039111966017,
"grad_norm": 0.31226950883865356,
"learning_rate": 3.044871794871795e-06,
"loss": 0.0642,
"step": 190
},
{
"epoch": 0.0320589885389116,
"grad_norm": 0.29508596658706665,
"learning_rate": 3.205128205128206e-06,
"loss": 0.0556,
"step": 200
},
{
"epoch": 0.033661937965857175,
"grad_norm": 0.33342409133911133,
"learning_rate": 3.365384615384616e-06,
"loss": 0.0542,
"step": 210
},
{
"epoch": 0.03526488739280276,
"grad_norm": 0.32582512497901917,
"learning_rate": 3.5256410256410263e-06,
"loss": 0.0513,
"step": 220
},
{
"epoch": 0.03686783681974834,
"grad_norm": 0.30706045031547546,
"learning_rate": 3.6858974358974363e-06,
"loss": 0.0533,
"step": 230
},
{
"epoch": 0.038470786246693915,
"grad_norm": 0.3309493064880371,
"learning_rate": 3.846153846153847e-06,
"loss": 0.0494,
"step": 240
},
{
"epoch": 0.0400737356736395,
"grad_norm": 0.2860076129436493,
"learning_rate": 4.006410256410257e-06,
"loss": 0.0489,
"step": 250
},
{
"epoch": 0.04167668510058508,
"grad_norm": 0.26025334000587463,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0456,
"step": 260
},
{
"epoch": 0.043279634527530655,
"grad_norm": 1.0466290712356567,
"learning_rate": 4.326923076923077e-06,
"loss": 0.0475,
"step": 270
},
{
"epoch": 0.04488258395447624,
"grad_norm": 0.31393590569496155,
"learning_rate": 4.487179487179488e-06,
"loss": 0.0484,
"step": 280
},
{
"epoch": 0.04648553338142181,
"grad_norm": 0.2957313060760498,
"learning_rate": 4.647435897435898e-06,
"loss": 0.0441,
"step": 290
},
{
"epoch": 0.048088482808367396,
"grad_norm": 0.2924838066101074,
"learning_rate": 4.807692307692308e-06,
"loss": 0.0433,
"step": 300
},
{
"epoch": 0.04969143223531298,
"grad_norm": 0.2568668723106384,
"learning_rate": 4.967948717948718e-06,
"loss": 0.0457,
"step": 310
},
{
"epoch": 0.051294381662258554,
"grad_norm": 0.3006618320941925,
"learning_rate": 5.128205128205128e-06,
"loss": 0.0436,
"step": 320
},
{
"epoch": 0.052897331089204136,
"grad_norm": 0.23940995335578918,
"learning_rate": 5.288461538461539e-06,
"loss": 0.0408,
"step": 330
},
{
"epoch": 0.05450028051614972,
"grad_norm": 0.30063626170158386,
"learning_rate": 5.448717948717949e-06,
"loss": 0.0409,
"step": 340
},
{
"epoch": 0.056103229943095294,
"grad_norm": 0.2716442048549652,
"learning_rate": 5.608974358974359e-06,
"loss": 0.0392,
"step": 350
},
{
"epoch": 0.057706179370040876,
"grad_norm": 0.27399641275405884,
"learning_rate": 5.769230769230769e-06,
"loss": 0.0375,
"step": 360
},
{
"epoch": 0.05930912879698645,
"grad_norm": 0.252614825963974,
"learning_rate": 5.92948717948718e-06,
"loss": 0.0378,
"step": 370
},
{
"epoch": 0.060912078223932034,
"grad_norm": 0.2915278971195221,
"learning_rate": 6.08974358974359e-06,
"loss": 0.0382,
"step": 380
},
{
"epoch": 0.06251502765087762,
"grad_norm": 0.2674607038497925,
"learning_rate": 6.25e-06,
"loss": 0.0376,
"step": 390
},
{
"epoch": 0.0641179770778232,
"grad_norm": 0.23759065568447113,
"learning_rate": 6.410256410256412e-06,
"loss": 0.0376,
"step": 400
},
{
"epoch": 0.06572092650476878,
"grad_norm": 0.28716912865638733,
"learning_rate": 6.570512820512821e-06,
"loss": 0.0368,
"step": 410
},
{
"epoch": 0.06732387593171435,
"grad_norm": 0.23370076715946198,
"learning_rate": 6.730769230769232e-06,
"loss": 0.0353,
"step": 420
},
{
"epoch": 0.06892682535865993,
"grad_norm": 0.24973611533641815,
"learning_rate": 6.891025641025641e-06,
"loss": 0.0345,
"step": 430
},
{
"epoch": 0.07052977478560551,
"grad_norm": 0.2501871585845947,
"learning_rate": 7.051282051282053e-06,
"loss": 0.0353,
"step": 440
},
{
"epoch": 0.0721327242125511,
"grad_norm": 0.25735384225845337,
"learning_rate": 7.211538461538462e-06,
"loss": 0.0343,
"step": 450
},
{
"epoch": 0.07373567363949668,
"grad_norm": 0.21811848878860474,
"learning_rate": 7.371794871794873e-06,
"loss": 0.0347,
"step": 460
},
{
"epoch": 0.07533862306644225,
"grad_norm": 0.25927454233169556,
"learning_rate": 7.532051282051282e-06,
"loss": 0.0334,
"step": 470
},
{
"epoch": 0.07694157249338783,
"grad_norm": 0.20881901681423187,
"learning_rate": 7.692307692307694e-06,
"loss": 0.0315,
"step": 480
},
{
"epoch": 0.07854452192033341,
"grad_norm": 0.22877703607082367,
"learning_rate": 7.852564102564102e-06,
"loss": 0.0316,
"step": 490
},
{
"epoch": 0.080147471347279,
"grad_norm": 0.23380960524082184,
"learning_rate": 8.012820512820515e-06,
"loss": 0.0319,
"step": 500
},
{
"epoch": 0.08175042077422458,
"grad_norm": 0.2707521915435791,
"learning_rate": 8.173076923076923e-06,
"loss": 0.0329,
"step": 510
},
{
"epoch": 0.08335337020117016,
"grad_norm": 0.19819176197052002,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0306,
"step": 520
},
{
"epoch": 0.08495631962811573,
"grad_norm": 0.2094683200120926,
"learning_rate": 8.493589743589744e-06,
"loss": 0.0314,
"step": 530
},
{
"epoch": 0.08655926905506131,
"grad_norm": 0.21663790941238403,
"learning_rate": 8.653846153846155e-06,
"loss": 0.033,
"step": 540
},
{
"epoch": 0.0881622184820069,
"grad_norm": 0.2188635766506195,
"learning_rate": 8.814102564102565e-06,
"loss": 0.0302,
"step": 550
},
{
"epoch": 0.08976516790895248,
"grad_norm": 0.2224225401878357,
"learning_rate": 8.974358974358976e-06,
"loss": 0.0305,
"step": 560
},
{
"epoch": 0.09136811733589806,
"grad_norm": 0.2301451414823532,
"learning_rate": 9.134615384615384e-06,
"loss": 0.0282,
"step": 570
},
{
"epoch": 0.09297106676284363,
"grad_norm": 0.21547645330429077,
"learning_rate": 9.294871794871796e-06,
"loss": 0.0314,
"step": 580
},
{
"epoch": 0.09457401618978921,
"grad_norm": 0.23693452775478363,
"learning_rate": 9.455128205128205e-06,
"loss": 0.0301,
"step": 590
},
{
"epoch": 0.09617696561673479,
"grad_norm": 0.21754775941371918,
"learning_rate": 9.615384615384616e-06,
"loss": 0.0315,
"step": 600
},
{
"epoch": 0.09777991504368037,
"grad_norm": 0.17971888184547424,
"learning_rate": 9.775641025641026e-06,
"loss": 0.0276,
"step": 610
},
{
"epoch": 0.09938286447062596,
"grad_norm": 0.2265903502702713,
"learning_rate": 9.935897435897437e-06,
"loss": 0.0287,
"step": 620
},
{
"epoch": 0.10098581389757152,
"grad_norm": 0.2187529057264328,
"learning_rate": 1.0096153846153847e-05,
"loss": 0.0271,
"step": 630
},
{
"epoch": 0.10258876332451711,
"grad_norm": 0.20468100905418396,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.0279,
"step": 640
},
{
"epoch": 0.10419171275146269,
"grad_norm": 0.2146177440881729,
"learning_rate": 1.0416666666666668e-05,
"loss": 0.0265,
"step": 650
},
{
"epoch": 0.10579466217840827,
"grad_norm": 0.21035543084144592,
"learning_rate": 1.0576923076923078e-05,
"loss": 0.0284,
"step": 660
},
{
"epoch": 0.10739761160535385,
"grad_norm": 0.20882540941238403,
"learning_rate": 1.0737179487179487e-05,
"loss": 0.0287,
"step": 670
},
{
"epoch": 0.10900056103229944,
"grad_norm": 0.18467697501182556,
"learning_rate": 1.0897435897435898e-05,
"loss": 0.0267,
"step": 680
},
{
"epoch": 0.110603510459245,
"grad_norm": 0.7568922638893127,
"learning_rate": 1.105769230769231e-05,
"loss": 0.0376,
"step": 690
},
{
"epoch": 0.11220645988619059,
"grad_norm": 0.23849329352378845,
"learning_rate": 1.1217948717948719e-05,
"loss": 0.0317,
"step": 700
},
{
"epoch": 0.11380940931313617,
"grad_norm": 0.20443181693553925,
"learning_rate": 1.1378205128205129e-05,
"loss": 0.0307,
"step": 710
},
{
"epoch": 0.11541235874008175,
"grad_norm": 0.18542100489139557,
"learning_rate": 1.1538461538461538e-05,
"loss": 0.026,
"step": 720
},
{
"epoch": 0.11701530816702733,
"grad_norm": 0.241206094622612,
"learning_rate": 1.169871794871795e-05,
"loss": 0.0308,
"step": 730
},
{
"epoch": 0.1186182575939729,
"grad_norm": 0.17955875396728516,
"learning_rate": 1.185897435897436e-05,
"loss": 0.0251,
"step": 740
},
{
"epoch": 0.12022120702091849,
"grad_norm": 0.1853230744600296,
"learning_rate": 1.201923076923077e-05,
"loss": 0.0258,
"step": 750
},
{
"epoch": 0.12182415644786407,
"grad_norm": 0.16425946354866028,
"learning_rate": 1.217948717948718e-05,
"loss": 0.0252,
"step": 760
},
{
"epoch": 0.12342710587480965,
"grad_norm": 0.18707670271396637,
"learning_rate": 1.2339743589743592e-05,
"loss": 0.0245,
"step": 770
},
{
"epoch": 0.12503005530175523,
"grad_norm": 0.2019442468881607,
"learning_rate": 1.25e-05,
"loss": 0.0268,
"step": 780
},
{
"epoch": 0.12663300472870082,
"grad_norm": 0.16195464134216309,
"learning_rate": 1.2660256410256411e-05,
"loss": 0.0254,
"step": 790
},
{
"epoch": 0.1282359541556464,
"grad_norm": 0.16929590702056885,
"learning_rate": 1.2820512820512823e-05,
"loss": 0.025,
"step": 800
},
{
"epoch": 0.12983890358259198,
"grad_norm": 0.1814528852701187,
"learning_rate": 1.2980769230769232e-05,
"loss": 0.0254,
"step": 810
},
{
"epoch": 0.13144185300953756,
"grad_norm": 0.20797356963157654,
"learning_rate": 1.3141025641025642e-05,
"loss": 0.0253,
"step": 820
},
{
"epoch": 0.13304480243648312,
"grad_norm": 0.18418587744235992,
"learning_rate": 1.3301282051282051e-05,
"loss": 0.0267,
"step": 830
},
{
"epoch": 0.1346477518634287,
"grad_norm": 0.16782532632350922,
"learning_rate": 1.3461538461538463e-05,
"loss": 0.0244,
"step": 840
},
{
"epoch": 0.13625070129037428,
"grad_norm": 0.16734585165977478,
"learning_rate": 1.3621794871794874e-05,
"loss": 0.0238,
"step": 850
},
{
"epoch": 0.13785365071731986,
"grad_norm": 0.15977711975574493,
"learning_rate": 1.3782051282051283e-05,
"loss": 0.0241,
"step": 860
},
{
"epoch": 0.13945660014426545,
"grad_norm": 0.20185428857803345,
"learning_rate": 1.3942307692307693e-05,
"loss": 0.0232,
"step": 870
},
{
"epoch": 0.14105954957121103,
"grad_norm": 0.1522301733493805,
"learning_rate": 1.4102564102564105e-05,
"loss": 0.0247,
"step": 880
},
{
"epoch": 0.1426624989981566,
"grad_norm": 0.15999549627304077,
"learning_rate": 1.4262820512820514e-05,
"loss": 0.0253,
"step": 890
},
{
"epoch": 0.1442654484251022,
"grad_norm": 0.16568179428577423,
"learning_rate": 1.4423076923076924e-05,
"loss": 0.025,
"step": 900
},
{
"epoch": 0.14586839785204778,
"grad_norm": 0.18805263936519623,
"learning_rate": 1.4583333333333333e-05,
"loss": 0.0231,
"step": 910
},
{
"epoch": 0.14747134727899336,
"grad_norm": 0.17499662935733795,
"learning_rate": 1.4743589743589745e-05,
"loss": 0.0236,
"step": 920
},
{
"epoch": 0.14907429670593894,
"grad_norm": 0.16887474060058594,
"learning_rate": 1.4903846153846156e-05,
"loss": 0.0239,
"step": 930
},
{
"epoch": 0.1506772461328845,
"grad_norm": 0.1816807985305786,
"learning_rate": 1.5064102564102565e-05,
"loss": 0.0236,
"step": 940
},
{
"epoch": 0.15228019555983008,
"grad_norm": 0.16164837777614594,
"learning_rate": 1.5224358974358975e-05,
"loss": 0.0245,
"step": 950
},
{
"epoch": 0.15388314498677566,
"grad_norm": 0.16409705579280853,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0228,
"step": 960
},
{
"epoch": 0.15548609441372124,
"grad_norm": 0.18589679896831512,
"learning_rate": 1.5544871794871796e-05,
"loss": 0.025,
"step": 970
},
{
"epoch": 0.15708904384066683,
"grad_norm": 0.14719834923744202,
"learning_rate": 1.5705128205128205e-05,
"loss": 0.0243,
"step": 980
},
{
"epoch": 0.1586919932676124,
"grad_norm": 0.1489681452512741,
"learning_rate": 1.5865384615384617e-05,
"loss": 0.0241,
"step": 990
},
{
"epoch": 0.160294942694558,
"grad_norm": 0.14481011033058167,
"learning_rate": 1.602564102564103e-05,
"loss": 0.0235,
"step": 1000
},
{
"epoch": 0.16189789212150357,
"grad_norm": 0.19330425560474396,
"learning_rate": 1.6185897435897438e-05,
"loss": 0.0243,
"step": 1010
},
{
"epoch": 0.16350084154844916,
"grad_norm": 0.14850366115570068,
"learning_rate": 1.6346153846153847e-05,
"loss": 0.0218,
"step": 1020
},
{
"epoch": 0.16510379097539474,
"grad_norm": 0.2127213478088379,
"learning_rate": 1.6506410256410255e-05,
"loss": 0.0219,
"step": 1030
},
{
"epoch": 0.16670674040234032,
"grad_norm": 0.15298931300640106,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0206,
"step": 1040
},
{
"epoch": 0.16830968982928587,
"grad_norm": 0.22085171937942505,
"learning_rate": 1.682692307692308e-05,
"loss": 0.0213,
"step": 1050
},
{
"epoch": 0.16991263925623146,
"grad_norm": 0.17538835108280182,
"learning_rate": 1.698717948717949e-05,
"loss": 0.0215,
"step": 1060
},
{
"epoch": 0.17151558868317704,
"grad_norm": 0.15932999551296234,
"learning_rate": 1.7147435897435897e-05,
"loss": 0.021,
"step": 1070
},
{
"epoch": 0.17311853811012262,
"grad_norm": 0.163809135556221,
"learning_rate": 1.730769230769231e-05,
"loss": 0.0208,
"step": 1080
},
{
"epoch": 0.1747214875370682,
"grad_norm": 0.13327869772911072,
"learning_rate": 1.7467948717948718e-05,
"loss": 0.0213,
"step": 1090
},
{
"epoch": 0.1763244369640138,
"grad_norm": 0.15784206986427307,
"learning_rate": 1.762820512820513e-05,
"loss": 0.0224,
"step": 1100
},
{
"epoch": 0.17792738639095937,
"grad_norm": 0.13589483499526978,
"learning_rate": 1.778846153846154e-05,
"loss": 0.021,
"step": 1110
},
{
"epoch": 0.17953033581790495,
"grad_norm": 0.14331687986850739,
"learning_rate": 1.794871794871795e-05,
"loss": 0.0202,
"step": 1120
},
{
"epoch": 0.18113328524485053,
"grad_norm": 0.1520327925682068,
"learning_rate": 1.810897435897436e-05,
"loss": 0.0201,
"step": 1130
},
{
"epoch": 0.18273623467179612,
"grad_norm": 0.154808908700943,
"learning_rate": 1.826923076923077e-05,
"loss": 0.0209,
"step": 1140
},
{
"epoch": 0.1843391840987417,
"grad_norm": 0.13862183690071106,
"learning_rate": 1.842948717948718e-05,
"loss": 0.0213,
"step": 1150
},
{
"epoch": 0.18594213352568725,
"grad_norm": 0.1722225844860077,
"learning_rate": 1.8589743589743593e-05,
"loss": 0.0186,
"step": 1160
},
{
"epoch": 0.18754508295263284,
"grad_norm": 0.1655365228652954,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.0209,
"step": 1170
},
{
"epoch": 0.18914803237957842,
"grad_norm": 0.20939995348453522,
"learning_rate": 1.891025641025641e-05,
"loss": 0.0211,
"step": 1180
},
{
"epoch": 0.190750981806524,
"grad_norm": 0.17024146020412445,
"learning_rate": 1.9070512820512823e-05,
"loss": 0.0213,
"step": 1190
},
{
"epoch": 0.19235393123346958,
"grad_norm": 0.14322948455810547,
"learning_rate": 1.923076923076923e-05,
"loss": 0.0202,
"step": 1200
},
{
"epoch": 0.19395688066041517,
"grad_norm": 0.15687131881713867,
"learning_rate": 1.9391025641025644e-05,
"loss": 0.0191,
"step": 1210
},
{
"epoch": 0.19555983008736075,
"grad_norm": 0.169046089053154,
"learning_rate": 1.9551282051282052e-05,
"loss": 0.0221,
"step": 1220
},
{
"epoch": 0.19716277951430633,
"grad_norm": 0.15385688841342926,
"learning_rate": 1.9711538461538465e-05,
"loss": 0.0201,
"step": 1230
},
{
"epoch": 0.1987657289412519,
"grad_norm": 0.15540941059589386,
"learning_rate": 1.9871794871794873e-05,
"loss": 0.02,
"step": 1240
},
{
"epoch": 0.2003686783681975,
"grad_norm": 0.15357881784439087,
"learning_rate": 1.9999998434240984e-05,
"loss": 0.0201,
"step": 1250
},
{
"epoch": 0.20197162779514305,
"grad_norm": 0.7511164546012878,
"learning_rate": 1.9999943632726828e-05,
"loss": 0.021,
"step": 1260
},
{
"epoch": 0.20357457722208863,
"grad_norm": 0.1908695548772812,
"learning_rate": 1.999981054375207e-05,
"loss": 0.0261,
"step": 1270
},
{
"epoch": 0.20517752664903421,
"grad_norm": 0.17630814015865326,
"learning_rate": 1.999959916835864e-05,
"loss": 0.0211,
"step": 1280
},
{
"epoch": 0.2067804760759798,
"grad_norm": 0.16517992317676544,
"learning_rate": 1.9999309508201362e-05,
"loss": 0.023,
"step": 1290
},
{
"epoch": 0.20838342550292538,
"grad_norm": 0.14057657122612,
"learning_rate": 1.999894156554791e-05,
"loss": 0.0205,
"step": 1300
},
{
"epoch": 0.20998637492987096,
"grad_norm": 0.17232050001621246,
"learning_rate": 1.9998495343278833e-05,
"loss": 0.0201,
"step": 1310
},
{
"epoch": 0.21158932435681654,
"grad_norm": 0.1483970731496811,
"learning_rate": 1.9997970844887513e-05,
"loss": 0.0193,
"step": 1320
},
{
"epoch": 0.21319227378376213,
"grad_norm": 0.11647937446832657,
"learning_rate": 1.9997368074480137e-05,
"loss": 0.0192,
"step": 1330
},
{
"epoch": 0.2147952232107077,
"grad_norm": 0.12476928532123566,
"learning_rate": 1.9996687036775672e-05,
"loss": 0.0199,
"step": 1340
},
{
"epoch": 0.2163981726376533,
"grad_norm": 0.13054953515529633,
"learning_rate": 1.9995927737105818e-05,
"loss": 0.0189,
"step": 1350
},
{
"epoch": 0.21800112206459887,
"grad_norm": 0.11871975660324097,
"learning_rate": 1.9995090181414973e-05,
"loss": 0.0202,
"step": 1360
},
{
"epoch": 0.21960407149154443,
"grad_norm": 0.13187278807163239,
"learning_rate": 1.9994174376260175e-05,
"loss": 0.0184,
"step": 1370
},
{
"epoch": 0.22120702091849,
"grad_norm": 0.12306849658489227,
"learning_rate": 1.9993180328811084e-05,
"loss": 0.019,
"step": 1380
},
{
"epoch": 0.2228099703454356,
"grad_norm": 0.12421499937772751,
"learning_rate": 1.9992108046849883e-05,
"loss": 0.0196,
"step": 1390
},
{
"epoch": 0.22441291977238118,
"grad_norm": 0.11667071282863617,
"learning_rate": 1.9990957538771242e-05,
"loss": 0.0197,
"step": 1400
},
{
"epoch": 0.22601586919932676,
"grad_norm": 0.1250181496143341,
"learning_rate": 1.998972881358225e-05,
"loss": 0.0196,
"step": 1410
},
{
"epoch": 0.22761881862627234,
"grad_norm": 0.11087560653686523,
"learning_rate": 1.9988421880902336e-05,
"loss": 0.0188,
"step": 1420
},
{
"epoch": 0.22922176805321792,
"grad_norm": 0.13298539817333221,
"learning_rate": 1.99870367509632e-05,
"loss": 0.0193,
"step": 1430
},
{
"epoch": 0.2308247174801635,
"grad_norm": 0.15178672969341278,
"learning_rate": 1.998557343460874e-05,
"loss": 0.0195,
"step": 1440
},
{
"epoch": 0.2324276669071091,
"grad_norm": 0.13834117352962494,
"learning_rate": 1.9984031943294947e-05,
"loss": 0.0183,
"step": 1450
},
{
"epoch": 0.23403061633405467,
"grad_norm": 0.13149769604206085,
"learning_rate": 1.9982412289089837e-05,
"loss": 0.0184,
"step": 1460
},
{
"epoch": 0.23563356576100025,
"grad_norm": 0.1371048241853714,
"learning_rate": 1.998071448467334e-05,
"loss": 0.0197,
"step": 1470
},
{
"epoch": 0.2372365151879458,
"grad_norm": 0.10099935531616211,
"learning_rate": 1.9978938543337212e-05,
"loss": 0.0195,
"step": 1480
},
{
"epoch": 0.2388394646148914,
"grad_norm": 0.1115645244717598,
"learning_rate": 1.9977084478984926e-05,
"loss": 0.017,
"step": 1490
},
{
"epoch": 0.24044241404183697,
"grad_norm": 0.147051602602005,
"learning_rate": 1.997515230613156e-05,
"loss": 0.018,
"step": 1500
},
{
"epoch": 0.24204536346878255,
"grad_norm": 0.1126634031534195,
"learning_rate": 1.997314203990369e-05,
"loss": 0.0182,
"step": 1510
},
{
"epoch": 0.24364831289572814,
"grad_norm": 0.11680582165718079,
"learning_rate": 1.9971053696039273e-05,
"loss": 0.0171,
"step": 1520
},
{
"epoch": 0.24525126232267372,
"grad_norm": 0.12330590188503265,
"learning_rate": 1.996888729088751e-05,
"loss": 0.0178,
"step": 1530
},
{
"epoch": 0.2468542117496193,
"grad_norm": 0.11050969362258911,
"learning_rate": 1.9966642841408738e-05,
"loss": 0.0169,
"step": 1540
},
{
"epoch": 0.24845716117656488,
"grad_norm": 0.12690754234790802,
"learning_rate": 1.9964320365174273e-05,
"loss": 0.0178,
"step": 1550
},
{
"epoch": 0.25006011060351047,
"grad_norm": 0.11407110840082169,
"learning_rate": 1.99619198803663e-05,
"loss": 0.0184,
"step": 1560
},
{
"epoch": 0.251663060030456,
"grad_norm": 0.13558299839496613,
"learning_rate": 1.995944140577771e-05,
"loss": 0.0195,
"step": 1570
},
{
"epoch": 0.25326600945740163,
"grad_norm": 0.11716917902231216,
"learning_rate": 1.995688496081196e-05,
"loss": 0.0164,
"step": 1580
},
{
"epoch": 0.2548689588843472,
"grad_norm": 0.112436443567276,
"learning_rate": 1.995425056548292e-05,
"loss": 0.0162,
"step": 1590
},
{
"epoch": 0.2564719083112928,
"grad_norm": 0.10250984877347946,
"learning_rate": 1.9951538240414724e-05,
"loss": 0.0171,
"step": 1600
},
{
"epoch": 0.25807485773823835,
"grad_norm": 0.12778599560260773,
"learning_rate": 1.9948748006841586e-05,
"loss": 0.0172,
"step": 1610
},
{
"epoch": 0.25967780716518396,
"grad_norm": 0.0987766906619072,
"learning_rate": 1.9945879886607666e-05,
"loss": 0.0165,
"step": 1620
},
{
"epoch": 0.2612807565921295,
"grad_norm": 0.14052483439445496,
"learning_rate": 1.9942933902166873e-05,
"loss": 0.0175,
"step": 1630
},
{
"epoch": 0.2628837060190751,
"grad_norm": 0.10828305035829544,
"learning_rate": 1.9939910076582708e-05,
"loss": 0.017,
"step": 1640
},
{
"epoch": 0.2644866554460207,
"grad_norm": 0.12008311599493027,
"learning_rate": 1.9936808433528058e-05,
"loss": 0.0173,
"step": 1650
},
{
"epoch": 0.26608960487296623,
"grad_norm": 0.11578180640935898,
"learning_rate": 1.9933628997285037e-05,
"loss": 0.0175,
"step": 1660
},
{
"epoch": 0.26769255429991184,
"grad_norm": 0.1305118203163147,
"learning_rate": 1.993037179274479e-05,
"loss": 0.0168,
"step": 1670
},
{
"epoch": 0.2692955037268574,
"grad_norm": 0.12475095689296722,
"learning_rate": 1.992703684540728e-05,
"loss": 0.0164,
"step": 1680
},
{
"epoch": 0.270898453153803,
"grad_norm": 0.13382022082805634,
"learning_rate": 1.9923624181381117e-05,
"loss": 0.0172,
"step": 1690
},
{
"epoch": 0.27250140258074856,
"grad_norm": 0.1182899996638298,
"learning_rate": 1.992013382738333e-05,
"loss": 0.0165,
"step": 1700
},
{
"epoch": 0.2741043520076942,
"grad_norm": 0.11200874298810959,
"learning_rate": 1.9916565810739167e-05,
"loss": 0.0178,
"step": 1710
},
{
"epoch": 0.27570730143463973,
"grad_norm": 0.11989603191614151,
"learning_rate": 1.9912920159381882e-05,
"loss": 0.0174,
"step": 1720
},
{
"epoch": 0.27731025086158534,
"grad_norm": 0.11210440844297409,
"learning_rate": 1.990919690185251e-05,
"loss": 0.0175,
"step": 1730
},
{
"epoch": 0.2789132002885309,
"grad_norm": 0.12546950578689575,
"learning_rate": 1.990539606729966e-05,
"loss": 0.0173,
"step": 1740
},
{
"epoch": 0.2805161497154765,
"grad_norm": 0.12153290957212448,
"learning_rate": 1.9901517685479267e-05,
"loss": 0.0177,
"step": 1750
},
{
"epoch": 0.28211909914242206,
"grad_norm": 0.09732785820960999,
"learning_rate": 1.989756178675437e-05,
"loss": 0.0165,
"step": 1760
},
{
"epoch": 0.2837220485693676,
"grad_norm": 0.09644783288240433,
"learning_rate": 1.9893528402094863e-05,
"loss": 0.0163,
"step": 1770
},
{
"epoch": 0.2853249979963132,
"grad_norm": 0.12492503225803375,
"learning_rate": 1.9889417563077274e-05,
"loss": 0.0166,
"step": 1780
},
{
"epoch": 0.2869279474232588,
"grad_norm": 0.1265823394060135,
"learning_rate": 1.9885229301884497e-05,
"loss": 0.0166,
"step": 1790
},
{
"epoch": 0.2885308968502044,
"grad_norm": 0.10268606245517731,
"learning_rate": 1.9880963651305548e-05,
"loss": 0.0178,
"step": 1800
},
{
"epoch": 0.29013384627714994,
"grad_norm": 0.09787417948246002,
"learning_rate": 1.987662064473532e-05,
"loss": 0.017,
"step": 1810
},
{
"epoch": 0.29173679570409555,
"grad_norm": 0.11527131497859955,
"learning_rate": 1.9872200316174285e-05,
"loss": 0.0164,
"step": 1820
},
{
"epoch": 0.2933397451310411,
"grad_norm": 0.1187480166554451,
"learning_rate": 1.9867702700228282e-05,
"loss": 0.0166,
"step": 1830
},
{
"epoch": 0.2949426945579867,
"grad_norm": 0.10902020335197449,
"learning_rate": 1.9863127832108196e-05,
"loss": 0.0157,
"step": 1840
},
{
"epoch": 0.2965456439849323,
"grad_norm": 0.11473922431468964,
"learning_rate": 1.9858475747629712e-05,
"loss": 0.0172,
"step": 1850
},
{
"epoch": 0.2981485934118779,
"grad_norm": 0.13333828747272491,
"learning_rate": 1.985374648321302e-05,
"loss": 0.0169,
"step": 1860
},
{
"epoch": 0.29975154283882344,
"grad_norm": 0.11504275351762772,
"learning_rate": 1.9848940075882543e-05,
"loss": 0.0161,
"step": 1870
},
{
"epoch": 0.301354492265769,
"grad_norm": 0.10406219214200974,
"learning_rate": 1.9844056563266632e-05,
"loss": 0.017,
"step": 1880
},
{
"epoch": 0.3029574416927146,
"grad_norm": 0.12233356386423111,
"learning_rate": 1.9839095983597282e-05,
"loss": 0.0146,
"step": 1890
},
{
"epoch": 0.30456039111966016,
"grad_norm": 0.10993051528930664,
"learning_rate": 1.983405837570983e-05,
"loss": 0.0161,
"step": 1900
},
{
"epoch": 0.30616334054660577,
"grad_norm": 0.1132737472653389,
"learning_rate": 1.9828943779042663e-05,
"loss": 0.0153,
"step": 1910
},
{
"epoch": 0.3077662899735513,
"grad_norm": 0.10923943668603897,
"learning_rate": 1.9823752233636868e-05,
"loss": 0.0158,
"step": 1920
},
{
"epoch": 0.30936923940049693,
"grad_norm": 0.121485136449337,
"learning_rate": 1.9818483780135976e-05,
"loss": 0.0151,
"step": 1930
},
{
"epoch": 0.3109721888274425,
"grad_norm": 0.1390533298254013,
"learning_rate": 1.98131384597856e-05,
"loss": 0.0165,
"step": 1940
},
{
"epoch": 0.3125751382543881,
"grad_norm": 0.11324126273393631,
"learning_rate": 1.9807716314433132e-05,
"loss": 0.0169,
"step": 1950
},
{
"epoch": 0.31417808768133365,
"grad_norm": 0.10308068245649338,
"learning_rate": 1.980221738652741e-05,
"loss": 0.0155,
"step": 1960
},
{
"epoch": 0.31578103710827926,
"grad_norm": 0.08773821592330933,
"learning_rate": 1.9796641719118387e-05,
"loss": 0.014,
"step": 1970
},
{
"epoch": 0.3173839865352248,
"grad_norm": 0.10015236586332321,
"learning_rate": 1.9790989355856794e-05,
"loss": 0.0158,
"step": 1980
},
{
"epoch": 0.31898693596217037,
"grad_norm": 0.10804478079080582,
"learning_rate": 1.9785260340993796e-05,
"loss": 0.0147,
"step": 1990
},
{
"epoch": 0.320589885389116,
"grad_norm": 0.10824614763259888,
"learning_rate": 1.977945471938065e-05,
"loss": 0.0159,
"step": 2000
},
{
"epoch": 0.32219283481606154,
"grad_norm": 0.10421755164861679,
"learning_rate": 1.9773572536468348e-05,
"loss": 0.0141,
"step": 2010
},
{
"epoch": 0.32379578424300715,
"grad_norm": 0.10749202966690063,
"learning_rate": 1.9767613838307267e-05,
"loss": 0.0168,
"step": 2020
},
{
"epoch": 0.3253987336699527,
"grad_norm": 0.11449505388736725,
"learning_rate": 1.9761578671546803e-05,
"loss": 0.0153,
"step": 2030
},
{
"epoch": 0.3270016830968983,
"grad_norm": 0.09397918730974197,
"learning_rate": 1.9755467083435013e-05,
"loss": 0.0147,
"step": 2040
},
{
"epoch": 0.32860463252384386,
"grad_norm": 0.11224810034036636,
"learning_rate": 1.9749279121818235e-05,
"loss": 0.0151,
"step": 2050
},
{
"epoch": 0.3302075819507895,
"grad_norm": 0.11822306364774704,
"learning_rate": 1.9743014835140725e-05,
"loss": 0.0164,
"step": 2060
},
{
"epoch": 0.33181053137773503,
"grad_norm": 0.10186577588319778,
"learning_rate": 1.973667427244427e-05,
"loss": 0.0179,
"step": 2070
},
{
"epoch": 0.33341348080468064,
"grad_norm": 0.1012149527668953,
"learning_rate": 1.97302574833678e-05,
"loss": 0.0155,
"step": 2080
},
{
"epoch": 0.3350164302316262,
"grad_norm": 0.11670338362455368,
"learning_rate": 1.9723764518147012e-05,
"loss": 0.0158,
"step": 2090
},
{
"epoch": 0.33661937965857175,
"grad_norm": 0.1039760485291481,
"learning_rate": 1.971719542761397e-05,
"loss": 0.0152,
"step": 2100
},
{
"epoch": 0.33822232908551736,
"grad_norm": 0.10772741585969925,
"learning_rate": 1.971055026319671e-05,
"loss": 0.0158,
"step": 2110
},
{
"epoch": 0.3398252785124629,
"grad_norm": 0.10215826332569122,
"learning_rate": 1.970382907691882e-05,
"loss": 0.015,
"step": 2120
},
{
"epoch": 0.3414282279394085,
"grad_norm": 0.10524086654186249,
"learning_rate": 1.9697031921399065e-05,
"loss": 0.0146,
"step": 2130
},
{
"epoch": 0.3430311773663541,
"grad_norm": 0.10919707268476486,
"learning_rate": 1.9690158849850943e-05,
"loss": 0.0141,
"step": 2140
},
{
"epoch": 0.3446341267932997,
"grad_norm": 0.10801272839307785,
"learning_rate": 1.9683209916082293e-05,
"loss": 0.0158,
"step": 2150
},
{
"epoch": 0.34623707622024524,
"grad_norm": 0.10727003216743469,
"learning_rate": 1.967618517449486e-05,
"loss": 0.014,
"step": 2160
},
{
"epoch": 0.34784002564719085,
"grad_norm": 0.11416902393102646,
"learning_rate": 1.9669084680083876e-05,
"loss": 0.0134,
"step": 2170
},
{
"epoch": 0.3494429750741364,
"grad_norm": 0.12395931780338287,
"learning_rate": 1.9661908488437613e-05,
"loss": 0.0149,
"step": 2180
},
{
"epoch": 0.351045924501082,
"grad_norm": 0.09420310705900192,
"learning_rate": 1.9654656655736973e-05,
"loss": 0.014,
"step": 2190
},
{
"epoch": 0.3526488739280276,
"grad_norm": 0.10008088499307632,
"learning_rate": 1.9647329238755034e-05,
"loss": 0.0145,
"step": 2200
},
{
"epoch": 0.3542518233549731,
"grad_norm": 0.09090422093868256,
"learning_rate": 1.9639926294856607e-05,
"loss": 0.0158,
"step": 2210
},
{
"epoch": 0.35585477278191874,
"grad_norm": 0.09520357847213745,
"learning_rate": 1.963244788199779e-05,
"loss": 0.0144,
"step": 2220
},
{
"epoch": 0.3574577222088643,
"grad_norm": 0.09746406227350235,
"learning_rate": 1.9624894058725495e-05,
"loss": 0.0156,
"step": 2230
},
{
"epoch": 0.3590606716358099,
"grad_norm": 0.11929647624492645,
"learning_rate": 1.9617264884177037e-05,
"loss": 0.0161,
"step": 2240
},
{
"epoch": 0.36066362106275546,
"grad_norm": 0.127578005194664,
"learning_rate": 1.9609560418079606e-05,
"loss": 0.0145,
"step": 2250
},
{
"epoch": 0.36226657048970107,
"grad_norm": 0.08713559806346893,
"learning_rate": 1.9601780720749867e-05,
"loss": 0.0156,
"step": 2260
},
{
"epoch": 0.3638695199166466,
"grad_norm": 0.1008586436510086,
"learning_rate": 1.9593925853093425e-05,
"loss": 0.0141,
"step": 2270
},
{
"epoch": 0.36547246934359223,
"grad_norm": 0.08782845735549927,
"learning_rate": 1.9585995876604397e-05,
"loss": 0.0146,
"step": 2280
},
{
"epoch": 0.3670754187705378,
"grad_norm": 0.08425669372081757,
"learning_rate": 1.9577990853364902e-05,
"loss": 0.0145,
"step": 2290
},
{
"epoch": 0.3686783681974834,
"grad_norm": 0.09964483976364136,
"learning_rate": 1.9569910846044586e-05,
"loss": 0.0139,
"step": 2300
},
{
"epoch": 0.37028131762442895,
"grad_norm": 0.08887302130460739,
"learning_rate": 1.956175591790014e-05,
"loss": 0.0142,
"step": 2310
},
{
"epoch": 0.3718842670513745,
"grad_norm": 0.12365594506263733,
"learning_rate": 1.955352613277478e-05,
"loss": 0.0145,
"step": 2320
},
{
"epoch": 0.3734872164783201,
"grad_norm": 0.10407640784978867,
"learning_rate": 1.954522155509776e-05,
"loss": 0.0146,
"step": 2330
},
{
"epoch": 0.37509016590526567,
"grad_norm": 0.09630418568849564,
"learning_rate": 1.953684224988389e-05,
"loss": 0.0139,
"step": 2340
},
{
"epoch": 0.3766931153322113,
"grad_norm": 0.11108113825321198,
"learning_rate": 1.952838828273298e-05,
"loss": 0.0147,
"step": 2350
},
{
"epoch": 0.37829606475915684,
"grad_norm": 0.10452122241258621,
"learning_rate": 1.9519859719829375e-05,
"loss": 0.0147,
"step": 2360
},
{
"epoch": 0.37989901418610245,
"grad_norm": 0.08710601180791855,
"learning_rate": 1.9511256627941394e-05,
"loss": 0.0143,
"step": 2370
},
{
"epoch": 0.381501963613048,
"grad_norm": 0.1234845295548439,
"learning_rate": 1.950257907442085e-05,
"loss": 0.0145,
"step": 2380
},
{
"epoch": 0.3831049130399936,
"grad_norm": 0.0989808440208435,
"learning_rate": 1.9493827127202482e-05,
"loss": 0.0143,
"step": 2390
},
{
"epoch": 0.38470786246693917,
"grad_norm": 0.09222126007080078,
"learning_rate": 1.948500085480345e-05,
"loss": 0.0143,
"step": 2400
},
{
"epoch": 0.3863108118938847,
"grad_norm": 0.10298562049865723,
"learning_rate": 1.9476100326322785e-05,
"loss": 0.0139,
"step": 2410
},
{
"epoch": 0.38791376132083033,
"grad_norm": 0.10555426776409149,
"learning_rate": 1.9467125611440864e-05,
"loss": 0.015,
"step": 2420
},
{
"epoch": 0.3895167107477759,
"grad_norm": 0.10084094852209091,
"learning_rate": 1.9458076780418844e-05,
"loss": 0.0145,
"step": 2430
},
{
"epoch": 0.3911196601747215,
"grad_norm": 0.1143961027264595,
"learning_rate": 1.9448953904098124e-05,
"loss": 0.0143,
"step": 2440
},
{
"epoch": 0.39272260960166705,
"grad_norm": 0.11982633918523788,
"learning_rate": 1.9439757053899785e-05,
"loss": 0.0155,
"step": 2450
},
{
"epoch": 0.39432555902861266,
"grad_norm": 0.09620746970176697,
"learning_rate": 1.9430486301824044e-05,
"loss": 0.0139,
"step": 2460
},
{
"epoch": 0.3959285084555582,
"grad_norm": 0.09390459209680557,
"learning_rate": 1.942114172044967e-05,
"loss": 0.0144,
"step": 2470
},
{
"epoch": 0.3975314578825038,
"grad_norm": 0.08205546438694,
"learning_rate": 1.9411723382933433e-05,
"loss": 0.0143,
"step": 2480
},
{
"epoch": 0.3991344073094494,
"grad_norm": 0.10015802830457687,
"learning_rate": 1.9402231363009515e-05,
"loss": 0.014,
"step": 2490
},
{
"epoch": 0.400737356736395,
"grad_norm": 0.10022858530282974,
"learning_rate": 1.9392665734988956e-05,
"loss": 0.0146,
"step": 2500
},
{
"epoch": 0.40234030616334054,
"grad_norm": 0.09830185770988464,
"learning_rate": 1.9383026573759046e-05,
"loss": 0.0141,
"step": 2510
},
{
"epoch": 0.4039432555902861,
"grad_norm": 0.10277236253023148,
"learning_rate": 1.9373313954782757e-05,
"loss": 0.0147,
"step": 2520
},
{
"epoch": 0.4055462050172317,
"grad_norm": 0.09011970460414886,
"learning_rate": 1.9363527954098148e-05,
"loss": 0.0138,
"step": 2530
},
{
"epoch": 0.40714915444417726,
"grad_norm": 0.08783067762851715,
"learning_rate": 1.935366864831776e-05,
"loss": 0.0153,
"step": 2540
},
{
"epoch": 0.4087521038711229,
"grad_norm": 0.11457304656505585,
"learning_rate": 1.9343736114628035e-05,
"loss": 0.0145,
"step": 2550
},
{
"epoch": 0.41035505329806843,
"grad_norm": 0.09166496247053146,
"learning_rate": 1.933373043078869e-05,
"loss": 0.015,
"step": 2560
},
{
"epoch": 0.41195800272501404,
"grad_norm": 0.10220997035503387,
"learning_rate": 1.9323651675132126e-05,
"loss": 0.0136,
"step": 2570
},
{
"epoch": 0.4135609521519596,
"grad_norm": 0.08962884545326233,
"learning_rate": 1.931349992656281e-05,
"loss": 0.0146,
"step": 2580
},
{
"epoch": 0.4151639015789052,
"grad_norm": 0.09034290909767151,
"learning_rate": 1.930327526455665e-05,
"loss": 0.0136,
"step": 2590
},
{
"epoch": 0.41676685100585076,
"grad_norm": 0.07524847984313965,
"learning_rate": 1.9292977769160374e-05,
"loss": 0.0137,
"step": 2600
},
{
"epoch": 0.41836980043279637,
"grad_norm": 0.09737731516361237,
"learning_rate": 1.9282607520990918e-05,
"loss": 0.0138,
"step": 2610
},
{
"epoch": 0.4199727498597419,
"grad_norm": 0.10110239684581757,
"learning_rate": 1.927216460123478e-05,
"loss": 0.0141,
"step": 2620
},
{
"epoch": 0.4215756992866875,
"grad_norm": 0.12663525342941284,
"learning_rate": 1.926164909164739e-05,
"loss": 0.0132,
"step": 2630
},
{
"epoch": 0.4231786487136331,
"grad_norm": 0.10225925594568253,
"learning_rate": 1.9251061074552458e-05,
"loss": 0.0142,
"step": 2640
},
{
"epoch": 0.42478159814057864,
"grad_norm": 0.11328284442424774,
"learning_rate": 1.924040063284135e-05,
"loss": 0.0147,
"step": 2650
},
{
"epoch": 0.42638454756752425,
"grad_norm": 0.10168527811765671,
"learning_rate": 1.9229667849972436e-05,
"loss": 0.0135,
"step": 2660
},
{
"epoch": 0.4279874969944698,
"grad_norm": 0.10044345259666443,
"learning_rate": 1.9218862809970413e-05,
"loss": 0.0152,
"step": 2670
},
{
"epoch": 0.4295904464214154,
"grad_norm": 0.09153233468532562,
"learning_rate": 1.9207985597425675e-05,
"loss": 0.0131,
"step": 2680
},
{
"epoch": 0.43119339584836097,
"grad_norm": 0.09851006418466568,
"learning_rate": 1.9197036297493636e-05,
"loss": 0.0139,
"step": 2690
},
{
"epoch": 0.4327963452753066,
"grad_norm": 0.10922195017337799,
"learning_rate": 1.918601499589407e-05,
"loss": 0.0133,
"step": 2700
},
{
"epoch": 0.43439929470225214,
"grad_norm": 0.0905163437128067,
"learning_rate": 1.917492177891043e-05,
"loss": 0.0132,
"step": 2710
},
{
"epoch": 0.43600224412919775,
"grad_norm": 0.09973873198032379,
"learning_rate": 1.916375673338919e-05,
"loss": 0.014,
"step": 2720
},
{
"epoch": 0.4376051935561433,
"grad_norm": 0.09208554029464722,
"learning_rate": 1.9152519946739146e-05,
"loss": 0.0132,
"step": 2730
},
{
"epoch": 0.43920814298308886,
"grad_norm": 0.09508796036243439,
"learning_rate": 1.9141211506930742e-05,
"loss": 0.0146,
"step": 2740
},
{
"epoch": 0.44081109241003447,
"grad_norm": 0.07578465342521667,
"learning_rate": 1.9129831502495383e-05,
"loss": 0.0131,
"step": 2750
},
{
"epoch": 0.44241404183698,
"grad_norm": 0.09476418793201447,
"learning_rate": 1.911838002252474e-05,
"loss": 0.0115,
"step": 2760
},
{
"epoch": 0.44401699126392563,
"grad_norm": 0.08848860114812851,
"learning_rate": 1.9106857156670037e-05,
"loss": 0.0138,
"step": 2770
},
{
"epoch": 0.4456199406908712,
"grad_norm": 0.09684263169765472,
"learning_rate": 1.9095262995141377e-05,
"loss": 0.0133,
"step": 2780
},
{
"epoch": 0.4472228901178168,
"grad_norm": 0.08769190311431885,
"learning_rate": 1.908359762870702e-05,
"loss": 0.0129,
"step": 2790
},
{
"epoch": 0.44882583954476235,
"grad_norm": 0.07661417126655579,
"learning_rate": 1.9071861148692673e-05,
"loss": 0.0149,
"step": 2800
},
{
"epoch": 0.45042878897170796,
"grad_norm": 0.1025426983833313,
"learning_rate": 1.9060053646980772e-05,
"loss": 0.0147,
"step": 2810
},
{
"epoch": 0.4520317383986535,
"grad_norm": 0.09976097196340561,
"learning_rate": 1.9048175216009776e-05,
"loss": 0.0126,
"step": 2820
},
{
"epoch": 0.4536346878255991,
"grad_norm": 0.09328175336122513,
"learning_rate": 1.9036225948773423e-05,
"loss": 0.0134,
"step": 2830
},
{
"epoch": 0.4552376372525447,
"grad_norm": 0.0779719203710556,
"learning_rate": 1.9024205938820023e-05,
"loss": 0.0137,
"step": 2840
},
{
"epoch": 0.45684058667949023,
"grad_norm": 0.08576709777116776,
"learning_rate": 1.901211528025171e-05,
"loss": 0.0138,
"step": 2850
},
{
"epoch": 0.45844353610643584,
"grad_norm": 0.08608071506023407,
"learning_rate": 1.8999954067723715e-05,
"loss": 0.0129,
"step": 2860
},
{
"epoch": 0.4600464855333814,
"grad_norm": 0.09790827333927155,
"learning_rate": 1.8987722396443618e-05,
"loss": 0.0123,
"step": 2870
},
{
"epoch": 0.461649434960327,
"grad_norm": 0.08433697372674942,
"learning_rate": 1.8975420362170606e-05,
"loss": 0.0124,
"step": 2880
},
{
"epoch": 0.46325238438727256,
"grad_norm": 0.10495218634605408,
"learning_rate": 1.8963048061214725e-05,
"loss": 0.0138,
"step": 2890
},
{
"epoch": 0.4648553338142182,
"grad_norm": 0.09212182462215424,
"learning_rate": 1.8950605590436125e-05,
"loss": 0.0119,
"step": 2900
},
{
"epoch": 0.46645828324116373,
"grad_norm": 0.10127340257167816,
"learning_rate": 1.8938093047244298e-05,
"loss": 0.0147,
"step": 2910
},
{
"epoch": 0.46806123266810934,
"grad_norm": 0.09314699470996857,
"learning_rate": 1.892551052959732e-05,
"loss": 0.0121,
"step": 2920
},
{
"epoch": 0.4696641820950549,
"grad_norm": 0.09155376255512238,
"learning_rate": 1.891285813600108e-05,
"loss": 0.0132,
"step": 2930
},
{
"epoch": 0.4712671315220005,
"grad_norm": 0.09479997307062149,
"learning_rate": 1.8900135965508514e-05,
"loss": 0.0127,
"step": 2940
},
{
"epoch": 0.47287008094894606,
"grad_norm": 0.0826965719461441,
"learning_rate": 1.8887344117718825e-05,
"loss": 0.0125,
"step": 2950
},
{
"epoch": 0.4744730303758916,
"grad_norm": 0.0781230479478836,
"learning_rate": 1.8874482692776705e-05,
"loss": 0.0131,
"step": 2960
},
{
"epoch": 0.4760759798028372,
"grad_norm": 0.08591257035732269,
"learning_rate": 1.8861551791371554e-05,
"loss": 0.0122,
"step": 2970
},
{
"epoch": 0.4776789292297828,
"grad_norm": 0.09087405353784561,
"learning_rate": 1.8848551514736684e-05,
"loss": 0.0125,
"step": 2980
},
{
"epoch": 0.4792818786567284,
"grad_norm": 0.11277468502521515,
"learning_rate": 1.883548196464853e-05,
"loss": 0.0134,
"step": 2990
},
{
"epoch": 0.48088482808367394,
"grad_norm": 0.07540776580572128,
"learning_rate": 1.8822343243425867e-05,
"loss": 0.0122,
"step": 3000
},
{
"epoch": 0.48248777751061955,
"grad_norm": 0.08707955479621887,
"learning_rate": 1.8809135453928976e-05,
"loss": 0.0132,
"step": 3010
},
{
"epoch": 0.4840907269375651,
"grad_norm": 0.08592630922794342,
"learning_rate": 1.8795858699558876e-05,
"loss": 0.0129,
"step": 3020
},
{
"epoch": 0.4856936763645107,
"grad_norm": 0.09010718762874603,
"learning_rate": 1.8782513084256492e-05,
"loss": 0.014,
"step": 3030
},
{
"epoch": 0.4872966257914563,
"grad_norm": 0.08629398792982101,
"learning_rate": 1.8769098712501842e-05,
"loss": 0.0131,
"step": 3040
},
{
"epoch": 0.4888995752184019,
"grad_norm": 0.08849960565567017,
"learning_rate": 1.875561568931323e-05,
"loss": 0.0125,
"step": 3050
},
{
"epoch": 0.49050252464534744,
"grad_norm": 0.11940028518438339,
"learning_rate": 1.8742064120246416e-05,
"loss": 0.0143,
"step": 3060
},
{
"epoch": 0.492105474072293,
"grad_norm": 0.0845445841550827,
"learning_rate": 1.872844411139379e-05,
"loss": 0.013,
"step": 3070
},
{
"epoch": 0.4937084234992386,
"grad_norm": 0.0929800420999527,
"learning_rate": 1.8714755769383546e-05,
"loss": 0.013,
"step": 3080
},
{
"epoch": 0.49531137292618416,
"grad_norm": 0.1016814187169075,
"learning_rate": 1.870099920137884e-05,
"loss": 0.0139,
"step": 3090
},
{
"epoch": 0.49691432235312977,
"grad_norm": 0.09055141359567642,
"learning_rate": 1.8687174515076956e-05,
"loss": 0.0121,
"step": 3100
},
{
"epoch": 0.4985172717800753,
"grad_norm": 0.09261985868215561,
"learning_rate": 1.867328181870846e-05,
"loss": 0.0133,
"step": 3110
},
{
"epoch": 0.5001202212070209,
"grad_norm": 0.08901810646057129,
"learning_rate": 1.8659321221036365e-05,
"loss": 0.013,
"step": 3120
},
{
"epoch": 0.5017231706339665,
"grad_norm": 0.08935806900262833,
"learning_rate": 1.8645292831355252e-05,
"loss": 0.0147,
"step": 3130
},
{
"epoch": 0.503326120060912,
"grad_norm": 0.1315586119890213,
"learning_rate": 1.8631196759490447e-05,
"loss": 0.0125,
"step": 3140
},
{
"epoch": 0.5049290694878577,
"grad_norm": 0.08496637642383575,
"learning_rate": 1.8617033115797137e-05,
"loss": 0.0112,
"step": 3150
},
{
"epoch": 0.5065320189148033,
"grad_norm": 0.0730431079864502,
"learning_rate": 1.8602802011159516e-05,
"loss": 0.0116,
"step": 3160
},
{
"epoch": 0.5081349683417489,
"grad_norm": 0.10121534019708633,
"learning_rate": 1.8588503556989918e-05,
"loss": 0.0132,
"step": 3170
},
{
"epoch": 0.5097379177686944,
"grad_norm": 0.10644973069429398,
"learning_rate": 1.8574137865227933e-05,
"loss": 0.0119,
"step": 3180
},
{
"epoch": 0.51134086719564,
"grad_norm": 0.10578668117523193,
"learning_rate": 1.8559705048339562e-05,
"loss": 0.0126,
"step": 3190
},
{
"epoch": 0.5129438166225856,
"grad_norm": 0.08165573328733444,
"learning_rate": 1.8545205219316292e-05,
"loss": 0.0131,
"step": 3200
},
{
"epoch": 0.5145467660495311,
"grad_norm": 0.08247827738523483,
"learning_rate": 1.853063849167424e-05,
"loss": 0.0133,
"step": 3210
},
{
"epoch": 0.5161497154764767,
"grad_norm": 0.07236569374799728,
"learning_rate": 1.8516004979453265e-05,
"loss": 0.0116,
"step": 3220
},
{
"epoch": 0.5177526649034223,
"grad_norm": 0.08705353736877441,
"learning_rate": 1.850130479721606e-05,
"loss": 0.0126,
"step": 3230
},
{
"epoch": 0.5193556143303679,
"grad_norm": 0.07517849653959274,
"learning_rate": 1.8486538060047267e-05,
"loss": 0.0109,
"step": 3240
},
{
"epoch": 0.5209585637573134,
"grad_norm": 0.09321631491184235,
"learning_rate": 1.8471704883552582e-05,
"loss": 0.0118,
"step": 3250
},
{
"epoch": 0.522561513184259,
"grad_norm": 0.09053128957748413,
"learning_rate": 1.845680538385782e-05,
"loss": 0.0134,
"step": 3260
},
{
"epoch": 0.5241644626112046,
"grad_norm": 0.08485197275876999,
"learning_rate": 1.8441839677608045e-05,
"loss": 0.0129,
"step": 3270
},
{
"epoch": 0.5257674120381503,
"grad_norm": 0.0714045837521553,
"learning_rate": 1.8426807881966633e-05,
"loss": 0.0111,
"step": 3280
},
{
"epoch": 0.5273703614650957,
"grad_norm": 0.0777626484632492,
"learning_rate": 1.841171011461435e-05,
"loss": 0.0122,
"step": 3290
},
{
"epoch": 0.5289733108920414,
"grad_norm": 0.07993612438440323,
"learning_rate": 1.8396546493748456e-05,
"loss": 0.0123,
"step": 3300
},
{
"epoch": 0.530576260318987,
"grad_norm": 0.09510450810194016,
"learning_rate": 1.8381317138081755e-05,
"loss": 0.0126,
"step": 3310
},
{
"epoch": 0.5321792097459325,
"grad_norm": 0.07836094498634338,
"learning_rate": 1.8366022166841676e-05,
"loss": 0.0111,
"step": 3320
},
{
"epoch": 0.5337821591728781,
"grad_norm": 0.09115055948495865,
"learning_rate": 1.8350661699769344e-05,
"loss": 0.0127,
"step": 3330
},
{
"epoch": 0.5353851085998237,
"grad_norm": 0.08571304380893707,
"learning_rate": 1.833523585711863e-05,
"loss": 0.0126,
"step": 3340
},
{
"epoch": 0.5369880580267693,
"grad_norm": 0.06526945531368256,
"learning_rate": 1.831974475965521e-05,
"loss": 0.0119,
"step": 3350
},
{
"epoch": 0.5385910074537148,
"grad_norm": 0.08085188269615173,
"learning_rate": 1.830418852865565e-05,
"loss": 0.0117,
"step": 3360
},
{
"epoch": 0.5401939568806604,
"grad_norm": 0.0840122401714325,
"learning_rate": 1.828856728590642e-05,
"loss": 0.0138,
"step": 3370
},
{
"epoch": 0.541796906307606,
"grad_norm": 0.09077087044715881,
"learning_rate": 1.827288115370294e-05,
"loss": 0.0157,
"step": 3380
},
{
"epoch": 0.5433998557345516,
"grad_norm": 0.14829397201538086,
"learning_rate": 1.825713025484866e-05,
"loss": 0.013,
"step": 3390
},
{
"epoch": 0.5450028051614971,
"grad_norm": 0.08603893965482712,
"learning_rate": 1.824131471265405e-05,
"loss": 0.014,
"step": 3400
},
{
"epoch": 0.5466057545884427,
"grad_norm": 0.0720130130648613,
"learning_rate": 1.822543465093568e-05,
"loss": 0.012,
"step": 3410
},
{
"epoch": 0.5482087040153883,
"grad_norm": 0.08433578908443451,
"learning_rate": 1.8209490194015216e-05,
"loss": 0.0128,
"step": 3420
},
{
"epoch": 0.5498116534423338,
"grad_norm": 0.36162707209587097,
"learning_rate": 1.819348146671847e-05,
"loss": 0.0138,
"step": 3430
},
{
"epoch": 0.5514146028692795,
"grad_norm": 0.08648835122585297,
"learning_rate": 1.8177408594374412e-05,
"loss": 0.0126,
"step": 3440
},
{
"epoch": 0.5530175522962251,
"grad_norm": 0.0839293822646141,
"learning_rate": 1.816127170281418e-05,
"loss": 0.012,
"step": 3450
},
{
"epoch": 0.5546205017231707,
"grad_norm": 0.10502775758504868,
"learning_rate": 1.8145070918370114e-05,
"loss": 0.0133,
"step": 3460
},
{
"epoch": 0.5562234511501162,
"grad_norm": 0.08115836977958679,
"learning_rate": 1.8128806367874762e-05,
"loss": 0.0125,
"step": 3470
},
{
"epoch": 0.5578264005770618,
"grad_norm": 0.07933896780014038,
"learning_rate": 1.8112478178659872e-05,
"loss": 0.0124,
"step": 3480
},
{
"epoch": 0.5594293500040074,
"grad_norm": 0.07423722743988037,
"learning_rate": 1.8096086478555414e-05,
"loss": 0.0114,
"step": 3490
},
{
"epoch": 0.561032299430953,
"grad_norm": 0.08609019964933395,
"learning_rate": 1.8079631395888567e-05,
"loss": 0.0121,
"step": 3500
},
{
"epoch": 0.5626352488578985,
"grad_norm": 0.09131062030792236,
"learning_rate": 1.8063113059482718e-05,
"loss": 0.0122,
"step": 3510
},
{
"epoch": 0.5642381982848441,
"grad_norm": 0.07950209826231003,
"learning_rate": 1.8046531598656465e-05,
"loss": 0.0119,
"step": 3520
},
{
"epoch": 0.5658411477117897,
"grad_norm": 0.10808353126049042,
"learning_rate": 1.802988714322258e-05,
"loss": 0.0127,
"step": 3530
},
{
"epoch": 0.5674440971387352,
"grad_norm": 0.08181063830852509,
"learning_rate": 1.801317982348701e-05,
"loss": 0.0119,
"step": 3540
},
{
"epoch": 0.5690470465656808,
"grad_norm": 0.09325892478227615,
"learning_rate": 1.7996409770247866e-05,
"loss": 0.0113,
"step": 3550
},
{
"epoch": 0.5706499959926264,
"grad_norm": 0.09701074659824371,
"learning_rate": 1.7979577114794367e-05,
"loss": 0.0116,
"step": 3560
},
{
"epoch": 0.5722529454195721,
"grad_norm": 0.08736202120780945,
"learning_rate": 1.7962681988905844e-05,
"loss": 0.0118,
"step": 3570
},
{
"epoch": 0.5738558948465176,
"grad_norm": 0.09056610614061356,
"learning_rate": 1.7945724524850697e-05,
"loss": 0.0118,
"step": 3580
},
{
"epoch": 0.5754588442734632,
"grad_norm": 0.06541703641414642,
"learning_rate": 1.7928704855385344e-05,
"loss": 0.0118,
"step": 3590
},
{
"epoch": 0.5770617937004088,
"grad_norm": 0.07547177374362946,
"learning_rate": 1.791162311375321e-05,
"loss": 0.0126,
"step": 3600
},
{
"epoch": 0.5786647431273544,
"grad_norm": 0.07265637814998627,
"learning_rate": 1.7894479433683676e-05,
"loss": 0.0122,
"step": 3610
},
{
"epoch": 0.5802676925542999,
"grad_norm": 0.07592958211898804,
"learning_rate": 1.7877273949391006e-05,
"loss": 0.0135,
"step": 3620
},
{
"epoch": 0.5818706419812455,
"grad_norm": 0.08901036530733109,
"learning_rate": 1.7860006795573326e-05,
"loss": 0.012,
"step": 3630
},
{
"epoch": 0.5834735914081911,
"grad_norm": 0.07132358849048615,
"learning_rate": 1.7842678107411565e-05,
"loss": 0.0114,
"step": 3640
},
{
"epoch": 0.5850765408351366,
"grad_norm": 0.07480525970458984,
"learning_rate": 1.7825288020568387e-05,
"loss": 0.0118,
"step": 3650
},
{
"epoch": 0.5866794902620822,
"grad_norm": 0.07714416086673737,
"learning_rate": 1.780783667118713e-05,
"loss": 0.0117,
"step": 3660
},
{
"epoch": 0.5882824396890278,
"grad_norm": 0.08674295246601105,
"learning_rate": 1.7790324195890752e-05,
"loss": 0.0108,
"step": 3670
},
{
"epoch": 0.5898853891159734,
"grad_norm": 0.08064709603786469,
"learning_rate": 1.777275073178074e-05,
"loss": 0.0112,
"step": 3680
},
{
"epoch": 0.5914883385429189,
"grad_norm": 0.07559552043676376,
"learning_rate": 1.7755116416436063e-05,
"loss": 0.0111,
"step": 3690
},
{
"epoch": 0.5930912879698645,
"grad_norm": 0.08388907462358475,
"learning_rate": 1.7737421387912075e-05,
"loss": 0.0121,
"step": 3700
},
{
"epoch": 0.5946942373968102,
"grad_norm": 0.08252954483032227,
"learning_rate": 1.7719665784739444e-05,
"loss": 0.0127,
"step": 3710
},
{
"epoch": 0.5962971868237558,
"grad_norm": 0.0813174620270729,
"learning_rate": 1.7701849745923056e-05,
"loss": 0.012,
"step": 3720
},
{
"epoch": 0.5979001362507013,
"grad_norm": 0.07678083330392838,
"learning_rate": 1.7683973410940946e-05,
"loss": 0.0123,
"step": 3730
},
{
"epoch": 0.5995030856776469,
"grad_norm": 0.09870904684066772,
"learning_rate": 1.766603691974319e-05,
"loss": 0.0117,
"step": 3740
},
{
"epoch": 0.6011060351045925,
"grad_norm": 0.07422671467065811,
"learning_rate": 1.7648040412750807e-05,
"loss": 0.012,
"step": 3750
},
{
"epoch": 0.602708984531538,
"grad_norm": 0.09045329689979553,
"learning_rate": 1.7629984030854685e-05,
"loss": 0.0118,
"step": 3760
},
{
"epoch": 0.6043119339584836,
"grad_norm": 0.10855185240507126,
"learning_rate": 1.761186791541444e-05,
"loss": 0.018,
"step": 3770
},
{
"epoch": 0.6059148833854292,
"grad_norm": 0.08267343789339066,
"learning_rate": 1.7593692208257347e-05,
"loss": 0.014,
"step": 3780
},
{
"epoch": 0.6075178328123748,
"grad_norm": 0.09279263019561768,
"learning_rate": 1.75754570516772e-05,
"loss": 0.0136,
"step": 3790
},
{
"epoch": 0.6091207822393203,
"grad_norm": 0.09911685436964035,
"learning_rate": 1.7557162588433207e-05,
"loss": 0.012,
"step": 3800
},
{
"epoch": 0.6107237316662659,
"grad_norm": 0.07959414273500443,
"learning_rate": 1.7538808961748897e-05,
"loss": 0.012,
"step": 3810
},
{
"epoch": 0.6123266810932115,
"grad_norm": 0.09886189550161362,
"learning_rate": 1.752039631531095e-05,
"loss": 0.0118,
"step": 3820
},
{
"epoch": 0.6139296305201571,
"grad_norm": 0.08571209758520126,
"learning_rate": 1.750192479326812e-05,
"loss": 0.0125,
"step": 3830
},
{
"epoch": 0.6155325799471026,
"grad_norm": 0.07818809151649475,
"learning_rate": 1.748339454023007e-05,
"loss": 0.0121,
"step": 3840
},
{
"epoch": 0.6171355293740483,
"grad_norm": 0.09316529333591461,
"learning_rate": 1.746480570126627e-05,
"loss": 0.0123,
"step": 3850
},
{
"epoch": 0.6187384788009939,
"grad_norm": 0.07923634350299835,
"learning_rate": 1.744615842190484e-05,
"loss": 0.0118,
"step": 3860
},
{
"epoch": 0.6203414282279394,
"grad_norm": 0.06345493346452713,
"learning_rate": 1.742745284813141e-05,
"loss": 0.011,
"step": 3870
},
{
"epoch": 0.621944377654885,
"grad_norm": 0.092954121530056,
"learning_rate": 1.7408689126387997e-05,
"loss": 0.0111,
"step": 3880
},
{
"epoch": 0.6235473270818306,
"grad_norm": 0.0818738043308258,
"learning_rate": 1.7389867403571844e-05,
"loss": 0.0107,
"step": 3890
},
{
"epoch": 0.6251502765087762,
"grad_norm": 0.07621193677186966,
"learning_rate": 1.737098782703427e-05,
"loss": 0.011,
"step": 3900
},
{
"epoch": 0.6267532259357217,
"grad_norm": 0.08616837859153748,
"learning_rate": 1.7352050544579514e-05,
"loss": 0.0116,
"step": 3910
},
{
"epoch": 0.6283561753626673,
"grad_norm": 0.08493662625551224,
"learning_rate": 1.733305570446359e-05,
"loss": 0.012,
"step": 3920
},
{
"epoch": 0.6299591247896129,
"grad_norm": 0.07540535926818848,
"learning_rate": 1.7314003455393117e-05,
"loss": 0.0109,
"step": 3930
},
{
"epoch": 0.6315620742165585,
"grad_norm": 0.0753653421998024,
"learning_rate": 1.729489394652415e-05,
"loss": 0.0107,
"step": 3940
},
{
"epoch": 0.633165023643504,
"grad_norm": 0.11599840223789215,
"learning_rate": 1.7275727327461035e-05,
"loss": 0.0119,
"step": 3950
},
{
"epoch": 0.6347679730704496,
"grad_norm": 0.08844250440597534,
"learning_rate": 1.72565037482552e-05,
"loss": 0.0114,
"step": 3960
},
{
"epoch": 0.6363709224973952,
"grad_norm": 0.13949395716190338,
"learning_rate": 1.723722335940402e-05,
"loss": 0.0125,
"step": 3970
},
{
"epoch": 0.6379738719243407,
"grad_norm": 0.07286416739225388,
"learning_rate": 1.721788631184961e-05,
"loss": 0.0118,
"step": 3980
},
{
"epoch": 0.6395768213512864,
"grad_norm": 0.07762700319290161,
"learning_rate": 1.7198492756977664e-05,
"loss": 0.012,
"step": 3990
},
{
"epoch": 0.641179770778232,
"grad_norm": 0.08513263612985611,
"learning_rate": 1.717904284661625e-05,
"loss": 0.0111,
"step": 4000
},
{
"epoch": 0.6427827202051776,
"grad_norm": 0.07539255172014236,
"learning_rate": 1.7159536733034638e-05,
"loss": 0.0122,
"step": 4010
},
{
"epoch": 0.6443856696321231,
"grad_norm": 0.08053237944841385,
"learning_rate": 1.7139974568942094e-05,
"loss": 0.0102,
"step": 4020
},
{
"epoch": 0.6459886190590687,
"grad_norm": 0.07483901083469391,
"learning_rate": 1.7120356507486694e-05,
"loss": 0.0117,
"step": 4030
},
{
"epoch": 0.6475915684860143,
"grad_norm": 0.08027271181344986,
"learning_rate": 1.7100682702254133e-05,
"loss": 0.0117,
"step": 4040
},
{
"epoch": 0.6491945179129599,
"grad_norm": 0.0767173245549202,
"learning_rate": 1.7080953307266507e-05,
"loss": 0.012,
"step": 4050
},
{
"epoch": 0.6507974673399054,
"grad_norm": 0.07106052339076996,
"learning_rate": 1.70611684769811e-05,
"loss": 0.0109,
"step": 4060
},
{
"epoch": 0.652400416766851,
"grad_norm": 0.0712939128279686,
"learning_rate": 1.7041328366289202e-05,
"loss": 0.01,
"step": 4070
},
{
"epoch": 0.6540033661937966,
"grad_norm": 0.08332879841327667,
"learning_rate": 1.7021433130514875e-05,
"loss": 0.0114,
"step": 4080
},
{
"epoch": 0.6556063156207421,
"grad_norm": 0.07251271605491638,
"learning_rate": 1.7001482925413748e-05,
"loss": 0.0113,
"step": 4090
},
{
"epoch": 0.6572092650476877,
"grad_norm": 0.09120030701160431,
"learning_rate": 1.6981477907171792e-05,
"loss": 0.0103,
"step": 4100
},
{
"epoch": 0.6588122144746333,
"grad_norm": 0.0815647691488266,
"learning_rate": 1.696141823240409e-05,
"loss": 0.0109,
"step": 4110
},
{
"epoch": 0.660415163901579,
"grad_norm": 0.08549433201551437,
"learning_rate": 1.6941304058153637e-05,
"loss": 0.0113,
"step": 4120
},
{
"epoch": 0.6620181133285244,
"grad_norm": 0.06725161522626877,
"learning_rate": 1.6921135541890075e-05,
"loss": 0.0114,
"step": 4130
},
{
"epoch": 0.6636210627554701,
"grad_norm": 0.06733765453100204,
"learning_rate": 1.6900912841508496e-05,
"loss": 0.0116,
"step": 4140
},
{
"epoch": 0.6652240121824157,
"grad_norm": 0.10023114830255508,
"learning_rate": 1.6880636115328165e-05,
"loss": 0.0103,
"step": 4150
},
{
"epoch": 0.6668269616093613,
"grad_norm": 0.07057762891054153,
"learning_rate": 1.6860305522091328e-05,
"loss": 0.0113,
"step": 4160
},
{
"epoch": 0.6684299110363068,
"grad_norm": 0.08346732705831528,
"learning_rate": 1.683992122096193e-05,
"loss": 0.0111,
"step": 4170
},
{
"epoch": 0.6700328604632524,
"grad_norm": 0.07453355193138123,
"learning_rate": 1.681948337152439e-05,
"loss": 0.0103,
"step": 4180
},
{
"epoch": 0.671635809890198,
"grad_norm": 0.07577425241470337,
"learning_rate": 1.6798992133782333e-05,
"loss": 0.0111,
"step": 4190
},
{
"epoch": 0.6732387593171435,
"grad_norm": 0.07895900309085846,
"learning_rate": 1.677844766815737e-05,
"loss": 0.01,
"step": 4200
},
{
"epoch": 0.6748417087440891,
"grad_norm": 0.10164612531661987,
"learning_rate": 1.6757850135487813e-05,
"loss": 0.0099,
"step": 4210
},
{
"epoch": 0.6764446581710347,
"grad_norm": 0.08837499469518661,
"learning_rate": 1.673719969702742e-05,
"loss": 0.0114,
"step": 4220
},
{
"epoch": 0.6780476075979803,
"grad_norm": 0.07328519970178604,
"learning_rate": 1.671649651444414e-05,
"loss": 0.011,
"step": 4230
},
{
"epoch": 0.6796505570249258,
"grad_norm": 0.07811128348112106,
"learning_rate": 1.6695740749818847e-05,
"loss": 0.0105,
"step": 4240
},
{
"epoch": 0.6812535064518714,
"grad_norm": 0.08153773844242096,
"learning_rate": 1.6674932565644068e-05,
"loss": 0.0116,
"step": 4250
},
{
"epoch": 0.682856455878817,
"grad_norm": 0.07350067049264908,
"learning_rate": 1.6654072124822713e-05,
"loss": 0.0112,
"step": 4260
},
{
"epoch": 0.6844594053057627,
"grad_norm": 0.08609039336442947,
"learning_rate": 1.663315959066679e-05,
"loss": 0.0117,
"step": 4270
},
{
"epoch": 0.6860623547327082,
"grad_norm": 0.0826505571603775,
"learning_rate": 1.6612195126896143e-05,
"loss": 0.0104,
"step": 4280
},
{
"epoch": 0.6876653041596538,
"grad_norm": 0.08540302515029907,
"learning_rate": 1.6591178897637167e-05,
"loss": 0.0101,
"step": 4290
},
{
"epoch": 0.6892682535865994,
"grad_norm": 0.07264818996191025,
"learning_rate": 1.6570111067421504e-05,
"loss": 0.0112,
"step": 4300
},
{
"epoch": 0.6908712030135449,
"grad_norm": 0.07823798805475235,
"learning_rate": 1.6548991801184784e-05,
"loss": 0.0108,
"step": 4310
},
{
"epoch": 0.6924741524404905,
"grad_norm": 0.14571355283260345,
"learning_rate": 1.6527821264265303e-05,
"loss": 0.0109,
"step": 4320
},
{
"epoch": 0.6940771018674361,
"grad_norm": 0.06896299868822098,
"learning_rate": 1.6506599622402757e-05,
"loss": 0.0108,
"step": 4330
},
{
"epoch": 0.6956800512943817,
"grad_norm": 0.08016230165958405,
"learning_rate": 1.648532704173693e-05,
"loss": 0.0117,
"step": 4340
},
{
"epoch": 0.6972830007213272,
"grad_norm": 0.07234030216932297,
"learning_rate": 1.6464003688806385e-05,
"loss": 0.0112,
"step": 4350
},
{
"epoch": 0.6988859501482728,
"grad_norm": 0.08299129456281662,
"learning_rate": 1.6442629730547187e-05,
"loss": 0.011,
"step": 4360
},
{
"epoch": 0.7004888995752184,
"grad_norm": 0.07566344738006592,
"learning_rate": 1.6421205334291563e-05,
"loss": 0.0111,
"step": 4370
},
{
"epoch": 0.702091849002164,
"grad_norm": 0.0738309845328331,
"learning_rate": 1.639973066776662e-05,
"loss": 0.0123,
"step": 4380
},
{
"epoch": 0.7036947984291095,
"grad_norm": 0.06730606406927109,
"learning_rate": 1.637820589909302e-05,
"loss": 0.0107,
"step": 4390
},
{
"epoch": 0.7052977478560551,
"grad_norm": 0.07160209119319916,
"learning_rate": 1.6356631196783657e-05,
"loss": 0.0106,
"step": 4400
},
{
"epoch": 0.7069006972830008,
"grad_norm": 0.06399673223495483,
"learning_rate": 1.6335006729742345e-05,
"loss": 0.0114,
"step": 4410
},
{
"epoch": 0.7085036467099463,
"grad_norm": 0.06448632478713989,
"learning_rate": 1.6313332667262506e-05,
"loss": 0.0102,
"step": 4420
},
{
"epoch": 0.7101065961368919,
"grad_norm": 0.06451641768217087,
"learning_rate": 1.6291609179025827e-05,
"loss": 0.0101,
"step": 4430
},
{
"epoch": 0.7117095455638375,
"grad_norm": 0.10104962438344955,
"learning_rate": 1.6269836435100933e-05,
"loss": 0.0115,
"step": 4440
},
{
"epoch": 0.7133124949907831,
"grad_norm": 0.09099545329809189,
"learning_rate": 1.6248014605942074e-05,
"loss": 0.0115,
"step": 4450
},
{
"epoch": 0.7149154444177286,
"grad_norm": 0.06802688539028168,
"learning_rate": 1.6226143862387776e-05,
"loss": 0.0106,
"step": 4460
},
{
"epoch": 0.7165183938446742,
"grad_norm": 0.08041039854288101,
"learning_rate": 1.620422437565949e-05,
"loss": 0.0107,
"step": 4470
},
{
"epoch": 0.7181213432716198,
"grad_norm": 0.07889063656330109,
"learning_rate": 1.6182256317360286e-05,
"loss": 0.0106,
"step": 4480
},
{
"epoch": 0.7197242926985654,
"grad_norm": 0.08090974390506744,
"learning_rate": 1.6160239859473484e-05,
"loss": 0.0115,
"step": 4490
},
{
"epoch": 0.7213272421255109,
"grad_norm": 0.07090416550636292,
"learning_rate": 1.613817517436131e-05,
"loss": 0.011,
"step": 4500
},
{
"epoch": 0.7229301915524565,
"grad_norm": 0.07482471317052841,
"learning_rate": 1.611606243476356e-05,
"loss": 0.0113,
"step": 4510
},
{
"epoch": 0.7245331409794021,
"grad_norm": 0.07659480720758438,
"learning_rate": 1.6093901813796223e-05,
"loss": 0.0098,
"step": 4520
},
{
"epoch": 0.7261360904063476,
"grad_norm": 0.07723913341760635,
"learning_rate": 1.6071693484950165e-05,
"loss": 0.0105,
"step": 4530
},
{
"epoch": 0.7277390398332932,
"grad_norm": 0.08152413368225098,
"learning_rate": 1.6049437622089715e-05,
"loss": 0.0122,
"step": 4540
},
{
"epoch": 0.7293419892602389,
"grad_norm": 0.07603183388710022,
"learning_rate": 1.602713439945137e-05,
"loss": 0.0125,
"step": 4550
},
{
"epoch": 0.7309449386871845,
"grad_norm": 0.07525479793548584,
"learning_rate": 1.6004783991642373e-05,
"loss": 0.011,
"step": 4560
},
{
"epoch": 0.73254788811413,
"grad_norm": 0.0917053148150444,
"learning_rate": 1.5982386573639375e-05,
"loss": 0.0108,
"step": 4570
},
{
"epoch": 0.7341508375410756,
"grad_norm": 0.0688992515206337,
"learning_rate": 1.595994232078707e-05,
"loss": 0.011,
"step": 4580
},
{
"epoch": 0.7357537869680212,
"grad_norm": 0.08237089961767197,
"learning_rate": 1.5937451408796796e-05,
"loss": 0.0105,
"step": 4590
},
{
"epoch": 0.7373567363949668,
"grad_norm": 0.07928130775690079,
"learning_rate": 1.5914914013745194e-05,
"loss": 0.0114,
"step": 4600
},
{
"epoch": 0.7389596858219123,
"grad_norm": 0.07128145545721054,
"learning_rate": 1.589233031207279e-05,
"loss": 0.0105,
"step": 4610
},
{
"epoch": 0.7405626352488579,
"grad_norm": 0.07204887270927429,
"learning_rate": 1.586970048058266e-05,
"loss": 0.0098,
"step": 4620
},
{
"epoch": 0.7421655846758035,
"grad_norm": 0.08074098080396652,
"learning_rate": 1.584702469643899e-05,
"loss": 0.0123,
"step": 4630
},
{
"epoch": 0.743768534102749,
"grad_norm": 0.07323694974184036,
"learning_rate": 1.582430313716575e-05,
"loss": 0.0095,
"step": 4640
},
{
"epoch": 0.7453714835296946,
"grad_norm": 0.0781148299574852,
"learning_rate": 1.5801535980645253e-05,
"loss": 0.0098,
"step": 4650
},
{
"epoch": 0.7469744329566402,
"grad_norm": 0.07220373302698135,
"learning_rate": 1.5778723405116792e-05,
"loss": 0.011,
"step": 4660
},
{
"epoch": 0.7485773823835858,
"grad_norm": 0.05822448804974556,
"learning_rate": 1.5755865589175237e-05,
"loss": 0.0108,
"step": 4670
},
{
"epoch": 0.7501803318105313,
"grad_norm": 0.0715513601899147,
"learning_rate": 1.573296271176963e-05,
"loss": 0.0103,
"step": 4680
},
{
"epoch": 0.751783281237477,
"grad_norm": 0.3894812762737274,
"learning_rate": 1.5710014952201797e-05,
"loss": 0.0103,
"step": 4690
},
{
"epoch": 0.7533862306644226,
"grad_norm": 0.07824983447790146,
"learning_rate": 1.5687022490124926e-05,
"loss": 0.01,
"step": 4700
},
{
"epoch": 0.7549891800913682,
"grad_norm": 0.0841994360089302,
"learning_rate": 1.5663985505542185e-05,
"loss": 0.012,
"step": 4710
},
{
"epoch": 0.7565921295183137,
"grad_norm": 0.13080202043056488,
"learning_rate": 1.5640904178805293e-05,
"loss": 0.0119,
"step": 4720
},
{
"epoch": 0.7581950789452593,
"grad_norm": 0.10361091792583466,
"learning_rate": 1.561777869061311e-05,
"loss": 0.0131,
"step": 4730
},
{
"epoch": 0.7597980283722049,
"grad_norm": 0.07729562371969223,
"learning_rate": 1.559460922201023e-05,
"loss": 0.0128,
"step": 4740
},
{
"epoch": 0.7614009777991504,
"grad_norm": 0.07854557782411575,
"learning_rate": 1.5571395954385565e-05,
"loss": 0.0117,
"step": 4750
},
{
"epoch": 0.763003927226096,
"grad_norm": 0.07294869422912598,
"learning_rate": 1.5548139069470923e-05,
"loss": 0.0107,
"step": 4760
},
{
"epoch": 0.7646068766530416,
"grad_norm": 0.09968176484107971,
"learning_rate": 1.5524838749339567e-05,
"loss": 0.0113,
"step": 4770
},
{
"epoch": 0.7662098260799872,
"grad_norm": 0.09212654083967209,
"learning_rate": 1.5501495176404817e-05,
"loss": 0.0115,
"step": 4780
},
{
"epoch": 0.7678127755069327,
"grad_norm": 0.07619819790124893,
"learning_rate": 1.5478108533418604e-05,
"loss": 0.0122,
"step": 4790
},
{
"epoch": 0.7694157249338783,
"grad_norm": 0.07858500629663467,
"learning_rate": 1.5454679003470056e-05,
"loss": 0.0106,
"step": 4800
},
{
"epoch": 0.7710186743608239,
"grad_norm": 0.0700116902589798,
"learning_rate": 1.5431206769984047e-05,
"loss": 0.0121,
"step": 4810
},
{
"epoch": 0.7726216237877694,
"grad_norm": 0.0738898515701294,
"learning_rate": 1.5407692016719763e-05,
"loss": 0.01,
"step": 4820
},
{
"epoch": 0.774224573214715,
"grad_norm": 0.06155667454004288,
"learning_rate": 1.538413492776928e-05,
"loss": 0.0103,
"step": 4830
},
{
"epoch": 0.7758275226416607,
"grad_norm": 0.08188408613204956,
"learning_rate": 1.5360535687556097e-05,
"loss": 0.0102,
"step": 4840
},
{
"epoch": 0.7774304720686063,
"grad_norm": 0.06689083576202393,
"learning_rate": 1.533689448083372e-05,
"loss": 0.0104,
"step": 4850
},
{
"epoch": 0.7790334214955518,
"grad_norm": 0.09438969194889069,
"learning_rate": 1.5313211492684193e-05,
"loss": 0.0095,
"step": 4860
},
{
"epoch": 0.7806363709224974,
"grad_norm": 0.05713031813502312,
"learning_rate": 1.5289486908516665e-05,
"loss": 0.0097,
"step": 4870
},
{
"epoch": 0.782239320349443,
"grad_norm": 0.0832870751619339,
"learning_rate": 1.5265720914065925e-05,
"loss": 0.0114,
"step": 4880
},
{
"epoch": 0.7838422697763886,
"grad_norm": 0.06851907074451447,
"learning_rate": 1.5241913695390957e-05,
"loss": 0.0103,
"step": 4890
},
{
"epoch": 0.7854452192033341,
"grad_norm": 0.07114174216985703,
"learning_rate": 1.5218065438873484e-05,
"loss": 0.0105,
"step": 4900
},
{
"epoch": 0.7870481686302797,
"grad_norm": 0.06810145080089569,
"learning_rate": 1.5194176331216496e-05,
"loss": 0.0098,
"step": 4910
},
{
"epoch": 0.7886511180572253,
"grad_norm": 0.08911102265119553,
"learning_rate": 1.5170246559442815e-05,
"loss": 0.0104,
"step": 4920
},
{
"epoch": 0.7902540674841708,
"grad_norm": 0.07687364518642426,
"learning_rate": 1.5146276310893594e-05,
"loss": 0.0106,
"step": 4930
},
{
"epoch": 0.7918570169111164,
"grad_norm": 0.07641691714525223,
"learning_rate": 1.5122265773226886e-05,
"loss": 0.0094,
"step": 4940
},
{
"epoch": 0.793459966338062,
"grad_norm": 0.07674935460090637,
"learning_rate": 1.5098215134416148e-05,
"loss": 0.0098,
"step": 4950
},
{
"epoch": 0.7950629157650076,
"grad_norm": 0.0895194262266159,
"learning_rate": 1.5074124582748785e-05,
"loss": 0.0107,
"step": 4960
},
{
"epoch": 0.7966658651919531,
"grad_norm": 0.07801928371191025,
"learning_rate": 1.5049994306824678e-05,
"loss": 0.0095,
"step": 4970
},
{
"epoch": 0.7982688146188988,
"grad_norm": 0.08184653520584106,
"learning_rate": 1.5025824495554688e-05,
"loss": 0.0096,
"step": 4980
},
{
"epoch": 0.7998717640458444,
"grad_norm": 0.07062246650457382,
"learning_rate": 1.5001615338159198e-05,
"loss": 0.0104,
"step": 4990
},
{
"epoch": 0.80147471347279,
"grad_norm": 0.06757838279008865,
"learning_rate": 1.497736702416662e-05,
"loss": 0.0099,
"step": 5000
},
{
"epoch": 0.8030776628997355,
"grad_norm": 0.08436329662799835,
"learning_rate": 1.4953079743411922e-05,
"loss": 0.0102,
"step": 5010
},
{
"epoch": 0.8046806123266811,
"grad_norm": 0.0803229808807373,
"learning_rate": 1.4928753686035128e-05,
"loss": 0.01,
"step": 5020
},
{
"epoch": 0.8062835617536267,
"grad_norm": 0.08634204417467117,
"learning_rate": 1.4904389042479831e-05,
"loss": 0.0094,
"step": 5030
},
{
"epoch": 0.8078865111805722,
"grad_norm": 0.07308026403188705,
"learning_rate": 1.4879986003491722e-05,
"loss": 0.0103,
"step": 5040
},
{
"epoch": 0.8094894606075178,
"grad_norm": 0.07339402288198471,
"learning_rate": 1.4855544760117064e-05,
"loss": 0.0102,
"step": 5050
},
{
"epoch": 0.8110924100344634,
"grad_norm": 0.06620003283023834,
"learning_rate": 1.4831065503701234e-05,
"loss": 0.0093,
"step": 5060
},
{
"epoch": 0.812695359461409,
"grad_norm": 0.0636444240808487,
"learning_rate": 1.4806548425887186e-05,
"loss": 0.0105,
"step": 5070
},
{
"epoch": 0.8142983088883545,
"grad_norm": 0.06956276297569275,
"learning_rate": 1.4781993718613983e-05,
"loss": 0.0099,
"step": 5080
},
{
"epoch": 0.8159012583153001,
"grad_norm": 0.06907407194375992,
"learning_rate": 1.475740157411527e-05,
"loss": 0.0121,
"step": 5090
},
{
"epoch": 0.8175042077422457,
"grad_norm": 0.06344885379076004,
"learning_rate": 1.4732772184917795e-05,
"loss": 0.0097,
"step": 5100
},
{
"epoch": 0.8191071571691914,
"grad_norm": 0.08058324456214905,
"learning_rate": 1.4708105743839876e-05,
"loss": 0.0101,
"step": 5110
},
{
"epoch": 0.8207101065961369,
"grad_norm": 0.07553695142269135,
"learning_rate": 1.46834024439899e-05,
"loss": 0.0096,
"step": 5120
},
{
"epoch": 0.8223130560230825,
"grad_norm": 0.06311500817537308,
"learning_rate": 1.4658662478764823e-05,
"loss": 0.0108,
"step": 5130
},
{
"epoch": 0.8239160054500281,
"grad_norm": 0.07880297303199768,
"learning_rate": 1.463388604184864e-05,
"loss": 0.0096,
"step": 5140
},
{
"epoch": 0.8255189548769736,
"grad_norm": 0.07937072962522507,
"learning_rate": 1.4609073327210879e-05,
"loss": 0.0105,
"step": 5150
},
{
"epoch": 0.8271219043039192,
"grad_norm": 0.07063055038452148,
"learning_rate": 1.4584224529105077e-05,
"loss": 0.0108,
"step": 5160
},
{
"epoch": 0.8287248537308648,
"grad_norm": 0.07742191851139069,
"learning_rate": 1.4559339842067259e-05,
"loss": 0.0108,
"step": 5170
},
{
"epoch": 0.8303278031578104,
"grad_norm": 0.07162066549062729,
"learning_rate": 1.453441946091442e-05,
"loss": 0.0095,
"step": 5180
},
{
"epoch": 0.8319307525847559,
"grad_norm": 0.0674009919166565,
"learning_rate": 1.4509463580742993e-05,
"loss": 0.0102,
"step": 5190
},
{
"epoch": 0.8335337020117015,
"grad_norm": 0.07282152771949768,
"learning_rate": 1.4484472396927334e-05,
"loss": 0.0093,
"step": 5200
},
{
"epoch": 0.8351366514386471,
"grad_norm": 0.07375837862491608,
"learning_rate": 1.4459446105118171e-05,
"loss": 0.0099,
"step": 5210
},
{
"epoch": 0.8367396008655927,
"grad_norm": 0.0662304237484932,
"learning_rate": 1.4434384901241096e-05,
"loss": 0.0099,
"step": 5220
},
{
"epoch": 0.8383425502925382,
"grad_norm": 0.0746363177895546,
"learning_rate": 1.4409288981495011e-05,
"loss": 0.0094,
"step": 5230
},
{
"epoch": 0.8399454997194838,
"grad_norm": 0.09581029415130615,
"learning_rate": 1.438415854235061e-05,
"loss": 0.0102,
"step": 5240
},
{
"epoch": 0.8415484491464295,
"grad_norm": 0.08328843116760254,
"learning_rate": 1.4358993780548832e-05,
"loss": 0.0095,
"step": 5250
},
{
"epoch": 0.843151398573375,
"grad_norm": 0.06462030857801437,
"learning_rate": 1.433379489309931e-05,
"loss": 0.0097,
"step": 5260
},
{
"epoch": 0.8447543480003206,
"grad_norm": 0.09732311964035034,
"learning_rate": 1.4308562077278854e-05,
"loss": 0.0109,
"step": 5270
},
{
"epoch": 0.8463572974272662,
"grad_norm": 0.06178658828139305,
"learning_rate": 1.4283295530629877e-05,
"loss": 0.0094,
"step": 5280
},
{
"epoch": 0.8479602468542118,
"grad_norm": 0.07857873290777206,
"learning_rate": 1.4257995450958877e-05,
"loss": 0.0105,
"step": 5290
},
{
"epoch": 0.8495631962811573,
"grad_norm": 0.0908777043223381,
"learning_rate": 1.423266203633487e-05,
"loss": 0.0102,
"step": 5300
},
{
"epoch": 0.8511661457081029,
"grad_norm": 0.0654769092798233,
"learning_rate": 1.4207295485087837e-05,
"loss": 0.0096,
"step": 5310
},
{
"epoch": 0.8527690951350485,
"grad_norm": 0.06554923206567764,
"learning_rate": 1.4181895995807193e-05,
"loss": 0.0089,
"step": 5320
},
{
"epoch": 0.8543720445619941,
"grad_norm": 0.07311050593852997,
"learning_rate": 1.415646376734021e-05,
"loss": 0.0091,
"step": 5330
},
{
"epoch": 0.8559749939889396,
"grad_norm": 0.07530997693538666,
"learning_rate": 1.4130998998790464e-05,
"loss": 0.0104,
"step": 5340
},
{
"epoch": 0.8575779434158852,
"grad_norm": 0.06178179755806923,
"learning_rate": 1.4105501889516288e-05,
"loss": 0.0095,
"step": 5350
},
{
"epoch": 0.8591808928428308,
"grad_norm": 0.07013700902462006,
"learning_rate": 1.4079972639129204e-05,
"loss": 0.0095,
"step": 5360
},
{
"epoch": 0.8607838422697763,
"grad_norm": 0.06522087752819061,
"learning_rate": 1.4054411447492352e-05,
"loss": 0.0092,
"step": 5370
},
{
"epoch": 0.8623867916967219,
"grad_norm": 0.06943648308515549,
"learning_rate": 1.4028818514718936e-05,
"loss": 0.0093,
"step": 5380
},
{
"epoch": 0.8639897411236676,
"grad_norm": 0.06666680425405502,
"learning_rate": 1.4003194041170665e-05,
"loss": 0.01,
"step": 5390
},
{
"epoch": 0.8655926905506132,
"grad_norm": 0.06522834300994873,
"learning_rate": 1.397753822745616e-05,
"loss": 0.0095,
"step": 5400
},
{
"epoch": 0.8671956399775587,
"grad_norm": 0.05904649198055267,
"learning_rate": 1.3951851274429409e-05,
"loss": 0.0096,
"step": 5410
},
{
"epoch": 0.8687985894045043,
"grad_norm": 0.06921979039907455,
"learning_rate": 1.392613338318817e-05,
"loss": 0.0102,
"step": 5420
},
{
"epoch": 0.8704015388314499,
"grad_norm": 0.06601817160844803,
"learning_rate": 1.3900384755072424e-05,
"loss": 0.0102,
"step": 5430
},
{
"epoch": 0.8720044882583955,
"grad_norm": 0.07253236323595047,
"learning_rate": 1.3874605591662778e-05,
"loss": 0.0095,
"step": 5440
},
{
"epoch": 0.873607437685341,
"grad_norm": 0.07604202628135681,
"learning_rate": 1.384879609477889e-05,
"loss": 0.0095,
"step": 5450
},
{
"epoch": 0.8752103871122866,
"grad_norm": 0.0852079764008522,
"learning_rate": 1.38229564664779e-05,
"loss": 0.0101,
"step": 5460
},
{
"epoch": 0.8768133365392322,
"grad_norm": 0.06676860898733139,
"learning_rate": 1.379708690905283e-05,
"loss": 0.0092,
"step": 5470
},
{
"epoch": 0.8784162859661777,
"grad_norm": 0.06362950801849365,
"learning_rate": 1.3771187625031027e-05,
"loss": 0.0098,
"step": 5480
},
{
"epoch": 0.8800192353931233,
"grad_norm": 0.06741517037153244,
"learning_rate": 1.3745258817172544e-05,
"loss": 0.0101,
"step": 5490
},
{
"epoch": 0.8816221848200689,
"grad_norm": 0.0978483185172081,
"learning_rate": 1.371930068846858e-05,
"loss": 0.01,
"step": 5500
},
{
"epoch": 0.8832251342470145,
"grad_norm": 0.05989569425582886,
"learning_rate": 1.3693313442139877e-05,
"loss": 0.0091,
"step": 5510
},
{
"epoch": 0.88482808367396,
"grad_norm": 0.07173648476600647,
"learning_rate": 1.3667297281635135e-05,
"loss": 0.0102,
"step": 5520
},
{
"epoch": 0.8864310331009057,
"grad_norm": 0.06889471411705017,
"learning_rate": 1.364125241062942e-05,
"loss": 0.0095,
"step": 5530
},
{
"epoch": 0.8880339825278513,
"grad_norm": 0.05711568892002106,
"learning_rate": 1.3615179033022556e-05,
"loss": 0.0099,
"step": 5540
},
{
"epoch": 0.8896369319547969,
"grad_norm": 0.07180725783109665,
"learning_rate": 1.3589077352937552e-05,
"loss": 0.0103,
"step": 5550
},
{
"epoch": 0.8912398813817424,
"grad_norm": 0.06540070474147797,
"learning_rate": 1.3562947574718977e-05,
"loss": 0.0083,
"step": 5560
},
{
"epoch": 0.892842830808688,
"grad_norm": 0.07605359703302383,
"learning_rate": 1.3536789902931391e-05,
"loss": 0.0106,
"step": 5570
},
{
"epoch": 0.8944457802356336,
"grad_norm": 0.06887410581111908,
"learning_rate": 1.351060454235771e-05,
"loss": 0.0093,
"step": 5580
},
{
"epoch": 0.8960487296625791,
"grad_norm": 0.06430304050445557,
"learning_rate": 1.3484391697997637e-05,
"loss": 0.009,
"step": 5590
},
{
"epoch": 0.8976516790895247,
"grad_norm": 0.08205439895391464,
"learning_rate": 1.3458151575066025e-05,
"loss": 0.0095,
"step": 5600
},
{
"epoch": 0.8992546285164703,
"grad_norm": 0.06184590607881546,
"learning_rate": 1.343188437899129e-05,
"loss": 0.0093,
"step": 5610
},
{
"epoch": 0.9008575779434159,
"grad_norm": 0.06461317837238312,
"learning_rate": 1.34055903154138e-05,
"loss": 0.0093,
"step": 5620
},
{
"epoch": 0.9024605273703614,
"grad_norm": 0.0797036737203598,
"learning_rate": 1.3379269590184264e-05,
"loss": 0.0092,
"step": 5630
},
{
"epoch": 0.904063476797307,
"grad_norm": 0.07450596988201141,
"learning_rate": 1.3352922409362122e-05,
"loss": 0.0098,
"step": 5640
},
{
"epoch": 0.9056664262242526,
"grad_norm": 0.06656588613986969,
"learning_rate": 1.332654897921391e-05,
"loss": 0.0097,
"step": 5650
},
{
"epoch": 0.9072693756511983,
"grad_norm": 0.08003687113523483,
"learning_rate": 1.3300149506211693e-05,
"loss": 0.0097,
"step": 5660
},
{
"epoch": 0.9088723250781438,
"grad_norm": 0.0784405767917633,
"learning_rate": 1.32737241970314e-05,
"loss": 0.0104,
"step": 5670
},
{
"epoch": 0.9104752745050894,
"grad_norm": 0.07136031985282898,
"learning_rate": 1.3247273258551236e-05,
"loss": 0.0086,
"step": 5680
},
{
"epoch": 0.912078223932035,
"grad_norm": 0.06181903928518295,
"learning_rate": 1.3220796897850045e-05,
"loss": 0.0085,
"step": 5690
},
{
"epoch": 0.9136811733589805,
"grad_norm": 0.06976137310266495,
"learning_rate": 1.3194295322205698e-05,
"loss": 0.0099,
"step": 5700
},
{
"epoch": 0.9152841227859261,
"grad_norm": 0.06435976177453995,
"learning_rate": 1.3167768739093479e-05,
"loss": 0.0095,
"step": 5710
},
{
"epoch": 0.9168870722128717,
"grad_norm": 0.06821715831756592,
"learning_rate": 1.314121735618443e-05,
"loss": 0.0093,
"step": 5720
},
{
"epoch": 0.9184900216398173,
"grad_norm": 0.07428358495235443,
"learning_rate": 1.3114641381343767e-05,
"loss": 0.0099,
"step": 5730
},
{
"epoch": 0.9200929710667628,
"grad_norm": 0.06527336686849594,
"learning_rate": 1.3088041022629217e-05,
"loss": 0.0089,
"step": 5740
},
{
"epoch": 0.9216959204937084,
"grad_norm": 0.0690665915608406,
"learning_rate": 1.3061416488289407e-05,
"loss": 0.0089,
"step": 5750
},
{
"epoch": 0.923298869920654,
"grad_norm": 0.06167382001876831,
"learning_rate": 1.3034767986762229e-05,
"loss": 0.0094,
"step": 5760
},
{
"epoch": 0.9249018193475996,
"grad_norm": 0.08395292609930038,
"learning_rate": 1.3008095726673214e-05,
"loss": 0.0097,
"step": 5770
},
{
"epoch": 0.9265047687745451,
"grad_norm": 0.07569417357444763,
"learning_rate": 1.2981399916833888e-05,
"loss": 0.0096,
"step": 5780
},
{
"epoch": 0.9281077182014907,
"grad_norm": 0.07300177216529846,
"learning_rate": 1.2954680766240146e-05,
"loss": 0.0093,
"step": 5790
},
{
"epoch": 0.9297106676284363,
"grad_norm": 0.05430857837200165,
"learning_rate": 1.2927938484070608e-05,
"loss": 0.0084,
"step": 5800
},
{
"epoch": 0.9313136170553818,
"grad_norm": 0.06711985915899277,
"learning_rate": 1.2901173279684998e-05,
"loss": 0.0098,
"step": 5810
},
{
"epoch": 0.9329165664823275,
"grad_norm": 0.05374586954712868,
"learning_rate": 1.2874385362622476e-05,
"loss": 0.0086,
"step": 5820
},
{
"epoch": 0.9345195159092731,
"grad_norm": 0.062482576817274094,
"learning_rate": 1.2847574942600037e-05,
"loss": 0.0099,
"step": 5830
},
{
"epoch": 0.9361224653362187,
"grad_norm": 0.06739748269319534,
"learning_rate": 1.2820742229510818e-05,
"loss": 0.0094,
"step": 5840
},
{
"epoch": 0.9377254147631642,
"grad_norm": 0.0729333832859993,
"learning_rate": 1.2793887433422515e-05,
"loss": 0.0105,
"step": 5850
},
{
"epoch": 0.9393283641901098,
"grad_norm": 0.061774469912052155,
"learning_rate": 1.276701076457568e-05,
"loss": 0.009,
"step": 5860
},
{
"epoch": 0.9409313136170554,
"grad_norm": 0.06991413980722427,
"learning_rate": 1.2740112433382124e-05,
"loss": 0.009,
"step": 5870
},
{
"epoch": 0.942534263044001,
"grad_norm": 0.06478750705718994,
"learning_rate": 1.2713192650423234e-05,
"loss": 0.0093,
"step": 5880
},
{
"epoch": 0.9441372124709465,
"grad_norm": 0.05868879333138466,
"learning_rate": 1.2686251626448341e-05,
"loss": 0.0091,
"step": 5890
},
{
"epoch": 0.9457401618978921,
"grad_norm": 0.06809740513563156,
"learning_rate": 1.2659289572373072e-05,
"loss": 0.0099,
"step": 5900
},
{
"epoch": 0.9473431113248377,
"grad_norm": 0.06697957962751389,
"learning_rate": 1.263230669927769e-05,
"loss": 0.0094,
"step": 5910
},
{
"epoch": 0.9489460607517832,
"grad_norm": 0.07428871095180511,
"learning_rate": 1.2605303218405449e-05,
"loss": 0.0101,
"step": 5920
},
{
"epoch": 0.9505490101787288,
"grad_norm": 0.06630957126617432,
"learning_rate": 1.2578279341160933e-05,
"loss": 0.0091,
"step": 5930
},
{
"epoch": 0.9521519596056744,
"grad_norm": 0.07631520181894302,
"learning_rate": 1.2551235279108407e-05,
"loss": 0.009,
"step": 5940
},
{
"epoch": 0.9537549090326201,
"grad_norm": 0.059855490922927856,
"learning_rate": 1.2524171243970163e-05,
"loss": 0.0095,
"step": 5950
},
{
"epoch": 0.9553578584595656,
"grad_norm": 0.0653349757194519,
"learning_rate": 1.2497087447624844e-05,
"loss": 0.0105,
"step": 5960
},
{
"epoch": 0.9569608078865112,
"grad_norm": 0.06262225657701492,
"learning_rate": 1.2469984102105821e-05,
"loss": 0.0083,
"step": 5970
},
{
"epoch": 0.9585637573134568,
"grad_norm": 0.06356295198202133,
"learning_rate": 1.2442861419599492e-05,
"loss": 0.0105,
"step": 5980
},
{
"epoch": 0.9601667067404024,
"grad_norm": 0.06270481646060944,
"learning_rate": 1.2415719612443651e-05,
"loss": 0.0092,
"step": 5990
},
{
"epoch": 0.9617696561673479,
"grad_norm": 0.06081055477261543,
"learning_rate": 1.2388558893125806e-05,
"loss": 0.0083,
"step": 6000
},
{
"epoch": 0.9633726055942935,
"grad_norm": 0.06876713782548904,
"learning_rate": 1.2361379474281536e-05,
"loss": 0.0097,
"step": 6010
},
{
"epoch": 0.9649755550212391,
"grad_norm": 0.06346289068460464,
"learning_rate": 1.233418156869281e-05,
"loss": 0.0099,
"step": 6020
},
{
"epoch": 0.9665785044481846,
"grad_norm": 1.9258140325546265,
"learning_rate": 1.2306965389286316e-05,
"loss": 0.0118,
"step": 6030
},
{
"epoch": 0.9681814538751302,
"grad_norm": 0.0953875482082367,
"learning_rate": 1.2279731149131821e-05,
"loss": 0.0196,
"step": 6040
},
{
"epoch": 0.9697844033020758,
"grad_norm": 0.09784775227308273,
"learning_rate": 1.225247906144047e-05,
"loss": 0.0116,
"step": 6050
},
{
"epoch": 0.9713873527290214,
"grad_norm": 0.12490657716989517,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.0114,
"step": 6060
},
{
"epoch": 0.9729903021559669,
"grad_norm": 0.0873861089348793,
"learning_rate": 1.2197922196988776e-05,
"loss": 0.0104,
"step": 6070
},
{
"epoch": 0.9745932515829125,
"grad_norm": 0.06704119592905045,
"learning_rate": 1.2170617847342673e-05,
"loss": 0.0094,
"step": 6080
},
{
"epoch": 0.9761962010098582,
"grad_norm": 0.0751587525010109,
"learning_rate": 1.2143296504384868e-05,
"loss": 0.0094,
"step": 6090
},
{
"epoch": 0.9777991504368038,
"grad_norm": 0.07341291010379791,
"learning_rate": 1.2115958382008414e-05,
"loss": 0.0099,
"step": 6100
},
{
"epoch": 0.9794020998637493,
"grad_norm": 0.06904918700456619,
"learning_rate": 1.2088603694237744e-05,
"loss": 0.0099,
"step": 6110
},
{
"epoch": 0.9810050492906949,
"grad_norm": 0.06313113123178482,
"learning_rate": 1.2061232655226964e-05,
"loss": 0.0089,
"step": 6120
},
{
"epoch": 0.9826079987176405,
"grad_norm": 0.08169377595186234,
"learning_rate": 1.2033845479258197e-05,
"loss": 0.0086,
"step": 6130
},
{
"epoch": 0.984210948144586,
"grad_norm": 0.06288773566484451,
"learning_rate": 1.2006442380739896e-05,
"loss": 0.0089,
"step": 6140
},
{
"epoch": 0.9858138975715316,
"grad_norm": 0.07561258971691132,
"learning_rate": 1.197902357420517e-05,
"loss": 0.0102,
"step": 6150
},
{
"epoch": 0.9874168469984772,
"grad_norm": 0.06604144722223282,
"learning_rate": 1.1951589274310105e-05,
"loss": 0.0092,
"step": 6160
},
{
"epoch": 0.9890197964254228,
"grad_norm": 0.0782618299126625,
"learning_rate": 1.1924139695832077e-05,
"loss": 0.0094,
"step": 6170
},
{
"epoch": 0.9906227458523683,
"grad_norm": 0.08666810393333435,
"learning_rate": 1.189667505366808e-05,
"loss": 0.0095,
"step": 6180
},
{
"epoch": 0.9922256952793139,
"grad_norm": 0.08879975974559784,
"learning_rate": 1.1869195562833027e-05,
"loss": 0.009,
"step": 6190
},
{
"epoch": 0.9938286447062595,
"grad_norm": 0.08351059257984161,
"learning_rate": 1.1841701438458092e-05,
"loss": 0.0095,
"step": 6200
},
{
"epoch": 0.9954315941332051,
"grad_norm": 0.07370386272668839,
"learning_rate": 1.181419289578901e-05,
"loss": 0.0099,
"step": 6210
},
{
"epoch": 0.9970345435601506,
"grad_norm": 0.07470313459634781,
"learning_rate": 1.1786670150184381e-05,
"loss": 0.0093,
"step": 6220
},
{
"epoch": 0.9986374929870963,
"grad_norm": 0.06986988335847855,
"learning_rate": 1.1759133417114013e-05,
"loss": 0.0094,
"step": 6230
},
{
"epoch": 1.0001602949426946,
"grad_norm": 0.032465722411870956,
"learning_rate": 1.1731582912157206e-05,
"loss": 0.0086,
"step": 6240
},
{
"epoch": 1.00176324436964,
"grad_norm": 0.03515005484223366,
"learning_rate": 1.170401885100109e-05,
"loss": 0.0083,
"step": 6250
},
{
"epoch": 1.0033661937965856,
"grad_norm": 0.05306578055024147,
"learning_rate": 1.1676441449438908e-05,
"loss": 0.0082,
"step": 6260
},
{
"epoch": 1.0049691432235313,
"grad_norm": 0.047804247587919235,
"learning_rate": 1.164885092336836e-05,
"loss": 0.0079,
"step": 6270
},
{
"epoch": 1.0065720926504769,
"grad_norm": 0.052819378674030304,
"learning_rate": 1.1621247488789878e-05,
"loss": 0.0077,
"step": 6280
},
{
"epoch": 1.0081750420774225,
"grad_norm": 0.04436744377017021,
"learning_rate": 1.159363136180496e-05,
"loss": 0.0078,
"step": 6290
},
{
"epoch": 1.009777991504368,
"grad_norm": 0.0423140823841095,
"learning_rate": 1.1566002758614476e-05,
"loss": 0.0081,
"step": 6300
},
{
"epoch": 1.0113809409313137,
"grad_norm": 0.0411105714738369,
"learning_rate": 1.153836189551696e-05,
"loss": 0.0076,
"step": 6310
},
{
"epoch": 1.012983890358259,
"grad_norm": 0.048951249569654465,
"learning_rate": 1.151070898890693e-05,
"loss": 0.0087,
"step": 6320
},
{
"epoch": 1.0145868397852047,
"grad_norm": 0.04871077463030815,
"learning_rate": 1.148304425527319e-05,
"loss": 0.0083,
"step": 6330
},
{
"epoch": 1.0161897892121503,
"grad_norm": 0.04017919674515724,
"learning_rate": 1.1455367911197137e-05,
"loss": 0.0092,
"step": 6340
},
{
"epoch": 1.017792738639096,
"grad_norm": 0.04326304793357849,
"learning_rate": 1.1427680173351057e-05,
"loss": 0.0082,
"step": 6350
},
{
"epoch": 1.0193956880660415,
"grad_norm": 0.04374143108725548,
"learning_rate": 1.1399981258496447e-05,
"loss": 0.007,
"step": 6360
},
{
"epoch": 1.0209986374929871,
"grad_norm": 0.04934844747185707,
"learning_rate": 1.1372271383482293e-05,
"loss": 0.0085,
"step": 6370
},
{
"epoch": 1.0226015869199327,
"grad_norm": 0.05297398567199707,
"learning_rate": 1.1344550765243398e-05,
"loss": 0.0088,
"step": 6380
},
{
"epoch": 1.0242045363468784,
"grad_norm": 0.048744361847639084,
"learning_rate": 1.1316819620798665e-05,
"loss": 0.0079,
"step": 6390
},
{
"epoch": 1.0258074857738237,
"grad_norm": 0.04246707633137703,
"learning_rate": 1.1289078167249403e-05,
"loss": 0.0088,
"step": 6400
},
{
"epoch": 1.0274104352007694,
"grad_norm": 0.060280464589595795,
"learning_rate": 1.1261326621777635e-05,
"loss": 0.0073,
"step": 6410
},
{
"epoch": 1.029013384627715,
"grad_norm": 0.09698043763637543,
"learning_rate": 1.1233565201644383e-05,
"loss": 0.0083,
"step": 6420
},
{
"epoch": 1.0306163340546606,
"grad_norm": 0.045573972165584564,
"learning_rate": 1.1205794124187985e-05,
"loss": 0.0081,
"step": 6430
},
{
"epoch": 1.0322192834816062,
"grad_norm": 0.04904637858271599,
"learning_rate": 1.117801360682238e-05,
"loss": 0.0093,
"step": 6440
},
{
"epoch": 1.0338222329085518,
"grad_norm": 0.06047618016600609,
"learning_rate": 1.1150223867035405e-05,
"loss": 0.0091,
"step": 6450
},
{
"epoch": 1.0354251823354974,
"grad_norm": 0.04516315832734108,
"learning_rate": 1.112242512238711e-05,
"loss": 0.0082,
"step": 6460
},
{
"epoch": 1.0370281317624428,
"grad_norm": 0.0490105003118515,
"learning_rate": 1.1094617590508025e-05,
"loss": 0.0082,
"step": 6470
},
{
"epoch": 1.0386310811893884,
"grad_norm": 0.0697263553738594,
"learning_rate": 1.106680148909749e-05,
"loss": 0.0085,
"step": 6480
},
{
"epoch": 1.040234030616334,
"grad_norm": 0.04336274787783623,
"learning_rate": 1.1038977035921921e-05,
"loss": 0.0077,
"step": 6490
},
{
"epoch": 1.0418369800432796,
"grad_norm": 0.043528806418180466,
"learning_rate": 1.1011144448813129e-05,
"loss": 0.0082,
"step": 6500
},
{
"epoch": 1.0434399294702252,
"grad_norm": 0.036150723695755005,
"learning_rate": 1.0983303945666599e-05,
"loss": 0.0079,
"step": 6510
},
{
"epoch": 1.0450428788971708,
"grad_norm": 0.057397518306970596,
"learning_rate": 1.0955455744439782e-05,
"loss": 0.0076,
"step": 6520
},
{
"epoch": 1.0466458283241165,
"grad_norm": 0.04161163419485092,
"learning_rate": 1.0927600063150413e-05,
"loss": 0.007,
"step": 6530
},
{
"epoch": 1.0482487777510618,
"grad_norm": 0.03657936677336693,
"learning_rate": 1.0899737119874769e-05,
"loss": 0.0078,
"step": 6540
},
{
"epoch": 1.0498517271780075,
"grad_norm": 0.04080647602677345,
"learning_rate": 1.0871867132745989e-05,
"loss": 0.0075,
"step": 6550
},
{
"epoch": 1.051454676604953,
"grad_norm": 0.04533609747886658,
"learning_rate": 1.0843990319952351e-05,
"loss": 0.0082,
"step": 6560
},
{
"epoch": 1.0530576260318987,
"grad_norm": 0.05578101426362991,
"learning_rate": 1.0816106899735579e-05,
"loss": 0.0082,
"step": 6570
},
{
"epoch": 1.0546605754588443,
"grad_norm": 0.04739479348063469,
"learning_rate": 1.078821709038912e-05,
"loss": 0.0087,
"step": 6580
},
{
"epoch": 1.05626352488579,
"grad_norm": 0.04615991190075874,
"learning_rate": 1.0760321110256436e-05,
"loss": 0.0091,
"step": 6590
},
{
"epoch": 1.0578664743127355,
"grad_norm": 0.05354125425219536,
"learning_rate": 1.0732419177729303e-05,
"loss": 0.0071,
"step": 6600
},
{
"epoch": 1.0594694237396811,
"grad_norm": 0.040618497878313065,
"learning_rate": 1.0704511511246096e-05,
"loss": 0.0079,
"step": 6610
},
{
"epoch": 1.0610723731666265,
"grad_norm": 0.043357282876968384,
"learning_rate": 1.0676598329290087e-05,
"loss": 0.0079,
"step": 6620
},
{
"epoch": 1.0626753225935721,
"grad_norm": 0.033633410930633545,
"learning_rate": 1.064867985038771e-05,
"loss": 0.0078,
"step": 6630
},
{
"epoch": 1.0642782720205177,
"grad_norm": 0.05955130606889725,
"learning_rate": 1.0620756293106891e-05,
"loss": 0.008,
"step": 6640
},
{
"epoch": 1.0658812214474633,
"grad_norm": 0.04183583706617355,
"learning_rate": 1.0592827876055291e-05,
"loss": 0.0077,
"step": 6650
},
{
"epoch": 1.067484170874409,
"grad_norm": 0.05350350961089134,
"learning_rate": 1.0564894817878632e-05,
"loss": 0.0077,
"step": 6660
},
{
"epoch": 1.0690871203013546,
"grad_norm": 0.05788803473114967,
"learning_rate": 1.0536957337258968e-05,
"loss": 0.0077,
"step": 6670
},
{
"epoch": 1.0706900697283002,
"grad_norm": 0.046094585210084915,
"learning_rate": 1.0509015652912965e-05,
"loss": 0.008,
"step": 6680
},
{
"epoch": 1.0722930191552456,
"grad_norm": 0.05263133347034454,
"learning_rate": 1.0481069983590222e-05,
"loss": 0.0077,
"step": 6690
},
{
"epoch": 1.0738959685821912,
"grad_norm": 0.04569438099861145,
"learning_rate": 1.0453120548071503e-05,
"loss": 0.0077,
"step": 6700
},
{
"epoch": 1.0754989180091368,
"grad_norm": 0.054995764046907425,
"learning_rate": 1.0425167565167085e-05,
"loss": 0.008,
"step": 6710
},
{
"epoch": 1.0771018674360824,
"grad_norm": 0.05096138268709183,
"learning_rate": 1.0397211253715005e-05,
"loss": 0.0075,
"step": 6720
},
{
"epoch": 1.078704816863028,
"grad_norm": 0.04507224261760712,
"learning_rate": 1.0369251832579362e-05,
"loss": 0.0081,
"step": 6730
},
{
"epoch": 1.0803077662899736,
"grad_norm": 0.046497710049152374,
"learning_rate": 1.0341289520648591e-05,
"loss": 0.0083,
"step": 6740
},
{
"epoch": 1.0819107157169192,
"grad_norm": 0.04293447360396385,
"learning_rate": 1.031332453683377e-05,
"loss": 0.0081,
"step": 6750
},
{
"epoch": 1.0835136651438648,
"grad_norm": 0.042333897203207016,
"learning_rate": 1.028535710006689e-05,
"loss": 0.0081,
"step": 6760
},
{
"epoch": 1.0851166145708102,
"grad_norm": 0.036750372499227524,
"learning_rate": 1.0257387429299144e-05,
"loss": 0.0088,
"step": 6770
},
{
"epoch": 1.0867195639977558,
"grad_norm": 0.05808829143643379,
"learning_rate": 1.0229415743499217e-05,
"loss": 0.0081,
"step": 6780
},
{
"epoch": 1.0883225134247014,
"grad_norm": 0.04959068074822426,
"learning_rate": 1.0201442261651571e-05,
"loss": 0.008,
"step": 6790
},
{
"epoch": 1.089925462851647,
"grad_norm": 0.04482626914978027,
"learning_rate": 1.017346720275472e-05,
"loss": 0.0072,
"step": 6800
},
{
"epoch": 1.0915284122785927,
"grad_norm": 0.03439100831747055,
"learning_rate": 1.0145490785819537e-05,
"loss": 0.0079,
"step": 6810
},
{
"epoch": 1.0931313617055383,
"grad_norm": 0.053014714270830154,
"learning_rate": 1.0117513229867515e-05,
"loss": 0.0083,
"step": 6820
},
{
"epoch": 1.0947343111324839,
"grad_norm": 0.043408554047346115,
"learning_rate": 1.0089534753929073e-05,
"loss": 0.0085,
"step": 6830
},
{
"epoch": 1.0963372605594293,
"grad_norm": 0.044179074466228485,
"learning_rate": 1.0061555577041828e-05,
"loss": 0.008,
"step": 6840
},
{
"epoch": 1.0979402099863749,
"grad_norm": 0.05316644906997681,
"learning_rate": 1.0033575918248884e-05,
"loss": 0.0084,
"step": 6850
},
{
"epoch": 1.0995431594133205,
"grad_norm": 0.06040149927139282,
"learning_rate": 1.0005595996597122e-05,
"loss": 0.0081,
"step": 6860
},
{
"epoch": 1.101146108840266,
"grad_norm": 0.040795937180519104,
"learning_rate": 9.977616031135476e-06,
"loss": 0.0083,
"step": 6870
},
{
"epoch": 1.1027490582672117,
"grad_norm": 0.060764264315366745,
"learning_rate": 9.949636240913228e-06,
"loss": 0.0086,
"step": 6880
},
{
"epoch": 1.1043520076941573,
"grad_norm": 0.04060014709830284,
"learning_rate": 9.921656844978284e-06,
"loss": 0.0073,
"step": 6890
},
{
"epoch": 1.105954957121103,
"grad_norm": 0.04240868240594864,
"learning_rate": 9.893678062375455e-06,
"loss": 0.0085,
"step": 6900
},
{
"epoch": 1.1075579065480483,
"grad_norm": 0.04362311586737633,
"learning_rate": 9.865700112144776e-06,
"loss": 0.0081,
"step": 6910
},
{
"epoch": 1.109160855974994,
"grad_norm": 0.04985825717449188,
"learning_rate": 9.83772321331974e-06,
"loss": 0.0086,
"step": 6920
},
{
"epoch": 1.1107638054019395,
"grad_norm": 0.04281558841466904,
"learning_rate": 9.809747584925617e-06,
"loss": 0.0079,
"step": 6930
},
{
"epoch": 1.1123667548288851,
"grad_norm": 0.04530913755297661,
"learning_rate": 9.781773445977737e-06,
"loss": 0.0079,
"step": 6940
},
{
"epoch": 1.1139697042558308,
"grad_norm": 0.05035943537950516,
"learning_rate": 9.753801015479762e-06,
"loss": 0.0076,
"step": 6950
},
{
"epoch": 1.1155726536827764,
"grad_norm": 0.03706235811114311,
"learning_rate": 9.725830512421981e-06,
"loss": 0.0077,
"step": 6960
},
{
"epoch": 1.117175603109722,
"grad_norm": 0.04904279112815857,
"learning_rate": 9.697862155779593e-06,
"loss": 0.0083,
"step": 6970
},
{
"epoch": 1.1187785525366674,
"grad_norm": 0.03876826539635658,
"learning_rate": 9.669896164510996e-06,
"loss": 0.0083,
"step": 6980
},
{
"epoch": 1.120381501963613,
"grad_norm": 0.04224033281207085,
"learning_rate": 9.641932757556069e-06,
"loss": 0.0086,
"step": 6990
},
{
"epoch": 1.1219844513905586,
"grad_norm": 0.032640572637319565,
"learning_rate": 9.613972153834451e-06,
"loss": 0.0075,
"step": 7000
},
{
"epoch": 1.1235874008175042,
"grad_norm": 0.054974623024463654,
"learning_rate": 9.586014572243852e-06,
"loss": 0.0081,
"step": 7010
},
{
"epoch": 1.1251903502444498,
"grad_norm": 0.04727548360824585,
"learning_rate": 9.558060231658308e-06,
"loss": 0.0076,
"step": 7020
},
{
"epoch": 1.1267932996713954,
"grad_norm": 0.049345288425683975,
"learning_rate": 9.53010935092649e-06,
"loss": 0.0072,
"step": 7030
},
{
"epoch": 1.128396249098341,
"grad_norm": 0.045910853892564774,
"learning_rate": 9.502162148869967e-06,
"loss": 0.0078,
"step": 7040
},
{
"epoch": 1.1299991985252866,
"grad_norm": 0.037328120321035385,
"learning_rate": 9.474218844281533e-06,
"loss": 0.0072,
"step": 7050
},
{
"epoch": 1.131602147952232,
"grad_norm": 0.04577163606882095,
"learning_rate": 9.446279655923451e-06,
"loss": 0.0081,
"step": 7060
},
{
"epoch": 1.1332050973791776,
"grad_norm": 0.0425073467195034,
"learning_rate": 9.418344802525767e-06,
"loss": 0.0081,
"step": 7070
},
{
"epoch": 1.1348080468061232,
"grad_norm": 0.05351976305246353,
"learning_rate": 9.390414502784586e-06,
"loss": 0.0081,
"step": 7080
},
{
"epoch": 1.1364109962330688,
"grad_norm": 0.052148912101984024,
"learning_rate": 9.362488975360364e-06,
"loss": 0.0083,
"step": 7090
},
{
"epoch": 1.1380139456600145,
"grad_norm": 0.04858115687966347,
"learning_rate": 9.334568438876198e-06,
"loss": 0.0078,
"step": 7100
},
{
"epoch": 1.13961689508696,
"grad_norm": 0.05193081498146057,
"learning_rate": 9.306653111916105e-06,
"loss": 0.0076,
"step": 7110
},
{
"epoch": 1.1412198445139057,
"grad_norm": 0.041127193719148636,
"learning_rate": 9.27874321302333e-06,
"loss": 0.0078,
"step": 7120
},
{
"epoch": 1.142822793940851,
"grad_norm": 0.0410025380551815,
"learning_rate": 9.250838960698613e-06,
"loss": 0.0071,
"step": 7130
},
{
"epoch": 1.1444257433677967,
"grad_norm": 0.04048438370227814,
"learning_rate": 9.222940573398485e-06,
"loss": 0.0073,
"step": 7140
},
{
"epoch": 1.1460286927947423,
"grad_norm": 0.039375003427267075,
"learning_rate": 9.195048269533575e-06,
"loss": 0.0074,
"step": 7150
},
{
"epoch": 1.147631642221688,
"grad_norm": 0.04218687117099762,
"learning_rate": 9.167162267466876e-06,
"loss": 0.0077,
"step": 7160
},
{
"epoch": 1.1492345916486335,
"grad_norm": 0.045502275228500366,
"learning_rate": 9.139282785512046e-06,
"loss": 0.0079,
"step": 7170
},
{
"epoch": 1.1508375410755791,
"grad_norm": 0.03404016047716141,
"learning_rate": 9.111410041931696e-06,
"loss": 0.0078,
"step": 7180
},
{
"epoch": 1.1524404905025247,
"grad_norm": 0.032071568071842194,
"learning_rate": 9.083544254935696e-06,
"loss": 0.0086,
"step": 7190
},
{
"epoch": 1.1540434399294703,
"grad_norm": 0.059753019362688065,
"learning_rate": 9.05568564267944e-06,
"loss": 0.007,
"step": 7200
},
{
"epoch": 1.1556463893564157,
"grad_norm": 0.03715592995285988,
"learning_rate": 9.027834423262157e-06,
"loss": 0.0078,
"step": 7210
},
{
"epoch": 1.1572493387833613,
"grad_norm": 0.0512300506234169,
"learning_rate": 8.999990814725204e-06,
"loss": 0.0081,
"step": 7220
},
{
"epoch": 1.158852288210307,
"grad_norm": 0.04520628973841667,
"learning_rate": 8.972155035050351e-06,
"loss": 0.0082,
"step": 7230
},
{
"epoch": 1.1604552376372526,
"grad_norm": 0.0374721996486187,
"learning_rate": 8.944327302158073e-06,
"loss": 0.0075,
"step": 7240
},
{
"epoch": 1.1620581870641982,
"grad_norm": 0.0465792752802372,
"learning_rate": 8.91650783390585e-06,
"loss": 0.0074,
"step": 7250
},
{
"epoch": 1.1636611364911438,
"grad_norm": 0.06805742532014847,
"learning_rate": 8.888696848086474e-06,
"loss": 0.0077,
"step": 7260
},
{
"epoch": 1.1652640859180892,
"grad_norm": 0.055558763444423676,
"learning_rate": 8.860894562426308e-06,
"loss": 0.0073,
"step": 7270
},
{
"epoch": 1.1668670353450348,
"grad_norm": 0.057639673352241516,
"learning_rate": 8.83310119458361e-06,
"loss": 0.0076,
"step": 7280
},
{
"epoch": 1.1684699847719804,
"grad_norm": 0.04337713494896889,
"learning_rate": 8.805316962146835e-06,
"loss": 0.0079,
"step": 7290
},
{
"epoch": 1.170072934198926,
"grad_norm": 0.05388013273477554,
"learning_rate": 8.777542082632906e-06,
"loss": 0.0083,
"step": 7300
},
{
"epoch": 1.1716758836258716,
"grad_norm": 0.04648788273334503,
"learning_rate": 8.749776773485525e-06,
"loss": 0.0082,
"step": 7310
},
{
"epoch": 1.1732788330528172,
"grad_norm": 0.03778034448623657,
"learning_rate": 8.722021252073471e-06,
"loss": 0.0077,
"step": 7320
},
{
"epoch": 1.1748817824797628,
"grad_norm": 0.049436796456575394,
"learning_rate": 8.694275735688903e-06,
"loss": 0.0077,
"step": 7330
},
{
"epoch": 1.1764847319067084,
"grad_norm": 0.05889998748898506,
"learning_rate": 8.666540441545643e-06,
"loss": 0.0083,
"step": 7340
},
{
"epoch": 1.178087681333654,
"grad_norm": 0.05097229406237602,
"learning_rate": 8.63881558677749e-06,
"loss": 0.0073,
"step": 7350
},
{
"epoch": 1.1796906307605994,
"grad_norm": 0.04770096018910408,
"learning_rate": 8.611101388436518e-06,
"loss": 0.0077,
"step": 7360
},
{
"epoch": 1.181293580187545,
"grad_norm": 0.04946606978774071,
"learning_rate": 8.583398063491368e-06,
"loss": 0.0074,
"step": 7370
},
{
"epoch": 1.1828965296144907,
"grad_norm": 0.039538607001304626,
"learning_rate": 8.55570582882556e-06,
"loss": 0.0073,
"step": 7380
},
{
"epoch": 1.1844994790414363,
"grad_norm": 0.04183432087302208,
"learning_rate": 8.528024901235784e-06,
"loss": 0.0082,
"step": 7390
},
{
"epoch": 1.1861024284683819,
"grad_norm": 0.04200197011232376,
"learning_rate": 8.500355497430223e-06,
"loss": 0.0076,
"step": 7400
},
{
"epoch": 1.1877053778953275,
"grad_norm": 0.043875742703676224,
"learning_rate": 8.472697834026832e-06,
"loss": 0.0074,
"step": 7410
},
{
"epoch": 1.1893083273222729,
"grad_norm": 0.04134812578558922,
"learning_rate": 8.445052127551647e-06,
"loss": 0.0075,
"step": 7420
},
{
"epoch": 1.1909112767492185,
"grad_norm": 0.07085922360420227,
"learning_rate": 8.417418594437115e-06,
"loss": 0.0079,
"step": 7430
},
{
"epoch": 1.192514226176164,
"grad_norm": 0.037032246589660645,
"learning_rate": 8.389797451020361e-06,
"loss": 0.007,
"step": 7440
},
{
"epoch": 1.1941171756031097,
"grad_norm": 0.0462561696767807,
"learning_rate": 8.362188913541525e-06,
"loss": 0.0076,
"step": 7450
},
{
"epoch": 1.1957201250300553,
"grad_norm": 0.048092614859342575,
"learning_rate": 8.334593198142049e-06,
"loss": 0.0077,
"step": 7460
},
{
"epoch": 1.197323074457001,
"grad_norm": 0.043921004980802536,
"learning_rate": 8.307010520863008e-06,
"loss": 0.0081,
"step": 7470
},
{
"epoch": 1.1989260238839465,
"grad_norm": 0.048510029911994934,
"learning_rate": 8.27944109764339e-06,
"loss": 0.0076,
"step": 7480
},
{
"epoch": 1.2005289733108921,
"grad_norm": 0.060406643897295,
"learning_rate": 8.251885144318421e-06,
"loss": 0.0082,
"step": 7490
},
{
"epoch": 1.2021319227378375,
"grad_norm": 0.03776608407497406,
"learning_rate": 8.224342876617887e-06,
"loss": 0.0077,
"step": 7500
},
{
"epoch": 1.2037348721647831,
"grad_norm": 0.04678969085216522,
"learning_rate": 8.196814510164416e-06,
"loss": 0.008,
"step": 7510
},
{
"epoch": 1.2053378215917288,
"grad_norm": 0.04679039865732193,
"learning_rate": 8.169300260471818e-06,
"loss": 0.0081,
"step": 7520
},
{
"epoch": 1.2069407710186744,
"grad_norm": 0.05291286110877991,
"learning_rate": 8.141800342943375e-06,
"loss": 0.008,
"step": 7530
},
{
"epoch": 1.20854372044562,
"grad_norm": 0.0398259200155735,
"learning_rate": 8.114314972870179e-06,
"loss": 0.0081,
"step": 7540
},
{
"epoch": 1.2101466698725656,
"grad_norm": 0.046976324170827866,
"learning_rate": 8.086844365429421e-06,
"loss": 0.0085,
"step": 7550
},
{
"epoch": 1.2117496192995112,
"grad_norm": 0.032477136701345444,
"learning_rate": 8.059388735682723e-06,
"loss": 0.0071,
"step": 7560
},
{
"epoch": 1.2133525687264566,
"grad_norm": 0.04444463923573494,
"learning_rate": 8.031948298574452e-06,
"loss": 0.0077,
"step": 7570
},
{
"epoch": 1.2149555181534022,
"grad_norm": 0.055053021758794785,
"learning_rate": 8.00452326893003e-06,
"loss": 0.0076,
"step": 7580
},
{
"epoch": 1.2165584675803478,
"grad_norm": 0.04440735653042793,
"learning_rate": 7.977113861454265e-06,
"loss": 0.0077,
"step": 7590
},
{
"epoch": 1.2181614170072934,
"grad_norm": 0.038958437740802765,
"learning_rate": 7.949720290729649e-06,
"loss": 0.0077,
"step": 7600
},
{
"epoch": 1.219764366434239,
"grad_norm": 0.04088424891233444,
"learning_rate": 7.922342771214707e-06,
"loss": 0.0073,
"step": 7610
},
{
"epoch": 1.2213673158611846,
"grad_norm": 0.048896338790655136,
"learning_rate": 7.894981517242293e-06,
"loss": 0.0073,
"step": 7620
},
{
"epoch": 1.2229702652881302,
"grad_norm": 0.03298410400748253,
"learning_rate": 7.867636743017919e-06,
"loss": 0.0081,
"step": 7630
},
{
"epoch": 1.2245732147150759,
"grad_norm": 0.05103585496544838,
"learning_rate": 7.840308662618096e-06,
"loss": 0.0075,
"step": 7640
},
{
"epoch": 1.2261761641420212,
"grad_norm": 0.0347796194255352,
"learning_rate": 7.812997489988622e-06,
"loss": 0.0079,
"step": 7650
},
{
"epoch": 1.2277791135689669,
"grad_norm": 0.04016980156302452,
"learning_rate": 7.785703438942941e-06,
"loss": 0.0077,
"step": 7660
},
{
"epoch": 1.2293820629959125,
"grad_norm": 0.049017369747161865,
"learning_rate": 7.75842672316045e-06,
"loss": 0.0066,
"step": 7670
},
{
"epoch": 1.230985012422858,
"grad_norm": 0.050889965146780014,
"learning_rate": 7.731167556184836e-06,
"loss": 0.0074,
"step": 7680
},
{
"epoch": 1.2325879618498037,
"grad_norm": 0.04382390156388283,
"learning_rate": 7.7039261514224e-06,
"loss": 0.0072,
"step": 7690
},
{
"epoch": 1.2341909112767493,
"grad_norm": 0.042560938745737076,
"learning_rate": 7.676702722140378e-06,
"loss": 0.007,
"step": 7700
},
{
"epoch": 1.2357938607036947,
"grad_norm": 0.047447387129068375,
"learning_rate": 7.649497481465291e-06,
"loss": 0.0077,
"step": 7710
},
{
"epoch": 1.2373968101306403,
"grad_norm": 0.03900102153420448,
"learning_rate": 7.622310642381261e-06,
"loss": 0.0072,
"step": 7720
},
{
"epoch": 1.238999759557586,
"grad_norm": 0.05226941406726837,
"learning_rate": 7.595142417728344e-06,
"loss": 0.0082,
"step": 7730
},
{
"epoch": 1.2406027089845315,
"grad_norm": 0.045353468507528305,
"learning_rate": 7.56799302020087e-06,
"loss": 0.0079,
"step": 7740
},
{
"epoch": 1.2422056584114771,
"grad_norm": 0.0446421317756176,
"learning_rate": 7.54086266234578e-06,
"loss": 0.0083,
"step": 7750
},
{
"epoch": 1.2438086078384227,
"grad_norm": 0.037011098116636276,
"learning_rate": 7.513751556560951e-06,
"loss": 0.0078,
"step": 7760
},
{
"epoch": 1.2454115572653683,
"grad_norm": 0.04144198074936867,
"learning_rate": 7.486659915093537e-06,
"loss": 0.008,
"step": 7770
},
{
"epoch": 1.247014506692314,
"grad_norm": 0.03886372596025467,
"learning_rate": 7.459587950038325e-06,
"loss": 0.007,
"step": 7780
},
{
"epoch": 1.2486174561192596,
"grad_norm": 0.045625023543834686,
"learning_rate": 7.432535873336046e-06,
"loss": 0.0074,
"step": 7790
},
{
"epoch": 1.250220405546205,
"grad_norm": 0.04141170531511307,
"learning_rate": 7.4055038967717286e-06,
"loss": 0.0083,
"step": 7800
},
{
"epoch": 1.2518233549731506,
"grad_norm": 0.0399341844022274,
"learning_rate": 7.378492231973044e-06,
"loss": 0.0073,
"step": 7810
},
{
"epoch": 1.2534263044000962,
"grad_norm": 0.055795952677726746,
"learning_rate": 7.351501090408658e-06,
"loss": 0.0079,
"step": 7820
},
{
"epoch": 1.2550292538270418,
"grad_norm": 0.059044260531663895,
"learning_rate": 7.324530683386549e-06,
"loss": 0.0078,
"step": 7830
},
{
"epoch": 1.2566322032539874,
"grad_norm": 0.03669803962111473,
"learning_rate": 7.297581222052373e-06,
"loss": 0.0078,
"step": 7840
},
{
"epoch": 1.2582351526809328,
"grad_norm": 0.036892443895339966,
"learning_rate": 7.270652917387812e-06,
"loss": 0.007,
"step": 7850
},
{
"epoch": 1.2598381021078784,
"grad_norm": 0.0762961283326149,
"learning_rate": 7.243745980208915e-06,
"loss": 0.0074,
"step": 7860
},
{
"epoch": 1.261441051534824,
"grad_norm": 0.04703805595636368,
"learning_rate": 7.2168606211644435e-06,
"loss": 0.0081,
"step": 7870
},
{
"epoch": 1.2630440009617696,
"grad_norm": 0.07492291182279587,
"learning_rate": 7.189997050734232e-06,
"loss": 0.0076,
"step": 7880
},
{
"epoch": 1.2646469503887152,
"grad_norm": 0.045667752623558044,
"learning_rate": 7.16315547922754e-06,
"loss": 0.0076,
"step": 7890
},
{
"epoch": 1.2662498998156608,
"grad_norm": 0.0335981622338295,
"learning_rate": 7.1363361167814e-06,
"loss": 0.0088,
"step": 7900
},
{
"epoch": 1.2678528492426064,
"grad_norm": 0.038948290050029755,
"learning_rate": 7.109539173358968e-06,
"loss": 0.0071,
"step": 7910
},
{
"epoch": 1.269455798669552,
"grad_norm": 0.0461229644715786,
"learning_rate": 7.082764858747899e-06,
"loss": 0.0075,
"step": 7920
},
{
"epoch": 1.2710587480964977,
"grad_norm": 0.037394460290670395,
"learning_rate": 7.056013382558683e-06,
"loss": 0.0076,
"step": 7930
},
{
"epoch": 1.2726616975234433,
"grad_norm": 0.03177995607256889,
"learning_rate": 7.02928495422301e-06,
"loss": 0.007,
"step": 7940
},
{
"epoch": 1.2742646469503887,
"grad_norm": 0.037877414375543594,
"learning_rate": 7.002579782992138e-06,
"loss": 0.0075,
"step": 7950
},
{
"epoch": 1.2758675963773343,
"grad_norm": 0.04539757966995239,
"learning_rate": 6.975898077935255e-06,
"loss": 0.0076,
"step": 7960
},
{
"epoch": 1.2774705458042799,
"grad_norm": 0.04658060148358345,
"learning_rate": 6.949240047937828e-06,
"loss": 0.0075,
"step": 7970
},
{
"epoch": 1.2790734952312255,
"grad_norm": 0.051648322492837906,
"learning_rate": 6.922605901699978e-06,
"loss": 0.0073,
"step": 7980
},
{
"epoch": 1.280676444658171,
"grad_norm": 0.05647370219230652,
"learning_rate": 6.895995847734853e-06,
"loss": 0.008,
"step": 7990
},
{
"epoch": 1.2822793940851165,
"grad_norm": 0.03840125352144241,
"learning_rate": 6.8694100943669815e-06,
"loss": 0.0077,
"step": 8000
},
{
"epoch": 1.283882343512062,
"grad_norm": 0.03707459568977356,
"learning_rate": 6.842848849730647e-06,
"loss": 0.0076,
"step": 8010
},
{
"epoch": 1.2854852929390077,
"grad_norm": 0.04414204880595207,
"learning_rate": 6.8163123217682584e-06,
"loss": 0.0072,
"step": 8020
},
{
"epoch": 1.2870882423659533,
"grad_norm": 0.051417384296655655,
"learning_rate": 6.7898007182287294e-06,
"loss": 0.0074,
"step": 8030
},
{
"epoch": 1.288691191792899,
"grad_norm": 0.050453051924705505,
"learning_rate": 6.763314246665842e-06,
"loss": 0.0074,
"step": 8040
},
{
"epoch": 1.2902941412198445,
"grad_norm": 0.05048830434679985,
"learning_rate": 6.736853114436619e-06,
"loss": 0.0075,
"step": 8050
},
{
"epoch": 1.2918970906467901,
"grad_norm": 0.04227893427014351,
"learning_rate": 6.710417528699722e-06,
"loss": 0.0068,
"step": 8060
},
{
"epoch": 1.2935000400737358,
"grad_norm": 0.043056413531303406,
"learning_rate": 6.684007696413799e-06,
"loss": 0.0078,
"step": 8070
},
{
"epoch": 1.2951029895006814,
"grad_norm": 0.08075862377882004,
"learning_rate": 6.6576238243358905e-06,
"loss": 0.0073,
"step": 8080
},
{
"epoch": 1.2967059389276268,
"grad_norm": 0.04923049733042717,
"learning_rate": 6.631266119019786e-06,
"loss": 0.0068,
"step": 8090
},
{
"epoch": 1.2983088883545724,
"grad_norm": 0.04691435396671295,
"learning_rate": 6.604934786814439e-06,
"loss": 0.0076,
"step": 8100
},
{
"epoch": 1.299911837781518,
"grad_norm": 0.053735796362161636,
"learning_rate": 6.578630033862324e-06,
"loss": 0.0079,
"step": 8110
},
{
"epoch": 1.3015147872084636,
"grad_norm": 0.04722294583916664,
"learning_rate": 6.552352066097829e-06,
"loss": 0.008,
"step": 8120
},
{
"epoch": 1.3031177366354092,
"grad_norm": 0.037270933389663696,
"learning_rate": 6.5261010892456515e-06,
"loss": 0.0075,
"step": 8130
},
{
"epoch": 1.3047206860623548,
"grad_norm": 0.04054490104317665,
"learning_rate": 6.499877308819184e-06,
"loss": 0.0082,
"step": 8140
},
{
"epoch": 1.3063236354893002,
"grad_norm": 0.04223480075597763,
"learning_rate": 6.473680930118899e-06,
"loss": 0.0081,
"step": 8150
},
{
"epoch": 1.3079265849162458,
"grad_norm": 0.05101209506392479,
"learning_rate": 6.447512158230746e-06,
"loss": 0.0069,
"step": 8160
},
{
"epoch": 1.3095295343431914,
"grad_norm": 0.05035392940044403,
"learning_rate": 6.42137119802456e-06,
"loss": 0.0072,
"step": 8170
},
{
"epoch": 1.311132483770137,
"grad_norm": 0.04680299758911133,
"learning_rate": 6.3952582541524235e-06,
"loss": 0.0075,
"step": 8180
},
{
"epoch": 1.3127354331970826,
"grad_norm": 0.044884469360113144,
"learning_rate": 6.369173531047099e-06,
"loss": 0.0071,
"step": 8190
},
{
"epoch": 1.3143383826240282,
"grad_norm": 0.03105269744992256,
"learning_rate": 6.343117232920407e-06,
"loss": 0.0072,
"step": 8200
},
{
"epoch": 1.3159413320509739,
"grad_norm": 0.041606560349464417,
"learning_rate": 6.317089563761647e-06,
"loss": 0.0076,
"step": 8210
},
{
"epoch": 1.3175442814779195,
"grad_norm": 0.03596537187695503,
"learning_rate": 6.291090727335974e-06,
"loss": 0.0079,
"step": 8220
},
{
"epoch": 1.319147230904865,
"grad_norm": 0.05035543814301491,
"learning_rate": 6.265120927182824e-06,
"loss": 0.0076,
"step": 8230
},
{
"epoch": 1.3207501803318105,
"grad_norm": 0.036603864282369614,
"learning_rate": 6.2391803666143145e-06,
"loss": 0.0081,
"step": 8240
},
{
"epoch": 1.322353129758756,
"grad_norm": 0.04323893412947655,
"learning_rate": 6.213269248713653e-06,
"loss": 0.0078,
"step": 8250
},
{
"epoch": 1.3239560791857017,
"grad_norm": 0.03787427023053169,
"learning_rate": 6.187387776333542e-06,
"loss": 0.0073,
"step": 8260
},
{
"epoch": 1.3255590286126473,
"grad_norm": 0.04349483549594879,
"learning_rate": 6.161536152094598e-06,
"loss": 0.0076,
"step": 8270
},
{
"epoch": 1.327161978039593,
"grad_norm": 0.043585531413555145,
"learning_rate": 6.135714578383769e-06,
"loss": 0.0072,
"step": 8280
},
{
"epoch": 1.3287649274665383,
"grad_norm": 0.03276698291301727,
"learning_rate": 6.109923257352732e-06,
"loss": 0.0077,
"step": 8290
},
{
"epoch": 1.330367876893484,
"grad_norm": 0.04165283590555191,
"learning_rate": 6.084162390916328e-06,
"loss": 0.008,
"step": 8300
},
{
"epoch": 1.3319708263204295,
"grad_norm": 0.05153461545705795,
"learning_rate": 6.0584321807509825e-06,
"loss": 0.0073,
"step": 8310
},
{
"epoch": 1.3335737757473751,
"grad_norm": 0.049016524106264114,
"learning_rate": 6.032732828293106e-06,
"loss": 0.0071,
"step": 8320
},
{
"epoch": 1.3351767251743207,
"grad_norm": 0.042667657136917114,
"learning_rate": 6.007064534737538e-06,
"loss": 0.007,
"step": 8330
},
{
"epoch": 1.3367796746012663,
"grad_norm": 0.053493741899728775,
"learning_rate": 5.981427501035959e-06,
"loss": 0.008,
"step": 8340
},
{
"epoch": 1.338382624028212,
"grad_norm": 0.04648403078317642,
"learning_rate": 5.955821927895337e-06,
"loss": 0.0072,
"step": 8350
},
{
"epoch": 1.3399855734551576,
"grad_norm": 0.039376430213451385,
"learning_rate": 5.930248015776325e-06,
"loss": 0.0072,
"step": 8360
},
{
"epoch": 1.3415885228821032,
"grad_norm": 0.043529435992240906,
"learning_rate": 5.904705964891715e-06,
"loss": 0.0073,
"step": 8370
},
{
"epoch": 1.3431914723090488,
"grad_norm": 0.05400298163294792,
"learning_rate": 5.8791959752048675e-06,
"loss": 0.0073,
"step": 8380
},
{
"epoch": 1.3447944217359942,
"grad_norm": 0.04469398036599159,
"learning_rate": 5.853718246428137e-06,
"loss": 0.0072,
"step": 8390
},
{
"epoch": 1.3463973711629398,
"grad_norm": 0.053977783769369125,
"learning_rate": 5.828272978021319e-06,
"loss": 0.0078,
"step": 8400
},
{
"epoch": 1.3480003205898854,
"grad_norm": 0.0440487302839756,
"learning_rate": 5.802860369190076e-06,
"loss": 0.0072,
"step": 8410
},
{
"epoch": 1.349603270016831,
"grad_norm": 0.03750430420041084,
"learning_rate": 5.7774806188843955e-06,
"loss": 0.0078,
"step": 8420
},
{
"epoch": 1.3512062194437766,
"grad_norm": 0.03685884550213814,
"learning_rate": 5.7521339257970196e-06,
"loss": 0.0076,
"step": 8430
},
{
"epoch": 1.352809168870722,
"grad_norm": 0.04150310531258583,
"learning_rate": 5.7268204883618836e-06,
"loss": 0.0077,
"step": 8440
},
{
"epoch": 1.3544121182976676,
"grad_norm": 0.03886445239186287,
"learning_rate": 5.701540504752583e-06,
"loss": 0.0086,
"step": 8450
},
{
"epoch": 1.3560150677246132,
"grad_norm": 0.043425098061561584,
"learning_rate": 5.6762941728808065e-06,
"loss": 0.0074,
"step": 8460
},
{
"epoch": 1.3576180171515588,
"grad_norm": 0.03988328576087952,
"learning_rate": 5.651081690394784e-06,
"loss": 0.0069,
"step": 8470
},
{
"epoch": 1.3592209665785044,
"grad_norm": 0.04670213907957077,
"learning_rate": 5.625903254677753e-06,
"loss": 0.0086,
"step": 8480
},
{
"epoch": 1.36082391600545,
"grad_norm": 0.04091748967766762,
"learning_rate": 5.60075906284641e-06,
"loss": 0.0074,
"step": 8490
},
{
"epoch": 1.3624268654323957,
"grad_norm": 0.046860676258802414,
"learning_rate": 5.575649311749348e-06,
"loss": 0.0072,
"step": 8500
},
{
"epoch": 1.3640298148593413,
"grad_norm": 0.036616772413253784,
"learning_rate": 5.550574197965545e-06,
"loss": 0.0067,
"step": 8510
},
{
"epoch": 1.3656327642862869,
"grad_norm": 0.05461053550243378,
"learning_rate": 5.525533917802806e-06,
"loss": 0.0074,
"step": 8520
},
{
"epoch": 1.3672357137132323,
"grad_norm": 0.03603074327111244,
"learning_rate": 5.500528667296232e-06,
"loss": 0.0076,
"step": 8530
},
{
"epoch": 1.3688386631401779,
"grad_norm": 0.046030059456825256,
"learning_rate": 5.4755586422066805e-06,
"loss": 0.0084,
"step": 8540
},
{
"epoch": 1.3704416125671235,
"grad_norm": 0.0493701696395874,
"learning_rate": 5.450624038019232e-06,
"loss": 0.0071,
"step": 8550
},
{
"epoch": 1.372044561994069,
"grad_norm": 0.04193172603845596,
"learning_rate": 5.425725049941686e-06,
"loss": 0.0078,
"step": 8560
},
{
"epoch": 1.3736475114210147,
"grad_norm": 0.03369879722595215,
"learning_rate": 5.4008618729029846e-06,
"loss": 0.0074,
"step": 8570
},
{
"epoch": 1.3752504608479603,
"grad_norm": 0.05988788977265358,
"learning_rate": 5.376034701551729e-06,
"loss": 0.007,
"step": 8580
},
{
"epoch": 1.3768534102749057,
"grad_norm": 0.058902859687805176,
"learning_rate": 5.3512437302546365e-06,
"loss": 0.0077,
"step": 8590
},
{
"epoch": 1.3784563597018513,
"grad_norm": 0.05228348448872566,
"learning_rate": 5.326489153095011e-06,
"loss": 0.0074,
"step": 8600
},
{
"epoch": 1.380059309128797,
"grad_norm": 0.03920851647853851,
"learning_rate": 5.301771163871257e-06,
"loss": 0.0073,
"step": 8610
},
{
"epoch": 1.3816622585557425,
"grad_norm": 0.040967486798763275,
"learning_rate": 5.277089956095312e-06,
"loss": 0.0076,
"step": 8620
},
{
"epoch": 1.3832652079826881,
"grad_norm": 0.04195632413029671,
"learning_rate": 5.25244572299118e-06,
"loss": 0.0073,
"step": 8630
},
{
"epoch": 1.3848681574096338,
"grad_norm": 0.04842723160982132,
"learning_rate": 5.227838657493396e-06,
"loss": 0.0072,
"step": 8640
},
{
"epoch": 1.3864711068365794,
"grad_norm": 0.035918042063713074,
"learning_rate": 5.2032689522455e-06,
"loss": 0.007,
"step": 8650
},
{
"epoch": 1.388074056263525,
"grad_norm": 0.04247179999947548,
"learning_rate": 5.178736799598574e-06,
"loss": 0.0076,
"step": 8660
},
{
"epoch": 1.3896770056904706,
"grad_norm": 0.03688638284802437,
"learning_rate": 5.154242391609683e-06,
"loss": 0.0074,
"step": 8670
},
{
"epoch": 1.391279955117416,
"grad_norm": 0.04736698791384697,
"learning_rate": 5.129785920040416e-06,
"loss": 0.0085,
"step": 8680
},
{
"epoch": 1.3928829045443616,
"grad_norm": 0.041601404547691345,
"learning_rate": 5.105367576355351e-06,
"loss": 0.0072,
"step": 8690
},
{
"epoch": 1.3944858539713072,
"grad_norm": 0.03648248687386513,
"learning_rate": 5.08098755172058e-06,
"loss": 0.0064,
"step": 8700
},
{
"epoch": 1.3960888033982528,
"grad_norm": 0.04284751042723656,
"learning_rate": 5.056646037002205e-06,
"loss": 0.0068,
"step": 8710
},
{
"epoch": 1.3976917528251984,
"grad_norm": 0.047767430543899536,
"learning_rate": 5.03234322276483e-06,
"loss": 0.0081,
"step": 8720
},
{
"epoch": 1.3992947022521438,
"grad_norm": 0.06886753439903259,
"learning_rate": 5.008079299270091e-06,
"loss": 0.0076,
"step": 8730
},
{
"epoch": 1.4008976516790894,
"grad_norm": 0.03830573335289955,
"learning_rate": 4.98385445647516e-06,
"loss": 0.0072,
"step": 8740
},
{
"epoch": 1.402500601106035,
"grad_norm": 0.04586140811443329,
"learning_rate": 4.95966888403124e-06,
"loss": 0.0071,
"step": 8750
},
{
"epoch": 1.4041035505329806,
"grad_norm": 0.04326622560620308,
"learning_rate": 4.935522771282108e-06,
"loss": 0.0076,
"step": 8760
},
{
"epoch": 1.4057064999599262,
"grad_norm": 0.0408218689262867,
"learning_rate": 4.911416307262617e-06,
"loss": 0.0077,
"step": 8770
},
{
"epoch": 1.4073094493868719,
"grad_norm": 0.037510719150304794,
"learning_rate": 4.887349680697208e-06,
"loss": 0.0069,
"step": 8780
},
{
"epoch": 1.4089123988138175,
"grad_norm": 0.04716578498482704,
"learning_rate": 4.863323079998456e-06,
"loss": 0.0079,
"step": 8790
},
{
"epoch": 1.410515348240763,
"grad_norm": 0.043064896017313004,
"learning_rate": 4.8393366932655774e-06,
"loss": 0.0081,
"step": 8800
},
{
"epoch": 1.4121182976677087,
"grad_norm": 0.03494073450565338,
"learning_rate": 4.815390708282964e-06,
"loss": 0.0075,
"step": 8810
},
{
"epoch": 1.4137212470946543,
"grad_norm": 0.04706864804029465,
"learning_rate": 4.791485312518701e-06,
"loss": 0.0074,
"step": 8820
},
{
"epoch": 1.4153241965215997,
"grad_norm": 0.0434531532227993,
"learning_rate": 4.767620693123119e-06,
"loss": 0.0073,
"step": 8830
},
{
"epoch": 1.4169271459485453,
"grad_norm": 0.03783673048019409,
"learning_rate": 4.7437970369273216e-06,
"loss": 0.0077,
"step": 8840
},
{
"epoch": 1.418530095375491,
"grad_norm": 0.03731882572174072,
"learning_rate": 4.720014530441705e-06,
"loss": 0.0081,
"step": 8850
},
{
"epoch": 1.4201330448024365,
"grad_norm": 0.04272478446364403,
"learning_rate": 4.696273359854528e-06,
"loss": 0.0072,
"step": 8860
},
{
"epoch": 1.4217359942293821,
"grad_norm": 0.0567503459751606,
"learning_rate": 4.672573711030438e-06,
"loss": 0.0071,
"step": 8870
},
{
"epoch": 1.4233389436563275,
"grad_norm": 0.04293932020664215,
"learning_rate": 4.6489157695090045e-06,
"loss": 0.007,
"step": 8880
},
{
"epoch": 1.4249418930832731,
"grad_norm": 0.037547074258327484,
"learning_rate": 4.625299720503297e-06,
"loss": 0.0068,
"step": 8890
},
{
"epoch": 1.4265448425102187,
"grad_norm": 0.039693210273981094,
"learning_rate": 4.601725748898395e-06,
"loss": 0.007,
"step": 8900
},
{
"epoch": 1.4281477919371643,
"grad_norm": 0.048341087996959686,
"learning_rate": 4.578194039249992e-06,
"loss": 0.0068,
"step": 8910
},
{
"epoch": 1.42975074136411,
"grad_norm": 0.046204425394535065,
"learning_rate": 4.554704775782899e-06,
"loss": 0.0075,
"step": 8920
},
{
"epoch": 1.4313536907910556,
"grad_norm": 0.03958141803741455,
"learning_rate": 4.531258142389622e-06,
"loss": 0.0075,
"step": 8930
},
{
"epoch": 1.4329566402180012,
"grad_norm": 0.039660900831222534,
"learning_rate": 4.5078543226289505e-06,
"loss": 0.0072,
"step": 8940
},
{
"epoch": 1.4345595896449468,
"grad_norm": 0.04643959179520607,
"learning_rate": 4.484493499724468e-06,
"loss": 0.007,
"step": 8950
},
{
"epoch": 1.4361625390718924,
"grad_norm": 0.03679412230849266,
"learning_rate": 4.461175856563164e-06,
"loss": 0.0068,
"step": 8960
},
{
"epoch": 1.4377654884988378,
"grad_norm": 0.03700454905629158,
"learning_rate": 4.4379015756939646e-06,
"loss": 0.0064,
"step": 8970
},
{
"epoch": 1.4393684379257834,
"grad_norm": 0.03768878057599068,
"learning_rate": 4.414670839326337e-06,
"loss": 0.0067,
"step": 8980
},
{
"epoch": 1.440971387352729,
"grad_norm": 0.056058432906866074,
"learning_rate": 4.391483829328845e-06,
"loss": 0.0069,
"step": 8990
},
{
"epoch": 1.4425743367796746,
"grad_norm": 0.053892601281404495,
"learning_rate": 4.368340727227719e-06,
"loss": 0.0071,
"step": 9000
},
{
"epoch": 1.4441772862066202,
"grad_norm": 0.053902119398117065,
"learning_rate": 4.345241714205452e-06,
"loss": 0.0078,
"step": 9010
},
{
"epoch": 1.4457802356335658,
"grad_norm": 0.044995274394750595,
"learning_rate": 4.322186971099373e-06,
"loss": 0.007,
"step": 9020
},
{
"epoch": 1.4473831850605112,
"grad_norm": 0.04146160930395126,
"learning_rate": 4.299176678400225e-06,
"loss": 0.0072,
"step": 9030
},
{
"epoch": 1.4489861344874568,
"grad_norm": 0.03394712135195732,
"learning_rate": 4.276211016250763e-06,
"loss": 0.0078,
"step": 9040
},
{
"epoch": 1.4505890839144024,
"grad_norm": 0.044879477471113205,
"learning_rate": 4.253290164444337e-06,
"loss": 0.0068,
"step": 9050
},
{
"epoch": 1.452192033341348,
"grad_norm": 0.047636158764362335,
"learning_rate": 4.230414302423491e-06,
"loss": 0.0067,
"step": 9060
},
{
"epoch": 1.4537949827682937,
"grad_norm": 0.05139080807566643,
"learning_rate": 4.207583609278543e-06,
"loss": 0.0065,
"step": 9070
},
{
"epoch": 1.4553979321952393,
"grad_norm": 0.03766312450170517,
"learning_rate": 4.184798263746201e-06,
"loss": 0.0075,
"step": 9080
},
{
"epoch": 1.4570008816221849,
"grad_norm": 0.0382857508957386,
"learning_rate": 4.162058444208159e-06,
"loss": 0.0067,
"step": 9090
},
{
"epoch": 1.4586038310491305,
"grad_norm": 0.057315412908792496,
"learning_rate": 4.139364328689687e-06,
"loss": 0.007,
"step": 9100
},
{
"epoch": 1.460206780476076,
"grad_norm": 0.056077856570482254,
"learning_rate": 4.116716094858255e-06,
"loss": 0.0072,
"step": 9110
},
{
"epoch": 1.4618097299030215,
"grad_norm": 0.03596881777048111,
"learning_rate": 4.0941139200221414e-06,
"loss": 0.007,
"step": 9120
},
{
"epoch": 1.463412679329967,
"grad_norm": 0.04506804421544075,
"learning_rate": 4.071557981129019e-06,
"loss": 0.0069,
"step": 9130
},
{
"epoch": 1.4650156287569127,
"grad_norm": 0.04154985398054123,
"learning_rate": 4.049048454764608e-06,
"loss": 0.0072,
"step": 9140
},
{
"epoch": 1.4666185781838583,
"grad_norm": 0.052859582006931305,
"learning_rate": 4.02658551715127e-06,
"loss": 0.0069,
"step": 9150
},
{
"epoch": 1.468221527610804,
"grad_norm": 0.033961955457925797,
"learning_rate": 4.004169344146623e-06,
"loss": 0.0077,
"step": 9160
},
{
"epoch": 1.4698244770377493,
"grad_norm": 0.0423002727329731,
"learning_rate": 3.98180011124219e-06,
"loss": 0.0074,
"step": 9170
},
{
"epoch": 1.471427426464695,
"grad_norm": 0.053169313818216324,
"learning_rate": 3.9594779935619895e-06,
"loss": 0.0083,
"step": 9180
},
{
"epoch": 1.4730303758916405,
"grad_norm": 0.036727722734212875,
"learning_rate": 3.937203165861215e-06,
"loss": 0.007,
"step": 9190
},
{
"epoch": 1.4746333253185862,
"grad_norm": 0.039678290486335754,
"learning_rate": 3.914975802524806e-06,
"loss": 0.0066,
"step": 9200
},
{
"epoch": 1.4762362747455318,
"grad_norm": 0.040238771587610245,
"learning_rate": 3.892796077566131e-06,
"loss": 0.0069,
"step": 9210
},
{
"epoch": 1.4778392241724774,
"grad_norm": 0.03908219188451767,
"learning_rate": 3.870664164625606e-06,
"loss": 0.0075,
"step": 9220
},
{
"epoch": 1.479442173599423,
"grad_norm": 0.056258101016283035,
"learning_rate": 3.848580236969327e-06,
"loss": 0.0069,
"step": 9230
},
{
"epoch": 1.4810451230263686,
"grad_norm": 0.05250425264239311,
"learning_rate": 3.826544467487737e-06,
"loss": 0.0066,
"step": 9240
},
{
"epoch": 1.4826480724533142,
"grad_norm": 0.043162960559129715,
"learning_rate": 3.8045570286942455e-06,
"loss": 0.007,
"step": 9250
},
{
"epoch": 1.4842510218802598,
"grad_norm": 0.05024382844567299,
"learning_rate": 3.782618092723902e-06,
"loss": 0.0068,
"step": 9260
},
{
"epoch": 1.4858539713072052,
"grad_norm": 0.0474337600171566,
"learning_rate": 3.760727831332034e-06,
"loss": 0.0075,
"step": 9270
},
{
"epoch": 1.4874569207341508,
"grad_norm": 0.051519252359867096,
"learning_rate": 3.738886415892897e-06,
"loss": 0.0072,
"step": 9280
},
{
"epoch": 1.4890598701610964,
"grad_norm": 0.04442744702100754,
"learning_rate": 3.7170940173983627e-06,
"loss": 0.0067,
"step": 9290
},
{
"epoch": 1.490662819588042,
"grad_norm": 0.04417699947953224,
"learning_rate": 3.69535080645654e-06,
"loss": 0.0078,
"step": 9300
},
{
"epoch": 1.4922657690149876,
"grad_norm": 0.030911916866898537,
"learning_rate": 3.673656953290462e-06,
"loss": 0.0067,
"step": 9310
},
{
"epoch": 1.493868718441933,
"grad_norm": 0.041102729737758636,
"learning_rate": 3.652012627736756e-06,
"loss": 0.0064,
"step": 9320
},
{
"epoch": 1.4954716678688786,
"grad_norm": 0.051470912992954254,
"learning_rate": 3.6304179992443065e-06,
"loss": 0.0071,
"step": 9330
},
{
"epoch": 1.4970746172958243,
"grad_norm": 0.04753391444683075,
"learning_rate": 3.608873236872934e-06,
"loss": 0.0068,
"step": 9340
},
{
"epoch": 1.4986775667227699,
"grad_norm": 0.04698888957500458,
"learning_rate": 3.587378509292053e-06,
"loss": 0.0061,
"step": 9350
},
{
"epoch": 1.5002805161497155,
"grad_norm": 0.04583842307329178,
"learning_rate": 3.5659339847793805e-06,
"loss": 0.0064,
"step": 9360
},
{
"epoch": 1.501883465576661,
"grad_norm": 0.03985747694969177,
"learning_rate": 3.5445398312196046e-06,
"loss": 0.0065,
"step": 9370
},
{
"epoch": 1.5034864150036067,
"grad_norm": 0.039111629128456116,
"learning_rate": 3.5231962161030554e-06,
"loss": 0.0074,
"step": 9380
},
{
"epoch": 1.5050893644305523,
"grad_norm": 0.03803228959441185,
"learning_rate": 3.5019033065244225e-06,
"loss": 0.007,
"step": 9390
},
{
"epoch": 1.506692313857498,
"grad_norm": 0.05284256488084793,
"learning_rate": 3.48066126918143e-06,
"loss": 0.007,
"step": 9400
},
{
"epoch": 1.5082952632844435,
"grad_norm": 0.04431462287902832,
"learning_rate": 3.459470270373525e-06,
"loss": 0.0072,
"step": 9410
},
{
"epoch": 1.509898212711389,
"grad_norm": 0.041640907526016235,
"learning_rate": 3.4383304760005952e-06,
"loss": 0.0067,
"step": 9420
},
{
"epoch": 1.5115011621383345,
"grad_norm": 0.0580732598900795,
"learning_rate": 3.4172420515616543e-06,
"loss": 0.0071,
"step": 9430
},
{
"epoch": 1.5131041115652801,
"grad_norm": 0.042141567915678024,
"learning_rate": 3.396205162153556e-06,
"loss": 0.0066,
"step": 9440
},
{
"epoch": 1.5147070609922257,
"grad_norm": 0.037412770092487335,
"learning_rate": 3.375219972469692e-06,
"loss": 0.0068,
"step": 9450
},
{
"epoch": 1.5163100104191711,
"grad_norm": 0.04163951054215431,
"learning_rate": 3.3542866467987003e-06,
"loss": 0.0066,
"step": 9460
},
{
"epoch": 1.5179129598461167,
"grad_norm": 0.04390928894281387,
"learning_rate": 3.333405349023211e-06,
"loss": 0.0057,
"step": 9470
},
{
"epoch": 1.5195159092730623,
"grad_norm": 0.052741654217243195,
"learning_rate": 3.3125762426185114e-06,
"loss": 0.007,
"step": 9480
},
{
"epoch": 1.521118858700008,
"grad_norm": 0.041966211050748825,
"learning_rate": 3.2917994906513095e-06,
"loss": 0.0058,
"step": 9490
},
{
"epoch": 1.5227218081269536,
"grad_norm": 0.04255704954266548,
"learning_rate": 3.271075255778442e-06,
"loss": 0.0073,
"step": 9500
},
{
"epoch": 1.5243247575538992,
"grad_norm": 0.04274579510092735,
"learning_rate": 3.250403700245586e-06,
"loss": 0.0065,
"step": 9510
},
{
"epoch": 1.5259277069808448,
"grad_norm": 0.06175905093550682,
"learning_rate": 3.229784985886022e-06,
"loss": 0.0063,
"step": 9520
},
{
"epoch": 1.5275306564077904,
"grad_norm": 0.042121000587940216,
"learning_rate": 3.2092192741193295e-06,
"loss": 0.0063,
"step": 9530
},
{
"epoch": 1.529133605834736,
"grad_norm": 0.042963907122612,
"learning_rate": 3.188706725950157e-06,
"loss": 0.0071,
"step": 9540
},
{
"epoch": 1.5307365552616816,
"grad_norm": 0.04113069176673889,
"learning_rate": 3.1682475019669413e-06,
"loss": 0.0068,
"step": 9550
},
{
"epoch": 1.5323395046886272,
"grad_norm": 0.03678800165653229,
"learning_rate": 3.1478417623406464e-06,
"loss": 0.0066,
"step": 9560
},
{
"epoch": 1.5339424541155726,
"grad_norm": 0.04855935275554657,
"learning_rate": 3.127489666823539e-06,
"loss": 0.0067,
"step": 9570
},
{
"epoch": 1.5355454035425182,
"grad_norm": 0.04056788608431816,
"learning_rate": 3.107191374747893e-06,
"loss": 0.0076,
"step": 9580
},
{
"epoch": 1.5371483529694638,
"grad_norm": 0.03992756828665733,
"learning_rate": 3.0869470450247875e-06,
"loss": 0.0064,
"step": 9590
},
{
"epoch": 1.5387513023964094,
"grad_norm": 0.047048088163137436,
"learning_rate": 3.0667568361428256e-06,
"loss": 0.0067,
"step": 9600
},
{
"epoch": 1.5403542518233548,
"grad_norm": 0.04409756511449814,
"learning_rate": 3.0466209061669184e-06,
"loss": 0.0068,
"step": 9610
},
{
"epoch": 1.5419572012503004,
"grad_norm": 0.07111341506242752,
"learning_rate": 3.0265394127370406e-06,
"loss": 0.0067,
"step": 9620
},
{
"epoch": 1.543560150677246,
"grad_norm": 0.04027624428272247,
"learning_rate": 3.006512513066985e-06,
"loss": 0.0072,
"step": 9630
},
{
"epoch": 1.5451631001041917,
"grad_norm": 0.03571724519133568,
"learning_rate": 2.986540363943149e-06,
"loss": 0.0064,
"step": 9640
},
{
"epoch": 1.5467660495311373,
"grad_norm": 0.04187304526567459,
"learning_rate": 2.966623121723303e-06,
"loss": 0.0075,
"step": 9650
},
{
"epoch": 1.5483689989580829,
"grad_norm": 0.042146991938352585,
"learning_rate": 2.9467609423353504e-06,
"loss": 0.0064,
"step": 9660
},
{
"epoch": 1.5499719483850285,
"grad_norm": 0.04556239768862724,
"learning_rate": 2.9269539812761293e-06,
"loss": 0.0063,
"step": 9670
},
{
"epoch": 1.551574897811974,
"grad_norm": 0.03327028825879097,
"learning_rate": 2.9072023936101847e-06,
"loss": 0.0069,
"step": 9680
},
{
"epoch": 1.5531778472389197,
"grad_norm": 0.0430419035255909,
"learning_rate": 2.887506333968546e-06,
"loss": 0.0067,
"step": 9690
},
{
"epoch": 1.5547807966658653,
"grad_norm": 0.05209411308169365,
"learning_rate": 2.8678659565475363e-06,
"loss": 0.0079,
"step": 9700
},
{
"epoch": 1.556383746092811,
"grad_norm": 0.0382845476269722,
"learning_rate": 2.8482814151075477e-06,
"loss": 0.0068,
"step": 9710
},
{
"epoch": 1.5579866955197563,
"grad_norm": 0.03739907592535019,
"learning_rate": 2.8287528629718507e-06,
"loss": 0.0065,
"step": 9720
},
{
"epoch": 1.559589644946702,
"grad_norm": 0.04491214081645012,
"learning_rate": 2.809280453025376e-06,
"loss": 0.0069,
"step": 9730
},
{
"epoch": 1.5611925943736475,
"grad_norm": 0.047207217663526535,
"learning_rate": 2.7898643377135383e-06,
"loss": 0.0068,
"step": 9740
},
{
"epoch": 1.562795543800593,
"grad_norm": 0.03475257754325867,
"learning_rate": 2.7705046690410344e-06,
"loss": 0.0067,
"step": 9750
},
{
"epoch": 1.5643984932275385,
"grad_norm": 0.05968464910984039,
"learning_rate": 2.751201598570642e-06,
"loss": 0.0072,
"step": 9760
},
{
"epoch": 1.5660014426544842,
"grad_norm": 0.03330584615468979,
"learning_rate": 2.7319552774220517e-06,
"loss": 0.0066,
"step": 9770
},
{
"epoch": 1.5676043920814298,
"grad_norm": 0.03819479048252106,
"learning_rate": 2.712765856270678e-06,
"loss": 0.0069,
"step": 9780
},
{
"epoch": 1.5692073415083754,
"grad_norm": 0.05412070453166962,
"learning_rate": 2.6936334853464676e-06,
"loss": 0.0068,
"step": 9790
},
{
"epoch": 1.570810290935321,
"grad_norm": 0.04212406277656555,
"learning_rate": 2.6745583144327423e-06,
"loss": 0.0068,
"step": 9800
},
{
"epoch": 1.5724132403622666,
"grad_norm": 0.03889830783009529,
"learning_rate": 2.6555404928650055e-06,
"loss": 0.0073,
"step": 9810
},
{
"epoch": 1.5740161897892122,
"grad_norm": 0.03061777725815773,
"learning_rate": 2.6365801695298033e-06,
"loss": 0.0073,
"step": 9820
},
{
"epoch": 1.5756191392161578,
"grad_norm": 0.046813514083623886,
"learning_rate": 2.617677492863524e-06,
"loss": 0.0073,
"step": 9830
},
{
"epoch": 1.5772220886431034,
"grad_norm": 0.04139237850904465,
"learning_rate": 2.5988326108512494e-06,
"loss": 0.0066,
"step": 9840
},
{
"epoch": 1.578825038070049,
"grad_norm": 0.03712620958685875,
"learning_rate": 2.5800456710256207e-06,
"loss": 0.0074,
"step": 9850
},
{
"epoch": 1.5804279874969944,
"grad_norm": 0.04341958463191986,
"learning_rate": 2.561316820465638e-06,
"loss": 0.007,
"step": 9860
},
{
"epoch": 1.58203093692394,
"grad_norm": 0.06013938784599304,
"learning_rate": 2.5426462057955505e-06,
"loss": 0.0064,
"step": 9870
},
{
"epoch": 1.5836338863508856,
"grad_norm": 0.026838280260562897,
"learning_rate": 2.524033973183675e-06,
"loss": 0.0061,
"step": 9880
},
{
"epoch": 1.5852368357778313,
"grad_norm": 0.04112298786640167,
"learning_rate": 2.505480268341278e-06,
"loss": 0.0063,
"step": 9890
},
{
"epoch": 1.5868397852047766,
"grad_norm": 0.03727724403142929,
"learning_rate": 2.4869852365214287e-06,
"loss": 0.006,
"step": 9900
},
{
"epoch": 1.5884427346317223,
"grad_norm": 0.037251487374305725,
"learning_rate": 2.468549022517841e-06,
"loss": 0.0077,
"step": 9910
},
{
"epoch": 1.5900456840586679,
"grad_norm": 0.04826606065034866,
"learning_rate": 2.4501717706637707e-06,
"loss": 0.0068,
"step": 9920
},
{
"epoch": 1.5916486334856135,
"grad_norm": 0.04270663857460022,
"learning_rate": 2.43185362483087e-06,
"loss": 0.0066,
"step": 9930
},
{
"epoch": 1.593251582912559,
"grad_norm": 0.03491262346506119,
"learning_rate": 2.4135947284280523e-06,
"loss": 0.0068,
"step": 9940
},
{
"epoch": 1.5948545323395047,
"grad_norm": 0.03372110426425934,
"learning_rate": 2.395395224400391e-06,
"loss": 0.0066,
"step": 9950
},
{
"epoch": 1.5964574817664503,
"grad_norm": 0.03577246144413948,
"learning_rate": 2.3772552552279837e-06,
"loss": 0.0066,
"step": 9960
},
{
"epoch": 1.598060431193396,
"grad_norm": 0.04551481455564499,
"learning_rate": 2.3591749629248463e-06,
"loss": 0.0066,
"step": 9970
},
{
"epoch": 1.5996633806203415,
"grad_norm": 0.044537000358104706,
"learning_rate": 2.341154489037788e-06,
"loss": 0.0067,
"step": 9980
},
{
"epoch": 1.6012663300472871,
"grad_norm": 0.03701915591955185,
"learning_rate": 2.3231939746453214e-06,
"loss": 0.0068,
"step": 9990
},
{
"epoch": 1.6028692794742327,
"grad_norm": 0.03451233729720116,
"learning_rate": 2.3052935603565464e-06,
"loss": 0.0061,
"step": 10000
},
{
"epoch": 1.6044722289011781,
"grad_norm": 0.0396900400519371,
"learning_rate": 2.287453386310047e-06,
"loss": 0.0068,
"step": 10010
},
{
"epoch": 1.6060751783281237,
"grad_norm": 0.03384820371866226,
"learning_rate": 2.269673592172804e-06,
"loss": 0.0064,
"step": 10020
},
{
"epoch": 1.6076781277550694,
"grad_norm": 0.05970784276723862,
"learning_rate": 2.251954317139099e-06,
"loss": 0.0067,
"step": 10030
},
{
"epoch": 1.609281077182015,
"grad_norm": 0.03140771761536598,
"learning_rate": 2.234295699929413e-06,
"loss": 0.0069,
"step": 10040
},
{
"epoch": 1.6108840266089604,
"grad_norm": 0.04010142758488655,
"learning_rate": 2.2166978787893576e-06,
"loss": 0.0071,
"step": 10050
},
{
"epoch": 1.612486976035906,
"grad_norm": 0.037328608334064484,
"learning_rate": 2.1991609914885857e-06,
"loss": 0.0065,
"step": 10060
},
{
"epoch": 1.6140899254628516,
"grad_norm": 0.04343570023775101,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.0066,
"step": 10070
},
{
"epoch": 1.6156928748897972,
"grad_norm": 0.05846039205789566,
"learning_rate": 2.164270567097212e-06,
"loss": 0.0059,
"step": 10080
},
{
"epoch": 1.6172958243167428,
"grad_norm": 0.0358646959066391,
"learning_rate": 2.1469173031564194e-06,
"loss": 0.0065,
"step": 10090
},
{
"epoch": 1.6188987737436884,
"grad_norm": 0.04640796035528183,
"learning_rate": 2.1296255193523973e-06,
"loss": 0.007,
"step": 10100
},
{
"epoch": 1.620501723170634,
"grad_norm": 0.026891401037573814,
"learning_rate": 2.11239535105889e-06,
"loss": 0.0061,
"step": 10110
},
{
"epoch": 1.6221046725975796,
"grad_norm": 0.045271456241607666,
"learning_rate": 2.0952269331672624e-06,
"loss": 0.007,
"step": 10120
},
{
"epoch": 1.6237076220245252,
"grad_norm": 0.03902266547083855,
"learning_rate": 2.078120400085468e-06,
"loss": 0.0066,
"step": 10130
},
{
"epoch": 1.6253105714514708,
"grad_norm": 0.03035142458975315,
"learning_rate": 2.0610758857369573e-06,
"loss": 0.0072,
"step": 10140
},
{
"epoch": 1.6269135208784162,
"grad_norm": 0.03005078062415123,
"learning_rate": 2.0440935235596613e-06,
"loss": 0.0072,
"step": 10150
},
{
"epoch": 1.6285164703053618,
"grad_norm": 0.04039452224969864,
"learning_rate": 2.0271734465049264e-06,
"loss": 0.0066,
"step": 10160
},
{
"epoch": 1.6301194197323075,
"grad_norm": 0.04712294414639473,
"learning_rate": 2.0103157870364866e-06,
"loss": 0.0067,
"step": 10170
},
{
"epoch": 1.631722369159253,
"grad_norm": 0.034529995173215866,
"learning_rate": 1.9935206771294258e-06,
"loss": 0.0064,
"step": 10180
},
{
"epoch": 1.6333253185861984,
"grad_norm": 0.05219118669629097,
"learning_rate": 1.9767882482691257e-06,
"loss": 0.007,
"step": 10190
},
{
"epoch": 1.634928268013144,
"grad_norm": 0.04649091139435768,
"learning_rate": 1.960118631450273e-06,
"loss": 0.006,
"step": 10200
},
{
"epoch": 1.6365312174400897,
"grad_norm": 0.03332320600748062,
"learning_rate": 1.9435119571757942e-06,
"loss": 0.0072,
"step": 10210
},
{
"epoch": 1.6381341668670353,
"grad_norm": 0.04636770859360695,
"learning_rate": 1.926968355455856e-06,
"loss": 0.0068,
"step": 10220
},
{
"epoch": 1.639737116293981,
"grad_norm": 0.033311877399683,
"learning_rate": 1.910487955806848e-06,
"loss": 0.0068,
"step": 10230
},
{
"epoch": 1.6413400657209265,
"grad_norm": 0.044362135231494904,
"learning_rate": 1.894070887250361e-06,
"loss": 0.0059,
"step": 10240
},
{
"epoch": 1.642943015147872,
"grad_norm": 0.049247484654188156,
"learning_rate": 1.8777172783121823e-06,
"loss": 0.0062,
"step": 10250
},
{
"epoch": 1.6445459645748177,
"grad_norm": 0.03714997321367264,
"learning_rate": 1.86142725702128e-06,
"loss": 0.0069,
"step": 10260
},
{
"epoch": 1.6461489140017633,
"grad_norm": 0.04958584904670715,
"learning_rate": 1.8452009509088164e-06,
"loss": 0.0066,
"step": 10270
},
{
"epoch": 1.647751863428709,
"grad_norm": 0.04805106669664383,
"learning_rate": 1.8290384870071398e-06,
"loss": 0.0064,
"step": 10280
},
{
"epoch": 1.6493548128556546,
"grad_norm": 0.04615769535303116,
"learning_rate": 1.8129399918487833e-06,
"loss": 0.0067,
"step": 10290
},
{
"epoch": 1.6509577622826,
"grad_norm": 0.04581748694181442,
"learning_rate": 1.796905591465492e-06,
"loss": 0.0067,
"step": 10300
},
{
"epoch": 1.6525607117095455,
"grad_norm": 0.039267655462026596,
"learning_rate": 1.7809354113872224e-06,
"loss": 0.0058,
"step": 10310
},
{
"epoch": 1.6541636611364912,
"grad_norm": 0.03654203936457634,
"learning_rate": 1.7650295766411607e-06,
"loss": 0.0069,
"step": 10320
},
{
"epoch": 1.6557666105634368,
"grad_norm": 0.05323672294616699,
"learning_rate": 1.7491882117507507e-06,
"loss": 0.0069,
"step": 10330
},
{
"epoch": 1.6573695599903822,
"grad_norm": 0.047599758952856064,
"learning_rate": 1.7334114407347157e-06,
"loss": 0.0068,
"step": 10340
},
{
"epoch": 1.6589725094173278,
"grad_norm": 0.048799458891153336,
"learning_rate": 1.7176993871060876e-06,
"loss": 0.0065,
"step": 10350
},
{
"epoch": 1.6605754588442734,
"grad_norm": 0.0284061748534441,
"learning_rate": 1.7020521738712359e-06,
"loss": 0.0066,
"step": 10360
},
{
"epoch": 1.662178408271219,
"grad_norm": 0.039750777184963226,
"learning_rate": 1.686469923528905e-06,
"loss": 0.007,
"step": 10370
},
{
"epoch": 1.6637813576981646,
"grad_norm": 0.03770313411951065,
"learning_rate": 1.670952758069272e-06,
"loss": 0.0065,
"step": 10380
},
{
"epoch": 1.6653843071251102,
"grad_norm": 0.04947682470083237,
"learning_rate": 1.6555007989729643e-06,
"loss": 0.0067,
"step": 10390
},
{
"epoch": 1.6669872565520558,
"grad_norm": 0.048967983573675156,
"learning_rate": 1.6401141672101283e-06,
"loss": 0.0061,
"step": 10400
},
{
"epoch": 1.6685902059790014,
"grad_norm": 0.042331352829933167,
"learning_rate": 1.6247929832394792e-06,
"loss": 0.0068,
"step": 10410
},
{
"epoch": 1.670193155405947,
"grad_norm": 0.0504949614405632,
"learning_rate": 1.6095373670073467e-06,
"loss": 0.0063,
"step": 10420
},
{
"epoch": 1.6717961048328926,
"grad_norm": 0.03911704197525978,
"learning_rate": 1.594347437946755e-06,
"loss": 0.0065,
"step": 10430
},
{
"epoch": 1.6733990542598383,
"grad_norm": 0.03691579028964043,
"learning_rate": 1.5792233149764656e-06,
"loss": 0.0069,
"step": 10440
},
{
"epoch": 1.6750020036867836,
"grad_norm": 0.03881106898188591,
"learning_rate": 1.5641651165000672e-06,
"loss": 0.0063,
"step": 10450
},
{
"epoch": 1.6766049531137293,
"grad_norm": 0.039052508771419525,
"learning_rate": 1.5491729604050388e-06,
"loss": 0.0069,
"step": 10460
},
{
"epoch": 1.6782079025406749,
"grad_norm": 0.03730938211083412,
"learning_rate": 1.5342469640618162e-06,
"loss": 0.0066,
"step": 10470
},
{
"epoch": 1.6798108519676203,
"grad_norm": 0.05100889876484871,
"learning_rate": 1.5193872443229052e-06,
"loss": 0.0068,
"step": 10480
},
{
"epoch": 1.6814138013945659,
"grad_norm": 0.03976810351014137,
"learning_rate": 1.5045939175219271e-06,
"loss": 0.0064,
"step": 10490
},
{
"epoch": 1.6830167508215115,
"grad_norm": 0.04142378270626068,
"learning_rate": 1.4898670994727326e-06,
"loss": 0.0066,
"step": 10500
},
{
"epoch": 1.684619700248457,
"grad_norm": 0.03331238403916359,
"learning_rate": 1.4752069054684925e-06,
"loss": 0.0062,
"step": 10510
},
{
"epoch": 1.6862226496754027,
"grad_norm": 0.042981959879398346,
"learning_rate": 1.460613450280789e-06,
"loss": 0.0066,
"step": 10520
},
{
"epoch": 1.6878255991023483,
"grad_norm": 0.04343404620885849,
"learning_rate": 1.4460868481587231e-06,
"loss": 0.006,
"step": 10530
},
{
"epoch": 1.689428548529294,
"grad_norm": 0.038565054535865784,
"learning_rate": 1.4316272128280107e-06,
"loss": 0.0065,
"step": 10540
},
{
"epoch": 1.6910314979562395,
"grad_norm": 0.049151334911584854,
"learning_rate": 1.4172346574901064e-06,
"loss": 0.0072,
"step": 10550
},
{
"epoch": 1.6926344473831851,
"grad_norm": 0.03570883348584175,
"learning_rate": 1.4029092948213075e-06,
"loss": 0.0062,
"step": 10560
},
{
"epoch": 1.6942373968101307,
"grad_norm": 0.04620914161205292,
"learning_rate": 1.3886512369718675e-06,
"loss": 0.0071,
"step": 10570
},
{
"epoch": 1.6958403462370764,
"grad_norm": 0.03542487695813179,
"learning_rate": 1.3744605955651336e-06,
"loss": 0.0065,
"step": 10580
},
{
"epoch": 1.6974432956640217,
"grad_norm": 0.03881022706627846,
"learning_rate": 1.3603374816966607e-06,
"loss": 0.0062,
"step": 10590
},
{
"epoch": 1.6990462450909674,
"grad_norm": 0.06067880243062973,
"learning_rate": 1.3462820059333403e-06,
"loss": 0.0068,
"step": 10600
},
{
"epoch": 1.700649194517913,
"grad_norm": 0.04585850238800049,
"learning_rate": 1.332294278312546e-06,
"loss": 0.0067,
"step": 10610
},
{
"epoch": 1.7022521439448586,
"grad_norm": 0.040219008922576904,
"learning_rate": 1.318374408341262e-06,
"loss": 0.0064,
"step": 10620
},
{
"epoch": 1.703855093371804,
"grad_norm": 0.04510616883635521,
"learning_rate": 1.3045225049952314e-06,
"loss": 0.007,
"step": 10630
},
{
"epoch": 1.7054580427987496,
"grad_norm": 0.029775921255350113,
"learning_rate": 1.2907386767180985e-06,
"loss": 0.0059,
"step": 10640
},
{
"epoch": 1.7070609922256952,
"grad_norm": 0.04115507751703262,
"learning_rate": 1.2770230314205567e-06,
"loss": 0.0066,
"step": 10650
},
{
"epoch": 1.7086639416526408,
"grad_norm": 0.03847046568989754,
"learning_rate": 1.2633756764795247e-06,
"loss": 0.0063,
"step": 10660
},
{
"epoch": 1.7102668910795864,
"grad_norm": 0.04787128418684006,
"learning_rate": 1.249796718737275e-06,
"loss": 0.0064,
"step": 10670
},
{
"epoch": 1.711869840506532,
"grad_norm": 0.04256165400147438,
"learning_rate": 1.2362862645006213e-06,
"loss": 0.0065,
"step": 10680
},
{
"epoch": 1.7134727899334776,
"grad_norm": 0.047069843858480453,
"learning_rate": 1.2228444195400757e-06,
"loss": 0.007,
"step": 10690
},
{
"epoch": 1.7150757393604232,
"grad_norm": 0.025805678218603134,
"learning_rate": 1.2094712890890193e-06,
"loss": 0.0073,
"step": 10700
},
{
"epoch": 1.7166786887873688,
"grad_norm": 0.03637049347162247,
"learning_rate": 1.1961669778428874e-06,
"loss": 0.0063,
"step": 10710
},
{
"epoch": 1.7182816382143145,
"grad_norm": 0.0403585359454155,
"learning_rate": 1.1829315899583393e-06,
"loss": 0.0068,
"step": 10720
},
{
"epoch": 1.71988458764126,
"grad_norm": 0.04056670516729355,
"learning_rate": 1.1697652290524497e-06,
"loss": 0.0061,
"step": 10730
},
{
"epoch": 1.7214875370682055,
"grad_norm": 0.02951175719499588,
"learning_rate": 1.156667998201899e-06,
"loss": 0.0065,
"step": 10740
},
{
"epoch": 1.723090486495151,
"grad_norm": 0.045906949788331985,
"learning_rate": 1.143639999942152e-06,
"loss": 0.0065,
"step": 10750
},
{
"epoch": 1.7246934359220967,
"grad_norm": 0.04483688622713089,
"learning_rate": 1.1306813362666846e-06,
"loss": 0.0063,
"step": 10760
},
{
"epoch": 1.7262963853490423,
"grad_norm": 0.04201997444033623,
"learning_rate": 1.1177921086261467e-06,
"loss": 0.0064,
"step": 10770
},
{
"epoch": 1.7278993347759877,
"grad_norm": 0.03533167019486427,
"learning_rate": 1.1049724179276034e-06,
"loss": 0.0068,
"step": 10780
},
{
"epoch": 1.7295022842029333,
"grad_norm": 0.04161592572927475,
"learning_rate": 1.0922223645337181e-06,
"loss": 0.0066,
"step": 10790
},
{
"epoch": 1.731105233629879,
"grad_norm": 0.053074780851602554,
"learning_rate": 1.0795420482619867e-06,
"loss": 0.0071,
"step": 10800
},
{
"epoch": 1.7327081830568245,
"grad_norm": 0.04212959110736847,
"learning_rate": 1.0669315683839455e-06,
"loss": 0.0058,
"step": 10810
},
{
"epoch": 1.7343111324837701,
"grad_norm": 0.05835643783211708,
"learning_rate": 1.0543910236243926e-06,
"loss": 0.0066,
"step": 10820
},
{
"epoch": 1.7359140819107157,
"grad_norm": 0.03991185128688812,
"learning_rate": 1.0419205121606246e-06,
"loss": 0.0068,
"step": 10830
},
{
"epoch": 1.7375170313376613,
"grad_norm": 0.03626156225800514,
"learning_rate": 1.0295201316216596e-06,
"loss": 0.0066,
"step": 10840
},
{
"epoch": 1.739119980764607,
"grad_norm": 0.04507741332054138,
"learning_rate": 1.0171899790874718e-06,
"loss": 0.0065,
"step": 10850
},
{
"epoch": 1.7407229301915526,
"grad_norm": 0.05982038006186485,
"learning_rate": 1.0049301510882404e-06,
"loss": 0.0067,
"step": 10860
},
{
"epoch": 1.7423258796184982,
"grad_norm": 0.043249331414699554,
"learning_rate": 9.927407436035886e-07,
"loss": 0.0065,
"step": 10870
},
{
"epoch": 1.7439288290454438,
"grad_norm": 0.03685468062758446,
"learning_rate": 9.80621852061826e-07,
"loss": 0.006,
"step": 10880
},
{
"epoch": 1.7455317784723892,
"grad_norm": 0.03677702322602272,
"learning_rate": 9.685735713392141e-07,
"loss": 0.0059,
"step": 10890
},
{
"epoch": 1.7471347278993348,
"grad_norm": 0.034399621188640594,
"learning_rate": 9.565959957592141e-07,
"loss": 0.0076,
"step": 10900
},
{
"epoch": 1.7487376773262804,
"grad_norm": 0.0476701557636261,
"learning_rate": 9.446892190917556e-07,
"loss": 0.0068,
"step": 10910
},
{
"epoch": 1.7503406267532258,
"grad_norm": 0.04046626016497612,
"learning_rate": 9.328533345524893e-07,
"loss": 0.0064,
"step": 10920
},
{
"epoch": 1.7519435761801714,
"grad_norm": 0.04283340275287628,
"learning_rate": 9.210884348020744e-07,
"loss": 0.0065,
"step": 10930
},
{
"epoch": 1.753546525607117,
"grad_norm": 0.04990899935364723,
"learning_rate": 9.093946119454455e-07,
"loss": 0.0063,
"step": 10940
},
{
"epoch": 1.7551494750340626,
"grad_norm": 0.061496201902627945,
"learning_rate": 8.97771957531084e-07,
"loss": 0.0063,
"step": 10950
},
{
"epoch": 1.7567524244610082,
"grad_norm": 0.08075609058141708,
"learning_rate": 8.86220562550314e-07,
"loss": 0.0066,
"step": 10960
},
{
"epoch": 1.7583553738879538,
"grad_norm": 0.04579548165202141,
"learning_rate": 8.747405174365853e-07,
"loss": 0.0066,
"step": 10970
},
{
"epoch": 1.7599583233148994,
"grad_norm": 0.058112818747758865,
"learning_rate": 8.633319120647587e-07,
"loss": 0.0067,
"step": 10980
},
{
"epoch": 1.761561272741845,
"grad_norm": 0.042892564088106155,
"learning_rate": 8.51994835750416e-07,
"loss": 0.0068,
"step": 10990
},
{
"epoch": 1.7631642221687907,
"grad_norm": 0.039829425513744354,
"learning_rate": 8.407293772491432e-07,
"loss": 0.0064,
"step": 11000
},
{
"epoch": 1.7647671715957363,
"grad_norm": 0.04083314165472984,
"learning_rate": 8.295356247558595e-07,
"loss": 0.0063,
"step": 11010
},
{
"epoch": 1.7663701210226819,
"grad_norm": 0.05451719090342522,
"learning_rate": 8.184136659040986e-07,
"loss": 0.0067,
"step": 11020
},
{
"epoch": 1.7679730704496273,
"grad_norm": 0.03532470762729645,
"learning_rate": 8.07363587765343e-07,
"loss": 0.0061,
"step": 11030
},
{
"epoch": 1.7695760198765729,
"grad_norm": 0.04727141186594963,
"learning_rate": 7.963854768483392e-07,
"loss": 0.0061,
"step": 11040
},
{
"epoch": 1.7711789693035185,
"grad_norm": 0.04995537921786308,
"learning_rate": 7.854794190984116e-07,
"loss": 0.0068,
"step": 11050
},
{
"epoch": 1.772781918730464,
"grad_norm": 0.05020635575056076,
"learning_rate": 7.746454998968012e-07,
"loss": 0.006,
"step": 11060
},
{
"epoch": 1.7743848681574095,
"grad_norm": 0.0372854508459568,
"learning_rate": 7.638838040599838e-07,
"loss": 0.007,
"step": 11070
},
{
"epoch": 1.775987817584355,
"grad_norm": 0.04687948524951935,
"learning_rate": 7.531944158390203e-07,
"loss": 0.0059,
"step": 11080
},
{
"epoch": 1.7775907670113007,
"grad_norm": 0.09241676330566406,
"learning_rate": 7.425774189188906e-07,
"loss": 0.0063,
"step": 11090
},
{
"epoch": 1.7791937164382463,
"grad_norm": 0.05238804966211319,
"learning_rate": 7.320328964178325e-07,
"loss": 0.0072,
"step": 11100
},
{
"epoch": 1.780796665865192,
"grad_norm": 0.033452149480581284,
"learning_rate": 7.215609308867022e-07,
"loss": 0.0055,
"step": 11110
},
{
"epoch": 1.7823996152921375,
"grad_norm": 0.05519595742225647,
"learning_rate": 7.111616043083202e-07,
"loss": 0.0067,
"step": 11120
},
{
"epoch": 1.7840025647190831,
"grad_norm": 0.04810706898570061,
"learning_rate": 7.008349980968321e-07,
"loss": 0.0059,
"step": 11130
},
{
"epoch": 1.7856055141460287,
"grad_norm": 0.0363197885453701,
"learning_rate": 6.905811930970718e-07,
"loss": 0.0062,
"step": 11140
},
{
"epoch": 1.7872084635729744,
"grad_norm": 0.03364889323711395,
"learning_rate": 6.804002695839274e-07,
"loss": 0.0059,
"step": 11150
},
{
"epoch": 1.78881141299992,
"grad_norm": 0.03774208575487137,
"learning_rate": 6.702923072617129e-07,
"loss": 0.0057,
"step": 11160
},
{
"epoch": 1.7904143624268656,
"grad_norm": 0.05517459660768509,
"learning_rate": 6.602573852635441e-07,
"loss": 0.0068,
"step": 11170
},
{
"epoch": 1.792017311853811,
"grad_norm": 0.04096028581261635,
"learning_rate": 6.502955821507196e-07,
"loss": 0.007,
"step": 11180
},
{
"epoch": 1.7936202612807566,
"grad_norm": 0.03794392570853233,
"learning_rate": 6.404069759121079e-07,
"loss": 0.0065,
"step": 11190
},
{
"epoch": 1.7952232107077022,
"grad_norm": 0.03523090481758118,
"learning_rate": 6.305916439635295e-07,
"loss": 0.0063,
"step": 11200
},
{
"epoch": 1.7968261601346478,
"grad_norm": 0.04677393287420273,
"learning_rate": 6.208496631471605e-07,
"loss": 0.0062,
"step": 11210
},
{
"epoch": 1.7984291095615932,
"grad_norm": 0.0406043566763401,
"learning_rate": 6.111811097309262e-07,
"loss": 0.0066,
"step": 11220
},
{
"epoch": 1.8000320589885388,
"grad_norm": 0.03882203623652458,
"learning_rate": 6.015860594079004e-07,
"loss": 0.0058,
"step": 11230
},
{
"epoch": 1.8016350084154844,
"grad_norm": 0.03886554762721062,
"learning_rate": 5.920645872957187e-07,
"loss": 0.0065,
"step": 11240
},
{
"epoch": 1.80323795784243,
"grad_norm": 0.039025116711854935,
"learning_rate": 5.826167679359917e-07,
"loss": 0.0067,
"step": 11250
},
{
"epoch": 1.8048409072693756,
"grad_norm": 0.05819341167807579,
"learning_rate": 5.732426752937103e-07,
"loss": 0.0077,
"step": 11260
},
{
"epoch": 1.8064438566963212,
"grad_norm": 0.05688609555363655,
"learning_rate": 5.639423827566837e-07,
"loss": 0.0068,
"step": 11270
},
{
"epoch": 1.8080468061232668,
"grad_norm": 0.04290845990180969,
"learning_rate": 5.547159631349452e-07,
"loss": 0.0069,
"step": 11280
},
{
"epoch": 1.8096497555502125,
"grad_norm": 0.049413323402404785,
"learning_rate": 5.455634886602046e-07,
"loss": 0.0062,
"step": 11290
},
{
"epoch": 1.811252704977158,
"grad_norm": 0.045171111822128296,
"learning_rate": 5.364850309852598e-07,
"loss": 0.0065,
"step": 11300
},
{
"epoch": 1.8128556544041037,
"grad_norm": 0.038502875715494156,
"learning_rate": 5.274806611834527e-07,
"loss": 0.0063,
"step": 11310
},
{
"epoch": 1.8144586038310493,
"grad_norm": 0.037532929331064224,
"learning_rate": 5.185504497481064e-07,
"loss": 0.0059,
"step": 11320
},
{
"epoch": 1.8160615532579947,
"grad_norm": 0.03429022803902626,
"learning_rate": 5.096944665919712e-07,
"loss": 0.0056,
"step": 11330
},
{
"epoch": 1.8176645026849403,
"grad_norm": 0.03987206146121025,
"learning_rate": 5.009127810466808e-07,
"loss": 0.0065,
"step": 11340
},
{
"epoch": 1.819267452111886,
"grad_norm": 0.032589782029390335,
"learning_rate": 4.922054618622096e-07,
"loss": 0.0062,
"step": 11350
},
{
"epoch": 1.8208704015388313,
"grad_norm": 0.04659878835082054,
"learning_rate": 4.835725772063316e-07,
"loss": 0.0061,
"step": 11360
},
{
"epoch": 1.822473350965777,
"grad_norm": 0.03782256692647934,
"learning_rate": 4.750141946640918e-07,
"loss": 0.0063,
"step": 11370
},
{
"epoch": 1.8240763003927225,
"grad_norm": 0.03973528742790222,
"learning_rate": 4.665303812372668e-07,
"loss": 0.0062,
"step": 11380
},
{
"epoch": 1.8256792498196681,
"grad_norm": 0.06990045309066772,
"learning_rate": 4.581212033438576e-07,
"loss": 0.0064,
"step": 11390
},
{
"epoch": 1.8272821992466137,
"grad_norm": 0.055125899612903595,
"learning_rate": 4.4978672681755153e-07,
"loss": 0.0069,
"step": 11400
},
{
"epoch": 1.8288851486735593,
"grad_norm": 0.038944311439991,
"learning_rate": 4.41527016907215e-07,
"loss": 0.0066,
"step": 11410
},
{
"epoch": 1.830488098100505,
"grad_norm": 0.04602019861340523,
"learning_rate": 4.333421382763847e-07,
"loss": 0.006,
"step": 11420
},
{
"epoch": 1.8320910475274506,
"grad_norm": 0.055467262864112854,
"learning_rate": 4.252321550027583e-07,
"loss": 0.0068,
"step": 11430
},
{
"epoch": 1.8336939969543962,
"grad_norm": 0.029419003054499626,
"learning_rate": 4.171971305776945e-07,
"loss": 0.0065,
"step": 11440
},
{
"epoch": 1.8352969463813418,
"grad_norm": 0.039543673396110535,
"learning_rate": 4.0923712790571167e-07,
"loss": 0.0059,
"step": 11450
},
{
"epoch": 1.8368998958082874,
"grad_norm": 0.04183452948927879,
"learning_rate": 4.013522093040023e-07,
"loss": 0.0062,
"step": 11460
},
{
"epoch": 1.8385028452352328,
"grad_norm": 0.04477937892079353,
"learning_rate": 3.9354243650194025e-07,
"loss": 0.0059,
"step": 11470
},
{
"epoch": 1.8401057946621784,
"grad_norm": 0.04892928525805473,
"learning_rate": 3.8580787064059544e-07,
"loss": 0.0056,
"step": 11480
},
{
"epoch": 1.841708744089124,
"grad_norm": 0.0421488918364048,
"learning_rate": 3.781485722722622e-07,
"loss": 0.006,
"step": 11490
},
{
"epoch": 1.8433116935160696,
"grad_norm": 0.04015891253948212,
"learning_rate": 3.7056460135998283e-07,
"loss": 0.0066,
"step": 11500
},
{
"epoch": 1.844914642943015,
"grad_norm": 0.04604795202612877,
"learning_rate": 3.630560172770714e-07,
"loss": 0.0059,
"step": 11510
},
{
"epoch": 1.8465175923699606,
"grad_norm": 0.05634016543626785,
"learning_rate": 3.5562287880665845e-07,
"loss": 0.0072,
"step": 11520
},
{
"epoch": 1.8481205417969062,
"grad_norm": 0.04716205969452858,
"learning_rate": 3.48265244141226e-07,
"loss": 0.0066,
"step": 11530
},
{
"epoch": 1.8497234912238518,
"grad_norm": 0.04330425336956978,
"learning_rate": 3.4098317088215203e-07,
"loss": 0.0062,
"step": 11540
},
{
"epoch": 1.8513264406507974,
"grad_norm": 0.044012319296598434,
"learning_rate": 3.337767160392602e-07,
"loss": 0.0064,
"step": 11550
},
{
"epoch": 1.852929390077743,
"grad_norm": 0.03510000556707382,
"learning_rate": 3.2664593603037196e-07,
"loss": 0.0065,
"step": 11560
},
{
"epoch": 1.8545323395046887,
"grad_norm": 0.03645529970526695,
"learning_rate": 3.1959088668087055e-07,
"loss": 0.0063,
"step": 11570
},
{
"epoch": 1.8561352889316343,
"grad_norm": 0.03181441128253937,
"learning_rate": 3.1261162322325343e-07,
"loss": 0.0063,
"step": 11580
},
{
"epoch": 1.8577382383585799,
"grad_norm": 0.0412897914648056,
"learning_rate": 3.0570820029671377e-07,
"loss": 0.0064,
"step": 11590
},
{
"epoch": 1.8593411877855255,
"grad_norm": 0.03842850774526596,
"learning_rate": 2.988806719466997e-07,
"loss": 0.006,
"step": 11600
},
{
"epoch": 1.860944137212471,
"grad_norm": 0.0427357517182827,
"learning_rate": 2.9212909162449785e-07,
"loss": 0.006,
"step": 11610
},
{
"epoch": 1.8625470866394165,
"grad_norm": 0.055357471108436584,
"learning_rate": 2.8545351218681406e-07,
"loss": 0.0064,
"step": 11620
},
{
"epoch": 1.864150036066362,
"grad_norm": 0.0415298193693161,
"learning_rate": 2.788539858953587e-07,
"loss": 0.0063,
"step": 11630
},
{
"epoch": 1.8657529854933077,
"grad_norm": 0.038512542843818665,
"learning_rate": 2.723305644164398e-07,
"loss": 0.0064,
"step": 11640
},
{
"epoch": 1.8673559349202533,
"grad_norm": 0.04085739329457283,
"learning_rate": 2.6588329882055506e-07,
"loss": 0.006,
"step": 11650
},
{
"epoch": 1.8689588843471987,
"grad_norm": 0.036621492356061935,
"learning_rate": 2.5951223958199157e-07,
"loss": 0.0064,
"step": 11660
},
{
"epoch": 1.8705618337741443,
"grad_norm": 0.05602623149752617,
"learning_rate": 2.5321743657844013e-07,
"loss": 0.0063,
"step": 11670
},
{
"epoch": 1.87216478320109,
"grad_norm": 0.040210746228694916,
"learning_rate": 2.4699893909058805e-07,
"loss": 0.0057,
"step": 11680
},
{
"epoch": 1.8737677326280355,
"grad_norm": 0.03419404849410057,
"learning_rate": 2.408567958017516e-07,
"loss": 0.0065,
"step": 11690
},
{
"epoch": 1.8753706820549811,
"grad_norm": 0.03755233436822891,
"learning_rate": 2.3479105479747854e-07,
"loss": 0.0058,
"step": 11700
},
{
"epoch": 1.8769736314819268,
"grad_norm": 0.035565607249736786,
"learning_rate": 2.2880176356518292e-07,
"loss": 0.0066,
"step": 11710
},
{
"epoch": 1.8785765809088724,
"grad_norm": 0.0364888571202755,
"learning_rate": 2.2288896899377187e-07,
"loss": 0.0065,
"step": 11720
},
{
"epoch": 1.880179530335818,
"grad_norm": 0.03588583692908287,
"learning_rate": 2.170527173732706e-07,
"loss": 0.0064,
"step": 11730
},
{
"epoch": 1.8817824797627636,
"grad_norm": 0.03791727498173714,
"learning_rate": 2.1129305439447023e-07,
"loss": 0.0072,
"step": 11740
},
{
"epoch": 1.8833854291897092,
"grad_norm": 0.07219801843166351,
"learning_rate": 2.0561002514856377e-07,
"loss": 0.0064,
"step": 11750
},
{
"epoch": 1.8849883786166548,
"grad_norm": 0.035773083567619324,
"learning_rate": 2.00003674126793e-07,
"loss": 0.0065,
"step": 11760
},
{
"epoch": 1.8865913280436002,
"grad_norm": 0.033491574227809906,
"learning_rate": 1.9447404522010548e-07,
"loss": 0.0065,
"step": 11770
},
{
"epoch": 1.8881942774705458,
"grad_norm": 0.03553665056824684,
"learning_rate": 1.890211817188059e-07,
"loss": 0.0061,
"step": 11780
},
{
"epoch": 1.8897972268974914,
"grad_norm": 0.04234781861305237,
"learning_rate": 1.8364512631221633e-07,
"loss": 0.0068,
"step": 11790
},
{
"epoch": 1.8914001763244368,
"grad_norm": 0.045198485255241394,
"learning_rate": 1.783459210883498e-07,
"loss": 0.0061,
"step": 11800
},
{
"epoch": 1.8930031257513824,
"grad_norm": 0.034584879875183105,
"learning_rate": 1.731236075335696e-07,
"loss": 0.0063,
"step": 11810
},
{
"epoch": 1.894606075178328,
"grad_norm": 0.03774869441986084,
"learning_rate": 1.6797822653227492e-07,
"loss": 0.0062,
"step": 11820
},
{
"epoch": 1.8962090246052736,
"grad_norm": 0.03940942883491516,
"learning_rate": 1.6290981836657116e-07,
"loss": 0.0058,
"step": 11830
},
{
"epoch": 1.8978119740322192,
"grad_norm": 0.04277713969349861,
"learning_rate": 1.579184227159658e-07,
"loss": 0.0062,
"step": 11840
},
{
"epoch": 1.8994149234591649,
"grad_norm": 0.04427545145153999,
"learning_rate": 1.5300407865704637e-07,
"loss": 0.0068,
"step": 11850
},
{
"epoch": 1.9010178728861105,
"grad_norm": 0.048489801585674286,
"learning_rate": 1.4816682466318178e-07,
"loss": 0.0057,
"step": 11860
},
{
"epoch": 1.902620822313056,
"grad_norm": 0.04006476700305939,
"learning_rate": 1.4340669860421708e-07,
"loss": 0.0065,
"step": 11870
},
{
"epoch": 1.9042237717400017,
"grad_norm": 0.051166266202926636,
"learning_rate": 1.3872373774618363e-07,
"loss": 0.0056,
"step": 11880
},
{
"epoch": 1.9058267211669473,
"grad_norm": 0.03423753380775452,
"learning_rate": 1.3411797875099718e-07,
"loss": 0.0061,
"step": 11890
},
{
"epoch": 1.907429670593893,
"grad_norm": 0.03915632143616676,
"learning_rate": 1.2958945767617915e-07,
"loss": 0.0063,
"step": 11900
},
{
"epoch": 1.9090326200208383,
"grad_norm": 0.037460289895534515,
"learning_rate": 1.2513820997456904e-07,
"loss": 0.0064,
"step": 11910
},
{
"epoch": 1.910635569447784,
"grad_norm": 0.0423584058880806,
"learning_rate": 1.2076427049405482e-07,
"loss": 0.0063,
"step": 11920
},
{
"epoch": 1.9122385188747295,
"grad_norm": 0.04563596844673157,
"learning_rate": 1.164676734772896e-07,
"loss": 0.0065,
"step": 11930
},
{
"epoch": 1.9138414683016751,
"grad_norm": 0.036899786442518234,
"learning_rate": 1.1224845256142758e-07,
"loss": 0.0061,
"step": 11940
},
{
"epoch": 1.9154444177286205,
"grad_norm": 0.03818698972463608,
"learning_rate": 1.0810664077786747e-07,
"loss": 0.0061,
"step": 11950
},
{
"epoch": 1.9170473671555661,
"grad_norm": 0.04241606220602989,
"learning_rate": 1.040422705519828e-07,
"loss": 0.0072,
"step": 11960
},
{
"epoch": 1.9186503165825117,
"grad_norm": 0.039895787835121155,
"learning_rate": 1.0005537370287532e-07,
"loss": 0.0059,
"step": 11970
},
{
"epoch": 1.9202532660094573,
"grad_norm": 0.03996073827147484,
"learning_rate": 9.614598144312426e-08,
"loss": 0.0065,
"step": 11980
},
{
"epoch": 1.921856215436403,
"grad_norm": 0.040646884590387344,
"learning_rate": 9.231412437854192e-08,
"loss": 0.0066,
"step": 11990
},
{
"epoch": 1.9234591648633486,
"grad_norm": 0.037054888904094696,
"learning_rate": 8.855983250793287e-08,
"loss": 0.007,
"step": 12000
},
{
"epoch": 1.9250621142902942,
"grad_norm": 0.03988795354962349,
"learning_rate": 8.488313522286074e-08,
"loss": 0.0059,
"step": 12010
},
{
"epoch": 1.9266650637172398,
"grad_norm": 0.045791856944561005,
"learning_rate": 8.128406130741617e-08,
"loss": 0.0057,
"step": 12020
},
{
"epoch": 1.9282680131441854,
"grad_norm": 0.03719751164317131,
"learning_rate": 7.776263893799485e-08,
"loss": 0.0063,
"step": 12030
},
{
"epoch": 1.929870962571131,
"grad_norm": 0.05731379985809326,
"learning_rate": 7.431889568307316e-08,
"loss": 0.0072,
"step": 12040
},
{
"epoch": 1.9314739119980766,
"grad_norm": 0.03646623343229294,
"learning_rate": 7.095285850299505e-08,
"loss": 0.0068,
"step": 12050
},
{
"epoch": 1.933076861425022,
"grad_norm": 0.032672230154275894,
"learning_rate": 6.766455374975777e-08,
"loss": 0.0061,
"step": 12060
},
{
"epoch": 1.9346798108519676,
"grad_norm": 0.09227609634399414,
"learning_rate": 6.445400716681205e-08,
"loss": 0.0069,
"step": 12070
},
{
"epoch": 1.9362827602789132,
"grad_norm": 0.04252972453832626,
"learning_rate": 6.132124388885107e-08,
"loss": 0.0065,
"step": 12080
},
{
"epoch": 1.9378857097058588,
"grad_norm": 0.047904375940561295,
"learning_rate": 5.8266288441621855e-08,
"loss": 0.0065,
"step": 12090
},
{
"epoch": 1.9394886591328042,
"grad_norm": 0.04473813623189926,
"learning_rate": 5.528916474172974e-08,
"loss": 0.0071,
"step": 12100
},
{
"epoch": 1.9410916085597498,
"grad_norm": 0.04604315385222435,
"learning_rate": 5.2389896096451954e-08,
"loss": 0.0069,
"step": 12110
},
{
"epoch": 1.9426945579866954,
"grad_norm": 0.04812907055020332,
"learning_rate": 4.9568505203553277e-08,
"loss": 0.0061,
"step": 12120
},
{
"epoch": 1.944297507413641,
"grad_norm": 0.04590696468949318,
"learning_rate": 4.6825014151113955e-08,
"loss": 0.0064,
"step": 12130
},
{
"epoch": 1.9459004568405867,
"grad_norm": 0.03498758748173714,
"learning_rate": 4.415944441734543e-08,
"loss": 0.0062,
"step": 12140
},
{
"epoch": 1.9475034062675323,
"grad_norm": 0.04751746729016304,
"learning_rate": 4.1571816870438206e-08,
"loss": 0.0065,
"step": 12150
},
{
"epoch": 1.9491063556944779,
"grad_norm": 0.038238272070884705,
"learning_rate": 3.9062151768382015e-08,
"loss": 0.0062,
"step": 12160
},
{
"epoch": 1.9507093051214235,
"grad_norm": 0.04823305085301399,
"learning_rate": 3.663046875882037e-08,
"loss": 0.007,
"step": 12170
},
{
"epoch": 1.952312254548369,
"grad_norm": 0.0401928685605526,
"learning_rate": 3.427678687888847e-08,
"loss": 0.0068,
"step": 12180
},
{
"epoch": 1.9539152039753147,
"grad_norm": 0.03627694770693779,
"learning_rate": 3.200112455506777e-08,
"loss": 0.006,
"step": 12190
},
{
"epoch": 1.9555181534022603,
"grad_norm": 0.04345129802823067,
"learning_rate": 2.980349960304274e-08,
"loss": 0.0063,
"step": 12200
},
{
"epoch": 1.9571211028292057,
"grad_norm": 0.049445588141679764,
"learning_rate": 2.7683929227556585e-08,
"loss": 0.0058,
"step": 12210
},
{
"epoch": 1.9587240522561513,
"grad_norm": 0.05957973003387451,
"learning_rate": 2.5642430022281285e-08,
"loss": 0.0065,
"step": 12220
},
{
"epoch": 1.960327001683097,
"grad_norm": 0.032285600900650024,
"learning_rate": 2.3679017969685524e-08,
"loss": 0.0059,
"step": 12230
},
{
"epoch": 1.9619299511100423,
"grad_norm": 0.03355714678764343,
"learning_rate": 2.1793708440910334e-08,
"loss": 0.0061,
"step": 12240
},
{
"epoch": 1.963532900536988,
"grad_norm": 0.053335681557655334,
"learning_rate": 1.9986516195650284e-08,
"loss": 0.0063,
"step": 12250
},
{
"epoch": 1.9651358499639335,
"grad_norm": 0.031918782740831375,
"learning_rate": 1.8257455382031386e-08,
"loss": 0.0071,
"step": 12260
},
{
"epoch": 1.9667387993908791,
"grad_norm": 0.038328561931848526,
"learning_rate": 1.6606539536510037e-08,
"loss": 0.0072,
"step": 12270
},
{
"epoch": 1.9683417488178248,
"grad_norm": 0.045656658709049225,
"learning_rate": 1.5033781583758677e-08,
"loss": 0.006,
"step": 12280
},
{
"epoch": 1.9699446982447704,
"grad_norm": 0.03573407977819443,
"learning_rate": 1.3539193836571429e-08,
"loss": 0.0068,
"step": 12290
},
{
"epoch": 1.971547647671716,
"grad_norm": 0.03196869418025017,
"learning_rate": 1.2122787995759722e-08,
"loss": 0.0063,
"step": 12300
},
{
"epoch": 1.9731505970986616,
"grad_norm": 0.04788234829902649,
"learning_rate": 1.0784575150069033e-08,
"loss": 0.006,
"step": 12310
},
{
"epoch": 1.9747535465256072,
"grad_norm": 0.03255213052034378,
"learning_rate": 9.524565776086736e-09,
"loss": 0.0067,
"step": 12320
},
{
"epoch": 1.9763564959525528,
"grad_norm": 0.036573149263858795,
"learning_rate": 8.342769738161061e-09,
"loss": 0.0062,
"step": 12330
},
{
"epoch": 1.9779594453794984,
"grad_norm": 0.04543542489409447,
"learning_rate": 7.2391962883267e-09,
"loss": 0.007,
"step": 12340
},
{
"epoch": 1.9795623948064438,
"grad_norm": 0.03676731884479523,
"learning_rate": 6.213854066228208e-09,
"loss": 0.0063,
"step": 12350
},
{
"epoch": 1.9811653442333894,
"grad_norm": 0.04279434308409691,
"learning_rate": 5.266751099054501e-09,
"loss": 0.0062,
"step": 12360
},
{
"epoch": 1.982768293660335,
"grad_norm": 0.04709082096815109,
"learning_rate": 4.3978948014755664e-09,
"loss": 0.006,
"step": 12370
},
{
"epoch": 1.9843712430872806,
"grad_norm": 0.0320071280002594,
"learning_rate": 3.607291975584737e-09,
"loss": 0.0068,
"step": 12380
},
{
"epoch": 1.985974192514226,
"grad_norm": 0.03497837111353874,
"learning_rate": 2.89494881084762e-09,
"loss": 0.0061,
"step": 12390
},
{
"epoch": 1.9875771419411716,
"grad_norm": 0.042159345000982285,
"learning_rate": 2.2608708840476947e-09,
"loss": 0.0063,
"step": 12400
},
{
"epoch": 1.9891800913681172,
"grad_norm": 0.031165743246674538,
"learning_rate": 1.7050631592485657e-09,
"loss": 0.0053,
"step": 12410
},
{
"epoch": 1.9907830407950629,
"grad_norm": 0.040396977216005325,
"learning_rate": 1.2275299877517743e-09,
"loss": 0.0065,
"step": 12420
},
{
"epoch": 1.9923859902220085,
"grad_norm": 0.05201442912220955,
"learning_rate": 8.282751080646023e-10,
"loss": 0.0064,
"step": 12430
},
{
"epoch": 1.993988939648954,
"grad_norm": 0.043205760419368744,
"learning_rate": 5.073016458700952e-10,
"loss": 0.0058,
"step": 12440
},
{
"epoch": 1.9955918890758997,
"grad_norm": 0.03582298383116722,
"learning_rate": 2.6461211400152785e-10,
"loss": 0.0065,
"step": 12450
},
{
"epoch": 1.9971948385028453,
"grad_norm": 0.032651137560606,
"learning_rate": 1.0020841242575075e-10,
"loss": 0.0065,
"step": 12460
},
{
"epoch": 1.998797787929791,
"grad_norm": 0.03591388836503029,
"learning_rate": 1.4091828223206094e-11,
"loss": 0.0059,
"step": 12470
},
{
"epoch": 1.9997595575859581,
"step": 12476,
"total_flos": 7.626990245493342e+19,
"train_loss": 0.014145435631614382,
"train_runtime": 240833.2709,
"train_samples_per_second": 9.947,
"train_steps_per_second": 0.052
}
],
"logging_steps": 10,
"max_steps": 12476,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.626990245493342e+19,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}