gr00t-6 / checkpoint-20000 /trainer_state.json
willnorris's picture
Add trained GR00T policy — 2025-05-19T12:16:13.221425
3f1878a
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 11.19194180190263,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005595970900951315,
"grad_norm": 7.419506072998047,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.9689,
"step": 10
},
{
"epoch": 0.01119194180190263,
"grad_norm": 8.035171508789062,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.8977,
"step": 20
},
{
"epoch": 0.016787912702853944,
"grad_norm": 7.580524444580078,
"learning_rate": 3e-06,
"loss": 0.9942,
"step": 30
},
{
"epoch": 0.02238388360380526,
"grad_norm": 5.7520976066589355,
"learning_rate": 4.000000000000001e-06,
"loss": 0.8421,
"step": 40
},
{
"epoch": 0.027979854504756575,
"grad_norm": 4.714428901672363,
"learning_rate": 5e-06,
"loss": 0.6063,
"step": 50
},
{
"epoch": 0.03357582540570789,
"grad_norm": 4.136861801147461,
"learning_rate": 6e-06,
"loss": 0.4259,
"step": 60
},
{
"epoch": 0.03917179630665921,
"grad_norm": 2.1667540073394775,
"learning_rate": 7.000000000000001e-06,
"loss": 0.3447,
"step": 70
},
{
"epoch": 0.04476776720761052,
"grad_norm": 2.3095765113830566,
"learning_rate": 8.000000000000001e-06,
"loss": 0.284,
"step": 80
},
{
"epoch": 0.05036373810856184,
"grad_norm": 1.2860591411590576,
"learning_rate": 9e-06,
"loss": 0.2067,
"step": 90
},
{
"epoch": 0.05595970900951315,
"grad_norm": 2.0302886962890625,
"learning_rate": 1e-05,
"loss": 0.1943,
"step": 100
},
{
"epoch": 0.06155567991046446,
"grad_norm": 1.2757196426391602,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.1442,
"step": 110
},
{
"epoch": 0.06715165081141578,
"grad_norm": 1.5842756032943726,
"learning_rate": 1.2e-05,
"loss": 0.132,
"step": 120
},
{
"epoch": 0.0727476217123671,
"grad_norm": 1.0327903032302856,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.097,
"step": 130
},
{
"epoch": 0.07834359261331841,
"grad_norm": 0.733019232749939,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.0807,
"step": 140
},
{
"epoch": 0.08393956351426973,
"grad_norm": 0.9548436999320984,
"learning_rate": 1.5e-05,
"loss": 0.0922,
"step": 150
},
{
"epoch": 0.08953553441522104,
"grad_norm": 0.44906941056251526,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0841,
"step": 160
},
{
"epoch": 0.09513150531617236,
"grad_norm": 0.9586009979248047,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.0726,
"step": 170
},
{
"epoch": 0.10072747621712368,
"grad_norm": 0.6236313581466675,
"learning_rate": 1.8e-05,
"loss": 0.0631,
"step": 180
},
{
"epoch": 0.10632344711807498,
"grad_norm": 1.1688262224197388,
"learning_rate": 1.9e-05,
"loss": 0.0717,
"step": 190
},
{
"epoch": 0.1119194180190263,
"grad_norm": 1.5576119422912598,
"learning_rate": 2e-05,
"loss": 0.0718,
"step": 200
},
{
"epoch": 0.11751538891997762,
"grad_norm": 1.0707802772521973,
"learning_rate": 2.1e-05,
"loss": 0.0591,
"step": 210
},
{
"epoch": 0.12311135982092893,
"grad_norm": 0.8612272143363953,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.0623,
"step": 220
},
{
"epoch": 0.12870733072188026,
"grad_norm": 0.796205997467041,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.0563,
"step": 230
},
{
"epoch": 0.13430330162283155,
"grad_norm": 1.127061367034912,
"learning_rate": 2.4e-05,
"loss": 0.0545,
"step": 240
},
{
"epoch": 0.13989927252378287,
"grad_norm": 0.9559623003005981,
"learning_rate": 2.5e-05,
"loss": 0.0543,
"step": 250
},
{
"epoch": 0.1454952434247342,
"grad_norm": 0.7295358777046204,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.0554,
"step": 260
},
{
"epoch": 0.1510912143256855,
"grad_norm": 0.8386074900627136,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.0488,
"step": 270
},
{
"epoch": 0.15668718522663683,
"grad_norm": 0.9443495869636536,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.0639,
"step": 280
},
{
"epoch": 0.16228315612758815,
"grad_norm": 0.8754186630249023,
"learning_rate": 2.9e-05,
"loss": 0.0477,
"step": 290
},
{
"epoch": 0.16787912702853947,
"grad_norm": 0.5491052269935608,
"learning_rate": 3e-05,
"loss": 0.0509,
"step": 300
},
{
"epoch": 0.17347509792949076,
"grad_norm": 0.7870469093322754,
"learning_rate": 3.1e-05,
"loss": 0.0478,
"step": 310
},
{
"epoch": 0.17907106883044208,
"grad_norm": 0.9322296380996704,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.0514,
"step": 320
},
{
"epoch": 0.1846670397313934,
"grad_norm": 1.236414909362793,
"learning_rate": 3.3e-05,
"loss": 0.0504,
"step": 330
},
{
"epoch": 0.19026301063234471,
"grad_norm": 1.2571903467178345,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.0374,
"step": 340
},
{
"epoch": 0.19585898153329603,
"grad_norm": 1.1705288887023926,
"learning_rate": 3.5e-05,
"loss": 0.0514,
"step": 350
},
{
"epoch": 0.20145495243424735,
"grad_norm": 1.0005333423614502,
"learning_rate": 3.6e-05,
"loss": 0.0459,
"step": 360
},
{
"epoch": 0.20705092333519864,
"grad_norm": 0.5335679054260254,
"learning_rate": 3.7e-05,
"loss": 0.0444,
"step": 370
},
{
"epoch": 0.21264689423614996,
"grad_norm": 1.052669882774353,
"learning_rate": 3.8e-05,
"loss": 0.0409,
"step": 380
},
{
"epoch": 0.21824286513710128,
"grad_norm": 0.44473376870155334,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.0505,
"step": 390
},
{
"epoch": 0.2238388360380526,
"grad_norm": 0.6711838841438293,
"learning_rate": 4e-05,
"loss": 0.0388,
"step": 400
},
{
"epoch": 0.22943480693900392,
"grad_norm": 0.55412358045578,
"learning_rate": 4.1e-05,
"loss": 0.0416,
"step": 410
},
{
"epoch": 0.23503077783995524,
"grad_norm": 1.0375343561172485,
"learning_rate": 4.2e-05,
"loss": 0.0501,
"step": 420
},
{
"epoch": 0.24062674874090656,
"grad_norm": 0.7955525517463684,
"learning_rate": 4.3e-05,
"loss": 0.0461,
"step": 430
},
{
"epoch": 0.24622271964185785,
"grad_norm": 0.8107234239578247,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.0448,
"step": 440
},
{
"epoch": 0.2518186905428092,
"grad_norm": 0.8368202447891235,
"learning_rate": 4.5e-05,
"loss": 0.0459,
"step": 450
},
{
"epoch": 0.2574146614437605,
"grad_norm": 0.6938339471817017,
"learning_rate": 4.600000000000001e-05,
"loss": 0.034,
"step": 460
},
{
"epoch": 0.2630106323447118,
"grad_norm": 0.8612020611763,
"learning_rate": 4.7e-05,
"loss": 0.0454,
"step": 470
},
{
"epoch": 0.2686066032456631,
"grad_norm": 0.777197539806366,
"learning_rate": 4.8e-05,
"loss": 0.0381,
"step": 480
},
{
"epoch": 0.2742025741466144,
"grad_norm": 0.6520339250564575,
"learning_rate": 4.9e-05,
"loss": 0.0381,
"step": 490
},
{
"epoch": 0.27979854504756574,
"grad_norm": 0.5808746814727783,
"learning_rate": 5e-05,
"loss": 0.0285,
"step": 500
},
{
"epoch": 0.28539451594851706,
"grad_norm": 0.9482337832450867,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.0362,
"step": 510
},
{
"epoch": 0.2909904868494684,
"grad_norm": 0.5615134239196777,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.0322,
"step": 520
},
{
"epoch": 0.2965864577504197,
"grad_norm": 1.2695409059524536,
"learning_rate": 5.300000000000001e-05,
"loss": 0.0411,
"step": 530
},
{
"epoch": 0.302182428651371,
"grad_norm": 0.7221632599830627,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.0422,
"step": 540
},
{
"epoch": 0.30777839955232233,
"grad_norm": 1.1144938468933105,
"learning_rate": 5.500000000000001e-05,
"loss": 0.0334,
"step": 550
},
{
"epoch": 0.31337437045327365,
"grad_norm": 0.6722885966300964,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.0436,
"step": 560
},
{
"epoch": 0.318970341354225,
"grad_norm": 1.0043433904647827,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.0452,
"step": 570
},
{
"epoch": 0.3245663122551763,
"grad_norm": 0.9483539462089539,
"learning_rate": 5.8e-05,
"loss": 0.0492,
"step": 580
},
{
"epoch": 0.3301622831561276,
"grad_norm": 0.7825531363487244,
"learning_rate": 5.9e-05,
"loss": 0.0381,
"step": 590
},
{
"epoch": 0.33575825405707893,
"grad_norm": 0.7982919216156006,
"learning_rate": 6e-05,
"loss": 0.0447,
"step": 600
},
{
"epoch": 0.3413542249580302,
"grad_norm": 0.9162524342536926,
"learning_rate": 6.1e-05,
"loss": 0.0453,
"step": 610
},
{
"epoch": 0.3469501958589815,
"grad_norm": 0.5597997903823853,
"learning_rate": 6.2e-05,
"loss": 0.0393,
"step": 620
},
{
"epoch": 0.35254616675993283,
"grad_norm": 0.713256299495697,
"learning_rate": 6.3e-05,
"loss": 0.0394,
"step": 630
},
{
"epoch": 0.35814213766088415,
"grad_norm": 0.7356066703796387,
"learning_rate": 6.400000000000001e-05,
"loss": 0.0339,
"step": 640
},
{
"epoch": 0.36373810856183547,
"grad_norm": 0.5933259129524231,
"learning_rate": 6.500000000000001e-05,
"loss": 0.038,
"step": 650
},
{
"epoch": 0.3693340794627868,
"grad_norm": 0.5277016162872314,
"learning_rate": 6.6e-05,
"loss": 0.0383,
"step": 660
},
{
"epoch": 0.3749300503637381,
"grad_norm": 0.9106026887893677,
"learning_rate": 6.7e-05,
"loss": 0.0268,
"step": 670
},
{
"epoch": 0.38052602126468943,
"grad_norm": 0.5941755771636963,
"learning_rate": 6.800000000000001e-05,
"loss": 0.0399,
"step": 680
},
{
"epoch": 0.38612199216564075,
"grad_norm": 0.7207239270210266,
"learning_rate": 6.9e-05,
"loss": 0.0304,
"step": 690
},
{
"epoch": 0.39171796306659207,
"grad_norm": 0.5808258652687073,
"learning_rate": 7e-05,
"loss": 0.0317,
"step": 700
},
{
"epoch": 0.3973139339675434,
"grad_norm": 0.6304859519004822,
"learning_rate": 7.1e-05,
"loss": 0.0417,
"step": 710
},
{
"epoch": 0.4029099048684947,
"grad_norm": 0.6625694036483765,
"learning_rate": 7.2e-05,
"loss": 0.0301,
"step": 720
},
{
"epoch": 0.408505875769446,
"grad_norm": 0.6456591486930847,
"learning_rate": 7.3e-05,
"loss": 0.0416,
"step": 730
},
{
"epoch": 0.4141018466703973,
"grad_norm": 0.8103715181350708,
"learning_rate": 7.4e-05,
"loss": 0.0398,
"step": 740
},
{
"epoch": 0.4196978175713486,
"grad_norm": 0.592147707939148,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0317,
"step": 750
},
{
"epoch": 0.4252937884722999,
"grad_norm": 0.6823825836181641,
"learning_rate": 7.6e-05,
"loss": 0.031,
"step": 760
},
{
"epoch": 0.43088975937325125,
"grad_norm": 0.3274383544921875,
"learning_rate": 7.7e-05,
"loss": 0.0305,
"step": 770
},
{
"epoch": 0.43648573027420257,
"grad_norm": 0.3436225950717926,
"learning_rate": 7.800000000000001e-05,
"loss": 0.0338,
"step": 780
},
{
"epoch": 0.4420817011751539,
"grad_norm": 0.8361327052116394,
"learning_rate": 7.900000000000001e-05,
"loss": 0.0264,
"step": 790
},
{
"epoch": 0.4476776720761052,
"grad_norm": 0.5449605584144592,
"learning_rate": 8e-05,
"loss": 0.0321,
"step": 800
},
{
"epoch": 0.4532736429770565,
"grad_norm": 0.31227922439575195,
"learning_rate": 8.1e-05,
"loss": 0.0272,
"step": 810
},
{
"epoch": 0.45886961387800784,
"grad_norm": 0.6099038124084473,
"learning_rate": 8.2e-05,
"loss": 0.0504,
"step": 820
},
{
"epoch": 0.46446558477895916,
"grad_norm": 0.6343345642089844,
"learning_rate": 8.3e-05,
"loss": 0.0343,
"step": 830
},
{
"epoch": 0.4700615556799105,
"grad_norm": 0.7962288856506348,
"learning_rate": 8.4e-05,
"loss": 0.0292,
"step": 840
},
{
"epoch": 0.4756575265808618,
"grad_norm": 0.3960738182067871,
"learning_rate": 8.5e-05,
"loss": 0.033,
"step": 850
},
{
"epoch": 0.4812534974818131,
"grad_norm": 0.9380257725715637,
"learning_rate": 8.6e-05,
"loss": 0.0404,
"step": 860
},
{
"epoch": 0.4868494683827644,
"grad_norm": 0.7713156342506409,
"learning_rate": 8.7e-05,
"loss": 0.0387,
"step": 870
},
{
"epoch": 0.4924454392837157,
"grad_norm": 1.137207269668579,
"learning_rate": 8.800000000000001e-05,
"loss": 0.039,
"step": 880
},
{
"epoch": 0.498041410184667,
"grad_norm": 0.7128203511238098,
"learning_rate": 8.900000000000001e-05,
"loss": 0.0354,
"step": 890
},
{
"epoch": 0.5036373810856184,
"grad_norm": 0.6396750211715698,
"learning_rate": 9e-05,
"loss": 0.0367,
"step": 900
},
{
"epoch": 0.5092333519865697,
"grad_norm": 0.6838144659996033,
"learning_rate": 9.1e-05,
"loss": 0.0369,
"step": 910
},
{
"epoch": 0.514829322887521,
"grad_norm": 0.6156594157218933,
"learning_rate": 9.200000000000001e-05,
"loss": 0.0402,
"step": 920
},
{
"epoch": 0.5204252937884724,
"grad_norm": 0.5517926812171936,
"learning_rate": 9.300000000000001e-05,
"loss": 0.0497,
"step": 930
},
{
"epoch": 0.5260212646894236,
"grad_norm": 0.6177653670310974,
"learning_rate": 9.4e-05,
"loss": 0.0322,
"step": 940
},
{
"epoch": 0.5316172355903749,
"grad_norm": 0.5705161094665527,
"learning_rate": 9.5e-05,
"loss": 0.0365,
"step": 950
},
{
"epoch": 0.5372132064913262,
"grad_norm": 0.7966452836990356,
"learning_rate": 9.6e-05,
"loss": 0.0377,
"step": 960
},
{
"epoch": 0.5428091773922775,
"grad_norm": 0.7984173893928528,
"learning_rate": 9.7e-05,
"loss": 0.0335,
"step": 970
},
{
"epoch": 0.5484051482932288,
"grad_norm": 0.6380477547645569,
"learning_rate": 9.8e-05,
"loss": 0.0329,
"step": 980
},
{
"epoch": 0.5540011191941802,
"grad_norm": 0.7180393934249878,
"learning_rate": 9.900000000000001e-05,
"loss": 0.0302,
"step": 990
},
{
"epoch": 0.5595970900951315,
"grad_norm": 0.8885056972503662,
"learning_rate": 0.0001,
"loss": 0.0345,
"step": 1000
},
{
"epoch": 0.5651930609960828,
"grad_norm": 0.41542354226112366,
"learning_rate": 9.999993165095463e-05,
"loss": 0.0445,
"step": 1010
},
{
"epoch": 0.5707890318970341,
"grad_norm": 0.4343472421169281,
"learning_rate": 9.999972660400536e-05,
"loss": 0.0263,
"step": 1020
},
{
"epoch": 0.5763850027979854,
"grad_norm": 0.7970145344734192,
"learning_rate": 9.999938485971279e-05,
"loss": 0.0322,
"step": 1030
},
{
"epoch": 0.5819809736989368,
"grad_norm": 0.6129629015922546,
"learning_rate": 9.999890641901125e-05,
"loss": 0.0262,
"step": 1040
},
{
"epoch": 0.5875769445998881,
"grad_norm": 0.5661425590515137,
"learning_rate": 9.999829128320874e-05,
"loss": 0.0317,
"step": 1050
},
{
"epoch": 0.5931729155008394,
"grad_norm": 0.7532817721366882,
"learning_rate": 9.999753945398704e-05,
"loss": 0.0359,
"step": 1060
},
{
"epoch": 0.5987688864017907,
"grad_norm": 0.42677804827690125,
"learning_rate": 9.999665093340165e-05,
"loss": 0.0273,
"step": 1070
},
{
"epoch": 0.604364857302742,
"grad_norm": 0.6325145363807678,
"learning_rate": 9.99956257238817e-05,
"loss": 0.0377,
"step": 1080
},
{
"epoch": 0.6099608282036934,
"grad_norm": 0.6003039479255676,
"learning_rate": 9.999446382823013e-05,
"loss": 0.0327,
"step": 1090
},
{
"epoch": 0.6155567991046447,
"grad_norm": 0.36753129959106445,
"learning_rate": 9.999316524962345e-05,
"loss": 0.0285,
"step": 1100
},
{
"epoch": 0.621152770005596,
"grad_norm": 0.43158769607543945,
"learning_rate": 9.999172999161198e-05,
"loss": 0.0275,
"step": 1110
},
{
"epoch": 0.6267487409065473,
"grad_norm": 0.33566170930862427,
"learning_rate": 9.999015805811965e-05,
"loss": 0.0278,
"step": 1120
},
{
"epoch": 0.6323447118074986,
"grad_norm": 0.671672523021698,
"learning_rate": 9.998844945344405e-05,
"loss": 0.0344,
"step": 1130
},
{
"epoch": 0.63794068270845,
"grad_norm": 1.1190325021743774,
"learning_rate": 9.998660418225645e-05,
"loss": 0.0304,
"step": 1140
},
{
"epoch": 0.6435366536094013,
"grad_norm": 0.6546229124069214,
"learning_rate": 9.998462224960175e-05,
"loss": 0.0343,
"step": 1150
},
{
"epoch": 0.6491326245103526,
"grad_norm": 0.7560105323791504,
"learning_rate": 9.998250366089848e-05,
"loss": 0.0259,
"step": 1160
},
{
"epoch": 0.6547285954113039,
"grad_norm": 0.6937676072120667,
"learning_rate": 9.998024842193876e-05,
"loss": 0.0308,
"step": 1170
},
{
"epoch": 0.6603245663122552,
"grad_norm": 0.4479691684246063,
"learning_rate": 9.997785653888835e-05,
"loss": 0.0272,
"step": 1180
},
{
"epoch": 0.6659205372132065,
"grad_norm": 0.38218632340431213,
"learning_rate": 9.997532801828658e-05,
"loss": 0.0313,
"step": 1190
},
{
"epoch": 0.6715165081141579,
"grad_norm": 0.3345787525177002,
"learning_rate": 9.997266286704631e-05,
"loss": 0.0328,
"step": 1200
},
{
"epoch": 0.6771124790151091,
"grad_norm": 0.3578011989593506,
"learning_rate": 9.996986109245395e-05,
"loss": 0.0373,
"step": 1210
},
{
"epoch": 0.6827084499160604,
"grad_norm": 0.6602341532707214,
"learning_rate": 9.996692270216947e-05,
"loss": 0.0346,
"step": 1220
},
{
"epoch": 0.6883044208170117,
"grad_norm": 0.4503819942474365,
"learning_rate": 9.996384770422629e-05,
"loss": 0.0243,
"step": 1230
},
{
"epoch": 0.693900391717963,
"grad_norm": 0.753041684627533,
"learning_rate": 9.996063610703137e-05,
"loss": 0.0277,
"step": 1240
},
{
"epoch": 0.6994963626189143,
"grad_norm": 0.3396258056163788,
"learning_rate": 9.995728791936504e-05,
"loss": 0.0219,
"step": 1250
},
{
"epoch": 0.7050923335198657,
"grad_norm": 0.6529501676559448,
"learning_rate": 9.995380315038119e-05,
"loss": 0.0242,
"step": 1260
},
{
"epoch": 0.710688304420817,
"grad_norm": 0.2462773472070694,
"learning_rate": 9.9950181809607e-05,
"loss": 0.021,
"step": 1270
},
{
"epoch": 0.7162842753217683,
"grad_norm": 0.4511205554008484,
"learning_rate": 9.994642390694308e-05,
"loss": 0.0267,
"step": 1280
},
{
"epoch": 0.7218802462227196,
"grad_norm": 0.5708833336830139,
"learning_rate": 9.99425294526634e-05,
"loss": 0.0288,
"step": 1290
},
{
"epoch": 0.7274762171236709,
"grad_norm": 0.4378319978713989,
"learning_rate": 9.993849845741524e-05,
"loss": 0.0308,
"step": 1300
},
{
"epoch": 0.7330721880246223,
"grad_norm": 0.44127964973449707,
"learning_rate": 9.99343309322192e-05,
"loss": 0.0282,
"step": 1310
},
{
"epoch": 0.7386681589255736,
"grad_norm": 0.35624831914901733,
"learning_rate": 9.993002688846913e-05,
"loss": 0.0298,
"step": 1320
},
{
"epoch": 0.7442641298265249,
"grad_norm": 0.45579585433006287,
"learning_rate": 9.992558633793212e-05,
"loss": 0.0325,
"step": 1330
},
{
"epoch": 0.7498601007274762,
"grad_norm": 0.6297839283943176,
"learning_rate": 9.992100929274846e-05,
"loss": 0.0369,
"step": 1340
},
{
"epoch": 0.7554560716284275,
"grad_norm": 0.29105043411254883,
"learning_rate": 9.991629576543163e-05,
"loss": 0.0253,
"step": 1350
},
{
"epoch": 0.7610520425293789,
"grad_norm": 0.501181960105896,
"learning_rate": 9.991144576886823e-05,
"loss": 0.0355,
"step": 1360
},
{
"epoch": 0.7666480134303302,
"grad_norm": 0.4630679488182068,
"learning_rate": 9.990645931631796e-05,
"loss": 0.0264,
"step": 1370
},
{
"epoch": 0.7722439843312815,
"grad_norm": 0.6088075637817383,
"learning_rate": 9.990133642141359e-05,
"loss": 0.0282,
"step": 1380
},
{
"epoch": 0.7778399552322328,
"grad_norm": 0.5682616233825684,
"learning_rate": 9.989607709816091e-05,
"loss": 0.0331,
"step": 1390
},
{
"epoch": 0.7834359261331841,
"grad_norm": 0.4457339644432068,
"learning_rate": 9.989068136093873e-05,
"loss": 0.0309,
"step": 1400
},
{
"epoch": 0.7890318970341355,
"grad_norm": 0.566882848739624,
"learning_rate": 9.988514922449879e-05,
"loss": 0.0436,
"step": 1410
},
{
"epoch": 0.7946278679350868,
"grad_norm": 0.4208590090274811,
"learning_rate": 9.987948070396571e-05,
"loss": 0.0293,
"step": 1420
},
{
"epoch": 0.8002238388360381,
"grad_norm": 0.5373462438583374,
"learning_rate": 9.987367581483705e-05,
"loss": 0.0333,
"step": 1430
},
{
"epoch": 0.8058198097369894,
"grad_norm": 0.4833603799343109,
"learning_rate": 9.986773457298311e-05,
"loss": 0.0238,
"step": 1440
},
{
"epoch": 0.8114157806379407,
"grad_norm": 0.3185485303401947,
"learning_rate": 9.986165699464705e-05,
"loss": 0.0279,
"step": 1450
},
{
"epoch": 0.817011751538892,
"grad_norm": 0.32943880558013916,
"learning_rate": 9.985544309644475e-05,
"loss": 0.0259,
"step": 1460
},
{
"epoch": 0.8226077224398433,
"grad_norm": 0.4028552174568176,
"learning_rate": 9.984909289536473e-05,
"loss": 0.0183,
"step": 1470
},
{
"epoch": 0.8282036933407946,
"grad_norm": 0.3354315459728241,
"learning_rate": 9.984260640876821e-05,
"loss": 0.0279,
"step": 1480
},
{
"epoch": 0.8337996642417459,
"grad_norm": 0.581444263458252,
"learning_rate": 9.983598365438902e-05,
"loss": 0.0231,
"step": 1490
},
{
"epoch": 0.8393956351426972,
"grad_norm": 0.3263351321220398,
"learning_rate": 9.98292246503335e-05,
"loss": 0.0257,
"step": 1500
},
{
"epoch": 0.8449916060436485,
"grad_norm": 0.4574286639690399,
"learning_rate": 9.98223294150805e-05,
"loss": 0.0172,
"step": 1510
},
{
"epoch": 0.8505875769445999,
"grad_norm": 0.6482700705528259,
"learning_rate": 9.981529796748134e-05,
"loss": 0.0252,
"step": 1520
},
{
"epoch": 0.8561835478455512,
"grad_norm": 0.22327029705047607,
"learning_rate": 9.980813032675974e-05,
"loss": 0.0296,
"step": 1530
},
{
"epoch": 0.8617795187465025,
"grad_norm": 0.39261817932128906,
"learning_rate": 9.980082651251175e-05,
"loss": 0.0226,
"step": 1540
},
{
"epoch": 0.8673754896474538,
"grad_norm": 0.3742023706436157,
"learning_rate": 9.979338654470569e-05,
"loss": 0.0283,
"step": 1550
},
{
"epoch": 0.8729714605484051,
"grad_norm": 0.240834578871727,
"learning_rate": 9.97858104436822e-05,
"loss": 0.0176,
"step": 1560
},
{
"epoch": 0.8785674314493565,
"grad_norm": 0.39040738344192505,
"learning_rate": 9.977809823015401e-05,
"loss": 0.0225,
"step": 1570
},
{
"epoch": 0.8841634023503078,
"grad_norm": 0.3102349042892456,
"learning_rate": 9.977024992520602e-05,
"loss": 0.0229,
"step": 1580
},
{
"epoch": 0.8897593732512591,
"grad_norm": 0.32893484830856323,
"learning_rate": 9.976226555029522e-05,
"loss": 0.0286,
"step": 1590
},
{
"epoch": 0.8953553441522104,
"grad_norm": 0.3821198046207428,
"learning_rate": 9.975414512725057e-05,
"loss": 0.0278,
"step": 1600
},
{
"epoch": 0.9009513150531617,
"grad_norm": 0.3672045171260834,
"learning_rate": 9.974588867827301e-05,
"loss": 0.0275,
"step": 1610
},
{
"epoch": 0.906547285954113,
"grad_norm": 0.36223965883255005,
"learning_rate": 9.973749622593534e-05,
"loss": 0.028,
"step": 1620
},
{
"epoch": 0.9121432568550644,
"grad_norm": 0.5474312901496887,
"learning_rate": 9.972896779318219e-05,
"loss": 0.0307,
"step": 1630
},
{
"epoch": 0.9177392277560157,
"grad_norm": 0.7324241399765015,
"learning_rate": 9.972030340333001e-05,
"loss": 0.0246,
"step": 1640
},
{
"epoch": 0.923335198656967,
"grad_norm": 0.44370922446250916,
"learning_rate": 9.97115030800669e-05,
"loss": 0.0229,
"step": 1650
},
{
"epoch": 0.9289311695579183,
"grad_norm": 0.40400007367134094,
"learning_rate": 9.970256684745258e-05,
"loss": 0.0368,
"step": 1660
},
{
"epoch": 0.9345271404588696,
"grad_norm": 0.4597970247268677,
"learning_rate": 9.969349472991838e-05,
"loss": 0.0215,
"step": 1670
},
{
"epoch": 0.940123111359821,
"grad_norm": 0.41508862376213074,
"learning_rate": 9.968428675226714e-05,
"loss": 0.0251,
"step": 1680
},
{
"epoch": 0.9457190822607723,
"grad_norm": 0.5726234316825867,
"learning_rate": 9.967494293967312e-05,
"loss": 0.0385,
"step": 1690
},
{
"epoch": 0.9513150531617236,
"grad_norm": 0.47390761971473694,
"learning_rate": 9.966546331768191e-05,
"loss": 0.0269,
"step": 1700
},
{
"epoch": 0.9569110240626749,
"grad_norm": 0.3252114951610565,
"learning_rate": 9.965584791221048e-05,
"loss": 0.023,
"step": 1710
},
{
"epoch": 0.9625069949636262,
"grad_norm": 0.4773138761520386,
"learning_rate": 9.964609674954696e-05,
"loss": 0.0322,
"step": 1720
},
{
"epoch": 0.9681029658645776,
"grad_norm": 0.45844170451164246,
"learning_rate": 9.963620985635065e-05,
"loss": 0.0233,
"step": 1730
},
{
"epoch": 0.9736989367655288,
"grad_norm": 0.40978696942329407,
"learning_rate": 9.962618725965196e-05,
"loss": 0.0337,
"step": 1740
},
{
"epoch": 0.9792949076664801,
"grad_norm": 0.43942537903785706,
"learning_rate": 9.961602898685226e-05,
"loss": 0.0225,
"step": 1750
},
{
"epoch": 0.9848908785674314,
"grad_norm": 0.7744397521018982,
"learning_rate": 9.96057350657239e-05,
"loss": 0.0302,
"step": 1760
},
{
"epoch": 0.9904868494683827,
"grad_norm": 0.3644595444202423,
"learning_rate": 9.959530552441005e-05,
"loss": 0.0252,
"step": 1770
},
{
"epoch": 0.996082820369334,
"grad_norm": 0.29574769735336304,
"learning_rate": 9.95847403914247e-05,
"loss": 0.0222,
"step": 1780
},
{
"epoch": 1.0016787912702854,
"grad_norm": 0.5153500437736511,
"learning_rate": 9.95740396956525e-05,
"loss": 0.0291,
"step": 1790
},
{
"epoch": 1.0072747621712368,
"grad_norm": 0.5961137413978577,
"learning_rate": 9.956320346634876e-05,
"loss": 0.0266,
"step": 1800
},
{
"epoch": 1.012870733072188,
"grad_norm": 0.48836737871170044,
"learning_rate": 9.955223173313931e-05,
"loss": 0.0213,
"step": 1810
},
{
"epoch": 1.0184667039731394,
"grad_norm": 0.5610430240631104,
"learning_rate": 9.954112452602045e-05,
"loss": 0.0205,
"step": 1820
},
{
"epoch": 1.0240626748740906,
"grad_norm": 0.4025803804397583,
"learning_rate": 9.952988187535886e-05,
"loss": 0.0224,
"step": 1830
},
{
"epoch": 1.029658645775042,
"grad_norm": 0.605367124080658,
"learning_rate": 9.95185038118915e-05,
"loss": 0.0303,
"step": 1840
},
{
"epoch": 1.0352546166759933,
"grad_norm": 0.3206970989704132,
"learning_rate": 9.950699036672559e-05,
"loss": 0.0231,
"step": 1850
},
{
"epoch": 1.0408505875769447,
"grad_norm": 0.3495715260505676,
"learning_rate": 9.949534157133844e-05,
"loss": 0.024,
"step": 1860
},
{
"epoch": 1.046446558477896,
"grad_norm": 0.3895197808742523,
"learning_rate": 9.948355745757741e-05,
"loss": 0.0203,
"step": 1870
},
{
"epoch": 1.0520425293788471,
"grad_norm": 0.40038052201271057,
"learning_rate": 9.94716380576598e-05,
"loss": 0.0221,
"step": 1880
},
{
"epoch": 1.0576385002797986,
"grad_norm": 0.479744553565979,
"learning_rate": 9.945958340417283e-05,
"loss": 0.028,
"step": 1890
},
{
"epoch": 1.0632344711807498,
"grad_norm": 0.3020111322402954,
"learning_rate": 9.944739353007344e-05,
"loss": 0.0265,
"step": 1900
},
{
"epoch": 1.0688304420817012,
"grad_norm": 0.3391585648059845,
"learning_rate": 9.943506846868826e-05,
"loss": 0.0233,
"step": 1910
},
{
"epoch": 1.0744264129826524,
"grad_norm": 0.3941816985607147,
"learning_rate": 9.942260825371358e-05,
"loss": 0.0184,
"step": 1920
},
{
"epoch": 1.0800223838836038,
"grad_norm": 0.31161707639694214,
"learning_rate": 9.941001291921512e-05,
"loss": 0.0229,
"step": 1930
},
{
"epoch": 1.085618354784555,
"grad_norm": 0.33263275027275085,
"learning_rate": 9.939728249962807e-05,
"loss": 0.0227,
"step": 1940
},
{
"epoch": 1.0912143256855065,
"grad_norm": 0.35178300738334656,
"learning_rate": 9.938441702975689e-05,
"loss": 0.0224,
"step": 1950
},
{
"epoch": 1.0968102965864577,
"grad_norm": 0.374667227268219,
"learning_rate": 9.937141654477528e-05,
"loss": 0.0196,
"step": 1960
},
{
"epoch": 1.102406267487409,
"grad_norm": 0.2080841362476349,
"learning_rate": 9.93582810802261e-05,
"loss": 0.0274,
"step": 1970
},
{
"epoch": 1.1080022383883603,
"grad_norm": 0.29197070002555847,
"learning_rate": 9.934501067202117e-05,
"loss": 0.0242,
"step": 1980
},
{
"epoch": 1.1135982092893117,
"grad_norm": 0.32980409264564514,
"learning_rate": 9.93316053564413e-05,
"loss": 0.0189,
"step": 1990
},
{
"epoch": 1.119194180190263,
"grad_norm": 0.4776092767715454,
"learning_rate": 9.931806517013612e-05,
"loss": 0.022,
"step": 2000
},
{
"epoch": 1.1247901510912144,
"grad_norm": 0.37389442324638367,
"learning_rate": 9.930439015012396e-05,
"loss": 0.0216,
"step": 2010
},
{
"epoch": 1.1303861219921656,
"grad_norm": 0.22275716066360474,
"learning_rate": 9.929058033379181e-05,
"loss": 0.0192,
"step": 2020
},
{
"epoch": 1.135982092893117,
"grad_norm": 0.5097452402114868,
"learning_rate": 9.927663575889521e-05,
"loss": 0.0198,
"step": 2030
},
{
"epoch": 1.1415780637940682,
"grad_norm": 0.3198114037513733,
"learning_rate": 9.926255646355804e-05,
"loss": 0.0218,
"step": 2040
},
{
"epoch": 1.1471740346950197,
"grad_norm": 0.1620880514383316,
"learning_rate": 9.92483424862726e-05,
"loss": 0.0227,
"step": 2050
},
{
"epoch": 1.1527700055959709,
"grad_norm": 0.2927526831626892,
"learning_rate": 9.923399386589933e-05,
"loss": 0.0195,
"step": 2060
},
{
"epoch": 1.1583659764969223,
"grad_norm": 0.2967079281806946,
"learning_rate": 9.921951064166684e-05,
"loss": 0.024,
"step": 2070
},
{
"epoch": 1.1639619473978735,
"grad_norm": 0.19401852786540985,
"learning_rate": 9.92048928531717e-05,
"loss": 0.0223,
"step": 2080
},
{
"epoch": 1.169557918298825,
"grad_norm": 0.28363627195358276,
"learning_rate": 9.919014054037836e-05,
"loss": 0.0188,
"step": 2090
},
{
"epoch": 1.1751538891997761,
"grad_norm": 0.3623961806297302,
"learning_rate": 9.917525374361912e-05,
"loss": 0.0206,
"step": 2100
},
{
"epoch": 1.1807498601007276,
"grad_norm": 0.503246545791626,
"learning_rate": 9.91602325035939e-05,
"loss": 0.0253,
"step": 2110
},
{
"epoch": 1.1863458310016788,
"grad_norm": 0.7744673490524292,
"learning_rate": 9.914507686137019e-05,
"loss": 0.0337,
"step": 2120
},
{
"epoch": 1.19194180190263,
"grad_norm": 0.48357081413269043,
"learning_rate": 9.912978685838294e-05,
"loss": 0.0309,
"step": 2130
},
{
"epoch": 1.1975377728035814,
"grad_norm": 0.22658684849739075,
"learning_rate": 9.911436253643445e-05,
"loss": 0.0208,
"step": 2140
},
{
"epoch": 1.2031337437045329,
"grad_norm": 0.40776172280311584,
"learning_rate": 9.90988039376942e-05,
"loss": 0.0232,
"step": 2150
},
{
"epoch": 1.208729714605484,
"grad_norm": 0.48974546790122986,
"learning_rate": 9.90831111046988e-05,
"loss": 0.0278,
"step": 2160
},
{
"epoch": 1.2143256855064353,
"grad_norm": 0.3066832423210144,
"learning_rate": 9.90672840803519e-05,
"loss": 0.018,
"step": 2170
},
{
"epoch": 1.2199216564073867,
"grad_norm": 0.22434163093566895,
"learning_rate": 9.905132290792394e-05,
"loss": 0.0141,
"step": 2180
},
{
"epoch": 1.225517627308338,
"grad_norm": 0.3365159034729004,
"learning_rate": 9.903522763105218e-05,
"loss": 0.0205,
"step": 2190
},
{
"epoch": 1.2311135982092893,
"grad_norm": 0.3467719256877899,
"learning_rate": 9.901899829374047e-05,
"loss": 0.0206,
"step": 2200
},
{
"epoch": 1.2367095691102405,
"grad_norm": 0.31818097829818726,
"learning_rate": 9.900263494035921e-05,
"loss": 0.0255,
"step": 2210
},
{
"epoch": 1.242305540011192,
"grad_norm": 0.3118780851364136,
"learning_rate": 9.89861376156452e-05,
"loss": 0.0211,
"step": 2220
},
{
"epoch": 1.2479015109121432,
"grad_norm": 0.2563456594944,
"learning_rate": 9.896950636470147e-05,
"loss": 0.0249,
"step": 2230
},
{
"epoch": 1.2534974818130946,
"grad_norm": 0.4434971213340759,
"learning_rate": 9.895274123299723e-05,
"loss": 0.0214,
"step": 2240
},
{
"epoch": 1.2590934527140458,
"grad_norm": 0.36243245005607605,
"learning_rate": 9.893584226636772e-05,
"loss": 0.0239,
"step": 2250
},
{
"epoch": 1.2646894236149973,
"grad_norm": 0.4027983546257019,
"learning_rate": 9.891880951101407e-05,
"loss": 0.0328,
"step": 2260
},
{
"epoch": 1.2702853945159485,
"grad_norm": 0.4992479383945465,
"learning_rate": 9.890164301350318e-05,
"loss": 0.0247,
"step": 2270
},
{
"epoch": 1.2758813654169,
"grad_norm": 0.5188339948654175,
"learning_rate": 9.888434282076758e-05,
"loss": 0.0252,
"step": 2280
},
{
"epoch": 1.281477336317851,
"grad_norm": 0.2691977620124817,
"learning_rate": 9.886690898010535e-05,
"loss": 0.0238,
"step": 2290
},
{
"epoch": 1.2870733072188025,
"grad_norm": 0.42759424448013306,
"learning_rate": 9.884934153917997e-05,
"loss": 0.0252,
"step": 2300
},
{
"epoch": 1.2926692781197537,
"grad_norm": 0.315560519695282,
"learning_rate": 9.883164054602012e-05,
"loss": 0.0184,
"step": 2310
},
{
"epoch": 1.2982652490207052,
"grad_norm": 0.34518998861312866,
"learning_rate": 9.881380604901964e-05,
"loss": 0.026,
"step": 2320
},
{
"epoch": 1.3038612199216564,
"grad_norm": 0.322465717792511,
"learning_rate": 9.879583809693738e-05,
"loss": 0.0217,
"step": 2330
},
{
"epoch": 1.3094571908226076,
"grad_norm": 0.31809547543525696,
"learning_rate": 9.877773673889701e-05,
"loss": 0.0219,
"step": 2340
},
{
"epoch": 1.315053161723559,
"grad_norm": 0.4411179721355438,
"learning_rate": 9.8759502024387e-05,
"loss": 0.0221,
"step": 2350
},
{
"epoch": 1.3206491326245104,
"grad_norm": 0.44775789976119995,
"learning_rate": 9.87411340032603e-05,
"loss": 0.0234,
"step": 2360
},
{
"epoch": 1.3262451035254617,
"grad_norm": 0.5176445245742798,
"learning_rate": 9.872263272573443e-05,
"loss": 0.0255,
"step": 2370
},
{
"epoch": 1.3318410744264129,
"grad_norm": 0.36430883407592773,
"learning_rate": 9.870399824239117e-05,
"loss": 0.0205,
"step": 2380
},
{
"epoch": 1.3374370453273643,
"grad_norm": 0.5294170379638672,
"learning_rate": 9.868523060417646e-05,
"loss": 0.0266,
"step": 2390
},
{
"epoch": 1.3430330162283157,
"grad_norm": 0.3633783459663391,
"learning_rate": 9.86663298624003e-05,
"loss": 0.0208,
"step": 2400
},
{
"epoch": 1.348628987129267,
"grad_norm": 0.5161033272743225,
"learning_rate": 9.864729606873663e-05,
"loss": 0.0201,
"step": 2410
},
{
"epoch": 1.3542249580302181,
"grad_norm": 0.6746691465377808,
"learning_rate": 9.862812927522309e-05,
"loss": 0.0243,
"step": 2420
},
{
"epoch": 1.3598209289311696,
"grad_norm": 0.2213054746389389,
"learning_rate": 9.860882953426099e-05,
"loss": 0.0209,
"step": 2430
},
{
"epoch": 1.365416899832121,
"grad_norm": 0.6545590162277222,
"learning_rate": 9.858939689861506e-05,
"loss": 0.0225,
"step": 2440
},
{
"epoch": 1.3710128707330722,
"grad_norm": 0.46804091334342957,
"learning_rate": 9.856983142141339e-05,
"loss": 0.0271,
"step": 2450
},
{
"epoch": 1.3766088416340234,
"grad_norm": 0.38381436467170715,
"learning_rate": 9.855013315614725e-05,
"loss": 0.0233,
"step": 2460
},
{
"epoch": 1.3822048125349748,
"grad_norm": 0.41659992933273315,
"learning_rate": 9.853030215667093e-05,
"loss": 0.0229,
"step": 2470
},
{
"epoch": 1.387800783435926,
"grad_norm": 0.4473920464515686,
"learning_rate": 9.851033847720166e-05,
"loss": 0.0278,
"step": 2480
},
{
"epoch": 1.3933967543368775,
"grad_norm": 0.3903592824935913,
"learning_rate": 9.849024217231935e-05,
"loss": 0.0222,
"step": 2490
},
{
"epoch": 1.3989927252378287,
"grad_norm": 0.296999454498291,
"learning_rate": 9.847001329696653e-05,
"loss": 0.0287,
"step": 2500
},
{
"epoch": 1.4045886961387801,
"grad_norm": 0.45139339566230774,
"learning_rate": 9.844965190644817e-05,
"loss": 0.0253,
"step": 2510
},
{
"epoch": 1.4101846670397313,
"grad_norm": 0.29245492815971375,
"learning_rate": 9.842915805643155e-05,
"loss": 0.0149,
"step": 2520
},
{
"epoch": 1.4157806379406828,
"grad_norm": 0.2889615595340729,
"learning_rate": 9.840853180294608e-05,
"loss": 0.0224,
"step": 2530
},
{
"epoch": 1.421376608841634,
"grad_norm": 0.4102277457714081,
"learning_rate": 9.838777320238312e-05,
"loss": 0.0268,
"step": 2540
},
{
"epoch": 1.4269725797425854,
"grad_norm": 0.5045889616012573,
"learning_rate": 9.836688231149592e-05,
"loss": 0.0195,
"step": 2550
},
{
"epoch": 1.4325685506435366,
"grad_norm": 0.5412267446517944,
"learning_rate": 9.834585918739936e-05,
"loss": 0.0262,
"step": 2560
},
{
"epoch": 1.438164521544488,
"grad_norm": 0.5022779703140259,
"learning_rate": 9.832470388756987e-05,
"loss": 0.0268,
"step": 2570
},
{
"epoch": 1.4437604924454392,
"grad_norm": 0.5818321108818054,
"learning_rate": 9.830341646984521e-05,
"loss": 0.0262,
"step": 2580
},
{
"epoch": 1.4493564633463907,
"grad_norm": 0.3627963066101074,
"learning_rate": 9.82819969924244e-05,
"loss": 0.0161,
"step": 2590
},
{
"epoch": 1.4549524342473419,
"grad_norm": 0.35047340393066406,
"learning_rate": 9.826044551386744e-05,
"loss": 0.0245,
"step": 2600
},
{
"epoch": 1.4605484051482933,
"grad_norm": 0.2970013916492462,
"learning_rate": 9.823876209309527e-05,
"loss": 0.0206,
"step": 2610
},
{
"epoch": 1.4661443760492445,
"grad_norm": 0.39108118414878845,
"learning_rate": 9.821694678938953e-05,
"loss": 0.0229,
"step": 2620
},
{
"epoch": 1.4717403469501957,
"grad_norm": 0.30723538994789124,
"learning_rate": 9.819499966239243e-05,
"loss": 0.0239,
"step": 2630
},
{
"epoch": 1.4773363178511472,
"grad_norm": 0.316388338804245,
"learning_rate": 9.817292077210659e-05,
"loss": 0.0232,
"step": 2640
},
{
"epoch": 1.4829322887520986,
"grad_norm": 0.2693226635456085,
"learning_rate": 9.815071017889482e-05,
"loss": 0.0201,
"step": 2650
},
{
"epoch": 1.4885282596530498,
"grad_norm": 0.2165406197309494,
"learning_rate": 9.812836794348004e-05,
"loss": 0.0178,
"step": 2660
},
{
"epoch": 1.494124230554001,
"grad_norm": 0.33953240513801575,
"learning_rate": 9.81058941269451e-05,
"loss": 0.0247,
"step": 2670
},
{
"epoch": 1.4997202014549524,
"grad_norm": 0.37577569484710693,
"learning_rate": 9.808328879073251e-05,
"loss": 0.0188,
"step": 2680
},
{
"epoch": 1.5053161723559039,
"grad_norm": 0.3397989273071289,
"learning_rate": 9.806055199664446e-05,
"loss": 0.0174,
"step": 2690
},
{
"epoch": 1.510912143256855,
"grad_norm": 0.11495699733495712,
"learning_rate": 9.803768380684242e-05,
"loss": 0.0193,
"step": 2700
},
{
"epoch": 1.5165081141578063,
"grad_norm": 0.3947618305683136,
"learning_rate": 9.801468428384716e-05,
"loss": 0.0195,
"step": 2710
},
{
"epoch": 1.5221040850587577,
"grad_norm": 0.3024958670139313,
"learning_rate": 9.799155349053851e-05,
"loss": 0.021,
"step": 2720
},
{
"epoch": 1.5277000559597091,
"grad_norm": 0.3651089072227478,
"learning_rate": 9.796829149015517e-05,
"loss": 0.0148,
"step": 2730
},
{
"epoch": 1.5332960268606604,
"grad_norm": 0.6126254796981812,
"learning_rate": 9.794489834629455e-05,
"loss": 0.0187,
"step": 2740
},
{
"epoch": 1.5388919977616116,
"grad_norm": 0.35577818751335144,
"learning_rate": 9.792137412291265e-05,
"loss": 0.0183,
"step": 2750
},
{
"epoch": 1.544487968662563,
"grad_norm": 0.26784461736679077,
"learning_rate": 9.789771888432375e-05,
"loss": 0.0239,
"step": 2760
},
{
"epoch": 1.5500839395635144,
"grad_norm": 0.3259308338165283,
"learning_rate": 9.787393269520039e-05,
"loss": 0.0174,
"step": 2770
},
{
"epoch": 1.5556799104644656,
"grad_norm": 0.3289090394973755,
"learning_rate": 9.785001562057309e-05,
"loss": 0.0185,
"step": 2780
},
{
"epoch": 1.5612758813654168,
"grad_norm": 0.41667595505714417,
"learning_rate": 9.782596772583026e-05,
"loss": 0.0264,
"step": 2790
},
{
"epoch": 1.5668718522663683,
"grad_norm": 0.4217163324356079,
"learning_rate": 9.780178907671789e-05,
"loss": 0.0221,
"step": 2800
},
{
"epoch": 1.5724678231673195,
"grad_norm": 0.3442951440811157,
"learning_rate": 9.777747973933948e-05,
"loss": 0.0195,
"step": 2810
},
{
"epoch": 1.578063794068271,
"grad_norm": 0.38543257117271423,
"learning_rate": 9.775303978015585e-05,
"loss": 0.0189,
"step": 2820
},
{
"epoch": 1.5836597649692221,
"grad_norm": 0.6017774939537048,
"learning_rate": 9.772846926598491e-05,
"loss": 0.0254,
"step": 2830
},
{
"epoch": 1.5892557358701733,
"grad_norm": 0.5754305720329285,
"learning_rate": 9.77037682640015e-05,
"loss": 0.0224,
"step": 2840
},
{
"epoch": 1.5948517067711248,
"grad_norm": 0.2952113747596741,
"learning_rate": 9.767893684173721e-05,
"loss": 0.0209,
"step": 2850
},
{
"epoch": 1.6004476776720762,
"grad_norm": 0.3667709231376648,
"learning_rate": 9.765397506708023e-05,
"loss": 0.0221,
"step": 2860
},
{
"epoch": 1.6060436485730274,
"grad_norm": 0.543677031993866,
"learning_rate": 9.762888300827507e-05,
"loss": 0.0216,
"step": 2870
},
{
"epoch": 1.6116396194739786,
"grad_norm": 0.3521057069301605,
"learning_rate": 9.760366073392246e-05,
"loss": 0.02,
"step": 2880
},
{
"epoch": 1.61723559037493,
"grad_norm": 0.35763946175575256,
"learning_rate": 9.757830831297914e-05,
"loss": 0.0244,
"step": 2890
},
{
"epoch": 1.6228315612758815,
"grad_norm": 0.25549840927124023,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0224,
"step": 2900
},
{
"epoch": 1.6284275321768327,
"grad_norm": 0.22006206214427948,
"learning_rate": 9.752721330892624e-05,
"loss": 0.0178,
"step": 2910
},
{
"epoch": 1.6340235030777839,
"grad_norm": 0.2791355550289154,
"learning_rate": 9.750147086550844e-05,
"loss": 0.0204,
"step": 2920
},
{
"epoch": 1.6396194739787353,
"grad_norm": 0.34600383043289185,
"learning_rate": 9.747559855488313e-05,
"loss": 0.0206,
"step": 2930
},
{
"epoch": 1.6452154448796867,
"grad_norm": 0.40189531445503235,
"learning_rate": 9.744959644778422e-05,
"loss": 0.0213,
"step": 2940
},
{
"epoch": 1.650811415780638,
"grad_norm": 0.21385939419269562,
"learning_rate": 9.742346461530048e-05,
"loss": 0.0287,
"step": 2950
},
{
"epoch": 1.6564073866815892,
"grad_norm": 0.4269281327724457,
"learning_rate": 9.739720312887535e-05,
"loss": 0.0226,
"step": 2960
},
{
"epoch": 1.6620033575825406,
"grad_norm": 0.46277040243148804,
"learning_rate": 9.73708120603067e-05,
"loss": 0.0206,
"step": 2970
},
{
"epoch": 1.667599328483492,
"grad_norm": 0.340044230222702,
"learning_rate": 9.734429148174675e-05,
"loss": 0.016,
"step": 2980
},
{
"epoch": 1.6731952993844432,
"grad_norm": 0.33839765191078186,
"learning_rate": 9.731764146570173e-05,
"loss": 0.0208,
"step": 2990
},
{
"epoch": 1.6787912702853944,
"grad_norm": 0.4214085042476654,
"learning_rate": 9.729086208503174e-05,
"loss": 0.0291,
"step": 3000
},
{
"epoch": 1.6843872411863459,
"grad_norm": 0.29594293236732483,
"learning_rate": 9.726395341295062e-05,
"loss": 0.0194,
"step": 3010
},
{
"epoch": 1.6899832120872973,
"grad_norm": 0.43080446124076843,
"learning_rate": 9.723691552302562e-05,
"loss": 0.0204,
"step": 3020
},
{
"epoch": 1.6955791829882485,
"grad_norm": 0.3255208134651184,
"learning_rate": 9.720974848917735e-05,
"loss": 0.0219,
"step": 3030
},
{
"epoch": 1.7011751538891997,
"grad_norm": 0.30094242095947266,
"learning_rate": 9.718245238567939e-05,
"loss": 0.0207,
"step": 3040
},
{
"epoch": 1.7067711247901511,
"grad_norm": 0.27606436610221863,
"learning_rate": 9.715502728715826e-05,
"loss": 0.025,
"step": 3050
},
{
"epoch": 1.7123670956911026,
"grad_norm": 0.21307139098644257,
"learning_rate": 9.712747326859315e-05,
"loss": 0.0202,
"step": 3060
},
{
"epoch": 1.7179630665920538,
"grad_norm": 0.4076824188232422,
"learning_rate": 9.709979040531569e-05,
"loss": 0.0181,
"step": 3070
},
{
"epoch": 1.723559037493005,
"grad_norm": 0.3973149359226227,
"learning_rate": 9.707197877300974e-05,
"loss": 0.0278,
"step": 3080
},
{
"epoch": 1.7291550083939562,
"grad_norm": 0.3367111086845398,
"learning_rate": 9.704403844771128e-05,
"loss": 0.0284,
"step": 3090
},
{
"epoch": 1.7347509792949076,
"grad_norm": 0.4137897193431854,
"learning_rate": 9.701596950580806e-05,
"loss": 0.0251,
"step": 3100
},
{
"epoch": 1.740346950195859,
"grad_norm": 0.28888463973999023,
"learning_rate": 9.698777202403953e-05,
"loss": 0.0185,
"step": 3110
},
{
"epoch": 1.7459429210968103,
"grad_norm": 0.2732876241207123,
"learning_rate": 9.695944607949649e-05,
"loss": 0.0206,
"step": 3120
},
{
"epoch": 1.7515388919977615,
"grad_norm": 0.5475505590438843,
"learning_rate": 9.693099174962103e-05,
"loss": 0.0239,
"step": 3130
},
{
"epoch": 1.757134862898713,
"grad_norm": 0.3212341070175171,
"learning_rate": 9.690240911220618e-05,
"loss": 0.0193,
"step": 3140
},
{
"epoch": 1.7627308337996643,
"grad_norm": 0.38309773802757263,
"learning_rate": 9.687369824539577e-05,
"loss": 0.0228,
"step": 3150
},
{
"epoch": 1.7683268047006155,
"grad_norm": 0.22085356712341309,
"learning_rate": 9.684485922768422e-05,
"loss": 0.0167,
"step": 3160
},
{
"epoch": 1.7739227756015667,
"grad_norm": 0.32358717918395996,
"learning_rate": 9.681589213791633e-05,
"loss": 0.0216,
"step": 3170
},
{
"epoch": 1.7795187465025182,
"grad_norm": 0.30354073643684387,
"learning_rate": 9.6786797055287e-05,
"loss": 0.0202,
"step": 3180
},
{
"epoch": 1.7851147174034696,
"grad_norm": 0.3479655981063843,
"learning_rate": 9.675757405934103e-05,
"loss": 0.0167,
"step": 3190
},
{
"epoch": 1.7907106883044208,
"grad_norm": 0.3674020767211914,
"learning_rate": 9.672822322997305e-05,
"loss": 0.0216,
"step": 3200
},
{
"epoch": 1.796306659205372,
"grad_norm": 0.2632925808429718,
"learning_rate": 9.669874464742705e-05,
"loss": 0.0166,
"step": 3210
},
{
"epoch": 1.8019026301063235,
"grad_norm": 0.22815559804439545,
"learning_rate": 9.66691383922964e-05,
"loss": 0.0182,
"step": 3220
},
{
"epoch": 1.8074986010072749,
"grad_norm": 0.2246052771806717,
"learning_rate": 9.663940454552342e-05,
"loss": 0.0186,
"step": 3230
},
{
"epoch": 1.813094571908226,
"grad_norm": 0.28712260723114014,
"learning_rate": 9.660954318839933e-05,
"loss": 0.0157,
"step": 3240
},
{
"epoch": 1.8186905428091773,
"grad_norm": 0.2282487452030182,
"learning_rate": 9.657955440256395e-05,
"loss": 0.0201,
"step": 3250
},
{
"epoch": 1.8242865137101287,
"grad_norm": 0.3279257118701935,
"learning_rate": 9.654943827000548e-05,
"loss": 0.0153,
"step": 3260
},
{
"epoch": 1.8298824846110802,
"grad_norm": 0.3519797623157501,
"learning_rate": 9.651919487306025e-05,
"loss": 0.0217,
"step": 3270
},
{
"epoch": 1.8354784555120314,
"grad_norm": 0.29638567566871643,
"learning_rate": 9.648882429441257e-05,
"loss": 0.0165,
"step": 3280
},
{
"epoch": 1.8410744264129826,
"grad_norm": 0.3102523982524872,
"learning_rate": 9.645832661709444e-05,
"loss": 0.02,
"step": 3290
},
{
"epoch": 1.846670397313934,
"grad_norm": 0.31784892082214355,
"learning_rate": 9.642770192448536e-05,
"loss": 0.0259,
"step": 3300
},
{
"epoch": 1.8522663682148854,
"grad_norm": 0.31783589720726013,
"learning_rate": 9.639695030031204e-05,
"loss": 0.0154,
"step": 3310
},
{
"epoch": 1.8578623391158366,
"grad_norm": 0.4002092778682709,
"learning_rate": 9.636607182864827e-05,
"loss": 0.0128,
"step": 3320
},
{
"epoch": 1.8634583100167879,
"grad_norm": 0.3656691610813141,
"learning_rate": 9.63350665939146e-05,
"loss": 0.0167,
"step": 3330
},
{
"epoch": 1.869054280917739,
"grad_norm": 0.34003934264183044,
"learning_rate": 9.630393468087818e-05,
"loss": 0.018,
"step": 3340
},
{
"epoch": 1.8746502518186905,
"grad_norm": 0.3051067888736725,
"learning_rate": 9.627267617465243e-05,
"loss": 0.0192,
"step": 3350
},
{
"epoch": 1.880246222719642,
"grad_norm": 0.32361093163490295,
"learning_rate": 9.624129116069694e-05,
"loss": 0.0262,
"step": 3360
},
{
"epoch": 1.8858421936205931,
"grad_norm": 0.20856234431266785,
"learning_rate": 9.620977972481716e-05,
"loss": 0.0259,
"step": 3370
},
{
"epoch": 1.8914381645215443,
"grad_norm": 0.3916553258895874,
"learning_rate": 9.617814195316411e-05,
"loss": 0.0184,
"step": 3380
},
{
"epoch": 1.8970341354224958,
"grad_norm": 0.461211621761322,
"learning_rate": 9.614637793223425e-05,
"loss": 0.018,
"step": 3390
},
{
"epoch": 1.9026301063234472,
"grad_norm": 0.4060401916503906,
"learning_rate": 9.611448774886924e-05,
"loss": 0.0196,
"step": 3400
},
{
"epoch": 1.9082260772243984,
"grad_norm": 0.362894207239151,
"learning_rate": 9.60824714902556e-05,
"loss": 0.0149,
"step": 3410
},
{
"epoch": 1.9138220481253496,
"grad_norm": 0.2224276214838028,
"learning_rate": 9.605032924392457e-05,
"loss": 0.0214,
"step": 3420
},
{
"epoch": 1.919418019026301,
"grad_norm": 0.36570799350738525,
"learning_rate": 9.601806109775179e-05,
"loss": 0.019,
"step": 3430
},
{
"epoch": 1.9250139899272525,
"grad_norm": 0.37845227122306824,
"learning_rate": 9.598566713995718e-05,
"loss": 0.0283,
"step": 3440
},
{
"epoch": 1.9306099608282037,
"grad_norm": 0.2989262044429779,
"learning_rate": 9.595314745910456e-05,
"loss": 0.0195,
"step": 3450
},
{
"epoch": 1.936205931729155,
"grad_norm": 0.4651845097541809,
"learning_rate": 9.59205021441015e-05,
"loss": 0.0221,
"step": 3460
},
{
"epoch": 1.9418019026301063,
"grad_norm": 0.16341492533683777,
"learning_rate": 9.588773128419906e-05,
"loss": 0.0189,
"step": 3470
},
{
"epoch": 1.9473978735310578,
"grad_norm": 0.3499149978160858,
"learning_rate": 9.58548349689915e-05,
"loss": 0.0163,
"step": 3480
},
{
"epoch": 1.952993844432009,
"grad_norm": 0.5015300512313843,
"learning_rate": 9.582181328841611e-05,
"loss": 0.0287,
"step": 3490
},
{
"epoch": 1.9585898153329602,
"grad_norm": 0.3239698112010956,
"learning_rate": 9.578866633275288e-05,
"loss": 0.0168,
"step": 3500
},
{
"epoch": 1.9641857862339116,
"grad_norm": 0.29603099822998047,
"learning_rate": 9.575539419262434e-05,
"loss": 0.0204,
"step": 3510
},
{
"epoch": 1.969781757134863,
"grad_norm": 0.4523886740207672,
"learning_rate": 9.572199695899522e-05,
"loss": 0.0247,
"step": 3520
},
{
"epoch": 1.9753777280358142,
"grad_norm": 0.2664707899093628,
"learning_rate": 9.568847472317232e-05,
"loss": 0.0155,
"step": 3530
},
{
"epoch": 1.9809736989367654,
"grad_norm": 0.3717735707759857,
"learning_rate": 9.565482757680415e-05,
"loss": 0.0279,
"step": 3540
},
{
"epoch": 1.9865696698377169,
"grad_norm": 0.4721260070800781,
"learning_rate": 9.562105561188069e-05,
"loss": 0.017,
"step": 3550
},
{
"epoch": 1.9921656407386683,
"grad_norm": 0.19504283368587494,
"learning_rate": 9.558715892073323e-05,
"loss": 0.0251,
"step": 3560
},
{
"epoch": 1.9977616116396195,
"grad_norm": 0.3900291919708252,
"learning_rate": 9.555313759603402e-05,
"loss": 0.028,
"step": 3570
},
{
"epoch": 2.0033575825405707,
"grad_norm": 0.3327538073062897,
"learning_rate": 9.551899173079607e-05,
"loss": 0.0214,
"step": 3580
},
{
"epoch": 2.008953553441522,
"grad_norm": 0.5092990398406982,
"learning_rate": 9.548472141837286e-05,
"loss": 0.0204,
"step": 3590
},
{
"epoch": 2.0145495243424736,
"grad_norm": 0.2563795745372772,
"learning_rate": 9.545032675245813e-05,
"loss": 0.0242,
"step": 3600
},
{
"epoch": 2.020145495243425,
"grad_norm": 0.1788598746061325,
"learning_rate": 9.541580782708557e-05,
"loss": 0.0189,
"step": 3610
},
{
"epoch": 2.025741466144376,
"grad_norm": 0.2857683598995209,
"learning_rate": 9.538116473662861e-05,
"loss": 0.0187,
"step": 3620
},
{
"epoch": 2.031337437045327,
"grad_norm": 0.25776809453964233,
"learning_rate": 9.534639757580013e-05,
"loss": 0.0176,
"step": 3630
},
{
"epoch": 2.036933407946279,
"grad_norm": 0.37827619910240173,
"learning_rate": 9.531150643965223e-05,
"loss": 0.0133,
"step": 3640
},
{
"epoch": 2.04252937884723,
"grad_norm": 0.36484652757644653,
"learning_rate": 9.527649142357596e-05,
"loss": 0.021,
"step": 3650
},
{
"epoch": 2.0481253497481813,
"grad_norm": 0.41479215025901794,
"learning_rate": 9.524135262330098e-05,
"loss": 0.0159,
"step": 3660
},
{
"epoch": 2.0537213206491325,
"grad_norm": 0.261192262172699,
"learning_rate": 9.520609013489547e-05,
"loss": 0.0169,
"step": 3670
},
{
"epoch": 2.059317291550084,
"grad_norm": 0.3758920431137085,
"learning_rate": 9.517070405476575e-05,
"loss": 0.02,
"step": 3680
},
{
"epoch": 2.0649132624510353,
"grad_norm": 0.33406686782836914,
"learning_rate": 9.513519447965595e-05,
"loss": 0.0176,
"step": 3690
},
{
"epoch": 2.0705092333519866,
"grad_norm": 0.18889296054840088,
"learning_rate": 9.509956150664796e-05,
"loss": 0.0167,
"step": 3700
},
{
"epoch": 2.0761052042529378,
"grad_norm": 0.231406569480896,
"learning_rate": 9.50638052331609e-05,
"loss": 0.0232,
"step": 3710
},
{
"epoch": 2.0817011751538894,
"grad_norm": 0.31842225790023804,
"learning_rate": 9.502792575695112e-05,
"loss": 0.0219,
"step": 3720
},
{
"epoch": 2.0872971460548406,
"grad_norm": 0.2191598266363144,
"learning_rate": 9.499192317611167e-05,
"loss": 0.0207,
"step": 3730
},
{
"epoch": 2.092893116955792,
"grad_norm": 0.3848901391029358,
"learning_rate": 9.49557975890723e-05,
"loss": 0.0205,
"step": 3740
},
{
"epoch": 2.098489087856743,
"grad_norm": 0.3654007017612457,
"learning_rate": 9.491954909459895e-05,
"loss": 0.0202,
"step": 3750
},
{
"epoch": 2.1040850587576942,
"grad_norm": 0.3708373010158539,
"learning_rate": 9.488317779179361e-05,
"loss": 0.0186,
"step": 3760
},
{
"epoch": 2.109681029658646,
"grad_norm": 0.29888278245925903,
"learning_rate": 9.484668378009408e-05,
"loss": 0.0179,
"step": 3770
},
{
"epoch": 2.115277000559597,
"grad_norm": 0.3273047208786011,
"learning_rate": 9.481006715927351e-05,
"loss": 0.0194,
"step": 3780
},
{
"epoch": 2.1208729714605483,
"grad_norm": 0.30253902077674866,
"learning_rate": 9.477332802944044e-05,
"loss": 0.0172,
"step": 3790
},
{
"epoch": 2.1264689423614995,
"grad_norm": 0.3017847537994385,
"learning_rate": 9.473646649103818e-05,
"loss": 0.0239,
"step": 3800
},
{
"epoch": 2.132064913262451,
"grad_norm": 0.2024342566728592,
"learning_rate": 9.46994826448448e-05,
"loss": 0.0229,
"step": 3810
},
{
"epoch": 2.1376608841634024,
"grad_norm": 0.25708290934562683,
"learning_rate": 9.46623765919727e-05,
"loss": 0.017,
"step": 3820
},
{
"epoch": 2.1432568550643536,
"grad_norm": 0.38740572333335876,
"learning_rate": 9.462514843386845e-05,
"loss": 0.0186,
"step": 3830
},
{
"epoch": 2.148852825965305,
"grad_norm": 0.41047894954681396,
"learning_rate": 9.458779827231237e-05,
"loss": 0.0197,
"step": 3840
},
{
"epoch": 2.1544487968662565,
"grad_norm": 0.26995226740837097,
"learning_rate": 9.45503262094184e-05,
"loss": 0.0183,
"step": 3850
},
{
"epoch": 2.1600447677672077,
"grad_norm": 0.3127893805503845,
"learning_rate": 9.451273234763371e-05,
"loss": 0.0206,
"step": 3860
},
{
"epoch": 2.165640738668159,
"grad_norm": 0.33325016498565674,
"learning_rate": 9.447501678973852e-05,
"loss": 0.0208,
"step": 3870
},
{
"epoch": 2.17123670956911,
"grad_norm": 0.2265041172504425,
"learning_rate": 9.443717963884569e-05,
"loss": 0.0177,
"step": 3880
},
{
"epoch": 2.1768326804700617,
"grad_norm": 0.44378480315208435,
"learning_rate": 9.439922099840054e-05,
"loss": 0.0232,
"step": 3890
},
{
"epoch": 2.182428651371013,
"grad_norm": 0.2953107953071594,
"learning_rate": 9.43611409721806e-05,
"loss": 0.0201,
"step": 3900
},
{
"epoch": 2.188024622271964,
"grad_norm": 0.3876049518585205,
"learning_rate": 9.432293966429514e-05,
"loss": 0.0164,
"step": 3910
},
{
"epoch": 2.1936205931729154,
"grad_norm": 0.2669300138950348,
"learning_rate": 9.428461717918511e-05,
"loss": 0.0153,
"step": 3920
},
{
"epoch": 2.199216564073867,
"grad_norm": 0.6801855564117432,
"learning_rate": 9.424617362162271e-05,
"loss": 0.0185,
"step": 3930
},
{
"epoch": 2.204812534974818,
"grad_norm": 0.3502347469329834,
"learning_rate": 9.420760909671118e-05,
"loss": 0.0253,
"step": 3940
},
{
"epoch": 2.2104085058757694,
"grad_norm": 0.3213407099246979,
"learning_rate": 9.416892370988444e-05,
"loss": 0.0221,
"step": 3950
},
{
"epoch": 2.2160044767767206,
"grad_norm": 0.45591723918914795,
"learning_rate": 9.413011756690685e-05,
"loss": 0.0303,
"step": 3960
},
{
"epoch": 2.2216004476776723,
"grad_norm": 0.5190838575363159,
"learning_rate": 9.409119077387294e-05,
"loss": 0.0214,
"step": 3970
},
{
"epoch": 2.2271964185786235,
"grad_norm": 0.24658669531345367,
"learning_rate": 9.405214343720707e-05,
"loss": 0.0169,
"step": 3980
},
{
"epoch": 2.2327923894795747,
"grad_norm": 0.26745668053627014,
"learning_rate": 9.401297566366318e-05,
"loss": 0.0174,
"step": 3990
},
{
"epoch": 2.238388360380526,
"grad_norm": 0.23573242127895355,
"learning_rate": 9.397368756032445e-05,
"loss": 0.0166,
"step": 4000
},
{
"epoch": 2.243984331281477,
"grad_norm": 0.38697415590286255,
"learning_rate": 9.393427923460308e-05,
"loss": 0.0175,
"step": 4010
},
{
"epoch": 2.2495803021824288,
"grad_norm": 0.26302671432495117,
"learning_rate": 9.389475079423988e-05,
"loss": 0.016,
"step": 4020
},
{
"epoch": 2.25517627308338,
"grad_norm": 0.520627498626709,
"learning_rate": 9.385510234730415e-05,
"loss": 0.0196,
"step": 4030
},
{
"epoch": 2.260772243984331,
"grad_norm": 0.3094232976436615,
"learning_rate": 9.381533400219318e-05,
"loss": 0.0197,
"step": 4040
},
{
"epoch": 2.266368214885283,
"grad_norm": 0.3238268196582794,
"learning_rate": 9.377544586763215e-05,
"loss": 0.0242,
"step": 4050
},
{
"epoch": 2.271964185786234,
"grad_norm": 0.37398698925971985,
"learning_rate": 9.373543805267368e-05,
"loss": 0.0225,
"step": 4060
},
{
"epoch": 2.2775601566871853,
"grad_norm": 0.22411245107650757,
"learning_rate": 9.369531066669758e-05,
"loss": 0.0259,
"step": 4070
},
{
"epoch": 2.2831561275881365,
"grad_norm": 0.2310367226600647,
"learning_rate": 9.365506381941066e-05,
"loss": 0.0198,
"step": 4080
},
{
"epoch": 2.2887520984890877,
"grad_norm": 0.4910151958465576,
"learning_rate": 9.36146976208462e-05,
"loss": 0.0234,
"step": 4090
},
{
"epoch": 2.2943480693900393,
"grad_norm": 0.2820461392402649,
"learning_rate": 9.357421218136386e-05,
"loss": 0.0176,
"step": 4100
},
{
"epoch": 2.2999440402909905,
"grad_norm": 0.22990214824676514,
"learning_rate": 9.353360761164931e-05,
"loss": 0.0185,
"step": 4110
},
{
"epoch": 2.3055400111919417,
"grad_norm": 0.33790138363838196,
"learning_rate": 9.349288402271388e-05,
"loss": 0.0178,
"step": 4120
},
{
"epoch": 2.311135982092893,
"grad_norm": 0.3388676345348358,
"learning_rate": 9.345204152589428e-05,
"loss": 0.0147,
"step": 4130
},
{
"epoch": 2.3167319529938446,
"grad_norm": 0.36007586121559143,
"learning_rate": 9.341108023285238e-05,
"loss": 0.0185,
"step": 4140
},
{
"epoch": 2.322327923894796,
"grad_norm": 0.41096752882003784,
"learning_rate": 9.337000025557476e-05,
"loss": 0.0219,
"step": 4150
},
{
"epoch": 2.327923894795747,
"grad_norm": 0.2878301441669464,
"learning_rate": 9.332880170637252e-05,
"loss": 0.0159,
"step": 4160
},
{
"epoch": 2.3335198656966982,
"grad_norm": 0.32061803340911865,
"learning_rate": 9.328748469788093e-05,
"loss": 0.0216,
"step": 4170
},
{
"epoch": 2.33911583659765,
"grad_norm": 0.29178762435913086,
"learning_rate": 9.32460493430591e-05,
"loss": 0.0178,
"step": 4180
},
{
"epoch": 2.344711807498601,
"grad_norm": 0.32889455556869507,
"learning_rate": 9.320449575518972e-05,
"loss": 0.0194,
"step": 4190
},
{
"epoch": 2.3503077783995523,
"grad_norm": 0.2980196475982666,
"learning_rate": 9.316282404787871e-05,
"loss": 0.015,
"step": 4200
},
{
"epoch": 2.3559037493005035,
"grad_norm": 0.21256855130195618,
"learning_rate": 9.31210343350549e-05,
"loss": 0.0151,
"step": 4210
},
{
"epoch": 2.361499720201455,
"grad_norm": 0.2378161996603012,
"learning_rate": 9.30791267309698e-05,
"loss": 0.0179,
"step": 4220
},
{
"epoch": 2.3670956911024064,
"grad_norm": 0.211124449968338,
"learning_rate": 9.30371013501972e-05,
"loss": 0.0147,
"step": 4230
},
{
"epoch": 2.3726916620033576,
"grad_norm": 0.3496321439743042,
"learning_rate": 9.299495830763286e-05,
"loss": 0.0144,
"step": 4240
},
{
"epoch": 2.378287632904309,
"grad_norm": 0.2865016758441925,
"learning_rate": 9.295269771849427e-05,
"loss": 0.0209,
"step": 4250
},
{
"epoch": 2.38388360380526,
"grad_norm": 0.22519885003566742,
"learning_rate": 9.291031969832026e-05,
"loss": 0.0177,
"step": 4260
},
{
"epoch": 2.3894795747062116,
"grad_norm": 0.41060182452201843,
"learning_rate": 9.286782436297073e-05,
"loss": 0.0169,
"step": 4270
},
{
"epoch": 2.395075545607163,
"grad_norm": 0.6265867352485657,
"learning_rate": 9.282521182862629e-05,
"loss": 0.0189,
"step": 4280
},
{
"epoch": 2.400671516508114,
"grad_norm": 0.3811153173446655,
"learning_rate": 9.278248221178798e-05,
"loss": 0.0274,
"step": 4290
},
{
"epoch": 2.4062674874090657,
"grad_norm": 0.2686716318130493,
"learning_rate": 9.273963562927695e-05,
"loss": 0.0198,
"step": 4300
},
{
"epoch": 2.411863458310017,
"grad_norm": 0.31025633215904236,
"learning_rate": 9.269667219823412e-05,
"loss": 0.0159,
"step": 4310
},
{
"epoch": 2.417459429210968,
"grad_norm": 0.23998180031776428,
"learning_rate": 9.265359203611987e-05,
"loss": 0.018,
"step": 4320
},
{
"epoch": 2.4230554001119193,
"grad_norm": 0.45635882019996643,
"learning_rate": 9.261039526071374e-05,
"loss": 0.0199,
"step": 4330
},
{
"epoch": 2.4286513710128705,
"grad_norm": 0.34626588225364685,
"learning_rate": 9.256708199011401e-05,
"loss": 0.0169,
"step": 4340
},
{
"epoch": 2.434247341913822,
"grad_norm": 0.27278828620910645,
"learning_rate": 9.252365234273755e-05,
"loss": 0.0173,
"step": 4350
},
{
"epoch": 2.4398433128147734,
"grad_norm": 0.5236303806304932,
"learning_rate": 9.248010643731935e-05,
"loss": 0.0226,
"step": 4360
},
{
"epoch": 2.4454392837157246,
"grad_norm": 0.27782773971557617,
"learning_rate": 9.243644439291223e-05,
"loss": 0.0194,
"step": 4370
},
{
"epoch": 2.451035254616676,
"grad_norm": 0.280048131942749,
"learning_rate": 9.239266632888659e-05,
"loss": 0.0174,
"step": 4380
},
{
"epoch": 2.4566312255176275,
"grad_norm": 0.3045734763145447,
"learning_rate": 9.234877236492997e-05,
"loss": 0.0148,
"step": 4390
},
{
"epoch": 2.4622271964185787,
"grad_norm": 0.1700965315103531,
"learning_rate": 9.230476262104677e-05,
"loss": 0.0155,
"step": 4400
},
{
"epoch": 2.46782316731953,
"grad_norm": 0.3037347197532654,
"learning_rate": 9.226063721755799e-05,
"loss": 0.0132,
"step": 4410
},
{
"epoch": 2.473419138220481,
"grad_norm": 0.29750266671180725,
"learning_rate": 9.221639627510076e-05,
"loss": 0.0149,
"step": 4420
},
{
"epoch": 2.4790151091214327,
"grad_norm": 0.1919635832309723,
"learning_rate": 9.217203991462815e-05,
"loss": 0.015,
"step": 4430
},
{
"epoch": 2.484611080022384,
"grad_norm": 0.2919257879257202,
"learning_rate": 9.212756825740873e-05,
"loss": 0.0177,
"step": 4440
},
{
"epoch": 2.490207050923335,
"grad_norm": 0.17676684260368347,
"learning_rate": 9.208298142502636e-05,
"loss": 0.0175,
"step": 4450
},
{
"epoch": 2.4958030218242864,
"grad_norm": 0.24397723376750946,
"learning_rate": 9.20382795393797e-05,
"loss": 0.0179,
"step": 4460
},
{
"epoch": 2.501398992725238,
"grad_norm": 0.32645362615585327,
"learning_rate": 9.199346272268199e-05,
"loss": 0.0179,
"step": 4470
},
{
"epoch": 2.5069949636261892,
"grad_norm": 0.35162001848220825,
"learning_rate": 9.194853109746074e-05,
"loss": 0.0174,
"step": 4480
},
{
"epoch": 2.5125909345271404,
"grad_norm": 0.4019016623497009,
"learning_rate": 9.190348478655724e-05,
"loss": 0.015,
"step": 4490
},
{
"epoch": 2.5181869054280916,
"grad_norm": 0.4017965495586395,
"learning_rate": 9.185832391312644e-05,
"loss": 0.0238,
"step": 4500
},
{
"epoch": 2.523782876329043,
"grad_norm": 0.41645774245262146,
"learning_rate": 9.18130486006364e-05,
"loss": 0.0143,
"step": 4510
},
{
"epoch": 2.5293788472299945,
"grad_norm": 0.28400033712387085,
"learning_rate": 9.176765897286813e-05,
"loss": 0.0196,
"step": 4520
},
{
"epoch": 2.5349748181309457,
"grad_norm": 0.4045359492301941,
"learning_rate": 9.17221551539151e-05,
"loss": 0.0191,
"step": 4530
},
{
"epoch": 2.540570789031897,
"grad_norm": 0.37660202383995056,
"learning_rate": 9.167653726818305e-05,
"loss": 0.0138,
"step": 4540
},
{
"epoch": 2.5461667599328486,
"grad_norm": 0.35835906863212585,
"learning_rate": 9.163080544038952e-05,
"loss": 0.0213,
"step": 4550
},
{
"epoch": 2.5517627308338,
"grad_norm": 0.3906223177909851,
"learning_rate": 9.158495979556358e-05,
"loss": 0.0204,
"step": 4560
},
{
"epoch": 2.557358701734751,
"grad_norm": 0.23904386162757874,
"learning_rate": 9.153900045904549e-05,
"loss": 0.0193,
"step": 4570
},
{
"epoch": 2.562954672635702,
"grad_norm": 0.3690219521522522,
"learning_rate": 9.14929275564863e-05,
"loss": 0.0218,
"step": 4580
},
{
"epoch": 2.5685506435366534,
"grad_norm": 0.3098298907279968,
"learning_rate": 9.144674121384757e-05,
"loss": 0.0142,
"step": 4590
},
{
"epoch": 2.574146614437605,
"grad_norm": 0.5726227164268494,
"learning_rate": 9.140044155740101e-05,
"loss": 0.0168,
"step": 4600
},
{
"epoch": 2.5797425853385563,
"grad_norm": 0.32549935579299927,
"learning_rate": 9.135402871372808e-05,
"loss": 0.0228,
"step": 4610
},
{
"epoch": 2.5853385562395075,
"grad_norm": 0.35607558488845825,
"learning_rate": 9.130750280971978e-05,
"loss": 0.0234,
"step": 4620
},
{
"epoch": 2.590934527140459,
"grad_norm": 0.31833362579345703,
"learning_rate": 9.126086397257612e-05,
"loss": 0.0134,
"step": 4630
},
{
"epoch": 2.5965304980414103,
"grad_norm": 0.5075991749763489,
"learning_rate": 9.121411232980588e-05,
"loss": 0.0181,
"step": 4640
},
{
"epoch": 2.6021264689423615,
"grad_norm": 0.2868656814098358,
"learning_rate": 9.116724800922629e-05,
"loss": 0.0216,
"step": 4650
},
{
"epoch": 2.6077224398433128,
"grad_norm": 0.38551998138427734,
"learning_rate": 9.112027113896262e-05,
"loss": 0.0218,
"step": 4660
},
{
"epoch": 2.613318410744264,
"grad_norm": 0.3080727756023407,
"learning_rate": 9.107318184744781e-05,
"loss": 0.0263,
"step": 4670
},
{
"epoch": 2.618914381645215,
"grad_norm": 0.2743169665336609,
"learning_rate": 9.102598026342222e-05,
"loss": 0.0143,
"step": 4680
},
{
"epoch": 2.624510352546167,
"grad_norm": 0.286101758480072,
"learning_rate": 9.097866651593317e-05,
"loss": 0.0219,
"step": 4690
},
{
"epoch": 2.630106323447118,
"grad_norm": 0.1881791204214096,
"learning_rate": 9.093124073433463e-05,
"loss": 0.015,
"step": 4700
},
{
"epoch": 2.6357022943480692,
"grad_norm": 0.3556104004383087,
"learning_rate": 9.088370304828685e-05,
"loss": 0.0207,
"step": 4710
},
{
"epoch": 2.641298265249021,
"grad_norm": 0.2784225344657898,
"learning_rate": 9.083605358775612e-05,
"loss": 0.0159,
"step": 4720
},
{
"epoch": 2.646894236149972,
"grad_norm": 0.22262175381183624,
"learning_rate": 9.078829248301417e-05,
"loss": 0.0162,
"step": 4730
},
{
"epoch": 2.6524902070509233,
"grad_norm": 0.16783557832241058,
"learning_rate": 9.074041986463808e-05,
"loss": 0.018,
"step": 4740
},
{
"epoch": 2.6580861779518745,
"grad_norm": 0.31983381509780884,
"learning_rate": 9.069243586350975e-05,
"loss": 0.0168,
"step": 4750
},
{
"epoch": 2.6636821488528257,
"grad_norm": 0.2954675555229187,
"learning_rate": 9.064434061081562e-05,
"loss": 0.0157,
"step": 4760
},
{
"epoch": 2.6692781197537774,
"grad_norm": 0.37835440039634705,
"learning_rate": 9.059613423804623e-05,
"loss": 0.016,
"step": 4770
},
{
"epoch": 2.6748740906547286,
"grad_norm": 0.30182933807373047,
"learning_rate": 9.0547816876996e-05,
"loss": 0.0223,
"step": 4780
},
{
"epoch": 2.68047006155568,
"grad_norm": 0.3329738974571228,
"learning_rate": 9.049938865976275e-05,
"loss": 0.0232,
"step": 4790
},
{
"epoch": 2.6860660324566314,
"grad_norm": 0.2866031527519226,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0193,
"step": 4800
},
{
"epoch": 2.6916620033575827,
"grad_norm": 0.3558676540851593,
"learning_rate": 9.040220018665347e-05,
"loss": 0.0181,
"step": 4810
},
{
"epoch": 2.697257974258534,
"grad_norm": 0.22001361846923828,
"learning_rate": 9.035344019648702e-05,
"loss": 0.0124,
"step": 4820
},
{
"epoch": 2.702853945159485,
"grad_norm": 0.28986766934394836,
"learning_rate": 9.030456988155596e-05,
"loss": 0.0179,
"step": 4830
},
{
"epoch": 2.7084499160604363,
"grad_norm": 0.3889327347278595,
"learning_rate": 9.025558937546988e-05,
"loss": 0.0186,
"step": 4840
},
{
"epoch": 2.714045886961388,
"grad_norm": 0.33833345770835876,
"learning_rate": 9.020649881213958e-05,
"loss": 0.0161,
"step": 4850
},
{
"epoch": 2.719641857862339,
"grad_norm": 0.23896977305412292,
"learning_rate": 9.015729832577681e-05,
"loss": 0.0149,
"step": 4860
},
{
"epoch": 2.7252378287632903,
"grad_norm": 0.44981443881988525,
"learning_rate": 9.010798805089384e-05,
"loss": 0.0221,
"step": 4870
},
{
"epoch": 2.730833799664242,
"grad_norm": 0.4389462471008301,
"learning_rate": 9.005856812230304e-05,
"loss": 0.0175,
"step": 4880
},
{
"epoch": 2.736429770565193,
"grad_norm": 0.2757073640823364,
"learning_rate": 9.000903867511666e-05,
"loss": 0.0176,
"step": 4890
},
{
"epoch": 2.7420257414661444,
"grad_norm": 0.2381424754858017,
"learning_rate": 8.995939984474624e-05,
"loss": 0.0145,
"step": 4900
},
{
"epoch": 2.7476217123670956,
"grad_norm": 0.25083616375923157,
"learning_rate": 8.990965176690252e-05,
"loss": 0.0184,
"step": 4910
},
{
"epoch": 2.753217683268047,
"grad_norm": 0.3651309013366699,
"learning_rate": 8.98597945775948e-05,
"loss": 0.0201,
"step": 4920
},
{
"epoch": 2.7588136541689985,
"grad_norm": 0.19562850892543793,
"learning_rate": 8.980982841313074e-05,
"loss": 0.0158,
"step": 4930
},
{
"epoch": 2.7644096250699497,
"grad_norm": 0.646306037902832,
"learning_rate": 8.975975341011596e-05,
"loss": 0.0172,
"step": 4940
},
{
"epoch": 2.770005595970901,
"grad_norm": 0.5771059393882751,
"learning_rate": 8.970956970545355e-05,
"loss": 0.0181,
"step": 4950
},
{
"epoch": 2.775601566871852,
"grad_norm": 0.2918018400669098,
"learning_rate": 8.965927743634391e-05,
"loss": 0.0199,
"step": 4960
},
{
"epoch": 2.7811975377728038,
"grad_norm": 0.5034765601158142,
"learning_rate": 8.96088767402841e-05,
"loss": 0.0172,
"step": 4970
},
{
"epoch": 2.786793508673755,
"grad_norm": 0.29646632075309753,
"learning_rate": 8.955836775506776e-05,
"loss": 0.0147,
"step": 4980
},
{
"epoch": 2.792389479574706,
"grad_norm": 0.2613969147205353,
"learning_rate": 8.950775061878453e-05,
"loss": 0.0164,
"step": 4990
},
{
"epoch": 2.7979854504756574,
"grad_norm": 0.27573442459106445,
"learning_rate": 8.945702546981969e-05,
"loss": 0.018,
"step": 5000
},
{
"epoch": 2.8035814213766086,
"grad_norm": 0.33170339465141296,
"learning_rate": 8.940619244685388e-05,
"loss": 0.019,
"step": 5010
},
{
"epoch": 2.8091773922775602,
"grad_norm": 0.2994827628135681,
"learning_rate": 8.935525168886262e-05,
"loss": 0.019,
"step": 5020
},
{
"epoch": 2.8147733631785115,
"grad_norm": 0.3199397921562195,
"learning_rate": 8.930420333511606e-05,
"loss": 0.0172,
"step": 5030
},
{
"epoch": 2.8203693340794627,
"grad_norm": 0.24537423253059387,
"learning_rate": 8.92530475251784e-05,
"loss": 0.0146,
"step": 5040
},
{
"epoch": 2.8259653049804143,
"grad_norm": 0.24761222302913666,
"learning_rate": 8.920178439890765e-05,
"loss": 0.0194,
"step": 5050
},
{
"epoch": 2.8315612758813655,
"grad_norm": 0.2208421230316162,
"learning_rate": 8.91504140964553e-05,
"loss": 0.0123,
"step": 5060
},
{
"epoch": 2.8371572467823167,
"grad_norm": 0.3568471074104309,
"learning_rate": 8.909893675826574e-05,
"loss": 0.0147,
"step": 5070
},
{
"epoch": 2.842753217683268,
"grad_norm": 0.24207855761051178,
"learning_rate": 8.90473525250761e-05,
"loss": 0.0166,
"step": 5080
},
{
"epoch": 2.848349188584219,
"grad_norm": 0.47056907415390015,
"learning_rate": 8.899566153791566e-05,
"loss": 0.0234,
"step": 5090
},
{
"epoch": 2.853945159485171,
"grad_norm": 0.26351991295814514,
"learning_rate": 8.894386393810563e-05,
"loss": 0.0212,
"step": 5100
},
{
"epoch": 2.859541130386122,
"grad_norm": 0.2002822756767273,
"learning_rate": 8.889195986725865e-05,
"loss": 0.0191,
"step": 5110
},
{
"epoch": 2.865137101287073,
"grad_norm": 0.28489527106285095,
"learning_rate": 8.883994946727849e-05,
"loss": 0.0155,
"step": 5120
},
{
"epoch": 2.870733072188025,
"grad_norm": 0.30861204862594604,
"learning_rate": 8.878783288035957e-05,
"loss": 0.0158,
"step": 5130
},
{
"epoch": 2.876329043088976,
"grad_norm": 0.2856840193271637,
"learning_rate": 8.873561024898668e-05,
"loss": 0.0201,
"step": 5140
},
{
"epoch": 2.8819250139899273,
"grad_norm": 0.3461334705352783,
"learning_rate": 8.868328171593448e-05,
"loss": 0.0184,
"step": 5150
},
{
"epoch": 2.8875209848908785,
"grad_norm": 0.22160184383392334,
"learning_rate": 8.863084742426719e-05,
"loss": 0.0171,
"step": 5160
},
{
"epoch": 2.8931169557918297,
"grad_norm": 0.2488642781972885,
"learning_rate": 8.857830751733815e-05,
"loss": 0.0153,
"step": 5170
},
{
"epoch": 2.8987129266927814,
"grad_norm": 0.33482569456100464,
"learning_rate": 8.852566213878947e-05,
"loss": 0.0189,
"step": 5180
},
{
"epoch": 2.9043088975937326,
"grad_norm": 0.2865656316280365,
"learning_rate": 8.84729114325516e-05,
"loss": 0.0168,
"step": 5190
},
{
"epoch": 2.9099048684946838,
"grad_norm": 0.3801150321960449,
"learning_rate": 8.842005554284296e-05,
"loss": 0.0149,
"step": 5200
},
{
"epoch": 2.915500839395635,
"grad_norm": 0.24389003217220306,
"learning_rate": 8.836709461416952e-05,
"loss": 0.0176,
"step": 5210
},
{
"epoch": 2.9210968102965866,
"grad_norm": 0.4815085828304291,
"learning_rate": 8.831402879132446e-05,
"loss": 0.014,
"step": 5220
},
{
"epoch": 2.926692781197538,
"grad_norm": 0.2196839153766632,
"learning_rate": 8.82608582193877e-05,
"loss": 0.0174,
"step": 5230
},
{
"epoch": 2.932288752098489,
"grad_norm": 0.30073830485343933,
"learning_rate": 8.820758304372557e-05,
"loss": 0.0168,
"step": 5240
},
{
"epoch": 2.9378847229994403,
"grad_norm": 0.21486796438694,
"learning_rate": 8.815420340999033e-05,
"loss": 0.0128,
"step": 5250
},
{
"epoch": 2.9434806939003915,
"grad_norm": 0.31880220770835876,
"learning_rate": 8.810071946411989e-05,
"loss": 0.0209,
"step": 5260
},
{
"epoch": 2.949076664801343,
"grad_norm": 0.20475736260414124,
"learning_rate": 8.804713135233731e-05,
"loss": 0.0152,
"step": 5270
},
{
"epoch": 2.9546726357022943,
"grad_norm": 0.19735224545001984,
"learning_rate": 8.799343922115044e-05,
"loss": 0.0104,
"step": 5280
},
{
"epoch": 2.9602686066032455,
"grad_norm": 0.17013341188430786,
"learning_rate": 8.79396432173515e-05,
"loss": 0.0129,
"step": 5290
},
{
"epoch": 2.965864577504197,
"grad_norm": 0.38702845573425293,
"learning_rate": 8.788574348801675e-05,
"loss": 0.0239,
"step": 5300
},
{
"epoch": 2.9714605484051484,
"grad_norm": 0.34306514263153076,
"learning_rate": 8.783174018050594e-05,
"loss": 0.03,
"step": 5310
},
{
"epoch": 2.9770565193060996,
"grad_norm": 0.26854732632637024,
"learning_rate": 8.77776334424621e-05,
"loss": 0.019,
"step": 5320
},
{
"epoch": 2.982652490207051,
"grad_norm": 0.28458869457244873,
"learning_rate": 8.772342342181095e-05,
"loss": 0.0213,
"step": 5330
},
{
"epoch": 2.988248461108002,
"grad_norm": 0.28708454966545105,
"learning_rate": 8.766911026676064e-05,
"loss": 0.0173,
"step": 5340
},
{
"epoch": 2.9938444320089537,
"grad_norm": 0.35600361227989197,
"learning_rate": 8.761469412580125e-05,
"loss": 0.0179,
"step": 5350
},
{
"epoch": 2.999440402909905,
"grad_norm": 0.29637375473976135,
"learning_rate": 8.756017514770443e-05,
"loss": 0.0223,
"step": 5360
},
{
"epoch": 3.005036373810856,
"grad_norm": 0.39075925946235657,
"learning_rate": 8.750555348152298e-05,
"loss": 0.0148,
"step": 5370
},
{
"epoch": 3.0106323447118073,
"grad_norm": 0.3552566468715668,
"learning_rate": 8.745082927659047e-05,
"loss": 0.0187,
"step": 5380
},
{
"epoch": 3.016228315612759,
"grad_norm": 0.2608230710029602,
"learning_rate": 8.739600268252078e-05,
"loss": 0.0205,
"step": 5390
},
{
"epoch": 3.02182428651371,
"grad_norm": 0.2771034240722656,
"learning_rate": 8.73410738492077e-05,
"loss": 0.0187,
"step": 5400
},
{
"epoch": 3.0274202574146614,
"grad_norm": 0.2750489413738251,
"learning_rate": 8.728604292682459e-05,
"loss": 0.0161,
"step": 5410
},
{
"epoch": 3.0330162283156126,
"grad_norm": 0.3373420834541321,
"learning_rate": 8.723091006582389e-05,
"loss": 0.0193,
"step": 5420
},
{
"epoch": 3.0386121992165642,
"grad_norm": 0.27592456340789795,
"learning_rate": 8.717567541693673e-05,
"loss": 0.0171,
"step": 5430
},
{
"epoch": 3.0442081701175154,
"grad_norm": 0.3381069004535675,
"learning_rate": 8.71203391311725e-05,
"loss": 0.0185,
"step": 5440
},
{
"epoch": 3.0498041410184666,
"grad_norm": 0.342650830745697,
"learning_rate": 8.706490135981855e-05,
"loss": 0.0223,
"step": 5450
},
{
"epoch": 3.055400111919418,
"grad_norm": 0.2777611017227173,
"learning_rate": 8.700936225443959e-05,
"loss": 0.0135,
"step": 5460
},
{
"epoch": 3.0609960828203695,
"grad_norm": 0.26987946033477783,
"learning_rate": 8.695372196687743e-05,
"loss": 0.0182,
"step": 5470
},
{
"epoch": 3.0665920537213207,
"grad_norm": 0.24877256155014038,
"learning_rate": 8.689798064925049e-05,
"loss": 0.015,
"step": 5480
},
{
"epoch": 3.072188024622272,
"grad_norm": 0.31654706597328186,
"learning_rate": 8.684213845395339e-05,
"loss": 0.0142,
"step": 5490
},
{
"epoch": 3.077783995523223,
"grad_norm": 0.22976505756378174,
"learning_rate": 8.678619553365659e-05,
"loss": 0.0119,
"step": 5500
},
{
"epoch": 3.083379966424175,
"grad_norm": 0.3443313241004944,
"learning_rate": 8.673015204130586e-05,
"loss": 0.0138,
"step": 5510
},
{
"epoch": 3.088975937325126,
"grad_norm": 0.34815511107444763,
"learning_rate": 8.6674008130122e-05,
"loss": 0.0127,
"step": 5520
},
{
"epoch": 3.094571908226077,
"grad_norm": 0.392868310213089,
"learning_rate": 8.661776395360029e-05,
"loss": 0.0148,
"step": 5530
},
{
"epoch": 3.1001678791270284,
"grad_norm": 0.15690505504608154,
"learning_rate": 8.656141966551019e-05,
"loss": 0.0158,
"step": 5540
},
{
"epoch": 3.10576385002798,
"grad_norm": 0.2958482503890991,
"learning_rate": 8.650497541989482e-05,
"loss": 0.015,
"step": 5550
},
{
"epoch": 3.1113598209289313,
"grad_norm": 0.34652698040008545,
"learning_rate": 8.644843137107059e-05,
"loss": 0.0186,
"step": 5560
},
{
"epoch": 3.1169557918298825,
"grad_norm": 0.2787473201751709,
"learning_rate": 8.639178767362676e-05,
"loss": 0.0171,
"step": 5570
},
{
"epoch": 3.1225517627308337,
"grad_norm": 0.28770115971565247,
"learning_rate": 8.633504448242505e-05,
"loss": 0.0088,
"step": 5580
},
{
"epoch": 3.128147733631785,
"grad_norm": 0.16269604861736298,
"learning_rate": 8.627820195259918e-05,
"loss": 0.0144,
"step": 5590
},
{
"epoch": 3.1337437045327365,
"grad_norm": 0.2170538753271103,
"learning_rate": 8.622126023955446e-05,
"loss": 0.0145,
"step": 5600
},
{
"epoch": 3.1393396754336877,
"grad_norm": 0.1933916211128235,
"learning_rate": 8.616421949896734e-05,
"loss": 0.0145,
"step": 5610
},
{
"epoch": 3.144935646334639,
"grad_norm": 0.28321388363838196,
"learning_rate": 8.610707988678503e-05,
"loss": 0.0171,
"step": 5620
},
{
"epoch": 3.1505316172355906,
"grad_norm": 0.1729007363319397,
"learning_rate": 8.604984155922506e-05,
"loss": 0.0103,
"step": 5630
},
{
"epoch": 3.156127588136542,
"grad_norm": 0.41079893708229065,
"learning_rate": 8.599250467277483e-05,
"loss": 0.0159,
"step": 5640
},
{
"epoch": 3.161723559037493,
"grad_norm": 0.4628431797027588,
"learning_rate": 8.59350693841912e-05,
"loss": 0.0184,
"step": 5650
},
{
"epoch": 3.1673195299384442,
"grad_norm": 0.30907726287841797,
"learning_rate": 8.587753585050004e-05,
"loss": 0.0183,
"step": 5660
},
{
"epoch": 3.1729155008393954,
"grad_norm": 0.19282157719135284,
"learning_rate": 8.581990422899585e-05,
"loss": 0.0127,
"step": 5670
},
{
"epoch": 3.178511471740347,
"grad_norm": 0.27166658639907837,
"learning_rate": 8.576217467724128e-05,
"loss": 0.023,
"step": 5680
},
{
"epoch": 3.1841074426412983,
"grad_norm": 0.3486577272415161,
"learning_rate": 8.570434735306671e-05,
"loss": 0.0108,
"step": 5690
},
{
"epoch": 3.1897034135422495,
"grad_norm": 0.295238733291626,
"learning_rate": 8.564642241456986e-05,
"loss": 0.0181,
"step": 5700
},
{
"epoch": 3.1952993844432007,
"grad_norm": 0.20616333186626434,
"learning_rate": 8.558840002011528e-05,
"loss": 0.0202,
"step": 5710
},
{
"epoch": 3.2008953553441524,
"grad_norm": 0.12979304790496826,
"learning_rate": 8.553028032833397e-05,
"loss": 0.0125,
"step": 5720
},
{
"epoch": 3.2064913262451036,
"grad_norm": 0.23997394740581512,
"learning_rate": 8.547206349812298e-05,
"loss": 0.0159,
"step": 5730
},
{
"epoch": 3.212087297146055,
"grad_norm": 0.2359701246023178,
"learning_rate": 8.541374968864487e-05,
"loss": 0.0136,
"step": 5740
},
{
"epoch": 3.217683268047006,
"grad_norm": 0.25309842824935913,
"learning_rate": 8.535533905932738e-05,
"loss": 0.0154,
"step": 5750
},
{
"epoch": 3.2232792389479576,
"grad_norm": 0.26648661494255066,
"learning_rate": 8.529683176986295e-05,
"loss": 0.0132,
"step": 5760
},
{
"epoch": 3.228875209848909,
"grad_norm": 0.32268235087394714,
"learning_rate": 8.523822798020827e-05,
"loss": 0.0133,
"step": 5770
},
{
"epoch": 3.23447118074986,
"grad_norm": 0.2632688283920288,
"learning_rate": 8.517952785058385e-05,
"loss": 0.017,
"step": 5780
},
{
"epoch": 3.2400671516508113,
"grad_norm": 0.16985219717025757,
"learning_rate": 8.512073154147362e-05,
"loss": 0.0143,
"step": 5790
},
{
"epoch": 3.245663122551763,
"grad_norm": 0.23951981961727142,
"learning_rate": 8.506183921362443e-05,
"loss": 0.0157,
"step": 5800
},
{
"epoch": 3.251259093452714,
"grad_norm": 0.36843812465667725,
"learning_rate": 8.500285102804568e-05,
"loss": 0.0198,
"step": 5810
},
{
"epoch": 3.2568550643536653,
"grad_norm": 0.27591267228126526,
"learning_rate": 8.494376714600878e-05,
"loss": 0.0246,
"step": 5820
},
{
"epoch": 3.2624510352546165,
"grad_norm": 0.3020281195640564,
"learning_rate": 8.488458772904684e-05,
"loss": 0.018,
"step": 5830
},
{
"epoch": 3.2680470061555678,
"grad_norm": 0.20429036021232605,
"learning_rate": 8.482531293895412e-05,
"loss": 0.0154,
"step": 5840
},
{
"epoch": 3.2736429770565194,
"grad_norm": 0.3011918067932129,
"learning_rate": 8.476594293778561e-05,
"loss": 0.0181,
"step": 5850
},
{
"epoch": 3.2792389479574706,
"grad_norm": 0.20082388818264008,
"learning_rate": 8.470647788785665e-05,
"loss": 0.0118,
"step": 5860
},
{
"epoch": 3.284834918858422,
"grad_norm": 0.25404563546180725,
"learning_rate": 8.46469179517424e-05,
"loss": 0.0122,
"step": 5870
},
{
"epoch": 3.2904308897593735,
"grad_norm": 0.17162342369556427,
"learning_rate": 8.458726329227747e-05,
"loss": 0.0178,
"step": 5880
},
{
"epoch": 3.2960268606603247,
"grad_norm": 0.2713855803012848,
"learning_rate": 8.452751407255541e-05,
"loss": 0.0127,
"step": 5890
},
{
"epoch": 3.301622831561276,
"grad_norm": 0.25792196393013,
"learning_rate": 8.44676704559283e-05,
"loss": 0.0151,
"step": 5900
},
{
"epoch": 3.307218802462227,
"grad_norm": 0.24708054959774017,
"learning_rate": 8.44077326060063e-05,
"loss": 0.0205,
"step": 5910
},
{
"epoch": 3.3128147733631783,
"grad_norm": 0.22907878458499908,
"learning_rate": 8.434770068665723e-05,
"loss": 0.0196,
"step": 5920
},
{
"epoch": 3.31841074426413,
"grad_norm": 0.42451682686805725,
"learning_rate": 8.428757486200603e-05,
"loss": 0.0181,
"step": 5930
},
{
"epoch": 3.324006715165081,
"grad_norm": 0.2787477970123291,
"learning_rate": 8.422735529643444e-05,
"loss": 0.0163,
"step": 5940
},
{
"epoch": 3.3296026860660324,
"grad_norm": 0.2536604404449463,
"learning_rate": 8.416704215458043e-05,
"loss": 0.0153,
"step": 5950
},
{
"epoch": 3.3351986569669836,
"grad_norm": 0.27685803174972534,
"learning_rate": 8.410663560133784e-05,
"loss": 0.0171,
"step": 5960
},
{
"epoch": 3.3407946278679352,
"grad_norm": 0.21129871904850006,
"learning_rate": 8.404613580185585e-05,
"loss": 0.0146,
"step": 5970
},
{
"epoch": 3.3463905987688864,
"grad_norm": 0.2712884247303009,
"learning_rate": 8.398554292153866e-05,
"loss": 0.0124,
"step": 5980
},
{
"epoch": 3.3519865696698377,
"grad_norm": 0.28807780146598816,
"learning_rate": 8.392485712604483e-05,
"loss": 0.0151,
"step": 5990
},
{
"epoch": 3.357582540570789,
"grad_norm": 0.24215184152126312,
"learning_rate": 8.386407858128706e-05,
"loss": 0.0201,
"step": 6000
},
{
"epoch": 3.3631785114717405,
"grad_norm": 0.3111182451248169,
"learning_rate": 8.380320745343153e-05,
"loss": 0.0148,
"step": 6010
},
{
"epoch": 3.3687744823726917,
"grad_norm": 0.3122502267360687,
"learning_rate": 8.37422439088976e-05,
"loss": 0.0138,
"step": 6020
},
{
"epoch": 3.374370453273643,
"grad_norm": 0.23829977214336395,
"learning_rate": 8.368118811435726e-05,
"loss": 0.0172,
"step": 6030
},
{
"epoch": 3.379966424174594,
"grad_norm": 0.22568489611148834,
"learning_rate": 8.362004023673474e-05,
"loss": 0.0191,
"step": 6040
},
{
"epoch": 3.385562395075546,
"grad_norm": 0.37260109186172485,
"learning_rate": 8.355880044320598e-05,
"loss": 0.0146,
"step": 6050
},
{
"epoch": 3.391158365976497,
"grad_norm": 0.36467012763023376,
"learning_rate": 8.349746890119826e-05,
"loss": 0.0144,
"step": 6060
},
{
"epoch": 3.396754336877448,
"grad_norm": 0.28992265462875366,
"learning_rate": 8.343604577838964e-05,
"loss": 0.014,
"step": 6070
},
{
"epoch": 3.4023503077783994,
"grad_norm": 0.3018409311771393,
"learning_rate": 8.337453124270863e-05,
"loss": 0.0126,
"step": 6080
},
{
"epoch": 3.4079462786793506,
"grad_norm": 0.31771036982536316,
"learning_rate": 8.331292546233362e-05,
"loss": 0.0124,
"step": 6090
},
{
"epoch": 3.4135422495803023,
"grad_norm": 0.2008838802576065,
"learning_rate": 8.32512286056924e-05,
"loss": 0.0181,
"step": 6100
},
{
"epoch": 3.4191382204812535,
"grad_norm": 0.3000880777835846,
"learning_rate": 8.318944084146192e-05,
"loss": 0.0178,
"step": 6110
},
{
"epoch": 3.4247341913822047,
"grad_norm": 0.201462984085083,
"learning_rate": 8.31275623385675e-05,
"loss": 0.0121,
"step": 6120
},
{
"epoch": 3.4303301622831563,
"grad_norm": 0.29394298791885376,
"learning_rate": 8.306559326618259e-05,
"loss": 0.019,
"step": 6130
},
{
"epoch": 3.4359261331841076,
"grad_norm": 0.20683641731739044,
"learning_rate": 8.300353379372834e-05,
"loss": 0.0157,
"step": 6140
},
{
"epoch": 3.4415221040850588,
"grad_norm": 0.2323373705148697,
"learning_rate": 8.29413840908729e-05,
"loss": 0.0132,
"step": 6150
},
{
"epoch": 3.44711807498601,
"grad_norm": 0.28800690174102783,
"learning_rate": 8.287914432753123e-05,
"loss": 0.0149,
"step": 6160
},
{
"epoch": 3.452714045886961,
"grad_norm": 0.24825571477413177,
"learning_rate": 8.281681467386446e-05,
"loss": 0.0143,
"step": 6170
},
{
"epoch": 3.458310016787913,
"grad_norm": 0.26586174964904785,
"learning_rate": 8.275439530027948e-05,
"loss": 0.0193,
"step": 6180
},
{
"epoch": 3.463905987688864,
"grad_norm": 0.384670615196228,
"learning_rate": 8.269188637742846e-05,
"loss": 0.0135,
"step": 6190
},
{
"epoch": 3.4695019585898152,
"grad_norm": 0.2598379850387573,
"learning_rate": 8.262928807620843e-05,
"loss": 0.0192,
"step": 6200
},
{
"epoch": 3.4750979294907665,
"grad_norm": 0.26824334263801575,
"learning_rate": 8.256660056776076e-05,
"loss": 0.017,
"step": 6210
},
{
"epoch": 3.480693900391718,
"grad_norm": 0.29601970314979553,
"learning_rate": 8.250382402347065e-05,
"loss": 0.0236,
"step": 6220
},
{
"epoch": 3.4862898712926693,
"grad_norm": 0.2569962739944458,
"learning_rate": 8.244095861496686e-05,
"loss": 0.0148,
"step": 6230
},
{
"epoch": 3.4918858421936205,
"grad_norm": 0.18870459496974945,
"learning_rate": 8.237800451412095e-05,
"loss": 0.0166,
"step": 6240
},
{
"epoch": 3.4974818130945717,
"grad_norm": 0.20874905586242676,
"learning_rate": 8.231496189304704e-05,
"loss": 0.012,
"step": 6250
},
{
"epoch": 3.5030777839955234,
"grad_norm": 0.456989586353302,
"learning_rate": 8.225183092410128e-05,
"loss": 0.0174,
"step": 6260
},
{
"epoch": 3.5086737548964746,
"grad_norm": 0.3724716305732727,
"learning_rate": 8.218861177988129e-05,
"loss": 0.0164,
"step": 6270
},
{
"epoch": 3.514269725797426,
"grad_norm": 0.2510260343551636,
"learning_rate": 8.212530463322583e-05,
"loss": 0.014,
"step": 6280
},
{
"epoch": 3.519865696698377,
"grad_norm": 0.17292679846286774,
"learning_rate": 8.206190965721419e-05,
"loss": 0.0135,
"step": 6290
},
{
"epoch": 3.5254616675993287,
"grad_norm": 0.25856831669807434,
"learning_rate": 8.199842702516583e-05,
"loss": 0.0159,
"step": 6300
},
{
"epoch": 3.53105763850028,
"grad_norm": 0.26525381207466125,
"learning_rate": 8.193485691063985e-05,
"loss": 0.0132,
"step": 6310
},
{
"epoch": 3.536653609401231,
"grad_norm": 0.319915235042572,
"learning_rate": 8.18711994874345e-05,
"loss": 0.0113,
"step": 6320
},
{
"epoch": 3.5422495803021823,
"grad_norm": 0.23749981820583344,
"learning_rate": 8.180745492958674e-05,
"loss": 0.0145,
"step": 6330
},
{
"epoch": 3.5478455512031335,
"grad_norm": 0.25086531043052673,
"learning_rate": 8.174362341137177e-05,
"loss": 0.0165,
"step": 6340
},
{
"epoch": 3.553441522104085,
"grad_norm": 0.19675312936306,
"learning_rate": 8.167970510730253e-05,
"loss": 0.0155,
"step": 6350
},
{
"epoch": 3.5590374930050364,
"grad_norm": 0.2085702270269394,
"learning_rate": 8.161570019212921e-05,
"loss": 0.0155,
"step": 6360
},
{
"epoch": 3.5646334639059876,
"grad_norm": 0.4404468536376953,
"learning_rate": 8.155160884083881e-05,
"loss": 0.0208,
"step": 6370
},
{
"epoch": 3.570229434806939,
"grad_norm": 0.10625205188989639,
"learning_rate": 8.148743122865463e-05,
"loss": 0.015,
"step": 6380
},
{
"epoch": 3.5758254057078904,
"grad_norm": 0.34253987669944763,
"learning_rate": 8.14231675310358e-05,
"loss": 0.0229,
"step": 6390
},
{
"epoch": 3.5814213766088416,
"grad_norm": 0.43956324458122253,
"learning_rate": 8.135881792367686e-05,
"loss": 0.0181,
"step": 6400
},
{
"epoch": 3.587017347509793,
"grad_norm": 0.45199209451675415,
"learning_rate": 8.129438258250712e-05,
"loss": 0.0198,
"step": 6410
},
{
"epoch": 3.592613318410744,
"grad_norm": 0.2245771586894989,
"learning_rate": 8.12298616836904e-05,
"loss": 0.0141,
"step": 6420
},
{
"epoch": 3.5982092893116957,
"grad_norm": 0.3338348865509033,
"learning_rate": 8.116525540362434e-05,
"loss": 0.0168,
"step": 6430
},
{
"epoch": 3.603805260212647,
"grad_norm": 0.21632985770702362,
"learning_rate": 8.110056391894005e-05,
"loss": 0.0117,
"step": 6440
},
{
"epoch": 3.609401231113598,
"grad_norm": 0.2893829643726349,
"learning_rate": 8.103578740650156e-05,
"loss": 0.0166,
"step": 6450
},
{
"epoch": 3.6149972020145498,
"grad_norm": 0.24873918294906616,
"learning_rate": 8.097092604340542e-05,
"loss": 0.0139,
"step": 6460
},
{
"epoch": 3.620593172915501,
"grad_norm": 0.31232985854148865,
"learning_rate": 8.090598000698009e-05,
"loss": 0.0122,
"step": 6470
},
{
"epoch": 3.626189143816452,
"grad_norm": 0.20202654600143433,
"learning_rate": 8.084094947478556e-05,
"loss": 0.0126,
"step": 6480
},
{
"epoch": 3.6317851147174034,
"grad_norm": 0.339890718460083,
"learning_rate": 8.077583462461283e-05,
"loss": 0.0107,
"step": 6490
},
{
"epoch": 3.6373810856183546,
"grad_norm": 0.17959007620811462,
"learning_rate": 8.07106356344834e-05,
"loss": 0.0125,
"step": 6500
},
{
"epoch": 3.6429770565193063,
"grad_norm": 0.21795189380645752,
"learning_rate": 8.064535268264883e-05,
"loss": 0.0202,
"step": 6510
},
{
"epoch": 3.6485730274202575,
"grad_norm": 0.17131085693836212,
"learning_rate": 8.057998594759022e-05,
"loss": 0.0197,
"step": 6520
},
{
"epoch": 3.6541689983212087,
"grad_norm": 0.180596724152565,
"learning_rate": 8.051453560801772e-05,
"loss": 0.0128,
"step": 6530
},
{
"epoch": 3.65976496922216,
"grad_norm": 0.23086079955101013,
"learning_rate": 8.044900184287007e-05,
"loss": 0.0171,
"step": 6540
},
{
"epoch": 3.6653609401231115,
"grad_norm": 0.40819284319877625,
"learning_rate": 8.038338483131407e-05,
"loss": 0.0162,
"step": 6550
},
{
"epoch": 3.6709569110240627,
"grad_norm": 0.20544512569904327,
"learning_rate": 8.031768475274413e-05,
"loss": 0.01,
"step": 6560
},
{
"epoch": 3.676552881925014,
"grad_norm": 0.3116811513900757,
"learning_rate": 8.025190178678175e-05,
"loss": 0.0183,
"step": 6570
},
{
"epoch": 3.682148852825965,
"grad_norm": 0.3111719787120819,
"learning_rate": 8.018603611327504e-05,
"loss": 0.015,
"step": 6580
},
{
"epoch": 3.6877448237269164,
"grad_norm": 0.20265722274780273,
"learning_rate": 8.012008791229826e-05,
"loss": 0.0136,
"step": 6590
},
{
"epoch": 3.693340794627868,
"grad_norm": 0.35717812180519104,
"learning_rate": 8.005405736415126e-05,
"loss": 0.0098,
"step": 6600
},
{
"epoch": 3.6989367655288192,
"grad_norm": 0.45737767219543457,
"learning_rate": 7.998794464935904e-05,
"loss": 0.0115,
"step": 6610
},
{
"epoch": 3.7045327364297704,
"grad_norm": 0.3025696873664856,
"learning_rate": 7.992174994867123e-05,
"loss": 0.0159,
"step": 6620
},
{
"epoch": 3.710128707330722,
"grad_norm": 0.3852231502532959,
"learning_rate": 7.985547344306161e-05,
"loss": 0.0116,
"step": 6630
},
{
"epoch": 3.7157246782316733,
"grad_norm": 0.23505637049674988,
"learning_rate": 7.978911531372765e-05,
"loss": 0.012,
"step": 6640
},
{
"epoch": 3.7213206491326245,
"grad_norm": 0.16072528064250946,
"learning_rate": 7.972267574208991e-05,
"loss": 0.0101,
"step": 6650
},
{
"epoch": 3.7269166200335757,
"grad_norm": 0.2579629719257355,
"learning_rate": 7.965615490979163e-05,
"loss": 0.0172,
"step": 6660
},
{
"epoch": 3.732512590934527,
"grad_norm": 0.170463427901268,
"learning_rate": 7.958955299869825e-05,
"loss": 0.0164,
"step": 6670
},
{
"epoch": 3.7381085618354786,
"grad_norm": 0.2048628181219101,
"learning_rate": 7.952287019089685e-05,
"loss": 0.0095,
"step": 6680
},
{
"epoch": 3.74370453273643,
"grad_norm": 0.1665850281715393,
"learning_rate": 7.945610666869568e-05,
"loss": 0.0131,
"step": 6690
},
{
"epoch": 3.749300503637381,
"grad_norm": 0.184804305434227,
"learning_rate": 7.938926261462366e-05,
"loss": 0.0161,
"step": 6700
},
{
"epoch": 3.7548964745383326,
"grad_norm": 0.17109259963035583,
"learning_rate": 7.932233821142987e-05,
"loss": 0.014,
"step": 6710
},
{
"epoch": 3.760492445439284,
"grad_norm": 0.23285003006458282,
"learning_rate": 7.925533364208309e-05,
"loss": 0.0106,
"step": 6720
},
{
"epoch": 3.766088416340235,
"grad_norm": 0.21361905336380005,
"learning_rate": 7.918824908977123e-05,
"loss": 0.0218,
"step": 6730
},
{
"epoch": 3.7716843872411863,
"grad_norm": 0.22354750335216522,
"learning_rate": 7.912108473790092e-05,
"loss": 0.0203,
"step": 6740
},
{
"epoch": 3.7772803581421375,
"grad_norm": 0.24767528474330902,
"learning_rate": 7.905384077009693e-05,
"loss": 0.0193,
"step": 6750
},
{
"epoch": 3.782876329043089,
"grad_norm": 0.18995364010334015,
"learning_rate": 7.898651737020166e-05,
"loss": 0.0162,
"step": 6760
},
{
"epoch": 3.7884722999440403,
"grad_norm": 0.13995826244354248,
"learning_rate": 7.891911472227478e-05,
"loss": 0.0187,
"step": 6770
},
{
"epoch": 3.7940682708449915,
"grad_norm": 0.2525804340839386,
"learning_rate": 7.88516330105925e-05,
"loss": 0.0136,
"step": 6780
},
{
"epoch": 3.799664241745943,
"grad_norm": 0.17206352949142456,
"learning_rate": 7.878407241964729e-05,
"loss": 0.0133,
"step": 6790
},
{
"epoch": 3.8052602126468944,
"grad_norm": 0.17433176934719086,
"learning_rate": 7.871643313414718e-05,
"loss": 0.0257,
"step": 6800
},
{
"epoch": 3.8108561835478456,
"grad_norm": 0.2698834240436554,
"learning_rate": 7.864871533901544e-05,
"loss": 0.0141,
"step": 6810
},
{
"epoch": 3.816452154448797,
"grad_norm": 0.2874978482723236,
"learning_rate": 7.858091921938988e-05,
"loss": 0.0175,
"step": 6820
},
{
"epoch": 3.822048125349748,
"grad_norm": 0.267092227935791,
"learning_rate": 7.851304496062254e-05,
"loss": 0.0169,
"step": 6830
},
{
"epoch": 3.8276440962506992,
"grad_norm": 0.31751275062561035,
"learning_rate": 7.844509274827907e-05,
"loss": 0.0175,
"step": 6840
},
{
"epoch": 3.833240067151651,
"grad_norm": 0.30981171131134033,
"learning_rate": 7.837706276813819e-05,
"loss": 0.0145,
"step": 6850
},
{
"epoch": 3.838836038052602,
"grad_norm": 0.31560707092285156,
"learning_rate": 7.830895520619128e-05,
"loss": 0.0157,
"step": 6860
},
{
"epoch": 3.8444320089535533,
"grad_norm": 0.22295020520687103,
"learning_rate": 7.824077024864179e-05,
"loss": 0.0108,
"step": 6870
},
{
"epoch": 3.850027979854505,
"grad_norm": 0.25469842553138733,
"learning_rate": 7.817250808190483e-05,
"loss": 0.015,
"step": 6880
},
{
"epoch": 3.855623950755456,
"grad_norm": 0.3890667259693146,
"learning_rate": 7.810416889260653e-05,
"loss": 0.0179,
"step": 6890
},
{
"epoch": 3.8612199216564074,
"grad_norm": 0.1923862248659134,
"learning_rate": 7.803575286758364e-05,
"loss": 0.013,
"step": 6900
},
{
"epoch": 3.8668158925573586,
"grad_norm": 0.17686985433101654,
"learning_rate": 7.796726019388295e-05,
"loss": 0.0143,
"step": 6910
},
{
"epoch": 3.87241186345831,
"grad_norm": 0.1899517923593521,
"learning_rate": 7.789869105876083e-05,
"loss": 0.0178,
"step": 6920
},
{
"epoch": 3.8780078343592614,
"grad_norm": 0.3056480586528778,
"learning_rate": 7.783004564968263e-05,
"loss": 0.0129,
"step": 6930
},
{
"epoch": 3.8836038052602126,
"grad_norm": 0.27795109152793884,
"learning_rate": 7.776132415432234e-05,
"loss": 0.0151,
"step": 6940
},
{
"epoch": 3.889199776161164,
"grad_norm": 0.22460781037807465,
"learning_rate": 7.769252676056187e-05,
"loss": 0.0145,
"step": 6950
},
{
"epoch": 3.8947957470621155,
"grad_norm": 0.29980891942977905,
"learning_rate": 7.762365365649067e-05,
"loss": 0.015,
"step": 6960
},
{
"epoch": 3.9003917179630667,
"grad_norm": 0.2440609186887741,
"learning_rate": 7.755470503040516e-05,
"loss": 0.0137,
"step": 6970
},
{
"epoch": 3.905987688864018,
"grad_norm": 0.2510973811149597,
"learning_rate": 7.748568107080832e-05,
"loss": 0.0118,
"step": 6980
},
{
"epoch": 3.911583659764969,
"grad_norm": 0.4981507956981659,
"learning_rate": 7.741658196640892e-05,
"loss": 0.0217,
"step": 6990
},
{
"epoch": 3.9171796306659203,
"grad_norm": 0.28161290287971497,
"learning_rate": 7.734740790612136e-05,
"loss": 0.0154,
"step": 7000
},
{
"epoch": 3.922775601566872,
"grad_norm": 0.40513697266578674,
"learning_rate": 7.727815907906481e-05,
"loss": 0.0169,
"step": 7010
},
{
"epoch": 3.928371572467823,
"grad_norm": 0.31741997599601746,
"learning_rate": 7.720883567456298e-05,
"loss": 0.0156,
"step": 7020
},
{
"epoch": 3.9339675433687744,
"grad_norm": 0.2534908652305603,
"learning_rate": 7.713943788214337e-05,
"loss": 0.0142,
"step": 7030
},
{
"epoch": 3.939563514269726,
"grad_norm": 0.2655825912952423,
"learning_rate": 7.70699658915369e-05,
"loss": 0.0154,
"step": 7040
},
{
"epoch": 3.9451594851706773,
"grad_norm": 0.32799914479255676,
"learning_rate": 7.700041989267736e-05,
"loss": 0.0137,
"step": 7050
},
{
"epoch": 3.9507554560716285,
"grad_norm": 0.184087872505188,
"learning_rate": 7.693080007570084e-05,
"loss": 0.013,
"step": 7060
},
{
"epoch": 3.9563514269725797,
"grad_norm": 0.31337958574295044,
"learning_rate": 7.686110663094525e-05,
"loss": 0.0203,
"step": 7070
},
{
"epoch": 3.961947397873531,
"grad_norm": 0.44696512818336487,
"learning_rate": 7.679133974894983e-05,
"loss": 0.0136,
"step": 7080
},
{
"epoch": 3.967543368774482,
"grad_norm": 0.2737766206264496,
"learning_rate": 7.672149962045457e-05,
"loss": 0.0157,
"step": 7090
},
{
"epoch": 3.9731393396754338,
"grad_norm": 0.4152137339115143,
"learning_rate": 7.66515864363997e-05,
"loss": 0.0151,
"step": 7100
},
{
"epoch": 3.978735310576385,
"grad_norm": 0.25766709446907043,
"learning_rate": 7.658160038792518e-05,
"loss": 0.0185,
"step": 7110
},
{
"epoch": 3.984331281477336,
"grad_norm": 0.2175714522600174,
"learning_rate": 7.651154166637025e-05,
"loss": 0.013,
"step": 7120
},
{
"epoch": 3.989927252378288,
"grad_norm": 0.2838795483112335,
"learning_rate": 7.644141046327271e-05,
"loss": 0.0152,
"step": 7130
},
{
"epoch": 3.995523223279239,
"grad_norm": 0.17076176404953003,
"learning_rate": 7.637120697036866e-05,
"loss": 0.0161,
"step": 7140
},
{
"epoch": 4.00111919418019,
"grad_norm": 0.34454286098480225,
"learning_rate": 7.630093137959171e-05,
"loss": 0.0155,
"step": 7150
},
{
"epoch": 4.0067151650811414,
"grad_norm": 0.2543468773365021,
"learning_rate": 7.623058388307269e-05,
"loss": 0.0224,
"step": 7160
},
{
"epoch": 4.012311135982093,
"grad_norm": 0.26474493741989136,
"learning_rate": 7.616016467313891e-05,
"loss": 0.0121,
"step": 7170
},
{
"epoch": 4.017907106883044,
"grad_norm": 0.2469242513179779,
"learning_rate": 7.608967394231387e-05,
"loss": 0.0168,
"step": 7180
},
{
"epoch": 4.023503077783996,
"grad_norm": 0.2605207562446594,
"learning_rate": 7.60191118833165e-05,
"loss": 0.0142,
"step": 7190
},
{
"epoch": 4.029099048684947,
"grad_norm": 0.1799083948135376,
"learning_rate": 7.594847868906076e-05,
"loss": 0.02,
"step": 7200
},
{
"epoch": 4.034695019585898,
"grad_norm": 0.179059699177742,
"learning_rate": 7.587777455265515e-05,
"loss": 0.0115,
"step": 7210
},
{
"epoch": 4.04029099048685,
"grad_norm": 0.2233004868030548,
"learning_rate": 7.580699966740201e-05,
"loss": 0.0128,
"step": 7220
},
{
"epoch": 4.045886961387801,
"grad_norm": 0.253635436296463,
"learning_rate": 7.573615422679726e-05,
"loss": 0.0149,
"step": 7230
},
{
"epoch": 4.051482932288752,
"grad_norm": 0.3416047692298889,
"learning_rate": 7.566523842452958e-05,
"loss": 0.0125,
"step": 7240
},
{
"epoch": 4.057078903189703,
"grad_norm": 0.27430468797683716,
"learning_rate": 7.559425245448006e-05,
"loss": 0.0153,
"step": 7250
},
{
"epoch": 4.062674874090654,
"grad_norm": 0.26396802067756653,
"learning_rate": 7.552319651072164e-05,
"loss": 0.0128,
"step": 7260
},
{
"epoch": 4.068270844991606,
"grad_norm": 0.1688843071460724,
"learning_rate": 7.545207078751857e-05,
"loss": 0.017,
"step": 7270
},
{
"epoch": 4.073866815892558,
"grad_norm": 0.25092509388923645,
"learning_rate": 7.538087547932585e-05,
"loss": 0.0119,
"step": 7280
},
{
"epoch": 4.079462786793509,
"grad_norm": 0.12876421213150024,
"learning_rate": 7.530961078078873e-05,
"loss": 0.0099,
"step": 7290
},
{
"epoch": 4.08505875769446,
"grad_norm": 0.13818064332008362,
"learning_rate": 7.52382768867422e-05,
"loss": 0.0156,
"step": 7300
},
{
"epoch": 4.090654728595411,
"grad_norm": 0.23580847680568695,
"learning_rate": 7.516687399221037e-05,
"loss": 0.0122,
"step": 7310
},
{
"epoch": 4.096250699496363,
"grad_norm": 0.22529348731040955,
"learning_rate": 7.509540229240601e-05,
"loss": 0.0115,
"step": 7320
},
{
"epoch": 4.101846670397314,
"grad_norm": 0.29066744446754456,
"learning_rate": 7.50238619827301e-05,
"loss": 0.0125,
"step": 7330
},
{
"epoch": 4.107442641298265,
"grad_norm": 0.30195966362953186,
"learning_rate": 7.495225325877103e-05,
"loss": 0.0136,
"step": 7340
},
{
"epoch": 4.113038612199216,
"grad_norm": 0.2478567361831665,
"learning_rate": 7.488057631630437e-05,
"loss": 0.0138,
"step": 7350
},
{
"epoch": 4.118634583100168,
"grad_norm": 0.23493291437625885,
"learning_rate": 7.480883135129211e-05,
"loss": 0.0171,
"step": 7360
},
{
"epoch": 4.1242305540011195,
"grad_norm": 0.28376439213752747,
"learning_rate": 7.473701855988227e-05,
"loss": 0.0161,
"step": 7370
},
{
"epoch": 4.129826524902071,
"grad_norm": 0.183238685131073,
"learning_rate": 7.466513813840825e-05,
"loss": 0.0159,
"step": 7380
},
{
"epoch": 4.135422495803022,
"grad_norm": 0.26259323954582214,
"learning_rate": 7.45931902833884e-05,
"loss": 0.0139,
"step": 7390
},
{
"epoch": 4.141018466703973,
"grad_norm": 0.31283116340637207,
"learning_rate": 7.452117519152542e-05,
"loss": 0.0103,
"step": 7400
},
{
"epoch": 4.146614437604924,
"grad_norm": 0.3131321370601654,
"learning_rate": 7.444909305970578e-05,
"loss": 0.0147,
"step": 7410
},
{
"epoch": 4.1522104085058755,
"grad_norm": 0.22739440202713013,
"learning_rate": 7.437694408499933e-05,
"loss": 0.0199,
"step": 7420
},
{
"epoch": 4.157806379406827,
"grad_norm": 0.22918283939361572,
"learning_rate": 7.430472846465856e-05,
"loss": 0.0152,
"step": 7430
},
{
"epoch": 4.163402350307779,
"grad_norm": 0.3530014455318451,
"learning_rate": 7.423244639611826e-05,
"loss": 0.0123,
"step": 7440
},
{
"epoch": 4.16899832120873,
"grad_norm": 0.32133522629737854,
"learning_rate": 7.416009807699482e-05,
"loss": 0.0151,
"step": 7450
},
{
"epoch": 4.174594292109681,
"grad_norm": 0.13515067100524902,
"learning_rate": 7.408768370508576e-05,
"loss": 0.0123,
"step": 7460
},
{
"epoch": 4.1801902630106325,
"grad_norm": 0.39963120222091675,
"learning_rate": 7.401520347836926e-05,
"loss": 0.0132,
"step": 7470
},
{
"epoch": 4.185786233911584,
"grad_norm": 0.16310429573059082,
"learning_rate": 7.394265759500348e-05,
"loss": 0.0211,
"step": 7480
},
{
"epoch": 4.191382204812535,
"grad_norm": 0.23062337934970856,
"learning_rate": 7.387004625332608e-05,
"loss": 0.0155,
"step": 7490
},
{
"epoch": 4.196978175713486,
"grad_norm": 0.3456437289714813,
"learning_rate": 7.379736965185368e-05,
"loss": 0.0149,
"step": 7500
},
{
"epoch": 4.202574146614437,
"grad_norm": 0.30712154507637024,
"learning_rate": 7.372462798928137e-05,
"loss": 0.0142,
"step": 7510
},
{
"epoch": 4.2081701175153885,
"grad_norm": 0.40980008244514465,
"learning_rate": 7.365182146448205e-05,
"loss": 0.0185,
"step": 7520
},
{
"epoch": 4.213766088416341,
"grad_norm": 0.3277069330215454,
"learning_rate": 7.357895027650598e-05,
"loss": 0.0202,
"step": 7530
},
{
"epoch": 4.219362059317292,
"grad_norm": 0.2991955280303955,
"learning_rate": 7.350601462458024e-05,
"loss": 0.0129,
"step": 7540
},
{
"epoch": 4.224958030218243,
"grad_norm": 0.3370542526245117,
"learning_rate": 7.343301470810808e-05,
"loss": 0.0186,
"step": 7550
},
{
"epoch": 4.230554001119194,
"grad_norm": 0.31613653898239136,
"learning_rate": 7.335995072666848e-05,
"loss": 0.0123,
"step": 7560
},
{
"epoch": 4.236149972020145,
"grad_norm": 0.21174335479736328,
"learning_rate": 7.328682288001561e-05,
"loss": 0.0088,
"step": 7570
},
{
"epoch": 4.241745942921097,
"grad_norm": 0.18430404365062714,
"learning_rate": 7.32136313680782e-05,
"loss": 0.0136,
"step": 7580
},
{
"epoch": 4.247341913822048,
"grad_norm": 0.161945641040802,
"learning_rate": 7.3140376390959e-05,
"loss": 0.0146,
"step": 7590
},
{
"epoch": 4.252937884722999,
"grad_norm": 0.3349175453186035,
"learning_rate": 7.30670581489344e-05,
"loss": 0.0151,
"step": 7600
},
{
"epoch": 4.258533855623951,
"grad_norm": 0.22331948578357697,
"learning_rate": 7.299367684245362e-05,
"loss": 0.0116,
"step": 7610
},
{
"epoch": 4.264129826524902,
"grad_norm": 0.32214659452438354,
"learning_rate": 7.292023267213835e-05,
"loss": 0.0125,
"step": 7620
},
{
"epoch": 4.269725797425854,
"grad_norm": 0.2628123164176941,
"learning_rate": 7.284672583878219e-05,
"loss": 0.021,
"step": 7630
},
{
"epoch": 4.275321768326805,
"grad_norm": 0.17666281759738922,
"learning_rate": 7.277315654334997e-05,
"loss": 0.0129,
"step": 7640
},
{
"epoch": 4.280917739227756,
"grad_norm": 0.13651759922504425,
"learning_rate": 7.269952498697734e-05,
"loss": 0.0136,
"step": 7650
},
{
"epoch": 4.286513710128707,
"grad_norm": 0.19819198548793793,
"learning_rate": 7.262583137097018e-05,
"loss": 0.0178,
"step": 7660
},
{
"epoch": 4.292109681029658,
"grad_norm": 0.30227622389793396,
"learning_rate": 7.255207589680402e-05,
"loss": 0.0099,
"step": 7670
},
{
"epoch": 4.29770565193061,
"grad_norm": 0.1803039014339447,
"learning_rate": 7.247825876612353e-05,
"loss": 0.0125,
"step": 7680
},
{
"epoch": 4.303301622831562,
"grad_norm": 0.2602524757385254,
"learning_rate": 7.240438018074189e-05,
"loss": 0.0128,
"step": 7690
},
{
"epoch": 4.308897593732513,
"grad_norm": 0.22282052040100098,
"learning_rate": 7.233044034264034e-05,
"loss": 0.0105,
"step": 7700
},
{
"epoch": 4.314493564633464,
"grad_norm": 0.3194449841976166,
"learning_rate": 7.225643945396757e-05,
"loss": 0.0133,
"step": 7710
},
{
"epoch": 4.320089535534415,
"grad_norm": 0.31051668524742126,
"learning_rate": 7.218237771703921e-05,
"loss": 0.021,
"step": 7720
},
{
"epoch": 4.3256855064353665,
"grad_norm": 0.23389574885368347,
"learning_rate": 7.210825533433719e-05,
"loss": 0.0151,
"step": 7730
},
{
"epoch": 4.331281477336318,
"grad_norm": 0.16604237258434296,
"learning_rate": 7.203407250850928e-05,
"loss": 0.0101,
"step": 7740
},
{
"epoch": 4.336877448237269,
"grad_norm": 0.26793259382247925,
"learning_rate": 7.195982944236851e-05,
"loss": 0.0177,
"step": 7750
},
{
"epoch": 4.34247341913822,
"grad_norm": 0.21598176658153534,
"learning_rate": 7.188552633889259e-05,
"loss": 0.0168,
"step": 7760
},
{
"epoch": 4.348069390039171,
"grad_norm": 0.30887526273727417,
"learning_rate": 7.181116340122336e-05,
"loss": 0.0122,
"step": 7770
},
{
"epoch": 4.3536653609401235,
"grad_norm": 0.3463345468044281,
"learning_rate": 7.173674083266624e-05,
"loss": 0.0143,
"step": 7780
},
{
"epoch": 4.359261331841075,
"grad_norm": 0.26217085123062134,
"learning_rate": 7.166225883668969e-05,
"loss": 0.0151,
"step": 7790
},
{
"epoch": 4.364857302742026,
"grad_norm": 0.28720608353614807,
"learning_rate": 7.158771761692464e-05,
"loss": 0.0139,
"step": 7800
},
{
"epoch": 4.370453273642977,
"grad_norm": 0.35230302810668945,
"learning_rate": 7.151311737716397e-05,
"loss": 0.0146,
"step": 7810
},
{
"epoch": 4.376049244543928,
"grad_norm": 0.2841963469982147,
"learning_rate": 7.143845832136188e-05,
"loss": 0.0153,
"step": 7820
},
{
"epoch": 4.3816452154448795,
"grad_norm": 0.3889724016189575,
"learning_rate": 7.136374065363334e-05,
"loss": 0.0147,
"step": 7830
},
{
"epoch": 4.387241186345831,
"grad_norm": 0.2717784345149994,
"learning_rate": 7.128896457825364e-05,
"loss": 0.0161,
"step": 7840
},
{
"epoch": 4.392837157246782,
"grad_norm": 0.27939334511756897,
"learning_rate": 7.121413029965769e-05,
"loss": 0.0127,
"step": 7850
},
{
"epoch": 4.398433128147734,
"grad_norm": 0.24780631065368652,
"learning_rate": 7.113923802243957e-05,
"loss": 0.0134,
"step": 7860
},
{
"epoch": 4.404029099048685,
"grad_norm": 0.2736693024635315,
"learning_rate": 7.10642879513519e-05,
"loss": 0.0157,
"step": 7870
},
{
"epoch": 4.409625069949636,
"grad_norm": 0.2332269549369812,
"learning_rate": 7.09892802913053e-05,
"loss": 0.0155,
"step": 7880
},
{
"epoch": 4.415221040850588,
"grad_norm": 0.3542332947254181,
"learning_rate": 7.091421524736784e-05,
"loss": 0.0161,
"step": 7890
},
{
"epoch": 4.420817011751539,
"grad_norm": 0.29242730140686035,
"learning_rate": 7.083909302476453e-05,
"loss": 0.0137,
"step": 7900
},
{
"epoch": 4.42641298265249,
"grad_norm": 0.33528995513916016,
"learning_rate": 7.076391382887661e-05,
"loss": 0.0146,
"step": 7910
},
{
"epoch": 4.432008953553441,
"grad_norm": 0.34565469622612,
"learning_rate": 7.068867786524116e-05,
"loss": 0.0128,
"step": 7920
},
{
"epoch": 4.4376049244543925,
"grad_norm": 0.29550039768218994,
"learning_rate": 7.061338533955043e-05,
"loss": 0.0143,
"step": 7930
},
{
"epoch": 4.443200895355345,
"grad_norm": 0.18918676674365997,
"learning_rate": 7.053803645765128e-05,
"loss": 0.017,
"step": 7940
},
{
"epoch": 4.448796866256296,
"grad_norm": 0.24842104315757751,
"learning_rate": 7.04626314255447e-05,
"loss": 0.0115,
"step": 7950
},
{
"epoch": 4.454392837157247,
"grad_norm": 0.25395554304122925,
"learning_rate": 7.038717044938519e-05,
"loss": 0.0136,
"step": 7960
},
{
"epoch": 4.459988808058198,
"grad_norm": 0.223357155919075,
"learning_rate": 7.031165373548014e-05,
"loss": 0.0159,
"step": 7970
},
{
"epoch": 4.465584778959149,
"grad_norm": 0.2434312105178833,
"learning_rate": 7.023608149028937e-05,
"loss": 0.0113,
"step": 7980
},
{
"epoch": 4.471180749860101,
"grad_norm": 0.27500098943710327,
"learning_rate": 7.016045392042452e-05,
"loss": 0.0127,
"step": 7990
},
{
"epoch": 4.476776720761052,
"grad_norm": 0.1670360416173935,
"learning_rate": 7.008477123264848e-05,
"loss": 0.0151,
"step": 8000
},
{
"epoch": 4.482372691662003,
"grad_norm": 0.3035995662212372,
"learning_rate": 7.000903363387482e-05,
"loss": 0.0143,
"step": 8010
},
{
"epoch": 4.487968662562954,
"grad_norm": 0.25943461060523987,
"learning_rate": 6.993324133116726e-05,
"loss": 0.0099,
"step": 8020
},
{
"epoch": 4.493564633463906,
"grad_norm": 0.20338699221611023,
"learning_rate": 6.985739453173903e-05,
"loss": 0.0127,
"step": 8030
},
{
"epoch": 4.4991606043648575,
"grad_norm": 0.18308840692043304,
"learning_rate": 6.978149344295242e-05,
"loss": 0.012,
"step": 8040
},
{
"epoch": 4.504756575265809,
"grad_norm": 0.142523393034935,
"learning_rate": 6.97055382723181e-05,
"loss": 0.0117,
"step": 8050
},
{
"epoch": 4.51035254616676,
"grad_norm": 0.26383474469184875,
"learning_rate": 6.962952922749457e-05,
"loss": 0.0171,
"step": 8060
},
{
"epoch": 4.515948517067711,
"grad_norm": 0.1817890852689743,
"learning_rate": 6.955346651628771e-05,
"loss": 0.0147,
"step": 8070
},
{
"epoch": 4.521544487968662,
"grad_norm": 0.20679673552513123,
"learning_rate": 6.947735034665002e-05,
"loss": 0.0161,
"step": 8080
},
{
"epoch": 4.527140458869614,
"grad_norm": 0.2073245346546173,
"learning_rate": 6.940118092668022e-05,
"loss": 0.0104,
"step": 8090
},
{
"epoch": 4.532736429770566,
"grad_norm": 0.45759397745132446,
"learning_rate": 6.932495846462261e-05,
"loss": 0.0141,
"step": 8100
},
{
"epoch": 4.538332400671517,
"grad_norm": 0.2275332510471344,
"learning_rate": 6.924868316886649e-05,
"loss": 0.0144,
"step": 8110
},
{
"epoch": 4.543928371572468,
"grad_norm": 0.24839594960212708,
"learning_rate": 6.917235524794558e-05,
"loss": 0.0153,
"step": 8120
},
{
"epoch": 4.549524342473419,
"grad_norm": 0.13045403361320496,
"learning_rate": 6.909597491053751e-05,
"loss": 0.0148,
"step": 8130
},
{
"epoch": 4.5551203133743705,
"grad_norm": 0.298033207654953,
"learning_rate": 6.901954236546323e-05,
"loss": 0.0148,
"step": 8140
},
{
"epoch": 4.560716284275322,
"grad_norm": 0.3102302849292755,
"learning_rate": 6.894305782168638e-05,
"loss": 0.0104,
"step": 8150
},
{
"epoch": 4.566312255176273,
"grad_norm": 0.3511497378349304,
"learning_rate": 6.886652148831279e-05,
"loss": 0.0114,
"step": 8160
},
{
"epoch": 4.571908226077224,
"grad_norm": 0.19204401969909668,
"learning_rate": 6.878993357458986e-05,
"loss": 0.0144,
"step": 8170
},
{
"epoch": 4.577504196978175,
"grad_norm": 0.27601921558380127,
"learning_rate": 6.871329428990602e-05,
"loss": 0.0121,
"step": 8180
},
{
"epoch": 4.583100167879127,
"grad_norm": 0.15351536870002747,
"learning_rate": 6.863660384379017e-05,
"loss": 0.017,
"step": 8190
},
{
"epoch": 4.588696138780079,
"grad_norm": 0.34269094467163086,
"learning_rate": 6.855986244591104e-05,
"loss": 0.0164,
"step": 8200
},
{
"epoch": 4.59429210968103,
"grad_norm": 0.20768719911575317,
"learning_rate": 6.84830703060767e-05,
"loss": 0.0186,
"step": 8210
},
{
"epoch": 4.599888080581981,
"grad_norm": 0.29763510823249817,
"learning_rate": 6.840622763423391e-05,
"loss": 0.0134,
"step": 8220
},
{
"epoch": 4.605484051482932,
"grad_norm": 0.29871609807014465,
"learning_rate": 6.83293346404676e-05,
"loss": 0.0118,
"step": 8230
},
{
"epoch": 4.6110800223838835,
"grad_norm": 0.24642953276634216,
"learning_rate": 6.825239153500029e-05,
"loss": 0.015,
"step": 8240
},
{
"epoch": 4.616675993284835,
"grad_norm": 0.20664198696613312,
"learning_rate": 6.817539852819149e-05,
"loss": 0.0165,
"step": 8250
},
{
"epoch": 4.622271964185786,
"grad_norm": 0.1941448450088501,
"learning_rate": 6.809835583053715e-05,
"loss": 0.0129,
"step": 8260
},
{
"epoch": 4.627867935086737,
"grad_norm": 0.21355387568473816,
"learning_rate": 6.802126365266905e-05,
"loss": 0.013,
"step": 8270
},
{
"epoch": 4.633463905987689,
"grad_norm": 0.2642342746257782,
"learning_rate": 6.794412220535426e-05,
"loss": 0.0176,
"step": 8280
},
{
"epoch": 4.63905987688864,
"grad_norm": 0.31280654668807983,
"learning_rate": 6.786693169949455e-05,
"loss": 0.017,
"step": 8290
},
{
"epoch": 4.644655847789592,
"grad_norm": 0.2257363200187683,
"learning_rate": 6.778969234612584e-05,
"loss": 0.0099,
"step": 8300
},
{
"epoch": 4.650251818690543,
"grad_norm": 0.16536390781402588,
"learning_rate": 6.771240435641754e-05,
"loss": 0.012,
"step": 8310
},
{
"epoch": 4.655847789591494,
"grad_norm": 0.16031181812286377,
"learning_rate": 6.763506794167208e-05,
"loss": 0.0094,
"step": 8320
},
{
"epoch": 4.661443760492445,
"grad_norm": 0.2519717514514923,
"learning_rate": 6.755768331332424e-05,
"loss": 0.0153,
"step": 8330
},
{
"epoch": 4.6670397313933965,
"grad_norm": 0.11290234327316284,
"learning_rate": 6.748025068294067e-05,
"loss": 0.0187,
"step": 8340
},
{
"epoch": 4.6726357022943485,
"grad_norm": 0.18607747554779053,
"learning_rate": 6.740277026221923e-05,
"loss": 0.0123,
"step": 8350
},
{
"epoch": 4.6782316731953,
"grad_norm": 0.20653483271598816,
"learning_rate": 6.732524226298841e-05,
"loss": 0.0128,
"step": 8360
},
{
"epoch": 4.683827644096251,
"grad_norm": 0.20888541638851166,
"learning_rate": 6.72476668972068e-05,
"loss": 0.0235,
"step": 8370
},
{
"epoch": 4.689423614997202,
"grad_norm": 0.23816397786140442,
"learning_rate": 6.71700443769625e-05,
"loss": 0.0125,
"step": 8380
},
{
"epoch": 4.695019585898153,
"grad_norm": 0.3250564932823181,
"learning_rate": 6.709237491447249e-05,
"loss": 0.011,
"step": 8390
},
{
"epoch": 4.700615556799105,
"grad_norm": 0.3211959898471832,
"learning_rate": 6.701465872208216e-05,
"loss": 0.0124,
"step": 8400
},
{
"epoch": 4.706211527700056,
"grad_norm": 0.3432743549346924,
"learning_rate": 6.693689601226458e-05,
"loss": 0.0119,
"step": 8410
},
{
"epoch": 4.711807498601007,
"grad_norm": 0.2595174014568329,
"learning_rate": 6.685908699762002e-05,
"loss": 0.0111,
"step": 8420
},
{
"epoch": 4.717403469501958,
"grad_norm": 0.283252090215683,
"learning_rate": 6.67812318908754e-05,
"loss": 0.0119,
"step": 8430
},
{
"epoch": 4.72299944040291,
"grad_norm": 0.20471790432929993,
"learning_rate": 6.670333090488356e-05,
"loss": 0.013,
"step": 8440
},
{
"epoch": 4.7285954113038615,
"grad_norm": 0.1850796490907669,
"learning_rate": 6.662538425262285e-05,
"loss": 0.0112,
"step": 8450
},
{
"epoch": 4.734191382204813,
"grad_norm": 0.2515677213668823,
"learning_rate": 6.654739214719641e-05,
"loss": 0.0084,
"step": 8460
},
{
"epoch": 4.739787353105764,
"grad_norm": 0.25231802463531494,
"learning_rate": 6.646935480183173e-05,
"loss": 0.0149,
"step": 8470
},
{
"epoch": 4.745383324006715,
"grad_norm": 0.24691557884216309,
"learning_rate": 6.639127242987988e-05,
"loss": 0.0144,
"step": 8480
},
{
"epoch": 4.750979294907666,
"grad_norm": 0.3806649446487427,
"learning_rate": 6.631314524481513e-05,
"loss": 0.0136,
"step": 8490
},
{
"epoch": 4.756575265808618,
"grad_norm": 0.233370840549469,
"learning_rate": 6.623497346023418e-05,
"loss": 0.0119,
"step": 8500
},
{
"epoch": 4.762171236709569,
"grad_norm": 0.16195163130760193,
"learning_rate": 6.615675728985572e-05,
"loss": 0.0178,
"step": 8510
},
{
"epoch": 4.76776720761052,
"grad_norm": 0.25800469517707825,
"learning_rate": 6.607849694751977e-05,
"loss": 0.012,
"step": 8520
},
{
"epoch": 4.773363178511472,
"grad_norm": 0.17752796411514282,
"learning_rate": 6.600019264718713e-05,
"loss": 0.0084,
"step": 8530
},
{
"epoch": 4.778959149412423,
"grad_norm": 0.2168557047843933,
"learning_rate": 6.592184460293877e-05,
"loss": 0.0163,
"step": 8540
},
{
"epoch": 4.7845551203133745,
"grad_norm": 0.2908076345920563,
"learning_rate": 6.584345302897523e-05,
"loss": 0.0091,
"step": 8550
},
{
"epoch": 4.790151091214326,
"grad_norm": 0.16817107796669006,
"learning_rate": 6.576501813961609e-05,
"loss": 0.012,
"step": 8560
},
{
"epoch": 4.795747062115277,
"grad_norm": 0.17607803642749786,
"learning_rate": 6.568654014929932e-05,
"loss": 0.0095,
"step": 8570
},
{
"epoch": 4.801343033016228,
"grad_norm": 0.1395525336265564,
"learning_rate": 6.56080192725808e-05,
"loss": 0.0127,
"step": 8580
},
{
"epoch": 4.806939003917179,
"grad_norm": 0.12721598148345947,
"learning_rate": 6.552945572413358e-05,
"loss": 0.0127,
"step": 8590
},
{
"epoch": 4.812534974818131,
"grad_norm": 0.220106303691864,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0124,
"step": 8600
},
{
"epoch": 4.818130945719083,
"grad_norm": 0.1850575953722,
"learning_rate": 6.537220147132805e-05,
"loss": 0.0133,
"step": 8610
},
{
"epoch": 4.823726916620034,
"grad_norm": 0.14641323685646057,
"learning_rate": 6.529351119689688e-05,
"loss": 0.0083,
"step": 8620
},
{
"epoch": 4.829322887520985,
"grad_norm": 0.2565167546272278,
"learning_rate": 6.521477911059008e-05,
"loss": 0.0146,
"step": 8630
},
{
"epoch": 4.834918858421936,
"grad_norm": 0.1807018518447876,
"learning_rate": 6.513600542765817e-05,
"loss": 0.0093,
"step": 8640
},
{
"epoch": 4.8405148293228875,
"grad_norm": 0.22783279418945312,
"learning_rate": 6.505719036346539e-05,
"loss": 0.0105,
"step": 8650
},
{
"epoch": 4.846110800223839,
"grad_norm": 0.18857407569885254,
"learning_rate": 6.497833413348909e-05,
"loss": 0.012,
"step": 8660
},
{
"epoch": 4.85170677112479,
"grad_norm": 0.31593799591064453,
"learning_rate": 6.489943695331923e-05,
"loss": 0.013,
"step": 8670
},
{
"epoch": 4.857302742025741,
"grad_norm": 0.3053518533706665,
"learning_rate": 6.48204990386577e-05,
"loss": 0.0106,
"step": 8680
},
{
"epoch": 4.862898712926693,
"grad_norm": 0.2662791311740875,
"learning_rate": 6.474152060531768e-05,
"loss": 0.0151,
"step": 8690
},
{
"epoch": 4.868494683827644,
"grad_norm": 0.13093920052051544,
"learning_rate": 6.466250186922325e-05,
"loss": 0.0108,
"step": 8700
},
{
"epoch": 4.874090654728596,
"grad_norm": 0.17706599831581116,
"learning_rate": 6.458344304640858e-05,
"loss": 0.0118,
"step": 8710
},
{
"epoch": 4.879686625629547,
"grad_norm": 0.19158832728862762,
"learning_rate": 6.450434435301751e-05,
"loss": 0.0116,
"step": 8720
},
{
"epoch": 4.885282596530498,
"grad_norm": 0.12095298618078232,
"learning_rate": 6.44252060053028e-05,
"loss": 0.0134,
"step": 8730
},
{
"epoch": 4.890878567431449,
"grad_norm": 0.2882150411605835,
"learning_rate": 6.43460282196257e-05,
"loss": 0.0112,
"step": 8740
},
{
"epoch": 4.8964745383324,
"grad_norm": 0.34821435809135437,
"learning_rate": 6.426681121245527e-05,
"loss": 0.0111,
"step": 8750
},
{
"epoch": 4.902070509233352,
"grad_norm": 0.28680020570755005,
"learning_rate": 6.418755520036775e-05,
"loss": 0.011,
"step": 8760
},
{
"epoch": 4.907666480134303,
"grad_norm": 0.15372464060783386,
"learning_rate": 6.410826040004607e-05,
"loss": 0.0138,
"step": 8770
},
{
"epoch": 4.913262451035255,
"grad_norm": 0.24093207716941833,
"learning_rate": 6.402892702827916e-05,
"loss": 0.0152,
"step": 8780
},
{
"epoch": 4.918858421936206,
"grad_norm": 0.3779686689376831,
"learning_rate": 6.394955530196147e-05,
"loss": 0.0173,
"step": 8790
},
{
"epoch": 4.924454392837157,
"grad_norm": 0.19445843994617462,
"learning_rate": 6.387014543809223e-05,
"loss": 0.0142,
"step": 8800
},
{
"epoch": 4.930050363738109,
"grad_norm": 0.32286763191223145,
"learning_rate": 6.3790697653775e-05,
"loss": 0.0217,
"step": 8810
},
{
"epoch": 4.93564633463906,
"grad_norm": 0.27731436491012573,
"learning_rate": 6.371121216621698e-05,
"loss": 0.0103,
"step": 8820
},
{
"epoch": 4.941242305540011,
"grad_norm": 0.2174469232559204,
"learning_rate": 6.363168919272846e-05,
"loss": 0.0112,
"step": 8830
},
{
"epoch": 4.946838276440962,
"grad_norm": 0.20424802601337433,
"learning_rate": 6.355212895072223e-05,
"loss": 0.0179,
"step": 8840
},
{
"epoch": 4.952434247341914,
"grad_norm": 0.14288559556007385,
"learning_rate": 6.34725316577129e-05,
"loss": 0.0116,
"step": 8850
},
{
"epoch": 4.9580302182428655,
"grad_norm": 0.21734347939491272,
"learning_rate": 6.339289753131649e-05,
"loss": 0.012,
"step": 8860
},
{
"epoch": 4.963626189143817,
"grad_norm": 0.29445502161979675,
"learning_rate": 6.331322678924962e-05,
"loss": 0.0116,
"step": 8870
},
{
"epoch": 4.969222160044768,
"grad_norm": 0.2319229543209076,
"learning_rate": 6.323351964932908e-05,
"loss": 0.0194,
"step": 8880
},
{
"epoch": 4.974818130945719,
"grad_norm": 0.13166509568691254,
"learning_rate": 6.315377632947115e-05,
"loss": 0.0127,
"step": 8890
},
{
"epoch": 4.98041410184667,
"grad_norm": 0.2546875774860382,
"learning_rate": 6.307399704769099e-05,
"loss": 0.0115,
"step": 8900
},
{
"epoch": 4.9860100727476215,
"grad_norm": 0.2343253493309021,
"learning_rate": 6.299418202210214e-05,
"loss": 0.0123,
"step": 8910
},
{
"epoch": 4.991606043648573,
"grad_norm": 0.12813247740268707,
"learning_rate": 6.291433147091583e-05,
"loss": 0.0121,
"step": 8920
},
{
"epoch": 4.997202014549524,
"grad_norm": 0.11860624700784683,
"learning_rate": 6.283444561244042e-05,
"loss": 0.0125,
"step": 8930
},
{
"epoch": 5.002797985450476,
"grad_norm": 0.1995118260383606,
"learning_rate": 6.275452466508077e-05,
"loss": 0.0112,
"step": 8940
},
{
"epoch": 5.008393956351427,
"grad_norm": 0.2113560289144516,
"learning_rate": 6.26745688473377e-05,
"loss": 0.0118,
"step": 8950
},
{
"epoch": 5.0139899272523785,
"grad_norm": 0.321319580078125,
"learning_rate": 6.259457837780742e-05,
"loss": 0.0145,
"step": 8960
},
{
"epoch": 5.01958589815333,
"grad_norm": 0.15436704456806183,
"learning_rate": 6.251455347518073e-05,
"loss": 0.011,
"step": 8970
},
{
"epoch": 5.025181869054281,
"grad_norm": 0.2929522693157196,
"learning_rate": 6.243449435824276e-05,
"loss": 0.0145,
"step": 8980
},
{
"epoch": 5.030777839955232,
"grad_norm": 0.2311781346797943,
"learning_rate": 6.235440124587198e-05,
"loss": 0.0121,
"step": 8990
},
{
"epoch": 5.036373810856183,
"grad_norm": 0.16461458802223206,
"learning_rate": 6.227427435703997e-05,
"loss": 0.016,
"step": 9000
},
{
"epoch": 5.0419697817571345,
"grad_norm": 0.23925089836120605,
"learning_rate": 6.219411391081055e-05,
"loss": 0.0125,
"step": 9010
},
{
"epoch": 5.047565752658087,
"grad_norm": 0.3376557230949402,
"learning_rate": 6.211392012633932e-05,
"loss": 0.0147,
"step": 9020
},
{
"epoch": 5.053161723559038,
"grad_norm": 0.20988136529922485,
"learning_rate": 6.203369322287306e-05,
"loss": 0.0139,
"step": 9030
},
{
"epoch": 5.058757694459989,
"grad_norm": 0.17247657477855682,
"learning_rate": 6.195343341974899e-05,
"loss": 0.0133,
"step": 9040
},
{
"epoch": 5.06435366536094,
"grad_norm": 0.24936120212078094,
"learning_rate": 6.187314093639444e-05,
"loss": 0.0112,
"step": 9050
},
{
"epoch": 5.069949636261891,
"grad_norm": 0.1587497889995575,
"learning_rate": 6.179281599232591e-05,
"loss": 0.0127,
"step": 9060
},
{
"epoch": 5.075545607162843,
"grad_norm": 0.12296043336391449,
"learning_rate": 6.17124588071488e-05,
"loss": 0.0132,
"step": 9070
},
{
"epoch": 5.081141578063794,
"grad_norm": 0.2310076504945755,
"learning_rate": 6.163206960055651e-05,
"loss": 0.013,
"step": 9080
},
{
"epoch": 5.086737548964745,
"grad_norm": 0.1278199851512909,
"learning_rate": 6.155164859233012e-05,
"loss": 0.0127,
"step": 9090
},
{
"epoch": 5.092333519865696,
"grad_norm": 0.225848987698555,
"learning_rate": 6.147119600233758e-05,
"loss": 0.0125,
"step": 9100
},
{
"epoch": 5.097929490766648,
"grad_norm": 0.12778952717781067,
"learning_rate": 6.13907120505332e-05,
"loss": 0.0102,
"step": 9110
},
{
"epoch": 5.1035254616676,
"grad_norm": 0.2868061065673828,
"learning_rate": 6.131019695695702e-05,
"loss": 0.0102,
"step": 9120
},
{
"epoch": 5.109121432568551,
"grad_norm": 0.35349947214126587,
"learning_rate": 6.122965094173424e-05,
"loss": 0.0151,
"step": 9130
},
{
"epoch": 5.114717403469502,
"grad_norm": 0.24252165853977203,
"learning_rate": 6.11490742250746e-05,
"loss": 0.0111,
"step": 9140
},
{
"epoch": 5.120313374370453,
"grad_norm": 0.17868760228157043,
"learning_rate": 6.106846702727172e-05,
"loss": 0.0102,
"step": 9150
},
{
"epoch": 5.125909345271404,
"grad_norm": 0.21379156410694122,
"learning_rate": 6.0987829568702656e-05,
"loss": 0.0137,
"step": 9160
},
{
"epoch": 5.131505316172356,
"grad_norm": 0.29363685846328735,
"learning_rate": 6.090716206982714e-05,
"loss": 0.0131,
"step": 9170
},
{
"epoch": 5.137101287073307,
"grad_norm": 0.330162912607193,
"learning_rate": 6.0826464751186994e-05,
"loss": 0.0129,
"step": 9180
},
{
"epoch": 5.142697257974259,
"grad_norm": 0.2052110731601715,
"learning_rate": 6.074573783340562e-05,
"loss": 0.0108,
"step": 9190
},
{
"epoch": 5.14829322887521,
"grad_norm": 0.17011559009552002,
"learning_rate": 6.066498153718735e-05,
"loss": 0.0125,
"step": 9200
},
{
"epoch": 5.153889199776161,
"grad_norm": 0.3137349486351013,
"learning_rate": 6.0584196083316794e-05,
"loss": 0.0192,
"step": 9210
},
{
"epoch": 5.1594851706771125,
"grad_norm": 0.3046635389328003,
"learning_rate": 6.05033816926583e-05,
"loss": 0.0119,
"step": 9220
},
{
"epoch": 5.165081141578064,
"grad_norm": 0.1919318437576294,
"learning_rate": 6.042253858615532e-05,
"loss": 0.0139,
"step": 9230
},
{
"epoch": 5.170677112479015,
"grad_norm": 0.3815397322177887,
"learning_rate": 6.034166698482984e-05,
"loss": 0.0176,
"step": 9240
},
{
"epoch": 5.176273083379966,
"grad_norm": 0.23484662175178528,
"learning_rate": 6.026076710978171e-05,
"loss": 0.0137,
"step": 9250
},
{
"epoch": 5.181869054280917,
"grad_norm": 0.1737549602985382,
"learning_rate": 6.017983918218812e-05,
"loss": 0.0112,
"step": 9260
},
{
"epoch": 5.1874650251818695,
"grad_norm": 0.28736233711242676,
"learning_rate": 6.009888342330292e-05,
"loss": 0.0112,
"step": 9270
},
{
"epoch": 5.193060996082821,
"grad_norm": 0.21343185007572174,
"learning_rate": 6.001790005445607e-05,
"loss": 0.0089,
"step": 9280
},
{
"epoch": 5.198656966983772,
"grad_norm": 0.15162508189678192,
"learning_rate": 5.9936889297052986e-05,
"loss": 0.0156,
"step": 9290
},
{
"epoch": 5.204252937884723,
"grad_norm": 0.2816758155822754,
"learning_rate": 5.985585137257401e-05,
"loss": 0.0093,
"step": 9300
},
{
"epoch": 5.209848908785674,
"grad_norm": 0.1730954796075821,
"learning_rate": 5.977478650257374e-05,
"loss": 0.016,
"step": 9310
},
{
"epoch": 5.2154448796866255,
"grad_norm": 0.18365302681922913,
"learning_rate": 5.969369490868042e-05,
"loss": 0.0259,
"step": 9320
},
{
"epoch": 5.221040850587577,
"grad_norm": 0.12864327430725098,
"learning_rate": 5.961257681259535e-05,
"loss": 0.0119,
"step": 9330
},
{
"epoch": 5.226636821488528,
"grad_norm": 0.16363385319709778,
"learning_rate": 5.953143243609235e-05,
"loss": 0.0129,
"step": 9340
},
{
"epoch": 5.23223279238948,
"grad_norm": 0.15773551166057587,
"learning_rate": 5.945026200101702e-05,
"loss": 0.0083,
"step": 9350
},
{
"epoch": 5.237828763290431,
"grad_norm": 0.22605851292610168,
"learning_rate": 5.9369065729286245e-05,
"loss": 0.0096,
"step": 9360
},
{
"epoch": 5.243424734191382,
"grad_norm": 0.13637419044971466,
"learning_rate": 5.92878438428875e-05,
"loss": 0.0185,
"step": 9370
},
{
"epoch": 5.249020705092334,
"grad_norm": 0.12795643508434296,
"learning_rate": 5.9206596563878357e-05,
"loss": 0.008,
"step": 9380
},
{
"epoch": 5.254616675993285,
"grad_norm": 0.2635105550289154,
"learning_rate": 5.912532411438576e-05,
"loss": 0.0162,
"step": 9390
},
{
"epoch": 5.260212646894236,
"grad_norm": 0.18397080898284912,
"learning_rate": 5.90440267166055e-05,
"loss": 0.013,
"step": 9400
},
{
"epoch": 5.265808617795187,
"grad_norm": 0.23337115347385406,
"learning_rate": 5.896270459280153e-05,
"loss": 0.0105,
"step": 9410
},
{
"epoch": 5.2714045886961385,
"grad_norm": 0.24963605403900146,
"learning_rate": 5.888135796530544e-05,
"loss": 0.0098,
"step": 9420
},
{
"epoch": 5.27700055959709,
"grad_norm": 0.372761070728302,
"learning_rate": 5.8799987056515804e-05,
"loss": 0.0125,
"step": 9430
},
{
"epoch": 5.282596530498042,
"grad_norm": 0.2931661009788513,
"learning_rate": 5.871859208889759e-05,
"loss": 0.012,
"step": 9440
},
{
"epoch": 5.288192501398993,
"grad_norm": 0.2341478168964386,
"learning_rate": 5.8637173284981526e-05,
"loss": 0.0113,
"step": 9450
},
{
"epoch": 5.293788472299944,
"grad_norm": 0.2445063441991806,
"learning_rate": 5.85557308673635e-05,
"loss": 0.0157,
"step": 9460
},
{
"epoch": 5.299384443200895,
"grad_norm": 0.22766774892807007,
"learning_rate": 5.847426505870399e-05,
"loss": 0.011,
"step": 9470
},
{
"epoch": 5.304980414101847,
"grad_norm": 0.25397437810897827,
"learning_rate": 5.8392776081727385e-05,
"loss": 0.0088,
"step": 9480
},
{
"epoch": 5.310576385002798,
"grad_norm": 0.2036605179309845,
"learning_rate": 5.831126415922148e-05,
"loss": 0.0138,
"step": 9490
},
{
"epoch": 5.316172355903749,
"grad_norm": 0.17595243453979492,
"learning_rate": 5.8229729514036705e-05,
"loss": 0.0102,
"step": 9500
},
{
"epoch": 5.3217683268047,
"grad_norm": 0.14046894013881683,
"learning_rate": 5.8148172369085686e-05,
"loss": 0.0148,
"step": 9510
},
{
"epoch": 5.327364297705652,
"grad_norm": 0.2699585556983948,
"learning_rate": 5.8066592947342555e-05,
"loss": 0.0107,
"step": 9520
},
{
"epoch": 5.3329602686066035,
"grad_norm": 0.15614166855812073,
"learning_rate": 5.798499147184233e-05,
"loss": 0.0118,
"step": 9530
},
{
"epoch": 5.338556239507555,
"grad_norm": 0.3686412572860718,
"learning_rate": 5.7903368165680327e-05,
"loss": 0.0122,
"step": 9540
},
{
"epoch": 5.344152210408506,
"grad_norm": 0.2578679323196411,
"learning_rate": 5.782172325201155e-05,
"loss": 0.0152,
"step": 9550
},
{
"epoch": 5.349748181309457,
"grad_norm": 0.24605675041675568,
"learning_rate": 5.7740056954050084e-05,
"loss": 0.0106,
"step": 9560
},
{
"epoch": 5.355344152210408,
"grad_norm": 0.19138172268867493,
"learning_rate": 5.765836949506843e-05,
"loss": 0.0134,
"step": 9570
},
{
"epoch": 5.36094012311136,
"grad_norm": 0.23657287657260895,
"learning_rate": 5.757666109839702e-05,
"loss": 0.0076,
"step": 9580
},
{
"epoch": 5.366536094012311,
"grad_norm": 0.13402613997459412,
"learning_rate": 5.74949319874235e-05,
"loss": 0.0092,
"step": 9590
},
{
"epoch": 5.372132064913263,
"grad_norm": 0.16487988829612732,
"learning_rate": 5.74131823855921e-05,
"loss": 0.0165,
"step": 9600
},
{
"epoch": 5.377728035814214,
"grad_norm": 0.1842515617609024,
"learning_rate": 5.733141251640315e-05,
"loss": 0.0101,
"step": 9610
},
{
"epoch": 5.383324006715165,
"grad_norm": 0.17961528897285461,
"learning_rate": 5.72496226034123e-05,
"loss": 0.012,
"step": 9620
},
{
"epoch": 5.3889199776161165,
"grad_norm": 0.2516380548477173,
"learning_rate": 5.7167812870230094e-05,
"loss": 0.011,
"step": 9630
},
{
"epoch": 5.394515948517068,
"grad_norm": 0.1506935954093933,
"learning_rate": 5.7085983540521216e-05,
"loss": 0.0075,
"step": 9640
},
{
"epoch": 5.400111919418019,
"grad_norm": 0.3415573835372925,
"learning_rate": 5.70041348380039e-05,
"loss": 0.0142,
"step": 9650
},
{
"epoch": 5.40570789031897,
"grad_norm": 0.2501567006111145,
"learning_rate": 5.692226698644938e-05,
"loss": 0.0126,
"step": 9660
},
{
"epoch": 5.411303861219921,
"grad_norm": 0.15769636631011963,
"learning_rate": 5.6840380209681255e-05,
"loss": 0.0206,
"step": 9670
},
{
"epoch": 5.416899832120873,
"grad_norm": 0.17793142795562744,
"learning_rate": 5.675847473157485e-05,
"loss": 0.0198,
"step": 9680
},
{
"epoch": 5.422495803021825,
"grad_norm": 0.19135138392448425,
"learning_rate": 5.667655077605659e-05,
"loss": 0.0089,
"step": 9690
},
{
"epoch": 5.428091773922776,
"grad_norm": 0.1910410374403,
"learning_rate": 5.6594608567103456e-05,
"loss": 0.0178,
"step": 9700
},
{
"epoch": 5.433687744823727,
"grad_norm": 0.18896977603435516,
"learning_rate": 5.65126483287423e-05,
"loss": 0.0102,
"step": 9710
},
{
"epoch": 5.439283715724678,
"grad_norm": 0.12857311964035034,
"learning_rate": 5.6430670285049314e-05,
"loss": 0.0147,
"step": 9720
},
{
"epoch": 5.4448796866256295,
"grad_norm": 0.20521825551986694,
"learning_rate": 5.634867466014932e-05,
"loss": 0.0101,
"step": 9730
},
{
"epoch": 5.450475657526581,
"grad_norm": 0.16037105023860931,
"learning_rate": 5.6266661678215216e-05,
"loss": 0.0114,
"step": 9740
},
{
"epoch": 5.456071628427532,
"grad_norm": 0.15576882660388947,
"learning_rate": 5.618463156346739e-05,
"loss": 0.0138,
"step": 9750
},
{
"epoch": 5.461667599328483,
"grad_norm": 0.24249835312366486,
"learning_rate": 5.6102584540173006e-05,
"loss": 0.0131,
"step": 9760
},
{
"epoch": 5.467263570229435,
"grad_norm": 0.27811625599861145,
"learning_rate": 5.602052083264555e-05,
"loss": 0.0098,
"step": 9770
},
{
"epoch": 5.472859541130386,
"grad_norm": 0.3673328459262848,
"learning_rate": 5.5938440665244006e-05,
"loss": 0.0131,
"step": 9780
},
{
"epoch": 5.478455512031338,
"grad_norm": 0.2886298596858978,
"learning_rate": 5.585634426237246e-05,
"loss": 0.0141,
"step": 9790
},
{
"epoch": 5.484051482932289,
"grad_norm": 0.2564665973186493,
"learning_rate": 5.577423184847932e-05,
"loss": 0.0104,
"step": 9800
},
{
"epoch": 5.48964745383324,
"grad_norm": 0.22507299482822418,
"learning_rate": 5.569210364805677e-05,
"loss": 0.0116,
"step": 9810
},
{
"epoch": 5.495243424734191,
"grad_norm": 0.09582646191120148,
"learning_rate": 5.560995988564023e-05,
"loss": 0.0107,
"step": 9820
},
{
"epoch": 5.5008393956351425,
"grad_norm": 0.25511208176612854,
"learning_rate": 5.552780078580756e-05,
"loss": 0.0111,
"step": 9830
},
{
"epoch": 5.506435366536094,
"grad_norm": 0.14793109893798828,
"learning_rate": 5.544562657317863e-05,
"loss": 0.0088,
"step": 9840
},
{
"epoch": 5.512031337437046,
"grad_norm": 0.3215508759021759,
"learning_rate": 5.5363437472414595e-05,
"loss": 0.0132,
"step": 9850
},
{
"epoch": 5.517627308337997,
"grad_norm": 0.357731431722641,
"learning_rate": 5.52812337082173e-05,
"loss": 0.0119,
"step": 9860
},
{
"epoch": 5.523223279238948,
"grad_norm": 0.2520214915275574,
"learning_rate": 5.519901550532871e-05,
"loss": 0.0121,
"step": 9870
},
{
"epoch": 5.528819250139899,
"grad_norm": 0.28353017568588257,
"learning_rate": 5.511678308853026e-05,
"loss": 0.0077,
"step": 9880
},
{
"epoch": 5.534415221040851,
"grad_norm": 0.34384286403656006,
"learning_rate": 5.5034536682642224e-05,
"loss": 0.0125,
"step": 9890
},
{
"epoch": 5.540011191941802,
"grad_norm": 0.21323193609714508,
"learning_rate": 5.495227651252315e-05,
"loss": 0.0121,
"step": 9900
},
{
"epoch": 5.545607162842753,
"grad_norm": 0.3126833736896515,
"learning_rate": 5.487000280306917e-05,
"loss": 0.0125,
"step": 9910
},
{
"epoch": 5.551203133743704,
"grad_norm": 0.29106199741363525,
"learning_rate": 5.478771577921351e-05,
"loss": 0.0098,
"step": 9920
},
{
"epoch": 5.556799104644655,
"grad_norm": 0.2740892469882965,
"learning_rate": 5.470541566592573e-05,
"loss": 0.0135,
"step": 9930
},
{
"epoch": 5.5623950755456075,
"grad_norm": 0.19003938138484955,
"learning_rate": 5.462310268821118e-05,
"loss": 0.0146,
"step": 9940
},
{
"epoch": 5.567991046446559,
"grad_norm": 0.2251635491847992,
"learning_rate": 5.454077707111042e-05,
"loss": 0.0153,
"step": 9950
},
{
"epoch": 5.57358701734751,
"grad_norm": 0.16961322724819183,
"learning_rate": 5.445843903969854e-05,
"loss": 0.0154,
"step": 9960
},
{
"epoch": 5.579182988248461,
"grad_norm": 0.2752644419670105,
"learning_rate": 5.4376088819084556e-05,
"loss": 0.0102,
"step": 9970
},
{
"epoch": 5.584778959149412,
"grad_norm": 0.24675792455673218,
"learning_rate": 5.4293726634410855e-05,
"loss": 0.0123,
"step": 9980
},
{
"epoch": 5.590374930050364,
"grad_norm": 0.2074369490146637,
"learning_rate": 5.4211352710852495e-05,
"loss": 0.0095,
"step": 9990
},
{
"epoch": 5.595970900951315,
"grad_norm": 0.22929449379444122,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.0123,
"step": 10000
},
{
"epoch": 5.601566871852266,
"grad_norm": 0.21107512712478638,
"learning_rate": 5.404657054794189e-05,
"loss": 0.01,
"step": 10010
},
{
"epoch": 5.607162842753217,
"grad_norm": 0.3743564188480377,
"learning_rate": 5.396416275909779e-05,
"loss": 0.0173,
"step": 10020
},
{
"epoch": 5.612758813654169,
"grad_norm": 0.19637951254844666,
"learning_rate": 5.3881744132384104e-05,
"loss": 0.0114,
"step": 10030
},
{
"epoch": 5.6183547845551205,
"grad_norm": 0.2417994886636734,
"learning_rate": 5.379931489313016e-05,
"loss": 0.0117,
"step": 10040
},
{
"epoch": 5.623950755456072,
"grad_norm": 0.18541017174720764,
"learning_rate": 5.371687526669439e-05,
"loss": 0.0139,
"step": 10050
},
{
"epoch": 5.629546726357023,
"grad_norm": 0.26478803157806396,
"learning_rate": 5.363442547846356e-05,
"loss": 0.0108,
"step": 10060
},
{
"epoch": 5.635142697257974,
"grad_norm": 0.23468017578125,
"learning_rate": 5.355196575385225e-05,
"loss": 0.0107,
"step": 10070
},
{
"epoch": 5.640738668158925,
"grad_norm": 0.2251582145690918,
"learning_rate": 5.3469496318302204e-05,
"loss": 0.0105,
"step": 10080
},
{
"epoch": 5.6463346390598765,
"grad_norm": 0.18580631911754608,
"learning_rate": 5.3387017397281704e-05,
"loss": 0.0107,
"step": 10090
},
{
"epoch": 5.651930609960829,
"grad_norm": 0.14670825004577637,
"learning_rate": 5.330452921628497e-05,
"loss": 0.0103,
"step": 10100
},
{
"epoch": 5.65752658086178,
"grad_norm": 0.22916555404663086,
"learning_rate": 5.322203200083154e-05,
"loss": 0.0113,
"step": 10110
},
{
"epoch": 5.663122551762731,
"grad_norm": 0.1360463947057724,
"learning_rate": 5.313952597646568e-05,
"loss": 0.0121,
"step": 10120
},
{
"epoch": 5.668718522663682,
"grad_norm": 0.24525059759616852,
"learning_rate": 5.305701136875566e-05,
"loss": 0.0092,
"step": 10130
},
{
"epoch": 5.6743144935646335,
"grad_norm": 0.1451522707939148,
"learning_rate": 5.297448840329329e-05,
"loss": 0.0081,
"step": 10140
},
{
"epoch": 5.679910464465585,
"grad_norm": 0.1923244744539261,
"learning_rate": 5.2891957305693205e-05,
"loss": 0.0117,
"step": 10150
},
{
"epoch": 5.685506435366536,
"grad_norm": 0.18804806470870972,
"learning_rate": 5.280941830159227e-05,
"loss": 0.0095,
"step": 10160
},
{
"epoch": 5.691102406267487,
"grad_norm": 0.1880972534418106,
"learning_rate": 5.2726871616649e-05,
"loss": 0.0111,
"step": 10170
},
{
"epoch": 5.696698377168438,
"grad_norm": 0.18024373054504395,
"learning_rate": 5.264431747654284e-05,
"loss": 0.0119,
"step": 10180
},
{
"epoch": 5.70229434806939,
"grad_norm": 0.16494502127170563,
"learning_rate": 5.2561756106973656e-05,
"loss": 0.0131,
"step": 10190
},
{
"epoch": 5.707890318970342,
"grad_norm": 0.2051820605993271,
"learning_rate": 5.247918773366112e-05,
"loss": 0.0136,
"step": 10200
},
{
"epoch": 5.713486289871293,
"grad_norm": 0.21385324001312256,
"learning_rate": 5.2396612582343986e-05,
"loss": 0.0101,
"step": 10210
},
{
"epoch": 5.719082260772244,
"grad_norm": 0.2170487344264984,
"learning_rate": 5.231403087877955e-05,
"loss": 0.0107,
"step": 10220
},
{
"epoch": 5.724678231673195,
"grad_norm": 0.23433655500411987,
"learning_rate": 5.2231442848743064e-05,
"loss": 0.0139,
"step": 10230
},
{
"epoch": 5.730274202574146,
"grad_norm": 0.2549709379673004,
"learning_rate": 5.214884871802703e-05,
"loss": 0.0178,
"step": 10240
},
{
"epoch": 5.735870173475098,
"grad_norm": 0.11975869536399841,
"learning_rate": 5.2066248712440656e-05,
"loss": 0.0101,
"step": 10250
},
{
"epoch": 5.74146614437605,
"grad_norm": 0.39216071367263794,
"learning_rate": 5.198364305780922e-05,
"loss": 0.0131,
"step": 10260
},
{
"epoch": 5.747062115277,
"grad_norm": 0.2390432357788086,
"learning_rate": 5.1901031979973394e-05,
"loss": 0.0097,
"step": 10270
},
{
"epoch": 5.752658086177952,
"grad_norm": 0.1686331033706665,
"learning_rate": 5.1818415704788725e-05,
"loss": 0.0104,
"step": 10280
},
{
"epoch": 5.758254057078903,
"grad_norm": 0.28812578320503235,
"learning_rate": 5.1735794458124956e-05,
"loss": 0.01,
"step": 10290
},
{
"epoch": 5.763850027979855,
"grad_norm": 0.4722854197025299,
"learning_rate": 5.165316846586541e-05,
"loss": 0.0125,
"step": 10300
},
{
"epoch": 5.769445998880806,
"grad_norm": 0.19151827692985535,
"learning_rate": 5.157053795390642e-05,
"loss": 0.0134,
"step": 10310
},
{
"epoch": 5.775041969781757,
"grad_norm": 0.2533670961856842,
"learning_rate": 5.148790314815663e-05,
"loss": 0.011,
"step": 10320
},
{
"epoch": 5.780637940682708,
"grad_norm": 0.1756027489900589,
"learning_rate": 5.1405264274536445e-05,
"loss": 0.0092,
"step": 10330
},
{
"epoch": 5.786233911583659,
"grad_norm": 0.2753913700580597,
"learning_rate": 5.132262155897739e-05,
"loss": 0.0118,
"step": 10340
},
{
"epoch": 5.7918298824846115,
"grad_norm": 0.17530974745750427,
"learning_rate": 5.123997522742151e-05,
"loss": 0.0092,
"step": 10350
},
{
"epoch": 5.797425853385563,
"grad_norm": 0.3250185251235962,
"learning_rate": 5.1157325505820694e-05,
"loss": 0.0135,
"step": 10360
},
{
"epoch": 5.803021824286514,
"grad_norm": 0.2266574651002884,
"learning_rate": 5.107467262013614e-05,
"loss": 0.0174,
"step": 10370
},
{
"epoch": 5.808617795187465,
"grad_norm": 0.15442338585853577,
"learning_rate": 5.0992016796337686e-05,
"loss": 0.0112,
"step": 10380
},
{
"epoch": 5.814213766088416,
"grad_norm": 0.16227369010448456,
"learning_rate": 5.0909358260403186e-05,
"loss": 0.0141,
"step": 10390
},
{
"epoch": 5.8198097369893675,
"grad_norm": 0.288241982460022,
"learning_rate": 5.0826697238317935e-05,
"loss": 0.0142,
"step": 10400
},
{
"epoch": 5.825405707890319,
"grad_norm": 0.17878948152065277,
"learning_rate": 5.074403395607399e-05,
"loss": 0.0115,
"step": 10410
},
{
"epoch": 5.83100167879127,
"grad_norm": 0.2224341630935669,
"learning_rate": 5.066136863966963e-05,
"loss": 0.0106,
"step": 10420
},
{
"epoch": 5.836597649692221,
"grad_norm": 0.1762062907218933,
"learning_rate": 5.057870151510864e-05,
"loss": 0.0115,
"step": 10430
},
{
"epoch": 5.842193620593173,
"grad_norm": 0.15165816247463226,
"learning_rate": 5.0496032808399815e-05,
"loss": 0.0116,
"step": 10440
},
{
"epoch": 5.8477895914941245,
"grad_norm": 0.23350821435451508,
"learning_rate": 5.041336274555625e-05,
"loss": 0.0124,
"step": 10450
},
{
"epoch": 5.853385562395076,
"grad_norm": 0.3131781816482544,
"learning_rate": 5.033069155259471e-05,
"loss": 0.0136,
"step": 10460
},
{
"epoch": 5.858981533296027,
"grad_norm": 0.25165101885795593,
"learning_rate": 5.02480194555351e-05,
"loss": 0.0081,
"step": 10470
},
{
"epoch": 5.864577504196978,
"grad_norm": 0.17109723389148712,
"learning_rate": 5.016534668039976e-05,
"loss": 0.0104,
"step": 10480
},
{
"epoch": 5.870173475097929,
"grad_norm": 0.14172928035259247,
"learning_rate": 5.0082673453212914e-05,
"loss": 0.0096,
"step": 10490
},
{
"epoch": 5.8757694459988805,
"grad_norm": 0.15533624589443207,
"learning_rate": 5e-05,
"loss": 0.0075,
"step": 10500
},
{
"epoch": 5.881365416899833,
"grad_norm": 0.12869463860988617,
"learning_rate": 4.991732654678709e-05,
"loss": 0.0114,
"step": 10510
},
{
"epoch": 5.886961387800784,
"grad_norm": 0.3376826345920563,
"learning_rate": 4.9834653319600246e-05,
"loss": 0.0135,
"step": 10520
},
{
"epoch": 5.892557358701735,
"grad_norm": 0.20675431191921234,
"learning_rate": 4.975198054446492e-05,
"loss": 0.0106,
"step": 10530
},
{
"epoch": 5.898153329602686,
"grad_norm": 0.14309728145599365,
"learning_rate": 4.96693084474053e-05,
"loss": 0.0122,
"step": 10540
},
{
"epoch": 5.903749300503637,
"grad_norm": 0.13042593002319336,
"learning_rate": 4.9586637254443756e-05,
"loss": 0.0114,
"step": 10550
},
{
"epoch": 5.909345271404589,
"grad_norm": 0.14101748168468475,
"learning_rate": 4.950396719160018e-05,
"loss": 0.0104,
"step": 10560
},
{
"epoch": 5.91494124230554,
"grad_norm": 0.22409436106681824,
"learning_rate": 4.942129848489137e-05,
"loss": 0.0109,
"step": 10570
},
{
"epoch": 5.920537213206491,
"grad_norm": 0.22155794501304626,
"learning_rate": 4.93386313603304e-05,
"loss": 0.0091,
"step": 10580
},
{
"epoch": 5.926133184107442,
"grad_norm": 0.1839323341846466,
"learning_rate": 4.925596604392603e-05,
"loss": 0.0086,
"step": 10590
},
{
"epoch": 5.931729155008394,
"grad_norm": 0.1160067617893219,
"learning_rate": 4.917330276168208e-05,
"loss": 0.0103,
"step": 10600
},
{
"epoch": 5.937325125909346,
"grad_norm": 0.2413625419139862,
"learning_rate": 4.909064173959681e-05,
"loss": 0.0117,
"step": 10610
},
{
"epoch": 5.942921096810297,
"grad_norm": 0.19037237763404846,
"learning_rate": 4.9007983203662326e-05,
"loss": 0.011,
"step": 10620
},
{
"epoch": 5.948517067711248,
"grad_norm": 0.17303366959095,
"learning_rate": 4.892532737986387e-05,
"loss": 0.0094,
"step": 10630
},
{
"epoch": 5.954113038612199,
"grad_norm": 0.2476578801870346,
"learning_rate": 4.884267449417931e-05,
"loss": 0.0118,
"step": 10640
},
{
"epoch": 5.95970900951315,
"grad_norm": 0.29616495966911316,
"learning_rate": 4.87600247725785e-05,
"loss": 0.0118,
"step": 10650
},
{
"epoch": 5.965304980414102,
"grad_norm": 0.1653703898191452,
"learning_rate": 4.867737844102261e-05,
"loss": 0.0093,
"step": 10660
},
{
"epoch": 5.970900951315053,
"grad_norm": 0.2089630663394928,
"learning_rate": 4.8594735725463567e-05,
"loss": 0.0113,
"step": 10670
},
{
"epoch": 5.976496922216004,
"grad_norm": 0.14042207598686218,
"learning_rate": 4.851209685184338e-05,
"loss": 0.0091,
"step": 10680
},
{
"epoch": 5.982092893116956,
"grad_norm": 0.17145408689975739,
"learning_rate": 4.8429462046093585e-05,
"loss": 0.0103,
"step": 10690
},
{
"epoch": 5.987688864017907,
"grad_norm": 0.2082109898328781,
"learning_rate": 4.834683153413459e-05,
"loss": 0.0109,
"step": 10700
},
{
"epoch": 5.9932848349188586,
"grad_norm": 0.3018309473991394,
"learning_rate": 4.826420554187506e-05,
"loss": 0.0125,
"step": 10710
},
{
"epoch": 5.99888080581981,
"grad_norm": 0.1233690157532692,
"learning_rate": 4.818158429521129e-05,
"loss": 0.0093,
"step": 10720
},
{
"epoch": 6.004476776720761,
"grad_norm": 0.226378932595253,
"learning_rate": 4.809896802002662e-05,
"loss": 0.0124,
"step": 10730
},
{
"epoch": 6.010072747621712,
"grad_norm": 0.149214506149292,
"learning_rate": 4.801635694219079e-05,
"loss": 0.0105,
"step": 10740
},
{
"epoch": 6.015668718522663,
"grad_norm": 0.35911405086517334,
"learning_rate": 4.7933751287559335e-05,
"loss": 0.0097,
"step": 10750
},
{
"epoch": 6.021264689423615,
"grad_norm": 0.3472690284252167,
"learning_rate": 4.785115128197298e-05,
"loss": 0.0115,
"step": 10760
},
{
"epoch": 6.026860660324567,
"grad_norm": 0.1740999072790146,
"learning_rate": 4.776855715125694e-05,
"loss": 0.0088,
"step": 10770
},
{
"epoch": 6.032456631225518,
"grad_norm": 0.22089268267154694,
"learning_rate": 4.7685969121220456e-05,
"loss": 0.0087,
"step": 10780
},
{
"epoch": 6.038052602126469,
"grad_norm": 0.17993643879890442,
"learning_rate": 4.7603387417656026e-05,
"loss": 0.0086,
"step": 10790
},
{
"epoch": 6.04364857302742,
"grad_norm": 0.3000619113445282,
"learning_rate": 4.7520812266338885e-05,
"loss": 0.0117,
"step": 10800
},
{
"epoch": 6.0492445439283715,
"grad_norm": 0.16510385274887085,
"learning_rate": 4.743824389302635e-05,
"loss": 0.0098,
"step": 10810
},
{
"epoch": 6.054840514829323,
"grad_norm": 0.17736104130744934,
"learning_rate": 4.735568252345718e-05,
"loss": 0.0111,
"step": 10820
},
{
"epoch": 6.060436485730274,
"grad_norm": 0.17262353003025055,
"learning_rate": 4.7273128383351015e-05,
"loss": 0.0075,
"step": 10830
},
{
"epoch": 6.066032456631225,
"grad_norm": 0.15096010267734528,
"learning_rate": 4.7190581698407725e-05,
"loss": 0.0086,
"step": 10840
},
{
"epoch": 6.071628427532177,
"grad_norm": 0.16276976466178894,
"learning_rate": 4.710804269430681e-05,
"loss": 0.0102,
"step": 10850
},
{
"epoch": 6.0772243984331284,
"grad_norm": 0.42808446288108826,
"learning_rate": 4.702551159670672e-05,
"loss": 0.0094,
"step": 10860
},
{
"epoch": 6.08282036933408,
"grad_norm": 0.17846183478832245,
"learning_rate": 4.694298863124435e-05,
"loss": 0.0092,
"step": 10870
},
{
"epoch": 6.088416340235031,
"grad_norm": 0.2053506076335907,
"learning_rate": 4.6860474023534335e-05,
"loss": 0.0086,
"step": 10880
},
{
"epoch": 6.094012311135982,
"grad_norm": 0.2614595592021942,
"learning_rate": 4.677796799916845e-05,
"loss": 0.017,
"step": 10890
},
{
"epoch": 6.099608282036933,
"grad_norm": 0.2127176970243454,
"learning_rate": 4.669547078371504e-05,
"loss": 0.014,
"step": 10900
},
{
"epoch": 6.1052042529378845,
"grad_norm": 0.2204008847475052,
"learning_rate": 4.66129826027183e-05,
"loss": 0.0116,
"step": 10910
},
{
"epoch": 6.110800223838836,
"grad_norm": 0.3794216215610504,
"learning_rate": 4.65305036816978e-05,
"loss": 0.0112,
"step": 10920
},
{
"epoch": 6.116396194739787,
"grad_norm": 0.22125349938869476,
"learning_rate": 4.6448034246147754e-05,
"loss": 0.0086,
"step": 10930
},
{
"epoch": 6.121992165640739,
"grad_norm": 0.21079552173614502,
"learning_rate": 4.6365574521536445e-05,
"loss": 0.0118,
"step": 10940
},
{
"epoch": 6.12758813654169,
"grad_norm": 0.17766894400119781,
"learning_rate": 4.6283124733305624e-05,
"loss": 0.007,
"step": 10950
},
{
"epoch": 6.133184107442641,
"grad_norm": 0.23495835065841675,
"learning_rate": 4.620068510686985e-05,
"loss": 0.0092,
"step": 10960
},
{
"epoch": 6.138780078343593,
"grad_norm": 0.25509214401245117,
"learning_rate": 4.611825586761591e-05,
"loss": 0.0098,
"step": 10970
},
{
"epoch": 6.144376049244544,
"grad_norm": 0.2415831834077835,
"learning_rate": 4.60358372409022e-05,
"loss": 0.0105,
"step": 10980
},
{
"epoch": 6.149972020145495,
"grad_norm": 0.1638316661119461,
"learning_rate": 4.5953429452058135e-05,
"loss": 0.0092,
"step": 10990
},
{
"epoch": 6.155567991046446,
"grad_norm": 0.17809127271175385,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.0089,
"step": 11000
},
{
"epoch": 6.1611639619473975,
"grad_norm": 0.22080188989639282,
"learning_rate": 4.5788647289147516e-05,
"loss": 0.008,
"step": 11010
},
{
"epoch": 6.16675993284835,
"grad_norm": 0.19198036193847656,
"learning_rate": 4.570627336558915e-05,
"loss": 0.0099,
"step": 11020
},
{
"epoch": 6.172355903749301,
"grad_norm": 0.1567138433456421,
"learning_rate": 4.562391118091544e-05,
"loss": 0.0081,
"step": 11030
},
{
"epoch": 6.177951874650252,
"grad_norm": 0.10507390648126602,
"learning_rate": 4.554156096030149e-05,
"loss": 0.0068,
"step": 11040
},
{
"epoch": 6.183547845551203,
"grad_norm": 0.2201065570116043,
"learning_rate": 4.545922292888959e-05,
"loss": 0.0111,
"step": 11050
},
{
"epoch": 6.189143816452154,
"grad_norm": 0.2924385666847229,
"learning_rate": 4.537689731178883e-05,
"loss": 0.0198,
"step": 11060
},
{
"epoch": 6.194739787353106,
"grad_norm": 0.18973895907402039,
"learning_rate": 4.529458433407429e-05,
"loss": 0.0113,
"step": 11070
},
{
"epoch": 6.200335758254057,
"grad_norm": 0.2131788432598114,
"learning_rate": 4.5212284220786494e-05,
"loss": 0.0093,
"step": 11080
},
{
"epoch": 6.205931729155008,
"grad_norm": 0.17389249801635742,
"learning_rate": 4.5129997196930845e-05,
"loss": 0.0066,
"step": 11090
},
{
"epoch": 6.21152770005596,
"grad_norm": 0.21684075891971588,
"learning_rate": 4.504772348747687e-05,
"loss": 0.0071,
"step": 11100
},
{
"epoch": 6.217123670956911,
"grad_norm": 0.19866231083869934,
"learning_rate": 4.496546331735778e-05,
"loss": 0.0096,
"step": 11110
},
{
"epoch": 6.2227196418578625,
"grad_norm": 0.19832220673561096,
"learning_rate": 4.488321691146975e-05,
"loss": 0.0068,
"step": 11120
},
{
"epoch": 6.228315612758814,
"grad_norm": 0.12977780401706696,
"learning_rate": 4.480098449467132e-05,
"loss": 0.0089,
"step": 11130
},
{
"epoch": 6.233911583659765,
"grad_norm": 0.32740047574043274,
"learning_rate": 4.471876629178273e-05,
"loss": 0.0092,
"step": 11140
},
{
"epoch": 6.239507554560716,
"grad_norm": 0.12163751572370529,
"learning_rate": 4.463656252758542e-05,
"loss": 0.0089,
"step": 11150
},
{
"epoch": 6.245103525461667,
"grad_norm": 0.21914434432983398,
"learning_rate": 4.4554373426821374e-05,
"loss": 0.0084,
"step": 11160
},
{
"epoch": 6.250699496362619,
"grad_norm": 0.23196600377559662,
"learning_rate": 4.447219921419244e-05,
"loss": 0.0095,
"step": 11170
},
{
"epoch": 6.25629546726357,
"grad_norm": 0.19451774656772614,
"learning_rate": 4.439004011435979e-05,
"loss": 0.01,
"step": 11180
},
{
"epoch": 6.261891438164522,
"grad_norm": 0.20714877545833588,
"learning_rate": 4.430789635194324e-05,
"loss": 0.0124,
"step": 11190
},
{
"epoch": 6.267487409065473,
"grad_norm": 0.1735510528087616,
"learning_rate": 4.4225768151520694e-05,
"loss": 0.0089,
"step": 11200
},
{
"epoch": 6.273083379966424,
"grad_norm": 0.2282591164112091,
"learning_rate": 4.414365573762755e-05,
"loss": 0.0166,
"step": 11210
},
{
"epoch": 6.2786793508673755,
"grad_norm": 0.2207183688879013,
"learning_rate": 4.406155933475599e-05,
"loss": 0.0089,
"step": 11220
},
{
"epoch": 6.284275321768327,
"grad_norm": 0.252380907535553,
"learning_rate": 4.3979479167354477e-05,
"loss": 0.0111,
"step": 11230
},
{
"epoch": 6.289871292669278,
"grad_norm": 0.18762193620204926,
"learning_rate": 4.3897415459827e-05,
"loss": 0.0099,
"step": 11240
},
{
"epoch": 6.295467263570229,
"grad_norm": 0.15788224339485168,
"learning_rate": 4.381536843653262e-05,
"loss": 0.0086,
"step": 11250
},
{
"epoch": 6.301063234471181,
"grad_norm": 0.22205393016338348,
"learning_rate": 4.373333832178478e-05,
"loss": 0.0081,
"step": 11260
},
{
"epoch": 6.306659205372132,
"grad_norm": 0.2042773962020874,
"learning_rate": 4.365132533985071e-05,
"loss": 0.0112,
"step": 11270
},
{
"epoch": 6.312255176273084,
"grad_norm": 0.15884517133235931,
"learning_rate": 4.3569329714950704e-05,
"loss": 0.011,
"step": 11280
},
{
"epoch": 6.317851147174035,
"grad_norm": 0.1604417860507965,
"learning_rate": 4.348735167125771e-05,
"loss": 0.0126,
"step": 11290
},
{
"epoch": 6.323447118074986,
"grad_norm": 0.1566859632730484,
"learning_rate": 4.3405391432896555e-05,
"loss": 0.0078,
"step": 11300
},
{
"epoch": 6.329043088975937,
"grad_norm": 0.2835988700389862,
"learning_rate": 4.3323449223943416e-05,
"loss": 0.0096,
"step": 11310
},
{
"epoch": 6.3346390598768885,
"grad_norm": 0.2758636772632599,
"learning_rate": 4.324152526842517e-05,
"loss": 0.0118,
"step": 11320
},
{
"epoch": 6.34023503077784,
"grad_norm": 0.09336747974157333,
"learning_rate": 4.315961979031875e-05,
"loss": 0.0111,
"step": 11330
},
{
"epoch": 6.345831001678791,
"grad_norm": 0.16241887211799622,
"learning_rate": 4.307773301355062e-05,
"loss": 0.0106,
"step": 11340
},
{
"epoch": 6.351426972579743,
"grad_norm": 0.20391559600830078,
"learning_rate": 4.2995865161996105e-05,
"loss": 0.0081,
"step": 11350
},
{
"epoch": 6.357022943480694,
"grad_norm": 0.12543804943561554,
"learning_rate": 4.291401645947879e-05,
"loss": 0.0137,
"step": 11360
},
{
"epoch": 6.362618914381645,
"grad_norm": 0.24983376264572144,
"learning_rate": 4.283218712976992e-05,
"loss": 0.0095,
"step": 11370
},
{
"epoch": 6.368214885282597,
"grad_norm": 0.2291889637708664,
"learning_rate": 4.275037739658771e-05,
"loss": 0.0113,
"step": 11380
},
{
"epoch": 6.373810856183548,
"grad_norm": 0.1601787656545639,
"learning_rate": 4.2668587483596864e-05,
"loss": 0.0128,
"step": 11390
},
{
"epoch": 6.379406827084499,
"grad_norm": 0.14628605544567108,
"learning_rate": 4.2586817614407895e-05,
"loss": 0.0076,
"step": 11400
},
{
"epoch": 6.38500279798545,
"grad_norm": 0.16742217540740967,
"learning_rate": 4.250506801257653e-05,
"loss": 0.0104,
"step": 11410
},
{
"epoch": 6.390598768886401,
"grad_norm": 0.20203527808189392,
"learning_rate": 4.2423338901602985e-05,
"loss": 0.0112,
"step": 11420
},
{
"epoch": 6.396194739787353,
"grad_norm": 0.2605644762516022,
"learning_rate": 4.234163050493158e-05,
"loss": 0.0166,
"step": 11430
},
{
"epoch": 6.401790710688305,
"grad_norm": 0.22104188799858093,
"learning_rate": 4.2259943045949934e-05,
"loss": 0.0069,
"step": 11440
},
{
"epoch": 6.407386681589256,
"grad_norm": 0.2080865204334259,
"learning_rate": 4.2178276747988446e-05,
"loss": 0.0136,
"step": 11450
},
{
"epoch": 6.412982652490207,
"grad_norm": 0.22961939871311188,
"learning_rate": 4.209663183431969e-05,
"loss": 0.0184,
"step": 11460
},
{
"epoch": 6.418578623391158,
"grad_norm": 0.3134923577308655,
"learning_rate": 4.201500852815768e-05,
"loss": 0.0108,
"step": 11470
},
{
"epoch": 6.42417459429211,
"grad_norm": 0.11267667263746262,
"learning_rate": 4.1933407052657456e-05,
"loss": 0.0113,
"step": 11480
},
{
"epoch": 6.429770565193061,
"grad_norm": 0.11718063056468964,
"learning_rate": 4.1851827630914305e-05,
"loss": 0.0069,
"step": 11490
},
{
"epoch": 6.435366536094012,
"grad_norm": 0.15294240415096283,
"learning_rate": 4.17702704859633e-05,
"loss": 0.0087,
"step": 11500
},
{
"epoch": 6.440962506994964,
"grad_norm": 0.16003765165805817,
"learning_rate": 4.1688735840778546e-05,
"loss": 0.0087,
"step": 11510
},
{
"epoch": 6.446558477895915,
"grad_norm": 0.28345319628715515,
"learning_rate": 4.160722391827262e-05,
"loss": 0.0119,
"step": 11520
},
{
"epoch": 6.4521544487968665,
"grad_norm": 0.18619926273822784,
"learning_rate": 4.1525734941296026e-05,
"loss": 0.01,
"step": 11530
},
{
"epoch": 6.457750419697818,
"grad_norm": 0.1567833423614502,
"learning_rate": 4.14442691326365e-05,
"loss": 0.0089,
"step": 11540
},
{
"epoch": 6.463346390598769,
"grad_norm": 0.16688846051692963,
"learning_rate": 4.13628267150185e-05,
"loss": 0.0078,
"step": 11550
},
{
"epoch": 6.46894236149972,
"grad_norm": 0.19638372957706451,
"learning_rate": 4.1281407911102425e-05,
"loss": 0.0119,
"step": 11560
},
{
"epoch": 6.474538332400671,
"grad_norm": 0.13919275999069214,
"learning_rate": 4.120001294348421e-05,
"loss": 0.0105,
"step": 11570
},
{
"epoch": 6.4801343033016225,
"grad_norm": 0.17611968517303467,
"learning_rate": 4.111864203469457e-05,
"loss": 0.0145,
"step": 11580
},
{
"epoch": 6.485730274202574,
"grad_norm": 0.15707933902740479,
"learning_rate": 4.103729540719847e-05,
"loss": 0.0088,
"step": 11590
},
{
"epoch": 6.491326245103526,
"grad_norm": 0.16832014918327332,
"learning_rate": 4.095597328339452e-05,
"loss": 0.0087,
"step": 11600
},
{
"epoch": 6.496922216004477,
"grad_norm": 0.16573460400104523,
"learning_rate": 4.087467588561424e-05,
"loss": 0.0085,
"step": 11610
},
{
"epoch": 6.502518186905428,
"grad_norm": 0.16878801584243774,
"learning_rate": 4.079340343612165e-05,
"loss": 0.0081,
"step": 11620
},
{
"epoch": 6.5081141578063795,
"grad_norm": 0.10650831460952759,
"learning_rate": 4.07121561571125e-05,
"loss": 0.0088,
"step": 11630
},
{
"epoch": 6.513710128707331,
"grad_norm": 0.15549488365650177,
"learning_rate": 4.063093427071376e-05,
"loss": 0.008,
"step": 11640
},
{
"epoch": 6.519306099608282,
"grad_norm": 0.17358443140983582,
"learning_rate": 4.0549737998983e-05,
"loss": 0.0133,
"step": 11650
},
{
"epoch": 6.524902070509233,
"grad_norm": 0.24347983300685883,
"learning_rate": 4.046856756390767e-05,
"loss": 0.0123,
"step": 11660
},
{
"epoch": 6.530498041410184,
"grad_norm": 0.31662797927856445,
"learning_rate": 4.038742318740465e-05,
"loss": 0.0108,
"step": 11670
},
{
"epoch": 6.5360940123111355,
"grad_norm": 0.21490415930747986,
"learning_rate": 4.0306305091319595e-05,
"loss": 0.0116,
"step": 11680
},
{
"epoch": 6.541689983212088,
"grad_norm": 0.10896732658147812,
"learning_rate": 4.0225213497426276e-05,
"loss": 0.0088,
"step": 11690
},
{
"epoch": 6.547285954113039,
"grad_norm": 0.22287431359291077,
"learning_rate": 4.0144148627425993e-05,
"loss": 0.0157,
"step": 11700
},
{
"epoch": 6.55288192501399,
"grad_norm": 0.2492447942495346,
"learning_rate": 4.006311070294702e-05,
"loss": 0.0155,
"step": 11710
},
{
"epoch": 6.558477895914941,
"grad_norm": 0.09591550379991531,
"learning_rate": 3.9982099945543945e-05,
"loss": 0.0076,
"step": 11720
},
{
"epoch": 6.564073866815892,
"grad_norm": 0.21364928781986237,
"learning_rate": 3.9901116576697083e-05,
"loss": 0.0109,
"step": 11730
},
{
"epoch": 6.569669837716844,
"grad_norm": 0.2347889095544815,
"learning_rate": 3.982016081781189e-05,
"loss": 0.009,
"step": 11740
},
{
"epoch": 6.575265808617795,
"grad_norm": 0.07959645986557007,
"learning_rate": 3.973923289021829e-05,
"loss": 0.007,
"step": 11750
},
{
"epoch": 6.580861779518747,
"grad_norm": 0.18356555700302124,
"learning_rate": 3.965833301517017e-05,
"loss": 0.014,
"step": 11760
},
{
"epoch": 6.586457750419698,
"grad_norm": 0.16104575991630554,
"learning_rate": 3.9577461413844684e-05,
"loss": 0.0159,
"step": 11770
},
{
"epoch": 6.592053721320649,
"grad_norm": 0.2652454972267151,
"learning_rate": 3.949661830734172e-05,
"loss": 0.0103,
"step": 11780
},
{
"epoch": 6.597649692221601,
"grad_norm": 0.29040461778640747,
"learning_rate": 3.9415803916683224e-05,
"loss": 0.0077,
"step": 11790
},
{
"epoch": 6.603245663122552,
"grad_norm": 0.3047587275505066,
"learning_rate": 3.933501846281267e-05,
"loss": 0.0137,
"step": 11800
},
{
"epoch": 6.608841634023503,
"grad_norm": 0.15864235162734985,
"learning_rate": 3.925426216659438e-05,
"loss": 0.0097,
"step": 11810
},
{
"epoch": 6.614437604924454,
"grad_norm": 0.20918135344982147,
"learning_rate": 3.917353524881302e-05,
"loss": 0.008,
"step": 11820
},
{
"epoch": 6.620033575825405,
"grad_norm": 0.17880207300186157,
"learning_rate": 3.9092837930172884e-05,
"loss": 0.0119,
"step": 11830
},
{
"epoch": 6.625629546726357,
"grad_norm": 0.16844668984413147,
"learning_rate": 3.901217043129735e-05,
"loss": 0.0092,
"step": 11840
},
{
"epoch": 6.631225517627309,
"grad_norm": 0.2069406360387802,
"learning_rate": 3.8931532972728285e-05,
"loss": 0.0116,
"step": 11850
},
{
"epoch": 6.63682148852826,
"grad_norm": 0.2709522843360901,
"learning_rate": 3.8850925774925425e-05,
"loss": 0.0076,
"step": 11860
},
{
"epoch": 6.642417459429211,
"grad_norm": 0.16224393248558044,
"learning_rate": 3.877034905826577e-05,
"loss": 0.0099,
"step": 11870
},
{
"epoch": 6.648013430330162,
"grad_norm": 0.238708034157753,
"learning_rate": 3.8689803043043e-05,
"loss": 0.0073,
"step": 11880
},
{
"epoch": 6.6536094012311136,
"grad_norm": 0.12267536669969559,
"learning_rate": 3.860928794946682e-05,
"loss": 0.0086,
"step": 11890
},
{
"epoch": 6.659205372132065,
"grad_norm": 0.1931445449590683,
"learning_rate": 3.852880399766243e-05,
"loss": 0.0098,
"step": 11900
},
{
"epoch": 6.664801343033016,
"grad_norm": 0.23762571811676025,
"learning_rate": 3.844835140766988e-05,
"loss": 0.0091,
"step": 11910
},
{
"epoch": 6.670397313933967,
"grad_norm": 0.1977052241563797,
"learning_rate": 3.836793039944349e-05,
"loss": 0.0079,
"step": 11920
},
{
"epoch": 6.675993284834918,
"grad_norm": 0.10921810567378998,
"learning_rate": 3.828754119285123e-05,
"loss": 0.0072,
"step": 11930
},
{
"epoch": 6.6815892557358705,
"grad_norm": 0.2423611879348755,
"learning_rate": 3.820718400767409e-05,
"loss": 0.0119,
"step": 11940
},
{
"epoch": 6.687185226636822,
"grad_norm": 0.19429948925971985,
"learning_rate": 3.812685906360557e-05,
"loss": 0.0081,
"step": 11950
},
{
"epoch": 6.692781197537773,
"grad_norm": 0.104859858751297,
"learning_rate": 3.8046566580251e-05,
"loss": 0.0064,
"step": 11960
},
{
"epoch": 6.698377168438724,
"grad_norm": 0.11694277077913284,
"learning_rate": 3.796630677712697e-05,
"loss": 0.0086,
"step": 11970
},
{
"epoch": 6.703973139339675,
"grad_norm": 0.2368919551372528,
"learning_rate": 3.788607987366069e-05,
"loss": 0.0059,
"step": 11980
},
{
"epoch": 6.7095691102406265,
"grad_norm": 0.20411504805088043,
"learning_rate": 3.780588608918947e-05,
"loss": 0.0133,
"step": 11990
},
{
"epoch": 6.715165081141578,
"grad_norm": 0.11036452651023865,
"learning_rate": 3.772572564296005e-05,
"loss": 0.0085,
"step": 12000
},
{
"epoch": 6.72076105204253,
"grad_norm": 0.09863012284040451,
"learning_rate": 3.764559875412803e-05,
"loss": 0.0064,
"step": 12010
},
{
"epoch": 6.726357022943481,
"grad_norm": 0.12064427882432938,
"learning_rate": 3.756550564175727e-05,
"loss": 0.009,
"step": 12020
},
{
"epoch": 6.731952993844432,
"grad_norm": 0.11138517409563065,
"learning_rate": 3.748544652481927e-05,
"loss": 0.0082,
"step": 12030
},
{
"epoch": 6.7375489647453835,
"grad_norm": 0.1209891140460968,
"learning_rate": 3.74054216221926e-05,
"loss": 0.0074,
"step": 12040
},
{
"epoch": 6.743144935646335,
"grad_norm": 0.22739742696285248,
"learning_rate": 3.73254311526623e-05,
"loss": 0.0082,
"step": 12050
},
{
"epoch": 6.748740906547286,
"grad_norm": 0.19938482344150543,
"learning_rate": 3.7245475334919246e-05,
"loss": 0.0087,
"step": 12060
},
{
"epoch": 6.754336877448237,
"grad_norm": 0.18825367093086243,
"learning_rate": 3.716555438755961e-05,
"loss": 0.0091,
"step": 12070
},
{
"epoch": 6.759932848349188,
"grad_norm": 0.18540059030056,
"learning_rate": 3.7085668529084184e-05,
"loss": 0.0096,
"step": 12080
},
{
"epoch": 6.7655288192501395,
"grad_norm": 0.11188949644565582,
"learning_rate": 3.700581797789786e-05,
"loss": 0.0081,
"step": 12090
},
{
"epoch": 6.771124790151092,
"grad_norm": 0.09911153465509415,
"learning_rate": 3.6926002952309016e-05,
"loss": 0.0065,
"step": 12100
},
{
"epoch": 6.776720761052043,
"grad_norm": 0.2001970112323761,
"learning_rate": 3.684622367052887e-05,
"loss": 0.007,
"step": 12110
},
{
"epoch": 6.782316731952994,
"grad_norm": 0.256001740694046,
"learning_rate": 3.676648035067093e-05,
"loss": 0.0101,
"step": 12120
},
{
"epoch": 6.787912702853945,
"grad_norm": 0.16810284554958344,
"learning_rate": 3.6686773210750385e-05,
"loss": 0.0084,
"step": 12130
},
{
"epoch": 6.793508673754896,
"grad_norm": 0.21629579365253448,
"learning_rate": 3.6607102468683526e-05,
"loss": 0.0066,
"step": 12140
},
{
"epoch": 6.799104644655848,
"grad_norm": 0.2616669237613678,
"learning_rate": 3.65274683422871e-05,
"loss": 0.0111,
"step": 12150
},
{
"epoch": 6.804700615556799,
"grad_norm": 0.18898139894008636,
"learning_rate": 3.6447871049277796e-05,
"loss": 0.0103,
"step": 12160
},
{
"epoch": 6.81029658645775,
"grad_norm": 0.20177505910396576,
"learning_rate": 3.636831080727154e-05,
"loss": 0.0064,
"step": 12170
},
{
"epoch": 6.815892557358701,
"grad_norm": 0.18514911830425262,
"learning_rate": 3.628878783378302e-05,
"loss": 0.0118,
"step": 12180
},
{
"epoch": 6.821488528259653,
"grad_norm": 0.25894469022750854,
"learning_rate": 3.6209302346225006e-05,
"loss": 0.0083,
"step": 12190
},
{
"epoch": 6.827084499160605,
"grad_norm": 0.16605038940906525,
"learning_rate": 3.612985456190778e-05,
"loss": 0.0049,
"step": 12200
},
{
"epoch": 6.832680470061556,
"grad_norm": 0.17524683475494385,
"learning_rate": 3.605044469803854e-05,
"loss": 0.0066,
"step": 12210
},
{
"epoch": 6.838276440962507,
"grad_norm": 0.10738332569599152,
"learning_rate": 3.597107297172084e-05,
"loss": 0.0087,
"step": 12220
},
{
"epoch": 6.843872411863458,
"grad_norm": 0.19934684038162231,
"learning_rate": 3.5891739599953945e-05,
"loss": 0.009,
"step": 12230
},
{
"epoch": 6.849468382764409,
"grad_norm": 0.12639135122299194,
"learning_rate": 3.581244479963225e-05,
"loss": 0.0092,
"step": 12240
},
{
"epoch": 6.855064353665361,
"grad_norm": 0.1152096539735794,
"learning_rate": 3.5733188787544745e-05,
"loss": 0.007,
"step": 12250
},
{
"epoch": 6.860660324566313,
"grad_norm": 0.2878243625164032,
"learning_rate": 3.5653971780374295e-05,
"loss": 0.0096,
"step": 12260
},
{
"epoch": 6.866256295467264,
"grad_norm": 0.2725951075553894,
"learning_rate": 3.557479399469721e-05,
"loss": 0.0081,
"step": 12270
},
{
"epoch": 6.871852266368215,
"grad_norm": 0.16931770741939545,
"learning_rate": 3.5495655646982505e-05,
"loss": 0.0085,
"step": 12280
},
{
"epoch": 6.877448237269166,
"grad_norm": 0.11503436416387558,
"learning_rate": 3.541655695359142e-05,
"loss": 0.0062,
"step": 12290
},
{
"epoch": 6.8830442081701175,
"grad_norm": 0.18025194108486176,
"learning_rate": 3.533749813077677e-05,
"loss": 0.0082,
"step": 12300
},
{
"epoch": 6.888640179071069,
"grad_norm": 0.1392613649368286,
"learning_rate": 3.525847939468233e-05,
"loss": 0.0086,
"step": 12310
},
{
"epoch": 6.89423614997202,
"grad_norm": 0.2620909512042999,
"learning_rate": 3.517950096134232e-05,
"loss": 0.0108,
"step": 12320
},
{
"epoch": 6.899832120872971,
"grad_norm": 0.12296637147665024,
"learning_rate": 3.5100563046680764e-05,
"loss": 0.008,
"step": 12330
},
{
"epoch": 6.905428091773922,
"grad_norm": 0.13329119980335236,
"learning_rate": 3.5021665866510925e-05,
"loss": 0.0104,
"step": 12340
},
{
"epoch": 6.9110240626748745,
"grad_norm": 0.18710525333881378,
"learning_rate": 3.494280963653463e-05,
"loss": 0.0096,
"step": 12350
},
{
"epoch": 6.916620033575826,
"grad_norm": 0.199269637465477,
"learning_rate": 3.4863994572341843e-05,
"loss": 0.0098,
"step": 12360
},
{
"epoch": 6.922216004476777,
"grad_norm": 0.24953125417232513,
"learning_rate": 3.478522088940993e-05,
"loss": 0.01,
"step": 12370
},
{
"epoch": 6.927811975377728,
"grad_norm": 0.1573137789964676,
"learning_rate": 3.470648880310313e-05,
"loss": 0.0119,
"step": 12380
},
{
"epoch": 6.933407946278679,
"grad_norm": 0.24244867265224457,
"learning_rate": 3.462779852867197e-05,
"loss": 0.0129,
"step": 12390
},
{
"epoch": 6.9390039171796305,
"grad_norm": 0.12841010093688965,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.0074,
"step": 12400
},
{
"epoch": 6.944599888080582,
"grad_norm": 0.17973212897777557,
"learning_rate": 3.447054427586644e-05,
"loss": 0.0084,
"step": 12410
},
{
"epoch": 6.950195858981533,
"grad_norm": 0.2083815336227417,
"learning_rate": 3.439198072741921e-05,
"loss": 0.0096,
"step": 12420
},
{
"epoch": 6.955791829882484,
"grad_norm": 0.21580283343791962,
"learning_rate": 3.431345985070067e-05,
"loss": 0.009,
"step": 12430
},
{
"epoch": 6.961387800783436,
"grad_norm": 0.22562581300735474,
"learning_rate": 3.423498186038393e-05,
"loss": 0.0105,
"step": 12440
},
{
"epoch": 6.966983771684387,
"grad_norm": 0.19070309400558472,
"learning_rate": 3.4156546971024784e-05,
"loss": 0.0074,
"step": 12450
},
{
"epoch": 6.972579742585339,
"grad_norm": 0.2400059998035431,
"learning_rate": 3.407815539706124e-05,
"loss": 0.0102,
"step": 12460
},
{
"epoch": 6.97817571348629,
"grad_norm": 0.13252539932727814,
"learning_rate": 3.399980735281286e-05,
"loss": 0.0066,
"step": 12470
},
{
"epoch": 6.983771684387241,
"grad_norm": 0.2826622426509857,
"learning_rate": 3.392150305248024e-05,
"loss": 0.0103,
"step": 12480
},
{
"epoch": 6.989367655288192,
"grad_norm": 0.2674136757850647,
"learning_rate": 3.384324271014429e-05,
"loss": 0.0089,
"step": 12490
},
{
"epoch": 6.9949636261891435,
"grad_norm": 0.09753147512674332,
"learning_rate": 3.3765026539765834e-05,
"loss": 0.0126,
"step": 12500
},
{
"epoch": 7.000559597090096,
"grad_norm": 0.13642564415931702,
"learning_rate": 3.368685475518488e-05,
"loss": 0.01,
"step": 12510
},
{
"epoch": 7.006155567991047,
"grad_norm": 0.2658902704715729,
"learning_rate": 3.360872757012011e-05,
"loss": 0.0168,
"step": 12520
},
{
"epoch": 7.011751538891998,
"grad_norm": 0.12951083481311798,
"learning_rate": 3.3530645198168295e-05,
"loss": 0.0081,
"step": 12530
},
{
"epoch": 7.017347509792949,
"grad_norm": 0.23773689568042755,
"learning_rate": 3.3452607852803584e-05,
"loss": 0.0082,
"step": 12540
},
{
"epoch": 7.0229434806939,
"grad_norm": 0.21580462157726288,
"learning_rate": 3.337461574737716e-05,
"loss": 0.0106,
"step": 12550
},
{
"epoch": 7.028539451594852,
"grad_norm": 0.15399706363677979,
"learning_rate": 3.329666909511645e-05,
"loss": 0.0103,
"step": 12560
},
{
"epoch": 7.034135422495803,
"grad_norm": 0.21200086176395416,
"learning_rate": 3.321876810912461e-05,
"loss": 0.0141,
"step": 12570
},
{
"epoch": 7.039731393396754,
"grad_norm": 0.2530173063278198,
"learning_rate": 3.3140913002379995e-05,
"loss": 0.0101,
"step": 12580
},
{
"epoch": 7.045327364297705,
"grad_norm": 0.16888059675693512,
"learning_rate": 3.3063103987735433e-05,
"loss": 0.0068,
"step": 12590
},
{
"epoch": 7.050923335198657,
"grad_norm": 0.213544562458992,
"learning_rate": 3.298534127791785e-05,
"loss": 0.0099,
"step": 12600
},
{
"epoch": 7.0565193060996085,
"grad_norm": 0.2427508383989334,
"learning_rate": 3.2907625085527503e-05,
"loss": 0.0078,
"step": 12610
},
{
"epoch": 7.06211527700056,
"grad_norm": 0.3301132023334503,
"learning_rate": 3.282995562303754e-05,
"loss": 0.0091,
"step": 12620
},
{
"epoch": 7.067711247901511,
"grad_norm": 0.15243375301361084,
"learning_rate": 3.275233310279321e-05,
"loss": 0.0058,
"step": 12630
},
{
"epoch": 7.073307218802462,
"grad_norm": 0.14671820402145386,
"learning_rate": 3.267475773701161e-05,
"loss": 0.0062,
"step": 12640
},
{
"epoch": 7.078903189703413,
"grad_norm": 0.22168104350566864,
"learning_rate": 3.2597229737780774e-05,
"loss": 0.0079,
"step": 12650
},
{
"epoch": 7.084499160604365,
"grad_norm": 0.25640955567359924,
"learning_rate": 3.251974931705933e-05,
"loss": 0.0085,
"step": 12660
},
{
"epoch": 7.090095131505316,
"grad_norm": 0.2436077892780304,
"learning_rate": 3.244231668667578e-05,
"loss": 0.0078,
"step": 12670
},
{
"epoch": 7.095691102406268,
"grad_norm": 0.19463610649108887,
"learning_rate": 3.236493205832795e-05,
"loss": 0.0066,
"step": 12680
},
{
"epoch": 7.101287073307219,
"grad_norm": 0.22004422545433044,
"learning_rate": 3.228759564358248e-05,
"loss": 0.0078,
"step": 12690
},
{
"epoch": 7.10688304420817,
"grad_norm": 0.1793327033519745,
"learning_rate": 3.221030765387417e-05,
"loss": 0.0059,
"step": 12700
},
{
"epoch": 7.1124790151091215,
"grad_norm": 0.2823750376701355,
"learning_rate": 3.2133068300505455e-05,
"loss": 0.0072,
"step": 12710
},
{
"epoch": 7.118074986010073,
"grad_norm": 0.3006185293197632,
"learning_rate": 3.205587779464576e-05,
"loss": 0.0099,
"step": 12720
},
{
"epoch": 7.123670956911024,
"grad_norm": 0.15955254435539246,
"learning_rate": 3.197873634733096e-05,
"loss": 0.01,
"step": 12730
},
{
"epoch": 7.129266927811975,
"grad_norm": 0.3392355442047119,
"learning_rate": 3.190164416946285e-05,
"loss": 0.0096,
"step": 12740
},
{
"epoch": 7.134862898712926,
"grad_norm": 0.209779292345047,
"learning_rate": 3.18246014718085e-05,
"loss": 0.0083,
"step": 12750
},
{
"epoch": 7.140458869613878,
"grad_norm": 0.13492996990680695,
"learning_rate": 3.1747608464999725e-05,
"loss": 0.0085,
"step": 12760
},
{
"epoch": 7.14605484051483,
"grad_norm": 0.20543181896209717,
"learning_rate": 3.167066535953242e-05,
"loss": 0.0099,
"step": 12770
},
{
"epoch": 7.151650811415781,
"grad_norm": 0.24595800042152405,
"learning_rate": 3.1593772365766105e-05,
"loss": 0.0089,
"step": 12780
},
{
"epoch": 7.157246782316732,
"grad_norm": 0.24962860345840454,
"learning_rate": 3.1516929693923315e-05,
"loss": 0.0111,
"step": 12790
},
{
"epoch": 7.162842753217683,
"grad_norm": 0.236158549785614,
"learning_rate": 3.144013755408895e-05,
"loss": 0.0092,
"step": 12800
},
{
"epoch": 7.1684387241186345,
"grad_norm": 0.09373817592859268,
"learning_rate": 3.136339615620985e-05,
"loss": 0.0073,
"step": 12810
},
{
"epoch": 7.174034695019586,
"grad_norm": 0.3018852770328522,
"learning_rate": 3.128670571009399e-05,
"loss": 0.0109,
"step": 12820
},
{
"epoch": 7.179630665920537,
"grad_norm": 0.22144253551959991,
"learning_rate": 3.121006642541014e-05,
"loss": 0.008,
"step": 12830
},
{
"epoch": 7.185226636821488,
"grad_norm": 0.14473740756511688,
"learning_rate": 3.113347851168721e-05,
"loss": 0.0095,
"step": 12840
},
{
"epoch": 7.19082260772244,
"grad_norm": 0.14747409522533417,
"learning_rate": 3.105694217831361e-05,
"loss": 0.0062,
"step": 12850
},
{
"epoch": 7.196418578623391,
"grad_norm": 0.2111588716506958,
"learning_rate": 3.098045763453678e-05,
"loss": 0.0074,
"step": 12860
},
{
"epoch": 7.202014549524343,
"grad_norm": 0.2098371833562851,
"learning_rate": 3.090402508946249e-05,
"loss": 0.0084,
"step": 12870
},
{
"epoch": 7.207610520425294,
"grad_norm": 0.1614372432231903,
"learning_rate": 3.082764475205442e-05,
"loss": 0.007,
"step": 12880
},
{
"epoch": 7.213206491326245,
"grad_norm": 0.0742206946015358,
"learning_rate": 3.075131683113352e-05,
"loss": 0.006,
"step": 12890
},
{
"epoch": 7.218802462227196,
"grad_norm": 0.07135152816772461,
"learning_rate": 3.0675041535377405e-05,
"loss": 0.0057,
"step": 12900
},
{
"epoch": 7.2243984331281474,
"grad_norm": 0.20988823473453522,
"learning_rate": 3.059881907331979e-05,
"loss": 0.0071,
"step": 12910
},
{
"epoch": 7.229994404029099,
"grad_norm": 0.10817866027355194,
"learning_rate": 3.052264965335e-05,
"loss": 0.0049,
"step": 12920
},
{
"epoch": 7.235590374930051,
"grad_norm": 0.13764233887195587,
"learning_rate": 3.0446533483712304e-05,
"loss": 0.0088,
"step": 12930
},
{
"epoch": 7.241186345831002,
"grad_norm": 0.17063380777835846,
"learning_rate": 3.0370470772505433e-05,
"loss": 0.0073,
"step": 12940
},
{
"epoch": 7.246782316731953,
"grad_norm": 0.11198591440916061,
"learning_rate": 3.0294461727681932e-05,
"loss": 0.0112,
"step": 12950
},
{
"epoch": 7.252378287632904,
"grad_norm": 0.1855844408273697,
"learning_rate": 3.0218506557047598e-05,
"loss": 0.0069,
"step": 12960
},
{
"epoch": 7.257974258533856,
"grad_norm": 0.10013962537050247,
"learning_rate": 3.0142605468260978e-05,
"loss": 0.0063,
"step": 12970
},
{
"epoch": 7.263570229434807,
"grad_norm": 0.16480940580368042,
"learning_rate": 3.006675866883275e-05,
"loss": 0.0062,
"step": 12980
},
{
"epoch": 7.269166200335758,
"grad_norm": 0.2087039351463318,
"learning_rate": 2.999096636612518e-05,
"loss": 0.0085,
"step": 12990
},
{
"epoch": 7.274762171236709,
"grad_norm": 0.15215320885181427,
"learning_rate": 2.991522876735154e-05,
"loss": 0.0077,
"step": 13000
},
{
"epoch": 7.280358142137661,
"grad_norm": 0.2687567472457886,
"learning_rate": 2.9839546079575497e-05,
"loss": 0.0105,
"step": 13010
},
{
"epoch": 7.2859541130386125,
"grad_norm": 0.23126524686813354,
"learning_rate": 2.976391850971065e-05,
"loss": 0.0076,
"step": 13020
},
{
"epoch": 7.291550083939564,
"grad_norm": 0.10021013021469116,
"learning_rate": 2.9688346264519866e-05,
"loss": 0.01,
"step": 13030
},
{
"epoch": 7.297146054840515,
"grad_norm": 0.16525714099407196,
"learning_rate": 2.9612829550614836e-05,
"loss": 0.0082,
"step": 13040
},
{
"epoch": 7.302742025741466,
"grad_norm": 0.16742092370986938,
"learning_rate": 2.9537368574455304e-05,
"loss": 0.0141,
"step": 13050
},
{
"epoch": 7.308337996642417,
"grad_norm": 0.07409677654504776,
"learning_rate": 2.9461963542348737e-05,
"loss": 0.0083,
"step": 13060
},
{
"epoch": 7.3139339675433686,
"grad_norm": 0.2794577181339264,
"learning_rate": 2.9386614660449596e-05,
"loss": 0.0091,
"step": 13070
},
{
"epoch": 7.31952993844432,
"grad_norm": 0.16768626868724823,
"learning_rate": 2.931132213475884e-05,
"loss": 0.0128,
"step": 13080
},
{
"epoch": 7.325125909345271,
"grad_norm": 0.19670413434505463,
"learning_rate": 2.9236086171123404e-05,
"loss": 0.0058,
"step": 13090
},
{
"epoch": 7.330721880246223,
"grad_norm": 0.1663038730621338,
"learning_rate": 2.916090697523549e-05,
"loss": 0.0081,
"step": 13100
},
{
"epoch": 7.336317851147174,
"grad_norm": 0.2468092292547226,
"learning_rate": 2.9085784752632157e-05,
"loss": 0.0094,
"step": 13110
},
{
"epoch": 7.3419138220481255,
"grad_norm": 0.20476868748664856,
"learning_rate": 2.9010719708694722e-05,
"loss": 0.0095,
"step": 13120
},
{
"epoch": 7.347509792949077,
"grad_norm": 0.19373807311058044,
"learning_rate": 2.8935712048648112e-05,
"loss": 0.0077,
"step": 13130
},
{
"epoch": 7.353105763850028,
"grad_norm": 0.16226400434970856,
"learning_rate": 2.8860761977560436e-05,
"loss": 0.0105,
"step": 13140
},
{
"epoch": 7.358701734750979,
"grad_norm": 0.2760455906391144,
"learning_rate": 2.878586970034232e-05,
"loss": 0.017,
"step": 13150
},
{
"epoch": 7.36429770565193,
"grad_norm": 0.269136518239975,
"learning_rate": 2.8711035421746367e-05,
"loss": 0.0127,
"step": 13160
},
{
"epoch": 7.3698936765528815,
"grad_norm": 0.2237207144498825,
"learning_rate": 2.8636259346366666e-05,
"loss": 0.007,
"step": 13170
},
{
"epoch": 7.375489647453834,
"grad_norm": 0.1836055964231491,
"learning_rate": 2.8561541678638142e-05,
"loss": 0.0077,
"step": 13180
},
{
"epoch": 7.381085618354785,
"grad_norm": 0.1962578445672989,
"learning_rate": 2.8486882622836026e-05,
"loss": 0.0078,
"step": 13190
},
{
"epoch": 7.386681589255736,
"grad_norm": 0.16476459801197052,
"learning_rate": 2.8412282383075363e-05,
"loss": 0.0093,
"step": 13200
},
{
"epoch": 7.392277560156687,
"grad_norm": 0.17988111078739166,
"learning_rate": 2.8337741163310317e-05,
"loss": 0.0081,
"step": 13210
},
{
"epoch": 7.3978735310576385,
"grad_norm": 0.21751411259174347,
"learning_rate": 2.8263259167333777e-05,
"loss": 0.0092,
"step": 13220
},
{
"epoch": 7.40346950195859,
"grad_norm": 0.150657057762146,
"learning_rate": 2.8188836598776662e-05,
"loss": 0.0094,
"step": 13230
},
{
"epoch": 7.409065472859541,
"grad_norm": 0.16722621023654938,
"learning_rate": 2.811447366110741e-05,
"loss": 0.0074,
"step": 13240
},
{
"epoch": 7.414661443760492,
"grad_norm": 0.16167713701725006,
"learning_rate": 2.804017055763149e-05,
"loss": 0.0063,
"step": 13250
},
{
"epoch": 7.420257414661444,
"grad_norm": 0.07585649192333221,
"learning_rate": 2.7965927491490705e-05,
"loss": 0.0112,
"step": 13260
},
{
"epoch": 7.425853385562395,
"grad_norm": 0.19306915998458862,
"learning_rate": 2.7891744665662823e-05,
"loss": 0.0069,
"step": 13270
},
{
"epoch": 7.431449356463347,
"grad_norm": 0.23972170054912567,
"learning_rate": 2.7817622282960815e-05,
"loss": 0.0062,
"step": 13280
},
{
"epoch": 7.437045327364298,
"grad_norm": 0.15592247247695923,
"learning_rate": 2.774356054603243e-05,
"loss": 0.0055,
"step": 13290
},
{
"epoch": 7.442641298265249,
"grad_norm": 0.20682460069656372,
"learning_rate": 2.766955965735968e-05,
"loss": 0.0052,
"step": 13300
},
{
"epoch": 7.4482372691662,
"grad_norm": 0.09251468628644943,
"learning_rate": 2.7595619819258116e-05,
"loss": 0.0077,
"step": 13310
},
{
"epoch": 7.453833240067151,
"grad_norm": 0.1358599066734314,
"learning_rate": 2.7521741233876496e-05,
"loss": 0.0098,
"step": 13320
},
{
"epoch": 7.459429210968103,
"grad_norm": 0.10552109777927399,
"learning_rate": 2.7447924103195976e-05,
"loss": 0.0045,
"step": 13330
},
{
"epoch": 7.465025181869054,
"grad_norm": 0.22331656515598297,
"learning_rate": 2.7374168629029813e-05,
"loss": 0.0075,
"step": 13340
},
{
"epoch": 7.470621152770006,
"grad_norm": 0.25520750880241394,
"learning_rate": 2.7300475013022663e-05,
"loss": 0.0079,
"step": 13350
},
{
"epoch": 7.476217123670957,
"grad_norm": 0.3160042464733124,
"learning_rate": 2.7226843456650037e-05,
"loss": 0.0123,
"step": 13360
},
{
"epoch": 7.481813094571908,
"grad_norm": 0.1619534194469452,
"learning_rate": 2.7153274161217846e-05,
"loss": 0.0049,
"step": 13370
},
{
"epoch": 7.48740906547286,
"grad_norm": 0.3031173646450043,
"learning_rate": 2.707976732786166e-05,
"loss": 0.0098,
"step": 13380
},
{
"epoch": 7.493005036373811,
"grad_norm": 0.1819227635860443,
"learning_rate": 2.7006323157546386e-05,
"loss": 0.0065,
"step": 13390
},
{
"epoch": 7.498601007274762,
"grad_norm": 0.17307765781879425,
"learning_rate": 2.693294185106562e-05,
"loss": 0.0087,
"step": 13400
},
{
"epoch": 7.504196978175713,
"grad_norm": 0.1600845456123352,
"learning_rate": 2.6859623609040984e-05,
"loss": 0.0061,
"step": 13410
},
{
"epoch": 7.509792949076665,
"grad_norm": 0.21853172779083252,
"learning_rate": 2.6786368631921836e-05,
"loss": 0.0054,
"step": 13420
},
{
"epoch": 7.5153889199776165,
"grad_norm": 0.16434265673160553,
"learning_rate": 2.67131771199844e-05,
"loss": 0.0104,
"step": 13430
},
{
"epoch": 7.520984890878568,
"grad_norm": 0.1688595563173294,
"learning_rate": 2.6640049273331515e-05,
"loss": 0.0068,
"step": 13440
},
{
"epoch": 7.526580861779519,
"grad_norm": 0.10968342423439026,
"learning_rate": 2.656698529189193e-05,
"loss": 0.0072,
"step": 13450
},
{
"epoch": 7.53217683268047,
"grad_norm": 0.12489527463912964,
"learning_rate": 2.6493985375419778e-05,
"loss": 0.0067,
"step": 13460
},
{
"epoch": 7.537772803581421,
"grad_norm": 0.3275364935398102,
"learning_rate": 2.642104972349403e-05,
"loss": 0.0066,
"step": 13470
},
{
"epoch": 7.5433687744823725,
"grad_norm": 0.10653702169656754,
"learning_rate": 2.6348178535517966e-05,
"loss": 0.0133,
"step": 13480
},
{
"epoch": 7.548964745383324,
"grad_norm": 0.16446645557880402,
"learning_rate": 2.6275372010718635e-05,
"loss": 0.0075,
"step": 13490
},
{
"epoch": 7.554560716284275,
"grad_norm": 0.17610448598861694,
"learning_rate": 2.6202630348146324e-05,
"loss": 0.0077,
"step": 13500
},
{
"epoch": 7.560156687185227,
"grad_norm": 0.1589246541261673,
"learning_rate": 2.612995374667394e-05,
"loss": 0.0044,
"step": 13510
},
{
"epoch": 7.565752658086178,
"grad_norm": 0.3019932806491852,
"learning_rate": 2.6057342404996522e-05,
"loss": 0.0067,
"step": 13520
},
{
"epoch": 7.5713486289871295,
"grad_norm": 0.19549022614955902,
"learning_rate": 2.5984796521630737e-05,
"loss": 0.0083,
"step": 13530
},
{
"epoch": 7.576944599888081,
"grad_norm": 0.1532057523727417,
"learning_rate": 2.591231629491423e-05,
"loss": 0.0043,
"step": 13540
},
{
"epoch": 7.582540570789032,
"grad_norm": 0.1547580510377884,
"learning_rate": 2.5839901923005205e-05,
"loss": 0.0083,
"step": 13550
},
{
"epoch": 7.588136541689983,
"grad_norm": 0.30122992396354675,
"learning_rate": 2.5767553603881767e-05,
"loss": 0.0064,
"step": 13560
},
{
"epoch": 7.593732512590934,
"grad_norm": 0.12354984134435654,
"learning_rate": 2.5695271535341443e-05,
"loss": 0.0059,
"step": 13570
},
{
"epoch": 7.5993284834918855,
"grad_norm": 0.14805443584918976,
"learning_rate": 2.562305591500069e-05,
"loss": 0.0072,
"step": 13580
},
{
"epoch": 7.604924454392837,
"grad_norm": 0.15644380450248718,
"learning_rate": 2.555090694029421e-05,
"loss": 0.0076,
"step": 13590
},
{
"epoch": 7.610520425293789,
"grad_norm": 0.22504927217960358,
"learning_rate": 2.547882480847461e-05,
"loss": 0.0114,
"step": 13600
},
{
"epoch": 7.61611639619474,
"grad_norm": 0.10872774571180344,
"learning_rate": 2.540680971661161e-05,
"loss": 0.0098,
"step": 13610
},
{
"epoch": 7.621712367095691,
"grad_norm": 0.1415761411190033,
"learning_rate": 2.5334861861591753e-05,
"loss": 0.0059,
"step": 13620
},
{
"epoch": 7.627308337996642,
"grad_norm": 0.18380744755268097,
"learning_rate": 2.526298144011775e-05,
"loss": 0.0074,
"step": 13630
},
{
"epoch": 7.632904308897594,
"grad_norm": 0.13029605150222778,
"learning_rate": 2.5191168648707887e-05,
"loss": 0.0046,
"step": 13640
},
{
"epoch": 7.638500279798545,
"grad_norm": 0.11022605746984482,
"learning_rate": 2.511942368369566e-05,
"loss": 0.0052,
"step": 13650
},
{
"epoch": 7.644096250699496,
"grad_norm": 0.1933964192867279,
"learning_rate": 2.5047746741228978e-05,
"loss": 0.0062,
"step": 13660
},
{
"epoch": 7.649692221600448,
"grad_norm": 0.10140606015920639,
"learning_rate": 2.4976138017269908e-05,
"loss": 0.005,
"step": 13670
},
{
"epoch": 7.655288192501399,
"grad_norm": 0.1074545681476593,
"learning_rate": 2.490459770759398e-05,
"loss": 0.0081,
"step": 13680
},
{
"epoch": 7.660884163402351,
"grad_norm": 0.11866219341754913,
"learning_rate": 2.4833126007789653e-05,
"loss": 0.0063,
"step": 13690
},
{
"epoch": 7.666480134303302,
"grad_norm": 0.14528554677963257,
"learning_rate": 2.476172311325783e-05,
"loss": 0.0075,
"step": 13700
},
{
"epoch": 7.672076105204253,
"grad_norm": 0.12533891201019287,
"learning_rate": 2.4690389219211273e-05,
"loss": 0.0056,
"step": 13710
},
{
"epoch": 7.677672076105204,
"grad_norm": 0.2228127419948578,
"learning_rate": 2.4619124520674146e-05,
"loss": 0.007,
"step": 13720
},
{
"epoch": 7.683268047006155,
"grad_norm": 0.167043074965477,
"learning_rate": 2.4547929212481435e-05,
"loss": 0.0092,
"step": 13730
},
{
"epoch": 7.688864017907107,
"grad_norm": 0.1956396847963333,
"learning_rate": 2.447680348927837e-05,
"loss": 0.0104,
"step": 13740
},
{
"epoch": 7.694459988808058,
"grad_norm": 0.3440028429031372,
"learning_rate": 2.4405747545519963e-05,
"loss": 0.0101,
"step": 13750
},
{
"epoch": 7.70005595970901,
"grad_norm": 0.19462288916110992,
"learning_rate": 2.433476157547044e-05,
"loss": 0.0123,
"step": 13760
},
{
"epoch": 7.705651930609961,
"grad_norm": 0.2774219512939453,
"learning_rate": 2.4263845773202736e-05,
"loss": 0.012,
"step": 13770
},
{
"epoch": 7.711247901510912,
"grad_norm": 0.15917648375034332,
"learning_rate": 2.419300033259798e-05,
"loss": 0.0072,
"step": 13780
},
{
"epoch": 7.7168438724118635,
"grad_norm": 0.17087779939174652,
"learning_rate": 2.4122225447344875e-05,
"loss": 0.0051,
"step": 13790
},
{
"epoch": 7.722439843312815,
"grad_norm": 0.3049764931201935,
"learning_rate": 2.405152131093926e-05,
"loss": 0.0068,
"step": 13800
},
{
"epoch": 7.728035814213766,
"grad_norm": 0.23013077676296234,
"learning_rate": 2.3980888116683515e-05,
"loss": 0.0093,
"step": 13810
},
{
"epoch": 7.733631785114717,
"grad_norm": 0.25196191668510437,
"learning_rate": 2.3910326057686127e-05,
"loss": 0.0063,
"step": 13820
},
{
"epoch": 7.739227756015668,
"grad_norm": 0.13192011415958405,
"learning_rate": 2.3839835326861104e-05,
"loss": 0.0077,
"step": 13830
},
{
"epoch": 7.74482372691662,
"grad_norm": 0.14442972838878632,
"learning_rate": 2.3769416116927335e-05,
"loss": 0.0131,
"step": 13840
},
{
"epoch": 7.750419697817572,
"grad_norm": 0.1425463706254959,
"learning_rate": 2.3699068620408304e-05,
"loss": 0.0066,
"step": 13850
},
{
"epoch": 7.756015668718523,
"grad_norm": 0.1162482276558876,
"learning_rate": 2.362879302963135e-05,
"loss": 0.007,
"step": 13860
},
{
"epoch": 7.761611639619474,
"grad_norm": 0.21869398653507233,
"learning_rate": 2.3558589536727277e-05,
"loss": 0.0045,
"step": 13870
},
{
"epoch": 7.767207610520425,
"grad_norm": 0.1804109364748001,
"learning_rate": 2.3488458333629777e-05,
"loss": 0.0064,
"step": 13880
},
{
"epoch": 7.7728035814213765,
"grad_norm": 0.18711616098880768,
"learning_rate": 2.341839961207482e-05,
"loss": 0.0082,
"step": 13890
},
{
"epoch": 7.778399552322328,
"grad_norm": 0.17115071415901184,
"learning_rate": 2.3348413563600325e-05,
"loss": 0.008,
"step": 13900
},
{
"epoch": 7.783995523223279,
"grad_norm": 0.3199642300605774,
"learning_rate": 2.3278500379545436e-05,
"loss": 0.008,
"step": 13910
},
{
"epoch": 7.789591494124231,
"grad_norm": 0.16800075769424438,
"learning_rate": 2.3208660251050158e-05,
"loss": 0.0054,
"step": 13920
},
{
"epoch": 7.795187465025182,
"grad_norm": 0.11445470154285431,
"learning_rate": 2.3138893369054766e-05,
"loss": 0.0067,
"step": 13930
},
{
"epoch": 7.800783435926133,
"grad_norm": 0.1465342938899994,
"learning_rate": 2.3069199924299174e-05,
"loss": 0.0046,
"step": 13940
},
{
"epoch": 7.806379406827085,
"grad_norm": 0.10726216435432434,
"learning_rate": 2.2999580107322653e-05,
"loss": 0.013,
"step": 13950
},
{
"epoch": 7.811975377728036,
"grad_norm": 0.2467944324016571,
"learning_rate": 2.29300341084631e-05,
"loss": 0.006,
"step": 13960
},
{
"epoch": 7.817571348628987,
"grad_norm": 0.18158167600631714,
"learning_rate": 2.2860562117856647e-05,
"loss": 0.0065,
"step": 13970
},
{
"epoch": 7.823167319529938,
"grad_norm": 0.1618615835905075,
"learning_rate": 2.279116432543705e-05,
"loss": 0.0065,
"step": 13980
},
{
"epoch": 7.8287632904308895,
"grad_norm": 0.1069146990776062,
"learning_rate": 2.2721840920935196e-05,
"loss": 0.0105,
"step": 13990
},
{
"epoch": 7.834359261331841,
"grad_norm": 0.12003065645694733,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.0049,
"step": 14000
},
{
"epoch": 7.839955232232793,
"grad_norm": 0.09423186630010605,
"learning_rate": 2.258341803359108e-05,
"loss": 0.0061,
"step": 14010
},
{
"epoch": 7.845551203133744,
"grad_norm": 0.35245028138160706,
"learning_rate": 2.251431892919171e-05,
"loss": 0.0091,
"step": 14020
},
{
"epoch": 7.851147174034695,
"grad_norm": 0.11108125001192093,
"learning_rate": 2.2445294969594844e-05,
"loss": 0.007,
"step": 14030
},
{
"epoch": 7.856743144935646,
"grad_norm": 0.10527674853801727,
"learning_rate": 2.237634634350934e-05,
"loss": 0.0042,
"step": 14040
},
{
"epoch": 7.862339115836598,
"grad_norm": 0.2263229489326477,
"learning_rate": 2.2307473239438154e-05,
"loss": 0.0056,
"step": 14050
},
{
"epoch": 7.867935086737549,
"grad_norm": 0.13221915066242218,
"learning_rate": 2.2238675845677663e-05,
"loss": 0.0068,
"step": 14060
},
{
"epoch": 7.8735310576385,
"grad_norm": 0.17508424818515778,
"learning_rate": 2.2169954350317374e-05,
"loss": 0.007,
"step": 14070
},
{
"epoch": 7.879127028539451,
"grad_norm": 0.24999241530895233,
"learning_rate": 2.2101308941239203e-05,
"loss": 0.0085,
"step": 14080
},
{
"epoch": 7.8847229994404024,
"grad_norm": 0.12810635566711426,
"learning_rate": 2.2032739806117058e-05,
"loss": 0.0084,
"step": 14090
},
{
"epoch": 7.8903189703413545,
"grad_norm": 0.22745615243911743,
"learning_rate": 2.196424713241637e-05,
"loss": 0.0145,
"step": 14100
},
{
"epoch": 7.895914941242306,
"grad_norm": 0.0886574536561966,
"learning_rate": 2.1895831107393484e-05,
"loss": 0.0071,
"step": 14110
},
{
"epoch": 7.901510912143257,
"grad_norm": 0.18623238801956177,
"learning_rate": 2.182749191809518e-05,
"loss": 0.0077,
"step": 14120
},
{
"epoch": 7.907106883044208,
"grad_norm": 0.20176784694194794,
"learning_rate": 2.1759229751358217e-05,
"loss": 0.008,
"step": 14130
},
{
"epoch": 7.912702853945159,
"grad_norm": 0.18935443460941315,
"learning_rate": 2.1691044793808734e-05,
"loss": 0.0069,
"step": 14140
},
{
"epoch": 7.918298824846111,
"grad_norm": 0.18812550604343414,
"learning_rate": 2.1622937231861822e-05,
"loss": 0.0051,
"step": 14150
},
{
"epoch": 7.923894795747062,
"grad_norm": 0.12224578857421875,
"learning_rate": 2.1554907251720945e-05,
"loss": 0.0053,
"step": 14160
},
{
"epoch": 7.929490766648014,
"grad_norm": 0.12175440043210983,
"learning_rate": 2.148695503937745e-05,
"loss": 0.0075,
"step": 14170
},
{
"epoch": 7.935086737548965,
"grad_norm": 0.11878049373626709,
"learning_rate": 2.1419080780610123e-05,
"loss": 0.0062,
"step": 14180
},
{
"epoch": 7.940682708449916,
"grad_norm": 0.19284716248512268,
"learning_rate": 2.1351284660984572e-05,
"loss": 0.0063,
"step": 14190
},
{
"epoch": 7.9462786793508675,
"grad_norm": 0.159319207072258,
"learning_rate": 2.128356686585282e-05,
"loss": 0.0064,
"step": 14200
},
{
"epoch": 7.951874650251819,
"grad_norm": 0.16800148785114288,
"learning_rate": 2.121592758035273e-05,
"loss": 0.0054,
"step": 14210
},
{
"epoch": 7.95747062115277,
"grad_norm": 0.23277972638607025,
"learning_rate": 2.1148366989407496e-05,
"loss": 0.0056,
"step": 14220
},
{
"epoch": 7.963066592053721,
"grad_norm": 0.08594591915607452,
"learning_rate": 2.1080885277725236e-05,
"loss": 0.0054,
"step": 14230
},
{
"epoch": 7.968662562954672,
"grad_norm": 0.21676327288150787,
"learning_rate": 2.1013482629798333e-05,
"loss": 0.0071,
"step": 14240
},
{
"epoch": 7.9742585338556236,
"grad_norm": 0.1778232604265213,
"learning_rate": 2.094615922990309e-05,
"loss": 0.0067,
"step": 14250
},
{
"epoch": 7.979854504756576,
"grad_norm": 0.2177736759185791,
"learning_rate": 2.0878915262099098e-05,
"loss": 0.0068,
"step": 14260
},
{
"epoch": 7.985450475657527,
"grad_norm": 0.25127291679382324,
"learning_rate": 2.0811750910228774e-05,
"loss": 0.0104,
"step": 14270
},
{
"epoch": 7.991046446558478,
"grad_norm": 0.08792544901371002,
"learning_rate": 2.0744666357916925e-05,
"loss": 0.0064,
"step": 14280
},
{
"epoch": 7.996642417459429,
"grad_norm": 0.1125119999051094,
"learning_rate": 2.067766178857013e-05,
"loss": 0.0099,
"step": 14290
},
{
"epoch": 8.00223838836038,
"grad_norm": 0.18561410903930664,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0089,
"step": 14300
},
{
"epoch": 8.007834359261333,
"grad_norm": 0.10987678915262222,
"learning_rate": 2.0543893331304333e-05,
"loss": 0.0071,
"step": 14310
},
{
"epoch": 8.013430330162283,
"grad_norm": 0.10636857897043228,
"learning_rate": 2.0477129809103147e-05,
"loss": 0.007,
"step": 14320
},
{
"epoch": 8.019026301063235,
"grad_norm": 0.16379332542419434,
"learning_rate": 2.0410447001301753e-05,
"loss": 0.006,
"step": 14330
},
{
"epoch": 8.024622271964185,
"grad_norm": 0.09951362758874893,
"learning_rate": 2.0343845090208368e-05,
"loss": 0.0052,
"step": 14340
},
{
"epoch": 8.030218242865137,
"grad_norm": 0.1974375694990158,
"learning_rate": 2.0277324257910106e-05,
"loss": 0.0061,
"step": 14350
},
{
"epoch": 8.035814213766088,
"grad_norm": 0.16213521361351013,
"learning_rate": 2.0210884686272368e-05,
"loss": 0.0056,
"step": 14360
},
{
"epoch": 8.04141018466704,
"grad_norm": 0.32907333970069885,
"learning_rate": 2.0144526556938387e-05,
"loss": 0.011,
"step": 14370
},
{
"epoch": 8.047006155567992,
"grad_norm": 0.24763990938663483,
"learning_rate": 2.0078250051328784e-05,
"loss": 0.0059,
"step": 14380
},
{
"epoch": 8.052602126468942,
"grad_norm": 0.06522991508245468,
"learning_rate": 2.0012055350640986e-05,
"loss": 0.0075,
"step": 14390
},
{
"epoch": 8.058198097369894,
"grad_norm": 0.1594466120004654,
"learning_rate": 1.9945942635848748e-05,
"loss": 0.0107,
"step": 14400
},
{
"epoch": 8.063794068270845,
"grad_norm": 0.11248297244310379,
"learning_rate": 1.9879912087701753e-05,
"loss": 0.0043,
"step": 14410
},
{
"epoch": 8.069390039171797,
"grad_norm": 0.11491246521472931,
"learning_rate": 1.981396388672496e-05,
"loss": 0.0043,
"step": 14420
},
{
"epoch": 8.074986010072747,
"grad_norm": 0.22106263041496277,
"learning_rate": 1.974809821321827e-05,
"loss": 0.0055,
"step": 14430
},
{
"epoch": 8.0805819809737,
"grad_norm": 0.16226910054683685,
"learning_rate": 1.9682315247255894e-05,
"loss": 0.0085,
"step": 14440
},
{
"epoch": 8.08617795187465,
"grad_norm": 0.09066546708345413,
"learning_rate": 1.9616615168685943e-05,
"loss": 0.0083,
"step": 14450
},
{
"epoch": 8.091773922775602,
"grad_norm": 0.11933751404285431,
"learning_rate": 1.9550998157129946e-05,
"loss": 0.0057,
"step": 14460
},
{
"epoch": 8.097369893676554,
"grad_norm": 0.1404096931219101,
"learning_rate": 1.9485464391982284e-05,
"loss": 0.0047,
"step": 14470
},
{
"epoch": 8.102965864577504,
"grad_norm": 0.2508150339126587,
"learning_rate": 1.942001405240979e-05,
"loss": 0.0076,
"step": 14480
},
{
"epoch": 8.108561835478456,
"grad_norm": 0.17527352273464203,
"learning_rate": 1.9354647317351188e-05,
"loss": 0.0077,
"step": 14490
},
{
"epoch": 8.114157806379406,
"grad_norm": 0.11819542944431305,
"learning_rate": 1.928936436551661e-05,
"loss": 0.0048,
"step": 14500
},
{
"epoch": 8.119753777280359,
"grad_norm": 0.17159508168697357,
"learning_rate": 1.9224165375387193e-05,
"loss": 0.0072,
"step": 14510
},
{
"epoch": 8.125349748181309,
"grad_norm": 0.1392519325017929,
"learning_rate": 1.9159050525214452e-05,
"loss": 0.0058,
"step": 14520
},
{
"epoch": 8.130945719082261,
"grad_norm": 0.2096053957939148,
"learning_rate": 1.909401999301993e-05,
"loss": 0.007,
"step": 14530
},
{
"epoch": 8.136541689983211,
"grad_norm": 0.2075774371623993,
"learning_rate": 1.9029073956594606e-05,
"loss": 0.0063,
"step": 14540
},
{
"epoch": 8.142137660884163,
"grad_norm": 0.0607825368642807,
"learning_rate": 1.8964212593498442e-05,
"loss": 0.0046,
"step": 14550
},
{
"epoch": 8.147733631785115,
"grad_norm": 0.20028991997241974,
"learning_rate": 1.8899436081059975e-05,
"loss": 0.0067,
"step": 14560
},
{
"epoch": 8.153329602686066,
"grad_norm": 0.12437421083450317,
"learning_rate": 1.8834744596375666e-05,
"loss": 0.0045,
"step": 14570
},
{
"epoch": 8.158925573587018,
"grad_norm": 0.09412521868944168,
"learning_rate": 1.877013831630961e-05,
"loss": 0.0053,
"step": 14580
},
{
"epoch": 8.164521544487968,
"grad_norm": 0.30078214406967163,
"learning_rate": 1.8705617417492883e-05,
"loss": 0.0088,
"step": 14590
},
{
"epoch": 8.17011751538892,
"grad_norm": 0.2020367681980133,
"learning_rate": 1.8641182076323148e-05,
"loss": 0.0074,
"step": 14600
},
{
"epoch": 8.17571348628987,
"grad_norm": 0.12557435035705566,
"learning_rate": 1.85768324689642e-05,
"loss": 0.0054,
"step": 14610
},
{
"epoch": 8.181309457190823,
"grad_norm": 0.11945895105600357,
"learning_rate": 1.851256877134538e-05,
"loss": 0.0078,
"step": 14620
},
{
"epoch": 8.186905428091775,
"grad_norm": 0.33773839473724365,
"learning_rate": 1.8448391159161204e-05,
"loss": 0.0101,
"step": 14630
},
{
"epoch": 8.192501398992725,
"grad_norm": 0.2184380739927292,
"learning_rate": 1.838429980787081e-05,
"loss": 0.0059,
"step": 14640
},
{
"epoch": 8.198097369893677,
"grad_norm": 0.06359529495239258,
"learning_rate": 1.8320294892697478e-05,
"loss": 0.006,
"step": 14650
},
{
"epoch": 8.203693340794628,
"grad_norm": 0.1690957248210907,
"learning_rate": 1.8256376588628238e-05,
"loss": 0.008,
"step": 14660
},
{
"epoch": 8.20928931169558,
"grad_norm": 0.296812504529953,
"learning_rate": 1.8192545070413282e-05,
"loss": 0.0069,
"step": 14670
},
{
"epoch": 8.21488528259653,
"grad_norm": 0.08360179513692856,
"learning_rate": 1.8128800512565513e-05,
"loss": 0.007,
"step": 14680
},
{
"epoch": 8.220481253497482,
"grad_norm": 0.12985661625862122,
"learning_rate": 1.8065143089360172e-05,
"loss": 0.0079,
"step": 14690
},
{
"epoch": 8.226077224398432,
"grad_norm": 0.10445982962846756,
"learning_rate": 1.800157297483417e-05,
"loss": 0.0036,
"step": 14700
},
{
"epoch": 8.231673195299384,
"grad_norm": 0.1876983791589737,
"learning_rate": 1.7938090342785817e-05,
"loss": 0.0058,
"step": 14710
},
{
"epoch": 8.237269166200337,
"grad_norm": 0.07933235913515091,
"learning_rate": 1.787469536677419e-05,
"loss": 0.0048,
"step": 14720
},
{
"epoch": 8.242865137101287,
"grad_norm": 0.2597578465938568,
"learning_rate": 1.7811388220118707e-05,
"loss": 0.0077,
"step": 14730
},
{
"epoch": 8.248461108002239,
"grad_norm": 0.1318414807319641,
"learning_rate": 1.774816907589873e-05,
"loss": 0.0038,
"step": 14740
},
{
"epoch": 8.25405707890319,
"grad_norm": 0.23657891154289246,
"learning_rate": 1.768503810695295e-05,
"loss": 0.0074,
"step": 14750
},
{
"epoch": 8.259653049804141,
"grad_norm": 0.12084835767745972,
"learning_rate": 1.7621995485879062e-05,
"loss": 0.0086,
"step": 14760
},
{
"epoch": 8.265249020705092,
"grad_norm": 0.2077346295118332,
"learning_rate": 1.755904138503316e-05,
"loss": 0.0066,
"step": 14770
},
{
"epoch": 8.270844991606044,
"grad_norm": 0.26253417134284973,
"learning_rate": 1.749617597652934e-05,
"loss": 0.0107,
"step": 14780
},
{
"epoch": 8.276440962506994,
"grad_norm": 0.25481829047203064,
"learning_rate": 1.743339943223926e-05,
"loss": 0.0044,
"step": 14790
},
{
"epoch": 8.282036933407946,
"grad_norm": 0.23157408833503723,
"learning_rate": 1.7370711923791567e-05,
"loss": 0.0069,
"step": 14800
},
{
"epoch": 8.287632904308898,
"grad_norm": 0.10085418075323105,
"learning_rate": 1.7308113622571544e-05,
"loss": 0.0036,
"step": 14810
},
{
"epoch": 8.293228875209849,
"grad_norm": 0.10876370966434479,
"learning_rate": 1.7245604699720535e-05,
"loss": 0.007,
"step": 14820
},
{
"epoch": 8.2988248461108,
"grad_norm": 0.20935757458209991,
"learning_rate": 1.7183185326135543e-05,
"loss": 0.0055,
"step": 14830
},
{
"epoch": 8.304420817011751,
"grad_norm": 0.13824748992919922,
"learning_rate": 1.712085567246878e-05,
"loss": 0.0072,
"step": 14840
},
{
"epoch": 8.310016787912703,
"grad_norm": 0.3369564414024353,
"learning_rate": 1.70586159091271e-05,
"loss": 0.0069,
"step": 14850
},
{
"epoch": 8.315612758813653,
"grad_norm": 0.2684394419193268,
"learning_rate": 1.699646620627168e-05,
"loss": 0.0061,
"step": 14860
},
{
"epoch": 8.321208729714606,
"grad_norm": 0.23020261526107788,
"learning_rate": 1.6934406733817414e-05,
"loss": 0.0126,
"step": 14870
},
{
"epoch": 8.326804700615558,
"grad_norm": 0.23905567824840546,
"learning_rate": 1.6872437661432517e-05,
"loss": 0.0057,
"step": 14880
},
{
"epoch": 8.332400671516508,
"grad_norm": 0.11183072626590729,
"learning_rate": 1.6810559158538092e-05,
"loss": 0.0061,
"step": 14890
},
{
"epoch": 8.33799664241746,
"grad_norm": 0.11450804024934769,
"learning_rate": 1.6748771394307585e-05,
"loss": 0.0041,
"step": 14900
},
{
"epoch": 8.34359261331841,
"grad_norm": 0.14276103675365448,
"learning_rate": 1.6687074537666398e-05,
"loss": 0.0046,
"step": 14910
},
{
"epoch": 8.349188584219362,
"grad_norm": 0.1129729300737381,
"learning_rate": 1.662546875729138e-05,
"loss": 0.0063,
"step": 14920
},
{
"epoch": 8.354784555120313,
"grad_norm": 0.18285100162029266,
"learning_rate": 1.6563954221610355e-05,
"loss": 0.0106,
"step": 14930
},
{
"epoch": 8.360380526021265,
"grad_norm": 0.10539596527814865,
"learning_rate": 1.6502531098801753e-05,
"loss": 0.0043,
"step": 14940
},
{
"epoch": 8.365976496922215,
"grad_norm": 0.13819168508052826,
"learning_rate": 1.6441199556794033e-05,
"loss": 0.0065,
"step": 14950
},
{
"epoch": 8.371572467823167,
"grad_norm": 0.19076746702194214,
"learning_rate": 1.637995976326527e-05,
"loss": 0.01,
"step": 14960
},
{
"epoch": 8.37716843872412,
"grad_norm": 0.24138867855072021,
"learning_rate": 1.631881188564275e-05,
"loss": 0.0082,
"step": 14970
},
{
"epoch": 8.38276440962507,
"grad_norm": 0.1397552490234375,
"learning_rate": 1.62577560911024e-05,
"loss": 0.0047,
"step": 14980
},
{
"epoch": 8.388360380526022,
"grad_norm": 0.08066073060035706,
"learning_rate": 1.6196792546568472e-05,
"loss": 0.0076,
"step": 14990
},
{
"epoch": 8.393956351426972,
"grad_norm": 0.2772653102874756,
"learning_rate": 1.6135921418712956e-05,
"loss": 0.008,
"step": 15000
},
{
"epoch": 8.399552322327924,
"grad_norm": 0.1933654099702835,
"learning_rate": 1.6075142873955164e-05,
"loss": 0.0049,
"step": 15010
},
{
"epoch": 8.405148293228875,
"grad_norm": 0.09738892316818237,
"learning_rate": 1.6014457078461353e-05,
"loss": 0.0046,
"step": 15020
},
{
"epoch": 8.410744264129827,
"grad_norm": 0.11632133275270462,
"learning_rate": 1.5953864198144135e-05,
"loss": 0.0079,
"step": 15030
},
{
"epoch": 8.416340235030777,
"grad_norm": 0.10637476295232773,
"learning_rate": 1.5893364398662176e-05,
"loss": 0.0052,
"step": 15040
},
{
"epoch": 8.421936205931729,
"grad_norm": 0.22587163746356964,
"learning_rate": 1.583295784541958e-05,
"loss": 0.0064,
"step": 15050
},
{
"epoch": 8.427532176832681,
"grad_norm": 0.15165762603282928,
"learning_rate": 1.5772644703565565e-05,
"loss": 0.0068,
"step": 15060
},
{
"epoch": 8.433128147733632,
"grad_norm": 0.13497453927993774,
"learning_rate": 1.5712425137993973e-05,
"loss": 0.0076,
"step": 15070
},
{
"epoch": 8.438724118634584,
"grad_norm": 0.1444980800151825,
"learning_rate": 1.5652299313342773e-05,
"loss": 0.0066,
"step": 15080
},
{
"epoch": 8.444320089535534,
"grad_norm": 0.32101383805274963,
"learning_rate": 1.5592267393993716e-05,
"loss": 0.0054,
"step": 15090
},
{
"epoch": 8.449916060436486,
"grad_norm": 0.26894599199295044,
"learning_rate": 1.553232954407171e-05,
"loss": 0.0039,
"step": 15100
},
{
"epoch": 8.455512031337436,
"grad_norm": 0.26109951734542847,
"learning_rate": 1.5472485927444597e-05,
"loss": 0.0057,
"step": 15110
},
{
"epoch": 8.461108002238388,
"grad_norm": 0.09691357612609863,
"learning_rate": 1.5412736707722537e-05,
"loss": 0.0036,
"step": 15120
},
{
"epoch": 8.46670397313934,
"grad_norm": 0.08756586909294128,
"learning_rate": 1.5353082048257596e-05,
"loss": 0.0059,
"step": 15130
},
{
"epoch": 8.47229994404029,
"grad_norm": 0.0936000794172287,
"learning_rate": 1.5293522112143373e-05,
"loss": 0.0042,
"step": 15140
},
{
"epoch": 8.477895914941243,
"grad_norm": 0.20747262239456177,
"learning_rate": 1.5234057062214402e-05,
"loss": 0.0118,
"step": 15150
},
{
"epoch": 8.483491885842193,
"grad_norm": 0.11843043565750122,
"learning_rate": 1.517468706104589e-05,
"loss": 0.0072,
"step": 15160
},
{
"epoch": 8.489087856743145,
"grad_norm": 0.23854964971542358,
"learning_rate": 1.5115412270953167e-05,
"loss": 0.0066,
"step": 15170
},
{
"epoch": 8.494683827644096,
"grad_norm": 0.1770446002483368,
"learning_rate": 1.5056232853991209e-05,
"loss": 0.0062,
"step": 15180
},
{
"epoch": 8.500279798545048,
"grad_norm": 0.23799461126327515,
"learning_rate": 1.4997148971954344e-05,
"loss": 0.0075,
"step": 15190
},
{
"epoch": 8.505875769445998,
"grad_norm": 0.3780512511730194,
"learning_rate": 1.4938160786375572e-05,
"loss": 0.0081,
"step": 15200
},
{
"epoch": 8.51147174034695,
"grad_norm": 0.11119966208934784,
"learning_rate": 1.4879268458526379e-05,
"loss": 0.0046,
"step": 15210
},
{
"epoch": 8.517067711247902,
"grad_norm": 0.09658356010913849,
"learning_rate": 1.4820472149416154e-05,
"loss": 0.007,
"step": 15220
},
{
"epoch": 8.522663682148853,
"grad_norm": 0.17144611477851868,
"learning_rate": 1.4761772019791748e-05,
"loss": 0.0056,
"step": 15230
},
{
"epoch": 8.528259653049805,
"grad_norm": 0.14623138308525085,
"learning_rate": 1.470316823013707e-05,
"loss": 0.0051,
"step": 15240
},
{
"epoch": 8.533855623950755,
"grad_norm": 0.1579722911119461,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.0049,
"step": 15250
},
{
"epoch": 8.539451594851707,
"grad_norm": 0.14990709722042084,
"learning_rate": 1.4586250311355132e-05,
"loss": 0.006,
"step": 15260
},
{
"epoch": 8.545047565752657,
"grad_norm": 0.24695487320423126,
"learning_rate": 1.4527936501877032e-05,
"loss": 0.0072,
"step": 15270
},
{
"epoch": 8.55064353665361,
"grad_norm": 0.2550105154514313,
"learning_rate": 1.4469719671666043e-05,
"loss": 0.0058,
"step": 15280
},
{
"epoch": 8.556239507554562,
"grad_norm": 0.17998188734054565,
"learning_rate": 1.4411599979884744e-05,
"loss": 0.0089,
"step": 15290
},
{
"epoch": 8.561835478455512,
"grad_norm": 0.3639971613883972,
"learning_rate": 1.435357758543015e-05,
"loss": 0.0085,
"step": 15300
},
{
"epoch": 8.567431449356464,
"grad_norm": 0.12687824666500092,
"learning_rate": 1.4295652646933277e-05,
"loss": 0.0061,
"step": 15310
},
{
"epoch": 8.573027420257414,
"grad_norm": 0.1352899670600891,
"learning_rate": 1.4237825322758736e-05,
"loss": 0.0066,
"step": 15320
},
{
"epoch": 8.578623391158366,
"grad_norm": 0.2139214277267456,
"learning_rate": 1.4180095771004154e-05,
"loss": 0.006,
"step": 15330
},
{
"epoch": 8.584219362059317,
"grad_norm": 0.13526403903961182,
"learning_rate": 1.412246414949997e-05,
"loss": 0.0061,
"step": 15340
},
{
"epoch": 8.589815332960269,
"grad_norm": 0.10206010937690735,
"learning_rate": 1.4064930615808808e-05,
"loss": 0.0042,
"step": 15350
},
{
"epoch": 8.59541130386122,
"grad_norm": 0.1680195927619934,
"learning_rate": 1.4007495327225162e-05,
"loss": 0.0063,
"step": 15360
},
{
"epoch": 8.601007274762171,
"grad_norm": 0.2092961072921753,
"learning_rate": 1.3950158440774957e-05,
"loss": 0.0089,
"step": 15370
},
{
"epoch": 8.606603245663123,
"grad_norm": 0.24639266729354858,
"learning_rate": 1.389292011321498e-05,
"loss": 0.0037,
"step": 15380
},
{
"epoch": 8.612199216564074,
"grad_norm": 0.20889121294021606,
"learning_rate": 1.383578050103268e-05,
"loss": 0.0036,
"step": 15390
},
{
"epoch": 8.617795187465026,
"grad_norm": 0.1731806993484497,
"learning_rate": 1.3778739760445552e-05,
"loss": 0.0049,
"step": 15400
},
{
"epoch": 8.623391158365976,
"grad_norm": 0.15791241824626923,
"learning_rate": 1.3721798047400813e-05,
"loss": 0.0064,
"step": 15410
},
{
"epoch": 8.628987129266928,
"grad_norm": 0.2612980604171753,
"learning_rate": 1.3664955517574968e-05,
"loss": 0.0056,
"step": 15420
},
{
"epoch": 8.634583100167879,
"grad_norm": 0.12942969799041748,
"learning_rate": 1.3608212326373249e-05,
"loss": 0.0044,
"step": 15430
},
{
"epoch": 8.64017907106883,
"grad_norm": 0.224086731672287,
"learning_rate": 1.3551568628929434e-05,
"loss": 0.0065,
"step": 15440
},
{
"epoch": 8.645775041969781,
"grad_norm": 0.234924778342247,
"learning_rate": 1.3495024580105192e-05,
"loss": 0.0055,
"step": 15450
},
{
"epoch": 8.651371012870733,
"grad_norm": 0.14701171219348907,
"learning_rate": 1.343858033448982e-05,
"loss": 0.0078,
"step": 15460
},
{
"epoch": 8.656966983771685,
"grad_norm": 0.06672263145446777,
"learning_rate": 1.3382236046399722e-05,
"loss": 0.0057,
"step": 15470
},
{
"epoch": 8.662562954672635,
"grad_norm": 0.11234284192323685,
"learning_rate": 1.3325991869878013e-05,
"loss": 0.0053,
"step": 15480
},
{
"epoch": 8.668158925573588,
"grad_norm": 0.2150266021490097,
"learning_rate": 1.3269847958694148e-05,
"loss": 0.0045,
"step": 15490
},
{
"epoch": 8.673754896474538,
"grad_norm": 0.37493982911109924,
"learning_rate": 1.3213804466343421e-05,
"loss": 0.0058,
"step": 15500
},
{
"epoch": 8.67935086737549,
"grad_norm": 0.054848652333021164,
"learning_rate": 1.3157861546046613e-05,
"loss": 0.0062,
"step": 15510
},
{
"epoch": 8.68494683827644,
"grad_norm": 0.30526259541511536,
"learning_rate": 1.3102019350749528e-05,
"loss": 0.005,
"step": 15520
},
{
"epoch": 8.690542809177392,
"grad_norm": 0.11414709687232971,
"learning_rate": 1.3046278033122577e-05,
"loss": 0.0055,
"step": 15530
},
{
"epoch": 8.696138780078343,
"grad_norm": 0.19409357011318207,
"learning_rate": 1.299063774556042e-05,
"loss": 0.0048,
"step": 15540
},
{
"epoch": 8.701734750979295,
"grad_norm": 0.0840323343873024,
"learning_rate": 1.293509864018146e-05,
"loss": 0.0062,
"step": 15550
},
{
"epoch": 8.707330721880247,
"grad_norm": 0.2921426594257355,
"learning_rate": 1.2879660868827508e-05,
"loss": 0.0055,
"step": 15560
},
{
"epoch": 8.712926692781197,
"grad_norm": 0.18921242654323578,
"learning_rate": 1.2824324583063302e-05,
"loss": 0.0065,
"step": 15570
},
{
"epoch": 8.71852266368215,
"grad_norm": 0.2043517678976059,
"learning_rate": 1.2769089934176126e-05,
"loss": 0.0048,
"step": 15580
},
{
"epoch": 8.7241186345831,
"grad_norm": 0.14090007543563843,
"learning_rate": 1.2713957073175425e-05,
"loss": 0.0043,
"step": 15590
},
{
"epoch": 8.729714605484052,
"grad_norm": 0.13512486219406128,
"learning_rate": 1.2658926150792322e-05,
"loss": 0.009,
"step": 15600
},
{
"epoch": 8.735310576385002,
"grad_norm": 0.16850633919239044,
"learning_rate": 1.2603997317479238e-05,
"loss": 0.0043,
"step": 15610
},
{
"epoch": 8.740906547285954,
"grad_norm": 0.0671689510345459,
"learning_rate": 1.2549170723409549e-05,
"loss": 0.0047,
"step": 15620
},
{
"epoch": 8.746502518186904,
"grad_norm": 0.17265447974205017,
"learning_rate": 1.2494446518477022e-05,
"loss": 0.0078,
"step": 15630
},
{
"epoch": 8.752098489087857,
"grad_norm": 0.09633443504571915,
"learning_rate": 1.243982485229559e-05,
"loss": 0.01,
"step": 15640
},
{
"epoch": 8.757694459988809,
"grad_norm": 0.07608158886432648,
"learning_rate": 1.2385305874198776e-05,
"loss": 0.008,
"step": 15650
},
{
"epoch": 8.763290430889759,
"grad_norm": 0.1386493295431137,
"learning_rate": 1.233088973323937e-05,
"loss": 0.0141,
"step": 15660
},
{
"epoch": 8.768886401790711,
"grad_norm": 0.22368523478507996,
"learning_rate": 1.2276576578189064e-05,
"loss": 0.0046,
"step": 15670
},
{
"epoch": 8.774482372691661,
"grad_norm": 0.1423027664422989,
"learning_rate": 1.2222366557537911e-05,
"loss": 0.0059,
"step": 15680
},
{
"epoch": 8.780078343592614,
"grad_norm": 0.09472924470901489,
"learning_rate": 1.2168259819494066e-05,
"loss": 0.0078,
"step": 15690
},
{
"epoch": 8.785674314493564,
"grad_norm": 0.1385987550020218,
"learning_rate": 1.2114256511983274e-05,
"loss": 0.0044,
"step": 15700
},
{
"epoch": 8.791270285394516,
"grad_norm": 0.1465826779603958,
"learning_rate": 1.2060356782648503e-05,
"loss": 0.0035,
"step": 15710
},
{
"epoch": 8.796866256295468,
"grad_norm": 0.3275586664676666,
"learning_rate": 1.2006560778849578e-05,
"loss": 0.0057,
"step": 15720
},
{
"epoch": 8.802462227196418,
"grad_norm": 0.09989197552204132,
"learning_rate": 1.1952868647662696e-05,
"loss": 0.006,
"step": 15730
},
{
"epoch": 8.80805819809737,
"grad_norm": 0.12719599902629852,
"learning_rate": 1.1899280535880119e-05,
"loss": 0.0042,
"step": 15740
},
{
"epoch": 8.81365416899832,
"grad_norm": 0.3480566740036011,
"learning_rate": 1.1845796590009683e-05,
"loss": 0.0073,
"step": 15750
},
{
"epoch": 8.819250139899273,
"grad_norm": 0.1562948226928711,
"learning_rate": 1.1792416956274444e-05,
"loss": 0.0066,
"step": 15760
},
{
"epoch": 8.824846110800223,
"grad_norm": 0.23169738054275513,
"learning_rate": 1.1739141780612306e-05,
"loss": 0.0067,
"step": 15770
},
{
"epoch": 8.830442081701175,
"grad_norm": 0.1328081339597702,
"learning_rate": 1.1685971208675539e-05,
"loss": 0.0051,
"step": 15780
},
{
"epoch": 8.836038052602127,
"grad_norm": 0.10535513609647751,
"learning_rate": 1.1632905385830484e-05,
"loss": 0.0061,
"step": 15790
},
{
"epoch": 8.841634023503078,
"grad_norm": 0.08534829318523407,
"learning_rate": 1.157994445715706e-05,
"loss": 0.0052,
"step": 15800
},
{
"epoch": 8.84722999440403,
"grad_norm": 0.21224470436573029,
"learning_rate": 1.1527088567448407e-05,
"loss": 0.0066,
"step": 15810
},
{
"epoch": 8.85282596530498,
"grad_norm": 0.20451109111309052,
"learning_rate": 1.1474337861210543e-05,
"loss": 0.0067,
"step": 15820
},
{
"epoch": 8.858421936205932,
"grad_norm": 0.21763543784618378,
"learning_rate": 1.1421692482661856e-05,
"loss": 0.0089,
"step": 15830
},
{
"epoch": 8.864017907106883,
"grad_norm": 0.14212079346179962,
"learning_rate": 1.1369152575732822e-05,
"loss": 0.0048,
"step": 15840
},
{
"epoch": 8.869613878007835,
"grad_norm": 0.1489504873752594,
"learning_rate": 1.1316718284065537e-05,
"loss": 0.0046,
"step": 15850
},
{
"epoch": 8.875209848908785,
"grad_norm": 0.09450363367795944,
"learning_rate": 1.1264389751013326e-05,
"loss": 0.0053,
"step": 15860
},
{
"epoch": 8.880805819809737,
"grad_norm": 0.2034289836883545,
"learning_rate": 1.1212167119640438e-05,
"loss": 0.0081,
"step": 15870
},
{
"epoch": 8.88640179071069,
"grad_norm": 0.13935258984565735,
"learning_rate": 1.1160050532721528e-05,
"loss": 0.0064,
"step": 15880
},
{
"epoch": 8.89199776161164,
"grad_norm": 0.08578619360923767,
"learning_rate": 1.1108040132741354e-05,
"loss": 0.0111,
"step": 15890
},
{
"epoch": 8.897593732512592,
"grad_norm": 0.0884697362780571,
"learning_rate": 1.1056136061894384e-05,
"loss": 0.0108,
"step": 15900
},
{
"epoch": 8.903189703413542,
"grad_norm": 0.28323593735694885,
"learning_rate": 1.100433846208434e-05,
"loss": 0.0116,
"step": 15910
},
{
"epoch": 8.908785674314494,
"grad_norm": 0.14971330761909485,
"learning_rate": 1.095264747492391e-05,
"loss": 0.0079,
"step": 15920
},
{
"epoch": 8.914381645215444,
"grad_norm": 0.18808728456497192,
"learning_rate": 1.090106324173426e-05,
"loss": 0.0082,
"step": 15930
},
{
"epoch": 8.919977616116396,
"grad_norm": 0.16924549639225006,
"learning_rate": 1.0849585903544706e-05,
"loss": 0.0064,
"step": 15940
},
{
"epoch": 8.925573587017347,
"grad_norm": 0.1466728150844574,
"learning_rate": 1.0798215601092354e-05,
"loss": 0.0106,
"step": 15950
},
{
"epoch": 8.931169557918299,
"grad_norm": 0.18614622950553894,
"learning_rate": 1.0746952474821614e-05,
"loss": 0.0089,
"step": 15960
},
{
"epoch": 8.936765528819251,
"grad_norm": 0.04307910427451134,
"learning_rate": 1.069579666488395e-05,
"loss": 0.0092,
"step": 15970
},
{
"epoch": 8.942361499720201,
"grad_norm": 0.20207299292087555,
"learning_rate": 1.0644748311137376e-05,
"loss": 0.0077,
"step": 15980
},
{
"epoch": 8.947957470621153,
"grad_norm": 0.12527382373809814,
"learning_rate": 1.059380755314613e-05,
"loss": 0.008,
"step": 15990
},
{
"epoch": 8.953553441522104,
"grad_norm": 0.3143978416919708,
"learning_rate": 1.0542974530180327e-05,
"loss": 0.0061,
"step": 16000
},
{
"epoch": 8.959149412423056,
"grad_norm": 0.0894945040345192,
"learning_rate": 1.049224938121548e-05,
"loss": 0.0041,
"step": 16010
},
{
"epoch": 8.964745383324006,
"grad_norm": 0.23625624179840088,
"learning_rate": 1.0441632244932237e-05,
"loss": 0.0067,
"step": 16020
},
{
"epoch": 8.970341354224958,
"grad_norm": 0.14668506383895874,
"learning_rate": 1.0391123259715906e-05,
"loss": 0.0056,
"step": 16030
},
{
"epoch": 8.975937325125908,
"grad_norm": 0.17659400403499603,
"learning_rate": 1.0340722563656107e-05,
"loss": 0.0066,
"step": 16040
},
{
"epoch": 8.98153329602686,
"grad_norm": 0.2076718956232071,
"learning_rate": 1.0290430294546449e-05,
"loss": 0.0074,
"step": 16050
},
{
"epoch": 8.987129266927813,
"grad_norm": 0.1386403888463974,
"learning_rate": 1.0240246589884044e-05,
"loss": 0.0052,
"step": 16060
},
{
"epoch": 8.992725237828763,
"grad_norm": 0.12247960269451141,
"learning_rate": 1.0190171586869258e-05,
"loss": 0.0059,
"step": 16070
},
{
"epoch": 8.998321208729715,
"grad_norm": 0.08335962146520615,
"learning_rate": 1.0140205422405214e-05,
"loss": 0.0045,
"step": 16080
},
{
"epoch": 9.003917179630665,
"grad_norm": 0.13206073641777039,
"learning_rate": 1.009034823309749e-05,
"loss": 0.0049,
"step": 16090
},
{
"epoch": 9.009513150531617,
"grad_norm": 0.1473735272884369,
"learning_rate": 1.0040600155253765e-05,
"loss": 0.0035,
"step": 16100
},
{
"epoch": 9.015109121432568,
"grad_norm": 0.07891960442066193,
"learning_rate": 9.990961324883358e-06,
"loss": 0.0064,
"step": 16110
},
{
"epoch": 9.02070509233352,
"grad_norm": 0.16706879436969757,
"learning_rate": 9.941431877696955e-06,
"loss": 0.0039,
"step": 16120
},
{
"epoch": 9.026301063234472,
"grad_norm": 0.0876656025648117,
"learning_rate": 9.892011949106172e-06,
"loss": 0.008,
"step": 16130
},
{
"epoch": 9.031897034135422,
"grad_norm": 0.10205890983343124,
"learning_rate": 9.842701674223187e-06,
"loss": 0.0071,
"step": 16140
},
{
"epoch": 9.037493005036374,
"grad_norm": 0.16774903237819672,
"learning_rate": 9.793501187860432e-06,
"loss": 0.0037,
"step": 16150
},
{
"epoch": 9.043088975937325,
"grad_norm": 0.2676295340061188,
"learning_rate": 9.744410624530148e-06,
"loss": 0.0062,
"step": 16160
},
{
"epoch": 9.048684946838277,
"grad_norm": 0.2096317857503891,
"learning_rate": 9.695430118444048e-06,
"loss": 0.0036,
"step": 16170
},
{
"epoch": 9.054280917739227,
"grad_norm": 0.09436144679784775,
"learning_rate": 9.646559803512994e-06,
"loss": 0.0045,
"step": 16180
},
{
"epoch": 9.05987688864018,
"grad_norm": 0.17315761744976044,
"learning_rate": 9.597799813346525e-06,
"loss": 0.0064,
"step": 16190
},
{
"epoch": 9.06547285954113,
"grad_norm": 0.07326121628284454,
"learning_rate": 9.549150281252633e-06,
"loss": 0.0035,
"step": 16200
},
{
"epoch": 9.071068830442082,
"grad_norm": 0.14720216393470764,
"learning_rate": 9.500611340237258e-06,
"loss": 0.0055,
"step": 16210
},
{
"epoch": 9.076664801343034,
"grad_norm": 0.0691135823726654,
"learning_rate": 9.452183123004e-06,
"loss": 0.0077,
"step": 16220
},
{
"epoch": 9.082260772243984,
"grad_norm": 0.13588427007198334,
"learning_rate": 9.403865761953779e-06,
"loss": 0.0046,
"step": 16230
},
{
"epoch": 9.087856743144936,
"grad_norm": 0.13852879405021667,
"learning_rate": 9.355659389184396e-06,
"loss": 0.0046,
"step": 16240
},
{
"epoch": 9.093452714045887,
"grad_norm": 0.0626252144575119,
"learning_rate": 9.307564136490254e-06,
"loss": 0.0069,
"step": 16250
},
{
"epoch": 9.099048684946839,
"grad_norm": 0.25919991731643677,
"learning_rate": 9.259580135361929e-06,
"loss": 0.0046,
"step": 16260
},
{
"epoch": 9.104644655847789,
"grad_norm": 0.0894588977098465,
"learning_rate": 9.211707516985829e-06,
"loss": 0.0046,
"step": 16270
},
{
"epoch": 9.110240626748741,
"grad_norm": 0.45610806345939636,
"learning_rate": 9.163946412243896e-06,
"loss": 0.0069,
"step": 16280
},
{
"epoch": 9.115836597649691,
"grad_norm": 0.1714649349451065,
"learning_rate": 9.116296951713133e-06,
"loss": 0.0058,
"step": 16290
},
{
"epoch": 9.121432568550643,
"grad_norm": 0.20788055658340454,
"learning_rate": 9.068759265665384e-06,
"loss": 0.0046,
"step": 16300
},
{
"epoch": 9.127028539451596,
"grad_norm": 0.13281454145908356,
"learning_rate": 9.02133348406684e-06,
"loss": 0.0073,
"step": 16310
},
{
"epoch": 9.132624510352546,
"grad_norm": 0.20327745378017426,
"learning_rate": 8.974019736577777e-06,
"loss": 0.0061,
"step": 16320
},
{
"epoch": 9.138220481253498,
"grad_norm": 0.1418776661157608,
"learning_rate": 8.92681815255219e-06,
"loss": 0.0054,
"step": 16330
},
{
"epoch": 9.143816452154448,
"grad_norm": 0.08617481589317322,
"learning_rate": 8.879728861037384e-06,
"loss": 0.0057,
"step": 16340
},
{
"epoch": 9.1494124230554,
"grad_norm": 0.14362642168998718,
"learning_rate": 8.832751990773714e-06,
"loss": 0.0059,
"step": 16350
},
{
"epoch": 9.15500839395635,
"grad_norm": 0.05195459723472595,
"learning_rate": 8.785887670194138e-06,
"loss": 0.0063,
"step": 16360
},
{
"epoch": 9.160604364857303,
"grad_norm": 0.1765775829553604,
"learning_rate": 8.739136027423894e-06,
"loss": 0.0075,
"step": 16370
},
{
"epoch": 9.166200335758255,
"grad_norm": 0.1646648496389389,
"learning_rate": 8.692497190280224e-06,
"loss": 0.0065,
"step": 16380
},
{
"epoch": 9.171796306659205,
"grad_norm": 0.16203129291534424,
"learning_rate": 8.645971286271904e-06,
"loss": 0.0049,
"step": 16390
},
{
"epoch": 9.177392277560157,
"grad_norm": 0.07584717124700546,
"learning_rate": 8.599558442598998e-06,
"loss": 0.0071,
"step": 16400
},
{
"epoch": 9.182988248461108,
"grad_norm": 0.14030073583126068,
"learning_rate": 8.55325878615244e-06,
"loss": 0.0033,
"step": 16410
},
{
"epoch": 9.18858421936206,
"grad_norm": 0.09595508873462677,
"learning_rate": 8.507072443513702e-06,
"loss": 0.0034,
"step": 16420
},
{
"epoch": 9.19418019026301,
"grad_norm": 0.2346934825181961,
"learning_rate": 8.460999540954517e-06,
"loss": 0.0091,
"step": 16430
},
{
"epoch": 9.199776161163962,
"grad_norm": 0.11720654368400574,
"learning_rate": 8.415040204436426e-06,
"loss": 0.0056,
"step": 16440
},
{
"epoch": 9.205372132064912,
"grad_norm": 0.18266266584396362,
"learning_rate": 8.369194559610482e-06,
"loss": 0.0044,
"step": 16450
},
{
"epoch": 9.210968102965865,
"grad_norm": 0.11530566215515137,
"learning_rate": 8.323462731816961e-06,
"loss": 0.0091,
"step": 16460
},
{
"epoch": 9.216564073866817,
"grad_norm": 0.15264108777046204,
"learning_rate": 8.277844846084898e-06,
"loss": 0.0056,
"step": 16470
},
{
"epoch": 9.222160044767767,
"grad_norm": 0.12221037596464157,
"learning_rate": 8.232341027131885e-06,
"loss": 0.0046,
"step": 16480
},
{
"epoch": 9.227756015668719,
"grad_norm": 0.18118728697299957,
"learning_rate": 8.186951399363613e-06,
"loss": 0.0048,
"step": 16490
},
{
"epoch": 9.23335198656967,
"grad_norm": 0.11156457662582397,
"learning_rate": 8.141676086873572e-06,
"loss": 0.0038,
"step": 16500
},
{
"epoch": 9.238947957470621,
"grad_norm": 0.24215921759605408,
"learning_rate": 8.096515213442762e-06,
"loss": 0.0053,
"step": 16510
},
{
"epoch": 9.244543928371572,
"grad_norm": 0.1042838767170906,
"learning_rate": 8.051468902539272e-06,
"loss": 0.0038,
"step": 16520
},
{
"epoch": 9.250139899272524,
"grad_norm": 0.15312840044498444,
"learning_rate": 8.00653727731801e-06,
"loss": 0.0056,
"step": 16530
},
{
"epoch": 9.255735870173474,
"grad_norm": 0.12216275930404663,
"learning_rate": 7.96172046062032e-06,
"loss": 0.009,
"step": 16540
},
{
"epoch": 9.261331841074426,
"grad_norm": 0.14912450313568115,
"learning_rate": 7.917018574973645e-06,
"loss": 0.0104,
"step": 16550
},
{
"epoch": 9.266927811975378,
"grad_norm": 0.2108585089445114,
"learning_rate": 7.872431742591268e-06,
"loss": 0.0068,
"step": 16560
},
{
"epoch": 9.272523782876329,
"grad_norm": 0.0906781554222107,
"learning_rate": 7.827960085371855e-06,
"loss": 0.0044,
"step": 16570
},
{
"epoch": 9.27811975377728,
"grad_norm": 0.13947215676307678,
"learning_rate": 7.783603724899257e-06,
"loss": 0.0057,
"step": 16580
},
{
"epoch": 9.283715724678231,
"grad_norm": 0.11844757199287415,
"learning_rate": 7.739362782442021e-06,
"loss": 0.0044,
"step": 16590
},
{
"epoch": 9.289311695579183,
"grad_norm": 0.13809189200401306,
"learning_rate": 7.695237378953223e-06,
"loss": 0.0064,
"step": 16600
},
{
"epoch": 9.294907666480134,
"grad_norm": 0.33429670333862305,
"learning_rate": 7.651227635070041e-06,
"loss": 0.0033,
"step": 16610
},
{
"epoch": 9.300503637381086,
"grad_norm": 0.15949353575706482,
"learning_rate": 7.607333671113409e-06,
"loss": 0.0142,
"step": 16620
},
{
"epoch": 9.306099608282038,
"grad_norm": 0.30085355043411255,
"learning_rate": 7.56355560708778e-06,
"loss": 0.0064,
"step": 16630
},
{
"epoch": 9.311695579182988,
"grad_norm": 0.09114662557840347,
"learning_rate": 7.519893562680663e-06,
"loss": 0.0062,
"step": 16640
},
{
"epoch": 9.31729155008394,
"grad_norm": 0.3248306214809418,
"learning_rate": 7.476347657262456e-06,
"loss": 0.0063,
"step": 16650
},
{
"epoch": 9.32288752098489,
"grad_norm": 0.15951383113861084,
"learning_rate": 7.432918009885997e-06,
"loss": 0.0069,
"step": 16660
},
{
"epoch": 9.328483491885843,
"grad_norm": 0.1393985003232956,
"learning_rate": 7.389604739286271e-06,
"loss": 0.0046,
"step": 16670
},
{
"epoch": 9.334079462786793,
"grad_norm": 0.14699183404445648,
"learning_rate": 7.3464079638801365e-06,
"loss": 0.0047,
"step": 16680
},
{
"epoch": 9.339675433687745,
"grad_norm": 0.14034835994243622,
"learning_rate": 7.30332780176588e-06,
"loss": 0.0068,
"step": 16690
},
{
"epoch": 9.345271404588695,
"grad_norm": 0.202976793050766,
"learning_rate": 7.260364370723044e-06,
"loss": 0.007,
"step": 16700
},
{
"epoch": 9.350867375489647,
"grad_norm": 0.1574084311723709,
"learning_rate": 7.217517788212025e-06,
"loss": 0.0037,
"step": 16710
},
{
"epoch": 9.3564633463906,
"grad_norm": 0.23007866740226746,
"learning_rate": 7.174788171373731e-06,
"loss": 0.006,
"step": 16720
},
{
"epoch": 9.36205931729155,
"grad_norm": 0.06488067656755447,
"learning_rate": 7.132175637029293e-06,
"loss": 0.0038,
"step": 16730
},
{
"epoch": 9.367655288192502,
"grad_norm": 0.08520302921533585,
"learning_rate": 7.089680301679752e-06,
"loss": 0.0035,
"step": 16740
},
{
"epoch": 9.373251259093452,
"grad_norm": 0.1132565289735794,
"learning_rate": 7.047302281505736e-06,
"loss": 0.0033,
"step": 16750
},
{
"epoch": 9.378847229994404,
"grad_norm": 0.29900556802749634,
"learning_rate": 7.005041692367154e-06,
"loss": 0.0083,
"step": 16760
},
{
"epoch": 9.384443200895355,
"grad_norm": 0.21089625358581543,
"learning_rate": 6.962898649802823e-06,
"loss": 0.004,
"step": 16770
},
{
"epoch": 9.390039171796307,
"grad_norm": 0.1411179006099701,
"learning_rate": 6.92087326903022e-06,
"loss": 0.0051,
"step": 16780
},
{
"epoch": 9.395635142697259,
"grad_norm": 0.20569784939289093,
"learning_rate": 6.878965664945108e-06,
"loss": 0.0057,
"step": 16790
},
{
"epoch": 9.40123111359821,
"grad_norm": 0.13673344254493713,
"learning_rate": 6.837175952121306e-06,
"loss": 0.0029,
"step": 16800
},
{
"epoch": 9.406827084499161,
"grad_norm": 0.07221835851669312,
"learning_rate": 6.795504244810285e-06,
"loss": 0.0028,
"step": 16810
},
{
"epoch": 9.412423055400112,
"grad_norm": 0.15173490345478058,
"learning_rate": 6.753950656940905e-06,
"loss": 0.0055,
"step": 16820
},
{
"epoch": 9.418019026301064,
"grad_norm": 0.12818996608257294,
"learning_rate": 6.712515302119077e-06,
"loss": 0.0047,
"step": 16830
},
{
"epoch": 9.423614997202014,
"grad_norm": 0.2607164978981018,
"learning_rate": 6.671198293627479e-06,
"loss": 0.0062,
"step": 16840
},
{
"epoch": 9.429210968102966,
"grad_norm": 0.1782405823469162,
"learning_rate": 6.629999744425236e-06,
"loss": 0.0038,
"step": 16850
},
{
"epoch": 9.434806939003916,
"grad_norm": 0.1047229990363121,
"learning_rate": 6.588919767147639e-06,
"loss": 0.0038,
"step": 16860
},
{
"epoch": 9.440402909904869,
"grad_norm": 0.21528460085391998,
"learning_rate": 6.5479584741057255e-06,
"loss": 0.0044,
"step": 16870
},
{
"epoch": 9.44599888080582,
"grad_norm": 0.033052559942007065,
"learning_rate": 6.5071159772861436e-06,
"loss": 0.0043,
"step": 16880
},
{
"epoch": 9.451594851706771,
"grad_norm": 0.08729052543640137,
"learning_rate": 6.466392388350695e-06,
"loss": 0.0067,
"step": 16890
},
{
"epoch": 9.457190822607723,
"grad_norm": 0.1754913330078125,
"learning_rate": 6.425787818636131e-06,
"loss": 0.0038,
"step": 16900
},
{
"epoch": 9.462786793508673,
"grad_norm": 0.13821344077587128,
"learning_rate": 6.385302379153818e-06,
"loss": 0.0046,
"step": 16910
},
{
"epoch": 9.468382764409625,
"grad_norm": 0.1275906264781952,
"learning_rate": 6.344936180589351e-06,
"loss": 0.0036,
"step": 16920
},
{
"epoch": 9.473978735310576,
"grad_norm": 0.14954271912574768,
"learning_rate": 6.304689333302416e-06,
"loss": 0.0034,
"step": 16930
},
{
"epoch": 9.479574706211528,
"grad_norm": 0.12982557713985443,
"learning_rate": 6.264561947326331e-06,
"loss": 0.0043,
"step": 16940
},
{
"epoch": 9.485170677112478,
"grad_norm": 0.06912703812122345,
"learning_rate": 6.22455413236786e-06,
"loss": 0.0055,
"step": 16950
},
{
"epoch": 9.49076664801343,
"grad_norm": 0.19244985282421112,
"learning_rate": 6.184665997806832e-06,
"loss": 0.0043,
"step": 16960
},
{
"epoch": 9.496362618914382,
"grad_norm": 0.08739597350358963,
"learning_rate": 6.144897652695864e-06,
"loss": 0.0151,
"step": 16970
},
{
"epoch": 9.501958589815333,
"grad_norm": 0.11885930597782135,
"learning_rate": 6.1052492057601275e-06,
"loss": 0.0073,
"step": 16980
},
{
"epoch": 9.507554560716285,
"grad_norm": 0.07571222633123398,
"learning_rate": 6.0657207653969315e-06,
"loss": 0.0032,
"step": 16990
},
{
"epoch": 9.513150531617235,
"grad_norm": 0.07605729252099991,
"learning_rate": 6.026312439675552e-06,
"loss": 0.0036,
"step": 17000
},
{
"epoch": 9.518746502518187,
"grad_norm": 0.20224310457706451,
"learning_rate": 5.9870243363368275e-06,
"loss": 0.0055,
"step": 17010
},
{
"epoch": 9.524342473419138,
"grad_norm": 0.09693833440542221,
"learning_rate": 5.947856562792925e-06,
"loss": 0.0048,
"step": 17020
},
{
"epoch": 9.52993844432009,
"grad_norm": 0.13180632889270782,
"learning_rate": 5.908809226127054e-06,
"loss": 0.0052,
"step": 17030
},
{
"epoch": 9.53553441522104,
"grad_norm": 0.18198780715465546,
"learning_rate": 5.869882433093155e-06,
"loss": 0.0053,
"step": 17040
},
{
"epoch": 9.541130386121992,
"grad_norm": 0.08620735257863998,
"learning_rate": 5.831076290115573e-06,
"loss": 0.0047,
"step": 17050
},
{
"epoch": 9.546726357022944,
"grad_norm": 0.18070462346076965,
"learning_rate": 5.79239090328883e-06,
"loss": 0.005,
"step": 17060
},
{
"epoch": 9.552322327923894,
"grad_norm": 0.13954901695251465,
"learning_rate": 5.753826378377286e-06,
"loss": 0.0037,
"step": 17070
},
{
"epoch": 9.557918298824847,
"grad_norm": 0.08338068425655365,
"learning_rate": 5.715382820814885e-06,
"loss": 0.0035,
"step": 17080
},
{
"epoch": 9.563514269725797,
"grad_norm": 0.1206720620393753,
"learning_rate": 5.67706033570487e-06,
"loss": 0.0071,
"step": 17090
},
{
"epoch": 9.569110240626749,
"grad_norm": 0.1978680044412613,
"learning_rate": 5.6388590278194096e-06,
"loss": 0.0048,
"step": 17100
},
{
"epoch": 9.5747062115277,
"grad_norm": 0.2190864086151123,
"learning_rate": 5.600779001599455e-06,
"loss": 0.0043,
"step": 17110
},
{
"epoch": 9.580302182428651,
"grad_norm": 0.0734127014875412,
"learning_rate": 5.562820361154314e-06,
"loss": 0.0049,
"step": 17120
},
{
"epoch": 9.585898153329603,
"grad_norm": 0.14367960393428802,
"learning_rate": 5.524983210261481e-06,
"loss": 0.0035,
"step": 17130
},
{
"epoch": 9.591494124230554,
"grad_norm": 0.26178881525993347,
"learning_rate": 5.48726765236629e-06,
"loss": 0.005,
"step": 17140
},
{
"epoch": 9.597090095131506,
"grad_norm": 0.10900067538022995,
"learning_rate": 5.449673790581611e-06,
"loss": 0.0065,
"step": 17150
},
{
"epoch": 9.602686066032456,
"grad_norm": 0.16984951496124268,
"learning_rate": 5.412201727687644e-06,
"loss": 0.0051,
"step": 17160
},
{
"epoch": 9.608282036933408,
"grad_norm": 0.0894961804151535,
"learning_rate": 5.374851566131561e-06,
"loss": 0.0038,
"step": 17170
},
{
"epoch": 9.613878007834359,
"grad_norm": 0.25771039724349976,
"learning_rate": 5.337623408027293e-06,
"loss": 0.0073,
"step": 17180
},
{
"epoch": 9.61947397873531,
"grad_norm": 0.14566998183727264,
"learning_rate": 5.300517355155215e-06,
"loss": 0.0046,
"step": 17190
},
{
"epoch": 9.625069949636263,
"grad_norm": 0.17133091390132904,
"learning_rate": 5.263533508961827e-06,
"loss": 0.0073,
"step": 17200
},
{
"epoch": 9.630665920537213,
"grad_norm": 0.16593864560127258,
"learning_rate": 5.226671970559577e-06,
"loss": 0.0053,
"step": 17210
},
{
"epoch": 9.636261891438165,
"grad_norm": 0.11243371665477753,
"learning_rate": 5.1899328407264855e-06,
"loss": 0.0043,
"step": 17220
},
{
"epoch": 9.641857862339116,
"grad_norm": 0.15767988562583923,
"learning_rate": 5.153316219905946e-06,
"loss": 0.0072,
"step": 17230
},
{
"epoch": 9.647453833240068,
"grad_norm": 0.2645623981952667,
"learning_rate": 5.116822208206396e-06,
"loss": 0.0052,
"step": 17240
},
{
"epoch": 9.653049804141018,
"grad_norm": 0.08610297739505768,
"learning_rate": 5.080450905401057e-06,
"loss": 0.0056,
"step": 17250
},
{
"epoch": 9.65864577504197,
"grad_norm": 0.08036172389984131,
"learning_rate": 5.044202410927706e-06,
"loss": 0.0036,
"step": 17260
},
{
"epoch": 9.66424174594292,
"grad_norm": 0.18519535660743713,
"learning_rate": 5.008076823888319e-06,
"loss": 0.0057,
"step": 17270
},
{
"epoch": 9.669837716843872,
"grad_norm": 0.19542230665683746,
"learning_rate": 4.972074243048897e-06,
"loss": 0.0036,
"step": 17280
},
{
"epoch": 9.675433687744825,
"grad_norm": 0.21911007165908813,
"learning_rate": 4.936194766839103e-06,
"loss": 0.0039,
"step": 17290
},
{
"epoch": 9.681029658645775,
"grad_norm": 0.14355053007602692,
"learning_rate": 4.900438493352055e-06,
"loss": 0.0052,
"step": 17300
},
{
"epoch": 9.686625629546727,
"grad_norm": 0.34103378653526306,
"learning_rate": 4.864805520344051e-06,
"loss": 0.0063,
"step": 17310
},
{
"epoch": 9.692221600447677,
"grad_norm": 0.18420292437076569,
"learning_rate": 4.829295945234258e-06,
"loss": 0.0046,
"step": 17320
},
{
"epoch": 9.69781757134863,
"grad_norm": 0.11074794083833694,
"learning_rate": 4.7939098651045235e-06,
"loss": 0.0056,
"step": 17330
},
{
"epoch": 9.70341354224958,
"grad_norm": 0.1706562340259552,
"learning_rate": 4.758647376699032e-06,
"loss": 0.0038,
"step": 17340
},
{
"epoch": 9.709009513150532,
"grad_norm": 0.16499456763267517,
"learning_rate": 4.723508576424062e-06,
"loss": 0.0046,
"step": 17350
},
{
"epoch": 9.714605484051482,
"grad_norm": 0.08222458511590958,
"learning_rate": 4.688493560347773e-06,
"loss": 0.0062,
"step": 17360
},
{
"epoch": 9.720201454952434,
"grad_norm": 0.13518883287906647,
"learning_rate": 4.653602424199876e-06,
"loss": 0.0086,
"step": 17370
},
{
"epoch": 9.725797425853386,
"grad_norm": 0.16546756029129028,
"learning_rate": 4.618835263371396e-06,
"loss": 0.0051,
"step": 17380
},
{
"epoch": 9.731393396754337,
"grad_norm": 0.31760314106941223,
"learning_rate": 4.5841921729144424e-06,
"loss": 0.0056,
"step": 17390
},
{
"epoch": 9.736989367655289,
"grad_norm": 0.11362655460834503,
"learning_rate": 4.549673247541875e-06,
"loss": 0.0085,
"step": 17400
},
{
"epoch": 9.742585338556239,
"grad_norm": 0.12480427324771881,
"learning_rate": 4.515278581627141e-06,
"loss": 0.003,
"step": 17410
},
{
"epoch": 9.748181309457191,
"grad_norm": 0.09458563476800919,
"learning_rate": 4.48100826920394e-06,
"loss": 0.0043,
"step": 17420
},
{
"epoch": 9.753777280358142,
"grad_norm": 0.15045048296451569,
"learning_rate": 4.446862403965984e-06,
"loss": 0.0035,
"step": 17430
},
{
"epoch": 9.759373251259094,
"grad_norm": 0.10754050314426422,
"learning_rate": 4.412841079266777e-06,
"loss": 0.0059,
"step": 17440
},
{
"epoch": 9.764969222160044,
"grad_norm": 0.09626353532075882,
"learning_rate": 4.378944388119311e-06,
"loss": 0.0064,
"step": 17450
},
{
"epoch": 9.770565193060996,
"grad_norm": 0.0682365670800209,
"learning_rate": 4.3451724231958644e-06,
"loss": 0.0039,
"step": 17460
},
{
"epoch": 9.776161163961948,
"grad_norm": 0.0859832614660263,
"learning_rate": 4.311525276827682e-06,
"loss": 0.0038,
"step": 17470
},
{
"epoch": 9.781757134862898,
"grad_norm": 0.057302311062812805,
"learning_rate": 4.27800304100478e-06,
"loss": 0.0061,
"step": 17480
},
{
"epoch": 9.78735310576385,
"grad_norm": 0.30939188599586487,
"learning_rate": 4.244605807375679e-06,
"loss": 0.0072,
"step": 17490
},
{
"epoch": 9.7929490766648,
"grad_norm": 0.06655000895261765,
"learning_rate": 4.2113336672471245e-06,
"loss": 0.006,
"step": 17500
},
{
"epoch": 9.798545047565753,
"grad_norm": 0.07795148342847824,
"learning_rate": 4.178186711583904e-06,
"loss": 0.0064,
"step": 17510
},
{
"epoch": 9.804141018466703,
"grad_norm": 0.06218419224023819,
"learning_rate": 4.145165031008508e-06,
"loss": 0.0041,
"step": 17520
},
{
"epoch": 9.809736989367655,
"grad_norm": 0.064509816467762,
"learning_rate": 4.112268715800943e-06,
"loss": 0.0048,
"step": 17530
},
{
"epoch": 9.815332960268606,
"grad_norm": 0.2096703052520752,
"learning_rate": 4.079497855898501e-06,
"loss": 0.0049,
"step": 17540
},
{
"epoch": 9.820928931169558,
"grad_norm": 0.15621553361415863,
"learning_rate": 4.046852540895446e-06,
"loss": 0.0046,
"step": 17550
},
{
"epoch": 9.82652490207051,
"grad_norm": 0.089202381670475,
"learning_rate": 4.01433286004283e-06,
"loss": 0.0078,
"step": 17560
},
{
"epoch": 9.83212087297146,
"grad_norm": 0.11227259039878845,
"learning_rate": 3.981938902248222e-06,
"loss": 0.0046,
"step": 17570
},
{
"epoch": 9.837716843872412,
"grad_norm": 0.038788773119449615,
"learning_rate": 3.949670756075447e-06,
"loss": 0.0093,
"step": 17580
},
{
"epoch": 9.843312814773363,
"grad_norm": 0.1287786364555359,
"learning_rate": 3.917528509744412e-06,
"loss": 0.0041,
"step": 17590
},
{
"epoch": 9.848908785674315,
"grad_norm": 0.04712485149502754,
"learning_rate": 3.885512251130763e-06,
"loss": 0.0046,
"step": 17600
},
{
"epoch": 9.854504756575265,
"grad_norm": 0.24810890853405,
"learning_rate": 3.8536220677657495e-06,
"loss": 0.0112,
"step": 17610
},
{
"epoch": 9.860100727476217,
"grad_norm": 0.16745951771736145,
"learning_rate": 3.821858046835913e-06,
"loss": 0.0038,
"step": 17620
},
{
"epoch": 9.86569669837717,
"grad_norm": 0.10218873620033264,
"learning_rate": 3.790220275182854e-06,
"loss": 0.0037,
"step": 17630
},
{
"epoch": 9.87129266927812,
"grad_norm": 0.19612161815166473,
"learning_rate": 3.75870883930306e-06,
"loss": 0.004,
"step": 17640
},
{
"epoch": 9.876888640179072,
"grad_norm": 0.20635591447353363,
"learning_rate": 3.7273238253475785e-06,
"loss": 0.0081,
"step": 17650
},
{
"epoch": 9.882484611080022,
"grad_norm": 0.154740571975708,
"learning_rate": 3.696065319121833e-06,
"loss": 0.0049,
"step": 17660
},
{
"epoch": 9.888080581980974,
"grad_norm": 0.046477749943733215,
"learning_rate": 3.664933406085402e-06,
"loss": 0.0055,
"step": 17670
},
{
"epoch": 9.893676552881924,
"grad_norm": 0.20742470026016235,
"learning_rate": 3.6339281713517303e-06,
"loss": 0.0027,
"step": 17680
},
{
"epoch": 9.899272523782876,
"grad_norm": 0.07390665262937546,
"learning_rate": 3.60304969968796e-06,
"loss": 0.0035,
"step": 17690
},
{
"epoch": 9.904868494683829,
"grad_norm": 0.12964075803756714,
"learning_rate": 3.5722980755146517e-06,
"loss": 0.0066,
"step": 17700
},
{
"epoch": 9.910464465584779,
"grad_norm": 0.05571340024471283,
"learning_rate": 3.541673382905558e-06,
"loss": 0.008,
"step": 17710
},
{
"epoch": 9.916060436485731,
"grad_norm": 0.12276771664619446,
"learning_rate": 3.511175705587433e-06,
"loss": 0.0069,
"step": 17720
},
{
"epoch": 9.921656407386681,
"grad_norm": 0.09888763725757599,
"learning_rate": 3.4808051269397512e-06,
"loss": 0.0036,
"step": 17730
},
{
"epoch": 9.927252378287633,
"grad_norm": 0.08338962495326996,
"learning_rate": 3.4505617299945336e-06,
"loss": 0.004,
"step": 17740
},
{
"epoch": 9.932848349188584,
"grad_norm": 0.06845631450414658,
"learning_rate": 3.420445597436056e-06,
"loss": 0.0037,
"step": 17750
},
{
"epoch": 9.938444320089536,
"grad_norm": 0.072002112865448,
"learning_rate": 3.390456811600673e-06,
"loss": 0.0049,
"step": 17760
},
{
"epoch": 9.944040290990486,
"grad_norm": 0.13706427812576294,
"learning_rate": 3.360595454476595e-06,
"loss": 0.0067,
"step": 17770
},
{
"epoch": 9.949636261891438,
"grad_norm": 0.14595244824886322,
"learning_rate": 3.3308616077036115e-06,
"loss": 0.0047,
"step": 17780
},
{
"epoch": 9.95523223279239,
"grad_norm": 0.07961612939834595,
"learning_rate": 3.301255352572946e-06,
"loss": 0.0035,
"step": 17790
},
{
"epoch": 9.96082820369334,
"grad_norm": 0.10814230144023895,
"learning_rate": 3.271776770026963e-06,
"loss": 0.0048,
"step": 17800
},
{
"epoch": 9.966424174594293,
"grad_norm": 0.11842755228281021,
"learning_rate": 3.2424259406589664e-06,
"loss": 0.0095,
"step": 17810
},
{
"epoch": 9.972020145495243,
"grad_norm": 0.21332372725009918,
"learning_rate": 3.213202944713023e-06,
"loss": 0.003,
"step": 17820
},
{
"epoch": 9.977616116396195,
"grad_norm": 0.06386691331863403,
"learning_rate": 3.1841078620836683e-06,
"loss": 0.0036,
"step": 17830
},
{
"epoch": 9.983212087297145,
"grad_norm": 0.08316194266080856,
"learning_rate": 3.155140772315773e-06,
"loss": 0.0042,
"step": 17840
},
{
"epoch": 9.988808058198098,
"grad_norm": 0.16622905433177948,
"learning_rate": 3.126301754604233e-06,
"loss": 0.0039,
"step": 17850
},
{
"epoch": 9.994404029099048,
"grad_norm": 0.11861821264028549,
"learning_rate": 3.0975908877938277e-06,
"loss": 0.0048,
"step": 17860
},
{
"epoch": 10.0,
"grad_norm": 0.1722375601530075,
"learning_rate": 3.0690082503789742e-06,
"loss": 0.0026,
"step": 17870
},
{
"epoch": 10.005595970900952,
"grad_norm": 0.06653541326522827,
"learning_rate": 3.040553920503503e-06,
"loss": 0.0048,
"step": 17880
},
{
"epoch": 10.011191941801902,
"grad_norm": 0.16646505892276764,
"learning_rate": 3.0122279759604745e-06,
"loss": 0.004,
"step": 17890
},
{
"epoch": 10.016787912702855,
"grad_norm": 0.07118295133113861,
"learning_rate": 2.9840304941919415e-06,
"loss": 0.0066,
"step": 17900
},
{
"epoch": 10.022383883603805,
"grad_norm": 0.15453752875328064,
"learning_rate": 2.9559615522887273e-06,
"loss": 0.0052,
"step": 17910
},
{
"epoch": 10.027979854504757,
"grad_norm": 0.23914295434951782,
"learning_rate": 2.928021226990263e-06,
"loss": 0.0042,
"step": 17920
},
{
"epoch": 10.033575825405707,
"grad_norm": 0.09927842766046524,
"learning_rate": 2.9002095946843277e-06,
"loss": 0.0053,
"step": 17930
},
{
"epoch": 10.03917179630666,
"grad_norm": 0.039526671171188354,
"learning_rate": 2.8725267314068495e-06,
"loss": 0.0029,
"step": 17940
},
{
"epoch": 10.04476776720761,
"grad_norm": 0.1683174967765808,
"learning_rate": 2.844972712841737e-06,
"loss": 0.0042,
"step": 17950
},
{
"epoch": 10.050363738108562,
"grad_norm": 0.10315953940153122,
"learning_rate": 2.817547614320615e-06,
"loss": 0.0096,
"step": 17960
},
{
"epoch": 10.055959709009514,
"grad_norm": 0.17959141731262207,
"learning_rate": 2.790251510822661e-06,
"loss": 0.0048,
"step": 17970
},
{
"epoch": 10.061555679910464,
"grad_norm": 0.18458683788776398,
"learning_rate": 2.7630844769743757e-06,
"loss": 0.0051,
"step": 17980
},
{
"epoch": 10.067151650811416,
"grad_norm": 0.19159017503261566,
"learning_rate": 2.73604658704939e-06,
"loss": 0.0054,
"step": 17990
},
{
"epoch": 10.072747621712367,
"grad_norm": 0.08318327367305756,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.0053,
"step": 18000
},
{
"epoch": 10.078343592613319,
"grad_norm": 0.07472005486488342,
"learning_rate": 2.682358534298285e-06,
"loss": 0.006,
"step": 18010
},
{
"epoch": 10.083939563514269,
"grad_norm": 0.09040942043066025,
"learning_rate": 2.6557085182532582e-06,
"loss": 0.004,
"step": 18020
},
{
"epoch": 10.089535534415221,
"grad_norm": 0.037220001220703125,
"learning_rate": 2.6291879396933004e-06,
"loss": 0.0038,
"step": 18030
},
{
"epoch": 10.095131505316173,
"grad_norm": 0.11240635067224503,
"learning_rate": 2.602796871124663e-06,
"loss": 0.0031,
"step": 18040
},
{
"epoch": 10.100727476217124,
"grad_norm": 0.12259605526924133,
"learning_rate": 2.57653538469953e-06,
"loss": 0.0049,
"step": 18050
},
{
"epoch": 10.106323447118076,
"grad_norm": 0.16758129000663757,
"learning_rate": 2.5504035522157854e-06,
"loss": 0.0066,
"step": 18060
},
{
"epoch": 10.111919418019026,
"grad_norm": 0.10704974085092545,
"learning_rate": 2.5244014451168863e-06,
"loss": 0.0021,
"step": 18070
},
{
"epoch": 10.117515388919978,
"grad_norm": 0.19684171676635742,
"learning_rate": 2.4985291344915674e-06,
"loss": 0.0035,
"step": 18080
},
{
"epoch": 10.123111359820928,
"grad_norm": 0.25069093704223633,
"learning_rate": 2.4727866910737583e-06,
"loss": 0.0038,
"step": 18090
},
{
"epoch": 10.12870733072188,
"grad_norm": 0.15888355672359467,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.0055,
"step": 18100
},
{
"epoch": 10.13430330162283,
"grad_norm": 0.1355513483285904,
"learning_rate": 2.421691687020855e-06,
"loss": 0.0032,
"step": 18110
},
{
"epoch": 10.139899272523783,
"grad_norm": 0.09521888941526413,
"learning_rate": 2.3963392660775575e-06,
"loss": 0.0072,
"step": 18120
},
{
"epoch": 10.145495243424735,
"grad_norm": 0.18774038553237915,
"learning_rate": 2.371116991724953e-06,
"loss": 0.0028,
"step": 18130
},
{
"epoch": 10.151091214325685,
"grad_norm": 0.06293562054634094,
"learning_rate": 2.3460249329197824e-06,
"loss": 0.0032,
"step": 18140
},
{
"epoch": 10.156687185226637,
"grad_norm": 0.25169095396995544,
"learning_rate": 2.321063158262793e-06,
"loss": 0.0092,
"step": 18150
},
{
"epoch": 10.162283156127588,
"grad_norm": 0.08376752585172653,
"learning_rate": 2.296231735998511e-06,
"loss": 0.0021,
"step": 18160
},
{
"epoch": 10.16787912702854,
"grad_norm": 0.06758670508861542,
"learning_rate": 2.271530734015104e-06,
"loss": 0.0036,
"step": 18170
},
{
"epoch": 10.17347509792949,
"grad_norm": 0.06193256378173828,
"learning_rate": 2.2469602198441573e-06,
"loss": 0.0036,
"step": 18180
},
{
"epoch": 10.179071068830442,
"grad_norm": 0.21087805926799774,
"learning_rate": 2.222520260660521e-06,
"loss": 0.0043,
"step": 18190
},
{
"epoch": 10.184667039731393,
"grad_norm": 0.09581877291202545,
"learning_rate": 2.1982109232821178e-06,
"loss": 0.0048,
"step": 18200
},
{
"epoch": 10.190263010632345,
"grad_norm": 0.23187117278575897,
"learning_rate": 2.174032274169746e-06,
"loss": 0.0068,
"step": 18210
},
{
"epoch": 10.195858981533297,
"grad_norm": 0.1904383897781372,
"learning_rate": 2.149984379426906e-06,
"loss": 0.0036,
"step": 18220
},
{
"epoch": 10.201454952434247,
"grad_norm": 0.04588289558887482,
"learning_rate": 2.1260673047996227e-06,
"loss": 0.0075,
"step": 18230
},
{
"epoch": 10.2070509233352,
"grad_norm": 0.05446457862854004,
"learning_rate": 2.102281115676258e-06,
"loss": 0.0036,
"step": 18240
},
{
"epoch": 10.21264689423615,
"grad_norm": 0.12907229363918304,
"learning_rate": 2.0786258770873647e-06,
"loss": 0.0043,
"step": 18250
},
{
"epoch": 10.218242865137102,
"grad_norm": 0.0724627822637558,
"learning_rate": 2.0551016537054493e-06,
"loss": 0.0024,
"step": 18260
},
{
"epoch": 10.223838836038052,
"grad_norm": 0.11797565221786499,
"learning_rate": 2.0317085098448372e-06,
"loss": 0.0032,
"step": 18270
},
{
"epoch": 10.229434806939004,
"grad_norm": 0.1239556148648262,
"learning_rate": 2.008446509461498e-06,
"loss": 0.0038,
"step": 18280
},
{
"epoch": 10.235030777839956,
"grad_norm": 0.05614084377884865,
"learning_rate": 1.985315716152847e-06,
"loss": 0.0041,
"step": 18290
},
{
"epoch": 10.240626748740906,
"grad_norm": 0.2968387007713318,
"learning_rate": 1.962316193157593e-06,
"loss": 0.0092,
"step": 18300
},
{
"epoch": 10.246222719641858,
"grad_norm": 0.11529407650232315,
"learning_rate": 1.939448003355554e-06,
"loss": 0.0059,
"step": 18310
},
{
"epoch": 10.251818690542809,
"grad_norm": 0.24037353694438934,
"learning_rate": 1.91671120926748e-06,
"loss": 0.0045,
"step": 18320
},
{
"epoch": 10.257414661443761,
"grad_norm": 0.20346900820732117,
"learning_rate": 1.8941058730549132e-06,
"loss": 0.0047,
"step": 18330
},
{
"epoch": 10.263010632344711,
"grad_norm": 0.27883380651474,
"learning_rate": 1.8716320565199618e-06,
"loss": 0.0049,
"step": 18340
},
{
"epoch": 10.268606603245663,
"grad_norm": 0.12232355028390884,
"learning_rate": 1.849289821105199e-06,
"loss": 0.0077,
"step": 18350
},
{
"epoch": 10.274202574146614,
"grad_norm": 0.09397400170564651,
"learning_rate": 1.8270792278934302e-06,
"loss": 0.0039,
"step": 18360
},
{
"epoch": 10.279798545047566,
"grad_norm": 0.13843244314193726,
"learning_rate": 1.8050003376075707e-06,
"loss": 0.0059,
"step": 18370
},
{
"epoch": 10.285394515948518,
"grad_norm": 0.04927824065089226,
"learning_rate": 1.7830532106104747e-06,
"loss": 0.003,
"step": 18380
},
{
"epoch": 10.290990486849468,
"grad_norm": 0.2848436236381531,
"learning_rate": 1.7612379069047335e-06,
"loss": 0.004,
"step": 18390
},
{
"epoch": 10.29658645775042,
"grad_norm": 0.10808296501636505,
"learning_rate": 1.7395544861325718e-06,
"loss": 0.0072,
"step": 18400
},
{
"epoch": 10.30218242865137,
"grad_norm": 0.08363109827041626,
"learning_rate": 1.7180030075756136e-06,
"loss": 0.0029,
"step": 18410
},
{
"epoch": 10.307778399552323,
"grad_norm": 0.07970738410949707,
"learning_rate": 1.696583530154794e-06,
"loss": 0.0058,
"step": 18420
},
{
"epoch": 10.313374370453273,
"grad_norm": 0.06155739724636078,
"learning_rate": 1.6752961124301415e-06,
"loss": 0.0042,
"step": 18430
},
{
"epoch": 10.318970341354225,
"grad_norm": 0.15518154203891754,
"learning_rate": 1.6541408126006463e-06,
"loss": 0.006,
"step": 18440
},
{
"epoch": 10.324566312255175,
"grad_norm": 0.06478218734264374,
"learning_rate": 1.6331176885040878e-06,
"loss": 0.0083,
"step": 18450
},
{
"epoch": 10.330162283156128,
"grad_norm": 0.11871203780174255,
"learning_rate": 1.6122267976168781e-06,
"loss": 0.0046,
"step": 18460
},
{
"epoch": 10.33575825405708,
"grad_norm": 0.13164940476417542,
"learning_rate": 1.5914681970539192e-06,
"loss": 0.0055,
"step": 18470
},
{
"epoch": 10.34135422495803,
"grad_norm": 0.08165992051362991,
"learning_rate": 1.5708419435684462e-06,
"loss": 0.0065,
"step": 18480
},
{
"epoch": 10.346950195858982,
"grad_norm": 0.06479761004447937,
"learning_rate": 1.550348093551829e-06,
"loss": 0.0044,
"step": 18490
},
{
"epoch": 10.352546166759932,
"grad_norm": 0.24080127477645874,
"learning_rate": 1.5299867030334814e-06,
"loss": 0.0085,
"step": 18500
},
{
"epoch": 10.358142137660884,
"grad_norm": 0.1411421000957489,
"learning_rate": 1.5097578276806633e-06,
"loss": 0.0045,
"step": 18510
},
{
"epoch": 10.363738108561835,
"grad_norm": 0.058580052107572556,
"learning_rate": 1.4896615227983468e-06,
"loss": 0.0041,
"step": 18520
},
{
"epoch": 10.369334079462787,
"grad_norm": 0.1638147383928299,
"learning_rate": 1.4696978433290653e-06,
"loss": 0.0054,
"step": 18530
},
{
"epoch": 10.374930050363739,
"grad_norm": 0.05566524341702461,
"learning_rate": 1.4498668438527597e-06,
"loss": 0.004,
"step": 18540
},
{
"epoch": 10.38052602126469,
"grad_norm": 0.07601140439510345,
"learning_rate": 1.4301685785866214e-06,
"loss": 0.0034,
"step": 18550
},
{
"epoch": 10.386121992165641,
"grad_norm": 0.10449633747339249,
"learning_rate": 1.4106031013849496e-06,
"loss": 0.0041,
"step": 18560
},
{
"epoch": 10.391717963066592,
"grad_norm": 0.15937356650829315,
"learning_rate": 1.3911704657390113e-06,
"loss": 0.0039,
"step": 18570
},
{
"epoch": 10.397313933967544,
"grad_norm": 0.059475306421518326,
"learning_rate": 1.3718707247769135e-06,
"loss": 0.006,
"step": 18580
},
{
"epoch": 10.402909904868494,
"grad_norm": 0.24354378879070282,
"learning_rate": 1.3527039312633827e-06,
"loss": 0.0042,
"step": 18590
},
{
"epoch": 10.408505875769446,
"grad_norm": 0.20878778398036957,
"learning_rate": 1.333670137599713e-06,
"loss": 0.0107,
"step": 18600
},
{
"epoch": 10.414101846670397,
"grad_norm": 0.1909496784210205,
"learning_rate": 1.3147693958235618e-06,
"loss": 0.0034,
"step": 18610
},
{
"epoch": 10.419697817571349,
"grad_norm": 0.13632823526859283,
"learning_rate": 1.2960017576088446e-06,
"loss": 0.0066,
"step": 18620
},
{
"epoch": 10.4252937884723,
"grad_norm": 0.10793755203485489,
"learning_rate": 1.2773672742655784e-06,
"loss": 0.0037,
"step": 18630
},
{
"epoch": 10.430889759373251,
"grad_norm": 0.10346037149429321,
"learning_rate": 1.2588659967397e-06,
"loss": 0.0044,
"step": 18640
},
{
"epoch": 10.436485730274203,
"grad_norm": 0.08834080398082733,
"learning_rate": 1.2404979756130142e-06,
"loss": 0.0037,
"step": 18650
},
{
"epoch": 10.442081701175153,
"grad_norm": 0.09045784175395966,
"learning_rate": 1.222263261102985e-06,
"loss": 0.0052,
"step": 18660
},
{
"epoch": 10.447677672076106,
"grad_norm": 0.07731129229068756,
"learning_rate": 1.2041619030626284e-06,
"loss": 0.0071,
"step": 18670
},
{
"epoch": 10.453273642977056,
"grad_norm": 0.08769071102142334,
"learning_rate": 1.1861939509803687e-06,
"loss": 0.0044,
"step": 18680
},
{
"epoch": 10.458869613878008,
"grad_norm": 0.15766629576683044,
"learning_rate": 1.1683594539798893e-06,
"loss": 0.0063,
"step": 18690
},
{
"epoch": 10.46446558477896,
"grad_norm": 0.11048921942710876,
"learning_rate": 1.1506584608200367e-06,
"loss": 0.0033,
"step": 18700
},
{
"epoch": 10.47006155567991,
"grad_norm": 0.25674813985824585,
"learning_rate": 1.1330910198946442e-06,
"loss": 0.0047,
"step": 18710
},
{
"epoch": 10.475657526580862,
"grad_norm": 0.09696432203054428,
"learning_rate": 1.1156571792324211e-06,
"loss": 0.0038,
"step": 18720
},
{
"epoch": 10.481253497481813,
"grad_norm": 0.17716100811958313,
"learning_rate": 1.0983569864968346e-06,
"loss": 0.0085,
"step": 18730
},
{
"epoch": 10.486849468382765,
"grad_norm": 0.18763263523578644,
"learning_rate": 1.0811904889859336e-06,
"loss": 0.009,
"step": 18740
},
{
"epoch": 10.492445439283715,
"grad_norm": 0.047968145459890366,
"learning_rate": 1.064157733632276e-06,
"loss": 0.0051,
"step": 18750
},
{
"epoch": 10.498041410184667,
"grad_norm": 0.1565999537706375,
"learning_rate": 1.0472587670027678e-06,
"loss": 0.0062,
"step": 18760
},
{
"epoch": 10.503637381085618,
"grad_norm": 0.06519567221403122,
"learning_rate": 1.030493635298535e-06,
"loss": 0.0073,
"step": 18770
},
{
"epoch": 10.50923335198657,
"grad_norm": 0.10364692658185959,
"learning_rate": 1.0138623843548078e-06,
"loss": 0.0051,
"step": 18780
},
{
"epoch": 10.514829322887522,
"grad_norm": 0.036633651703596115,
"learning_rate": 9.97365059640787e-07,
"loss": 0.0062,
"step": 18790
},
{
"epoch": 10.520425293788472,
"grad_norm": 0.2015930861234665,
"learning_rate": 9.810017062595322e-07,
"loss": 0.0037,
"step": 18800
},
{
"epoch": 10.526021264689424,
"grad_norm": 0.1180974468588829,
"learning_rate": 9.647723689478305e-07,
"loss": 0.0039,
"step": 18810
},
{
"epoch": 10.531617235590375,
"grad_norm": 0.07416771352291107,
"learning_rate": 9.486770920760668e-07,
"loss": 0.0041,
"step": 18820
},
{
"epoch": 10.537213206491327,
"grad_norm": 0.05668334290385246,
"learning_rate": 9.327159196481138e-07,
"loss": 0.0059,
"step": 18830
},
{
"epoch": 10.542809177392277,
"grad_norm": 0.07584750652313232,
"learning_rate": 9.168888953011989e-07,
"loss": 0.0054,
"step": 18840
},
{
"epoch": 10.548405148293229,
"grad_norm": 0.06703902035951614,
"learning_rate": 9.011960623058202e-07,
"loss": 0.0039,
"step": 18850
},
{
"epoch": 10.55400111919418,
"grad_norm": 0.06538796424865723,
"learning_rate": 8.856374635655695e-07,
"loss": 0.0035,
"step": 18860
},
{
"epoch": 10.559597090095131,
"grad_norm": 0.09234767407178879,
"learning_rate": 8.702131416170656e-07,
"loss": 0.0047,
"step": 18870
},
{
"epoch": 10.565193060996084,
"grad_norm": 0.09068552404642105,
"learning_rate": 8.549231386298151e-07,
"loss": 0.0032,
"step": 18880
},
{
"epoch": 10.570789031897034,
"grad_norm": 0.2574044466018677,
"learning_rate": 8.397674964061075e-07,
"loss": 0.0123,
"step": 18890
},
{
"epoch": 10.576385002797986,
"grad_norm": 0.1742398738861084,
"learning_rate": 8.247462563808817e-07,
"loss": 0.005,
"step": 18900
},
{
"epoch": 10.581980973698936,
"grad_norm": 0.19498533010482788,
"learning_rate": 8.098594596216424e-07,
"loss": 0.0051,
"step": 18910
},
{
"epoch": 10.587576944599888,
"grad_norm": 0.1093849390745163,
"learning_rate": 7.951071468283167e-07,
"loss": 0.0062,
"step": 18920
},
{
"epoch": 10.593172915500839,
"grad_norm": 0.05242215842008591,
"learning_rate": 7.804893583331696e-07,
"loss": 0.0049,
"step": 18930
},
{
"epoch": 10.59876888640179,
"grad_norm": 0.06830724328756332,
"learning_rate": 7.66006134100672e-07,
"loss": 0.0031,
"step": 18940
},
{
"epoch": 10.604364857302741,
"grad_norm": 0.08541436493396759,
"learning_rate": 7.516575137274162e-07,
"loss": 0.0044,
"step": 18950
},
{
"epoch": 10.609960828203693,
"grad_norm": 0.042029768228530884,
"learning_rate": 7.374435364419674e-07,
"loss": 0.0043,
"step": 18960
},
{
"epoch": 10.615556799104645,
"grad_norm": 0.12100391089916229,
"learning_rate": 7.233642411048014e-07,
"loss": 0.0032,
"step": 18970
},
{
"epoch": 10.621152770005596,
"grad_norm": 0.04842936620116234,
"learning_rate": 7.094196662081831e-07,
"loss": 0.0052,
"step": 18980
},
{
"epoch": 10.626748740906548,
"grad_norm": 0.13397961854934692,
"learning_rate": 6.956098498760389e-07,
"loss": 0.0056,
"step": 18990
},
{
"epoch": 10.632344711807498,
"grad_norm": 0.19486455619335175,
"learning_rate": 6.819348298638839e-07,
"loss": 0.0029,
"step": 19000
},
{
"epoch": 10.63794068270845,
"grad_norm": 0.1525876224040985,
"learning_rate": 6.683946435586952e-07,
"loss": 0.0142,
"step": 19010
},
{
"epoch": 10.6435366536094,
"grad_norm": 0.09059377759695053,
"learning_rate": 6.549893279788277e-07,
"loss": 0.0057,
"step": 19020
},
{
"epoch": 10.649132624510353,
"grad_norm": 0.08628048002719879,
"learning_rate": 6.417189197739093e-07,
"loss": 0.0059,
"step": 19030
},
{
"epoch": 10.654728595411305,
"grad_norm": 0.34853503108024597,
"learning_rate": 6.285834552247128e-07,
"loss": 0.0041,
"step": 19040
},
{
"epoch": 10.660324566312255,
"grad_norm": 0.1580825001001358,
"learning_rate": 6.15582970243117e-07,
"loss": 0.0059,
"step": 19050
},
{
"epoch": 10.665920537213207,
"grad_norm": 0.2064519226551056,
"learning_rate": 6.027175003719354e-07,
"loss": 0.0065,
"step": 19060
},
{
"epoch": 10.671516508114157,
"grad_norm": 0.1656566709280014,
"learning_rate": 5.899870807848762e-07,
"loss": 0.0045,
"step": 19070
},
{
"epoch": 10.67711247901511,
"grad_norm": 0.06346923857927322,
"learning_rate": 5.773917462864264e-07,
"loss": 0.0108,
"step": 19080
},
{
"epoch": 10.68270844991606,
"grad_norm": 0.0746588408946991,
"learning_rate": 5.64931531311741e-07,
"loss": 0.0038,
"step": 19090
},
{
"epoch": 10.688304420817012,
"grad_norm": 0.10566951334476471,
"learning_rate": 5.526064699265753e-07,
"loss": 0.0084,
"step": 19100
},
{
"epoch": 10.693900391717962,
"grad_norm": 0.061587151139974594,
"learning_rate": 5.404165958271811e-07,
"loss": 0.0042,
"step": 19110
},
{
"epoch": 10.699496362618914,
"grad_norm": 0.27593472599983215,
"learning_rate": 5.283619423401998e-07,
"loss": 0.005,
"step": 19120
},
{
"epoch": 10.705092333519866,
"grad_norm": 0.37827596068382263,
"learning_rate": 5.164425424226016e-07,
"loss": 0.0068,
"step": 19130
},
{
"epoch": 10.710688304420817,
"grad_norm": 0.2789309322834015,
"learning_rate": 5.046584286615697e-07,
"loss": 0.0054,
"step": 19140
},
{
"epoch": 10.716284275321769,
"grad_norm": 0.08417310565710068,
"learning_rate": 4.930096332744105e-07,
"loss": 0.0043,
"step": 19150
},
{
"epoch": 10.72188024622272,
"grad_norm": 0.13277283310890198,
"learning_rate": 4.814961881085045e-07,
"loss": 0.007,
"step": 19160
},
{
"epoch": 10.727476217123671,
"grad_norm": 0.029057292267680168,
"learning_rate": 4.701181246411501e-07,
"loss": 0.0077,
"step": 19170
},
{
"epoch": 10.733072188024622,
"grad_norm": 0.07132174074649811,
"learning_rate": 4.5887547397955864e-07,
"loss": 0.0044,
"step": 19180
},
{
"epoch": 10.738668158925574,
"grad_norm": 0.05213991925120354,
"learning_rate": 4.4776826686069305e-07,
"loss": 0.0022,
"step": 19190
},
{
"epoch": 10.744264129826526,
"grad_norm": 0.092039555311203,
"learning_rate": 4.367965336512403e-07,
"loss": 0.0032,
"step": 19200
},
{
"epoch": 10.749860100727476,
"grad_norm": 0.17352578043937683,
"learning_rate": 4.259603043475002e-07,
"loss": 0.0064,
"step": 19210
},
{
"epoch": 10.755456071628428,
"grad_norm": 0.15915948152542114,
"learning_rate": 4.1525960857530243e-07,
"loss": 0.0075,
"step": 19220
},
{
"epoch": 10.761052042529379,
"grad_norm": 0.21297423541545868,
"learning_rate": 4.0469447558995065e-07,
"loss": 0.0057,
"step": 19230
},
{
"epoch": 10.76664801343033,
"grad_norm": 0.17462663352489471,
"learning_rate": 3.9426493427611177e-07,
"loss": 0.0056,
"step": 19240
},
{
"epoch": 10.772243984331281,
"grad_norm": 0.10657753050327301,
"learning_rate": 3.839710131477492e-07,
"loss": 0.0089,
"step": 19250
},
{
"epoch": 10.777839955232233,
"grad_norm": 0.07254552841186523,
"learning_rate": 3.738127403480507e-07,
"loss": 0.003,
"step": 19260
},
{
"epoch": 10.783435926133183,
"grad_norm": 0.27843359112739563,
"learning_rate": 3.637901436493507e-07,
"loss": 0.0067,
"step": 19270
},
{
"epoch": 10.789031897034135,
"grad_norm": 0.17431190609931946,
"learning_rate": 3.5390325045304706e-07,
"loss": 0.0042,
"step": 19280
},
{
"epoch": 10.794627867935088,
"grad_norm": 0.11761761456727982,
"learning_rate": 3.441520877895288e-07,
"loss": 0.0036,
"step": 19290
},
{
"epoch": 10.800223838836038,
"grad_norm": 0.1055087074637413,
"learning_rate": 3.3453668231809286e-07,
"loss": 0.0049,
"step": 19300
},
{
"epoch": 10.80581980973699,
"grad_norm": 0.05716053023934364,
"learning_rate": 3.250570603268943e-07,
"loss": 0.0057,
"step": 19310
},
{
"epoch": 10.81141578063794,
"grad_norm": 0.06227661669254303,
"learning_rate": 3.157132477328628e-07,
"loss": 0.0047,
"step": 19320
},
{
"epoch": 10.817011751538892,
"grad_norm": 0.07587496936321259,
"learning_rate": 3.0650527008162513e-07,
"loss": 0.0058,
"step": 19330
},
{
"epoch": 10.822607722439843,
"grad_norm": 0.12384708225727081,
"learning_rate": 2.9743315254743833e-07,
"loss": 0.0044,
"step": 19340
},
{
"epoch": 10.828203693340795,
"grad_norm": 0.130027636885643,
"learning_rate": 2.8849691993311777e-07,
"loss": 0.0048,
"step": 19350
},
{
"epoch": 10.833799664241745,
"grad_norm": 0.03498604893684387,
"learning_rate": 2.796965966699927e-07,
"loss": 0.0076,
"step": 19360
},
{
"epoch": 10.839395635142697,
"grad_norm": 0.06795532256364822,
"learning_rate": 2.7103220681780615e-07,
"loss": 0.0046,
"step": 19370
},
{
"epoch": 10.84499160604365,
"grad_norm": 0.15649089217185974,
"learning_rate": 2.625037740646763e-07,
"loss": 0.0041,
"step": 19380
},
{
"epoch": 10.8505875769446,
"grad_norm": 0.19872230291366577,
"learning_rate": 2.5411132172700194e-07,
"loss": 0.0045,
"step": 19390
},
{
"epoch": 10.856183547845552,
"grad_norm": 0.1986837238073349,
"learning_rate": 2.458548727494292e-07,
"loss": 0.0034,
"step": 19400
},
{
"epoch": 10.861779518746502,
"grad_norm": 0.34645870327949524,
"learning_rate": 2.3773444970477955e-07,
"loss": 0.0059,
"step": 19410
},
{
"epoch": 10.867375489647454,
"grad_norm": 0.043271441012620926,
"learning_rate": 2.2975007479397738e-07,
"loss": 0.0042,
"step": 19420
},
{
"epoch": 10.872971460548404,
"grad_norm": 0.10621374845504761,
"learning_rate": 2.219017698460002e-07,
"loss": 0.0107,
"step": 19430
},
{
"epoch": 10.878567431449357,
"grad_norm": 0.038412097841501236,
"learning_rate": 2.1418955631781202e-07,
"loss": 0.0025,
"step": 19440
},
{
"epoch": 10.884163402350307,
"grad_norm": 0.14375977218151093,
"learning_rate": 2.0661345529430775e-07,
"loss": 0.0063,
"step": 19450
},
{
"epoch": 10.889759373251259,
"grad_norm": 0.28644490242004395,
"learning_rate": 1.9917348748826335e-07,
"loss": 0.0037,
"step": 19460
},
{
"epoch": 10.895355344152211,
"grad_norm": 0.19371145963668823,
"learning_rate": 1.918696732402636e-07,
"loss": 0.0071,
"step": 19470
},
{
"epoch": 10.900951315053161,
"grad_norm": 0.11907006055116653,
"learning_rate": 1.847020325186577e-07,
"loss": 0.0049,
"step": 19480
},
{
"epoch": 10.906547285954113,
"grad_norm": 0.10020023584365845,
"learning_rate": 1.776705849195037e-07,
"loss": 0.0036,
"step": 19490
},
{
"epoch": 10.912143256855064,
"grad_norm": 0.12778791785240173,
"learning_rate": 1.7077534966650766e-07,
"loss": 0.0057,
"step": 19500
},
{
"epoch": 10.917739227756016,
"grad_norm": 0.06359223276376724,
"learning_rate": 1.6401634561098444e-07,
"loss": 0.0036,
"step": 19510
},
{
"epoch": 10.923335198656966,
"grad_norm": 0.07983513921499252,
"learning_rate": 1.5739359123178587e-07,
"loss": 0.0037,
"step": 19520
},
{
"epoch": 10.928931169557918,
"grad_norm": 0.12060696631669998,
"learning_rate": 1.5090710463527836e-07,
"loss": 0.0031,
"step": 19530
},
{
"epoch": 10.93452714045887,
"grad_norm": 0.10252276062965393,
"learning_rate": 1.4455690355525964e-07,
"loss": 0.0052,
"step": 19540
},
{
"epoch": 10.94012311135982,
"grad_norm": 0.10586907714605331,
"learning_rate": 1.383430053529422e-07,
"loss": 0.0025,
"step": 19550
},
{
"epoch": 10.945719082260773,
"grad_norm": 0.05571618303656578,
"learning_rate": 1.3226542701689215e-07,
"loss": 0.0045,
"step": 19560
},
{
"epoch": 10.951315053161723,
"grad_norm": 0.07698628306388855,
"learning_rate": 1.2632418516296262e-07,
"loss": 0.0039,
"step": 19570
},
{
"epoch": 10.956911024062675,
"grad_norm": 0.3049318790435791,
"learning_rate": 1.2051929603428825e-07,
"loss": 0.0036,
"step": 19580
},
{
"epoch": 10.962506994963626,
"grad_norm": 0.04247491434216499,
"learning_rate": 1.1485077550122402e-07,
"loss": 0.0086,
"step": 19590
},
{
"epoch": 10.968102965864578,
"grad_norm": 0.13998843729496002,
"learning_rate": 1.0931863906127327e-07,
"loss": 0.0032,
"step": 19600
},
{
"epoch": 10.973698936765528,
"grad_norm": 0.18532228469848633,
"learning_rate": 1.0392290183909304e-07,
"loss": 0.0053,
"step": 19610
},
{
"epoch": 10.97929490766648,
"grad_norm": 0.24849370121955872,
"learning_rate": 9.866357858642205e-08,
"loss": 0.0031,
"step": 19620
},
{
"epoch": 10.984890878567432,
"grad_norm": 0.04739070311188698,
"learning_rate": 9.354068368204739e-08,
"loss": 0.0055,
"step": 19630
},
{
"epoch": 10.990486849468383,
"grad_norm": 0.13325341045856476,
"learning_rate": 8.855423113177664e-08,
"loss": 0.0027,
"step": 19640
},
{
"epoch": 10.996082820369335,
"grad_norm": 0.15442515909671783,
"learning_rate": 8.37042345683714e-08,
"loss": 0.009,
"step": 19650
},
{
"epoch": 11.001678791270285,
"grad_norm": 0.20657239854335785,
"learning_rate": 7.899070725153613e-08,
"loss": 0.0063,
"step": 19660
},
{
"epoch": 11.007274762171237,
"grad_norm": 0.16029535233974457,
"learning_rate": 7.44136620678848e-08,
"loss": 0.0044,
"step": 19670
},
{
"epoch": 11.012870733072187,
"grad_norm": 0.16476546227931976,
"learning_rate": 6.997311153086883e-08,
"loss": 0.0066,
"step": 19680
},
{
"epoch": 11.01846670397314,
"grad_norm": 0.12683425843715668,
"learning_rate": 6.566906778079917e-08,
"loss": 0.0052,
"step": 19690
},
{
"epoch": 11.024062674874092,
"grad_norm": 0.23135153949260712,
"learning_rate": 6.150154258476315e-08,
"loss": 0.0043,
"step": 19700
},
{
"epoch": 11.029658645775042,
"grad_norm": 0.1939716786146164,
"learning_rate": 5.747054733660773e-08,
"loss": 0.0077,
"step": 19710
},
{
"epoch": 11.035254616675994,
"grad_norm": 0.11450741440057755,
"learning_rate": 5.3576093056922906e-08,
"loss": 0.0079,
"step": 19720
},
{
"epoch": 11.040850587576944,
"grad_norm": 0.06929726153612137,
"learning_rate": 4.981819039300284e-08,
"loss": 0.0039,
"step": 19730
},
{
"epoch": 11.046446558477896,
"grad_norm": 0.11268885433673859,
"learning_rate": 4.619684961881254e-08,
"loss": 0.0047,
"step": 19740
},
{
"epoch": 11.052042529378847,
"grad_norm": 0.07555661350488663,
"learning_rate": 4.2712080634949024e-08,
"loss": 0.0038,
"step": 19750
},
{
"epoch": 11.057638500279799,
"grad_norm": 0.07180225849151611,
"learning_rate": 3.936389296864129e-08,
"loss": 0.0066,
"step": 19760
},
{
"epoch": 11.063234471180749,
"grad_norm": 0.2635197937488556,
"learning_rate": 3.615229577371149e-08,
"loss": 0.0047,
"step": 19770
},
{
"epoch": 11.068830442081701,
"grad_norm": 0.03527739644050598,
"learning_rate": 3.3077297830541584e-08,
"loss": 0.0047,
"step": 19780
},
{
"epoch": 11.074426412982653,
"grad_norm": 0.061606280505657196,
"learning_rate": 3.01389075460512e-08,
"loss": 0.0069,
"step": 19790
},
{
"epoch": 11.080022383883604,
"grad_norm": 0.14764872193336487,
"learning_rate": 2.7337132953697554e-08,
"loss": 0.0063,
"step": 19800
},
{
"epoch": 11.085618354784556,
"grad_norm": 0.13825170695781708,
"learning_rate": 2.467198171342e-08,
"loss": 0.0047,
"step": 19810
},
{
"epoch": 11.091214325685506,
"grad_norm": 0.40132373571395874,
"learning_rate": 2.214346111164556e-08,
"loss": 0.0058,
"step": 19820
},
{
"epoch": 11.096810296586458,
"grad_norm": 0.06293044239282608,
"learning_rate": 1.9751578061244504e-08,
"loss": 0.0093,
"step": 19830
},
{
"epoch": 11.102406267487408,
"grad_norm": 0.08641501516103745,
"learning_rate": 1.749633910153592e-08,
"loss": 0.0061,
"step": 19840
},
{
"epoch": 11.10800223838836,
"grad_norm": 0.06543342024087906,
"learning_rate": 1.5377750398265502e-08,
"loss": 0.0034,
"step": 19850
},
{
"epoch": 11.11359820928931,
"grad_norm": 0.0463268905878067,
"learning_rate": 1.3395817743561134e-08,
"loss": 0.0031,
"step": 19860
},
{
"epoch": 11.119194180190263,
"grad_norm": 0.18889687955379486,
"learning_rate": 1.1550546555960662e-08,
"loss": 0.0049,
"step": 19870
},
{
"epoch": 11.124790151091215,
"grad_norm": 0.33526870608329773,
"learning_rate": 9.841941880361916e-09,
"loss": 0.0068,
"step": 19880
},
{
"epoch": 11.130386121992165,
"grad_norm": 0.17259934544563293,
"learning_rate": 8.270008388022721e-09,
"loss": 0.0047,
"step": 19890
},
{
"epoch": 11.135982092893117,
"grad_norm": 0.24882031977176666,
"learning_rate": 6.834750376549792e-09,
"loss": 0.0061,
"step": 19900
},
{
"epoch": 11.141578063794068,
"grad_norm": 0.05286456272006035,
"learning_rate": 5.536171769887632e-09,
"loss": 0.0059,
"step": 19910
},
{
"epoch": 11.14717403469502,
"grad_norm": 0.08882560580968857,
"learning_rate": 4.3742761183018784e-09,
"loss": 0.0063,
"step": 19920
},
{
"epoch": 11.15277000559597,
"grad_norm": 0.09571769833564758,
"learning_rate": 3.349066598362649e-09,
"loss": 0.0034,
"step": 19930
},
{
"epoch": 11.158365976496922,
"grad_norm": 0.07795775681734085,
"learning_rate": 2.4605460129556445e-09,
"loss": 0.0029,
"step": 19940
},
{
"epoch": 11.163961947397874,
"grad_norm": 0.07696644216775894,
"learning_rate": 1.7087167912710478e-09,
"loss": 0.0083,
"step": 19950
},
{
"epoch": 11.169557918298825,
"grad_norm": 0.26498469710350037,
"learning_rate": 1.0935809887702154e-09,
"loss": 0.0033,
"step": 19960
},
{
"epoch": 11.175153889199777,
"grad_norm": 0.165630042552948,
"learning_rate": 6.151402872134337e-10,
"loss": 0.007,
"step": 19970
},
{
"epoch": 11.180749860100727,
"grad_norm": 0.07009857147932053,
"learning_rate": 2.7339599464326627e-10,
"loss": 0.0037,
"step": 19980
},
{
"epoch": 11.18634583100168,
"grad_norm": 0.1754114180803299,
"learning_rate": 6.834904537900144e-11,
"loss": 0.0069,
"step": 19990
},
{
"epoch": 11.19194180190263,
"grad_norm": 0.18103741109371185,
"learning_rate": 0.0,
"loss": 0.0044,
"step": 20000
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.091345386565736e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}