Arko007's picture
Upload folder using huggingface_hub
2732864 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9946007388462631,
"eval_steps": 500,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002841716396703609,
"grad_norm": 0.20229442417621613,
"learning_rate": 0.00019948849104859336,
"loss": 1.523,
"step": 10
},
{
"epoch": 0.005683432793407218,
"grad_norm": 0.24781420826911926,
"learning_rate": 0.00019892014776925262,
"loss": 1.3539,
"step": 20
},
{
"epoch": 0.008525149190110827,
"grad_norm": 0.21505358815193176,
"learning_rate": 0.0001983518044899119,
"loss": 1.3182,
"step": 30
},
{
"epoch": 0.011366865586814436,
"grad_norm": 0.2120037078857422,
"learning_rate": 0.00019778346121057119,
"loss": 1.3647,
"step": 40
},
{
"epoch": 0.014208581983518044,
"grad_norm": 0.19157598912715912,
"learning_rate": 0.00019721511793123047,
"loss": 1.4083,
"step": 50
},
{
"epoch": 0.017050298380221655,
"grad_norm": 0.2424042969942093,
"learning_rate": 0.00019664677465188975,
"loss": 1.2882,
"step": 60
},
{
"epoch": 0.019892014776925263,
"grad_norm": 0.25576573610305786,
"learning_rate": 0.000196078431372549,
"loss": 1.2469,
"step": 70
},
{
"epoch": 0.022733731173628872,
"grad_norm": 0.3163057267665863,
"learning_rate": 0.0001955100880932083,
"loss": 1.3234,
"step": 80
},
{
"epoch": 0.02557544757033248,
"grad_norm": 0.20805688202381134,
"learning_rate": 0.00019494174481386758,
"loss": 1.2203,
"step": 90
},
{
"epoch": 0.02841716396703609,
"grad_norm": 0.26067784428596497,
"learning_rate": 0.00019437340153452686,
"loss": 1.235,
"step": 100
},
{
"epoch": 0.0312588803637397,
"grad_norm": 0.25863921642303467,
"learning_rate": 0.00019380505825518612,
"loss": 1.2209,
"step": 110
},
{
"epoch": 0.03410059676044331,
"grad_norm": 0.2780992090702057,
"learning_rate": 0.0001932367149758454,
"loss": 1.2806,
"step": 120
},
{
"epoch": 0.03694231315714692,
"grad_norm": 0.2662423253059387,
"learning_rate": 0.0001926683716965047,
"loss": 1.2075,
"step": 130
},
{
"epoch": 0.039784029553850526,
"grad_norm": 0.35407036542892456,
"learning_rate": 0.00019210002841716397,
"loss": 1.2721,
"step": 140
},
{
"epoch": 0.042625745950554135,
"grad_norm": 0.2910842001438141,
"learning_rate": 0.00019153168513782326,
"loss": 1.2716,
"step": 150
},
{
"epoch": 0.045467462347257744,
"grad_norm": 0.2619645893573761,
"learning_rate": 0.00019096334185848252,
"loss": 1.186,
"step": 160
},
{
"epoch": 0.04830917874396135,
"grad_norm": 0.2996160686016083,
"learning_rate": 0.0001903949985791418,
"loss": 1.1886,
"step": 170
},
{
"epoch": 0.05115089514066496,
"grad_norm": 0.227691650390625,
"learning_rate": 0.00018982665529980108,
"loss": 1.2613,
"step": 180
},
{
"epoch": 0.05399261153736857,
"grad_norm": 0.2601442039012909,
"learning_rate": 0.00018925831202046037,
"loss": 1.1767,
"step": 190
},
{
"epoch": 0.05683432793407218,
"grad_norm": 0.28762301802635193,
"learning_rate": 0.00018868996874111963,
"loss": 1.2628,
"step": 200
},
{
"epoch": 0.059676044330775786,
"grad_norm": 0.23256859183311462,
"learning_rate": 0.0001881216254617789,
"loss": 1.1098,
"step": 210
},
{
"epoch": 0.0625177607274794,
"grad_norm": 0.2880021333694458,
"learning_rate": 0.0001875532821824382,
"loss": 1.1575,
"step": 220
},
{
"epoch": 0.06535947712418301,
"grad_norm": 0.25147515535354614,
"learning_rate": 0.00018698493890309748,
"loss": 1.1868,
"step": 230
},
{
"epoch": 0.06820119352088662,
"grad_norm": 0.29358601570129395,
"learning_rate": 0.00018641659562375676,
"loss": 1.241,
"step": 240
},
{
"epoch": 0.07104290991759023,
"grad_norm": 0.24878720939159393,
"learning_rate": 0.00018584825234441602,
"loss": 1.1583,
"step": 250
},
{
"epoch": 0.07388462631429384,
"grad_norm": 0.23219600319862366,
"learning_rate": 0.0001852799090650753,
"loss": 1.155,
"step": 260
},
{
"epoch": 0.07672634271099744,
"grad_norm": 0.2404685616493225,
"learning_rate": 0.0001847115657857346,
"loss": 1.1896,
"step": 270
},
{
"epoch": 0.07956805910770105,
"grad_norm": 0.21366341412067413,
"learning_rate": 0.00018414322250639387,
"loss": 1.1522,
"step": 280
},
{
"epoch": 0.08240977550440466,
"grad_norm": 0.30190715193748474,
"learning_rate": 0.00018357487922705313,
"loss": 1.2053,
"step": 290
},
{
"epoch": 0.08525149190110827,
"grad_norm": 0.253252238035202,
"learning_rate": 0.00018300653594771241,
"loss": 1.1812,
"step": 300
},
{
"epoch": 0.08809320829781188,
"grad_norm": 0.2292664349079132,
"learning_rate": 0.0001824381926683717,
"loss": 1.112,
"step": 310
},
{
"epoch": 0.09093492469451549,
"grad_norm": 0.28798526525497437,
"learning_rate": 0.00018186984938903098,
"loss": 1.2313,
"step": 320
},
{
"epoch": 0.0937766410912191,
"grad_norm": 0.28377199172973633,
"learning_rate": 0.00018130150610969027,
"loss": 1.108,
"step": 330
},
{
"epoch": 0.0966183574879227,
"grad_norm": 0.25983279943466187,
"learning_rate": 0.00018073316283034952,
"loss": 1.1511,
"step": 340
},
{
"epoch": 0.09946007388462631,
"grad_norm": 0.25927045941352844,
"learning_rate": 0.0001801648195510088,
"loss": 1.2269,
"step": 350
},
{
"epoch": 0.10230179028132992,
"grad_norm": 0.2704865634441376,
"learning_rate": 0.0001795964762716681,
"loss": 1.1827,
"step": 360
},
{
"epoch": 0.10514350667803353,
"grad_norm": 0.30205655097961426,
"learning_rate": 0.00017902813299232738,
"loss": 1.2028,
"step": 370
},
{
"epoch": 0.10798522307473714,
"grad_norm": 0.3334643244743347,
"learning_rate": 0.00017845978971298663,
"loss": 1.1631,
"step": 380
},
{
"epoch": 0.11082693947144075,
"grad_norm": 0.25340893864631653,
"learning_rate": 0.00017789144643364592,
"loss": 1.1823,
"step": 390
},
{
"epoch": 0.11366865586814436,
"grad_norm": 0.2417430877685547,
"learning_rate": 0.0001773231031543052,
"loss": 1.1956,
"step": 400
},
{
"epoch": 0.11651037226484796,
"grad_norm": 0.238485187292099,
"learning_rate": 0.0001767547598749645,
"loss": 1.2436,
"step": 410
},
{
"epoch": 0.11935208866155157,
"grad_norm": 0.2162630409002304,
"learning_rate": 0.00017618641659562377,
"loss": 1.1831,
"step": 420
},
{
"epoch": 0.12219380505825518,
"grad_norm": 0.25849658250808716,
"learning_rate": 0.00017561807331628303,
"loss": 1.2077,
"step": 430
},
{
"epoch": 0.1250355214549588,
"grad_norm": 0.3160068392753601,
"learning_rate": 0.0001750497300369423,
"loss": 1.2443,
"step": 440
},
{
"epoch": 0.1278772378516624,
"grad_norm": 0.25949159264564514,
"learning_rate": 0.0001744813867576016,
"loss": 1.2738,
"step": 450
},
{
"epoch": 0.13071895424836602,
"grad_norm": 0.2856585383415222,
"learning_rate": 0.00017391304347826088,
"loss": 1.1611,
"step": 460
},
{
"epoch": 0.13356067064506963,
"grad_norm": 0.27770936489105225,
"learning_rate": 0.00017334470019892014,
"loss": 1.1796,
"step": 470
},
{
"epoch": 0.13640238704177324,
"grad_norm": 0.2558460831642151,
"learning_rate": 0.00017277635691957942,
"loss": 1.1277,
"step": 480
},
{
"epoch": 0.13924410343847685,
"grad_norm": 0.23918330669403076,
"learning_rate": 0.0001722080136402387,
"loss": 1.1651,
"step": 490
},
{
"epoch": 0.14208581983518045,
"grad_norm": 0.24087974429130554,
"learning_rate": 0.000171639670360898,
"loss": 1.2795,
"step": 500
},
{
"epoch": 0.14492753623188406,
"grad_norm": 0.22798433899879456,
"learning_rate": 0.00017107132708155728,
"loss": 1.1636,
"step": 510
},
{
"epoch": 0.14776925262858767,
"grad_norm": 0.25570231676101685,
"learning_rate": 0.00017050298380221653,
"loss": 1.0991,
"step": 520
},
{
"epoch": 0.15061096902529128,
"grad_norm": 0.3161047101020813,
"learning_rate": 0.00016993464052287582,
"loss": 1.1638,
"step": 530
},
{
"epoch": 0.1534526854219949,
"grad_norm": 0.34027767181396484,
"learning_rate": 0.0001693662972435351,
"loss": 1.1913,
"step": 540
},
{
"epoch": 0.1562944018186985,
"grad_norm": 0.5431545972824097,
"learning_rate": 0.00016879795396419439,
"loss": 1.2059,
"step": 550
},
{
"epoch": 0.1591361182154021,
"grad_norm": 0.3261612057685852,
"learning_rate": 0.00016822961068485364,
"loss": 1.1779,
"step": 560
},
{
"epoch": 0.16197783461210571,
"grad_norm": 0.25675147771835327,
"learning_rate": 0.00016766126740551293,
"loss": 1.2524,
"step": 570
},
{
"epoch": 0.16481955100880932,
"grad_norm": 0.2560219168663025,
"learning_rate": 0.0001670929241261722,
"loss": 1.0579,
"step": 580
},
{
"epoch": 0.16766126740551293,
"grad_norm": 0.29484283924102783,
"learning_rate": 0.0001665245808468315,
"loss": 1.0802,
"step": 590
},
{
"epoch": 0.17050298380221654,
"grad_norm": 0.22986841201782227,
"learning_rate": 0.00016595623756749078,
"loss": 1.1268,
"step": 600
},
{
"epoch": 0.17334470019892015,
"grad_norm": 0.23908346891403198,
"learning_rate": 0.00016538789428815004,
"loss": 1.1326,
"step": 610
},
{
"epoch": 0.17618641659562376,
"grad_norm": 0.30050793290138245,
"learning_rate": 0.00016481955100880932,
"loss": 1.0501,
"step": 620
},
{
"epoch": 0.17902813299232737,
"grad_norm": 0.24418674409389496,
"learning_rate": 0.0001642512077294686,
"loss": 1.1718,
"step": 630
},
{
"epoch": 0.18186984938903097,
"grad_norm": 0.4306769073009491,
"learning_rate": 0.0001636828644501279,
"loss": 1.1343,
"step": 640
},
{
"epoch": 0.18471156578573458,
"grad_norm": 0.2366916537284851,
"learning_rate": 0.00016311452117078715,
"loss": 1.1766,
"step": 650
},
{
"epoch": 0.1875532821824382,
"grad_norm": 0.281982958316803,
"learning_rate": 0.00016254617789144643,
"loss": 1.2057,
"step": 660
},
{
"epoch": 0.1903949985791418,
"grad_norm": 0.2688102424144745,
"learning_rate": 0.00016197783461210572,
"loss": 1.1958,
"step": 670
},
{
"epoch": 0.1932367149758454,
"grad_norm": 0.28181755542755127,
"learning_rate": 0.000161409491332765,
"loss": 1.1662,
"step": 680
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.23462365567684174,
"learning_rate": 0.00016084114805342428,
"loss": 1.1193,
"step": 690
},
{
"epoch": 0.19892014776925263,
"grad_norm": 0.27968090772628784,
"learning_rate": 0.00016027280477408354,
"loss": 1.1678,
"step": 700
},
{
"epoch": 0.20176186416595623,
"grad_norm": 0.2571905851364136,
"learning_rate": 0.00015970446149474283,
"loss": 1.1775,
"step": 710
},
{
"epoch": 0.20460358056265984,
"grad_norm": 0.2821357250213623,
"learning_rate": 0.0001591361182154021,
"loss": 1.1649,
"step": 720
},
{
"epoch": 0.20744529695936345,
"grad_norm": 0.2606565058231354,
"learning_rate": 0.0001585677749360614,
"loss": 1.0562,
"step": 730
},
{
"epoch": 0.21028701335606706,
"grad_norm": 0.27794864773750305,
"learning_rate": 0.00015799943165672065,
"loss": 1.1366,
"step": 740
},
{
"epoch": 0.21312872975277067,
"grad_norm": 0.2107602059841156,
"learning_rate": 0.00015743108837737994,
"loss": 1.1106,
"step": 750
},
{
"epoch": 0.21597044614947428,
"grad_norm": 0.2524462640285492,
"learning_rate": 0.00015686274509803922,
"loss": 1.1777,
"step": 760
},
{
"epoch": 0.21881216254617789,
"grad_norm": 0.22253374755382538,
"learning_rate": 0.0001562944018186985,
"loss": 1.0856,
"step": 770
},
{
"epoch": 0.2216538789428815,
"grad_norm": 0.2423143982887268,
"learning_rate": 0.0001557260585393578,
"loss": 1.1836,
"step": 780
},
{
"epoch": 0.2244955953395851,
"grad_norm": 0.2595592737197876,
"learning_rate": 0.00015515771526001705,
"loss": 1.1698,
"step": 790
},
{
"epoch": 0.2273373117362887,
"grad_norm": 0.2568744421005249,
"learning_rate": 0.00015458937198067633,
"loss": 1.1707,
"step": 800
},
{
"epoch": 0.23017902813299232,
"grad_norm": 0.30398836731910706,
"learning_rate": 0.00015402102870133561,
"loss": 1.1547,
"step": 810
},
{
"epoch": 0.23302074452969593,
"grad_norm": 0.300106406211853,
"learning_rate": 0.0001534526854219949,
"loss": 1.1925,
"step": 820
},
{
"epoch": 0.23586246092639954,
"grad_norm": 0.2774117588996887,
"learning_rate": 0.00015288434214265416,
"loss": 1.1404,
"step": 830
},
{
"epoch": 0.23870417732310314,
"grad_norm": 0.28219330310821533,
"learning_rate": 0.00015231599886331344,
"loss": 1.122,
"step": 840
},
{
"epoch": 0.24154589371980675,
"grad_norm": 0.3004832863807678,
"learning_rate": 0.00015174765558397272,
"loss": 1.2315,
"step": 850
},
{
"epoch": 0.24438761011651036,
"grad_norm": 0.24674588441848755,
"learning_rate": 0.000151179312304632,
"loss": 1.1663,
"step": 860
},
{
"epoch": 0.24722932651321397,
"grad_norm": 0.21483612060546875,
"learning_rate": 0.0001506109690252913,
"loss": 1.173,
"step": 870
},
{
"epoch": 0.2500710429099176,
"grad_norm": 0.3079366683959961,
"learning_rate": 0.00015004262574595055,
"loss": 1.1695,
"step": 880
},
{
"epoch": 0.2529127593066212,
"grad_norm": 0.2626102566719055,
"learning_rate": 0.00014947428246660983,
"loss": 1.1552,
"step": 890
},
{
"epoch": 0.2557544757033248,
"grad_norm": 0.3606242537498474,
"learning_rate": 0.00014890593918726912,
"loss": 1.0921,
"step": 900
},
{
"epoch": 0.25859619210002843,
"grad_norm": 0.3001089096069336,
"learning_rate": 0.0001483375959079284,
"loss": 1.1911,
"step": 910
},
{
"epoch": 0.26143790849673204,
"grad_norm": 0.2784072458744049,
"learning_rate": 0.00014776925262858766,
"loss": 1.1979,
"step": 920
},
{
"epoch": 0.26427962489343565,
"grad_norm": 0.21513643860816956,
"learning_rate": 0.00014720090934924694,
"loss": 1.0723,
"step": 930
},
{
"epoch": 0.26712134129013926,
"grad_norm": 0.253627210855484,
"learning_rate": 0.00014663256606990623,
"loss": 1.1883,
"step": 940
},
{
"epoch": 0.26996305768684287,
"grad_norm": 0.2411937564611435,
"learning_rate": 0.0001460642227905655,
"loss": 1.1786,
"step": 950
},
{
"epoch": 0.2728047740835465,
"grad_norm": 0.2327292114496231,
"learning_rate": 0.0001454958795112248,
"loss": 1.2151,
"step": 960
},
{
"epoch": 0.2756464904802501,
"grad_norm": 0.26323702931404114,
"learning_rate": 0.00014492753623188405,
"loss": 1.1002,
"step": 970
},
{
"epoch": 0.2784882068769537,
"grad_norm": 0.2239420861005783,
"learning_rate": 0.00014435919295254334,
"loss": 1.0453,
"step": 980
},
{
"epoch": 0.2813299232736573,
"grad_norm": 0.2786525785923004,
"learning_rate": 0.00014379084967320262,
"loss": 1.2195,
"step": 990
},
{
"epoch": 0.2841716396703609,
"grad_norm": 0.23764382302761078,
"learning_rate": 0.0001432225063938619,
"loss": 1.1263,
"step": 1000
},
{
"epoch": 0.2870133560670645,
"grad_norm": 0.2542721927165985,
"learning_rate": 0.00014265416311452116,
"loss": 1.1603,
"step": 1010
},
{
"epoch": 0.2898550724637681,
"grad_norm": 0.26884064078330994,
"learning_rate": 0.00014208581983518045,
"loss": 1.1759,
"step": 1020
},
{
"epoch": 0.29269678886047173,
"grad_norm": 0.2652948498725891,
"learning_rate": 0.00014151747655583973,
"loss": 1.0826,
"step": 1030
},
{
"epoch": 0.29553850525717534,
"grad_norm": 0.3024510443210602,
"learning_rate": 0.00014094913327649902,
"loss": 1.1451,
"step": 1040
},
{
"epoch": 0.29838022165387895,
"grad_norm": 0.22471967339515686,
"learning_rate": 0.0001403807899971583,
"loss": 1.1412,
"step": 1050
},
{
"epoch": 0.30122193805058256,
"grad_norm": 0.32953527569770813,
"learning_rate": 0.00013981244671781756,
"loss": 1.1233,
"step": 1060
},
{
"epoch": 0.30406365444728617,
"grad_norm": 0.3743230700492859,
"learning_rate": 0.00013924410343847684,
"loss": 1.1403,
"step": 1070
},
{
"epoch": 0.3069053708439898,
"grad_norm": 0.36825114488601685,
"learning_rate": 0.00013867576015913613,
"loss": 1.2296,
"step": 1080
},
{
"epoch": 0.3097470872406934,
"grad_norm": 0.3129768669605255,
"learning_rate": 0.0001381074168797954,
"loss": 1.1697,
"step": 1090
},
{
"epoch": 0.312588803637397,
"grad_norm": 0.31028246879577637,
"learning_rate": 0.00013753907360045467,
"loss": 1.096,
"step": 1100
},
{
"epoch": 0.3154305200341006,
"grad_norm": 0.2524968981742859,
"learning_rate": 0.00013697073032111395,
"loss": 1.1114,
"step": 1110
},
{
"epoch": 0.3182722364308042,
"grad_norm": 0.23146887123584747,
"learning_rate": 0.00013640238704177324,
"loss": 1.0634,
"step": 1120
},
{
"epoch": 0.3211139528275078,
"grad_norm": 0.26541128754615784,
"learning_rate": 0.00013583404376243252,
"loss": 1.0631,
"step": 1130
},
{
"epoch": 0.32395566922421143,
"grad_norm": 0.24775725603103638,
"learning_rate": 0.0001352657004830918,
"loss": 1.1456,
"step": 1140
},
{
"epoch": 0.32679738562091504,
"grad_norm": 0.25310489535331726,
"learning_rate": 0.00013469735720375106,
"loss": 1.2405,
"step": 1150
},
{
"epoch": 0.32963910201761865,
"grad_norm": 0.2598433792591095,
"learning_rate": 0.00013412901392441035,
"loss": 1.2455,
"step": 1160
},
{
"epoch": 0.33248081841432225,
"grad_norm": 0.2735394835472107,
"learning_rate": 0.00013356067064506963,
"loss": 1.1014,
"step": 1170
},
{
"epoch": 0.33532253481102586,
"grad_norm": 0.26913198828697205,
"learning_rate": 0.00013299232736572892,
"loss": 1.1476,
"step": 1180
},
{
"epoch": 0.33816425120772947,
"grad_norm": 0.22991891205310822,
"learning_rate": 0.00013242398408638817,
"loss": 1.0964,
"step": 1190
},
{
"epoch": 0.3410059676044331,
"grad_norm": 0.2543002963066101,
"learning_rate": 0.00013185564080704746,
"loss": 1.1409,
"step": 1200
},
{
"epoch": 0.3438476840011367,
"grad_norm": 0.2660631537437439,
"learning_rate": 0.00013128729752770674,
"loss": 1.0375,
"step": 1210
},
{
"epoch": 0.3466894003978403,
"grad_norm": 0.25068119168281555,
"learning_rate": 0.00013071895424836603,
"loss": 1.1708,
"step": 1220
},
{
"epoch": 0.3495311167945439,
"grad_norm": 0.27296605706214905,
"learning_rate": 0.0001301506109690253,
"loss": 1.1449,
"step": 1230
},
{
"epoch": 0.3523728331912475,
"grad_norm": 0.23561522364616394,
"learning_rate": 0.00012958226768968457,
"loss": 1.1954,
"step": 1240
},
{
"epoch": 0.3552145495879511,
"grad_norm": 0.2912009358406067,
"learning_rate": 0.00012901392441034385,
"loss": 1.1254,
"step": 1250
},
{
"epoch": 0.35805626598465473,
"grad_norm": 0.24392102658748627,
"learning_rate": 0.00012844558113100314,
"loss": 1.1263,
"step": 1260
},
{
"epoch": 0.36089798238135834,
"grad_norm": 0.30842769145965576,
"learning_rate": 0.00012787723785166242,
"loss": 1.0731,
"step": 1270
},
{
"epoch": 0.36373969877806195,
"grad_norm": 0.2747196853160858,
"learning_rate": 0.0001273088945723217,
"loss": 1.1913,
"step": 1280
},
{
"epoch": 0.36658141517476556,
"grad_norm": 0.31838688254356384,
"learning_rate": 0.00012674055129298096,
"loss": 1.222,
"step": 1290
},
{
"epoch": 0.36942313157146917,
"grad_norm": 0.3127056062221527,
"learning_rate": 0.00012617220801364025,
"loss": 1.0872,
"step": 1300
},
{
"epoch": 0.3722648479681728,
"grad_norm": 0.2799491882324219,
"learning_rate": 0.00012560386473429953,
"loss": 1.1277,
"step": 1310
},
{
"epoch": 0.3751065643648764,
"grad_norm": 0.2875117063522339,
"learning_rate": 0.00012503552145495881,
"loss": 1.153,
"step": 1320
},
{
"epoch": 0.37794828076158,
"grad_norm": 0.3062177300453186,
"learning_rate": 0.00012446717817561807,
"loss": 1.1142,
"step": 1330
},
{
"epoch": 0.3807899971582836,
"grad_norm": 0.23725247383117676,
"learning_rate": 0.00012389883489627736,
"loss": 1.0559,
"step": 1340
},
{
"epoch": 0.3836317135549872,
"grad_norm": 0.28051912784576416,
"learning_rate": 0.00012333049161693664,
"loss": 1.0642,
"step": 1350
},
{
"epoch": 0.3864734299516908,
"grad_norm": 0.2455301284790039,
"learning_rate": 0.00012276214833759592,
"loss": 1.175,
"step": 1360
},
{
"epoch": 0.3893151463483944,
"grad_norm": 0.26085713505744934,
"learning_rate": 0.0001221938050582552,
"loss": 1.1368,
"step": 1370
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.4084455370903015,
"learning_rate": 0.00012162546177891448,
"loss": 1.1346,
"step": 1380
},
{
"epoch": 0.39499857914180164,
"grad_norm": 0.28224310278892517,
"learning_rate": 0.00012105711849957375,
"loss": 1.0693,
"step": 1390
},
{
"epoch": 0.39784029553850525,
"grad_norm": 0.3237653970718384,
"learning_rate": 0.00012048877522023303,
"loss": 1.0585,
"step": 1400
},
{
"epoch": 0.40068201193520886,
"grad_norm": 0.26249366998672485,
"learning_rate": 0.0001199204319408923,
"loss": 1.1227,
"step": 1410
},
{
"epoch": 0.40352372833191247,
"grad_norm": 0.29983624815940857,
"learning_rate": 0.00011935208866155159,
"loss": 1.1473,
"step": 1420
},
{
"epoch": 0.4063654447286161,
"grad_norm": 0.26302003860473633,
"learning_rate": 0.00011878374538221086,
"loss": 1.2398,
"step": 1430
},
{
"epoch": 0.4092071611253197,
"grad_norm": 0.2707955539226532,
"learning_rate": 0.00011821540210287014,
"loss": 1.1299,
"step": 1440
},
{
"epoch": 0.4120488775220233,
"grad_norm": 0.47745946049690247,
"learning_rate": 0.00011764705882352942,
"loss": 1.138,
"step": 1450
},
{
"epoch": 0.4148905939187269,
"grad_norm": 0.24607343971729279,
"learning_rate": 0.0001170787155441887,
"loss": 1.1444,
"step": 1460
},
{
"epoch": 0.4177323103154305,
"grad_norm": 0.2819903492927551,
"learning_rate": 0.00011651037226484798,
"loss": 1.0861,
"step": 1470
},
{
"epoch": 0.4205740267121341,
"grad_norm": 0.2718110978603363,
"learning_rate": 0.00011594202898550725,
"loss": 1.1225,
"step": 1480
},
{
"epoch": 0.4234157431088377,
"grad_norm": 0.2966226041316986,
"learning_rate": 0.00011537368570616654,
"loss": 1.1901,
"step": 1490
},
{
"epoch": 0.42625745950554134,
"grad_norm": 0.38320621848106384,
"learning_rate": 0.00011480534242682581,
"loss": 1.134,
"step": 1500
},
{
"epoch": 0.42909917590224494,
"grad_norm": 0.2895069718360901,
"learning_rate": 0.0001142369991474851,
"loss": 1.0505,
"step": 1510
},
{
"epoch": 0.43194089229894855,
"grad_norm": 0.32522544264793396,
"learning_rate": 0.00011366865586814436,
"loss": 1.119,
"step": 1520
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.3680785298347473,
"learning_rate": 0.00011310031258880365,
"loss": 1.154,
"step": 1530
},
{
"epoch": 0.43762432509235577,
"grad_norm": 0.24093805253505707,
"learning_rate": 0.00011253196930946292,
"loss": 1.1294,
"step": 1540
},
{
"epoch": 0.4404660414890594,
"grad_norm": 0.3424024283885956,
"learning_rate": 0.0001119636260301222,
"loss": 1.1875,
"step": 1550
},
{
"epoch": 0.443307757885763,
"grad_norm": 0.24656616151332855,
"learning_rate": 0.00011139528275078149,
"loss": 1.111,
"step": 1560
},
{
"epoch": 0.4461494742824666,
"grad_norm": 0.289628803730011,
"learning_rate": 0.00011082693947144076,
"loss": 1.1301,
"step": 1570
},
{
"epoch": 0.4489911906791702,
"grad_norm": 0.33433884382247925,
"learning_rate": 0.00011025859619210004,
"loss": 1.1408,
"step": 1580
},
{
"epoch": 0.4518329070758738,
"grad_norm": 0.32477903366088867,
"learning_rate": 0.00010969025291275931,
"loss": 1.1735,
"step": 1590
},
{
"epoch": 0.4546746234725774,
"grad_norm": 0.2586929202079773,
"learning_rate": 0.0001091219096334186,
"loss": 1.1209,
"step": 1600
},
{
"epoch": 0.45751633986928103,
"grad_norm": 0.3958762586116791,
"learning_rate": 0.00010855356635407787,
"loss": 1.1538,
"step": 1610
},
{
"epoch": 0.46035805626598464,
"grad_norm": 0.2704383134841919,
"learning_rate": 0.00010798522307473715,
"loss": 1.1352,
"step": 1620
},
{
"epoch": 0.46319977266268825,
"grad_norm": 0.3805047869682312,
"learning_rate": 0.00010741687979539642,
"loss": 1.0875,
"step": 1630
},
{
"epoch": 0.46604148905939186,
"grad_norm": 0.2563640773296356,
"learning_rate": 0.00010684853651605571,
"loss": 1.0743,
"step": 1640
},
{
"epoch": 0.46888320545609546,
"grad_norm": 0.2729780972003937,
"learning_rate": 0.00010628019323671499,
"loss": 1.082,
"step": 1650
},
{
"epoch": 0.4717249218527991,
"grad_norm": 0.31282857060432434,
"learning_rate": 0.00010571184995737426,
"loss": 1.1308,
"step": 1660
},
{
"epoch": 0.4745666382495027,
"grad_norm": 0.30954697728157043,
"learning_rate": 0.00010514350667803355,
"loss": 1.1608,
"step": 1670
},
{
"epoch": 0.4774083546462063,
"grad_norm": 0.2565356194972992,
"learning_rate": 0.00010457516339869282,
"loss": 1.0761,
"step": 1680
},
{
"epoch": 0.4802500710429099,
"grad_norm": 0.25005489587783813,
"learning_rate": 0.0001040068201193521,
"loss": 1.129,
"step": 1690
},
{
"epoch": 0.4830917874396135,
"grad_norm": 0.2812393605709076,
"learning_rate": 0.00010343847684001137,
"loss": 1.1722,
"step": 1700
},
{
"epoch": 0.4859335038363171,
"grad_norm": 0.2592753469944,
"learning_rate": 0.00010287013356067066,
"loss": 1.0974,
"step": 1710
},
{
"epoch": 0.4887752202330207,
"grad_norm": 0.4299195408821106,
"learning_rate": 0.00010230179028132993,
"loss": 1.1636,
"step": 1720
},
{
"epoch": 0.49161693662972433,
"grad_norm": 0.2507430613040924,
"learning_rate": 0.00010173344700198921,
"loss": 1.1259,
"step": 1730
},
{
"epoch": 0.49445865302642794,
"grad_norm": 0.25915494561195374,
"learning_rate": 0.0001011651037226485,
"loss": 1.1513,
"step": 1740
},
{
"epoch": 0.49730036942313155,
"grad_norm": 0.4046742022037506,
"learning_rate": 0.00010059676044330777,
"loss": 1.1431,
"step": 1750
},
{
"epoch": 0.5001420858198352,
"grad_norm": 0.269300639629364,
"learning_rate": 0.00010002841716396705,
"loss": 1.1176,
"step": 1760
},
{
"epoch": 0.5029838022165388,
"grad_norm": 0.27633029222488403,
"learning_rate": 9.946007388462631e-05,
"loss": 1.1394,
"step": 1770
},
{
"epoch": 0.5058255186132424,
"grad_norm": 0.2624037563800812,
"learning_rate": 9.889173060528559e-05,
"loss": 1.1789,
"step": 1780
},
{
"epoch": 0.508667235009946,
"grad_norm": 0.2688696086406708,
"learning_rate": 9.832338732594488e-05,
"loss": 1.0863,
"step": 1790
},
{
"epoch": 0.5115089514066496,
"grad_norm": 0.41462844610214233,
"learning_rate": 9.775504404660415e-05,
"loss": 1.1191,
"step": 1800
},
{
"epoch": 0.5143506678033533,
"grad_norm": 0.26823553442955017,
"learning_rate": 9.718670076726343e-05,
"loss": 1.0279,
"step": 1810
},
{
"epoch": 0.5171923842000569,
"grad_norm": 0.37709948420524597,
"learning_rate": 9.66183574879227e-05,
"loss": 1.1478,
"step": 1820
},
{
"epoch": 0.5200341005967605,
"grad_norm": 0.3214150369167328,
"learning_rate": 9.605001420858199e-05,
"loss": 1.0844,
"step": 1830
},
{
"epoch": 0.5228758169934641,
"grad_norm": 0.27517786622047424,
"learning_rate": 9.548167092924126e-05,
"loss": 1.1664,
"step": 1840
},
{
"epoch": 0.5257175333901677,
"grad_norm": 0.2494751513004303,
"learning_rate": 9.491332764990054e-05,
"loss": 1.1159,
"step": 1850
},
{
"epoch": 0.5285592497868713,
"grad_norm": 0.3077758252620697,
"learning_rate": 9.434498437055981e-05,
"loss": 1.1658,
"step": 1860
},
{
"epoch": 0.5314009661835749,
"grad_norm": 0.34368380904197693,
"learning_rate": 9.37766410912191e-05,
"loss": 1.1377,
"step": 1870
},
{
"epoch": 0.5342426825802785,
"grad_norm": 0.2457958608865738,
"learning_rate": 9.320829781187838e-05,
"loss": 1.1323,
"step": 1880
},
{
"epoch": 0.5370843989769821,
"grad_norm": 0.2676566243171692,
"learning_rate": 9.263995453253765e-05,
"loss": 1.1968,
"step": 1890
},
{
"epoch": 0.5399261153736857,
"grad_norm": 0.23688159883022308,
"learning_rate": 9.207161125319694e-05,
"loss": 1.093,
"step": 1900
},
{
"epoch": 0.5427678317703893,
"grad_norm": 0.3742455542087555,
"learning_rate": 9.150326797385621e-05,
"loss": 1.1396,
"step": 1910
},
{
"epoch": 0.545609548167093,
"grad_norm": 0.31831708550453186,
"learning_rate": 9.093492469451549e-05,
"loss": 1.169,
"step": 1920
},
{
"epoch": 0.5484512645637966,
"grad_norm": 0.2743741273880005,
"learning_rate": 9.036658141517476e-05,
"loss": 1.0958,
"step": 1930
},
{
"epoch": 0.5512929809605002,
"grad_norm": 0.3475467562675476,
"learning_rate": 8.979823813583405e-05,
"loss": 1.0955,
"step": 1940
},
{
"epoch": 0.5541346973572038,
"grad_norm": 0.245005264878273,
"learning_rate": 8.922989485649332e-05,
"loss": 1.1146,
"step": 1950
},
{
"epoch": 0.5569764137539074,
"grad_norm": 0.2580576241016388,
"learning_rate": 8.86615515771526e-05,
"loss": 1.1223,
"step": 1960
},
{
"epoch": 0.559818130150611,
"grad_norm": 0.3004942536354065,
"learning_rate": 8.809320829781189e-05,
"loss": 1.0554,
"step": 1970
},
{
"epoch": 0.5626598465473146,
"grad_norm": 0.27015358209609985,
"learning_rate": 8.752486501847116e-05,
"loss": 1.1463,
"step": 1980
},
{
"epoch": 0.5655015629440182,
"grad_norm": 0.26687344908714294,
"learning_rate": 8.695652173913044e-05,
"loss": 1.1709,
"step": 1990
},
{
"epoch": 0.5683432793407218,
"grad_norm": 0.2721405625343323,
"learning_rate": 8.638817845978971e-05,
"loss": 1.1327,
"step": 2000
},
{
"epoch": 0.5711849957374254,
"grad_norm": 0.287565678358078,
"learning_rate": 8.5819835180449e-05,
"loss": 1.1123,
"step": 2010
},
{
"epoch": 0.574026712134129,
"grad_norm": 0.3454604148864746,
"learning_rate": 8.525149190110827e-05,
"loss": 1.1508,
"step": 2020
},
{
"epoch": 0.5768684285308326,
"grad_norm": 0.26024872064590454,
"learning_rate": 8.468314862176755e-05,
"loss": 1.0406,
"step": 2030
},
{
"epoch": 0.5797101449275363,
"grad_norm": 0.2535635828971863,
"learning_rate": 8.411480534242682e-05,
"loss": 1.1191,
"step": 2040
},
{
"epoch": 0.5825518613242399,
"grad_norm": 0.334521621465683,
"learning_rate": 8.35464620630861e-05,
"loss": 1.1982,
"step": 2050
},
{
"epoch": 0.5853935777209435,
"grad_norm": 0.27677032351493835,
"learning_rate": 8.297811878374539e-05,
"loss": 1.0914,
"step": 2060
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.360412061214447,
"learning_rate": 8.240977550440466e-05,
"loss": 1.1377,
"step": 2070
},
{
"epoch": 0.5910770105143507,
"grad_norm": 0.28759074211120605,
"learning_rate": 8.184143222506395e-05,
"loss": 1.053,
"step": 2080
},
{
"epoch": 0.5939187269110543,
"grad_norm": 0.32831844687461853,
"learning_rate": 8.127308894572322e-05,
"loss": 1.1326,
"step": 2090
},
{
"epoch": 0.5967604433077579,
"grad_norm": 0.25032633543014526,
"learning_rate": 8.07047456663825e-05,
"loss": 1.1457,
"step": 2100
},
{
"epoch": 0.5996021597044615,
"grad_norm": 0.28084343671798706,
"learning_rate": 8.013640238704177e-05,
"loss": 1.1014,
"step": 2110
},
{
"epoch": 0.6024438761011651,
"grad_norm": 0.26768869161605835,
"learning_rate": 7.956805910770106e-05,
"loss": 1.1442,
"step": 2120
},
{
"epoch": 0.6052855924978687,
"grad_norm": 0.35225123167037964,
"learning_rate": 7.899971582836033e-05,
"loss": 1.1275,
"step": 2130
},
{
"epoch": 0.6081273088945723,
"grad_norm": 0.25125908851623535,
"learning_rate": 7.843137254901961e-05,
"loss": 1.1318,
"step": 2140
},
{
"epoch": 0.610969025291276,
"grad_norm": 0.2658576965332031,
"learning_rate": 7.78630292696789e-05,
"loss": 1.127,
"step": 2150
},
{
"epoch": 0.6138107416879796,
"grad_norm": 0.27074316143989563,
"learning_rate": 7.729468599033817e-05,
"loss": 1.1609,
"step": 2160
},
{
"epoch": 0.6166524580846832,
"grad_norm": 0.4368594288825989,
"learning_rate": 7.672634271099745e-05,
"loss": 1.1488,
"step": 2170
},
{
"epoch": 0.6194941744813868,
"grad_norm": 0.3108392059803009,
"learning_rate": 7.615799943165672e-05,
"loss": 1.1158,
"step": 2180
},
{
"epoch": 0.6223358908780904,
"grad_norm": 0.3383192718029022,
"learning_rate": 7.5589656152316e-05,
"loss": 1.123,
"step": 2190
},
{
"epoch": 0.625177607274794,
"grad_norm": 0.35067611932754517,
"learning_rate": 7.502131287297528e-05,
"loss": 1.1087,
"step": 2200
},
{
"epoch": 0.6280193236714976,
"grad_norm": 0.28962525725364685,
"learning_rate": 7.445296959363456e-05,
"loss": 1.0414,
"step": 2210
},
{
"epoch": 0.6308610400682012,
"grad_norm": 0.2729092538356781,
"learning_rate": 7.388462631429383e-05,
"loss": 1.1024,
"step": 2220
},
{
"epoch": 0.6337027564649048,
"grad_norm": 0.26230186223983765,
"learning_rate": 7.331628303495311e-05,
"loss": 1.1446,
"step": 2230
},
{
"epoch": 0.6365444728616084,
"grad_norm": 0.30937954783439636,
"learning_rate": 7.27479397556124e-05,
"loss": 1.1018,
"step": 2240
},
{
"epoch": 0.639386189258312,
"grad_norm": 0.2578141987323761,
"learning_rate": 7.217959647627167e-05,
"loss": 1.124,
"step": 2250
},
{
"epoch": 0.6422279056550156,
"grad_norm": 0.2398582547903061,
"learning_rate": 7.161125319693095e-05,
"loss": 1.0647,
"step": 2260
},
{
"epoch": 0.6450696220517192,
"grad_norm": 0.2563438415527344,
"learning_rate": 7.104290991759022e-05,
"loss": 1.1878,
"step": 2270
},
{
"epoch": 0.6479113384484229,
"grad_norm": 0.3550574779510498,
"learning_rate": 7.047456663824951e-05,
"loss": 1.051,
"step": 2280
},
{
"epoch": 0.6507530548451265,
"grad_norm": 0.2969961166381836,
"learning_rate": 6.990622335890878e-05,
"loss": 1.1507,
"step": 2290
},
{
"epoch": 0.6535947712418301,
"grad_norm": 0.2654373049736023,
"learning_rate": 6.933788007956806e-05,
"loss": 1.0709,
"step": 2300
},
{
"epoch": 0.6564364876385337,
"grad_norm": 0.2643349766731262,
"learning_rate": 6.876953680022733e-05,
"loss": 1.0412,
"step": 2310
},
{
"epoch": 0.6592782040352373,
"grad_norm": 0.24034832417964935,
"learning_rate": 6.820119352088662e-05,
"loss": 1.1291,
"step": 2320
},
{
"epoch": 0.6621199204319409,
"grad_norm": 0.23572514951229095,
"learning_rate": 6.76328502415459e-05,
"loss": 1.0813,
"step": 2330
},
{
"epoch": 0.6649616368286445,
"grad_norm": 0.24992486834526062,
"learning_rate": 6.706450696220517e-05,
"loss": 1.0856,
"step": 2340
},
{
"epoch": 0.6678033532253481,
"grad_norm": 0.319242924451828,
"learning_rate": 6.649616368286446e-05,
"loss": 1.0326,
"step": 2350
},
{
"epoch": 0.6706450696220517,
"grad_norm": 0.2844800353050232,
"learning_rate": 6.592782040352373e-05,
"loss": 1.0593,
"step": 2360
},
{
"epoch": 0.6734867860187553,
"grad_norm": 0.3302006423473358,
"learning_rate": 6.535947712418301e-05,
"loss": 1.0814,
"step": 2370
},
{
"epoch": 0.6763285024154589,
"grad_norm": 0.25190767645835876,
"learning_rate": 6.479113384484228e-05,
"loss": 1.1088,
"step": 2380
},
{
"epoch": 0.6791702188121626,
"grad_norm": 0.35067909955978394,
"learning_rate": 6.422279056550157e-05,
"loss": 1.0752,
"step": 2390
},
{
"epoch": 0.6820119352088662,
"grad_norm": 0.3033742904663086,
"learning_rate": 6.365444728616085e-05,
"loss": 1.0715,
"step": 2400
},
{
"epoch": 0.6848536516055698,
"grad_norm": 0.2741170823574066,
"learning_rate": 6.308610400682012e-05,
"loss": 1.1067,
"step": 2410
},
{
"epoch": 0.6876953680022734,
"grad_norm": 0.32321301102638245,
"learning_rate": 6.251776072747941e-05,
"loss": 1.124,
"step": 2420
},
{
"epoch": 0.690537084398977,
"grad_norm": 0.3299727737903595,
"learning_rate": 6.194941744813868e-05,
"loss": 1.0749,
"step": 2430
},
{
"epoch": 0.6933788007956806,
"grad_norm": 0.2597588002681732,
"learning_rate": 6.138107416879796e-05,
"loss": 1.056,
"step": 2440
},
{
"epoch": 0.6962205171923842,
"grad_norm": 0.23553162813186646,
"learning_rate": 6.081273088945724e-05,
"loss": 1.0891,
"step": 2450
},
{
"epoch": 0.6990622335890878,
"grad_norm": 0.2980464696884155,
"learning_rate": 6.024438761011652e-05,
"loss": 1.1324,
"step": 2460
},
{
"epoch": 0.7019039499857914,
"grad_norm": 0.2644715905189514,
"learning_rate": 5.9676044330775795e-05,
"loss": 1.0958,
"step": 2470
},
{
"epoch": 0.704745666382495,
"grad_norm": 0.3175826668739319,
"learning_rate": 5.910770105143507e-05,
"loss": 1.1356,
"step": 2480
},
{
"epoch": 0.7075873827791986,
"grad_norm": 0.2976154685020447,
"learning_rate": 5.853935777209435e-05,
"loss": 1.0735,
"step": 2490
},
{
"epoch": 0.7104290991759022,
"grad_norm": 0.31366729736328125,
"learning_rate": 5.797101449275363e-05,
"loss": 1.1751,
"step": 2500
},
{
"epoch": 0.7132708155726059,
"grad_norm": 0.31203290820121765,
"learning_rate": 5.7402671213412905e-05,
"loss": 1.1212,
"step": 2510
},
{
"epoch": 0.7161125319693095,
"grad_norm": 0.2864065170288086,
"learning_rate": 5.683432793407218e-05,
"loss": 1.146,
"step": 2520
},
{
"epoch": 0.7189542483660131,
"grad_norm": 0.2840626835823059,
"learning_rate": 5.626598465473146e-05,
"loss": 1.0261,
"step": 2530
},
{
"epoch": 0.7217959647627167,
"grad_norm": 0.3476808965206146,
"learning_rate": 5.5697641375390744e-05,
"loss": 1.1344,
"step": 2540
},
{
"epoch": 0.7246376811594203,
"grad_norm": 0.3733687996864319,
"learning_rate": 5.512929809605002e-05,
"loss": 1.1298,
"step": 2550
},
{
"epoch": 0.7274793975561239,
"grad_norm": 0.30094149708747864,
"learning_rate": 5.45609548167093e-05,
"loss": 1.1255,
"step": 2560
},
{
"epoch": 0.7303211139528275,
"grad_norm": 0.32213765382766724,
"learning_rate": 5.3992611537368576e-05,
"loss": 1.1319,
"step": 2570
},
{
"epoch": 0.7331628303495311,
"grad_norm": 0.2626177668571472,
"learning_rate": 5.3424268258027854e-05,
"loss": 1.141,
"step": 2580
},
{
"epoch": 0.7360045467462347,
"grad_norm": 0.30769652128219604,
"learning_rate": 5.285592497868713e-05,
"loss": 1.0559,
"step": 2590
},
{
"epoch": 0.7388462631429383,
"grad_norm": 0.24354040622711182,
"learning_rate": 5.228758169934641e-05,
"loss": 1.1385,
"step": 2600
},
{
"epoch": 0.7416879795396419,
"grad_norm": 0.24648752808570862,
"learning_rate": 5.1719238420005686e-05,
"loss": 1.1231,
"step": 2610
},
{
"epoch": 0.7445296959363455,
"grad_norm": 0.30956050753593445,
"learning_rate": 5.1150895140664964e-05,
"loss": 1.104,
"step": 2620
},
{
"epoch": 0.7473714123330492,
"grad_norm": 0.27976328134536743,
"learning_rate": 5.058255186132425e-05,
"loss": 1.1321,
"step": 2630
},
{
"epoch": 0.7502131287297528,
"grad_norm": 0.2535441517829895,
"learning_rate": 5.0014208581983526e-05,
"loss": 1.1174,
"step": 2640
},
{
"epoch": 0.7530548451264564,
"grad_norm": 0.3341921865940094,
"learning_rate": 4.9445865302642796e-05,
"loss": 1.1038,
"step": 2650
},
{
"epoch": 0.75589656152316,
"grad_norm": 0.4902798533439636,
"learning_rate": 4.8877522023302074e-05,
"loss": 1.1245,
"step": 2660
},
{
"epoch": 0.7587382779198636,
"grad_norm": 0.2470143884420395,
"learning_rate": 4.830917874396135e-05,
"loss": 1.1574,
"step": 2670
},
{
"epoch": 0.7615799943165672,
"grad_norm": 0.2521553635597229,
"learning_rate": 4.774083546462063e-05,
"loss": 1.1444,
"step": 2680
},
{
"epoch": 0.7644217107132708,
"grad_norm": 0.3512313961982727,
"learning_rate": 4.7172492185279906e-05,
"loss": 1.1339,
"step": 2690
},
{
"epoch": 0.7672634271099744,
"grad_norm": 0.25479793548583984,
"learning_rate": 4.660414890593919e-05,
"loss": 1.1517,
"step": 2700
},
{
"epoch": 0.770105143506678,
"grad_norm": 0.2850602865219116,
"learning_rate": 4.603580562659847e-05,
"loss": 1.1212,
"step": 2710
},
{
"epoch": 0.7729468599033816,
"grad_norm": 0.3531084358692169,
"learning_rate": 4.5467462347257746e-05,
"loss": 1.0433,
"step": 2720
},
{
"epoch": 0.7757885763000852,
"grad_norm": 0.2699624001979828,
"learning_rate": 4.489911906791702e-05,
"loss": 1.0769,
"step": 2730
},
{
"epoch": 0.7786302926967889,
"grad_norm": 0.3828187584877014,
"learning_rate": 4.43307757885763e-05,
"loss": 1.0826,
"step": 2740
},
{
"epoch": 0.7814720090934925,
"grad_norm": 0.36253124475479126,
"learning_rate": 4.376243250923558e-05,
"loss": 1.1098,
"step": 2750
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.26642584800720215,
"learning_rate": 4.3194089229894856e-05,
"loss": 1.084,
"step": 2760
},
{
"epoch": 0.7871554418868997,
"grad_norm": 0.33443573117256165,
"learning_rate": 4.262574595055413e-05,
"loss": 1.1113,
"step": 2770
},
{
"epoch": 0.7899971582836033,
"grad_norm": 0.3628551661968231,
"learning_rate": 4.205740267121341e-05,
"loss": 1.107,
"step": 2780
},
{
"epoch": 0.7928388746803069,
"grad_norm": 0.4214700758457184,
"learning_rate": 4.1489059391872695e-05,
"loss": 1.1348,
"step": 2790
},
{
"epoch": 0.7956805910770105,
"grad_norm": 0.2711296081542969,
"learning_rate": 4.092071611253197e-05,
"loss": 1.0151,
"step": 2800
},
{
"epoch": 0.7985223074737141,
"grad_norm": 0.25555798411369324,
"learning_rate": 4.035237283319125e-05,
"loss": 1.1388,
"step": 2810
},
{
"epoch": 0.8013640238704177,
"grad_norm": 0.2785557806491852,
"learning_rate": 3.978402955385053e-05,
"loss": 1.1212,
"step": 2820
},
{
"epoch": 0.8042057402671213,
"grad_norm": 0.2974455654621124,
"learning_rate": 3.9215686274509805e-05,
"loss": 1.1068,
"step": 2830
},
{
"epoch": 0.8070474566638249,
"grad_norm": 0.2941993176937103,
"learning_rate": 3.864734299516908e-05,
"loss": 1.1138,
"step": 2840
},
{
"epoch": 0.8098891730605285,
"grad_norm": 0.26110532879829407,
"learning_rate": 3.807899971582836e-05,
"loss": 1.1223,
"step": 2850
},
{
"epoch": 0.8127308894572322,
"grad_norm": 0.3765209913253784,
"learning_rate": 3.751065643648764e-05,
"loss": 1.1082,
"step": 2860
},
{
"epoch": 0.8155726058539358,
"grad_norm": 0.25102654099464417,
"learning_rate": 3.6942313157146915e-05,
"loss": 1.0929,
"step": 2870
},
{
"epoch": 0.8184143222506394,
"grad_norm": 0.28592199087142944,
"learning_rate": 3.63739698778062e-05,
"loss": 1.1606,
"step": 2880
},
{
"epoch": 0.821256038647343,
"grad_norm": 0.28945067524909973,
"learning_rate": 3.580562659846548e-05,
"loss": 1.0372,
"step": 2890
},
{
"epoch": 0.8240977550440466,
"grad_norm": 0.25189608335494995,
"learning_rate": 3.5237283319124754e-05,
"loss": 1.1106,
"step": 2900
},
{
"epoch": 0.8269394714407502,
"grad_norm": 0.25932416319847107,
"learning_rate": 3.466894003978403e-05,
"loss": 1.0905,
"step": 2910
},
{
"epoch": 0.8297811878374538,
"grad_norm": 0.384107768535614,
"learning_rate": 3.410059676044331e-05,
"loss": 1.0509,
"step": 2920
},
{
"epoch": 0.8326229042341574,
"grad_norm": 0.2776072919368744,
"learning_rate": 3.353225348110259e-05,
"loss": 1.0733,
"step": 2930
},
{
"epoch": 0.835464620630861,
"grad_norm": 0.3497239351272583,
"learning_rate": 3.2963910201761864e-05,
"loss": 1.0315,
"step": 2940
},
{
"epoch": 0.8383063370275646,
"grad_norm": 0.3055514693260193,
"learning_rate": 3.239556692242114e-05,
"loss": 1.1525,
"step": 2950
},
{
"epoch": 0.8411480534242682,
"grad_norm": 0.34542274475097656,
"learning_rate": 3.1827223643080426e-05,
"loss": 1.1209,
"step": 2960
},
{
"epoch": 0.8439897698209718,
"grad_norm": 0.28153786063194275,
"learning_rate": 3.1258880363739704e-05,
"loss": 1.1896,
"step": 2970
},
{
"epoch": 0.8468314862176755,
"grad_norm": 0.3401312232017517,
"learning_rate": 3.069053708439898e-05,
"loss": 1.1267,
"step": 2980
},
{
"epoch": 0.8496732026143791,
"grad_norm": 0.275859534740448,
"learning_rate": 3.012219380505826e-05,
"loss": 1.1269,
"step": 2990
},
{
"epoch": 0.8525149190110827,
"grad_norm": 0.2636275589466095,
"learning_rate": 2.9553850525717536e-05,
"loss": 1.0916,
"step": 3000
},
{
"epoch": 0.8553566354077863,
"grad_norm": 0.2864493727684021,
"learning_rate": 2.8985507246376814e-05,
"loss": 1.1135,
"step": 3010
},
{
"epoch": 0.8581983518044899,
"grad_norm": 0.32858121395111084,
"learning_rate": 2.841716396703609e-05,
"loss": 1.138,
"step": 3020
},
{
"epoch": 0.8610400682011935,
"grad_norm": 0.2940399944782257,
"learning_rate": 2.7848820687695372e-05,
"loss": 1.1357,
"step": 3030
},
{
"epoch": 0.8638817845978971,
"grad_norm": 0.3530689477920532,
"learning_rate": 2.728047740835465e-05,
"loss": 1.0974,
"step": 3040
},
{
"epoch": 0.8667235009946007,
"grad_norm": 0.2814668118953705,
"learning_rate": 2.6712134129013927e-05,
"loss": 1.1401,
"step": 3050
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.31256726384162903,
"learning_rate": 2.6143790849673204e-05,
"loss": 1.0997,
"step": 3060
},
{
"epoch": 0.8724069337880079,
"grad_norm": 0.24463021755218506,
"learning_rate": 2.5575447570332482e-05,
"loss": 1.044,
"step": 3070
},
{
"epoch": 0.8752486501847115,
"grad_norm": 0.2700346112251282,
"learning_rate": 2.5007104290991763e-05,
"loss": 1.0897,
"step": 3080
},
{
"epoch": 0.8780903665814151,
"grad_norm": 0.36381930112838745,
"learning_rate": 2.4438761011651037e-05,
"loss": 1.0861,
"step": 3090
},
{
"epoch": 0.8809320829781188,
"grad_norm": 0.37403604388237,
"learning_rate": 2.3870417732310314e-05,
"loss": 1.1199,
"step": 3100
},
{
"epoch": 0.8837737993748224,
"grad_norm": 0.3271077275276184,
"learning_rate": 2.3302074452969595e-05,
"loss": 1.0453,
"step": 3110
},
{
"epoch": 0.886615515771526,
"grad_norm": 0.29940828680992126,
"learning_rate": 2.2733731173628873e-05,
"loss": 1.1525,
"step": 3120
},
{
"epoch": 0.8894572321682296,
"grad_norm": 0.26956799626350403,
"learning_rate": 2.216538789428815e-05,
"loss": 1.1288,
"step": 3130
},
{
"epoch": 0.8922989485649332,
"grad_norm": 0.28132757544517517,
"learning_rate": 2.1597044614947428e-05,
"loss": 1.137,
"step": 3140
},
{
"epoch": 0.8951406649616368,
"grad_norm": 0.3393004834651947,
"learning_rate": 2.1028701335606705e-05,
"loss": 1.0603,
"step": 3150
},
{
"epoch": 0.8979823813583404,
"grad_norm": 0.29636818170547485,
"learning_rate": 2.0460358056265986e-05,
"loss": 1.0911,
"step": 3160
},
{
"epoch": 0.900824097755044,
"grad_norm": 0.30555668473243713,
"learning_rate": 1.9892014776925264e-05,
"loss": 1.2047,
"step": 3170
},
{
"epoch": 0.9036658141517476,
"grad_norm": 0.31181901693344116,
"learning_rate": 1.932367149758454e-05,
"loss": 1.1506,
"step": 3180
},
{
"epoch": 0.9065075305484512,
"grad_norm": 0.3718467652797699,
"learning_rate": 1.875532821824382e-05,
"loss": 1.1278,
"step": 3190
},
{
"epoch": 0.9093492469451548,
"grad_norm": 0.2683012783527374,
"learning_rate": 1.81869849389031e-05,
"loss": 1.0927,
"step": 3200
},
{
"epoch": 0.9121909633418585,
"grad_norm": 0.3459053933620453,
"learning_rate": 1.7618641659562377e-05,
"loss": 0.9968,
"step": 3210
},
{
"epoch": 0.9150326797385621,
"grad_norm": 0.37453094124794006,
"learning_rate": 1.7050298380221655e-05,
"loss": 1.0649,
"step": 3220
},
{
"epoch": 0.9178743961352657,
"grad_norm": 0.2843706011772156,
"learning_rate": 1.6481955100880932e-05,
"loss": 1.0884,
"step": 3230
},
{
"epoch": 0.9207161125319693,
"grad_norm": 0.2847299575805664,
"learning_rate": 1.5913611821540213e-05,
"loss": 1.124,
"step": 3240
},
{
"epoch": 0.9235578289286729,
"grad_norm": 0.2724878191947937,
"learning_rate": 1.534526854219949e-05,
"loss": 1.1162,
"step": 3250
},
{
"epoch": 0.9263995453253765,
"grad_norm": 0.3032269775867462,
"learning_rate": 1.4776925262858768e-05,
"loss": 1.152,
"step": 3260
},
{
"epoch": 0.9292412617220801,
"grad_norm": 0.2314031720161438,
"learning_rate": 1.4208581983518046e-05,
"loss": 1.083,
"step": 3270
},
{
"epoch": 0.9320829781187837,
"grad_norm": 0.3560166656970978,
"learning_rate": 1.3640238704177325e-05,
"loss": 1.0657,
"step": 3280
},
{
"epoch": 0.9349246945154873,
"grad_norm": 0.3593125343322754,
"learning_rate": 1.3071895424836602e-05,
"loss": 1.1199,
"step": 3290
},
{
"epoch": 0.9377664109121909,
"grad_norm": 0.2630976438522339,
"learning_rate": 1.2503552145495881e-05,
"loss": 1.0206,
"step": 3300
},
{
"epoch": 0.9406081273088945,
"grad_norm": 0.2689420282840729,
"learning_rate": 1.1935208866155157e-05,
"loss": 1.1464,
"step": 3310
},
{
"epoch": 0.9434498437055981,
"grad_norm": 0.33892905712127686,
"learning_rate": 1.1366865586814436e-05,
"loss": 1.1148,
"step": 3320
},
{
"epoch": 0.9462915601023018,
"grad_norm": 0.2977355420589447,
"learning_rate": 1.0798522307473714e-05,
"loss": 1.0744,
"step": 3330
},
{
"epoch": 0.9491332764990054,
"grad_norm": 0.24483497440814972,
"learning_rate": 1.0230179028132993e-05,
"loss": 1.0821,
"step": 3340
},
{
"epoch": 0.951974992895709,
"grad_norm": 0.31132972240448,
"learning_rate": 9.66183574879227e-06,
"loss": 1.1173,
"step": 3350
},
{
"epoch": 0.9548167092924126,
"grad_norm": 0.38017576932907104,
"learning_rate": 9.09349246945155e-06,
"loss": 1.0849,
"step": 3360
},
{
"epoch": 0.9576584256891162,
"grad_norm": 0.3665505051612854,
"learning_rate": 8.525149190110827e-06,
"loss": 1.1455,
"step": 3370
},
{
"epoch": 0.9605001420858198,
"grad_norm": 0.29008227586746216,
"learning_rate": 7.956805910770107e-06,
"loss": 1.073,
"step": 3380
},
{
"epoch": 0.9633418584825234,
"grad_norm": 0.3025209903717041,
"learning_rate": 7.388462631429384e-06,
"loss": 1.1217,
"step": 3390
},
{
"epoch": 0.966183574879227,
"grad_norm": 0.28936612606048584,
"learning_rate": 6.820119352088662e-06,
"loss": 1.0274,
"step": 3400
},
{
"epoch": 0.9690252912759306,
"grad_norm": 0.3080444633960724,
"learning_rate": 6.251776072747941e-06,
"loss": 1.1762,
"step": 3410
},
{
"epoch": 0.9718670076726342,
"grad_norm": 0.3216501772403717,
"learning_rate": 5.683432793407218e-06,
"loss": 1.1382,
"step": 3420
},
{
"epoch": 0.9747087240693378,
"grad_norm": 0.2817239761352539,
"learning_rate": 5.1150895140664966e-06,
"loss": 1.0852,
"step": 3430
},
{
"epoch": 0.9775504404660414,
"grad_norm": 0.33580419421195984,
"learning_rate": 4.546746234725775e-06,
"loss": 1.1122,
"step": 3440
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.27000322937965393,
"learning_rate": 3.978402955385053e-06,
"loss": 1.1576,
"step": 3450
},
{
"epoch": 0.9832338732594487,
"grad_norm": 0.3857513666152954,
"learning_rate": 3.410059676044331e-06,
"loss": 1.0738,
"step": 3460
},
{
"epoch": 0.9860755896561523,
"grad_norm": 0.2714364528656006,
"learning_rate": 2.841716396703609e-06,
"loss": 1.0978,
"step": 3470
},
{
"epoch": 0.9889173060528559,
"grad_norm": 0.314001202583313,
"learning_rate": 2.2733731173628875e-06,
"loss": 1.0158,
"step": 3480
},
{
"epoch": 0.9917590224495595,
"grad_norm": 0.3561016023159027,
"learning_rate": 1.7050298380221656e-06,
"loss": 1.1506,
"step": 3490
},
{
"epoch": 0.9946007388462631,
"grad_norm": 0.33506929874420166,
"learning_rate": 1.1366865586814437e-06,
"loss": 1.0156,
"step": 3500
}
],
"logging_steps": 10,
"max_steps": 3519,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.6906446012416e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}