DeepseekProverV2Finetuned01 / trainer_state.json
raduv98's picture
Upload folder using huggingface_hub
ade6557 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 13705,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001459321415541773,
"grad_norm": 2.6745453947710165,
"learning_rate": 4.611650485436894e-06,
"loss": 1.0637,
"step": 20
},
{
"epoch": 0.002918642831083546,
"grad_norm": 0.8636258229268736,
"learning_rate": 9.466019417475729e-06,
"loss": 0.3278,
"step": 40
},
{
"epoch": 0.004377964246625319,
"grad_norm": 0.6429542529341992,
"learning_rate": 1.4320388349514562e-05,
"loss": 0.1701,
"step": 60
},
{
"epoch": 0.005837285662167092,
"grad_norm": 0.4870228029221957,
"learning_rate": 1.91747572815534e-05,
"loss": 0.1413,
"step": 80
},
{
"epoch": 0.0072966070777088655,
"grad_norm": 0.4961345702002848,
"learning_rate": 2.4029126213592234e-05,
"loss": 0.1396,
"step": 100
},
{
"epoch": 0.008755928493250638,
"grad_norm": 0.40834540818199705,
"learning_rate": 2.8883495145631068e-05,
"loss": 0.1266,
"step": 120
},
{
"epoch": 0.010215249908792412,
"grad_norm": 0.6018679639107668,
"learning_rate": 3.373786407766991e-05,
"loss": 0.1103,
"step": 140
},
{
"epoch": 0.011674571324334184,
"grad_norm": 0.5960255000220145,
"learning_rate": 3.859223300970874e-05,
"loss": 0.1325,
"step": 160
},
{
"epoch": 0.013133892739875957,
"grad_norm": 0.4212109164544222,
"learning_rate": 4.344660194174757e-05,
"loss": 0.1237,
"step": 180
},
{
"epoch": 0.014593214155417731,
"grad_norm": 0.6565890138369451,
"learning_rate": 4.830097087378641e-05,
"loss": 0.1233,
"step": 200
},
{
"epoch": 0.016052535570959505,
"grad_norm": 0.3857563740550121,
"learning_rate": 5.3155339805825244e-05,
"loss": 0.1185,
"step": 220
},
{
"epoch": 0.017511856986501276,
"grad_norm": 0.35890344101071686,
"learning_rate": 5.800970873786408e-05,
"loss": 0.1415,
"step": 240
},
{
"epoch": 0.01897117840204305,
"grad_norm": 0.2979671831060535,
"learning_rate": 6.286407766990293e-05,
"loss": 0.1362,
"step": 260
},
{
"epoch": 0.020430499817584824,
"grad_norm": 0.38285277593806,
"learning_rate": 6.771844660194175e-05,
"loss": 0.1302,
"step": 280
},
{
"epoch": 0.021889821233126595,
"grad_norm": 0.5932557337111825,
"learning_rate": 7.25728155339806e-05,
"loss": 0.1286,
"step": 300
},
{
"epoch": 0.02334914264866837,
"grad_norm": 0.39643282763459103,
"learning_rate": 7.742718446601942e-05,
"loss": 0.1494,
"step": 320
},
{
"epoch": 0.024808464064210143,
"grad_norm": 0.4853895916428549,
"learning_rate": 8.228155339805825e-05,
"loss": 0.1304,
"step": 340
},
{
"epoch": 0.026267785479751914,
"grad_norm": 0.48381251386974544,
"learning_rate": 8.713592233009709e-05,
"loss": 0.1394,
"step": 360
},
{
"epoch": 0.027727106895293688,
"grad_norm": 0.4904529766502894,
"learning_rate": 9.199029126213593e-05,
"loss": 0.1509,
"step": 380
},
{
"epoch": 0.029186428310835462,
"grad_norm": 0.7958367491650168,
"learning_rate": 9.684466019417477e-05,
"loss": 0.1395,
"step": 400
},
{
"epoch": 0.030645749726377236,
"grad_norm": 0.624215442246557,
"learning_rate": 9.999993157895145e-05,
"loss": 0.1745,
"step": 420
},
{
"epoch": 0.03210507114191901,
"grad_norm": 0.6815596480986827,
"learning_rate": 9.999898206558094e-05,
"loss": 0.1433,
"step": 440
},
{
"epoch": 0.03356439255746078,
"grad_norm": 0.7939990598700336,
"learning_rate": 9.999691549843376e-05,
"loss": 0.1609,
"step": 460
},
{
"epoch": 0.03502371397300255,
"grad_norm": 0.4965525216601077,
"learning_rate": 9.999373192368015e-05,
"loss": 0.1523,
"step": 480
},
{
"epoch": 0.036483035388544326,
"grad_norm": 1.8640551410419721,
"learning_rate": 9.998943141244607e-05,
"loss": 0.1809,
"step": 500
},
{
"epoch": 0.0379423568040861,
"grad_norm": 0.6022579601783719,
"learning_rate": 9.99840140608115e-05,
"loss": 0.1956,
"step": 520
},
{
"epoch": 0.039401678219627874,
"grad_norm": 0.40549268604230454,
"learning_rate": 9.997747998980835e-05,
"loss": 0.1648,
"step": 540
},
{
"epoch": 0.04086099963516965,
"grad_norm": 0.49940827687014067,
"learning_rate": 9.996982934541781e-05,
"loss": 0.1475,
"step": 560
},
{
"epoch": 0.04232032105071142,
"grad_norm": 0.5149466244422736,
"learning_rate": 9.996106229856695e-05,
"loss": 0.1518,
"step": 580
},
{
"epoch": 0.04377964246625319,
"grad_norm": 0.29222877668330227,
"learning_rate": 9.995117904512503e-05,
"loss": 0.1682,
"step": 600
},
{
"epoch": 0.045238963881794964,
"grad_norm": 0.5286646588687645,
"learning_rate": 9.994017980589906e-05,
"loss": 0.1421,
"step": 620
},
{
"epoch": 0.04669828529733674,
"grad_norm": 1.1957202323680398,
"learning_rate": 9.992806482662887e-05,
"loss": 0.1699,
"step": 640
},
{
"epoch": 0.04815760671287851,
"grad_norm": 0.5883304358400666,
"learning_rate": 9.991483437798162e-05,
"loss": 0.1338,
"step": 660
},
{
"epoch": 0.049616928128420286,
"grad_norm": 0.6301249373045459,
"learning_rate": 9.99004887555458e-05,
"loss": 0.1547,
"step": 680
},
{
"epoch": 0.05107624954396206,
"grad_norm": 0.4012564183658102,
"learning_rate": 9.988502827982458e-05,
"loss": 0.1367,
"step": 700
},
{
"epoch": 0.05253557095950383,
"grad_norm": 0.3168597769245346,
"learning_rate": 9.986845329622862e-05,
"loss": 0.1475,
"step": 720
},
{
"epoch": 0.0539948923750456,
"grad_norm": 0.38246057665835786,
"learning_rate": 9.985076417506844e-05,
"loss": 0.1397,
"step": 740
},
{
"epoch": 0.055454213790587376,
"grad_norm": 0.3864484630630238,
"learning_rate": 9.983196131154607e-05,
"loss": 0.1368,
"step": 760
},
{
"epoch": 0.05691353520612915,
"grad_norm": 0.34381534736001396,
"learning_rate": 9.981204512574626e-05,
"loss": 0.1202,
"step": 780
},
{
"epoch": 0.058372856621670924,
"grad_norm": 0.5385327614437169,
"learning_rate": 9.979101606262708e-05,
"loss": 0.1444,
"step": 800
},
{
"epoch": 0.0598321780372127,
"grad_norm": 0.3291873594168373,
"learning_rate": 9.976887459200999e-05,
"loss": 0.1344,
"step": 820
},
{
"epoch": 0.06129149945275447,
"grad_norm": 0.35530404400780635,
"learning_rate": 9.97456212085693e-05,
"loss": 0.1455,
"step": 840
},
{
"epoch": 0.06275082086829624,
"grad_norm": 0.25357381945588353,
"learning_rate": 9.972125643182121e-05,
"loss": 0.1405,
"step": 860
},
{
"epoch": 0.06421014228383802,
"grad_norm": 0.33587236054655184,
"learning_rate": 9.969578080611211e-05,
"loss": 0.1273,
"step": 880
},
{
"epoch": 0.06566946369937979,
"grad_norm": 0.2636319125685315,
"learning_rate": 9.966919490060646e-05,
"loss": 0.1155,
"step": 900
},
{
"epoch": 0.06712878511492155,
"grad_norm": 0.4749750626433427,
"learning_rate": 9.96414993092741e-05,
"loss": 0.1577,
"step": 920
},
{
"epoch": 0.06858810653046334,
"grad_norm": 0.26401350478445823,
"learning_rate": 9.961269465087691e-05,
"loss": 0.1313,
"step": 940
},
{
"epoch": 0.0700474279460051,
"grad_norm": 0.3330162979528586,
"learning_rate": 9.958278156895502e-05,
"loss": 0.135,
"step": 960
},
{
"epoch": 0.07150674936154688,
"grad_norm": 0.5511486073921424,
"learning_rate": 9.955176073181249e-05,
"loss": 0.1274,
"step": 980
},
{
"epoch": 0.07296607077708865,
"grad_norm": 0.7818317017171568,
"learning_rate": 9.951963283250227e-05,
"loss": 0.1565,
"step": 1000
},
{
"epoch": 0.07442539219263043,
"grad_norm": 0.42949055730605057,
"learning_rate": 9.948639858881083e-05,
"loss": 0.1487,
"step": 1020
},
{
"epoch": 0.0758847136081722,
"grad_norm": 0.41682813706150323,
"learning_rate": 9.945205874324201e-05,
"loss": 0.143,
"step": 1040
},
{
"epoch": 0.07734403502371397,
"grad_norm": 1.059389264065714,
"learning_rate": 9.941661406300052e-05,
"loss": 0.1312,
"step": 1060
},
{
"epoch": 0.07880335643925575,
"grad_norm": 0.3764528008769406,
"learning_rate": 9.938006533997475e-05,
"loss": 0.1475,
"step": 1080
},
{
"epoch": 0.08026267785479752,
"grad_norm": 0.3433293556468226,
"learning_rate": 9.934241339071912e-05,
"loss": 0.1379,
"step": 1100
},
{
"epoch": 0.0817219992703393,
"grad_norm": 0.4669143421528186,
"learning_rate": 9.930365905643578e-05,
"loss": 0.1528,
"step": 1120
},
{
"epoch": 0.08318132068588106,
"grad_norm": 0.3165847851959828,
"learning_rate": 9.92638032029559e-05,
"loss": 0.1424,
"step": 1140
},
{
"epoch": 0.08464064210142284,
"grad_norm": 0.4112176255009246,
"learning_rate": 9.922284672072021e-05,
"loss": 0.1466,
"step": 1160
},
{
"epoch": 0.08609996351696461,
"grad_norm": 0.24922324308271643,
"learning_rate": 9.918079052475922e-05,
"loss": 0.1151,
"step": 1180
},
{
"epoch": 0.08755928493250638,
"grad_norm": 0.29440735283548447,
"learning_rate": 9.913763555467269e-05,
"loss": 0.1502,
"step": 1200
},
{
"epoch": 0.08901860634804816,
"grad_norm": 0.31114703090098295,
"learning_rate": 9.909338277460872e-05,
"loss": 0.1163,
"step": 1220
},
{
"epoch": 0.09047792776358993,
"grad_norm": 0.26314581527572667,
"learning_rate": 9.904803317324211e-05,
"loss": 0.1124,
"step": 1240
},
{
"epoch": 0.09193724917913171,
"grad_norm": 0.37910467700641326,
"learning_rate": 9.90015877637524e-05,
"loss": 0.1624,
"step": 1260
},
{
"epoch": 0.09339657059467348,
"grad_norm": 0.32917497252554034,
"learning_rate": 9.895404758380109e-05,
"loss": 0.1417,
"step": 1280
},
{
"epoch": 0.09485589201021526,
"grad_norm": 0.4253161624681656,
"learning_rate": 9.890541369550854e-05,
"loss": 0.1243,
"step": 1300
},
{
"epoch": 0.09631521342575702,
"grad_norm": 0.2579086839658108,
"learning_rate": 9.885568718543025e-05,
"loss": 0.1386,
"step": 1320
},
{
"epoch": 0.09777453484129879,
"grad_norm": 0.22942493216872414,
"learning_rate": 9.88048691645326e-05,
"loss": 0.13,
"step": 1340
},
{
"epoch": 0.09923385625684057,
"grad_norm": 0.45710318102883435,
"learning_rate": 9.87529607681679e-05,
"loss": 0.1777,
"step": 1360
},
{
"epoch": 0.10069317767238234,
"grad_norm": 0.27950949145967136,
"learning_rate": 9.869996315604915e-05,
"loss": 0.1397,
"step": 1380
},
{
"epoch": 0.10215249908792412,
"grad_norm": 0.35374191781286285,
"learning_rate": 9.864587751222415e-05,
"loss": 0.1269,
"step": 1400
},
{
"epoch": 0.10361182050346589,
"grad_norm": 0.45122040063810864,
"learning_rate": 9.859070504504894e-05,
"loss": 0.1479,
"step": 1420
},
{
"epoch": 0.10507114191900765,
"grad_norm": 0.3679594319404153,
"learning_rate": 9.85344469871609e-05,
"loss": 0.1333,
"step": 1440
},
{
"epoch": 0.10653046333454944,
"grad_norm": 0.2773979819671957,
"learning_rate": 9.847710459545109e-05,
"loss": 0.1293,
"step": 1460
},
{
"epoch": 0.1079897847500912,
"grad_norm": 0.18675042963035182,
"learning_rate": 9.841867915103632e-05,
"loss": 0.1262,
"step": 1480
},
{
"epoch": 0.10944910616563298,
"grad_norm": 0.30565129208355624,
"learning_rate": 9.835917195923044e-05,
"loss": 0.1197,
"step": 1500
},
{
"epoch": 0.11090842758117475,
"grad_norm": 0.2626135808770724,
"learning_rate": 9.829858434951516e-05,
"loss": 0.1132,
"step": 1520
},
{
"epoch": 0.11236774899671653,
"grad_norm": 0.36381732806040473,
"learning_rate": 9.823691767551042e-05,
"loss": 0.1397,
"step": 1540
},
{
"epoch": 0.1138270704122583,
"grad_norm": 0.2531250566483984,
"learning_rate": 9.817417331494409e-05,
"loss": 0.0946,
"step": 1560
},
{
"epoch": 0.11528639182780007,
"grad_norm": 0.31305747632958897,
"learning_rate": 9.81103526696212e-05,
"loss": 0.1154,
"step": 1580
},
{
"epoch": 0.11674571324334185,
"grad_norm": 0.3203482106895159,
"learning_rate": 9.804545716539265e-05,
"loss": 0.1263,
"step": 1600
},
{
"epoch": 0.11820503465888361,
"grad_norm": 0.21193993774401784,
"learning_rate": 9.797948825212331e-05,
"loss": 0.1282,
"step": 1620
},
{
"epoch": 0.1196643560744254,
"grad_norm": 0.328003998882712,
"learning_rate": 9.791244740365965e-05,
"loss": 0.1217,
"step": 1640
},
{
"epoch": 0.12112367748996716,
"grad_norm": 0.25049879501157474,
"learning_rate": 9.784433611779684e-05,
"loss": 0.1395,
"step": 1660
},
{
"epoch": 0.12258299890550894,
"grad_norm": 0.2889080458455597,
"learning_rate": 9.777515591624522e-05,
"loss": 0.1281,
"step": 1680
},
{
"epoch": 0.12404232032105071,
"grad_norm": 0.32303024299802585,
"learning_rate": 9.77049083445964e-05,
"loss": 0.1279,
"step": 1700
},
{
"epoch": 0.12550164173659248,
"grad_norm": 0.4839798792492212,
"learning_rate": 9.76335949722886e-05,
"loss": 0.1077,
"step": 1720
},
{
"epoch": 0.12696096315213426,
"grad_norm": 0.3053946282511456,
"learning_rate": 9.756121739257173e-05,
"loss": 0.1306,
"step": 1740
},
{
"epoch": 0.12842028456767604,
"grad_norm": 0.37752644019410203,
"learning_rate": 9.748777722247164e-05,
"loss": 0.1219,
"step": 1760
},
{
"epoch": 0.1298796059832178,
"grad_norm": 0.3582486729875512,
"learning_rate": 9.741327610275417e-05,
"loss": 0.1098,
"step": 1780
},
{
"epoch": 0.13133892739875958,
"grad_norm": 0.32665426597686364,
"learning_rate": 9.73377156978883e-05,
"loss": 0.1131,
"step": 1800
},
{
"epoch": 0.13279824881430136,
"grad_norm": 0.29447834111629645,
"learning_rate": 9.726109769600915e-05,
"loss": 0.1408,
"step": 1820
},
{
"epoch": 0.1342575702298431,
"grad_norm": 0.3757371639048124,
"learning_rate": 9.718342380888013e-05,
"loss": 0.1181,
"step": 1840
},
{
"epoch": 0.1357168916453849,
"grad_norm": 0.2721970485578736,
"learning_rate": 9.710469577185473e-05,
"loss": 0.1397,
"step": 1860
},
{
"epoch": 0.13717621306092667,
"grad_norm": 0.25541942141740964,
"learning_rate": 9.702491534383779e-05,
"loss": 0.123,
"step": 1880
},
{
"epoch": 0.13863553447646845,
"grad_norm": 0.28833999806064703,
"learning_rate": 9.69440843072462e-05,
"loss": 0.117,
"step": 1900
},
{
"epoch": 0.1400948558920102,
"grad_norm": 0.269673320995455,
"learning_rate": 9.686220446796896e-05,
"loss": 0.1137,
"step": 1920
},
{
"epoch": 0.141554177307552,
"grad_norm": 0.48330990807963714,
"learning_rate": 9.677927765532701e-05,
"loss": 0.1528,
"step": 1940
},
{
"epoch": 0.14301349872309377,
"grad_norm": 0.3363262147438343,
"learning_rate": 9.669530572203227e-05,
"loss": 0.1104,
"step": 1960
},
{
"epoch": 0.14447282013863552,
"grad_norm": 0.25580892613717404,
"learning_rate": 9.661029054414622e-05,
"loss": 0.1193,
"step": 1980
},
{
"epoch": 0.1459321415541773,
"grad_norm": 0.45388106676996987,
"learning_rate": 9.652423402103805e-05,
"loss": 0.1592,
"step": 2000
},
{
"epoch": 0.14739146296971908,
"grad_norm": 0.3709494593344652,
"learning_rate": 9.643713807534219e-05,
"loss": 0.1073,
"step": 2020
},
{
"epoch": 0.14885078438526086,
"grad_norm": 0.6788345381577189,
"learning_rate": 9.634900465291534e-05,
"loss": 0.1315,
"step": 2040
},
{
"epoch": 0.15031010580080262,
"grad_norm": 0.2869088910525234,
"learning_rate": 9.625983572279304e-05,
"loss": 0.1184,
"step": 2060
},
{
"epoch": 0.1517694272163444,
"grad_norm": 0.2964111500353762,
"learning_rate": 9.616963327714566e-05,
"loss": 0.1115,
"step": 2080
},
{
"epoch": 0.15322874863188618,
"grad_norm": 0.3152701415407213,
"learning_rate": 9.607839933123386e-05,
"loss": 0.117,
"step": 2100
},
{
"epoch": 0.15468807004742793,
"grad_norm": 0.30792427867208466,
"learning_rate": 9.598613592336364e-05,
"loss": 0.1219,
"step": 2120
},
{
"epoch": 0.15614739146296971,
"grad_norm": 0.28660782708863575,
"learning_rate": 9.589284511484071e-05,
"loss": 0.1436,
"step": 2140
},
{
"epoch": 0.1576067128785115,
"grad_norm": 0.32617953316288606,
"learning_rate": 9.579852898992452e-05,
"loss": 0.1287,
"step": 2160
},
{
"epoch": 0.15906603429405328,
"grad_norm": 0.41379725314108085,
"learning_rate": 9.570318965578163e-05,
"loss": 0.1097,
"step": 2180
},
{
"epoch": 0.16052535570959503,
"grad_norm": 0.22219925652341765,
"learning_rate": 9.560682924243866e-05,
"loss": 0.1171,
"step": 2200
},
{
"epoch": 0.1619846771251368,
"grad_norm": 0.2835465449542872,
"learning_rate": 9.550944990273473e-05,
"loss": 0.1275,
"step": 2220
},
{
"epoch": 0.1634439985406786,
"grad_norm": 0.3971492169748388,
"learning_rate": 9.54110538122733e-05,
"loss": 0.1029,
"step": 2240
},
{
"epoch": 0.16490331995622035,
"grad_norm": 0.27416529849431764,
"learning_rate": 9.531164316937362e-05,
"loss": 0.1209,
"step": 2260
},
{
"epoch": 0.16636264137176213,
"grad_norm": 0.2653777493304375,
"learning_rate": 9.52112201950216e-05,
"loss": 0.1132,
"step": 2280
},
{
"epoch": 0.1678219627873039,
"grad_norm": 0.5642093807940471,
"learning_rate": 9.510978713282017e-05,
"loss": 0.1299,
"step": 2300
},
{
"epoch": 0.1692812842028457,
"grad_norm": 0.24584617694789176,
"learning_rate": 9.500734624893914e-05,
"loss": 0.1251,
"step": 2320
},
{
"epoch": 0.17074060561838744,
"grad_norm": 0.272116000365995,
"learning_rate": 9.490389983206466e-05,
"loss": 0.1281,
"step": 2340
},
{
"epoch": 0.17219992703392922,
"grad_norm": 0.3358227142033562,
"learning_rate": 9.4799450193348e-05,
"loss": 0.1296,
"step": 2360
},
{
"epoch": 0.173659248449471,
"grad_norm": 0.2051814222925398,
"learning_rate": 9.469399966635391e-05,
"loss": 0.1191,
"step": 2380
},
{
"epoch": 0.17511856986501276,
"grad_norm": 0.20408049257855926,
"learning_rate": 9.458755060700856e-05,
"loss": 0.1141,
"step": 2400
},
{
"epoch": 0.17657789128055454,
"grad_norm": 0.1849829651088512,
"learning_rate": 9.448010539354685e-05,
"loss": 0.1127,
"step": 2420
},
{
"epoch": 0.17803721269609632,
"grad_norm": 0.18080713942428248,
"learning_rate": 9.437166642645926e-05,
"loss": 0.1394,
"step": 2440
},
{
"epoch": 0.1794965341116381,
"grad_norm": 0.27541852629660835,
"learning_rate": 9.426223612843828e-05,
"loss": 0.1214,
"step": 2460
},
{
"epoch": 0.18095585552717985,
"grad_norm": 0.37092708297153004,
"learning_rate": 9.415181694432423e-05,
"loss": 0.146,
"step": 2480
},
{
"epoch": 0.18241517694272164,
"grad_norm": 0.36083890038936484,
"learning_rate": 9.404041134105066e-05,
"loss": 0.1248,
"step": 2500
},
{
"epoch": 0.18387449835826342,
"grad_norm": 0.2776438139983535,
"learning_rate": 9.392802180758926e-05,
"loss": 0.1368,
"step": 2520
},
{
"epoch": 0.18533381977380517,
"grad_norm": 0.34820547586785217,
"learning_rate": 9.38146508548942e-05,
"loss": 0.1155,
"step": 2540
},
{
"epoch": 0.18679314118934695,
"grad_norm": 0.294238763398288,
"learning_rate": 9.370030101584605e-05,
"loss": 0.1172,
"step": 2560
},
{
"epoch": 0.18825246260488873,
"grad_norm": 0.22876093865750505,
"learning_rate": 9.358497484519524e-05,
"loss": 0.1241,
"step": 2580
},
{
"epoch": 0.1897117840204305,
"grad_norm": 0.2856249418184077,
"learning_rate": 9.34686749195049e-05,
"loss": 0.1251,
"step": 2600
},
{
"epoch": 0.19117110543597227,
"grad_norm": 0.2919772530638231,
"learning_rate": 9.335140383709333e-05,
"loss": 0.12,
"step": 2620
},
{
"epoch": 0.19263042685151405,
"grad_norm": 0.3294893864056268,
"learning_rate": 9.323316421797602e-05,
"loss": 0.1097,
"step": 2640
},
{
"epoch": 0.19408974826705583,
"grad_norm": 0.3920752736984575,
"learning_rate": 9.311395870380698e-05,
"loss": 0.1151,
"step": 2660
},
{
"epoch": 0.19554906968259758,
"grad_norm": 0.1668745397084369,
"learning_rate": 9.299378995781984e-05,
"loss": 0.1191,
"step": 2680
},
{
"epoch": 0.19700839109813936,
"grad_norm": 0.19167495599752757,
"learning_rate": 9.28726606647683e-05,
"loss": 0.1413,
"step": 2700
},
{
"epoch": 0.19846771251368114,
"grad_norm": 0.39783090766324053,
"learning_rate": 9.275057353086611e-05,
"loss": 0.149,
"step": 2720
},
{
"epoch": 0.19992703392922292,
"grad_norm": 0.2810998132604279,
"learning_rate": 9.262753128372672e-05,
"loss": 0.1194,
"step": 2740
},
{
"epoch": 0.20138635534476468,
"grad_norm": 0.2235431056863839,
"learning_rate": 9.25035366723022e-05,
"loss": 0.1339,
"step": 2760
},
{
"epoch": 0.20284567676030646,
"grad_norm": 0.21528060380233013,
"learning_rate": 9.237859246682193e-05,
"loss": 0.1254,
"step": 2780
},
{
"epoch": 0.20430499817584824,
"grad_norm": 0.5942022277992831,
"learning_rate": 9.22527014587307e-05,
"loss": 0.1279,
"step": 2800
},
{
"epoch": 0.20576431959139,
"grad_norm": 0.22522986172233075,
"learning_rate": 9.212586646062626e-05,
"loss": 0.1016,
"step": 2820
},
{
"epoch": 0.20722364100693177,
"grad_norm": 0.38913350777355465,
"learning_rate": 9.19980903061966e-05,
"loss": 0.1321,
"step": 2840
},
{
"epoch": 0.20868296242247356,
"grad_norm": 0.2761445042724322,
"learning_rate": 9.186937585015654e-05,
"loss": 0.1006,
"step": 2860
},
{
"epoch": 0.2101422838380153,
"grad_norm": 0.2904489675134637,
"learning_rate": 9.173972596818399e-05,
"loss": 0.1391,
"step": 2880
},
{
"epoch": 0.2116016052535571,
"grad_norm": 0.3863861717225745,
"learning_rate": 9.160914355685577e-05,
"loss": 0.1338,
"step": 2900
},
{
"epoch": 0.21306092666909887,
"grad_norm": 0.23375929184834016,
"learning_rate": 9.147763153358276e-05,
"loss": 0.1271,
"step": 2920
},
{
"epoch": 0.21452024808464065,
"grad_norm": 0.2103193670388651,
"learning_rate": 9.134519283654483e-05,
"loss": 0.1115,
"step": 2940
},
{
"epoch": 0.2159795695001824,
"grad_norm": 0.253417546073443,
"learning_rate": 9.121183042462517e-05,
"loss": 0.0965,
"step": 2960
},
{
"epoch": 0.2174388909157242,
"grad_norm": 0.36352688754174645,
"learning_rate": 9.107754727734414e-05,
"loss": 0.1257,
"step": 2980
},
{
"epoch": 0.21889821233126597,
"grad_norm": 0.33374450352895074,
"learning_rate": 9.094234639479273e-05,
"loss": 0.12,
"step": 3000
},
{
"epoch": 0.22035753374680772,
"grad_norm": 0.24900650250532352,
"learning_rate": 9.080623079756561e-05,
"loss": 0.1071,
"step": 3020
},
{
"epoch": 0.2218168551623495,
"grad_norm": 0.21315684544430627,
"learning_rate": 9.066920352669353e-05,
"loss": 0.1382,
"step": 3040
},
{
"epoch": 0.22327617657789128,
"grad_norm": 0.2263034759808627,
"learning_rate": 9.053126764357537e-05,
"loss": 0.145,
"step": 3060
},
{
"epoch": 0.22473549799343306,
"grad_norm": 0.17408999521123467,
"learning_rate": 9.03924262299099e-05,
"loss": 0.1125,
"step": 3080
},
{
"epoch": 0.22619481940897482,
"grad_norm": 0.26382605132819903,
"learning_rate": 9.025268238762678e-05,
"loss": 0.1345,
"step": 3100
},
{
"epoch": 0.2276541408245166,
"grad_norm": 0.428881304462081,
"learning_rate": 9.011203923881728e-05,
"loss": 0.1223,
"step": 3120
},
{
"epoch": 0.22911346224005838,
"grad_norm": 0.23176671690985062,
"learning_rate": 8.997049992566462e-05,
"loss": 0.1259,
"step": 3140
},
{
"epoch": 0.23057278365560013,
"grad_norm": 0.27711123572562557,
"learning_rate": 8.982806761037363e-05,
"loss": 0.1228,
"step": 3160
},
{
"epoch": 0.23203210507114191,
"grad_norm": 0.28590819940098056,
"learning_rate": 8.968474547510022e-05,
"loss": 0.1312,
"step": 3180
},
{
"epoch": 0.2334914264866837,
"grad_norm": 0.29608561342126305,
"learning_rate": 8.954053672188022e-05,
"loss": 0.1123,
"step": 3200
},
{
"epoch": 0.23495074790222548,
"grad_norm": 0.22352582010864266,
"learning_rate": 8.93954445725579e-05,
"loss": 0.102,
"step": 3220
},
{
"epoch": 0.23641006931776723,
"grad_norm": 0.17310790477532703,
"learning_rate": 8.924947226871392e-05,
"loss": 0.1326,
"step": 3240
},
{
"epoch": 0.237869390733309,
"grad_norm": 0.21667547494734962,
"learning_rate": 8.91026230715929e-05,
"loss": 0.1367,
"step": 3260
},
{
"epoch": 0.2393287121488508,
"grad_norm": 0.20281398075174428,
"learning_rate": 8.895490026203067e-05,
"loss": 0.1289,
"step": 3280
},
{
"epoch": 0.24078803356439255,
"grad_norm": 0.2869894200024577,
"learning_rate": 8.880630714038087e-05,
"loss": 0.1356,
"step": 3300
},
{
"epoch": 0.24224735497993433,
"grad_norm": 0.25752530118391737,
"learning_rate": 8.865684702644121e-05,
"loss": 0.1265,
"step": 3320
},
{
"epoch": 0.2437066763954761,
"grad_norm": 0.3193768272298343,
"learning_rate": 8.85065232593794e-05,
"loss": 0.112,
"step": 3340
},
{
"epoch": 0.2451659978110179,
"grad_norm": 0.36091443188401096,
"learning_rate": 8.835533919765844e-05,
"loss": 0.095,
"step": 3360
},
{
"epoch": 0.24662531922655964,
"grad_norm": 1.3698974484227704,
"learning_rate": 8.820329821896163e-05,
"loss": 0.1168,
"step": 3380
},
{
"epoch": 0.24808464064210142,
"grad_norm": 0.2487563159790042,
"learning_rate": 8.805040372011712e-05,
"loss": 0.113,
"step": 3400
},
{
"epoch": 0.2495439620576432,
"grad_norm": 0.3698970509982561,
"learning_rate": 8.789665911702199e-05,
"loss": 0.1215,
"step": 3420
},
{
"epoch": 0.25100328347318496,
"grad_norm": 0.1876341055377898,
"learning_rate": 8.774206784456597e-05,
"loss": 0.1209,
"step": 3440
},
{
"epoch": 0.25246260488872674,
"grad_norm": 0.24295665772880776,
"learning_rate": 8.758663335655469e-05,
"loss": 0.1229,
"step": 3460
},
{
"epoch": 0.2539219263042685,
"grad_norm": 0.2689776977679599,
"learning_rate": 8.743035912563244e-05,
"loss": 0.0832,
"step": 3480
},
{
"epoch": 0.2553812477198103,
"grad_norm": 0.379412893920958,
"learning_rate": 8.727324864320472e-05,
"loss": 0.1106,
"step": 3500
},
{
"epoch": 0.2568405691353521,
"grad_norm": 0.31695262624414844,
"learning_rate": 8.711530541936017e-05,
"loss": 0.1023,
"step": 3520
},
{
"epoch": 0.2582998905508938,
"grad_norm": 0.21783049074759805,
"learning_rate": 8.695653298279208e-05,
"loss": 0.1009,
"step": 3540
},
{
"epoch": 0.2597592119664356,
"grad_norm": 0.2703353850483838,
"learning_rate": 8.67969348807197e-05,
"loss": 0.1211,
"step": 3560
},
{
"epoch": 0.26121853338197737,
"grad_norm": 0.1932384187079349,
"learning_rate": 8.663651467880885e-05,
"loss": 0.1039,
"step": 3580
},
{
"epoch": 0.26267785479751915,
"grad_norm": 0.28285556612386037,
"learning_rate": 8.647527596109237e-05,
"loss": 0.1158,
"step": 3600
},
{
"epoch": 0.26413717621306093,
"grad_norm": 0.32813759107134893,
"learning_rate": 8.631322232988994e-05,
"loss": 0.1311,
"step": 3620
},
{
"epoch": 0.2655964976286027,
"grad_norm": 0.2312850774172843,
"learning_rate": 8.615035740572773e-05,
"loss": 0.1129,
"step": 3640
},
{
"epoch": 0.2670558190441445,
"grad_norm": 0.3303464994517715,
"learning_rate": 8.598668482725732e-05,
"loss": 0.1278,
"step": 3660
},
{
"epoch": 0.2685151404596862,
"grad_norm": 0.24992627997941425,
"learning_rate": 8.582220825117467e-05,
"loss": 0.0928,
"step": 3680
},
{
"epoch": 0.269974461875228,
"grad_norm": 0.4135400436284091,
"learning_rate": 8.565693135213815e-05,
"loss": 0.1032,
"step": 3700
},
{
"epoch": 0.2714337832907698,
"grad_norm": 0.16234055427123434,
"learning_rate": 8.549085782268663e-05,
"loss": 0.1187,
"step": 3720
},
{
"epoch": 0.27289310470631156,
"grad_norm": 0.21509892954974083,
"learning_rate": 8.532399137315693e-05,
"loss": 0.1312,
"step": 3740
},
{
"epoch": 0.27435242612185334,
"grad_norm": 0.484277155110313,
"learning_rate": 8.51563357316009e-05,
"loss": 0.0971,
"step": 3760
},
{
"epoch": 0.2758117475373951,
"grad_norm": 0.4137856953789829,
"learning_rate": 8.498789464370212e-05,
"loss": 0.1153,
"step": 3780
},
{
"epoch": 0.2772710689529369,
"grad_norm": 0.5336023621351729,
"learning_rate": 8.48186718726923e-05,
"loss": 0.1133,
"step": 3800
},
{
"epoch": 0.27873039036847863,
"grad_norm": 0.4245926967265952,
"learning_rate": 8.464867119926711e-05,
"loss": 0.1188,
"step": 3820
},
{
"epoch": 0.2801897117840204,
"grad_norm": 0.37902596075543216,
"learning_rate": 8.447789642150176e-05,
"loss": 0.1054,
"step": 3840
},
{
"epoch": 0.2816490331995622,
"grad_norm": 0.31818209527759106,
"learning_rate": 8.430635135476615e-05,
"loss": 0.1362,
"step": 3860
},
{
"epoch": 0.283108354615104,
"grad_norm": 0.22795920895858368,
"learning_rate": 8.413403983163958e-05,
"loss": 0.111,
"step": 3880
},
{
"epoch": 0.28456767603064576,
"grad_norm": 0.37896391196946616,
"learning_rate": 8.396096570182519e-05,
"loss": 0.1027,
"step": 3900
},
{
"epoch": 0.28602699744618754,
"grad_norm": 0.3346173686783235,
"learning_rate": 8.378713283206389e-05,
"loss": 0.1245,
"step": 3920
},
{
"epoch": 0.2874863188617293,
"grad_norm": 0.1912037431397104,
"learning_rate": 8.361254510604804e-05,
"loss": 0.1106,
"step": 3940
},
{
"epoch": 0.28894564027727104,
"grad_norm": 0.2345929112818003,
"learning_rate": 8.343720642433462e-05,
"loss": 0.0864,
"step": 3960
},
{
"epoch": 0.2904049616928128,
"grad_norm": 0.2699123096192621,
"learning_rate": 8.326112070425811e-05,
"loss": 0.1085,
"step": 3980
},
{
"epoch": 0.2918642831083546,
"grad_norm": 0.23429437678203588,
"learning_rate": 8.308429187984297e-05,
"loss": 0.1356,
"step": 4000
},
{
"epoch": 0.2933236045238964,
"grad_norm": 0.28130939764083146,
"learning_rate": 8.290672390171576e-05,
"loss": 0.1102,
"step": 4020
},
{
"epoch": 0.29478292593943817,
"grad_norm": 0.31157607453363195,
"learning_rate": 8.272842073701688e-05,
"loss": 0.1004,
"step": 4040
},
{
"epoch": 0.29624224735497995,
"grad_norm": 0.34115856403247896,
"learning_rate": 8.254938636931184e-05,
"loss": 0.0911,
"step": 4060
},
{
"epoch": 0.29770156877052173,
"grad_norm": 0.23348388763532596,
"learning_rate": 8.236962479850247e-05,
"loss": 0.0934,
"step": 4080
},
{
"epoch": 0.29916089018606346,
"grad_norm": 0.25310795043145723,
"learning_rate": 8.218914004073734e-05,
"loss": 0.107,
"step": 4100
},
{
"epoch": 0.30062021160160524,
"grad_norm": 0.2859341390716333,
"learning_rate": 8.200793612832213e-05,
"loss": 0.1034,
"step": 4120
},
{
"epoch": 0.302079533017147,
"grad_norm": 0.1589307537613677,
"learning_rate": 8.182601710962958e-05,
"loss": 0.1024,
"step": 4140
},
{
"epoch": 0.3035388544326888,
"grad_norm": 0.21346275027988693,
"learning_rate": 8.164338704900894e-05,
"loss": 0.1008,
"step": 4160
},
{
"epoch": 0.3049981758482306,
"grad_norm": 0.23819056236182787,
"learning_rate": 8.14600500266953e-05,
"loss": 0.1106,
"step": 4180
},
{
"epoch": 0.30645749726377236,
"grad_norm": 0.28549063766432503,
"learning_rate": 8.127601013871829e-05,
"loss": 0.1127,
"step": 4200
},
{
"epoch": 0.30791681867931414,
"grad_norm": 0.42956976973150385,
"learning_rate": 8.109127149681066e-05,
"loss": 0.119,
"step": 4220
},
{
"epoch": 0.30937614009485587,
"grad_norm": 0.4318716465775294,
"learning_rate": 8.090583822831637e-05,
"loss": 0.1213,
"step": 4240
},
{
"epoch": 0.31083546151039765,
"grad_norm": 0.3083984081079968,
"learning_rate": 8.071971447609847e-05,
"loss": 0.1161,
"step": 4260
},
{
"epoch": 0.31229478292593943,
"grad_norm": 0.38276763388234036,
"learning_rate": 8.053290439844639e-05,
"loss": 0.1277,
"step": 4280
},
{
"epoch": 0.3137541043414812,
"grad_norm": 0.1908229423823102,
"learning_rate": 8.034541216898315e-05,
"loss": 0.0972,
"step": 4300
},
{
"epoch": 0.315213425757023,
"grad_norm": 0.1542948433528475,
"learning_rate": 8.01572419765721e-05,
"loss": 0.0921,
"step": 4320
},
{
"epoch": 0.3166727471725648,
"grad_norm": 0.21419210109077533,
"learning_rate": 7.996839802522331e-05,
"loss": 0.1182,
"step": 4340
},
{
"epoch": 0.31813206858810655,
"grad_norm": 0.3984295641541688,
"learning_rate": 7.977888453399967e-05,
"loss": 0.1277,
"step": 4360
},
{
"epoch": 0.3195913900036483,
"grad_norm": 0.23812270948662653,
"learning_rate": 7.958870573692258e-05,
"loss": 0.1189,
"step": 4380
},
{
"epoch": 0.32105071141919006,
"grad_norm": 0.2698116268855808,
"learning_rate": 7.939786588287743e-05,
"loss": 0.1028,
"step": 4400
},
{
"epoch": 0.32251003283473184,
"grad_norm": 0.23000489317809045,
"learning_rate": 7.92063692355186e-05,
"loss": 0.1179,
"step": 4420
},
{
"epoch": 0.3239693542502736,
"grad_norm": 0.2804136973700269,
"learning_rate": 7.901422007317426e-05,
"loss": 0.1039,
"step": 4440
},
{
"epoch": 0.3254286756658154,
"grad_norm": 0.24908274716197615,
"learning_rate": 7.882142268875075e-05,
"loss": 0.1189,
"step": 4460
},
{
"epoch": 0.3268879970813572,
"grad_norm": 0.23252680742999735,
"learning_rate": 7.862798138963672e-05,
"loss": 0.1131,
"step": 4480
},
{
"epoch": 0.32834731849689897,
"grad_norm": 0.1786470217408672,
"learning_rate": 7.843390049760679e-05,
"loss": 0.1073,
"step": 4500
},
{
"epoch": 0.3298066399124407,
"grad_norm": 0.26097955557070734,
"learning_rate": 7.823918434872515e-05,
"loss": 0.1315,
"step": 4520
},
{
"epoch": 0.33126596132798247,
"grad_norm": 0.32002806565738656,
"learning_rate": 7.80438372932485e-05,
"loss": 0.1045,
"step": 4540
},
{
"epoch": 0.33272528274352425,
"grad_norm": 0.2131844778500901,
"learning_rate": 7.784786369552905e-05,
"loss": 0.0941,
"step": 4560
},
{
"epoch": 0.33418460415906603,
"grad_norm": 0.1798301571622198,
"learning_rate": 7.765126793391691e-05,
"loss": 0.1088,
"step": 4580
},
{
"epoch": 0.3356439255746078,
"grad_norm": 0.17051341313737456,
"learning_rate": 7.74540544006622e-05,
"loss": 0.1042,
"step": 4600
},
{
"epoch": 0.3371032469901496,
"grad_norm": 0.22572869425970987,
"learning_rate": 7.725622750181712e-05,
"loss": 0.1007,
"step": 4620
},
{
"epoch": 0.3385625684056914,
"grad_norm": 0.1968135681484681,
"learning_rate": 7.70577916571373e-05,
"loss": 0.0884,
"step": 4640
},
{
"epoch": 0.3400218898212331,
"grad_norm": 0.2868939162865515,
"learning_rate": 7.68587512999832e-05,
"loss": 0.0973,
"step": 4660
},
{
"epoch": 0.3414812112367749,
"grad_norm": 0.1585501010853745,
"learning_rate": 7.665911087722103e-05,
"loss": 0.1008,
"step": 4680
},
{
"epoch": 0.34294053265231667,
"grad_norm": 0.4516898386815449,
"learning_rate": 7.645887484912334e-05,
"loss": 0.1146,
"step": 4700
},
{
"epoch": 0.34439985406785845,
"grad_norm": 0.3086424574036419,
"learning_rate": 7.625804768926944e-05,
"loss": 0.1184,
"step": 4720
},
{
"epoch": 0.3458591754834002,
"grad_norm": 0.20584040402606343,
"learning_rate": 7.605663388444541e-05,
"loss": 0.1147,
"step": 4740
},
{
"epoch": 0.347318496898942,
"grad_norm": 0.1966429839833748,
"learning_rate": 7.585463793454393e-05,
"loss": 0.0878,
"step": 4760
},
{
"epoch": 0.3487778183144838,
"grad_norm": 0.2445766881016643,
"learning_rate": 7.56520643524636e-05,
"loss": 0.13,
"step": 4780
},
{
"epoch": 0.3502371397300255,
"grad_norm": 0.21158701765416985,
"learning_rate": 7.544891766400827e-05,
"loss": 0.0956,
"step": 4800
},
{
"epoch": 0.3516964611455673,
"grad_norm": 0.4238803079261724,
"learning_rate": 7.524520240778587e-05,
"loss": 0.1174,
"step": 4820
},
{
"epoch": 0.3531557825611091,
"grad_norm": 0.3095728288400587,
"learning_rate": 7.504092313510697e-05,
"loss": 0.1155,
"step": 4840
},
{
"epoch": 0.35461510397665086,
"grad_norm": 0.4293980639357039,
"learning_rate": 7.483608440988316e-05,
"loss": 0.1079,
"step": 4860
},
{
"epoch": 0.35607442539219264,
"grad_norm": 0.16320882425277555,
"learning_rate": 7.463069080852503e-05,
"loss": 0.1044,
"step": 4880
},
{
"epoch": 0.3575337468077344,
"grad_norm": 0.1566741416723225,
"learning_rate": 7.442474691983996e-05,
"loss": 0.1043,
"step": 4900
},
{
"epoch": 0.3589930682232762,
"grad_norm": 0.20253896848392447,
"learning_rate": 7.421825734492963e-05,
"loss": 0.1061,
"step": 4920
},
{
"epoch": 0.3604523896388179,
"grad_norm": 0.23451042679193784,
"learning_rate": 7.40112266970871e-05,
"loss": 0.0984,
"step": 4940
},
{
"epoch": 0.3619117110543597,
"grad_norm": 0.31761969552147234,
"learning_rate": 7.380365960169391e-05,
"loss": 0.0982,
"step": 4960
},
{
"epoch": 0.3633710324699015,
"grad_norm": 0.19087054811891263,
"learning_rate": 7.35955606961166e-05,
"loss": 0.0834,
"step": 4980
},
{
"epoch": 0.36483035388544327,
"grad_norm": 0.26333137912323057,
"learning_rate": 7.338693462960324e-05,
"loss": 0.115,
"step": 5000
},
{
"epoch": 0.36628967530098505,
"grad_norm": 0.35946333892955823,
"learning_rate": 7.317778606317937e-05,
"loss": 0.109,
"step": 5020
},
{
"epoch": 0.36774899671652683,
"grad_norm": 0.29217079786710687,
"learning_rate": 7.296811966954411e-05,
"loss": 0.1061,
"step": 5040
},
{
"epoch": 0.3692083181320686,
"grad_norm": 0.16099974509802562,
"learning_rate": 7.27579401329655e-05,
"loss": 0.1023,
"step": 5060
},
{
"epoch": 0.37066763954761034,
"grad_norm": 0.14270002573644924,
"learning_rate": 7.254725214917607e-05,
"loss": 0.1363,
"step": 5080
},
{
"epoch": 0.3721269609631521,
"grad_norm": 0.2765615965792803,
"learning_rate": 7.233606042526781e-05,
"loss": 0.1226,
"step": 5100
},
{
"epoch": 0.3735862823786939,
"grad_norm": 0.352531222497615,
"learning_rate": 7.212436967958703e-05,
"loss": 0.116,
"step": 5120
},
{
"epoch": 0.3750456037942357,
"grad_norm": 0.3179852115996289,
"learning_rate": 7.191218464162897e-05,
"loss": 0.1106,
"step": 5140
},
{
"epoch": 0.37650492520977746,
"grad_norm": 0.21044459839378338,
"learning_rate": 7.169951005193207e-05,
"loss": 0.0794,
"step": 5160
},
{
"epoch": 0.37796424662531924,
"grad_norm": 0.29435428319089024,
"learning_rate": 7.148635066197216e-05,
"loss": 0.1098,
"step": 5180
},
{
"epoch": 0.379423568040861,
"grad_norm": 0.3678808726276575,
"learning_rate": 7.127271123405622e-05,
"loss": 0.1201,
"step": 5200
},
{
"epoch": 0.38088288945640275,
"grad_norm": 0.3444852709706006,
"learning_rate": 7.105859654121602e-05,
"loss": 0.1186,
"step": 5220
},
{
"epoch": 0.38234221087194453,
"grad_norm": 0.1407926086621674,
"learning_rate": 7.084401136710149e-05,
"loss": 0.1187,
"step": 5240
},
{
"epoch": 0.3838015322874863,
"grad_norm": 0.1677280774460152,
"learning_rate": 7.062896050587377e-05,
"loss": 0.1027,
"step": 5260
},
{
"epoch": 0.3852608537030281,
"grad_norm": 0.2039504177031652,
"learning_rate": 7.041344876209827e-05,
"loss": 0.0913,
"step": 5280
},
{
"epoch": 0.3867201751185699,
"grad_norm": 0.23188710135621454,
"learning_rate": 7.019748095063712e-05,
"loss": 0.1141,
"step": 5300
},
{
"epoch": 0.38817949653411166,
"grad_norm": 0.24724368926384838,
"learning_rate": 6.998106189654176e-05,
"loss": 0.1037,
"step": 5320
},
{
"epoch": 0.38963881794965344,
"grad_norm": 0.32541077271508595,
"learning_rate": 6.976419643494504e-05,
"loss": 0.1199,
"step": 5340
},
{
"epoch": 0.39109813936519516,
"grad_norm": 0.2385801144209165,
"learning_rate": 6.954688941095327e-05,
"loss": 0.0933,
"step": 5360
},
{
"epoch": 0.39255746078073694,
"grad_norm": 0.20684187377778415,
"learning_rate": 6.932914567953792e-05,
"loss": 0.1046,
"step": 5380
},
{
"epoch": 0.3940167821962787,
"grad_norm": 0.22717043667652123,
"learning_rate": 6.91109701054272e-05,
"loss": 0.0973,
"step": 5400
},
{
"epoch": 0.3954761036118205,
"grad_norm": 0.1784316153067281,
"learning_rate": 6.889236756299732e-05,
"loss": 0.122,
"step": 5420
},
{
"epoch": 0.3969354250273623,
"grad_norm": 0.2992073258265761,
"learning_rate": 6.867334293616361e-05,
"loss": 0.1064,
"step": 5440
},
{
"epoch": 0.39839474644290407,
"grad_norm": 0.35809119189138666,
"learning_rate": 6.845390111827142e-05,
"loss": 0.1215,
"step": 5460
},
{
"epoch": 0.39985406785844585,
"grad_norm": 0.24770942776501734,
"learning_rate": 6.823404701198683e-05,
"loss": 0.1015,
"step": 5480
},
{
"epoch": 0.4013133892739876,
"grad_norm": 0.2676532625134847,
"learning_rate": 6.801378552918697e-05,
"loss": 0.1024,
"step": 5500
},
{
"epoch": 0.40277271068952936,
"grad_norm": 0.17695706700427993,
"learning_rate": 6.779312159085051e-05,
"loss": 0.0866,
"step": 5520
},
{
"epoch": 0.40423203210507114,
"grad_norm": 0.22513044418817704,
"learning_rate": 6.757206012694751e-05,
"loss": 0.0898,
"step": 5540
},
{
"epoch": 0.4056913535206129,
"grad_norm": 0.24221056958067821,
"learning_rate": 6.735060607632937e-05,
"loss": 0.0923,
"step": 5560
},
{
"epoch": 0.4071506749361547,
"grad_norm": 0.2516010085453918,
"learning_rate": 6.71287643866185e-05,
"loss": 0.0821,
"step": 5580
},
{
"epoch": 0.4086099963516965,
"grad_norm": 0.16792435621352286,
"learning_rate": 6.690654001409773e-05,
"loss": 0.1064,
"step": 5600
},
{
"epoch": 0.41006931776723826,
"grad_norm": 0.24148115931630454,
"learning_rate": 6.668393792359967e-05,
"loss": 0.1002,
"step": 5620
},
{
"epoch": 0.41152863918278,
"grad_norm": 0.20454443340824457,
"learning_rate": 6.646096308839564e-05,
"loss": 0.0955,
"step": 5640
},
{
"epoch": 0.41298796059832177,
"grad_norm": 0.22632163953996756,
"learning_rate": 6.623762049008475e-05,
"loss": 0.1067,
"step": 5660
},
{
"epoch": 0.41444728201386355,
"grad_norm": 0.18189039381318844,
"learning_rate": 6.60139151184824e-05,
"loss": 0.0999,
"step": 5680
},
{
"epoch": 0.41590660342940533,
"grad_norm": 0.2179009200010762,
"learning_rate": 6.578985197150893e-05,
"loss": 0.0897,
"step": 5700
},
{
"epoch": 0.4173659248449471,
"grad_norm": 0.25728882549010346,
"learning_rate": 6.5565436055078e-05,
"loss": 0.092,
"step": 5720
},
{
"epoch": 0.4188252462604889,
"grad_norm": 0.21132545096135683,
"learning_rate": 6.53406723829846e-05,
"loss": 0.1093,
"step": 5740
},
{
"epoch": 0.4202845676760306,
"grad_norm": 0.3259354513604496,
"learning_rate": 6.511556597679313e-05,
"loss": 0.0939,
"step": 5760
},
{
"epoch": 0.4217438890915724,
"grad_norm": 0.14428503360326553,
"learning_rate": 6.48901218657252e-05,
"loss": 0.113,
"step": 5780
},
{
"epoch": 0.4232032105071142,
"grad_norm": 0.20596362998777773,
"learning_rate": 6.466434508654729e-05,
"loss": 0.1221,
"step": 5800
},
{
"epoch": 0.42466253192265596,
"grad_norm": 0.2076732092116106,
"learning_rate": 6.443824068345814e-05,
"loss": 0.0981,
"step": 5820
},
{
"epoch": 0.42612185333819774,
"grad_norm": 0.23210274197312583,
"learning_rate": 6.421181370797616e-05,
"loss": 0.1091,
"step": 5840
},
{
"epoch": 0.4275811747537395,
"grad_norm": 0.34773891076479957,
"learning_rate": 6.39850692188265e-05,
"loss": 0.1152,
"step": 5860
},
{
"epoch": 0.4290404961692813,
"grad_norm": 0.25768381087645986,
"learning_rate": 6.375801228182804e-05,
"loss": 0.0833,
"step": 5880
},
{
"epoch": 0.43049981758482303,
"grad_norm": 0.20272718271445447,
"learning_rate": 6.353064796978025e-05,
"loss": 0.0821,
"step": 5900
},
{
"epoch": 0.4319591390003648,
"grad_norm": 0.3769192263755196,
"learning_rate": 6.330298136234981e-05,
"loss": 0.1047,
"step": 5920
},
{
"epoch": 0.4334184604159066,
"grad_norm": 0.18218184597043124,
"learning_rate": 6.307501754595712e-05,
"loss": 0.1114,
"step": 5940
},
{
"epoch": 0.4348777818314484,
"grad_norm": 0.20670653864614483,
"learning_rate": 6.284676161366276e-05,
"loss": 0.0885,
"step": 5960
},
{
"epoch": 0.43633710324699015,
"grad_norm": 0.21541995858015078,
"learning_rate": 6.261821866505353e-05,
"loss": 0.1153,
"step": 5980
},
{
"epoch": 0.43779642466253194,
"grad_norm": 0.2073422866021902,
"learning_rate": 6.23893938061287e-05,
"loss": 0.0958,
"step": 6000
},
{
"epoch": 0.4392557460780737,
"grad_norm": 0.3614983325797712,
"learning_rate": 6.216029214918576e-05,
"loss": 0.1039,
"step": 6020
},
{
"epoch": 0.44071506749361544,
"grad_norm": 0.25937791265177973,
"learning_rate": 6.193091881270639e-05,
"loss": 0.0884,
"step": 6040
},
{
"epoch": 0.4421743889091572,
"grad_norm": 0.27236848714431067,
"learning_rate": 6.17012789212419e-05,
"loss": 0.104,
"step": 6060
},
{
"epoch": 0.443633710324699,
"grad_norm": 0.2193972961735453,
"learning_rate": 6.147137760529893e-05,
"loss": 0.1051,
"step": 6080
},
{
"epoch": 0.4450930317402408,
"grad_norm": 0.3288936349913114,
"learning_rate": 6.124122000122474e-05,
"loss": 0.1194,
"step": 6100
},
{
"epoch": 0.44655235315578257,
"grad_norm": 0.28030638179884143,
"learning_rate": 6.101081125109238e-05,
"loss": 0.0987,
"step": 6120
},
{
"epoch": 0.44801167457132435,
"grad_norm": 0.28963057861922986,
"learning_rate": 6.0780156502585974e-05,
"loss": 0.104,
"step": 6140
},
{
"epoch": 0.44947099598686613,
"grad_norm": 0.2107381404732794,
"learning_rate": 6.054926090888559e-05,
"loss": 0.0944,
"step": 6160
},
{
"epoch": 0.45093031740240785,
"grad_norm": 0.22426758617758438,
"learning_rate": 6.031812962855212e-05,
"loss": 0.1088,
"step": 6180
},
{
"epoch": 0.45238963881794964,
"grad_norm": 0.19433617911544113,
"learning_rate": 6.008676782541214e-05,
"loss": 0.0934,
"step": 6200
},
{
"epoch": 0.4538489602334914,
"grad_norm": 0.16786745480860363,
"learning_rate": 5.985518066844235e-05,
"loss": 0.1065,
"step": 6220
},
{
"epoch": 0.4553082816490332,
"grad_norm": 0.28967266828058946,
"learning_rate": 5.9623373331654296e-05,
"loss": 0.1104,
"step": 6240
},
{
"epoch": 0.456767603064575,
"grad_norm": 0.2857631028326485,
"learning_rate": 5.9391350993978586e-05,
"loss": 0.1059,
"step": 6260
},
{
"epoch": 0.45822692448011676,
"grad_norm": 0.16887743736632202,
"learning_rate": 5.915911883914937e-05,
"loss": 0.0921,
"step": 6280
},
{
"epoch": 0.45968624589565854,
"grad_norm": 0.45462588569274826,
"learning_rate": 5.892668205558838e-05,
"loss": 0.1062,
"step": 6300
},
{
"epoch": 0.46114556731120027,
"grad_norm": 0.1874149476062009,
"learning_rate": 5.869404583628906e-05,
"loss": 0.0877,
"step": 6320
},
{
"epoch": 0.46260488872674205,
"grad_norm": 0.2035453150986139,
"learning_rate": 5.846121537870059e-05,
"loss": 0.0826,
"step": 6340
},
{
"epoch": 0.46406421014228383,
"grad_norm": 0.2009650369528326,
"learning_rate": 5.822819588461167e-05,
"loss": 0.0988,
"step": 6360
},
{
"epoch": 0.4655235315578256,
"grad_norm": 0.33321536680847463,
"learning_rate": 5.799499256003447e-05,
"loss": 0.0827,
"step": 6380
},
{
"epoch": 0.4669828529733674,
"grad_norm": 0.20073386151251446,
"learning_rate": 5.77616106150881e-05,
"loss": 0.0831,
"step": 6400
},
{
"epoch": 0.46844217438890917,
"grad_norm": 0.32718978675294735,
"learning_rate": 5.7528055263882394e-05,
"loss": 0.1012,
"step": 6420
},
{
"epoch": 0.46990149580445095,
"grad_norm": 0.21478649269135933,
"learning_rate": 5.729433172440133e-05,
"loss": 0.1003,
"step": 6440
},
{
"epoch": 0.4713608172199927,
"grad_norm": 0.35055435747379776,
"learning_rate": 5.706044521838645e-05,
"loss": 0.1186,
"step": 6460
},
{
"epoch": 0.47282013863553446,
"grad_norm": 0.17583973159572328,
"learning_rate": 5.682640097122024e-05,
"loss": 0.1025,
"step": 6480
},
{
"epoch": 0.47427946005107624,
"grad_norm": 0.19007944644812688,
"learning_rate": 5.659220421180935e-05,
"loss": 0.0897,
"step": 6500
},
{
"epoch": 0.475738781466618,
"grad_norm": 0.20795427294782404,
"learning_rate": 5.635786017246782e-05,
"loss": 0.1066,
"step": 6520
},
{
"epoch": 0.4771981028821598,
"grad_norm": 0.18309256793902484,
"learning_rate": 5.612337408880011e-05,
"loss": 0.0845,
"step": 6540
},
{
"epoch": 0.4786574242977016,
"grad_norm": 0.3124216891183694,
"learning_rate": 5.5888751199584156e-05,
"loss": 0.1096,
"step": 6560
},
{
"epoch": 0.48011674571324336,
"grad_norm": 0.18822513299788007,
"learning_rate": 5.56539967466544e-05,
"loss": 0.0957,
"step": 6580
},
{
"epoch": 0.4815760671287851,
"grad_norm": 0.26561985090282353,
"learning_rate": 5.541911597478458e-05,
"loss": 0.0862,
"step": 6600
},
{
"epoch": 0.48303538854432687,
"grad_norm": 0.2720185239193176,
"learning_rate": 5.5184114131570574e-05,
"loss": 0.0968,
"step": 6620
},
{
"epoch": 0.48449470995986865,
"grad_norm": 0.23325174950351915,
"learning_rate": 5.494899646731322e-05,
"loss": 0.0987,
"step": 6640
},
{
"epoch": 0.48595403137541043,
"grad_norm": 0.5924963942808815,
"learning_rate": 5.4713768234900956e-05,
"loss": 0.0865,
"step": 6660
},
{
"epoch": 0.4874133527909522,
"grad_norm": 0.2626819591515115,
"learning_rate": 5.447843468969247e-05,
"loss": 0.0933,
"step": 6680
},
{
"epoch": 0.488872674206494,
"grad_norm": 0.30178159697329016,
"learning_rate": 5.4243001089399305e-05,
"loss": 0.094,
"step": 6700
},
{
"epoch": 0.4903319956220358,
"grad_norm": 0.32298125922899934,
"learning_rate": 5.400747269396842e-05,
"loss": 0.0892,
"step": 6720
},
{
"epoch": 0.4917913170375775,
"grad_norm": 0.29452417657729046,
"learning_rate": 5.37718547654646e-05,
"loss": 0.1021,
"step": 6740
},
{
"epoch": 0.4932506384531193,
"grad_norm": 0.15177550207237628,
"learning_rate": 5.353615256795297e-05,
"loss": 0.0992,
"step": 6760
},
{
"epoch": 0.49470995986866106,
"grad_norm": 0.3145374189570284,
"learning_rate": 5.3300371367381306e-05,
"loss": 0.0978,
"step": 6780
},
{
"epoch": 0.49616928128420285,
"grad_norm": 0.28863235717217767,
"learning_rate": 5.306451643146247e-05,
"loss": 0.0908,
"step": 6800
},
{
"epoch": 0.4976286026997446,
"grad_norm": 0.20954696658671135,
"learning_rate": 5.2828593029556705e-05,
"loss": 0.1084,
"step": 6820
},
{
"epoch": 0.4990879241152864,
"grad_norm": 0.2199618710390789,
"learning_rate": 5.2592606432553846e-05,
"loss": 0.0972,
"step": 6840
},
{
"epoch": 0.5005472455308282,
"grad_norm": 0.20271409907110083,
"learning_rate": 5.235656191275561e-05,
"loss": 0.0999,
"step": 6860
},
{
"epoch": 0.5020065669463699,
"grad_norm": 0.29349205157450786,
"learning_rate": 5.21204647437578e-05,
"loss": 0.0931,
"step": 6880
},
{
"epoch": 0.5034658883619118,
"grad_norm": 0.33489812333335206,
"learning_rate": 5.1884320200332517e-05,
"loss": 0.0996,
"step": 6900
},
{
"epoch": 0.5049252097774535,
"grad_norm": 0.19799483926382305,
"learning_rate": 5.164813355831023e-05,
"loss": 0.1108,
"step": 6920
},
{
"epoch": 0.5063845311929952,
"grad_norm": 0.3720539958012567,
"learning_rate": 5.141191009446198e-05,
"loss": 0.1104,
"step": 6940
},
{
"epoch": 0.507843852608537,
"grad_norm": 0.2954464123920451,
"learning_rate": 5.1175655086381466e-05,
"loss": 0.11,
"step": 6960
},
{
"epoch": 0.5093031740240788,
"grad_norm": 0.1642741168702313,
"learning_rate": 5.093937381236712e-05,
"loss": 0.1031,
"step": 6980
},
{
"epoch": 0.5107624954396206,
"grad_norm": 0.20935330232427551,
"learning_rate": 5.0703071551304214e-05,
"loss": 0.0978,
"step": 7000
},
{
"epoch": 0.5122218168551623,
"grad_norm": 0.1993779996730668,
"learning_rate": 5.04667535825469e-05,
"loss": 0.0972,
"step": 7020
},
{
"epoch": 0.5136811382707042,
"grad_norm": 0.19115582036939682,
"learning_rate": 5.023042518580022e-05,
"loss": 0.0847,
"step": 7040
},
{
"epoch": 0.5151404596862459,
"grad_norm": 0.27120517340424255,
"learning_rate": 4.999409164100226e-05,
"loss": 0.1042,
"step": 7060
},
{
"epoch": 0.5165997811017876,
"grad_norm": 0.10934691309913587,
"learning_rate": 4.9757758228206084e-05,
"loss": 0.081,
"step": 7080
},
{
"epoch": 0.5180591025173295,
"grad_norm": 0.24394002533972173,
"learning_rate": 4.952143022746181e-05,
"loss": 0.094,
"step": 7100
},
{
"epoch": 0.5195184239328712,
"grad_norm": 0.16029024193022853,
"learning_rate": 4.928511291869865e-05,
"loss": 0.0826,
"step": 7120
},
{
"epoch": 0.520977745348413,
"grad_norm": 0.23900126446494024,
"learning_rate": 4.9048811581606934e-05,
"loss": 0.0961,
"step": 7140
},
{
"epoch": 0.5224370667639547,
"grad_norm": 0.19889558694631473,
"learning_rate": 4.8812531495520155e-05,
"loss": 0.1087,
"step": 7160
},
{
"epoch": 0.5238963881794966,
"grad_norm": 0.23105375198475658,
"learning_rate": 4.857627793929705e-05,
"loss": 0.0869,
"step": 7180
},
{
"epoch": 0.5253557095950383,
"grad_norm": 0.22834103771419306,
"learning_rate": 4.8340056191203615e-05,
"loss": 0.0899,
"step": 7200
},
{
"epoch": 0.52681503101058,
"grad_norm": 0.14832148410977408,
"learning_rate": 4.810387152879521e-05,
"loss": 0.0824,
"step": 7220
},
{
"epoch": 0.5282743524261219,
"grad_norm": 0.14992555758737747,
"learning_rate": 4.786772922879863e-05,
"loss": 0.0887,
"step": 7240
},
{
"epoch": 0.5297336738416636,
"grad_norm": 0.2316964809656849,
"learning_rate": 4.763163456699427e-05,
"loss": 0.1093,
"step": 7260
},
{
"epoch": 0.5311929952572054,
"grad_norm": 0.1330630758475358,
"learning_rate": 4.739559281809818e-05,
"loss": 0.1009,
"step": 7280
},
{
"epoch": 0.5326523166727472,
"grad_norm": 0.18333429681957206,
"learning_rate": 4.715960925564427e-05,
"loss": 0.1004,
"step": 7300
},
{
"epoch": 0.534111638088289,
"grad_norm": 0.270128715362079,
"learning_rate": 4.6923689151866444e-05,
"loss": 0.1018,
"step": 7320
},
{
"epoch": 0.5355709595038307,
"grad_norm": 0.23984397395024673,
"learning_rate": 4.6687837777580886e-05,
"loss": 0.0887,
"step": 7340
},
{
"epoch": 0.5370302809193724,
"grad_norm": 0.4475338081181222,
"learning_rate": 4.645206040206824e-05,
"loss": 0.1036,
"step": 7360
},
{
"epoch": 0.5384896023349143,
"grad_norm": 0.18351540680366502,
"learning_rate": 4.621636229295591e-05,
"loss": 0.0868,
"step": 7380
},
{
"epoch": 0.539948923750456,
"grad_norm": 0.32433205707718027,
"learning_rate": 4.5980748716100346e-05,
"loss": 0.112,
"step": 7400
},
{
"epoch": 0.5414082451659978,
"grad_norm": 0.26936021946381955,
"learning_rate": 4.574522493546944e-05,
"loss": 0.0752,
"step": 7420
},
{
"epoch": 0.5428675665815396,
"grad_norm": 0.23965825882528494,
"learning_rate": 4.550979621302488e-05,
"loss": 0.0987,
"step": 7440
},
{
"epoch": 0.5443268879970814,
"grad_norm": 0.15591543995056015,
"learning_rate": 4.527446780860464e-05,
"loss": 0.1019,
"step": 7460
},
{
"epoch": 0.5457862094126231,
"grad_norm": 0.1448561831202914,
"learning_rate": 4.5039244979805403e-05,
"loss": 0.0764,
"step": 7480
},
{
"epoch": 0.5472455308281649,
"grad_norm": 0.13626905980903883,
"learning_rate": 4.480413298186516e-05,
"loss": 0.0774,
"step": 7500
},
{
"epoch": 0.5487048522437067,
"grad_norm": 0.28395775526753664,
"learning_rate": 4.456913706754573e-05,
"loss": 0.0746,
"step": 7520
},
{
"epoch": 0.5501641736592484,
"grad_norm": 0.3285254794237905,
"learning_rate": 4.4334262487015474e-05,
"loss": 0.0835,
"step": 7540
},
{
"epoch": 0.5516234950747902,
"grad_norm": 0.2496952670799407,
"learning_rate": 4.4099514487732e-05,
"loss": 0.0935,
"step": 7560
},
{
"epoch": 0.553082816490332,
"grad_norm": 0.19932665975830546,
"learning_rate": 4.386489831432483e-05,
"loss": 0.0921,
"step": 7580
},
{
"epoch": 0.5545421379058738,
"grad_norm": 0.18400201820649104,
"learning_rate": 4.3630419208478356e-05,
"loss": 0.0919,
"step": 7600
},
{
"epoch": 0.5560014593214155,
"grad_norm": 0.22155855058445367,
"learning_rate": 4.339608240881462e-05,
"loss": 0.0764,
"step": 7620
},
{
"epoch": 0.5574607807369573,
"grad_norm": 0.19580138253597132,
"learning_rate": 4.316189315077636e-05,
"loss": 0.0897,
"step": 7640
},
{
"epoch": 0.5589201021524991,
"grad_norm": 0.19401483076890932,
"learning_rate": 4.2927856666510005e-05,
"loss": 0.0757,
"step": 7660
},
{
"epoch": 0.5603794235680408,
"grad_norm": 0.20987182850074507,
"learning_rate": 4.269397818474878e-05,
"loss": 0.0882,
"step": 7680
},
{
"epoch": 0.5618387449835827,
"grad_norm": 0.35798764065337424,
"learning_rate": 4.246026293069588e-05,
"loss": 0.0966,
"step": 7700
},
{
"epoch": 0.5632980663991244,
"grad_norm": 0.257029410117375,
"learning_rate": 4.222671612590775e-05,
"loss": 0.0947,
"step": 7720
},
{
"epoch": 0.5647573878146662,
"grad_norm": 0.1521057062552542,
"learning_rate": 4.1993342988177434e-05,
"loss": 0.0804,
"step": 7740
},
{
"epoch": 0.566216709230208,
"grad_norm": 0.1742108319695617,
"learning_rate": 4.176014873141798e-05,
"loss": 0.0866,
"step": 7760
},
{
"epoch": 0.5676760306457497,
"grad_norm": 0.13073108248641252,
"learning_rate": 4.152713856554595e-05,
"loss": 0.0878,
"step": 7780
},
{
"epoch": 0.5691353520612915,
"grad_norm": 0.24385879986625963,
"learning_rate": 4.129431769636505e-05,
"loss": 0.0854,
"step": 7800
},
{
"epoch": 0.5705946734768332,
"grad_norm": 0.22983093362986062,
"learning_rate": 4.106169132544979e-05,
"loss": 0.0882,
"step": 7820
},
{
"epoch": 0.5720539948923751,
"grad_norm": 0.2362534084443868,
"learning_rate": 4.082926465002932e-05,
"loss": 0.0841,
"step": 7840
},
{
"epoch": 0.5735133163079168,
"grad_norm": 0.19860411744987813,
"learning_rate": 4.0597042862871257e-05,
"loss": 0.0911,
"step": 7860
},
{
"epoch": 0.5749726377234586,
"grad_norm": 0.14469692761467015,
"learning_rate": 4.0365031152165724e-05,
"loss": 0.0705,
"step": 7880
},
{
"epoch": 0.5764319591390004,
"grad_norm": 0.3026702131043989,
"learning_rate": 4.0133234701409386e-05,
"loss": 0.1141,
"step": 7900
},
{
"epoch": 0.5778912805545421,
"grad_norm": 0.20135466725204315,
"learning_rate": 3.99016586892897e-05,
"loss": 0.0981,
"step": 7920
},
{
"epoch": 0.5793506019700839,
"grad_norm": 0.2627174484653081,
"learning_rate": 3.967030828956918e-05,
"loss": 0.0886,
"step": 7940
},
{
"epoch": 0.5808099233856256,
"grad_norm": 0.13106325907585273,
"learning_rate": 3.943918867096981e-05,
"loss": 0.098,
"step": 7960
},
{
"epoch": 0.5822692448011675,
"grad_norm": 0.17654659766268968,
"learning_rate": 3.9208304997057566e-05,
"loss": 0.093,
"step": 7980
},
{
"epoch": 0.5837285662167092,
"grad_norm": 0.20015436956986157,
"learning_rate": 3.897766242612706e-05,
"loss": 0.0874,
"step": 8000
},
{
"epoch": 0.585187887632251,
"grad_norm": 0.17037302015456898,
"learning_rate": 3.874726611108628e-05,
"loss": 0.0913,
"step": 8020
},
{
"epoch": 0.5866472090477928,
"grad_norm": 0.1687960234761524,
"learning_rate": 3.8517121199341535e-05,
"loss": 0.0786,
"step": 8040
},
{
"epoch": 0.5881065304633345,
"grad_norm": 0.29620459255170106,
"learning_rate": 3.8287232832682335e-05,
"loss": 0.0905,
"step": 8060
},
{
"epoch": 0.5895658518788763,
"grad_norm": 0.1947016220691186,
"learning_rate": 3.805760614716662e-05,
"loss": 0.0852,
"step": 8080
},
{
"epoch": 0.5910251732944181,
"grad_norm": 0.14437887795334717,
"learning_rate": 3.782824627300593e-05,
"loss": 0.0931,
"step": 8100
},
{
"epoch": 0.5924844947099599,
"grad_norm": 0.21909261220655682,
"learning_rate": 3.759915833445092e-05,
"loss": 0.0878,
"step": 8120
},
{
"epoch": 0.5939438161255016,
"grad_norm": 0.14839231943289036,
"learning_rate": 3.737034744967669e-05,
"loss": 0.0962,
"step": 8140
},
{
"epoch": 0.5954031375410435,
"grad_norm": 0.23104939520938056,
"learning_rate": 3.714181873066857e-05,
"loss": 0.0912,
"step": 8160
},
{
"epoch": 0.5968624589565852,
"grad_norm": 0.25933402155627416,
"learning_rate": 3.691357728310789e-05,
"loss": 0.081,
"step": 8180
},
{
"epoch": 0.5983217803721269,
"grad_norm": 0.24053747298935832,
"learning_rate": 3.668562820625785e-05,
"loss": 0.0855,
"step": 8200
},
{
"epoch": 0.5997811017876687,
"grad_norm": 0.23989767806744336,
"learning_rate": 3.6457976592849754e-05,
"loss": 0.0983,
"step": 8220
},
{
"epoch": 0.6012404232032105,
"grad_norm": 0.3322203458314159,
"learning_rate": 3.6230627528968964e-05,
"loss": 0.1073,
"step": 8240
},
{
"epoch": 0.6026997446187523,
"grad_norm": 0.2388704578332255,
"learning_rate": 3.6003586093941534e-05,
"loss": 0.0839,
"step": 8260
},
{
"epoch": 0.604159066034294,
"grad_norm": 0.29568333505671857,
"learning_rate": 3.577685736022056e-05,
"loss": 0.0986,
"step": 8280
},
{
"epoch": 0.6056183874498359,
"grad_norm": 0.2764304009893813,
"learning_rate": 3.555044639327293e-05,
"loss": 0.0914,
"step": 8300
},
{
"epoch": 0.6070777088653776,
"grad_norm": 0.1994542234940755,
"learning_rate": 3.532435825146618e-05,
"loss": 0.0722,
"step": 8320
},
{
"epoch": 0.6085370302809193,
"grad_norm": 0.1850674654066651,
"learning_rate": 3.509859798595537e-05,
"loss": 0.1007,
"step": 8340
},
{
"epoch": 0.6099963516964612,
"grad_norm": 0.2602392735140372,
"learning_rate": 3.487317064057033e-05,
"loss": 0.0795,
"step": 8360
},
{
"epoch": 0.6114556731120029,
"grad_norm": 0.236171327726822,
"learning_rate": 3.464808125170295e-05,
"loss": 0.0868,
"step": 8380
},
{
"epoch": 0.6129149945275447,
"grad_norm": 0.24511982789375833,
"learning_rate": 3.442333484819462e-05,
"loss": 0.1099,
"step": 8400
},
{
"epoch": 0.6143743159430864,
"grad_norm": 0.18929289550803868,
"learning_rate": 3.4198936451224006e-05,
"loss": 0.0639,
"step": 8420
},
{
"epoch": 0.6158336373586283,
"grad_norm": 0.29573231890882484,
"learning_rate": 3.397489107419466e-05,
"loss": 0.086,
"step": 8440
},
{
"epoch": 0.61729295877417,
"grad_norm": 0.15110985532433188,
"learning_rate": 3.3751203722623185e-05,
"loss": 0.0826,
"step": 8460
},
{
"epoch": 0.6187522801897117,
"grad_norm": 0.2240730141612783,
"learning_rate": 3.352787939402734e-05,
"loss": 0.1002,
"step": 8480
},
{
"epoch": 0.6202116016052536,
"grad_norm": 0.17617940816910643,
"learning_rate": 3.330492307781442e-05,
"loss": 0.0814,
"step": 8500
},
{
"epoch": 0.6216709230207953,
"grad_norm": 0.20532201236991618,
"learning_rate": 3.3082339755169724e-05,
"loss": 0.0866,
"step": 8520
},
{
"epoch": 0.6231302444363371,
"grad_norm": 0.2625908408503322,
"learning_rate": 3.286013439894532e-05,
"loss": 0.0824,
"step": 8540
},
{
"epoch": 0.6245895658518789,
"grad_norm": 0.1370998701397795,
"learning_rate": 3.2638311973548904e-05,
"loss": 0.0775,
"step": 8560
},
{
"epoch": 0.6260488872674207,
"grad_norm": 0.16525999384073028,
"learning_rate": 3.241687743483293e-05,
"loss": 0.0859,
"step": 8580
},
{
"epoch": 0.6275082086829624,
"grad_norm": 0.1019124816557733,
"learning_rate": 3.2195835729983914e-05,
"loss": 0.0758,
"step": 8600
},
{
"epoch": 0.6289675300985041,
"grad_norm": 0.16025292416956477,
"learning_rate": 3.1975191797411786e-05,
"loss": 0.0768,
"step": 8620
},
{
"epoch": 0.630426851514046,
"grad_norm": 0.16545554798031012,
"learning_rate": 3.1754950566639685e-05,
"loss": 0.0736,
"step": 8640
},
{
"epoch": 0.6318861729295877,
"grad_norm": 0.17355211388140368,
"learning_rate": 3.153511695819374e-05,
"loss": 0.0735,
"step": 8660
},
{
"epoch": 0.6333454943451295,
"grad_norm": 0.4006051455854273,
"learning_rate": 3.131569588349319e-05,
"loss": 0.0765,
"step": 8680
},
{
"epoch": 0.6348048157606713,
"grad_norm": 0.24052078851822786,
"learning_rate": 3.1096692244740664e-05,
"loss": 0.1022,
"step": 8700
},
{
"epoch": 0.6362641371762131,
"grad_norm": 0.1533414015506768,
"learning_rate": 3.08781109348126e-05,
"loss": 0.0809,
"step": 8720
},
{
"epoch": 0.6377234585917548,
"grad_norm": 0.1340242757509518,
"learning_rate": 3.0659956837149985e-05,
"loss": 0.0781,
"step": 8740
},
{
"epoch": 0.6391827800072966,
"grad_norm": 0.1691744273675545,
"learning_rate": 3.0442234825649185e-05,
"loss": 0.0905,
"step": 8760
},
{
"epoch": 0.6406421014228384,
"grad_norm": 0.23981672728497094,
"learning_rate": 3.0224949764553144e-05,
"loss": 0.0892,
"step": 8780
},
{
"epoch": 0.6421014228383801,
"grad_norm": 0.24286106288672432,
"learning_rate": 3.000810650834269e-05,
"loss": 0.0817,
"step": 8800
},
{
"epoch": 0.643560744253922,
"grad_norm": 0.1455899222141873,
"learning_rate": 2.979170990162799e-05,
"loss": 0.0836,
"step": 8820
},
{
"epoch": 0.6450200656694637,
"grad_norm": 0.22817756184035107,
"learning_rate": 2.9575764779040427e-05,
"loss": 0.0789,
"step": 8840
},
{
"epoch": 0.6464793870850055,
"grad_norm": 0.24718132089424502,
"learning_rate": 2.9360275965124484e-05,
"loss": 0.0966,
"step": 8860
},
{
"epoch": 0.6479387085005472,
"grad_norm": 0.2830882130796971,
"learning_rate": 2.914524827423006e-05,
"loss": 0.0844,
"step": 8880
},
{
"epoch": 0.649398029916089,
"grad_norm": 0.13154886405230762,
"learning_rate": 2.8930686510404848e-05,
"loss": 0.0882,
"step": 8900
},
{
"epoch": 0.6508573513316308,
"grad_norm": 0.1628509299026824,
"learning_rate": 2.871659546728701e-05,
"loss": 0.1051,
"step": 8920
},
{
"epoch": 0.6523166727471725,
"grad_norm": 0.2870676908057582,
"learning_rate": 2.8502979927998096e-05,
"loss": 0.0856,
"step": 8940
},
{
"epoch": 0.6537759941627144,
"grad_norm": 0.25963714561124496,
"learning_rate": 2.8289844665036136e-05,
"loss": 0.0961,
"step": 8960
},
{
"epoch": 0.6552353155782561,
"grad_norm": 0.2751758771997854,
"learning_rate": 2.8077194440169117e-05,
"loss": 0.0788,
"step": 8980
},
{
"epoch": 0.6566946369937979,
"grad_norm": 0.30269194089469403,
"learning_rate": 2.7865034004328496e-05,
"loss": 0.0832,
"step": 9000
},
{
"epoch": 0.6581539584093397,
"grad_norm": 0.21562618811033948,
"learning_rate": 2.7653368097503085e-05,
"loss": 0.0885,
"step": 9020
},
{
"epoch": 0.6596132798248814,
"grad_norm": 0.2128702249563008,
"learning_rate": 2.7442201448633165e-05,
"loss": 0.0847,
"step": 9040
},
{
"epoch": 0.6610726012404232,
"grad_norm": 0.19016118127632922,
"learning_rate": 2.7231538775504846e-05,
"loss": 0.0836,
"step": 9060
},
{
"epoch": 0.6625319226559649,
"grad_norm": 0.1497778616340849,
"learning_rate": 2.7021384784644632e-05,
"loss": 0.0754,
"step": 9080
},
{
"epoch": 0.6639912440715068,
"grad_norm": 0.23437427731049273,
"learning_rate": 2.6811744171214303e-05,
"loss": 0.0734,
"step": 9100
},
{
"epoch": 0.6654505654870485,
"grad_norm": 0.3171739171707465,
"learning_rate": 2.6602621618905988e-05,
"loss": 0.0907,
"step": 9120
},
{
"epoch": 0.6669098869025903,
"grad_norm": 0.34300590939562675,
"learning_rate": 2.639402179983754e-05,
"loss": 0.0913,
"step": 9140
},
{
"epoch": 0.6683692083181321,
"grad_norm": 0.2917703835582748,
"learning_rate": 2.6185949374448136e-05,
"loss": 0.0789,
"step": 9160
},
{
"epoch": 0.6698285297336738,
"grad_norm": 0.22378572737202726,
"learning_rate": 2.5978408991394233e-05,
"loss": 0.0815,
"step": 9180
},
{
"epoch": 0.6712878511492156,
"grad_norm": 0.12173835938546967,
"learning_rate": 2.5771405287445576e-05,
"loss": 0.0758,
"step": 9200
},
{
"epoch": 0.6727471725647574,
"grad_norm": 0.14662571413630113,
"learning_rate": 2.5564942887381705e-05,
"loss": 0.0714,
"step": 9220
},
{
"epoch": 0.6742064939802992,
"grad_norm": 0.27764203523675757,
"learning_rate": 2.535902640388861e-05,
"loss": 0.089,
"step": 9240
},
{
"epoch": 0.6756658153958409,
"grad_norm": 0.19946437768799996,
"learning_rate": 2.5153660437455634e-05,
"loss": 0.0703,
"step": 9260
},
{
"epoch": 0.6771251368113828,
"grad_norm": 0.20290178502096412,
"learning_rate": 2.494884957627282e-05,
"loss": 0.0821,
"step": 9280
},
{
"epoch": 0.6785844582269245,
"grad_norm": 0.2154294699026174,
"learning_rate": 2.4744598396128183e-05,
"loss": 0.0974,
"step": 9300
},
{
"epoch": 0.6800437796424662,
"grad_norm": 0.2511874205987635,
"learning_rate": 2.4540911460305694e-05,
"loss": 0.0825,
"step": 9320
},
{
"epoch": 0.681503101058008,
"grad_norm": 0.1810086992559282,
"learning_rate": 2.4337793319483186e-05,
"loss": 0.0874,
"step": 9340
},
{
"epoch": 0.6829624224735498,
"grad_norm": 0.16642882654866892,
"learning_rate": 2.4135248511630824e-05,
"loss": 0.0736,
"step": 9360
},
{
"epoch": 0.6844217438890916,
"grad_norm": 0.18730698333216458,
"learning_rate": 2.3933281561909566e-05,
"loss": 0.0682,
"step": 9380
},
{
"epoch": 0.6858810653046333,
"grad_norm": 0.1991805058045345,
"learning_rate": 2.373189698257014e-05,
"loss": 0.0763,
"step": 9400
},
{
"epoch": 0.6873403867201752,
"grad_norm": 0.1692253705188508,
"learning_rate": 2.353109927285226e-05,
"loss": 0.0825,
"step": 9420
},
{
"epoch": 0.6887997081357169,
"grad_norm": 0.24581712588330082,
"learning_rate": 2.333089291888403e-05,
"loss": 0.072,
"step": 9440
},
{
"epoch": 0.6902590295512586,
"grad_norm": 0.15001533830747885,
"learning_rate": 2.3131282393581822e-05,
"loss": 0.0835,
"step": 9460
},
{
"epoch": 0.6917183509668005,
"grad_norm": 0.2572418200751706,
"learning_rate": 2.293227215655026e-05,
"loss": 0.0777,
"step": 9480
},
{
"epoch": 0.6931776723823422,
"grad_norm": 0.23284739021368367,
"learning_rate": 2.273386665398256e-05,
"loss": 0.0909,
"step": 9500
},
{
"epoch": 0.694636993797884,
"grad_norm": 0.18808290689449877,
"learning_rate": 2.253607031856131e-05,
"loss": 0.0806,
"step": 9520
},
{
"epoch": 0.6960963152134257,
"grad_norm": 0.21599851137275142,
"learning_rate": 2.2338887569359313e-05,
"loss": 0.0818,
"step": 9540
},
{
"epoch": 0.6975556366289676,
"grad_norm": 0.16231435724106183,
"learning_rate": 2.2142322811740994e-05,
"loss": 0.0651,
"step": 9560
},
{
"epoch": 0.6990149580445093,
"grad_norm": 0.16358563863117298,
"learning_rate": 2.194638043726384e-05,
"loss": 0.0941,
"step": 9580
},
{
"epoch": 0.700474279460051,
"grad_norm": 0.18696897549337352,
"learning_rate": 2.175106482358037e-05,
"loss": 0.077,
"step": 9600
},
{
"epoch": 0.7019336008755929,
"grad_norm": 0.27938821625111243,
"learning_rate": 2.1556380334340287e-05,
"loss": 0.0691,
"step": 9620
},
{
"epoch": 0.7033929222911346,
"grad_norm": 0.276170709188811,
"learning_rate": 2.136233131909301e-05,
"loss": 0.0891,
"step": 9640
},
{
"epoch": 0.7048522437066764,
"grad_norm": 0.2066704076900185,
"learning_rate": 2.116892211319054e-05,
"loss": 0.08,
"step": 9660
},
{
"epoch": 0.7063115651222182,
"grad_norm": 0.34640850265903106,
"learning_rate": 2.0976157037690537e-05,
"loss": 0.097,
"step": 9680
},
{
"epoch": 0.70777088653776,
"grad_norm": 0.19562017264957848,
"learning_rate": 2.078404039925974e-05,
"loss": 0.077,
"step": 9700
},
{
"epoch": 0.7092302079533017,
"grad_norm": 0.15661479322837638,
"learning_rate": 2.0592576490077886e-05,
"loss": 0.0709,
"step": 9720
},
{
"epoch": 0.7106895293688434,
"grad_norm": 0.2717282580952972,
"learning_rate": 2.040176958774171e-05,
"loss": 0.0787,
"step": 9740
},
{
"epoch": 0.7121488507843853,
"grad_norm": 0.27436935761606934,
"learning_rate": 2.021162395516944e-05,
"loss": 0.0742,
"step": 9760
},
{
"epoch": 0.713608172199927,
"grad_norm": 0.14755232060666437,
"learning_rate": 2.002214384050549e-05,
"loss": 0.0805,
"step": 9780
},
{
"epoch": 0.7150674936154688,
"grad_norm": 0.24785156225144722,
"learning_rate": 1.98333334770256e-05,
"loss": 0.0774,
"step": 9800
},
{
"epoch": 0.7165268150310106,
"grad_norm": 0.276700390822216,
"learning_rate": 1.9645197083042217e-05,
"loss": 0.081,
"step": 9820
},
{
"epoch": 0.7179861364465524,
"grad_norm": 0.18486086426882578,
"learning_rate": 1.9457738861810344e-05,
"loss": 0.0663,
"step": 9840
},
{
"epoch": 0.7194454578620941,
"grad_norm": 0.20602502291890026,
"learning_rate": 1.9270963001433506e-05,
"loss": 0.0826,
"step": 9860
},
{
"epoch": 0.7209047792776359,
"grad_norm": 0.17095661359091108,
"learning_rate": 1.9084873674770258e-05,
"loss": 0.0764,
"step": 9880
},
{
"epoch": 0.7223641006931777,
"grad_norm": 0.1780004922244762,
"learning_rate": 1.889947503934097e-05,
"loss": 0.0849,
"step": 9900
},
{
"epoch": 0.7238234221087194,
"grad_norm": 0.17930810952796022,
"learning_rate": 1.871477123723483e-05,
"loss": 0.0848,
"step": 9920
},
{
"epoch": 0.7252827435242613,
"grad_norm": 0.11631516474736948,
"learning_rate": 1.853076639501749e-05,
"loss": 0.0726,
"step": 9940
},
{
"epoch": 0.726742064939803,
"grad_norm": 0.25877277599531223,
"learning_rate": 1.8347464623638716e-05,
"loss": 0.0799,
"step": 9960
},
{
"epoch": 0.7282013863553448,
"grad_norm": 0.2888977002420134,
"learning_rate": 1.8164870018340595e-05,
"loss": 0.0808,
"step": 9980
},
{
"epoch": 0.7296607077708865,
"grad_norm": 0.24770955121408422,
"learning_rate": 1.798298665856605e-05,
"loss": 0.0933,
"step": 10000
},
{
"epoch": 0.7311200291864283,
"grad_norm": 0.17145855088561357,
"learning_rate": 1.780181860786767e-05,
"loss": 0.0666,
"step": 10020
},
{
"epoch": 0.7325793506019701,
"grad_norm": 0.1888311975799085,
"learning_rate": 1.7621369913816998e-05,
"loss": 0.0688,
"step": 10040
},
{
"epoch": 0.7340386720175118,
"grad_norm": 0.21899758481758985,
"learning_rate": 1.7441644607913997e-05,
"loss": 0.0819,
"step": 10060
},
{
"epoch": 0.7354979934330537,
"grad_norm": 0.15837265511931634,
"learning_rate": 1.7262646705497054e-05,
"loss": 0.0773,
"step": 10080
},
{
"epoch": 0.7369573148485954,
"grad_norm": 0.16230722759639937,
"learning_rate": 1.708438020565325e-05,
"loss": 0.083,
"step": 10100
},
{
"epoch": 0.7384166362641372,
"grad_norm": 0.1889470693308101,
"learning_rate": 1.690684909112896e-05,
"loss": 0.0648,
"step": 10120
},
{
"epoch": 0.739875957679679,
"grad_norm": 0.35825558366398547,
"learning_rate": 1.6730057328241032e-05,
"loss": 0.0914,
"step": 10140
},
{
"epoch": 0.7413352790952207,
"grad_norm": 0.17595763950131643,
"learning_rate": 1.6554008866787978e-05,
"loss": 0.0626,
"step": 10160
},
{
"epoch": 0.7427946005107625,
"grad_norm": 0.2191406674992623,
"learning_rate": 1.6378707639961847e-05,
"loss": 0.118,
"step": 10180
},
{
"epoch": 0.7442539219263042,
"grad_norm": 0.1977690967160498,
"learning_rate": 1.620415756426032e-05,
"loss": 0.0825,
"step": 10200
},
{
"epoch": 0.7457132433418461,
"grad_norm": 0.16945524182505178,
"learning_rate": 1.6030362539399235e-05,
"loss": 0.0721,
"step": 10220
},
{
"epoch": 0.7471725647573878,
"grad_norm": 0.2137107271036752,
"learning_rate": 1.5857326448225413e-05,
"loss": 0.0933,
"step": 10240
},
{
"epoch": 0.7486318861729296,
"grad_norm": 0.14388503061137067,
"learning_rate": 1.5685053156629936e-05,
"loss": 0.0697,
"step": 10260
},
{
"epoch": 0.7500912075884714,
"grad_norm": 0.16396133345953567,
"learning_rate": 1.551354651346178e-05,
"loss": 0.072,
"step": 10280
},
{
"epoch": 0.7515505290040131,
"grad_norm": 0.20758612199072954,
"learning_rate": 1.534281035044183e-05,
"loss": 0.0782,
"step": 10300
},
{
"epoch": 0.7530098504195549,
"grad_norm": 0.2677007162576603,
"learning_rate": 1.5172848482077251e-05,
"loss": 0.088,
"step": 10320
},
{
"epoch": 0.7544691718350967,
"grad_norm": 0.22981080747403948,
"learning_rate": 1.5003664705576292e-05,
"loss": 0.0675,
"step": 10340
},
{
"epoch": 0.7559284932506385,
"grad_norm": 0.14595975992209873,
"learning_rate": 1.4835262800763433e-05,
"loss": 0.0598,
"step": 10360
},
{
"epoch": 0.7573878146661802,
"grad_norm": 0.2442624180440446,
"learning_rate": 1.4667646529994955e-05,
"loss": 0.0803,
"step": 10380
},
{
"epoch": 0.758847136081722,
"grad_norm": 0.1769769305897374,
"learning_rate": 1.4500819638074836e-05,
"loss": 0.0717,
"step": 10400
},
{
"epoch": 0.7603064574972638,
"grad_norm": 0.22752965532855682,
"learning_rate": 1.4334785852171189e-05,
"loss": 0.0773,
"step": 10420
},
{
"epoch": 0.7617657789128055,
"grad_norm": 0.17198904935541695,
"learning_rate": 1.4169548881732863e-05,
"loss": 0.0679,
"step": 10440
},
{
"epoch": 0.7632251003283473,
"grad_norm": 0.18227879189264046,
"learning_rate": 1.4005112418406658e-05,
"loss": 0.0779,
"step": 10460
},
{
"epoch": 0.7646844217438891,
"grad_norm": 0.17918974376239177,
"learning_rate": 1.3841480135954815e-05,
"loss": 0.0755,
"step": 10480
},
{
"epoch": 0.7661437431594309,
"grad_norm": 0.1808913214191542,
"learning_rate": 1.3678655690172937e-05,
"loss": 0.073,
"step": 10500
},
{
"epoch": 0.7676030645749726,
"grad_norm": 0.19182604410775567,
"learning_rate": 1.351664271880833e-05,
"loss": 0.076,
"step": 10520
},
{
"epoch": 0.7690623859905145,
"grad_norm": 0.2258827316579577,
"learning_rate": 1.335544484147872e-05,
"loss": 0.0736,
"step": 10540
},
{
"epoch": 0.7705217074060562,
"grad_norm": 0.18920980773130505,
"learning_rate": 1.3195065659591377e-05,
"loss": 0.0979,
"step": 10560
},
{
"epoch": 0.7719810288215979,
"grad_norm": 0.16526779110459391,
"learning_rate": 1.303550875626266e-05,
"loss": 0.0822,
"step": 10580
},
{
"epoch": 0.7734403502371398,
"grad_norm": 0.11874076858796079,
"learning_rate": 1.2876777696237957e-05,
"loss": 0.0784,
"step": 10600
},
{
"epoch": 0.7748996716526815,
"grad_norm": 0.13406374720218356,
"learning_rate": 1.271887602581211e-05,
"loss": 0.0607,
"step": 10620
},
{
"epoch": 0.7763589930682233,
"grad_norm": 0.16665764155851395,
"learning_rate": 1.2561807272750053e-05,
"loss": 0.0775,
"step": 10640
},
{
"epoch": 0.777818314483765,
"grad_norm": 0.1644277062999908,
"learning_rate": 1.2405574946208116e-05,
"loss": 0.0778,
"step": 10660
},
{
"epoch": 0.7792776358993069,
"grad_norm": 0.36519522425925205,
"learning_rate": 1.2250182536655563e-05,
"loss": 0.0693,
"step": 10680
},
{
"epoch": 0.7807369573148486,
"grad_norm": 0.16841701778065343,
"learning_rate": 1.2095633515796639e-05,
"loss": 0.0789,
"step": 10700
},
{
"epoch": 0.7821962787303903,
"grad_norm": 0.23811108549083682,
"learning_rate": 1.1941931336492984e-05,
"loss": 0.07,
"step": 10720
},
{
"epoch": 0.7836556001459322,
"grad_norm": 0.18785323479769722,
"learning_rate": 1.1789079432686501e-05,
"loss": 0.0679,
"step": 10740
},
{
"epoch": 0.7851149215614739,
"grad_norm": 0.2187740182681998,
"learning_rate": 1.1637081219322648e-05,
"loss": 0.0783,
"step": 10760
},
{
"epoch": 0.7865742429770157,
"grad_norm": 0.2083550955615201,
"learning_rate": 1.1485940092274117e-05,
"loss": 0.0847,
"step": 10780
},
{
"epoch": 0.7880335643925575,
"grad_norm": 0.22098387990816473,
"learning_rate": 1.1335659428265012e-05,
"loss": 0.0741,
"step": 10800
},
{
"epoch": 0.7894928858080993,
"grad_norm": 0.25484065655198496,
"learning_rate": 1.1186242584795331e-05,
"loss": 0.0743,
"step": 10820
},
{
"epoch": 0.790952207223641,
"grad_norm": 0.3490731944764208,
"learning_rate": 1.1037692900066038e-05,
"loss": 0.0847,
"step": 10840
},
{
"epoch": 0.7924115286391827,
"grad_norm": 0.16585089936476097,
"learning_rate": 1.0890013692904411e-05,
"loss": 0.0615,
"step": 10860
},
{
"epoch": 0.7938708500547246,
"grad_norm": 0.11879388400453102,
"learning_rate": 1.0743208262689958e-05,
"loss": 0.0866,
"step": 10880
},
{
"epoch": 0.7953301714702663,
"grad_norm": 0.16974280932555014,
"learning_rate": 1.0597279889280649e-05,
"loss": 0.0711,
"step": 10900
},
{
"epoch": 0.7967894928858081,
"grad_norm": 0.2879906296131935,
"learning_rate": 1.0452231832939669e-05,
"loss": 0.087,
"step": 10920
},
{
"epoch": 0.7982488143013499,
"grad_norm": 0.30448023595487583,
"learning_rate": 1.0308067334262578e-05,
"loss": 0.079,
"step": 10940
},
{
"epoch": 0.7997081357168917,
"grad_norm": 0.1346240174177682,
"learning_rate": 1.0164789614104909e-05,
"loss": 0.0663,
"step": 10960
},
{
"epoch": 0.8011674571324334,
"grad_norm": 0.11374165016723602,
"learning_rate": 1.002240187351018e-05,
"loss": 0.0716,
"step": 10980
},
{
"epoch": 0.8026267785479752,
"grad_norm": 0.1995760830889922,
"learning_rate": 9.880907293638447e-06,
"loss": 0.0779,
"step": 11000
},
{
"epoch": 0.804086099963517,
"grad_norm": 0.16050456710211813,
"learning_rate": 9.740309035695156e-06,
"loss": 0.0754,
"step": 11020
},
{
"epoch": 0.8055454213790587,
"grad_norm": 0.20346144604531705,
"learning_rate": 9.600610240860557e-06,
"loss": 0.0744,
"step": 11040
},
{
"epoch": 0.8070047427946005,
"grad_norm": 0.16473468373629052,
"learning_rate": 9.461814030219518e-06,
"loss": 0.066,
"step": 11060
},
{
"epoch": 0.8084640642101423,
"grad_norm": 0.2408642242586706,
"learning_rate": 9.323923504691795e-06,
"loss": 0.0873,
"step": 11080
},
{
"epoch": 0.8099233856256841,
"grad_norm": 0.18983244827766566,
"learning_rate": 9.186941744962752e-06,
"loss": 0.0727,
"step": 11100
},
{
"epoch": 0.8113827070412258,
"grad_norm": 0.13201659836858892,
"learning_rate": 9.050871811414535e-06,
"loss": 0.0771,
"step": 11120
},
{
"epoch": 0.8128420284567676,
"grad_norm": 0.28031452690082004,
"learning_rate": 8.915716744057706e-06,
"loss": 0.0854,
"step": 11140
},
{
"epoch": 0.8143013498723094,
"grad_norm": 0.32592701500229143,
"learning_rate": 8.781479562463285e-06,
"loss": 0.0929,
"step": 11160
},
{
"epoch": 0.8157606712878511,
"grad_norm": 0.2502727337909842,
"learning_rate": 8.648163265695369e-06,
"loss": 0.0823,
"step": 11180
},
{
"epoch": 0.817219992703393,
"grad_norm": 0.15230810954172375,
"learning_rate": 8.515770832244047e-06,
"loss": 0.0713,
"step": 11200
},
{
"epoch": 0.8186793141189347,
"grad_norm": 0.16309233407580462,
"learning_rate": 8.384305219958889e-06,
"loss": 0.0596,
"step": 11220
},
{
"epoch": 0.8201386355344765,
"grad_norm": 0.23482748758834446,
"learning_rate": 8.25376936598286e-06,
"loss": 0.0655,
"step": 11240
},
{
"epoch": 0.8215979569500182,
"grad_norm": 0.16819291222882218,
"learning_rate": 8.1241661866867e-06,
"loss": 0.0767,
"step": 11260
},
{
"epoch": 0.82305727836556,
"grad_norm": 0.155955940914946,
"learning_rate": 7.995498577603816e-06,
"loss": 0.07,
"step": 11280
},
{
"epoch": 0.8245165997811018,
"grad_norm": 0.16190605473805825,
"learning_rate": 7.867769413365461e-06,
"loss": 0.0695,
"step": 11300
},
{
"epoch": 0.8259759211966435,
"grad_norm": 0.20401149032189506,
"learning_rate": 7.740981547636656e-06,
"loss": 0.0725,
"step": 11320
},
{
"epoch": 0.8274352426121854,
"grad_norm": 0.18866486867252952,
"learning_rate": 7.615137813052353e-06,
"loss": 0.0765,
"step": 11340
},
{
"epoch": 0.8288945640277271,
"grad_norm": 0.22917668312380834,
"learning_rate": 7.490241021154154e-06,
"loss": 0.0731,
"step": 11360
},
{
"epoch": 0.8303538854432689,
"grad_norm": 0.15573965400489234,
"learning_rate": 7.366293962327564e-06,
"loss": 0.078,
"step": 11380
},
{
"epoch": 0.8318132068588107,
"grad_norm": 0.20293931138222282,
"learning_rate": 7.243299405739539e-06,
"loss": 0.0653,
"step": 11400
},
{
"epoch": 0.8332725282743524,
"grad_norm": 0.3734354194730838,
"learning_rate": 7.1212600992767165e-06,
"loss": 0.0729,
"step": 11420
},
{
"epoch": 0.8347318496898942,
"grad_norm": 0.18655222782248948,
"learning_rate": 7.0001787694839504e-06,
"loss": 0.0697,
"step": 11440
},
{
"epoch": 0.836191171105436,
"grad_norm": 0.20224114343218869,
"learning_rate": 6.880058121503452e-06,
"loss": 0.0672,
"step": 11460
},
{
"epoch": 0.8376504925209778,
"grad_norm": 0.23796394228026443,
"learning_rate": 6.760900839014356e-06,
"loss": 0.0822,
"step": 11480
},
{
"epoch": 0.8391098139365195,
"grad_norm": 0.20702446741177674,
"learning_rate": 6.642709584172674e-06,
"loss": 0.0709,
"step": 11500
},
{
"epoch": 0.8405691353520612,
"grad_norm": 0.21404136231074256,
"learning_rate": 6.525486997551933e-06,
"loss": 0.0647,
"step": 11520
},
{
"epoch": 0.8420284567676031,
"grad_norm": 0.19107414563165565,
"learning_rate": 6.409235698084093e-06,
"loss": 0.0704,
"step": 11540
},
{
"epoch": 0.8434877781831448,
"grad_norm": 0.14301208487157718,
"learning_rate": 6.293958283001122e-06,
"loss": 0.0638,
"step": 11560
},
{
"epoch": 0.8449470995986866,
"grad_norm": 0.16690480624670154,
"learning_rate": 6.179657327776872e-06,
"loss": 0.0718,
"step": 11580
},
{
"epoch": 0.8464064210142284,
"grad_norm": 0.14535518790707497,
"learning_rate": 6.066335386069616e-06,
"loss": 0.064,
"step": 11600
},
{
"epoch": 0.8478657424297702,
"grad_norm": 0.25689609440186906,
"learning_rate": 5.953994989664952e-06,
"loss": 0.0739,
"step": 11620
},
{
"epoch": 0.8493250638453119,
"grad_norm": 0.2874364243397915,
"learning_rate": 5.842638648419252e-06,
"loss": 0.0798,
"step": 11640
},
{
"epoch": 0.8507843852608536,
"grad_norm": 0.32559338716955627,
"learning_rate": 5.7322688502036145e-06,
"loss": 0.0795,
"step": 11660
},
{
"epoch": 0.8522437066763955,
"grad_norm": 0.26451174523619075,
"learning_rate": 5.622888060848225e-06,
"loss": 0.0638,
"step": 11680
},
{
"epoch": 0.8537030280919372,
"grad_norm": 0.15011807077671308,
"learning_rate": 5.51449872408733e-06,
"loss": 0.0799,
"step": 11700
},
{
"epoch": 0.855162349507479,
"grad_norm": 0.15800610605357415,
"learning_rate": 5.407103261504565e-06,
"loss": 0.0633,
"step": 11720
},
{
"epoch": 0.8566216709230208,
"grad_norm": 0.2885871145585813,
"learning_rate": 5.300704072478918e-06,
"loss": 0.0814,
"step": 11740
},
{
"epoch": 0.8580809923385626,
"grad_norm": 0.19478064630131137,
"learning_rate": 5.195303534131124e-06,
"loss": 0.0708,
"step": 11760
},
{
"epoch": 0.8595403137541043,
"grad_norm": 0.1669746056578264,
"learning_rate": 5.090904001270502e-06,
"loss": 0.0662,
"step": 11780
},
{
"epoch": 0.8609996351696461,
"grad_norm": 0.19509662064642885,
"learning_rate": 4.987507806342395e-06,
"loss": 0.0604,
"step": 11800
},
{
"epoch": 0.8624589565851879,
"grad_norm": 0.13302036796781258,
"learning_rate": 4.885117259376021e-06,
"loss": 0.0665,
"step": 11820
},
{
"epoch": 0.8639182780007296,
"grad_norm": 0.2705229793863414,
"learning_rate": 4.783734647932891e-06,
"loss": 0.0812,
"step": 11840
},
{
"epoch": 0.8653775994162715,
"grad_norm": 0.21217950595702748,
"learning_rate": 4.683362237055716e-06,
"loss": 0.0851,
"step": 11860
},
{
"epoch": 0.8668369208318132,
"grad_norm": 0.10430228014392347,
"learning_rate": 4.584002269217758e-06,
"loss": 0.0797,
"step": 11880
},
{
"epoch": 0.868296242247355,
"grad_norm": 0.18383288154386884,
"learning_rate": 4.485656964272761e-06,
"loss": 0.0687,
"step": 11900
},
{
"epoch": 0.8697555636628967,
"grad_norm": 0.15062393283221726,
"learning_rate": 4.388328519405321e-06,
"loss": 0.0726,
"step": 11920
},
{
"epoch": 0.8712148850784385,
"grad_norm": 0.18392906928261096,
"learning_rate": 4.292019109081863e-06,
"loss": 0.0728,
"step": 11940
},
{
"epoch": 0.8726742064939803,
"grad_norm": 0.19301702379177835,
"learning_rate": 4.196730885002003e-06,
"loss": 0.0743,
"step": 11960
},
{
"epoch": 0.874133527909522,
"grad_norm": 0.28144373325731764,
"learning_rate": 4.102465976050495e-06,
"loss": 0.0765,
"step": 11980
},
{
"epoch": 0.8755928493250639,
"grad_norm": 0.33870680329228003,
"learning_rate": 4.009226488249656e-06,
"loss": 0.0741,
"step": 12000
},
{
"epoch": 0.8770521707406056,
"grad_norm": 0.14028718131513995,
"learning_rate": 3.917014504712341e-06,
"loss": 0.0826,
"step": 12020
},
{
"epoch": 0.8785114921561474,
"grad_norm": 0.16242741860000037,
"learning_rate": 3.825832085595382e-06,
"loss": 0.0827,
"step": 12040
},
{
"epoch": 0.8799708135716892,
"grad_norm": 0.2236973430421186,
"learning_rate": 3.73568126805357e-06,
"loss": 0.0738,
"step": 12060
},
{
"epoch": 0.8814301349872309,
"grad_norm": 0.18121546816554587,
"learning_rate": 3.6465640661941305e-06,
"loss": 0.0759,
"step": 12080
},
{
"epoch": 0.8828894564027727,
"grad_norm": 0.1671473845979495,
"learning_rate": 3.5584824710317433e-06,
"loss": 0.0707,
"step": 12100
},
{
"epoch": 0.8843487778183144,
"grad_norm": 0.287452266941361,
"learning_rate": 3.4714384504440145e-06,
"loss": 0.0702,
"step": 12120
},
{
"epoch": 0.8858080992338563,
"grad_norm": 0.2603122182092065,
"learning_rate": 3.3854339491276034e-06,
"loss": 0.0763,
"step": 12140
},
{
"epoch": 0.887267420649398,
"grad_norm": 0.23937881451583296,
"learning_rate": 3.30047088855468e-06,
"loss": 0.0688,
"step": 12160
},
{
"epoch": 0.8887267420649398,
"grad_norm": 0.3034105099595713,
"learning_rate": 3.2165511669300374e-06,
"loss": 0.07,
"step": 12180
},
{
"epoch": 0.8901860634804816,
"grad_norm": 0.23546891436663464,
"learning_rate": 3.1336766591486986e-06,
"loss": 0.0764,
"step": 12200
},
{
"epoch": 0.8916453848960233,
"grad_norm": 0.22874430917476343,
"learning_rate": 3.051849216753977e-06,
"loss": 0.0813,
"step": 12220
},
{
"epoch": 0.8931047063115651,
"grad_norm": 0.18731436997461934,
"learning_rate": 2.971070667896181e-06,
"loss": 0.0748,
"step": 12240
},
{
"epoch": 0.8945640277271069,
"grad_norm": 0.28697155444258127,
"learning_rate": 2.8913428172917088e-06,
"loss": 0.0714,
"step": 12260
},
{
"epoch": 0.8960233491426487,
"grad_norm": 0.1665356610579465,
"learning_rate": 2.812667446182754e-06,
"loss": 0.0619,
"step": 12280
},
{
"epoch": 0.8974826705581904,
"grad_norm": 0.3780123004550475,
"learning_rate": 2.735046312297512e-06,
"loss": 0.0897,
"step": 12300
},
{
"epoch": 0.8989419919737323,
"grad_norm": 0.17553586444633024,
"learning_rate": 2.658481149810904e-06,
"loss": 0.0795,
"step": 12320
},
{
"epoch": 0.900401313389274,
"grad_norm": 0.22562570429434525,
"learning_rate": 2.5829736693058324e-06,
"loss": 0.0791,
"step": 12340
},
{
"epoch": 0.9018606348048157,
"grad_norm": 0.18563882080258232,
"learning_rate": 2.508525557734964e-06,
"loss": 0.0844,
"step": 12360
},
{
"epoch": 0.9033199562203575,
"grad_norm": 0.16316948443096166,
"learning_rate": 2.4351384783830476e-06,
"loss": 0.078,
"step": 12380
},
{
"epoch": 0.9047792776358993,
"grad_norm": 0.33268252514205654,
"learning_rate": 2.3628140708297387e-06,
"loss": 0.0804,
"step": 12400
},
{
"epoch": 0.9062385990514411,
"grad_norm": 0.22218951884610635,
"learning_rate": 2.2915539509130056e-06,
"loss": 0.0743,
"step": 12420
},
{
"epoch": 0.9076979204669828,
"grad_norm": 0.2798071798612865,
"learning_rate": 2.221359710692961e-06,
"loss": 0.0797,
"step": 12440
},
{
"epoch": 0.9091572418825247,
"grad_norm": 0.18122090589025527,
"learning_rate": 2.1522329184163693e-06,
"loss": 0.0829,
"step": 12460
},
{
"epoch": 0.9106165632980664,
"grad_norm": 0.16423755657097971,
"learning_rate": 2.084175118481552e-06,
"loss": 0.0711,
"step": 12480
},
{
"epoch": 0.9120758847136081,
"grad_norm": 0.26153582906662814,
"learning_rate": 2.0171878314039216e-06,
"loss": 0.1026,
"step": 12500
},
{
"epoch": 0.91353520612915,
"grad_norm": 0.14423016740050856,
"learning_rate": 1.951272553781974e-06,
"loss": 0.0568,
"step": 12520
},
{
"epoch": 0.9149945275446917,
"grad_norm": 0.1767728721541835,
"learning_rate": 1.8864307582639018e-06,
"loss": 0.0709,
"step": 12540
},
{
"epoch": 0.9164538489602335,
"grad_norm": 0.2557552892882785,
"learning_rate": 1.8226638935146368e-06,
"loss": 0.0655,
"step": 12560
},
{
"epoch": 0.9179131703757752,
"grad_norm": 0.16552993833577492,
"learning_rate": 1.759973384183533e-06,
"loss": 0.0778,
"step": 12580
},
{
"epoch": 0.9193724917913171,
"grad_norm": 0.20666952535042302,
"learning_rate": 1.6983606308724975e-06,
"loss": 0.0594,
"step": 12600
},
{
"epoch": 0.9208318132068588,
"grad_norm": 0.17169473621194706,
"learning_rate": 1.6378270101047476e-06,
"loss": 0.0615,
"step": 12620
},
{
"epoch": 0.9222911346224005,
"grad_norm": 0.19249775498672067,
"learning_rate": 1.5783738742940035e-06,
"loss": 0.0768,
"step": 12640
},
{
"epoch": 0.9237504560379424,
"grad_norm": 0.23939912325403057,
"learning_rate": 1.5200025517143002e-06,
"loss": 0.073,
"step": 12660
},
{
"epoch": 0.9252097774534841,
"grad_norm": 0.1470790842962273,
"learning_rate": 1.4627143464703175e-06,
"loss": 0.0643,
"step": 12680
},
{
"epoch": 0.9266690988690259,
"grad_norm": 0.14616263478217778,
"learning_rate": 1.4065105384682365e-06,
"loss": 0.0748,
"step": 12700
},
{
"epoch": 0.9281284202845677,
"grad_norm": 0.19447868042741956,
"learning_rate": 1.3513923833871344e-06,
"loss": 0.0785,
"step": 12720
},
{
"epoch": 0.9295877417001095,
"grad_norm": 0.2699774519408524,
"learning_rate": 1.2973611126509465e-06,
"loss": 0.0573,
"step": 12740
},
{
"epoch": 0.9310470631156512,
"grad_norm": 0.2363625074829158,
"learning_rate": 1.2444179334009598e-06,
"loss": 0.0774,
"step": 12760
},
{
"epoch": 0.9325063845311929,
"grad_norm": 0.21819478694246727,
"learning_rate": 1.1925640284688067e-06,
"loss": 0.0646,
"step": 12780
},
{
"epoch": 0.9339657059467348,
"grad_norm": 0.21027646446608875,
"learning_rate": 1.1418005563500977e-06,
"loss": 0.0831,
"step": 12800
},
{
"epoch": 0.9354250273622765,
"grad_norm": 0.16671790982554727,
"learning_rate": 1.0921286511784757e-06,
"loss": 0.0604,
"step": 12820
},
{
"epoch": 0.9368843487778183,
"grad_norm": 0.21424601887889774,
"learning_rate": 1.0435494227003183e-06,
"loss": 0.0794,
"step": 12840
},
{
"epoch": 0.9383436701933601,
"grad_norm": 0.2758342830425722,
"learning_rate": 9.960639562499374e-07,
"loss": 0.0558,
"step": 12860
},
{
"epoch": 0.9398029916089019,
"grad_norm": 0.2067849866378926,
"learning_rate": 9.496733127253243e-07,
"loss": 0.0708,
"step": 12880
},
{
"epoch": 0.9412623130244436,
"grad_norm": 0.25852268360753444,
"learning_rate": 9.043785285644534e-07,
"loss": 0.0658,
"step": 12900
},
{
"epoch": 0.9427216344399854,
"grad_norm": 0.13562651531215247,
"learning_rate": 8.601806157221171e-07,
"loss": 0.0543,
"step": 12920
},
{
"epoch": 0.9441809558555272,
"grad_norm": 0.1276801534608157,
"learning_rate": 8.170805616473265e-07,
"loss": 0.0589,
"step": 12940
},
{
"epoch": 0.9456402772710689,
"grad_norm": 0.21656400304795456,
"learning_rate": 7.750793292612469e-07,
"loss": 0.0653,
"step": 12960
},
{
"epoch": 0.9470995986866108,
"grad_norm": 0.15236754016229437,
"learning_rate": 7.341778569356916e-07,
"loss": 0.0861,
"step": 12980
},
{
"epoch": 0.9485589201021525,
"grad_norm": 0.13042448104911,
"learning_rate": 6.943770584721565e-07,
"loss": 0.0558,
"step": 13000
},
{
"epoch": 0.9500182415176943,
"grad_norm": 0.16814342944607893,
"learning_rate": 6.556778230813743e-07,
"loss": 0.0693,
"step": 13020
},
{
"epoch": 0.951477562933236,
"grad_norm": 0.24541824676137683,
"learning_rate": 6.180810153634919e-07,
"loss": 0.0654,
"step": 13040
},
{
"epoch": 0.9529368843487778,
"grad_norm": 0.23375974468426802,
"learning_rate": 5.815874752887362e-07,
"loss": 0.0774,
"step": 13060
},
{
"epoch": 0.9543962057643196,
"grad_norm": 0.19559616963339815,
"learning_rate": 5.461980181786397e-07,
"loss": 0.079,
"step": 13080
},
{
"epoch": 0.9558555271798613,
"grad_norm": 0.27441559379098057,
"learning_rate": 5.119134346878273e-07,
"loss": 0.0865,
"step": 13100
},
{
"epoch": 0.9573148485954032,
"grad_norm": 0.27156969288522603,
"learning_rate": 4.7873449078637e-07,
"loss": 0.0629,
"step": 13120
},
{
"epoch": 0.9587741700109449,
"grad_norm": 0.28696180329193466,
"learning_rate": 4.466619277426476e-07,
"loss": 0.0631,
"step": 13140
},
{
"epoch": 0.9602334914264867,
"grad_norm": 0.15271798864372269,
"learning_rate": 4.1569646210680156e-07,
"loss": 0.063,
"step": 13160
},
{
"epoch": 0.9616928128420285,
"grad_norm": 0.23490129701109025,
"learning_rate": 3.858387856947254e-07,
"loss": 0.0731,
"step": 13180
},
{
"epoch": 0.9631521342575702,
"grad_norm": 0.22618295012189688,
"learning_rate": 3.570895655725992e-07,
"loss": 0.0702,
"step": 13200
},
{
"epoch": 0.964611455673112,
"grad_norm": 0.20700633424973702,
"learning_rate": 3.2944944404200153e-07,
"loss": 0.0818,
"step": 13220
},
{
"epoch": 0.9660707770886537,
"grad_norm": 0.15624758185553145,
"learning_rate": 3.0291903862554873e-07,
"loss": 0.0711,
"step": 13240
},
{
"epoch": 0.9675300985041956,
"grad_norm": 0.16367727608879512,
"learning_rate": 2.774989420530949e-07,
"loss": 0.0682,
"step": 13260
},
{
"epoch": 0.9689894199197373,
"grad_norm": 0.152753645721427,
"learning_rate": 2.531897222485036e-07,
"loss": 0.0678,
"step": 13280
},
{
"epoch": 0.9704487413352791,
"grad_norm": 0.12746459313433045,
"learning_rate": 2.2999192231694667e-07,
"loss": 0.0673,
"step": 13300
},
{
"epoch": 0.9719080627508209,
"grad_norm": 0.18022389515283288,
"learning_rate": 2.0790606053276984e-07,
"loss": 0.0733,
"step": 13320
},
{
"epoch": 0.9733673841663626,
"grad_norm": 0.2043683161630788,
"learning_rate": 1.8693263032793506e-07,
"loss": 0.0664,
"step": 13340
},
{
"epoch": 0.9748267055819044,
"grad_norm": 0.21315387218220452,
"learning_rate": 1.6707210028095722e-07,
"loss": 0.0766,
"step": 13360
},
{
"epoch": 0.9762860269974462,
"grad_norm": 0.10838765239735793,
"learning_rate": 1.4832491410649018e-07,
"loss": 0.0668,
"step": 13380
},
{
"epoch": 0.977745348412988,
"grad_norm": 0.20493249133697297,
"learning_rate": 1.3069149064534603e-07,
"loss": 0.0668,
"step": 13400
},
{
"epoch": 0.9792046698285297,
"grad_norm": 0.12533859768480493,
"learning_rate": 1.1417222385520232e-07,
"loss": 0.0624,
"step": 13420
},
{
"epoch": 0.9806639912440716,
"grad_norm": 0.13640407728401674,
"learning_rate": 9.876748280175374e-08,
"loss": 0.0648,
"step": 13440
},
{
"epoch": 0.9821233126596133,
"grad_norm": 0.26255640137890773,
"learning_rate": 8.447761165049084e-08,
"loss": 0.0748,
"step": 13460
},
{
"epoch": 0.983582634075155,
"grad_norm": 0.2045065387998203,
"learning_rate": 7.130292965901176e-08,
"loss": 0.0736,
"step": 13480
},
{
"epoch": 0.9850419554906968,
"grad_norm": 0.2589541202073395,
"learning_rate": 5.924373116986126e-08,
"loss": 0.0804,
"step": 13500
},
{
"epoch": 0.9865012769062386,
"grad_norm": 0.23813447695351278,
"learning_rate": 4.830028560399713e-08,
"loss": 0.0717,
"step": 13520
},
{
"epoch": 0.9879605983217804,
"grad_norm": 0.22908649033900605,
"learning_rate": 3.84728374547394e-08,
"loss": 0.0615,
"step": 13540
},
{
"epoch": 0.9894199197373221,
"grad_norm": 0.134381505549619,
"learning_rate": 2.9761606282319164e-08,
"loss": 0.0696,
"step": 13560
},
{
"epoch": 0.990879241152864,
"grad_norm": 0.20686107675298593,
"learning_rate": 2.2166786708976983e-08,
"loss": 0.0608,
"step": 13580
},
{
"epoch": 0.9923385625684057,
"grad_norm": 0.1470958235212741,
"learning_rate": 1.5688548414594107e-08,
"loss": 0.0722,
"step": 13600
},
{
"epoch": 0.9937978839839474,
"grad_norm": 0.28992636074658135,
"learning_rate": 1.0327036132939949e-08,
"loss": 0.079,
"step": 13620
},
{
"epoch": 0.9952572053994893,
"grad_norm": 0.1518264410524196,
"learning_rate": 6.082369648396924e-09,
"loss": 0.0716,
"step": 13640
},
{
"epoch": 0.996716526815031,
"grad_norm": 0.1355086227621071,
"learning_rate": 2.9546437933070104e-09,
"loss": 0.0631,
"step": 13660
},
{
"epoch": 0.9981758482305728,
"grad_norm": 0.1903490129846117,
"learning_rate": 9.439284458623299e-10,
"loss": 0.0774,
"step": 13680
},
{
"epoch": 0.9996351696461145,
"grad_norm": 0.1922983216458204,
"learning_rate": 5.02685285175275e-11,
"loss": 0.0681,
"step": 13700
},
{
"epoch": 1.0,
"step": 13705,
"total_flos": 336607794167808.0,
"train_loss": 0.10042130719436776,
"train_runtime": 24357.1331,
"train_samples_per_second": 4.501,
"train_steps_per_second": 0.563
}
],
"logging_steps": 20,
"max_steps": 13705,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 336607794167808.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}