diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,55 +1,8910 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.7142857142857144, + "epoch": 1.9995261786306564, "eval_steps": 500, - "global_step": 6, + "global_step": 6330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.2857142857142857, - "grad_norm": 153.4065134934994, - "learning_rate": 0.001, - "loss": 21.8182, + "epoch": 0.0003158809128958383, + "grad_norm": 0.2789897719448856, + "learning_rate": 1.5797788309636651e-06, + "loss": 1.5618, "step": 1 }, { - "epoch": 0.8571428571428571, - "eval_loss": 17.570310592651367, - "eval_runtime": 0.3894, - "eval_samples_per_second": 5.136, - "eval_steps_per_second": 2.568, - "step": 3 + "epoch": 0.0015794045644791914, + "grad_norm": 0.2892532097035974, + "learning_rate": 7.898894154818326e-06, + "loss": 1.491, + "step": 5 + }, + { + "epoch": 0.003158809128958383, + "grad_norm": 0.2677271963629727, + "learning_rate": 1.579778830963665e-05, + "loss": 1.5398, + "step": 10 + }, + { + "epoch": 0.004738213693437574, + "grad_norm": 0.17664334604863555, + "learning_rate": 2.3696682464454976e-05, + "loss": 1.4923, + "step": 15 + }, + { + "epoch": 0.006317618257916766, + "grad_norm": 0.1837074587639523, + "learning_rate": 3.15955766192733e-05, + "loss": 1.4707, + "step": 20 + }, + { + "epoch": 0.007897022822395957, + "grad_norm": 0.10674506928787326, + "learning_rate": 3.949447077409163e-05, + "loss": 1.4078, + "step": 25 + }, + { + "epoch": 0.009476427386875147, + "grad_norm": 0.09158554885809744, + "learning_rate": 4.739336492890995e-05, + "loss": 1.3651, + "step": 30 + }, + { + "epoch": 0.01105583195135434, + "grad_norm": 0.08454382107096206, + "learning_rate": 5.529225908372828e-05, + "loss": 1.3675, + "step": 35 + }, + { + "epoch": 0.012635236515833531, + "grad_norm": 0.06493854926352545, + "learning_rate": 6.31911532385466e-05, + "loss": 1.3226, + "step": 40 + }, + { + "epoch": 0.014214641080312722, + "grad_norm": 0.05673072207914187, + "learning_rate": 7.109004739336493e-05, + "loss": 1.3366, + "step": 45 + }, + { + "epoch": 0.015794045644791914, + "grad_norm": 0.05491667593602107, + "learning_rate": 7.898894154818326e-05, + "loss": 1.3201, + "step": 50 + }, + { + "epoch": 0.017373450209271106, + "grad_norm": 0.04964918512060703, + "learning_rate": 8.688783570300159e-05, + "loss": 1.2821, + "step": 55 + }, + { + "epoch": 0.018952854773750295, + "grad_norm": 0.05508256002416552, + "learning_rate": 9.47867298578199e-05, + "loss": 1.3296, + "step": 60 + }, + { + "epoch": 0.020532259338229487, + "grad_norm": 0.05145117328690555, + "learning_rate": 0.00010268562401263823, + "loss": 1.2242, + "step": 65 + }, + { + "epoch": 0.02211166390270868, + "grad_norm": 0.04856402203442387, + "learning_rate": 0.00011058451816745656, + "loss": 1.3047, + "step": 70 + }, + { + "epoch": 0.02369106846718787, + "grad_norm": 0.04791827380722405, + "learning_rate": 0.00011848341232227489, + "loss": 1.2547, + "step": 75 + }, + { + "epoch": 0.025270473031667063, + "grad_norm": 0.04950986336105823, + "learning_rate": 0.0001263823064770932, + "loss": 1.2478, + "step": 80 + }, + { + "epoch": 0.02684987759614625, + "grad_norm": 0.05115323866433944, + "learning_rate": 0.00013428120063191154, + "loss": 1.252, + "step": 85 + }, + { + "epoch": 0.028429282160625444, + "grad_norm": 0.050403331262978614, + "learning_rate": 0.00014218009478672987, + "loss": 1.2436, + "step": 90 + }, + { + "epoch": 0.030008686725104636, + "grad_norm": 0.047802558286327325, + "learning_rate": 0.0001500789889415482, + "loss": 1.2154, + "step": 95 + }, + { + "epoch": 0.03158809128958383, + "grad_norm": 0.058289368434715695, + "learning_rate": 0.00015797788309636652, + "loss": 1.2263, + "step": 100 + }, + { + "epoch": 0.03316749585406302, + "grad_norm": 0.052893240483671625, + "learning_rate": 0.00016587677725118482, + "loss": 1.2554, + "step": 105 + }, + { + "epoch": 0.03474690041854221, + "grad_norm": 0.05755557590352308, + "learning_rate": 0.00017377567140600318, + "loss": 1.2039, + "step": 110 + }, + { + "epoch": 0.036326304983021404, + "grad_norm": 0.05223463762568313, + "learning_rate": 0.00018167456556082148, + "loss": 1.1944, + "step": 115 + }, + { + "epoch": 0.03790570954750059, + "grad_norm": 0.05491452017858886, + "learning_rate": 0.0001895734597156398, + "loss": 1.2906, + "step": 120 + }, + { + "epoch": 0.03948511411197978, + "grad_norm": 0.05543633621414441, + "learning_rate": 0.00019747235387045813, + "loss": 1.2448, + "step": 125 + }, + { + "epoch": 0.04106451867645897, + "grad_norm": 0.05358848718368862, + "learning_rate": 0.00020537124802527646, + "loss": 1.2237, + "step": 130 + }, + { + "epoch": 0.042643923240938165, + "grad_norm": 0.06179885136266294, + "learning_rate": 0.0002132701421800948, + "loss": 1.2079, + "step": 135 + }, + { + "epoch": 0.04422332780541736, + "grad_norm": 0.05100931620495029, + "learning_rate": 0.00022116903633491312, + "loss": 1.3063, + "step": 140 + }, + { + "epoch": 0.04580273236989655, + "grad_norm": 0.0530018665651464, + "learning_rate": 0.00022906793048973144, + "loss": 1.1266, + "step": 145 + }, + { + "epoch": 0.04738213693437574, + "grad_norm": 0.06146301369787031, + "learning_rate": 0.00023696682464454977, + "loss": 1.2175, + "step": 150 + }, + { + "epoch": 0.048961541498854934, + "grad_norm": 0.058591714657941774, + "learning_rate": 0.0002448657187993681, + "loss": 1.201, + "step": 155 + }, + { + "epoch": 0.050540946063334126, + "grad_norm": 0.05380313755008262, + "learning_rate": 0.0002527646129541864, + "loss": 1.1665, + "step": 160 + }, + { + "epoch": 0.05212035062781331, + "grad_norm": 0.058558871257419466, + "learning_rate": 0.00026066350710900475, + "loss": 1.1877, + "step": 165 + }, + { + "epoch": 0.0536997551922925, + "grad_norm": 0.06380402461857115, + "learning_rate": 0.0002685624012638231, + "loss": 1.1482, + "step": 170 + }, + { + "epoch": 0.055279159756771695, + "grad_norm": 0.05479882567291893, + "learning_rate": 0.0002764612954186414, + "loss": 1.1207, + "step": 175 + }, + { + "epoch": 0.05685856432125089, + "grad_norm": 0.05855467690650077, + "learning_rate": 0.00028436018957345974, + "loss": 1.1685, + "step": 180 + }, + { + "epoch": 0.05843796888573008, + "grad_norm": 0.05564570468690808, + "learning_rate": 0.000292259083728278, + "loss": 1.1683, + "step": 185 + }, + { + "epoch": 0.06001737345020927, + "grad_norm": 0.0494359899772851, + "learning_rate": 0.0003001579778830964, + "loss": 1.2459, + "step": 190 + }, + { + "epoch": 0.061596778014688464, + "grad_norm": 0.05502377946893621, + "learning_rate": 0.0003080568720379147, + "loss": 1.1759, + "step": 195 + }, + { + "epoch": 0.06317618257916766, + "grad_norm": 0.04826733876181895, + "learning_rate": 0.00031595576619273305, + "loss": 1.2097, + "step": 200 + }, + { + "epoch": 0.06475558714364685, + "grad_norm": 0.06130531868163813, + "learning_rate": 0.0003238546603475513, + "loss": 1.2117, + "step": 205 + }, + { + "epoch": 0.06633499170812604, + "grad_norm": 0.05579359766498949, + "learning_rate": 0.00033175355450236965, + "loss": 1.1871, + "step": 210 + }, + { + "epoch": 0.06791439627260523, + "grad_norm": 0.056126237444499225, + "learning_rate": 0.000339652448657188, + "loss": 1.2226, + "step": 215 + }, + { + "epoch": 0.06949380083708442, + "grad_norm": 0.057607712153461374, + "learning_rate": 0.00034755134281200636, + "loss": 1.1417, + "step": 220 + }, + { + "epoch": 0.07107320540156362, + "grad_norm": 0.059680902683061025, + "learning_rate": 0.0003554502369668247, + "loss": 1.2532, + "step": 225 + }, + { + "epoch": 0.07265260996604281, + "grad_norm": 0.059660465306138444, + "learning_rate": 0.00036334913112164296, + "loss": 1.2279, + "step": 230 + }, + { + "epoch": 0.07423201453052199, + "grad_norm": 0.07025494414796972, + "learning_rate": 0.0003712480252764613, + "loss": 1.1505, + "step": 235 + }, + { + "epoch": 0.07581141909500118, + "grad_norm": 0.0700230881662308, + "learning_rate": 0.0003791469194312796, + "loss": 1.1622, + "step": 240 + }, + { + "epoch": 0.07739082365948037, + "grad_norm": 0.06259280855025934, + "learning_rate": 0.000387045813586098, + "loss": 1.1703, + "step": 245 + }, + { + "epoch": 0.07897022822395956, + "grad_norm": 0.05846912427798551, + "learning_rate": 0.00039494470774091627, + "loss": 1.1557, + "step": 250 + }, + { + "epoch": 0.08054963278843875, + "grad_norm": 0.059126444692307836, + "learning_rate": 0.0004028436018957346, + "loss": 1.1398, + "step": 255 + }, + { + "epoch": 0.08212903735291795, + "grad_norm": 0.0636582968368329, + "learning_rate": 0.0004107424960505529, + "loss": 1.1896, + "step": 260 + }, + { + "epoch": 0.08370844191739714, + "grad_norm": 0.054771718653728006, + "learning_rate": 0.00041864139020537125, + "loss": 1.1395, + "step": 265 + }, + { + "epoch": 0.08528784648187633, + "grad_norm": 0.052046726233918324, + "learning_rate": 0.0004265402843601896, + "loss": 1.1524, + "step": 270 + }, + { + "epoch": 0.08686725104635552, + "grad_norm": 0.05505440033261828, + "learning_rate": 0.0004344391785150079, + "loss": 1.2445, + "step": 275 + }, + { + "epoch": 0.08844665561083472, + "grad_norm": 0.05229126324293584, + "learning_rate": 0.00044233807266982623, + "loss": 1.1923, + "step": 280 + }, + { + "epoch": 0.09002606017531391, + "grad_norm": 0.056018163896207424, + "learning_rate": 0.00045023696682464456, + "loss": 1.2435, + "step": 285 + }, + { + "epoch": 0.0916054647397931, + "grad_norm": 0.05939226225728438, + "learning_rate": 0.0004581358609794629, + "loss": 1.1854, + "step": 290 + }, + { + "epoch": 0.09318486930427229, + "grad_norm": 0.060756291168264476, + "learning_rate": 0.0004660347551342812, + "loss": 1.1893, + "step": 295 + }, + { + "epoch": 0.09476427386875148, + "grad_norm": 0.05904003385364645, + "learning_rate": 0.00047393364928909954, + "loss": 1.2337, + "step": 300 + }, + { + "epoch": 0.09634367843323068, + "grad_norm": 0.056832912998354725, + "learning_rate": 0.00048183254344391787, + "loss": 1.184, + "step": 305 + }, + { + "epoch": 0.09792308299770987, + "grad_norm": 0.06534146054108758, + "learning_rate": 0.0004897314375987362, + "loss": 1.2271, + "step": 310 + }, + { + "epoch": 0.09950248756218906, + "grad_norm": 0.049751901789561674, + "learning_rate": 0.0004976303317535545, + "loss": 1.1458, + "step": 315 + }, + { + "epoch": 0.10108189212666825, + "grad_norm": 0.04997659125094788, + "learning_rate": 0.0005055292259083729, + "loss": 1.109, + "step": 320 + }, + { + "epoch": 0.10266129669114744, + "grad_norm": 0.061972227867431645, + "learning_rate": 0.0005134281200631912, + "loss": 1.1755, + "step": 325 + }, + { + "epoch": 0.10424070125562662, + "grad_norm": 0.07995162193950708, + "learning_rate": 0.0005213270142180095, + "loss": 1.1901, + "step": 330 + }, + { + "epoch": 0.10582010582010581, + "grad_norm": 0.05838498126795504, + "learning_rate": 0.0005292259083728278, + "loss": 1.1629, + "step": 335 + }, + { + "epoch": 0.107399510384585, + "grad_norm": 0.12200420477732127, + "learning_rate": 0.0005371248025276462, + "loss": 1.231, + "step": 340 + }, + { + "epoch": 0.1089789149490642, + "grad_norm": 0.05707978807468486, + "learning_rate": 0.0005450236966824644, + "loss": 1.1957, + "step": 345 + }, + { + "epoch": 0.11055831951354339, + "grad_norm": 0.06161334754095597, + "learning_rate": 0.0005529225908372828, + "loss": 1.2422, + "step": 350 + }, + { + "epoch": 0.11213772407802258, + "grad_norm": 0.05656475357013885, + "learning_rate": 0.0005608214849921011, + "loss": 1.2008, + "step": 355 + }, + { + "epoch": 0.11371712864250177, + "grad_norm": 0.056239595168653025, + "learning_rate": 0.0005687203791469195, + "loss": 1.2125, + "step": 360 + }, + { + "epoch": 0.11529653320698097, + "grad_norm": 0.058071575217863304, + "learning_rate": 0.0005766192733017378, + "loss": 1.2465, + "step": 365 + }, + { + "epoch": 0.11687593777146016, + "grad_norm": 0.06278972665091191, + "learning_rate": 0.000584518167456556, + "loss": 1.2111, + "step": 370 + }, + { + "epoch": 0.11845534233593935, + "grad_norm": 0.05703645077122024, + "learning_rate": 0.0005924170616113745, + "loss": 1.1096, + "step": 375 + }, + { + "epoch": 0.12003474690041854, + "grad_norm": 0.06994936365804813, + "learning_rate": 0.0006003159557661928, + "loss": 1.1155, + "step": 380 + }, + { + "epoch": 0.12161415146489774, + "grad_norm": 0.052372697841875995, + "learning_rate": 0.0006082148499210111, + "loss": 1.2188, + "step": 385 + }, + { + "epoch": 0.12319355602937693, + "grad_norm": 0.06645079677500605, + "learning_rate": 0.0006161137440758294, + "loss": 1.1516, + "step": 390 + }, + { + "epoch": 0.12477296059385612, + "grad_norm": 0.06386365975552465, + "learning_rate": 0.0006240126382306477, + "loss": 1.1409, + "step": 395 + }, + { + "epoch": 0.1263523651583353, + "grad_norm": 0.051198724511214086, + "learning_rate": 0.0006319115323854661, + "loss": 1.1699, + "step": 400 + }, + { + "epoch": 0.1279317697228145, + "grad_norm": 0.047789704621754345, + "learning_rate": 0.0006398104265402843, + "loss": 1.2111, + "step": 405 + }, + { + "epoch": 0.1295111742872937, + "grad_norm": 0.08292897928400877, + "learning_rate": 0.0006477093206951026, + "loss": 1.2084, + "step": 410 + }, + { + "epoch": 0.1310905788517729, + "grad_norm": 0.053440634299855995, + "learning_rate": 0.0006556082148499211, + "loss": 1.1919, + "step": 415 + }, + { + "epoch": 0.13266998341625208, + "grad_norm": 0.05318482784273273, + "learning_rate": 0.0006635071090047393, + "loss": 1.1451, + "step": 420 + }, + { + "epoch": 0.13424938798073127, + "grad_norm": 0.05665917212538103, + "learning_rate": 0.0006714060031595577, + "loss": 1.1411, + "step": 425 + }, + { + "epoch": 0.13582879254521046, + "grad_norm": 0.05574161677632433, + "learning_rate": 0.000679304897314376, + "loss": 1.1071, + "step": 430 + }, + { + "epoch": 0.13740819710968966, + "grad_norm": 0.05821755155831752, + "learning_rate": 0.0006872037914691943, + "loss": 1.1786, + "step": 435 + }, + { + "epoch": 0.13898760167416885, + "grad_norm": 0.06093075040564606, + "learning_rate": 0.0006951026856240127, + "loss": 1.092, + "step": 440 + }, + { + "epoch": 0.14056700623864804, + "grad_norm": 0.05825340962494296, + "learning_rate": 0.0007030015797788309, + "loss": 1.1626, + "step": 445 + }, + { + "epoch": 0.14214641080312723, + "grad_norm": 0.05020938068401297, + "learning_rate": 0.0007109004739336494, + "loss": 1.1822, + "step": 450 + }, + { + "epoch": 0.14372581536760642, + "grad_norm": 0.0557961806261948, + "learning_rate": 0.0007187993680884676, + "loss": 1.2147, + "step": 455 + }, + { + "epoch": 0.14530521993208562, + "grad_norm": 0.05437488653301554, + "learning_rate": 0.0007266982622432859, + "loss": 1.1479, + "step": 460 + }, + { + "epoch": 0.1468846244965648, + "grad_norm": 0.05268833934110884, + "learning_rate": 0.0007345971563981043, + "loss": 1.1473, + "step": 465 + }, + { + "epoch": 0.14846402906104397, + "grad_norm": 0.04493291882173135, + "learning_rate": 0.0007424960505529226, + "loss": 1.1499, + "step": 470 + }, + { + "epoch": 0.15004343362552316, + "grad_norm": 0.04452601287960959, + "learning_rate": 0.0007503949447077409, + "loss": 1.1688, + "step": 475 + }, + { + "epoch": 0.15162283819000236, + "grad_norm": 0.050748020091030625, + "learning_rate": 0.0007582938388625592, + "loss": 1.2168, + "step": 480 + }, + { + "epoch": 0.15320224275448155, + "grad_norm": 0.16891299036710675, + "learning_rate": 0.0007661927330173775, + "loss": 1.1953, + "step": 485 + }, + { + "epoch": 0.15478164731896074, + "grad_norm": 0.0683474674929215, + "learning_rate": 0.000774091627172196, + "loss": 1.2254, + "step": 490 + }, + { + "epoch": 0.15636105188343993, + "grad_norm": 0.06256971833638193, + "learning_rate": 0.0007819905213270142, + "loss": 1.2087, + "step": 495 + }, + { + "epoch": 0.15794045644791913, + "grad_norm": 0.05436563080376856, + "learning_rate": 0.0007898894154818325, + "loss": 1.1562, + "step": 500 + }, + { + "epoch": 0.15951986101239832, + "grad_norm": 0.055684137284571354, + "learning_rate": 0.0007977883096366509, + "loss": 1.1949, + "step": 505 + }, + { + "epoch": 0.1610992655768775, + "grad_norm": 0.050335288455669695, + "learning_rate": 0.0008056872037914692, + "loss": 1.1596, + "step": 510 + }, + { + "epoch": 0.1626786701413567, + "grad_norm": 0.05206965203978437, + "learning_rate": 0.0008135860979462876, + "loss": 1.2096, + "step": 515 + }, + { + "epoch": 0.1642580747058359, + "grad_norm": 0.06839415660825547, + "learning_rate": 0.0008214849921011058, + "loss": 1.2405, + "step": 520 + }, + { + "epoch": 0.16583747927031509, + "grad_norm": 0.05934969682125384, + "learning_rate": 0.0008293838862559242, + "loss": 1.1575, + "step": 525 + }, + { + "epoch": 0.16741688383479428, + "grad_norm": 0.048248270161178164, + "learning_rate": 0.0008372827804107425, + "loss": 1.1297, + "step": 530 + }, + { + "epoch": 0.16899628839927347, + "grad_norm": 0.04841154135491026, + "learning_rate": 0.0008451816745655608, + "loss": 1.1778, + "step": 535 + }, + { + "epoch": 0.17057569296375266, + "grad_norm": 0.04839285015392914, + "learning_rate": 0.0008530805687203792, + "loss": 1.1367, + "step": 540 + }, + { + "epoch": 0.17215509752823185, + "grad_norm": 0.05085171971926083, + "learning_rate": 0.0008609794628751975, + "loss": 1.1527, + "step": 545 + }, + { + "epoch": 0.17373450209271105, + "grad_norm": 0.048045360550468874, + "learning_rate": 0.0008688783570300158, + "loss": 1.2203, + "step": 550 + }, + { + "epoch": 0.17531390665719024, + "grad_norm": 0.05360566007829095, + "learning_rate": 0.0008767772511848341, + "loss": 1.191, + "step": 555 + }, + { + "epoch": 0.17689331122166943, + "grad_norm": 0.05153608252672655, + "learning_rate": 0.0008846761453396525, + "loss": 1.2965, + "step": 560 + }, + { + "epoch": 0.17847271578614862, + "grad_norm": 0.0537492984979501, + "learning_rate": 0.0008925750394944708, + "loss": 1.253, + "step": 565 + }, + { + "epoch": 0.18005212035062781, + "grad_norm": 0.059570203946716534, + "learning_rate": 0.0009004739336492891, + "loss": 1.1733, + "step": 570 + }, + { + "epoch": 0.181631524915107, + "grad_norm": 0.07310330867652265, + "learning_rate": 0.0009083728278041074, + "loss": 1.2127, + "step": 575 + }, + { + "epoch": 0.1832109294795862, + "grad_norm": 0.048429065249010646, + "learning_rate": 0.0009162717219589258, + "loss": 1.24, + "step": 580 + }, + { + "epoch": 0.1847903340440654, + "grad_norm": 0.05274469740017941, + "learning_rate": 0.0009241706161137441, + "loss": 1.1546, + "step": 585 + }, + { + "epoch": 0.18636973860854458, + "grad_norm": 0.04378705257118556, + "learning_rate": 0.0009320695102685624, + "loss": 1.1437, + "step": 590 + }, + { + "epoch": 0.18794914317302377, + "grad_norm": 0.05384819336054559, + "learning_rate": 0.0009399684044233808, + "loss": 1.1763, + "step": 595 + }, + { + "epoch": 0.18952854773750297, + "grad_norm": 0.07965381715501382, + "learning_rate": 0.0009478672985781991, + "loss": 1.2411, + "step": 600 + }, + { + "epoch": 0.19110795230198216, + "grad_norm": 0.059004614843427866, + "learning_rate": 0.0009557661927330173, + "loss": 1.2445, + "step": 605 + }, + { + "epoch": 0.19268735686646135, + "grad_norm": 0.05280984994326182, + "learning_rate": 0.0009636650868878357, + "loss": 1.3118, + "step": 610 + }, + { + "epoch": 0.19426676143094054, + "grad_norm": 0.04254342942038169, + "learning_rate": 0.0009715639810426541, + "loss": 1.306, + "step": 615 + }, + { + "epoch": 0.19584616599541974, + "grad_norm": 0.056142331071331146, + "learning_rate": 0.0009794628751974724, + "loss": 1.1764, + "step": 620 + }, + { + "epoch": 0.19742557055989893, + "grad_norm": 0.055000939660971034, + "learning_rate": 0.0009873617693522906, + "loss": 1.2869, + "step": 625 + }, + { + "epoch": 0.19900497512437812, + "grad_norm": 0.05848265110756856, + "learning_rate": 0.000995260663507109, + "loss": 1.2242, + "step": 630 + }, + { + "epoch": 0.2005843796888573, + "grad_norm": 0.1289533179840951, + "learning_rate": 0.0009999996959064125, + "loss": 1.2347, + "step": 635 + }, + { + "epoch": 0.2021637842533365, + "grad_norm": 0.0634015519760411, + "learning_rate": 0.0009999962748577986, + "loss": 1.296, + "step": 640 + }, + { + "epoch": 0.2037431888178157, + "grad_norm": 0.07254139591414524, + "learning_rate": 0.0009999890526696813, + "loss": 1.1969, + "step": 645 + }, + { + "epoch": 0.2053225933822949, + "grad_norm": 0.06309255152166017, + "learning_rate": 0.0009999780293969657, + "loss": 1.2262, + "step": 650 + }, + { + "epoch": 0.20690199794677408, + "grad_norm": 0.06898019039350595, + "learning_rate": 0.0009999632051234547, + "loss": 1.2251, + "step": 655 + }, + { + "epoch": 0.20848140251125324, + "grad_norm": 0.06259386096143452, + "learning_rate": 0.000999944579961847, + "loss": 1.3119, + "step": 660 + }, + { + "epoch": 0.21006080707573244, + "grad_norm": 0.06872950124265154, + "learning_rate": 0.0009999221540537377, + "loss": 1.1715, + "step": 665 + }, + { + "epoch": 0.21164021164021163, + "grad_norm": 0.9937019497079367, + "learning_rate": 0.000999895927569616, + "loss": 1.3005, + "step": 670 + }, + { + "epoch": 0.21321961620469082, + "grad_norm": 0.1354849534169185, + "learning_rate": 0.0009998659007088642, + "loss": 1.2387, + "step": 675 + }, + { + "epoch": 0.21479902076917, + "grad_norm": 0.07498188430417127, + "learning_rate": 0.0009998320736997568, + "loss": 1.2051, + "step": 680 + }, + { + "epoch": 0.2163784253336492, + "grad_norm": 2.499051766047346, + "learning_rate": 0.0009997944467994581, + "loss": 1.259, + "step": 685 + }, + { + "epoch": 0.2179578298981284, + "grad_norm": 2.048237623936503, + "learning_rate": 0.0009997530202940205, + "loss": 1.4247, + "step": 690 + }, + { + "epoch": 0.2195372344626076, + "grad_norm": 0.9568708158453652, + "learning_rate": 0.0009997077944983819, + "loss": 1.6289, + "step": 695 + }, + { + "epoch": 0.22111663902708678, + "grad_norm": 0.17185653456476527, + "learning_rate": 0.0009996587697563642, + "loss": 1.5628, + "step": 700 + }, + { + "epoch": 0.22269604359156597, + "grad_norm": 0.17039026137566396, + "learning_rate": 0.00099960594644067, + "loss": 1.4037, + "step": 705 + }, + { + "epoch": 0.22427544815604517, + "grad_norm": 5.215297156818115, + "learning_rate": 0.0009995493249528795, + "loss": 1.4015, + "step": 710 + }, + { + "epoch": 0.22585485272052436, + "grad_norm": 0.8050682876390816, + "learning_rate": 0.0009994889057234487, + "loss": 1.4124, + "step": 715 + }, + { + "epoch": 0.22743425728500355, + "grad_norm": 0.16011890487013522, + "learning_rate": 0.0009994246892117045, + "loss": 1.6748, + "step": 720 + }, + { + "epoch": 0.22901366184948274, + "grad_norm": 0.11806743445365343, + "learning_rate": 0.0009993566759058429, + "loss": 1.6867, + "step": 725 + }, + { + "epoch": 0.23059306641396193, + "grad_norm": 0.1422914641138834, + "learning_rate": 0.0009992848663229231, + "loss": 1.5684, + "step": 730 + }, + { + "epoch": 0.23217247097844113, + "grad_norm": 0.10039336502710226, + "learning_rate": 0.0009992092610088662, + "loss": 1.3594, + "step": 735 + }, + { + "epoch": 0.23375187554292032, + "grad_norm": 0.08353358714541645, + "learning_rate": 0.0009991298605384492, + "loss": 1.3553, + "step": 740 + }, + { + "epoch": 0.2353312801073995, + "grad_norm": 0.06028998203574107, + "learning_rate": 0.000999046665515301, + "loss": 1.2517, + "step": 745 + }, + { + "epoch": 0.2369106846718787, + "grad_norm": 0.1069757369931618, + "learning_rate": 0.0009989596765718981, + "loss": 1.3012, + "step": 750 + }, + { + "epoch": 0.2384900892363579, + "grad_norm": 0.057695063797185045, + "learning_rate": 0.0009988688943695595, + "loss": 1.3095, + "step": 755 + }, + { + "epoch": 0.24006949380083709, + "grad_norm": 0.05321345064994074, + "learning_rate": 0.000998774319598442, + "loss": 1.2603, + "step": 760 + }, + { + "epoch": 0.24164889836531628, + "grad_norm": 0.06181569765694586, + "learning_rate": 0.0009986759529775349, + "loss": 1.2347, + "step": 765 + }, + { + "epoch": 0.24322830292979547, + "grad_norm": 0.04799581558822299, + "learning_rate": 0.0009985737952546542, + "loss": 1.2514, + "step": 770 + }, + { + "epoch": 0.24480770749427466, + "grad_norm": 0.05659151468826247, + "learning_rate": 0.0009984678472064374, + "loss": 1.2609, + "step": 775 + }, + { + "epoch": 0.24638711205875385, + "grad_norm": 0.05127796227154854, + "learning_rate": 0.0009983581096383368, + "loss": 1.2511, + "step": 780 + }, + { + "epoch": 0.24796651662323305, + "grad_norm": 0.049337974506391276, + "learning_rate": 0.0009982445833846146, + "loss": 1.2434, + "step": 785 + }, + { + "epoch": 0.24954592118771224, + "grad_norm": 0.06282817628594659, + "learning_rate": 0.0009981272693083349, + "loss": 1.2281, + "step": 790 + }, + { + "epoch": 0.25112532575219143, + "grad_norm": 0.05360363073011154, + "learning_rate": 0.0009980061683013592, + "loss": 1.261, + "step": 795 + }, + { + "epoch": 0.2527047303166706, + "grad_norm": 0.0435497395031424, + "learning_rate": 0.0009978812812843378, + "loss": 1.2305, + "step": 800 + }, + { + "epoch": 0.2542841348811498, + "grad_norm": 0.0701758491964102, + "learning_rate": 0.0009977526092067037, + "loss": 1.2389, + "step": 805 + }, + { + "epoch": 0.255863539445629, + "grad_norm": 0.04528535565880187, + "learning_rate": 0.0009976201530466655, + "loss": 1.1771, + "step": 810 + }, + { + "epoch": 0.2574429440101082, + "grad_norm": 0.04304696595003719, + "learning_rate": 0.0009974839138111988, + "loss": 1.1608, + "step": 815 + }, + { + "epoch": 0.2590223485745874, + "grad_norm": 0.05501271003472879, + "learning_rate": 0.0009973438925360407, + "loss": 1.2104, + "step": 820 + }, + { + "epoch": 0.2606017531390666, + "grad_norm": 0.06777008626565485, + "learning_rate": 0.0009972000902856795, + "loss": 1.2131, + "step": 825 + }, + { + "epoch": 0.2621811577035458, + "grad_norm": 0.03968699801552576, + "learning_rate": 0.0009970525081533482, + "loss": 1.1724, + "step": 830 + }, + { + "epoch": 0.26376056226802497, + "grad_norm": 0.07383457025829056, + "learning_rate": 0.0009969011472610158, + "loss": 1.1637, + "step": 835 + }, + { + "epoch": 0.26533996683250416, + "grad_norm": 0.05318282155905444, + "learning_rate": 0.0009967460087593786, + "loss": 1.1844, + "step": 840 + }, + { + "epoch": 0.26691937139698335, + "grad_norm": 0.0438038229881027, + "learning_rate": 0.0009965870938278517, + "loss": 1.3114, + "step": 845 + }, + { + "epoch": 0.26849877596146254, + "grad_norm": 0.044682976599825244, + "learning_rate": 0.0009964244036745594, + "loss": 1.1801, + "step": 850 + }, + { + "epoch": 0.27007818052594174, + "grad_norm": 0.04931452257775692, + "learning_rate": 0.000996257939536327, + "loss": 1.2087, + "step": 855 + }, + { + "epoch": 0.2716575850904209, + "grad_norm": 0.08794796118059663, + "learning_rate": 0.0009960877026786708, + "loss": 1.2521, + "step": 860 + }, + { + "epoch": 0.2732369896549001, + "grad_norm": 0.06090026057096657, + "learning_rate": 0.0009959136943957887, + "loss": 1.2126, + "step": 865 + }, + { + "epoch": 0.2748163942193793, + "grad_norm": 0.046466185921548456, + "learning_rate": 0.0009957359160105497, + "loss": 1.2087, + "step": 870 + }, + { + "epoch": 0.2763957987838585, + "grad_norm": 0.22759376527429676, + "learning_rate": 0.000995554368874485, + "loss": 1.2451, + "step": 875 + }, + { + "epoch": 0.2779752033483377, + "grad_norm": 0.046606113089797695, + "learning_rate": 0.0009953690543677768, + "loss": 1.2507, + "step": 880 + }, + { + "epoch": 0.2795546079128169, + "grad_norm": 0.05471794231886829, + "learning_rate": 0.0009951799738992485, + "loss": 1.2073, + "step": 885 + }, + { + "epoch": 0.2811340124772961, + "grad_norm": 2.3469505748015114, + "learning_rate": 0.0009949871289063525, + "loss": 1.1663, + "step": 890 + }, + { + "epoch": 0.2827134170417753, + "grad_norm": 0.04849558180772164, + "learning_rate": 0.000994790520855162, + "loss": 1.2141, + "step": 895 + }, + { + "epoch": 0.28429282160625446, + "grad_norm": 0.06306917133411176, + "learning_rate": 0.0009945901512403569, + "loss": 1.1744, + "step": 900 + }, + { + "epoch": 0.28587222617073366, + "grad_norm": 0.16182143067555266, + "learning_rate": 0.0009943860215852144, + "loss": 1.1608, + "step": 905 + }, + { + "epoch": 0.28745163073521285, + "grad_norm": 0.0875751596694063, + "learning_rate": 0.0009941781334415966, + "loss": 1.2676, + "step": 910 + }, + { + "epoch": 0.28903103529969204, + "grad_norm": 0.05190083485865879, + "learning_rate": 0.0009939664883899394, + "loss": 1.1948, + "step": 915 + }, + { + "epoch": 0.29061043986417123, + "grad_norm": 0.0446450044232843, + "learning_rate": 0.0009937510880392386, + "loss": 1.1432, + "step": 920 + }, + { + "epoch": 0.2921898444286504, + "grad_norm": 0.03769202980158123, + "learning_rate": 0.0009935319340270408, + "loss": 1.2313, + "step": 925 + }, + { + "epoch": 0.2937692489931296, + "grad_norm": 0.06718904210860335, + "learning_rate": 0.0009933090280194279, + "loss": 1.1912, + "step": 930 + }, + { + "epoch": 0.2953486535576088, + "grad_norm": 0.045744113522662125, + "learning_rate": 0.0009930823717110065, + "loss": 1.1365, + "step": 935 + }, + { + "epoch": 0.29692805812208795, + "grad_norm": 0.043786588552808724, + "learning_rate": 0.0009928519668248937, + "loss": 1.1888, + "step": 940 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.039005039119245764, + "learning_rate": 0.0009926178151127049, + "loss": 1.2473, + "step": 945 + }, + { + "epoch": 0.30008686725104633, + "grad_norm": 0.050876769890127016, + "learning_rate": 0.0009923799183545398, + "loss": 1.1889, + "step": 950 + }, + { + "epoch": 0.3016662718155255, + "grad_norm": 0.04575948024145222, + "learning_rate": 0.0009921382783589696, + "loss": 1.171, + "step": 955 + }, + { + "epoch": 0.3032456763800047, + "grad_norm": 0.05017713187941563, + "learning_rate": 0.0009918928969630228, + "loss": 1.1756, + "step": 960 + }, + { + "epoch": 0.3048250809444839, + "grad_norm": 0.04345272263747149, + "learning_rate": 0.0009916437760321708, + "loss": 1.1517, + "step": 965 + }, + { + "epoch": 0.3064044855089631, + "grad_norm": 0.05131866420472819, + "learning_rate": 0.0009913909174603147, + "loss": 1.2095, + "step": 970 + }, + { + "epoch": 0.3079838900734423, + "grad_norm": 0.05715628036510762, + "learning_rate": 0.0009911343231697703, + "loss": 1.2204, + "step": 975 + }, + { + "epoch": 0.3095632946379215, + "grad_norm": 0.04788259122222203, + "learning_rate": 0.0009908739951112534, + "loss": 1.2256, + "step": 980 + }, + { + "epoch": 0.3111426992024007, + "grad_norm": 0.04502265853173252, + "learning_rate": 0.0009906099352638652, + "loss": 1.2243, + "step": 985 + }, + { + "epoch": 0.31272210376687987, + "grad_norm": 0.040922281342619146, + "learning_rate": 0.0009903421456350775, + "loss": 1.1412, + "step": 990 + }, + { + "epoch": 0.31430150833135906, + "grad_norm": 0.03777821636386108, + "learning_rate": 0.000990070628260717, + "loss": 1.1867, + "step": 995 + }, + { + "epoch": 0.31588091289583825, + "grad_norm": 0.041838269964072416, + "learning_rate": 0.0009897953852049494, + "loss": 1.1813, + "step": 1000 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.09066613740501203, + "learning_rate": 0.0009895164185602654, + "loss": 1.2916, + "step": 1005 + }, + { + "epoch": 0.31903972202479663, + "grad_norm": 0.05545106093802797, + "learning_rate": 0.0009892337304474629, + "loss": 1.2022, + "step": 1010 + }, + { + "epoch": 0.3206191265892758, + "grad_norm": 0.05823267457176772, + "learning_rate": 0.0009889473230156316, + "loss": 1.1614, + "step": 1015 + }, + { + "epoch": 0.322198531153755, + "grad_norm": 0.045599148101737805, + "learning_rate": 0.000988657198442137, + "loss": 1.1038, + "step": 1020 + }, + { + "epoch": 0.3237779357182342, + "grad_norm": 0.0438141101705124, + "learning_rate": 0.0009883633589326038, + "loss": 1.122, + "step": 1025 + }, + { + "epoch": 0.3253573402827134, + "grad_norm": 0.036910474407238554, + "learning_rate": 0.000988065806720898, + "loss": 1.2285, + "step": 1030 + }, + { + "epoch": 0.3269367448471926, + "grad_norm": 0.04459568322437855, + "learning_rate": 0.0009877645440691122, + "loss": 1.2004, + "step": 1035 + }, + { + "epoch": 0.3285161494116718, + "grad_norm": 0.06088171196269682, + "learning_rate": 0.0009874595732675454, + "loss": 1.2027, + "step": 1040 + }, + { + "epoch": 0.330095553976151, + "grad_norm": 0.1301305299184432, + "learning_rate": 0.0009871508966346882, + "loss": 1.1417, + "step": 1045 + }, + { + "epoch": 0.33167495854063017, + "grad_norm": 0.0650018665905558, + "learning_rate": 0.0009868385165172043, + "loss": 1.1807, + "step": 1050 + }, + { + "epoch": 0.33325436310510936, + "grad_norm": 0.05039289090482615, + "learning_rate": 0.0009865224352899118, + "loss": 1.2863, + "step": 1055 + }, + { + "epoch": 0.33483376766958856, + "grad_norm": 0.04252576093135151, + "learning_rate": 0.0009862026553557669, + "loss": 1.2244, + "step": 1060 + }, + { + "epoch": 0.33641317223406775, + "grad_norm": 0.040040635989773796, + "learning_rate": 0.000985879179145843, + "loss": 1.1958, + "step": 1065 + }, + { + "epoch": 0.33799257679854694, + "grad_norm": 0.05510903071317426, + "learning_rate": 0.0009855520091193158, + "loss": 1.1589, + "step": 1070 + }, + { + "epoch": 0.33957198136302613, + "grad_norm": 0.06148111145590916, + "learning_rate": 0.000985221147763441, + "loss": 1.199, + "step": 1075 + }, + { + "epoch": 0.3411513859275053, + "grad_norm": 0.06385524089857669, + "learning_rate": 0.000984886597593538, + "loss": 1.1694, + "step": 1080 + }, + { + "epoch": 0.3427307904919845, + "grad_norm": 0.06039272181936096, + "learning_rate": 0.0009845483611529693, + "loss": 1.153, + "step": 1085 + }, + { + "epoch": 0.3443101950564637, + "grad_norm": 0.057409878329639244, + "learning_rate": 0.0009842064410131221, + "loss": 1.2322, + "step": 1090 + }, + { + "epoch": 0.3458895996209429, + "grad_norm": 0.0385176344260921, + "learning_rate": 0.000983860839773388, + "loss": 1.1711, + "step": 1095 + }, + { + "epoch": 0.3474690041854221, + "grad_norm": 0.046177122204221564, + "learning_rate": 0.0009835115600611434, + "loss": 1.1927, + "step": 1100 + }, + { + "epoch": 0.3490484087499013, + "grad_norm": 0.03923161192034386, + "learning_rate": 0.00098315860453173, + "loss": 1.1267, + "step": 1105 + }, + { + "epoch": 0.3506278133143805, + "grad_norm": 0.0711060541707956, + "learning_rate": 0.0009828019758684342, + "loss": 1.188, + "step": 1110 + }, + { + "epoch": 0.35220721787885967, + "grad_norm": 0.043558270735732964, + "learning_rate": 0.000982441676782467, + "loss": 1.1585, + "step": 1115 + }, + { + "epoch": 0.35378662244333886, + "grad_norm": 0.050395482724650706, + "learning_rate": 0.0009820777100129428, + "loss": 1.1803, + "step": 1120 + }, + { + "epoch": 0.35536602700781805, + "grad_norm": 0.04592194649018605, + "learning_rate": 0.0009817100783268591, + "loss": 1.1806, + "step": 1125 + }, + { + "epoch": 0.35694543157229724, + "grad_norm": 0.03797561593571615, + "learning_rate": 0.0009813387845190756, + "loss": 1.1736, + "step": 1130 + }, + { + "epoch": 0.35852483613677644, + "grad_norm": 0.05497891905537719, + "learning_rate": 0.0009809638314122922, + "loss": 1.1786, + "step": 1135 + }, + { + "epoch": 0.36010424070125563, + "grad_norm": 0.06429227680603777, + "learning_rate": 0.0009805852218570284, + "loss": 1.1582, + "step": 1140 + }, + { + "epoch": 0.3616836452657348, + "grad_norm": 0.07553160414558219, + "learning_rate": 0.000980202958731601, + "loss": 1.1873, + "step": 1145 + }, + { + "epoch": 0.363263049830214, + "grad_norm": 0.044060112680621384, + "learning_rate": 0.0009798170449421028, + "loss": 1.0936, + "step": 1150 + }, + { + "epoch": 0.3648424543946932, + "grad_norm": 0.03854689953399917, + "learning_rate": 0.0009794274834223798, + "loss": 1.1089, + "step": 1155 + }, + { + "epoch": 0.3664218589591724, + "grad_norm": 0.057582233846948015, + "learning_rate": 0.0009790342771340095, + "loss": 1.2381, + "step": 1160 + }, + { + "epoch": 0.3680012635236516, + "grad_norm": 0.042972058986148115, + "learning_rate": 0.000978637429066278, + "loss": 1.1241, + "step": 1165 + }, + { + "epoch": 0.3695806680881308, + "grad_norm": 0.04384959742998597, + "learning_rate": 0.0009782369422361575, + "loss": 1.1868, + "step": 1170 + }, + { + "epoch": 0.37116007265261, + "grad_norm": 0.03743363050153524, + "learning_rate": 0.0009778328196882835, + "loss": 1.1248, + "step": 1175 + }, + { + "epoch": 0.37273947721708917, + "grad_norm": 0.05852710309772355, + "learning_rate": 0.000977425064494931, + "loss": 1.2375, + "step": 1180 + }, + { + "epoch": 0.37431888178156836, + "grad_norm": 0.04176855553749399, + "learning_rate": 0.000977013679755992, + "loss": 1.1759, + "step": 1185 + }, + { + "epoch": 0.37589828634604755, + "grad_norm": 0.03504530283243545, + "learning_rate": 0.0009765986685989513, + "loss": 1.1413, + "step": 1190 + }, + { + "epoch": 0.37747769091052674, + "grad_norm": 0.044377355127390584, + "learning_rate": 0.0009761800341788632, + "loss": 1.1988, + "step": 1195 + }, + { + "epoch": 0.37905709547500593, + "grad_norm": 0.07278323758231763, + "learning_rate": 0.0009757577796783267, + "loss": 1.2166, + "step": 1200 + }, + { + "epoch": 0.3806365000394851, + "grad_norm": 0.03741504859286431, + "learning_rate": 0.0009753319083074625, + "loss": 1.16, + "step": 1205 + }, + { + "epoch": 0.3822159046039643, + "grad_norm": 0.043790677416908085, + "learning_rate": 0.0009749024233038876, + "loss": 1.1204, + "step": 1210 + }, + { + "epoch": 0.3837953091684435, + "grad_norm": 0.04965181579324521, + "learning_rate": 0.0009744693279326914, + "loss": 1.2009, + "step": 1215 + }, + { + "epoch": 0.3853747137329227, + "grad_norm": 0.03844420659862734, + "learning_rate": 0.00097403262548641, + "loss": 1.1129, + "step": 1220 + }, + { + "epoch": 0.3869541182974019, + "grad_norm": 0.04020927397828586, + "learning_rate": 0.000973592319285002, + "loss": 1.2122, + "step": 1225 + }, + { + "epoch": 0.3885335228618811, + "grad_norm": 0.03880487766984948, + "learning_rate": 0.0009731484126758229, + "loss": 1.1671, + "step": 1230 + }, + { + "epoch": 0.3901129274263603, + "grad_norm": 12.282611028819801, + "learning_rate": 0.0009727009090336001, + "loss": 1.2351, + "step": 1235 + }, + { + "epoch": 0.39169233199083947, + "grad_norm": 0.09915021744614969, + "learning_rate": 0.000972249811760406, + "loss": 1.1555, + "step": 1240 + }, + { + "epoch": 0.39327173655531866, + "grad_norm": 0.0788287042616388, + "learning_rate": 0.0009717951242856338, + "loss": 1.1723, + "step": 1245 + }, + { + "epoch": 0.39485114111979785, + "grad_norm": 0.05548474879341339, + "learning_rate": 0.00097133685006597, + "loss": 1.2439, + "step": 1250 + }, + { + "epoch": 0.39643054568427705, + "grad_norm": 0.05120713365037505, + "learning_rate": 0.0009708749925853695, + "loss": 1.1581, + "step": 1255 + }, + { + "epoch": 0.39800995024875624, + "grad_norm": 0.06404293801939767, + "learning_rate": 0.0009704095553550276, + "loss": 1.1447, + "step": 1260 + }, + { + "epoch": 0.39958935481323543, + "grad_norm": 0.06312319298754332, + "learning_rate": 0.0009699405419133542, + "loss": 1.1659, + "step": 1265 + }, + { + "epoch": 0.4011687593777146, + "grad_norm": 0.13502437347863908, + "learning_rate": 0.0009694679558259472, + "loss": 1.2976, + "step": 1270 + }, + { + "epoch": 0.4027481639421938, + "grad_norm": 0.05679455947202985, + "learning_rate": 0.0009689918006855645, + "loss": 1.1992, + "step": 1275 + }, + { + "epoch": 0.404327568506673, + "grad_norm": 0.042285556377311476, + "learning_rate": 0.0009685120801120974, + "loss": 1.1559, + "step": 1280 + }, + { + "epoch": 0.4059069730711522, + "grad_norm": 0.039616964902070693, + "learning_rate": 0.0009680287977525426, + "loss": 1.2237, + "step": 1285 + }, + { + "epoch": 0.4074863776356314, + "grad_norm": 0.04579132630925091, + "learning_rate": 0.0009675419572809748, + "loss": 1.1573, + "step": 1290 + }, + { + "epoch": 0.4090657822001106, + "grad_norm": 0.04404265464861563, + "learning_rate": 0.0009670515623985187, + "loss": 1.139, + "step": 1295 + }, + { + "epoch": 0.4106451867645898, + "grad_norm": 0.04072321352559835, + "learning_rate": 0.000966557616833321, + "loss": 1.1837, + "step": 1300 + }, + { + "epoch": 0.41222459132906897, + "grad_norm": 0.050235769680070874, + "learning_rate": 0.0009660601243405214, + "loss": 1.1782, + "step": 1305 + }, + { + "epoch": 0.41380399589354816, + "grad_norm": 0.033559753256380775, + "learning_rate": 0.000965559088702225, + "loss": 1.1306, + "step": 1310 + }, + { + "epoch": 0.4153834004580273, + "grad_norm": 0.04229103700252006, + "learning_rate": 0.0009650545137274727, + "loss": 1.1629, + "step": 1315 + }, + { + "epoch": 0.4169628050225065, + "grad_norm": 0.035322329411059644, + "learning_rate": 0.000964546403252213, + "loss": 1.1241, + "step": 1320 + }, + { + "epoch": 0.4185422095869857, + "grad_norm": 0.03465845928810666, + "learning_rate": 0.0009640347611392722, + "loss": 1.1461, + "step": 1325 + }, + { + "epoch": 0.4201216141514649, + "grad_norm": 0.04057442742798969, + "learning_rate": 0.0009635195912783254, + "loss": 1.1491, + "step": 1330 + }, + { + "epoch": 0.42170101871594406, + "grad_norm": 0.03684991337983024, + "learning_rate": 0.0009630008975858666, + "loss": 1.1161, + "step": 1335 + }, + { + "epoch": 0.42328042328042326, + "grad_norm": 0.03575593243336909, + "learning_rate": 0.0009624786840051798, + "loss": 1.1021, + "step": 1340 + }, + { + "epoch": 0.42485982784490245, + "grad_norm": 0.038199337694147205, + "learning_rate": 0.0009619529545063075, + "loss": 1.1164, + "step": 1345 + }, + { + "epoch": 0.42643923240938164, + "grad_norm": 0.041815107832999215, + "learning_rate": 0.000961423713086022, + "loss": 1.194, + "step": 1350 + }, + { + "epoch": 0.42801863697386083, + "grad_norm": 0.04498779180491189, + "learning_rate": 0.000960890963767794, + "loss": 1.1468, + "step": 1355 + }, + { + "epoch": 0.42959804153834, + "grad_norm": 0.036156965395200406, + "learning_rate": 0.0009603547106017629, + "loss": 1.2044, + "step": 1360 + }, + { + "epoch": 0.4311774461028192, + "grad_norm": 0.0432898856802058, + "learning_rate": 0.0009598149576647053, + "loss": 1.1368, + "step": 1365 + }, + { + "epoch": 0.4327568506672984, + "grad_norm": 0.034197240048107265, + "learning_rate": 0.0009592717090600039, + "loss": 1.1465, + "step": 1370 + }, + { + "epoch": 0.4343362552317776, + "grad_norm": 0.06043967223719665, + "learning_rate": 0.0009587249689176171, + "loss": 1.2052, + "step": 1375 + }, + { + "epoch": 0.4359156597962568, + "grad_norm": 0.06952056078208815, + "learning_rate": 0.0009581747413940472, + "loss": 1.1896, + "step": 1380 + }, + { + "epoch": 0.437495064360736, + "grad_norm": 0.05316147215022715, + "learning_rate": 0.000957621030672308, + "loss": 1.1591, + "step": 1385 + }, + { + "epoch": 0.4390744689252152, + "grad_norm": 0.04617490936576988, + "learning_rate": 0.0009570638409618946, + "loss": 1.18, + "step": 1390 + }, + { + "epoch": 0.44065387348969437, + "grad_norm": 0.04953043812653844, + "learning_rate": 0.0009565031764987502, + "loss": 1.1999, + "step": 1395 + }, + { + "epoch": 0.44223327805417356, + "grad_norm": 0.06773718629645895, + "learning_rate": 0.000955939041545234, + "loss": 1.1981, + "step": 1400 + }, + { + "epoch": 0.44381268261865275, + "grad_norm": 0.05915998407509928, + "learning_rate": 0.0009553714403900897, + "loss": 1.1665, + "step": 1405 + }, + { + "epoch": 0.44539208718313195, + "grad_norm": 0.05814230542199274, + "learning_rate": 0.0009548003773484114, + "loss": 1.1592, + "step": 1410 + }, + { + "epoch": 0.44697149174761114, + "grad_norm": 0.050530877930300165, + "learning_rate": 0.0009542258567616122, + "loss": 1.1392, + "step": 1415 + }, + { + "epoch": 0.44855089631209033, + "grad_norm": 0.0347879234970523, + "learning_rate": 0.0009536478829973902, + "loss": 1.1932, + "step": 1420 + }, + { + "epoch": 0.4501303008765695, + "grad_norm": 0.03401528298444466, + "learning_rate": 0.0009530664604496964, + "loss": 1.2068, + "step": 1425 + }, + { + "epoch": 0.4517097054410487, + "grad_norm": 0.03592298599245248, + "learning_rate": 0.0009524815935386997, + "loss": 1.1095, + "step": 1430 + }, + { + "epoch": 0.4532891100055279, + "grad_norm": 0.03974587205442548, + "learning_rate": 0.0009518932867107551, + "loss": 1.1696, + "step": 1435 + }, + { + "epoch": 0.4548685145700071, + "grad_norm": 0.03632280838981324, + "learning_rate": 0.0009513015444383682, + "loss": 1.0858, + "step": 1440 + }, + { + "epoch": 0.4564479191344863, + "grad_norm": 0.06649418475772793, + "learning_rate": 0.0009507063712201623, + "loss": 1.2024, + "step": 1445 + }, + { + "epoch": 0.4580273236989655, + "grad_norm": 0.03796203835081221, + "learning_rate": 0.0009501077715808444, + "loss": 1.1621, + "step": 1450 + }, + { + "epoch": 0.4596067282634447, + "grad_norm": 0.03697987143090938, + "learning_rate": 0.0009495057500711697, + "loss": 1.1507, + "step": 1455 + }, + { + "epoch": 0.46118613282792387, + "grad_norm": 0.060201011824411864, + "learning_rate": 0.0009489003112679075, + "loss": 1.1274, + "step": 1460 + }, + { + "epoch": 0.46276553739240306, + "grad_norm": 0.06116106558440598, + "learning_rate": 0.0009482914597738072, + "loss": 1.1842, + "step": 1465 + }, + { + "epoch": 0.46434494195688225, + "grad_norm": 0.036769642166062266, + "learning_rate": 0.000947679200217562, + "loss": 1.2194, + "step": 1470 + }, + { + "epoch": 0.46592434652136144, + "grad_norm": 0.04080679654037661, + "learning_rate": 0.0009470635372537748, + "loss": 1.1075, + "step": 1475 + }, + { + "epoch": 0.46750375108584064, + "grad_norm": 0.04134620986690867, + "learning_rate": 0.0009464444755629216, + "loss": 1.1599, + "step": 1480 + }, + { + "epoch": 0.4690831556503198, + "grad_norm": 0.04951151349439519, + "learning_rate": 0.0009458220198513177, + "loss": 1.0983, + "step": 1485 + }, + { + "epoch": 0.470662560214799, + "grad_norm": 0.036583714485988426, + "learning_rate": 0.00094519617485108, + "loss": 1.1358, + "step": 1490 + }, + { + "epoch": 0.4722419647792782, + "grad_norm": 0.03826169128595341, + "learning_rate": 0.0009445669453200923, + "loss": 1.1244, + "step": 1495 + }, + { + "epoch": 0.4738213693437574, + "grad_norm": 0.038232781887304446, + "learning_rate": 0.0009439343360419688, + "loss": 1.1603, + "step": 1500 + }, + { + "epoch": 0.4754007739082366, + "grad_norm": 0.039105164779814766, + "learning_rate": 0.0009432983518260174, + "loss": 1.1746, + "step": 1505 + }, + { + "epoch": 0.4769801784727158, + "grad_norm": 0.042248369457699976, + "learning_rate": 0.0009426589975072039, + "loss": 1.0923, + "step": 1510 + }, + { + "epoch": 0.478559583037195, + "grad_norm": 0.034606195108217934, + "learning_rate": 0.0009420162779461141, + "loss": 1.1434, + "step": 1515 + }, + { + "epoch": 0.48013898760167417, + "grad_norm": 0.05210811886077802, + "learning_rate": 0.000941370198028918, + "loss": 1.2034, + "step": 1520 + }, + { + "epoch": 0.48171839216615336, + "grad_norm": 0.042216154872195315, + "learning_rate": 0.0009407207626673319, + "loss": 1.2095, + "step": 1525 + }, + { + "epoch": 0.48329779673063256, + "grad_norm": 0.04064731158462736, + "learning_rate": 0.0009400679767985813, + "loss": 1.1016, + "step": 1530 + }, + { + "epoch": 0.48487720129511175, + "grad_norm": 0.0600499327895473, + "learning_rate": 0.000939411845385364, + "loss": 1.158, + "step": 1535 + }, + { + "epoch": 0.48645660585959094, + "grad_norm": 0.03423119648920456, + "learning_rate": 0.0009387523734158106, + "loss": 1.1099, + "step": 1540 + }, + { + "epoch": 0.48803601042407013, + "grad_norm": 0.050478652855600346, + "learning_rate": 0.0009380895659034485, + "loss": 1.1357, + "step": 1545 + }, + { + "epoch": 0.4896154149885493, + "grad_norm": 0.04697965402996422, + "learning_rate": 0.0009374234278871631, + "loss": 1.1607, + "step": 1550 + }, + { + "epoch": 0.4911948195530285, + "grad_norm": 0.03542010437099392, + "learning_rate": 0.0009367539644311591, + "loss": 1.0903, + "step": 1555 + }, + { + "epoch": 0.4927742241175077, + "grad_norm": 0.03747261262586468, + "learning_rate": 0.0009360811806249223, + "loss": 1.1855, + "step": 1560 + }, + { + "epoch": 0.4943536286819869, + "grad_norm": 0.0342672802591948, + "learning_rate": 0.0009354050815831811, + "loss": 1.1237, + "step": 1565 + }, + { + "epoch": 0.4959330332464661, + "grad_norm": 0.04341908110834359, + "learning_rate": 0.0009347256724458674, + "loss": 1.1838, + "step": 1570 + }, + { + "epoch": 0.4975124378109453, + "grad_norm": 0.035326570663710144, + "learning_rate": 0.0009340429583780774, + "loss": 1.1988, + "step": 1575 + }, + { + "epoch": 0.4990918423754245, + "grad_norm": 0.039931525021498104, + "learning_rate": 0.0009333569445700326, + "loss": 1.1413, + "step": 1580 + }, + { + "epoch": 0.5006712469399036, + "grad_norm": 0.04648089686611222, + "learning_rate": 0.0009326676362370404, + "loss": 1.1775, + "step": 1585 + }, + { + "epoch": 0.5022506515043829, + "grad_norm": 0.03358345656864849, + "learning_rate": 0.0009319750386194537, + "loss": 1.0934, + "step": 1590 + }, + { + "epoch": 0.503830056068862, + "grad_norm": 0.04937018263420283, + "learning_rate": 0.0009312791569826324, + "loss": 1.2194, + "step": 1595 + }, + { + "epoch": 0.5054094606333412, + "grad_norm": 0.03677684834011018, + "learning_rate": 0.0009305799966169022, + "loss": 1.0755, + "step": 1600 + }, + { + "epoch": 0.5069888651978204, + "grad_norm": 0.0485094805968303, + "learning_rate": 0.000929877562837515, + "loss": 1.158, + "step": 1605 + }, + { + "epoch": 0.5085682697622996, + "grad_norm": 0.0444791474985611, + "learning_rate": 0.0009291718609846081, + "loss": 1.1302, + "step": 1610 + }, + { + "epoch": 0.5101476743267788, + "grad_norm": 0.04117121605197338, + "learning_rate": 0.0009284628964231635, + "loss": 1.1024, + "step": 1615 + }, + { + "epoch": 0.511727078891258, + "grad_norm": 0.04900515775546535, + "learning_rate": 0.0009277506745429682, + "loss": 1.1087, + "step": 1620 + }, + { + "epoch": 0.5133064834557372, + "grad_norm": 0.05633245648361274, + "learning_rate": 0.0009270352007585719, + "loss": 1.2387, + "step": 1625 + }, + { + "epoch": 0.5148858880202164, + "grad_norm": 0.059059357799405465, + "learning_rate": 0.000926316480509246, + "loss": 1.223, + "step": 1630 + }, + { + "epoch": 0.5164652925846955, + "grad_norm": 0.042565468524679115, + "learning_rate": 0.0009255945192589439, + "loss": 1.1179, + "step": 1635 + }, + { + "epoch": 0.5180446971491748, + "grad_norm": 0.041144265259162496, + "learning_rate": 0.0009248693224962567, + "loss": 1.239, + "step": 1640 + }, + { + "epoch": 0.5196241017136539, + "grad_norm": 0.03525706812621692, + "learning_rate": 0.0009241408957343739, + "loss": 1.2348, + "step": 1645 + }, + { + "epoch": 0.5212035062781332, + "grad_norm": 0.046457052327021985, + "learning_rate": 0.00092340924451104, + "loss": 1.1048, + "step": 1650 + }, + { + "epoch": 0.5227829108426123, + "grad_norm": 0.047403099964377, + "learning_rate": 0.0009226743743885134, + "loss": 1.1228, + "step": 1655 + }, + { + "epoch": 0.5243623154070916, + "grad_norm": 0.03485309471044975, + "learning_rate": 0.0009219362909535234, + "loss": 1.1687, + "step": 1660 + }, + { + "epoch": 0.5259417199715707, + "grad_norm": 0.03368534111902494, + "learning_rate": 0.0009211949998172279, + "loss": 1.1499, + "step": 1665 + }, + { + "epoch": 0.5275211245360499, + "grad_norm": 0.03876304390267727, + "learning_rate": 0.0009204505066151709, + "loss": 1.1442, + "step": 1670 + }, + { + "epoch": 0.5291005291005291, + "grad_norm": 0.03718587036595309, + "learning_rate": 0.0009197028170072397, + "loss": 1.0995, + "step": 1675 + }, + { + "epoch": 0.5306799336650083, + "grad_norm": 0.04248209908652153, + "learning_rate": 0.0009189519366776217, + "loss": 1.218, + "step": 1680 + }, + { + "epoch": 0.5322593382294875, + "grad_norm": 0.040832723625780394, + "learning_rate": 0.0009181978713347613, + "loss": 1.1446, + "step": 1685 + }, + { + "epoch": 0.5338387427939667, + "grad_norm": 0.04706059530054778, + "learning_rate": 0.000917440626711316, + "loss": 1.1481, + "step": 1690 + }, + { + "epoch": 0.5354181473584458, + "grad_norm": 0.039309572380869684, + "learning_rate": 0.0009166802085641139, + "loss": 1.1471, + "step": 1695 + }, + { + "epoch": 0.5369975519229251, + "grad_norm": 0.03645680498086478, + "learning_rate": 0.0009159166226741088, + "loss": 1.2631, + "step": 1700 + }, + { + "epoch": 0.5385769564874042, + "grad_norm": 0.03650813201144362, + "learning_rate": 0.000915149874846337, + "loss": 1.0914, + "step": 1705 + }, + { + "epoch": 0.5401563610518835, + "grad_norm": 0.1694935189698132, + "learning_rate": 0.0009143799709098728, + "loss": 1.161, + "step": 1710 + }, + { + "epoch": 0.5417357656163626, + "grad_norm": 0.033172052345546904, + "learning_rate": 0.0009136069167177844, + "loss": 1.0754, + "step": 1715 + }, + { + "epoch": 0.5433151701808419, + "grad_norm": 0.04336773903840129, + "learning_rate": 0.0009128307181470893, + "loss": 1.1658, + "step": 1720 + }, + { + "epoch": 0.544894574745321, + "grad_norm": 0.04377461369596686, + "learning_rate": 0.0009120513810987094, + "loss": 1.1052, + "step": 1725 + }, + { + "epoch": 0.5464739793098002, + "grad_norm": 0.041066160659134626, + "learning_rate": 0.0009112689114974266, + "loss": 1.2059, + "step": 1730 + }, + { + "epoch": 0.5480533838742794, + "grad_norm": 0.04178208265398977, + "learning_rate": 0.0009104833152918375, + "loss": 1.1699, + "step": 1735 + }, + { + "epoch": 0.5496327884387586, + "grad_norm": 0.11328835205617309, + "learning_rate": 0.0009096945984543081, + "loss": 1.1136, + "step": 1740 + }, + { + "epoch": 0.5512121930032378, + "grad_norm": 0.19182622324811044, + "learning_rate": 0.0009089027669809285, + "loss": 1.2703, + "step": 1745 + }, + { + "epoch": 0.552791597567717, + "grad_norm": 0.059773779341214854, + "learning_rate": 0.0009081078268914673, + "loss": 1.2062, + "step": 1750 + }, + { + "epoch": 0.5543710021321961, + "grad_norm": 0.15234917380906907, + "learning_rate": 0.000907309784229326, + "loss": 1.2287, + "step": 1755 + }, + { + "epoch": 0.5559504066966754, + "grad_norm": 0.06586639991112524, + "learning_rate": 0.0009065086450614928, + "loss": 1.1379, + "step": 1760 + }, + { + "epoch": 0.5575298112611545, + "grad_norm": 0.17904872602554633, + "learning_rate": 0.0009057044154784963, + "loss": 1.1576, + "step": 1765 + }, + { + "epoch": 0.5591092158256338, + "grad_norm": 0.03758826384872291, + "learning_rate": 0.0009048971015943599, + "loss": 1.1779, + "step": 1770 + }, + { + "epoch": 0.5606886203901129, + "grad_norm": 0.08338270784298804, + "learning_rate": 0.0009040867095465548, + "loss": 1.2273, + "step": 1775 + }, + { + "epoch": 0.5622680249545922, + "grad_norm": 0.03954640652450269, + "learning_rate": 0.0009032732454959533, + "loss": 1.1036, + "step": 1780 + }, + { + "epoch": 0.5638474295190713, + "grad_norm": 0.0438072209614571, + "learning_rate": 0.000902456715626782, + "loss": 1.2786, + "step": 1785 + }, + { + "epoch": 0.5654268340835505, + "grad_norm": 0.047019840479581396, + "learning_rate": 0.0009016371261465752, + "loss": 1.1721, + "step": 1790 + }, + { + "epoch": 0.5670062386480297, + "grad_norm": 0.06010554299082062, + "learning_rate": 0.0009008144832861272, + "loss": 1.1543, + "step": 1795 + }, + { + "epoch": 0.5685856432125089, + "grad_norm": 0.053420243571307535, + "learning_rate": 0.000899988793299445, + "loss": 1.1894, + "step": 1800 + }, + { + "epoch": 0.5701650477769881, + "grad_norm": 0.03523402052330801, + "learning_rate": 0.0008991600624637013, + "loss": 1.2492, + "step": 1805 + }, + { + "epoch": 0.5717444523414673, + "grad_norm": 0.03576456253696137, + "learning_rate": 0.0008983282970791858, + "loss": 1.2006, + "step": 1810 + }, + { + "epoch": 0.5733238569059464, + "grad_norm": 0.04271916038016165, + "learning_rate": 0.0008974935034692583, + "loss": 1.1345, + "step": 1815 + }, + { + "epoch": 0.5749032614704257, + "grad_norm": 0.06186038459636685, + "learning_rate": 0.0008966556879802998, + "loss": 1.1715, + "step": 1820 + }, + { + "epoch": 0.5764826660349048, + "grad_norm": 0.04007885821089377, + "learning_rate": 0.0008958148569816652, + "loss": 1.1471, + "step": 1825 + }, + { + "epoch": 0.5780620705993841, + "grad_norm": 0.06690675689465636, + "learning_rate": 0.0008949710168656337, + "loss": 1.1694, + "step": 1830 + }, + { + "epoch": 0.5796414751638632, + "grad_norm": 0.054483680948141516, + "learning_rate": 0.0008941241740473612, + "loss": 1.1302, + "step": 1835 + }, + { + "epoch": 0.5812208797283425, + "grad_norm": 0.05283051263253464, + "learning_rate": 0.0008932743349648312, + "loss": 1.1547, + "step": 1840 + }, + { + "epoch": 0.5828002842928216, + "grad_norm": 0.0351874645524128, + "learning_rate": 0.0008924215060788051, + "loss": 1.1842, + "step": 1845 + }, + { + "epoch": 0.5843796888573008, + "grad_norm": 0.03531292908646924, + "learning_rate": 0.000891565693872775, + "loss": 1.1414, + "step": 1850 + }, + { + "epoch": 0.58595909342178, + "grad_norm": 0.03252007276812236, + "learning_rate": 0.0008907069048529122, + "loss": 1.1998, + "step": 1855 + }, + { + "epoch": 0.5875384979862592, + "grad_norm": 0.048953905031448496, + "learning_rate": 0.000889845145548019, + "loss": 1.1882, + "step": 1860 + }, + { + "epoch": 0.5891179025507384, + "grad_norm": 0.03679662565463424, + "learning_rate": 0.000888980422509479, + "loss": 1.2016, + "step": 1865 + }, + { + "epoch": 0.5906973071152176, + "grad_norm": 0.0429312944873228, + "learning_rate": 0.0008881127423112072, + "loss": 1.231, + "step": 1870 + }, + { + "epoch": 0.5922767116796968, + "grad_norm": 0.05085200300369583, + "learning_rate": 0.0008872421115495995, + "loss": 1.1525, + "step": 1875 + }, + { + "epoch": 0.5938561162441759, + "grad_norm": 0.04143984691504309, + "learning_rate": 0.0008863685368434831, + "loss": 1.1573, + "step": 1880 + }, + { + "epoch": 0.5954355208086551, + "grad_norm": 0.054741188359976926, + "learning_rate": 0.0008854920248340662, + "loss": 1.1595, + "step": 1885 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.04157592670579889, + "learning_rate": 0.0008846125821848873, + "loss": 1.1485, + "step": 1890 + }, + { + "epoch": 0.5985943299376135, + "grad_norm": 0.05597913806097826, + "learning_rate": 0.0008837302155817647, + "loss": 1.1551, + "step": 1895 + }, + { + "epoch": 0.6001737345020927, + "grad_norm": 0.04003622147381468, + "learning_rate": 0.0008828449317327452, + "loss": 1.114, + "step": 1900 + }, + { + "epoch": 0.6017531390665719, + "grad_norm": 0.04430881613109523, + "learning_rate": 0.0008819567373680541, + "loss": 1.2302, + "step": 1905 + }, + { + "epoch": 0.603332543631051, + "grad_norm": 0.044159161743787353, + "learning_rate": 0.000881065639240043, + "loss": 1.1455, + "step": 1910 + }, + { + "epoch": 0.6049119481955303, + "grad_norm": 0.040333438068730916, + "learning_rate": 0.0008801716441231386, + "loss": 1.1751, + "step": 1915 + }, + { + "epoch": 0.6064913527600094, + "grad_norm": 0.037659445323191335, + "learning_rate": 0.0008792747588137924, + "loss": 1.1294, + "step": 1920 + }, + { + "epoch": 0.6080707573244887, + "grad_norm": 0.057367139158678554, + "learning_rate": 0.0008783749901304271, + "loss": 1.185, + "step": 1925 + }, + { + "epoch": 0.6096501618889678, + "grad_norm": 0.039046189159429465, + "learning_rate": 0.0008774723449133866, + "loss": 1.1998, + "step": 1930 + }, + { + "epoch": 0.6112295664534471, + "grad_norm": 0.038520766106106066, + "learning_rate": 0.0008765668300248823, + "loss": 1.1858, + "step": 1935 + }, + { + "epoch": 0.6128089710179262, + "grad_norm": 0.06371115248464544, + "learning_rate": 0.000875658452348943, + "loss": 1.2866, + "step": 1940 + }, + { + "epoch": 0.6143883755824054, + "grad_norm": 0.05805583054296386, + "learning_rate": 0.0008747472187913603, + "loss": 1.1246, + "step": 1945 + }, + { + "epoch": 0.6159677801468846, + "grad_norm": 0.0882727755464156, + "learning_rate": 0.0008738331362796375, + "loss": 1.1861, + "step": 1950 + }, + { + "epoch": 0.6175471847113638, + "grad_norm": 0.1291074294139289, + "learning_rate": 0.0008729162117629368, + "loss": 1.172, + "step": 1955 + }, + { + "epoch": 0.619126589275843, + "grad_norm": 0.06752168123783803, + "learning_rate": 0.0008719964522120261, + "loss": 1.1289, + "step": 1960 + }, + { + "epoch": 0.6207059938403222, + "grad_norm": 0.05230307801436827, + "learning_rate": 0.0008710738646192262, + "loss": 1.1345, + "step": 1965 + }, + { + "epoch": 0.6222853984048013, + "grad_norm": 0.05307584281041897, + "learning_rate": 0.0008701484559983577, + "loss": 1.1963, + "step": 1970 + }, + { + "epoch": 0.6238648029692806, + "grad_norm": 0.062313428830658914, + "learning_rate": 0.0008692202333846875, + "loss": 1.1784, + "step": 1975 + }, + { + "epoch": 0.6254442075337597, + "grad_norm": 0.07318456946963307, + "learning_rate": 0.0008682892038348756, + "loss": 1.1756, + "step": 1980 + }, + { + "epoch": 0.627023612098239, + "grad_norm": 0.05035910214366681, + "learning_rate": 0.0008673553744269207, + "loss": 1.1644, + "step": 1985 + }, + { + "epoch": 0.6286030166627181, + "grad_norm": 0.03930070288487179, + "learning_rate": 0.0008664187522601079, + "loss": 1.1885, + "step": 1990 + }, + { + "epoch": 0.6301824212271974, + "grad_norm": 0.04028703538445593, + "learning_rate": 0.0008654793444549531, + "loss": 1.1463, + "step": 1995 + }, + { + "epoch": 0.6317618257916765, + "grad_norm": 0.04340467125837745, + "learning_rate": 0.0008645371581531497, + "loss": 1.1571, + "step": 2000 + }, + { + "epoch": 0.6333412303561557, + "grad_norm": 0.03579329388775005, + "learning_rate": 0.0008635922005175143, + "loss": 1.1222, + "step": 2005 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.039633418005999524, + "learning_rate": 0.0008626444787319319, + "loss": 1.126, + "step": 2010 + }, + { + "epoch": 0.6365000394851141, + "grad_norm": 0.04656715388051395, + "learning_rate": 0.0008616940000013016, + "loss": 1.1351, + "step": 2015 + }, + { + "epoch": 0.6380794440495933, + "grad_norm": 0.04241567297447217, + "learning_rate": 0.0008607407715514819, + "loss": 1.1736, + "step": 2020 + }, + { + "epoch": 0.6396588486140725, + "grad_norm": 0.04272664455329046, + "learning_rate": 0.0008597848006292354, + "loss": 1.16, + "step": 2025 + }, + { + "epoch": 0.6412382531785517, + "grad_norm": 0.050634173141446026, + "learning_rate": 0.0008588260945021737, + "loss": 1.1535, + "step": 2030 + }, + { + "epoch": 0.6428176577430309, + "grad_norm": 0.043629941793955164, + "learning_rate": 0.0008578646604587028, + "loss": 1.1172, + "step": 2035 + }, + { + "epoch": 0.64439706230751, + "grad_norm": 0.037913467140448166, + "learning_rate": 0.0008569005058079671, + "loss": 1.1938, + "step": 2040 + }, + { + "epoch": 0.6459764668719893, + "grad_norm": 0.07347441864594863, + "learning_rate": 0.0008559336378797935, + "loss": 1.2555, + "step": 2045 + }, + { + "epoch": 0.6475558714364684, + "grad_norm": 0.04452704637473851, + "learning_rate": 0.0008549640640246367, + "loss": 1.2069, + "step": 2050 + }, + { + "epoch": 0.6491352760009477, + "grad_norm": 0.04051185517620845, + "learning_rate": 0.0008539917916135227, + "loss": 1.0992, + "step": 2055 + }, + { + "epoch": 0.6507146805654268, + "grad_norm": 0.03344259315550966, + "learning_rate": 0.0008530168280379924, + "loss": 1.2363, + "step": 2060 + }, + { + "epoch": 0.652294085129906, + "grad_norm": 0.051496549798025794, + "learning_rate": 0.0008520391807100465, + "loss": 1.1066, + "step": 2065 + }, + { + "epoch": 0.6538734896943852, + "grad_norm": 0.049313008139448104, + "learning_rate": 0.0008510588570620879, + "loss": 1.1435, + "step": 2070 + }, + { + "epoch": 0.6554528942588644, + "grad_norm": 0.036890992849123165, + "learning_rate": 0.0008500758645468661, + "loss": 1.178, + "step": 2075 + }, + { + "epoch": 0.6570322988233436, + "grad_norm": 0.05000485789172623, + "learning_rate": 0.0008490902106374202, + "loss": 1.1358, + "step": 2080 + }, + { + "epoch": 0.6586117033878228, + "grad_norm": 0.058476338194899814, + "learning_rate": 0.0008481019028270219, + "loss": 1.1535, + "step": 2085 + }, + { + "epoch": 0.660191107952302, + "grad_norm": 0.17384762993524389, + "learning_rate": 0.000847110948629119, + "loss": 1.1709, + "step": 2090 + }, + { + "epoch": 0.6617705125167812, + "grad_norm": 0.05756662582337642, + "learning_rate": 0.0008461173555772779, + "loss": 1.2323, + "step": 2095 + }, + { + "epoch": 0.6633499170812603, + "grad_norm": 0.04628423823864862, + "learning_rate": 0.0008451211312251266, + "loss": 1.1244, + "step": 2100 + }, + { + "epoch": 0.6649293216457396, + "grad_norm": 0.05753687558314109, + "learning_rate": 0.0008441222831462967, + "loss": 1.185, + "step": 2105 + }, + { + "epoch": 0.6665087262102187, + "grad_norm": 0.03541434435667187, + "learning_rate": 0.0008431208189343669, + "loss": 1.1652, + "step": 2110 + }, + { + "epoch": 0.668088130774698, + "grad_norm": 0.06365027743645021, + "learning_rate": 0.0008421167462028039, + "loss": 1.0968, + "step": 2115 + }, + { + "epoch": 0.6696675353391771, + "grad_norm": 0.15796614326365377, + "learning_rate": 0.000841110072584906, + "loss": 1.2558, + "step": 2120 + }, + { + "epoch": 0.6712469399036564, + "grad_norm": 0.04426269547365523, + "learning_rate": 0.0008401008057337437, + "loss": 1.236, + "step": 2125 + }, + { + "epoch": 0.6728263444681355, + "grad_norm": 0.0481957665679776, + "learning_rate": 0.0008390889533221025, + "loss": 1.221, + "step": 2130 + }, + { + "epoch": 0.6744057490326147, + "grad_norm": 0.0742711968724926, + "learning_rate": 0.0008380745230424238, + "loss": 1.1596, + "step": 2135 + }, + { + "epoch": 0.6759851535970939, + "grad_norm": 0.10209227348253318, + "learning_rate": 0.0008370575226067474, + "loss": 1.1409, + "step": 2140 + }, + { + "epoch": 0.6775645581615731, + "grad_norm": 0.03233639963159502, + "learning_rate": 0.0008360379597466518, + "loss": 1.0974, + "step": 2145 + }, + { + "epoch": 0.6791439627260523, + "grad_norm": 0.04112561163083512, + "learning_rate": 0.0008350158422131961, + "loss": 1.1423, + "step": 2150 + }, + { + "epoch": 0.6807233672905315, + "grad_norm": 0.04219217327869605, + "learning_rate": 0.0008339911777768609, + "loss": 1.0994, + "step": 2155 + }, + { + "epoch": 0.6823027718550106, + "grad_norm": 0.037513381823338576, + "learning_rate": 0.0008329639742274892, + "loss": 1.1342, + "step": 2160 + }, + { + "epoch": 0.6838821764194899, + "grad_norm": 0.03657974348037566, + "learning_rate": 0.0008319342393742268, + "loss": 1.1531, + "step": 2165 + }, + { + "epoch": 0.685461580983969, + "grad_norm": 0.05047343123602675, + "learning_rate": 0.0008309019810454643, + "loss": 1.0958, + "step": 2170 + }, + { + "epoch": 0.6870409855484483, + "grad_norm": 0.03243744614986179, + "learning_rate": 0.0008298672070887754, + "loss": 1.1256, + "step": 2175 + }, + { + "epoch": 0.6886203901129274, + "grad_norm": 0.044261034687072306, + "learning_rate": 0.0008288299253708595, + "loss": 1.0999, + "step": 2180 + }, + { + "epoch": 0.6901997946774067, + "grad_norm": 0.03619490329735276, + "learning_rate": 0.00082779014377748, + "loss": 1.1721, + "step": 2185 + }, + { + "epoch": 0.6917791992418858, + "grad_norm": 0.04146064866463831, + "learning_rate": 0.000826747870213406, + "loss": 1.1727, + "step": 2190 + }, + { + "epoch": 0.693358603806365, + "grad_norm": 0.04849711422712515, + "learning_rate": 0.0008257031126023504, + "loss": 1.1482, + "step": 2195 + }, + { + "epoch": 0.6949380083708442, + "grad_norm": 0.040296226435196425, + "learning_rate": 0.0008246558788869116, + "loss": 1.1527, + "step": 2200 + }, + { + "epoch": 0.6965174129353234, + "grad_norm": 0.03293749678341955, + "learning_rate": 0.0008236061770285119, + "loss": 1.1148, + "step": 2205 + }, + { + "epoch": 0.6980968174998026, + "grad_norm": 0.042233291320546755, + "learning_rate": 0.0008225540150073371, + "loss": 1.2206, + "step": 2210 + }, + { + "epoch": 0.6996762220642818, + "grad_norm": 0.03827298591812442, + "learning_rate": 0.0008214994008222758, + "loss": 1.1384, + "step": 2215 + }, + { + "epoch": 0.701255626628761, + "grad_norm": 0.045374923911993686, + "learning_rate": 0.0008204423424908591, + "loss": 1.1324, + "step": 2220 + }, + { + "epoch": 0.7028350311932402, + "grad_norm": 0.03238064122889688, + "learning_rate": 0.0008193828480491995, + "loss": 1.1544, + "step": 2225 + }, + { + "epoch": 0.7044144357577193, + "grad_norm": 0.04380634028979496, + "learning_rate": 0.000818320925551929, + "loss": 1.0635, + "step": 2230 + }, + { + "epoch": 0.7059938403221986, + "grad_norm": 0.0370632104302981, + "learning_rate": 0.000817256583072139, + "loss": 1.0649, + "step": 2235 + }, + { + "epoch": 0.7075732448866777, + "grad_norm": 0.031093840596031517, + "learning_rate": 0.0008161898287013184, + "loss": 1.1048, + "step": 2240 + }, + { + "epoch": 0.709152649451157, + "grad_norm": 0.03207987681813833, + "learning_rate": 0.0008151206705492919, + "loss": 1.1286, + "step": 2245 + }, + { + "epoch": 0.7107320540156361, + "grad_norm": 0.04164109172225732, + "learning_rate": 0.0008140491167441584, + "loss": 1.1099, + "step": 2250 + }, + { + "epoch": 0.7123114585801152, + "grad_norm": 0.03400416322772637, + "learning_rate": 0.0008129751754322299, + "loss": 1.1361, + "step": 2255 + }, + { + "epoch": 0.7138908631445945, + "grad_norm": 0.04592151794322222, + "learning_rate": 0.0008118988547779687, + "loss": 1.1323, + "step": 2260 + }, + { + "epoch": 0.7154702677090736, + "grad_norm": 0.036229621957410515, + "learning_rate": 0.0008108201629639256, + "loss": 1.1368, + "step": 2265 + }, + { + "epoch": 0.7170496722735529, + "grad_norm": 0.04151662660096531, + "learning_rate": 0.0008097391081906777, + "loss": 1.1346, + "step": 2270 + }, + { + "epoch": 0.718629076838032, + "grad_norm": 0.04873978029799004, + "learning_rate": 0.0008086556986767663, + "loss": 1.134, + "step": 2275 + }, + { + "epoch": 0.7202084814025113, + "grad_norm": 0.03333780638415451, + "learning_rate": 0.0008075699426586344, + "loss": 1.1413, + "step": 2280 + }, + { + "epoch": 0.7217878859669904, + "grad_norm": 0.034640199910440705, + "learning_rate": 0.0008064818483905634, + "loss": 1.1857, + "step": 2285 + }, + { + "epoch": 0.7233672905314696, + "grad_norm": 0.04818114443150759, + "learning_rate": 0.0008053914241446112, + "loss": 1.1447, + "step": 2290 + }, + { + "epoch": 0.7249466950959488, + "grad_norm": 0.036283227351149444, + "learning_rate": 0.000804298678210549, + "loss": 1.0211, + "step": 2295 + }, + { + "epoch": 0.726526099660428, + "grad_norm": 0.03357453215492695, + "learning_rate": 0.0008032036188957982, + "loss": 1.1105, + "step": 2300 + }, + { + "epoch": 0.7281055042249072, + "grad_norm": 0.034911998236058436, + "learning_rate": 0.0008021062545253672, + "loss": 1.1321, + "step": 2305 + }, + { + "epoch": 0.7296849087893864, + "grad_norm": 0.03072089247830198, + "learning_rate": 0.0008010065934417881, + "loss": 1.118, + "step": 2310 + }, + { + "epoch": 0.7312643133538655, + "grad_norm": 0.033945552237615126, + "learning_rate": 0.0007999046440050538, + "loss": 1.1327, + "step": 2315 + }, + { + "epoch": 0.7328437179183448, + "grad_norm": 0.036623240485144815, + "learning_rate": 0.0007988004145925538, + "loss": 1.1336, + "step": 2320 + }, + { + "epoch": 0.7344231224828239, + "grad_norm": 0.034765981851321515, + "learning_rate": 0.0007976939135990106, + "loss": 1.109, + "step": 2325 + }, + { + "epoch": 0.7360025270473032, + "grad_norm": 0.03112055116207592, + "learning_rate": 0.000796585149436416, + "loss": 1.0639, + "step": 2330 + }, + { + "epoch": 0.7375819316117823, + "grad_norm": 0.03787672321570991, + "learning_rate": 0.0007954741305339676, + "loss": 1.1191, + "step": 2335 + }, + { + "epoch": 0.7391613361762616, + "grad_norm": 0.05596768906351211, + "learning_rate": 0.000794360865338004, + "loss": 1.1129, + "step": 2340 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.04646968935035901, + "learning_rate": 0.0007932453623119407, + "loss": 1.1535, + "step": 2345 + }, + { + "epoch": 0.74232014530522, + "grad_norm": 0.03287872636040462, + "learning_rate": 0.0007921276299362062, + "loss": 1.0685, + "step": 2350 + }, + { + "epoch": 0.7438995498696991, + "grad_norm": 0.03761813831687904, + "learning_rate": 0.0007910076767081772, + "loss": 1.1851, + "step": 2355 + }, + { + "epoch": 0.7454789544341783, + "grad_norm": 0.04154317590568981, + "learning_rate": 0.0007898855111421139, + "loss": 1.0641, + "step": 2360 + }, + { + "epoch": 0.7470583589986575, + "grad_norm": 0.03225872527047046, + "learning_rate": 0.0007887611417690958, + "loss": 1.0459, + "step": 2365 + }, + { + "epoch": 0.7486377635631367, + "grad_norm": 0.07034127382923797, + "learning_rate": 0.0007876345771369564, + "loss": 1.301, + "step": 2370 + }, + { + "epoch": 0.7502171681276159, + "grad_norm": 0.04449126769031474, + "learning_rate": 0.0007865058258102177, + "loss": 1.1004, + "step": 2375 + }, + { + "epoch": 0.7517965726920951, + "grad_norm": 0.0350145043034834, + "learning_rate": 0.0007853748963700264, + "loss": 1.1672, + "step": 2380 + }, + { + "epoch": 0.7533759772565742, + "grad_norm": 0.03748553636786378, + "learning_rate": 0.0007842417974140879, + "loss": 1.1193, + "step": 2385 + }, + { + "epoch": 0.7549553818210535, + "grad_norm": 0.030937914052851834, + "learning_rate": 0.0007831065375566004, + "loss": 1.0449, + "step": 2390 + }, + { + "epoch": 0.7565347863855326, + "grad_norm": 0.034083331472644675, + "learning_rate": 0.0007819691254281905, + "loss": 1.0966, + "step": 2395 + }, + { + "epoch": 0.7581141909500119, + "grad_norm": 0.03047192172455573, + "learning_rate": 0.0007808295696758472, + "loss": 1.0874, + "step": 2400 + }, + { + "epoch": 0.759693595514491, + "grad_norm": 0.03554221631207444, + "learning_rate": 0.0007796878789628555, + "loss": 1.0696, + "step": 2405 + }, + { + "epoch": 0.7612730000789703, + "grad_norm": 0.03538164542716929, + "learning_rate": 0.0007785440619687316, + "loss": 1.1272, + "step": 2410 + }, + { + "epoch": 0.7628524046434494, + "grad_norm": 0.036853427979659625, + "learning_rate": 0.0007773981273891562, + "loss": 1.1094, + "step": 2415 + }, + { + "epoch": 0.7644318092079286, + "grad_norm": 0.04257817482887054, + "learning_rate": 0.0007762500839359084, + "loss": 1.1402, + "step": 2420 + }, + { + "epoch": 0.7660112137724078, + "grad_norm": 0.044417776153948235, + "learning_rate": 0.0007750999403368001, + "loss": 1.1064, + "step": 2425 + }, + { + "epoch": 0.767590618336887, + "grad_norm": 0.04767857151182902, + "learning_rate": 0.000773947705335609, + "loss": 1.0984, + "step": 2430 + }, + { + "epoch": 0.7691700229013662, + "grad_norm": 0.04717634798528141, + "learning_rate": 0.0007727933876920121, + "loss": 1.1129, + "step": 2435 + }, + { + "epoch": 0.7707494274658454, + "grad_norm": 0.03626272630855877, + "learning_rate": 0.0007716369961815199, + "loss": 1.1507, + "step": 2440 + }, + { + "epoch": 0.7723288320303245, + "grad_norm": 0.03522532598234683, + "learning_rate": 0.0007704785395954085, + "loss": 1.0645, + "step": 2445 + }, + { + "epoch": 0.7739082365948038, + "grad_norm": 0.03559706129851639, + "learning_rate": 0.0007693180267406539, + "loss": 1.1182, + "step": 2450 + }, + { + "epoch": 0.7754876411592829, + "grad_norm": 0.06678349957404967, + "learning_rate": 0.000768155466439864, + "loss": 1.1167, + "step": 2455 + }, + { + "epoch": 0.7770670457237622, + "grad_norm": 0.04029787951031313, + "learning_rate": 0.0007669908675312128, + "loss": 1.1288, + "step": 2460 + }, + { + "epoch": 0.7786464502882413, + "grad_norm": 0.060731887918175714, + "learning_rate": 0.000765824238868372, + "loss": 1.1212, + "step": 2465 + }, + { + "epoch": 0.7802258548527206, + "grad_norm": 0.029437468762583706, + "learning_rate": 0.0007646555893204442, + "loss": 1.0918, + "step": 2470 + }, + { + "epoch": 0.7818052594171997, + "grad_norm": 0.03426412759745619, + "learning_rate": 0.0007634849277718956, + "loss": 1.1042, + "step": 2475 + }, + { + "epoch": 0.7833846639816789, + "grad_norm": 0.043626338906299925, + "learning_rate": 0.0007623122631224881, + "loss": 1.0942, + "step": 2480 + }, + { + "epoch": 0.7849640685461581, + "grad_norm": 0.06053739130525586, + "learning_rate": 0.0007611376042872121, + "loss": 1.1401, + "step": 2485 + }, + { + "epoch": 0.7865434731106373, + "grad_norm": 0.04934003230834547, + "learning_rate": 0.0007599609601962183, + "loss": 1.1509, + "step": 2490 + }, + { + "epoch": 0.7881228776751165, + "grad_norm": 0.032994806646679455, + "learning_rate": 0.00075878233979475, + "loss": 1.1306, + "step": 2495 + }, + { + "epoch": 0.7897022822395957, + "grad_norm": 0.057874354577367164, + "learning_rate": 0.000757601752043075, + "loss": 1.1152, + "step": 2500 + }, + { + "epoch": 0.7912816868040748, + "grad_norm": 0.036916846930438035, + "learning_rate": 0.0007564192059164176, + "loss": 1.1254, + "step": 2505 + }, + { + "epoch": 0.7928610913685541, + "grad_norm": 0.03622601094242121, + "learning_rate": 0.0007552347104048908, + "loss": 1.1797, + "step": 2510 + }, + { + "epoch": 0.7944404959330332, + "grad_norm": 0.04034014908393007, + "learning_rate": 0.0007540482745134266, + "loss": 1.0887, + "step": 2515 + }, + { + "epoch": 0.7960199004975125, + "grad_norm": 0.04086451098985416, + "learning_rate": 0.000752859907261709, + "loss": 1.1606, + "step": 2520 + }, + { + "epoch": 0.7975993050619916, + "grad_norm": 0.04230873404823367, + "learning_rate": 0.0007516696176841048, + "loss": 1.1413, + "step": 2525 + }, + { + "epoch": 0.7991787096264709, + "grad_norm": 0.053041815848766695, + "learning_rate": 0.000750477414829595, + "loss": 1.2506, + "step": 2530 + }, + { + "epoch": 0.80075811419095, + "grad_norm": 0.038500571689324685, + "learning_rate": 0.0007492833077617053, + "loss": 1.2014, + "step": 2535 + }, + { + "epoch": 0.8023375187554292, + "grad_norm": 0.04095622602441375, + "learning_rate": 0.0007480873055584392, + "loss": 1.1081, + "step": 2540 + }, + { + "epoch": 0.8039169233199084, + "grad_norm": 0.06242247746111767, + "learning_rate": 0.0007468894173122063, + "loss": 1.1474, + "step": 2545 + }, + { + "epoch": 0.8054963278843876, + "grad_norm": 0.052696786458390654, + "learning_rate": 0.0007456896521297554, + "loss": 1.1867, + "step": 2550 + }, + { + "epoch": 0.8070757324488668, + "grad_norm": 0.03907948162129907, + "learning_rate": 0.000744488019132104, + "loss": 1.1588, + "step": 2555 + }, + { + "epoch": 0.808655137013346, + "grad_norm": 0.03537195704084729, + "learning_rate": 0.0007432845274544695, + "loss": 1.0816, + "step": 2560 + }, + { + "epoch": 0.8102345415778252, + "grad_norm": 0.0492539507126839, + "learning_rate": 0.0007420791862461997, + "loss": 1.0672, + "step": 2565 + }, + { + "epoch": 0.8118139461423044, + "grad_norm": 0.041180842720502576, + "learning_rate": 0.0007408720046707027, + "loss": 1.0945, + "step": 2570 + }, + { + "epoch": 0.8133933507067835, + "grad_norm": 0.03595581716482617, + "learning_rate": 0.0007396629919053785, + "loss": 1.1398, + "step": 2575 + }, + { + "epoch": 0.8149727552712628, + "grad_norm": 0.06860020808618335, + "learning_rate": 0.0007384521571415475, + "loss": 1.1856, + "step": 2580 + }, + { + "epoch": 0.8165521598357419, + "grad_norm": 0.034288016394964434, + "learning_rate": 0.0007372395095843823, + "loss": 1.0849, + "step": 2585 + }, + { + "epoch": 0.8181315644002212, + "grad_norm": 0.049671673626027694, + "learning_rate": 0.0007360250584528363, + "loss": 1.2389, + "step": 2590 + }, + { + "epoch": 0.8197109689647003, + "grad_norm": 0.04438153524293192, + "learning_rate": 0.000734808812979575, + "loss": 1.1411, + "step": 2595 + }, + { + "epoch": 0.8212903735291796, + "grad_norm": 0.030735268017338425, + "learning_rate": 0.0007335907824109046, + "loss": 1.1131, + "step": 2600 + }, + { + "epoch": 0.8228697780936587, + "grad_norm": 0.03405757193751862, + "learning_rate": 0.0007323709760067023, + "loss": 1.0823, + "step": 2605 + }, + { + "epoch": 0.8244491826581379, + "grad_norm": 0.030663575416957994, + "learning_rate": 0.0007311494030403458, + "loss": 1.1149, + "step": 2610 + }, + { + "epoch": 0.8260285872226171, + "grad_norm": 0.03208737020004201, + "learning_rate": 0.0007299260727986428, + "loss": 1.1078, + "step": 2615 + }, + { + "epoch": 0.8276079917870963, + "grad_norm": 0.040596884547164494, + "learning_rate": 0.0007287009945817605, + "loss": 1.072, + "step": 2620 + }, + { + "epoch": 0.8291873963515755, + "grad_norm": 0.037091725728933614, + "learning_rate": 0.0007274741777031544, + "loss": 1.1415, + "step": 2625 + }, + { + "epoch": 0.8307668009160546, + "grad_norm": 0.036465623744331946, + "learning_rate": 0.0007262456314894985, + "loss": 1.0951, + "step": 2630 + }, + { + "epoch": 0.8323462054805338, + "grad_norm": 0.0342176930658027, + "learning_rate": 0.0007250153652806133, + "loss": 1.119, + "step": 2635 + }, + { + "epoch": 0.833925610045013, + "grad_norm": 0.03140323676971682, + "learning_rate": 0.0007237833884293955, + "loss": 1.0438, + "step": 2640 + }, + { + "epoch": 0.8355050146094922, + "grad_norm": 0.03890983036796517, + "learning_rate": 0.0007225497103017467, + "loss": 1.0783, + "step": 2645 + }, + { + "epoch": 0.8370844191739714, + "grad_norm": 0.04169345402666985, + "learning_rate": 0.0007213143402765021, + "loss": 1.1149, + "step": 2650 + }, + { + "epoch": 0.8386638237384506, + "grad_norm": 0.05149900937684697, + "learning_rate": 0.0007200772877453593, + "loss": 1.1519, + "step": 2655 + }, + { + "epoch": 0.8402432283029297, + "grad_norm": 0.030798303288645443, + "learning_rate": 0.0007188385621128067, + "loss": 1.0422, + "step": 2660 + }, + { + "epoch": 0.841822632867409, + "grad_norm": 0.029135350330774433, + "learning_rate": 0.0007175981727960526, + "loss": 1.039, + "step": 2665 + }, + { + "epoch": 0.8434020374318881, + "grad_norm": 0.03493548525906682, + "learning_rate": 0.0007163561292249525, + "loss": 1.1766, + "step": 2670 + }, + { + "epoch": 0.8449814419963674, + "grad_norm": 0.03229186110696721, + "learning_rate": 0.0007151124408419389, + "loss": 1.0602, + "step": 2675 + }, + { + "epoch": 0.8465608465608465, + "grad_norm": 0.0763692511304746, + "learning_rate": 0.0007138671171019481, + "loss": 1.1464, + "step": 2680 + }, + { + "epoch": 0.8481402511253258, + "grad_norm": 0.046494324492253956, + "learning_rate": 0.0007126201674723492, + "loss": 1.1064, + "step": 2685 + }, + { + "epoch": 0.8497196556898049, + "grad_norm": 0.04244692069264259, + "learning_rate": 0.000711371601432872, + "loss": 1.1527, + "step": 2690 + }, + { + "epoch": 0.8512990602542841, + "grad_norm": 0.05873271313913313, + "learning_rate": 0.0007101214284755344, + "loss": 1.1121, + "step": 2695 + }, + { + "epoch": 0.8528784648187633, + "grad_norm": 0.038375326877799966, + "learning_rate": 0.000708869658104571, + "loss": 1.1012, + "step": 2700 + }, + { + "epoch": 0.8544578693832425, + "grad_norm": 0.04665919943954757, + "learning_rate": 0.0007076162998363603, + "loss": 1.0935, + "step": 2705 + }, + { + "epoch": 0.8560372739477217, + "grad_norm": 0.03946285449664685, + "learning_rate": 0.0007063613631993523, + "loss": 1.1081, + "step": 2710 + }, + { + "epoch": 0.8576166785122009, + "grad_norm": 0.04828063508924996, + "learning_rate": 0.0007051048577339968, + "loss": 1.1164, + "step": 2715 + }, + { + "epoch": 0.85919608307668, + "grad_norm": 0.06428297700542004, + "learning_rate": 0.00070384679299267, + "loss": 1.0972, + "step": 2720 + }, + { + "epoch": 0.8607754876411593, + "grad_norm": 0.03538152451128835, + "learning_rate": 0.0007025871785396023, + "loss": 1.1695, + "step": 2725 + }, + { + "epoch": 0.8623548922056384, + "grad_norm": 0.03802128698750148, + "learning_rate": 0.0007013260239508055, + "loss": 1.1753, + "step": 2730 + }, + { + "epoch": 0.8639342967701177, + "grad_norm": 0.048080492082236435, + "learning_rate": 0.0007000633388140002, + "loss": 1.177, + "step": 2735 + }, + { + "epoch": 0.8655137013345968, + "grad_norm": 0.028063478917697556, + "learning_rate": 0.0006987991327285425, + "loss": 1.0636, + "step": 2740 + }, + { + "epoch": 0.8670931058990761, + "grad_norm": 0.03558860357027592, + "learning_rate": 0.0006975334153053517, + "loss": 1.065, + "step": 2745 + }, + { + "epoch": 0.8686725104635552, + "grad_norm": 0.03215642075882598, + "learning_rate": 0.0006962661961668362, + "loss": 1.0859, + "step": 2750 + }, + { + "epoch": 0.8702519150280345, + "grad_norm": 0.032309742975668077, + "learning_rate": 0.0006949974849468212, + "loss": 1.1089, + "step": 2755 + }, + { + "epoch": 0.8718313195925136, + "grad_norm": 0.02951782677048253, + "learning_rate": 0.0006937272912904755, + "loss": 1.0983, + "step": 2760 + }, + { + "epoch": 0.8734107241569928, + "grad_norm": 0.036426668611823296, + "learning_rate": 0.0006924556248542373, + "loss": 1.129, + "step": 2765 + }, + { + "epoch": 0.874990128721472, + "grad_norm": 0.03957474298496717, + "learning_rate": 0.0006911824953057419, + "loss": 1.0902, + "step": 2770 + }, + { + "epoch": 0.8765695332859512, + "grad_norm": 0.03354407883805116, + "learning_rate": 0.0006899079123237473, + "loss": 1.124, + "step": 2775 + }, + { + "epoch": 0.8781489378504304, + "grad_norm": 0.041213643772183596, + "learning_rate": 0.0006886318855980611, + "loss": 1.1636, + "step": 2780 + }, + { + "epoch": 0.8797283424149096, + "grad_norm": 0.030287657014638987, + "learning_rate": 0.0006873544248294671, + "loss": 1.0797, + "step": 2785 + }, + { + "epoch": 0.8813077469793887, + "grad_norm": 0.03247509895879481, + "learning_rate": 0.0006860755397296505, + "loss": 1.0849, + "step": 2790 + }, + { + "epoch": 0.882887151543868, + "grad_norm": 0.04023725812499706, + "learning_rate": 0.0006847952400211252, + "loss": 1.1162, + "step": 2795 + }, + { + "epoch": 0.8844665561083471, + "grad_norm": 0.03890541651074166, + "learning_rate": 0.0006835135354371593, + "loss": 1.1291, + "step": 2800 + }, + { + "epoch": 0.8860459606728264, + "grad_norm": 0.03597174755417176, + "learning_rate": 0.0006822304357217013, + "loss": 1.1038, + "step": 2805 + }, + { + "epoch": 0.8876253652373055, + "grad_norm": 0.044296988823722266, + "learning_rate": 0.0006809459506293057, + "loss": 1.1106, + "step": 2810 + }, + { + "epoch": 0.8892047698017848, + "grad_norm": 0.031888241864467884, + "learning_rate": 0.0006796600899250596, + "loss": 1.0465, + "step": 2815 + }, + { + "epoch": 0.8907841743662639, + "grad_norm": 0.03375688432946613, + "learning_rate": 0.0006783728633845076, + "loss": 1.1087, + "step": 2820 + }, + { + "epoch": 0.8923635789307431, + "grad_norm": 0.03141469193813306, + "learning_rate": 0.0006770842807935777, + "loss": 1.111, + "step": 2825 + }, + { + "epoch": 0.8939429834952223, + "grad_norm": 0.029725912599760738, + "learning_rate": 0.0006757943519485075, + "loss": 1.1024, + "step": 2830 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.06555190731683326, + "learning_rate": 0.0006745030866557691, + "loss": 1.092, + "step": 2835 + }, + { + "epoch": 0.8971017926241807, + "grad_norm": 0.04010032166975462, + "learning_rate": 0.0006732104947319942, + "loss": 1.0763, + "step": 2840 + }, + { + "epoch": 0.8986811971886599, + "grad_norm": 0.04456482558579905, + "learning_rate": 0.0006719165860039009, + "loss": 1.121, + "step": 2845 + }, + { + "epoch": 0.900260601753139, + "grad_norm": 0.03629358875250488, + "learning_rate": 0.0006706213703082176, + "loss": 1.1179, + "step": 2850 + }, + { + "epoch": 0.9018400063176183, + "grad_norm": 0.03263092062169253, + "learning_rate": 0.0006693248574916086, + "loss": 1.0806, + "step": 2855 + }, + { + "epoch": 0.9034194108820974, + "grad_norm": 0.040114231565220586, + "learning_rate": 0.0006680270574105997, + "loss": 1.1045, + "step": 2860 + }, + { + "epoch": 0.9049988154465767, + "grad_norm": 0.032824962630205234, + "learning_rate": 0.0006667279799315025, + "loss": 1.0758, + "step": 2865 + }, + { + "epoch": 0.9065782200110558, + "grad_norm": 0.04457230563249612, + "learning_rate": 0.0006654276349303402, + "loss": 1.0575, + "step": 2870 + }, + { + "epoch": 0.9081576245755351, + "grad_norm": 0.07752050734868508, + "learning_rate": 0.0006641260322927718, + "loss": 1.0644, + "step": 2875 + }, + { + "epoch": 0.9097370291400142, + "grad_norm": 0.035445999376309226, + "learning_rate": 0.0006628231819140175, + "loss": 1.1192, + "step": 2880 + }, + { + "epoch": 0.9113164337044934, + "grad_norm": 0.05929887171190351, + "learning_rate": 0.0006615190936987833, + "loss": 1.0944, + "step": 2885 + }, + { + "epoch": 0.9128958382689726, + "grad_norm": 0.03267363437553562, + "learning_rate": 0.0006602137775611853, + "loss": 1.0551, + "step": 2890 + }, + { + "epoch": 0.9144752428334518, + "grad_norm": 0.029073524110190204, + "learning_rate": 0.000658907243424675, + "loss": 1.0458, + "step": 2895 + }, + { + "epoch": 0.916054647397931, + "grad_norm": 0.046992289036764744, + "learning_rate": 0.0006575995012219636, + "loss": 1.1098, + "step": 2900 + }, + { + "epoch": 0.9176340519624102, + "grad_norm": 0.037215881233934045, + "learning_rate": 0.000656290560894946, + "loss": 1.1957, + "step": 2905 + }, + { + "epoch": 0.9192134565268893, + "grad_norm": 0.037314969757336794, + "learning_rate": 0.0006549804323946261, + "loss": 1.067, + "step": 2910 + }, + { + "epoch": 0.9207928610913686, + "grad_norm": 0.05935064237729512, + "learning_rate": 0.0006536691256810404, + "loss": 1.0819, + "step": 2915 + }, + { + "epoch": 0.9223722656558477, + "grad_norm": 0.039382890006385236, + "learning_rate": 0.0006523566507231827, + "loss": 1.0803, + "step": 2920 + }, + { + "epoch": 0.923951670220327, + "grad_norm": 0.032551625760060814, + "learning_rate": 0.0006510430174989281, + "loss": 1.0521, + "step": 2925 + }, + { + "epoch": 0.9255310747848061, + "grad_norm": 0.03084938699937599, + "learning_rate": 0.0006497282359949574, + "loss": 1.0464, + "step": 2930 + }, + { + "epoch": 0.9271104793492854, + "grad_norm": 0.03540679422108104, + "learning_rate": 0.000648412316206681, + "loss": 1.0569, + "step": 2935 + }, + { + "epoch": 0.9286898839137645, + "grad_norm": 0.02845818505979114, + "learning_rate": 0.0006470952681381626, + "loss": 1.093, + "step": 2940 + }, + { + "epoch": 0.9302692884782437, + "grad_norm": 0.04412753782252594, + "learning_rate": 0.0006457771018020435, + "loss": 1.0981, + "step": 2945 + }, + { + "epoch": 0.9318486930427229, + "grad_norm": 0.032380564517949934, + "learning_rate": 0.0006444578272194672, + "loss": 1.1071, + "step": 2950 + }, + { + "epoch": 0.9334280976072021, + "grad_norm": 0.02960305717253739, + "learning_rate": 0.0006431374544200013, + "loss": 1.0815, + "step": 2955 + }, + { + "epoch": 0.9350075021716813, + "grad_norm": 0.0385572133108294, + "learning_rate": 0.0006418159934415634, + "loss": 1.0636, + "step": 2960 + }, + { + "epoch": 0.9365869067361605, + "grad_norm": 0.03940359113486163, + "learning_rate": 0.0006404934543303431, + "loss": 1.115, + "step": 2965 + }, + { + "epoch": 0.9381663113006397, + "grad_norm": 0.047952221587856184, + "learning_rate": 0.0006391698471407269, + "loss": 1.0677, + "step": 2970 + }, + { + "epoch": 0.9397457158651189, + "grad_norm": 0.042843188079932856, + "learning_rate": 0.0006378451819352206, + "loss": 1.1122, + "step": 2975 + }, + { + "epoch": 0.941325120429598, + "grad_norm": 0.03449156963379797, + "learning_rate": 0.0006365194687843743, + "loss": 1.1886, + "step": 2980 + }, + { + "epoch": 0.9429045249940773, + "grad_norm": 0.028173240496093405, + "learning_rate": 0.0006351927177667036, + "loss": 1.0572, + "step": 2985 + }, + { + "epoch": 0.9444839295585564, + "grad_norm": 0.030807817303202586, + "learning_rate": 0.0006338649389686157, + "loss": 1.1408, + "step": 2990 + }, + { + "epoch": 0.9460633341230357, + "grad_norm": 0.03588108022175335, + "learning_rate": 0.0006325361424843304, + "loss": 1.1679, + "step": 2995 + }, + { + "epoch": 0.9476427386875148, + "grad_norm": 0.030002262330937383, + "learning_rate": 0.0006312063384158043, + "loss": 1.067, + "step": 3000 + }, + { + "epoch": 0.9492221432519939, + "grad_norm": 0.04075260536403215, + "learning_rate": 0.0006298755368726548, + "loss": 1.1673, + "step": 3005 + }, + { + "epoch": 0.9508015478164732, + "grad_norm": 0.04006476615687794, + "learning_rate": 0.0006285437479720817, + "loss": 1.1876, + "step": 3010 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.02917383969273514, + "learning_rate": 0.0006272109818387909, + "loss": 1.1357, + "step": 3015 + }, + { + "epoch": 0.9539603569454316, + "grad_norm": 0.03406796068036105, + "learning_rate": 0.0006258772486049185, + "loss": 1.1703, + "step": 3020 + }, + { + "epoch": 0.9555397615099107, + "grad_norm": 0.05189254290140413, + "learning_rate": 0.0006245425584099518, + "loss": 1.1679, + "step": 3025 + }, + { + "epoch": 0.95711916607439, + "grad_norm": 0.03255022955779678, + "learning_rate": 0.0006232069214006536, + "loss": 1.0853, + "step": 3030 + }, + { + "epoch": 0.9586985706388691, + "grad_norm": 0.032704936361272335, + "learning_rate": 0.000621870347730985, + "loss": 1.116, + "step": 3035 + }, + { + "epoch": 0.9602779752033483, + "grad_norm": 0.03606712436906053, + "learning_rate": 0.0006205328475620275, + "loss": 1.1576, + "step": 3040 + }, + { + "epoch": 0.9618573797678275, + "grad_norm": 0.03527900701015813, + "learning_rate": 0.0006191944310619065, + "loss": 1.1359, + "step": 3045 + }, + { + "epoch": 0.9634367843323067, + "grad_norm": 0.04489452824337388, + "learning_rate": 0.0006178551084057134, + "loss": 1.1915, + "step": 3050 + }, + { + "epoch": 0.9650161888967859, + "grad_norm": 0.029752863180517432, + "learning_rate": 0.0006165148897754282, + "loss": 1.0507, + "step": 3055 + }, + { + "epoch": 0.9665955934612651, + "grad_norm": 0.03416207058808192, + "learning_rate": 0.0006151737853598432, + "loss": 1.1158, + "step": 3060 + }, + { + "epoch": 0.9681749980257442, + "grad_norm": 0.029662235175984018, + "learning_rate": 0.0006138318053544842, + "loss": 1.0316, + "step": 3065 + }, + { + "epoch": 0.9697544025902235, + "grad_norm": 0.05275392483944112, + "learning_rate": 0.0006124889599615336, + "loss": 1.0265, + "step": 3070 + }, + { + "epoch": 0.9713338071547026, + "grad_norm": 0.03356895671978958, + "learning_rate": 0.0006111452593897526, + "loss": 1.038, + "step": 3075 + }, + { + "epoch": 0.9729132117191819, + "grad_norm": 0.04726474414038403, + "learning_rate": 0.0006098007138544044, + "loss": 1.1369, + "step": 3080 + }, + { + "epoch": 0.974492616283661, + "grad_norm": 0.112571769030443, + "learning_rate": 0.0006084553335771749, + "loss": 1.0957, + "step": 3085 + }, + { + "epoch": 0.9760720208481403, + "grad_norm": 0.05682154704218681, + "learning_rate": 0.0006071091287860972, + "loss": 1.0644, + "step": 3090 + }, + { + "epoch": 0.9776514254126194, + "grad_norm": 0.045062530416325526, + "learning_rate": 0.0006057621097154715, + "loss": 1.1487, + "step": 3095 + }, + { + "epoch": 0.9792308299770986, + "grad_norm": 0.04500300458485399, + "learning_rate": 0.000604414286605789, + "loss": 1.148, + "step": 3100 + }, + { + "epoch": 0.9808102345415778, + "grad_norm": 0.03222115450318305, + "learning_rate": 0.0006030656697036534, + "loss": 1.0628, + "step": 3105 + }, + { + "epoch": 0.982389639106057, + "grad_norm": 0.03405499215516374, + "learning_rate": 0.0006017162692617031, + "loss": 1.0925, + "step": 3110 + }, + { + "epoch": 0.9839690436705362, + "grad_norm": 0.028050905956532823, + "learning_rate": 0.0006003660955385331, + "loss": 1.1066, + "step": 3115 + }, + { + "epoch": 0.9855484482350154, + "grad_norm": 0.029921219590913206, + "learning_rate": 0.0005990151587986171, + "loss": 1.1475, + "step": 3120 + }, + { + "epoch": 0.9871278527994946, + "grad_norm": 0.035043774118588174, + "learning_rate": 0.0005976634693122298, + "loss": 1.0828, + "step": 3125 + }, + { + "epoch": 0.9887072573639738, + "grad_norm": 0.03437726161120859, + "learning_rate": 0.0005963110373553686, + "loss": 1.1099, + "step": 3130 + }, + { + "epoch": 0.9902866619284529, + "grad_norm": 0.03221755496940865, + "learning_rate": 0.0005949578732096746, + "loss": 1.0652, + "step": 3135 + }, + { + "epoch": 0.9918660664929322, + "grad_norm": 0.036914998622386334, + "learning_rate": 0.0005936039871623563, + "loss": 1.0891, + "step": 3140 + }, + { + "epoch": 0.9934454710574113, + "grad_norm": 0.03178618200706658, + "learning_rate": 0.0005922493895061098, + "loss": 1.0234, + "step": 3145 + }, + { + "epoch": 0.9950248756218906, + "grad_norm": 0.028528350246715833, + "learning_rate": 0.0005908940905390408, + "loss": 1.0395, + "step": 3150 + }, + { + "epoch": 0.9966042801863697, + "grad_norm": 0.04501200819230332, + "learning_rate": 0.0005895381005645874, + "loss": 1.1871, + "step": 3155 + }, + { + "epoch": 0.998183684750849, + "grad_norm": 0.03144475794822077, + "learning_rate": 0.0005881814298914402, + "loss": 1.094, + "step": 3160 + }, + { + "epoch": 0.9997630893153281, + "grad_norm": 0.04082120452582063, + "learning_rate": 0.0005868240888334653, + "loss": 1.0823, + "step": 3165 + }, + { + "epoch": 0.9997630893153281, + "eval_loss": 1.0791449546813965, + "eval_runtime": 510.0567, + "eval_samples_per_second": 5.194, + "eval_steps_per_second": 1.3, + "step": 3165 + }, + { + "epoch": 1.0013424938798072, + "grad_norm": 0.07150112023496676, + "learning_rate": 0.0005854660877096246, + "loss": 1.13, + "step": 3170 + }, + { + "epoch": 1.0029218984442865, + "grad_norm": 0.0798487285566982, + "learning_rate": 0.000584107436843899, + "loss": 1.0078, + "step": 3175 + }, + { + "epoch": 1.0045013030087657, + "grad_norm": 0.06433145982933967, + "learning_rate": 0.0005827481465652079, + "loss": 0.9826, + "step": 3180 + }, + { + "epoch": 1.006080707573245, + "grad_norm": 0.04494583173932524, + "learning_rate": 0.0005813882272073325, + "loss": 0.9551, + "step": 3185 + }, + { + "epoch": 1.007660112137724, + "grad_norm": 0.03549329472007132, + "learning_rate": 0.0005800276891088362, + "loss": 0.9463, + "step": 3190 + }, + { + "epoch": 1.0092395167022032, + "grad_norm": 0.034186176588898505, + "learning_rate": 0.0005786665426129862, + "loss": 0.977, + "step": 3195 + }, + { + "epoch": 1.0108189212666825, + "grad_norm": 0.029817268432022398, + "learning_rate": 0.000577304798067675, + "loss": 0.9908, + "step": 3200 + }, + { + "epoch": 1.0123983258311617, + "grad_norm": 0.03394331617147919, + "learning_rate": 0.0005759424658253418, + "loss": 1.0151, + "step": 3205 + }, + { + "epoch": 1.0139777303956408, + "grad_norm": 0.13271163042849446, + "learning_rate": 0.0005745795562428936, + "loss": 1.0172, + "step": 3210 + }, + { + "epoch": 1.01555713496012, + "grad_norm": 0.04312287259535113, + "learning_rate": 0.0005732160796816266, + "loss": 0.968, + "step": 3215 + }, + { + "epoch": 1.0171365395245993, + "grad_norm": 0.06020683361964271, + "learning_rate": 0.000571852046507147, + "loss": 1.0422, + "step": 3220 + }, + { + "epoch": 1.0187159440890785, + "grad_norm": 0.030095876466864336, + "learning_rate": 0.0005704874670892929, + "loss": 1.0082, + "step": 3225 + }, + { + "epoch": 1.0202953486535575, + "grad_norm": 0.041989546074888764, + "learning_rate": 0.000569122351802055, + "loss": 1.0718, + "step": 3230 + }, + { + "epoch": 1.0218747532180368, + "grad_norm": 0.03248769015541394, + "learning_rate": 0.000567756711023498, + "loss": 1.0164, + "step": 3235 + }, + { + "epoch": 1.023454157782516, + "grad_norm": 0.033241056105033265, + "learning_rate": 0.0005663905551356816, + "loss": 1.0325, + "step": 3240 + }, + { + "epoch": 1.0250335623469953, + "grad_norm": 0.04113820390530121, + "learning_rate": 0.0005650238945245811, + "loss": 0.9916, + "step": 3245 + }, + { + "epoch": 1.0266129669114743, + "grad_norm": 0.047457711642047797, + "learning_rate": 0.0005636567395800095, + "loss": 1.0278, + "step": 3250 + }, + { + "epoch": 1.0281923714759535, + "grad_norm": 0.03425503116269514, + "learning_rate": 0.0005622891006955374, + "loss": 1.0648, + "step": 3255 + }, + { + "epoch": 1.0297717760404328, + "grad_norm": 0.032910706194540146, + "learning_rate": 0.0005609209882684147, + "loss": 0.9476, + "step": 3260 + }, + { + "epoch": 1.031351180604912, + "grad_norm": 0.031202607233272173, + "learning_rate": 0.0005595524126994912, + "loss": 1.0214, + "step": 3265 + }, + { + "epoch": 1.032930585169391, + "grad_norm": 0.034687369210111244, + "learning_rate": 0.0005581833843931377, + "loss": 0.9938, + "step": 3270 + }, + { + "epoch": 1.0345099897338703, + "grad_norm": 0.13437002311376092, + "learning_rate": 0.0005568139137571671, + "loss": 0.9909, + "step": 3275 + }, + { + "epoch": 1.0360893942983496, + "grad_norm": 0.029955677741687923, + "learning_rate": 0.0005554440112027546, + "loss": 0.9762, + "step": 3280 + }, + { + "epoch": 1.0376687988628288, + "grad_norm": 0.034097784104351415, + "learning_rate": 0.0005540736871443595, + "loss": 0.9952, + "step": 3285 + }, + { + "epoch": 1.0392482034273078, + "grad_norm": 0.03790878076303209, + "learning_rate": 0.0005527029519996448, + "loss": 1.0071, + "step": 3290 + }, + { + "epoch": 1.040827607991787, + "grad_norm": 0.02948940417484095, + "learning_rate": 0.0005513318161893996, + "loss": 0.9836, + "step": 3295 + }, + { + "epoch": 1.0424070125562663, + "grad_norm": 0.03107381970615086, + "learning_rate": 0.0005499602901374582, + "loss": 0.9885, + "step": 3300 + }, + { + "epoch": 1.0439864171207456, + "grad_norm": 0.03660851842140771, + "learning_rate": 0.0005485883842706224, + "loss": 0.9412, + "step": 3305 + }, + { + "epoch": 1.0455658216852246, + "grad_norm": 0.03214082310663574, + "learning_rate": 0.0005472161090185806, + "loss": 1.0371, + "step": 3310 + }, + { + "epoch": 1.0471452262497039, + "grad_norm": 0.03340724348849699, + "learning_rate": 0.0005458434748138302, + "loss": 0.9617, + "step": 3315 + }, + { + "epoch": 1.048724630814183, + "grad_norm": 0.056978614456297746, + "learning_rate": 0.0005444704920915971, + "loss": 1.0408, + "step": 3320 + }, + { + "epoch": 1.0503040353786623, + "grad_norm": 0.032025722101213884, + "learning_rate": 0.0005430971712897566, + "loss": 1.0889, + "step": 3325 + }, + { + "epoch": 1.0518834399431414, + "grad_norm": 0.03384344024540512, + "learning_rate": 0.0005417235228487546, + "loss": 1.0263, + "step": 3330 + }, + { + "epoch": 1.0534628445076206, + "grad_norm": 0.04349779981432125, + "learning_rate": 0.0005403495572115275, + "loss": 0.9939, + "step": 3335 + }, + { + "epoch": 1.0550422490720999, + "grad_norm": 0.0316311317750681, + "learning_rate": 0.0005389752848234234, + "loss": 0.9824, + "step": 3340 + }, + { + "epoch": 1.0566216536365791, + "grad_norm": 0.032659388007359264, + "learning_rate": 0.000537600716132122, + "loss": 0.9473, + "step": 3345 + }, + { + "epoch": 1.0582010582010581, + "grad_norm": 0.031611742536276584, + "learning_rate": 0.0005362258615875562, + "loss": 1.0733, + "step": 3350 + }, + { + "epoch": 1.0597804627655374, + "grad_norm": 0.03702335622451369, + "learning_rate": 0.0005348507316418313, + "loss": 1.0077, + "step": 3355 + }, + { + "epoch": 1.0613598673300166, + "grad_norm": 0.03582357424394027, + "learning_rate": 0.000533475336749147, + "loss": 0.9825, + "step": 3360 + }, + { + "epoch": 1.0629392718944959, + "grad_norm": 0.029384256667206336, + "learning_rate": 0.0005320996873657167, + "loss": 0.9317, + "step": 3365 + }, + { + "epoch": 1.064518676458975, + "grad_norm": 0.02909246543706195, + "learning_rate": 0.000530723793949689, + "loss": 1.0118, + "step": 3370 + }, + { + "epoch": 1.0660980810234542, + "grad_norm": 0.03610373021969205, + "learning_rate": 0.0005293476669610673, + "loss": 0.9777, + "step": 3375 + }, + { + "epoch": 1.0676774855879334, + "grad_norm": 0.036352202064281774, + "learning_rate": 0.0005279713168616309, + "loss": 0.9223, + "step": 3380 + }, + { + "epoch": 1.0692568901524124, + "grad_norm": 0.04168876104566738, + "learning_rate": 0.0005265947541148553, + "loss": 0.9932, + "step": 3385 + }, + { + "epoch": 1.0708362947168917, + "grad_norm": 0.03730727894381184, + "learning_rate": 0.0005252179891858326, + "loss": 0.955, + "step": 3390 + }, + { + "epoch": 1.072415699281371, + "grad_norm": 0.0313568608389757, + "learning_rate": 0.0005238410325411917, + "loss": 1.0055, + "step": 3395 + }, + { + "epoch": 1.0739951038458502, + "grad_norm": 0.0377620352643827, + "learning_rate": 0.0005224638946490191, + "loss": 0.9968, + "step": 3400 + }, + { + "epoch": 1.0755745084103294, + "grad_norm": 0.034285990273277014, + "learning_rate": 0.0005210865859787794, + "loss": 1.05, + "step": 3405 + }, + { + "epoch": 1.0771539129748084, + "grad_norm": 0.033053408056772364, + "learning_rate": 0.0005197091170012356, + "loss": 0.9527, + "step": 3410 + }, + { + "epoch": 1.0787333175392877, + "grad_norm": 0.03391521396574238, + "learning_rate": 0.000518331498188369, + "loss": 1.031, + "step": 3415 + }, + { + "epoch": 1.080312722103767, + "grad_norm": 0.03474538664274266, + "learning_rate": 0.0005169537400133002, + "loss": 1.0192, + "step": 3420 + }, + { + "epoch": 1.081892126668246, + "grad_norm": 0.030783132539478026, + "learning_rate": 0.0005155758529502095, + "loss": 1.0036, + "step": 3425 + }, + { + "epoch": 1.0834715312327252, + "grad_norm": 0.03057246653231956, + "learning_rate": 0.0005141978474742566, + "loss": 1.0117, + "step": 3430 + }, + { + "epoch": 1.0850509357972045, + "grad_norm": 0.02968806381266762, + "learning_rate": 0.0005128197340615018, + "loss": 0.9851, + "step": 3435 + }, + { + "epoch": 1.0866303403616837, + "grad_norm": 0.03520879529669726, + "learning_rate": 0.0005114415231888257, + "loss": 0.9676, + "step": 3440 + }, + { + "epoch": 1.0882097449261627, + "grad_norm": 0.03173846889255153, + "learning_rate": 0.0005100632253338499, + "loss": 1.0356, + "step": 3445 + }, + { + "epoch": 1.089789149490642, + "grad_norm": 0.031706825455682444, + "learning_rate": 0.0005086848509748577, + "loss": 1.1036, + "step": 3450 + }, + { + "epoch": 1.0913685540551212, + "grad_norm": 0.03537078439015193, + "learning_rate": 0.000507306410590713, + "loss": 1.0021, + "step": 3455 + }, + { + "epoch": 1.0929479586196005, + "grad_norm": 0.030282214060137556, + "learning_rate": 0.0005059279146607829, + "loss": 1.03, + "step": 3460 + }, + { + "epoch": 1.0945273631840795, + "grad_norm": 0.034400453962651394, + "learning_rate": 0.0005045493736648556, + "loss": 0.9934, + "step": 3465 + }, + { + "epoch": 1.0961067677485588, + "grad_norm": 10.606479686832095, + "learning_rate": 0.0005031707980830629, + "loss": 1.0075, + "step": 3470 + }, + { + "epoch": 1.097686172313038, + "grad_norm": 0.04574495818963683, + "learning_rate": 0.000501792198395799, + "loss": 0.9429, + "step": 3475 + }, + { + "epoch": 1.0992655768775172, + "grad_norm": 0.041434716901667105, + "learning_rate": 0.0005004135850836412, + "loss": 0.9915, + "step": 3480 + }, + { + "epoch": 1.1008449814419963, + "grad_norm": 0.036560297041780127, + "learning_rate": 0.0004990349686272709, + "loss": 1.0123, + "step": 3485 + }, + { + "epoch": 1.1024243860064755, + "grad_norm": 0.03740903677106789, + "learning_rate": 0.0004976563595073929, + "loss": 1.0287, + "step": 3490 + }, + { + "epoch": 1.1040037905709548, + "grad_norm": 0.03868760581227752, + "learning_rate": 0.0004962777682046565, + "loss": 0.9872, + "step": 3495 + }, + { + "epoch": 1.105583195135434, + "grad_norm": 0.04634060161494692, + "learning_rate": 0.0004948992051995756, + "loss": 1.0251, + "step": 3500 + }, + { + "epoch": 1.107162599699913, + "grad_norm": 0.02993297136622178, + "learning_rate": 0.0004935206809724488, + "loss": 0.9731, + "step": 3505 + }, + { + "epoch": 1.1087420042643923, + "grad_norm": 0.04079476364273126, + "learning_rate": 0.0004921422060032801, + "loss": 1.0498, + "step": 3510 + }, + { + "epoch": 1.1103214088288715, + "grad_norm": 0.04673875982677158, + "learning_rate": 0.0004907637907716987, + "loss": 1.0366, + "step": 3515 + }, + { + "epoch": 1.1119008133933508, + "grad_norm": 0.03821099301219827, + "learning_rate": 0.0004893854457568801, + "loss": 1.0495, + "step": 3520 + }, + { + "epoch": 1.1134802179578298, + "grad_norm": 0.03919150081513503, + "learning_rate": 0.0004880071814374656, + "loss": 0.9499, + "step": 3525 + }, + { + "epoch": 1.115059622522309, + "grad_norm": 0.03890374315742622, + "learning_rate": 0.0004866290082914831, + "loss": 1.0682, + "step": 3530 + }, + { + "epoch": 1.1166390270867883, + "grad_norm": 0.03362373378100303, + "learning_rate": 0.00048525093679626746, + "loss": 1.0049, + "step": 3535 + }, + { + "epoch": 1.1182184316512676, + "grad_norm": 0.0363331476945115, + "learning_rate": 0.00048387297742838085, + "loss": 1.0039, + "step": 3540 + }, + { + "epoch": 1.1197978362157466, + "grad_norm": 0.030750365373425098, + "learning_rate": 0.00048249514066353274, + "loss": 0.9417, + "step": 3545 + }, + { + "epoch": 1.1213772407802258, + "grad_norm": 0.038839666921607556, + "learning_rate": 0.0004811174369765008, + "loss": 0.9521, + "step": 3550 + }, + { + "epoch": 1.122956645344705, + "grad_norm": 0.03068903343661717, + "learning_rate": 0.0004797398768410509, + "loss": 1.0124, + "step": 3555 + }, + { + "epoch": 1.1245360499091843, + "grad_norm": 0.02862806963588536, + "learning_rate": 0.0004783624707298574, + "loss": 1.0724, + "step": 3560 + }, + { + "epoch": 1.1261154544736633, + "grad_norm": 0.03115930123636453, + "learning_rate": 0.00047698522911442397, + "loss": 1.0118, + "step": 3565 + }, + { + "epoch": 1.1276948590381426, + "grad_norm": 0.03220348520336603, + "learning_rate": 0.0004756081624650037, + "loss": 1.0294, + "step": 3570 + }, + { + "epoch": 1.1292742636026218, + "grad_norm": 0.029980214681574024, + "learning_rate": 0.0004742312812505194, + "loss": 0.9844, + "step": 3575 + }, + { + "epoch": 1.130853668167101, + "grad_norm": 0.03910674749748984, + "learning_rate": 0.00047285459593848425, + "loss": 1.0132, + "step": 3580 + }, + { + "epoch": 1.1324330727315801, + "grad_norm": 0.03391440952365017, + "learning_rate": 0.00047147811699492227, + "loss": 1.0222, + "step": 3585 + }, + { + "epoch": 1.1340124772960594, + "grad_norm": 0.05878162258843076, + "learning_rate": 0.00047010185488428793, + "loss": 1.0384, + "step": 3590 + }, + { + "epoch": 1.1355918818605386, + "grad_norm": 0.0327519856427672, + "learning_rate": 0.00046872582006938796, + "loss": 0.9858, + "step": 3595 + }, + { + "epoch": 1.1371712864250179, + "grad_norm": 0.03680010655471814, + "learning_rate": 0.00046735002301130093, + "loss": 0.9303, + "step": 3600 + }, + { + "epoch": 1.1387506909894969, + "grad_norm": 0.031895585784546855, + "learning_rate": 0.00046597447416929776, + "loss": 1.0007, + "step": 3605 + }, + { + "epoch": 1.1403300955539761, + "grad_norm": 0.03199689960710086, + "learning_rate": 0.0004645991840007627, + "loss": 1.004, + "step": 3610 + }, + { + "epoch": 1.1419095001184554, + "grad_norm": 0.03009576478041764, + "learning_rate": 0.00046322416296111296, + "loss": 1.039, + "step": 3615 + }, + { + "epoch": 1.1434889046829346, + "grad_norm": 0.032847682758630106, + "learning_rate": 0.00046184942150372007, + "loss": 1.0054, + "step": 3620 + }, + { + "epoch": 1.1450683092474137, + "grad_norm": 0.06698387106228133, + "learning_rate": 0.00046047497007983, + "loss": 1.0097, + "step": 3625 + }, + { + "epoch": 1.146647713811893, + "grad_norm": 0.035296251330652055, + "learning_rate": 0.0004591008191384838, + "loss": 0.9193, + "step": 3630 + }, + { + "epoch": 1.1482271183763721, + "grad_norm": 0.03264524280421541, + "learning_rate": 0.0004577269791264383, + "loss": 1.025, + "step": 3635 + }, + { + "epoch": 1.1498065229408514, + "grad_norm": 0.038864179284595034, + "learning_rate": 0.00045635346048808625, + "loss": 0.9855, + "step": 3640 + }, + { + "epoch": 1.1513859275053304, + "grad_norm": 0.03172597744457259, + "learning_rate": 0.0004549802736653775, + "loss": 1.0073, + "step": 3645 + }, + { + "epoch": 1.1529653320698097, + "grad_norm": 0.03272896511499134, + "learning_rate": 0.00045360742909773886, + "loss": 1.0158, + "step": 3650 + }, + { + "epoch": 1.154544736634289, + "grad_norm": 0.03181076711304137, + "learning_rate": 0.0004522349372219959, + "loss": 1.049, + "step": 3655 + }, + { + "epoch": 1.156124141198768, + "grad_norm": 0.029570420554063247, + "learning_rate": 0.0004508628084722923, + "loss": 0.9824, + "step": 3660 + }, + { + "epoch": 1.1577035457632472, + "grad_norm": 0.031518736372667645, + "learning_rate": 0.0004494910532800115, + "loss": 0.965, + "step": 3665 + }, + { + "epoch": 1.1592829503277264, + "grad_norm": 0.03120462181627199, + "learning_rate": 0.00044811968207369675, + "loss": 1.0007, + "step": 3670 + }, + { + "epoch": 1.1608623548922057, + "grad_norm": 0.030615282637634297, + "learning_rate": 0.0004467487052789724, + "loss": 0.9676, + "step": 3675 + }, + { + "epoch": 1.162441759456685, + "grad_norm": 0.03218785070440956, + "learning_rate": 0.00044537813331846414, + "loss": 1.0053, + "step": 3680 + }, + { + "epoch": 1.164021164021164, + "grad_norm": 0.037644209561606065, + "learning_rate": 0.00044400797661172016, + "loss": 1.0194, + "step": 3685 + }, + { + "epoch": 1.1656005685856432, + "grad_norm": 0.036131652266939686, + "learning_rate": 0.00044263824557513144, + "loss": 0.9678, + "step": 3690 + }, + { + "epoch": 1.1671799731501225, + "grad_norm": 0.0345485607793738, + "learning_rate": 0.00044126895062185324, + "loss": 1.0913, + "step": 3695 + }, + { + "epoch": 1.1687593777146015, + "grad_norm": 0.10534071784088957, + "learning_rate": 0.00043990010216172533, + "loss": 0.9556, + "step": 3700 + }, + { + "epoch": 1.1703387822790807, + "grad_norm": 0.031917678600092154, + "learning_rate": 0.000438531710601193, + "loss": 0.9676, + "step": 3705 + }, + { + "epoch": 1.17191818684356, + "grad_norm": 0.03528790633782125, + "learning_rate": 0.00043716378634322834, + "loss": 1.0752, + "step": 3710 + }, + { + "epoch": 1.1734975914080392, + "grad_norm": 0.06247356214049855, + "learning_rate": 0.00043579633978725065, + "loss": 0.986, + "step": 3715 + }, + { + "epoch": 1.1750769959725185, + "grad_norm": 0.03594855583679661, + "learning_rate": 0.00043442938132904767, + "loss": 0.9764, + "step": 3720 + }, + { + "epoch": 1.1766564005369975, + "grad_norm": 0.03314206038497171, + "learning_rate": 0.00043306292136069646, + "loss": 1.0098, + "step": 3725 + }, + { + "epoch": 1.1782358051014767, + "grad_norm": 0.031324091896376624, + "learning_rate": 0.0004316969702704842, + "loss": 0.9796, + "step": 3730 + }, + { + "epoch": 1.179815209665956, + "grad_norm": 0.035907780461936295, + "learning_rate": 0.0004303315384428298, + "loss": 1.0284, + "step": 3735 + }, + { + "epoch": 1.181394614230435, + "grad_norm": 0.04235651655577545, + "learning_rate": 0.0004289666362582041, + "loss": 1.0072, + "step": 3740 + }, + { + "epoch": 1.1829740187949143, + "grad_norm": 0.02825475220475871, + "learning_rate": 0.00042760227409305166, + "loss": 0.954, + "step": 3745 + }, + { + "epoch": 1.1845534233593935, + "grad_norm": 0.03422905250774198, + "learning_rate": 0.0004262384623197116, + "loss": 0.9824, + "step": 3750 + }, + { + "epoch": 1.1861328279238728, + "grad_norm": 0.03097530803468814, + "learning_rate": 0.0004248752113063388, + "loss": 1.0269, + "step": 3755 + }, + { + "epoch": 1.187712232488352, + "grad_norm": 0.03167783403621823, + "learning_rate": 0.0004235125314168251, + "loss": 1.04, + "step": 3760 + }, + { + "epoch": 1.189291637052831, + "grad_norm": 0.03228159018489407, + "learning_rate": 0.00042215043301072037, + "loss": 0.9895, + "step": 3765 + }, + { + "epoch": 1.1908710416173103, + "grad_norm": 0.028456877325231795, + "learning_rate": 0.00042078892644315387, + "loss": 0.9235, + "step": 3770 + }, + { + "epoch": 1.1924504461817895, + "grad_norm": 0.04042505241231098, + "learning_rate": 0.0004194280220647556, + "loss": 1.0244, + "step": 3775 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.03214462728655419, + "learning_rate": 0.00041806773022157716, + "loss": 1.0068, + "step": 3780 + }, + { + "epoch": 1.1956092553107478, + "grad_norm": 0.028459469654203682, + "learning_rate": 0.00041670806125501393, + "loss": 1.0081, + "step": 3785 + }, + { + "epoch": 1.197188659875227, + "grad_norm": 0.03278427499854533, + "learning_rate": 0.0004153490255017257, + "loss": 0.9328, + "step": 3790 + }, + { + "epoch": 1.1987680644397063, + "grad_norm": 0.03807036409466499, + "learning_rate": 0.00041399063329355853, + "loss": 1.0445, + "step": 3795 + }, + { + "epoch": 1.2003474690041855, + "grad_norm": 0.04133519873285916, + "learning_rate": 0.00041263289495746574, + "loss": 1.0262, + "step": 3800 + }, + { + "epoch": 1.2019268735686646, + "grad_norm": 0.039450935145532044, + "learning_rate": 0.00041127582081543, + "loss": 1.1415, + "step": 3805 + }, + { + "epoch": 1.2035062781331438, + "grad_norm": 0.03637001529973416, + "learning_rate": 0.0004099194211843847, + "loss": 0.9529, + "step": 3810 + }, + { + "epoch": 1.205085682697623, + "grad_norm": 0.034497184596985475, + "learning_rate": 0.0004085637063761346, + "loss": 1.0775, + "step": 3815 + }, + { + "epoch": 1.206665087262102, + "grad_norm": 0.02959202086144998, + "learning_rate": 0.0004072086866972789, + "loss": 1.0161, + "step": 3820 + }, + { + "epoch": 1.2082444918265813, + "grad_norm": 0.0311951928316788, + "learning_rate": 0.00040585437244913217, + "loss": 0.9326, + "step": 3825 + }, + { + "epoch": 1.2098238963910606, + "grad_norm": 0.03179771229016227, + "learning_rate": 0.0004045007739276456, + "loss": 0.9844, + "step": 3830 + }, + { + "epoch": 1.2114033009555398, + "grad_norm": 0.031644743069673686, + "learning_rate": 0.0004031479014233297, + "loss": 0.9812, + "step": 3835 + }, + { + "epoch": 1.212982705520019, + "grad_norm": 0.03323986149903773, + "learning_rate": 0.0004017957652211753, + "loss": 0.9487, + "step": 3840 + }, + { + "epoch": 1.214562110084498, + "grad_norm": 0.05569080062496465, + "learning_rate": 0.00040044437560057567, + "loss": 1.0618, + "step": 3845 + }, + { + "epoch": 1.2161415146489774, + "grad_norm": 0.04109910399076241, + "learning_rate": 0.0003990937428352482, + "loss": 1.0304, + "step": 3850 + }, + { + "epoch": 1.2177209192134566, + "grad_norm": 0.03505379674466537, + "learning_rate": 0.00039774387719315664, + "loss": 1.0664, + "step": 3855 + }, + { + "epoch": 1.2193003237779356, + "grad_norm": 0.03418242604516485, + "learning_rate": 0.00039639478893643257, + "loss": 1.018, + "step": 3860 + }, + { + "epoch": 1.2208797283424149, + "grad_norm": 0.05819456896356674, + "learning_rate": 0.00039504648832129787, + "loss": 0.9534, + "step": 3865 + }, + { + "epoch": 1.2224591329068941, + "grad_norm": 0.03845387810065682, + "learning_rate": 0.00039369898559798614, + "loss": 1.0085, + "step": 3870 + }, + { + "epoch": 1.2240385374713734, + "grad_norm": 0.03002188707299774, + "learning_rate": 0.0003923522910106656, + "loss": 1.0685, + "step": 3875 + }, + { + "epoch": 1.2256179420358524, + "grad_norm": 0.03350032348038046, + "learning_rate": 0.0003910064147973603, + "loss": 1.0133, + "step": 3880 + }, + { + "epoch": 1.2271973466003316, + "grad_norm": 0.032399311229717676, + "learning_rate": 0.0003896613671898732, + "loss": 1.011, + "step": 3885 + }, + { + "epoch": 1.2287767511648109, + "grad_norm": 0.03717438638700494, + "learning_rate": 0.00038831715841370745, + "loss": 0.9171, + "step": 3890 + }, + { + "epoch": 1.2303561557292901, + "grad_norm": 0.03669686520409071, + "learning_rate": 0.0003869737986879895, + "loss": 0.9085, + "step": 3895 + }, + { + "epoch": 1.2319355602937692, + "grad_norm": 0.028568004417021613, + "learning_rate": 0.0003856312982253909, + "loss": 0.98, + "step": 3900 + }, + { + "epoch": 1.2335149648582484, + "grad_norm": 0.0337167611533833, + "learning_rate": 0.0003842896672320506, + "loss": 1.0053, + "step": 3905 + }, + { + "epoch": 1.2350943694227277, + "grad_norm": 0.033327802875302, + "learning_rate": 0.00038294891590749783, + "loss": 1.0156, + "step": 3910 + }, + { + "epoch": 1.236673773987207, + "grad_norm": 0.040464892884495215, + "learning_rate": 0.0003816090544445741, + "loss": 0.9311, + "step": 3915 + }, + { + "epoch": 1.238253178551686, + "grad_norm": 0.031169717492686492, + "learning_rate": 0.0003802700930293563, + "loss": 0.9942, + "step": 3920 + }, + { + "epoch": 1.2398325831161652, + "grad_norm": 0.03608779188581459, + "learning_rate": 0.00037893204184107803, + "loss": 0.9781, + "step": 3925 + }, + { + "epoch": 1.2414119876806444, + "grad_norm": 0.034353318606698664, + "learning_rate": 0.0003775949110520538, + "loss": 0.9602, + "step": 3930 + }, + { + "epoch": 1.2429913922451237, + "grad_norm": 0.032459644273819636, + "learning_rate": 0.00037625871082760064, + "loss": 1.0103, + "step": 3935 + }, + { + "epoch": 1.2445707968096027, + "grad_norm": 0.03287385312411447, + "learning_rate": 0.00037492345132596113, + "loss": 1.0343, + "step": 3940 + }, + { + "epoch": 1.246150201374082, + "grad_norm": 0.09624463706888049, + "learning_rate": 0.0003735891426982262, + "loss": 1.001, + "step": 3945 + }, + { + "epoch": 1.2477296059385612, + "grad_norm": 0.039854815726468845, + "learning_rate": 0.00037225579508825803, + "loss": 0.9973, + "step": 3950 + }, + { + "epoch": 1.2493090105030404, + "grad_norm": 0.03282797081588262, + "learning_rate": 0.0003709234186326124, + "loss": 1.007, + "step": 3955 + }, + { + "epoch": 1.2508884150675195, + "grad_norm": 0.04323405227732425, + "learning_rate": 0.0003695920234604625, + "loss": 0.9519, + "step": 3960 + }, + { + "epoch": 1.2524678196319987, + "grad_norm": 0.030804057889002943, + "learning_rate": 0.00036826161969352137, + "loss": 0.9538, + "step": 3965 + }, + { + "epoch": 1.254047224196478, + "grad_norm": 0.033900286482032485, + "learning_rate": 0.00036693221744596476, + "loss": 0.9878, + "step": 3970 + }, + { + "epoch": 1.255626628760957, + "grad_norm": 0.029308009262548004, + "learning_rate": 0.000365603826824355, + "loss": 1.0579, + "step": 3975 + }, + { + "epoch": 1.2572060333254362, + "grad_norm": 0.03242275017266416, + "learning_rate": 0.00036427645792756335, + "loss": 1.0618, + "step": 3980 + }, + { + "epoch": 1.2587854378899155, + "grad_norm": 0.0302414222139075, + "learning_rate": 0.0003629501208466938, + "loss": 0.9297, + "step": 3985 + }, + { + "epoch": 1.2603648424543947, + "grad_norm": 0.03279689599577794, + "learning_rate": 0.000361624825665006, + "loss": 0.9962, + "step": 3990 + }, + { + "epoch": 1.261944247018874, + "grad_norm": 0.033006660420700024, + "learning_rate": 0.0003603005824578386, + "loss": 0.93, + "step": 3995 + }, + { + "epoch": 1.263523651583353, + "grad_norm": 0.03197012821917078, + "learning_rate": 0.00035897740129253296, + "loss": 0.9839, + "step": 4000 + }, + { + "epoch": 1.2651030561478323, + "grad_norm": 0.03230977381844998, + "learning_rate": 0.00035765529222835666, + "loss": 0.9787, + "step": 4005 + }, + { + "epoch": 1.2666824607123115, + "grad_norm": 0.031821233663209894, + "learning_rate": 0.00035633426531642625, + "loss": 0.9809, + "step": 4010 + }, + { + "epoch": 1.2682618652767905, + "grad_norm": 0.03646680985111504, + "learning_rate": 0.00035501433059963194, + "loss": 0.9671, + "step": 4015 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.034147950422965034, + "learning_rate": 0.00035369549811256043, + "loss": 0.9884, + "step": 4020 + }, + { + "epoch": 1.271420674405749, + "grad_norm": 0.04178973681830019, + "learning_rate": 0.00035237777788141896, + "loss": 1.019, + "step": 4025 + }, + { + "epoch": 1.2730000789702283, + "grad_norm": 0.03610221769429688, + "learning_rate": 0.00035106117992395893, + "loss": 0.9549, + "step": 4030 + }, + { + "epoch": 1.2745794835347075, + "grad_norm": 0.032643633732576, + "learning_rate": 0.00034974571424940007, + "loss": 1.0115, + "step": 4035 + }, + { + "epoch": 1.2761588880991865, + "grad_norm": 0.03551475992576843, + "learning_rate": 0.0003484313908583538, + "loss": 1.0712, + "step": 4040 + }, + { + "epoch": 1.2777382926636658, + "grad_norm": 0.03168699509132295, + "learning_rate": 0.0003471182197427477, + "loss": 0.9868, + "step": 4045 + }, + { + "epoch": 1.279317697228145, + "grad_norm": 0.04010531950002586, + "learning_rate": 0.00034580621088574944, + "loss": 1.0084, + "step": 4050 + }, + { + "epoch": 1.280897101792624, + "grad_norm": 0.034498613645441746, + "learning_rate": 0.00034449537426169065, + "loss": 0.9823, + "step": 4055 + }, + { + "epoch": 1.2824765063571033, + "grad_norm": 0.03821759329478121, + "learning_rate": 0.00034318571983599146, + "loss": 1.0177, + "step": 4060 + }, + { + "epoch": 1.2840559109215826, + "grad_norm": 0.03673721054940819, + "learning_rate": 0.00034187725756508426, + "loss": 0.9367, + "step": 4065 + }, + { + "epoch": 1.2856353154860618, + "grad_norm": 0.03384213798714474, + "learning_rate": 0.0003405699973963384, + "loss": 0.9227, + "step": 4070 + }, + { + "epoch": 1.287214720050541, + "grad_norm": 0.034090758745436614, + "learning_rate": 0.0003392639492679846, + "loss": 1.0423, + "step": 4075 + }, + { + "epoch": 1.28879412461502, + "grad_norm": 0.03245882007994995, + "learning_rate": 0.0003379591231090391, + "loss": 0.9629, + "step": 4080 + }, + { + "epoch": 1.2903735291794993, + "grad_norm": 0.03496114506115646, + "learning_rate": 0.00033665552883922815, + "loss": 1.0429, + "step": 4085 + }, + { + "epoch": 1.2919529337439786, + "grad_norm": 0.03294034716543751, + "learning_rate": 0.00033535317636891306, + "loss": 0.9318, + "step": 4090 + }, + { + "epoch": 1.2935323383084576, + "grad_norm": 0.03295135988949754, + "learning_rate": 0.0003340520755990144, + "loss": 0.9867, + "step": 4095 + }, + { + "epoch": 1.2951117428729368, + "grad_norm": 0.03690628663937476, + "learning_rate": 0.0003327522364209369, + "loss": 0.9245, + "step": 4100 + }, + { + "epoch": 1.296691147437416, + "grad_norm": 0.029297125596864402, + "learning_rate": 0.0003314536687164944, + "loss": 1.0122, + "step": 4105 + }, + { + "epoch": 1.2982705520018953, + "grad_norm": 0.029269281575792814, + "learning_rate": 0.0003301563823578343, + "loss": 0.9307, + "step": 4110 + }, + { + "epoch": 1.2998499565663746, + "grad_norm": 0.034022614599823864, + "learning_rate": 0.0003288603872073631, + "loss": 0.9801, + "step": 4115 + }, + { + "epoch": 1.3014293611308536, + "grad_norm": 0.08295291494782484, + "learning_rate": 0.00032756569311767083, + "loss": 1.0192, + "step": 4120 + }, + { + "epoch": 1.3030087656953329, + "grad_norm": 0.05198063709991113, + "learning_rate": 0.00032627230993145643, + "loss": 0.9641, + "step": 4125 + }, + { + "epoch": 1.304588170259812, + "grad_norm": 0.0386293310510888, + "learning_rate": 0.0003249802474814532, + "loss": 1.0402, + "step": 4130 + }, + { + "epoch": 1.3061675748242911, + "grad_norm": 0.03262356907346096, + "learning_rate": 0.0003236895155903533, + "loss": 0.9112, + "step": 4135 + }, + { + "epoch": 1.3077469793887704, + "grad_norm": 0.036464664779840325, + "learning_rate": 0.000322400124070734, + "loss": 1.0295, + "step": 4140 + }, + { + "epoch": 1.3093263839532496, + "grad_norm": 0.0498083025164631, + "learning_rate": 0.0003211120827249827, + "loss": 0.9529, + "step": 4145 + }, + { + "epoch": 1.3109057885177289, + "grad_norm": 0.03277934651515331, + "learning_rate": 0.0003198254013452214, + "loss": 0.9878, + "step": 4150 + }, + { + "epoch": 1.3124851930822081, + "grad_norm": 0.03799182246479796, + "learning_rate": 0.0003185400897132341, + "loss": 0.9981, + "step": 4155 + }, + { + "epoch": 1.3140645976466871, + "grad_norm": 0.030928525858774297, + "learning_rate": 0.0003172561576003913, + "loss": 0.9849, + "step": 4160 + }, + { + "epoch": 1.3156440022111664, + "grad_norm": 0.032931441479663676, + "learning_rate": 0.00031597361476757587, + "loss": 0.9871, + "step": 4165 + }, + { + "epoch": 1.3172234067756456, + "grad_norm": 0.03860824643776728, + "learning_rate": 0.0003146924709651089, + "loss": 1.0179, + "step": 4170 + }, + { + "epoch": 1.3188028113401247, + "grad_norm": 0.036134302819682045, + "learning_rate": 0.0003134127359326755, + "loss": 0.9913, + "step": 4175 + }, + { + "epoch": 1.320382215904604, + "grad_norm": 0.04505322900858715, + "learning_rate": 0.000312134419399251, + "loss": 0.9884, + "step": 4180 + }, + { + "epoch": 1.3219616204690832, + "grad_norm": 0.03583021484325502, + "learning_rate": 0.0003108575310830266, + "loss": 1.0335, + "step": 4185 + }, + { + "epoch": 1.3235410250335624, + "grad_norm": 0.028584860073887837, + "learning_rate": 0.00030958208069133613, + "loss": 0.9472, + "step": 4190 + }, + { + "epoch": 1.3251204295980417, + "grad_norm": 0.032560726753026195, + "learning_rate": 0.00030830807792058137, + "loss": 0.9358, + "step": 4195 + }, + { + "epoch": 1.3266998341625207, + "grad_norm": 0.03201200510999583, + "learning_rate": 0.0003070355324561591, + "loss": 0.9581, + "step": 4200 + }, + { + "epoch": 1.328279238727, + "grad_norm": 0.02863524059045198, + "learning_rate": 0.0003057644539723871, + "loss": 0.9479, + "step": 4205 + }, + { + "epoch": 1.3298586432914792, + "grad_norm": 0.02861412563443391, + "learning_rate": 0.00030449485213243047, + "loss": 0.9869, + "step": 4210 + }, + { + "epoch": 1.3314380478559582, + "grad_norm": 0.0297763674050006, + "learning_rate": 0.00030322673658822864, + "loss": 0.9998, + "step": 4215 + }, + { + "epoch": 1.3330174524204375, + "grad_norm": 0.03234585505718821, + "learning_rate": 0.0003019601169804216, + "loss": 0.9828, + "step": 4220 + }, + { + "epoch": 1.3345968569849167, + "grad_norm": 0.03260293388067336, + "learning_rate": 0.00030069500293827676, + "loss": 0.9266, + "step": 4225 + }, + { + "epoch": 1.336176261549396, + "grad_norm": 0.032776920157258686, + "learning_rate": 0.00029943140407961565, + "loss": 1.0039, + "step": 4230 + }, + { + "epoch": 1.3377556661138752, + "grad_norm": 0.08314838417227068, + "learning_rate": 0.000298169330010741, + "loss": 0.9715, + "step": 4235 + }, + { + "epoch": 1.3393350706783542, + "grad_norm": 0.028657099663928653, + "learning_rate": 0.0002969087903263635, + "loss": 0.9944, + "step": 4240 + }, + { + "epoch": 1.3409144752428335, + "grad_norm": 0.03255072028797001, + "learning_rate": 0.0002956497946095289, + "loss": 0.9984, + "step": 4245 + }, + { + "epoch": 1.3424938798073127, + "grad_norm": 0.029853697845549056, + "learning_rate": 0.0002943923524315451, + "loss": 0.9864, + "step": 4250 + }, + { + "epoch": 1.3440732843717917, + "grad_norm": 0.02927271301894455, + "learning_rate": 0.00029313647335190975, + "loss": 0.9946, + "step": 4255 + }, + { + "epoch": 1.345652688936271, + "grad_norm": 0.03457055693857293, + "learning_rate": 0.0002918821669182372, + "loss": 0.9119, + "step": 4260 + }, + { + "epoch": 1.3472320935007502, + "grad_norm": 0.030506666861336504, + "learning_rate": 0.00029062944266618565, + "loss": 0.95, + "step": 4265 + }, + { + "epoch": 1.3488114980652295, + "grad_norm": 0.02777303737819793, + "learning_rate": 0.00028937831011938565, + "loss": 0.9663, + "step": 4270 + }, + { + "epoch": 1.3503909026297087, + "grad_norm": 0.028472861399310736, + "learning_rate": 0.0002881287787893666, + "loss": 0.9641, + "step": 4275 + }, + { + "epoch": 1.3519703071941878, + "grad_norm": 0.03335275141344005, + "learning_rate": 0.00028688085817548504, + "loss": 0.9274, + "step": 4280 + }, + { + "epoch": 1.353549711758667, + "grad_norm": 0.03476091838113158, + "learning_rate": 0.0002856345577648526, + "loss": 1.0074, + "step": 4285 + }, + { + "epoch": 1.3551291163231463, + "grad_norm": 0.029396912738058084, + "learning_rate": 0.00028438988703226287, + "loss": 0.95, + "step": 4290 + }, + { + "epoch": 1.3567085208876253, + "grad_norm": 0.03292824454164419, + "learning_rate": 0.000283146855440121, + "loss": 1.0241, + "step": 4295 + }, + { + "epoch": 1.3582879254521045, + "grad_norm": 0.02933063711821805, + "learning_rate": 0.00028190547243836994, + "loss": 0.9273, + "step": 4300 + }, + { + "epoch": 1.3598673300165838, + "grad_norm": 0.0487532524969538, + "learning_rate": 0.0002806657474644204, + "loss": 0.9284, + "step": 4305 + }, + { + "epoch": 1.361446734581063, + "grad_norm": 0.04641854021034858, + "learning_rate": 0.00027942768994307734, + "loss": 0.9597, + "step": 4310 + }, + { + "epoch": 1.363026139145542, + "grad_norm": 0.03899217449204588, + "learning_rate": 0.0002781913092864699, + "loss": 1.0238, + "step": 4315 + }, + { + "epoch": 1.3646055437100213, + "grad_norm": 0.41874580707406633, + "learning_rate": 0.0002769566148939787, + "loss": 0.9863, + "step": 4320 + }, + { + "epoch": 1.3661849482745005, + "grad_norm": 0.03254665656229802, + "learning_rate": 0.0002757236161521647, + "loss": 1.0182, + "step": 4325 + }, + { + "epoch": 1.3677643528389796, + "grad_norm": 0.031051834601741202, + "learning_rate": 0.00027449232243469856, + "loss": 1.0195, + "step": 4330 + }, + { + "epoch": 1.3693437574034588, + "grad_norm": 0.03681112953763141, + "learning_rate": 0.00027326274310228806, + "loss": 1.0055, + "step": 4335 + }, + { + "epoch": 1.370923161967938, + "grad_norm": 0.029500637011015705, + "learning_rate": 0.0002720348875026083, + "loss": 0.9259, + "step": 4340 + }, + { + "epoch": 1.3725025665324173, + "grad_norm": 0.029781036898435677, + "learning_rate": 0.0002708087649702294, + "loss": 0.9307, + "step": 4345 + }, + { + "epoch": 1.3740819710968966, + "grad_norm": 0.033142957599914515, + "learning_rate": 0.00026958438482654667, + "loss": 0.9918, + "step": 4350 + }, + { + "epoch": 1.3756613756613756, + "grad_norm": 0.028352927356558807, + "learning_rate": 0.0002683617563797088, + "loss": 0.982, + "step": 4355 + }, + { + "epoch": 1.3772407802258548, + "grad_norm": 0.03140033057258207, + "learning_rate": 0.0002671408889245475, + "loss": 0.9288, + "step": 4360 + }, + { + "epoch": 1.378820184790334, + "grad_norm": 0.035233655109260735, + "learning_rate": 0.0002659217917425071, + "loss": 0.9746, + "step": 4365 + }, + { + "epoch": 1.380399589354813, + "grad_norm": 0.0310879621744834, + "learning_rate": 0.00026470447410157353, + "loss": 0.9245, + "step": 4370 + }, + { + "epoch": 1.3819789939192924, + "grad_norm": 0.03545557030251083, + "learning_rate": 0.0002634889452562041, + "loss": 0.9832, + "step": 4375 + }, + { + "epoch": 1.3835583984837716, + "grad_norm": 0.03221098890038632, + "learning_rate": 0.00026227521444725685, + "loss": 0.9617, + "step": 4380 + }, + { + "epoch": 1.3851378030482508, + "grad_norm": 0.03575604783785744, + "learning_rate": 0.0002610632909019211, + "loss": 0.9869, + "step": 4385 + }, + { + "epoch": 1.38671720761273, + "grad_norm": 0.03000837213143947, + "learning_rate": 0.0002598531838336461, + "loss": 0.9722, + "step": 4390 + }, + { + "epoch": 1.3882966121772091, + "grad_norm": 0.0313471817306669, + "learning_rate": 0.0002586449024420724, + "loss": 1.0356, + "step": 4395 + }, + { + "epoch": 1.3898760167416884, + "grad_norm": 0.03130166860322959, + "learning_rate": 0.0002574384559129602, + "loss": 0.9598, + "step": 4400 + }, + { + "epoch": 1.3914554213061676, + "grad_norm": 0.08264592100818809, + "learning_rate": 0.00025623385341812135, + "loss": 0.9846, + "step": 4405 + }, + { + "epoch": 1.3930348258706466, + "grad_norm": 0.030114349700177814, + "learning_rate": 0.0002550311041153482, + "loss": 0.9582, + "step": 4410 + }, + { + "epoch": 1.394614230435126, + "grad_norm": 0.03521345245155562, + "learning_rate": 0.0002538302171483444, + "loss": 0.9928, + "step": 4415 + }, + { + "epoch": 1.3961936349996051, + "grad_norm": 0.03229933078643899, + "learning_rate": 0.0002526312016466562, + "loss": 0.9755, + "step": 4420 + }, + { + "epoch": 1.3977730395640844, + "grad_norm": 0.032182885993634974, + "learning_rate": 0.0002514340667256014, + "loss": 0.9613, + "step": 4425 + }, + { + "epoch": 1.3993524441285636, + "grad_norm": 0.030631941234258123, + "learning_rate": 0.00025023882148620205, + "loss": 0.9776, + "step": 4430 + }, + { + "epoch": 1.4009318486930427, + "grad_norm": 0.02987505008410226, + "learning_rate": 0.00024904547501511306, + "loss": 0.913, + "step": 4435 + }, + { + "epoch": 1.402511253257522, + "grad_norm": 0.028145259876542112, + "learning_rate": 0.00024785403638455535, + "loss": 0.934, + "step": 4440 + }, + { + "epoch": 1.4040906578220012, + "grad_norm": 0.028978602256717955, + "learning_rate": 0.000246664514652245, + "loss": 0.9603, + "step": 4445 + }, + { + "epoch": 1.4056700623864802, + "grad_norm": 0.028847643444511347, + "learning_rate": 0.0002454769188613254, + "loss": 0.8939, + "step": 4450 + }, + { + "epoch": 1.4072494669509594, + "grad_norm": 0.029573447000154355, + "learning_rate": 0.00024429125804029865, + "loss": 0.9665, + "step": 4455 + }, + { + "epoch": 1.4088288715154387, + "grad_norm": 0.028470803895962127, + "learning_rate": 0.00024310754120295596, + "loss": 0.9971, + "step": 4460 + }, + { + "epoch": 1.410408276079918, + "grad_norm": 0.02826169613197225, + "learning_rate": 0.00024192577734831046, + "loss": 0.969, + "step": 4465 + }, + { + "epoch": 1.4119876806443972, + "grad_norm": 0.031375362200801706, + "learning_rate": 0.00024074597546052713, + "loss": 0.9437, + "step": 4470 + }, + { + "epoch": 1.4135670852088762, + "grad_norm": 0.030043580137543383, + "learning_rate": 0.00023956814450885633, + "loss": 0.9972, + "step": 4475 + }, + { + "epoch": 1.4151464897733554, + "grad_norm": 0.043291478405645165, + "learning_rate": 0.00023839229344756418, + "loss": 1.0168, + "step": 4480 + }, + { + "epoch": 1.4167258943378347, + "grad_norm": 0.029231571062768233, + "learning_rate": 0.00023721843121586505, + "loss": 0.9198, + "step": 4485 + }, + { + "epoch": 1.4183052989023137, + "grad_norm": 0.030135526164777927, + "learning_rate": 0.0002360465667378534, + "loss": 1.1433, + "step": 4490 + }, + { + "epoch": 1.419884703466793, + "grad_norm": 0.034022565203319415, + "learning_rate": 0.00023487670892243683, + "loss": 1.0713, + "step": 4495 + }, + { + "epoch": 1.4214641080312722, + "grad_norm": 0.029417036409679694, + "learning_rate": 0.0002337088666632668, + "loss": 0.9955, + "step": 4500 + }, + { + "epoch": 1.4230435125957515, + "grad_norm": 0.03610772984932751, + "learning_rate": 0.00023254304883867205, + "loss": 0.9593, + "step": 4505 + }, + { + "epoch": 1.4246229171602307, + "grad_norm": 0.031236680351112527, + "learning_rate": 0.00023137926431159129, + "loss": 0.9978, + "step": 4510 + }, + { + "epoch": 1.4262023217247097, + "grad_norm": 0.030707645526992056, + "learning_rate": 0.00023021752192950472, + "loss": 0.9331, + "step": 4515 + }, + { + "epoch": 1.427781726289189, + "grad_norm": 0.03726122575915596, + "learning_rate": 0.00022905783052436834, + "loss": 1.0044, + "step": 4520 + }, + { + "epoch": 1.4293611308536682, + "grad_norm": 0.03361264973926007, + "learning_rate": 0.00022790019891254506, + "loss": 1.0059, + "step": 4525 + }, + { + "epoch": 1.4309405354181473, + "grad_norm": 0.027948864001244987, + "learning_rate": 0.00022674463589473926, + "loss": 0.906, + "step": 4530 + }, + { + "epoch": 1.4325199399826265, + "grad_norm": 0.028728985899873962, + "learning_rate": 0.0002255911502559287, + "loss": 0.966, + "step": 4535 + }, + { + "epoch": 1.4340993445471057, + "grad_norm": 0.02915994123656055, + "learning_rate": 0.0002244397507652982, + "loss": 0.9845, + "step": 4540 + }, + { + "epoch": 1.435678749111585, + "grad_norm": 0.027532167934338617, + "learning_rate": 0.00022329044617617355, + "loss": 0.9283, + "step": 4545 + }, + { + "epoch": 1.4372581536760642, + "grad_norm": 0.02787896461830844, + "learning_rate": 0.0002221432452259536, + "loss": 0.9426, + "step": 4550 + }, + { + "epoch": 1.4388375582405433, + "grad_norm": 0.02915052050382484, + "learning_rate": 0.00022099815663604533, + "loss": 0.8726, + "step": 4555 + }, + { + "epoch": 1.4404169628050225, + "grad_norm": 0.031834082078436965, + "learning_rate": 0.00021985518911179624, + "loss": 1.0137, + "step": 4560 + }, + { + "epoch": 1.4419963673695018, + "grad_norm": 0.03190501236818658, + "learning_rate": 0.0002187143513424295, + "loss": 0.9506, + "step": 4565 + }, + { + "epoch": 1.4435757719339808, + "grad_norm": 0.033915132392950936, + "learning_rate": 0.0002175756520009765, + "loss": 1.0225, + "step": 4570 + }, + { + "epoch": 1.44515517649846, + "grad_norm": 0.027089734040549717, + "learning_rate": 0.00021643909974421166, + "loss": 0.92, + "step": 4575 + }, + { + "epoch": 1.4467345810629393, + "grad_norm": 0.03244887285384843, + "learning_rate": 0.0002153047032125871, + "loss": 0.9505, + "step": 4580 + }, + { + "epoch": 1.4483139856274185, + "grad_norm": 0.034126797735564554, + "learning_rate": 0.00021417247103016563, + "loss": 0.9887, + "step": 4585 + }, + { + "epoch": 1.4498933901918978, + "grad_norm": 0.03479014165453759, + "learning_rate": 0.00021304241180455675, + "loss": 0.9509, + "step": 4590 + }, + { + "epoch": 1.4514727947563768, + "grad_norm": 0.03606094910601667, + "learning_rate": 0.0002119145341268497, + "loss": 0.9849, + "step": 4595 + }, + { + "epoch": 1.453052199320856, + "grad_norm": 0.033649259298392964, + "learning_rate": 0.00021078884657154922, + "loss": 0.9333, + "step": 4600 + }, + { + "epoch": 1.4546316038853353, + "grad_norm": 0.033304936568712216, + "learning_rate": 0.0002096653576965098, + "loss": 1.0053, + "step": 4605 + }, + { + "epoch": 1.4562110084498143, + "grad_norm": 0.03169005737288277, + "learning_rate": 0.00020854407604287123, + "loss": 0.9485, + "step": 4610 + }, + { + "epoch": 1.4577904130142936, + "grad_norm": 0.03260405386554439, + "learning_rate": 0.0002074250101349927, + "loss": 0.8876, + "step": 4615 + }, + { + "epoch": 1.4593698175787728, + "grad_norm": 0.02873933047741036, + "learning_rate": 0.0002063081684803892, + "loss": 0.9522, + "step": 4620 + }, + { + "epoch": 1.460949222143252, + "grad_norm": 0.038484761951163585, + "learning_rate": 0.00020519355956966567, + "loss": 1.0253, + "step": 4625 + }, + { + "epoch": 1.4625286267077313, + "grad_norm": 0.02998983242693224, + "learning_rate": 0.000204081191876453, + "loss": 0.9557, + "step": 4630 + }, + { + "epoch": 1.4641080312722103, + "grad_norm": 0.0343455817655151, + "learning_rate": 0.0002029710738573441, + "loss": 0.9206, + "step": 4635 + }, + { + "epoch": 1.4656874358366896, + "grad_norm": 0.030228779530593563, + "learning_rate": 0.00020186321395182838, + "loss": 0.9486, + "step": 4640 + }, + { + "epoch": 1.4672668404011688, + "grad_norm": 0.030423290706468993, + "learning_rate": 0.00020075762058222914, + "loss": 0.9493, + "step": 4645 + }, + { + "epoch": 1.4688462449656479, + "grad_norm": 0.029878637950206986, + "learning_rate": 0.00019965430215363779, + "loss": 1.0168, + "step": 4650 + }, + { + "epoch": 1.470425649530127, + "grad_norm": 0.03965356236244884, + "learning_rate": 0.00019855326705385174, + "loss": 1.0224, + "step": 4655 + }, + { + "epoch": 1.4720050540946064, + "grad_norm": 0.03325347822092596, + "learning_rate": 0.00019745452365330923, + "loss": 0.9515, + "step": 4660 + }, + { + "epoch": 1.4735844586590856, + "grad_norm": 0.032050231462300495, + "learning_rate": 0.00019635808030502616, + "loss": 0.9337, + "step": 4665 + }, + { + "epoch": 1.4751638632235649, + "grad_norm": 0.03942197951048198, + "learning_rate": 0.00019526394534453328, + "loss": 0.9647, + "step": 4670 + }, + { + "epoch": 1.4767432677880439, + "grad_norm": 0.03706566932643626, + "learning_rate": 0.00019417212708981146, + "loss": 0.9601, + "step": 4675 + }, + { + "epoch": 1.4783226723525231, + "grad_norm": 0.031943909303215, + "learning_rate": 0.00019308263384122987, + "loss": 0.9851, + "step": 4680 + }, + { + "epoch": 1.4799020769170022, + "grad_norm": 0.030359878975780616, + "learning_rate": 0.00019199547388148148, + "loss": 0.9544, + "step": 4685 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.03122343055875806, + "learning_rate": 0.0001909106554755216, + "loss": 0.983, + "step": 4690 + }, + { + "epoch": 1.4830608860459606, + "grad_norm": 0.03029868467794547, + "learning_rate": 0.00018982818687050368, + "loss": 0.9617, + "step": 4695 + }, + { + "epoch": 1.48464029061044, + "grad_norm": 0.029666257350296574, + "learning_rate": 0.00018874807629571722, + "loss": 0.9558, + "step": 4700 + }, + { + "epoch": 1.4862196951749191, + "grad_norm": 0.03656648047904474, + "learning_rate": 0.0001876703319625257, + "loss": 0.9576, + "step": 4705 + }, + { + "epoch": 1.4877990997393982, + "grad_norm": 0.028585469241267418, + "learning_rate": 0.00018659496206430303, + "loss": 0.9123, + "step": 4710 + }, + { + "epoch": 1.4893785043038774, + "grad_norm": 0.03493954328106872, + "learning_rate": 0.0001855219747763723, + "loss": 0.9263, + "step": 4715 + }, + { + "epoch": 1.4909579088683567, + "grad_norm": 0.030950110520780533, + "learning_rate": 0.0001844513782559426, + "loss": 0.9996, + "step": 4720 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.029563002599243136, + "learning_rate": 0.00018338318064204856, + "loss": 0.9425, + "step": 4725 + }, + { + "epoch": 1.494116717997315, + "grad_norm": 0.02886888207876382, + "learning_rate": 0.00018231739005548654, + "loss": 0.9078, + "step": 4730 + }, + { + "epoch": 1.4956961225617942, + "grad_norm": 0.030011112415565738, + "learning_rate": 0.00018125401459875474, + "loss": 0.9273, + "step": 4735 + }, + { + "epoch": 1.4972755271262734, + "grad_norm": 0.027887955051654714, + "learning_rate": 0.00018019306235598983, + "loss": 0.9781, + "step": 4740 + }, + { + "epoch": 1.4988549316907527, + "grad_norm": 0.03007756668275834, + "learning_rate": 0.0001791345413929073, + "loss": 1.0021, + "step": 4745 + }, + { + "epoch": 1.500434336255232, + "grad_norm": 0.03740847640057817, + "learning_rate": 0.0001780784597567386, + "loss": 0.9867, + "step": 4750 + }, + { + "epoch": 1.502013740819711, + "grad_norm": 0.02990805205321503, + "learning_rate": 0.00017702482547617067, + "loss": 0.9218, + "step": 4755 + }, + { + "epoch": 1.5035931453841902, + "grad_norm": 0.03897827731571156, + "learning_rate": 0.00017597364656128517, + "loss": 1.0062, + "step": 4760 + }, + { + "epoch": 1.5051725499486692, + "grad_norm": 0.027694122754996994, + "learning_rate": 0.0001749249310034969, + "loss": 0.9356, + "step": 4765 + }, + { + "epoch": 1.5067519545131485, + "grad_norm": 0.033195432920009905, + "learning_rate": 0.00017387868677549368, + "loss": 0.9328, + "step": 4770 + }, + { + "epoch": 1.5083313590776277, + "grad_norm": 0.031499993107616385, + "learning_rate": 0.0001728349218311751, + "loss": 0.9293, + "step": 4775 + }, + { + "epoch": 1.509910763642107, + "grad_norm": 0.029061947708505044, + "learning_rate": 0.00017179364410559284, + "loss": 0.9617, + "step": 4780 + }, + { + "epoch": 1.5114901682065862, + "grad_norm": 0.028165272129141487, + "learning_rate": 0.00017075486151488955, + "loss": 0.9074, + "step": 4785 + }, + { + "epoch": 1.5130695727710655, + "grad_norm": 0.02758690423618732, + "learning_rate": 0.00016971858195623897, + "loss": 0.9602, + "step": 4790 + }, + { + "epoch": 1.5146489773355445, + "grad_norm": 0.03188469784355415, + "learning_rate": 0.00016868481330778646, + "loss": 0.9115, + "step": 4795 + }, + { + "epoch": 1.5162283819000237, + "grad_norm": 0.028648464031357315, + "learning_rate": 0.00016765356342858794, + "loss": 0.874, + "step": 4800 + }, + { + "epoch": 1.5178077864645028, + "grad_norm": 0.02934106811153413, + "learning_rate": 0.00016662484015855152, + "loss": 0.9345, + "step": 4805 + }, + { + "epoch": 1.519387191028982, + "grad_norm": 0.03524406400581884, + "learning_rate": 0.0001655986513183763, + "loss": 0.9732, + "step": 4810 + }, + { + "epoch": 1.5209665955934613, + "grad_norm": 0.030630658877105973, + "learning_rate": 0.00016457500470949476, + "loss": 0.9414, + "step": 4815 + }, + { + "epoch": 1.5225460001579405, + "grad_norm": 0.03469678327082884, + "learning_rate": 0.00016355390811401176, + "loss": 0.9199, + "step": 4820 + }, + { + "epoch": 1.5241254047224198, + "grad_norm": 0.030372740472785865, + "learning_rate": 0.0001625353692946464, + "loss": 0.9456, + "step": 4825 + }, + { + "epoch": 1.525704809286899, + "grad_norm": 0.031793028072404264, + "learning_rate": 0.00016151939599467246, + "loss": 0.9958, + "step": 4830 + }, + { + "epoch": 1.527284213851378, + "grad_norm": 0.0310066550568518, + "learning_rate": 0.0001605059959378603, + "loss": 0.9201, + "step": 4835 + }, + { + "epoch": 1.5288636184158573, + "grad_norm": 0.029379841977030172, + "learning_rate": 0.00015949517682841712, + "loss": 0.9616, + "step": 4840 + }, + { + "epoch": 1.5304430229803363, + "grad_norm": 0.033451450559209034, + "learning_rate": 0.00015848694635092896, + "loss": 0.9249, + "step": 4845 + }, + { + "epoch": 1.5320224275448155, + "grad_norm": 0.031728724996236635, + "learning_rate": 0.00015748131217030258, + "loss": 0.9942, + "step": 4850 + }, + { + "epoch": 1.5336018321092948, + "grad_norm": 0.03353294850579401, + "learning_rate": 0.00015647828193170632, + "loss": 0.9606, + "step": 4855 + }, + { + "epoch": 1.535181236673774, + "grad_norm": 0.029190786275313668, + "learning_rate": 0.00015547786326051293, + "loss": 0.9939, + "step": 4860 + }, + { + "epoch": 1.5367606412382533, + "grad_norm": 0.02937275253640311, + "learning_rate": 0.00015448006376224066, + "loss": 0.9728, + "step": 4865 + }, + { + "epoch": 1.5383400458027325, + "grad_norm": 0.02972987748663535, + "learning_rate": 0.00015348489102249657, + "loss": 0.9799, + "step": 4870 + }, + { + "epoch": 1.5399194503672116, + "grad_norm": 0.028079108417540122, + "learning_rate": 0.00015249235260691763, + "loss": 0.9692, + "step": 4875 + }, + { + "epoch": 1.5414988549316906, + "grad_norm": 0.027471739413706998, + "learning_rate": 0.0001515024560611139, + "loss": 0.9209, + "step": 4880 + }, + { + "epoch": 1.5430782594961698, + "grad_norm": 0.028941544868934965, + "learning_rate": 0.00015051520891061143, + "loss": 0.9596, + "step": 4885 + }, + { + "epoch": 1.544657664060649, + "grad_norm": 0.02723478992768709, + "learning_rate": 0.0001495306186607942, + "loss": 0.9286, + "step": 4890 + }, + { + "epoch": 1.5462370686251283, + "grad_norm": 0.028916965276510317, + "learning_rate": 0.00014854869279684808, + "loss": 0.9878, + "step": 4895 + }, + { + "epoch": 1.5478164731896076, + "grad_norm": 0.030312213406193604, + "learning_rate": 0.00014756943878370266, + "loss": 0.9618, + "step": 4900 + }, + { + "epoch": 1.5493958777540868, + "grad_norm": 0.03473525605902538, + "learning_rate": 0.000146592864065976, + "loss": 0.9115, + "step": 4905 + }, + { + "epoch": 1.5509752823185659, + "grad_norm": 0.031870739232060324, + "learning_rate": 0.00014561897606791673, + "loss": 0.9396, + "step": 4910 + }, + { + "epoch": 1.552554686883045, + "grad_norm": 0.03690047717608636, + "learning_rate": 0.00014464778219334812, + "loss": 1.0796, + "step": 4915 + }, + { + "epoch": 1.5541340914475241, + "grad_norm": 0.03234558759055333, + "learning_rate": 0.00014367928982561234, + "loss": 0.96, + "step": 4920 + }, + { + "epoch": 1.5557134960120034, + "grad_norm": 0.031191637794967527, + "learning_rate": 0.00014271350632751313, + "loss": 0.9271, + "step": 4925 + }, + { + "epoch": 1.5572929005764826, + "grad_norm": 0.03077065665554426, + "learning_rate": 0.00014175043904126117, + "loss": 0.9352, + "step": 4930 + }, + { + "epoch": 1.5588723051409619, + "grad_norm": 0.030775870100129745, + "learning_rate": 0.00014079009528841668, + "loss": 1.0264, + "step": 4935 + }, + { + "epoch": 1.5604517097054411, + "grad_norm": 0.02898000438139762, + "learning_rate": 0.0001398324823698357, + "loss": 0.9509, + "step": 4940 + }, + { + "epoch": 1.5620311142699204, + "grad_norm": 0.027784668808561445, + "learning_rate": 0.00013887760756561268, + "loss": 1.0466, + "step": 4945 + }, + { + "epoch": 1.5636105188343994, + "grad_norm": 0.03145514968781478, + "learning_rate": 0.00013792547813502675, + "loss": 0.9426, + "step": 4950 + }, + { + "epoch": 1.5651899233988786, + "grad_norm": 0.031223582023416144, + "learning_rate": 0.0001369761013164851, + "loss": 0.9717, + "step": 4955 + }, + { + "epoch": 1.5667693279633577, + "grad_norm": 0.029446106573280874, + "learning_rate": 0.00013602948432746916, + "loss": 1.0059, + "step": 4960 + }, + { + "epoch": 1.568348732527837, + "grad_norm": 0.03060047149808178, + "learning_rate": 0.000135085634364479, + "loss": 0.9386, + "step": 4965 + }, + { + "epoch": 1.5699281370923162, + "grad_norm": 0.031111881817250466, + "learning_rate": 0.00013414455860297865, + "loss": 0.969, + "step": 4970 + }, + { + "epoch": 1.5715075416567954, + "grad_norm": 0.03352967313819241, + "learning_rate": 0.00013320626419734217, + "loss": 0.9655, + "step": 4975 + }, + { + "epoch": 1.5730869462212747, + "grad_norm": 0.029547593732058695, + "learning_rate": 0.00013227075828079832, + "loss": 0.9662, + "step": 4980 + }, + { + "epoch": 1.574666350785754, + "grad_norm": 0.029508552107430478, + "learning_rate": 0.00013133804796537735, + "loss": 0.9803, + "step": 4985 + }, + { + "epoch": 1.576245755350233, + "grad_norm": 0.02975554398000049, + "learning_rate": 0.00013040814034185588, + "loss": 1.0374, + "step": 4990 + }, + { + "epoch": 1.5778251599147122, + "grad_norm": 0.031578737148424206, + "learning_rate": 0.000129481042479704, + "loss": 0.9395, + "step": 4995 + }, + { + "epoch": 1.5794045644791912, + "grad_norm": 0.029663746491364242, + "learning_rate": 0.00012855676142703077, + "loss": 0.9928, + "step": 5000 + }, + { + "epoch": 1.5809839690436704, + "grad_norm": 0.029549754079616287, + "learning_rate": 0.00012763530421053076, + "loss": 0.8866, + "step": 5005 + }, + { + "epoch": 1.5825633736081497, + "grad_norm": 0.030569386705799836, + "learning_rate": 0.0001267166778354314, + "loss": 0.9777, + "step": 5010 + }, + { + "epoch": 1.584142778172629, + "grad_norm": 0.028048496212648384, + "learning_rate": 0.00012580088928543836, + "loss": 0.962, + "step": 5015 + }, + { + "epoch": 1.5857221827371082, + "grad_norm": 0.030860169017198416, + "learning_rate": 0.00012488794552268395, + "loss": 0.9141, + "step": 5020 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.029711018675052436, + "learning_rate": 0.0001239778534876727, + "loss": 0.9897, + "step": 5025 + }, + { + "epoch": 1.5888809918660665, + "grad_norm": 0.02982786530992063, + "learning_rate": 0.00012307062009923005, + "loss": 0.9729, + "step": 5030 + }, + { + "epoch": 1.5904603964305457, + "grad_norm": 0.029748108496392242, + "learning_rate": 0.0001221662522544486, + "loss": 0.9373, + "step": 5035 + }, + { + "epoch": 1.5920398009950247, + "grad_norm": 0.042065215046750926, + "learning_rate": 0.00012126475682863608, + "loss": 0.9409, + "step": 5040 + }, + { + "epoch": 1.593619205559504, + "grad_norm": 0.029375244036325774, + "learning_rate": 0.00012036614067526364, + "loss": 0.9917, + "step": 5045 + }, + { + "epoch": 1.5951986101239832, + "grad_norm": 0.040792830285248355, + "learning_rate": 0.00011947041062591274, + "loss": 0.9492, + "step": 5050 + }, + { + "epoch": 1.5967780146884625, + "grad_norm": 0.02942938481672079, + "learning_rate": 0.0001185775734902238, + "loss": 1.0339, + "step": 5055 + }, + { + "epoch": 1.5983574192529417, + "grad_norm": 0.027670992950519373, + "learning_rate": 0.00011768763605584437, + "loss": 0.898, + "step": 5060 + }, + { + "epoch": 1.599936823817421, + "grad_norm": 0.029414668742697336, + "learning_rate": 0.0001168006050883777, + "loss": 0.9601, + "step": 5065 + }, + { + "epoch": 1.6015162283819, + "grad_norm": 0.02715641122003609, + "learning_rate": 0.0001159164873313307, + "loss": 0.9179, + "step": 5070 + }, + { + "epoch": 1.6030956329463792, + "grad_norm": 0.0344629215298313, + "learning_rate": 0.00011503528950606363, + "loss": 0.9857, + "step": 5075 + }, + { + "epoch": 1.6046750375108583, + "grad_norm": 0.03305054684378565, + "learning_rate": 0.0001141570183117378, + "loss": 0.9162, + "step": 5080 + }, + { + "epoch": 1.6062544420753375, + "grad_norm": 0.0302924518110162, + "learning_rate": 0.00011328168042526594, + "loss": 0.9946, + "step": 5085 + }, + { + "epoch": 1.6078338466398168, + "grad_norm": 0.0348715109293973, + "learning_rate": 0.00011240928250126026, + "loss": 0.9149, + "step": 5090 + }, + { + "epoch": 1.609413251204296, + "grad_norm": 0.028970342452736657, + "learning_rate": 0.00011153983117198252, + "loss": 0.9643, + "step": 5095 + }, + { + "epoch": 1.6109926557687753, + "grad_norm": 0.02909734940218002, + "learning_rate": 0.00011067333304729382, + "loss": 0.9972, + "step": 5100 + }, + { + "epoch": 1.6125720603332545, + "grad_norm": 0.030720347491084186, + "learning_rate": 0.00010980979471460339, + "loss": 0.9243, + "step": 5105 + }, + { + "epoch": 1.6141514648977335, + "grad_norm": 0.028974249632074664, + "learning_rate": 0.0001089492227388199, + "loss": 0.9629, + "step": 5110 + }, + { + "epoch": 1.6157308694622128, + "grad_norm": 0.03122595729483744, + "learning_rate": 0.00010809162366229996, + "loss": 1.0212, + "step": 5115 + }, + { + "epoch": 1.6173102740266918, + "grad_norm": 0.03136511591330045, + "learning_rate": 0.00010723700400479997, + "loss": 1.0273, + "step": 5120 + }, + { + "epoch": 1.618889678591171, + "grad_norm": 0.02825384699316145, + "learning_rate": 0.00010638537026342515, + "loss": 0.9214, + "step": 5125 + }, + { + "epoch": 1.6204690831556503, + "grad_norm": 0.03165649689077606, + "learning_rate": 0.00010553672891258104, + "loss": 0.9782, + "step": 5130 + }, + { + "epoch": 1.6220484877201296, + "grad_norm": 0.034210643516055385, + "learning_rate": 0.00010469108640392422, + "loss": 1.0466, + "step": 5135 + }, + { + "epoch": 1.6236278922846088, + "grad_norm": 0.026834499698806056, + "learning_rate": 0.00010384844916631264, + "loss": 0.9678, + "step": 5140 + }, + { + "epoch": 1.625207296849088, + "grad_norm": 0.028988043467771108, + "learning_rate": 0.00010300882360575775, + "loss": 0.9699, + "step": 5145 + }, + { + "epoch": 1.626786701413567, + "grad_norm": 0.030870225331222148, + "learning_rate": 0.00010217221610537448, + "loss": 0.97, + "step": 5150 + }, + { + "epoch": 1.6283661059780463, + "grad_norm": 0.0350020851497268, + "learning_rate": 0.0001013386330253343, + "loss": 1.0223, + "step": 5155 + }, + { + "epoch": 1.6299455105425253, + "grad_norm": 0.0281915476142278, + "learning_rate": 0.00010050808070281508, + "loss": 1.0168, + "step": 5160 + }, + { + "epoch": 1.6315249151070046, + "grad_norm": 0.029815543401568854, + "learning_rate": 9.968056545195476e-05, + "loss": 0.9887, + "step": 5165 + }, + { + "epoch": 1.6331043196714838, + "grad_norm": 0.03017266635366265, + "learning_rate": 9.88560935638017e-05, + "loss": 0.9811, + "step": 5170 + }, + { + "epoch": 1.634683724235963, + "grad_norm": 0.02814923543583342, + "learning_rate": 9.80346713062682e-05, + "loss": 0.9602, + "step": 5175 + }, + { + "epoch": 1.6362631288004423, + "grad_norm": 0.027320031708459765, + "learning_rate": 9.72163049240819e-05, + "loss": 0.9273, + "step": 5180 + }, + { + "epoch": 1.6378425333649216, + "grad_norm": 0.029917340873650416, + "learning_rate": 9.640100063873852e-05, + "loss": 0.9804, + "step": 5185 + }, + { + "epoch": 1.6394219379294006, + "grad_norm": 0.027497721634484554, + "learning_rate": 9.558876464845517e-05, + "loss": 0.9379, + "step": 5190 + }, + { + "epoch": 1.6410013424938799, + "grad_norm": 0.029168251943545543, + "learning_rate": 9.477960312812217e-05, + "loss": 0.9427, + "step": 5195 + }, + { + "epoch": 1.6425807470583589, + "grad_norm": 0.029310743945918462, + "learning_rate": 9.397352222925737e-05, + "loss": 0.935, + "step": 5200 + }, + { + "epoch": 1.6441601516228381, + "grad_norm": 0.03490911408333341, + "learning_rate": 9.317052807995797e-05, + "loss": 0.9037, + "step": 5205 + }, + { + "epoch": 1.6457395561873174, + "grad_norm": 0.029412651272320862, + "learning_rate": 9.23706267848553e-05, + "loss": 1.0066, + "step": 5210 + }, + { + "epoch": 1.6473189607517966, + "grad_norm": 0.03265358238275424, + "learning_rate": 9.157382442506734e-05, + "loss": 0.9727, + "step": 5215 + }, + { + "epoch": 1.6488983653162759, + "grad_norm": 0.03207810008519121, + "learning_rate": 9.078012705815297e-05, + "loss": 0.9691, + "step": 5220 + }, + { + "epoch": 1.6504777698807551, + "grad_norm": 0.033924430870274636, + "learning_rate": 8.998954071806625e-05, + "loss": 0.9386, + "step": 5225 + }, + { + "epoch": 1.6520571744452341, + "grad_norm": 0.03025695180847416, + "learning_rate": 8.920207141510962e-05, + "loss": 0.9494, + "step": 5230 + }, + { + "epoch": 1.6536365790097134, + "grad_norm": 0.04335743382466986, + "learning_rate": 8.841772513588919e-05, + "loss": 0.9915, + "step": 5235 + }, + { + "epoch": 1.6552159835741924, + "grad_norm": 0.030378840122880894, + "learning_rate": 8.763650784326855e-05, + "loss": 0.9172, + "step": 5240 + }, + { + "epoch": 1.6567953881386717, + "grad_norm": 0.028274908027186057, + "learning_rate": 8.685842547632395e-05, + "loss": 0.9718, + "step": 5245 + }, + { + "epoch": 1.658374792703151, + "grad_norm": 0.03839721528686618, + "learning_rate": 8.608348395029859e-05, + "loss": 0.9117, + "step": 5250 + }, + { + "epoch": 1.6599541972676302, + "grad_norm": 0.03365555844021039, + "learning_rate": 8.531168915655785e-05, + "loss": 0.9401, + "step": 5255 + }, + { + "epoch": 1.6615336018321094, + "grad_norm": 0.02967917392975241, + "learning_rate": 8.454304696254516e-05, + "loss": 0.9328, + "step": 5260 + }, + { + "epoch": 1.6631130063965884, + "grad_norm": 0.02915381706391397, + "learning_rate": 8.377756321173629e-05, + "loss": 0.898, + "step": 5265 + }, + { + "epoch": 1.6646924109610677, + "grad_norm": 0.02733055819538654, + "learning_rate": 8.30152437235957e-05, + "loss": 0.9382, + "step": 5270 + }, + { + "epoch": 1.6662718155255467, + "grad_norm": 0.03054724576737309, + "learning_rate": 8.225609429353187e-05, + "loss": 0.9498, + "step": 5275 + }, + { + "epoch": 1.667851220090026, + "grad_norm": 0.028099357270467477, + "learning_rate": 8.150012069285373e-05, + "loss": 0.9599, + "step": 5280 + }, + { + "epoch": 1.6694306246545052, + "grad_norm": 0.03324146580650551, + "learning_rate": 8.074732866872619e-05, + "loss": 0.9326, + "step": 5285 + }, + { + "epoch": 1.6710100292189844, + "grad_norm": 0.029211888199556814, + "learning_rate": 7.999772394412713e-05, + "loss": 0.9251, + "step": 5290 + }, + { + "epoch": 1.6725894337834637, + "grad_norm": 0.02838078725555603, + "learning_rate": 7.925131221780297e-05, + "loss": 0.9026, + "step": 5295 + }, + { + "epoch": 1.674168838347943, + "grad_norm": 0.027425274238882927, + "learning_rate": 7.85080991642264e-05, + "loss": 0.9298, + "step": 5300 + }, + { + "epoch": 1.675748242912422, + "grad_norm": 0.031163908927535056, + "learning_rate": 7.776809043355254e-05, + "loss": 0.9946, + "step": 5305 + }, + { + "epoch": 1.6773276474769012, + "grad_norm": 0.030309070448334655, + "learning_rate": 7.703129165157586e-05, + "loss": 1.0243, + "step": 5310 + }, + { + "epoch": 1.6789070520413802, + "grad_norm": 0.031389990741688696, + "learning_rate": 7.629770841968837e-05, + "loss": 1.0145, + "step": 5315 + }, + { + "epoch": 1.6804864566058595, + "grad_norm": 0.032247096608187115, + "learning_rate": 7.556734631483564e-05, + "loss": 0.9475, + "step": 5320 + }, + { + "epoch": 1.6820658611703387, + "grad_norm": 0.03219927524276064, + "learning_rate": 7.484021088947591e-05, + "loss": 0.9795, + "step": 5325 + }, + { + "epoch": 1.683645265734818, + "grad_norm": 0.028086436761979167, + "learning_rate": 7.411630767153643e-05, + "loss": 0.969, + "step": 5330 + }, + { + "epoch": 1.6852246702992972, + "grad_norm": 0.03177602647774136, + "learning_rate": 7.339564216437273e-05, + "loss": 0.9481, + "step": 5335 + }, + { + "epoch": 1.6868040748637765, + "grad_norm": 0.029274911998221066, + "learning_rate": 7.267821984672573e-05, + "loss": 0.9434, + "step": 5340 + }, + { + "epoch": 1.6883834794282555, + "grad_norm": 0.030745705722408884, + "learning_rate": 7.196404617268059e-05, + "loss": 1.0108, + "step": 5345 + }, + { + "epoch": 1.6899628839927348, + "grad_norm": 0.031067297390136, + "learning_rate": 7.125312657162547e-05, + "loss": 0.9551, + "step": 5350 + }, + { + "epoch": 1.6915422885572138, + "grad_norm": 0.02853230768950431, + "learning_rate": 7.054546644820964e-05, + "loss": 0.9126, + "step": 5355 + }, + { + "epoch": 1.693121693121693, + "grad_norm": 0.027851652070922075, + "learning_rate": 6.984107118230309e-05, + "loss": 0.9397, + "step": 5360 + }, + { + "epoch": 1.6947010976861723, + "grad_norm": 0.03409950787348448, + "learning_rate": 6.91399461289548e-05, + "loss": 0.9741, + "step": 5365 + }, + { + "epoch": 1.6962805022506515, + "grad_norm": 0.031246138770397752, + "learning_rate": 6.844209661835299e-05, + "loss": 0.91, + "step": 5370 + }, + { + "epoch": 1.6978599068151308, + "grad_norm": 0.02818147666089724, + "learning_rate": 6.774752795578365e-05, + "loss": 0.9463, + "step": 5375 + }, + { + "epoch": 1.69943931137961, + "grad_norm": 0.02768868505742041, + "learning_rate": 6.705624542159123e-05, + "loss": 0.9933, + "step": 5380 + }, + { + "epoch": 1.701018715944089, + "grad_norm": 0.02933701599491612, + "learning_rate": 6.636825427113718e-05, + "loss": 0.9009, + "step": 5385 + }, + { + "epoch": 1.7025981205085683, + "grad_norm": 0.028010286533507792, + "learning_rate": 6.568355973476136e-05, + "loss": 0.9082, + "step": 5390 + }, + { + "epoch": 1.7041775250730473, + "grad_norm": 0.02810226017668788, + "learning_rate": 6.500216701774147e-05, + "loss": 0.9339, + "step": 5395 + }, + { + "epoch": 1.7057569296375266, + "grad_norm": 0.03158388238396385, + "learning_rate": 6.432408130025347e-05, + "loss": 0.9326, + "step": 5400 + }, + { + "epoch": 1.7073363342020058, + "grad_norm": 0.028939507313212626, + "learning_rate": 6.36493077373328e-05, + "loss": 0.9599, + "step": 5405 + }, + { + "epoch": 1.708915738766485, + "grad_norm": 0.029375924859892183, + "learning_rate": 6.297785145883439e-05, + "loss": 0.9643, + "step": 5410 + }, + { + "epoch": 1.7104951433309643, + "grad_norm": 0.028567698771268964, + "learning_rate": 6.230971756939441e-05, + "loss": 1.0528, + "step": 5415 + }, + { + "epoch": 1.7120745478954436, + "grad_norm": 0.03190272267423509, + "learning_rate": 6.164491114839077e-05, + "loss": 0.9126, + "step": 5420 + }, + { + "epoch": 1.7136539524599226, + "grad_norm": 0.03278810172581454, + "learning_rate": 6.098343724990524e-05, + "loss": 0.9017, + "step": 5425 + }, + { + "epoch": 1.7152333570244018, + "grad_norm": 0.03128291196025068, + "learning_rate": 6.032530090268429e-05, + "loss": 0.9239, + "step": 5430 + }, + { + "epoch": 1.7168127615888809, + "grad_norm": 0.029785540541996565, + "learning_rate": 5.967050711010119e-05, + "loss": 0.9322, + "step": 5435 + }, + { + "epoch": 1.71839216615336, + "grad_norm": 0.030098440572351545, + "learning_rate": 5.9019060850118434e-05, + "loss": 0.9287, + "step": 5440 + }, + { + "epoch": 1.7199715707178393, + "grad_norm": 0.028159321421875484, + "learning_rate": 5.837096707524886e-05, + "loss": 0.8821, + "step": 5445 + }, + { + "epoch": 1.7215509752823186, + "grad_norm": 0.028063721609171772, + "learning_rate": 5.772623071251915e-05, + "loss": 0.9866, + "step": 5450 + }, + { + "epoch": 1.7231303798467978, + "grad_norm": 0.03500911377144775, + "learning_rate": 5.7084856663431216e-05, + "loss": 1.0065, + "step": 5455 + }, + { + "epoch": 1.724709784411277, + "grad_norm": 0.02767605696654335, + "learning_rate": 5.644684980392617e-05, + "loss": 0.8997, + "step": 5460 + }, + { + "epoch": 1.7262891889757561, + "grad_norm": 0.026826975417503198, + "learning_rate": 5.5812214984346074e-05, + "loss": 0.9658, + "step": 5465 + }, + { + "epoch": 1.7278685935402354, + "grad_norm": 0.027611668243687007, + "learning_rate": 5.518095702939807e-05, + "loss": 0.9327, + "step": 5470 + }, + { + "epoch": 1.7294479981047144, + "grad_norm": 0.02754288098514866, + "learning_rate": 5.4553080738116826e-05, + "loss": 0.9229, + "step": 5475 + }, + { + "epoch": 1.7310274026691936, + "grad_norm": 0.027193439151797328, + "learning_rate": 5.392859088382856e-05, + "loss": 0.8753, + "step": 5480 + }, + { + "epoch": 1.7326068072336729, + "grad_norm": 0.027601347646716512, + "learning_rate": 5.330749221411507e-05, + "loss": 0.9554, + "step": 5485 + }, + { + "epoch": 1.7341862117981521, + "grad_norm": 0.03515769678408284, + "learning_rate": 5.268978945077668e-05, + "loss": 1.0282, + "step": 5490 + }, + { + "epoch": 1.7357656163626314, + "grad_norm": 0.02894574402350519, + "learning_rate": 5.207548728979716e-05, + "loss": 0.9606, + "step": 5495 + }, + { + "epoch": 1.7373450209271106, + "grad_norm": 0.027702253930178652, + "learning_rate": 5.1464590401307684e-05, + "loss": 0.8957, + "step": 5500 + }, + { + "epoch": 1.7389244254915897, + "grad_norm": 0.03350890536428408, + "learning_rate": 5.085710342955163e-05, + "loss": 0.9537, + "step": 5505 + }, + { + "epoch": 1.740503830056069, + "grad_norm": 0.03142247327424322, + "learning_rate": 5.0253030992848616e-05, + "loss": 0.9703, + "step": 5510 + }, + { + "epoch": 1.742083234620548, + "grad_norm": 0.030854271335729343, + "learning_rate": 4.965237768356029e-05, + "loss": 0.8991, + "step": 5515 + }, + { + "epoch": 1.7436626391850272, + "grad_norm": 0.02972820758050759, + "learning_rate": 4.905514806805456e-05, + "loss": 0.9566, + "step": 5520 + }, + { + "epoch": 1.7452420437495064, + "grad_norm": 0.027053904669588996, + "learning_rate": 4.8461346686671405e-05, + "loss": 0.9043, + "step": 5525 + }, + { + "epoch": 1.7468214483139857, + "grad_norm": 0.02752918466003326, + "learning_rate": 4.787097805368839e-05, + "loss": 0.9152, + "step": 5530 + }, + { + "epoch": 1.748400852878465, + "grad_norm": 0.027898582750512425, + "learning_rate": 4.728404665728586e-05, + "loss": 0.8763, + "step": 5535 + }, + { + "epoch": 1.7499802574429442, + "grad_norm": 0.03174675042457659, + "learning_rate": 4.670055695951342e-05, + "loss": 0.9377, + "step": 5540 + }, + { + "epoch": 1.7515596620074232, + "grad_norm": 0.02953452769699611, + "learning_rate": 4.6120513396255446e-05, + "loss": 0.9275, + "step": 5545 + }, + { + "epoch": 1.7531390665719024, + "grad_norm": 0.028278131840278015, + "learning_rate": 4.554392037719801e-05, + "loss": 0.9219, + "step": 5550 + }, + { + "epoch": 1.7547184711363815, + "grad_norm": 0.02883058939172368, + "learning_rate": 4.4970782285794484e-05, + "loss": 0.9637, + "step": 5555 + }, + { + "epoch": 1.7562978757008607, + "grad_norm": 0.02892465549363483, + "learning_rate": 4.440110347923332e-05, + "loss": 0.9655, + "step": 5560 + }, + { + "epoch": 1.75787728026534, + "grad_norm": 0.028065875175745866, + "learning_rate": 4.383488828840387e-05, + "loss": 0.9238, + "step": 5565 + }, + { + "epoch": 1.7594566848298192, + "grad_norm": 0.029174737233365906, + "learning_rate": 4.327214101786397e-05, + "loss": 0.9855, + "step": 5570 + }, + { + "epoch": 1.7610360893942985, + "grad_norm": 0.0285435239640047, + "learning_rate": 4.271286594580748e-05, + "loss": 0.9132, + "step": 5575 + }, + { + "epoch": 1.7626154939587777, + "grad_norm": 0.0288796816691122, + "learning_rate": 4.215706732403096e-05, + "loss": 0.9095, + "step": 5580 + }, + { + "epoch": 1.7641948985232567, + "grad_norm": 0.02956368548489994, + "learning_rate": 4.160474937790232e-05, + "loss": 0.9589, + "step": 5585 + }, + { + "epoch": 1.765774303087736, + "grad_norm": 0.028132040846672902, + "learning_rate": 4.105591630632777e-05, + "loss": 0.9333, + "step": 5590 + }, + { + "epoch": 1.767353707652215, + "grad_norm": 0.02957005723683004, + "learning_rate": 4.051057228172073e-05, + "loss": 0.9258, + "step": 5595 + }, + { + "epoch": 1.7689331122166942, + "grad_norm": 0.032839128835579115, + "learning_rate": 3.996872144996938e-05, + "loss": 0.9229, + "step": 5600 + }, + { + "epoch": 1.7705125167811735, + "grad_norm": 0.028340339388521443, + "learning_rate": 3.9430367930405666e-05, + "loss": 0.9532, + "step": 5605 + }, + { + "epoch": 1.7720919213456527, + "grad_norm": 0.02798928468339342, + "learning_rate": 3.8895515815773774e-05, + "loss": 0.9623, + "step": 5610 + }, + { + "epoch": 1.773671325910132, + "grad_norm": 0.027450223264395033, + "learning_rate": 3.836416917219881e-05, + "loss": 0.9309, + "step": 5615 + }, + { + "epoch": 1.775250730474611, + "grad_norm": 0.02861113029376848, + "learning_rate": 3.783633203915654e-05, + "loss": 0.9085, + "step": 5620 + }, + { + "epoch": 1.7768301350390903, + "grad_norm": 0.028077426206203647, + "learning_rate": 3.731200842944182e-05, + "loss": 0.9115, + "step": 5625 + }, + { + "epoch": 1.7784095396035693, + "grad_norm": 0.027683250225642483, + "learning_rate": 3.6791202329138965e-05, + "loss": 0.8986, + "step": 5630 + }, + { + "epoch": 1.7799889441680485, + "grad_norm": 0.028563107486100536, + "learning_rate": 3.6273917697590475e-05, + "loss": 0.9682, + "step": 5635 + }, + { + "epoch": 1.7815683487325278, + "grad_norm": 0.029066045783134917, + "learning_rate": 3.576015846736797e-05, + "loss": 0.9404, + "step": 5640 + }, + { + "epoch": 1.783147753297007, + "grad_norm": 0.031421469836084966, + "learning_rate": 3.524992854424147e-05, + "loss": 0.94, + "step": 5645 + }, + { + "epoch": 1.7847271578614863, + "grad_norm": 0.0290390057106864, + "learning_rate": 3.4743231807150056e-05, + "loss": 0.9669, + "step": 5650 + }, + { + "epoch": 1.7863065624259655, + "grad_norm": 0.027429872792924102, + "learning_rate": 3.4240072108172485e-05, + "loss": 0.9191, + "step": 5655 + }, + { + "epoch": 1.7878859669904446, + "grad_norm": 0.02743182866648419, + "learning_rate": 3.3740453272497585e-05, + "loss": 0.8912, + "step": 5660 + }, + { + "epoch": 1.7894653715549238, + "grad_norm": 0.03289260196623006, + "learning_rate": 3.324437909839556e-05, + "loss": 0.9597, + "step": 5665 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.031067149441916974, + "learning_rate": 3.275185335718861e-05, + "loss": 0.9613, + "step": 5670 + }, + { + "epoch": 1.792624180683882, + "grad_norm": 0.027403840944075043, + "learning_rate": 3.226287979322295e-05, + "loss": 0.9077, + "step": 5675 + }, + { + "epoch": 1.7942035852483613, + "grad_norm": 0.027638003709822466, + "learning_rate": 3.177746212383953e-05, + "loss": 1.0154, + "step": 5680 + }, + { + "epoch": 1.7957829898128406, + "grad_norm": 0.027981137344302076, + "learning_rate": 3.1295604039346615e-05, + "loss": 0.9223, + "step": 5685 + }, + { + "epoch": 1.7973623943773198, + "grad_norm": 0.02869606714386023, + "learning_rate": 3.0817309202990916e-05, + "loss": 0.9966, + "step": 5690 }, { - "epoch": 1.4285714285714286, - "grad_norm": 31.90907042465711, - "learning_rate": 9.549150281252633e-05, - "loss": 19.9897, - "step": 5 + "epoch": 1.798941798941799, + "grad_norm": 0.029474493487004923, + "learning_rate": 3.0342581250930368e-05, + "loss": 1.0082, + "step": 5695 + }, + { + "epoch": 1.800521203506278, + "grad_norm": 0.029469302160620065, + "learning_rate": 2.9871423792206252e-05, + "loss": 0.9922, + "step": 5700 + }, + { + "epoch": 1.8021006080707573, + "grad_norm": 0.030119473606862637, + "learning_rate": 2.940384040871563e-05, + "loss": 0.9676, + "step": 5705 + }, + { + "epoch": 1.8036800126352364, + "grad_norm": 0.03392551374610534, + "learning_rate": 2.893983465518446e-05, + "loss": 0.9468, + "step": 5710 + }, + { + "epoch": 1.8052594171997156, + "grad_norm": 0.028895121987632597, + "learning_rate": 2.847941005914012e-05, + "loss": 0.9319, + "step": 5715 + }, + { + "epoch": 1.8068388217641949, + "grad_norm": 0.03129999681800566, + "learning_rate": 2.8022570120884937e-05, + "loss": 1.0065, + "step": 5720 + }, + { + "epoch": 1.808418226328674, + "grad_norm": 0.027645868323954787, + "learning_rate": 2.756931831346937e-05, + "loss": 0.9222, + "step": 5725 + }, + { + "epoch": 1.8099976308931534, + "grad_norm": 0.029121807025221366, + "learning_rate": 2.7119658082666034e-05, + "loss": 0.895, + "step": 5730 + }, + { + "epoch": 1.8115770354576326, + "grad_norm": 0.02854616322766256, + "learning_rate": 2.6673592846942707e-05, + "loss": 0.988, + "step": 5735 + }, + { + "epoch": 1.8131564400221116, + "grad_norm": 0.030400035928911648, + "learning_rate": 2.62311259974371e-05, + "loss": 0.9156, + "step": 5740 + }, + { + "epoch": 1.8147358445865909, + "grad_norm": 0.02879072839156828, + "learning_rate": 2.579226089793074e-05, + "loss": 0.9106, + "step": 5745 + }, + { + "epoch": 1.81631524915107, + "grad_norm": 0.02891714024162058, + "learning_rate": 2.5357000884823344e-05, + "loss": 0.982, + "step": 5750 + }, + { + "epoch": 1.8178946537155491, + "grad_norm": 0.027645185731986426, + "learning_rate": 2.4925349267107765e-05, + "loss": 0.9915, + "step": 5755 + }, + { + "epoch": 1.8194740582800284, + "grad_norm": 0.028868560331025948, + "learning_rate": 2.4497309326344364e-05, + "loss": 0.9583, + "step": 5760 + }, + { + "epoch": 1.8210534628445076, + "grad_norm": 0.028199422420434998, + "learning_rate": 2.4072884316636512e-05, + "loss": 0.9967, + "step": 5765 + }, + { + "epoch": 1.822632867408987, + "grad_norm": 0.026658531525045798, + "learning_rate": 2.3652077464605514e-05, + "loss": 0.9488, + "step": 5770 + }, + { + "epoch": 1.8242122719734661, + "grad_norm": 0.035756806143134, + "learning_rate": 2.32348919693664e-05, + "loss": 0.9512, + "step": 5775 + }, + { + "epoch": 1.8257916765379452, + "grad_norm": 0.027506596305714767, + "learning_rate": 2.2821331002503276e-05, + "loss": 0.9161, + "step": 5780 + }, + { + "epoch": 1.8273710811024244, + "grad_norm": 0.028991229999471242, + "learning_rate": 2.2411397708045346e-05, + "loss": 0.8853, + "step": 5785 + }, + { + "epoch": 1.8289504856669034, + "grad_norm": 0.02828101088834613, + "learning_rate": 2.200509520244326e-05, + "loss": 0.9581, + "step": 5790 + }, + { + "epoch": 1.8305298902313827, + "grad_norm": 0.08332681448836027, + "learning_rate": 2.1602426574544863e-05, + "loss": 0.916, + "step": 5795 + }, + { + "epoch": 1.832109294795862, + "grad_norm": 0.02904806118099692, + "learning_rate": 2.1203394885572436e-05, + "loss": 0.9825, + "step": 5800 + }, + { + "epoch": 1.8336886993603412, + "grad_norm": 0.027670952342160212, + "learning_rate": 2.0808003169098587e-05, + "loss": 1.0126, + "step": 5805 + }, + { + "epoch": 1.8352681039248204, + "grad_norm": 0.027656961469499166, + "learning_rate": 2.0416254431024073e-05, + "loss": 0.8975, + "step": 5810 + }, + { + "epoch": 1.8368475084892997, + "grad_norm": 0.028413585230628282, + "learning_rate": 2.0028151649554126e-05, + "loss": 0.9336, + "step": 5815 + }, + { + "epoch": 1.8384269130537787, + "grad_norm": 0.027511851975369536, + "learning_rate": 1.964369777517644e-05, + "loss": 0.9466, + "step": 5820 + }, + { + "epoch": 1.840006317618258, + "grad_norm": 0.02815057689234334, + "learning_rate": 1.9262895730638387e-05, + "loss": 0.9238, + "step": 5825 + }, + { + "epoch": 1.841585722182737, + "grad_norm": 0.031429023025303086, + "learning_rate": 1.8885748410924884e-05, + "loss": 0.9359, + "step": 5830 + }, + { + "epoch": 1.8431651267472162, + "grad_norm": 0.027798306464001606, + "learning_rate": 1.8512258683236525e-05, + "loss": 0.9302, + "step": 5835 + }, + { + "epoch": 1.8447445313116955, + "grad_norm": 0.028933369811650223, + "learning_rate": 1.8142429386967473e-05, + "loss": 0.923, + "step": 5840 + }, + { + "epoch": 1.8463239358761747, + "grad_norm": 0.027960711205781297, + "learning_rate": 1.7776263333684316e-05, + "loss": 0.9287, + "step": 5845 + }, + { + "epoch": 1.847903340440654, + "grad_norm": 0.026802432992602397, + "learning_rate": 1.7413763307104092e-05, + "loss": 0.9873, + "step": 5850 + }, + { + "epoch": 1.8494827450051332, + "grad_norm": 0.030407224531095515, + "learning_rate": 1.70549320630739e-05, + "loss": 0.9345, + "step": 5855 + }, + { + "epoch": 1.8510621495696122, + "grad_norm": 0.03016268684640237, + "learning_rate": 1.669977232954911e-05, + "loss": 0.9718, + "step": 5860 + }, + { + "epoch": 1.8526415541340915, + "grad_norm": 0.029031201241125883, + "learning_rate": 1.6348286806573354e-05, + "loss": 0.9148, + "step": 5865 + }, + { + "epoch": 1.8542209586985705, + "grad_norm": 0.026706981294734724, + "learning_rate": 1.6000478166257494e-05, + "loss": 0.9294, + "step": 5870 + }, + { + "epoch": 1.8558003632630498, + "grad_norm": 0.027765072457612058, + "learning_rate": 1.5656349052759533e-05, + "loss": 1.0392, + "step": 5875 + }, + { + "epoch": 1.857379767827529, + "grad_norm": 0.02930140524141711, + "learning_rate": 1.5315902082264577e-05, + "loss": 1.0084, + "step": 5880 + }, + { + "epoch": 1.8589591723920083, + "grad_norm": 0.02615279031598757, + "learning_rate": 1.4979139842964674e-05, + "loss": 0.9409, + "step": 5885 + }, + { + "epoch": 1.8605385769564875, + "grad_norm": 0.027246642272342872, + "learning_rate": 1.4646064895039502e-05, + "loss": 0.8993, + "step": 5890 + }, + { + "epoch": 1.8621179815209667, + "grad_norm": 0.030922173080159505, + "learning_rate": 1.4316679770636498e-05, + "loss": 0.9891, + "step": 5895 + }, + { + "epoch": 1.8636973860854458, + "grad_norm": 0.029229489681193894, + "learning_rate": 1.3990986973852039e-05, + "loss": 0.9553, + "step": 5900 + }, + { + "epoch": 1.865276790649925, + "grad_norm": 0.02719994991525794, + "learning_rate": 1.3668988980712005e-05, + "loss": 0.9284, + "step": 5905 + }, + { + "epoch": 1.866856195214404, + "grad_norm": 0.02866064080644265, + "learning_rate": 1.3350688239153196e-05, + "loss": 0.9299, + "step": 5910 + }, + { + "epoch": 1.8684355997788833, + "grad_norm": 0.06285937243773189, + "learning_rate": 1.303608716900484e-05, + "loss": 1.0569, + "step": 5915 + }, + { + "epoch": 1.8700150043433625, + "grad_norm": 0.02930390176914617, + "learning_rate": 1.2725188161969659e-05, + "loss": 0.9373, + "step": 5920 + }, + { + "epoch": 1.8715944089078418, + "grad_norm": 0.027319721436793096, + "learning_rate": 1.2417993581606447e-05, + "loss": 0.9802, + "step": 5925 + }, + { + "epoch": 1.873173813472321, + "grad_norm": 0.027699411481698355, + "learning_rate": 1.2114505763311356e-05, + "loss": 0.9254, + "step": 5930 + }, + { + "epoch": 1.8747532180368003, + "grad_norm": 0.027341193658824932, + "learning_rate": 1.1814727014300807e-05, + "loss": 0.8946, + "step": 5935 + }, + { + "epoch": 1.8763326226012793, + "grad_norm": 0.02829423795989287, + "learning_rate": 1.151865961359333e-05, + "loss": 0.9129, + "step": 5940 + }, + { + "epoch": 1.8779120271657586, + "grad_norm": 0.02724668316770682, + "learning_rate": 1.1226305811992743e-05, + "loss": 0.9241, + "step": 5945 + }, + { + "epoch": 1.8794914317302376, + "grad_norm": 0.027357269499672484, + "learning_rate": 1.093766783207073e-05, + "loss": 1.029, + "step": 5950 + }, + { + "epoch": 1.8810708362947168, + "grad_norm": 0.02737662342078075, + "learning_rate": 1.0652747868150125e-05, + "loss": 0.9155, + "step": 5955 + }, + { + "epoch": 1.882650240859196, + "grad_norm": 0.02909080740490778, + "learning_rate": 1.0371548086288207e-05, + "loss": 0.9197, + "step": 5960 + }, + { + "epoch": 1.8842296454236753, + "grad_norm": 0.02851471169073095, + "learning_rate": 1.0094070624259877e-05, + "loss": 0.9217, + "step": 5965 + }, + { + "epoch": 1.8858090499881546, + "grad_norm": 0.030988378240067956, + "learning_rate": 9.820317591542172e-06, + "loss": 0.9359, + "step": 5970 + }, + { + "epoch": 1.8873884545526338, + "grad_norm": 0.03114149770416287, + "learning_rate": 9.550291069297445e-06, + "loss": 0.9198, + "step": 5975 + }, + { + "epoch": 1.8889678591171128, + "grad_norm": 0.03499816306156768, + "learning_rate": 9.283993110357936e-06, + "loss": 0.9136, + "step": 5980 + }, + { + "epoch": 1.890547263681592, + "grad_norm": 0.028861934812947413, + "learning_rate": 9.021425739210054e-06, + "loss": 1.0039, + "step": 5985 + }, + { + "epoch": 1.8921266682460711, + "grad_norm": 0.029981362197630507, + "learning_rate": 8.762590951979232e-06, + "loss": 0.9442, + "step": 5990 + }, + { + "epoch": 1.8937060728105504, + "grad_norm": 0.027640443348192135, + "learning_rate": 8.507490716414269e-06, + "loss": 0.878, + "step": 5995 + }, + { + "epoch": 1.8952854773750296, + "grad_norm": 0.027628387408123886, + "learning_rate": 8.256126971872835e-06, + "loss": 0.9266, + "step": 6000 + }, + { + "epoch": 1.8968648819395089, + "grad_norm": 0.028137456844671317, + "learning_rate": 8.008501629306497e-06, + "loss": 0.9445, + "step": 6005 + }, + { + "epoch": 1.898444286503988, + "grad_norm": 0.030635181924957434, + "learning_rate": 7.764616571246218e-06, + "loss": 0.9492, + "step": 6010 + }, + { + "epoch": 1.9000236910684671, + "grad_norm": 0.03022552040333277, + "learning_rate": 7.524473651788044e-06, + "loss": 0.9865, + "step": 6015 + }, + { + "epoch": 1.9016030956329464, + "grad_norm": 0.027532356125552514, + "learning_rate": 7.288074696578995e-06, + "loss": 0.9205, + "step": 6020 + }, + { + "epoch": 1.9031825001974254, + "grad_norm": 0.03037252231239617, + "learning_rate": 7.055421502803416e-06, + "loss": 1.0084, + "step": 6025 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.029191841877876303, + "learning_rate": 6.826515839168934e-06, + "loss": 0.9652, + "step": 6030 + }, + { + "epoch": 1.906341309326384, + "grad_norm": 0.03132558846523111, + "learning_rate": 6.6013594458931295e-06, + "loss": 0.9523, + "step": 6035 + }, + { + "epoch": 1.9079207138908632, + "grad_norm": 0.02818847729664874, + "learning_rate": 6.379954034690605e-06, + "loss": 0.9847, + "step": 6040 + }, + { + "epoch": 1.9095001184553424, + "grad_norm": 0.028021954410402724, + "learning_rate": 6.162301288759498e-06, + "loss": 0.9539, + "step": 6045 + }, + { + "epoch": 1.9110795230198216, + "grad_norm": 0.029421920413576684, + "learning_rate": 5.9484028627692085e-06, + "loss": 0.9986, + "step": 6050 + }, + { + "epoch": 1.9126589275843007, + "grad_norm": 0.026853037333214733, + "learning_rate": 5.738260382847193e-06, + "loss": 0.939, + "step": 6055 + }, + { + "epoch": 1.91423833214878, + "grad_norm": 0.02934335338064871, + "learning_rate": 5.531875446567136e-06, + "loss": 0.9809, + "step": 6060 + }, + { + "epoch": 1.915817736713259, + "grad_norm": 0.02949182368243745, + "learning_rate": 5.3292496229366824e-06, + "loss": 1.0094, + "step": 6065 + }, + { + "epoch": 1.9173971412777382, + "grad_norm": 0.028757305323247218, + "learning_rate": 5.130384452385339e-06, + "loss": 0.9217, + "step": 6070 + }, + { + "epoch": 1.9189765458422174, + "grad_norm": 0.027371319131601297, + "learning_rate": 4.93528144675276e-06, + "loss": 0.8874, + "step": 6075 + }, + { + "epoch": 1.9205559504066967, + "grad_norm": 0.02981376355537962, + "learning_rate": 4.743942089277642e-06, + "loss": 1.0042, + "step": 6080 + }, + { + "epoch": 1.922135354971176, + "grad_norm": 0.027760082391345534, + "learning_rate": 4.556367834585961e-06, + "loss": 0.941, + "step": 6085 + }, + { + "epoch": 1.9237147595356552, + "grad_norm": 0.0279524329788298, + "learning_rate": 4.3725601086800345e-06, + "loss": 0.9074, + "step": 6090 + }, + { + "epoch": 1.9252941641001342, + "grad_norm": 0.02941844183259332, + "learning_rate": 4.192520308928083e-06, + "loss": 0.9607, + "step": 6095 + }, + { + "epoch": 1.9268735686646135, + "grad_norm": 0.028310654875225674, + "learning_rate": 4.016249804052907e-06, + "loss": 0.9221, + "step": 6100 + }, + { + "epoch": 1.9284529732290925, + "grad_norm": 0.02790363498934764, + "learning_rate": 3.843749934122231e-06, + "loss": 0.9637, + "step": 6105 + }, + { + "epoch": 1.9300323777935717, + "grad_norm": 0.02883951143907877, + "learning_rate": 3.6750220105378206e-06, + "loss": 0.926, + "step": 6110 + }, + { + "epoch": 1.931611782358051, + "grad_norm": 0.029616968182773224, + "learning_rate": 3.5100673160260442e-06, + "loss": 0.9938, + "step": 6115 + }, + { + "epoch": 1.9331911869225302, + "grad_norm": 0.03017986090096489, + "learning_rate": 3.3488871046278844e-06, + "loss": 0.9115, + "step": 6120 + }, + { + "epoch": 1.9347705914870095, + "grad_norm": 0.03190498090763136, + "learning_rate": 3.191482601689333e-06, + "loss": 0.9357, + "step": 6125 + }, + { + "epoch": 1.9363499960514887, + "grad_norm": 0.029102205438411393, + "learning_rate": 3.0378550038522855e-06, + "loss": 0.9068, + "step": 6130 + }, + { + "epoch": 1.9379294006159677, + "grad_norm": 0.02897397063321125, + "learning_rate": 2.8880054790453304e-06, + "loss": 0.9336, + "step": 6135 + }, + { + "epoch": 1.939508805180447, + "grad_norm": 0.029670071722107515, + "learning_rate": 2.741935166474807e-06, + "loss": 0.9588, + "step": 6140 + }, + { + "epoch": 1.941088209744926, + "grad_norm": 0.032069141806492965, + "learning_rate": 2.5996451766163165e-06, + "loss": 1.0055, + "step": 6145 + }, + { + "epoch": 1.9426676143094053, + "grad_norm": 0.028650482223296685, + "learning_rate": 2.4611365912061143e-06, + "loss": 0.9142, + "step": 6150 + }, + { + "epoch": 1.9442470188738845, + "grad_norm": 0.02895708267084703, + "learning_rate": 2.3264104632328974e-06, + "loss": 0.9812, + "step": 6155 + }, + { + "epoch": 1.9458264234383638, + "grad_norm": 0.027893607901280315, + "learning_rate": 2.1954678169299745e-06, + "loss": 0.9621, + "step": 6160 + }, + { + "epoch": 1.947405828002843, + "grad_norm": 0.025988777604333536, + "learning_rate": 2.0683096477672747e-06, + "loss": 0.9891, + "step": 6165 + }, + { + "epoch": 1.9489852325673223, + "grad_norm": 0.02763337859997564, + "learning_rate": 1.9449369224438517e-06, + "loss": 0.9667, + "step": 6170 + }, + { + "epoch": 1.9505646371318013, + "grad_norm": 0.02684017287977009, + "learning_rate": 1.8253505788806136e-06, + "loss": 0.9869, + "step": 6175 + }, + { + "epoch": 1.9521440416962805, + "grad_norm": 0.03379619360602131, + "learning_rate": 1.7095515262129935e-06, + "loss": 0.9331, + "step": 6180 + }, + { + "epoch": 1.9537234462607596, + "grad_norm": 0.02904286384558498, + "learning_rate": 1.597540644784401e-06, + "loss": 0.9409, + "step": 6185 + }, + { + "epoch": 1.9553028508252388, + "grad_norm": 0.028873506388812257, + "learning_rate": 1.48931878613906e-06, + "loss": 0.8905, + "step": 6190 + }, + { + "epoch": 1.956882255389718, + "grad_norm": 0.027643832216016774, + "learning_rate": 1.3848867730158476e-06, + "loss": 0.9781, + "step": 6195 + }, + { + "epoch": 1.9584616599541973, + "grad_norm": 0.02723841917931141, + "learning_rate": 1.2842453993420765e-06, + "loss": 0.9066, + "step": 6200 + }, + { + "epoch": 1.9600410645186765, + "grad_norm": 0.027939513974217022, + "learning_rate": 1.1873954302271118e-06, + "loss": 0.9705, + "step": 6205 + }, + { + "epoch": 1.9616204690831558, + "grad_norm": 0.027685289739808893, + "learning_rate": 1.0943376019570962e-06, + "loss": 0.9548, + "step": 6210 + }, + { + "epoch": 1.9631998736476348, + "grad_norm": 0.031984172346232166, + "learning_rate": 1.0050726219886785e-06, + "loss": 0.9961, + "step": 6215 + }, + { + "epoch": 1.964779278212114, + "grad_norm": 0.027664449474453394, + "learning_rate": 9.196011689444061e-07, + "loss": 0.9057, + "step": 6220 + }, + { + "epoch": 1.966358682776593, + "grad_norm": 0.02804811457422032, + "learning_rate": 8.379238926067845e-07, + "loss": 0.9279, + "step": 6225 + }, + { + "epoch": 1.9679380873410723, + "grad_norm": 0.028313917901337752, + "learning_rate": 7.600414139139477e-07, + "loss": 0.9599, + "step": 6230 + }, + { + "epoch": 1.9695174919055516, + "grad_norm": 0.028323922173206457, + "learning_rate": 6.859543249546074e-07, + "loss": 0.9481, + "step": 6235 + }, + { + "epoch": 1.9710968964700308, + "grad_norm": 0.02668801851595795, + "learning_rate": 6.156631889637776e-07, + "loss": 0.8966, + "step": 6240 + }, + { + "epoch": 1.97267630103451, + "grad_norm": 0.027012280802929903, + "learning_rate": 5.491685403181679e-07, + "loss": 0.9364, + "step": 6245 + }, + { + "epoch": 1.9742557055989893, + "grad_norm": 0.03458699490095528, + "learning_rate": 4.864708845324639e-07, + "loss": 0.9719, + "step": 6250 + }, + { + "epoch": 1.9758351101634684, + "grad_norm": 0.029890185354397428, + "learning_rate": 4.275706982552752e-07, + "loss": 0.9293, + "step": 6255 + }, + { + "epoch": 1.9774145147279476, + "grad_norm": 0.028008736761539147, + "learning_rate": 3.724684292655822e-07, + "loss": 0.9318, + "step": 6260 + }, + { + "epoch": 1.9789939192924266, + "grad_norm": 0.02920905193937555, + "learning_rate": 3.21164496469295e-07, + "loss": 0.9837, + "step": 6265 + }, + { + "epoch": 1.9805733238569059, + "grad_norm": 0.02677931624268966, + "learning_rate": 2.736592898961998e-07, + "loss": 0.9539, + "step": 6270 + }, + { + "epoch": 1.9821527284213851, + "grad_norm": 0.027770174529727857, + "learning_rate": 2.29953170696795e-07, + "loss": 0.9783, + "step": 6275 + }, + { + "epoch": 1.9837321329858644, + "grad_norm": 0.02942134960133055, + "learning_rate": 1.900464711396821e-07, + "loss": 0.9974, + "step": 6280 + }, + { + "epoch": 1.9853115375503436, + "grad_norm": 0.02938903540449821, + "learning_rate": 1.5393949460895674e-07, + "loss": 0.8912, + "step": 6285 + }, + { + "epoch": 1.9868909421148229, + "grad_norm": 0.02772513890848461, + "learning_rate": 1.2163251560198817e-07, + "loss": 0.96, + "step": 6290 + }, + { + "epoch": 1.988470346679302, + "grad_norm": 0.02890495825239336, + "learning_rate": 9.312577972725444e-08, + "loss": 0.9747, + "step": 6295 + }, + { + "epoch": 1.9900497512437811, + "grad_norm": 0.0291827619706324, + "learning_rate": 6.841950370256589e-08, + "loss": 0.8942, + "step": 6300 + }, + { + "epoch": 1.9916291558082602, + "grad_norm": 0.02705666292763451, + "learning_rate": 4.751387535328888e-08, + "loss": 0.9338, + "step": 6305 + }, + { + "epoch": 1.9932085603727394, + "grad_norm": 0.03065964434744364, + "learning_rate": 3.0409053611013534e-08, + "loss": 0.8768, + "step": 6310 + }, + { + "epoch": 1.9947879649372187, + "grad_norm": 0.031459567261155605, + "learning_rate": 1.7105168512443482e-08, + "loss": 0.9127, + "step": 6315 + }, + { + "epoch": 1.996367369501698, + "grad_norm": 0.02752383355078723, + "learning_rate": 7.602321198063589e-09, + "loss": 0.9482, + "step": 6320 + }, + { + "epoch": 1.9979467740661772, + "grad_norm": 0.0273474840031279, + "learning_rate": 1.900583911751408e-09, + "loss": 0.9588, + "step": 6325 + }, + { + "epoch": 1.9995261786306564, + "grad_norm": 0.06363709530898917, + "learning_rate": 0.0, + "loss": 0.9738, + "step": 6330 }, { - "epoch": 1.7142857142857144, - "eval_loss": 13.667020797729492, - "eval_runtime": 0.4014, - "eval_samples_per_second": 4.983, - "eval_steps_per_second": 2.491, - "step": 6 + "epoch": 1.9995261786306564, + "eval_loss": 1.0175522565841675, + "eval_runtime": 509.7076, + "eval_samples_per_second": 5.197, + "eval_steps_per_second": 1.301, + "step": 6330 }, { - "epoch": 1.7142857142857144, - "step": 6, - "total_flos": 1113549778714624.0, - "train_loss": 19.26901610692342, - "train_runtime": 77.567, - "train_samples_per_second": 1.444, - "train_steps_per_second": 0.077 + "epoch": 1.9995261786306564, + "step": 6330, + "total_flos": 1.174779724933628e+18, + "train_loss": 1.0698493373337514, + "train_runtime": 77836.8266, + "train_samples_per_second": 1.301, + "train_steps_per_second": 0.081 } ], "logging_steps": 5, - "max_steps": 6, + "max_steps": 6330, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, @@ -65,7 +8920,7 @@ "attributes": {} } }, - "total_flos": 1113549778714624.0, + "total_flos": 1.174779724933628e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null