{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.99960508648606, "eval_steps": 500, "global_step": 15825, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031593081115235764, "grad_norm": 3.847676639099889, "learning_rate": 6.317119393556539e-08, "loss": 0.5818, "step": 10 }, { "epoch": 0.006318616223047153, "grad_norm": 3.8262659782800736, "learning_rate": 1.2634238787113078e-07, "loss": 0.5856, "step": 20 }, { "epoch": 0.00947792433457073, "grad_norm": 3.2562040895477886, "learning_rate": 1.8951358180669618e-07, "loss": 0.5757, "step": 30 }, { "epoch": 0.012637232446094306, "grad_norm": 2.3293562431253108, "learning_rate": 2.5268477574226156e-07, "loss": 0.5599, "step": 40 }, { "epoch": 0.01579654055761788, "grad_norm": 1.7404444983787009, "learning_rate": 3.158559696778269e-07, "loss": 0.5394, "step": 50 }, { "epoch": 0.01895584866914146, "grad_norm": 1.3641917061000175, "learning_rate": 3.7902716361339236e-07, "loss": 0.5113, "step": 60 }, { "epoch": 0.022115156780665033, "grad_norm": 0.9564559310967178, "learning_rate": 4.421983575489577e-07, "loss": 0.4871, "step": 70 }, { "epoch": 0.02527446489218861, "grad_norm": 0.7099841750448822, "learning_rate": 5.053695514845231e-07, "loss": 0.4714, "step": 80 }, { "epoch": 0.028433773003712186, "grad_norm": 0.5402837514386732, "learning_rate": 5.685407454200885e-07, "loss": 0.451, "step": 90 }, { "epoch": 0.03159308111523576, "grad_norm": 0.4807271971224434, "learning_rate": 6.317119393556538e-07, "loss": 0.4323, "step": 100 }, { "epoch": 0.03475238922675934, "grad_norm": 0.34807058040387906, "learning_rate": 6.948831332912193e-07, "loss": 0.4219, "step": 110 }, { "epoch": 0.03791169733828292, "grad_norm": 0.3269546213934352, "learning_rate": 7.580543272267847e-07, "loss": 0.4152, "step": 120 }, { "epoch": 0.04107100544980649, "grad_norm": 0.29931985054640087, "learning_rate": 8.212255211623501e-07, "loss": 0.4046, "step": 130 }, { "epoch": 0.044230313561330066, "grad_norm": 0.3007912763355078, "learning_rate": 8.843967150979154e-07, "loss": 0.3975, "step": 140 }, { "epoch": 0.04738962167285365, "grad_norm": 0.3314878634217047, "learning_rate": 9.475679090334808e-07, "loss": 0.398, "step": 150 }, { "epoch": 0.05054892978437722, "grad_norm": 0.3099556109376344, "learning_rate": 1.0107391029690462e-06, "loss": 0.3931, "step": 160 }, { "epoch": 0.0537082378959008, "grad_norm": 0.29068208472557255, "learning_rate": 1.0739102969046116e-06, "loss": 0.3858, "step": 170 }, { "epoch": 0.05686754600742437, "grad_norm": 0.27910793332859557, "learning_rate": 1.137081490840177e-06, "loss": 0.3808, "step": 180 }, { "epoch": 0.06002685411894795, "grad_norm": 0.30546375486352645, "learning_rate": 1.2002526847757423e-06, "loss": 0.3762, "step": 190 }, { "epoch": 0.06318616223047152, "grad_norm": 0.3554409408339123, "learning_rate": 1.2634238787113076e-06, "loss": 0.3735, "step": 200 }, { "epoch": 0.0663454703419951, "grad_norm": 0.32484090299900514, "learning_rate": 1.3265950726468732e-06, "loss": 0.3674, "step": 210 }, { "epoch": 0.06950477845351868, "grad_norm": 0.3068743877352151, "learning_rate": 1.3897662665824385e-06, "loss": 0.3666, "step": 220 }, { "epoch": 0.07266408656504225, "grad_norm": 0.29618802262032184, "learning_rate": 1.4529374605180039e-06, "loss": 0.3618, "step": 230 }, { "epoch": 0.07582339467656583, "grad_norm": 0.3148809652314096, "learning_rate": 1.5161086544535694e-06, "loss": 0.3624, "step": 240 }, { "epoch": 0.07898270278808942, "grad_norm": 0.2804644371038647, "learning_rate": 1.5792798483891348e-06, "loss": 0.3613, "step": 250 }, { "epoch": 0.08214201089961298, "grad_norm": 0.3309683996287944, "learning_rate": 1.6424510423247001e-06, "loss": 0.3581, "step": 260 }, { "epoch": 0.08530131901113656, "grad_norm": 0.29235969381132293, "learning_rate": 1.7056222362602653e-06, "loss": 0.3556, "step": 270 }, { "epoch": 0.08846062712266013, "grad_norm": 0.3059950184586691, "learning_rate": 1.7687934301958308e-06, "loss": 0.3485, "step": 280 }, { "epoch": 0.09161993523418371, "grad_norm": 0.3065943367603657, "learning_rate": 1.8319646241313962e-06, "loss": 0.3441, "step": 290 }, { "epoch": 0.0947792433457073, "grad_norm": 0.29754666934058893, "learning_rate": 1.8951358180669615e-06, "loss": 0.3486, "step": 300 }, { "epoch": 0.09793855145723086, "grad_norm": 0.31042175851236, "learning_rate": 1.9583070120025267e-06, "loss": 0.3458, "step": 310 }, { "epoch": 0.10109785956875444, "grad_norm": 0.28167478495509757, "learning_rate": 2.0214782059380925e-06, "loss": 0.3425, "step": 320 }, { "epoch": 0.10425716768027801, "grad_norm": 0.2937174634980692, "learning_rate": 2.084649399873658e-06, "loss": 0.3409, "step": 330 }, { "epoch": 0.1074164757918016, "grad_norm": 0.28351076022747446, "learning_rate": 2.147820593809223e-06, "loss": 0.3377, "step": 340 }, { "epoch": 0.11057578390332518, "grad_norm": 0.2873794548808371, "learning_rate": 2.2109917877447885e-06, "loss": 0.3365, "step": 350 }, { "epoch": 0.11373509201484874, "grad_norm": 0.28077300443160785, "learning_rate": 2.274162981680354e-06, "loss": 0.3395, "step": 360 }, { "epoch": 0.11689440012637232, "grad_norm": 0.2895451240097338, "learning_rate": 2.337334175615919e-06, "loss": 0.3356, "step": 370 }, { "epoch": 0.1200537082378959, "grad_norm": 0.29469241884731345, "learning_rate": 2.4005053695514845e-06, "loss": 0.3323, "step": 380 }, { "epoch": 0.12321301634941947, "grad_norm": 0.31436693982320113, "learning_rate": 2.4636765634870503e-06, "loss": 0.333, "step": 390 }, { "epoch": 0.12637232446094304, "grad_norm": 0.2882094750389832, "learning_rate": 2.5268477574226152e-06, "loss": 0.3316, "step": 400 }, { "epoch": 0.12953163257246664, "grad_norm": 0.33020346533639344, "learning_rate": 2.590018951358181e-06, "loss": 0.328, "step": 410 }, { "epoch": 0.1326909406839902, "grad_norm": 0.3442838133962943, "learning_rate": 2.6531901452937464e-06, "loss": 0.3254, "step": 420 }, { "epoch": 0.13585024879551377, "grad_norm": 0.3032556873766918, "learning_rate": 2.7163613392293113e-06, "loss": 0.3249, "step": 430 }, { "epoch": 0.13900955690703737, "grad_norm": 0.3056937672984876, "learning_rate": 2.779532533164877e-06, "loss": 0.3229, "step": 440 }, { "epoch": 0.14216886501856094, "grad_norm": 0.28744855606538844, "learning_rate": 2.8427037271004424e-06, "loss": 0.3233, "step": 450 }, { "epoch": 0.1453281731300845, "grad_norm": 0.31562420003771263, "learning_rate": 2.9058749210360078e-06, "loss": 0.3226, "step": 460 }, { "epoch": 0.1484874812416081, "grad_norm": 0.2778274627103786, "learning_rate": 2.969046114971573e-06, "loss": 0.3209, "step": 470 }, { "epoch": 0.15164678935313167, "grad_norm": 0.31527911586537605, "learning_rate": 3.032217308907139e-06, "loss": 0.3233, "step": 480 }, { "epoch": 0.15480609746465523, "grad_norm": 0.28579902930861095, "learning_rate": 3.095388502842704e-06, "loss": 0.3187, "step": 490 }, { "epoch": 0.15796540557617883, "grad_norm": 0.2854450060082136, "learning_rate": 3.1585596967782696e-06, "loss": 0.3164, "step": 500 }, { "epoch": 0.1611247136877024, "grad_norm": 0.2783395921293009, "learning_rate": 3.2217308907138345e-06, "loss": 0.3172, "step": 510 }, { "epoch": 0.16428402179922597, "grad_norm": 0.30491989041976003, "learning_rate": 3.2849020846494003e-06, "loss": 0.3129, "step": 520 }, { "epoch": 0.16744332991074953, "grad_norm": 0.32468595503092035, "learning_rate": 3.3480732785849656e-06, "loss": 0.3147, "step": 530 }, { "epoch": 0.17060263802227313, "grad_norm": 0.31416335531159423, "learning_rate": 3.4112444725205306e-06, "loss": 0.3125, "step": 540 }, { "epoch": 0.1737619461337967, "grad_norm": 0.36383194617757575, "learning_rate": 3.4744156664560963e-06, "loss": 0.3138, "step": 550 }, { "epoch": 0.17692125424532026, "grad_norm": 0.29533777762093066, "learning_rate": 3.5375868603916617e-06, "loss": 0.3116, "step": 560 }, { "epoch": 0.18008056235684386, "grad_norm": 0.3493771802079338, "learning_rate": 3.600758054327227e-06, "loss": 0.3135, "step": 570 }, { "epoch": 0.18323987046836743, "grad_norm": 0.2944691289331218, "learning_rate": 3.6639292482627924e-06, "loss": 0.3163, "step": 580 }, { "epoch": 0.186399178579891, "grad_norm": 0.31590918798291073, "learning_rate": 3.727100442198358e-06, "loss": 0.3087, "step": 590 }, { "epoch": 0.1895584866914146, "grad_norm": 0.36702711115113235, "learning_rate": 3.790271636133923e-06, "loss": 0.3048, "step": 600 }, { "epoch": 0.19271779480293816, "grad_norm": 0.34337289204312266, "learning_rate": 3.853442830069489e-06, "loss": 0.3097, "step": 610 }, { "epoch": 0.19587710291446173, "grad_norm": 0.3043737272471374, "learning_rate": 3.916614024005053e-06, "loss": 0.3071, "step": 620 }, { "epoch": 0.19903641102598532, "grad_norm": 0.31254100353406794, "learning_rate": 3.9797852179406196e-06, "loss": 0.3035, "step": 630 }, { "epoch": 0.2021957191375089, "grad_norm": 0.2865164407577743, "learning_rate": 4.042956411876185e-06, "loss": 0.3047, "step": 640 }, { "epoch": 0.20535502724903246, "grad_norm": 0.29447310088157563, "learning_rate": 4.10612760581175e-06, "loss": 0.3037, "step": 650 }, { "epoch": 0.20851433536055602, "grad_norm": 0.29267707519023506, "learning_rate": 4.169298799747316e-06, "loss": 0.3028, "step": 660 }, { "epoch": 0.21167364347207962, "grad_norm": 0.2963460269610618, "learning_rate": 4.232469993682881e-06, "loss": 0.305, "step": 670 }, { "epoch": 0.2148329515836032, "grad_norm": 0.2978257495744734, "learning_rate": 4.295641187618446e-06, "loss": 0.3004, "step": 680 }, { "epoch": 0.21799225969512676, "grad_norm": 0.2906359775378807, "learning_rate": 4.358812381554012e-06, "loss": 0.3019, "step": 690 }, { "epoch": 0.22115156780665035, "grad_norm": 0.3446901435895929, "learning_rate": 4.421983575489577e-06, "loss": 0.3024, "step": 700 }, { "epoch": 0.22431087591817392, "grad_norm": 0.30756534246023176, "learning_rate": 4.485154769425142e-06, "loss": 0.2959, "step": 710 }, { "epoch": 0.2274701840296975, "grad_norm": 0.3155947525143884, "learning_rate": 4.548325963360708e-06, "loss": 0.3005, "step": 720 }, { "epoch": 0.23062949214122108, "grad_norm": 0.2995756375311246, "learning_rate": 4.611497157296273e-06, "loss": 0.2975, "step": 730 }, { "epoch": 0.23378880025274465, "grad_norm": 0.30858061818787486, "learning_rate": 4.674668351231838e-06, "loss": 0.2978, "step": 740 }, { "epoch": 0.23694810836426822, "grad_norm": 0.3338257461588683, "learning_rate": 4.737839545167405e-06, "loss": 0.2933, "step": 750 }, { "epoch": 0.2401074164757918, "grad_norm": 0.29022817054284084, "learning_rate": 4.801010739102969e-06, "loss": 0.2972, "step": 760 }, { "epoch": 0.24326672458731538, "grad_norm": 0.2896880604505418, "learning_rate": 4.8641819330385344e-06, "loss": 0.2946, "step": 770 }, { "epoch": 0.24642603269883895, "grad_norm": 0.2847438243168221, "learning_rate": 4.927353126974101e-06, "loss": 0.2969, "step": 780 }, { "epoch": 0.24958534081036254, "grad_norm": 0.3037714021164623, "learning_rate": 4.990524320909665e-06, "loss": 0.2937, "step": 790 }, { "epoch": 0.2527446489218861, "grad_norm": 0.32395568014513065, "learning_rate": 5.0536955148452305e-06, "loss": 0.2939, "step": 800 }, { "epoch": 0.2559039570334097, "grad_norm": 0.2957719635763268, "learning_rate": 5.116866708780797e-06, "loss": 0.2932, "step": 810 }, { "epoch": 0.2590632651449333, "grad_norm": 0.3165621516450349, "learning_rate": 5.180037902716362e-06, "loss": 0.2928, "step": 820 }, { "epoch": 0.2622225732564568, "grad_norm": 0.3232166339204694, "learning_rate": 5.2432090966519265e-06, "loss": 0.2901, "step": 830 }, { "epoch": 0.2653818813679804, "grad_norm": 0.3040549753465353, "learning_rate": 5.306380290587493e-06, "loss": 0.2881, "step": 840 }, { "epoch": 0.268541189479504, "grad_norm": 0.32781940117167635, "learning_rate": 5.369551484523058e-06, "loss": 0.2908, "step": 850 }, { "epoch": 0.27170049759102755, "grad_norm": 0.3327478795766515, "learning_rate": 5.432722678458623e-06, "loss": 0.2907, "step": 860 }, { "epoch": 0.27485980570255114, "grad_norm": 0.2856004412508293, "learning_rate": 5.495893872394189e-06, "loss": 0.2934, "step": 870 }, { "epoch": 0.27801911381407474, "grad_norm": 0.3604841472498219, "learning_rate": 5.559065066329754e-06, "loss": 0.2855, "step": 880 }, { "epoch": 0.2811784219255983, "grad_norm": 0.32653022277153904, "learning_rate": 5.6222362602653195e-06, "loss": 0.2876, "step": 890 }, { "epoch": 0.28433773003712187, "grad_norm": 0.3123066830281955, "learning_rate": 5.685407454200885e-06, "loss": 0.2887, "step": 900 }, { "epoch": 0.28749703814864547, "grad_norm": 0.31118386648194923, "learning_rate": 5.74857864813645e-06, "loss": 0.2889, "step": 910 }, { "epoch": 0.290656346260169, "grad_norm": 0.29015768543262505, "learning_rate": 5.8117498420720155e-06, "loss": 0.2876, "step": 920 }, { "epoch": 0.2938156543716926, "grad_norm": 0.2885072388494909, "learning_rate": 5.874921036007582e-06, "loss": 0.2868, "step": 930 }, { "epoch": 0.2969749624832162, "grad_norm": 0.3282216748098807, "learning_rate": 5.938092229943146e-06, "loss": 0.2877, "step": 940 }, { "epoch": 0.30013427059473974, "grad_norm": 0.29797634077774454, "learning_rate": 6.001263423878712e-06, "loss": 0.2873, "step": 950 }, { "epoch": 0.30329357870626333, "grad_norm": 0.33087996389175034, "learning_rate": 6.064434617814278e-06, "loss": 0.2838, "step": 960 }, { "epoch": 0.30645288681778693, "grad_norm": 0.29661512432744086, "learning_rate": 6.127605811749843e-06, "loss": 0.2877, "step": 970 }, { "epoch": 0.30961219492931047, "grad_norm": 0.31023012973801395, "learning_rate": 6.190777005685408e-06, "loss": 0.2826, "step": 980 }, { "epoch": 0.31277150304083406, "grad_norm": 0.30885706884471026, "learning_rate": 6.253948199620974e-06, "loss": 0.2845, "step": 990 }, { "epoch": 0.31593081115235766, "grad_norm": 0.3540472583639826, "learning_rate": 6.317119393556539e-06, "loss": 0.2809, "step": 1000 }, { "epoch": 0.3190901192638812, "grad_norm": 0.2730722833734264, "learning_rate": 6.380290587492104e-06, "loss": 0.2787, "step": 1010 }, { "epoch": 0.3222494273754048, "grad_norm": 0.3320814722084985, "learning_rate": 6.443461781427669e-06, "loss": 0.2813, "step": 1020 }, { "epoch": 0.32540873548692834, "grad_norm": 0.290985113799801, "learning_rate": 6.506632975363235e-06, "loss": 0.2782, "step": 1030 }, { "epoch": 0.32856804359845193, "grad_norm": 0.321948058443697, "learning_rate": 6.5698041692988006e-06, "loss": 0.2799, "step": 1040 }, { "epoch": 0.3317273517099755, "grad_norm": 0.3247951324518689, "learning_rate": 6.632975363234365e-06, "loss": 0.2853, "step": 1050 }, { "epoch": 0.33488665982149907, "grad_norm": 0.37466669888496, "learning_rate": 6.696146557169931e-06, "loss": 0.2847, "step": 1060 }, { "epoch": 0.33804596793302266, "grad_norm": 0.3158284283569005, "learning_rate": 6.759317751105497e-06, "loss": 0.2793, "step": 1070 }, { "epoch": 0.34120527604454626, "grad_norm": 0.2844891421044668, "learning_rate": 6.822488945041061e-06, "loss": 0.2812, "step": 1080 }, { "epoch": 0.3443645841560698, "grad_norm": 0.33051991039937395, "learning_rate": 6.885660138976627e-06, "loss": 0.2811, "step": 1090 }, { "epoch": 0.3475238922675934, "grad_norm": 0.3186151235138414, "learning_rate": 6.948831332912193e-06, "loss": 0.2851, "step": 1100 }, { "epoch": 0.350683200379117, "grad_norm": 0.3123969158631383, "learning_rate": 7.012002526847758e-06, "loss": 0.2798, "step": 1110 }, { "epoch": 0.35384250849064053, "grad_norm": 0.3280651223802418, "learning_rate": 7.075173720783323e-06, "loss": 0.2835, "step": 1120 }, { "epoch": 0.3570018166021641, "grad_norm": 0.29332806586366567, "learning_rate": 7.138344914718889e-06, "loss": 0.2775, "step": 1130 }, { "epoch": 0.3601611247136877, "grad_norm": 0.33171088512344676, "learning_rate": 7.201516108654454e-06, "loss": 0.2752, "step": 1140 }, { "epoch": 0.36332043282521126, "grad_norm": 0.3000902915772275, "learning_rate": 7.26468730259002e-06, "loss": 0.2807, "step": 1150 }, { "epoch": 0.36647974093673485, "grad_norm": 0.2883327293237225, "learning_rate": 7.327858496525585e-06, "loss": 0.2762, "step": 1160 }, { "epoch": 0.36963904904825845, "grad_norm": 0.3144189091034534, "learning_rate": 7.39102969046115e-06, "loss": 0.2801, "step": 1170 }, { "epoch": 0.372798357159782, "grad_norm": 0.356268108886898, "learning_rate": 7.454200884396716e-06, "loss": 0.2777, "step": 1180 }, { "epoch": 0.3759576652713056, "grad_norm": 0.3128810321586775, "learning_rate": 7.517372078332281e-06, "loss": 0.2757, "step": 1190 }, { "epoch": 0.3791169733828292, "grad_norm": 0.30434623170055075, "learning_rate": 7.580543272267846e-06, "loss": 0.2784, "step": 1200 }, { "epoch": 0.3822762814943527, "grad_norm": 0.3388464380231131, "learning_rate": 7.643714466203413e-06, "loss": 0.2784, "step": 1210 }, { "epoch": 0.3854355896058763, "grad_norm": 0.3110630401877264, "learning_rate": 7.706885660138978e-06, "loss": 0.2743, "step": 1220 }, { "epoch": 0.3885948977173999, "grad_norm": 0.2771106506778223, "learning_rate": 7.770056854074542e-06, "loss": 0.2757, "step": 1230 }, { "epoch": 0.39175420582892345, "grad_norm": 0.3175411734943708, "learning_rate": 7.833228048010107e-06, "loss": 0.2762, "step": 1240 }, { "epoch": 0.39491351394044705, "grad_norm": 0.3072276379244834, "learning_rate": 7.896399241945673e-06, "loss": 0.2757, "step": 1250 }, { "epoch": 0.39807282205197064, "grad_norm": 0.31787735230000674, "learning_rate": 7.959570435881239e-06, "loss": 0.2716, "step": 1260 }, { "epoch": 0.4012321301634942, "grad_norm": 0.32573848304334413, "learning_rate": 8.022741629816804e-06, "loss": 0.2718, "step": 1270 }, { "epoch": 0.4043914382750178, "grad_norm": 0.31968318506791704, "learning_rate": 8.08591282375237e-06, "loss": 0.2753, "step": 1280 }, { "epoch": 0.4075507463865414, "grad_norm": 0.3038009939073212, "learning_rate": 8.149084017687934e-06, "loss": 0.2709, "step": 1290 }, { "epoch": 0.4107100544980649, "grad_norm": 0.32085469500420516, "learning_rate": 8.2122552116235e-06, "loss": 0.2707, "step": 1300 }, { "epoch": 0.4138693626095885, "grad_norm": 0.3065889908229096, "learning_rate": 8.275426405559067e-06, "loss": 0.2768, "step": 1310 }, { "epoch": 0.41702867072111205, "grad_norm": 0.31763603457220624, "learning_rate": 8.338597599494631e-06, "loss": 0.2709, "step": 1320 }, { "epoch": 0.42018797883263564, "grad_norm": 0.3109152063857626, "learning_rate": 8.401768793430196e-06, "loss": 0.2739, "step": 1330 }, { "epoch": 0.42334728694415924, "grad_norm": 0.29181104171061434, "learning_rate": 8.464939987365762e-06, "loss": 0.2686, "step": 1340 }, { "epoch": 0.4265065950556828, "grad_norm": 0.2935442340016787, "learning_rate": 8.528111181301328e-06, "loss": 0.2707, "step": 1350 }, { "epoch": 0.4296659031672064, "grad_norm": 0.2906035979144556, "learning_rate": 8.591282375236893e-06, "loss": 0.2731, "step": 1360 }, { "epoch": 0.43282521127872997, "grad_norm": 0.29758936608537967, "learning_rate": 8.654453569172459e-06, "loss": 0.2709, "step": 1370 }, { "epoch": 0.4359845193902535, "grad_norm": 0.33154204605407617, "learning_rate": 8.717624763108023e-06, "loss": 0.2707, "step": 1380 }, { "epoch": 0.4391438275017771, "grad_norm": 0.33339797215462064, "learning_rate": 8.780795957043588e-06, "loss": 0.2713, "step": 1390 }, { "epoch": 0.4423031356133007, "grad_norm": 0.3155178562410378, "learning_rate": 8.843967150979154e-06, "loss": 0.2675, "step": 1400 }, { "epoch": 0.44546244372482424, "grad_norm": 0.3265426814247614, "learning_rate": 8.90713834491472e-06, "loss": 0.2683, "step": 1410 }, { "epoch": 0.44862175183634784, "grad_norm": 0.36971141677875463, "learning_rate": 8.970309538850285e-06, "loss": 0.2677, "step": 1420 }, { "epoch": 0.45178105994787143, "grad_norm": 0.30518969540463764, "learning_rate": 9.033480732785851e-06, "loss": 0.2688, "step": 1430 }, { "epoch": 0.454940368059395, "grad_norm": 0.34536630526318685, "learning_rate": 9.096651926721415e-06, "loss": 0.2673, "step": 1440 }, { "epoch": 0.45809967617091857, "grad_norm": 0.35749309238694066, "learning_rate": 9.159823120656982e-06, "loss": 0.2691, "step": 1450 }, { "epoch": 0.46125898428244216, "grad_norm": 0.31258023613884145, "learning_rate": 9.222994314592546e-06, "loss": 0.2664, "step": 1460 }, { "epoch": 0.4644182923939657, "grad_norm": 0.3450789196077563, "learning_rate": 9.286165508528112e-06, "loss": 0.268, "step": 1470 }, { "epoch": 0.4675776005054893, "grad_norm": 0.29121813500205246, "learning_rate": 9.349336702463677e-06, "loss": 0.2677, "step": 1480 }, { "epoch": 0.4707369086170129, "grad_norm": 0.3220463754226227, "learning_rate": 9.412507896399243e-06, "loss": 0.2667, "step": 1490 }, { "epoch": 0.47389621672853643, "grad_norm": 0.33704239601038527, "learning_rate": 9.47567909033481e-06, "loss": 0.2646, "step": 1500 }, { "epoch": 0.47705552484006003, "grad_norm": 0.2863325950552584, "learning_rate": 9.538850284270374e-06, "loss": 0.2705, "step": 1510 }, { "epoch": 0.4802148329515836, "grad_norm": 0.31752841534155907, "learning_rate": 9.602021478205938e-06, "loss": 0.2664, "step": 1520 }, { "epoch": 0.48337414106310717, "grad_norm": 0.32880306966393713, "learning_rate": 9.665192672141504e-06, "loss": 0.2631, "step": 1530 }, { "epoch": 0.48653344917463076, "grad_norm": 0.31843129282620164, "learning_rate": 9.728363866077069e-06, "loss": 0.2625, "step": 1540 }, { "epoch": 0.48969275728615436, "grad_norm": 0.30470755245172276, "learning_rate": 9.791535060012635e-06, "loss": 0.2666, "step": 1550 }, { "epoch": 0.4928520653976779, "grad_norm": 0.33123646966581777, "learning_rate": 9.854706253948201e-06, "loss": 0.2646, "step": 1560 }, { "epoch": 0.4960113735092015, "grad_norm": 0.28677001614790365, "learning_rate": 9.917877447883766e-06, "loss": 0.2622, "step": 1570 }, { "epoch": 0.4991706816207251, "grad_norm": 0.324282072097384, "learning_rate": 9.98104864181933e-06, "loss": 0.2678, "step": 1580 }, { "epoch": 0.5023299897322486, "grad_norm": 0.3255697445472025, "learning_rate": 9.999994039347758e-06, "loss": 0.2637, "step": 1590 }, { "epoch": 0.5054892978437722, "grad_norm": 0.2933750585789845, "learning_rate": 9.999964844350574e-06, "loss": 0.2649, "step": 1600 }, { "epoch": 0.5086486059552958, "grad_norm": 0.33377765453076996, "learning_rate": 9.999911320336655e-06, "loss": 0.267, "step": 1610 }, { "epoch": 0.5118079140668194, "grad_norm": 0.304237717093902, "learning_rate": 9.999833467566438e-06, "loss": 0.2633, "step": 1620 }, { "epoch": 0.5149672221783429, "grad_norm": 0.3058481570433111, "learning_rate": 9.999731286418741e-06, "loss": 0.262, "step": 1630 }, { "epoch": 0.5181265302898665, "grad_norm": 0.3392309580496954, "learning_rate": 9.999604777390763e-06, "loss": 0.2593, "step": 1640 }, { "epoch": 0.5212858384013901, "grad_norm": 0.2895018374347362, "learning_rate": 9.999453941098077e-06, "loss": 0.2625, "step": 1650 }, { "epoch": 0.5244451465129136, "grad_norm": 0.2912777670223621, "learning_rate": 9.999278778274627e-06, "loss": 0.266, "step": 1660 }, { "epoch": 0.5276044546244373, "grad_norm": 0.3529258059433674, "learning_rate": 9.999079289772724e-06, "loss": 0.2619, "step": 1670 }, { "epoch": 0.5307637627359608, "grad_norm": 0.2738135177871275, "learning_rate": 9.99885547656305e-06, "loss": 0.2618, "step": 1680 }, { "epoch": 0.5339230708474844, "grad_norm": 0.2692997188904244, "learning_rate": 9.998607339734643e-06, "loss": 0.2606, "step": 1690 }, { "epoch": 0.537082378959008, "grad_norm": 0.2949366840504707, "learning_rate": 9.998334880494898e-06, "loss": 0.2612, "step": 1700 }, { "epoch": 0.5402416870705316, "grad_norm": 0.3057325239685516, "learning_rate": 9.998038100169554e-06, "loss": 0.261, "step": 1710 }, { "epoch": 0.5434009951820551, "grad_norm": 0.27124805299961813, "learning_rate": 9.997717000202696e-06, "loss": 0.2598, "step": 1720 }, { "epoch": 0.5465603032935787, "grad_norm": 0.30051977914101224, "learning_rate": 9.997371582156747e-06, "loss": 0.2602, "step": 1730 }, { "epoch": 0.5497196114051023, "grad_norm": 0.3075193191916608, "learning_rate": 9.997001847712456e-06, "loss": 0.2601, "step": 1740 }, { "epoch": 0.5528789195166258, "grad_norm": 0.29754589174948665, "learning_rate": 9.996607798668887e-06, "loss": 0.2592, "step": 1750 }, { "epoch": 0.5560382276281495, "grad_norm": 0.33700900362331376, "learning_rate": 9.99618943694342e-06, "loss": 0.262, "step": 1760 }, { "epoch": 0.559197535739673, "grad_norm": 0.3265608460452239, "learning_rate": 9.995746764571736e-06, "loss": 0.259, "step": 1770 }, { "epoch": 0.5623568438511966, "grad_norm": 0.31549240271112217, "learning_rate": 9.995279783707805e-06, "loss": 0.2607, "step": 1780 }, { "epoch": 0.5655161519627202, "grad_norm": 0.3586012625330008, "learning_rate": 9.994788496623884e-06, "loss": 0.2572, "step": 1790 }, { "epoch": 0.5686754600742437, "grad_norm": 0.3056154836660224, "learning_rate": 9.994272905710491e-06, "loss": 0.2526, "step": 1800 }, { "epoch": 0.5718347681857673, "grad_norm": 0.2613411851814494, "learning_rate": 9.993733013476412e-06, "loss": 0.2561, "step": 1810 }, { "epoch": 0.5749940762972909, "grad_norm": 0.3254616196951068, "learning_rate": 9.993168822548672e-06, "loss": 0.257, "step": 1820 }, { "epoch": 0.5781533844088145, "grad_norm": 0.3020785656622962, "learning_rate": 9.992580335672535e-06, "loss": 0.2557, "step": 1830 }, { "epoch": 0.581312692520338, "grad_norm": 0.26993214887498285, "learning_rate": 9.99196755571148e-06, "loss": 0.2536, "step": 1840 }, { "epoch": 0.5844720006318617, "grad_norm": 0.2639927502536523, "learning_rate": 9.991330485647195e-06, "loss": 0.26, "step": 1850 }, { "epoch": 0.5876313087433852, "grad_norm": 0.30065294121954683, "learning_rate": 9.990669128579562e-06, "loss": 0.2605, "step": 1860 }, { "epoch": 0.5907906168549087, "grad_norm": 0.30583938194208843, "learning_rate": 9.989983487726634e-06, "loss": 0.2563, "step": 1870 }, { "epoch": 0.5939499249664324, "grad_norm": 0.2864579077847969, "learning_rate": 9.989273566424629e-06, "loss": 0.2606, "step": 1880 }, { "epoch": 0.5971092330779559, "grad_norm": 0.3026768306473177, "learning_rate": 9.98853936812791e-06, "loss": 0.258, "step": 1890 }, { "epoch": 0.6002685411894795, "grad_norm": 0.29314955217944727, "learning_rate": 9.987780896408966e-06, "loss": 0.2589, "step": 1900 }, { "epoch": 0.6034278493010031, "grad_norm": 0.31058695039428386, "learning_rate": 9.986998154958395e-06, "loss": 0.253, "step": 1910 }, { "epoch": 0.6065871574125267, "grad_norm": 0.2845944676061047, "learning_rate": 9.986191147584893e-06, "loss": 0.2546, "step": 1920 }, { "epoch": 0.6097464655240502, "grad_norm": 0.29933495312410735, "learning_rate": 9.985359878215224e-06, "loss": 0.2552, "step": 1930 }, { "epoch": 0.6129057736355739, "grad_norm": 0.2759459833479636, "learning_rate": 9.984504350894213e-06, "loss": 0.2574, "step": 1940 }, { "epoch": 0.6160650817470974, "grad_norm": 0.3066829895649084, "learning_rate": 9.983624569784714e-06, "loss": 0.2553, "step": 1950 }, { "epoch": 0.6192243898586209, "grad_norm": 0.2690944667845755, "learning_rate": 9.982720539167601e-06, "loss": 0.2568, "step": 1960 }, { "epoch": 0.6223836979701446, "grad_norm": 0.2946036416755621, "learning_rate": 9.981792263441739e-06, "loss": 0.2543, "step": 1970 }, { "epoch": 0.6255430060816681, "grad_norm": 0.2747932187325911, "learning_rate": 9.980839747123967e-06, "loss": 0.2557, "step": 1980 }, { "epoch": 0.6287023141931917, "grad_norm": 0.2851964847837964, "learning_rate": 9.979862994849074e-06, "loss": 0.2541, "step": 1990 }, { "epoch": 0.6318616223047153, "grad_norm": 0.31890735265964804, "learning_rate": 9.978862011369779e-06, "loss": 0.2558, "step": 2000 }, { "epoch": 0.6350209304162389, "grad_norm": 0.28038726746988707, "learning_rate": 9.977836801556705e-06, "loss": 0.2538, "step": 2010 }, { "epoch": 0.6381802385277624, "grad_norm": 0.295646403302159, "learning_rate": 9.976787370398355e-06, "loss": 0.2546, "step": 2020 }, { "epoch": 0.641339546639286, "grad_norm": 0.2902155436032145, "learning_rate": 9.975713723001093e-06, "loss": 0.251, "step": 2030 }, { "epoch": 0.6444988547508096, "grad_norm": 0.2828393586793592, "learning_rate": 9.974615864589112e-06, "loss": 0.2559, "step": 2040 }, { "epoch": 0.6476581628623331, "grad_norm": 0.28577077749952917, "learning_rate": 9.97349380050441e-06, "loss": 0.2531, "step": 2050 }, { "epoch": 0.6508174709738567, "grad_norm": 0.29564024961232643, "learning_rate": 9.972347536206772e-06, "loss": 0.2506, "step": 2060 }, { "epoch": 0.6539767790853803, "grad_norm": 0.33183704608229764, "learning_rate": 9.971177077273732e-06, "loss": 0.2534, "step": 2070 }, { "epoch": 0.6571360871969039, "grad_norm": 0.28977544469285044, "learning_rate": 9.969982429400556e-06, "loss": 0.2537, "step": 2080 }, { "epoch": 0.6602953953084274, "grad_norm": 0.29923378551865065, "learning_rate": 9.968763598400202e-06, "loss": 0.2569, "step": 2090 }, { "epoch": 0.663454703419951, "grad_norm": 0.3012488805857487, "learning_rate": 9.967520590203305e-06, "loss": 0.2509, "step": 2100 }, { "epoch": 0.6666140115314746, "grad_norm": 0.2916467483906387, "learning_rate": 9.966253410858145e-06, "loss": 0.2551, "step": 2110 }, { "epoch": 0.6697733196429981, "grad_norm": 0.27652645594826225, "learning_rate": 9.964962066530604e-06, "loss": 0.2515, "step": 2120 }, { "epoch": 0.6729326277545218, "grad_norm": 0.27562185290008373, "learning_rate": 9.963646563504158e-06, "loss": 0.2544, "step": 2130 }, { "epoch": 0.6760919358660453, "grad_norm": 0.25732340414313354, "learning_rate": 9.962306908179833e-06, "loss": 0.2515, "step": 2140 }, { "epoch": 0.6792512439775689, "grad_norm": 0.26515843698964003, "learning_rate": 9.96094310707617e-06, "loss": 0.2494, "step": 2150 }, { "epoch": 0.6824105520890925, "grad_norm": 0.28168758705492475, "learning_rate": 9.959555166829204e-06, "loss": 0.2494, "step": 2160 }, { "epoch": 0.685569860200616, "grad_norm": 0.24631536243430951, "learning_rate": 9.95814309419243e-06, "loss": 0.2519, "step": 2170 }, { "epoch": 0.6887291683121396, "grad_norm": 0.26426587738324664, "learning_rate": 9.956706896036762e-06, "loss": 0.2533, "step": 2180 }, { "epoch": 0.6918884764236632, "grad_norm": 0.24898568379039823, "learning_rate": 9.955246579350505e-06, "loss": 0.2491, "step": 2190 }, { "epoch": 0.6950477845351868, "grad_norm": 0.30540369742862966, "learning_rate": 9.953762151239327e-06, "loss": 0.2478, "step": 2200 }, { "epoch": 0.6982070926467103, "grad_norm": 0.2709499727026948, "learning_rate": 9.952253618926212e-06, "loss": 0.2515, "step": 2210 }, { "epoch": 0.701366400758234, "grad_norm": 0.3003234996003297, "learning_rate": 9.95072098975143e-06, "loss": 0.2541, "step": 2220 }, { "epoch": 0.7045257088697575, "grad_norm": 0.2736168957433189, "learning_rate": 9.949164271172512e-06, "loss": 0.2499, "step": 2230 }, { "epoch": 0.7076850169812811, "grad_norm": 0.2651616205282266, "learning_rate": 9.947583470764193e-06, "loss": 0.2506, "step": 2240 }, { "epoch": 0.7108443250928047, "grad_norm": 0.2650203207221141, "learning_rate": 9.945978596218391e-06, "loss": 0.2488, "step": 2250 }, { "epoch": 0.7140036332043282, "grad_norm": 0.31633222399887706, "learning_rate": 9.944349655344168e-06, "loss": 0.2504, "step": 2260 }, { "epoch": 0.7171629413158518, "grad_norm": 0.3450314977108967, "learning_rate": 9.942696656067683e-06, "loss": 0.2487, "step": 2270 }, { "epoch": 0.7203222494273754, "grad_norm": 0.2676203266167299, "learning_rate": 9.941019606432163e-06, "loss": 0.2515, "step": 2280 }, { "epoch": 0.723481557538899, "grad_norm": 0.27501120919754213, "learning_rate": 9.93931851459786e-06, "loss": 0.2472, "step": 2290 }, { "epoch": 0.7266408656504225, "grad_norm": 0.2721444766021836, "learning_rate": 9.937593388842008e-06, "loss": 0.2484, "step": 2300 }, { "epoch": 0.7298001737619462, "grad_norm": 0.3055396538597429, "learning_rate": 9.935844237558792e-06, "loss": 0.2491, "step": 2310 }, { "epoch": 0.7329594818734697, "grad_norm": 0.2687664962558202, "learning_rate": 9.934071069259295e-06, "loss": 0.2511, "step": 2320 }, { "epoch": 0.7361187899849932, "grad_norm": 0.26485303698836093, "learning_rate": 9.932273892571467e-06, "loss": 0.2493, "step": 2330 }, { "epoch": 0.7392780980965169, "grad_norm": 0.27030949688183054, "learning_rate": 9.930452716240077e-06, "loss": 0.2465, "step": 2340 }, { "epoch": 0.7424374062080404, "grad_norm": 0.26582715073120317, "learning_rate": 9.928607549126677e-06, "loss": 0.2492, "step": 2350 }, { "epoch": 0.745596714319564, "grad_norm": 0.26161071455089063, "learning_rate": 9.926738400209546e-06, "loss": 0.2473, "step": 2360 }, { "epoch": 0.7487560224310876, "grad_norm": 0.27281673381922744, "learning_rate": 9.924845278583661e-06, "loss": 0.2461, "step": 2370 }, { "epoch": 0.7519153305426112, "grad_norm": 0.25026144589450994, "learning_rate": 9.922928193460644e-06, "loss": 0.2447, "step": 2380 }, { "epoch": 0.7550746386541347, "grad_norm": 0.2928912991449991, "learning_rate": 9.920987154168719e-06, "loss": 0.2461, "step": 2390 }, { "epoch": 0.7582339467656584, "grad_norm": 0.2850696882452383, "learning_rate": 9.919022170152668e-06, "loss": 0.2499, "step": 2400 }, { "epoch": 0.7613932548771819, "grad_norm": 0.30443484331190923, "learning_rate": 9.917033250973786e-06, "loss": 0.2493, "step": 2410 }, { "epoch": 0.7645525629887054, "grad_norm": 0.3212209432208665, "learning_rate": 9.915020406309828e-06, "loss": 0.2491, "step": 2420 }, { "epoch": 0.7677118711002291, "grad_norm": 0.28504423491604935, "learning_rate": 9.912983645954973e-06, "loss": 0.2474, "step": 2430 }, { "epoch": 0.7708711792117526, "grad_norm": 0.2878869891714329, "learning_rate": 9.910922979819762e-06, "loss": 0.2492, "step": 2440 }, { "epoch": 0.7740304873232762, "grad_norm": 0.2771202754746543, "learning_rate": 9.908838417931062e-06, "loss": 0.2472, "step": 2450 }, { "epoch": 0.7771897954347998, "grad_norm": 0.31577544480370157, "learning_rate": 9.906729970432014e-06, "loss": 0.249, "step": 2460 }, { "epoch": 0.7803491035463234, "grad_norm": 0.3082921227301929, "learning_rate": 9.904597647581982e-06, "loss": 0.2468, "step": 2470 }, { "epoch": 0.7835084116578469, "grad_norm": 0.2562560339656447, "learning_rate": 9.9024414597565e-06, "loss": 0.2495, "step": 2480 }, { "epoch": 0.7866677197693706, "grad_norm": 0.257309029607144, "learning_rate": 9.90026141744723e-06, "loss": 0.2474, "step": 2490 }, { "epoch": 0.7898270278808941, "grad_norm": 0.2512470913827528, "learning_rate": 9.898057531261904e-06, "loss": 0.2472, "step": 2500 }, { "epoch": 0.7929863359924176, "grad_norm": 0.255003256149547, "learning_rate": 9.89582981192427e-06, "loss": 0.2443, "step": 2510 }, { "epoch": 0.7961456441039413, "grad_norm": 0.24903458333429462, "learning_rate": 9.893578270274054e-06, "loss": 0.2473, "step": 2520 }, { "epoch": 0.7993049522154648, "grad_norm": 0.25472580330098815, "learning_rate": 9.891302917266886e-06, "loss": 0.2501, "step": 2530 }, { "epoch": 0.8024642603269884, "grad_norm": 0.2876016304280073, "learning_rate": 9.889003763974272e-06, "loss": 0.248, "step": 2540 }, { "epoch": 0.805623568438512, "grad_norm": 0.26547023285812826, "learning_rate": 9.886680821583512e-06, "loss": 0.2462, "step": 2550 }, { "epoch": 0.8087828765500356, "grad_norm": 0.2541356171021246, "learning_rate": 9.884334101397666e-06, "loss": 0.2481, "step": 2560 }, { "epoch": 0.8119421846615591, "grad_norm": 0.27501124097499335, "learning_rate": 9.881963614835499e-06, "loss": 0.2417, "step": 2570 }, { "epoch": 0.8151014927730827, "grad_norm": 0.24719251381186472, "learning_rate": 9.879569373431408e-06, "loss": 0.2466, "step": 2580 }, { "epoch": 0.8182608008846063, "grad_norm": 0.3030702344650738, "learning_rate": 9.877151388835384e-06, "loss": 0.2472, "step": 2590 }, { "epoch": 0.8214201089961298, "grad_norm": 0.2516088806018057, "learning_rate": 9.87470967281295e-06, "loss": 0.2448, "step": 2600 }, { "epoch": 0.8245794171076535, "grad_norm": 0.27693954241387997, "learning_rate": 9.872244237245096e-06, "loss": 0.2453, "step": 2610 }, { "epoch": 0.827738725219177, "grad_norm": 0.25236801181159096, "learning_rate": 9.869755094128234e-06, "loss": 0.2444, "step": 2620 }, { "epoch": 0.8308980333307006, "grad_norm": 0.2511412465708672, "learning_rate": 9.867242255574127e-06, "loss": 0.2459, "step": 2630 }, { "epoch": 0.8340573414422241, "grad_norm": 0.28252199150173957, "learning_rate": 9.864705733809842e-06, "loss": 0.245, "step": 2640 }, { "epoch": 0.8372166495537477, "grad_norm": 0.25900018707343897, "learning_rate": 9.862145541177681e-06, "loss": 0.2434, "step": 2650 }, { "epoch": 0.8403759576652713, "grad_norm": 0.24962743931418693, "learning_rate": 9.859561690135125e-06, "loss": 0.2461, "step": 2660 }, { "epoch": 0.8435352657767948, "grad_norm": 0.28196557811696227, "learning_rate": 9.856954193254773e-06, "loss": 0.2475, "step": 2670 }, { "epoch": 0.8466945738883185, "grad_norm": 0.26537277820682614, "learning_rate": 9.854323063224282e-06, "loss": 0.2451, "step": 2680 }, { "epoch": 0.849853881999842, "grad_norm": 0.2925684725032324, "learning_rate": 9.851668312846303e-06, "loss": 0.2453, "step": 2690 }, { "epoch": 0.8530131901113656, "grad_norm": 0.31194880176939827, "learning_rate": 9.848989955038422e-06, "loss": 0.2446, "step": 2700 }, { "epoch": 0.8561724982228892, "grad_norm": 0.26907966931847127, "learning_rate": 9.84628800283309e-06, "loss": 0.2416, "step": 2710 }, { "epoch": 0.8593318063344128, "grad_norm": 0.2855876195388184, "learning_rate": 9.843562469377568e-06, "loss": 0.2435, "step": 2720 }, { "epoch": 0.8624911144459363, "grad_norm": 0.26929494670820325, "learning_rate": 9.84081336793386e-06, "loss": 0.246, "step": 2730 }, { "epoch": 0.8656504225574599, "grad_norm": 0.2560753102085232, "learning_rate": 9.838040711878648e-06, "loss": 0.2423, "step": 2740 }, { "epoch": 0.8688097306689835, "grad_norm": 0.2726015414676623, "learning_rate": 9.835244514703223e-06, "loss": 0.2427, "step": 2750 }, { "epoch": 0.871969038780507, "grad_norm": 0.24937962474701536, "learning_rate": 9.83242479001343e-06, "loss": 0.2439, "step": 2760 }, { "epoch": 0.8751283468920307, "grad_norm": 0.2758063411552256, "learning_rate": 9.82958155152959e-06, "loss": 0.2448, "step": 2770 }, { "epoch": 0.8782876550035542, "grad_norm": 0.2905878430819007, "learning_rate": 9.826714813086439e-06, "loss": 0.2412, "step": 2780 }, { "epoch": 0.8814469631150778, "grad_norm": 0.2558643463285868, "learning_rate": 9.82382458863306e-06, "loss": 0.2406, "step": 2790 }, { "epoch": 0.8846062712266014, "grad_norm": 0.2642935160526405, "learning_rate": 9.820910892232816e-06, "loss": 0.2444, "step": 2800 }, { "epoch": 0.8877655793381249, "grad_norm": 0.2592764595190498, "learning_rate": 9.817973738063283e-06, "loss": 0.2386, "step": 2810 }, { "epoch": 0.8909248874496485, "grad_norm": 0.2507512507039411, "learning_rate": 9.815013140416171e-06, "loss": 0.246, "step": 2820 }, { "epoch": 0.8940841955611721, "grad_norm": 0.25908992057608105, "learning_rate": 9.812029113697271e-06, "loss": 0.2395, "step": 2830 }, { "epoch": 0.8972435036726957, "grad_norm": 0.2612092586416161, "learning_rate": 9.809021672426371e-06, "loss": 0.24, "step": 2840 }, { "epoch": 0.9004028117842192, "grad_norm": 0.2673187039088244, "learning_rate": 9.805990831237194e-06, "loss": 0.2444, "step": 2850 }, { "epoch": 0.9035621198957429, "grad_norm": 0.2437036853392356, "learning_rate": 9.802936604877316e-06, "loss": 0.241, "step": 2860 }, { "epoch": 0.9067214280072664, "grad_norm": 0.2825822044337206, "learning_rate": 9.799859008208112e-06, "loss": 0.2419, "step": 2870 }, { "epoch": 0.90988073611879, "grad_norm": 0.28354580397363727, "learning_rate": 9.796758056204662e-06, "loss": 0.2427, "step": 2880 }, { "epoch": 0.9130400442303136, "grad_norm": 0.24849879403822714, "learning_rate": 9.7936337639557e-06, "loss": 0.2407, "step": 2890 }, { "epoch": 0.9161993523418371, "grad_norm": 0.26017599732454766, "learning_rate": 9.790486146663522e-06, "loss": 0.2403, "step": 2900 }, { "epoch": 0.9193586604533607, "grad_norm": 0.2718913199408004, "learning_rate": 9.78731521964392e-06, "loss": 0.2428, "step": 2910 }, { "epoch": 0.9225179685648843, "grad_norm": 0.23612120956344837, "learning_rate": 9.784120998326115e-06, "loss": 0.2401, "step": 2920 }, { "epoch": 0.9256772766764079, "grad_norm": 0.24629353188155112, "learning_rate": 9.780903498252665e-06, "loss": 0.2392, "step": 2930 }, { "epoch": 0.9288365847879314, "grad_norm": 0.2459160847664615, "learning_rate": 9.777662735079406e-06, "loss": 0.2404, "step": 2940 }, { "epoch": 0.9319958928994551, "grad_norm": 0.2475965886432302, "learning_rate": 9.77439872457536e-06, "loss": 0.2412, "step": 2950 }, { "epoch": 0.9351552010109786, "grad_norm": 0.28786836243759434, "learning_rate": 9.771111482622677e-06, "loss": 0.2408, "step": 2960 }, { "epoch": 0.9383145091225021, "grad_norm": 0.2556612314956361, "learning_rate": 9.76780102521654e-06, "loss": 0.2395, "step": 2970 }, { "epoch": 0.9414738172340258, "grad_norm": 0.26540547133463965, "learning_rate": 9.764467368465098e-06, "loss": 0.2408, "step": 2980 }, { "epoch": 0.9446331253455493, "grad_norm": 0.27099661447259094, "learning_rate": 9.761110528589382e-06, "loss": 0.2411, "step": 2990 }, { "epoch": 0.9477924334570729, "grad_norm": 0.2665281174710895, "learning_rate": 9.75773052192323e-06, "loss": 0.2411, "step": 3000 }, { "epoch": 0.9509517415685965, "grad_norm": 0.27363802003153875, "learning_rate": 9.754327364913208e-06, "loss": 0.2378, "step": 3010 }, { "epoch": 0.9541110496801201, "grad_norm": 0.2319637502396465, "learning_rate": 9.75090107411852e-06, "loss": 0.2423, "step": 3020 }, { "epoch": 0.9572703577916436, "grad_norm": 0.24790838457350073, "learning_rate": 9.747451666210946e-06, "loss": 0.2418, "step": 3030 }, { "epoch": 0.9604296659031673, "grad_norm": 0.28516778958623945, "learning_rate": 9.743979157974739e-06, "loss": 0.2416, "step": 3040 }, { "epoch": 0.9635889740146908, "grad_norm": 0.24493900212151187, "learning_rate": 9.740483566306565e-06, "loss": 0.2398, "step": 3050 }, { "epoch": 0.9667482821262143, "grad_norm": 0.2873027578326869, "learning_rate": 9.736964908215402e-06, "loss": 0.2396, "step": 3060 }, { "epoch": 0.969907590237738, "grad_norm": 0.24814910114506777, "learning_rate": 9.733423200822469e-06, "loss": 0.2391, "step": 3070 }, { "epoch": 0.9730668983492615, "grad_norm": 0.24906741016474904, "learning_rate": 9.729858461361142e-06, "loss": 0.242, "step": 3080 }, { "epoch": 0.9762262064607851, "grad_norm": 0.2826980721069527, "learning_rate": 9.726270707176859e-06, "loss": 0.2399, "step": 3090 }, { "epoch": 0.9793855145723087, "grad_norm": 0.26392318924970115, "learning_rate": 9.722659955727055e-06, "loss": 0.2395, "step": 3100 }, { "epoch": 0.9825448226838323, "grad_norm": 0.2855609937725699, "learning_rate": 9.719026224581054e-06, "loss": 0.2379, "step": 3110 }, { "epoch": 0.9857041307953558, "grad_norm": 0.2502787654544348, "learning_rate": 9.715369531420006e-06, "loss": 0.2394, "step": 3120 }, { "epoch": 0.9888634389068794, "grad_norm": 0.23500842887120083, "learning_rate": 9.711689894036785e-06, "loss": 0.2366, "step": 3130 }, { "epoch": 0.992022747018403, "grad_norm": 0.24586324530930273, "learning_rate": 9.707987330335906e-06, "loss": 0.2378, "step": 3140 }, { "epoch": 0.9951820551299265, "grad_norm": 0.23967550366976703, "learning_rate": 9.704261858333445e-06, "loss": 0.2388, "step": 3150 }, { "epoch": 0.9983413632414502, "grad_norm": 0.2725545852571603, "learning_rate": 9.700513496156945e-06, "loss": 0.2378, "step": 3160 }, { "epoch": 1.0012637232446093, "grad_norm": 0.25488826469533976, "learning_rate": 9.696742262045324e-06, "loss": 0.2171, "step": 3170 }, { "epoch": 1.004423031356133, "grad_norm": 0.2434423686192198, "learning_rate": 9.692948174348798e-06, "loss": 0.2256, "step": 3180 }, { "epoch": 1.0075823394676566, "grad_norm": 0.29889977976697246, "learning_rate": 9.689131251528778e-06, "loss": 0.2249, "step": 3190 }, { "epoch": 1.0107416475791802, "grad_norm": 0.2533307535141922, "learning_rate": 9.685291512157793e-06, "loss": 0.2265, "step": 3200 }, { "epoch": 1.0139009556907037, "grad_norm": 0.2663724197725429, "learning_rate": 9.68142897491939e-06, "loss": 0.226, "step": 3210 }, { "epoch": 1.0170602638022272, "grad_norm": 0.2791907738011004, "learning_rate": 9.677543658608047e-06, "loss": 0.2262, "step": 3220 }, { "epoch": 1.0202195719137508, "grad_norm": 0.23622696465246065, "learning_rate": 9.673635582129084e-06, "loss": 0.2222, "step": 3230 }, { "epoch": 1.0233788800252746, "grad_norm": 0.28281839802186026, "learning_rate": 9.669704764498564e-06, "loss": 0.2246, "step": 3240 }, { "epoch": 1.026538188136798, "grad_norm": 0.26869842180643333, "learning_rate": 9.66575122484321e-06, "loss": 0.2249, "step": 3250 }, { "epoch": 1.0296974962483216, "grad_norm": 0.26529568216979593, "learning_rate": 9.661774982400301e-06, "loss": 0.223, "step": 3260 }, { "epoch": 1.0328568043598452, "grad_norm": 0.2453451964854557, "learning_rate": 9.65777605651759e-06, "loss": 0.2238, "step": 3270 }, { "epoch": 1.0360161124713687, "grad_norm": 0.24227112445437904, "learning_rate": 9.653754466653195e-06, "loss": 0.222, "step": 3280 }, { "epoch": 1.0391754205828923, "grad_norm": 0.26279583121049077, "learning_rate": 9.649710232375526e-06, "loss": 0.2236, "step": 3290 }, { "epoch": 1.042334728694416, "grad_norm": 0.31955584307225327, "learning_rate": 9.645643373363166e-06, "loss": 0.2229, "step": 3300 }, { "epoch": 1.0454940368059396, "grad_norm": 0.257107141717547, "learning_rate": 9.64155390940479e-06, "loss": 0.2256, "step": 3310 }, { "epoch": 1.048653344917463, "grad_norm": 0.28510160571621773, "learning_rate": 9.637441860399065e-06, "loss": 0.2243, "step": 3320 }, { "epoch": 1.0518126530289866, "grad_norm": 0.24137541551602262, "learning_rate": 9.633307246354558e-06, "loss": 0.2237, "step": 3330 }, { "epoch": 1.0549719611405102, "grad_norm": 0.26654943800407926, "learning_rate": 9.629150087389625e-06, "loss": 0.2253, "step": 3340 }, { "epoch": 1.0581312692520337, "grad_norm": 0.24362961602020153, "learning_rate": 9.624970403732328e-06, "loss": 0.2291, "step": 3350 }, { "epoch": 1.0612905773635575, "grad_norm": 0.24350680136934802, "learning_rate": 9.620768215720327e-06, "loss": 0.2229, "step": 3360 }, { "epoch": 1.064449885475081, "grad_norm": 0.2410847673171125, "learning_rate": 9.61654354380079e-06, "loss": 0.2291, "step": 3370 }, { "epoch": 1.0676091935866046, "grad_norm": 0.24859633294553296, "learning_rate": 9.612296408530279e-06, "loss": 0.224, "step": 3380 }, { "epoch": 1.070768501698128, "grad_norm": 0.33635244899144884, "learning_rate": 9.608026830574666e-06, "loss": 0.2219, "step": 3390 }, { "epoch": 1.0739278098096516, "grad_norm": 0.288768968554062, "learning_rate": 9.603734830709029e-06, "loss": 0.2252, "step": 3400 }, { "epoch": 1.0770871179211752, "grad_norm": 0.2697580145809562, "learning_rate": 9.599420429817534e-06, "loss": 0.2234, "step": 3410 }, { "epoch": 1.080246426032699, "grad_norm": 0.2520287953499914, "learning_rate": 9.595083648893361e-06, "loss": 0.2218, "step": 3420 }, { "epoch": 1.0834057341442225, "grad_norm": 0.2538820882514716, "learning_rate": 9.59072450903858e-06, "loss": 0.2245, "step": 3430 }, { "epoch": 1.086565042255746, "grad_norm": 0.24006555139584898, "learning_rate": 9.586343031464056e-06, "loss": 0.2245, "step": 3440 }, { "epoch": 1.0897243503672696, "grad_norm": 0.24296965136255919, "learning_rate": 9.581939237489347e-06, "loss": 0.2227, "step": 3450 }, { "epoch": 1.092883658478793, "grad_norm": 0.2641190618925988, "learning_rate": 9.577513148542601e-06, "loss": 0.2224, "step": 3460 }, { "epoch": 1.0960429665903166, "grad_norm": 0.22534300284603093, "learning_rate": 9.573064786160447e-06, "loss": 0.2265, "step": 3470 }, { "epoch": 1.0992022747018404, "grad_norm": 0.24227259587968128, "learning_rate": 9.568594171987894e-06, "loss": 0.2269, "step": 3480 }, { "epoch": 1.102361582813364, "grad_norm": 0.24660324114104168, "learning_rate": 9.564101327778223e-06, "loss": 0.2252, "step": 3490 }, { "epoch": 1.1055208909248875, "grad_norm": 0.2727222419616193, "learning_rate": 9.559586275392887e-06, "loss": 0.2222, "step": 3500 }, { "epoch": 1.108680199036411, "grad_norm": 0.23732734292357127, "learning_rate": 9.555049036801394e-06, "loss": 0.2251, "step": 3510 }, { "epoch": 1.1118395071479346, "grad_norm": 0.2499998501537187, "learning_rate": 9.550489634081213e-06, "loss": 0.2235, "step": 3520 }, { "epoch": 1.114998815259458, "grad_norm": 0.2505631262518965, "learning_rate": 9.545908089417655e-06, "loss": 0.2268, "step": 3530 }, { "epoch": 1.1181581233709816, "grad_norm": 0.24831276616046985, "learning_rate": 9.541304425103772e-06, "loss": 0.2258, "step": 3540 }, { "epoch": 1.1213174314825054, "grad_norm": 0.24804609760497534, "learning_rate": 9.536678663540247e-06, "loss": 0.2232, "step": 3550 }, { "epoch": 1.124476739594029, "grad_norm": 0.2268843330169224, "learning_rate": 9.532030827235285e-06, "loss": 0.2223, "step": 3560 }, { "epoch": 1.1276360477055525, "grad_norm": 0.22228229533403326, "learning_rate": 9.527360938804503e-06, "loss": 0.2261, "step": 3570 }, { "epoch": 1.130795355817076, "grad_norm": 0.25616837563557016, "learning_rate": 9.522669020970821e-06, "loss": 0.2248, "step": 3580 }, { "epoch": 1.1339546639285996, "grad_norm": 0.24799181444300183, "learning_rate": 9.517955096564344e-06, "loss": 0.2249, "step": 3590 }, { "epoch": 1.1371139720401233, "grad_norm": 0.2382662787923193, "learning_rate": 9.513219188522266e-06, "loss": 0.2216, "step": 3600 }, { "epoch": 1.1402732801516469, "grad_norm": 0.25171104996591714, "learning_rate": 9.508461319888744e-06, "loss": 0.2225, "step": 3610 }, { "epoch": 1.1434325882631704, "grad_norm": 0.23415739363429328, "learning_rate": 9.503681513814797e-06, "loss": 0.2236, "step": 3620 }, { "epoch": 1.146591896374694, "grad_norm": 0.2432154138833874, "learning_rate": 9.498879793558184e-06, "loss": 0.2234, "step": 3630 }, { "epoch": 1.1497512044862175, "grad_norm": 0.22687839285475797, "learning_rate": 9.494056182483293e-06, "loss": 0.2222, "step": 3640 }, { "epoch": 1.152910512597741, "grad_norm": 0.26826441248684857, "learning_rate": 9.489210704061036e-06, "loss": 0.2216, "step": 3650 }, { "epoch": 1.1560698207092646, "grad_norm": 0.24724769280109898, "learning_rate": 9.484343381868722e-06, "loss": 0.2242, "step": 3660 }, { "epoch": 1.1592291288207883, "grad_norm": 0.2584281422449668, "learning_rate": 9.479454239589948e-06, "loss": 0.2248, "step": 3670 }, { "epoch": 1.1623884369323119, "grad_norm": 0.2445110224992667, "learning_rate": 9.47454330101449e-06, "loss": 0.2207, "step": 3680 }, { "epoch": 1.1655477450438354, "grad_norm": 0.22768485500843333, "learning_rate": 9.469610590038175e-06, "loss": 0.2231, "step": 3690 }, { "epoch": 1.168707053155359, "grad_norm": 0.23303026391453427, "learning_rate": 9.464656130662775e-06, "loss": 0.2237, "step": 3700 }, { "epoch": 1.1718663612668825, "grad_norm": 0.2489422466683634, "learning_rate": 9.45967994699588e-06, "loss": 0.2249, "step": 3710 }, { "epoch": 1.1750256693784062, "grad_norm": 0.26953611174046954, "learning_rate": 9.454682063250798e-06, "loss": 0.2214, "step": 3720 }, { "epoch": 1.1781849774899298, "grad_norm": 0.24590519797285548, "learning_rate": 9.449662503746416e-06, "loss": 0.2238, "step": 3730 }, { "epoch": 1.1813442856014533, "grad_norm": 0.23511526883880332, "learning_rate": 9.444621292907095e-06, "loss": 0.2224, "step": 3740 }, { "epoch": 1.1845035937129769, "grad_norm": 0.22848149314341606, "learning_rate": 9.439558455262547e-06, "loss": 0.2214, "step": 3750 }, { "epoch": 1.1876629018245004, "grad_norm": 0.25030584511199394, "learning_rate": 9.43447401544772e-06, "loss": 0.2206, "step": 3760 }, { "epoch": 1.190822209936024, "grad_norm": 0.2395716904991911, "learning_rate": 9.429367998202671e-06, "loss": 0.2203, "step": 3770 }, { "epoch": 1.1939815180475475, "grad_norm": 0.24389646038824445, "learning_rate": 9.424240428372454e-06, "loss": 0.2231, "step": 3780 }, { "epoch": 1.1971408261590712, "grad_norm": 0.2511376763314995, "learning_rate": 9.419091330906985e-06, "loss": 0.2229, "step": 3790 }, { "epoch": 1.2003001342705948, "grad_norm": 0.2309948734333885, "learning_rate": 9.413920730860936e-06, "loss": 0.2217, "step": 3800 }, { "epoch": 1.2034594423821183, "grad_norm": 0.22403672252100287, "learning_rate": 9.408728653393613e-06, "loss": 0.2209, "step": 3810 }, { "epoch": 1.2066187504936419, "grad_norm": 0.23770720039134421, "learning_rate": 9.403515123768817e-06, "loss": 0.2262, "step": 3820 }, { "epoch": 1.2097780586051654, "grad_norm": 0.23637922011574056, "learning_rate": 9.398280167354737e-06, "loss": 0.2211, "step": 3830 }, { "epoch": 1.2129373667166892, "grad_norm": 0.23752969597662485, "learning_rate": 9.39302380962382e-06, "loss": 0.2232, "step": 3840 }, { "epoch": 1.2160966748282127, "grad_norm": 0.23711964499813667, "learning_rate": 9.38774607615265e-06, "loss": 0.2199, "step": 3850 }, { "epoch": 1.2192559829397362, "grad_norm": 0.2594025147223781, "learning_rate": 9.382446992621822e-06, "loss": 0.2219, "step": 3860 }, { "epoch": 1.2224152910512598, "grad_norm": 0.24109832395545655, "learning_rate": 9.377126584815812e-06, "loss": 0.2212, "step": 3870 }, { "epoch": 1.2255745991627833, "grad_norm": 0.22269010324866226, "learning_rate": 9.371784878622863e-06, "loss": 0.221, "step": 3880 }, { "epoch": 1.2287339072743069, "grad_norm": 0.22390895916446435, "learning_rate": 9.36642190003485e-06, "loss": 0.2206, "step": 3890 }, { "epoch": 1.2318932153858304, "grad_norm": 0.27627761911305704, "learning_rate": 9.361037675147152e-06, "loss": 0.2209, "step": 3900 }, { "epoch": 1.2350525234973542, "grad_norm": 0.2420598861318583, "learning_rate": 9.355632230158537e-06, "loss": 0.2179, "step": 3910 }, { "epoch": 1.2382118316088777, "grad_norm": 0.2325111241248699, "learning_rate": 9.35020559137102e-06, "loss": 0.2195, "step": 3920 }, { "epoch": 1.2413711397204013, "grad_norm": 0.24210875124091263, "learning_rate": 9.344757785189743e-06, "loss": 0.2209, "step": 3930 }, { "epoch": 1.2445304478319248, "grad_norm": 0.23465411344270035, "learning_rate": 9.339288838122848e-06, "loss": 0.2218, "step": 3940 }, { "epoch": 1.2476897559434483, "grad_norm": 0.24940394557867132, "learning_rate": 9.333798776781344e-06, "loss": 0.2207, "step": 3950 }, { "epoch": 1.250849064054972, "grad_norm": 0.24164502781099362, "learning_rate": 9.328287627878974e-06, "loss": 0.2239, "step": 3960 }, { "epoch": 1.2540083721664956, "grad_norm": 0.24030120002479546, "learning_rate": 9.322755418232094e-06, "loss": 0.2222, "step": 3970 }, { "epoch": 1.2571676802780192, "grad_norm": 0.24260325995919346, "learning_rate": 9.317202174759541e-06, "loss": 0.2205, "step": 3980 }, { "epoch": 1.2603269883895427, "grad_norm": 0.24390555474193987, "learning_rate": 9.311627924482494e-06, "loss": 0.2201, "step": 3990 }, { "epoch": 1.2634862965010663, "grad_norm": 0.2373559456916262, "learning_rate": 9.306032694524346e-06, "loss": 0.2211, "step": 4000 }, { "epoch": 1.2669615354237422, "grad_norm": 0.23593037133395878, "learning_rate": 9.300416512110582e-06, "loss": 0.2212, "step": 4010 }, { "epoch": 1.2701208435352658, "grad_norm": 0.2683209112120544, "learning_rate": 9.29477940456863e-06, "loss": 0.2233, "step": 4020 }, { "epoch": 1.2732801516467893, "grad_norm": 0.24625101625854237, "learning_rate": 9.289121399327742e-06, "loss": 0.2204, "step": 4030 }, { "epoch": 1.2764394597583129, "grad_norm": 0.24487225016448577, "learning_rate": 9.283442523918848e-06, "loss": 0.2216, "step": 4040 }, { "epoch": 1.2795987678698366, "grad_norm": 0.22210615337013012, "learning_rate": 9.27774280597444e-06, "loss": 0.2215, "step": 4050 }, { "epoch": 1.2827580759813602, "grad_norm": 0.22108031546901602, "learning_rate": 9.272022273228414e-06, "loss": 0.2204, "step": 4060 }, { "epoch": 1.2859173840928837, "grad_norm": 0.24375060995624975, "learning_rate": 9.266280953515958e-06, "loss": 0.2206, "step": 4070 }, { "epoch": 1.2890766922044072, "grad_norm": 0.2480993257402374, "learning_rate": 9.260518874773395e-06, "loss": 0.2241, "step": 4080 }, { "epoch": 1.2922360003159308, "grad_norm": 0.22812840181798105, "learning_rate": 9.25473606503807e-06, "loss": 0.2229, "step": 4090 }, { "epoch": 1.2953953084274543, "grad_norm": 0.23578170889449204, "learning_rate": 9.248932552448191e-06, "loss": 0.2202, "step": 4100 }, { "epoch": 1.2985546165389779, "grad_norm": 0.23821605850175842, "learning_rate": 9.24310836524271e-06, "loss": 0.223, "step": 4110 }, { "epoch": 1.3017139246505016, "grad_norm": 0.22354575939426694, "learning_rate": 9.237263531761178e-06, "loss": 0.2206, "step": 4120 }, { "epoch": 1.3048732327620252, "grad_norm": 0.25832314829209013, "learning_rate": 9.2313980804436e-06, "loss": 0.2231, "step": 4130 }, { "epoch": 1.3080325408735487, "grad_norm": 0.25538340887467387, "learning_rate": 9.225512039830316e-06, "loss": 0.2175, "step": 4140 }, { "epoch": 1.3111918489850722, "grad_norm": 0.25215426741015406, "learning_rate": 9.219605438561836e-06, "loss": 0.2237, "step": 4150 }, { "epoch": 1.3143511570965958, "grad_norm": 0.2313336444417292, "learning_rate": 9.213678305378728e-06, "loss": 0.2172, "step": 4160 }, { "epoch": 1.3175104652081195, "grad_norm": 0.2278642515048215, "learning_rate": 9.207730669121458e-06, "loss": 0.2198, "step": 4170 }, { "epoch": 1.320669773319643, "grad_norm": 0.2202393432518963, "learning_rate": 9.201762558730256e-06, "loss": 0.2217, "step": 4180 }, { "epoch": 1.3238290814311666, "grad_norm": 0.23993000450443083, "learning_rate": 9.19577400324498e-06, "loss": 0.218, "step": 4190 }, { "epoch": 1.3269883895426902, "grad_norm": 0.21595002622084153, "learning_rate": 9.189765031804965e-06, "loss": 0.2176, "step": 4200 }, { "epoch": 1.3301476976542137, "grad_norm": 0.2526302368911067, "learning_rate": 9.183735673648893e-06, "loss": 0.2201, "step": 4210 }, { "epoch": 1.3333070057657372, "grad_norm": 0.2431796699930924, "learning_rate": 9.177685958114641e-06, "loss": 0.2197, "step": 4220 }, { "epoch": 1.3364663138772608, "grad_norm": 0.22807380467388666, "learning_rate": 9.171615914639143e-06, "loss": 0.2184, "step": 4230 }, { "epoch": 1.3396256219887843, "grad_norm": 0.23018273423897978, "learning_rate": 9.16552557275824e-06, "loss": 0.2212, "step": 4240 }, { "epoch": 1.342784930100308, "grad_norm": 0.25116936857534927, "learning_rate": 9.159414962106551e-06, "loss": 0.2205, "step": 4250 }, { "epoch": 1.3459442382118316, "grad_norm": 0.2535484242407173, "learning_rate": 9.153284112417314e-06, "loss": 0.2173, "step": 4260 }, { "epoch": 1.3491035463233552, "grad_norm": 0.21916754861180746, "learning_rate": 9.147133053522243e-06, "loss": 0.2206, "step": 4270 }, { "epoch": 1.3522628544348787, "grad_norm": 0.24428352296386502, "learning_rate": 9.140961815351399e-06, "loss": 0.2203, "step": 4280 }, { "epoch": 1.3554221625464025, "grad_norm": 0.22564630207596492, "learning_rate": 9.13477042793302e-06, "loss": 0.2185, "step": 4290 }, { "epoch": 1.358581470657926, "grad_norm": 0.2203268383689168, "learning_rate": 9.128558921393391e-06, "loss": 0.2196, "step": 4300 }, { "epoch": 1.3617407787694495, "grad_norm": 0.22885301789778367, "learning_rate": 9.122327325956697e-06, "loss": 0.2188, "step": 4310 }, { "epoch": 1.364900086880973, "grad_norm": 0.23826459623192525, "learning_rate": 9.116075671944865e-06, "loss": 0.2196, "step": 4320 }, { "epoch": 1.3680593949924966, "grad_norm": 0.23832523948959192, "learning_rate": 9.109803989777432e-06, "loss": 0.22, "step": 4330 }, { "epoch": 1.3712187031040202, "grad_norm": 0.23458728809270832, "learning_rate": 9.103512309971381e-06, "loss": 0.2201, "step": 4340 }, { "epoch": 1.3743780112155437, "grad_norm": 0.2505289166305433, "learning_rate": 9.097200663141007e-06, "loss": 0.2204, "step": 4350 }, { "epoch": 1.3775373193270672, "grad_norm": 0.23547611606996885, "learning_rate": 9.090869079997756e-06, "loss": 0.2176, "step": 4360 }, { "epoch": 1.380696627438591, "grad_norm": 0.23300517643120455, "learning_rate": 9.084517591350085e-06, "loss": 0.2196, "step": 4370 }, { "epoch": 1.3838559355501145, "grad_norm": 0.23821378937778367, "learning_rate": 9.078146228103302e-06, "loss": 0.2205, "step": 4380 }, { "epoch": 1.387015243661638, "grad_norm": 0.24777379778962433, "learning_rate": 9.07175502125943e-06, "loss": 0.217, "step": 4390 }, { "epoch": 1.3901745517731616, "grad_norm": 0.2475049075002878, "learning_rate": 9.065344001917042e-06, "loss": 0.2182, "step": 4400 }, { "epoch": 1.3933338598846854, "grad_norm": 0.22578213255742105, "learning_rate": 9.058913201271116e-06, "loss": 0.2193, "step": 4410 }, { "epoch": 1.396493167996209, "grad_norm": 0.2363691948058237, "learning_rate": 9.052462650612886e-06, "loss": 0.2203, "step": 4420 }, { "epoch": 1.3996524761077325, "grad_norm": 0.22912256425928543, "learning_rate": 9.045992381329678e-06, "loss": 0.219, "step": 4430 }, { "epoch": 1.402811784219256, "grad_norm": 0.24248708988856363, "learning_rate": 9.039502424904778e-06, "loss": 0.2197, "step": 4440 }, { "epoch": 1.4059710923307795, "grad_norm": 0.24586583420100006, "learning_rate": 9.032992812917253e-06, "loss": 0.217, "step": 4450 }, { "epoch": 1.409130400442303, "grad_norm": 0.2352960559617627, "learning_rate": 9.026463577041823e-06, "loss": 0.2187, "step": 4460 }, { "epoch": 1.4122897085538266, "grad_norm": 0.24173709757573497, "learning_rate": 9.019914749048689e-06, "loss": 0.221, "step": 4470 }, { "epoch": 1.4154490166653502, "grad_norm": 0.23957045024343024, "learning_rate": 9.01334636080338e-06, "loss": 0.2186, "step": 4480 }, { "epoch": 1.418608324776874, "grad_norm": 0.2419319656288058, "learning_rate": 9.00675844426661e-06, "loss": 0.2194, "step": 4490 }, { "epoch": 1.4217676328883975, "grad_norm": 0.21881593406458033, "learning_rate": 9.00015103149411e-06, "loss": 0.2186, "step": 4500 }, { "epoch": 1.424926940999921, "grad_norm": 0.24136336388843918, "learning_rate": 8.993524154636475e-06, "loss": 0.2194, "step": 4510 }, { "epoch": 1.4280862491114446, "grad_norm": 0.24438868998683702, "learning_rate": 8.986877845939013e-06, "loss": 0.2182, "step": 4520 }, { "epoch": 1.431245557222968, "grad_norm": 0.2885217400112698, "learning_rate": 8.980212137741584e-06, "loss": 0.2177, "step": 4530 }, { "epoch": 1.4344048653344919, "grad_norm": 0.24449644579392688, "learning_rate": 8.973527062478438e-06, "loss": 0.221, "step": 4540 }, { "epoch": 1.4375641734460154, "grad_norm": 0.23524427018475397, "learning_rate": 8.966822652678068e-06, "loss": 0.2187, "step": 4550 }, { "epoch": 1.440723481557539, "grad_norm": 0.2288850724812931, "learning_rate": 8.960098940963042e-06, "loss": 0.2181, "step": 4560 }, { "epoch": 1.4438827896690625, "grad_norm": 0.21740524732083277, "learning_rate": 8.953355960049848e-06, "loss": 0.2171, "step": 4570 }, { "epoch": 1.447042097780586, "grad_norm": 0.22515210418111187, "learning_rate": 8.946593742748737e-06, "loss": 0.2198, "step": 4580 }, { "epoch": 1.4502014058921096, "grad_norm": 0.2241208354134438, "learning_rate": 8.93981232196356e-06, "loss": 0.2176, "step": 4590 }, { "epoch": 1.453360714003633, "grad_norm": 0.23239653090715226, "learning_rate": 8.933011730691609e-06, "loss": 0.2193, "step": 4600 }, { "epoch": 1.4565200221151569, "grad_norm": 0.2591474679289352, "learning_rate": 8.926192002023457e-06, "loss": 0.2152, "step": 4610 }, { "epoch": 1.4596793302266804, "grad_norm": 0.25412246361370894, "learning_rate": 8.919353169142794e-06, "loss": 0.221, "step": 4620 }, { "epoch": 1.462838638338204, "grad_norm": 0.24625096243975705, "learning_rate": 8.912495265326274e-06, "loss": 0.2176, "step": 4630 }, { "epoch": 1.4659979464497275, "grad_norm": 0.23707218603543193, "learning_rate": 8.905618323943337e-06, "loss": 0.2195, "step": 4640 }, { "epoch": 1.469157254561251, "grad_norm": 0.2428731037963279, "learning_rate": 8.898722378456066e-06, "loss": 0.2194, "step": 4650 }, { "epoch": 1.4723165626727748, "grad_norm": 0.23709943181558119, "learning_rate": 8.89180746241901e-06, "loss": 0.2189, "step": 4660 }, { "epoch": 1.4754758707842983, "grad_norm": 0.22433268675724657, "learning_rate": 8.88487360947903e-06, "loss": 0.2177, "step": 4670 }, { "epoch": 1.4786351788958219, "grad_norm": 0.20952198638497543, "learning_rate": 8.877920853375127e-06, "loss": 0.2168, "step": 4680 }, { "epoch": 1.4817944870073454, "grad_norm": 0.21717097039950753, "learning_rate": 8.87094922793828e-06, "loss": 0.2159, "step": 4690 }, { "epoch": 1.484953795118869, "grad_norm": 0.23796957102894456, "learning_rate": 8.86395876709129e-06, "loss": 0.2151, "step": 4700 }, { "epoch": 1.4881131032303925, "grad_norm": 0.2488834756996504, "learning_rate": 8.856949504848602e-06, "loss": 0.2154, "step": 4710 }, { "epoch": 1.491272411341916, "grad_norm": 0.25338534020929043, "learning_rate": 8.849921475316147e-06, "loss": 0.2182, "step": 4720 }, { "epoch": 1.4944317194534398, "grad_norm": 0.22919814445941372, "learning_rate": 8.842874712691176e-06, "loss": 0.2167, "step": 4730 }, { "epoch": 1.4975910275649633, "grad_norm": 0.2256770534043575, "learning_rate": 8.83580925126209e-06, "loss": 0.2173, "step": 4740 }, { "epoch": 1.5007503356764869, "grad_norm": 0.22659615457077606, "learning_rate": 8.828725125408277e-06, "loss": 0.2195, "step": 4750 }, { "epoch": 1.5039096437880104, "grad_norm": 0.2525621807127862, "learning_rate": 8.821622369599945e-06, "loss": 0.2189, "step": 4760 }, { "epoch": 1.5070689518995342, "grad_norm": 0.23576094914242274, "learning_rate": 8.814501018397948e-06, "loss": 0.2169, "step": 4770 }, { "epoch": 1.5102282600110577, "grad_norm": 0.23522191103261228, "learning_rate": 8.807361106453623e-06, "loss": 0.2174, "step": 4780 }, { "epoch": 1.5133875681225812, "grad_norm": 0.25147556181231623, "learning_rate": 8.800202668508624e-06, "loss": 0.2182, "step": 4790 }, { "epoch": 1.5165468762341048, "grad_norm": 0.229827362444795, "learning_rate": 8.793025739394747e-06, "loss": 0.2188, "step": 4800 }, { "epoch": 1.5197061843456283, "grad_norm": 0.23171793705210642, "learning_rate": 8.78583035403376e-06, "loss": 0.2192, "step": 4810 }, { "epoch": 1.5228654924571519, "grad_norm": 0.24117108800540396, "learning_rate": 8.778616547437244e-06, "loss": 0.2154, "step": 4820 }, { "epoch": 1.5260248005686754, "grad_norm": 0.2239538620184076, "learning_rate": 8.771384354706407e-06, "loss": 0.2189, "step": 4830 }, { "epoch": 1.529184108680199, "grad_norm": 0.22643911431321556, "learning_rate": 8.764133811031926e-06, "loss": 0.219, "step": 4840 }, { "epoch": 1.5323434167917225, "grad_norm": 0.2309154933998562, "learning_rate": 8.756864951693767e-06, "loss": 0.2161, "step": 4850 }, { "epoch": 1.5355027249032462, "grad_norm": 0.22487805218107695, "learning_rate": 8.749577812061019e-06, "loss": 0.2155, "step": 4860 }, { "epoch": 1.5386620330147698, "grad_norm": 0.2508153221101582, "learning_rate": 8.74227242759172e-06, "loss": 0.2165, "step": 4870 }, { "epoch": 1.5418213411262933, "grad_norm": 0.2294176475790702, "learning_rate": 8.734948833832684e-06, "loss": 0.2194, "step": 4880 }, { "epoch": 1.544980649237817, "grad_norm": 0.24196930828876226, "learning_rate": 8.72760706641933e-06, "loss": 0.2179, "step": 4890 }, { "epoch": 1.5481399573493406, "grad_norm": 0.2067023139224669, "learning_rate": 8.720247161075504e-06, "loss": 0.2141, "step": 4900 }, { "epoch": 1.5512992654608642, "grad_norm": 0.22150939402397368, "learning_rate": 8.71286915361331e-06, "loss": 0.2167, "step": 4910 }, { "epoch": 1.5544585735723877, "grad_norm": 0.22765148037671673, "learning_rate": 8.705473079932935e-06, "loss": 0.2193, "step": 4920 }, { "epoch": 1.5576178816839112, "grad_norm": 0.2541351407544811, "learning_rate": 8.698058976022473e-06, "loss": 0.2178, "step": 4930 }, { "epoch": 1.5607771897954348, "grad_norm": 0.22926147481484557, "learning_rate": 8.690626877957745e-06, "loss": 0.217, "step": 4940 }, { "epoch": 1.5639364979069583, "grad_norm": 0.22296251412940335, "learning_rate": 8.683176821902135e-06, "loss": 0.2169, "step": 4950 }, { "epoch": 1.5670958060184819, "grad_norm": 0.20884559665213961, "learning_rate": 8.675708844106407e-06, "loss": 0.2177, "step": 4960 }, { "epoch": 1.5702551141300054, "grad_norm": 0.24025626100283112, "learning_rate": 8.668222980908527e-06, "loss": 0.2197, "step": 4970 }, { "epoch": 1.573414422241529, "grad_norm": 0.23773563954990934, "learning_rate": 8.66071926873349e-06, "loss": 0.2152, "step": 4980 }, { "epoch": 1.5765737303530527, "grad_norm": 0.2145137444703373, "learning_rate": 8.65319774409314e-06, "loss": 0.2171, "step": 4990 }, { "epoch": 1.5797330384645762, "grad_norm": 0.22033335790324296, "learning_rate": 8.645658443585992e-06, "loss": 0.2186, "step": 5000 }, { "epoch": 1.5828923465760998, "grad_norm": 0.22804805519869795, "learning_rate": 8.638101403897062e-06, "loss": 0.2174, "step": 5010 }, { "epoch": 1.5860516546876235, "grad_norm": 0.2289269600023062, "learning_rate": 8.630526661797673e-06, "loss": 0.2164, "step": 5020 }, { "epoch": 1.589210962799147, "grad_norm": 0.2165226855369354, "learning_rate": 8.622934254145292e-06, "loss": 0.2173, "step": 5030 }, { "epoch": 1.5923702709106706, "grad_norm": 0.21269449117197298, "learning_rate": 8.615324217883341e-06, "loss": 0.2158, "step": 5040 }, { "epoch": 1.5955295790221942, "grad_norm": 0.21841160515610453, "learning_rate": 8.607696590041021e-06, "loss": 0.2181, "step": 5050 }, { "epoch": 1.5986888871337177, "grad_norm": 0.23472942411314004, "learning_rate": 8.60005140773313e-06, "loss": 0.2158, "step": 5060 }, { "epoch": 1.6018481952452412, "grad_norm": 0.2599264549024524, "learning_rate": 8.592388708159881e-06, "loss": 0.2202, "step": 5070 }, { "epoch": 1.6050075033567648, "grad_norm": 0.22673012824944389, "learning_rate": 8.584708528606728e-06, "loss": 0.2165, "step": 5080 }, { "epoch": 1.6081668114682883, "grad_norm": 0.2206529413228289, "learning_rate": 8.577010906444174e-06, "loss": 0.2187, "step": 5090 }, { "epoch": 1.6113261195798119, "grad_norm": 0.22895351987221169, "learning_rate": 8.569295879127602e-06, "loss": 0.2159, "step": 5100 }, { "epoch": 1.6144854276913356, "grad_norm": 0.23022447681447986, "learning_rate": 8.56156348419708e-06, "loss": 0.2157, "step": 5110 }, { "epoch": 1.6176447358028592, "grad_norm": 0.2183798711751106, "learning_rate": 8.553813759277185e-06, "loss": 0.2169, "step": 5120 }, { "epoch": 1.6208040439143827, "grad_norm": 0.22109788078597084, "learning_rate": 8.546046742076819e-06, "loss": 0.214, "step": 5130 }, { "epoch": 1.6239633520259065, "grad_norm": 0.2690538258988973, "learning_rate": 8.538262470389027e-06, "loss": 0.2189, "step": 5140 }, { "epoch": 1.62712266013743, "grad_norm": 0.22791429863696558, "learning_rate": 8.530460982090812e-06, "loss": 0.2146, "step": 5150 }, { "epoch": 1.6302819682489536, "grad_norm": 0.23247506925321845, "learning_rate": 8.522642315142948e-06, "loss": 0.2174, "step": 5160 }, { "epoch": 1.633441276360477, "grad_norm": 0.22524917238532172, "learning_rate": 8.514806507589796e-06, "loss": 0.2135, "step": 5170 }, { "epoch": 1.6366005844720006, "grad_norm": 0.21074048952604266, "learning_rate": 8.506953597559125e-06, "loss": 0.2168, "step": 5180 }, { "epoch": 1.6397598925835242, "grad_norm": 0.24438387341045803, "learning_rate": 8.49908362326192e-06, "loss": 0.2181, "step": 5190 }, { "epoch": 1.6429192006950477, "grad_norm": 0.2486574671755116, "learning_rate": 8.491196622992196e-06, "loss": 0.2131, "step": 5200 }, { "epoch": 1.6460785088065713, "grad_norm": 0.2256817140951327, "learning_rate": 8.483292635126814e-06, "loss": 0.2168, "step": 5210 }, { "epoch": 1.6492378169180948, "grad_norm": 0.23268219491083575, "learning_rate": 8.475371698125298e-06, "loss": 0.2199, "step": 5220 }, { "epoch": 1.6523971250296186, "grad_norm": 0.22549590195988278, "learning_rate": 8.46743385052964e-06, "loss": 0.2148, "step": 5230 }, { "epoch": 1.655556433141142, "grad_norm": 0.22816476144990433, "learning_rate": 8.459479130964114e-06, "loss": 0.2161, "step": 5240 }, { "epoch": 1.6587157412526656, "grad_norm": 0.22538580259762195, "learning_rate": 8.451507578135099e-06, "loss": 0.2153, "step": 5250 }, { "epoch": 1.6618750493641894, "grad_norm": 0.2378857848685177, "learning_rate": 8.443519230830871e-06, "loss": 0.2165, "step": 5260 }, { "epoch": 1.665034357475713, "grad_norm": 0.21964276783718112, "learning_rate": 8.435514127921432e-06, "loss": 0.2152, "step": 5270 }, { "epoch": 1.6681936655872365, "grad_norm": 0.2278255608379614, "learning_rate": 8.427492308358314e-06, "loss": 0.2151, "step": 5280 }, { "epoch": 1.67135297369876, "grad_norm": 0.22151461379504075, "learning_rate": 8.419453811174384e-06, "loss": 0.2159, "step": 5290 }, { "epoch": 1.6745122818102836, "grad_norm": 0.23436228542360216, "learning_rate": 8.411398675483668e-06, "loss": 0.2139, "step": 5300 }, { "epoch": 1.677671589921807, "grad_norm": 0.23116461868460544, "learning_rate": 8.403326940481146e-06, "loss": 0.2141, "step": 5310 }, { "epoch": 1.6808308980333306, "grad_norm": 0.22621704434121967, "learning_rate": 8.39523864544257e-06, "loss": 0.2178, "step": 5320 }, { "epoch": 1.6839902061448542, "grad_norm": 0.24437450666792623, "learning_rate": 8.387133829724266e-06, "loss": 0.2148, "step": 5330 }, { "epoch": 1.6871495142563777, "grad_norm": 0.2285633102632245, "learning_rate": 8.379012532762956e-06, "loss": 0.211, "step": 5340 }, { "epoch": 1.6903088223679015, "grad_norm": 0.23322244192934505, "learning_rate": 8.370874794075548e-06, "loss": 0.2159, "step": 5350 }, { "epoch": 1.693468130479425, "grad_norm": 0.22379930823118785, "learning_rate": 8.36272065325896e-06, "loss": 0.2139, "step": 5360 }, { "epoch": 1.6966274385909486, "grad_norm": 0.2120873572584099, "learning_rate": 8.354550149989912e-06, "loss": 0.2152, "step": 5370 }, { "epoch": 1.6997867467024723, "grad_norm": 0.2274107944742338, "learning_rate": 8.346363324024752e-06, "loss": 0.2154, "step": 5380 }, { "epoch": 1.7029460548139959, "grad_norm": 0.22062326975674185, "learning_rate": 8.338160215199239e-06, "loss": 0.213, "step": 5390 }, { "epoch": 1.7061053629255194, "grad_norm": 0.20566039440830033, "learning_rate": 8.329940863428372e-06, "loss": 0.2142, "step": 5400 }, { "epoch": 1.709264671037043, "grad_norm": 0.22328688203535454, "learning_rate": 8.321705308706178e-06, "loss": 0.2174, "step": 5410 }, { "epoch": 1.7124239791485665, "grad_norm": 0.2284198217298347, "learning_rate": 8.313453591105534e-06, "loss": 0.2166, "step": 5420 }, { "epoch": 1.71558328726009, "grad_norm": 0.24234082572473092, "learning_rate": 8.305185750777951e-06, "loss": 0.2153, "step": 5430 }, { "epoch": 1.7187425953716136, "grad_norm": 0.21687647127855095, "learning_rate": 8.296901827953403e-06, "loss": 0.2164, "step": 5440 }, { "epoch": 1.721901903483137, "grad_norm": 0.2121864508017843, "learning_rate": 8.288601862940109e-06, "loss": 0.2139, "step": 5450 }, { "epoch": 1.7250612115946606, "grad_norm": 0.22730841948699315, "learning_rate": 8.280285896124351e-06, "loss": 0.2174, "step": 5460 }, { "epoch": 1.7282205197061844, "grad_norm": 0.2364301598385313, "learning_rate": 8.271953967970273e-06, "loss": 0.214, "step": 5470 }, { "epoch": 1.731379827817708, "grad_norm": 0.2502748891595353, "learning_rate": 8.263606119019684e-06, "loss": 0.2158, "step": 5480 }, { "epoch": 1.7345391359292315, "grad_norm": 0.23803855035434604, "learning_rate": 8.255242389891863e-06, "loss": 0.217, "step": 5490 }, { "epoch": 1.737698444040755, "grad_norm": 0.23212647695583102, "learning_rate": 8.246862821283354e-06, "loss": 0.2147, "step": 5500 }, { "epoch": 1.7408577521522788, "grad_norm": 0.24431104358398667, "learning_rate": 8.238467453967778e-06, "loss": 0.2154, "step": 5510 }, { "epoch": 1.7440170602638023, "grad_norm": 0.21470725579185915, "learning_rate": 8.23005632879563e-06, "loss": 0.2141, "step": 5520 }, { "epoch": 1.7471763683753259, "grad_norm": 0.21322893537867463, "learning_rate": 8.221629486694076e-06, "loss": 0.2137, "step": 5530 }, { "epoch": 1.7503356764868494, "grad_norm": 0.22024980684855178, "learning_rate": 8.213186968666761e-06, "loss": 0.216, "step": 5540 }, { "epoch": 1.753494984598373, "grad_norm": 0.21887176967393748, "learning_rate": 8.20472881579361e-06, "loss": 0.2162, "step": 5550 }, { "epoch": 1.7566542927098965, "grad_norm": 0.21439885224310692, "learning_rate": 8.196255069230618e-06, "loss": 0.215, "step": 5560 }, { "epoch": 1.75981360082142, "grad_norm": 0.22279903289185554, "learning_rate": 8.187765770209662e-06, "loss": 0.2149, "step": 5570 }, { "epoch": 1.7629729089329436, "grad_norm": 0.23141919021697108, "learning_rate": 8.179260960038286e-06, "loss": 0.2158, "step": 5580 }, { "epoch": 1.766132217044467, "grad_norm": 0.21177829542109405, "learning_rate": 8.17074068009952e-06, "loss": 0.2162, "step": 5590 }, { "epoch": 1.7692915251559909, "grad_norm": 0.23088215353230407, "learning_rate": 8.162204971851662e-06, "loss": 0.215, "step": 5600 }, { "epoch": 1.7724508332675144, "grad_norm": 0.22123248711116683, "learning_rate": 8.153653876828081e-06, "loss": 0.2128, "step": 5610 }, { "epoch": 1.775610141379038, "grad_norm": 0.21216354452493327, "learning_rate": 8.145087436637014e-06, "loss": 0.2161, "step": 5620 }, { "epoch": 1.7787694494905617, "grad_norm": 0.2176779002539683, "learning_rate": 8.13650569296137e-06, "loss": 0.2147, "step": 5630 }, { "epoch": 1.7819287576020852, "grad_norm": 0.22379805142011377, "learning_rate": 8.12790868755852e-06, "loss": 0.2149, "step": 5640 }, { "epoch": 1.7850880657136088, "grad_norm": 0.23156638443169314, "learning_rate": 8.119296462260094e-06, "loss": 0.2116, "step": 5650 }, { "epoch": 1.7882473738251323, "grad_norm": 0.23142885687079817, "learning_rate": 8.110669058971783e-06, "loss": 0.2168, "step": 5660 }, { "epoch": 1.7914066819366559, "grad_norm": 0.21278539872224353, "learning_rate": 8.102026519673127e-06, "loss": 0.2136, "step": 5670 }, { "epoch": 1.7945659900481794, "grad_norm": 0.22604551176497345, "learning_rate": 8.093368886417323e-06, "loss": 0.2139, "step": 5680 }, { "epoch": 1.797725298159703, "grad_norm": 0.21945486381827597, "learning_rate": 8.084696201331005e-06, "loss": 0.2148, "step": 5690 }, { "epoch": 1.8008846062712265, "grad_norm": 0.2554690895004349, "learning_rate": 8.07600850661405e-06, "loss": 0.215, "step": 5700 }, { "epoch": 1.80404391438275, "grad_norm": 0.24996286626429107, "learning_rate": 8.067305844539369e-06, "loss": 0.2183, "step": 5710 }, { "epoch": 1.8072032224942738, "grad_norm": 0.24383324293020725, "learning_rate": 8.058588257452705e-06, "loss": 0.2121, "step": 5720 }, { "epoch": 1.8103625306057973, "grad_norm": 0.2169636811525225, "learning_rate": 8.049855787772416e-06, "loss": 0.2145, "step": 5730 }, { "epoch": 1.8135218387173209, "grad_norm": 0.24944768584905086, "learning_rate": 8.041108477989283e-06, "loss": 0.2141, "step": 5740 }, { "epoch": 1.8166811468288446, "grad_norm": 0.21757143809376578, "learning_rate": 8.032346370666297e-06, "loss": 0.2138, "step": 5750 }, { "epoch": 1.8198404549403682, "grad_norm": 0.20693258010147997, "learning_rate": 8.023569508438444e-06, "loss": 0.2159, "step": 5760 }, { "epoch": 1.8229997630518917, "grad_norm": 0.21253234218456654, "learning_rate": 8.014777934012515e-06, "loss": 0.2156, "step": 5770 }, { "epoch": 1.8261590711634152, "grad_norm": 0.2199083063027526, "learning_rate": 8.005971690166879e-06, "loss": 0.2147, "step": 5780 }, { "epoch": 1.8293183792749388, "grad_norm": 0.24052412314651891, "learning_rate": 7.99715081975129e-06, "loss": 0.2129, "step": 5790 }, { "epoch": 1.8324776873864623, "grad_norm": 0.21391433161968806, "learning_rate": 7.98831536568667e-06, "loss": 0.2142, "step": 5800 }, { "epoch": 1.8356369954979859, "grad_norm": 0.2071590477738613, "learning_rate": 7.979465370964904e-06, "loss": 0.2111, "step": 5810 }, { "epoch": 1.8387963036095094, "grad_norm": 0.20813594996772833, "learning_rate": 7.97060087864863e-06, "loss": 0.2139, "step": 5820 }, { "epoch": 1.841955611721033, "grad_norm": 0.21828875123993477, "learning_rate": 7.961721931871023e-06, "loss": 0.2125, "step": 5830 }, { "epoch": 1.8451149198325567, "grad_norm": 0.22962238030556992, "learning_rate": 7.9528285738356e-06, "loss": 0.214, "step": 5840 }, { "epoch": 1.8482742279440803, "grad_norm": 0.22398065321127122, "learning_rate": 7.943920847815995e-06, "loss": 0.2111, "step": 5850 }, { "epoch": 1.8514335360556038, "grad_norm": 0.23666023387603707, "learning_rate": 7.934998797155757e-06, "loss": 0.2136, "step": 5860 }, { "epoch": 1.8545928441671276, "grad_norm": 0.24325581332600177, "learning_rate": 7.926062465268133e-06, "loss": 0.2146, "step": 5870 }, { "epoch": 1.857752152278651, "grad_norm": 0.22514971896352026, "learning_rate": 7.917111895635865e-06, "loss": 0.2143, "step": 5880 }, { "epoch": 1.8609114603901746, "grad_norm": 0.21406995986557276, "learning_rate": 7.908147131810968e-06, "loss": 0.2147, "step": 5890 }, { "epoch": 1.8640707685016982, "grad_norm": 0.22801297726410702, "learning_rate": 7.899168217414526e-06, "loss": 0.2124, "step": 5900 }, { "epoch": 1.8672300766132217, "grad_norm": 0.20686065594624745, "learning_rate": 7.890175196136484e-06, "loss": 0.2128, "step": 5910 }, { "epoch": 1.8703893847247453, "grad_norm": 0.219199870102877, "learning_rate": 7.881168111735417e-06, "loss": 0.212, "step": 5920 }, { "epoch": 1.8735486928362688, "grad_norm": 0.2106377311590809, "learning_rate": 7.872147008038335e-06, "loss": 0.2131, "step": 5930 }, { "epoch": 1.8767080009477923, "grad_norm": 0.22171325213770007, "learning_rate": 7.863111928940465e-06, "loss": 0.2144, "step": 5940 }, { "epoch": 1.8798673090593159, "grad_norm": 0.2408651871754287, "learning_rate": 7.854062918405034e-06, "loss": 0.2145, "step": 5950 }, { "epoch": 1.8830266171708396, "grad_norm": 0.22645870425489717, "learning_rate": 7.845000020463058e-06, "loss": 0.2157, "step": 5960 }, { "epoch": 1.8861859252823632, "grad_norm": 0.21229610496145646, "learning_rate": 7.835923279213124e-06, "loss": 0.2153, "step": 5970 }, { "epoch": 1.8893452333938867, "grad_norm": 0.22168579889983991, "learning_rate": 7.826832738821182e-06, "loss": 0.2135, "step": 5980 }, { "epoch": 1.8925045415054105, "grad_norm": 0.2503714943772614, "learning_rate": 7.817728443520324e-06, "loss": 0.214, "step": 5990 }, { "epoch": 1.895663849616934, "grad_norm": 0.21389350571126534, "learning_rate": 7.808610437610572e-06, "loss": 0.2139, "step": 6000 }, { "epoch": 1.8988231577284576, "grad_norm": 0.21396113064462294, "learning_rate": 7.799478765458665e-06, "loss": 0.215, "step": 6010 }, { "epoch": 1.901982465839981, "grad_norm": 0.21646665487064554, "learning_rate": 7.790333471497831e-06, "loss": 0.2137, "step": 6020 }, { "epoch": 1.9051417739515046, "grad_norm": 0.2173302972697527, "learning_rate": 7.781174600227587e-06, "loss": 0.215, "step": 6030 }, { "epoch": 1.9083010820630282, "grad_norm": 0.208526571079878, "learning_rate": 7.772002196213517e-06, "loss": 0.2144, "step": 6040 }, { "epoch": 1.9114603901745517, "grad_norm": 0.23762678091854472, "learning_rate": 7.762816304087042e-06, "loss": 0.2154, "step": 6050 }, { "epoch": 1.9146196982860753, "grad_norm": 0.23394045959731094, "learning_rate": 7.753616968545223e-06, "loss": 0.214, "step": 6060 }, { "epoch": 1.9177790063975988, "grad_norm": 0.21611757282074848, "learning_rate": 7.744404234350536e-06, "loss": 0.2125, "step": 6070 }, { "epoch": 1.9209383145091226, "grad_norm": 0.21843363212182215, "learning_rate": 7.735178146330647e-06, "loss": 0.2119, "step": 6080 }, { "epoch": 1.924097622620646, "grad_norm": 0.20825290374476896, "learning_rate": 7.7259387493782e-06, "loss": 0.2114, "step": 6090 }, { "epoch": 1.9272569307321696, "grad_norm": 0.20701940040618752, "learning_rate": 7.716686088450601e-06, "loss": 0.2118, "step": 6100 }, { "epoch": 1.9304162388436932, "grad_norm": 0.22116343279147632, "learning_rate": 7.707420208569793e-06, "loss": 0.2114, "step": 6110 }, { "epoch": 1.933575546955217, "grad_norm": 0.21636101800504964, "learning_rate": 7.698141154822048e-06, "loss": 0.216, "step": 6120 }, { "epoch": 1.9367348550667405, "grad_norm": 0.21079410410879035, "learning_rate": 7.68884897235773e-06, "loss": 0.213, "step": 6130 }, { "epoch": 1.939894163178264, "grad_norm": 0.24490335061347607, "learning_rate": 7.679543706391088e-06, "loss": 0.2128, "step": 6140 }, { "epoch": 1.9430534712897876, "grad_norm": 0.22964720738637706, "learning_rate": 7.670225402200037e-06, "loss": 0.2111, "step": 6150 }, { "epoch": 1.946212779401311, "grad_norm": 0.21860013623402114, "learning_rate": 7.660894105125932e-06, "loss": 0.2134, "step": 6160 }, { "epoch": 1.9493720875128346, "grad_norm": 0.2564661587161801, "learning_rate": 7.651549860573347e-06, "loss": 0.2097, "step": 6170 }, { "epoch": 1.9525313956243582, "grad_norm": 0.22698394312118578, "learning_rate": 7.642192714009861e-06, "loss": 0.2124, "step": 6180 }, { "epoch": 1.9556907037358817, "grad_norm": 0.2076381468352638, "learning_rate": 7.632822710965826e-06, "loss": 0.2115, "step": 6190 }, { "epoch": 1.9588500118474053, "grad_norm": 0.2262277080179204, "learning_rate": 7.623439897034155e-06, "loss": 0.2161, "step": 6200 }, { "epoch": 1.962009319958929, "grad_norm": 0.22968981943777275, "learning_rate": 7.614044317870099e-06, "loss": 0.212, "step": 6210 }, { "epoch": 1.9651686280704526, "grad_norm": 0.21734537077847677, "learning_rate": 7.604636019191018e-06, "loss": 0.2112, "step": 6220 }, { "epoch": 1.968327936181976, "grad_norm": 0.2162091973430056, "learning_rate": 7.595215046776165e-06, "loss": 0.2112, "step": 6230 }, { "epoch": 1.9714872442934999, "grad_norm": 0.22143440320403707, "learning_rate": 7.585781446466464e-06, "loss": 0.2108, "step": 6240 }, { "epoch": 1.9746465524050234, "grad_norm": 0.21858157789999919, "learning_rate": 7.5763352641642785e-06, "loss": 0.2145, "step": 6250 }, { "epoch": 1.977805860516547, "grad_norm": 0.23297936516023993, "learning_rate": 7.566876545833197e-06, "loss": 0.2123, "step": 6260 }, { "epoch": 1.9809651686280705, "grad_norm": 0.22908027579219403, "learning_rate": 7.55740533749781e-06, "loss": 0.2114, "step": 6270 }, { "epoch": 1.984124476739594, "grad_norm": 0.2202073126271702, "learning_rate": 7.547921685243475e-06, "loss": 0.2113, "step": 6280 }, { "epoch": 1.9872837848511176, "grad_norm": 0.2309779316659973, "learning_rate": 7.538425635216105e-06, "loss": 0.2136, "step": 6290 }, { "epoch": 1.990443092962641, "grad_norm": 0.2116437633996568, "learning_rate": 7.5289172336219375e-06, "loss": 0.2122, "step": 6300 }, { "epoch": 1.9936024010741646, "grad_norm": 0.21586666623876277, "learning_rate": 7.51939652672731e-06, "loss": 0.2098, "step": 6310 }, { "epoch": 1.9967617091856882, "grad_norm": 0.22548426849872574, "learning_rate": 7.509863560858432e-06, "loss": 0.2114, "step": 6320 }, { "epoch": 1.999921017297212, "grad_norm": 0.2263338267805787, "learning_rate": 7.5003183824011726e-06, "loss": 0.2131, "step": 6330 }, { "epoch": 2.0031593081115235, "grad_norm": 0.2127546790886278, "learning_rate": 7.490761037800816e-06, "loss": 0.1968, "step": 6340 }, { "epoch": 2.006318616223047, "grad_norm": 0.2069882189331785, "learning_rate": 7.48119157356185e-06, "loss": 0.1914, "step": 6350 }, { "epoch": 2.0094779243345706, "grad_norm": 0.20703568644153889, "learning_rate": 7.471610036247733e-06, "loss": 0.1897, "step": 6360 }, { "epoch": 2.012637232446094, "grad_norm": 0.2055646371637049, "learning_rate": 7.462016472480668e-06, "loss": 0.194, "step": 6370 }, { "epoch": 2.0157965405576177, "grad_norm": 0.2226339706612743, "learning_rate": 7.452410928941378e-06, "loss": 0.1921, "step": 6380 }, { "epoch": 2.0189558486691412, "grad_norm": 0.22355427767751265, "learning_rate": 7.442793452368879e-06, "loss": 0.1918, "step": 6390 }, { "epoch": 2.0221151567806652, "grad_norm": 0.2073037658523876, "learning_rate": 7.433164089560251e-06, "loss": 0.192, "step": 6400 }, { "epoch": 2.0252744648921888, "grad_norm": 0.23504421002096731, "learning_rate": 7.423522887370405e-06, "loss": 0.1904, "step": 6410 }, { "epoch": 2.0284337730037123, "grad_norm": 0.20581970412966663, "learning_rate": 7.413869892711867e-06, "loss": 0.1918, "step": 6420 }, { "epoch": 2.031593081115236, "grad_norm": 0.21545977722772292, "learning_rate": 7.40420515255454e-06, "loss": 0.1953, "step": 6430 }, { "epoch": 2.0347523892267594, "grad_norm": 0.21275493409460883, "learning_rate": 7.394528713925482e-06, "loss": 0.1926, "step": 6440 }, { "epoch": 2.037911697338283, "grad_norm": 0.2100072176135639, "learning_rate": 7.38484062390867e-06, "loss": 0.1928, "step": 6450 }, { "epoch": 2.0410710054498065, "grad_norm": 0.2152551253488428, "learning_rate": 7.375140929644776e-06, "loss": 0.1913, "step": 6460 }, { "epoch": 2.04423031356133, "grad_norm": 0.23722168142518216, "learning_rate": 7.365429678330938e-06, "loss": 0.193, "step": 6470 }, { "epoch": 2.0473896216728535, "grad_norm": 0.21165262894718706, "learning_rate": 7.355706917220524e-06, "loss": 0.1919, "step": 6480 }, { "epoch": 2.050548929784377, "grad_norm": 0.19900789144822892, "learning_rate": 7.345972693622916e-06, "loss": 0.1934, "step": 6490 }, { "epoch": 2.0537082378959006, "grad_norm": 0.22490326358812657, "learning_rate": 7.336227054903258e-06, "loss": 0.1947, "step": 6500 }, { "epoch": 2.056867546007424, "grad_norm": 0.2140405521043961, "learning_rate": 7.3264700484822504e-06, "loss": 0.1916, "step": 6510 }, { "epoch": 2.060026854118948, "grad_norm": 0.21185203080938914, "learning_rate": 7.316701721835899e-06, "loss": 0.1921, "step": 6520 }, { "epoch": 2.0631861622304717, "grad_norm": 0.21223665536640837, "learning_rate": 7.306922122495296e-06, "loss": 0.1906, "step": 6530 }, { "epoch": 2.0663454703419952, "grad_norm": 0.21789344473823863, "learning_rate": 7.297131298046381e-06, "loss": 0.1912, "step": 6540 }, { "epoch": 2.0695047784535188, "grad_norm": 0.2186374831032629, "learning_rate": 7.287329296129716e-06, "loss": 0.1905, "step": 6550 }, { "epoch": 2.0726640865650423, "grad_norm": 0.21221974732525253, "learning_rate": 7.2775161644402504e-06, "loss": 0.1911, "step": 6560 }, { "epoch": 2.075823394676566, "grad_norm": 0.22468404766263275, "learning_rate": 7.267691950727089e-06, "loss": 0.1918, "step": 6570 }, { "epoch": 2.0789827027880894, "grad_norm": 0.22280157355766247, "learning_rate": 7.257856702793262e-06, "loss": 0.1953, "step": 6580 }, { "epoch": 2.082142010899613, "grad_norm": 0.2185553426185545, "learning_rate": 7.248010468495486e-06, "loss": 0.1927, "step": 6590 }, { "epoch": 2.0853013190111365, "grad_norm": 0.21359923659529395, "learning_rate": 7.238153295743936e-06, "loss": 0.1914, "step": 6600 }, { "epoch": 2.08846062712266, "grad_norm": 0.21132207896334118, "learning_rate": 7.228285232502016e-06, "loss": 0.1916, "step": 6610 }, { "epoch": 2.0916199352341835, "grad_norm": 0.22929120062926403, "learning_rate": 7.218406326786119e-06, "loss": 0.1927, "step": 6620 }, { "epoch": 2.094779243345707, "grad_norm": 0.21485998840313886, "learning_rate": 7.208516626665394e-06, "loss": 0.1937, "step": 6630 }, { "epoch": 2.097938551457231, "grad_norm": 0.21872022787914924, "learning_rate": 7.198616180261515e-06, "loss": 0.1925, "step": 6640 }, { "epoch": 2.1010978595687546, "grad_norm": 0.2221942323992186, "learning_rate": 7.188705035748447e-06, "loss": 0.191, "step": 6650 }, { "epoch": 2.104257167680278, "grad_norm": 0.23507618605208405, "learning_rate": 7.178783241352209e-06, "loss": 0.1913, "step": 6660 }, { "epoch": 2.1074164757918017, "grad_norm": 0.2124831637924298, "learning_rate": 7.168850845350642e-06, "loss": 0.1937, "step": 6670 }, { "epoch": 2.1105757839033252, "grad_norm": 0.27819912434383975, "learning_rate": 7.158907896073171e-06, "loss": 0.1907, "step": 6680 }, { "epoch": 2.1137350920148488, "grad_norm": 0.2140478018660185, "learning_rate": 7.148954441900569e-06, "loss": 0.1937, "step": 6690 }, { "epoch": 2.1168944001263723, "grad_norm": 0.20813723734002745, "learning_rate": 7.13899053126473e-06, "loss": 0.1928, "step": 6700 }, { "epoch": 2.120053708237896, "grad_norm": 0.21914914363643226, "learning_rate": 7.1290162126484255e-06, "loss": 0.1911, "step": 6710 }, { "epoch": 2.1232130163494194, "grad_norm": 0.20921158225021147, "learning_rate": 7.119031534585068e-06, "loss": 0.1911, "step": 6720 }, { "epoch": 2.126372324460943, "grad_norm": 0.21063028898492417, "learning_rate": 7.109036545658478e-06, "loss": 0.1909, "step": 6730 }, { "epoch": 2.1295316325724665, "grad_norm": 0.20519838588236927, "learning_rate": 7.099031294502651e-06, "loss": 0.1926, "step": 6740 }, { "epoch": 2.13269094068399, "grad_norm": 0.19866370874843575, "learning_rate": 7.089015829801513e-06, "loss": 0.1918, "step": 6750 }, { "epoch": 2.1358502487955136, "grad_norm": 0.23526951667689947, "learning_rate": 7.078990200288685e-06, "loss": 0.1914, "step": 6760 }, { "epoch": 2.1390095569070375, "grad_norm": 0.2119435665033379, "learning_rate": 7.0689544547472564e-06, "loss": 0.1926, "step": 6770 }, { "epoch": 2.142168865018561, "grad_norm": 0.2144608848885566, "learning_rate": 7.058908642009532e-06, "loss": 0.1929, "step": 6780 }, { "epoch": 2.1453281731300846, "grad_norm": 0.2359992087538063, "learning_rate": 7.048852810956805e-06, "loss": 0.1939, "step": 6790 }, { "epoch": 2.148487481241608, "grad_norm": 0.2210600215142186, "learning_rate": 7.038787010519117e-06, "loss": 0.1926, "step": 6800 }, { "epoch": 2.1516467893531317, "grad_norm": 0.24951718000615125, "learning_rate": 7.0287112896750166e-06, "loss": 0.1911, "step": 6810 }, { "epoch": 2.1548060974646552, "grad_norm": 0.2175553461891265, "learning_rate": 7.018625697451327e-06, "loss": 0.1934, "step": 6820 }, { "epoch": 2.1579654055761788, "grad_norm": 0.20956901895344687, "learning_rate": 7.008530282922896e-06, "loss": 0.1913, "step": 6830 }, { "epoch": 2.1611247136877023, "grad_norm": 0.2224814576315156, "learning_rate": 6.998425095212378e-06, "loss": 0.1927, "step": 6840 }, { "epoch": 2.164284021799226, "grad_norm": 0.20337670529721538, "learning_rate": 6.9883101834899694e-06, "loss": 0.1922, "step": 6850 }, { "epoch": 2.1674433299107494, "grad_norm": 0.2198594709438925, "learning_rate": 6.978185596973192e-06, "loss": 0.1917, "step": 6860 }, { "epoch": 2.170602638022273, "grad_norm": 0.21033108894938493, "learning_rate": 6.968051384926634e-06, "loss": 0.1938, "step": 6870 }, { "epoch": 2.173761946133797, "grad_norm": 0.22864890990862674, "learning_rate": 6.957907596661729e-06, "loss": 0.1915, "step": 6880 }, { "epoch": 2.1769212542453205, "grad_norm": 0.21930675227402627, "learning_rate": 6.9477542815365025e-06, "loss": 0.1934, "step": 6890 }, { "epoch": 2.180080562356844, "grad_norm": 0.2079046170539124, "learning_rate": 6.937591488955335e-06, "loss": 0.1913, "step": 6900 }, { "epoch": 2.1832398704683675, "grad_norm": 0.227129021822185, "learning_rate": 6.927419268368727e-06, "loss": 0.1928, "step": 6910 }, { "epoch": 2.186399178579891, "grad_norm": 0.21476017288096094, "learning_rate": 6.917237669273047e-06, "loss": 0.195, "step": 6920 }, { "epoch": 2.1895584866914146, "grad_norm": 0.2101092960463205, "learning_rate": 6.907046741210308e-06, "loss": 0.1929, "step": 6930 }, { "epoch": 2.192717794802938, "grad_norm": 0.21529278522230835, "learning_rate": 6.8968465337679056e-06, "loss": 0.1932, "step": 6940 }, { "epoch": 2.1958771029144617, "grad_norm": 0.22276990935035607, "learning_rate": 6.886637096578395e-06, "loss": 0.1932, "step": 6950 }, { "epoch": 2.1990364110259852, "grad_norm": 0.22806514291674243, "learning_rate": 6.876418479319238e-06, "loss": 0.1938, "step": 6960 }, { "epoch": 2.202195719137509, "grad_norm": 0.20310603052055423, "learning_rate": 6.866190731712566e-06, "loss": 0.1945, "step": 6970 }, { "epoch": 2.2053550272490323, "grad_norm": 0.2000583112376738, "learning_rate": 6.8559539035249405e-06, "loss": 0.1906, "step": 6980 }, { "epoch": 2.208514335360556, "grad_norm": 0.21643622482829, "learning_rate": 6.8457080445671e-06, "loss": 0.1928, "step": 6990 }, { "epoch": 2.2116736434720794, "grad_norm": 0.21135933665048376, "learning_rate": 6.835453204693733e-06, "loss": 0.1927, "step": 7000 }, { "epoch": 2.2148329515836034, "grad_norm": 0.21880301226347054, "learning_rate": 6.825189433803223e-06, "loss": 0.1912, "step": 7010 }, { "epoch": 2.217992259695127, "grad_norm": 0.21873208245413148, "learning_rate": 6.814916781837413e-06, "loss": 0.1922, "step": 7020 }, { "epoch": 2.2211515678066505, "grad_norm": 0.19919301034094944, "learning_rate": 6.804635298781358e-06, "loss": 0.1914, "step": 7030 }, { "epoch": 2.224310875918174, "grad_norm": 0.21709044589388574, "learning_rate": 6.7943450346630845e-06, "loss": 0.1949, "step": 7040 }, { "epoch": 2.2274701840296975, "grad_norm": 0.19238634340689642, "learning_rate": 6.784046039553347e-06, "loss": 0.1926, "step": 7050 }, { "epoch": 2.230629492141221, "grad_norm": 0.20786949384009987, "learning_rate": 6.7737383635653805e-06, "loss": 0.191, "step": 7060 }, { "epoch": 2.2337888002527446, "grad_norm": 0.19859606904892732, "learning_rate": 6.763422056854666e-06, "loss": 0.1921, "step": 7070 }, { "epoch": 2.236948108364268, "grad_norm": 0.22013791343337974, "learning_rate": 6.753097169618672e-06, "loss": 0.1892, "step": 7080 }, { "epoch": 2.2401074164757917, "grad_norm": 0.22775949599854897, "learning_rate": 6.742763752096625e-06, "loss": 0.1924, "step": 7090 }, { "epoch": 2.2432667245873152, "grad_norm": 0.2128328590833451, "learning_rate": 6.732421854569254e-06, "loss": 0.191, "step": 7100 }, { "epoch": 2.246426032698839, "grad_norm": 0.21644470882444383, "learning_rate": 6.722071527358557e-06, "loss": 0.1935, "step": 7110 }, { "epoch": 2.2495853408103628, "grad_norm": 0.21312175511661602, "learning_rate": 6.7117128208275384e-06, "loss": 0.1931, "step": 7120 }, { "epoch": 2.252744648921886, "grad_norm": 0.22730243091149457, "learning_rate": 6.701345785379987e-06, "loss": 0.1922, "step": 7130 }, { "epoch": 2.25590395703341, "grad_norm": 0.23578039614437363, "learning_rate": 6.69097047146021e-06, "loss": 0.1911, "step": 7140 }, { "epoch": 2.2590632651449334, "grad_norm": 0.21392963590443645, "learning_rate": 6.6805869295528e-06, "loss": 0.1916, "step": 7150 }, { "epoch": 2.262222573256457, "grad_norm": 0.21695056856445175, "learning_rate": 6.6701952101823885e-06, "loss": 0.1906, "step": 7160 }, { "epoch": 2.2653818813679805, "grad_norm": 0.21461312040450123, "learning_rate": 6.659795363913389e-06, "loss": 0.1902, "step": 7170 }, { "epoch": 2.268541189479504, "grad_norm": 0.2285522952212848, "learning_rate": 6.649387441349767e-06, "loss": 0.1903, "step": 7180 }, { "epoch": 2.2717004975910275, "grad_norm": 0.2005700632922379, "learning_rate": 6.6389714931347825e-06, "loss": 0.1908, "step": 7190 }, { "epoch": 2.274859805702551, "grad_norm": 0.22483814706686375, "learning_rate": 6.628547569950748e-06, "loss": 0.1943, "step": 7200 }, { "epoch": 2.2780191138140746, "grad_norm": 0.21339185695046756, "learning_rate": 6.61811572251878e-06, "loss": 0.1923, "step": 7210 }, { "epoch": 2.281178421925598, "grad_norm": 0.22013090536921387, "learning_rate": 6.607676001598553e-06, "loss": 0.1931, "step": 7220 }, { "epoch": 2.2843377300371217, "grad_norm": 0.21150686936631666, "learning_rate": 6.597228457988053e-06, "loss": 0.1933, "step": 7230 }, { "epoch": 2.2874970381486452, "grad_norm": 0.22030620455805627, "learning_rate": 6.58677314252333e-06, "loss": 0.1913, "step": 7240 }, { "epoch": 2.2906563462601692, "grad_norm": 0.20354686364957472, "learning_rate": 6.576310106078255e-06, "loss": 0.1935, "step": 7250 }, { "epoch": 2.2938156543716928, "grad_norm": 0.2266197363553553, "learning_rate": 6.565839399564258e-06, "loss": 0.1943, "step": 7260 }, { "epoch": 2.2969749624832163, "grad_norm": 0.21248852160067305, "learning_rate": 6.555361073930098e-06, "loss": 0.1923, "step": 7270 }, { "epoch": 2.30013427059474, "grad_norm": 0.20476133265911461, "learning_rate": 6.544875180161605e-06, "loss": 0.1908, "step": 7280 }, { "epoch": 2.3032935787062634, "grad_norm": 0.20464174752072192, "learning_rate": 6.534381769281437e-06, "loss": 0.1905, "step": 7290 }, { "epoch": 2.306452886817787, "grad_norm": 0.21832471477385795, "learning_rate": 6.523880892348824e-06, "loss": 0.194, "step": 7300 }, { "epoch": 2.3096121949293105, "grad_norm": 0.19969512810776804, "learning_rate": 6.513372600459329e-06, "loss": 0.1914, "step": 7310 }, { "epoch": 2.312771503040834, "grad_norm": 0.21518488246695772, "learning_rate": 6.502856944744593e-06, "loss": 0.1937, "step": 7320 }, { "epoch": 2.3159308111523575, "grad_norm": 0.22335845799273413, "learning_rate": 6.49233397637209e-06, "loss": 0.1909, "step": 7330 }, { "epoch": 2.319090119263881, "grad_norm": 0.2258181355415704, "learning_rate": 6.48180374654487e-06, "loss": 0.1918, "step": 7340 }, { "epoch": 2.3222494273754046, "grad_norm": 0.22062705120624707, "learning_rate": 6.471266306501325e-06, "loss": 0.1925, "step": 7350 }, { "epoch": 2.325408735486928, "grad_norm": 0.21451058313158836, "learning_rate": 6.4607217075149265e-06, "loss": 0.1909, "step": 7360 }, { "epoch": 2.3285680435984517, "grad_norm": 0.20119666338740025, "learning_rate": 6.450170000893978e-06, "loss": 0.1912, "step": 7370 }, { "epoch": 2.3317273517099757, "grad_norm": 0.20548278813927204, "learning_rate": 6.439611237981373e-06, "loss": 0.1903, "step": 7380 }, { "epoch": 2.3348866598214992, "grad_norm": 0.2220285180002872, "learning_rate": 6.429045470154333e-06, "loss": 0.1922, "step": 7390 }, { "epoch": 2.3380459679330228, "grad_norm": 0.20989770488757611, "learning_rate": 6.418472748824172e-06, "loss": 0.1905, "step": 7400 }, { "epoch": 2.3412052760445463, "grad_norm": 0.21011715599104358, "learning_rate": 6.407893125436031e-06, "loss": 0.1918, "step": 7410 }, { "epoch": 2.34436458415607, "grad_norm": 0.19838344804989674, "learning_rate": 6.397306651468641e-06, "loss": 0.1909, "step": 7420 }, { "epoch": 2.3475238922675934, "grad_norm": 0.2169687246351887, "learning_rate": 6.386713378434064e-06, "loss": 0.1927, "step": 7430 }, { "epoch": 2.350683200379117, "grad_norm": 0.204594134231941, "learning_rate": 6.376113357877445e-06, "loss": 0.1925, "step": 7440 }, { "epoch": 2.3538425084906405, "grad_norm": 0.20966117593645678, "learning_rate": 6.365506641376762e-06, "loss": 0.1897, "step": 7450 }, { "epoch": 2.357001816602164, "grad_norm": 0.20527775997224504, "learning_rate": 6.354893280542576e-06, "loss": 0.1929, "step": 7460 }, { "epoch": 2.3601611247136876, "grad_norm": 0.20065093914673782, "learning_rate": 6.344273327017778e-06, "loss": 0.193, "step": 7470 }, { "epoch": 2.363320432825211, "grad_norm": 0.22557801837324515, "learning_rate": 6.333646832477334e-06, "loss": 0.1898, "step": 7480 }, { "epoch": 2.366479740936735, "grad_norm": 0.21882992475206528, "learning_rate": 6.32301384862804e-06, "loss": 0.1941, "step": 7490 }, { "epoch": 2.3696390490482586, "grad_norm": 0.21331149519342535, "learning_rate": 6.31237442720827e-06, "loss": 0.1928, "step": 7500 }, { "epoch": 2.372798357159782, "grad_norm": 0.2135806830134628, "learning_rate": 6.301728619987722e-06, "loss": 0.1912, "step": 7510 }, { "epoch": 2.3759576652713057, "grad_norm": 0.20992047195155106, "learning_rate": 6.29107647876716e-06, "loss": 0.1924, "step": 7520 }, { "epoch": 2.3791169733828292, "grad_norm": 0.22347906358520409, "learning_rate": 6.280418055378175e-06, "loss": 0.1929, "step": 7530 }, { "epoch": 2.3822762814943528, "grad_norm": 0.20738356404474784, "learning_rate": 6.269753401682924e-06, "loss": 0.1921, "step": 7540 }, { "epoch": 2.3854355896058763, "grad_norm": 0.21987439925453983, "learning_rate": 6.25908256957388e-06, "loss": 0.1914, "step": 7550 }, { "epoch": 2.3885948977174, "grad_norm": 0.20180229410257866, "learning_rate": 6.248405610973579e-06, "loss": 0.1915, "step": 7560 }, { "epoch": 2.3917542058289234, "grad_norm": 0.204547582458948, "learning_rate": 6.237722577834366e-06, "loss": 0.1926, "step": 7570 }, { "epoch": 2.394913513940447, "grad_norm": 0.20792182149152452, "learning_rate": 6.227033522138145e-06, "loss": 0.1933, "step": 7580 }, { "epoch": 2.3980728220519705, "grad_norm": 0.20943317325322225, "learning_rate": 6.216338495896125e-06, "loss": 0.192, "step": 7590 }, { "epoch": 2.401232130163494, "grad_norm": 0.2184828315646208, "learning_rate": 6.205637551148567e-06, "loss": 0.1931, "step": 7600 }, { "epoch": 2.4043914382750176, "grad_norm": 0.22863771124698407, "learning_rate": 6.194930739964529e-06, "loss": 0.1928, "step": 7610 }, { "epoch": 2.4075507463865415, "grad_norm": 0.22337051398699578, "learning_rate": 6.1842181144416145e-06, "loss": 0.1924, "step": 7620 }, { "epoch": 2.410710054498065, "grad_norm": 0.21034627944404075, "learning_rate": 6.17349972670572e-06, "loss": 0.1916, "step": 7630 }, { "epoch": 2.4138693626095886, "grad_norm": 0.20200815442203482, "learning_rate": 6.162775628910781e-06, "loss": 0.1934, "step": 7640 }, { "epoch": 2.417028670721112, "grad_norm": 0.20113233320753685, "learning_rate": 6.152045873238512e-06, "loss": 0.1915, "step": 7650 }, { "epoch": 2.4201879788326357, "grad_norm": 0.2148019344588209, "learning_rate": 6.141310511898162e-06, "loss": 0.1901, "step": 7660 }, { "epoch": 2.4233472869441592, "grad_norm": 0.2156955634575946, "learning_rate": 6.130569597126257e-06, "loss": 0.1896, "step": 7670 }, { "epoch": 2.426506595055683, "grad_norm": 0.22700196837396544, "learning_rate": 6.119823181186342e-06, "loss": 0.1923, "step": 7680 }, { "epoch": 2.4296659031672063, "grad_norm": 0.20832014160995668, "learning_rate": 6.109071316368732e-06, "loss": 0.1943, "step": 7690 }, { "epoch": 2.43282521127873, "grad_norm": 0.22039569297008865, "learning_rate": 6.0983140549902544e-06, "loss": 0.1918, "step": 7700 }, { "epoch": 2.4359845193902534, "grad_norm": 0.2032368232732068, "learning_rate": 6.087551449393996e-06, "loss": 0.1908, "step": 7710 }, { "epoch": 2.439143827501777, "grad_norm": 0.21501063236631873, "learning_rate": 6.0767835519490455e-06, "loss": 0.193, "step": 7720 }, { "epoch": 2.442303135613301, "grad_norm": 0.21020124149593053, "learning_rate": 6.066010415050246e-06, "loss": 0.1912, "step": 7730 }, { "epoch": 2.445462443724824, "grad_norm": 0.21241622539470262, "learning_rate": 6.0552320911179295e-06, "loss": 0.1909, "step": 7740 }, { "epoch": 2.448621751836348, "grad_norm": 0.21470252522546168, "learning_rate": 6.04444863259767e-06, "loss": 0.1928, "step": 7750 }, { "epoch": 2.4517810599478715, "grad_norm": 0.21167973032496565, "learning_rate": 6.033660091960025e-06, "loss": 0.1936, "step": 7760 }, { "epoch": 2.454940368059395, "grad_norm": 0.2051994018828898, "learning_rate": 6.02286652170028e-06, "loss": 0.1938, "step": 7770 }, { "epoch": 2.4580996761709186, "grad_norm": 0.20457172842704202, "learning_rate": 6.0120679743381945e-06, "loss": 0.19, "step": 7780 }, { "epoch": 2.461258984282442, "grad_norm": 0.2188284827007198, "learning_rate": 6.001264502417749e-06, "loss": 0.1923, "step": 7790 }, { "epoch": 2.4644182923939657, "grad_norm": 0.2196693674996576, "learning_rate": 5.990456158506879e-06, "loss": 0.1905, "step": 7800 }, { "epoch": 2.4675776005054892, "grad_norm": 0.206778112224436, "learning_rate": 5.979642995197231e-06, "loss": 0.1932, "step": 7810 }, { "epoch": 2.470736908617013, "grad_norm": 0.21023136041598797, "learning_rate": 5.968825065103904e-06, "loss": 0.1914, "step": 7820 }, { "epoch": 2.4738962167285363, "grad_norm": 0.20965600357426922, "learning_rate": 5.958002420865184e-06, "loss": 0.1908, "step": 7830 }, { "epoch": 2.47705552484006, "grad_norm": 0.20939819430473985, "learning_rate": 5.947175115142303e-06, "loss": 0.1923, "step": 7840 }, { "epoch": 2.4802148329515834, "grad_norm": 0.21000576536235138, "learning_rate": 5.936343200619171e-06, "loss": 0.1906, "step": 7850 }, { "epoch": 2.4833741410631074, "grad_norm": 0.21638448903532395, "learning_rate": 5.925506730002125e-06, "loss": 0.1922, "step": 7860 }, { "epoch": 2.486533449174631, "grad_norm": 0.21066924946782678, "learning_rate": 5.914665756019672e-06, "loss": 0.1926, "step": 7870 }, { "epoch": 2.4896927572861545, "grad_norm": 0.21573344183841558, "learning_rate": 5.903820331422228e-06, "loss": 0.1929, "step": 7880 }, { "epoch": 2.492852065397678, "grad_norm": 0.20934900894495664, "learning_rate": 5.8929705089818665e-06, "loss": 0.1915, "step": 7890 }, { "epoch": 2.4960113735092015, "grad_norm": 0.19194977706940777, "learning_rate": 5.882116341492063e-06, "loss": 0.1918, "step": 7900 }, { "epoch": 2.499170681620725, "grad_norm": 0.22582333577460095, "learning_rate": 5.8712578817674356e-06, "loss": 0.1909, "step": 7910 }, { "epoch": 2.5023299897322486, "grad_norm": 0.2083520216392237, "learning_rate": 5.860395182643481e-06, "loss": 0.1915, "step": 7920 }, { "epoch": 2.505489297843772, "grad_norm": 0.21101939233964928, "learning_rate": 5.84952829697633e-06, "loss": 0.1907, "step": 7930 }, { "epoch": 2.5086486059552957, "grad_norm": 0.21143622985502697, "learning_rate": 5.838657277642484e-06, "loss": 0.1935, "step": 7940 }, { "epoch": 2.5118079140668192, "grad_norm": 0.20175953224841772, "learning_rate": 5.8277821775385575e-06, "loss": 0.1924, "step": 7950 }, { "epoch": 2.514967222178343, "grad_norm": 0.21068052700964154, "learning_rate": 5.816903049581021e-06, "loss": 0.1937, "step": 7960 }, { "epoch": 2.5181265302898668, "grad_norm": 0.21882832311876035, "learning_rate": 5.806019946705942e-06, "loss": 0.191, "step": 7970 }, { "epoch": 2.52128583840139, "grad_norm": 0.21291172763480593, "learning_rate": 5.795132921868732e-06, "loss": 0.1909, "step": 7980 }, { "epoch": 2.524445146512914, "grad_norm": 0.2062457751313165, "learning_rate": 5.784242028043886e-06, "loss": 0.1913, "step": 7990 }, { "epoch": 2.5276044546244374, "grad_norm": 0.19898272324165425, "learning_rate": 5.773347318224726e-06, "loss": 0.1926, "step": 8000 }, { "epoch": 2.530763762735961, "grad_norm": 0.21382734065324233, "learning_rate": 5.762448845423136e-06, "loss": 0.1897, "step": 8010 }, { "epoch": 2.5339230708474845, "grad_norm": 0.19860488219214512, "learning_rate": 5.751546662669319e-06, "loss": 0.1916, "step": 8020 }, { "epoch": 2.537082378959008, "grad_norm": 0.20598643966772848, "learning_rate": 5.74064082301152e-06, "loss": 0.1889, "step": 8030 }, { "epoch": 2.5402416870705316, "grad_norm": 0.20481756196092588, "learning_rate": 5.729731379515787e-06, "loss": 0.1916, "step": 8040 }, { "epoch": 2.543400995182055, "grad_norm": 0.21953218134084496, "learning_rate": 5.718818385265701e-06, "loss": 0.1921, "step": 8050 }, { "epoch": 2.5465603032935786, "grad_norm": 0.21169625628604868, "learning_rate": 5.707901893362116e-06, "loss": 0.1925, "step": 8060 }, { "epoch": 2.549719611405102, "grad_norm": 0.20604257642179108, "learning_rate": 5.696981956922909e-06, "loss": 0.1906, "step": 8070 }, { "epoch": 2.5528789195166257, "grad_norm": 0.2041617688415921, "learning_rate": 5.686058629082718e-06, "loss": 0.191, "step": 8080 }, { "epoch": 2.5560382276281493, "grad_norm": 0.21493038957804925, "learning_rate": 5.6751319629926834e-06, "loss": 0.1903, "step": 8090 }, { "epoch": 2.5591975357396732, "grad_norm": 0.22786525193461765, "learning_rate": 5.664202011820183e-06, "loss": 0.1907, "step": 8100 }, { "epoch": 2.5623568438511963, "grad_norm": 0.20645959253391757, "learning_rate": 5.653268828748588e-06, "loss": 0.1901, "step": 8110 }, { "epoch": 2.5655161519627203, "grad_norm": 0.19358622503604414, "learning_rate": 5.642332466976989e-06, "loss": 0.1878, "step": 8120 }, { "epoch": 2.568675460074244, "grad_norm": 0.19969978459877627, "learning_rate": 5.631392979719945e-06, "loss": 0.1903, "step": 8130 }, { "epoch": 2.5718347681857674, "grad_norm": 0.2143548911374617, "learning_rate": 5.620450420207227e-06, "loss": 0.1911, "step": 8140 }, { "epoch": 2.574994076297291, "grad_norm": 0.21122139807224502, "learning_rate": 5.609504841683551e-06, "loss": 0.1904, "step": 8150 }, { "epoch": 2.5781533844088145, "grad_norm": 0.21283424203204657, "learning_rate": 5.598556297408322e-06, "loss": 0.1929, "step": 8160 }, { "epoch": 2.581312692520338, "grad_norm": 0.20416368302758592, "learning_rate": 5.587604840655379e-06, "loss": 0.1924, "step": 8170 }, { "epoch": 2.5844720006318616, "grad_norm": 0.20184025843385978, "learning_rate": 5.576650524712734e-06, "loss": 0.1912, "step": 8180 }, { "epoch": 2.587631308743385, "grad_norm": 0.20798749173073353, "learning_rate": 5.565693402882306e-06, "loss": 0.1923, "step": 8190 }, { "epoch": 2.5907906168549086, "grad_norm": 0.2110396741272342, "learning_rate": 5.554733528479672e-06, "loss": 0.1901, "step": 8200 }, { "epoch": 2.5939499249664326, "grad_norm": 0.2166668205307208, "learning_rate": 5.5437709548337985e-06, "loss": 0.191, "step": 8210 }, { "epoch": 2.5971092330779557, "grad_norm": 0.22301181516302854, "learning_rate": 5.53280573528679e-06, "loss": 0.1911, "step": 8220 }, { "epoch": 2.6002685411894797, "grad_norm": 0.20791087766916325, "learning_rate": 5.521837923193621e-06, "loss": 0.1889, "step": 8230 }, { "epoch": 2.6034278493010032, "grad_norm": 0.21867859844216495, "learning_rate": 5.510867571921887e-06, "loss": 0.19, "step": 8240 }, { "epoch": 2.606587157412527, "grad_norm": 0.20256198649149781, "learning_rate": 5.499894734851533e-06, "loss": 0.1908, "step": 8250 }, { "epoch": 2.6097464655240503, "grad_norm": 0.2118493538585681, "learning_rate": 5.488919465374601e-06, "loss": 0.1898, "step": 8260 }, { "epoch": 2.612905773635574, "grad_norm": 0.20908873607701223, "learning_rate": 5.477941816894973e-06, "loss": 0.1904, "step": 8270 }, { "epoch": 2.6160650817470974, "grad_norm": 0.18994734431648216, "learning_rate": 5.4669618428281e-06, "loss": 0.1895, "step": 8280 }, { "epoch": 2.619224389858621, "grad_norm": 0.21464906838442485, "learning_rate": 5.455979596600752e-06, "loss": 0.1906, "step": 8290 }, { "epoch": 2.6223836979701445, "grad_norm": 0.20545088012285806, "learning_rate": 5.444995131650757e-06, "loss": 0.1916, "step": 8300 }, { "epoch": 2.625543006081668, "grad_norm": 0.2308591130429511, "learning_rate": 5.434008501426739e-06, "loss": 0.1915, "step": 8310 }, { "epoch": 2.6287023141931916, "grad_norm": 0.19533870441750978, "learning_rate": 5.423019759387851e-06, "loss": 0.1905, "step": 8320 }, { "epoch": 2.631861622304715, "grad_norm": 0.21360189157598528, "learning_rate": 5.41202895900353e-06, "loss": 0.1891, "step": 8330 }, { "epoch": 2.635020930416239, "grad_norm": 0.2086177679970007, "learning_rate": 5.401036153753224e-06, "loss": 0.1894, "step": 8340 }, { "epoch": 2.638180238527762, "grad_norm": 0.20511520788720256, "learning_rate": 5.390041397126139e-06, "loss": 0.191, "step": 8350 }, { "epoch": 2.641339546639286, "grad_norm": 0.20744339959234745, "learning_rate": 5.379044742620975e-06, "loss": 0.1906, "step": 8360 }, { "epoch": 2.6444988547508097, "grad_norm": 0.21002258208476907, "learning_rate": 5.368046243745664e-06, "loss": 0.19, "step": 8370 }, { "epoch": 2.6476581628623332, "grad_norm": 0.20045395931004378, "learning_rate": 5.357045954017117e-06, "loss": 0.1918, "step": 8380 }, { "epoch": 2.650817470973857, "grad_norm": 0.20206701855858772, "learning_rate": 5.346043926960955e-06, "loss": 0.1914, "step": 8390 }, { "epoch": 2.6539767790853803, "grad_norm": 0.21096628622208463, "learning_rate": 5.335040216111259e-06, "loss": 0.192, "step": 8400 }, { "epoch": 2.657136087196904, "grad_norm": 0.20546853070131513, "learning_rate": 5.324034875010293e-06, "loss": 0.1913, "step": 8410 }, { "epoch": 2.6602953953084274, "grad_norm": 0.21143968803860907, "learning_rate": 5.313027957208262e-06, "loss": 0.19, "step": 8420 }, { "epoch": 2.663454703419951, "grad_norm": 0.20813089241185626, "learning_rate": 5.30201951626304e-06, "loss": 0.1908, "step": 8430 }, { "epoch": 2.6666140115314745, "grad_norm": 0.2065605941239096, "learning_rate": 5.291009605739912e-06, "loss": 0.1914, "step": 8440 }, { "epoch": 2.669773319642998, "grad_norm": 0.211794362752723, "learning_rate": 5.279998279211315e-06, "loss": 0.194, "step": 8450 }, { "epoch": 2.6729326277545216, "grad_norm": 0.21353505062189707, "learning_rate": 5.2689855902565725e-06, "loss": 0.1907, "step": 8460 }, { "epoch": 2.6760919358660455, "grad_norm": 0.20548536484778626, "learning_rate": 5.257971592461643e-06, "loss": 0.1889, "step": 8470 }, { "epoch": 2.6792512439775686, "grad_norm": 0.20132489673742388, "learning_rate": 5.2469563394188485e-06, "loss": 0.1913, "step": 8480 }, { "epoch": 2.6824105520890926, "grad_norm": 0.2183068296850621, "learning_rate": 5.235939884726624e-06, "loss": 0.1899, "step": 8490 }, { "epoch": 2.685569860200616, "grad_norm": 0.21831688267562407, "learning_rate": 5.224922281989245e-06, "loss": 0.1885, "step": 8500 }, { "epoch": 2.6887291683121397, "grad_norm": 0.20042363588520107, "learning_rate": 5.213903584816578e-06, "loss": 0.1919, "step": 8510 }, { "epoch": 2.6918884764236632, "grad_norm": 0.19948817447604472, "learning_rate": 5.202883846823816e-06, "loss": 0.1869, "step": 8520 }, { "epoch": 2.695047784535187, "grad_norm": 0.20763152988351818, "learning_rate": 5.1918631216312095e-06, "loss": 0.1892, "step": 8530 }, { "epoch": 2.6982070926467103, "grad_norm": 0.21738471086652061, "learning_rate": 5.1808414628638206e-06, "loss": 0.1904, "step": 8540 }, { "epoch": 2.701366400758234, "grad_norm": 0.21100460968688434, "learning_rate": 5.16981892415125e-06, "loss": 0.1899, "step": 8550 }, { "epoch": 2.7045257088697574, "grad_norm": 0.21418226702194354, "learning_rate": 5.158795559127379e-06, "loss": 0.191, "step": 8560 }, { "epoch": 2.707685016981281, "grad_norm": 0.20136963314709463, "learning_rate": 5.147771421430112e-06, "loss": 0.1903, "step": 8570 }, { "epoch": 2.710844325092805, "grad_norm": 0.19436149226210073, "learning_rate": 5.136746564701113e-06, "loss": 0.1921, "step": 8580 }, { "epoch": 2.714003633204328, "grad_norm": 0.2140972731026424, "learning_rate": 5.125721042585542e-06, "loss": 0.1896, "step": 8590 }, { "epoch": 2.717162941315852, "grad_norm": 0.20787084627216532, "learning_rate": 5.114694908731801e-06, "loss": 0.1915, "step": 8600 }, { "epoch": 2.7203222494273755, "grad_norm": 0.21044073918475403, "learning_rate": 5.103668216791266e-06, "loss": 0.1924, "step": 8610 }, { "epoch": 2.723481557538899, "grad_norm": 0.20735197165221705, "learning_rate": 5.092641020418026e-06, "loss": 0.1886, "step": 8620 }, { "epoch": 2.7266408656504226, "grad_norm": 0.206930807103157, "learning_rate": 5.0816133732686305e-06, "loss": 0.1898, "step": 8630 }, { "epoch": 2.729800173761946, "grad_norm": 0.19205839670275318, "learning_rate": 5.070585329001819e-06, "loss": 0.1908, "step": 8640 }, { "epoch": 2.7329594818734697, "grad_norm": 0.212328301487025, "learning_rate": 5.059556941278261e-06, "loss": 0.1903, "step": 8650 }, { "epoch": 2.7361187899849932, "grad_norm": 0.2099153942052239, "learning_rate": 5.048528263760301e-06, "loss": 0.1909, "step": 8660 }, { "epoch": 2.739278098096517, "grad_norm": 0.2137298602899187, "learning_rate": 5.037499350111693e-06, "loss": 0.1886, "step": 8670 }, { "epoch": 2.7424374062080403, "grad_norm": 0.20013893402808897, "learning_rate": 5.026470253997339e-06, "loss": 0.1918, "step": 8680 }, { "epoch": 2.745596714319564, "grad_norm": 0.2029041595541199, "learning_rate": 5.0154410290830295e-06, "loss": 0.1896, "step": 8690 }, { "epoch": 2.7487560224310874, "grad_norm": 0.21844048806518204, "learning_rate": 5.004411729035179e-06, "loss": 0.1903, "step": 8700 }, { "epoch": 2.7519153305426114, "grad_norm": 0.20229493690485986, "learning_rate": 4.9933824075205735e-06, "loss": 0.1889, "step": 8710 }, { "epoch": 2.7550746386541345, "grad_norm": 0.20057196812482364, "learning_rate": 4.982353118206095e-06, "loss": 0.1905, "step": 8720 }, { "epoch": 2.7582339467656585, "grad_norm": 0.20127831749959146, "learning_rate": 4.971323914758479e-06, "loss": 0.192, "step": 8730 }, { "epoch": 2.761393254877182, "grad_norm": 0.19694835136970584, "learning_rate": 4.9602948508440365e-06, "loss": 0.1899, "step": 8740 }, { "epoch": 2.7645525629887056, "grad_norm": 0.20744913562422332, "learning_rate": 4.949265980128398e-06, "loss": 0.1903, "step": 8750 }, { "epoch": 2.767711871100229, "grad_norm": 0.1938079327085531, "learning_rate": 4.938237356276261e-06, "loss": 0.1909, "step": 8760 }, { "epoch": 2.7708711792117526, "grad_norm": 0.2007179212194115, "learning_rate": 4.9272090329511136e-06, "loss": 0.1897, "step": 8770 }, { "epoch": 2.774030487323276, "grad_norm": 0.21505457205199155, "learning_rate": 4.916181063814989e-06, "loss": 0.1925, "step": 8780 }, { "epoch": 2.7771897954347997, "grad_norm": 0.21435123984044802, "learning_rate": 4.905153502528193e-06, "loss": 0.1902, "step": 8790 }, { "epoch": 2.7803491035463233, "grad_norm": 0.2061154947415264, "learning_rate": 4.894126402749044e-06, "loss": 0.1898, "step": 8800 }, { "epoch": 2.783508411657847, "grad_norm": 0.2131667185188113, "learning_rate": 4.883099818133624e-06, "loss": 0.193, "step": 8810 }, { "epoch": 2.7866677197693708, "grad_norm": 0.21301174700619682, "learning_rate": 4.872073802335499e-06, "loss": 0.1906, "step": 8820 }, { "epoch": 2.789827027880894, "grad_norm": 0.2108477817285028, "learning_rate": 4.86104840900547e-06, "loss": 0.19, "step": 8830 }, { "epoch": 2.792986335992418, "grad_norm": 0.202466254802374, "learning_rate": 4.850023691791313e-06, "loss": 0.1913, "step": 8840 }, { "epoch": 2.7961456441039414, "grad_norm": 0.21383889045040108, "learning_rate": 4.838999704337507e-06, "loss": 0.1905, "step": 8850 }, { "epoch": 2.799304952215465, "grad_norm": 0.22342476096804859, "learning_rate": 4.82797650028499e-06, "loss": 0.1929, "step": 8860 }, { "epoch": 2.8024642603269885, "grad_norm": 0.2124173070769737, "learning_rate": 4.816954133270879e-06, "loss": 0.1902, "step": 8870 }, { "epoch": 2.805623568438512, "grad_norm": 0.20879593574150696, "learning_rate": 4.805932656928218e-06, "loss": 0.1907, "step": 8880 }, { "epoch": 2.8087828765500356, "grad_norm": 0.2009544133407569, "learning_rate": 4.794912124885728e-06, "loss": 0.1924, "step": 8890 }, { "epoch": 2.811942184661559, "grad_norm": 0.2176494138664626, "learning_rate": 4.78389259076752e-06, "loss": 0.19, "step": 8900 }, { "epoch": 2.8151014927730826, "grad_norm": 0.20657867775507024, "learning_rate": 4.772874108192864e-06, "loss": 0.1886, "step": 8910 }, { "epoch": 2.818260800884606, "grad_norm": 0.19602872268511337, "learning_rate": 4.761856730775902e-06, "loss": 0.1901, "step": 8920 }, { "epoch": 2.8214201089961297, "grad_norm": 0.21196238191751543, "learning_rate": 4.750840512125403e-06, "loss": 0.1883, "step": 8930 }, { "epoch": 2.8245794171076533, "grad_norm": 0.22132755218766037, "learning_rate": 4.7398255058445e-06, "loss": 0.1884, "step": 8940 }, { "epoch": 2.8277387252191772, "grad_norm": 0.21199745533152217, "learning_rate": 4.72881176553042e-06, "loss": 0.1893, "step": 8950 }, { "epoch": 2.8308980333307003, "grad_norm": 0.20115752802009068, "learning_rate": 4.717799344774241e-06, "loss": 0.19, "step": 8960 }, { "epoch": 2.8340573414422243, "grad_norm": 0.1971895330735378, "learning_rate": 4.706788297160608e-06, "loss": 0.1914, "step": 8970 }, { "epoch": 2.837216649553748, "grad_norm": 0.22093757797756977, "learning_rate": 4.69577867626749e-06, "loss": 0.1911, "step": 8980 }, { "epoch": 2.8403759576652714, "grad_norm": 0.20276757417965852, "learning_rate": 4.684770535665917e-06, "loss": 0.1894, "step": 8990 }, { "epoch": 2.843535265776795, "grad_norm": 0.2052738035290844, "learning_rate": 4.673763928919712e-06, "loss": 0.1904, "step": 9000 }, { "epoch": 2.8466945738883185, "grad_norm": 0.1997948903035985, "learning_rate": 4.662758909585233e-06, "loss": 0.1902, "step": 9010 }, { "epoch": 2.849853881999842, "grad_norm": 0.21607370638898796, "learning_rate": 4.651755531211121e-06, "loss": 0.1885, "step": 9020 }, { "epoch": 2.8530131901113656, "grad_norm": 0.2072331158936079, "learning_rate": 4.640753847338022e-06, "loss": 0.1903, "step": 9030 }, { "epoch": 2.856172498222889, "grad_norm": 0.2095788996901622, "learning_rate": 4.629753911498348e-06, "loss": 0.1906, "step": 9040 }, { "epoch": 2.8593318063344126, "grad_norm": 0.21107554604389273, "learning_rate": 4.618755777215998e-06, "loss": 0.1875, "step": 9050 }, { "epoch": 2.862491114445936, "grad_norm": 0.20742194888322327, "learning_rate": 4.607759498006105e-06, "loss": 0.1899, "step": 9060 }, { "epoch": 2.8656504225574597, "grad_norm": 0.1998899750485672, "learning_rate": 4.596765127374781e-06, "loss": 0.1887, "step": 9070 }, { "epoch": 2.8688097306689837, "grad_norm": 0.21434577458006387, "learning_rate": 4.5857727188188426e-06, "loss": 0.19, "step": 9080 }, { "epoch": 2.871969038780507, "grad_norm": 0.20486890206169453, "learning_rate": 4.57478232582557e-06, "loss": 0.1888, "step": 9090 }, { "epoch": 2.875128346892031, "grad_norm": 0.19626003974934564, "learning_rate": 4.563794001872428e-06, "loss": 0.189, "step": 9100 }, { "epoch": 2.8782876550035543, "grad_norm": 0.19598418482256752, "learning_rate": 4.5528078004268125e-06, "loss": 0.1908, "step": 9110 }, { "epoch": 2.881446963115078, "grad_norm": 0.20771737890224903, "learning_rate": 4.5418237749458e-06, "loss": 0.1893, "step": 9120 }, { "epoch": 2.8846062712266014, "grad_norm": 0.21369972050863945, "learning_rate": 4.5308419788758705e-06, "loss": 0.1914, "step": 9130 }, { "epoch": 2.887765579338125, "grad_norm": 0.19830047599777365, "learning_rate": 4.519862465652664e-06, "loss": 0.1891, "step": 9140 }, { "epoch": 2.8909248874496485, "grad_norm": 0.21728853363365294, "learning_rate": 4.508885288700706e-06, "loss": 0.1878, "step": 9150 }, { "epoch": 2.894084195561172, "grad_norm": 0.21489567981218682, "learning_rate": 4.497910501433153e-06, "loss": 0.1892, "step": 9160 }, { "epoch": 2.8972435036726956, "grad_norm": 0.1999440134687983, "learning_rate": 4.486938157251544e-06, "loss": 0.1913, "step": 9170 }, { "epoch": 2.900402811784219, "grad_norm": 0.21121245524106846, "learning_rate": 4.475968309545519e-06, "loss": 0.192, "step": 9180 }, { "epoch": 2.903562119895743, "grad_norm": 0.19232313304792983, "learning_rate": 4.465001011692575e-06, "loss": 0.1884, "step": 9190 }, { "epoch": 2.906721428007266, "grad_norm": 0.2047713363163475, "learning_rate": 4.454036317057804e-06, "loss": 0.1897, "step": 9200 }, { "epoch": 2.90988073611879, "grad_norm": 0.20838735166818767, "learning_rate": 4.443074278993625e-06, "loss": 0.1868, "step": 9210 }, { "epoch": 2.9130400442303137, "grad_norm": 0.19377646969116016, "learning_rate": 4.43211495083954e-06, "loss": 0.1899, "step": 9220 }, { "epoch": 2.9161993523418372, "grad_norm": 0.2081826810286359, "learning_rate": 4.421158385921856e-06, "loss": 0.1901, "step": 9230 }, { "epoch": 2.919358660453361, "grad_norm": 0.19836105918450578, "learning_rate": 4.410204637553437e-06, "loss": 0.1897, "step": 9240 }, { "epoch": 2.9225179685648843, "grad_norm": 0.20794480096225132, "learning_rate": 4.3992537590334485e-06, "loss": 0.1904, "step": 9250 }, { "epoch": 2.925677276676408, "grad_norm": 0.21149092375760936, "learning_rate": 4.38830580364708e-06, "loss": 0.1897, "step": 9260 }, { "epoch": 2.9288365847879314, "grad_norm": 0.20074752053463094, "learning_rate": 4.377360824665309e-06, "loss": 0.1876, "step": 9270 }, { "epoch": 2.931995892899455, "grad_norm": 0.20279658516073718, "learning_rate": 4.366418875344624e-06, "loss": 0.1888, "step": 9280 }, { "epoch": 2.9351552010109785, "grad_norm": 0.202921993136915, "learning_rate": 4.3554800089267705e-06, "loss": 0.192, "step": 9290 }, { "epoch": 2.938314509122502, "grad_norm": 0.2112414397403086, "learning_rate": 4.344544278638499e-06, "loss": 0.1883, "step": 9300 }, { "epoch": 2.9414738172340256, "grad_norm": 0.1979648780466572, "learning_rate": 4.333611737691296e-06, "loss": 0.188, "step": 9310 }, { "epoch": 2.9446331253455496, "grad_norm": 0.19936914995735966, "learning_rate": 4.322682439281126e-06, "loss": 0.1876, "step": 9320 }, { "epoch": 2.9477924334570726, "grad_norm": 0.2045530468116589, "learning_rate": 4.311756436588185e-06, "loss": 0.1861, "step": 9330 }, { "epoch": 2.9509517415685966, "grad_norm": 0.2207404912444104, "learning_rate": 4.300833782776624e-06, "loss": 0.1894, "step": 9340 }, { "epoch": 2.95411104968012, "grad_norm": 0.19298829375532547, "learning_rate": 4.289914530994303e-06, "loss": 0.1885, "step": 9350 }, { "epoch": 2.9572703577916437, "grad_norm": 0.20201449973484348, "learning_rate": 4.27899873437253e-06, "loss": 0.1892, "step": 9360 }, { "epoch": 2.9604296659031673, "grad_norm": 0.2038957223369961, "learning_rate": 4.268086446025793e-06, "loss": 0.1884, "step": 9370 }, { "epoch": 2.963588974014691, "grad_norm": 0.21578914203411362, "learning_rate": 4.25717771905152e-06, "loss": 0.1892, "step": 9380 }, { "epoch": 2.9667482821262143, "grad_norm": 0.21096600784690153, "learning_rate": 4.2462726065298e-06, "loss": 0.1902, "step": 9390 }, { "epoch": 2.969907590237738, "grad_norm": 0.2033196759446758, "learning_rate": 4.235371161523141e-06, "loss": 0.1892, "step": 9400 }, { "epoch": 2.9730668983492614, "grad_norm": 0.20356432378382075, "learning_rate": 4.224473437076204e-06, "loss": 0.1905, "step": 9410 }, { "epoch": 2.976226206460785, "grad_norm": 0.21038412746729923, "learning_rate": 4.2135794862155454e-06, "loss": 0.1912, "step": 9420 }, { "epoch": 2.979385514572309, "grad_norm": 0.2033988073630781, "learning_rate": 4.20268936194936e-06, "loss": 0.1897, "step": 9430 }, { "epoch": 2.982544822683832, "grad_norm": 0.20236599162357977, "learning_rate": 4.191803117267223e-06, "loss": 0.1893, "step": 9440 }, { "epoch": 2.985704130795356, "grad_norm": 0.20641674023136433, "learning_rate": 4.180920805139835e-06, "loss": 0.1888, "step": 9450 }, { "epoch": 2.9888634389068796, "grad_norm": 0.22914246659818197, "learning_rate": 4.170042478518759e-06, "loss": 0.1875, "step": 9460 }, { "epoch": 2.992022747018403, "grad_norm": 0.20738233476923548, "learning_rate": 4.159168190336162e-06, "loss": 0.187, "step": 9470 }, { "epoch": 2.9951820551299266, "grad_norm": 0.20804494541317625, "learning_rate": 4.148297993504566e-06, "loss": 0.1902, "step": 9480 }, { "epoch": 2.99834136324145, "grad_norm": 0.19974693835491714, "learning_rate": 4.137431940916584e-06, "loss": 0.1866, "step": 9490 }, { "epoch": 3.001579654055762, "grad_norm": 0.19231070887741758, "learning_rate": 4.12657008544466e-06, "loss": 0.1846, "step": 9500 }, { "epoch": 3.0047389621672855, "grad_norm": 0.20508127207154292, "learning_rate": 4.115712479940821e-06, "loss": 0.1717, "step": 9510 }, { "epoch": 3.007898270278809, "grad_norm": 0.19947206755336946, "learning_rate": 4.10485917723641e-06, "loss": 0.1711, "step": 9520 }, { "epoch": 3.0110575783903326, "grad_norm": 0.19753840539867382, "learning_rate": 4.0940102301418375e-06, "loss": 0.1721, "step": 9530 }, { "epoch": 3.014216886501856, "grad_norm": 0.1992468045730409, "learning_rate": 4.083165691446314e-06, "loss": 0.1719, "step": 9540 }, { "epoch": 3.0173761946133797, "grad_norm": 0.20255881958221392, "learning_rate": 4.072325613917605e-06, "loss": 0.1719, "step": 9550 }, { "epoch": 3.0205355027249032, "grad_norm": 0.1899039859129075, "learning_rate": 4.061490050301767e-06, "loss": 0.1699, "step": 9560 }, { "epoch": 3.0236948108364268, "grad_norm": 0.20612020049628732, "learning_rate": 4.050659053322892e-06, "loss": 0.1714, "step": 9570 }, { "epoch": 3.0268541189479503, "grad_norm": 0.21789263057401254, "learning_rate": 4.039832675682854e-06, "loss": 0.1723, "step": 9580 }, { "epoch": 3.030013427059474, "grad_norm": 0.2026996020270893, "learning_rate": 4.0290109700610445e-06, "loss": 0.17, "step": 9590 }, { "epoch": 3.0331727351709974, "grad_norm": 0.19422527184239666, "learning_rate": 4.0181939891141276e-06, "loss": 0.1715, "step": 9600 }, { "epoch": 3.036332043282521, "grad_norm": 0.19811136542886754, "learning_rate": 4.007381785475776e-06, "loss": 0.1707, "step": 9610 }, { "epoch": 3.039491351394045, "grad_norm": 0.21473702063694924, "learning_rate": 3.996574411756412e-06, "loss": 0.1717, "step": 9620 }, { "epoch": 3.0426506595055685, "grad_norm": 0.20093271189509854, "learning_rate": 3.9857719205429666e-06, "loss": 0.1698, "step": 9630 }, { "epoch": 3.045809967617092, "grad_norm": 0.2054432390601117, "learning_rate": 3.974974364398604e-06, "loss": 0.1722, "step": 9640 }, { "epoch": 3.0489692757286155, "grad_norm": 0.21106353915988532, "learning_rate": 3.964181795862476e-06, "loss": 0.1702, "step": 9650 }, { "epoch": 3.052128583840139, "grad_norm": 0.20956851733321105, "learning_rate": 3.9533942674494736e-06, "loss": 0.1712, "step": 9660 }, { "epoch": 3.0552878919516626, "grad_norm": 0.2151757551569205, "learning_rate": 3.942611831649953e-06, "loss": 0.1723, "step": 9670 }, { "epoch": 3.058447200063186, "grad_norm": 0.20087929271216232, "learning_rate": 3.931834540929498e-06, "loss": 0.1729, "step": 9680 }, { "epoch": 3.0616065081747097, "grad_norm": 0.19812955505838692, "learning_rate": 3.9210624477286545e-06, "loss": 0.1702, "step": 9690 }, { "epoch": 3.0647658162862332, "grad_norm": 0.19934250074039184, "learning_rate": 3.910295604462675e-06, "loss": 0.1718, "step": 9700 }, { "epoch": 3.0679251243977568, "grad_norm": 0.22292842551894693, "learning_rate": 3.899534063521274e-06, "loss": 0.1703, "step": 9710 }, { "epoch": 3.0710844325092803, "grad_norm": 0.20621823300766834, "learning_rate": 3.888777877268361e-06, "loss": 0.1718, "step": 9720 }, { "epoch": 3.074243740620804, "grad_norm": 0.20627308571255917, "learning_rate": 3.8780270980417865e-06, "loss": 0.1715, "step": 9730 }, { "epoch": 3.077403048732328, "grad_norm": 0.20092589221833093, "learning_rate": 3.867281778153103e-06, "loss": 0.1708, "step": 9740 }, { "epoch": 3.0805623568438514, "grad_norm": 0.20178715984754905, "learning_rate": 3.856541969887284e-06, "loss": 0.1713, "step": 9750 }, { "epoch": 3.083721664955375, "grad_norm": 0.20333774590909526, "learning_rate": 3.8458077255024985e-06, "loss": 0.1711, "step": 9760 }, { "epoch": 3.0868809730668985, "grad_norm": 0.19812370974809254, "learning_rate": 3.835079097229834e-06, "loss": 0.1716, "step": 9770 }, { "epoch": 3.090040281178422, "grad_norm": 0.19168564591899634, "learning_rate": 3.82435613727305e-06, "loss": 0.1712, "step": 9780 }, { "epoch": 3.0931995892899455, "grad_norm": 0.20301128877899655, "learning_rate": 3.8136388978083318e-06, "loss": 0.1717, "step": 9790 }, { "epoch": 3.096358897401469, "grad_norm": 0.21119806157032167, "learning_rate": 3.802927430984024e-06, "loss": 0.1713, "step": 9800 }, { "epoch": 3.0995182055129926, "grad_norm": 0.19958619654256107, "learning_rate": 3.7922217889203815e-06, "loss": 0.1729, "step": 9810 }, { "epoch": 3.102677513624516, "grad_norm": 0.1968913471359473, "learning_rate": 3.781522023709325e-06, "loss": 0.172, "step": 9820 }, { "epoch": 3.1058368217360397, "grad_norm": 0.20650603439149065, "learning_rate": 3.770828187414169e-06, "loss": 0.1714, "step": 9830 }, { "epoch": 3.1089961298475632, "grad_norm": 0.20480879307691913, "learning_rate": 3.7601403320693877e-06, "loss": 0.1731, "step": 9840 }, { "epoch": 3.112155437959087, "grad_norm": 0.19841361175349975, "learning_rate": 3.7494585096803475e-06, "loss": 0.17, "step": 9850 }, { "epoch": 3.1153147460706103, "grad_norm": 0.20023910284222765, "learning_rate": 3.7387827722230592e-06, "loss": 0.1719, "step": 9860 }, { "epoch": 3.1184740541821343, "grad_norm": 0.20538502307132153, "learning_rate": 3.72811317164393e-06, "loss": 0.1714, "step": 9870 }, { "epoch": 3.121633362293658, "grad_norm": 0.2025961874013164, "learning_rate": 3.7174497598595004e-06, "loss": 0.1731, "step": 9880 }, { "epoch": 3.1247926704051814, "grad_norm": 0.19897689525350173, "learning_rate": 3.7067925887562035e-06, "loss": 0.1709, "step": 9890 }, { "epoch": 3.127951978516705, "grad_norm": 0.19674997421298326, "learning_rate": 3.6961417101901004e-06, "loss": 0.1709, "step": 9900 }, { "epoch": 3.1311112866282285, "grad_norm": 0.20048513567462214, "learning_rate": 3.6854971759866343e-06, "loss": 0.168, "step": 9910 }, { "epoch": 3.134270594739752, "grad_norm": 0.20877865427323217, "learning_rate": 3.6748590379403837e-06, "loss": 0.1699, "step": 9920 }, { "epoch": 3.1374299028512755, "grad_norm": 0.20947180408342017, "learning_rate": 3.664227347814796e-06, "loss": 0.1718, "step": 9930 }, { "epoch": 3.140589210962799, "grad_norm": 0.19984590881674016, "learning_rate": 3.653602157341953e-06, "loss": 0.1744, "step": 9940 }, { "epoch": 3.1437485190743226, "grad_norm": 0.20244914750147294, "learning_rate": 3.6429835182223028e-06, "loss": 0.1701, "step": 9950 }, { "epoch": 3.146907827185846, "grad_norm": 0.20651719839215915, "learning_rate": 3.632371482124416e-06, "loss": 0.1722, "step": 9960 }, { "epoch": 3.1500671352973697, "grad_norm": 0.21320130534174414, "learning_rate": 3.621766100684742e-06, "loss": 0.1719, "step": 9970 }, { "epoch": 3.1532264434088937, "grad_norm": 0.1982018761614747, "learning_rate": 3.6111674255073415e-06, "loss": 0.1697, "step": 9980 }, { "epoch": 3.1563857515204172, "grad_norm": 0.19540192155635913, "learning_rate": 3.600575508163643e-06, "loss": 0.1716, "step": 9990 }, { "epoch": 3.1595450596319408, "grad_norm": 0.19265346677541417, "learning_rate": 3.5899904001922014e-06, "loss": 0.1723, "step": 10000 }, { "epoch": 3.1627043677434643, "grad_norm": 0.2602993365623357, "learning_rate": 3.579412153098428e-06, "loss": 0.1717, "step": 10010 }, { "epoch": 3.165863675854988, "grad_norm": 0.2072576388794693, "learning_rate": 3.568840818354359e-06, "loss": 0.1705, "step": 10020 }, { "epoch": 3.1690229839665114, "grad_norm": 0.20292312081490704, "learning_rate": 3.5582764473983898e-06, "loss": 0.1708, "step": 10030 }, { "epoch": 3.172182292078035, "grad_norm": 0.19839564056735176, "learning_rate": 3.5477190916350314e-06, "loss": 0.173, "step": 10040 }, { "epoch": 3.1753416001895585, "grad_norm": 0.21059840553848913, "learning_rate": 3.5371688024346663e-06, "loss": 0.1728, "step": 10050 }, { "epoch": 3.178500908301082, "grad_norm": 0.19127556281230237, "learning_rate": 3.5266256311332838e-06, "loss": 0.1717, "step": 10060 }, { "epoch": 3.1816602164126055, "grad_norm": 0.20280055781097764, "learning_rate": 3.5160896290322466e-06, "loss": 0.1718, "step": 10070 }, { "epoch": 3.184819524524129, "grad_norm": 0.19870253928511597, "learning_rate": 3.5055608473980275e-06, "loss": 0.173, "step": 10080 }, { "epoch": 3.1879788326356526, "grad_norm": 0.20124551003316218, "learning_rate": 3.495039337461966e-06, "loss": 0.1714, "step": 10090 }, { "epoch": 3.191138140747176, "grad_norm": 0.1974863499022992, "learning_rate": 3.484525150420024e-06, "loss": 0.1727, "step": 10100 }, { "epoch": 3.1942974488587, "grad_norm": 0.2056406098760328, "learning_rate": 3.474018337432526e-06, "loss": 0.1711, "step": 10110 }, { "epoch": 3.1974567569702237, "grad_norm": 0.1993829016013529, "learning_rate": 3.4635189496239147e-06, "loss": 0.1723, "step": 10120 }, { "epoch": 3.2006160650817472, "grad_norm": 0.20235980644501536, "learning_rate": 3.4530270380825106e-06, "loss": 0.1719, "step": 10130 }, { "epoch": 3.2037753731932708, "grad_norm": 0.21045502485536316, "learning_rate": 3.442542653860246e-06, "loss": 0.1728, "step": 10140 }, { "epoch": 3.2069346813047943, "grad_norm": 0.1983591302141842, "learning_rate": 3.4320658479724358e-06, "loss": 0.1714, "step": 10150 }, { "epoch": 3.210093989416318, "grad_norm": 0.20220358320589318, "learning_rate": 3.4215966713975137e-06, "loss": 0.1721, "step": 10160 }, { "epoch": 3.2132532975278414, "grad_norm": 0.20148720905716092, "learning_rate": 3.41113517507679e-06, "loss": 0.1722, "step": 10170 }, { "epoch": 3.216412605639365, "grad_norm": 0.21958364967195199, "learning_rate": 3.400681409914211e-06, "loss": 0.1717, "step": 10180 }, { "epoch": 3.2195719137508885, "grad_norm": 0.20443876928391433, "learning_rate": 3.390235426776095e-06, "loss": 0.1723, "step": 10190 }, { "epoch": 3.222731221862412, "grad_norm": 0.20468900776748283, "learning_rate": 3.3797972764909044e-06, "loss": 0.1728, "step": 10200 }, { "epoch": 3.2258905299739355, "grad_norm": 0.20712330871995108, "learning_rate": 3.3693670098489794e-06, "loss": 0.1717, "step": 10210 }, { "epoch": 3.229049838085459, "grad_norm": 0.20539340759709276, "learning_rate": 3.3589446776023026e-06, "loss": 0.1735, "step": 10220 }, { "epoch": 3.2322091461969826, "grad_norm": 0.20853940556733697, "learning_rate": 3.3485303304642523e-06, "loss": 0.1734, "step": 10230 }, { "epoch": 3.2353684543085066, "grad_norm": 0.20622108613855025, "learning_rate": 3.338124019109348e-06, "loss": 0.1731, "step": 10240 }, { "epoch": 3.23852776242003, "grad_norm": 0.20725725534935868, "learning_rate": 3.3277257941730112e-06, "loss": 0.1701, "step": 10250 }, { "epoch": 3.2416870705315537, "grad_norm": 0.20483333136083265, "learning_rate": 3.3173357062513156e-06, "loss": 0.1726, "step": 10260 }, { "epoch": 3.2448463786430772, "grad_norm": 0.19730208138726862, "learning_rate": 3.30695380590074e-06, "loss": 0.1719, "step": 10270 }, { "epoch": 3.2480056867546008, "grad_norm": 0.19161529058623247, "learning_rate": 3.2965801436379268e-06, "loss": 0.1703, "step": 10280 }, { "epoch": 3.2511649948661243, "grad_norm": 0.20183756272218625, "learning_rate": 3.2862147699394308e-06, "loss": 0.1707, "step": 10290 }, { "epoch": 3.254324302977648, "grad_norm": 0.2077259975279863, "learning_rate": 3.2758577352414746e-06, "loss": 0.1724, "step": 10300 }, { "epoch": 3.2574836110891714, "grad_norm": 0.19909554676713112, "learning_rate": 3.2655090899397104e-06, "loss": 0.1727, "step": 10310 }, { "epoch": 3.260642919200695, "grad_norm": 0.19458384557088879, "learning_rate": 3.255168884388962e-06, "loss": 0.1706, "step": 10320 }, { "epoch": 3.2638022273122185, "grad_norm": 0.20300444839784934, "learning_rate": 3.2448371689029917e-06, "loss": 0.17, "step": 10330 }, { "epoch": 3.266961535423742, "grad_norm": 0.2037381038102816, "learning_rate": 3.2345139937542493e-06, "loss": 0.1707, "step": 10340 }, { "epoch": 3.270120843535266, "grad_norm": 0.20261293016006007, "learning_rate": 3.2241994091736264e-06, "loss": 0.1716, "step": 10350 }, { "epoch": 3.2732801516467895, "grad_norm": 0.20445117127878168, "learning_rate": 3.2138934653502157e-06, "loss": 0.1715, "step": 10360 }, { "epoch": 3.276439459758313, "grad_norm": 0.20006168658634946, "learning_rate": 3.2035962124310677e-06, "loss": 0.1699, "step": 10370 }, { "epoch": 3.2795987678698366, "grad_norm": 0.2048357399994623, "learning_rate": 3.1933077005209413e-06, "loss": 0.1714, "step": 10380 }, { "epoch": 3.28275807598136, "grad_norm": 0.20764582211703872, "learning_rate": 3.1830279796820655e-06, "loss": 0.1726, "step": 10390 }, { "epoch": 3.2859173840928837, "grad_norm": 0.19507013108646712, "learning_rate": 3.17275709993389e-06, "loss": 0.1686, "step": 10400 }, { "epoch": 3.2890766922044072, "grad_norm": 0.20376140005700832, "learning_rate": 3.1624951112528486e-06, "loss": 0.1727, "step": 10410 }, { "epoch": 3.2922360003159308, "grad_norm": 0.19814895837042856, "learning_rate": 3.152242063572111e-06, "loss": 0.172, "step": 10420 }, { "epoch": 3.2953953084274543, "grad_norm": 0.20809655011071984, "learning_rate": 3.1419980067813416e-06, "loss": 0.1723, "step": 10430 }, { "epoch": 3.298554616538978, "grad_norm": 0.2130288394869752, "learning_rate": 3.131762990726457e-06, "loss": 0.1693, "step": 10440 }, { "epoch": 3.3017139246505014, "grad_norm": 0.20474107277059672, "learning_rate": 3.1215370652093817e-06, "loss": 0.1728, "step": 10450 }, { "epoch": 3.304873232762025, "grad_norm": 0.19685492816170327, "learning_rate": 3.1113202799878104e-06, "loss": 0.1736, "step": 10460 }, { "epoch": 3.3080325408735485, "grad_norm": 0.20719245855817442, "learning_rate": 3.1011126847749573e-06, "loss": 0.1718, "step": 10470 }, { "epoch": 3.3111918489850725, "grad_norm": 0.19594664304604262, "learning_rate": 3.090914329239325e-06, "loss": 0.1705, "step": 10480 }, { "epoch": 3.314351157096596, "grad_norm": 0.19759328866966638, "learning_rate": 3.0807252630044535e-06, "loss": 0.1738, "step": 10490 }, { "epoch": 3.3175104652081195, "grad_norm": 0.2039023386475025, "learning_rate": 3.0705455356486847e-06, "loss": 0.1709, "step": 10500 }, { "epoch": 3.320669773319643, "grad_norm": 0.20102237377028126, "learning_rate": 3.0603751967049196e-06, "loss": 0.1731, "step": 10510 }, { "epoch": 3.3238290814311666, "grad_norm": 0.20154946206713367, "learning_rate": 3.050214295660373e-06, "loss": 0.1744, "step": 10520 }, { "epoch": 3.32698838954269, "grad_norm": 0.20754788610456004, "learning_rate": 3.0400628819563394e-06, "loss": 0.1725, "step": 10530 }, { "epoch": 3.3301476976542137, "grad_norm": 0.21440535676787434, "learning_rate": 3.02992100498795e-06, "loss": 0.1711, "step": 10540 }, { "epoch": 3.3333070057657372, "grad_norm": 0.1985749560929464, "learning_rate": 3.0197887141039295e-06, "loss": 0.1716, "step": 10550 }, { "epoch": 3.336466313877261, "grad_norm": 0.20486416168285063, "learning_rate": 3.009666058606361e-06, "loss": 0.1712, "step": 10560 }, { "epoch": 3.3396256219887843, "grad_norm": 0.20464416545173006, "learning_rate": 2.999553087750441e-06, "loss": 0.1715, "step": 10570 }, { "epoch": 3.342784930100308, "grad_norm": 0.19277745618987774, "learning_rate": 2.9894498507442403e-06, "loss": 0.1696, "step": 10580 }, { "epoch": 3.345944238211832, "grad_norm": 0.19243002023701336, "learning_rate": 2.979356396748474e-06, "loss": 0.1722, "step": 10590 }, { "epoch": 3.349103546323355, "grad_norm": 0.184939999971442, "learning_rate": 2.969272774876246e-06, "loss": 0.1704, "step": 10600 }, { "epoch": 3.352262854434879, "grad_norm": 0.20963451528121682, "learning_rate": 2.9591990341928233e-06, "loss": 0.172, "step": 10610 }, { "epoch": 3.3554221625464025, "grad_norm": 0.21203412019436482, "learning_rate": 2.9491352237153924e-06, "loss": 0.1719, "step": 10620 }, { "epoch": 3.358581470657926, "grad_norm": 0.19976350882936353, "learning_rate": 2.9390813924128187e-06, "loss": 0.1716, "step": 10630 }, { "epoch": 3.3617407787694495, "grad_norm": 0.19765127820077447, "learning_rate": 2.9290375892054145e-06, "loss": 0.1719, "step": 10640 }, { "epoch": 3.364900086880973, "grad_norm": 0.19819054962322868, "learning_rate": 2.9190038629646928e-06, "loss": 0.1718, "step": 10650 }, { "epoch": 3.3680593949924966, "grad_norm": 0.2018843941060178, "learning_rate": 2.9089802625131357e-06, "loss": 0.1715, "step": 10660 }, { "epoch": 3.37121870310402, "grad_norm": 0.2053114073761089, "learning_rate": 2.898966836623956e-06, "loss": 0.1712, "step": 10670 }, { "epoch": 3.3743780112155437, "grad_norm": 0.1834559540518353, "learning_rate": 2.888963634020856e-06, "loss": 0.1718, "step": 10680 }, { "epoch": 3.3775373193270672, "grad_norm": 0.21217338223459672, "learning_rate": 2.8789707033777958e-06, "loss": 0.17, "step": 10690 }, { "epoch": 3.380696627438591, "grad_norm": 0.20413368942330248, "learning_rate": 2.868988093318755e-06, "loss": 0.17, "step": 10700 }, { "epoch": 3.3838559355501143, "grad_norm": 0.20529368260563738, "learning_rate": 2.8590158524174847e-06, "loss": 0.1706, "step": 10710 }, { "epoch": 3.3870152436616383, "grad_norm": 0.1996839471001918, "learning_rate": 2.849054029197299e-06, "loss": 0.1728, "step": 10720 }, { "epoch": 3.390174551773162, "grad_norm": 0.21107475110462812, "learning_rate": 2.8391026721308048e-06, "loss": 0.1726, "step": 10730 }, { "epoch": 3.3933338598846854, "grad_norm": 0.1977207222688015, "learning_rate": 2.8291618296396906e-06, "loss": 0.1717, "step": 10740 }, { "epoch": 3.396493167996209, "grad_norm": 0.20112571153294398, "learning_rate": 2.819231550094482e-06, "loss": 0.171, "step": 10750 }, { "epoch": 3.3996524761077325, "grad_norm": 0.1998438118160792, "learning_rate": 2.8093118818143054e-06, "loss": 0.1714, "step": 10760 }, { "epoch": 3.402811784219256, "grad_norm": 0.20497839555948202, "learning_rate": 2.799402873066657e-06, "loss": 0.1718, "step": 10770 }, { "epoch": 3.4059710923307795, "grad_norm": 0.20138016538674214, "learning_rate": 2.789504572067163e-06, "loss": 0.1723, "step": 10780 }, { "epoch": 3.409130400442303, "grad_norm": 0.2119166594988366, "learning_rate": 2.7796170269793448e-06, "loss": 0.1714, "step": 10790 }, { "epoch": 3.4122897085538266, "grad_norm": 0.207710303365895, "learning_rate": 2.7697402859143973e-06, "loss": 0.1731, "step": 10800 }, { "epoch": 3.41544901666535, "grad_norm": 0.2002743432404299, "learning_rate": 2.7598743969309323e-06, "loss": 0.1705, "step": 10810 }, { "epoch": 3.4186083247768737, "grad_norm": 0.1979710890338057, "learning_rate": 2.7500194080347652e-06, "loss": 0.1698, "step": 10820 }, { "epoch": 3.4217676328883972, "grad_norm": 0.20374184595460798, "learning_rate": 2.740175367178671e-06, "loss": 0.1731, "step": 10830 }, { "epoch": 3.424926940999921, "grad_norm": 0.21111483778852924, "learning_rate": 2.7303423222621532e-06, "loss": 0.1712, "step": 10840 }, { "epoch": 3.4280862491114448, "grad_norm": 0.209638712778081, "learning_rate": 2.7205203211312113e-06, "loss": 0.1695, "step": 10850 }, { "epoch": 3.4312455572229683, "grad_norm": 0.20749198974074953, "learning_rate": 2.710709411578108e-06, "loss": 0.1701, "step": 10860 }, { "epoch": 3.434404865334492, "grad_norm": 0.20185076643899905, "learning_rate": 2.700909641341136e-06, "loss": 0.1716, "step": 10870 }, { "epoch": 3.4375641734460154, "grad_norm": 0.20360708501600241, "learning_rate": 2.6911210581043827e-06, "loss": 0.1717, "step": 10880 }, { "epoch": 3.440723481557539, "grad_norm": 0.19759420416489065, "learning_rate": 2.6813437094975058e-06, "loss": 0.1702, "step": 10890 }, { "epoch": 3.4438827896690625, "grad_norm": 0.21258398444562132, "learning_rate": 2.6715776430954948e-06, "loss": 0.1712, "step": 10900 }, { "epoch": 3.447042097780586, "grad_norm": 0.204687059081065, "learning_rate": 2.661822906418443e-06, "loss": 0.1713, "step": 10910 }, { "epoch": 3.4502014058921096, "grad_norm": 0.21216451437708195, "learning_rate": 2.652079546931314e-06, "loss": 0.172, "step": 10920 }, { "epoch": 3.453360714003633, "grad_norm": 0.20495187704615014, "learning_rate": 2.642347612043713e-06, "loss": 0.172, "step": 10930 }, { "epoch": 3.4565200221151566, "grad_norm": 0.20577240721425508, "learning_rate": 2.632627149109653e-06, "loss": 0.1724, "step": 10940 }, { "epoch": 3.45967933022668, "grad_norm": 0.20332569317373442, "learning_rate": 2.622918205427332e-06, "loss": 0.1728, "step": 10950 }, { "epoch": 3.462838638338204, "grad_norm": 0.20833963528994082, "learning_rate": 2.613220828238887e-06, "loss": 0.1723, "step": 10960 }, { "epoch": 3.4659979464497277, "grad_norm": 0.20062775671760197, "learning_rate": 2.6035350647301825e-06, "loss": 0.1697, "step": 10970 }, { "epoch": 3.4691572545612512, "grad_norm": 0.1925666545023208, "learning_rate": 2.5938609620305697e-06, "loss": 0.1721, "step": 10980 }, { "epoch": 3.4723165626727748, "grad_norm": 0.20615750012695935, "learning_rate": 2.584198567212663e-06, "loss": 0.1693, "step": 10990 }, { "epoch": 3.4754758707842983, "grad_norm": 0.205868926105775, "learning_rate": 2.5745479272921035e-06, "loss": 0.1715, "step": 11000 }, { "epoch": 3.478635178895822, "grad_norm": 0.19907532345392218, "learning_rate": 2.5649090892273394e-06, "loss": 0.1697, "step": 11010 }, { "epoch": 3.4817944870073454, "grad_norm": 0.20241604647559086, "learning_rate": 2.5552820999193893e-06, "loss": 0.1714, "step": 11020 }, { "epoch": 3.484953795118869, "grad_norm": 0.19925318676474033, "learning_rate": 2.5456670062116227e-06, "loss": 0.1702, "step": 11030 }, { "epoch": 3.4881131032303925, "grad_norm": 0.20118707025616442, "learning_rate": 2.5360638548895177e-06, "loss": 0.1687, "step": 11040 }, { "epoch": 3.491272411341916, "grad_norm": 0.21011772565832204, "learning_rate": 2.526472692680455e-06, "loss": 0.1723, "step": 11050 }, { "epoch": 3.4944317194534396, "grad_norm": 0.2114462945794177, "learning_rate": 2.5168935662534676e-06, "loss": 0.1713, "step": 11060 }, { "epoch": 3.497591027564963, "grad_norm": 0.20322439000336912, "learning_rate": 2.507326522219031e-06, "loss": 0.1722, "step": 11070 }, { "epoch": 3.5007503356764866, "grad_norm": 0.20423924785828376, "learning_rate": 2.497771607128826e-06, "loss": 0.1711, "step": 11080 }, { "epoch": 3.5039096437880106, "grad_norm": 0.21367822416543628, "learning_rate": 2.4882288674755196e-06, "loss": 0.1702, "step": 11090 }, { "epoch": 3.507068951899534, "grad_norm": 0.20077382774697636, "learning_rate": 2.4786983496925273e-06, "loss": 0.1723, "step": 11100 }, { "epoch": 3.5102282600110577, "grad_norm": 0.21321414495009436, "learning_rate": 2.4691801001538083e-06, "loss": 0.1696, "step": 11110 }, { "epoch": 3.5133875681225812, "grad_norm": 0.203537804839117, "learning_rate": 2.459674165173611e-06, "loss": 0.1698, "step": 11120 }, { "epoch": 3.516546876234105, "grad_norm": 0.2009040955436558, "learning_rate": 2.450180591006278e-06, "loss": 0.1716, "step": 11130 }, { "epoch": 3.5197061843456283, "grad_norm": 0.20513705543314245, "learning_rate": 2.440699423845994e-06, "loss": 0.1721, "step": 11140 }, { "epoch": 3.522865492457152, "grad_norm": 0.19738195470784428, "learning_rate": 2.43123070982658e-06, "loss": 0.1716, "step": 11150 }, { "epoch": 3.5260248005686754, "grad_norm": 0.19923618992627787, "learning_rate": 2.4217744950212603e-06, "loss": 0.1722, "step": 11160 }, { "epoch": 3.529184108680199, "grad_norm": 0.19543045451037108, "learning_rate": 2.4123308254424397e-06, "loss": 0.1722, "step": 11170 }, { "epoch": 3.5323434167917225, "grad_norm": 0.20098623956184636, "learning_rate": 2.4028997470414813e-06, "loss": 0.1721, "step": 11180 }, { "epoch": 3.535502724903246, "grad_norm": 0.19666520667412346, "learning_rate": 2.393481305708481e-06, "loss": 0.1718, "step": 11190 }, { "epoch": 3.53866203301477, "grad_norm": 0.20769071659100188, "learning_rate": 2.38407554727204e-06, "loss": 0.1721, "step": 11200 }, { "epoch": 3.541821341126293, "grad_norm": 0.20902129214374304, "learning_rate": 2.3746825174990586e-06, "loss": 0.1734, "step": 11210 }, { "epoch": 3.544980649237817, "grad_norm": 0.18991888715497546, "learning_rate": 2.365302262094485e-06, "loss": 0.1718, "step": 11220 }, { "epoch": 3.5481399573493406, "grad_norm": 0.18872022535120644, "learning_rate": 2.3559348267011265e-06, "loss": 0.1717, "step": 11230 }, { "epoch": 3.551299265460864, "grad_norm": 0.2032395005721165, "learning_rate": 2.3465802568993974e-06, "loss": 0.1696, "step": 11240 }, { "epoch": 3.5544585735723877, "grad_norm": 0.20260872935920152, "learning_rate": 2.3372385982071155e-06, "loss": 0.1699, "step": 11250 }, { "epoch": 3.5576178816839112, "grad_norm": 0.19808829127100105, "learning_rate": 2.3279098960792743e-06, "loss": 0.1693, "step": 11260 }, { "epoch": 3.560777189795435, "grad_norm": 0.20833962705825296, "learning_rate": 2.318594195907826e-06, "loss": 0.1716, "step": 11270 }, { "epoch": 3.5639364979069583, "grad_norm": 0.20099933117353708, "learning_rate": 2.3092915430214486e-06, "loss": 0.171, "step": 11280 }, { "epoch": 3.567095806018482, "grad_norm": 0.20739851880627683, "learning_rate": 2.3000019826853464e-06, "loss": 0.1693, "step": 11290 }, { "epoch": 3.5702551141300054, "grad_norm": 0.19883060983816717, "learning_rate": 2.2907255601010048e-06, "loss": 0.1706, "step": 11300 }, { "epoch": 3.573414422241529, "grad_norm": 0.200914416630117, "learning_rate": 2.2814623204059954e-06, "loss": 0.1705, "step": 11310 }, { "epoch": 3.5765737303530525, "grad_norm": 0.18451322741288337, "learning_rate": 2.272212308673733e-06, "loss": 0.1702, "step": 11320 }, { "epoch": 3.5797330384645765, "grad_norm": 0.20604109813637425, "learning_rate": 2.262975569913274e-06, "loss": 0.1716, "step": 11330 }, { "epoch": 3.5828923465760996, "grad_norm": 0.20483250038192008, "learning_rate": 2.2537521490690885e-06, "loss": 0.1692, "step": 11340 }, { "epoch": 3.5860516546876235, "grad_norm": 0.20171939003906209, "learning_rate": 2.2445420910208444e-06, "loss": 0.1687, "step": 11350 }, { "epoch": 3.589210962799147, "grad_norm": 0.20410002768909777, "learning_rate": 2.2353454405831878e-06, "loss": 0.1681, "step": 11360 }, { "epoch": 3.5923702709106706, "grad_norm": 0.1943525698679139, "learning_rate": 2.2261622425055275e-06, "loss": 0.1726, "step": 11370 }, { "epoch": 3.595529579022194, "grad_norm": 0.19993993998805085, "learning_rate": 2.2169925414718084e-06, "loss": 0.1719, "step": 11380 }, { "epoch": 3.5986888871337177, "grad_norm": 0.20390963358156805, "learning_rate": 2.207836382100314e-06, "loss": 0.1701, "step": 11390 }, { "epoch": 3.6018481952452412, "grad_norm": 0.2052658002840104, "learning_rate": 2.1986938089434217e-06, "loss": 0.1715, "step": 11400 }, { "epoch": 3.605007503356765, "grad_norm": 0.2050183493439441, "learning_rate": 2.1895648664874107e-06, "loss": 0.1719, "step": 11410 }, { "epoch": 3.6081668114682883, "grad_norm": 0.2133941623970417, "learning_rate": 2.1804495991522312e-06, "loss": 0.1704, "step": 11420 }, { "epoch": 3.611326119579812, "grad_norm": 0.19332656069708545, "learning_rate": 2.171348051291293e-06, "loss": 0.1681, "step": 11430 }, { "epoch": 3.614485427691336, "grad_norm": 0.1933434601782223, "learning_rate": 2.1622602671912507e-06, "loss": 0.1704, "step": 11440 }, { "epoch": 3.617644735802859, "grad_norm": 0.1983561300379488, "learning_rate": 2.1531862910717864e-06, "loss": 0.1706, "step": 11450 }, { "epoch": 3.620804043914383, "grad_norm": 0.1979155158963241, "learning_rate": 2.1441261670853886e-06, "loss": 0.1686, "step": 11460 }, { "epoch": 3.6239633520259065, "grad_norm": 0.19862020700111335, "learning_rate": 2.1350799393171565e-06, "loss": 0.1729, "step": 11470 }, { "epoch": 3.62712266013743, "grad_norm": 0.19911281754040644, "learning_rate": 2.1260476517845573e-06, "loss": 0.1715, "step": 11480 }, { "epoch": 3.6302819682489536, "grad_norm": 0.19710089391849986, "learning_rate": 2.117029348437243e-06, "loss": 0.1713, "step": 11490 }, { "epoch": 3.633441276360477, "grad_norm": 0.2043341418376342, "learning_rate": 2.108025073156806e-06, "loss": 0.1719, "step": 11500 }, { "epoch": 3.6366005844720006, "grad_norm": 0.22322690107677232, "learning_rate": 2.09903486975659e-06, "loss": 0.1729, "step": 11510 }, { "epoch": 3.639759892583524, "grad_norm": 0.20236434961535074, "learning_rate": 2.090058781981464e-06, "loss": 0.1711, "step": 11520 }, { "epoch": 3.6429192006950477, "grad_norm": 0.2123665454455019, "learning_rate": 2.0810968535076126e-06, "loss": 0.1701, "step": 11530 }, { "epoch": 3.6460785088065713, "grad_norm": 0.20276458027943825, "learning_rate": 2.0721491279423246e-06, "loss": 0.1716, "step": 11540 }, { "epoch": 3.649237816918095, "grad_norm": 0.18698069166162326, "learning_rate": 2.063215648823781e-06, "loss": 0.1682, "step": 11550 }, { "epoch": 3.6523971250296183, "grad_norm": 0.19593671814658578, "learning_rate": 2.0542964596208344e-06, "loss": 0.1704, "step": 11560 }, { "epoch": 3.6555564331411423, "grad_norm": 0.18813913173493607, "learning_rate": 2.0453916037328174e-06, "loss": 0.1727, "step": 11570 }, { "epoch": 3.6587157412526654, "grad_norm": 0.2012970901960924, "learning_rate": 2.036501124489308e-06, "loss": 0.1703, "step": 11580 }, { "epoch": 3.6618750493641894, "grad_norm": 0.20717799227869307, "learning_rate": 2.0276250651499346e-06, "loss": 0.1706, "step": 11590 }, { "epoch": 3.665034357475713, "grad_norm": 0.197037281134198, "learning_rate": 2.0187634689041603e-06, "loss": 0.1715, "step": 11600 }, { "epoch": 3.6681936655872365, "grad_norm": 0.20136557601918076, "learning_rate": 2.009916378871074e-06, "loss": 0.1709, "step": 11610 }, { "epoch": 3.67135297369876, "grad_norm": 0.20329881900316946, "learning_rate": 2.0010838380991776e-06, "loss": 0.1703, "step": 11620 }, { "epoch": 3.6745122818102836, "grad_norm": 0.20458710965310886, "learning_rate": 1.9922658895661816e-06, "loss": 0.1715, "step": 11630 }, { "epoch": 3.677671589921807, "grad_norm": 0.19784202689125777, "learning_rate": 1.983462576178786e-06, "loss": 0.1715, "step": 11640 }, { "epoch": 3.6808308980333306, "grad_norm": 0.19664140408499303, "learning_rate": 1.9746739407724913e-06, "loss": 0.1707, "step": 11650 }, { "epoch": 3.683990206144854, "grad_norm": 0.20357355393106222, "learning_rate": 1.965900026111364e-06, "loss": 0.1682, "step": 11660 }, { "epoch": 3.6871495142563777, "grad_norm": 0.19933592060492888, "learning_rate": 1.9571408748878495e-06, "loss": 0.1688, "step": 11670 }, { "epoch": 3.6903088223679017, "grad_norm": 0.19698317753323233, "learning_rate": 1.9483965297225545e-06, "loss": 0.1708, "step": 11680 }, { "epoch": 3.693468130479425, "grad_norm": 0.19171609705724965, "learning_rate": 1.9396670331640427e-06, "loss": 0.1714, "step": 11690 }, { "epoch": 3.6966274385909488, "grad_norm": 0.20128320688947327, "learning_rate": 1.930952427688626e-06, "loss": 0.1699, "step": 11700 }, { "epoch": 3.6997867467024723, "grad_norm": 0.19818571115089004, "learning_rate": 1.9222527557001587e-06, "loss": 0.1726, "step": 11710 }, { "epoch": 3.702946054813996, "grad_norm": 0.20461949225316534, "learning_rate": 1.913568059529832e-06, "loss": 0.1708, "step": 11720 }, { "epoch": 3.7061053629255194, "grad_norm": 0.1969396016325418, "learning_rate": 1.9048983814359684e-06, "loss": 0.1726, "step": 11730 }, { "epoch": 3.709264671037043, "grad_norm": 0.20508478156000626, "learning_rate": 1.8962437636038095e-06, "loss": 0.171, "step": 11740 }, { "epoch": 3.7124239791485665, "grad_norm": 0.19143862052589067, "learning_rate": 1.8876042481453222e-06, "loss": 0.1703, "step": 11750 }, { "epoch": 3.71558328726009, "grad_norm": 0.19462055997090028, "learning_rate": 1.8789798770989841e-06, "loss": 0.1695, "step": 11760 }, { "epoch": 3.7187425953716136, "grad_norm": 0.20047619813511314, "learning_rate": 1.870370692429585e-06, "loss": 0.169, "step": 11770 }, { "epoch": 3.721901903483137, "grad_norm": 0.1997610549023301, "learning_rate": 1.8617767360280182e-06, "loss": 0.1722, "step": 11780 }, { "epoch": 3.7250612115946606, "grad_norm": 0.2040821932549548, "learning_rate": 1.8531980497110803e-06, "loss": 0.1715, "step": 11790 }, { "epoch": 3.728220519706184, "grad_norm": 0.20953839611305236, "learning_rate": 1.8446346752212662e-06, "loss": 0.1723, "step": 11800 }, { "epoch": 3.731379827817708, "grad_norm": 0.19797718998398306, "learning_rate": 1.8360866542265626e-06, "loss": 0.1683, "step": 11810 }, { "epoch": 3.7345391359292313, "grad_norm": 0.20300080445572846, "learning_rate": 1.827554028320252e-06, "loss": 0.1714, "step": 11820 }, { "epoch": 3.7376984440407552, "grad_norm": 0.19643175410416572, "learning_rate": 1.8190368390207063e-06, "loss": 0.1733, "step": 11830 }, { "epoch": 3.740857752152279, "grad_norm": 0.1956191661879107, "learning_rate": 1.8105351277711857e-06, "loss": 0.1709, "step": 11840 }, { "epoch": 3.7440170602638023, "grad_norm": 0.20191888545926198, "learning_rate": 1.8020489359396353e-06, "loss": 0.1726, "step": 11850 }, { "epoch": 3.747176368375326, "grad_norm": 0.20203040508749906, "learning_rate": 1.7935783048184868e-06, "loss": 0.1709, "step": 11860 }, { "epoch": 3.7503356764868494, "grad_norm": 0.19868848486501622, "learning_rate": 1.7851232756244542e-06, "loss": 0.171, "step": 11870 }, { "epoch": 3.753494984598373, "grad_norm": 0.20747266365018838, "learning_rate": 1.776683889498339e-06, "loss": 0.1726, "step": 11880 }, { "epoch": 3.7566542927098965, "grad_norm": 0.1977057128148197, "learning_rate": 1.768260187504819e-06, "loss": 0.1712, "step": 11890 }, { "epoch": 3.75981360082142, "grad_norm": 0.20593578039273977, "learning_rate": 1.7598522106322618e-06, "loss": 0.1699, "step": 11900 }, { "epoch": 3.7629729089329436, "grad_norm": 0.205692865090555, "learning_rate": 1.751459999792517e-06, "loss": 0.1693, "step": 11910 }, { "epoch": 3.766132217044467, "grad_norm": 0.20536611729964754, "learning_rate": 1.7430835958207188e-06, "loss": 0.1695, "step": 11920 }, { "epoch": 3.7692915251559906, "grad_norm": 0.20003626633545768, "learning_rate": 1.734723039475089e-06, "loss": 0.1707, "step": 11930 }, { "epoch": 3.7724508332675146, "grad_norm": 0.19756511256291745, "learning_rate": 1.7263783714367388e-06, "loss": 0.1706, "step": 11940 }, { "epoch": 3.7756101413790377, "grad_norm": 0.19369294547074006, "learning_rate": 1.7180496323094609e-06, "loss": 0.1727, "step": 11950 }, { "epoch": 3.7787694494905617, "grad_norm": 0.19163574497757527, "learning_rate": 1.7097368626195548e-06, "loss": 0.1716, "step": 11960 }, { "epoch": 3.7819287576020852, "grad_norm": 0.19945071023523384, "learning_rate": 1.7014401028156003e-06, "loss": 0.17, "step": 11970 }, { "epoch": 3.785088065713609, "grad_norm": 0.1941150399201019, "learning_rate": 1.6931593932682893e-06, "loss": 0.1716, "step": 11980 }, { "epoch": 3.7882473738251323, "grad_norm": 0.19627611530197875, "learning_rate": 1.6848947742702048e-06, "loss": 0.17, "step": 11990 }, { "epoch": 3.791406681936656, "grad_norm": 0.1985759488262882, "learning_rate": 1.6766462860356425e-06, "loss": 0.1705, "step": 12000 }, { "epoch": 3.7945659900481794, "grad_norm": 0.2084135568916697, "learning_rate": 1.6684139687004052e-06, "loss": 0.1703, "step": 12010 }, { "epoch": 3.797725298159703, "grad_norm": 0.20212891125021623, "learning_rate": 1.6601978623216126e-06, "loss": 0.1719, "step": 12020 }, { "epoch": 3.8008846062712265, "grad_norm": 0.20148662876122203, "learning_rate": 1.6519980068775026e-06, "loss": 0.1718, "step": 12030 }, { "epoch": 3.80404391438275, "grad_norm": 0.19966908684766907, "learning_rate": 1.643814442267243e-06, "loss": 0.1703, "step": 12040 }, { "epoch": 3.807203222494274, "grad_norm": 0.19758114437735802, "learning_rate": 1.6356472083107239e-06, "loss": 0.1704, "step": 12050 }, { "epoch": 3.810362530605797, "grad_norm": 0.2043201919277624, "learning_rate": 1.6274963447483855e-06, "loss": 0.1709, "step": 12060 }, { "epoch": 3.813521838717321, "grad_norm": 0.2065094204780346, "learning_rate": 1.6193618912410019e-06, "loss": 0.1719, "step": 12070 }, { "epoch": 3.8166811468288446, "grad_norm": 0.19410241771240708, "learning_rate": 1.611243887369503e-06, "loss": 0.1699, "step": 12080 }, { "epoch": 3.819840454940368, "grad_norm": 0.19582642907268447, "learning_rate": 1.6031423726347778e-06, "loss": 0.1703, "step": 12090 }, { "epoch": 3.8229997630518917, "grad_norm": 0.1995506401309092, "learning_rate": 1.5950573864574808e-06, "loss": 0.1686, "step": 12100 }, { "epoch": 3.8261590711634152, "grad_norm": 0.19809864311658273, "learning_rate": 1.5869889681778411e-06, "loss": 0.1705, "step": 12110 }, { "epoch": 3.829318379274939, "grad_norm": 0.20274297131703675, "learning_rate": 1.5789371570554729e-06, "loss": 0.1727, "step": 12120 }, { "epoch": 3.8324776873864623, "grad_norm": 0.20601567032087037, "learning_rate": 1.570901992269177e-06, "loss": 0.1693, "step": 12130 }, { "epoch": 3.835636995497986, "grad_norm": 0.19456210693609077, "learning_rate": 1.5628835129167662e-06, "loss": 0.1701, "step": 12140 }, { "epoch": 3.8387963036095094, "grad_norm": 0.20115987503568447, "learning_rate": 1.5548817580148517e-06, "loss": 0.1721, "step": 12150 }, { "epoch": 3.841955611721033, "grad_norm": 0.1958665568478278, "learning_rate": 1.54689676649868e-06, "loss": 0.1707, "step": 12160 }, { "epoch": 3.8451149198325565, "grad_norm": 0.20025006804402964, "learning_rate": 1.5389285772219176e-06, "loss": 0.1702, "step": 12170 }, { "epoch": 3.8482742279440805, "grad_norm": 0.20824644715314478, "learning_rate": 1.5309772289564806e-06, "loss": 0.1713, "step": 12180 }, { "epoch": 3.8514335360556036, "grad_norm": 0.20543389482537658, "learning_rate": 1.5230427603923386e-06, "loss": 0.1714, "step": 12190 }, { "epoch": 3.8545928441671276, "grad_norm": 0.201515908971831, "learning_rate": 1.5151252101373266e-06, "loss": 0.1729, "step": 12200 }, { "epoch": 3.857752152278651, "grad_norm": 0.2016189641289871, "learning_rate": 1.5072246167169574e-06, "loss": 0.1701, "step": 12210 }, { "epoch": 3.8609114603901746, "grad_norm": 0.19876889094407768, "learning_rate": 1.4993410185742374e-06, "loss": 0.1689, "step": 12220 }, { "epoch": 3.864070768501698, "grad_norm": 0.20500710077419468, "learning_rate": 1.4914744540694697e-06, "loss": 0.1714, "step": 12230 }, { "epoch": 3.8672300766132217, "grad_norm": 0.20129831840042556, "learning_rate": 1.4836249614800857e-06, "loss": 0.1706, "step": 12240 }, { "epoch": 3.8703893847247453, "grad_norm": 0.1945834822439153, "learning_rate": 1.4757925790004362e-06, "loss": 0.1709, "step": 12250 }, { "epoch": 3.873548692836269, "grad_norm": 0.1987155834635916, "learning_rate": 1.467977344741624e-06, "loss": 0.1717, "step": 12260 }, { "epoch": 3.8767080009477923, "grad_norm": 0.21759266570145575, "learning_rate": 1.4601792967313095e-06, "loss": 0.1712, "step": 12270 }, { "epoch": 3.879867309059316, "grad_norm": 0.20382637953104524, "learning_rate": 1.4523984729135272e-06, "loss": 0.1714, "step": 12280 }, { "epoch": 3.88302661717084, "grad_norm": 0.1916842281105115, "learning_rate": 1.444634911148502e-06, "loss": 0.1692, "step": 12290 }, { "epoch": 3.886185925282363, "grad_norm": 0.1946183364574122, "learning_rate": 1.4368886492124661e-06, "loss": 0.1699, "step": 12300 }, { "epoch": 3.889345233393887, "grad_norm": 0.19920320984504467, "learning_rate": 1.429159724797467e-06, "loss": 0.1697, "step": 12310 }, { "epoch": 3.8925045415054105, "grad_norm": 0.19953879740403313, "learning_rate": 1.421448175511202e-06, "loss": 0.1694, "step": 12320 }, { "epoch": 3.895663849616934, "grad_norm": 0.20468537426685904, "learning_rate": 1.4137540388768107e-06, "loss": 0.1722, "step": 12330 }, { "epoch": 3.8988231577284576, "grad_norm": 0.20676153968954222, "learning_rate": 1.4060773523327175e-06, "loss": 0.173, "step": 12340 }, { "epoch": 3.901982465839981, "grad_norm": 0.21412532800510253, "learning_rate": 1.3984181532324291e-06, "loss": 0.17, "step": 12350 }, { "epoch": 3.9051417739515046, "grad_norm": 0.20530217895484593, "learning_rate": 1.3907764788443651e-06, "loss": 0.1718, "step": 12360 }, { "epoch": 3.908301082063028, "grad_norm": 0.19397608625457688, "learning_rate": 1.383152366351671e-06, "loss": 0.171, "step": 12370 }, { "epoch": 3.9114603901745517, "grad_norm": 0.19792598868494216, "learning_rate": 1.3755458528520422e-06, "loss": 0.1691, "step": 12380 }, { "epoch": 3.9146196982860753, "grad_norm": 0.19530536153646347, "learning_rate": 1.3679569753575321e-06, "loss": 0.1713, "step": 12390 }, { "epoch": 3.917779006397599, "grad_norm": 0.19937572369371176, "learning_rate": 1.3603857707943934e-06, "loss": 0.1718, "step": 12400 }, { "epoch": 3.9209383145091223, "grad_norm": 0.2045657222636314, "learning_rate": 1.3528322760028706e-06, "loss": 0.1705, "step": 12410 }, { "epoch": 3.9240976226206463, "grad_norm": 0.19538054890753861, "learning_rate": 1.345296527737049e-06, "loss": 0.17, "step": 12420 }, { "epoch": 3.9272569307321694, "grad_norm": 0.19898434578570567, "learning_rate": 1.3377785626646505e-06, "loss": 0.1708, "step": 12430 }, { "epoch": 3.9304162388436934, "grad_norm": 0.20793904574642982, "learning_rate": 1.3302784173668732e-06, "loss": 0.17, "step": 12440 }, { "epoch": 3.933575546955217, "grad_norm": 0.20762680354960933, "learning_rate": 1.322796128338207e-06, "loss": 0.1724, "step": 12450 }, { "epoch": 3.9367348550667405, "grad_norm": 0.2018512947411419, "learning_rate": 1.315331731986253e-06, "loss": 0.1695, "step": 12460 }, { "epoch": 3.939894163178264, "grad_norm": 0.19579741223203112, "learning_rate": 1.3078852646315532e-06, "loss": 0.1718, "step": 12470 }, { "epoch": 3.9430534712897876, "grad_norm": 0.20067280716130292, "learning_rate": 1.3004567625074083e-06, "loss": 0.1701, "step": 12480 }, { "epoch": 3.946212779401311, "grad_norm": 0.19388666686105327, "learning_rate": 1.2930462617596996e-06, "loss": 0.1711, "step": 12490 }, { "epoch": 3.9493720875128346, "grad_norm": 0.19875001659515246, "learning_rate": 1.285653798446725e-06, "loss": 0.1725, "step": 12500 }, { "epoch": 3.952531395624358, "grad_norm": 0.20497115291022464, "learning_rate": 1.278279408539006e-06, "loss": 0.1699, "step": 12510 }, { "epoch": 3.9556907037358817, "grad_norm": 0.19011102711328773, "learning_rate": 1.270923127919128e-06, "loss": 0.1697, "step": 12520 }, { "epoch": 3.9588500118474053, "grad_norm": 0.20042416730472096, "learning_rate": 1.2635849923815562e-06, "loss": 0.1711, "step": 12530 }, { "epoch": 3.962009319958929, "grad_norm": 0.19397058835275352, "learning_rate": 1.2562650376324675e-06, "loss": 0.1715, "step": 12540 }, { "epoch": 3.965168628070453, "grad_norm": 0.20187863747529436, "learning_rate": 1.2489632992895722e-06, "loss": 0.173, "step": 12550 }, { "epoch": 3.968327936181976, "grad_norm": 0.19498840804702408, "learning_rate": 1.2416798128819446e-06, "loss": 0.1699, "step": 12560 }, { "epoch": 3.9714872442935, "grad_norm": 0.1971626109701413, "learning_rate": 1.2344146138498414e-06, "loss": 0.1707, "step": 12570 }, { "epoch": 3.9746465524050234, "grad_norm": 0.19237195141765748, "learning_rate": 1.2271677375445474e-06, "loss": 0.1723, "step": 12580 }, { "epoch": 3.977805860516547, "grad_norm": 0.19369835876699062, "learning_rate": 1.2199392192281805e-06, "loss": 0.1722, "step": 12590 }, { "epoch": 3.9809651686280705, "grad_norm": 0.20431461072784543, "learning_rate": 1.2127290940735387e-06, "loss": 0.1688, "step": 12600 }, { "epoch": 3.984124476739594, "grad_norm": 0.20529642623566896, "learning_rate": 1.2055373971639195e-06, "loss": 0.168, "step": 12610 }, { "epoch": 3.9872837848511176, "grad_norm": 0.19084232186590003, "learning_rate": 1.1983641634929522e-06, "loss": 0.17, "step": 12620 }, { "epoch": 3.990443092962641, "grad_norm": 0.20780766829176076, "learning_rate": 1.1912094279644265e-06, "loss": 0.1679, "step": 12630 }, { "epoch": 3.9936024010741646, "grad_norm": 0.19468886303948824, "learning_rate": 1.1840732253921227e-06, "loss": 0.1686, "step": 12640 }, { "epoch": 3.996761709185688, "grad_norm": 0.19648890342838896, "learning_rate": 1.1769555904996454e-06, "loss": 0.1704, "step": 12650 }, { "epoch": 3.999921017297212, "grad_norm": 0.19284335982757503, "learning_rate": 1.1698565579202465e-06, "loss": 0.1704, "step": 12660 }, { "epoch": 4.002843377300371, "grad_norm": 0.19754930089679965, "learning_rate": 1.1627761621966671e-06, "loss": 0.1487, "step": 12670 }, { "epoch": 4.006002685411895, "grad_norm": 0.19520779806174013, "learning_rate": 1.1557144377809626e-06, "loss": 0.1588, "step": 12680 }, { "epoch": 4.009161993523418, "grad_norm": 0.19859317904582857, "learning_rate": 1.1486714190343367e-06, "loss": 0.1596, "step": 12690 }, { "epoch": 4.012321301634942, "grad_norm": 0.20028049175205914, "learning_rate": 1.1416471402269747e-06, "loss": 0.1581, "step": 12700 }, { "epoch": 4.015480609746466, "grad_norm": 0.18619306242942488, "learning_rate": 1.1346416355378764e-06, "loss": 0.1598, "step": 12710 }, { "epoch": 4.018639917857989, "grad_norm": 0.19084584057427986, "learning_rate": 1.1276549390546893e-06, "loss": 0.1598, "step": 12720 }, { "epoch": 4.021799225969513, "grad_norm": 0.1936540709695895, "learning_rate": 1.120687084773545e-06, "loss": 0.159, "step": 12730 }, { "epoch": 4.024958534081036, "grad_norm": 0.1975921539537816, "learning_rate": 1.1137381065988878e-06, "loss": 0.1583, "step": 12740 }, { "epoch": 4.02811784219256, "grad_norm": 0.19969147674976237, "learning_rate": 1.1068080383433188e-06, "loss": 0.1602, "step": 12750 }, { "epoch": 4.031277150304083, "grad_norm": 0.19352808568437072, "learning_rate": 1.0998969137274234e-06, "loss": 0.1597, "step": 12760 }, { "epoch": 4.034436458415607, "grad_norm": 0.19339061581403172, "learning_rate": 1.0930047663796117e-06, "loss": 0.1618, "step": 12770 }, { "epoch": 4.03759576652713, "grad_norm": 0.19477299915795446, "learning_rate": 1.0861316298359537e-06, "loss": 0.1584, "step": 12780 }, { "epoch": 4.040755074638654, "grad_norm": 0.19747662712691139, "learning_rate": 1.0792775375400143e-06, "loss": 0.1598, "step": 12790 }, { "epoch": 4.043914382750177, "grad_norm": 0.19362010628396598, "learning_rate": 1.0724425228426938e-06, "loss": 0.1609, "step": 12800 }, { "epoch": 4.047073690861701, "grad_norm": 0.1978158624641635, "learning_rate": 1.0656266190020648e-06, "loss": 0.1604, "step": 12810 }, { "epoch": 4.050232998973225, "grad_norm": 0.2017835581795631, "learning_rate": 1.058829859183204e-06, "loss": 0.1595, "step": 12820 }, { "epoch": 4.053392307084748, "grad_norm": 0.19505342405207957, "learning_rate": 1.0520522764580466e-06, "loss": 0.1601, "step": 12830 }, { "epoch": 4.056551615196272, "grad_norm": 0.19659192583114876, "learning_rate": 1.0452939038052045e-06, "loss": 0.1582, "step": 12840 }, { "epoch": 4.0597109233077955, "grad_norm": 0.19062889408725014, "learning_rate": 1.0385547741098222e-06, "loss": 0.1594, "step": 12850 }, { "epoch": 4.0628702314193195, "grad_norm": 0.2007575871127392, "learning_rate": 1.0318349201634116e-06, "loss": 0.1609, "step": 12860 }, { "epoch": 4.066029539530843, "grad_norm": 0.19407495103479086, "learning_rate": 1.02513437466369e-06, "loss": 0.1601, "step": 12870 }, { "epoch": 4.0691888476423665, "grad_norm": 0.1926256014241401, "learning_rate": 1.01845317021442e-06, "loss": 0.1597, "step": 12880 }, { "epoch": 4.07234815575389, "grad_norm": 0.19084665134324094, "learning_rate": 1.0117913393252632e-06, "loss": 0.1605, "step": 12890 }, { "epoch": 4.075507463865414, "grad_norm": 0.19709574860841494, "learning_rate": 1.0051489144116e-06, "loss": 0.1608, "step": 12900 }, { "epoch": 4.078666771976937, "grad_norm": 0.20095251133089448, "learning_rate": 9.985259277943977e-07, "loss": 0.1602, "step": 12910 }, { "epoch": 4.081826080088461, "grad_norm": 0.19685311750698353, "learning_rate": 9.919224117000281e-07, "loss": 0.1614, "step": 12920 }, { "epoch": 4.084985388199984, "grad_norm": 0.20241005189325584, "learning_rate": 9.853383982601294e-07, "loss": 0.1596, "step": 12930 }, { "epoch": 4.088144696311508, "grad_norm": 0.19400175840515052, "learning_rate": 9.787739195114427e-07, "loss": 0.1592, "step": 12940 }, { "epoch": 4.091304004423032, "grad_norm": 0.20303783315409038, "learning_rate": 9.722290073956536e-07, "loss": 0.1597, "step": 12950 }, { "epoch": 4.094463312534555, "grad_norm": 0.1936600304781095, "learning_rate": 9.657036937592423e-07, "loss": 0.1621, "step": 12960 }, { "epoch": 4.097622620646079, "grad_norm": 0.19796367201440496, "learning_rate": 9.59198010353326e-07, "loss": 0.1597, "step": 12970 }, { "epoch": 4.100781928757602, "grad_norm": 0.18342546130725573, "learning_rate": 9.527119888334996e-07, "loss": 0.1582, "step": 12980 }, { "epoch": 4.103941236869126, "grad_norm": 0.20317670235586452, "learning_rate": 9.462456607596954e-07, "loss": 0.1603, "step": 12990 }, { "epoch": 4.107100544980649, "grad_norm": 0.19569726312873856, "learning_rate": 9.397990575960103e-07, "loss": 0.1578, "step": 13000 }, { "epoch": 4.110259853092173, "grad_norm": 0.20338486160541247, "learning_rate": 9.333722107105725e-07, "loss": 0.1606, "step": 13010 }, { "epoch": 4.113419161203696, "grad_norm": 0.1875071227775237, "learning_rate": 9.269651513753725e-07, "loss": 0.1603, "step": 13020 }, { "epoch": 4.11657846931522, "grad_norm": 0.1927619122892459, "learning_rate": 9.205779107661201e-07, "loss": 0.1581, "step": 13030 }, { "epoch": 4.119737777426743, "grad_norm": 0.20452574688837152, "learning_rate": 9.142105199620916e-07, "loss": 0.159, "step": 13040 }, { "epoch": 4.122897085538267, "grad_norm": 0.2023565306937573, "learning_rate": 9.078630099459768e-07, "loss": 0.1604, "step": 13050 }, { "epoch": 4.12605639364979, "grad_norm": 0.19951162187159371, "learning_rate": 9.015354116037256e-07, "loss": 0.158, "step": 13060 }, { "epoch": 4.129215701761314, "grad_norm": 0.19668566056760164, "learning_rate": 8.952277557244077e-07, "loss": 0.1589, "step": 13070 }, { "epoch": 4.132375009872838, "grad_norm": 0.18983274506050818, "learning_rate": 8.889400730000475e-07, "loss": 0.1599, "step": 13080 }, { "epoch": 4.135534317984361, "grad_norm": 0.19379806465727592, "learning_rate": 8.826723940254923e-07, "loss": 0.1614, "step": 13090 }, { "epoch": 4.138693626095885, "grad_norm": 0.2015655061435355, "learning_rate": 8.76424749298247e-07, "loss": 0.1596, "step": 13100 }, { "epoch": 4.141852934207408, "grad_norm": 0.20253336827566704, "learning_rate": 8.701971692183365e-07, "loss": 0.1605, "step": 13110 }, { "epoch": 4.145012242318932, "grad_norm": 0.19563497160996118, "learning_rate": 8.639896840881534e-07, "loss": 0.1607, "step": 13120 }, { "epoch": 4.1481715504304555, "grad_norm": 0.20060195439697434, "learning_rate": 8.578023241123134e-07, "loss": 0.16, "step": 13130 }, { "epoch": 4.1513308585419795, "grad_norm": 0.19835862074182956, "learning_rate": 8.516351193975042e-07, "loss": 0.1631, "step": 13140 }, { "epoch": 4.154490166653503, "grad_norm": 0.1869510365958818, "learning_rate": 8.454880999523435e-07, "loss": 0.1587, "step": 13150 }, { "epoch": 4.1576494747650266, "grad_norm": 0.20004584908939843, "learning_rate": 8.393612956872254e-07, "loss": 0.1621, "step": 13160 }, { "epoch": 4.16080878287655, "grad_norm": 0.1856765556024912, "learning_rate": 8.332547364141891e-07, "loss": 0.159, "step": 13170 }, { "epoch": 4.163968090988074, "grad_norm": 0.19589805621856057, "learning_rate": 8.271684518467571e-07, "loss": 0.1602, "step": 13180 }, { "epoch": 4.167127399099598, "grad_norm": 0.19689293762125867, "learning_rate": 8.211024715998023e-07, "loss": 0.1591, "step": 13190 }, { "epoch": 4.170286707211121, "grad_norm": 0.19503964405624044, "learning_rate": 8.150568251893992e-07, "loss": 0.1604, "step": 13200 }, { "epoch": 4.173446015322645, "grad_norm": 0.1932945597906981, "learning_rate": 8.09031542032681e-07, "loss": 0.1596, "step": 13210 }, { "epoch": 4.176605323434168, "grad_norm": 0.1954697082605763, "learning_rate": 8.030266514476976e-07, "loss": 0.1596, "step": 13220 }, { "epoch": 4.179764631545692, "grad_norm": 0.18739853815459867, "learning_rate": 7.97042182653271e-07, "loss": 0.1611, "step": 13230 }, { "epoch": 4.182923939657215, "grad_norm": 0.2008506504946661, "learning_rate": 7.910781647688515e-07, "loss": 0.1594, "step": 13240 }, { "epoch": 4.186083247768739, "grad_norm": 0.1997520617477428, "learning_rate": 7.851346268143861e-07, "loss": 0.1594, "step": 13250 }, { "epoch": 4.189242555880262, "grad_norm": 0.19380452684319097, "learning_rate": 7.7921159771016e-07, "loss": 0.1608, "step": 13260 }, { "epoch": 4.192401863991786, "grad_norm": 0.19204023557075717, "learning_rate": 7.733091062766751e-07, "loss": 0.1603, "step": 13270 }, { "epoch": 4.195561172103309, "grad_norm": 0.1963780227663788, "learning_rate": 7.674271812344935e-07, "loss": 0.1581, "step": 13280 }, { "epoch": 4.198720480214833, "grad_norm": 0.18775094836052963, "learning_rate": 7.615658512041068e-07, "loss": 0.1585, "step": 13290 }, { "epoch": 4.201879788326356, "grad_norm": 0.1879252136754587, "learning_rate": 7.557251447057962e-07, "loss": 0.16, "step": 13300 }, { "epoch": 4.20503909643788, "grad_norm": 0.19489071033267957, "learning_rate": 7.499050901594896e-07, "loss": 0.1587, "step": 13310 }, { "epoch": 4.208198404549404, "grad_norm": 0.20273324039262924, "learning_rate": 7.441057158846276e-07, "loss": 0.1591, "step": 13320 }, { "epoch": 4.211357712660927, "grad_norm": 0.1982379447248168, "learning_rate": 7.383270501000245e-07, "loss": 0.1599, "step": 13330 }, { "epoch": 4.214517020772451, "grad_norm": 0.1856228450158758, "learning_rate": 7.325691209237251e-07, "loss": 0.1581, "step": 13340 }, { "epoch": 4.217676328883974, "grad_norm": 0.19437453651795136, "learning_rate": 7.268319563728831e-07, "loss": 0.1586, "step": 13350 }, { "epoch": 4.220835636995498, "grad_norm": 0.18694690942616607, "learning_rate": 7.211155843636059e-07, "loss": 0.1603, "step": 13360 }, { "epoch": 4.223994945107021, "grad_norm": 0.19393003030705352, "learning_rate": 7.154200327108313e-07, "loss": 0.162, "step": 13370 }, { "epoch": 4.227154253218545, "grad_norm": 0.1927448263517383, "learning_rate": 7.097453291281887e-07, "loss": 0.1612, "step": 13380 }, { "epoch": 4.230313561330068, "grad_norm": 0.20215663142647716, "learning_rate": 7.040915012278648e-07, "loss": 0.1589, "step": 13390 }, { "epoch": 4.233472869441592, "grad_norm": 0.21545803120847262, "learning_rate": 6.984585765204665e-07, "loss": 0.16, "step": 13400 }, { "epoch": 4.2366321775531155, "grad_norm": 0.20075406426026432, "learning_rate": 6.928465824148923e-07, "loss": 0.1594, "step": 13410 }, { "epoch": 4.2397914856646395, "grad_norm": 0.2394877373371559, "learning_rate": 6.872555462181907e-07, "loss": 0.1592, "step": 13420 }, { "epoch": 4.2429507937761635, "grad_norm": 0.19216510992720218, "learning_rate": 6.816854951354396e-07, "loss": 0.1573, "step": 13430 }, { "epoch": 4.246110101887687, "grad_norm": 0.2020202616715345, "learning_rate": 6.761364562695993e-07, "loss": 0.161, "step": 13440 }, { "epoch": 4.2492694099992105, "grad_norm": 0.189014469828388, "learning_rate": 6.706084566213933e-07, "loss": 0.1589, "step": 13450 }, { "epoch": 4.252428718110734, "grad_norm": 0.19279477497914185, "learning_rate": 6.651015230891694e-07, "loss": 0.1608, "step": 13460 }, { "epoch": 4.255588026222258, "grad_norm": 0.20318784264174414, "learning_rate": 6.596156824687722e-07, "loss": 0.1596, "step": 13470 }, { "epoch": 4.258747334333781, "grad_norm": 0.19924677539875918, "learning_rate": 6.541509614534103e-07, "loss": 0.1593, "step": 13480 }, { "epoch": 4.261906642445305, "grad_norm": 0.20657626622701158, "learning_rate": 6.487073866335298e-07, "loss": 0.1598, "step": 13490 }, { "epoch": 4.265065950556828, "grad_norm": 0.18954862382773452, "learning_rate": 6.432849844966782e-07, "loss": 0.1607, "step": 13500 }, { "epoch": 4.268225258668352, "grad_norm": 0.20096767940062027, "learning_rate": 6.378837814273886e-07, "loss": 0.1602, "step": 13510 }, { "epoch": 4.271384566779875, "grad_norm": 0.19873738285793385, "learning_rate": 6.325038037070336e-07, "loss": 0.1602, "step": 13520 }, { "epoch": 4.274543874891399, "grad_norm": 0.2026478071540902, "learning_rate": 6.271450775137116e-07, "loss": 0.1579, "step": 13530 }, { "epoch": 4.277703183002922, "grad_norm": 0.19853177565270944, "learning_rate": 6.218076289221153e-07, "loss": 0.1598, "step": 13540 }, { "epoch": 4.280862491114446, "grad_norm": 0.20829772712743705, "learning_rate": 6.164914839034008e-07, "loss": 0.1587, "step": 13550 }, { "epoch": 4.28402179922597, "grad_norm": 0.196256766990733, "learning_rate": 6.111966683250681e-07, "loss": 0.1604, "step": 13560 }, { "epoch": 4.287181107337493, "grad_norm": 0.19618426976460837, "learning_rate": 6.059232079508276e-07, "loss": 0.1603, "step": 13570 }, { "epoch": 4.290340415449017, "grad_norm": 0.202438781233527, "learning_rate": 6.006711284404837e-07, "loss": 0.1612, "step": 13580 }, { "epoch": 4.29349972356054, "grad_norm": 0.19704572347653887, "learning_rate": 5.954404553497989e-07, "loss": 0.1602, "step": 13590 }, { "epoch": 4.296659031672064, "grad_norm": 0.19168113950072857, "learning_rate": 5.902312141303806e-07, "loss": 0.1604, "step": 13600 }, { "epoch": 4.299818339783587, "grad_norm": 0.2003879313833993, "learning_rate": 5.850434301295494e-07, "loss": 0.1596, "step": 13610 }, { "epoch": 4.302977647895111, "grad_norm": 0.19759760667562565, "learning_rate": 5.798771285902205e-07, "loss": 0.1604, "step": 13620 }, { "epoch": 4.306136956006634, "grad_norm": 0.19863833857948981, "learning_rate": 5.747323346507777e-07, "loss": 0.1592, "step": 13630 }, { "epoch": 4.309296264118158, "grad_norm": 0.2024378958773524, "learning_rate": 5.696090733449528e-07, "loss": 0.1601, "step": 13640 }, { "epoch": 4.312455572229681, "grad_norm": 0.20215707871180005, "learning_rate": 5.645073696017028e-07, "loss": 0.1585, "step": 13650 }, { "epoch": 4.315614880341205, "grad_norm": 0.18689645886289935, "learning_rate": 5.594272482450902e-07, "loss": 0.1573, "step": 13660 }, { "epoch": 4.318774188452728, "grad_norm": 0.2053319708006727, "learning_rate": 5.543687339941584e-07, "loss": 0.1615, "step": 13670 }, { "epoch": 4.321933496564252, "grad_norm": 0.19171061220997784, "learning_rate": 5.493318514628171e-07, "loss": 0.1616, "step": 13680 }, { "epoch": 4.325092804675776, "grad_norm": 0.19563207560203782, "learning_rate": 5.443166251597187e-07, "loss": 0.16, "step": 13690 }, { "epoch": 4.3282521127872995, "grad_norm": 0.19760551499634138, "learning_rate": 5.393230794881399e-07, "loss": 0.1587, "step": 13700 }, { "epoch": 4.3314114208988235, "grad_norm": 0.1919192060783246, "learning_rate": 5.343512387458621e-07, "loss": 0.1598, "step": 13710 }, { "epoch": 4.334570729010347, "grad_norm": 0.19916105439716603, "learning_rate": 5.294011271250549e-07, "loss": 0.1581, "step": 13720 }, { "epoch": 4.3377300371218706, "grad_norm": 0.1924623858324796, "learning_rate": 5.244727687121581e-07, "loss": 0.1585, "step": 13730 }, { "epoch": 4.340889345233394, "grad_norm": 0.19222422456276647, "learning_rate": 5.195661874877633e-07, "loss": 0.1585, "step": 13740 }, { "epoch": 4.344048653344918, "grad_norm": 0.1925599596150807, "learning_rate": 5.14681407326495e-07, "loss": 0.161, "step": 13750 }, { "epoch": 4.347207961456441, "grad_norm": 0.1950428531940909, "learning_rate": 5.098184519969041e-07, "loss": 0.1581, "step": 13760 }, { "epoch": 4.350367269567965, "grad_norm": 0.20354980424119892, "learning_rate": 5.049773451613382e-07, "loss": 0.1607, "step": 13770 }, { "epoch": 4.353526577679488, "grad_norm": 0.1868628417947928, "learning_rate": 5.001581103758374e-07, "loss": 0.1601, "step": 13780 }, { "epoch": 4.356685885791012, "grad_norm": 0.19984754637012053, "learning_rate": 4.95360771090016e-07, "loss": 0.159, "step": 13790 }, { "epoch": 4.359845193902535, "grad_norm": 0.19703967834397978, "learning_rate": 4.905853506469477e-07, "loss": 0.159, "step": 13800 }, { "epoch": 4.363004502014059, "grad_norm": 0.18977597668459417, "learning_rate": 4.858318722830518e-07, "loss": 0.1583, "step": 13810 }, { "epoch": 4.366163810125583, "grad_norm": 0.19339667722600956, "learning_rate": 4.811003591279834e-07, "loss": 0.1585, "step": 13820 }, { "epoch": 4.369323118237106, "grad_norm": 0.1938566607621067, "learning_rate": 4.7639083420451425e-07, "loss": 0.1593, "step": 13830 }, { "epoch": 4.37248242634863, "grad_norm": 0.19830744178047272, "learning_rate": 4.71703320428431e-07, "loss": 0.1591, "step": 13840 }, { "epoch": 4.375641734460153, "grad_norm": 0.20447285160479836, "learning_rate": 4.6703784060841194e-07, "loss": 0.1592, "step": 13850 }, { "epoch": 4.378801042571677, "grad_norm": 0.19999302428118468, "learning_rate": 4.623944174459238e-07, "loss": 0.1596, "step": 13860 }, { "epoch": 4.3819603506832, "grad_norm": 0.19431839969423845, "learning_rate": 4.5777307353511103e-07, "loss": 0.1587, "step": 13870 }, { "epoch": 4.385119658794724, "grad_norm": 0.19213231385230065, "learning_rate": 4.53173831362681e-07, "loss": 0.1598, "step": 13880 }, { "epoch": 4.388278966906247, "grad_norm": 0.19727512610076384, "learning_rate": 4.485967133078001e-07, "loss": 0.1595, "step": 13890 }, { "epoch": 4.391438275017771, "grad_norm": 0.20137928321566095, "learning_rate": 4.440417416419812e-07, "loss": 0.1608, "step": 13900 }, { "epoch": 4.394597583129294, "grad_norm": 0.19192751875350025, "learning_rate": 4.395089385289747e-07, "loss": 0.1582, "step": 13910 }, { "epoch": 4.397756891240818, "grad_norm": 0.19508844609767204, "learning_rate": 4.3499832602466764e-07, "loss": 0.1612, "step": 13920 }, { "epoch": 4.400916199352342, "grad_norm": 0.20085050538502322, "learning_rate": 4.3050992607696354e-07, "loss": 0.1585, "step": 13930 }, { "epoch": 4.404075507463865, "grad_norm": 0.19345802602283768, "learning_rate": 4.260437605256912e-07, "loss": 0.1593, "step": 13940 }, { "epoch": 4.407234815575389, "grad_norm": 0.19853948221248452, "learning_rate": 4.215998511024844e-07, "loss": 0.1593, "step": 13950 }, { "epoch": 4.410394123686912, "grad_norm": 0.19645554805603088, "learning_rate": 4.171782194306856e-07, "loss": 0.1581, "step": 13960 }, { "epoch": 4.413553431798436, "grad_norm": 0.18983249029369767, "learning_rate": 4.127788870252358e-07, "loss": 0.1592, "step": 13970 }, { "epoch": 4.4167127399099595, "grad_norm": 0.19264647219798062, "learning_rate": 4.084018752925728e-07, "loss": 0.162, "step": 13980 }, { "epoch": 4.4198720480214835, "grad_norm": 0.19188102638619134, "learning_rate": 4.0404720553052225e-07, "loss": 0.1599, "step": 13990 }, { "epoch": 4.423031356133007, "grad_norm": 0.19594341326526055, "learning_rate": 3.997148989282035e-07, "loss": 0.1582, "step": 14000 }, { "epoch": 4.426190664244531, "grad_norm": 0.19679190764934815, "learning_rate": 3.9540497656591235e-07, "loss": 0.16, "step": 14010 }, { "epoch": 4.429349972356054, "grad_norm": 0.2001818569734459, "learning_rate": 3.911174594150352e-07, "loss": 0.161, "step": 14020 }, { "epoch": 4.432509280467578, "grad_norm": 0.19932050541026844, "learning_rate": 3.868523683379316e-07, "loss": 0.1609, "step": 14030 }, { "epoch": 4.435668588579102, "grad_norm": 0.19545767797055838, "learning_rate": 3.8260972408784236e-07, "loss": 0.1586, "step": 14040 }, { "epoch": 4.438827896690625, "grad_norm": 0.19560441338063028, "learning_rate": 3.7838954730878505e-07, "loss": 0.1597, "step": 14050 }, { "epoch": 4.441987204802149, "grad_norm": 0.19520076627630972, "learning_rate": 3.741918585354548e-07, "loss": 0.1601, "step": 14060 }, { "epoch": 4.445146512913672, "grad_norm": 0.2021625860562286, "learning_rate": 3.7001667819312303e-07, "loss": 0.1589, "step": 14070 }, { "epoch": 4.448305821025196, "grad_norm": 0.19657547564424072, "learning_rate": 3.6586402659753994e-07, "loss": 0.1593, "step": 14080 }, { "epoch": 4.451465129136719, "grad_norm": 0.2371545042414208, "learning_rate": 3.617339239548312e-07, "loss": 0.1602, "step": 14090 }, { "epoch": 4.454624437248243, "grad_norm": 0.19885312184722165, "learning_rate": 3.5762639036140856e-07, "loss": 0.1595, "step": 14100 }, { "epoch": 4.457783745359766, "grad_norm": 0.20782243257336264, "learning_rate": 3.5354144580385997e-07, "loss": 0.1602, "step": 14110 }, { "epoch": 4.46094305347129, "grad_norm": 0.18932292467991474, "learning_rate": 3.494791101588657e-07, "loss": 0.1616, "step": 14120 }, { "epoch": 4.464102361582813, "grad_norm": 0.1919157424283962, "learning_rate": 3.454394031930885e-07, "loss": 0.1593, "step": 14130 }, { "epoch": 4.467261669694337, "grad_norm": 0.2010076659233086, "learning_rate": 3.414223445630865e-07, "loss": 0.1599, "step": 14140 }, { "epoch": 4.47042097780586, "grad_norm": 0.20255576550229257, "learning_rate": 3.3742795381521533e-07, "loss": 0.1593, "step": 14150 }, { "epoch": 4.473580285917384, "grad_norm": 0.194621411799477, "learning_rate": 3.334562503855321e-07, "loss": 0.1597, "step": 14160 }, { "epoch": 4.476739594028908, "grad_norm": 0.18866219829191938, "learning_rate": 3.295072535996974e-07, "loss": 0.1581, "step": 14170 }, { "epoch": 4.479898902140431, "grad_norm": 0.19349313221379064, "learning_rate": 3.255809826728923e-07, "loss": 0.1601, "step": 14180 }, { "epoch": 4.483058210251955, "grad_norm": 0.20115409541502788, "learning_rate": 3.2167745670970973e-07, "loss": 0.1601, "step": 14190 }, { "epoch": 4.486217518363478, "grad_norm": 0.18780373745693515, "learning_rate": 3.1779669470407615e-07, "loss": 0.1589, "step": 14200 }, { "epoch": 4.489376826475002, "grad_norm": 0.19486811896601117, "learning_rate": 3.1393871553914654e-07, "loss": 0.1587, "step": 14210 }, { "epoch": 4.492536134586525, "grad_norm": 0.2025724767403799, "learning_rate": 3.101035379872219e-07, "loss": 0.1595, "step": 14220 }, { "epoch": 4.495695442698049, "grad_norm": 0.2080673328738384, "learning_rate": 3.06291180709653e-07, "loss": 0.1593, "step": 14230 }, { "epoch": 4.498854750809572, "grad_norm": 0.19149699902846998, "learning_rate": 3.0250166225675115e-07, "loss": 0.1599, "step": 14240 }, { "epoch": 4.502014058921096, "grad_norm": 0.20411585929483625, "learning_rate": 2.987350010676976e-07, "loss": 0.1602, "step": 14250 }, { "epoch": 4.5051733670326195, "grad_norm": 0.19343496961687232, "learning_rate": 2.9499121547045426e-07, "loss": 0.1599, "step": 14260 }, { "epoch": 4.5083326751441435, "grad_norm": 0.19449547358661812, "learning_rate": 2.912703236816722e-07, "loss": 0.1606, "step": 14270 }, { "epoch": 4.511491983255667, "grad_norm": 0.1973387536772748, "learning_rate": 2.8757234380660857e-07, "loss": 0.1599, "step": 14280 }, { "epoch": 4.514651291367191, "grad_norm": 0.18818779692322143, "learning_rate": 2.838972938390311e-07, "loss": 0.1601, "step": 14290 }, { "epoch": 4.5178105994787146, "grad_norm": 0.19985204703053389, "learning_rate": 2.802451916611365e-07, "loss": 0.1583, "step": 14300 }, { "epoch": 4.520969907590238, "grad_norm": 0.19536890400548887, "learning_rate": 2.7661605504346045e-07, "loss": 0.1608, "step": 14310 }, { "epoch": 4.524129215701762, "grad_norm": 0.18811223542835254, "learning_rate": 2.730099016447929e-07, "loss": 0.1596, "step": 14320 }, { "epoch": 4.527288523813285, "grad_norm": 0.19468411001885044, "learning_rate": 2.6942674901209e-07, "loss": 0.16, "step": 14330 }, { "epoch": 4.530447831924809, "grad_norm": 0.20552266635420624, "learning_rate": 2.658666145803912e-07, "loss": 0.1615, "step": 14340 }, { "epoch": 4.533607140036332, "grad_norm": 0.1937881234664753, "learning_rate": 2.623295156727301e-07, "loss": 0.1578, "step": 14350 }, { "epoch": 4.536766448147856, "grad_norm": 0.20412087334287093, "learning_rate": 2.588154695000589e-07, "loss": 0.1612, "step": 14360 }, { "epoch": 4.539925756259379, "grad_norm": 0.19434151561756924, "learning_rate": 2.55324493161152e-07, "loss": 0.1584, "step": 14370 }, { "epoch": 4.543085064370903, "grad_norm": 0.1908270600724035, "learning_rate": 2.5185660364253515e-07, "loss": 0.1593, "step": 14380 }, { "epoch": 4.546244372482426, "grad_norm": 0.2004261876800901, "learning_rate": 2.484118178183953e-07, "loss": 0.1581, "step": 14390 }, { "epoch": 4.54940368059395, "grad_norm": 0.19755205920634125, "learning_rate": 2.4499015245049997e-07, "loss": 0.1601, "step": 14400 }, { "epoch": 4.552562988705473, "grad_norm": 0.198156216790078, "learning_rate": 2.415916241881172e-07, "loss": 0.1606, "step": 14410 }, { "epoch": 4.555722296816997, "grad_norm": 0.195998760066744, "learning_rate": 2.382162495679341e-07, "loss": 0.1601, "step": 14420 }, { "epoch": 4.558881604928521, "grad_norm": 0.18945574448552882, "learning_rate": 2.3486404501397497e-07, "loss": 0.158, "step": 14430 }, { "epoch": 4.562040913040044, "grad_norm": 0.19748534603778248, "learning_rate": 2.315350268375227e-07, "loss": 0.1574, "step": 14440 }, { "epoch": 4.565200221151568, "grad_norm": 0.20176443804299427, "learning_rate": 2.2822921123703822e-07, "loss": 0.1603, "step": 14450 }, { "epoch": 4.568359529263091, "grad_norm": 0.19789674088621473, "learning_rate": 2.249466142980844e-07, "loss": 0.1598, "step": 14460 }, { "epoch": 4.571518837374615, "grad_norm": 0.187818302106187, "learning_rate": 2.2168725199324336e-07, "loss": 0.159, "step": 14470 }, { "epoch": 4.574678145486138, "grad_norm": 0.20046350486590453, "learning_rate": 2.1845114018204382e-07, "loss": 0.16, "step": 14480 }, { "epoch": 4.577837453597662, "grad_norm": 0.1956285243391142, "learning_rate": 2.1523829461087997e-07, "loss": 0.1606, "step": 14490 }, { "epoch": 4.580996761709185, "grad_norm": 0.19955861141039574, "learning_rate": 2.12048730912936e-07, "loss": 0.1599, "step": 14500 }, { "epoch": 4.584156069820709, "grad_norm": 0.19999137325948985, "learning_rate": 2.0888246460811168e-07, "loss": 0.1581, "step": 14510 }, { "epoch": 4.587315377932233, "grad_norm": 0.19609764005409422, "learning_rate": 2.057395111029431e-07, "loss": 0.1587, "step": 14520 }, { "epoch": 4.590474686043756, "grad_norm": 0.19243471223634231, "learning_rate": 2.0261988569053205e-07, "loss": 0.1585, "step": 14530 }, { "epoch": 4.5936339941552795, "grad_norm": 0.20046273900446646, "learning_rate": 1.995236035504694e-07, "loss": 0.1602, "step": 14540 }, { "epoch": 4.5967933022668035, "grad_norm": 0.19670271834661107, "learning_rate": 1.9645067974876086e-07, "loss": 0.1593, "step": 14550 }, { "epoch": 4.5999526103783275, "grad_norm": 0.19348803453560268, "learning_rate": 1.9340112923775467e-07, "loss": 0.1572, "step": 14560 }, { "epoch": 4.603111918489851, "grad_norm": 0.19604507063856969, "learning_rate": 1.9037496685606782e-07, "loss": 0.1615, "step": 14570 }, { "epoch": 4.606271226601375, "grad_norm": 0.19905957494150894, "learning_rate": 1.873722073285156e-07, "loss": 0.1599, "step": 14580 }, { "epoch": 4.609430534712898, "grad_norm": 0.1931564026479933, "learning_rate": 1.8439286526603816e-07, "loss": 0.1605, "step": 14590 }, { "epoch": 4.612589842824422, "grad_norm": 0.1957472742685542, "learning_rate": 1.814369551656281e-07, "loss": 0.1576, "step": 14600 }, { "epoch": 4.615749150935945, "grad_norm": 0.19222186879935804, "learning_rate": 1.7850449141026626e-07, "loss": 0.158, "step": 14610 }, { "epoch": 4.618908459047469, "grad_norm": 0.19288882737133697, "learning_rate": 1.755954882688432e-07, "loss": 0.1599, "step": 14620 }, { "epoch": 4.622067767158992, "grad_norm": 0.19173903599470532, "learning_rate": 1.7270995989609685e-07, "loss": 0.163, "step": 14630 }, { "epoch": 4.625227075270516, "grad_norm": 0.1971599904397844, "learning_rate": 1.6984792033253873e-07, "loss": 0.1624, "step": 14640 }, { "epoch": 4.62838638338204, "grad_norm": 0.20237639199908425, "learning_rate": 1.67009383504389e-07, "loss": 0.1599, "step": 14650 }, { "epoch": 4.631545691493563, "grad_norm": 0.19640473192083982, "learning_rate": 1.6419436322350602e-07, "loss": 0.1582, "step": 14660 }, { "epoch": 4.634704999605087, "grad_norm": 0.19394680624920346, "learning_rate": 1.6140287318732295e-07, "loss": 0.1612, "step": 14670 }, { "epoch": 4.63786430771661, "grad_norm": 0.19411893620985973, "learning_rate": 1.5863492697877403e-07, "loss": 0.1579, "step": 14680 }, { "epoch": 4.641023615828134, "grad_norm": 0.19802298148744443, "learning_rate": 1.5589053806623845e-07, "loss": 0.1599, "step": 14690 }, { "epoch": 4.644182923939657, "grad_norm": 0.19535413598264334, "learning_rate": 1.5316971980346597e-07, "loss": 0.1563, "step": 14700 }, { "epoch": 4.647342232051181, "grad_norm": 0.19497113149503717, "learning_rate": 1.5047248542951586e-07, "loss": 0.1593, "step": 14710 }, { "epoch": 4.650501540162704, "grad_norm": 0.2020301060995925, "learning_rate": 1.4779884806869262e-07, "loss": 0.1579, "step": 14720 }, { "epoch": 4.653660848274228, "grad_norm": 0.19311882101118522, "learning_rate": 1.4514882073048186e-07, "loss": 0.1603, "step": 14730 }, { "epoch": 4.656820156385751, "grad_norm": 0.19069864706937145, "learning_rate": 1.4252241630948515e-07, "loss": 0.159, "step": 14740 }, { "epoch": 4.659979464497275, "grad_norm": 0.19826154207098273, "learning_rate": 1.3991964758536148e-07, "loss": 0.1594, "step": 14750 }, { "epoch": 4.663138772608798, "grad_norm": 0.20463683774068114, "learning_rate": 1.3734052722275849e-07, "loss": 0.1607, "step": 14760 }, { "epoch": 4.666298080720322, "grad_norm": 0.19219549706945396, "learning_rate": 1.3478506777125865e-07, "loss": 0.1574, "step": 14770 }, { "epoch": 4.669457388831846, "grad_norm": 0.19753785214231598, "learning_rate": 1.3225328166531158e-07, "loss": 0.1599, "step": 14780 }, { "epoch": 4.672616696943369, "grad_norm": 0.1970861929609217, "learning_rate": 1.297451812241779e-07, "loss": 0.1607, "step": 14790 }, { "epoch": 4.675776005054893, "grad_norm": 0.1966183263052465, "learning_rate": 1.2726077865186648e-07, "loss": 0.159, "step": 14800 }, { "epoch": 4.678935313166416, "grad_norm": 0.1943429806373708, "learning_rate": 1.2480008603707627e-07, "loss": 0.158, "step": 14810 }, { "epoch": 4.68209462127794, "grad_norm": 0.19589426287899958, "learning_rate": 1.223631153531385e-07, "loss": 0.1577, "step": 14820 }, { "epoch": 4.6852539293894635, "grad_norm": 0.19706763059063206, "learning_rate": 1.1994987845795725e-07, "loss": 0.1597, "step": 14830 }, { "epoch": 4.6884132375009875, "grad_norm": 0.1975866024609479, "learning_rate": 1.1756038709394902e-07, "loss": 0.1593, "step": 14840 }, { "epoch": 4.691572545612511, "grad_norm": 0.19686807427075748, "learning_rate": 1.1519465288799325e-07, "loss": 0.1599, "step": 14850 }, { "epoch": 4.694731853724035, "grad_norm": 0.19405867333452428, "learning_rate": 1.1285268735136634e-07, "loss": 0.1599, "step": 14860 }, { "epoch": 4.697891161835558, "grad_norm": 0.19332590972909364, "learning_rate": 1.1053450187969383e-07, "loss": 0.159, "step": 14870 }, { "epoch": 4.701050469947082, "grad_norm": 0.19949006237363448, "learning_rate": 1.0824010775288829e-07, "loss": 0.1593, "step": 14880 }, { "epoch": 4.704209778058605, "grad_norm": 0.19416481710856007, "learning_rate": 1.0596951613509931e-07, "loss": 0.1592, "step": 14890 }, { "epoch": 4.707369086170129, "grad_norm": 0.19602339515974787, "learning_rate": 1.0372273807465638e-07, "loss": 0.1591, "step": 14900 }, { "epoch": 4.710528394281653, "grad_norm": 0.1960689688312678, "learning_rate": 1.0149978450401776e-07, "loss": 0.1603, "step": 14910 }, { "epoch": 4.713687702393176, "grad_norm": 0.2057850892961581, "learning_rate": 9.930066623971334e-08, "loss": 0.1591, "step": 14920 }, { "epoch": 4.7168470105047, "grad_norm": 0.19301696301593, "learning_rate": 9.712539398229637e-08, "loss": 0.1602, "step": 14930 }, { "epoch": 4.720006318616223, "grad_norm": 0.19265047597399218, "learning_rate": 9.497397831628673e-08, "loss": 0.1594, "step": 14940 }, { "epoch": 4.723165626727747, "grad_norm": 0.19291383326551242, "learning_rate": 9.284642971012559e-08, "loss": 0.1556, "step": 14950 }, { "epoch": 4.72632493483927, "grad_norm": 0.1985670750054423, "learning_rate": 9.074275851611691e-08, "loss": 0.1611, "step": 14960 }, { "epoch": 4.729484242950794, "grad_norm": 0.1956461829616837, "learning_rate": 8.866297497038435e-08, "loss": 0.1595, "step": 14970 }, { "epoch": 4.732643551062317, "grad_norm": 0.1913906186621511, "learning_rate": 8.660708919281613e-08, "loss": 0.1596, "step": 14980 }, { "epoch": 4.735802859173841, "grad_norm": 0.20269780565703013, "learning_rate": 8.457511118701911e-08, "loss": 0.1585, "step": 14990 }, { "epoch": 4.738962167285364, "grad_norm": 0.2006526774187281, "learning_rate": 8.256705084026761e-08, "loss": 0.159, "step": 15000 }, { "epoch": 4.742121475396888, "grad_norm": 0.19809011682211267, "learning_rate": 8.05829179234574e-08, "loss": 0.1585, "step": 15010 }, { "epoch": 4.745280783508411, "grad_norm": 0.1925300643932596, "learning_rate": 7.862272209105625e-08, "loss": 0.1593, "step": 15020 }, { "epoch": 4.748440091619935, "grad_norm": 0.1928029167127188, "learning_rate": 7.668647288106012e-08, "loss": 0.1599, "step": 15030 }, { "epoch": 4.751599399731459, "grad_norm": 0.19481339189265057, "learning_rate": 7.47741797149415e-08, "loss": 0.1601, "step": 15040 }, { "epoch": 4.754758707842982, "grad_norm": 0.20271075417943374, "learning_rate": 7.288585189760944e-08, "loss": 0.1617, "step": 15050 }, { "epoch": 4.757918015954506, "grad_norm": 0.19802943106559054, "learning_rate": 7.102149861735962e-08, "loss": 0.1585, "step": 15060 }, { "epoch": 4.761077324066029, "grad_norm": 0.19714245747298037, "learning_rate": 6.918112894583328e-08, "loss": 0.1618, "step": 15070 }, { "epoch": 4.764236632177553, "grad_norm": 0.20555489147236597, "learning_rate": 6.736475183796887e-08, "loss": 0.1598, "step": 15080 }, { "epoch": 4.767395940289076, "grad_norm": 0.19422658967567297, "learning_rate": 6.557237613196321e-08, "loss": 0.1607, "step": 15090 }, { "epoch": 4.7705552484006, "grad_norm": 0.19909191153347178, "learning_rate": 6.380401054922547e-08, "loss": 0.1594, "step": 15100 }, { "epoch": 4.7737145565121235, "grad_norm": 0.18853836885626313, "learning_rate": 6.205966369433547e-08, "loss": 0.1607, "step": 15110 }, { "epoch": 4.7768738646236475, "grad_norm": 0.2001345804485893, "learning_rate": 6.033934405500042e-08, "loss": 0.1618, "step": 15120 }, { "epoch": 4.7800331727351715, "grad_norm": 0.20108034656670212, "learning_rate": 5.864306000201825e-08, "loss": 0.1625, "step": 15130 }, { "epoch": 4.783192480846695, "grad_norm": 0.1957272210969326, "learning_rate": 5.697081978922936e-08, "loss": 0.16, "step": 15140 }, { "epoch": 4.786351788958218, "grad_norm": 0.20101643906993924, "learning_rate": 5.5322631553484385e-08, "loss": 0.1587, "step": 15150 }, { "epoch": 4.789511097069742, "grad_norm": 0.1932795778151164, "learning_rate": 5.369850331459925e-08, "loss": 0.1609, "step": 15160 }, { "epoch": 4.792670405181266, "grad_norm": 0.2009306860005601, "learning_rate": 5.209844297531796e-08, "loss": 0.159, "step": 15170 }, { "epoch": 4.795829713292789, "grad_norm": 0.18836162035628687, "learning_rate": 5.052245832127434e-08, "loss": 0.1596, "step": 15180 }, { "epoch": 4.798989021404313, "grad_norm": 0.19233309651869823, "learning_rate": 4.8970557020954215e-08, "loss": 0.1614, "step": 15190 }, { "epoch": 4.802148329515836, "grad_norm": 0.20067963469184333, "learning_rate": 4.744274662565662e-08, "loss": 0.16, "step": 15200 }, { "epoch": 4.80530763762736, "grad_norm": 0.1907746195534059, "learning_rate": 4.5939034569458804e-08, "loss": 0.1595, "step": 15210 }, { "epoch": 4.808466945738883, "grad_norm": 0.18618207146361934, "learning_rate": 4.4459428169179583e-08, "loss": 0.1596, "step": 15220 }, { "epoch": 4.811626253850407, "grad_norm": 0.1924146366492045, "learning_rate": 4.3003934624342716e-08, "loss": 0.1581, "step": 15230 }, { "epoch": 4.81478556196193, "grad_norm": 0.19971943762565272, "learning_rate": 4.157256101714413e-08, "loss": 0.1577, "step": 15240 }, { "epoch": 4.817944870073454, "grad_norm": 0.19654001189058995, "learning_rate": 4.016531431241533e-08, "loss": 0.1588, "step": 15250 }, { "epoch": 4.821104178184978, "grad_norm": 0.20256447137055644, "learning_rate": 3.8782201357589475e-08, "loss": 0.1592, "step": 15260 }, { "epoch": 4.824263486296501, "grad_norm": 0.19955523886494098, "learning_rate": 3.742322888267036e-08, "loss": 0.159, "step": 15270 }, { "epoch": 4.827422794408025, "grad_norm": 0.20426304048843397, "learning_rate": 3.6088403500196267e-08, "loss": 0.1585, "step": 15280 }, { "epoch": 4.830582102519548, "grad_norm": 0.19123805822649273, "learning_rate": 3.4777731705211705e-08, "loss": 0.1599, "step": 15290 }, { "epoch": 4.833741410631072, "grad_norm": 0.1961561945761796, "learning_rate": 3.349121987523241e-08, "loss": 0.1603, "step": 15300 }, { "epoch": 4.836900718742595, "grad_norm": 0.19703665699422515, "learning_rate": 3.222887427021537e-08, "loss": 0.1584, "step": 15310 }, { "epoch": 4.840060026854119, "grad_norm": 0.205577530927033, "learning_rate": 3.099070103253055e-08, "loss": 0.1599, "step": 15320 }, { "epoch": 4.843219334965642, "grad_norm": 0.19037949799154982, "learning_rate": 2.977670618692641e-08, "loss": 0.1588, "step": 15330 }, { "epoch": 4.846378643077166, "grad_norm": 0.18634230770244323, "learning_rate": 2.8586895640504986e-08, "loss": 0.1589, "step": 15340 }, { "epoch": 4.849537951188689, "grad_norm": 0.19361637810104884, "learning_rate": 2.7421275182691887e-08, "loss": 0.1576, "step": 15350 }, { "epoch": 4.852697259300213, "grad_norm": 0.1957214714937506, "learning_rate": 2.6279850485206316e-08, "loss": 0.162, "step": 15360 }, { "epoch": 4.855856567411736, "grad_norm": 0.19434731826900606, "learning_rate": 2.5162627102035543e-08, "loss": 0.1607, "step": 15370 }, { "epoch": 4.85901587552326, "grad_norm": 0.19311901028457665, "learning_rate": 2.406961046940659e-08, "loss": 0.1597, "step": 15380 }, { "epoch": 4.862175183634784, "grad_norm": 0.19029235111786966, "learning_rate": 2.3000805905761814e-08, "loss": 0.1571, "step": 15390 }, { "epoch": 4.8653344917463075, "grad_norm": 0.18986353610183304, "learning_rate": 2.1956218611730028e-08, "loss": 0.1599, "step": 15400 }, { "epoch": 4.8684937998578315, "grad_norm": 0.19230056674947954, "learning_rate": 2.0935853670103202e-08, "loss": 0.1587, "step": 15410 }, { "epoch": 4.871653107969355, "grad_norm": 0.18970480828208378, "learning_rate": 1.9939716045811463e-08, "loss": 0.1601, "step": 15420 }, { "epoch": 4.874812416080879, "grad_norm": 0.19866752371194085, "learning_rate": 1.8967810585898695e-08, "loss": 0.162, "step": 15430 }, { "epoch": 4.877971724192402, "grad_norm": 0.19367079176529667, "learning_rate": 1.8020142019499755e-08, "loss": 0.159, "step": 15440 }, { "epoch": 4.881131032303926, "grad_norm": 0.19230026377170165, "learning_rate": 1.7096714957814953e-08, "loss": 0.1581, "step": 15450 }, { "epoch": 4.884290340415449, "grad_norm": 0.18516877508195054, "learning_rate": 1.619753389409062e-08, "loss": 0.1577, "step": 15460 }, { "epoch": 4.887449648526973, "grad_norm": 0.2030225188585794, "learning_rate": 1.5322603203595797e-08, "loss": 0.1584, "step": 15470 }, { "epoch": 4.890608956638496, "grad_norm": 0.19372717095423658, "learning_rate": 1.4471927143601127e-08, "loss": 0.1597, "step": 15480 }, { "epoch": 4.89376826475002, "grad_norm": 0.19197742339555968, "learning_rate": 1.3645509853357775e-08, "loss": 0.1568, "step": 15490 }, { "epoch": 4.896927572861543, "grad_norm": 0.2003946373624122, "learning_rate": 1.2843355354079102e-08, "loss": 0.1588, "step": 15500 }, { "epoch": 4.900086880973067, "grad_norm": 0.19449497450048703, "learning_rate": 1.2065467548917353e-08, "loss": 0.1563, "step": 15510 }, { "epoch": 4.903246189084591, "grad_norm": 0.19730672007764802, "learning_rate": 1.1311850222949227e-08, "loss": 0.1608, "step": 15520 }, { "epoch": 4.906405497196114, "grad_norm": 0.19426023326462366, "learning_rate": 1.0582507043153112e-08, "loss": 0.1587, "step": 15530 }, { "epoch": 4.909564805307638, "grad_norm": 0.18665531141294303, "learning_rate": 9.877441558395761e-09, "loss": 0.1588, "step": 15540 }, { "epoch": 4.912724113419161, "grad_norm": 0.19544733053145186, "learning_rate": 9.196657199410097e-09, "loss": 0.1585, "step": 15550 }, { "epoch": 4.915883421530685, "grad_norm": 0.1931838965682529, "learning_rate": 8.54015727878299e-09, "loss": 0.16, "step": 15560 }, { "epoch": 4.919042729642208, "grad_norm": 0.18609748141590166, "learning_rate": 7.90794499093639e-09, "loss": 0.1597, "step": 15570 }, { "epoch": 4.922202037753732, "grad_norm": 0.20127723441153517, "learning_rate": 7.300023412111779e-09, "loss": 0.1601, "step": 15580 }, { "epoch": 4.925361345865255, "grad_norm": 0.19920490400128663, "learning_rate": 6.716395500357964e-09, "loss": 0.1577, "step": 15590 }, { "epoch": 4.928520653976779, "grad_norm": 0.19375948427344167, "learning_rate": 6.157064095512754e-09, "loss": 0.1599, "step": 15600 }, { "epoch": 4.931679962088302, "grad_norm": 0.19750901689149744, "learning_rate": 5.622031919191862e-09, "loss": 0.1587, "step": 15610 }, { "epoch": 4.934839270199826, "grad_norm": 0.18708793754549835, "learning_rate": 5.1113015747755735e-09, "loss": 0.1601, "step": 15620 }, { "epoch": 4.937998578311349, "grad_norm": 0.1928749265530207, "learning_rate": 4.624875547394325e-09, "loss": 0.1589, "step": 15630 }, { "epoch": 4.941157886422873, "grad_norm": 0.20136315326321008, "learning_rate": 4.16275620391815e-09, "loss": 0.1597, "step": 15640 }, { "epoch": 4.944317194534397, "grad_norm": 0.19196622559471357, "learning_rate": 3.724945792945023e-09, "loss": 0.1594, "step": 15650 }, { "epoch": 4.94747650264592, "grad_norm": 0.19581601175358024, "learning_rate": 3.3114464447892013e-09, "loss": 0.1596, "step": 15660 }, { "epoch": 4.950635810757444, "grad_norm": 0.19173585748074187, "learning_rate": 2.922260171470681e-09, "loss": 0.159, "step": 15670 }, { "epoch": 4.9537951188689675, "grad_norm": 0.19350532117294958, "learning_rate": 2.5573888667079772e-09, "loss": 0.1616, "step": 15680 }, { "epoch": 4.9569544269804915, "grad_norm": 0.19845655941270632, "learning_rate": 2.2168343059042475e-09, "loss": 0.1588, "step": 15690 }, { "epoch": 4.960113735092015, "grad_norm": 0.19860632734242958, "learning_rate": 1.9005981461434065e-09, "loss": 0.1609, "step": 15700 }, { "epoch": 4.963273043203539, "grad_norm": 0.2001952094467118, "learning_rate": 1.6086819261790232e-09, "loss": 0.1585, "step": 15710 }, { "epoch": 4.966432351315062, "grad_norm": 0.1971085780740881, "learning_rate": 1.3410870664276598e-09, "loss": 0.1583, "step": 15720 }, { "epoch": 4.969591659426586, "grad_norm": 0.19391119108700927, "learning_rate": 1.0978148689633205e-09, "loss": 0.1594, "step": 15730 }, { "epoch": 4.97275096753811, "grad_norm": 0.19649511560351368, "learning_rate": 8.788665175085697e-10, "loss": 0.1603, "step": 15740 }, { "epoch": 4.975910275649633, "grad_norm": 0.19371704255850056, "learning_rate": 6.842430774300913e-10, "loss": 0.1588, "step": 15750 }, { "epoch": 4.979069583761156, "grad_norm": 0.20144520587590042, "learning_rate": 5.139454957342471e-10, "loss": 0.1586, "step": 15760 }, { "epoch": 4.98222889187268, "grad_norm": 0.18820551886405948, "learning_rate": 3.6797460106152707e-10, "loss": 0.1585, "step": 15770 }, { "epoch": 4.985388199984204, "grad_norm": 0.1989233181042242, "learning_rate": 2.463311036826621e-10, "loss": 0.1583, "step": 15780 }, { "epoch": 4.988547508095727, "grad_norm": 0.1988452001930714, "learning_rate": 1.490155954947392e-10, "loss": 0.159, "step": 15790 }, { "epoch": 4.991706816207251, "grad_norm": 0.19747851657350868, "learning_rate": 7.602855001953569e-11, "loss": 0.1593, "step": 15800 }, { "epoch": 4.994866124318774, "grad_norm": 0.19954559773754327, "learning_rate": 2.7370322400188665e-11, "loss": 0.1579, "step": 15810 }, { "epoch": 4.998025432430298, "grad_norm": 0.1940730914471369, "learning_rate": 3.041149399529708e-12, "loss": 0.1603, "step": 15820 } ], "logging_steps": 10, "max_steps": 15825, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.002555364979507e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }