{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 1000, "global_step": 7385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006770480704129994, "grad_norm": 2.130030393600464, "learning_rate": 2.7063599458728013e-06, "loss": 2.3319, "step": 10 }, { "epoch": 0.013540961408259987, "grad_norm": 2.666555881500244, "learning_rate": 5.4127198917456026e-06, "loss": 2.3443, "step": 20 }, { "epoch": 0.020311442112389978, "grad_norm": 2.274488687515259, "learning_rate": 8.119079837618404e-06, "loss": 2.3759, "step": 30 }, { "epoch": 0.027081922816519974, "grad_norm": 2.197918653488159, "learning_rate": 1.0825439783491205e-05, "loss": 2.1286, "step": 40 }, { "epoch": 0.033852403520649964, "grad_norm": 2.2513201236724854, "learning_rate": 1.3531799729364006e-05, "loss": 1.9161, "step": 50 }, { "epoch": 0.040622884224779957, "grad_norm": 1.52046537399292, "learning_rate": 1.6238159675236808e-05, "loss": 1.6287, "step": 60 }, { "epoch": 0.04739336492890995, "grad_norm": 1.0912840366363525, "learning_rate": 1.894451962110961e-05, "loss": 1.5206, "step": 70 }, { "epoch": 0.05416384563303995, "grad_norm": 1.050105333328247, "learning_rate": 2.165087956698241e-05, "loss": 1.3484, "step": 80 }, { "epoch": 0.06093432633716994, "grad_norm": 1.138007402420044, "learning_rate": 2.435723951285521e-05, "loss": 1.3352, "step": 90 }, { "epoch": 0.06770480704129993, "grad_norm": 1.0807892084121704, "learning_rate": 2.7063599458728013e-05, "loss": 1.2605, "step": 100 }, { "epoch": 0.07447528774542993, "grad_norm": 1.1421936750411987, "learning_rate": 2.976995940460081e-05, "loss": 1.1888, "step": 110 }, { "epoch": 0.08124576844955991, "grad_norm": 1.2684075832366943, "learning_rate": 3.2476319350473615e-05, "loss": 1.1998, "step": 120 }, { "epoch": 0.08801624915368991, "grad_norm": 1.1413911581039429, "learning_rate": 3.518267929634642e-05, "loss": 1.1426, "step": 130 }, { "epoch": 0.0947867298578199, "grad_norm": 1.3954917192459106, "learning_rate": 3.788903924221922e-05, "loss": 1.1437, "step": 140 }, { "epoch": 0.1015572105619499, "grad_norm": 1.2118768692016602, "learning_rate": 4.059539918809202e-05, "loss": 1.0564, "step": 150 }, { "epoch": 0.1083276912660799, "grad_norm": 1.4291969537734985, "learning_rate": 4.330175913396482e-05, "loss": 1.0382, "step": 160 }, { "epoch": 0.11509817197020988, "grad_norm": 1.351151943206787, "learning_rate": 4.600811907983762e-05, "loss": 1.0717, "step": 170 }, { "epoch": 0.12186865267433988, "grad_norm": 1.3836501836776733, "learning_rate": 4.871447902571042e-05, "loss": 1.0294, "step": 180 }, { "epoch": 0.12863913337846988, "grad_norm": 1.2129018306732178, "learning_rate": 5.142083897158322e-05, "loss": 1.0081, "step": 190 }, { "epoch": 0.13540961408259986, "grad_norm": 1.244095802307129, "learning_rate": 5.4127198917456026e-05, "loss": 0.9383, "step": 200 }, { "epoch": 0.14218009478672985, "grad_norm": 1.3957242965698242, "learning_rate": 5.683355886332883e-05, "loss": 0.927, "step": 210 }, { "epoch": 0.14895057549085985, "grad_norm": 1.688636302947998, "learning_rate": 5.953991880920162e-05, "loss": 0.9617, "step": 220 }, { "epoch": 0.15572105619498985, "grad_norm": 1.376826524734497, "learning_rate": 6.224627875507443e-05, "loss": 1.0176, "step": 230 }, { "epoch": 0.16249153689911983, "grad_norm": 1.4289461374282837, "learning_rate": 6.495263870094723e-05, "loss": 0.9733, "step": 240 }, { "epoch": 0.16926201760324983, "grad_norm": 1.4132306575775146, "learning_rate": 6.765899864682003e-05, "loss": 1.0141, "step": 250 }, { "epoch": 0.17603249830737983, "grad_norm": 1.482531189918518, "learning_rate": 7.036535859269283e-05, "loss": 0.977, "step": 260 }, { "epoch": 0.18280297901150983, "grad_norm": 1.509128212928772, "learning_rate": 7.307171853856563e-05, "loss": 0.9624, "step": 270 }, { "epoch": 0.1895734597156398, "grad_norm": 1.7142691612243652, "learning_rate": 7.577807848443844e-05, "loss": 1.0063, "step": 280 }, { "epoch": 0.1963439404197698, "grad_norm": 1.2345936298370361, "learning_rate": 7.848443843031124e-05, "loss": 0.9562, "step": 290 }, { "epoch": 0.2031144211238998, "grad_norm": 1.4808542728424072, "learning_rate": 8.119079837618404e-05, "loss": 1.0207, "step": 300 }, { "epoch": 0.2098849018280298, "grad_norm": 0.9802400469779968, "learning_rate": 8.389715832205684e-05, "loss": 0.9731, "step": 310 }, { "epoch": 0.2166553825321598, "grad_norm": 1.2837491035461426, "learning_rate": 8.660351826792964e-05, "loss": 0.9732, "step": 320 }, { "epoch": 0.22342586323628977, "grad_norm": 1.6100679636001587, "learning_rate": 8.930987821380244e-05, "loss": 0.9645, "step": 330 }, { "epoch": 0.23019634394041977, "grad_norm": 1.65373957157135, "learning_rate": 9.201623815967524e-05, "loss": 0.9825, "step": 340 }, { "epoch": 0.23696682464454977, "grad_norm": 1.4988625049591064, "learning_rate": 9.472259810554804e-05, "loss": 0.9521, "step": 350 }, { "epoch": 0.24373730534867977, "grad_norm": 1.0492310523986816, "learning_rate": 9.742895805142085e-05, "loss": 0.9418, "step": 360 }, { "epoch": 0.25050778605280977, "grad_norm": 1.26401948928833, "learning_rate": 0.00010013531799729365, "loss": 1.0314, "step": 370 }, { "epoch": 0.25727826675693977, "grad_norm": 1.3206366300582886, "learning_rate": 0.00010284167794316644, "loss": 0.9194, "step": 380 }, { "epoch": 0.2640487474610697, "grad_norm": 1.533471941947937, "learning_rate": 0.00010554803788903924, "loss": 0.9, "step": 390 }, { "epoch": 0.2708192281651997, "grad_norm": 1.2870343923568726, "learning_rate": 0.00010825439783491205, "loss": 0.911, "step": 400 }, { "epoch": 0.2775897088693297, "grad_norm": 1.3480168581008911, "learning_rate": 0.00011096075778078485, "loss": 0.9127, "step": 410 }, { "epoch": 0.2843601895734597, "grad_norm": 1.1548075675964355, "learning_rate": 0.00011366711772665765, "loss": 0.9206, "step": 420 }, { "epoch": 0.2911306702775897, "grad_norm": 1.000781536102295, "learning_rate": 0.00011637347767253047, "loss": 0.9248, "step": 430 }, { "epoch": 0.2979011509817197, "grad_norm": 1.0907179117202759, "learning_rate": 0.00011907983761840324, "loss": 0.897, "step": 440 }, { "epoch": 0.3046716316858497, "grad_norm": 1.3253204822540283, "learning_rate": 0.00012178619756427604, "loss": 0.9503, "step": 450 }, { "epoch": 0.3114421123899797, "grad_norm": 1.186468482017517, "learning_rate": 0.00012449255751014886, "loss": 0.885, "step": 460 }, { "epoch": 0.3182125930941097, "grad_norm": 1.0382546186447144, "learning_rate": 0.00012719891745602166, "loss": 0.937, "step": 470 }, { "epoch": 0.32498307379823965, "grad_norm": 0.9156469702720642, "learning_rate": 0.00012990527740189446, "loss": 0.9407, "step": 480 }, { "epoch": 0.33175355450236965, "grad_norm": 1.2555314302444458, "learning_rate": 0.00013261163734776726, "loss": 0.9349, "step": 490 }, { "epoch": 0.33852403520649965, "grad_norm": 1.1427136659622192, "learning_rate": 0.00013531799729364006, "loss": 0.9034, "step": 500 }, { "epoch": 0.34529451591062965, "grad_norm": 0.9024341106414795, "learning_rate": 0.00013802435723951287, "loss": 0.8431, "step": 510 }, { "epoch": 0.35206499661475965, "grad_norm": 1.0170283317565918, "learning_rate": 0.00014073071718538567, "loss": 0.9392, "step": 520 }, { "epoch": 0.35883547731888965, "grad_norm": 0.9581354856491089, "learning_rate": 0.00014343707713125847, "loss": 0.9557, "step": 530 }, { "epoch": 0.36560595802301965, "grad_norm": 1.1668641567230225, "learning_rate": 0.00014614343707713127, "loss": 0.8982, "step": 540 }, { "epoch": 0.37237643872714965, "grad_norm": 1.249225378036499, "learning_rate": 0.00014884979702300404, "loss": 0.8719, "step": 550 }, { "epoch": 0.3791469194312796, "grad_norm": 0.8681928515434265, "learning_rate": 0.00015155615696887687, "loss": 0.9412, "step": 560 }, { "epoch": 0.3859174001354096, "grad_norm": 0.8795790672302246, "learning_rate": 0.00015426251691474967, "loss": 0.9476, "step": 570 }, { "epoch": 0.3926878808395396, "grad_norm": 1.2251633405685425, "learning_rate": 0.00015696887686062247, "loss": 0.9401, "step": 580 }, { "epoch": 0.3994583615436696, "grad_norm": 0.9845913052558899, "learning_rate": 0.00015967523680649528, "loss": 0.8447, "step": 590 }, { "epoch": 0.4062288422477996, "grad_norm": 1.3847956657409668, "learning_rate": 0.00016238159675236808, "loss": 0.9562, "step": 600 }, { "epoch": 0.4129993229519296, "grad_norm": 0.9039000272750854, "learning_rate": 0.00016508795669824085, "loss": 0.8706, "step": 610 }, { "epoch": 0.4197698036560596, "grad_norm": 0.8315423130989075, "learning_rate": 0.00016779431664411368, "loss": 0.9437, "step": 620 }, { "epoch": 0.4265402843601896, "grad_norm": 0.8760778903961182, "learning_rate": 0.00017050067658998648, "loss": 0.9078, "step": 630 }, { "epoch": 0.4333107650643196, "grad_norm": 1.0592724084854126, "learning_rate": 0.00017320703653585928, "loss": 0.8835, "step": 640 }, { "epoch": 0.44008124576844954, "grad_norm": 0.8527820706367493, "learning_rate": 0.00017591339648173208, "loss": 0.9088, "step": 650 }, { "epoch": 0.44685172647257954, "grad_norm": 0.8774325847625732, "learning_rate": 0.00017861975642760488, "loss": 0.8967, "step": 660 }, { "epoch": 0.45362220717670954, "grad_norm": 0.6633328795433044, "learning_rate": 0.00018132611637347766, "loss": 0.9158, "step": 670 }, { "epoch": 0.46039268788083954, "grad_norm": 0.7048283219337463, "learning_rate": 0.0001840324763193505, "loss": 0.872, "step": 680 }, { "epoch": 0.46716316858496953, "grad_norm": 0.8527712225914001, "learning_rate": 0.0001867388362652233, "loss": 0.9062, "step": 690 }, { "epoch": 0.47393364928909953, "grad_norm": 1.095738172531128, "learning_rate": 0.0001894451962110961, "loss": 0.89, "step": 700 }, { "epoch": 0.48070412999322953, "grad_norm": 0.8880236148834229, "learning_rate": 0.0001921515561569689, "loss": 0.8825, "step": 710 }, { "epoch": 0.48747461069735953, "grad_norm": 0.7381774187088013, "learning_rate": 0.0001948579161028417, "loss": 0.8121, "step": 720 }, { "epoch": 0.4942450914014895, "grad_norm": 0.9708958864212036, "learning_rate": 0.0001975642760487145, "loss": 0.8458, "step": 730 }, { "epoch": 0.5010155721056195, "grad_norm": 1.0069886445999146, "learning_rate": 0.00019999998882753333, "loss": 0.8679, "step": 740 }, { "epoch": 0.5077860528097495, "grad_norm": 0.8364754915237427, "learning_rate": 0.00019999864813455363, "loss": 0.8797, "step": 750 }, { "epoch": 0.5145565335138795, "grad_norm": 0.8467391133308411, "learning_rate": 0.0001999950729825663, "loss": 0.8789, "step": 760 }, { "epoch": 0.5213270142180095, "grad_norm": 0.749064028263092, "learning_rate": 0.00019998926345145775, "loss": 0.9156, "step": 770 }, { "epoch": 0.5280974949221394, "grad_norm": 0.7991885542869568, "learning_rate": 0.00019998121967104132, "loss": 0.919, "step": 780 }, { "epoch": 0.5348679756262694, "grad_norm": 0.8024610877037048, "learning_rate": 0.00019997094182105447, "loss": 0.8619, "step": 790 }, { "epoch": 0.5416384563303994, "grad_norm": 0.8949725031852722, "learning_rate": 0.00019995843013115454, "loss": 0.86, "step": 800 }, { "epoch": 0.5484089370345294, "grad_norm": 0.9048612713813782, "learning_rate": 0.00019994368488091398, "loss": 0.9258, "step": 810 }, { "epoch": 0.5551794177386594, "grad_norm": 1.112876057624817, "learning_rate": 0.00019992670639981376, "loss": 0.8758, "step": 820 }, { "epoch": 0.5619498984427894, "grad_norm": 0.9120655059814453, "learning_rate": 0.00019990749506723624, "loss": 0.9112, "step": 830 }, { "epoch": 0.5687203791469194, "grad_norm": 0.9125117063522339, "learning_rate": 0.00019988605131245662, "loss": 0.899, "step": 840 }, { "epoch": 0.5754908598510494, "grad_norm": 0.8011307716369629, "learning_rate": 0.00019986237561463318, "loss": 0.8604, "step": 850 }, { "epoch": 0.5822613405551794, "grad_norm": 0.7512729167938232, "learning_rate": 0.00019983646850279692, "loss": 0.8411, "step": 860 }, { "epoch": 0.5890318212593094, "grad_norm": 0.7400951981544495, "learning_rate": 0.0001998083305558394, "loss": 0.9106, "step": 870 }, { "epoch": 0.5958023019634394, "grad_norm": 0.8688220381736755, "learning_rate": 0.00019977796240250008, "loss": 0.9071, "step": 880 }, { "epoch": 0.6025727826675694, "grad_norm": 0.9177795052528381, "learning_rate": 0.00019974536472135203, "loss": 0.9038, "step": 890 }, { "epoch": 0.6093432633716994, "grad_norm": 0.986629843711853, "learning_rate": 0.00019971053824078693, "loss": 0.8832, "step": 900 }, { "epoch": 0.6161137440758294, "grad_norm": 0.7033129334449768, "learning_rate": 0.00019967348373899868, "loss": 0.845, "step": 910 }, { "epoch": 0.6228842247799594, "grad_norm": 0.8107329607009888, "learning_rate": 0.0001996342020439662, "loss": 0.9287, "step": 920 }, { "epoch": 0.6296547054840894, "grad_norm": 0.7914236783981323, "learning_rate": 0.00019959269403343474, "loss": 0.8836, "step": 930 }, { "epoch": 0.6364251861882194, "grad_norm": 0.8895307183265686, "learning_rate": 0.00019954896063489622, "loss": 0.8759, "step": 940 }, { "epoch": 0.6431956668923493, "grad_norm": 0.8289987444877625, "learning_rate": 0.0001995030028255688, "loss": 0.9136, "step": 950 }, { "epoch": 0.6499661475964793, "grad_norm": 0.9810376167297363, "learning_rate": 0.00019945482163237472, "loss": 0.8388, "step": 960 }, { "epoch": 0.6567366283006093, "grad_norm": 0.7306379079818726, "learning_rate": 0.0001994044181319176, "loss": 0.8804, "step": 970 }, { "epoch": 0.6635071090047393, "grad_norm": 0.7892174124717712, "learning_rate": 0.00019935179345045815, "loss": 0.8671, "step": 980 }, { "epoch": 0.6702775897088693, "grad_norm": 0.9007791876792908, "learning_rate": 0.0001992969487638893, "loss": 0.8661, "step": 990 }, { "epoch": 0.6770480704129993, "grad_norm": 0.7324849963188171, "learning_rate": 0.00019923988529770958, "loss": 0.7901, "step": 1000 }, { "epoch": 0.6770480704129993, "eval_loss": 0.8919770121574402, "eval_runtime": 23.6227, "eval_samples_per_second": 105.323, "eval_steps_per_second": 13.165, "step": 1000 }, { "epoch": 0.6838185511171293, "grad_norm": 0.8670386672019958, "learning_rate": 0.000199180604326996, "loss": 0.8084, "step": 1010 }, { "epoch": 0.6905890318212593, "grad_norm": 1.3103822469711304, "learning_rate": 0.00019911910717637548, "loss": 0.8708, "step": 1020 }, { "epoch": 0.6973595125253893, "grad_norm": 0.8602836728096008, "learning_rate": 0.00019905539521999517, "loss": 0.8608, "step": 1030 }, { "epoch": 0.7041299932295193, "grad_norm": 0.7158609628677368, "learning_rate": 0.00019898946988149193, "loss": 0.9042, "step": 1040 }, { "epoch": 0.7109004739336493, "grad_norm": 0.6975676417350769, "learning_rate": 0.0001989213326339603, "loss": 0.8896, "step": 1050 }, { "epoch": 0.7176709546377793, "grad_norm": 0.7300527095794678, "learning_rate": 0.00019885098499991972, "loss": 0.8685, "step": 1060 }, { "epoch": 0.7244414353419093, "grad_norm": 0.6200681924819946, "learning_rate": 0.0001987784285512805, "loss": 0.8615, "step": 1070 }, { "epoch": 0.7312119160460393, "grad_norm": 0.7945191860198975, "learning_rate": 0.00019870366490930868, "loss": 0.8786, "step": 1080 }, { "epoch": 0.7379823967501693, "grad_norm": 0.6641054749488831, "learning_rate": 0.0001986266957445897, "loss": 0.8872, "step": 1090 }, { "epoch": 0.7447528774542993, "grad_norm": 0.7063596844673157, "learning_rate": 0.00019854752277699138, "loss": 0.8544, "step": 1100 }, { "epoch": 0.7515233581584293, "grad_norm": 0.6685433983802795, "learning_rate": 0.000198466147775625, "loss": 0.8256, "step": 1110 }, { "epoch": 0.7582938388625592, "grad_norm": 0.6927530765533447, "learning_rate": 0.00019838257255880626, "loss": 0.8642, "step": 1120 }, { "epoch": 0.7650643195666892, "grad_norm": 0.7018571496009827, "learning_rate": 0.00019829679899401436, "loss": 0.8624, "step": 1130 }, { "epoch": 0.7718348002708192, "grad_norm": 0.8826500773429871, "learning_rate": 0.00019820882899785038, "loss": 0.8312, "step": 1140 }, { "epoch": 0.7786052809749492, "grad_norm": 0.9699224233627319, "learning_rate": 0.00019811866453599435, "loss": 0.8467, "step": 1150 }, { "epoch": 0.7853757616790792, "grad_norm": 0.7322418689727783, "learning_rate": 0.00019802630762316145, "loss": 0.8456, "step": 1160 }, { "epoch": 0.7921462423832092, "grad_norm": 0.768301248550415, "learning_rate": 0.00019793176032305697, "loss": 0.8391, "step": 1170 }, { "epoch": 0.7989167230873392, "grad_norm": 0.8243605494499207, "learning_rate": 0.00019783502474833009, "loss": 0.904, "step": 1180 }, { "epoch": 0.8056872037914692, "grad_norm": 0.7215325236320496, "learning_rate": 0.00019773610306052683, "loss": 0.8494, "step": 1190 }, { "epoch": 0.8124576844955992, "grad_norm": 0.7619712948799133, "learning_rate": 0.00019763499747004165, "loss": 0.8865, "step": 1200 }, { "epoch": 0.8192281651997292, "grad_norm": 0.835599958896637, "learning_rate": 0.000197531710236068, "loss": 0.8733, "step": 1210 }, { "epoch": 0.8259986459038592, "grad_norm": 0.8382962942123413, "learning_rate": 0.00019742624366654802, "loss": 0.9122, "step": 1220 }, { "epoch": 0.8327691266079892, "grad_norm": 0.666801393032074, "learning_rate": 0.00019731860011812087, "loss": 0.8429, "step": 1230 }, { "epoch": 0.8395396073121192, "grad_norm": 0.7756575345993042, "learning_rate": 0.00019720878199606996, "loss": 0.9004, "step": 1240 }, { "epoch": 0.8463100880162492, "grad_norm": 0.7014258503913879, "learning_rate": 0.00019709679175426942, "loss": 0.9241, "step": 1250 }, { "epoch": 0.8530805687203792, "grad_norm": 0.6827540397644043, "learning_rate": 0.00019698263189512914, "loss": 0.8566, "step": 1260 }, { "epoch": 0.8598510494245092, "grad_norm": 0.9167826771736145, "learning_rate": 0.00019686630496953882, "loss": 0.9116, "step": 1270 }, { "epoch": 0.8666215301286392, "grad_norm": 0.8172047138214111, "learning_rate": 0.00019674781357681108, "loss": 0.8052, "step": 1280 }, { "epoch": 0.8733920108327691, "grad_norm": 0.7139961123466492, "learning_rate": 0.00019662716036462335, "loss": 0.89, "step": 1290 }, { "epoch": 0.8801624915368991, "grad_norm": 0.9733943939208984, "learning_rate": 0.0001965043480289586, "loss": 0.8191, "step": 1300 }, { "epoch": 0.8869329722410291, "grad_norm": 0.849946916103363, "learning_rate": 0.00019637937931404523, "loss": 0.8995, "step": 1310 }, { "epoch": 0.8937034529451591, "grad_norm": 0.6809601187705994, "learning_rate": 0.00019625225701229573, "loss": 0.8582, "step": 1320 }, { "epoch": 0.9004739336492891, "grad_norm": 0.7891602516174316, "learning_rate": 0.00019612298396424417, "loss": 0.844, "step": 1330 }, { "epoch": 0.9072444143534191, "grad_norm": 0.6357580423355103, "learning_rate": 0.0001959915630584829, "loss": 0.8609, "step": 1340 }, { "epoch": 0.9140148950575491, "grad_norm": 0.9102625846862793, "learning_rate": 0.00019585799723159788, "loss": 0.91, "step": 1350 }, { "epoch": 0.9207853757616791, "grad_norm": 0.690881609916687, "learning_rate": 0.0001957222894681031, "loss": 0.8287, "step": 1360 }, { "epoch": 0.9275558564658091, "grad_norm": 0.6755393743515015, "learning_rate": 0.00019558444280037393, "loss": 0.7931, "step": 1370 }, { "epoch": 0.9343263371699391, "grad_norm": 0.6997596025466919, "learning_rate": 0.00019544446030857922, "loss": 0.8941, "step": 1380 }, { "epoch": 0.9410968178740691, "grad_norm": 0.8115108013153076, "learning_rate": 0.0001953023451206127, "loss": 0.8674, "step": 1390 }, { "epoch": 0.9478672985781991, "grad_norm": 0.6413692235946655, "learning_rate": 0.00019515810041202295, "loss": 0.8462, "step": 1400 }, { "epoch": 0.9546377792823291, "grad_norm": 0.6888745427131653, "learning_rate": 0.00019501172940594242, "loss": 0.8594, "step": 1410 }, { "epoch": 0.9614082599864591, "grad_norm": 0.8250995874404907, "learning_rate": 0.00019486323537301538, "loss": 0.8622, "step": 1420 }, { "epoch": 0.9681787406905891, "grad_norm": 0.7127440571784973, "learning_rate": 0.00019471262163132504, "loss": 0.8626, "step": 1430 }, { "epoch": 0.9749492213947191, "grad_norm": 0.6688849925994873, "learning_rate": 0.0001945598915463192, "loss": 0.871, "step": 1440 }, { "epoch": 0.9817197020988491, "grad_norm": 0.8800045251846313, "learning_rate": 0.00019440504853073516, "loss": 0.8555, "step": 1450 }, { "epoch": 0.988490182802979, "grad_norm": 0.7973435521125793, "learning_rate": 0.00019424809604452338, "loss": 0.826, "step": 1460 }, { "epoch": 0.995260663507109, "grad_norm": 0.7803165316581726, "learning_rate": 0.00019408903759477025, "loss": 0.8657, "step": 1470 }, { "epoch": 1.002031144211239, "grad_norm": 0.9152759313583374, "learning_rate": 0.00019392787673561964, "loss": 0.8114, "step": 1480 }, { "epoch": 1.008801624915369, "grad_norm": 0.717939555644989, "learning_rate": 0.00019376461706819358, "loss": 0.7081, "step": 1490 }, { "epoch": 1.015572105619499, "grad_norm": 0.8752790093421936, "learning_rate": 0.00019359926224051178, "loss": 0.697, "step": 1500 }, { "epoch": 1.022342586323629, "grad_norm": 0.7938421368598938, "learning_rate": 0.00019343181594740996, "loss": 0.7743, "step": 1510 }, { "epoch": 1.029113067027759, "grad_norm": 0.8380940556526184, "learning_rate": 0.00019326228193045753, "loss": 0.7965, "step": 1520 }, { "epoch": 1.035883547731889, "grad_norm": 0.8056864142417908, "learning_rate": 0.00019309066397787378, "loss": 0.7399, "step": 1530 }, { "epoch": 1.042654028436019, "grad_norm": 0.9307854771614075, "learning_rate": 0.0001929169659244434, "loss": 0.7503, "step": 1540 }, { "epoch": 1.0494245091401488, "grad_norm": 0.8573846220970154, "learning_rate": 0.00019274119165143064, "loss": 0.7867, "step": 1550 }, { "epoch": 1.0561949898442788, "grad_norm": 0.7639918327331543, "learning_rate": 0.00019256334508649262, "loss": 0.7303, "step": 1560 }, { "epoch": 1.0629654705484088, "grad_norm": 0.7085719704627991, "learning_rate": 0.00019238343020359174, "loss": 0.7375, "step": 1570 }, { "epoch": 1.0697359512525388, "grad_norm": 0.8645661473274231, "learning_rate": 0.00019220145102290658, "loss": 0.7569, "step": 1580 }, { "epoch": 1.0765064319566688, "grad_norm": 0.8893268704414368, "learning_rate": 0.00019201741161074234, "loss": 0.7594, "step": 1590 }, { "epoch": 1.0832769126607988, "grad_norm": 0.9011455774307251, "learning_rate": 0.00019183131607943983, "loss": 0.7721, "step": 1600 }, { "epoch": 1.0900473933649288, "grad_norm": 0.812759518623352, "learning_rate": 0.00019164316858728364, "loss": 0.6816, "step": 1610 }, { "epoch": 1.0968178740690588, "grad_norm": 0.7881085276603699, "learning_rate": 0.00019145297333840916, "loss": 0.7927, "step": 1620 }, { "epoch": 1.1035883547731888, "grad_norm": 0.9383792281150818, "learning_rate": 0.00019126073458270874, "loss": 0.8416, "step": 1630 }, { "epoch": 1.1103588354773188, "grad_norm": 0.8487265110015869, "learning_rate": 0.00019106645661573667, "loss": 0.7731, "step": 1640 }, { "epoch": 1.1171293161814488, "grad_norm": 1.061084270477295, "learning_rate": 0.0001908701437786131, "loss": 0.7954, "step": 1650 }, { "epoch": 1.1238997968855788, "grad_norm": 0.7608863115310669, "learning_rate": 0.00019067180045792724, "loss": 0.7224, "step": 1660 }, { "epoch": 1.1306702775897088, "grad_norm": 1.0351011753082275, "learning_rate": 0.0001904714310856392, "loss": 0.7761, "step": 1670 }, { "epoch": 1.1374407582938388, "grad_norm": 0.8522539138793945, "learning_rate": 0.00019026904013898097, "loss": 0.7552, "step": 1680 }, { "epoch": 1.1442112389979688, "grad_norm": 0.9050424098968506, "learning_rate": 0.00019006463214035646, "loss": 0.7458, "step": 1690 }, { "epoch": 1.1509817197020988, "grad_norm": 1.0837703943252563, "learning_rate": 0.00018985821165724034, "loss": 0.7811, "step": 1700 }, { "epoch": 1.1577522004062288, "grad_norm": 0.7830744385719299, "learning_rate": 0.00018964978330207605, "loss": 0.7596, "step": 1710 }, { "epoch": 1.1645226811103588, "grad_norm": 0.8530306220054626, "learning_rate": 0.0001894393517321727, "loss": 0.7075, "step": 1720 }, { "epoch": 1.1712931618144888, "grad_norm": 0.9117756485939026, "learning_rate": 0.00018922692164960098, "loss": 0.7585, "step": 1730 }, { "epoch": 1.1780636425186188, "grad_norm": 0.9983711242675781, "learning_rate": 0.00018901249780108823, "loss": 0.7459, "step": 1740 }, { "epoch": 1.1848341232227488, "grad_norm": 0.9291015267372131, "learning_rate": 0.00018879608497791224, "loss": 0.7271, "step": 1750 }, { "epoch": 1.1916046039268788, "grad_norm": 1.0468007326126099, "learning_rate": 0.00018857768801579415, "loss": 0.7932, "step": 1760 }, { "epoch": 1.1983750846310088, "grad_norm": 0.8586043119430542, "learning_rate": 0.00018835731179479056, "loss": 0.8144, "step": 1770 }, { "epoch": 1.2051455653351388, "grad_norm": 0.7450950741767883, "learning_rate": 0.00018813496123918432, "loss": 0.7402, "step": 1780 }, { "epoch": 1.2119160460392688, "grad_norm": 0.9340034127235413, "learning_rate": 0.00018791064131737462, "loss": 0.7852, "step": 1790 }, { "epoch": 1.2186865267433988, "grad_norm": 0.9052138328552246, "learning_rate": 0.00018768435704176597, "loss": 0.7128, "step": 1800 }, { "epoch": 1.2254570074475288, "grad_norm": 0.8574148416519165, "learning_rate": 0.00018745611346865606, "loss": 0.7488, "step": 1810 }, { "epoch": 1.2322274881516588, "grad_norm": 1.0493452548980713, "learning_rate": 0.00018722591569812294, "loss": 0.8368, "step": 1820 }, { "epoch": 1.2389979688557888, "grad_norm": 1.019943356513977, "learning_rate": 0.00018699376887391093, "loss": 0.8279, "step": 1830 }, { "epoch": 1.2457684495599188, "grad_norm": 0.9113163352012634, "learning_rate": 0.0001867596781833158, "loss": 0.7308, "step": 1840 }, { "epoch": 1.2525389302640488, "grad_norm": 0.9192100763320923, "learning_rate": 0.0001865236488570688, "loss": 0.783, "step": 1850 }, { "epoch": 1.2593094109681786, "grad_norm": 0.8824251294136047, "learning_rate": 0.00018628568616921976, "loss": 0.7581, "step": 1860 }, { "epoch": 1.2660798916723088, "grad_norm": 0.8410795331001282, "learning_rate": 0.00018604579543701926, "loss": 0.7696, "step": 1870 }, { "epoch": 1.2728503723764386, "grad_norm": 1.0213907957077026, "learning_rate": 0.00018580398202079987, "loss": 0.7202, "step": 1880 }, { "epoch": 1.2796208530805688, "grad_norm": 0.7865493297576904, "learning_rate": 0.00018556025132385626, "loss": 0.7685, "step": 1890 }, { "epoch": 1.2863913337846986, "grad_norm": 0.9204791784286499, "learning_rate": 0.00018531460879232456, "loss": 0.7814, "step": 1900 }, { "epoch": 1.2931618144888288, "grad_norm": 0.810883104801178, "learning_rate": 0.00018506705991506067, "loss": 0.7202, "step": 1910 }, { "epoch": 1.2999322951929586, "grad_norm": 0.8419713973999023, "learning_rate": 0.00018481761022351757, "loss": 0.785, "step": 1920 }, { "epoch": 1.3067027758970888, "grad_norm": 0.8345950245857239, "learning_rate": 0.0001845662652916217, "loss": 0.7693, "step": 1930 }, { "epoch": 1.3134732566012186, "grad_norm": 0.8708229660987854, "learning_rate": 0.00018431303073564842, "loss": 0.8127, "step": 1940 }, { "epoch": 1.3202437373053486, "grad_norm": 0.800879716873169, "learning_rate": 0.0001840579122140966, "loss": 0.7804, "step": 1950 }, { "epoch": 1.3270142180094786, "grad_norm": 0.8764187097549438, "learning_rate": 0.00018380091542756212, "loss": 0.7563, "step": 1960 }, { "epoch": 1.3337846987136086, "grad_norm": 0.9371510744094849, "learning_rate": 0.00018354204611861042, "loss": 0.7382, "step": 1970 }, { "epoch": 1.3405551794177386, "grad_norm": 0.9174867868423462, "learning_rate": 0.00018328131007164827, "loss": 0.7543, "step": 1980 }, { "epoch": 1.3473256601218686, "grad_norm": 0.9580458998680115, "learning_rate": 0.00018301871311279455, "loss": 0.7877, "step": 1990 }, { "epoch": 1.3540961408259986, "grad_norm": 0.8264724016189575, "learning_rate": 0.00018275426110975, "loss": 0.7599, "step": 2000 }, { "epoch": 1.3540961408259986, "eval_loss": 0.8573334813117981, "eval_runtime": 23.1617, "eval_samples_per_second": 107.419, "eval_steps_per_second": 13.427, "step": 2000 }, { "epoch": 1.3608666215301286, "grad_norm": 0.8695821762084961, "learning_rate": 0.00018248795997166607, "loss": 0.772, "step": 2010 }, { "epoch": 1.3676371022342586, "grad_norm": 0.9564002752304077, "learning_rate": 0.000182219815649013, "loss": 0.8211, "step": 2020 }, { "epoch": 1.3744075829383886, "grad_norm": 0.951923668384552, "learning_rate": 0.00018194983413344674, "loss": 0.7549, "step": 2030 }, { "epoch": 1.3811780636425186, "grad_norm": 0.7695098519325256, "learning_rate": 0.00018167802145767513, "loss": 0.7133, "step": 2040 }, { "epoch": 1.3879485443466486, "grad_norm": 1.255873203277588, "learning_rate": 0.0001814043836953231, "loss": 0.7562, "step": 2050 }, { "epoch": 1.3947190250507786, "grad_norm": 0.8769702315330505, "learning_rate": 0.00018112892696079698, "loss": 0.7411, "step": 2060 }, { "epoch": 1.4014895057549086, "grad_norm": 0.9851005673408508, "learning_rate": 0.00018085165740914776, "loss": 0.7568, "step": 2070 }, { "epoch": 1.4082599864590386, "grad_norm": 0.8695229887962341, "learning_rate": 0.00018057258123593367, "loss": 0.7358, "step": 2080 }, { "epoch": 1.4150304671631686, "grad_norm": 0.9267136454582214, "learning_rate": 0.00018029170467708165, "loss": 0.7352, "step": 2090 }, { "epoch": 1.4218009478672986, "grad_norm": 0.8532856106758118, "learning_rate": 0.00018000903400874823, "loss": 0.8073, "step": 2100 }, { "epoch": 1.4285714285714286, "grad_norm": 0.8961872458457947, "learning_rate": 0.0001797245755471789, "loss": 0.7886, "step": 2110 }, { "epoch": 1.4353419092755586, "grad_norm": 0.8943607211112976, "learning_rate": 0.00017943833564856737, "loss": 0.7216, "step": 2120 }, { "epoch": 1.4421123899796886, "grad_norm": 0.824885904788971, "learning_rate": 0.00017915032070891327, "loss": 0.7077, "step": 2130 }, { "epoch": 1.4488828706838186, "grad_norm": 0.846660315990448, "learning_rate": 0.00017886053716387935, "loss": 0.7511, "step": 2140 }, { "epoch": 1.4556533513879486, "grad_norm": 0.8594396710395813, "learning_rate": 0.00017856899148864774, "loss": 0.7603, "step": 2150 }, { "epoch": 1.4624238320920786, "grad_norm": 0.8377899527549744, "learning_rate": 0.00017827569019777503, "loss": 0.7301, "step": 2160 }, { "epoch": 1.4691943127962086, "grad_norm": 1.0455125570297241, "learning_rate": 0.00017798063984504698, "loss": 0.7858, "step": 2170 }, { "epoch": 1.4759647935003386, "grad_norm": 0.9242769479751587, "learning_rate": 0.00017768384702333188, "loss": 0.8125, "step": 2180 }, { "epoch": 1.4827352742044684, "grad_norm": 0.9363239407539368, "learning_rate": 0.00017738531836443332, "loss": 0.7731, "step": 2190 }, { "epoch": 1.4895057549085986, "grad_norm": 0.8512465953826904, "learning_rate": 0.000177085060538942, "loss": 0.7407, "step": 2200 }, { "epoch": 1.4962762356127284, "grad_norm": 0.9729003310203552, "learning_rate": 0.00017678308025608665, "loss": 0.7751, "step": 2210 }, { "epoch": 1.5030467163168586, "grad_norm": 0.94197678565979, "learning_rate": 0.00017647938426358412, "loss": 0.7642, "step": 2220 }, { "epoch": 1.5098171970209884, "grad_norm": 0.9034068584442139, "learning_rate": 0.00017617397934748859, "loss": 0.8069, "step": 2230 }, { "epoch": 1.5165876777251186, "grad_norm": 0.9055565595626831, "learning_rate": 0.00017586687233204, "loss": 0.7463, "step": 2240 }, { "epoch": 1.5233581584292484, "grad_norm": 0.9645712971687317, "learning_rate": 0.00017555807007951142, "loss": 0.8157, "step": 2250 }, { "epoch": 1.5301286391333786, "grad_norm": 0.9376358389854431, "learning_rate": 0.00017524757949005597, "loss": 0.8012, "step": 2260 }, { "epoch": 1.5368991198375084, "grad_norm": 0.8372974991798401, "learning_rate": 0.00017493540750155236, "loss": 0.7429, "step": 2270 }, { "epoch": 1.5436696005416386, "grad_norm": 0.8159657120704651, "learning_rate": 0.00017462156108944996, "loss": 0.7619, "step": 2280 }, { "epoch": 1.5504400812457684, "grad_norm": 0.9110903143882751, "learning_rate": 0.00017430604726661304, "loss": 0.7792, "step": 2290 }, { "epoch": 1.5572105619498986, "grad_norm": 1.0363059043884277, "learning_rate": 0.00017398887308316393, "loss": 0.7875, "step": 2300 }, { "epoch": 1.5639810426540284, "grad_norm": 0.8779491186141968, "learning_rate": 0.00017367004562632556, "loss": 0.7395, "step": 2310 }, { "epoch": 1.5707515233581584, "grad_norm": 0.7635359168052673, "learning_rate": 0.00017334957202026305, "loss": 0.734, "step": 2320 }, { "epoch": 1.5775220040622884, "grad_norm": 0.7570300698280334, "learning_rate": 0.0001730274594259246, "loss": 0.732, "step": 2330 }, { "epoch": 1.5842924847664184, "grad_norm": 0.8852811455726624, "learning_rate": 0.0001727037150408813, "loss": 0.7176, "step": 2340 }, { "epoch": 1.5910629654705484, "grad_norm": 0.920385479927063, "learning_rate": 0.00017237834609916668, "loss": 0.7883, "step": 2350 }, { "epoch": 1.5978334461746784, "grad_norm": 0.7175299525260925, "learning_rate": 0.00017205135987111446, "loss": 0.7511, "step": 2360 }, { "epoch": 1.6046039268788084, "grad_norm": 0.9640962481498718, "learning_rate": 0.0001717227636631968, "loss": 0.7344, "step": 2370 }, { "epoch": 1.6113744075829384, "grad_norm": 1.0787372589111328, "learning_rate": 0.00017139256481786043, "loss": 0.7388, "step": 2380 }, { "epoch": 1.6181448882870684, "grad_norm": 0.8717492818832397, "learning_rate": 0.00017106077071336298, "loss": 0.8181, "step": 2390 }, { "epoch": 1.6249153689911984, "grad_norm": 0.9693078398704529, "learning_rate": 0.00017072738876360792, "loss": 0.7784, "step": 2400 }, { "epoch": 1.6316858496953284, "grad_norm": 0.9157988429069519, "learning_rate": 0.00017039242641797895, "loss": 0.7631, "step": 2410 }, { "epoch": 1.6384563303994584, "grad_norm": 0.856497585773468, "learning_rate": 0.0001700558911611736, "loss": 0.7572, "step": 2420 }, { "epoch": 1.6452268111035884, "grad_norm": 0.9910064339637756, "learning_rate": 0.0001697177905130358, "loss": 0.79, "step": 2430 }, { "epoch": 1.6519972918077184, "grad_norm": 0.9009943008422852, "learning_rate": 0.00016937813202838817, "loss": 0.7389, "step": 2440 }, { "epoch": 1.6587677725118484, "grad_norm": 0.8572137951850891, "learning_rate": 0.00016903692329686286, "loss": 0.8074, "step": 2450 }, { "epoch": 1.6655382532159784, "grad_norm": 0.9608494639396667, "learning_rate": 0.00016869417194273216, "loss": 0.7493, "step": 2460 }, { "epoch": 1.6723087339201084, "grad_norm": 1.1153324842453003, "learning_rate": 0.00016834988562473813, "loss": 0.7696, "step": 2470 }, { "epoch": 1.6790792146242384, "grad_norm": 0.8839768171310425, "learning_rate": 0.00016800407203592144, "loss": 0.6736, "step": 2480 }, { "epoch": 1.6858496953283684, "grad_norm": 0.8794620633125305, "learning_rate": 0.00016765673890344944, "loss": 0.7678, "step": 2490 }, { "epoch": 1.6926201760324981, "grad_norm": 1.167880892753601, "learning_rate": 0.0001673078939884435, "loss": 0.799, "step": 2500 }, { "epoch": 1.6993906567366284, "grad_norm": 0.8976329565048218, "learning_rate": 0.00016695754508580556, "loss": 0.7445, "step": 2510 }, { "epoch": 1.7061611374407581, "grad_norm": 0.8003941178321838, "learning_rate": 0.00016660570002404414, "loss": 0.7434, "step": 2520 }, { "epoch": 1.7129316181448884, "grad_norm": 1.5716880559921265, "learning_rate": 0.0001662523666650992, "loss": 0.7785, "step": 2530 }, { "epoch": 1.7197020988490181, "grad_norm": 0.7486565113067627, "learning_rate": 0.00016589755290416652, "loss": 0.7415, "step": 2540 }, { "epoch": 1.7264725795531484, "grad_norm": 0.872717559337616, "learning_rate": 0.0001655412666695213, "loss": 0.7568, "step": 2550 }, { "epoch": 1.7332430602572781, "grad_norm": 1.06588876247406, "learning_rate": 0.00016518351592234102, "loss": 0.714, "step": 2560 }, { "epoch": 1.7400135409614084, "grad_norm": 0.8603307008743286, "learning_rate": 0.00016482430865652758, "loss": 0.8015, "step": 2570 }, { "epoch": 1.7467840216655381, "grad_norm": 0.9161677956581116, "learning_rate": 0.0001644636528985286, "loss": 0.7517, "step": 2580 }, { "epoch": 1.7535545023696684, "grad_norm": 0.9165793657302856, "learning_rate": 0.00016410155670715807, "loss": 0.7219, "step": 2590 }, { "epoch": 1.7603249830737981, "grad_norm": 0.9347404837608337, "learning_rate": 0.00016373802817341631, "loss": 0.7544, "step": 2600 }, { "epoch": 1.7670954637779284, "grad_norm": 0.9771521687507629, "learning_rate": 0.00016337307542030924, "loss": 0.7613, "step": 2610 }, { "epoch": 1.7738659444820581, "grad_norm": 0.8616775870323181, "learning_rate": 0.00016300670660266678, "loss": 0.7028, "step": 2620 }, { "epoch": 1.7806364251861884, "grad_norm": 0.9634568095207214, "learning_rate": 0.0001626389299069606, "loss": 0.7776, "step": 2630 }, { "epoch": 1.7874069058903181, "grad_norm": 0.8600468635559082, "learning_rate": 0.00016226975355112134, "loss": 0.7127, "step": 2640 }, { "epoch": 1.7941773865944484, "grad_norm": 0.8130874037742615, "learning_rate": 0.00016189918578435482, "loss": 0.7618, "step": 2650 }, { "epoch": 1.8009478672985781, "grad_norm": 0.8722664713859558, "learning_rate": 0.00016152723488695783, "loss": 0.7364, "step": 2660 }, { "epoch": 1.8077183480027081, "grad_norm": 0.726963222026825, "learning_rate": 0.00016115390917013307, "loss": 0.7449, "step": 2670 }, { "epoch": 1.8144888287068381, "grad_norm": 0.9895104765892029, "learning_rate": 0.00016077921697580343, "loss": 0.7766, "step": 2680 }, { "epoch": 1.8212593094109681, "grad_norm": 0.9779828190803528, "learning_rate": 0.00016040316667642558, "loss": 0.7266, "step": 2690 }, { "epoch": 1.8280297901150981, "grad_norm": 1.04193913936615, "learning_rate": 0.00016002576667480288, "loss": 0.7344, "step": 2700 }, { "epoch": 1.8348002708192281, "grad_norm": 0.8899911046028137, "learning_rate": 0.00015964702540389767, "loss": 0.7546, "step": 2710 }, { "epoch": 1.8415707515233581, "grad_norm": 0.9403987526893616, "learning_rate": 0.0001592669513266428, "loss": 0.7482, "step": 2720 }, { "epoch": 1.8483412322274881, "grad_norm": 0.863129734992981, "learning_rate": 0.00015888555293575254, "loss": 0.7527, "step": 2730 }, { "epoch": 1.8551117129316181, "grad_norm": 1.1445564031600952, "learning_rate": 0.0001585028387535328, "loss": 0.7672, "step": 2740 }, { "epoch": 1.8618821936357481, "grad_norm": 0.8358940482139587, "learning_rate": 0.0001581188173316907, "loss": 0.7877, "step": 2750 }, { "epoch": 1.8686526743398781, "grad_norm": 1.0207701921463013, "learning_rate": 0.00015773349725114352, "loss": 0.7711, "step": 2760 }, { "epoch": 1.8754231550440081, "grad_norm": 0.9382310509681702, "learning_rate": 0.00015734688712182687, "loss": 0.7365, "step": 2770 }, { "epoch": 1.8821936357481381, "grad_norm": 0.7211757898330688, "learning_rate": 0.0001569589955825024, "loss": 0.7144, "step": 2780 }, { "epoch": 1.8889641164522681, "grad_norm": 1.0787826776504517, "learning_rate": 0.00015656983130056472, "loss": 0.7784, "step": 2790 }, { "epoch": 1.8957345971563981, "grad_norm": 1.0936686992645264, "learning_rate": 0.00015617940297184775, "loss": 0.7455, "step": 2800 }, { "epoch": 1.9025050778605281, "grad_norm": 1.0122491121292114, "learning_rate": 0.00015578771932043037, "loss": 0.7711, "step": 2810 }, { "epoch": 1.9092755585646581, "grad_norm": 0.9829614162445068, "learning_rate": 0.00015539478909844156, "loss": 0.7485, "step": 2820 }, { "epoch": 1.9160460392687881, "grad_norm": 0.9822033047676086, "learning_rate": 0.00015500062108586473, "loss": 0.7337, "step": 2830 }, { "epoch": 1.9228165199729181, "grad_norm": 0.8550043702125549, "learning_rate": 0.0001546052240903416, "loss": 0.7547, "step": 2840 }, { "epoch": 1.929587000677048, "grad_norm": 0.7504202723503113, "learning_rate": 0.0001542086069469754, "loss": 0.7329, "step": 2850 }, { "epoch": 1.9363574813811781, "grad_norm": 0.7536128759384155, "learning_rate": 0.00015381077851813342, "loss": 0.6917, "step": 2860 }, { "epoch": 1.943127962085308, "grad_norm": 1.024143934249878, "learning_rate": 0.000153411747693249, "loss": 0.7293, "step": 2870 }, { "epoch": 1.9498984427894381, "grad_norm": 0.8882274031639099, "learning_rate": 0.0001530115233886229, "loss": 0.7067, "step": 2880 }, { "epoch": 1.956668923493568, "grad_norm": 0.814894437789917, "learning_rate": 0.00015261011454722402, "loss": 0.6613, "step": 2890 }, { "epoch": 1.9634394041976981, "grad_norm": 0.8720422387123108, "learning_rate": 0.00015220753013848965, "loss": 0.7931, "step": 2900 }, { "epoch": 1.970209884901828, "grad_norm": 1.070326805114746, "learning_rate": 0.00015180377915812498, "loss": 0.6737, "step": 2910 }, { "epoch": 1.9769803656059581, "grad_norm": 0.9129419922828674, "learning_rate": 0.0001513988706279021, "loss": 0.7693, "step": 2920 }, { "epoch": 1.983750846310088, "grad_norm": 0.9133071303367615, "learning_rate": 0.00015099281359545844, "loss": 0.7222, "step": 2930 }, { "epoch": 1.9905213270142181, "grad_norm": 1.1360323429107666, "learning_rate": 0.00015058561713409465, "loss": 0.7813, "step": 2940 }, { "epoch": 1.997291807718348, "grad_norm": 1.1606559753417969, "learning_rate": 0.0001501772903425717, "loss": 0.7045, "step": 2950 }, { "epoch": 2.004062288422478, "grad_norm": 0.8940277099609375, "learning_rate": 0.0001497678423449077, "loss": 0.6686, "step": 2960 }, { "epoch": 2.010832769126608, "grad_norm": 0.9504866003990173, "learning_rate": 0.00014935728229017404, "loss": 0.5851, "step": 2970 }, { "epoch": 2.017603249830738, "grad_norm": 0.9662072062492371, "learning_rate": 0.00014894561935229083, "loss": 0.5836, "step": 2980 }, { "epoch": 2.024373730534868, "grad_norm": 1.1531829833984375, "learning_rate": 0.00014853286272982206, "loss": 0.5511, "step": 2990 }, { "epoch": 2.031144211238998, "grad_norm": 1.0693235397338867, "learning_rate": 0.00014811902164576986, "loss": 0.5325, "step": 3000 }, { "epoch": 2.031144211238998, "eval_loss": 0.8718012571334839, "eval_runtime": 23.0432, "eval_samples_per_second": 107.971, "eval_steps_per_second": 13.496, "step": 3000 }, { "epoch": 2.037914691943128, "grad_norm": 1.1329638957977295, "learning_rate": 0.0001477041053473687, "loss": 0.5722, "step": 3010 }, { "epoch": 2.044685172647258, "grad_norm": 1.1756556034088135, "learning_rate": 0.0001472881231058785, "loss": 0.57, "step": 3020 }, { "epoch": 2.051455653351388, "grad_norm": 1.1575700044631958, "learning_rate": 0.00014687108421637758, "loss": 0.5845, "step": 3030 }, { "epoch": 2.058226134055518, "grad_norm": 1.0859098434448242, "learning_rate": 0.0001464529979975549, "loss": 0.533, "step": 3040 }, { "epoch": 2.064996614759648, "grad_norm": 0.9851484298706055, "learning_rate": 0.00014603387379150197, "loss": 0.584, "step": 3050 }, { "epoch": 2.071767095463778, "grad_norm": 1.1865367889404297, "learning_rate": 0.00014561372096350402, "loss": 0.5536, "step": 3060 }, { "epoch": 2.078537576167908, "grad_norm": 1.114558219909668, "learning_rate": 0.00014519254890183058, "loss": 0.5627, "step": 3070 }, { "epoch": 2.085308056872038, "grad_norm": 1.0637989044189453, "learning_rate": 0.00014477036701752603, "loss": 0.5625, "step": 3080 }, { "epoch": 2.092078537576168, "grad_norm": 1.2044423818588257, "learning_rate": 0.00014434718474419896, "loss": 0.6045, "step": 3090 }, { "epoch": 2.0988490182802977, "grad_norm": 1.0656991004943848, "learning_rate": 0.00014392301153781168, "loss": 0.5458, "step": 3100 }, { "epoch": 2.105619498984428, "grad_norm": 1.431920051574707, "learning_rate": 0.00014349785687646879, "loss": 0.5798, "step": 3110 }, { "epoch": 2.1123899796885577, "grad_norm": 1.4664020538330078, "learning_rate": 0.00014307173026020524, "loss": 0.5566, "step": 3120 }, { "epoch": 2.119160460392688, "grad_norm": 0.9782803654670715, "learning_rate": 0.00014264464121077435, "loss": 0.5883, "step": 3130 }, { "epoch": 2.1259309410968177, "grad_norm": 1.2193199396133423, "learning_rate": 0.00014221659927143488, "loss": 0.5912, "step": 3140 }, { "epoch": 2.132701421800948, "grad_norm": 1.1089211702346802, "learning_rate": 0.00014178761400673778, "loss": 0.5421, "step": 3150 }, { "epoch": 2.1394719025050777, "grad_norm": 1.6899245977401733, "learning_rate": 0.00014135769500231259, "loss": 0.5477, "step": 3160 }, { "epoch": 2.146242383209208, "grad_norm": 1.1503666639328003, "learning_rate": 0.00014092685186465297, "loss": 0.5703, "step": 3170 }, { "epoch": 2.1530128639133377, "grad_norm": 1.1421773433685303, "learning_rate": 0.0001404950942209025, "loss": 0.6063, "step": 3180 }, { "epoch": 2.159783344617468, "grad_norm": 1.308514952659607, "learning_rate": 0.00014006243171863907, "loss": 0.6101, "step": 3190 }, { "epoch": 2.1665538253215977, "grad_norm": 1.108906626701355, "learning_rate": 0.00013962887402565967, "loss": 0.6067, "step": 3200 }, { "epoch": 2.173324306025728, "grad_norm": 1.3432538509368896, "learning_rate": 0.00013919443082976415, "loss": 0.5724, "step": 3210 }, { "epoch": 2.1800947867298577, "grad_norm": 1.2304880619049072, "learning_rate": 0.00013875911183853896, "loss": 0.5764, "step": 3220 }, { "epoch": 2.186865267433988, "grad_norm": 1.1720483303070068, "learning_rate": 0.0001383229267791399, "loss": 0.565, "step": 3230 }, { "epoch": 2.1936357481381177, "grad_norm": 0.9357210397720337, "learning_rate": 0.00013788588539807517, "loss": 0.525, "step": 3240 }, { "epoch": 2.200406228842248, "grad_norm": 1.2292680740356445, "learning_rate": 0.0001374479974609872, "loss": 0.6126, "step": 3250 }, { "epoch": 2.2071767095463777, "grad_norm": 1.0784507989883423, "learning_rate": 0.0001370092727524348, "loss": 0.5863, "step": 3260 }, { "epoch": 2.213947190250508, "grad_norm": 1.3088752031326294, "learning_rate": 0.00013656972107567423, "loss": 0.5568, "step": 3270 }, { "epoch": 2.2207176709546377, "grad_norm": 1.1142232418060303, "learning_rate": 0.0001361293522524403, "loss": 0.5777, "step": 3280 }, { "epoch": 2.227488151658768, "grad_norm": 1.1168012619018555, "learning_rate": 0.0001356881761227269, "loss": 0.549, "step": 3290 }, { "epoch": 2.2342586323628977, "grad_norm": 1.1179856061935425, "learning_rate": 0.00013524620254456705, "loss": 0.5828, "step": 3300 }, { "epoch": 2.241029113067028, "grad_norm": 1.1862361431121826, "learning_rate": 0.00013480344139381266, "loss": 0.5441, "step": 3310 }, { "epoch": 2.2477995937711577, "grad_norm": 1.2580469846725464, "learning_rate": 0.0001343599025639139, "loss": 0.6452, "step": 3320 }, { "epoch": 2.254570074475288, "grad_norm": 0.9721531271934509, "learning_rate": 0.00013391559596569815, "loss": 0.5803, "step": 3330 }, { "epoch": 2.2613405551794177, "grad_norm": 1.099107265472412, "learning_rate": 0.0001334705315271483, "loss": 0.5768, "step": 3340 }, { "epoch": 2.268111035883548, "grad_norm": 1.0356446504592896, "learning_rate": 0.00013302471919318141, "loss": 0.5759, "step": 3350 }, { "epoch": 2.2748815165876777, "grad_norm": 1.2317684888839722, "learning_rate": 0.00013257816892542582, "loss": 0.5797, "step": 3360 }, { "epoch": 2.281651997291808, "grad_norm": 1.2287174463272095, "learning_rate": 0.0001321308907019992, "loss": 0.5747, "step": 3370 }, { "epoch": 2.2884224779959377, "grad_norm": 1.2517625093460083, "learning_rate": 0.0001316828945172852, "loss": 0.5114, "step": 3380 }, { "epoch": 2.295192958700068, "grad_norm": 1.088796854019165, "learning_rate": 0.00013123419038171024, "loss": 0.5821, "step": 3390 }, { "epoch": 2.3019634394041977, "grad_norm": 1.0487096309661865, "learning_rate": 0.00013078478832151985, "loss": 0.6054, "step": 3400 }, { "epoch": 2.3087339201083275, "grad_norm": 1.1964969635009766, "learning_rate": 0.00013033469837855457, "loss": 0.5621, "step": 3410 }, { "epoch": 2.3155044008124577, "grad_norm": 1.2567753791809082, "learning_rate": 0.00012988393061002566, "loss": 0.5858, "step": 3420 }, { "epoch": 2.322274881516588, "grad_norm": 0.984793484210968, "learning_rate": 0.0001294324950882903, "loss": 0.5961, "step": 3430 }, { "epoch": 2.3290453622207177, "grad_norm": 1.2915070056915283, "learning_rate": 0.00012898040190062647, "loss": 0.5667, "step": 3440 }, { "epoch": 2.3358158429248475, "grad_norm": 1.242781400680542, "learning_rate": 0.00012852766114900777, "loss": 0.5781, "step": 3450 }, { "epoch": 2.3425863236289777, "grad_norm": 1.1402225494384766, "learning_rate": 0.00012807428294987744, "loss": 0.6048, "step": 3460 }, { "epoch": 2.349356804333108, "grad_norm": 1.2243235111236572, "learning_rate": 0.0001276202774339224, "loss": 0.5672, "step": 3470 }, { "epoch": 2.3561272850372377, "grad_norm": 1.2512565851211548, "learning_rate": 0.00012716565474584702, "loss": 0.5992, "step": 3480 }, { "epoch": 2.3628977657413675, "grad_norm": 1.3591067790985107, "learning_rate": 0.00012671042504414619, "loss": 0.5853, "step": 3490 }, { "epoch": 2.3696682464454977, "grad_norm": 1.7091628313064575, "learning_rate": 0.00012625459850087846, "loss": 0.5501, "step": 3500 }, { "epoch": 2.3764387271496275, "grad_norm": 1.2151107788085938, "learning_rate": 0.00012579818530143884, "loss": 0.5684, "step": 3510 }, { "epoch": 2.3832092078537577, "grad_norm": 1.4708514213562012, "learning_rate": 0.000125341195644331, "loss": 0.578, "step": 3520 }, { "epoch": 2.3899796885578874, "grad_norm": 1.2934261560440063, "learning_rate": 0.0001248836397409396, "loss": 0.6235, "step": 3530 }, { "epoch": 2.3967501692620177, "grad_norm": 1.9203015565872192, "learning_rate": 0.00012442552781530186, "loss": 0.5868, "step": 3540 }, { "epoch": 2.4035206499661474, "grad_norm": 1.2564107179641724, "learning_rate": 0.00012396687010387942, "loss": 0.6091, "step": 3550 }, { "epoch": 2.4102911306702777, "grad_norm": 1.3231315612792969, "learning_rate": 0.00012350767685532938, "loss": 0.5492, "step": 3560 }, { "epoch": 2.4170616113744074, "grad_norm": 1.392247200012207, "learning_rate": 0.00012304795833027534, "loss": 0.5809, "step": 3570 }, { "epoch": 2.4238320920785377, "grad_norm": 1.1600557565689087, "learning_rate": 0.00012258772480107816, "loss": 0.5638, "step": 3580 }, { "epoch": 2.4306025727826674, "grad_norm": 1.3254331350326538, "learning_rate": 0.00012212698655160637, "loss": 0.5644, "step": 3590 }, { "epoch": 2.4373730534867977, "grad_norm": 1.2660179138183594, "learning_rate": 0.00012166575387700651, "loss": 0.5852, "step": 3600 }, { "epoch": 2.4441435341909274, "grad_norm": 1.1489580869674683, "learning_rate": 0.00012120403708347298, "loss": 0.5753, "step": 3610 }, { "epoch": 2.4509140148950577, "grad_norm": 1.1386017799377441, "learning_rate": 0.00012074184648801769, "loss": 0.5446, "step": 3620 }, { "epoch": 2.4576844955991874, "grad_norm": 1.3722707033157349, "learning_rate": 0.00012027919241823964, "loss": 0.5771, "step": 3630 }, { "epoch": 2.4644549763033177, "grad_norm": 1.1902090311050415, "learning_rate": 0.00011981608521209413, "loss": 0.5774, "step": 3640 }, { "epoch": 2.4712254570074474, "grad_norm": 1.1676629781723022, "learning_rate": 0.00011935253521766174, "loss": 0.5718, "step": 3650 }, { "epoch": 2.4779959377115777, "grad_norm": 1.1004976034164429, "learning_rate": 0.00011888855279291713, "loss": 0.6151, "step": 3660 }, { "epoch": 2.4847664184157074, "grad_norm": 1.407827377319336, "learning_rate": 0.00011842414830549748, "loss": 0.6025, "step": 3670 }, { "epoch": 2.4915368991198377, "grad_norm": 1.26259183883667, "learning_rate": 0.00011795933213247101, "loss": 0.6008, "step": 3680 }, { "epoch": 2.4983073798239674, "grad_norm": 1.1961734294891357, "learning_rate": 0.000117494114660105, "loss": 0.5598, "step": 3690 }, { "epoch": 2.5050778605280977, "grad_norm": 0.9188928604125977, "learning_rate": 0.00011702850628363365, "loss": 0.5636, "step": 3700 }, { "epoch": 2.5118483412322274, "grad_norm": 0.9072563052177429, "learning_rate": 0.00011656251740702596, "loss": 0.5629, "step": 3710 }, { "epoch": 2.518618821936357, "grad_norm": 1.0292631387710571, "learning_rate": 0.00011609615844275305, "loss": 0.6066, "step": 3720 }, { "epoch": 2.5253893026404874, "grad_norm": 1.229181170463562, "learning_rate": 0.00011562943981155575, "loss": 0.5491, "step": 3730 }, { "epoch": 2.5321597833446177, "grad_norm": 1.1053756475448608, "learning_rate": 0.00011516237194221149, "loss": 0.6065, "step": 3740 }, { "epoch": 2.5389302640487474, "grad_norm": 1.4795639514923096, "learning_rate": 0.0001146949652713015, "loss": 0.5705, "step": 3750 }, { "epoch": 2.545700744752877, "grad_norm": 1.1489176750183105, "learning_rate": 0.00011422723024297737, "loss": 0.5364, "step": 3760 }, { "epoch": 2.5524712254570074, "grad_norm": 1.1073706150054932, "learning_rate": 0.00011375917730872787, "loss": 0.6014, "step": 3770 }, { "epoch": 2.5592417061611377, "grad_norm": 1.5487061738967896, "learning_rate": 0.00011329081692714534, "loss": 0.5477, "step": 3780 }, { "epoch": 2.5660121868652674, "grad_norm": 1.4128634929656982, "learning_rate": 0.00011282215956369204, "loss": 0.6538, "step": 3790 }, { "epoch": 2.572782667569397, "grad_norm": 1.2158820629119873, "learning_rate": 0.00011235321569046615, "loss": 0.594, "step": 3800 }, { "epoch": 2.5795531482735274, "grad_norm": 1.3014835119247437, "learning_rate": 0.00011188399578596795, "loss": 0.5936, "step": 3810 }, { "epoch": 2.5863236289776577, "grad_norm": 1.3620414733886719, "learning_rate": 0.00011141451033486564, "loss": 0.5633, "step": 3820 }, { "epoch": 2.5930941096817874, "grad_norm": 1.224446415901184, "learning_rate": 0.00011094476982776096, "loss": 0.553, "step": 3830 }, { "epoch": 2.599864590385917, "grad_norm": 1.3176541328430176, "learning_rate": 0.00011047478476095487, "loss": 0.5591, "step": 3840 }, { "epoch": 2.6066350710900474, "grad_norm": 1.1520602703094482, "learning_rate": 0.00011000456563621304, "loss": 0.5753, "step": 3850 }, { "epoch": 2.6134055517941777, "grad_norm": 1.2285906076431274, "learning_rate": 0.00010953412296053105, "loss": 0.6055, "step": 3860 }, { "epoch": 2.6201760324983074, "grad_norm": 1.544148564338684, "learning_rate": 0.00010906346724589975, "loss": 0.6062, "step": 3870 }, { "epoch": 2.626946513202437, "grad_norm": 1.2714669704437256, "learning_rate": 0.00010859260900907038, "loss": 0.5867, "step": 3880 }, { "epoch": 2.6337169939065674, "grad_norm": 1.4937471151351929, "learning_rate": 0.00010812155877131945, "loss": 0.5953, "step": 3890 }, { "epoch": 2.640487474610697, "grad_norm": 1.551594614982605, "learning_rate": 0.00010765032705821363, "loss": 0.5537, "step": 3900 }, { "epoch": 2.6472579553148274, "grad_norm": 1.565324068069458, "learning_rate": 0.0001071789243993748, "loss": 0.572, "step": 3910 }, { "epoch": 2.654028436018957, "grad_norm": 1.207514762878418, "learning_rate": 0.00010670736132824455, "loss": 0.5921, "step": 3920 }, { "epoch": 2.6607989167230874, "grad_norm": 1.1995245218276978, "learning_rate": 0.00010623564838184878, "loss": 0.5635, "step": 3930 }, { "epoch": 2.667569397427217, "grad_norm": 1.1889262199401855, "learning_rate": 0.00010576379610056249, "loss": 0.5886, "step": 3940 }, { "epoch": 2.6743398781313474, "grad_norm": 1.0783162117004395, "learning_rate": 0.0001052918150278739, "loss": 0.5831, "step": 3950 }, { "epoch": 2.681110358835477, "grad_norm": 1.4271385669708252, "learning_rate": 0.0001048197157101493, "loss": 0.5335, "step": 3960 }, { "epoch": 2.6878808395396074, "grad_norm": 1.167817234992981, "learning_rate": 0.00010434750869639693, "loss": 0.5331, "step": 3970 }, { "epoch": 2.694651320243737, "grad_norm": 1.3966023921966553, "learning_rate": 0.00010387520453803166, "loss": 0.5931, "step": 3980 }, { "epoch": 2.7014218009478674, "grad_norm": 1.328182578086853, "learning_rate": 0.00010340281378863892, "loss": 0.5472, "step": 3990 }, { "epoch": 2.708192281651997, "grad_norm": 1.3755980730056763, "learning_rate": 0.00010293034700373905, "loss": 0.5875, "step": 4000 }, { "epoch": 2.708192281651997, "eval_loss": 0.8555851578712463, "eval_runtime": 22.9559, "eval_samples_per_second": 108.382, "eval_steps_per_second": 13.548, "step": 4000 }, { "epoch": 2.7149627623561274, "grad_norm": 1.2442570924758911, "learning_rate": 0.0001024578147405514, "loss": 0.6028, "step": 4010 }, { "epoch": 2.721733243060257, "grad_norm": 1.2046414613723755, "learning_rate": 0.0001019852275577585, "loss": 0.5959, "step": 4020 }, { "epoch": 2.7285037237643874, "grad_norm": 1.1981314420700073, "learning_rate": 0.00010151259601526992, "loss": 0.6042, "step": 4030 }, { "epoch": 2.735274204468517, "grad_norm": 1.3695381879806519, "learning_rate": 0.00010103993067398649, "loss": 0.5943, "step": 4040 }, { "epoch": 2.7420446851726474, "grad_norm": 1.1446524858474731, "learning_rate": 0.00010056724209556431, "loss": 0.5853, "step": 4050 }, { "epoch": 2.748815165876777, "grad_norm": 1.2874009609222412, "learning_rate": 0.00010009454084217873, "loss": 0.5967, "step": 4060 }, { "epoch": 2.755585646580907, "grad_norm": 1.3916451930999756, "learning_rate": 9.962183747628819e-05, "loss": 0.5528, "step": 4070 }, { "epoch": 2.762356127285037, "grad_norm": 1.141298532485962, "learning_rate": 9.914914256039847e-05, "loss": 0.5641, "step": 4080 }, { "epoch": 2.7691266079891674, "grad_norm": 1.2546755075454712, "learning_rate": 9.867646665682646e-05, "loss": 0.5638, "step": 4090 }, { "epoch": 2.775897088693297, "grad_norm": 1.2840214967727661, "learning_rate": 9.820382032746426e-05, "loss": 0.5835, "step": 4100 }, { "epoch": 2.782667569397427, "grad_norm": 1.1560393571853638, "learning_rate": 9.773121413354311e-05, "loss": 0.5809, "step": 4110 }, { "epoch": 2.789438050101557, "grad_norm": 1.3474149703979492, "learning_rate": 9.725865863539747e-05, "loss": 0.5768, "step": 4120 }, { "epoch": 2.7962085308056874, "grad_norm": 1.1416068077087402, "learning_rate": 9.678616439222899e-05, "loss": 0.5758, "step": 4130 }, { "epoch": 2.802979011509817, "grad_norm": 1.192691445350647, "learning_rate": 9.631374196187051e-05, "loss": 0.547, "step": 4140 }, { "epoch": 2.809749492213947, "grad_norm": 1.2631511688232422, "learning_rate": 9.584140190055035e-05, "loss": 0.5315, "step": 4150 }, { "epoch": 2.816519972918077, "grad_norm": 1.3457276821136475, "learning_rate": 9.536915476265621e-05, "loss": 0.5824, "step": 4160 }, { "epoch": 2.8232904536222074, "grad_norm": 1.5314511060714722, "learning_rate": 9.489701110049944e-05, "loss": 0.6094, "step": 4170 }, { "epoch": 2.830060934326337, "grad_norm": 1.3376086950302124, "learning_rate": 9.442498146407927e-05, "loss": 0.5914, "step": 4180 }, { "epoch": 2.836831415030467, "grad_norm": 1.5918281078338623, "learning_rate": 9.3953076400847e-05, "loss": 0.5814, "step": 4190 }, { "epoch": 2.843601895734597, "grad_norm": 1.387515902519226, "learning_rate": 9.348130645547042e-05, "loss": 0.5663, "step": 4200 }, { "epoch": 2.850372376438727, "grad_norm": 1.612802267074585, "learning_rate": 9.300968216959805e-05, "loss": 0.5807, "step": 4210 }, { "epoch": 2.857142857142857, "grad_norm": 1.34074068069458, "learning_rate": 9.253821408162366e-05, "loss": 0.5868, "step": 4220 }, { "epoch": 2.863913337846987, "grad_norm": 1.436584234237671, "learning_rate": 9.206691272645087e-05, "loss": 0.5613, "step": 4230 }, { "epoch": 2.870683818551117, "grad_norm": 1.3354675769805908, "learning_rate": 9.159578863525762e-05, "loss": 0.6245, "step": 4240 }, { "epoch": 2.877454299255247, "grad_norm": 1.1248669624328613, "learning_rate": 9.11248523352609e-05, "loss": 0.547, "step": 4250 }, { "epoch": 2.884224779959377, "grad_norm": 1.1722201108932495, "learning_rate": 9.065411434948152e-05, "loss": 0.5432, "step": 4260 }, { "epoch": 2.890995260663507, "grad_norm": 1.2124953269958496, "learning_rate": 9.018358519650909e-05, "loss": 0.534, "step": 4270 }, { "epoch": 2.897765741367637, "grad_norm": 1.258863091468811, "learning_rate": 8.97132753902667e-05, "loss": 0.5651, "step": 4280 }, { "epoch": 2.904536222071767, "grad_norm": 1.2424662113189697, "learning_rate": 8.924319543977631e-05, "loss": 0.5611, "step": 4290 }, { "epoch": 2.911306702775897, "grad_norm": 1.2281653881072998, "learning_rate": 8.877335584892369e-05, "loss": 0.5584, "step": 4300 }, { "epoch": 2.918077183480027, "grad_norm": 1.1419377326965332, "learning_rate": 8.830376711622379e-05, "loss": 0.5939, "step": 4310 }, { "epoch": 2.924847664184157, "grad_norm": 1.0923197269439697, "learning_rate": 8.783443973458625e-05, "loss": 0.5912, "step": 4320 }, { "epoch": 2.931618144888287, "grad_norm": 1.0926480293273926, "learning_rate": 8.736538419108074e-05, "loss": 0.6095, "step": 4330 }, { "epoch": 2.938388625592417, "grad_norm": 1.4442996978759766, "learning_rate": 8.689661096670285e-05, "loss": 0.5618, "step": 4340 }, { "epoch": 2.945159106296547, "grad_norm": 1.2105728387832642, "learning_rate": 8.64281305361397e-05, "loss": 0.5388, "step": 4350 }, { "epoch": 2.951929587000677, "grad_norm": 1.2048066854476929, "learning_rate": 8.595995336753597e-05, "loss": 0.5891, "step": 4360 }, { "epoch": 2.958700067704807, "grad_norm": 1.407758355140686, "learning_rate": 8.549208992226001e-05, "loss": 0.5351, "step": 4370 }, { "epoch": 2.9654705484089368, "grad_norm": 1.075348973274231, "learning_rate": 8.502455065467006e-05, "loss": 0.5939, "step": 4380 }, { "epoch": 2.972241029113067, "grad_norm": 1.2892156839370728, "learning_rate": 8.45573460118806e-05, "loss": 0.5488, "step": 4390 }, { "epoch": 2.979011509817197, "grad_norm": 1.1205973625183105, "learning_rate": 8.4090486433529e-05, "loss": 0.6054, "step": 4400 }, { "epoch": 2.985781990521327, "grad_norm": 1.4507098197937012, "learning_rate": 8.362398235154213e-05, "loss": 0.5542, "step": 4410 }, { "epoch": 2.9925524712254568, "grad_norm": 1.2207527160644531, "learning_rate": 8.31578441899035e-05, "loss": 0.5326, "step": 4420 }, { "epoch": 2.999322951929587, "grad_norm": 1.032354712486267, "learning_rate": 8.269208236442003e-05, "loss": 0.5924, "step": 4430 }, { "epoch": 3.006093432633717, "grad_norm": 1.38179349899292, "learning_rate": 8.222670728248941e-05, "loss": 0.4272, "step": 4440 }, { "epoch": 3.012863913337847, "grad_norm": 1.3886513710021973, "learning_rate": 8.17617293428677e-05, "loss": 0.4442, "step": 4450 }, { "epoch": 3.019634394041977, "grad_norm": 1.5716043710708618, "learning_rate": 8.129715893543681e-05, "loss": 0.3873, "step": 4460 }, { "epoch": 3.026404874746107, "grad_norm": 1.4398396015167236, "learning_rate": 8.08330064409724e-05, "loss": 0.3991, "step": 4470 }, { "epoch": 3.0331753554502368, "grad_norm": 1.4795118570327759, "learning_rate": 8.036928223091187e-05, "loss": 0.4557, "step": 4480 }, { "epoch": 3.039945836154367, "grad_norm": 1.5591235160827637, "learning_rate": 7.990599666712268e-05, "loss": 0.4077, "step": 4490 }, { "epoch": 3.0467163168584968, "grad_norm": 1.3513033390045166, "learning_rate": 7.94431601016708e-05, "loss": 0.3999, "step": 4500 }, { "epoch": 3.053486797562627, "grad_norm": 1.4254108667373657, "learning_rate": 7.898078287658941e-05, "loss": 0.3614, "step": 4510 }, { "epoch": 3.0602572782667568, "grad_norm": 1.2728102207183838, "learning_rate": 7.85188753236477e-05, "loss": 0.4038, "step": 4520 }, { "epoch": 3.067027758970887, "grad_norm": 1.6714439392089844, "learning_rate": 7.805744776412012e-05, "loss": 0.4229, "step": 4530 }, { "epoch": 3.0737982396750168, "grad_norm": 1.4847053289413452, "learning_rate": 7.759651050855568e-05, "loss": 0.3806, "step": 4540 }, { "epoch": 3.080568720379147, "grad_norm": 1.7574979066848755, "learning_rate": 7.713607385654772e-05, "loss": 0.3625, "step": 4550 }, { "epoch": 3.0873392010832768, "grad_norm": 1.495059609413147, "learning_rate": 7.667614809650351e-05, "loss": 0.3889, "step": 4560 }, { "epoch": 3.094109681787407, "grad_norm": 1.2997581958770752, "learning_rate": 7.621674350541461e-05, "loss": 0.3775, "step": 4570 }, { "epoch": 3.1008801624915368, "grad_norm": 1.5862250328063965, "learning_rate": 7.575787034862704e-05, "loss": 0.4023, "step": 4580 }, { "epoch": 3.107650643195667, "grad_norm": 1.5325440168380737, "learning_rate": 7.529953887961197e-05, "loss": 0.3641, "step": 4590 }, { "epoch": 3.1144211238997968, "grad_norm": 1.4811371564865112, "learning_rate": 7.484175933973668e-05, "loss": 0.3818, "step": 4600 }, { "epoch": 3.121191604603927, "grad_norm": 1.7169820070266724, "learning_rate": 7.438454195803559e-05, "loss": 0.4187, "step": 4610 }, { "epoch": 3.1279620853080567, "grad_norm": 1.6318345069885254, "learning_rate": 7.392789695098182e-05, "loss": 0.3718, "step": 4620 }, { "epoch": 3.134732566012187, "grad_norm": 1.633092999458313, "learning_rate": 7.347183452225874e-05, "loss": 0.3969, "step": 4630 }, { "epoch": 3.1415030467163167, "grad_norm": 1.8210922479629517, "learning_rate": 7.301636486253215e-05, "loss": 0.4193, "step": 4640 }, { "epoch": 3.148273527420447, "grad_norm": 2.1533546447753906, "learning_rate": 7.256149814922253e-05, "loss": 0.3923, "step": 4650 }, { "epoch": 3.1550440081245767, "grad_norm": 1.4838796854019165, "learning_rate": 7.210724454627751e-05, "loss": 0.3871, "step": 4660 }, { "epoch": 3.161814488828707, "grad_norm": 1.755631685256958, "learning_rate": 7.165361420394482e-05, "loss": 0.4219, "step": 4670 }, { "epoch": 3.1685849695328367, "grad_norm": 1.197309136390686, "learning_rate": 7.120061725854554e-05, "loss": 0.4219, "step": 4680 }, { "epoch": 3.175355450236967, "grad_norm": 1.7161248922348022, "learning_rate": 7.074826383224761e-05, "loss": 0.4002, "step": 4690 }, { "epoch": 3.1821259309410967, "grad_norm": 1.4585338830947876, "learning_rate": 7.029656403283951e-05, "loss": 0.3984, "step": 4700 }, { "epoch": 3.188896411645227, "grad_norm": 1.5048658847808838, "learning_rate": 6.984552795350453e-05, "loss": 0.4005, "step": 4710 }, { "epoch": 3.1956668923493567, "grad_norm": 1.7454990148544312, "learning_rate": 6.939516567259523e-05, "loss": 0.3999, "step": 4720 }, { "epoch": 3.202437373053487, "grad_norm": 1.4264365434646606, "learning_rate": 6.894548725340822e-05, "loss": 0.3844, "step": 4730 }, { "epoch": 3.2092078537576167, "grad_norm": 1.3761653900146484, "learning_rate": 6.849650274395929e-05, "loss": 0.4107, "step": 4740 }, { "epoch": 3.215978334461747, "grad_norm": 1.6094237565994263, "learning_rate": 6.804822217675885e-05, "loss": 0.3865, "step": 4750 }, { "epoch": 3.2227488151658767, "grad_norm": 1.969099998474121, "learning_rate": 6.760065556858786e-05, "loss": 0.3635, "step": 4760 }, { "epoch": 3.229519295870007, "grad_norm": 1.5209436416625977, "learning_rate": 6.715381292027385e-05, "loss": 0.3754, "step": 4770 }, { "epoch": 3.2362897765741367, "grad_norm": 1.6469786167144775, "learning_rate": 6.670770421646767e-05, "loss": 0.4034, "step": 4780 }, { "epoch": 3.243060257278267, "grad_norm": 1.6617894172668457, "learning_rate": 6.626233942542013e-05, "loss": 0.3946, "step": 4790 }, { "epoch": 3.2498307379823967, "grad_norm": 1.4001210927963257, "learning_rate": 6.581772849875951e-05, "loss": 0.3638, "step": 4800 }, { "epoch": 3.2566012186865265, "grad_norm": 1.7633929252624512, "learning_rate": 6.537388137126899e-05, "loss": 0.3607, "step": 4810 }, { "epoch": 3.2633716993906567, "grad_norm": 1.6892105340957642, "learning_rate": 6.493080796066477e-05, "loss": 0.3797, "step": 4820 }, { "epoch": 3.270142180094787, "grad_norm": 1.4346562623977661, "learning_rate": 6.448851816737443e-05, "loss": 0.3552, "step": 4830 }, { "epoch": 3.2769126607989167, "grad_norm": 1.5974228382110596, "learning_rate": 6.404702187431568e-05, "loss": 0.3905, "step": 4840 }, { "epoch": 3.2836831415030465, "grad_norm": 1.4062926769256592, "learning_rate": 6.360632894667555e-05, "loss": 0.3864, "step": 4850 }, { "epoch": 3.2904536222071767, "grad_norm": 1.6129074096679688, "learning_rate": 6.316644923169007e-05, "loss": 0.3921, "step": 4860 }, { "epoch": 3.2972241029113065, "grad_norm": 1.5494030714035034, "learning_rate": 6.27273925584239e-05, "loss": 0.4138, "step": 4870 }, { "epoch": 3.3039945836154367, "grad_norm": 1.5944302082061768, "learning_rate": 6.228916873755118e-05, "loss": 0.3709, "step": 4880 }, { "epoch": 3.3107650643195665, "grad_norm": 1.4350250959396362, "learning_rate": 6.185178756113586e-05, "loss": 0.3622, "step": 4890 }, { "epoch": 3.3175355450236967, "grad_norm": 1.5585368871688843, "learning_rate": 6.141525880241313e-05, "loss": 0.3969, "step": 4900 }, { "epoch": 3.3243060257278265, "grad_norm": 1.289538860321045, "learning_rate": 6.097959221557108e-05, "loss": 0.394, "step": 4910 }, { "epoch": 3.3310765064319567, "grad_norm": 1.7543057203292847, "learning_rate": 6.054479753553259e-05, "loss": 0.396, "step": 4920 }, { "epoch": 3.3378469871360865, "grad_norm": 1.633093237876892, "learning_rate": 6.0110884477737875e-05, "loss": 0.415, "step": 4930 }, { "epoch": 3.3446174678402167, "grad_norm": 1.537914514541626, "learning_rate": 5.9677862737927415e-05, "loss": 0.399, "step": 4940 }, { "epoch": 3.3513879485443465, "grad_norm": 1.6341283321380615, "learning_rate": 5.924574199192527e-05, "loss": 0.3825, "step": 4950 }, { "epoch": 3.3581584292484767, "grad_norm": 1.4960927963256836, "learning_rate": 5.881453189542295e-05, "loss": 0.3793, "step": 4960 }, { "epoch": 3.3649289099526065, "grad_norm": 1.6509079933166504, "learning_rate": 5.838424208376354e-05, "loss": 0.3939, "step": 4970 }, { "epoch": 3.3716993906567367, "grad_norm": 1.662853479385376, "learning_rate": 5.7954882171726444e-05, "loss": 0.4141, "step": 4980 }, { "epoch": 3.3784698713608665, "grad_norm": 1.639427661895752, "learning_rate": 5.752646175331267e-05, "loss": 0.4112, "step": 4990 }, { "epoch": 3.3852403520649967, "grad_norm": 1.4693089723587036, "learning_rate": 5.709899040153013e-05, "loss": 0.372, "step": 5000 }, { "epoch": 3.3852403520649967, "eval_loss": 0.9812659621238708, "eval_runtime": 23.1744, "eval_samples_per_second": 107.36, "eval_steps_per_second": 13.42, "step": 5000 }, { "epoch": 3.3920108327691265, "grad_norm": 1.4617177248001099, "learning_rate": 5.667247766818018e-05, "loss": 0.385, "step": 5010 }, { "epoch": 3.3987813134732567, "grad_norm": 1.2667337656021118, "learning_rate": 5.6246933083643794e-05, "loss": 0.3759, "step": 5020 }, { "epoch": 3.4055517941773865, "grad_norm": 1.9020839929580688, "learning_rate": 5.582236615666885e-05, "loss": 0.3991, "step": 5030 }, { "epoch": 3.4123222748815167, "grad_norm": 1.4279497861862183, "learning_rate": 5.5398786374157564e-05, "loss": 0.3938, "step": 5040 }, { "epoch": 3.4190927555856465, "grad_norm": 1.5497093200683594, "learning_rate": 5.4976203200954425e-05, "loss": 0.4, "step": 5050 }, { "epoch": 3.4258632362897767, "grad_norm": 1.3598889112472534, "learning_rate": 5.4554626079634906e-05, "loss": 0.4117, "step": 5060 }, { "epoch": 3.4326337169939065, "grad_norm": 1.498186707496643, "learning_rate": 5.413406443029433e-05, "loss": 0.409, "step": 5070 }, { "epoch": 3.4394041976980367, "grad_norm": 1.9175001382827759, "learning_rate": 5.371452765033733e-05, "loss": 0.405, "step": 5080 }, { "epoch": 3.4461746784021665, "grad_norm": 1.9584026336669922, "learning_rate": 5.32960251142681e-05, "loss": 0.3635, "step": 5090 }, { "epoch": 3.4529451591062967, "grad_norm": 1.582276463508606, "learning_rate": 5.287856617348054e-05, "loss": 0.4101, "step": 5100 }, { "epoch": 3.4597156398104265, "grad_norm": 1.6922118663787842, "learning_rate": 5.2462160156049765e-05, "loss": 0.3894, "step": 5110 }, { "epoch": 3.4664861205145563, "grad_norm": 1.7980077266693115, "learning_rate": 5.2046816366523355e-05, "loss": 0.3909, "step": 5120 }, { "epoch": 3.4732566012186865, "grad_norm": 1.5998905897140503, "learning_rate": 5.1632544085713376e-05, "loss": 0.367, "step": 5130 }, { "epoch": 3.4800270819228167, "grad_norm": 1.5311387777328491, "learning_rate": 5.121935257048936e-05, "loss": 0.4053, "step": 5140 }, { "epoch": 3.4867975626269465, "grad_norm": 1.7611960172653198, "learning_rate": 5.080725105357109e-05, "loss": 0.3938, "step": 5150 }, { "epoch": 3.4935680433310763, "grad_norm": 2.3462700843811035, "learning_rate": 5.0396248743322526e-05, "loss": 0.3949, "step": 5160 }, { "epoch": 3.5003385240352065, "grad_norm": 1.386608362197876, "learning_rate": 4.998635482354598e-05, "loss": 0.3593, "step": 5170 }, { "epoch": 3.5071090047393367, "grad_norm": 2.024418592453003, "learning_rate": 4.9577578453276886e-05, "loss": 0.3835, "step": 5180 }, { "epoch": 3.5138794854434665, "grad_norm": 1.9304969310760498, "learning_rate": 4.9169928766579164e-05, "loss": 0.4439, "step": 5190 }, { "epoch": 3.5206499661475963, "grad_norm": 1.6261743307113647, "learning_rate": 4.876341487234105e-05, "loss": 0.4055, "step": 5200 }, { "epoch": 3.5274204468517265, "grad_norm": 1.770004153251648, "learning_rate": 4.83580458540717e-05, "loss": 0.401, "step": 5210 }, { "epoch": 3.5341909275558567, "grad_norm": 2.584394931793213, "learning_rate": 4.7953830769698125e-05, "loss": 0.3809, "step": 5220 }, { "epoch": 3.5409614082599865, "grad_norm": 1.66965651512146, "learning_rate": 4.755077865136274e-05, "loss": 0.4251, "step": 5230 }, { "epoch": 3.5477318889641163, "grad_norm": 1.5093834400177002, "learning_rate": 4.7148898505221685e-05, "loss": 0.3812, "step": 5240 }, { "epoch": 3.5545023696682465, "grad_norm": 1.7326291799545288, "learning_rate": 4.674819931124348e-05, "loss": 0.3606, "step": 5250 }, { "epoch": 3.5612728503723763, "grad_norm": 2.2934281826019287, "learning_rate": 4.63486900230084e-05, "loss": 0.4269, "step": 5260 }, { "epoch": 3.5680433310765065, "grad_norm": 1.787213683128357, "learning_rate": 4.595037956750845e-05, "loss": 0.4109, "step": 5270 }, { "epoch": 3.5748138117806363, "grad_norm": 1.5188498497009277, "learning_rate": 4.5553276844947726e-05, "loss": 0.4027, "step": 5280 }, { "epoch": 3.5815842924847665, "grad_norm": 1.5621033906936646, "learning_rate": 4.515739072854376e-05, "loss": 0.4377, "step": 5290 }, { "epoch": 3.5883547731888963, "grad_norm": 1.4404442310333252, "learning_rate": 4.4762730064329164e-05, "loss": 0.4058, "step": 5300 }, { "epoch": 3.5951252538930265, "grad_norm": 1.506831407546997, "learning_rate": 4.436930367095384e-05, "loss": 0.3852, "step": 5310 }, { "epoch": 3.6018957345971563, "grad_norm": 2.1018640995025635, "learning_rate": 4.3977120339488174e-05, "loss": 0.4128, "step": 5320 }, { "epoch": 3.6086662153012865, "grad_norm": 1.4768526554107666, "learning_rate": 4.358618883322639e-05, "loss": 0.3848, "step": 5330 }, { "epoch": 3.6154366960054163, "grad_norm": 1.3917316198349, "learning_rate": 4.319651788749084e-05, "loss": 0.4186, "step": 5340 }, { "epoch": 3.6222071767095465, "grad_norm": 1.9646469354629517, "learning_rate": 4.280811620943682e-05, "loss": 0.4213, "step": 5350 }, { "epoch": 3.6289776574136763, "grad_norm": 2.266582727432251, "learning_rate": 4.2420992477857856e-05, "loss": 0.4063, "step": 5360 }, { "epoch": 3.6357481381178065, "grad_norm": 1.8989133834838867, "learning_rate": 4.203515534299205e-05, "loss": 0.3786, "step": 5370 }, { "epoch": 3.6425186188219363, "grad_norm": 2.106405258178711, "learning_rate": 4.16506134263285e-05, "loss": 0.406, "step": 5380 }, { "epoch": 3.6492890995260665, "grad_norm": 2.1753334999084473, "learning_rate": 4.12673753204149e-05, "loss": 0.3845, "step": 5390 }, { "epoch": 3.6560595802301963, "grad_norm": 1.5723298788070679, "learning_rate": 4.0885449588665395e-05, "loss": 0.411, "step": 5400 }, { "epoch": 3.6628300609343265, "grad_norm": 2.0291285514831543, "learning_rate": 4.050484476516926e-05, "loss": 0.3926, "step": 5410 }, { "epoch": 3.6696005416384563, "grad_norm": 1.5461398363113403, "learning_rate": 4.012556935450027e-05, "loss": 0.4232, "step": 5420 }, { "epoch": 3.676371022342586, "grad_norm": 1.6446950435638428, "learning_rate": 3.97476318315265e-05, "loss": 0.3882, "step": 5430 }, { "epoch": 3.6831415030467163, "grad_norm": 1.363389015197754, "learning_rate": 3.937104064122117e-05, "loss": 0.3714, "step": 5440 }, { "epoch": 3.6899119837508465, "grad_norm": 1.4707744121551514, "learning_rate": 3.899580419847385e-05, "loss": 0.3633, "step": 5450 }, { "epoch": 3.6966824644549763, "grad_norm": 2.183893918991089, "learning_rate": 3.862193088790231e-05, "loss": 0.3918, "step": 5460 }, { "epoch": 3.703452945159106, "grad_norm": 1.798282504081726, "learning_rate": 3.82494290636654e-05, "loss": 0.4081, "step": 5470 }, { "epoch": 3.7102234258632363, "grad_norm": 1.563833475112915, "learning_rate": 3.7878307049276195e-05, "loss": 0.3772, "step": 5480 }, { "epoch": 3.7169939065673665, "grad_norm": 1.5234781503677368, "learning_rate": 3.7508573137416095e-05, "loss": 0.3923, "step": 5490 }, { "epoch": 3.7237643872714963, "grad_norm": 1.5436840057373047, "learning_rate": 3.71402355897495e-05, "loss": 0.4204, "step": 5500 }, { "epoch": 3.730534867975626, "grad_norm": 1.640419363975525, "learning_rate": 3.6773302636739116e-05, "loss": 0.391, "step": 5510 }, { "epoch": 3.7373053486797563, "grad_norm": 1.8847980499267578, "learning_rate": 3.640778247746226e-05, "loss": 0.3843, "step": 5520 }, { "epoch": 3.7440758293838865, "grad_norm": 1.2375092506408691, "learning_rate": 3.6043683279427484e-05, "loss": 0.3623, "step": 5530 }, { "epoch": 3.7508463100880163, "grad_norm": 1.3256595134735107, "learning_rate": 3.568101317839205e-05, "loss": 0.3923, "step": 5540 }, { "epoch": 3.757616790792146, "grad_norm": 1.5230741500854492, "learning_rate": 3.531978027818027e-05, "loss": 0.3918, "step": 5550 }, { "epoch": 3.7643872714962763, "grad_norm": 1.619551181793213, "learning_rate": 3.4959992650502346e-05, "loss": 0.4316, "step": 5560 }, { "epoch": 3.7711577522004065, "grad_norm": 2.241872787475586, "learning_rate": 3.4601658334774014e-05, "loss": 0.4183, "step": 5570 }, { "epoch": 3.7779282329045363, "grad_norm": 1.427147626876831, "learning_rate": 3.424478533793695e-05, "loss": 0.4036, "step": 5580 }, { "epoch": 3.784698713608666, "grad_norm": 1.646103024482727, "learning_rate": 3.388938163427969e-05, "loss": 0.3846, "step": 5590 }, { "epoch": 3.7914691943127963, "grad_norm": 1.4623626470565796, "learning_rate": 3.3535455165259734e-05, "loss": 0.4339, "step": 5600 }, { "epoch": 3.798239675016926, "grad_norm": 1.5822981595993042, "learning_rate": 3.318301383932586e-05, "loss": 0.4013, "step": 5610 }, { "epoch": 3.8050101557210563, "grad_norm": 1.6035799980163574, "learning_rate": 3.283206553174144e-05, "loss": 0.3765, "step": 5620 }, { "epoch": 3.811780636425186, "grad_norm": 1.4690262079238892, "learning_rate": 3.248261808440858e-05, "loss": 0.3846, "step": 5630 }, { "epoch": 3.8185511171293163, "grad_norm": 1.6690099239349365, "learning_rate": 3.213467930569279e-05, "loss": 0.3908, "step": 5640 }, { "epoch": 3.825321597833446, "grad_norm": 1.9128773212432861, "learning_rate": 3.178825697024859e-05, "loss": 0.4075, "step": 5650 }, { "epoch": 3.8320920785375763, "grad_norm": 1.5227471590042114, "learning_rate": 3.14433588188457e-05, "loss": 0.3949, "step": 5660 }, { "epoch": 3.838862559241706, "grad_norm": 1.8962739706039429, "learning_rate": 3.109999255819607e-05, "loss": 0.3708, "step": 5670 }, { "epoch": 3.8456330399458363, "grad_norm": 1.7166234254837036, "learning_rate": 3.075816586078182e-05, "loss": 0.3853, "step": 5680 }, { "epoch": 3.852403520649966, "grad_norm": 1.603034257888794, "learning_rate": 3.0417886364683578e-05, "loss": 0.3697, "step": 5690 }, { "epoch": 3.8591740013540963, "grad_norm": 1.2980273962020874, "learning_rate": 3.0079161673410006e-05, "loss": 0.3561, "step": 5700 }, { "epoch": 3.865944482058226, "grad_norm": 1.2596299648284912, "learning_rate": 2.974199935572781e-05, "loss": 0.3759, "step": 5710 }, { "epoch": 3.8727149627623563, "grad_norm": 1.6658598184585571, "learning_rate": 2.9406406945492616e-05, "loss": 0.3902, "step": 5720 }, { "epoch": 3.879485443466486, "grad_norm": 1.401743769645691, "learning_rate": 2.907239194148066e-05, "loss": 0.4045, "step": 5730 }, { "epoch": 3.8862559241706163, "grad_norm": 1.7074028253555298, "learning_rate": 2.8739961807221127e-05, "loss": 0.4103, "step": 5740 }, { "epoch": 3.893026404874746, "grad_norm": 1.6622352600097656, "learning_rate": 2.840912397082954e-05, "loss": 0.3718, "step": 5750 }, { "epoch": 3.8997968855788763, "grad_norm": 1.5955240726470947, "learning_rate": 2.807988582484171e-05, "loss": 0.3949, "step": 5760 }, { "epoch": 3.906567366283006, "grad_norm": 1.5108157396316528, "learning_rate": 2.7752254726048422e-05, "loss": 0.3665, "step": 5770 }, { "epoch": 3.913337846987136, "grad_norm": 1.4178344011306763, "learning_rate": 2.7426237995331296e-05, "loss": 0.3835, "step": 5780 }, { "epoch": 3.920108327691266, "grad_norm": 1.7224016189575195, "learning_rate": 2.7101842917498997e-05, "loss": 0.4008, "step": 5790 }, { "epoch": 3.9268788083953963, "grad_norm": 1.513185977935791, "learning_rate": 2.6779076741124576e-05, "loss": 0.4084, "step": 5800 }, { "epoch": 3.933649289099526, "grad_norm": 1.806357741355896, "learning_rate": 2.6457946678383448e-05, "loss": 0.382, "step": 5810 }, { "epoch": 3.940419769803656, "grad_norm": 1.5622941255569458, "learning_rate": 2.6138459904892177e-05, "loss": 0.3943, "step": 5820 }, { "epoch": 3.947190250507786, "grad_norm": 2.032970428466797, "learning_rate": 2.5820623559548285e-05, "loss": 0.3486, "step": 5830 }, { "epoch": 3.9539607312119163, "grad_norm": 1.7815639972686768, "learning_rate": 2.550444474437066e-05, "loss": 0.3772, "step": 5840 }, { "epoch": 3.960731211916046, "grad_norm": 1.6397390365600586, "learning_rate": 2.5189930524340767e-05, "loss": 0.3629, "step": 5850 }, { "epoch": 3.967501692620176, "grad_norm": 1.4618537425994873, "learning_rate": 2.487708792724497e-05, "loss": 0.4054, "step": 5860 }, { "epoch": 3.974272173324306, "grad_norm": 1.5044384002685547, "learning_rate": 2.4565923943517343e-05, "loss": 0.4003, "step": 5870 }, { "epoch": 3.9810426540284363, "grad_norm": 1.5843464136123657, "learning_rate": 2.425644552608356e-05, "loss": 0.3977, "step": 5880 }, { "epoch": 3.987813134732566, "grad_norm": 1.5150847434997559, "learning_rate": 2.3948659590205515e-05, "loss": 0.4088, "step": 5890 }, { "epoch": 3.994583615436696, "grad_norm": 1.9236164093017578, "learning_rate": 2.3642573013326663e-05, "loss": 0.4008, "step": 5900 }, { "epoch": 4.001354096140826, "grad_norm": 1.42927086353302, "learning_rate": 2.3338192634918643e-05, "loss": 0.3427, "step": 5910 }, { "epoch": 4.008124576844956, "grad_norm": 1.3550347089767456, "learning_rate": 2.3035525256328106e-05, "loss": 0.2699, "step": 5920 }, { "epoch": 4.014895057549086, "grad_norm": 1.546830177307129, "learning_rate": 2.2734577640625022e-05, "loss": 0.2694, "step": 5930 }, { "epoch": 4.021665538253216, "grad_norm": 1.7005549669265747, "learning_rate": 2.2435356512451387e-05, "loss": 0.2822, "step": 5940 }, { "epoch": 4.028436018957346, "grad_norm": 1.5947457551956177, "learning_rate": 2.2137868557871067e-05, "loss": 0.2965, "step": 5950 }, { "epoch": 4.035206499661476, "grad_norm": 1.600761890411377, "learning_rate": 2.1842120424220334e-05, "loss": 0.2551, "step": 5960 }, { "epoch": 4.041976980365606, "grad_norm": 1.5094797611236572, "learning_rate": 2.1548118719959286e-05, "loss": 0.2903, "step": 5970 }, { "epoch": 4.048747461069736, "grad_norm": 1.5594260692596436, "learning_rate": 2.1255870014524327e-05, "loss": 0.294, "step": 5980 }, { "epoch": 4.055517941773866, "grad_norm": 1.5365486145019531, "learning_rate": 2.096538083818128e-05, "loss": 0.2838, "step": 5990 }, { "epoch": 4.062288422477996, "grad_norm": 1.9512939453125, "learning_rate": 2.067665768187941e-05, "loss": 0.2649, "step": 6000 }, { "epoch": 4.062288422477996, "eval_loss": 1.1342198848724365, "eval_runtime": 22.903, "eval_samples_per_second": 108.632, "eval_steps_per_second": 13.579, "step": 6000 }, { "epoch": 4.069058903182126, "grad_norm": 1.703903079032898, "learning_rate": 2.0389706997106527e-05, "loss": 0.2606, "step": 6010 }, { "epoch": 4.075829383886256, "grad_norm": 1.8867642879486084, "learning_rate": 2.0104535195744746e-05, "loss": 0.2848, "step": 6020 }, { "epoch": 4.082599864590386, "grad_norm": 1.9352099895477295, "learning_rate": 1.9821148649927212e-05, "loss": 0.2724, "step": 6030 }, { "epoch": 4.089370345294516, "grad_norm": 1.7266086339950562, "learning_rate": 1.953955369189574e-05, "loss": 0.2745, "step": 6040 }, { "epoch": 4.096140825998646, "grad_norm": 1.5754889249801636, "learning_rate": 1.925975661385926e-05, "loss": 0.2737, "step": 6050 }, { "epoch": 4.102911306702776, "grad_norm": 1.6799631118774414, "learning_rate": 1.8981763667853326e-05, "loss": 0.2606, "step": 6060 }, { "epoch": 4.109681787406906, "grad_norm": 1.5695922374725342, "learning_rate": 1.870558106560035e-05, "loss": 0.2621, "step": 6070 }, { "epoch": 4.116452268111036, "grad_norm": 1.550424337387085, "learning_rate": 1.8431214978370758e-05, "loss": 0.2677, "step": 6080 }, { "epoch": 4.123222748815166, "grad_norm": 1.4905930757522583, "learning_rate": 1.8158671536845186e-05, "loss": 0.2562, "step": 6090 }, { "epoch": 4.129993229519296, "grad_norm": 1.688219666481018, "learning_rate": 1.788795683097746e-05, "loss": 0.2591, "step": 6100 }, { "epoch": 4.136763710223426, "grad_norm": 1.8246350288391113, "learning_rate": 1.761907690985847e-05, "loss": 0.2823, "step": 6110 }, { "epoch": 4.143534190927556, "grad_norm": 1.475894808769226, "learning_rate": 1.735203778158109e-05, "loss": 0.2672, "step": 6120 }, { "epoch": 4.150304671631686, "grad_norm": 2.1845951080322266, "learning_rate": 1.7086845413105778e-05, "loss": 0.2607, "step": 6130 }, { "epoch": 4.157075152335816, "grad_norm": 1.9802888631820679, "learning_rate": 1.6823505730127455e-05, "loss": 0.2653, "step": 6140 }, { "epoch": 4.163845633039946, "grad_norm": 1.2355766296386719, "learning_rate": 1.656202461694293e-05, "loss": 0.2787, "step": 6150 }, { "epoch": 4.170616113744076, "grad_norm": 1.6711342334747314, "learning_rate": 1.630240791631945e-05, "loss": 0.2996, "step": 6160 }, { "epoch": 4.177386594448206, "grad_norm": 1.8249988555908203, "learning_rate": 1.6044661429364205e-05, "loss": 0.2617, "step": 6170 }, { "epoch": 4.184157075152336, "grad_norm": 2.0309152603149414, "learning_rate": 1.5788790915394645e-05, "loss": 0.2627, "step": 6180 }, { "epoch": 4.190927555856466, "grad_norm": 1.7783539295196533, "learning_rate": 1.5534802091809818e-05, "loss": 0.2734, "step": 6190 }, { "epoch": 4.197698036560595, "grad_norm": 1.5822839736938477, "learning_rate": 1.528270063396262e-05, "loss": 0.2765, "step": 6200 }, { "epoch": 4.204468517264726, "grad_norm": 1.9683705568313599, "learning_rate": 1.5032492175032876e-05, "loss": 0.2665, "step": 6210 }, { "epoch": 4.211238997968856, "grad_norm": 1.4425179958343506, "learning_rate": 1.4784182305901672e-05, "loss": 0.2644, "step": 6220 }, { "epoch": 4.218009478672986, "grad_norm": 1.8725738525390625, "learning_rate": 1.4537776575026207e-05, "loss": 0.2611, "step": 6230 }, { "epoch": 4.224779959377115, "grad_norm": 1.767899990081787, "learning_rate": 1.4293280488315986e-05, "loss": 0.2851, "step": 6240 }, { "epoch": 4.231550440081246, "grad_norm": 1.2789946794509888, "learning_rate": 1.4050699509009679e-05, "loss": 0.2727, "step": 6250 }, { "epoch": 4.238320920785376, "grad_norm": 1.5606369972229004, "learning_rate": 1.3810039057553138e-05, "loss": 0.2704, "step": 6260 }, { "epoch": 4.245091401489506, "grad_norm": 1.5035715103149414, "learning_rate": 1.3571304511478188e-05, "loss": 0.2847, "step": 6270 }, { "epoch": 4.251861882193635, "grad_norm": 1.8756885528564453, "learning_rate": 1.333450120528249e-05, "loss": 0.2551, "step": 6280 }, { "epoch": 4.258632362897766, "grad_norm": 2.072859048843384, "learning_rate": 1.3099634430310403e-05, "loss": 0.249, "step": 6290 }, { "epoch": 4.265402843601896, "grad_norm": 1.6129212379455566, "learning_rate": 1.2866709434634684e-05, "loss": 0.2961, "step": 6300 }, { "epoch": 4.272173324306026, "grad_norm": 1.705417513847351, "learning_rate": 1.2635731422939212e-05, "loss": 0.2476, "step": 6310 }, { "epoch": 4.278943805010155, "grad_norm": 1.9114418029785156, "learning_rate": 1.2406705556402776e-05, "loss": 0.275, "step": 6320 }, { "epoch": 4.285714285714286, "grad_norm": 1.7978328466415405, "learning_rate": 1.217963695258364e-05, "loss": 0.2605, "step": 6330 }, { "epoch": 4.292484766418416, "grad_norm": 1.7482448816299438, "learning_rate": 1.1954530685305287e-05, "loss": 0.2696, "step": 6340 }, { "epoch": 4.299255247122546, "grad_norm": 2.014146566390991, "learning_rate": 1.1731391784543e-05, "loss": 0.2914, "step": 6350 }, { "epoch": 4.306025727826675, "grad_norm": 2.0617308616638184, "learning_rate": 1.15102252363114e-05, "loss": 0.262, "step": 6360 }, { "epoch": 4.312796208530806, "grad_norm": 1.9172184467315674, "learning_rate": 1.1291035982553189e-05, "loss": 0.2702, "step": 6370 }, { "epoch": 4.319566689234936, "grad_norm": 1.7097840309143066, "learning_rate": 1.1073828921028606e-05, "loss": 0.308, "step": 6380 }, { "epoch": 4.326337169939066, "grad_norm": 1.5703011751174927, "learning_rate": 1.085860890520598e-05, "loss": 0.2536, "step": 6390 }, { "epoch": 4.333107650643195, "grad_norm": 2.1221113204956055, "learning_rate": 1.0645380744153378e-05, "loss": 0.2713, "step": 6400 }, { "epoch": 4.339878131347326, "grad_norm": 1.5522172451019287, "learning_rate": 1.0434149202431054e-05, "loss": 0.259, "step": 6410 }, { "epoch": 4.346648612051456, "grad_norm": 1.7431870698928833, "learning_rate": 1.0224918999985044e-05, "loss": 0.2847, "step": 6420 }, { "epoch": 4.353419092755586, "grad_norm": 1.9679934978485107, "learning_rate": 1.0017694812041656e-05, "loss": 0.2621, "step": 6430 }, { "epoch": 4.360189573459715, "grad_norm": 2.4556872844696045, "learning_rate": 9.812481269002983e-06, "loss": 0.2803, "step": 6440 }, { "epoch": 4.366960054163846, "grad_norm": 1.530918836593628, "learning_rate": 9.609282956343557e-06, "loss": 0.2962, "step": 6450 }, { "epoch": 4.373730534867976, "grad_norm": 1.861484169960022, "learning_rate": 9.408104414507724e-06, "loss": 0.2917, "step": 6460 }, { "epoch": 4.380501015572106, "grad_norm": 2.1292312145233154, "learning_rate": 9.208950138808293e-06, "loss": 0.329, "step": 6470 }, { "epoch": 4.387271496276235, "grad_norm": 1.6679848432540894, "learning_rate": 9.011824579326144e-06, "loss": 0.2768, "step": 6480 }, { "epoch": 4.394041976980366, "grad_norm": 1.5731488466262817, "learning_rate": 8.81673214081058e-06, "loss": 0.2919, "step": 6490 }, { "epoch": 4.400812457684496, "grad_norm": 1.8150240182876587, "learning_rate": 8.623677182581135e-06, "loss": 0.2719, "step": 6500 }, { "epoch": 4.407582938388625, "grad_norm": 2.06569504737854, "learning_rate": 8.432664018430003e-06, "loss": 0.2803, "step": 6510 }, { "epoch": 4.414353419092755, "grad_norm": 1.6544770002365112, "learning_rate": 8.243696916525745e-06, "loss": 0.2508, "step": 6520 }, { "epoch": 4.421123899796886, "grad_norm": 1.6926827430725098, "learning_rate": 8.056780099317885e-06, "loss": 0.2979, "step": 6530 }, { "epoch": 4.427894380501016, "grad_norm": 1.7074532508850098, "learning_rate": 7.871917743442513e-06, "loss": 0.2901, "step": 6540 }, { "epoch": 4.434664861205146, "grad_norm": 2.1102843284606934, "learning_rate": 7.68911397962906e-06, "loss": 0.2615, "step": 6550 }, { "epoch": 4.441435341909275, "grad_norm": 1.4068889617919922, "learning_rate": 7.5083728926079065e-06, "loss": 0.2608, "step": 6560 }, { "epoch": 4.448205822613406, "grad_norm": 1.8090318441390991, "learning_rate": 7.329698521019157e-06, "loss": 0.2904, "step": 6570 }, { "epoch": 4.454976303317536, "grad_norm": 1.7596811056137085, "learning_rate": 7.153094857322374e-06, "loss": 0.2763, "step": 6580 }, { "epoch": 4.461746784021665, "grad_norm": 1.7713943719863892, "learning_rate": 6.978565847707352e-06, "loss": 0.2644, "step": 6590 }, { "epoch": 4.468517264725795, "grad_norm": 1.9358819723129272, "learning_rate": 6.806115392006007e-06, "loss": 0.2758, "step": 6600 }, { "epoch": 4.475287745429926, "grad_norm": 1.916235327720642, "learning_rate": 6.635747343605181e-06, "loss": 0.2952, "step": 6610 }, { "epoch": 4.482058226134056, "grad_norm": 1.6258528232574463, "learning_rate": 6.4674655093605155e-06, "loss": 0.272, "step": 6620 }, { "epoch": 4.488828706838185, "grad_norm": 1.8681087493896484, "learning_rate": 6.301273649511464e-06, "loss": 0.2638, "step": 6630 }, { "epoch": 4.495599187542315, "grad_norm": 1.644300103187561, "learning_rate": 6.137175477597213e-06, "loss": 0.271, "step": 6640 }, { "epoch": 4.502369668246446, "grad_norm": 1.8756589889526367, "learning_rate": 5.975174660373706e-06, "loss": 0.2682, "step": 6650 }, { "epoch": 4.509140148950576, "grad_norm": 1.5481034517288208, "learning_rate": 5.815274817731753e-06, "loss": 0.2926, "step": 6660 }, { "epoch": 4.515910629654705, "grad_norm": 1.8476117849349976, "learning_rate": 5.657479522616071e-06, "loss": 0.2716, "step": 6670 }, { "epoch": 4.522681110358835, "grad_norm": 1.7573695182800293, "learning_rate": 5.501792300945507e-06, "loss": 0.2812, "step": 6680 }, { "epoch": 4.529451591062966, "grad_norm": 1.7136588096618652, "learning_rate": 5.348216631534264e-06, "loss": 0.2416, "step": 6690 }, { "epoch": 4.536222071767096, "grad_norm": 1.662249207496643, "learning_rate": 5.196755946014065e-06, "loss": 0.2571, "step": 6700 }, { "epoch": 4.542992552471225, "grad_norm": 2.3519043922424316, "learning_rate": 5.047413628757658e-06, "loss": 0.2819, "step": 6710 }, { "epoch": 4.549763033175355, "grad_norm": 1.7724781036376953, "learning_rate": 4.900193016802956e-06, "loss": 0.2881, "step": 6720 }, { "epoch": 4.556533513879486, "grad_norm": 1.6066288948059082, "learning_rate": 4.755097399778707e-06, "loss": 0.2837, "step": 6730 }, { "epoch": 4.563303994583616, "grad_norm": 2.2322845458984375, "learning_rate": 4.612130019830774e-06, "loss": 0.2648, "step": 6740 }, { "epoch": 4.570074475287745, "grad_norm": 1.8880157470703125, "learning_rate": 4.471294071549869e-06, "loss": 0.2571, "step": 6750 }, { "epoch": 4.576844955991875, "grad_norm": 1.5234016180038452, "learning_rate": 4.332592701900085e-06, "loss": 0.2567, "step": 6760 }, { "epoch": 4.583615436696006, "grad_norm": 2.566943645477295, "learning_rate": 4.196029010148527e-06, "loss": 0.2462, "step": 6770 }, { "epoch": 4.590385917400136, "grad_norm": 2.2811155319213867, "learning_rate": 4.0616060477961845e-06, "loss": 0.2695, "step": 6780 }, { "epoch": 4.597156398104265, "grad_norm": 2.036428928375244, "learning_rate": 3.929326818509638e-06, "loss": 0.2816, "step": 6790 }, { "epoch": 4.603926878808395, "grad_norm": 1.9326859712600708, "learning_rate": 3.799194278054019e-06, "loss": 0.3004, "step": 6800 }, { "epoch": 4.610697359512526, "grad_norm": 2.0376124382019043, "learning_rate": 3.6712113342269095e-06, "loss": 0.3155, "step": 6810 }, { "epoch": 4.617467840216655, "grad_norm": 1.9327590465545654, "learning_rate": 3.5453808467933558e-06, "loss": 0.2598, "step": 6820 }, { "epoch": 4.624238320920785, "grad_norm": 1.5915392637252808, "learning_rate": 3.421705627422067e-06, "loss": 0.2893, "step": 6830 }, { "epoch": 4.631008801624915, "grad_norm": 1.4876010417938232, "learning_rate": 3.300188439622465e-06, "loss": 0.2702, "step": 6840 }, { "epoch": 4.637779282329046, "grad_norm": 1.8183128833770752, "learning_rate": 3.180831998682987e-06, "loss": 0.26, "step": 6850 }, { "epoch": 4.644549763033176, "grad_norm": 1.5423557758331299, "learning_rate": 3.0636389716104607e-06, "loss": 0.309, "step": 6860 }, { "epoch": 4.651320243737305, "grad_norm": 1.5031051635742188, "learning_rate": 2.9486119770704144e-06, "loss": 0.2541, "step": 6870 }, { "epoch": 4.658090724441435, "grad_norm": 1.648635745048523, "learning_rate": 2.83575358532866e-06, "loss": 0.3016, "step": 6880 }, { "epoch": 4.664861205145566, "grad_norm": 2.3799970149993896, "learning_rate": 2.7250663181937808e-06, "loss": 0.287, "step": 6890 }, { "epoch": 4.671631685849695, "grad_norm": 1.8683040142059326, "learning_rate": 2.6165526489608016e-06, "loss": 0.2414, "step": 6900 }, { "epoch": 4.678402166553825, "grad_norm": 1.5256311893463135, "learning_rate": 2.510215002355987e-06, "loss": 0.2605, "step": 6910 }, { "epoch": 4.685172647257955, "grad_norm": 1.87392258644104, "learning_rate": 2.4060557544825724e-06, "loss": 0.2536, "step": 6920 }, { "epoch": 4.691943127962086, "grad_norm": 1.480167031288147, "learning_rate": 2.3040772327676987e-06, "loss": 0.2773, "step": 6930 }, { "epoch": 4.698713608666216, "grad_norm": 1.5413248538970947, "learning_rate": 2.2042817159104614e-06, "loss": 0.2801, "step": 6940 }, { "epoch": 4.705484089370345, "grad_norm": 1.492633581161499, "learning_rate": 2.106671433830909e-06, "loss": 0.2343, "step": 6950 }, { "epoch": 4.712254570074475, "grad_norm": 1.4329499006271362, "learning_rate": 2.011248567620272e-06, "loss": 0.2628, "step": 6960 }, { "epoch": 4.719025050778606, "grad_norm": 1.9466246366500854, "learning_rate": 1.918015249492211e-06, "loss": 0.258, "step": 6970 }, { "epoch": 4.725795531482735, "grad_norm": 1.604708194732666, "learning_rate": 1.8269735627351459e-06, "loss": 0.2807, "step": 6980 }, { "epoch": 4.732566012186865, "grad_norm": 1.7957441806793213, "learning_rate": 1.7381255416657693e-06, "loss": 0.2476, "step": 6990 }, { "epoch": 4.739336492890995, "grad_norm": 1.6520119905471802, "learning_rate": 1.6514731715835064e-06, "loss": 0.2722, "step": 7000 }, { "epoch": 4.739336492890995, "eval_loss": 1.1487771272659302, "eval_runtime": 23.0937, "eval_samples_per_second": 107.735, "eval_steps_per_second": 13.467, "step": 7000 }, { "epoch": 4.746106973595126, "grad_norm": 1.8763707876205444, "learning_rate": 1.5670183887262268e-06, "loss": 0.253, "step": 7010 }, { "epoch": 4.752877454299255, "grad_norm": 2.0074474811553955, "learning_rate": 1.4847630802269695e-06, "loss": 0.2886, "step": 7020 }, { "epoch": 4.759647935003385, "grad_norm": 1.6623965501785278, "learning_rate": 1.4047090840716982e-06, "loss": 0.2645, "step": 7030 }, { "epoch": 4.766418415707515, "grad_norm": 2.1426522731781006, "learning_rate": 1.3268581890583553e-06, "loss": 0.2834, "step": 7040 }, { "epoch": 4.773188896411646, "grad_norm": 2.4106967449188232, "learning_rate": 1.251212134756763e-06, "loss": 0.2967, "step": 7050 }, { "epoch": 4.779959377115775, "grad_norm": 1.7238754034042358, "learning_rate": 1.1777726114698628e-06, "loss": 0.2819, "step": 7060 }, { "epoch": 4.786729857819905, "grad_norm": 1.9978512525558472, "learning_rate": 1.1065412601958813e-06, "loss": 0.2892, "step": 7070 }, { "epoch": 4.793500338524035, "grad_norm": 1.807606816291809, "learning_rate": 1.0375196725916693e-06, "loss": 0.2751, "step": 7080 }, { "epoch": 4.800270819228166, "grad_norm": 1.8417556285858154, "learning_rate": 9.707093909371745e-07, "loss": 0.277, "step": 7090 }, { "epoch": 4.807041299932295, "grad_norm": 1.6947407722473145, "learning_rate": 9.061119081009262e-07, "loss": 0.2717, "step": 7100 }, { "epoch": 4.813811780636425, "grad_norm": 2.100844621658325, "learning_rate": 8.437286675067046e-07, "loss": 0.2589, "step": 7110 }, { "epoch": 4.820582261340555, "grad_norm": 1.8315235376358032, "learning_rate": 7.835610631013123e-07, "loss": 0.2774, "step": 7120 }, { "epoch": 4.827352742044685, "grad_norm": 1.8022527694702148, "learning_rate": 7.256104393233654e-07, "loss": 0.2826, "step": 7130 }, { "epoch": 4.834123222748815, "grad_norm": 1.8034976720809937, "learning_rate": 6.698780910732949e-07, "loss": 0.287, "step": 7140 }, { "epoch": 4.840893703452945, "grad_norm": 2.1168487071990967, "learning_rate": 6.163652636844375e-07, "loss": 0.2601, "step": 7150 }, { "epoch": 4.847664184157075, "grad_norm": 1.7831007242202759, "learning_rate": 5.650731528951237e-07, "loss": 0.2671, "step": 7160 }, { "epoch": 4.854434664861206, "grad_norm": 1.85152268409729, "learning_rate": 5.160029048220438e-07, "loss": 0.2877, "step": 7170 }, { "epoch": 4.861205145565335, "grad_norm": 1.629766583442688, "learning_rate": 4.691556159346133e-07, "loss": 0.3145, "step": 7180 }, { "epoch": 4.867975626269465, "grad_norm": 2.025866746902466, "learning_rate": 4.2453233303043627e-07, "loss": 0.2634, "step": 7190 }, { "epoch": 4.874746106973595, "grad_norm": 1.8864160776138306, "learning_rate": 3.8213405321195775e-07, "loss": 0.257, "step": 7200 }, { "epoch": 4.881516587677725, "grad_norm": 1.6541404724121094, "learning_rate": 3.4196172386417036e-07, "loss": 0.2942, "step": 7210 }, { "epoch": 4.888287068381855, "grad_norm": 1.627166509628296, "learning_rate": 3.0401624263344254e-07, "loss": 0.2984, "step": 7220 }, { "epoch": 4.895057549085985, "grad_norm": 2.0203287601470947, "learning_rate": 2.682984574074565e-07, "loss": 0.2775, "step": 7230 }, { "epoch": 4.901828029790115, "grad_norm": 1.4823179244995117, "learning_rate": 2.3480916629626816e-07, "loss": 0.2303, "step": 7240 }, { "epoch": 4.908598510494246, "grad_norm": 1.6466970443725586, "learning_rate": 2.035491176144766e-07, "loss": 0.2561, "step": 7250 }, { "epoch": 4.915368991198375, "grad_norm": 1.857335090637207, "learning_rate": 1.7451900986450441e-07, "loss": 0.2478, "step": 7260 }, { "epoch": 4.922139471902505, "grad_norm": 1.615402102470398, "learning_rate": 1.4771949172097677e-07, "loss": 0.2644, "step": 7270 }, { "epoch": 4.928909952606635, "grad_norm": 1.6097745895385742, "learning_rate": 1.2315116201623288e-07, "loss": 0.2687, "step": 7280 }, { "epoch": 4.935680433310765, "grad_norm": 1.6500680446624756, "learning_rate": 1.0081456972694803e-07, "loss": 0.2782, "step": 7290 }, { "epoch": 4.942450914014895, "grad_norm": 1.5854169130325317, "learning_rate": 8.07102139618765e-08, "loss": 0.2503, "step": 7300 }, { "epoch": 4.949221394719025, "grad_norm": 1.917787790298462, "learning_rate": 6.283854395067179e-08, "loss": 0.2688, "step": 7310 }, { "epoch": 4.955991875423155, "grad_norm": 1.3667759895324707, "learning_rate": 4.719995903387231e-08, "loss": 0.2713, "step": 7320 }, { "epoch": 4.962762356127285, "grad_norm": 1.4660590887069702, "learning_rate": 3.379480865397522e-08, "loss": 0.2492, "step": 7330 }, { "epoch": 4.969532836831415, "grad_norm": 1.909756064414978, "learning_rate": 2.2623392347620455e-08, "loss": 0.2528, "step": 7340 }, { "epoch": 4.976303317535545, "grad_norm": 1.9919097423553467, "learning_rate": 1.3685959738907184e-08, "loss": 0.2797, "step": 7350 }, { "epoch": 4.983073798239675, "grad_norm": 1.7295809984207153, "learning_rate": 6.982710533787185e-09, "loss": 0.2527, "step": 7360 }, { "epoch": 4.989844278943805, "grad_norm": 1.575947642326355, "learning_rate": 2.5137945156461507e-09, "loss": 0.3057, "step": 7370 }, { "epoch": 4.996614759647935, "grad_norm": 1.8067814111709595, "learning_rate": 2.7931154193971964e-10, "loss": 0.2525, "step": 7380 }, { "epoch": 5.0, "step": 7385, "total_flos": 1.6593737353978184e+18, "train_loss": 0.5936373706919645, "train_runtime": 5834.3806, "train_samples_per_second": 40.5, "train_steps_per_second": 1.266 } ], "logging_steps": 10, "max_steps": 7385, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6593737353978184e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }