| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 1000, | |
| "global_step": 7385, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006770480704129994, | |
| "grad_norm": 2.130030393600464, | |
| "learning_rate": 2.7063599458728013e-06, | |
| "loss": 2.3319, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.013540961408259987, | |
| "grad_norm": 2.666555881500244, | |
| "learning_rate": 5.4127198917456026e-06, | |
| "loss": 2.3443, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.020311442112389978, | |
| "grad_norm": 2.274488687515259, | |
| "learning_rate": 8.119079837618404e-06, | |
| "loss": 2.3759, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.027081922816519974, | |
| "grad_norm": 2.197918653488159, | |
| "learning_rate": 1.0825439783491205e-05, | |
| "loss": 2.1286, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.033852403520649964, | |
| "grad_norm": 2.2513201236724854, | |
| "learning_rate": 1.3531799729364006e-05, | |
| "loss": 1.9161, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.040622884224779957, | |
| "grad_norm": 1.52046537399292, | |
| "learning_rate": 1.6238159675236808e-05, | |
| "loss": 1.6287, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04739336492890995, | |
| "grad_norm": 1.0912840366363525, | |
| "learning_rate": 1.894451962110961e-05, | |
| "loss": 1.5206, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05416384563303995, | |
| "grad_norm": 1.050105333328247, | |
| "learning_rate": 2.165087956698241e-05, | |
| "loss": 1.3484, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06093432633716994, | |
| "grad_norm": 1.138007402420044, | |
| "learning_rate": 2.435723951285521e-05, | |
| "loss": 1.3352, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06770480704129993, | |
| "grad_norm": 1.0807892084121704, | |
| "learning_rate": 2.7063599458728013e-05, | |
| "loss": 1.2605, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07447528774542993, | |
| "grad_norm": 1.1421936750411987, | |
| "learning_rate": 2.976995940460081e-05, | |
| "loss": 1.1888, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.08124576844955991, | |
| "grad_norm": 1.2684075832366943, | |
| "learning_rate": 3.2476319350473615e-05, | |
| "loss": 1.1998, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08801624915368991, | |
| "grad_norm": 1.1413911581039429, | |
| "learning_rate": 3.518267929634642e-05, | |
| "loss": 1.1426, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0947867298578199, | |
| "grad_norm": 1.3954917192459106, | |
| "learning_rate": 3.788903924221922e-05, | |
| "loss": 1.1437, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1015572105619499, | |
| "grad_norm": 1.2118768692016602, | |
| "learning_rate": 4.059539918809202e-05, | |
| "loss": 1.0564, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1083276912660799, | |
| "grad_norm": 1.4291969537734985, | |
| "learning_rate": 4.330175913396482e-05, | |
| "loss": 1.0382, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11509817197020988, | |
| "grad_norm": 1.351151943206787, | |
| "learning_rate": 4.600811907983762e-05, | |
| "loss": 1.0717, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.12186865267433988, | |
| "grad_norm": 1.3836501836776733, | |
| "learning_rate": 4.871447902571042e-05, | |
| "loss": 1.0294, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12863913337846988, | |
| "grad_norm": 1.2129018306732178, | |
| "learning_rate": 5.142083897158322e-05, | |
| "loss": 1.0081, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.13540961408259986, | |
| "grad_norm": 1.244095802307129, | |
| "learning_rate": 5.4127198917456026e-05, | |
| "loss": 0.9383, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.14218009478672985, | |
| "grad_norm": 1.3957242965698242, | |
| "learning_rate": 5.683355886332883e-05, | |
| "loss": 0.927, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14895057549085985, | |
| "grad_norm": 1.688636302947998, | |
| "learning_rate": 5.953991880920162e-05, | |
| "loss": 0.9617, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.15572105619498985, | |
| "grad_norm": 1.376826524734497, | |
| "learning_rate": 6.224627875507443e-05, | |
| "loss": 1.0176, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.16249153689911983, | |
| "grad_norm": 1.4289461374282837, | |
| "learning_rate": 6.495263870094723e-05, | |
| "loss": 0.9733, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16926201760324983, | |
| "grad_norm": 1.4132306575775146, | |
| "learning_rate": 6.765899864682003e-05, | |
| "loss": 1.0141, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.17603249830737983, | |
| "grad_norm": 1.482531189918518, | |
| "learning_rate": 7.036535859269283e-05, | |
| "loss": 0.977, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.18280297901150983, | |
| "grad_norm": 1.509128212928772, | |
| "learning_rate": 7.307171853856563e-05, | |
| "loss": 0.9624, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1895734597156398, | |
| "grad_norm": 1.7142691612243652, | |
| "learning_rate": 7.577807848443844e-05, | |
| "loss": 1.0063, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1963439404197698, | |
| "grad_norm": 1.2345936298370361, | |
| "learning_rate": 7.848443843031124e-05, | |
| "loss": 0.9562, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2031144211238998, | |
| "grad_norm": 1.4808542728424072, | |
| "learning_rate": 8.119079837618404e-05, | |
| "loss": 1.0207, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2098849018280298, | |
| "grad_norm": 0.9802400469779968, | |
| "learning_rate": 8.389715832205684e-05, | |
| "loss": 0.9731, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2166553825321598, | |
| "grad_norm": 1.2837491035461426, | |
| "learning_rate": 8.660351826792964e-05, | |
| "loss": 0.9732, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.22342586323628977, | |
| "grad_norm": 1.6100679636001587, | |
| "learning_rate": 8.930987821380244e-05, | |
| "loss": 0.9645, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.23019634394041977, | |
| "grad_norm": 1.65373957157135, | |
| "learning_rate": 9.201623815967524e-05, | |
| "loss": 0.9825, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.23696682464454977, | |
| "grad_norm": 1.4988625049591064, | |
| "learning_rate": 9.472259810554804e-05, | |
| "loss": 0.9521, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.24373730534867977, | |
| "grad_norm": 1.0492310523986816, | |
| "learning_rate": 9.742895805142085e-05, | |
| "loss": 0.9418, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.25050778605280977, | |
| "grad_norm": 1.26401948928833, | |
| "learning_rate": 0.00010013531799729365, | |
| "loss": 1.0314, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.25727826675693977, | |
| "grad_norm": 1.3206366300582886, | |
| "learning_rate": 0.00010284167794316644, | |
| "loss": 0.9194, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2640487474610697, | |
| "grad_norm": 1.533471941947937, | |
| "learning_rate": 0.00010554803788903924, | |
| "loss": 0.9, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2708192281651997, | |
| "grad_norm": 1.2870343923568726, | |
| "learning_rate": 0.00010825439783491205, | |
| "loss": 0.911, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2775897088693297, | |
| "grad_norm": 1.3480168581008911, | |
| "learning_rate": 0.00011096075778078485, | |
| "loss": 0.9127, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2843601895734597, | |
| "grad_norm": 1.1548075675964355, | |
| "learning_rate": 0.00011366711772665765, | |
| "loss": 0.9206, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2911306702775897, | |
| "grad_norm": 1.000781536102295, | |
| "learning_rate": 0.00011637347767253047, | |
| "loss": 0.9248, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2979011509817197, | |
| "grad_norm": 1.0907179117202759, | |
| "learning_rate": 0.00011907983761840324, | |
| "loss": 0.897, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.3046716316858497, | |
| "grad_norm": 1.3253204822540283, | |
| "learning_rate": 0.00012178619756427604, | |
| "loss": 0.9503, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3114421123899797, | |
| "grad_norm": 1.186468482017517, | |
| "learning_rate": 0.00012449255751014886, | |
| "loss": 0.885, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3182125930941097, | |
| "grad_norm": 1.0382546186447144, | |
| "learning_rate": 0.00012719891745602166, | |
| "loss": 0.937, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.32498307379823965, | |
| "grad_norm": 0.9156469702720642, | |
| "learning_rate": 0.00012990527740189446, | |
| "loss": 0.9407, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.33175355450236965, | |
| "grad_norm": 1.2555314302444458, | |
| "learning_rate": 0.00013261163734776726, | |
| "loss": 0.9349, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.33852403520649965, | |
| "grad_norm": 1.1427136659622192, | |
| "learning_rate": 0.00013531799729364006, | |
| "loss": 0.9034, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.34529451591062965, | |
| "grad_norm": 0.9024341106414795, | |
| "learning_rate": 0.00013802435723951287, | |
| "loss": 0.8431, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.35206499661475965, | |
| "grad_norm": 1.0170283317565918, | |
| "learning_rate": 0.00014073071718538567, | |
| "loss": 0.9392, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.35883547731888965, | |
| "grad_norm": 0.9581354856491089, | |
| "learning_rate": 0.00014343707713125847, | |
| "loss": 0.9557, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.36560595802301965, | |
| "grad_norm": 1.1668641567230225, | |
| "learning_rate": 0.00014614343707713127, | |
| "loss": 0.8982, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.37237643872714965, | |
| "grad_norm": 1.249225378036499, | |
| "learning_rate": 0.00014884979702300404, | |
| "loss": 0.8719, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3791469194312796, | |
| "grad_norm": 0.8681928515434265, | |
| "learning_rate": 0.00015155615696887687, | |
| "loss": 0.9412, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3859174001354096, | |
| "grad_norm": 0.8795790672302246, | |
| "learning_rate": 0.00015426251691474967, | |
| "loss": 0.9476, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3926878808395396, | |
| "grad_norm": 1.2251633405685425, | |
| "learning_rate": 0.00015696887686062247, | |
| "loss": 0.9401, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3994583615436696, | |
| "grad_norm": 0.9845913052558899, | |
| "learning_rate": 0.00015967523680649528, | |
| "loss": 0.8447, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.4062288422477996, | |
| "grad_norm": 1.3847956657409668, | |
| "learning_rate": 0.00016238159675236808, | |
| "loss": 0.9562, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4129993229519296, | |
| "grad_norm": 0.9039000272750854, | |
| "learning_rate": 0.00016508795669824085, | |
| "loss": 0.8706, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4197698036560596, | |
| "grad_norm": 0.8315423130989075, | |
| "learning_rate": 0.00016779431664411368, | |
| "loss": 0.9437, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4265402843601896, | |
| "grad_norm": 0.8760778903961182, | |
| "learning_rate": 0.00017050067658998648, | |
| "loss": 0.9078, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4333107650643196, | |
| "grad_norm": 1.0592724084854126, | |
| "learning_rate": 0.00017320703653585928, | |
| "loss": 0.8835, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.44008124576844954, | |
| "grad_norm": 0.8527820706367493, | |
| "learning_rate": 0.00017591339648173208, | |
| "loss": 0.9088, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.44685172647257954, | |
| "grad_norm": 0.8774325847625732, | |
| "learning_rate": 0.00017861975642760488, | |
| "loss": 0.8967, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.45362220717670954, | |
| "grad_norm": 0.6633328795433044, | |
| "learning_rate": 0.00018132611637347766, | |
| "loss": 0.9158, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.46039268788083954, | |
| "grad_norm": 0.7048283219337463, | |
| "learning_rate": 0.0001840324763193505, | |
| "loss": 0.872, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.46716316858496953, | |
| "grad_norm": 0.8527712225914001, | |
| "learning_rate": 0.0001867388362652233, | |
| "loss": 0.9062, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.47393364928909953, | |
| "grad_norm": 1.095738172531128, | |
| "learning_rate": 0.0001894451962110961, | |
| "loss": 0.89, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.48070412999322953, | |
| "grad_norm": 0.8880236148834229, | |
| "learning_rate": 0.0001921515561569689, | |
| "loss": 0.8825, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.48747461069735953, | |
| "grad_norm": 0.7381774187088013, | |
| "learning_rate": 0.0001948579161028417, | |
| "loss": 0.8121, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4942450914014895, | |
| "grad_norm": 0.9708958864212036, | |
| "learning_rate": 0.0001975642760487145, | |
| "loss": 0.8458, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.5010155721056195, | |
| "grad_norm": 1.0069886445999146, | |
| "learning_rate": 0.00019999998882753333, | |
| "loss": 0.8679, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5077860528097495, | |
| "grad_norm": 0.8364754915237427, | |
| "learning_rate": 0.00019999864813455363, | |
| "loss": 0.8797, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5145565335138795, | |
| "grad_norm": 0.8467391133308411, | |
| "learning_rate": 0.0001999950729825663, | |
| "loss": 0.8789, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5213270142180095, | |
| "grad_norm": 0.749064028263092, | |
| "learning_rate": 0.00019998926345145775, | |
| "loss": 0.9156, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5280974949221394, | |
| "grad_norm": 0.7991885542869568, | |
| "learning_rate": 0.00019998121967104132, | |
| "loss": 0.919, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5348679756262694, | |
| "grad_norm": 0.8024610877037048, | |
| "learning_rate": 0.00019997094182105447, | |
| "loss": 0.8619, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5416384563303994, | |
| "grad_norm": 0.8949725031852722, | |
| "learning_rate": 0.00019995843013115454, | |
| "loss": 0.86, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5484089370345294, | |
| "grad_norm": 0.9048612713813782, | |
| "learning_rate": 0.00019994368488091398, | |
| "loss": 0.9258, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5551794177386594, | |
| "grad_norm": 1.112876057624817, | |
| "learning_rate": 0.00019992670639981376, | |
| "loss": 0.8758, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5619498984427894, | |
| "grad_norm": 0.9120655059814453, | |
| "learning_rate": 0.00019990749506723624, | |
| "loss": 0.9112, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5687203791469194, | |
| "grad_norm": 0.9125117063522339, | |
| "learning_rate": 0.00019988605131245662, | |
| "loss": 0.899, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5754908598510494, | |
| "grad_norm": 0.8011307716369629, | |
| "learning_rate": 0.00019986237561463318, | |
| "loss": 0.8604, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5822613405551794, | |
| "grad_norm": 0.7512729167938232, | |
| "learning_rate": 0.00019983646850279692, | |
| "loss": 0.8411, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5890318212593094, | |
| "grad_norm": 0.7400951981544495, | |
| "learning_rate": 0.0001998083305558394, | |
| "loss": 0.9106, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5958023019634394, | |
| "grad_norm": 0.8688220381736755, | |
| "learning_rate": 0.00019977796240250008, | |
| "loss": 0.9071, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.6025727826675694, | |
| "grad_norm": 0.9177795052528381, | |
| "learning_rate": 0.00019974536472135203, | |
| "loss": 0.9038, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.6093432633716994, | |
| "grad_norm": 0.986629843711853, | |
| "learning_rate": 0.00019971053824078693, | |
| "loss": 0.8832, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6161137440758294, | |
| "grad_norm": 0.7033129334449768, | |
| "learning_rate": 0.00019967348373899868, | |
| "loss": 0.845, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6228842247799594, | |
| "grad_norm": 0.8107329607009888, | |
| "learning_rate": 0.0001996342020439662, | |
| "loss": 0.9287, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.6296547054840894, | |
| "grad_norm": 0.7914236783981323, | |
| "learning_rate": 0.00019959269403343474, | |
| "loss": 0.8836, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6364251861882194, | |
| "grad_norm": 0.8895307183265686, | |
| "learning_rate": 0.00019954896063489622, | |
| "loss": 0.8759, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6431956668923493, | |
| "grad_norm": 0.8289987444877625, | |
| "learning_rate": 0.0001995030028255688, | |
| "loss": 0.9136, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6499661475964793, | |
| "grad_norm": 0.9810376167297363, | |
| "learning_rate": 0.00019945482163237472, | |
| "loss": 0.8388, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6567366283006093, | |
| "grad_norm": 0.7306379079818726, | |
| "learning_rate": 0.0001994044181319176, | |
| "loss": 0.8804, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6635071090047393, | |
| "grad_norm": 0.7892174124717712, | |
| "learning_rate": 0.00019935179345045815, | |
| "loss": 0.8671, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6702775897088693, | |
| "grad_norm": 0.9007791876792908, | |
| "learning_rate": 0.0001992969487638893, | |
| "loss": 0.8661, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6770480704129993, | |
| "grad_norm": 0.7324849963188171, | |
| "learning_rate": 0.00019923988529770958, | |
| "loss": 0.7901, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6770480704129993, | |
| "eval_loss": 0.8919770121574402, | |
| "eval_runtime": 23.6227, | |
| "eval_samples_per_second": 105.323, | |
| "eval_steps_per_second": 13.165, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6838185511171293, | |
| "grad_norm": 0.8670386672019958, | |
| "learning_rate": 0.000199180604326996, | |
| "loss": 0.8084, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6905890318212593, | |
| "grad_norm": 1.3103822469711304, | |
| "learning_rate": 0.00019911910717637548, | |
| "loss": 0.8708, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6973595125253893, | |
| "grad_norm": 0.8602836728096008, | |
| "learning_rate": 0.00019905539521999517, | |
| "loss": 0.8608, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.7041299932295193, | |
| "grad_norm": 0.7158609628677368, | |
| "learning_rate": 0.00019898946988149193, | |
| "loss": 0.9042, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.7109004739336493, | |
| "grad_norm": 0.6975676417350769, | |
| "learning_rate": 0.0001989213326339603, | |
| "loss": 0.8896, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7176709546377793, | |
| "grad_norm": 0.7300527095794678, | |
| "learning_rate": 0.00019885098499991972, | |
| "loss": 0.8685, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.7244414353419093, | |
| "grad_norm": 0.6200681924819946, | |
| "learning_rate": 0.0001987784285512805, | |
| "loss": 0.8615, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.7312119160460393, | |
| "grad_norm": 0.7945191860198975, | |
| "learning_rate": 0.00019870366490930868, | |
| "loss": 0.8786, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7379823967501693, | |
| "grad_norm": 0.6641054749488831, | |
| "learning_rate": 0.0001986266957445897, | |
| "loss": 0.8872, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.7447528774542993, | |
| "grad_norm": 0.7063596844673157, | |
| "learning_rate": 0.00019854752277699138, | |
| "loss": 0.8544, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7515233581584293, | |
| "grad_norm": 0.6685433983802795, | |
| "learning_rate": 0.000198466147775625, | |
| "loss": 0.8256, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7582938388625592, | |
| "grad_norm": 0.6927530765533447, | |
| "learning_rate": 0.00019838257255880626, | |
| "loss": 0.8642, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7650643195666892, | |
| "grad_norm": 0.7018571496009827, | |
| "learning_rate": 0.00019829679899401436, | |
| "loss": 0.8624, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7718348002708192, | |
| "grad_norm": 0.8826500773429871, | |
| "learning_rate": 0.00019820882899785038, | |
| "loss": 0.8312, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7786052809749492, | |
| "grad_norm": 0.9699224233627319, | |
| "learning_rate": 0.00019811866453599435, | |
| "loss": 0.8467, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7853757616790792, | |
| "grad_norm": 0.7322418689727783, | |
| "learning_rate": 0.00019802630762316145, | |
| "loss": 0.8456, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7921462423832092, | |
| "grad_norm": 0.768301248550415, | |
| "learning_rate": 0.00019793176032305697, | |
| "loss": 0.8391, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7989167230873392, | |
| "grad_norm": 0.8243605494499207, | |
| "learning_rate": 0.00019783502474833009, | |
| "loss": 0.904, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.8056872037914692, | |
| "grad_norm": 0.7215325236320496, | |
| "learning_rate": 0.00019773610306052683, | |
| "loss": 0.8494, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.8124576844955992, | |
| "grad_norm": 0.7619712948799133, | |
| "learning_rate": 0.00019763499747004165, | |
| "loss": 0.8865, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8192281651997292, | |
| "grad_norm": 0.835599958896637, | |
| "learning_rate": 0.000197531710236068, | |
| "loss": 0.8733, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.8259986459038592, | |
| "grad_norm": 0.8382962942123413, | |
| "learning_rate": 0.00019742624366654802, | |
| "loss": 0.9122, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.8327691266079892, | |
| "grad_norm": 0.666801393032074, | |
| "learning_rate": 0.00019731860011812087, | |
| "loss": 0.8429, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.8395396073121192, | |
| "grad_norm": 0.7756575345993042, | |
| "learning_rate": 0.00019720878199606996, | |
| "loss": 0.9004, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8463100880162492, | |
| "grad_norm": 0.7014258503913879, | |
| "learning_rate": 0.00019709679175426942, | |
| "loss": 0.9241, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8530805687203792, | |
| "grad_norm": 0.6827540397644043, | |
| "learning_rate": 0.00019698263189512914, | |
| "loss": 0.8566, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8598510494245092, | |
| "grad_norm": 0.9167826771736145, | |
| "learning_rate": 0.00019686630496953882, | |
| "loss": 0.9116, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8666215301286392, | |
| "grad_norm": 0.8172047138214111, | |
| "learning_rate": 0.00019674781357681108, | |
| "loss": 0.8052, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8733920108327691, | |
| "grad_norm": 0.7139961123466492, | |
| "learning_rate": 0.00019662716036462335, | |
| "loss": 0.89, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8801624915368991, | |
| "grad_norm": 0.9733943939208984, | |
| "learning_rate": 0.0001965043480289586, | |
| "loss": 0.8191, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8869329722410291, | |
| "grad_norm": 0.849946916103363, | |
| "learning_rate": 0.00019637937931404523, | |
| "loss": 0.8995, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.8937034529451591, | |
| "grad_norm": 0.6809601187705994, | |
| "learning_rate": 0.00019625225701229573, | |
| "loss": 0.8582, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.9004739336492891, | |
| "grad_norm": 0.7891602516174316, | |
| "learning_rate": 0.00019612298396424417, | |
| "loss": 0.844, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.9072444143534191, | |
| "grad_norm": 0.6357580423355103, | |
| "learning_rate": 0.0001959915630584829, | |
| "loss": 0.8609, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.9140148950575491, | |
| "grad_norm": 0.9102625846862793, | |
| "learning_rate": 0.00019585799723159788, | |
| "loss": 0.91, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.9207853757616791, | |
| "grad_norm": 0.690881609916687, | |
| "learning_rate": 0.0001957222894681031, | |
| "loss": 0.8287, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.9275558564658091, | |
| "grad_norm": 0.6755393743515015, | |
| "learning_rate": 0.00019558444280037393, | |
| "loss": 0.7931, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.9343263371699391, | |
| "grad_norm": 0.6997596025466919, | |
| "learning_rate": 0.00019544446030857922, | |
| "loss": 0.8941, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.9410968178740691, | |
| "grad_norm": 0.8115108013153076, | |
| "learning_rate": 0.0001953023451206127, | |
| "loss": 0.8674, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9478672985781991, | |
| "grad_norm": 0.6413692235946655, | |
| "learning_rate": 0.00019515810041202295, | |
| "loss": 0.8462, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9546377792823291, | |
| "grad_norm": 0.6888745427131653, | |
| "learning_rate": 0.00019501172940594242, | |
| "loss": 0.8594, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9614082599864591, | |
| "grad_norm": 0.8250995874404907, | |
| "learning_rate": 0.00019486323537301538, | |
| "loss": 0.8622, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9681787406905891, | |
| "grad_norm": 0.7127440571784973, | |
| "learning_rate": 0.00019471262163132504, | |
| "loss": 0.8626, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9749492213947191, | |
| "grad_norm": 0.6688849925994873, | |
| "learning_rate": 0.0001945598915463192, | |
| "loss": 0.871, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.9817197020988491, | |
| "grad_norm": 0.8800045251846313, | |
| "learning_rate": 0.00019440504853073516, | |
| "loss": 0.8555, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.988490182802979, | |
| "grad_norm": 0.7973435521125793, | |
| "learning_rate": 0.00019424809604452338, | |
| "loss": 0.826, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.995260663507109, | |
| "grad_norm": 0.7803165316581726, | |
| "learning_rate": 0.00019408903759477025, | |
| "loss": 0.8657, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.002031144211239, | |
| "grad_norm": 0.9152759313583374, | |
| "learning_rate": 0.00019392787673561964, | |
| "loss": 0.8114, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.008801624915369, | |
| "grad_norm": 0.717939555644989, | |
| "learning_rate": 0.00019376461706819358, | |
| "loss": 0.7081, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.015572105619499, | |
| "grad_norm": 0.8752790093421936, | |
| "learning_rate": 0.00019359926224051178, | |
| "loss": 0.697, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.022342586323629, | |
| "grad_norm": 0.7938421368598938, | |
| "learning_rate": 0.00019343181594740996, | |
| "loss": 0.7743, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.029113067027759, | |
| "grad_norm": 0.8380940556526184, | |
| "learning_rate": 0.00019326228193045753, | |
| "loss": 0.7965, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.035883547731889, | |
| "grad_norm": 0.8056864142417908, | |
| "learning_rate": 0.00019309066397787378, | |
| "loss": 0.7399, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.042654028436019, | |
| "grad_norm": 0.9307854771614075, | |
| "learning_rate": 0.0001929169659244434, | |
| "loss": 0.7503, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.0494245091401488, | |
| "grad_norm": 0.8573846220970154, | |
| "learning_rate": 0.00019274119165143064, | |
| "loss": 0.7867, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.0561949898442788, | |
| "grad_norm": 0.7639918327331543, | |
| "learning_rate": 0.00019256334508649262, | |
| "loss": 0.7303, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.0629654705484088, | |
| "grad_norm": 0.7085719704627991, | |
| "learning_rate": 0.00019238343020359174, | |
| "loss": 0.7375, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.0697359512525388, | |
| "grad_norm": 0.8645661473274231, | |
| "learning_rate": 0.00019220145102290658, | |
| "loss": 0.7569, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.0765064319566688, | |
| "grad_norm": 0.8893268704414368, | |
| "learning_rate": 0.00019201741161074234, | |
| "loss": 0.7594, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.0832769126607988, | |
| "grad_norm": 0.9011455774307251, | |
| "learning_rate": 0.00019183131607943983, | |
| "loss": 0.7721, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0900473933649288, | |
| "grad_norm": 0.812759518623352, | |
| "learning_rate": 0.00019164316858728364, | |
| "loss": 0.6816, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.0968178740690588, | |
| "grad_norm": 0.7881085276603699, | |
| "learning_rate": 0.00019145297333840916, | |
| "loss": 0.7927, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.1035883547731888, | |
| "grad_norm": 0.9383792281150818, | |
| "learning_rate": 0.00019126073458270874, | |
| "loss": 0.8416, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.1103588354773188, | |
| "grad_norm": 0.8487265110015869, | |
| "learning_rate": 0.00019106645661573667, | |
| "loss": 0.7731, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.1171293161814488, | |
| "grad_norm": 1.061084270477295, | |
| "learning_rate": 0.0001908701437786131, | |
| "loss": 0.7954, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.1238997968855788, | |
| "grad_norm": 0.7608863115310669, | |
| "learning_rate": 0.00019067180045792724, | |
| "loss": 0.7224, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.1306702775897088, | |
| "grad_norm": 1.0351011753082275, | |
| "learning_rate": 0.0001904714310856392, | |
| "loss": 0.7761, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.1374407582938388, | |
| "grad_norm": 0.8522539138793945, | |
| "learning_rate": 0.00019026904013898097, | |
| "loss": 0.7552, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.1442112389979688, | |
| "grad_norm": 0.9050424098968506, | |
| "learning_rate": 0.00019006463214035646, | |
| "loss": 0.7458, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.1509817197020988, | |
| "grad_norm": 1.0837703943252563, | |
| "learning_rate": 0.00018985821165724034, | |
| "loss": 0.7811, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.1577522004062288, | |
| "grad_norm": 0.7830744385719299, | |
| "learning_rate": 0.00018964978330207605, | |
| "loss": 0.7596, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.1645226811103588, | |
| "grad_norm": 0.8530306220054626, | |
| "learning_rate": 0.0001894393517321727, | |
| "loss": 0.7075, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.1712931618144888, | |
| "grad_norm": 0.9117756485939026, | |
| "learning_rate": 0.00018922692164960098, | |
| "loss": 0.7585, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.1780636425186188, | |
| "grad_norm": 0.9983711242675781, | |
| "learning_rate": 0.00018901249780108823, | |
| "loss": 0.7459, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.1848341232227488, | |
| "grad_norm": 0.9291015267372131, | |
| "learning_rate": 0.00018879608497791224, | |
| "loss": 0.7271, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.1916046039268788, | |
| "grad_norm": 1.0468007326126099, | |
| "learning_rate": 0.00018857768801579415, | |
| "loss": 0.7932, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.1983750846310088, | |
| "grad_norm": 0.8586043119430542, | |
| "learning_rate": 0.00018835731179479056, | |
| "loss": 0.8144, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.2051455653351388, | |
| "grad_norm": 0.7450950741767883, | |
| "learning_rate": 0.00018813496123918432, | |
| "loss": 0.7402, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.2119160460392688, | |
| "grad_norm": 0.9340034127235413, | |
| "learning_rate": 0.00018791064131737462, | |
| "loss": 0.7852, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.2186865267433988, | |
| "grad_norm": 0.9052138328552246, | |
| "learning_rate": 0.00018768435704176597, | |
| "loss": 0.7128, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.2254570074475288, | |
| "grad_norm": 0.8574148416519165, | |
| "learning_rate": 0.00018745611346865606, | |
| "loss": 0.7488, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.2322274881516588, | |
| "grad_norm": 1.0493452548980713, | |
| "learning_rate": 0.00018722591569812294, | |
| "loss": 0.8368, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.2389979688557888, | |
| "grad_norm": 1.019943356513977, | |
| "learning_rate": 0.00018699376887391093, | |
| "loss": 0.8279, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.2457684495599188, | |
| "grad_norm": 0.9113163352012634, | |
| "learning_rate": 0.0001867596781833158, | |
| "loss": 0.7308, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.2525389302640488, | |
| "grad_norm": 0.9192100763320923, | |
| "learning_rate": 0.0001865236488570688, | |
| "loss": 0.783, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.2593094109681786, | |
| "grad_norm": 0.8824251294136047, | |
| "learning_rate": 0.00018628568616921976, | |
| "loss": 0.7581, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.2660798916723088, | |
| "grad_norm": 0.8410795331001282, | |
| "learning_rate": 0.00018604579543701926, | |
| "loss": 0.7696, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.2728503723764386, | |
| "grad_norm": 1.0213907957077026, | |
| "learning_rate": 0.00018580398202079987, | |
| "loss": 0.7202, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.2796208530805688, | |
| "grad_norm": 0.7865493297576904, | |
| "learning_rate": 0.00018556025132385626, | |
| "loss": 0.7685, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.2863913337846986, | |
| "grad_norm": 0.9204791784286499, | |
| "learning_rate": 0.00018531460879232456, | |
| "loss": 0.7814, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.2931618144888288, | |
| "grad_norm": 0.810883104801178, | |
| "learning_rate": 0.00018506705991506067, | |
| "loss": 0.7202, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.2999322951929586, | |
| "grad_norm": 0.8419713973999023, | |
| "learning_rate": 0.00018481761022351757, | |
| "loss": 0.785, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.3067027758970888, | |
| "grad_norm": 0.8345950245857239, | |
| "learning_rate": 0.0001845662652916217, | |
| "loss": 0.7693, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.3134732566012186, | |
| "grad_norm": 0.8708229660987854, | |
| "learning_rate": 0.00018431303073564842, | |
| "loss": 0.8127, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.3202437373053486, | |
| "grad_norm": 0.800879716873169, | |
| "learning_rate": 0.0001840579122140966, | |
| "loss": 0.7804, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.3270142180094786, | |
| "grad_norm": 0.8764187097549438, | |
| "learning_rate": 0.00018380091542756212, | |
| "loss": 0.7563, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.3337846987136086, | |
| "grad_norm": 0.9371510744094849, | |
| "learning_rate": 0.00018354204611861042, | |
| "loss": 0.7382, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.3405551794177386, | |
| "grad_norm": 0.9174867868423462, | |
| "learning_rate": 0.00018328131007164827, | |
| "loss": 0.7543, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.3473256601218686, | |
| "grad_norm": 0.9580458998680115, | |
| "learning_rate": 0.00018301871311279455, | |
| "loss": 0.7877, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.3540961408259986, | |
| "grad_norm": 0.8264724016189575, | |
| "learning_rate": 0.00018275426110975, | |
| "loss": 0.7599, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.3540961408259986, | |
| "eval_loss": 0.8573334813117981, | |
| "eval_runtime": 23.1617, | |
| "eval_samples_per_second": 107.419, | |
| "eval_steps_per_second": 13.427, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.3608666215301286, | |
| "grad_norm": 0.8695821762084961, | |
| "learning_rate": 0.00018248795997166607, | |
| "loss": 0.772, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.3676371022342586, | |
| "grad_norm": 0.9564002752304077, | |
| "learning_rate": 0.000182219815649013, | |
| "loss": 0.8211, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.3744075829383886, | |
| "grad_norm": 0.951923668384552, | |
| "learning_rate": 0.00018194983413344674, | |
| "loss": 0.7549, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.3811780636425186, | |
| "grad_norm": 0.7695098519325256, | |
| "learning_rate": 0.00018167802145767513, | |
| "loss": 0.7133, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.3879485443466486, | |
| "grad_norm": 1.255873203277588, | |
| "learning_rate": 0.0001814043836953231, | |
| "loss": 0.7562, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.3947190250507786, | |
| "grad_norm": 0.8769702315330505, | |
| "learning_rate": 0.00018112892696079698, | |
| "loss": 0.7411, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.4014895057549086, | |
| "grad_norm": 0.9851005673408508, | |
| "learning_rate": 0.00018085165740914776, | |
| "loss": 0.7568, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.4082599864590386, | |
| "grad_norm": 0.8695229887962341, | |
| "learning_rate": 0.00018057258123593367, | |
| "loss": 0.7358, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.4150304671631686, | |
| "grad_norm": 0.9267136454582214, | |
| "learning_rate": 0.00018029170467708165, | |
| "loss": 0.7352, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.4218009478672986, | |
| "grad_norm": 0.8532856106758118, | |
| "learning_rate": 0.00018000903400874823, | |
| "loss": 0.8073, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.8961872458457947, | |
| "learning_rate": 0.0001797245755471789, | |
| "loss": 0.7886, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.4353419092755586, | |
| "grad_norm": 0.8943607211112976, | |
| "learning_rate": 0.00017943833564856737, | |
| "loss": 0.7216, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.4421123899796886, | |
| "grad_norm": 0.824885904788971, | |
| "learning_rate": 0.00017915032070891327, | |
| "loss": 0.7077, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.4488828706838186, | |
| "grad_norm": 0.846660315990448, | |
| "learning_rate": 0.00017886053716387935, | |
| "loss": 0.7511, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.4556533513879486, | |
| "grad_norm": 0.8594396710395813, | |
| "learning_rate": 0.00017856899148864774, | |
| "loss": 0.7603, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.4624238320920786, | |
| "grad_norm": 0.8377899527549744, | |
| "learning_rate": 0.00017827569019777503, | |
| "loss": 0.7301, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.4691943127962086, | |
| "grad_norm": 1.0455125570297241, | |
| "learning_rate": 0.00017798063984504698, | |
| "loss": 0.7858, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.4759647935003386, | |
| "grad_norm": 0.9242769479751587, | |
| "learning_rate": 0.00017768384702333188, | |
| "loss": 0.8125, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.4827352742044684, | |
| "grad_norm": 0.9363239407539368, | |
| "learning_rate": 0.00017738531836443332, | |
| "loss": 0.7731, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.4895057549085986, | |
| "grad_norm": 0.8512465953826904, | |
| "learning_rate": 0.000177085060538942, | |
| "loss": 0.7407, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.4962762356127284, | |
| "grad_norm": 0.9729003310203552, | |
| "learning_rate": 0.00017678308025608665, | |
| "loss": 0.7751, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.5030467163168586, | |
| "grad_norm": 0.94197678565979, | |
| "learning_rate": 0.00017647938426358412, | |
| "loss": 0.7642, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.5098171970209884, | |
| "grad_norm": 0.9034068584442139, | |
| "learning_rate": 0.00017617397934748859, | |
| "loss": 0.8069, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.5165876777251186, | |
| "grad_norm": 0.9055565595626831, | |
| "learning_rate": 0.00017586687233204, | |
| "loss": 0.7463, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.5233581584292484, | |
| "grad_norm": 0.9645712971687317, | |
| "learning_rate": 0.00017555807007951142, | |
| "loss": 0.8157, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.5301286391333786, | |
| "grad_norm": 0.9376358389854431, | |
| "learning_rate": 0.00017524757949005597, | |
| "loss": 0.8012, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.5368991198375084, | |
| "grad_norm": 0.8372974991798401, | |
| "learning_rate": 0.00017493540750155236, | |
| "loss": 0.7429, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.5436696005416386, | |
| "grad_norm": 0.8159657120704651, | |
| "learning_rate": 0.00017462156108944996, | |
| "loss": 0.7619, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.5504400812457684, | |
| "grad_norm": 0.9110903143882751, | |
| "learning_rate": 0.00017430604726661304, | |
| "loss": 0.7792, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.5572105619498986, | |
| "grad_norm": 1.0363059043884277, | |
| "learning_rate": 0.00017398887308316393, | |
| "loss": 0.7875, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.5639810426540284, | |
| "grad_norm": 0.8779491186141968, | |
| "learning_rate": 0.00017367004562632556, | |
| "loss": 0.7395, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.5707515233581584, | |
| "grad_norm": 0.7635359168052673, | |
| "learning_rate": 0.00017334957202026305, | |
| "loss": 0.734, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.5775220040622884, | |
| "grad_norm": 0.7570300698280334, | |
| "learning_rate": 0.0001730274594259246, | |
| "loss": 0.732, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.5842924847664184, | |
| "grad_norm": 0.8852811455726624, | |
| "learning_rate": 0.0001727037150408813, | |
| "loss": 0.7176, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.5910629654705484, | |
| "grad_norm": 0.920385479927063, | |
| "learning_rate": 0.00017237834609916668, | |
| "loss": 0.7883, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.5978334461746784, | |
| "grad_norm": 0.7175299525260925, | |
| "learning_rate": 0.00017205135987111446, | |
| "loss": 0.7511, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.6046039268788084, | |
| "grad_norm": 0.9640962481498718, | |
| "learning_rate": 0.0001717227636631968, | |
| "loss": 0.7344, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.6113744075829384, | |
| "grad_norm": 1.0787372589111328, | |
| "learning_rate": 0.00017139256481786043, | |
| "loss": 0.7388, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.6181448882870684, | |
| "grad_norm": 0.8717492818832397, | |
| "learning_rate": 0.00017106077071336298, | |
| "loss": 0.8181, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.6249153689911984, | |
| "grad_norm": 0.9693078398704529, | |
| "learning_rate": 0.00017072738876360792, | |
| "loss": 0.7784, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.6316858496953284, | |
| "grad_norm": 0.9157988429069519, | |
| "learning_rate": 0.00017039242641797895, | |
| "loss": 0.7631, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.6384563303994584, | |
| "grad_norm": 0.856497585773468, | |
| "learning_rate": 0.0001700558911611736, | |
| "loss": 0.7572, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.6452268111035884, | |
| "grad_norm": 0.9910064339637756, | |
| "learning_rate": 0.0001697177905130358, | |
| "loss": 0.79, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.6519972918077184, | |
| "grad_norm": 0.9009943008422852, | |
| "learning_rate": 0.00016937813202838817, | |
| "loss": 0.7389, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.6587677725118484, | |
| "grad_norm": 0.8572137951850891, | |
| "learning_rate": 0.00016903692329686286, | |
| "loss": 0.8074, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.6655382532159784, | |
| "grad_norm": 0.9608494639396667, | |
| "learning_rate": 0.00016869417194273216, | |
| "loss": 0.7493, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.6723087339201084, | |
| "grad_norm": 1.1153324842453003, | |
| "learning_rate": 0.00016834988562473813, | |
| "loss": 0.7696, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.6790792146242384, | |
| "grad_norm": 0.8839768171310425, | |
| "learning_rate": 0.00016800407203592144, | |
| "loss": 0.6736, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.6858496953283684, | |
| "grad_norm": 0.8794620633125305, | |
| "learning_rate": 0.00016765673890344944, | |
| "loss": 0.7678, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.6926201760324981, | |
| "grad_norm": 1.167880892753601, | |
| "learning_rate": 0.0001673078939884435, | |
| "loss": 0.799, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.6993906567366284, | |
| "grad_norm": 0.8976329565048218, | |
| "learning_rate": 0.00016695754508580556, | |
| "loss": 0.7445, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.7061611374407581, | |
| "grad_norm": 0.8003941178321838, | |
| "learning_rate": 0.00016660570002404414, | |
| "loss": 0.7434, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.7129316181448884, | |
| "grad_norm": 1.5716880559921265, | |
| "learning_rate": 0.0001662523666650992, | |
| "loss": 0.7785, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.7197020988490181, | |
| "grad_norm": 0.7486565113067627, | |
| "learning_rate": 0.00016589755290416652, | |
| "loss": 0.7415, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.7264725795531484, | |
| "grad_norm": 0.872717559337616, | |
| "learning_rate": 0.0001655412666695213, | |
| "loss": 0.7568, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.7332430602572781, | |
| "grad_norm": 1.06588876247406, | |
| "learning_rate": 0.00016518351592234102, | |
| "loss": 0.714, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.7400135409614084, | |
| "grad_norm": 0.8603307008743286, | |
| "learning_rate": 0.00016482430865652758, | |
| "loss": 0.8015, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.7467840216655381, | |
| "grad_norm": 0.9161677956581116, | |
| "learning_rate": 0.0001644636528985286, | |
| "loss": 0.7517, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.7535545023696684, | |
| "grad_norm": 0.9165793657302856, | |
| "learning_rate": 0.00016410155670715807, | |
| "loss": 0.7219, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.7603249830737981, | |
| "grad_norm": 0.9347404837608337, | |
| "learning_rate": 0.00016373802817341631, | |
| "loss": 0.7544, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.7670954637779284, | |
| "grad_norm": 0.9771521687507629, | |
| "learning_rate": 0.00016337307542030924, | |
| "loss": 0.7613, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.7738659444820581, | |
| "grad_norm": 0.8616775870323181, | |
| "learning_rate": 0.00016300670660266678, | |
| "loss": 0.7028, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.7806364251861884, | |
| "grad_norm": 0.9634568095207214, | |
| "learning_rate": 0.0001626389299069606, | |
| "loss": 0.7776, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.7874069058903181, | |
| "grad_norm": 0.8600468635559082, | |
| "learning_rate": 0.00016226975355112134, | |
| "loss": 0.7127, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.7941773865944484, | |
| "grad_norm": 0.8130874037742615, | |
| "learning_rate": 0.00016189918578435482, | |
| "loss": 0.7618, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.8009478672985781, | |
| "grad_norm": 0.8722664713859558, | |
| "learning_rate": 0.00016152723488695783, | |
| "loss": 0.7364, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.8077183480027081, | |
| "grad_norm": 0.726963222026825, | |
| "learning_rate": 0.00016115390917013307, | |
| "loss": 0.7449, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.8144888287068381, | |
| "grad_norm": 0.9895104765892029, | |
| "learning_rate": 0.00016077921697580343, | |
| "loss": 0.7766, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.8212593094109681, | |
| "grad_norm": 0.9779828190803528, | |
| "learning_rate": 0.00016040316667642558, | |
| "loss": 0.7266, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.8280297901150981, | |
| "grad_norm": 1.04193913936615, | |
| "learning_rate": 0.00016002576667480288, | |
| "loss": 0.7344, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.8348002708192281, | |
| "grad_norm": 0.8899911046028137, | |
| "learning_rate": 0.00015964702540389767, | |
| "loss": 0.7546, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.8415707515233581, | |
| "grad_norm": 0.9403987526893616, | |
| "learning_rate": 0.0001592669513266428, | |
| "loss": 0.7482, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.8483412322274881, | |
| "grad_norm": 0.863129734992981, | |
| "learning_rate": 0.00015888555293575254, | |
| "loss": 0.7527, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.8551117129316181, | |
| "grad_norm": 1.1445564031600952, | |
| "learning_rate": 0.0001585028387535328, | |
| "loss": 0.7672, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.8618821936357481, | |
| "grad_norm": 0.8358940482139587, | |
| "learning_rate": 0.0001581188173316907, | |
| "loss": 0.7877, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.8686526743398781, | |
| "grad_norm": 1.0207701921463013, | |
| "learning_rate": 0.00015773349725114352, | |
| "loss": 0.7711, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.8754231550440081, | |
| "grad_norm": 0.9382310509681702, | |
| "learning_rate": 0.00015734688712182687, | |
| "loss": 0.7365, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.8821936357481381, | |
| "grad_norm": 0.7211757898330688, | |
| "learning_rate": 0.0001569589955825024, | |
| "loss": 0.7144, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.8889641164522681, | |
| "grad_norm": 1.0787826776504517, | |
| "learning_rate": 0.00015656983130056472, | |
| "loss": 0.7784, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.8957345971563981, | |
| "grad_norm": 1.0936686992645264, | |
| "learning_rate": 0.00015617940297184775, | |
| "loss": 0.7455, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.9025050778605281, | |
| "grad_norm": 1.0122491121292114, | |
| "learning_rate": 0.00015578771932043037, | |
| "loss": 0.7711, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.9092755585646581, | |
| "grad_norm": 0.9829614162445068, | |
| "learning_rate": 0.00015539478909844156, | |
| "loss": 0.7485, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.9160460392687881, | |
| "grad_norm": 0.9822033047676086, | |
| "learning_rate": 0.00015500062108586473, | |
| "loss": 0.7337, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.9228165199729181, | |
| "grad_norm": 0.8550043702125549, | |
| "learning_rate": 0.0001546052240903416, | |
| "loss": 0.7547, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.929587000677048, | |
| "grad_norm": 0.7504202723503113, | |
| "learning_rate": 0.0001542086069469754, | |
| "loss": 0.7329, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.9363574813811781, | |
| "grad_norm": 0.7536128759384155, | |
| "learning_rate": 0.00015381077851813342, | |
| "loss": 0.6917, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.943127962085308, | |
| "grad_norm": 1.024143934249878, | |
| "learning_rate": 0.000153411747693249, | |
| "loss": 0.7293, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.9498984427894381, | |
| "grad_norm": 0.8882274031639099, | |
| "learning_rate": 0.0001530115233886229, | |
| "loss": 0.7067, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.956668923493568, | |
| "grad_norm": 0.814894437789917, | |
| "learning_rate": 0.00015261011454722402, | |
| "loss": 0.6613, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.9634394041976981, | |
| "grad_norm": 0.8720422387123108, | |
| "learning_rate": 0.00015220753013848965, | |
| "loss": 0.7931, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.970209884901828, | |
| "grad_norm": 1.070326805114746, | |
| "learning_rate": 0.00015180377915812498, | |
| "loss": 0.6737, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.9769803656059581, | |
| "grad_norm": 0.9129419922828674, | |
| "learning_rate": 0.0001513988706279021, | |
| "loss": 0.7693, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.983750846310088, | |
| "grad_norm": 0.9133071303367615, | |
| "learning_rate": 0.00015099281359545844, | |
| "loss": 0.7222, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.9905213270142181, | |
| "grad_norm": 1.1360323429107666, | |
| "learning_rate": 0.00015058561713409465, | |
| "loss": 0.7813, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.997291807718348, | |
| "grad_norm": 1.1606559753417969, | |
| "learning_rate": 0.0001501772903425717, | |
| "loss": 0.7045, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.004062288422478, | |
| "grad_norm": 0.8940277099609375, | |
| "learning_rate": 0.0001497678423449077, | |
| "loss": 0.6686, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.010832769126608, | |
| "grad_norm": 0.9504866003990173, | |
| "learning_rate": 0.00014935728229017404, | |
| "loss": 0.5851, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.017603249830738, | |
| "grad_norm": 0.9662072062492371, | |
| "learning_rate": 0.00014894561935229083, | |
| "loss": 0.5836, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.024373730534868, | |
| "grad_norm": 1.1531829833984375, | |
| "learning_rate": 0.00014853286272982206, | |
| "loss": 0.5511, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.031144211238998, | |
| "grad_norm": 1.0693235397338867, | |
| "learning_rate": 0.00014811902164576986, | |
| "loss": 0.5325, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.031144211238998, | |
| "eval_loss": 0.8718012571334839, | |
| "eval_runtime": 23.0432, | |
| "eval_samples_per_second": 107.971, | |
| "eval_steps_per_second": 13.496, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.037914691943128, | |
| "grad_norm": 1.1329638957977295, | |
| "learning_rate": 0.0001477041053473687, | |
| "loss": 0.5722, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.044685172647258, | |
| "grad_norm": 1.1756556034088135, | |
| "learning_rate": 0.0001472881231058785, | |
| "loss": 0.57, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.051455653351388, | |
| "grad_norm": 1.1575700044631958, | |
| "learning_rate": 0.00014687108421637758, | |
| "loss": 0.5845, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.058226134055518, | |
| "grad_norm": 1.0859098434448242, | |
| "learning_rate": 0.0001464529979975549, | |
| "loss": 0.533, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.064996614759648, | |
| "grad_norm": 0.9851484298706055, | |
| "learning_rate": 0.00014603387379150197, | |
| "loss": 0.584, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.071767095463778, | |
| "grad_norm": 1.1865367889404297, | |
| "learning_rate": 0.00014561372096350402, | |
| "loss": 0.5536, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.078537576167908, | |
| "grad_norm": 1.114558219909668, | |
| "learning_rate": 0.00014519254890183058, | |
| "loss": 0.5627, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.085308056872038, | |
| "grad_norm": 1.0637989044189453, | |
| "learning_rate": 0.00014477036701752603, | |
| "loss": 0.5625, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.092078537576168, | |
| "grad_norm": 1.2044423818588257, | |
| "learning_rate": 0.00014434718474419896, | |
| "loss": 0.6045, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.0988490182802977, | |
| "grad_norm": 1.0656991004943848, | |
| "learning_rate": 0.00014392301153781168, | |
| "loss": 0.5458, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.105619498984428, | |
| "grad_norm": 1.431920051574707, | |
| "learning_rate": 0.00014349785687646879, | |
| "loss": 0.5798, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.1123899796885577, | |
| "grad_norm": 1.4664020538330078, | |
| "learning_rate": 0.00014307173026020524, | |
| "loss": 0.5566, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.119160460392688, | |
| "grad_norm": 0.9782803654670715, | |
| "learning_rate": 0.00014264464121077435, | |
| "loss": 0.5883, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.1259309410968177, | |
| "grad_norm": 1.2193199396133423, | |
| "learning_rate": 0.00014221659927143488, | |
| "loss": 0.5912, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.132701421800948, | |
| "grad_norm": 1.1089211702346802, | |
| "learning_rate": 0.00014178761400673778, | |
| "loss": 0.5421, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.1394719025050777, | |
| "grad_norm": 1.6899245977401733, | |
| "learning_rate": 0.00014135769500231259, | |
| "loss": 0.5477, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.146242383209208, | |
| "grad_norm": 1.1503666639328003, | |
| "learning_rate": 0.00014092685186465297, | |
| "loss": 0.5703, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.1530128639133377, | |
| "grad_norm": 1.1421773433685303, | |
| "learning_rate": 0.0001404950942209025, | |
| "loss": 0.6063, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.159783344617468, | |
| "grad_norm": 1.308514952659607, | |
| "learning_rate": 0.00014006243171863907, | |
| "loss": 0.6101, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.1665538253215977, | |
| "grad_norm": 1.108906626701355, | |
| "learning_rate": 0.00013962887402565967, | |
| "loss": 0.6067, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.173324306025728, | |
| "grad_norm": 1.3432538509368896, | |
| "learning_rate": 0.00013919443082976415, | |
| "loss": 0.5724, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.1800947867298577, | |
| "grad_norm": 1.2304880619049072, | |
| "learning_rate": 0.00013875911183853896, | |
| "loss": 0.5764, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.186865267433988, | |
| "grad_norm": 1.1720483303070068, | |
| "learning_rate": 0.0001383229267791399, | |
| "loss": 0.565, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.1936357481381177, | |
| "grad_norm": 0.9357210397720337, | |
| "learning_rate": 0.00013788588539807517, | |
| "loss": 0.525, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.200406228842248, | |
| "grad_norm": 1.2292680740356445, | |
| "learning_rate": 0.0001374479974609872, | |
| "loss": 0.6126, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.2071767095463777, | |
| "grad_norm": 1.0784507989883423, | |
| "learning_rate": 0.0001370092727524348, | |
| "loss": 0.5863, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.213947190250508, | |
| "grad_norm": 1.3088752031326294, | |
| "learning_rate": 0.00013656972107567423, | |
| "loss": 0.5568, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.2207176709546377, | |
| "grad_norm": 1.1142232418060303, | |
| "learning_rate": 0.0001361293522524403, | |
| "loss": 0.5777, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.227488151658768, | |
| "grad_norm": 1.1168012619018555, | |
| "learning_rate": 0.0001356881761227269, | |
| "loss": 0.549, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.2342586323628977, | |
| "grad_norm": 1.1179856061935425, | |
| "learning_rate": 0.00013524620254456705, | |
| "loss": 0.5828, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.241029113067028, | |
| "grad_norm": 1.1862361431121826, | |
| "learning_rate": 0.00013480344139381266, | |
| "loss": 0.5441, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.2477995937711577, | |
| "grad_norm": 1.2580469846725464, | |
| "learning_rate": 0.0001343599025639139, | |
| "loss": 0.6452, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.254570074475288, | |
| "grad_norm": 0.9721531271934509, | |
| "learning_rate": 0.00013391559596569815, | |
| "loss": 0.5803, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.2613405551794177, | |
| "grad_norm": 1.099107265472412, | |
| "learning_rate": 0.0001334705315271483, | |
| "loss": 0.5768, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.268111035883548, | |
| "grad_norm": 1.0356446504592896, | |
| "learning_rate": 0.00013302471919318141, | |
| "loss": 0.5759, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.2748815165876777, | |
| "grad_norm": 1.2317684888839722, | |
| "learning_rate": 0.00013257816892542582, | |
| "loss": 0.5797, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.281651997291808, | |
| "grad_norm": 1.2287174463272095, | |
| "learning_rate": 0.0001321308907019992, | |
| "loss": 0.5747, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.2884224779959377, | |
| "grad_norm": 1.2517625093460083, | |
| "learning_rate": 0.0001316828945172852, | |
| "loss": 0.5114, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.295192958700068, | |
| "grad_norm": 1.088796854019165, | |
| "learning_rate": 0.00013123419038171024, | |
| "loss": 0.5821, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.3019634394041977, | |
| "grad_norm": 1.0487096309661865, | |
| "learning_rate": 0.00013078478832151985, | |
| "loss": 0.6054, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.3087339201083275, | |
| "grad_norm": 1.1964969635009766, | |
| "learning_rate": 0.00013033469837855457, | |
| "loss": 0.5621, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.3155044008124577, | |
| "grad_norm": 1.2567753791809082, | |
| "learning_rate": 0.00012988393061002566, | |
| "loss": 0.5858, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.322274881516588, | |
| "grad_norm": 0.984793484210968, | |
| "learning_rate": 0.0001294324950882903, | |
| "loss": 0.5961, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.3290453622207177, | |
| "grad_norm": 1.2915070056915283, | |
| "learning_rate": 0.00012898040190062647, | |
| "loss": 0.5667, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.3358158429248475, | |
| "grad_norm": 1.242781400680542, | |
| "learning_rate": 0.00012852766114900777, | |
| "loss": 0.5781, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.3425863236289777, | |
| "grad_norm": 1.1402225494384766, | |
| "learning_rate": 0.00012807428294987744, | |
| "loss": 0.6048, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.349356804333108, | |
| "grad_norm": 1.2243235111236572, | |
| "learning_rate": 0.0001276202774339224, | |
| "loss": 0.5672, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.3561272850372377, | |
| "grad_norm": 1.2512565851211548, | |
| "learning_rate": 0.00012716565474584702, | |
| "loss": 0.5992, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.3628977657413675, | |
| "grad_norm": 1.3591067790985107, | |
| "learning_rate": 0.00012671042504414619, | |
| "loss": 0.5853, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.3696682464454977, | |
| "grad_norm": 1.7091628313064575, | |
| "learning_rate": 0.00012625459850087846, | |
| "loss": 0.5501, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.3764387271496275, | |
| "grad_norm": 1.2151107788085938, | |
| "learning_rate": 0.00012579818530143884, | |
| "loss": 0.5684, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.3832092078537577, | |
| "grad_norm": 1.4708514213562012, | |
| "learning_rate": 0.000125341195644331, | |
| "loss": 0.578, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.3899796885578874, | |
| "grad_norm": 1.2934261560440063, | |
| "learning_rate": 0.0001248836397409396, | |
| "loss": 0.6235, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.3967501692620177, | |
| "grad_norm": 1.9203015565872192, | |
| "learning_rate": 0.00012442552781530186, | |
| "loss": 0.5868, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.4035206499661474, | |
| "grad_norm": 1.2564107179641724, | |
| "learning_rate": 0.00012396687010387942, | |
| "loss": 0.6091, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.4102911306702777, | |
| "grad_norm": 1.3231315612792969, | |
| "learning_rate": 0.00012350767685532938, | |
| "loss": 0.5492, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.4170616113744074, | |
| "grad_norm": 1.392247200012207, | |
| "learning_rate": 0.00012304795833027534, | |
| "loss": 0.5809, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.4238320920785377, | |
| "grad_norm": 1.1600557565689087, | |
| "learning_rate": 0.00012258772480107816, | |
| "loss": 0.5638, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.4306025727826674, | |
| "grad_norm": 1.3254331350326538, | |
| "learning_rate": 0.00012212698655160637, | |
| "loss": 0.5644, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.4373730534867977, | |
| "grad_norm": 1.2660179138183594, | |
| "learning_rate": 0.00012166575387700651, | |
| "loss": 0.5852, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.4441435341909274, | |
| "grad_norm": 1.1489580869674683, | |
| "learning_rate": 0.00012120403708347298, | |
| "loss": 0.5753, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.4509140148950577, | |
| "grad_norm": 1.1386017799377441, | |
| "learning_rate": 0.00012074184648801769, | |
| "loss": 0.5446, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.4576844955991874, | |
| "grad_norm": 1.3722707033157349, | |
| "learning_rate": 0.00012027919241823964, | |
| "loss": 0.5771, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.4644549763033177, | |
| "grad_norm": 1.1902090311050415, | |
| "learning_rate": 0.00011981608521209413, | |
| "loss": 0.5774, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.4712254570074474, | |
| "grad_norm": 1.1676629781723022, | |
| "learning_rate": 0.00011935253521766174, | |
| "loss": 0.5718, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.4779959377115777, | |
| "grad_norm": 1.1004976034164429, | |
| "learning_rate": 0.00011888855279291713, | |
| "loss": 0.6151, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.4847664184157074, | |
| "grad_norm": 1.407827377319336, | |
| "learning_rate": 0.00011842414830549748, | |
| "loss": 0.6025, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.4915368991198377, | |
| "grad_norm": 1.26259183883667, | |
| "learning_rate": 0.00011795933213247101, | |
| "loss": 0.6008, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.4983073798239674, | |
| "grad_norm": 1.1961734294891357, | |
| "learning_rate": 0.000117494114660105, | |
| "loss": 0.5598, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.5050778605280977, | |
| "grad_norm": 0.9188928604125977, | |
| "learning_rate": 0.00011702850628363365, | |
| "loss": 0.5636, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.5118483412322274, | |
| "grad_norm": 0.9072563052177429, | |
| "learning_rate": 0.00011656251740702596, | |
| "loss": 0.5629, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.518618821936357, | |
| "grad_norm": 1.0292631387710571, | |
| "learning_rate": 0.00011609615844275305, | |
| "loss": 0.6066, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.5253893026404874, | |
| "grad_norm": 1.229181170463562, | |
| "learning_rate": 0.00011562943981155575, | |
| "loss": 0.5491, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.5321597833446177, | |
| "grad_norm": 1.1053756475448608, | |
| "learning_rate": 0.00011516237194221149, | |
| "loss": 0.6065, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.5389302640487474, | |
| "grad_norm": 1.4795639514923096, | |
| "learning_rate": 0.0001146949652713015, | |
| "loss": 0.5705, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.545700744752877, | |
| "grad_norm": 1.1489176750183105, | |
| "learning_rate": 0.00011422723024297737, | |
| "loss": 0.5364, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.5524712254570074, | |
| "grad_norm": 1.1073706150054932, | |
| "learning_rate": 0.00011375917730872787, | |
| "loss": 0.6014, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.5592417061611377, | |
| "grad_norm": 1.5487061738967896, | |
| "learning_rate": 0.00011329081692714534, | |
| "loss": 0.5477, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.5660121868652674, | |
| "grad_norm": 1.4128634929656982, | |
| "learning_rate": 0.00011282215956369204, | |
| "loss": 0.6538, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.572782667569397, | |
| "grad_norm": 1.2158820629119873, | |
| "learning_rate": 0.00011235321569046615, | |
| "loss": 0.594, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.5795531482735274, | |
| "grad_norm": 1.3014835119247437, | |
| "learning_rate": 0.00011188399578596795, | |
| "loss": 0.5936, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.5863236289776577, | |
| "grad_norm": 1.3620414733886719, | |
| "learning_rate": 0.00011141451033486564, | |
| "loss": 0.5633, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.5930941096817874, | |
| "grad_norm": 1.224446415901184, | |
| "learning_rate": 0.00011094476982776096, | |
| "loss": 0.553, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.599864590385917, | |
| "grad_norm": 1.3176541328430176, | |
| "learning_rate": 0.00011047478476095487, | |
| "loss": 0.5591, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.6066350710900474, | |
| "grad_norm": 1.1520602703094482, | |
| "learning_rate": 0.00011000456563621304, | |
| "loss": 0.5753, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.6134055517941777, | |
| "grad_norm": 1.2285906076431274, | |
| "learning_rate": 0.00010953412296053105, | |
| "loss": 0.6055, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.6201760324983074, | |
| "grad_norm": 1.544148564338684, | |
| "learning_rate": 0.00010906346724589975, | |
| "loss": 0.6062, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.626946513202437, | |
| "grad_norm": 1.2714669704437256, | |
| "learning_rate": 0.00010859260900907038, | |
| "loss": 0.5867, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.6337169939065674, | |
| "grad_norm": 1.4937471151351929, | |
| "learning_rate": 0.00010812155877131945, | |
| "loss": 0.5953, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.640487474610697, | |
| "grad_norm": 1.551594614982605, | |
| "learning_rate": 0.00010765032705821363, | |
| "loss": 0.5537, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.6472579553148274, | |
| "grad_norm": 1.565324068069458, | |
| "learning_rate": 0.0001071789243993748, | |
| "loss": 0.572, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.654028436018957, | |
| "grad_norm": 1.207514762878418, | |
| "learning_rate": 0.00010670736132824455, | |
| "loss": 0.5921, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.6607989167230874, | |
| "grad_norm": 1.1995245218276978, | |
| "learning_rate": 0.00010623564838184878, | |
| "loss": 0.5635, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.667569397427217, | |
| "grad_norm": 1.1889262199401855, | |
| "learning_rate": 0.00010576379610056249, | |
| "loss": 0.5886, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.6743398781313474, | |
| "grad_norm": 1.0783162117004395, | |
| "learning_rate": 0.0001052918150278739, | |
| "loss": 0.5831, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.681110358835477, | |
| "grad_norm": 1.4271385669708252, | |
| "learning_rate": 0.0001048197157101493, | |
| "loss": 0.5335, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.6878808395396074, | |
| "grad_norm": 1.167817234992981, | |
| "learning_rate": 0.00010434750869639693, | |
| "loss": 0.5331, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.694651320243737, | |
| "grad_norm": 1.3966023921966553, | |
| "learning_rate": 0.00010387520453803166, | |
| "loss": 0.5931, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.7014218009478674, | |
| "grad_norm": 1.328182578086853, | |
| "learning_rate": 0.00010340281378863892, | |
| "loss": 0.5472, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.708192281651997, | |
| "grad_norm": 1.3755980730056763, | |
| "learning_rate": 0.00010293034700373905, | |
| "loss": 0.5875, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.708192281651997, | |
| "eval_loss": 0.8555851578712463, | |
| "eval_runtime": 22.9559, | |
| "eval_samples_per_second": 108.382, | |
| "eval_steps_per_second": 13.548, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.7149627623561274, | |
| "grad_norm": 1.2442570924758911, | |
| "learning_rate": 0.0001024578147405514, | |
| "loss": 0.6028, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.721733243060257, | |
| "grad_norm": 1.2046414613723755, | |
| "learning_rate": 0.0001019852275577585, | |
| "loss": 0.5959, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.7285037237643874, | |
| "grad_norm": 1.1981314420700073, | |
| "learning_rate": 0.00010151259601526992, | |
| "loss": 0.6042, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.735274204468517, | |
| "grad_norm": 1.3695381879806519, | |
| "learning_rate": 0.00010103993067398649, | |
| "loss": 0.5943, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.7420446851726474, | |
| "grad_norm": 1.1446524858474731, | |
| "learning_rate": 0.00010056724209556431, | |
| "loss": 0.5853, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.748815165876777, | |
| "grad_norm": 1.2874009609222412, | |
| "learning_rate": 0.00010009454084217873, | |
| "loss": 0.5967, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.755585646580907, | |
| "grad_norm": 1.3916451930999756, | |
| "learning_rate": 9.962183747628819e-05, | |
| "loss": 0.5528, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.762356127285037, | |
| "grad_norm": 1.141298532485962, | |
| "learning_rate": 9.914914256039847e-05, | |
| "loss": 0.5641, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.7691266079891674, | |
| "grad_norm": 1.2546755075454712, | |
| "learning_rate": 9.867646665682646e-05, | |
| "loss": 0.5638, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.775897088693297, | |
| "grad_norm": 1.2840214967727661, | |
| "learning_rate": 9.820382032746426e-05, | |
| "loss": 0.5835, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.782667569397427, | |
| "grad_norm": 1.1560393571853638, | |
| "learning_rate": 9.773121413354311e-05, | |
| "loss": 0.5809, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 2.789438050101557, | |
| "grad_norm": 1.3474149703979492, | |
| "learning_rate": 9.725865863539747e-05, | |
| "loss": 0.5768, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 2.7962085308056874, | |
| "grad_norm": 1.1416068077087402, | |
| "learning_rate": 9.678616439222899e-05, | |
| "loss": 0.5758, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 2.802979011509817, | |
| "grad_norm": 1.192691445350647, | |
| "learning_rate": 9.631374196187051e-05, | |
| "loss": 0.547, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 2.809749492213947, | |
| "grad_norm": 1.2631511688232422, | |
| "learning_rate": 9.584140190055035e-05, | |
| "loss": 0.5315, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.816519972918077, | |
| "grad_norm": 1.3457276821136475, | |
| "learning_rate": 9.536915476265621e-05, | |
| "loss": 0.5824, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 2.8232904536222074, | |
| "grad_norm": 1.5314511060714722, | |
| "learning_rate": 9.489701110049944e-05, | |
| "loss": 0.6094, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 2.830060934326337, | |
| "grad_norm": 1.3376086950302124, | |
| "learning_rate": 9.442498146407927e-05, | |
| "loss": 0.5914, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 2.836831415030467, | |
| "grad_norm": 1.5918281078338623, | |
| "learning_rate": 9.3953076400847e-05, | |
| "loss": 0.5814, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 2.843601895734597, | |
| "grad_norm": 1.387515902519226, | |
| "learning_rate": 9.348130645547042e-05, | |
| "loss": 0.5663, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.850372376438727, | |
| "grad_norm": 1.612802267074585, | |
| "learning_rate": 9.300968216959805e-05, | |
| "loss": 0.5807, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 1.34074068069458, | |
| "learning_rate": 9.253821408162366e-05, | |
| "loss": 0.5868, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 2.863913337846987, | |
| "grad_norm": 1.436584234237671, | |
| "learning_rate": 9.206691272645087e-05, | |
| "loss": 0.5613, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 2.870683818551117, | |
| "grad_norm": 1.3354675769805908, | |
| "learning_rate": 9.159578863525762e-05, | |
| "loss": 0.6245, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 2.877454299255247, | |
| "grad_norm": 1.1248669624328613, | |
| "learning_rate": 9.11248523352609e-05, | |
| "loss": 0.547, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.884224779959377, | |
| "grad_norm": 1.1722201108932495, | |
| "learning_rate": 9.065411434948152e-05, | |
| "loss": 0.5432, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 2.890995260663507, | |
| "grad_norm": 1.2124953269958496, | |
| "learning_rate": 9.018358519650909e-05, | |
| "loss": 0.534, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 2.897765741367637, | |
| "grad_norm": 1.258863091468811, | |
| "learning_rate": 8.97132753902667e-05, | |
| "loss": 0.5651, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 2.904536222071767, | |
| "grad_norm": 1.2424662113189697, | |
| "learning_rate": 8.924319543977631e-05, | |
| "loss": 0.5611, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 2.911306702775897, | |
| "grad_norm": 1.2281653881072998, | |
| "learning_rate": 8.877335584892369e-05, | |
| "loss": 0.5584, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.918077183480027, | |
| "grad_norm": 1.1419377326965332, | |
| "learning_rate": 8.830376711622379e-05, | |
| "loss": 0.5939, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 2.924847664184157, | |
| "grad_norm": 1.0923197269439697, | |
| "learning_rate": 8.783443973458625e-05, | |
| "loss": 0.5912, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.931618144888287, | |
| "grad_norm": 1.0926480293273926, | |
| "learning_rate": 8.736538419108074e-05, | |
| "loss": 0.6095, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 2.938388625592417, | |
| "grad_norm": 1.4442996978759766, | |
| "learning_rate": 8.689661096670285e-05, | |
| "loss": 0.5618, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 2.945159106296547, | |
| "grad_norm": 1.2105728387832642, | |
| "learning_rate": 8.64281305361397e-05, | |
| "loss": 0.5388, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.951929587000677, | |
| "grad_norm": 1.2048066854476929, | |
| "learning_rate": 8.595995336753597e-05, | |
| "loss": 0.5891, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 2.958700067704807, | |
| "grad_norm": 1.407758355140686, | |
| "learning_rate": 8.549208992226001e-05, | |
| "loss": 0.5351, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 2.9654705484089368, | |
| "grad_norm": 1.075348973274231, | |
| "learning_rate": 8.502455065467006e-05, | |
| "loss": 0.5939, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 2.972241029113067, | |
| "grad_norm": 1.2892156839370728, | |
| "learning_rate": 8.45573460118806e-05, | |
| "loss": 0.5488, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 2.979011509817197, | |
| "grad_norm": 1.1205973625183105, | |
| "learning_rate": 8.4090486433529e-05, | |
| "loss": 0.6054, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.985781990521327, | |
| "grad_norm": 1.4507098197937012, | |
| "learning_rate": 8.362398235154213e-05, | |
| "loss": 0.5542, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 2.9925524712254568, | |
| "grad_norm": 1.2207527160644531, | |
| "learning_rate": 8.31578441899035e-05, | |
| "loss": 0.5326, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 2.999322951929587, | |
| "grad_norm": 1.032354712486267, | |
| "learning_rate": 8.269208236442003e-05, | |
| "loss": 0.5924, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 3.006093432633717, | |
| "grad_norm": 1.38179349899292, | |
| "learning_rate": 8.222670728248941e-05, | |
| "loss": 0.4272, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 3.012863913337847, | |
| "grad_norm": 1.3886513710021973, | |
| "learning_rate": 8.17617293428677e-05, | |
| "loss": 0.4442, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.019634394041977, | |
| "grad_norm": 1.5716043710708618, | |
| "learning_rate": 8.129715893543681e-05, | |
| "loss": 0.3873, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 3.026404874746107, | |
| "grad_norm": 1.4398396015167236, | |
| "learning_rate": 8.08330064409724e-05, | |
| "loss": 0.3991, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 3.0331753554502368, | |
| "grad_norm": 1.4795118570327759, | |
| "learning_rate": 8.036928223091187e-05, | |
| "loss": 0.4557, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 3.039945836154367, | |
| "grad_norm": 1.5591235160827637, | |
| "learning_rate": 7.990599666712268e-05, | |
| "loss": 0.4077, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 3.0467163168584968, | |
| "grad_norm": 1.3513033390045166, | |
| "learning_rate": 7.94431601016708e-05, | |
| "loss": 0.3999, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.053486797562627, | |
| "grad_norm": 1.4254108667373657, | |
| "learning_rate": 7.898078287658941e-05, | |
| "loss": 0.3614, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 3.0602572782667568, | |
| "grad_norm": 1.2728102207183838, | |
| "learning_rate": 7.85188753236477e-05, | |
| "loss": 0.4038, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 3.067027758970887, | |
| "grad_norm": 1.6714439392089844, | |
| "learning_rate": 7.805744776412012e-05, | |
| "loss": 0.4229, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 3.0737982396750168, | |
| "grad_norm": 1.4847053289413452, | |
| "learning_rate": 7.759651050855568e-05, | |
| "loss": 0.3806, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 3.080568720379147, | |
| "grad_norm": 1.7574979066848755, | |
| "learning_rate": 7.713607385654772e-05, | |
| "loss": 0.3625, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 3.0873392010832768, | |
| "grad_norm": 1.495059609413147, | |
| "learning_rate": 7.667614809650351e-05, | |
| "loss": 0.3889, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 3.094109681787407, | |
| "grad_norm": 1.2997581958770752, | |
| "learning_rate": 7.621674350541461e-05, | |
| "loss": 0.3775, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 3.1008801624915368, | |
| "grad_norm": 1.5862250328063965, | |
| "learning_rate": 7.575787034862704e-05, | |
| "loss": 0.4023, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 3.107650643195667, | |
| "grad_norm": 1.5325440168380737, | |
| "learning_rate": 7.529953887961197e-05, | |
| "loss": 0.3641, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 3.1144211238997968, | |
| "grad_norm": 1.4811371564865112, | |
| "learning_rate": 7.484175933973668e-05, | |
| "loss": 0.3818, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.121191604603927, | |
| "grad_norm": 1.7169820070266724, | |
| "learning_rate": 7.438454195803559e-05, | |
| "loss": 0.4187, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 3.1279620853080567, | |
| "grad_norm": 1.6318345069885254, | |
| "learning_rate": 7.392789695098182e-05, | |
| "loss": 0.3718, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 3.134732566012187, | |
| "grad_norm": 1.633092999458313, | |
| "learning_rate": 7.347183452225874e-05, | |
| "loss": 0.3969, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 3.1415030467163167, | |
| "grad_norm": 1.8210922479629517, | |
| "learning_rate": 7.301636486253215e-05, | |
| "loss": 0.4193, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 3.148273527420447, | |
| "grad_norm": 2.1533546447753906, | |
| "learning_rate": 7.256149814922253e-05, | |
| "loss": 0.3923, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 3.1550440081245767, | |
| "grad_norm": 1.4838796854019165, | |
| "learning_rate": 7.210724454627751e-05, | |
| "loss": 0.3871, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 3.161814488828707, | |
| "grad_norm": 1.755631685256958, | |
| "learning_rate": 7.165361420394482e-05, | |
| "loss": 0.4219, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 3.1685849695328367, | |
| "grad_norm": 1.197309136390686, | |
| "learning_rate": 7.120061725854554e-05, | |
| "loss": 0.4219, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 3.175355450236967, | |
| "grad_norm": 1.7161248922348022, | |
| "learning_rate": 7.074826383224761e-05, | |
| "loss": 0.4002, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 3.1821259309410967, | |
| "grad_norm": 1.4585338830947876, | |
| "learning_rate": 7.029656403283951e-05, | |
| "loss": 0.3984, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.188896411645227, | |
| "grad_norm": 1.5048658847808838, | |
| "learning_rate": 6.984552795350453e-05, | |
| "loss": 0.4005, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 3.1956668923493567, | |
| "grad_norm": 1.7454990148544312, | |
| "learning_rate": 6.939516567259523e-05, | |
| "loss": 0.3999, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 3.202437373053487, | |
| "grad_norm": 1.4264365434646606, | |
| "learning_rate": 6.894548725340822e-05, | |
| "loss": 0.3844, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 3.2092078537576167, | |
| "grad_norm": 1.3761653900146484, | |
| "learning_rate": 6.849650274395929e-05, | |
| "loss": 0.4107, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 3.215978334461747, | |
| "grad_norm": 1.6094237565994263, | |
| "learning_rate": 6.804822217675885e-05, | |
| "loss": 0.3865, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 3.2227488151658767, | |
| "grad_norm": 1.969099998474121, | |
| "learning_rate": 6.760065556858786e-05, | |
| "loss": 0.3635, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 3.229519295870007, | |
| "grad_norm": 1.5209436416625977, | |
| "learning_rate": 6.715381292027385e-05, | |
| "loss": 0.3754, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 3.2362897765741367, | |
| "grad_norm": 1.6469786167144775, | |
| "learning_rate": 6.670770421646767e-05, | |
| "loss": 0.4034, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 3.243060257278267, | |
| "grad_norm": 1.6617894172668457, | |
| "learning_rate": 6.626233942542013e-05, | |
| "loss": 0.3946, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 3.2498307379823967, | |
| "grad_norm": 1.4001210927963257, | |
| "learning_rate": 6.581772849875951e-05, | |
| "loss": 0.3638, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.2566012186865265, | |
| "grad_norm": 1.7633929252624512, | |
| "learning_rate": 6.537388137126899e-05, | |
| "loss": 0.3607, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 3.2633716993906567, | |
| "grad_norm": 1.6892105340957642, | |
| "learning_rate": 6.493080796066477e-05, | |
| "loss": 0.3797, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 3.270142180094787, | |
| "grad_norm": 1.4346562623977661, | |
| "learning_rate": 6.448851816737443e-05, | |
| "loss": 0.3552, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 3.2769126607989167, | |
| "grad_norm": 1.5974228382110596, | |
| "learning_rate": 6.404702187431568e-05, | |
| "loss": 0.3905, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 3.2836831415030465, | |
| "grad_norm": 1.4062926769256592, | |
| "learning_rate": 6.360632894667555e-05, | |
| "loss": 0.3864, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 3.2904536222071767, | |
| "grad_norm": 1.6129074096679688, | |
| "learning_rate": 6.316644923169007e-05, | |
| "loss": 0.3921, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 3.2972241029113065, | |
| "grad_norm": 1.5494030714035034, | |
| "learning_rate": 6.27273925584239e-05, | |
| "loss": 0.4138, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 3.3039945836154367, | |
| "grad_norm": 1.5944302082061768, | |
| "learning_rate": 6.228916873755118e-05, | |
| "loss": 0.3709, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 3.3107650643195665, | |
| "grad_norm": 1.4350250959396362, | |
| "learning_rate": 6.185178756113586e-05, | |
| "loss": 0.3622, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 3.3175355450236967, | |
| "grad_norm": 1.5585368871688843, | |
| "learning_rate": 6.141525880241313e-05, | |
| "loss": 0.3969, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.3243060257278265, | |
| "grad_norm": 1.289538860321045, | |
| "learning_rate": 6.097959221557108e-05, | |
| "loss": 0.394, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 3.3310765064319567, | |
| "grad_norm": 1.7543057203292847, | |
| "learning_rate": 6.054479753553259e-05, | |
| "loss": 0.396, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 3.3378469871360865, | |
| "grad_norm": 1.633093237876892, | |
| "learning_rate": 6.0110884477737875e-05, | |
| "loss": 0.415, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 3.3446174678402167, | |
| "grad_norm": 1.537914514541626, | |
| "learning_rate": 5.9677862737927415e-05, | |
| "loss": 0.399, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 3.3513879485443465, | |
| "grad_norm": 1.6341283321380615, | |
| "learning_rate": 5.924574199192527e-05, | |
| "loss": 0.3825, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 3.3581584292484767, | |
| "grad_norm": 1.4960927963256836, | |
| "learning_rate": 5.881453189542295e-05, | |
| "loss": 0.3793, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 3.3649289099526065, | |
| "grad_norm": 1.6509079933166504, | |
| "learning_rate": 5.838424208376354e-05, | |
| "loss": 0.3939, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 3.3716993906567367, | |
| "grad_norm": 1.662853479385376, | |
| "learning_rate": 5.7954882171726444e-05, | |
| "loss": 0.4141, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 3.3784698713608665, | |
| "grad_norm": 1.639427661895752, | |
| "learning_rate": 5.752646175331267e-05, | |
| "loss": 0.4112, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 3.3852403520649967, | |
| "grad_norm": 1.4693089723587036, | |
| "learning_rate": 5.709899040153013e-05, | |
| "loss": 0.372, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.3852403520649967, | |
| "eval_loss": 0.9812659621238708, | |
| "eval_runtime": 23.1744, | |
| "eval_samples_per_second": 107.36, | |
| "eval_steps_per_second": 13.42, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.3920108327691265, | |
| "grad_norm": 1.4617177248001099, | |
| "learning_rate": 5.667247766818018e-05, | |
| "loss": 0.385, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 3.3987813134732567, | |
| "grad_norm": 1.2667337656021118, | |
| "learning_rate": 5.6246933083643794e-05, | |
| "loss": 0.3759, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 3.4055517941773865, | |
| "grad_norm": 1.9020839929580688, | |
| "learning_rate": 5.582236615666885e-05, | |
| "loss": 0.3991, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 3.4123222748815167, | |
| "grad_norm": 1.4279497861862183, | |
| "learning_rate": 5.5398786374157564e-05, | |
| "loss": 0.3938, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 3.4190927555856465, | |
| "grad_norm": 1.5497093200683594, | |
| "learning_rate": 5.4976203200954425e-05, | |
| "loss": 0.4, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 3.4258632362897767, | |
| "grad_norm": 1.3598889112472534, | |
| "learning_rate": 5.4554626079634906e-05, | |
| "loss": 0.4117, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 3.4326337169939065, | |
| "grad_norm": 1.498186707496643, | |
| "learning_rate": 5.413406443029433e-05, | |
| "loss": 0.409, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 3.4394041976980367, | |
| "grad_norm": 1.9175001382827759, | |
| "learning_rate": 5.371452765033733e-05, | |
| "loss": 0.405, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 3.4461746784021665, | |
| "grad_norm": 1.9584026336669922, | |
| "learning_rate": 5.32960251142681e-05, | |
| "loss": 0.3635, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 3.4529451591062967, | |
| "grad_norm": 1.582276463508606, | |
| "learning_rate": 5.287856617348054e-05, | |
| "loss": 0.4101, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.4597156398104265, | |
| "grad_norm": 1.6922118663787842, | |
| "learning_rate": 5.2462160156049765e-05, | |
| "loss": 0.3894, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 3.4664861205145563, | |
| "grad_norm": 1.7980077266693115, | |
| "learning_rate": 5.2046816366523355e-05, | |
| "loss": 0.3909, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 3.4732566012186865, | |
| "grad_norm": 1.5998905897140503, | |
| "learning_rate": 5.1632544085713376e-05, | |
| "loss": 0.367, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 3.4800270819228167, | |
| "grad_norm": 1.5311387777328491, | |
| "learning_rate": 5.121935257048936e-05, | |
| "loss": 0.4053, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 3.4867975626269465, | |
| "grad_norm": 1.7611960172653198, | |
| "learning_rate": 5.080725105357109e-05, | |
| "loss": 0.3938, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 3.4935680433310763, | |
| "grad_norm": 2.3462700843811035, | |
| "learning_rate": 5.0396248743322526e-05, | |
| "loss": 0.3949, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 3.5003385240352065, | |
| "grad_norm": 1.386608362197876, | |
| "learning_rate": 4.998635482354598e-05, | |
| "loss": 0.3593, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 3.5071090047393367, | |
| "grad_norm": 2.024418592453003, | |
| "learning_rate": 4.9577578453276886e-05, | |
| "loss": 0.3835, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 3.5138794854434665, | |
| "grad_norm": 1.9304969310760498, | |
| "learning_rate": 4.9169928766579164e-05, | |
| "loss": 0.4439, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 3.5206499661475963, | |
| "grad_norm": 1.6261743307113647, | |
| "learning_rate": 4.876341487234105e-05, | |
| "loss": 0.4055, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.5274204468517265, | |
| "grad_norm": 1.770004153251648, | |
| "learning_rate": 4.83580458540717e-05, | |
| "loss": 0.401, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 3.5341909275558567, | |
| "grad_norm": 2.584394931793213, | |
| "learning_rate": 4.7953830769698125e-05, | |
| "loss": 0.3809, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 3.5409614082599865, | |
| "grad_norm": 1.66965651512146, | |
| "learning_rate": 4.755077865136274e-05, | |
| "loss": 0.4251, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 3.5477318889641163, | |
| "grad_norm": 1.5093834400177002, | |
| "learning_rate": 4.7148898505221685e-05, | |
| "loss": 0.3812, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 3.5545023696682465, | |
| "grad_norm": 1.7326291799545288, | |
| "learning_rate": 4.674819931124348e-05, | |
| "loss": 0.3606, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.5612728503723763, | |
| "grad_norm": 2.2934281826019287, | |
| "learning_rate": 4.63486900230084e-05, | |
| "loss": 0.4269, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 3.5680433310765065, | |
| "grad_norm": 1.787213683128357, | |
| "learning_rate": 4.595037956750845e-05, | |
| "loss": 0.4109, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 3.5748138117806363, | |
| "grad_norm": 1.5188498497009277, | |
| "learning_rate": 4.5553276844947726e-05, | |
| "loss": 0.4027, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 3.5815842924847665, | |
| "grad_norm": 1.5621033906936646, | |
| "learning_rate": 4.515739072854376e-05, | |
| "loss": 0.4377, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 3.5883547731888963, | |
| "grad_norm": 1.4404442310333252, | |
| "learning_rate": 4.4762730064329164e-05, | |
| "loss": 0.4058, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.5951252538930265, | |
| "grad_norm": 1.506831407546997, | |
| "learning_rate": 4.436930367095384e-05, | |
| "loss": 0.3852, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 3.6018957345971563, | |
| "grad_norm": 2.1018640995025635, | |
| "learning_rate": 4.3977120339488174e-05, | |
| "loss": 0.4128, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 3.6086662153012865, | |
| "grad_norm": 1.4768526554107666, | |
| "learning_rate": 4.358618883322639e-05, | |
| "loss": 0.3848, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 3.6154366960054163, | |
| "grad_norm": 1.3917316198349, | |
| "learning_rate": 4.319651788749084e-05, | |
| "loss": 0.4186, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 3.6222071767095465, | |
| "grad_norm": 1.9646469354629517, | |
| "learning_rate": 4.280811620943682e-05, | |
| "loss": 0.4213, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.6289776574136763, | |
| "grad_norm": 2.266582727432251, | |
| "learning_rate": 4.2420992477857856e-05, | |
| "loss": 0.4063, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 3.6357481381178065, | |
| "grad_norm": 1.8989133834838867, | |
| "learning_rate": 4.203515534299205e-05, | |
| "loss": 0.3786, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 3.6425186188219363, | |
| "grad_norm": 2.106405258178711, | |
| "learning_rate": 4.16506134263285e-05, | |
| "loss": 0.406, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 3.6492890995260665, | |
| "grad_norm": 2.1753334999084473, | |
| "learning_rate": 4.12673753204149e-05, | |
| "loss": 0.3845, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 3.6560595802301963, | |
| "grad_norm": 1.5723298788070679, | |
| "learning_rate": 4.0885449588665395e-05, | |
| "loss": 0.411, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.6628300609343265, | |
| "grad_norm": 2.0291285514831543, | |
| "learning_rate": 4.050484476516926e-05, | |
| "loss": 0.3926, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 3.6696005416384563, | |
| "grad_norm": 1.5461398363113403, | |
| "learning_rate": 4.012556935450027e-05, | |
| "loss": 0.4232, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 3.676371022342586, | |
| "grad_norm": 1.6446950435638428, | |
| "learning_rate": 3.97476318315265e-05, | |
| "loss": 0.3882, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 3.6831415030467163, | |
| "grad_norm": 1.363389015197754, | |
| "learning_rate": 3.937104064122117e-05, | |
| "loss": 0.3714, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 3.6899119837508465, | |
| "grad_norm": 1.4707744121551514, | |
| "learning_rate": 3.899580419847385e-05, | |
| "loss": 0.3633, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.6966824644549763, | |
| "grad_norm": 2.183893918991089, | |
| "learning_rate": 3.862193088790231e-05, | |
| "loss": 0.3918, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 3.703452945159106, | |
| "grad_norm": 1.798282504081726, | |
| "learning_rate": 3.82494290636654e-05, | |
| "loss": 0.4081, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 3.7102234258632363, | |
| "grad_norm": 1.563833475112915, | |
| "learning_rate": 3.7878307049276195e-05, | |
| "loss": 0.3772, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 3.7169939065673665, | |
| "grad_norm": 1.5234781503677368, | |
| "learning_rate": 3.7508573137416095e-05, | |
| "loss": 0.3923, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 3.7237643872714963, | |
| "grad_norm": 1.5436840057373047, | |
| "learning_rate": 3.71402355897495e-05, | |
| "loss": 0.4204, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.730534867975626, | |
| "grad_norm": 1.640419363975525, | |
| "learning_rate": 3.6773302636739116e-05, | |
| "loss": 0.391, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 3.7373053486797563, | |
| "grad_norm": 1.8847980499267578, | |
| "learning_rate": 3.640778247746226e-05, | |
| "loss": 0.3843, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 3.7440758293838865, | |
| "grad_norm": 1.2375092506408691, | |
| "learning_rate": 3.6043683279427484e-05, | |
| "loss": 0.3623, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 3.7508463100880163, | |
| "grad_norm": 1.3256595134735107, | |
| "learning_rate": 3.568101317839205e-05, | |
| "loss": 0.3923, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 3.757616790792146, | |
| "grad_norm": 1.5230741500854492, | |
| "learning_rate": 3.531978027818027e-05, | |
| "loss": 0.3918, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 3.7643872714962763, | |
| "grad_norm": 1.619551181793213, | |
| "learning_rate": 3.4959992650502346e-05, | |
| "loss": 0.4316, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 3.7711577522004065, | |
| "grad_norm": 2.241872787475586, | |
| "learning_rate": 3.4601658334774014e-05, | |
| "loss": 0.4183, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 3.7779282329045363, | |
| "grad_norm": 1.427147626876831, | |
| "learning_rate": 3.424478533793695e-05, | |
| "loss": 0.4036, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 3.784698713608666, | |
| "grad_norm": 1.646103024482727, | |
| "learning_rate": 3.388938163427969e-05, | |
| "loss": 0.3846, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 3.7914691943127963, | |
| "grad_norm": 1.4623626470565796, | |
| "learning_rate": 3.3535455165259734e-05, | |
| "loss": 0.4339, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.798239675016926, | |
| "grad_norm": 1.5822981595993042, | |
| "learning_rate": 3.318301383932586e-05, | |
| "loss": 0.4013, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 3.8050101557210563, | |
| "grad_norm": 1.6035799980163574, | |
| "learning_rate": 3.283206553174144e-05, | |
| "loss": 0.3765, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 3.811780636425186, | |
| "grad_norm": 1.4690262079238892, | |
| "learning_rate": 3.248261808440858e-05, | |
| "loss": 0.3846, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 3.8185511171293163, | |
| "grad_norm": 1.6690099239349365, | |
| "learning_rate": 3.213467930569279e-05, | |
| "loss": 0.3908, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 3.825321597833446, | |
| "grad_norm": 1.9128773212432861, | |
| "learning_rate": 3.178825697024859e-05, | |
| "loss": 0.4075, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 3.8320920785375763, | |
| "grad_norm": 1.5227471590042114, | |
| "learning_rate": 3.14433588188457e-05, | |
| "loss": 0.3949, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 3.838862559241706, | |
| "grad_norm": 1.8962739706039429, | |
| "learning_rate": 3.109999255819607e-05, | |
| "loss": 0.3708, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 3.8456330399458363, | |
| "grad_norm": 1.7166234254837036, | |
| "learning_rate": 3.075816586078182e-05, | |
| "loss": 0.3853, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 3.852403520649966, | |
| "grad_norm": 1.603034257888794, | |
| "learning_rate": 3.0417886364683578e-05, | |
| "loss": 0.3697, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 3.8591740013540963, | |
| "grad_norm": 1.2980273962020874, | |
| "learning_rate": 3.0079161673410006e-05, | |
| "loss": 0.3561, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.865944482058226, | |
| "grad_norm": 1.2596299648284912, | |
| "learning_rate": 2.974199935572781e-05, | |
| "loss": 0.3759, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 3.8727149627623563, | |
| "grad_norm": 1.6658598184585571, | |
| "learning_rate": 2.9406406945492616e-05, | |
| "loss": 0.3902, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 3.879485443466486, | |
| "grad_norm": 1.401743769645691, | |
| "learning_rate": 2.907239194148066e-05, | |
| "loss": 0.4045, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 3.8862559241706163, | |
| "grad_norm": 1.7074028253555298, | |
| "learning_rate": 2.8739961807221127e-05, | |
| "loss": 0.4103, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 3.893026404874746, | |
| "grad_norm": 1.6622352600097656, | |
| "learning_rate": 2.840912397082954e-05, | |
| "loss": 0.3718, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 3.8997968855788763, | |
| "grad_norm": 1.5955240726470947, | |
| "learning_rate": 2.807988582484171e-05, | |
| "loss": 0.3949, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 3.906567366283006, | |
| "grad_norm": 1.5108157396316528, | |
| "learning_rate": 2.7752254726048422e-05, | |
| "loss": 0.3665, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 3.913337846987136, | |
| "grad_norm": 1.4178344011306763, | |
| "learning_rate": 2.7426237995331296e-05, | |
| "loss": 0.3835, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 3.920108327691266, | |
| "grad_norm": 1.7224016189575195, | |
| "learning_rate": 2.7101842917498997e-05, | |
| "loss": 0.4008, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 3.9268788083953963, | |
| "grad_norm": 1.513185977935791, | |
| "learning_rate": 2.6779076741124576e-05, | |
| "loss": 0.4084, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.933649289099526, | |
| "grad_norm": 1.806357741355896, | |
| "learning_rate": 2.6457946678383448e-05, | |
| "loss": 0.382, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 3.940419769803656, | |
| "grad_norm": 1.5622941255569458, | |
| "learning_rate": 2.6138459904892177e-05, | |
| "loss": 0.3943, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 3.947190250507786, | |
| "grad_norm": 2.032970428466797, | |
| "learning_rate": 2.5820623559548285e-05, | |
| "loss": 0.3486, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 3.9539607312119163, | |
| "grad_norm": 1.7815639972686768, | |
| "learning_rate": 2.550444474437066e-05, | |
| "loss": 0.3772, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 3.960731211916046, | |
| "grad_norm": 1.6397390365600586, | |
| "learning_rate": 2.5189930524340767e-05, | |
| "loss": 0.3629, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 3.967501692620176, | |
| "grad_norm": 1.4618537425994873, | |
| "learning_rate": 2.487708792724497e-05, | |
| "loss": 0.4054, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 3.974272173324306, | |
| "grad_norm": 1.5044384002685547, | |
| "learning_rate": 2.4565923943517343e-05, | |
| "loss": 0.4003, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 3.9810426540284363, | |
| "grad_norm": 1.5843464136123657, | |
| "learning_rate": 2.425644552608356e-05, | |
| "loss": 0.3977, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 3.987813134732566, | |
| "grad_norm": 1.5150847434997559, | |
| "learning_rate": 2.3948659590205515e-05, | |
| "loss": 0.4088, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 3.994583615436696, | |
| "grad_norm": 1.9236164093017578, | |
| "learning_rate": 2.3642573013326663e-05, | |
| "loss": 0.4008, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 4.001354096140826, | |
| "grad_norm": 1.42927086353302, | |
| "learning_rate": 2.3338192634918643e-05, | |
| "loss": 0.3427, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 4.008124576844956, | |
| "grad_norm": 1.3550347089767456, | |
| "learning_rate": 2.3035525256328106e-05, | |
| "loss": 0.2699, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 4.014895057549086, | |
| "grad_norm": 1.546830177307129, | |
| "learning_rate": 2.2734577640625022e-05, | |
| "loss": 0.2694, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 4.021665538253216, | |
| "grad_norm": 1.7005549669265747, | |
| "learning_rate": 2.2435356512451387e-05, | |
| "loss": 0.2822, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 4.028436018957346, | |
| "grad_norm": 1.5947457551956177, | |
| "learning_rate": 2.2137868557871067e-05, | |
| "loss": 0.2965, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 4.035206499661476, | |
| "grad_norm": 1.600761890411377, | |
| "learning_rate": 2.1842120424220334e-05, | |
| "loss": 0.2551, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 4.041976980365606, | |
| "grad_norm": 1.5094797611236572, | |
| "learning_rate": 2.1548118719959286e-05, | |
| "loss": 0.2903, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 4.048747461069736, | |
| "grad_norm": 1.5594260692596436, | |
| "learning_rate": 2.1255870014524327e-05, | |
| "loss": 0.294, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 4.055517941773866, | |
| "grad_norm": 1.5365486145019531, | |
| "learning_rate": 2.096538083818128e-05, | |
| "loss": 0.2838, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 4.062288422477996, | |
| "grad_norm": 1.9512939453125, | |
| "learning_rate": 2.067665768187941e-05, | |
| "loss": 0.2649, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.062288422477996, | |
| "eval_loss": 1.1342198848724365, | |
| "eval_runtime": 22.903, | |
| "eval_samples_per_second": 108.632, | |
| "eval_steps_per_second": 13.579, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.069058903182126, | |
| "grad_norm": 1.703903079032898, | |
| "learning_rate": 2.0389706997106527e-05, | |
| "loss": 0.2606, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 4.075829383886256, | |
| "grad_norm": 1.8867642879486084, | |
| "learning_rate": 2.0104535195744746e-05, | |
| "loss": 0.2848, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 4.082599864590386, | |
| "grad_norm": 1.9352099895477295, | |
| "learning_rate": 1.9821148649927212e-05, | |
| "loss": 0.2724, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 4.089370345294516, | |
| "grad_norm": 1.7266086339950562, | |
| "learning_rate": 1.953955369189574e-05, | |
| "loss": 0.2745, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 4.096140825998646, | |
| "grad_norm": 1.5754889249801636, | |
| "learning_rate": 1.925975661385926e-05, | |
| "loss": 0.2737, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 4.102911306702776, | |
| "grad_norm": 1.6799631118774414, | |
| "learning_rate": 1.8981763667853326e-05, | |
| "loss": 0.2606, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 4.109681787406906, | |
| "grad_norm": 1.5695922374725342, | |
| "learning_rate": 1.870558106560035e-05, | |
| "loss": 0.2621, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 4.116452268111036, | |
| "grad_norm": 1.550424337387085, | |
| "learning_rate": 1.8431214978370758e-05, | |
| "loss": 0.2677, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 4.123222748815166, | |
| "grad_norm": 1.4905930757522583, | |
| "learning_rate": 1.8158671536845186e-05, | |
| "loss": 0.2562, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 4.129993229519296, | |
| "grad_norm": 1.688219666481018, | |
| "learning_rate": 1.788795683097746e-05, | |
| "loss": 0.2591, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 4.136763710223426, | |
| "grad_norm": 1.8246350288391113, | |
| "learning_rate": 1.761907690985847e-05, | |
| "loss": 0.2823, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 4.143534190927556, | |
| "grad_norm": 1.475894808769226, | |
| "learning_rate": 1.735203778158109e-05, | |
| "loss": 0.2672, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 4.150304671631686, | |
| "grad_norm": 2.1845951080322266, | |
| "learning_rate": 1.7086845413105778e-05, | |
| "loss": 0.2607, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 4.157075152335816, | |
| "grad_norm": 1.9802888631820679, | |
| "learning_rate": 1.6823505730127455e-05, | |
| "loss": 0.2653, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 4.163845633039946, | |
| "grad_norm": 1.2355766296386719, | |
| "learning_rate": 1.656202461694293e-05, | |
| "loss": 0.2787, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 4.170616113744076, | |
| "grad_norm": 1.6711342334747314, | |
| "learning_rate": 1.630240791631945e-05, | |
| "loss": 0.2996, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 4.177386594448206, | |
| "grad_norm": 1.8249988555908203, | |
| "learning_rate": 1.6044661429364205e-05, | |
| "loss": 0.2617, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 4.184157075152336, | |
| "grad_norm": 2.0309152603149414, | |
| "learning_rate": 1.5788790915394645e-05, | |
| "loss": 0.2627, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 4.190927555856466, | |
| "grad_norm": 1.7783539295196533, | |
| "learning_rate": 1.5534802091809818e-05, | |
| "loss": 0.2734, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 4.197698036560595, | |
| "grad_norm": 1.5822839736938477, | |
| "learning_rate": 1.528270063396262e-05, | |
| "loss": 0.2765, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 4.204468517264726, | |
| "grad_norm": 1.9683705568313599, | |
| "learning_rate": 1.5032492175032876e-05, | |
| "loss": 0.2665, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 4.211238997968856, | |
| "grad_norm": 1.4425179958343506, | |
| "learning_rate": 1.4784182305901672e-05, | |
| "loss": 0.2644, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 4.218009478672986, | |
| "grad_norm": 1.8725738525390625, | |
| "learning_rate": 1.4537776575026207e-05, | |
| "loss": 0.2611, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 4.224779959377115, | |
| "grad_norm": 1.767899990081787, | |
| "learning_rate": 1.4293280488315986e-05, | |
| "loss": 0.2851, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 4.231550440081246, | |
| "grad_norm": 1.2789946794509888, | |
| "learning_rate": 1.4050699509009679e-05, | |
| "loss": 0.2727, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 4.238320920785376, | |
| "grad_norm": 1.5606369972229004, | |
| "learning_rate": 1.3810039057553138e-05, | |
| "loss": 0.2704, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 4.245091401489506, | |
| "grad_norm": 1.5035715103149414, | |
| "learning_rate": 1.3571304511478188e-05, | |
| "loss": 0.2847, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 4.251861882193635, | |
| "grad_norm": 1.8756885528564453, | |
| "learning_rate": 1.333450120528249e-05, | |
| "loss": 0.2551, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 4.258632362897766, | |
| "grad_norm": 2.072859048843384, | |
| "learning_rate": 1.3099634430310403e-05, | |
| "loss": 0.249, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 4.265402843601896, | |
| "grad_norm": 1.6129212379455566, | |
| "learning_rate": 1.2866709434634684e-05, | |
| "loss": 0.2961, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 4.272173324306026, | |
| "grad_norm": 1.705417513847351, | |
| "learning_rate": 1.2635731422939212e-05, | |
| "loss": 0.2476, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 4.278943805010155, | |
| "grad_norm": 1.9114418029785156, | |
| "learning_rate": 1.2406705556402776e-05, | |
| "loss": 0.275, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 1.7978328466415405, | |
| "learning_rate": 1.217963695258364e-05, | |
| "loss": 0.2605, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 4.292484766418416, | |
| "grad_norm": 1.7482448816299438, | |
| "learning_rate": 1.1954530685305287e-05, | |
| "loss": 0.2696, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 4.299255247122546, | |
| "grad_norm": 2.014146566390991, | |
| "learning_rate": 1.1731391784543e-05, | |
| "loss": 0.2914, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 4.306025727826675, | |
| "grad_norm": 2.0617308616638184, | |
| "learning_rate": 1.15102252363114e-05, | |
| "loss": 0.262, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 4.312796208530806, | |
| "grad_norm": 1.9172184467315674, | |
| "learning_rate": 1.1291035982553189e-05, | |
| "loss": 0.2702, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 4.319566689234936, | |
| "grad_norm": 1.7097840309143066, | |
| "learning_rate": 1.1073828921028606e-05, | |
| "loss": 0.308, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 4.326337169939066, | |
| "grad_norm": 1.5703011751174927, | |
| "learning_rate": 1.085860890520598e-05, | |
| "loss": 0.2536, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 4.333107650643195, | |
| "grad_norm": 2.1221113204956055, | |
| "learning_rate": 1.0645380744153378e-05, | |
| "loss": 0.2713, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 4.339878131347326, | |
| "grad_norm": 1.5522172451019287, | |
| "learning_rate": 1.0434149202431054e-05, | |
| "loss": 0.259, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 4.346648612051456, | |
| "grad_norm": 1.7431870698928833, | |
| "learning_rate": 1.0224918999985044e-05, | |
| "loss": 0.2847, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 4.353419092755586, | |
| "grad_norm": 1.9679934978485107, | |
| "learning_rate": 1.0017694812041656e-05, | |
| "loss": 0.2621, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 4.360189573459715, | |
| "grad_norm": 2.4556872844696045, | |
| "learning_rate": 9.812481269002983e-06, | |
| "loss": 0.2803, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 4.366960054163846, | |
| "grad_norm": 1.530918836593628, | |
| "learning_rate": 9.609282956343557e-06, | |
| "loss": 0.2962, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 4.373730534867976, | |
| "grad_norm": 1.861484169960022, | |
| "learning_rate": 9.408104414507724e-06, | |
| "loss": 0.2917, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 4.380501015572106, | |
| "grad_norm": 2.1292312145233154, | |
| "learning_rate": 9.208950138808293e-06, | |
| "loss": 0.329, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 4.387271496276235, | |
| "grad_norm": 1.6679848432540894, | |
| "learning_rate": 9.011824579326144e-06, | |
| "loss": 0.2768, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 4.394041976980366, | |
| "grad_norm": 1.5731488466262817, | |
| "learning_rate": 8.81673214081058e-06, | |
| "loss": 0.2919, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 4.400812457684496, | |
| "grad_norm": 1.8150240182876587, | |
| "learning_rate": 8.623677182581135e-06, | |
| "loss": 0.2719, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.407582938388625, | |
| "grad_norm": 2.06569504737854, | |
| "learning_rate": 8.432664018430003e-06, | |
| "loss": 0.2803, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 4.414353419092755, | |
| "grad_norm": 1.6544770002365112, | |
| "learning_rate": 8.243696916525745e-06, | |
| "loss": 0.2508, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 4.421123899796886, | |
| "grad_norm": 1.6926827430725098, | |
| "learning_rate": 8.056780099317885e-06, | |
| "loss": 0.2979, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 4.427894380501016, | |
| "grad_norm": 1.7074532508850098, | |
| "learning_rate": 7.871917743442513e-06, | |
| "loss": 0.2901, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 4.434664861205146, | |
| "grad_norm": 2.1102843284606934, | |
| "learning_rate": 7.68911397962906e-06, | |
| "loss": 0.2615, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 4.441435341909275, | |
| "grad_norm": 1.4068889617919922, | |
| "learning_rate": 7.5083728926079065e-06, | |
| "loss": 0.2608, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 4.448205822613406, | |
| "grad_norm": 1.8090318441390991, | |
| "learning_rate": 7.329698521019157e-06, | |
| "loss": 0.2904, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 4.454976303317536, | |
| "grad_norm": 1.7596811056137085, | |
| "learning_rate": 7.153094857322374e-06, | |
| "loss": 0.2763, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 4.461746784021665, | |
| "grad_norm": 1.7713943719863892, | |
| "learning_rate": 6.978565847707352e-06, | |
| "loss": 0.2644, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 4.468517264725795, | |
| "grad_norm": 1.9358819723129272, | |
| "learning_rate": 6.806115392006007e-06, | |
| "loss": 0.2758, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 4.475287745429926, | |
| "grad_norm": 1.916235327720642, | |
| "learning_rate": 6.635747343605181e-06, | |
| "loss": 0.2952, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 4.482058226134056, | |
| "grad_norm": 1.6258528232574463, | |
| "learning_rate": 6.4674655093605155e-06, | |
| "loss": 0.272, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 4.488828706838185, | |
| "grad_norm": 1.8681087493896484, | |
| "learning_rate": 6.301273649511464e-06, | |
| "loss": 0.2638, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 4.495599187542315, | |
| "grad_norm": 1.644300103187561, | |
| "learning_rate": 6.137175477597213e-06, | |
| "loss": 0.271, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 4.502369668246446, | |
| "grad_norm": 1.8756589889526367, | |
| "learning_rate": 5.975174660373706e-06, | |
| "loss": 0.2682, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 4.509140148950576, | |
| "grad_norm": 1.5481034517288208, | |
| "learning_rate": 5.815274817731753e-06, | |
| "loss": 0.2926, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 4.515910629654705, | |
| "grad_norm": 1.8476117849349976, | |
| "learning_rate": 5.657479522616071e-06, | |
| "loss": 0.2716, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 4.522681110358835, | |
| "grad_norm": 1.7573695182800293, | |
| "learning_rate": 5.501792300945507e-06, | |
| "loss": 0.2812, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 4.529451591062966, | |
| "grad_norm": 1.7136588096618652, | |
| "learning_rate": 5.348216631534264e-06, | |
| "loss": 0.2416, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 4.536222071767096, | |
| "grad_norm": 1.662249207496643, | |
| "learning_rate": 5.196755946014065e-06, | |
| "loss": 0.2571, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 4.542992552471225, | |
| "grad_norm": 2.3519043922424316, | |
| "learning_rate": 5.047413628757658e-06, | |
| "loss": 0.2819, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 4.549763033175355, | |
| "grad_norm": 1.7724781036376953, | |
| "learning_rate": 4.900193016802956e-06, | |
| "loss": 0.2881, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 4.556533513879486, | |
| "grad_norm": 1.6066288948059082, | |
| "learning_rate": 4.755097399778707e-06, | |
| "loss": 0.2837, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 4.563303994583616, | |
| "grad_norm": 2.2322845458984375, | |
| "learning_rate": 4.612130019830774e-06, | |
| "loss": 0.2648, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 4.570074475287745, | |
| "grad_norm": 1.8880157470703125, | |
| "learning_rate": 4.471294071549869e-06, | |
| "loss": 0.2571, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 4.576844955991875, | |
| "grad_norm": 1.5234016180038452, | |
| "learning_rate": 4.332592701900085e-06, | |
| "loss": 0.2567, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 4.583615436696006, | |
| "grad_norm": 2.566943645477295, | |
| "learning_rate": 4.196029010148527e-06, | |
| "loss": 0.2462, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 4.590385917400136, | |
| "grad_norm": 2.2811155319213867, | |
| "learning_rate": 4.0616060477961845e-06, | |
| "loss": 0.2695, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 4.597156398104265, | |
| "grad_norm": 2.036428928375244, | |
| "learning_rate": 3.929326818509638e-06, | |
| "loss": 0.2816, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 4.603926878808395, | |
| "grad_norm": 1.9326859712600708, | |
| "learning_rate": 3.799194278054019e-06, | |
| "loss": 0.3004, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 4.610697359512526, | |
| "grad_norm": 2.0376124382019043, | |
| "learning_rate": 3.6712113342269095e-06, | |
| "loss": 0.3155, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 4.617467840216655, | |
| "grad_norm": 1.9327590465545654, | |
| "learning_rate": 3.5453808467933558e-06, | |
| "loss": 0.2598, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 4.624238320920785, | |
| "grad_norm": 1.5915392637252808, | |
| "learning_rate": 3.421705627422067e-06, | |
| "loss": 0.2893, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 4.631008801624915, | |
| "grad_norm": 1.4876010417938232, | |
| "learning_rate": 3.300188439622465e-06, | |
| "loss": 0.2702, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 4.637779282329046, | |
| "grad_norm": 1.8183128833770752, | |
| "learning_rate": 3.180831998682987e-06, | |
| "loss": 0.26, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 4.644549763033176, | |
| "grad_norm": 1.5423557758331299, | |
| "learning_rate": 3.0636389716104607e-06, | |
| "loss": 0.309, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 4.651320243737305, | |
| "grad_norm": 1.5031051635742188, | |
| "learning_rate": 2.9486119770704144e-06, | |
| "loss": 0.2541, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 4.658090724441435, | |
| "grad_norm": 1.648635745048523, | |
| "learning_rate": 2.83575358532866e-06, | |
| "loss": 0.3016, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 4.664861205145566, | |
| "grad_norm": 2.3799970149993896, | |
| "learning_rate": 2.7250663181937808e-06, | |
| "loss": 0.287, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 4.671631685849695, | |
| "grad_norm": 1.8683040142059326, | |
| "learning_rate": 2.6165526489608016e-06, | |
| "loss": 0.2414, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 4.678402166553825, | |
| "grad_norm": 1.5256311893463135, | |
| "learning_rate": 2.510215002355987e-06, | |
| "loss": 0.2605, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 4.685172647257955, | |
| "grad_norm": 1.87392258644104, | |
| "learning_rate": 2.4060557544825724e-06, | |
| "loss": 0.2536, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 4.691943127962086, | |
| "grad_norm": 1.480167031288147, | |
| "learning_rate": 2.3040772327676987e-06, | |
| "loss": 0.2773, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 4.698713608666216, | |
| "grad_norm": 1.5413248538970947, | |
| "learning_rate": 2.2042817159104614e-06, | |
| "loss": 0.2801, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 4.705484089370345, | |
| "grad_norm": 1.492633581161499, | |
| "learning_rate": 2.106671433830909e-06, | |
| "loss": 0.2343, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 4.712254570074475, | |
| "grad_norm": 1.4329499006271362, | |
| "learning_rate": 2.011248567620272e-06, | |
| "loss": 0.2628, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 4.719025050778606, | |
| "grad_norm": 1.9466246366500854, | |
| "learning_rate": 1.918015249492211e-06, | |
| "loss": 0.258, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 4.725795531482735, | |
| "grad_norm": 1.604708194732666, | |
| "learning_rate": 1.8269735627351459e-06, | |
| "loss": 0.2807, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 4.732566012186865, | |
| "grad_norm": 1.7957441806793213, | |
| "learning_rate": 1.7381255416657693e-06, | |
| "loss": 0.2476, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 4.739336492890995, | |
| "grad_norm": 1.6520119905471802, | |
| "learning_rate": 1.6514731715835064e-06, | |
| "loss": 0.2722, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.739336492890995, | |
| "eval_loss": 1.1487771272659302, | |
| "eval_runtime": 23.0937, | |
| "eval_samples_per_second": 107.735, | |
| "eval_steps_per_second": 13.467, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.746106973595126, | |
| "grad_norm": 1.8763707876205444, | |
| "learning_rate": 1.5670183887262268e-06, | |
| "loss": 0.253, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 4.752877454299255, | |
| "grad_norm": 2.0074474811553955, | |
| "learning_rate": 1.4847630802269695e-06, | |
| "loss": 0.2886, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 4.759647935003385, | |
| "grad_norm": 1.6623965501785278, | |
| "learning_rate": 1.4047090840716982e-06, | |
| "loss": 0.2645, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 4.766418415707515, | |
| "grad_norm": 2.1426522731781006, | |
| "learning_rate": 1.3268581890583553e-06, | |
| "loss": 0.2834, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 4.773188896411646, | |
| "grad_norm": 2.4106967449188232, | |
| "learning_rate": 1.251212134756763e-06, | |
| "loss": 0.2967, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 4.779959377115775, | |
| "grad_norm": 1.7238754034042358, | |
| "learning_rate": 1.1777726114698628e-06, | |
| "loss": 0.2819, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 4.786729857819905, | |
| "grad_norm": 1.9978512525558472, | |
| "learning_rate": 1.1065412601958813e-06, | |
| "loss": 0.2892, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 4.793500338524035, | |
| "grad_norm": 1.807606816291809, | |
| "learning_rate": 1.0375196725916693e-06, | |
| "loss": 0.2751, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 4.800270819228166, | |
| "grad_norm": 1.8417556285858154, | |
| "learning_rate": 9.707093909371745e-07, | |
| "loss": 0.277, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 4.807041299932295, | |
| "grad_norm": 1.6947407722473145, | |
| "learning_rate": 9.061119081009262e-07, | |
| "loss": 0.2717, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 4.813811780636425, | |
| "grad_norm": 2.100844621658325, | |
| "learning_rate": 8.437286675067046e-07, | |
| "loss": 0.2589, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 4.820582261340555, | |
| "grad_norm": 1.8315235376358032, | |
| "learning_rate": 7.835610631013123e-07, | |
| "loss": 0.2774, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 4.827352742044685, | |
| "grad_norm": 1.8022527694702148, | |
| "learning_rate": 7.256104393233654e-07, | |
| "loss": 0.2826, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 4.834123222748815, | |
| "grad_norm": 1.8034976720809937, | |
| "learning_rate": 6.698780910732949e-07, | |
| "loss": 0.287, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 4.840893703452945, | |
| "grad_norm": 2.1168487071990967, | |
| "learning_rate": 6.163652636844375e-07, | |
| "loss": 0.2601, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 4.847664184157075, | |
| "grad_norm": 1.7831007242202759, | |
| "learning_rate": 5.650731528951237e-07, | |
| "loss": 0.2671, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 4.854434664861206, | |
| "grad_norm": 1.85152268409729, | |
| "learning_rate": 5.160029048220438e-07, | |
| "loss": 0.2877, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 4.861205145565335, | |
| "grad_norm": 1.629766583442688, | |
| "learning_rate": 4.691556159346133e-07, | |
| "loss": 0.3145, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 4.867975626269465, | |
| "grad_norm": 2.025866746902466, | |
| "learning_rate": 4.2453233303043627e-07, | |
| "loss": 0.2634, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 4.874746106973595, | |
| "grad_norm": 1.8864160776138306, | |
| "learning_rate": 3.8213405321195775e-07, | |
| "loss": 0.257, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 4.881516587677725, | |
| "grad_norm": 1.6541404724121094, | |
| "learning_rate": 3.4196172386417036e-07, | |
| "loss": 0.2942, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 4.888287068381855, | |
| "grad_norm": 1.627166509628296, | |
| "learning_rate": 3.0401624263344254e-07, | |
| "loss": 0.2984, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 4.895057549085985, | |
| "grad_norm": 2.0203287601470947, | |
| "learning_rate": 2.682984574074565e-07, | |
| "loss": 0.2775, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 4.901828029790115, | |
| "grad_norm": 1.4823179244995117, | |
| "learning_rate": 2.3480916629626816e-07, | |
| "loss": 0.2303, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 4.908598510494246, | |
| "grad_norm": 1.6466970443725586, | |
| "learning_rate": 2.035491176144766e-07, | |
| "loss": 0.2561, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 4.915368991198375, | |
| "grad_norm": 1.857335090637207, | |
| "learning_rate": 1.7451900986450441e-07, | |
| "loss": 0.2478, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 4.922139471902505, | |
| "grad_norm": 1.615402102470398, | |
| "learning_rate": 1.4771949172097677e-07, | |
| "loss": 0.2644, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 4.928909952606635, | |
| "grad_norm": 1.6097745895385742, | |
| "learning_rate": 1.2315116201623288e-07, | |
| "loss": 0.2687, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 4.935680433310765, | |
| "grad_norm": 1.6500680446624756, | |
| "learning_rate": 1.0081456972694803e-07, | |
| "loss": 0.2782, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 4.942450914014895, | |
| "grad_norm": 1.5854169130325317, | |
| "learning_rate": 8.07102139618765e-08, | |
| "loss": 0.2503, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 4.949221394719025, | |
| "grad_norm": 1.917787790298462, | |
| "learning_rate": 6.283854395067179e-08, | |
| "loss": 0.2688, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 4.955991875423155, | |
| "grad_norm": 1.3667759895324707, | |
| "learning_rate": 4.719995903387231e-08, | |
| "loss": 0.2713, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 4.962762356127285, | |
| "grad_norm": 1.4660590887069702, | |
| "learning_rate": 3.379480865397522e-08, | |
| "loss": 0.2492, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 4.969532836831415, | |
| "grad_norm": 1.909756064414978, | |
| "learning_rate": 2.2623392347620455e-08, | |
| "loss": 0.2528, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 4.976303317535545, | |
| "grad_norm": 1.9919097423553467, | |
| "learning_rate": 1.3685959738907184e-08, | |
| "loss": 0.2797, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 4.983073798239675, | |
| "grad_norm": 1.7295809984207153, | |
| "learning_rate": 6.982710533787185e-09, | |
| "loss": 0.2527, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 4.989844278943805, | |
| "grad_norm": 1.575947642326355, | |
| "learning_rate": 2.5137945156461507e-09, | |
| "loss": 0.3057, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 4.996614759647935, | |
| "grad_norm": 1.8067814111709595, | |
| "learning_rate": 2.7931154193971964e-10, | |
| "loss": 0.2525, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 7385, | |
| "total_flos": 1.6593737353978184e+18, | |
| "train_loss": 0.5936373706919645, | |
| "train_runtime": 5834.3806, | |
| "train_samples_per_second": 40.5, | |
| "train_steps_per_second": 1.266 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7385, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6593737353978184e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |