{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6695, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014936798670624918, "grad_norm": 184.9497833251953, "learning_rate": 1.3432835820895523e-07, "loss": 4.6248, "step": 10 }, { "epoch": 0.0029873597341249837, "grad_norm": 123.1812744140625, "learning_rate": 2.8358208955223886e-07, "loss": 4.6243, "step": 20 }, { "epoch": 0.004481039601187476, "grad_norm": 195.16688537597656, "learning_rate": 4.3283582089552244e-07, "loss": 4.2232, "step": 30 }, { "epoch": 0.005974719468249967, "grad_norm": 90.20713806152344, "learning_rate": 5.82089552238806e-07, "loss": 3.2697, "step": 40 }, { "epoch": 0.007468399335312459, "grad_norm": 38.96065139770508, "learning_rate": 7.313432835820897e-07, "loss": 2.3537, "step": 50 }, { "epoch": 0.008962079202374951, "grad_norm": 18.04777717590332, "learning_rate": 8.805970149253732e-07, "loss": 1.4564, "step": 60 }, { "epoch": 0.010455759069437442, "grad_norm": 14.39755630493164, "learning_rate": 1.0298507462686568e-06, "loss": 1.1459, "step": 70 }, { "epoch": 0.011949438936499935, "grad_norm": 14.463386535644531, "learning_rate": 1.1791044776119403e-06, "loss": 0.9877, "step": 80 }, { "epoch": 0.013443118803562427, "grad_norm": 15.058645248413086, "learning_rate": 1.3283582089552241e-06, "loss": 0.8928, "step": 90 }, { "epoch": 0.014936798670624918, "grad_norm": 14.862004280090332, "learning_rate": 1.4776119402985075e-06, "loss": 0.7928, "step": 100 }, { "epoch": 0.01643047853768741, "grad_norm": 14.284353256225586, "learning_rate": 1.626865671641791e-06, "loss": 0.7325, "step": 110 }, { "epoch": 0.017924158404749903, "grad_norm": 13.038077354431152, "learning_rate": 1.7761194029850749e-06, "loss": 0.6793, "step": 120 }, { "epoch": 0.019417838271812395, "grad_norm": 12.484222412109375, "learning_rate": 1.9253731343283582e-06, "loss": 0.6044, "step": 130 }, { "epoch": 0.020911518138874884, "grad_norm": 12.41592788696289, "learning_rate": 2.074626865671642e-06, "loss": 0.5792, "step": 140 }, { "epoch": 0.022405198005937377, "grad_norm": 10.647322654724121, "learning_rate": 2.2238805970149254e-06, "loss": 0.5225, "step": 150 }, { "epoch": 0.02389887787299987, "grad_norm": 10.59830093383789, "learning_rate": 2.373134328358209e-06, "loss": 0.4818, "step": 160 }, { "epoch": 0.02539255774006236, "grad_norm": 8.66019058227539, "learning_rate": 2.5223880597014925e-06, "loss": 0.4556, "step": 170 }, { "epoch": 0.026886237607124854, "grad_norm": 7.149711608886719, "learning_rate": 2.6716417910447763e-06, "loss": 0.3993, "step": 180 }, { "epoch": 0.028379917474187343, "grad_norm": 5.81325101852417, "learning_rate": 2.82089552238806e-06, "loss": 0.4024, "step": 190 }, { "epoch": 0.029873597341249836, "grad_norm": 5.245195388793945, "learning_rate": 2.9701492537313435e-06, "loss": 0.4011, "step": 200 }, { "epoch": 0.03136727720831233, "grad_norm": 5.920019149780273, "learning_rate": 3.1194029850746273e-06, "loss": 0.3821, "step": 210 }, { "epoch": 0.03286095707537482, "grad_norm": 4.425774097442627, "learning_rate": 3.2686567164179106e-06, "loss": 0.3863, "step": 220 }, { "epoch": 0.03435463694243731, "grad_norm": 2.886918067932129, "learning_rate": 3.417910447761194e-06, "loss": 0.3741, "step": 230 }, { "epoch": 0.035848316809499806, "grad_norm": 6.619475841522217, "learning_rate": 3.5671641791044782e-06, "loss": 0.3674, "step": 240 }, { "epoch": 0.037341996676562295, "grad_norm": 5.154594898223877, "learning_rate": 3.7164179104477616e-06, "loss": 0.3615, "step": 250 }, { "epoch": 0.03883567654362479, "grad_norm": 4.2160820960998535, "learning_rate": 3.865671641791045e-06, "loss": 0.3373, "step": 260 }, { "epoch": 0.04032935641068728, "grad_norm": 2.998599052429199, "learning_rate": 4.014925373134328e-06, "loss": 0.3669, "step": 270 }, { "epoch": 0.04182303627774977, "grad_norm": 4.233543395996094, "learning_rate": 4.1641791044776125e-06, "loss": 0.367, "step": 280 }, { "epoch": 0.043316716144812265, "grad_norm": 4.081936359405518, "learning_rate": 4.313432835820896e-06, "loss": 0.3581, "step": 290 }, { "epoch": 0.044810396011874754, "grad_norm": 3.0351035594940186, "learning_rate": 4.462686567164179e-06, "loss": 0.3378, "step": 300 }, { "epoch": 0.04630407587893725, "grad_norm": 5.17939567565918, "learning_rate": 4.611940298507463e-06, "loss": 0.3771, "step": 310 }, { "epoch": 0.04779775574599974, "grad_norm": 2.004800319671631, "learning_rate": 4.761194029850746e-06, "loss": 0.334, "step": 320 }, { "epoch": 0.04929143561306223, "grad_norm": 4.026329040527344, "learning_rate": 4.91044776119403e-06, "loss": 0.3373, "step": 330 }, { "epoch": 0.05078511548012472, "grad_norm": 3.8416078090667725, "learning_rate": 5.059701492537314e-06, "loss": 0.3616, "step": 340 }, { "epoch": 0.05227879534718721, "grad_norm": 2.231009006500244, "learning_rate": 5.208955223880598e-06, "loss": 0.3196, "step": 350 }, { "epoch": 0.05377247521424971, "grad_norm": 3.798912286758423, "learning_rate": 5.358208955223881e-06, "loss": 0.3638, "step": 360 }, { "epoch": 0.0552661550813122, "grad_norm": 2.688854932785034, "learning_rate": 5.5074626865671645e-06, "loss": 0.3411, "step": 370 }, { "epoch": 0.056759834948374686, "grad_norm": 3.7967114448547363, "learning_rate": 5.656716417910449e-06, "loss": 0.3717, "step": 380 }, { "epoch": 0.05825351481543718, "grad_norm": 2.219935655593872, "learning_rate": 5.805970149253732e-06, "loss": 0.33, "step": 390 }, { "epoch": 0.05974719468249967, "grad_norm": 2.561140775680542, "learning_rate": 5.9552238805970155e-06, "loss": 0.3509, "step": 400 }, { "epoch": 0.06124087454956217, "grad_norm": 1.8147597312927246, "learning_rate": 6.1044776119403e-06, "loss": 0.3371, "step": 410 }, { "epoch": 0.06273455441662466, "grad_norm": 2.9185450077056885, "learning_rate": 6.253731343283582e-06, "loss": 0.3351, "step": 420 }, { "epoch": 0.06422823428368715, "grad_norm": 1.7370891571044922, "learning_rate": 6.4029850746268664e-06, "loss": 0.3465, "step": 430 }, { "epoch": 0.06572191415074964, "grad_norm": 1.9567862749099731, "learning_rate": 6.552238805970151e-06, "loss": 0.3617, "step": 440 }, { "epoch": 0.06721559401781213, "grad_norm": 3.0918054580688477, "learning_rate": 6.701492537313433e-06, "loss": 0.344, "step": 450 }, { "epoch": 0.06870927388487462, "grad_norm": 2.9340105056762695, "learning_rate": 6.850746268656717e-06, "loss": 0.3464, "step": 460 }, { "epoch": 0.07020295375193712, "grad_norm": 2.3813934326171875, "learning_rate": 7e-06, "loss": 0.3233, "step": 470 }, { "epoch": 0.07169663361899961, "grad_norm": 1.6313637495040894, "learning_rate": 7.149253731343284e-06, "loss": 0.3377, "step": 480 }, { "epoch": 0.0731903134860621, "grad_norm": 2.1293253898620605, "learning_rate": 7.298507462686568e-06, "loss": 0.3595, "step": 490 }, { "epoch": 0.07468399335312459, "grad_norm": 2.9458706378936768, "learning_rate": 7.447761194029851e-06, "loss": 0.3289, "step": 500 }, { "epoch": 0.07468399335312459, "eval_loss": 0.3393873870372772, "eval_runtime": 77.3397, "eval_samples_per_second": 6.995, "eval_steps_per_second": 3.504, "step": 500 }, { "epoch": 0.07617767322018708, "grad_norm": 3.799473762512207, "learning_rate": 7.597014925373135e-06, "loss": 0.3188, "step": 510 }, { "epoch": 0.07767135308724958, "grad_norm": 2.0290870666503906, "learning_rate": 7.746268656716418e-06, "loss": 0.3592, "step": 520 }, { "epoch": 0.07916503295431207, "grad_norm": 3.1726157665252686, "learning_rate": 7.895522388059703e-06, "loss": 0.3284, "step": 530 }, { "epoch": 0.08065871282137456, "grad_norm": 2.264389991760254, "learning_rate": 8.044776119402986e-06, "loss": 0.3631, "step": 540 }, { "epoch": 0.08215239268843705, "grad_norm": 2.3615527153015137, "learning_rate": 8.19402985074627e-06, "loss": 0.3333, "step": 550 }, { "epoch": 0.08364607255549954, "grad_norm": 2.3802566528320312, "learning_rate": 8.343283582089553e-06, "loss": 0.3482, "step": 560 }, { "epoch": 0.08513975242256204, "grad_norm": 2.536975145339966, "learning_rate": 8.492537313432838e-06, "loss": 0.3316, "step": 570 }, { "epoch": 0.08663343228962453, "grad_norm": 2.01639723777771, "learning_rate": 8.64179104477612e-06, "loss": 0.355, "step": 580 }, { "epoch": 0.08812711215668702, "grad_norm": 2.158482313156128, "learning_rate": 8.791044776119405e-06, "loss": 0.3637, "step": 590 }, { "epoch": 0.08962079202374951, "grad_norm": 3.023801326751709, "learning_rate": 8.940298507462686e-06, "loss": 0.3663, "step": 600 }, { "epoch": 0.091114471890812, "grad_norm": 1.7491657733917236, "learning_rate": 9.089552238805971e-06, "loss": 0.368, "step": 610 }, { "epoch": 0.0926081517578745, "grad_norm": 1.5877282619476318, "learning_rate": 9.238805970149255e-06, "loss": 0.3366, "step": 620 }, { "epoch": 0.09410183162493699, "grad_norm": 3.5212433338165283, "learning_rate": 9.388059701492538e-06, "loss": 0.3501, "step": 630 }, { "epoch": 0.09559551149199948, "grad_norm": 2.3926730155944824, "learning_rate": 9.537313432835821e-06, "loss": 0.3328, "step": 640 }, { "epoch": 0.09708919135906197, "grad_norm": 3.278258800506592, "learning_rate": 9.686567164179105e-06, "loss": 0.3635, "step": 650 }, { "epoch": 0.09858287122612445, "grad_norm": 2.390896797180176, "learning_rate": 9.835820895522388e-06, "loss": 0.3453, "step": 660 }, { "epoch": 0.10007655109318696, "grad_norm": 2.1486220359802246, "learning_rate": 9.985074626865673e-06, "loss": 0.327, "step": 670 }, { "epoch": 0.10157023096024945, "grad_norm": 3.7770419120788574, "learning_rate": 9.999944943338487e-06, "loss": 0.3048, "step": 680 }, { "epoch": 0.10306391082731194, "grad_norm": 3.788212776184082, "learning_rate": 9.999754625571397e-06, "loss": 0.3593, "step": 690 }, { "epoch": 0.10455759069437442, "grad_norm": 2.0790538787841797, "learning_rate": 9.999428372160074e-06, "loss": 0.3782, "step": 700 }, { "epoch": 0.10605127056143691, "grad_norm": 2.0736265182495117, "learning_rate": 9.998966191974846e-06, "loss": 0.3522, "step": 710 }, { "epoch": 0.10754495042849942, "grad_norm": 2.3214290142059326, "learning_rate": 9.998368097581685e-06, "loss": 0.3844, "step": 720 }, { "epoch": 0.1090386302955619, "grad_norm": 1.3843424320220947, "learning_rate": 9.997634105241855e-06, "loss": 0.3387, "step": 730 }, { "epoch": 0.1105323101626244, "grad_norm": 4.11653995513916, "learning_rate": 9.996764234911483e-06, "loss": 0.3523, "step": 740 }, { "epoch": 0.11202599002968688, "grad_norm": 1.6446789503097534, "learning_rate": 9.995758510241003e-06, "loss": 0.3339, "step": 750 }, { "epoch": 0.11351966989674937, "grad_norm": 1.4377137422561646, "learning_rate": 9.994616958574526e-06, "loss": 0.3523, "step": 760 }, { "epoch": 0.11501334976381188, "grad_norm": 1.9575657844543457, "learning_rate": 9.993339610949084e-06, "loss": 0.3654, "step": 770 }, { "epoch": 0.11650702963087436, "grad_norm": 1.8258610963821411, "learning_rate": 9.9919265020938e-06, "loss": 0.3465, "step": 780 }, { "epoch": 0.11800070949793685, "grad_norm": 2.1197669506073, "learning_rate": 9.99037767042893e-06, "loss": 0.36, "step": 790 }, { "epoch": 0.11949438936499934, "grad_norm": 1.671007752418518, "learning_rate": 9.988693158064826e-06, "loss": 0.3182, "step": 800 }, { "epoch": 0.12098806923206183, "grad_norm": 2.0421807765960693, "learning_rate": 9.986873010800792e-06, "loss": 0.3402, "step": 810 }, { "epoch": 0.12248174909912433, "grad_norm": 2.7439417839050293, "learning_rate": 9.984917278123832e-06, "loss": 0.3551, "step": 820 }, { "epoch": 0.12397542896618682, "grad_norm": 1.2339754104614258, "learning_rate": 9.982826013207314e-06, "loss": 0.3407, "step": 830 }, { "epoch": 0.12546910883324933, "grad_norm": 3.45686674118042, "learning_rate": 9.980599272909517e-06, "loss": 0.3262, "step": 840 }, { "epoch": 0.1269627887003118, "grad_norm": 2.196939468383789, "learning_rate": 9.978237117772086e-06, "loss": 0.3537, "step": 850 }, { "epoch": 0.1284564685673743, "grad_norm": 2.2232518196105957, "learning_rate": 9.975739612018391e-06, "loss": 0.3621, "step": 860 }, { "epoch": 0.12995014843443678, "grad_norm": 1.7306561470031738, "learning_rate": 9.973106823551772e-06, "loss": 0.3207, "step": 870 }, { "epoch": 0.13144382830149928, "grad_norm": 1.6579896211624146, "learning_rate": 9.970338823953704e-06, "loss": 0.3399, "step": 880 }, { "epoch": 0.13293750816856179, "grad_norm": 2.4403505325317383, "learning_rate": 9.96743568848184e-06, "loss": 0.3616, "step": 890 }, { "epoch": 0.13443118803562426, "grad_norm": 2.051017999649048, "learning_rate": 9.964397496067972e-06, "loss": 0.3408, "step": 900 }, { "epoch": 0.13592486790268676, "grad_norm": 1.935581922531128, "learning_rate": 9.961224329315886e-06, "loss": 0.3469, "step": 910 }, { "epoch": 0.13741854776974924, "grad_norm": 2.0615079402923584, "learning_rate": 9.957916274499103e-06, "loss": 0.3401, "step": 920 }, { "epoch": 0.13891222763681174, "grad_norm": 2.1460647583007812, "learning_rate": 9.954473421558554e-06, "loss": 0.328, "step": 930 }, { "epoch": 0.14040590750387424, "grad_norm": 1.6261128187179565, "learning_rate": 9.950895864100117e-06, "loss": 0.3483, "step": 940 }, { "epoch": 0.14189958737093672, "grad_norm": 2.0029091835021973, "learning_rate": 9.947183699392083e-06, "loss": 0.3655, "step": 950 }, { "epoch": 0.14339326723799922, "grad_norm": 2.068676233291626, "learning_rate": 9.943337028362503e-06, "loss": 0.3133, "step": 960 }, { "epoch": 0.1448869471050617, "grad_norm": 2.6636133193969727, "learning_rate": 9.93935595559645e-06, "loss": 0.3295, "step": 970 }, { "epoch": 0.1463806269721242, "grad_norm": 1.58219313621521, "learning_rate": 9.935240589333179e-06, "loss": 0.3247, "step": 980 }, { "epoch": 0.1478743068391867, "grad_norm": 1.7050349712371826, "learning_rate": 9.930991041463166e-06, "loss": 0.3172, "step": 990 }, { "epoch": 0.14936798670624918, "grad_norm": 1.4773019552230835, "learning_rate": 9.926607427525094e-06, "loss": 0.3445, "step": 1000 }, { "epoch": 0.14936798670624918, "eval_loss": 0.33145418763160706, "eval_runtime": 76.2509, "eval_samples_per_second": 7.095, "eval_steps_per_second": 3.554, "step": 1000 }, { "epoch": 0.15086166657331168, "grad_norm": 2.2282180786132812, "learning_rate": 9.922089866702685e-06, "loss": 0.3449, "step": 1010 }, { "epoch": 0.15235534644037416, "grad_norm": 1.7335081100463867, "learning_rate": 9.917438481821475e-06, "loss": 0.3664, "step": 1020 }, { "epoch": 0.15384902630743666, "grad_norm": 1.7053015232086182, "learning_rate": 9.912653399345473e-06, "loss": 0.3457, "step": 1030 }, { "epoch": 0.15534270617449916, "grad_norm": 1.1435269117355347, "learning_rate": 9.907734749373712e-06, "loss": 0.3177, "step": 1040 }, { "epoch": 0.15683638604156164, "grad_norm": 3.1279070377349854, "learning_rate": 9.90268266563673e-06, "loss": 0.351, "step": 1050 }, { "epoch": 0.15833006590862414, "grad_norm": 2.90409779548645, "learning_rate": 9.897497285492919e-06, "loss": 0.3403, "step": 1060 }, { "epoch": 0.15982374577568662, "grad_norm": 1.5271624326705933, "learning_rate": 9.892178749924792e-06, "loss": 0.3039, "step": 1070 }, { "epoch": 0.16131742564274912, "grad_norm": 1.622085452079773, "learning_rate": 9.886727203535163e-06, "loss": 0.3323, "step": 1080 }, { "epoch": 0.16281110550981162, "grad_norm": 2.4689345359802246, "learning_rate": 9.881142794543196e-06, "loss": 0.3069, "step": 1090 }, { "epoch": 0.1643047853768741, "grad_norm": 1.3529936075210571, "learning_rate": 9.875425674780388e-06, "loss": 0.3265, "step": 1100 }, { "epoch": 0.1657984652439366, "grad_norm": 1.7921439409255981, "learning_rate": 9.86957599968644e-06, "loss": 0.3439, "step": 1110 }, { "epoch": 0.16729214511099907, "grad_norm": 1.8610994815826416, "learning_rate": 9.863593928305031e-06, "loss": 0.323, "step": 1120 }, { "epoch": 0.16878582497806158, "grad_norm": 1.460547685623169, "learning_rate": 9.857479623279481e-06, "loss": 0.3502, "step": 1130 }, { "epoch": 0.17027950484512408, "grad_norm": 1.4841840267181396, "learning_rate": 9.851233250848355e-06, "loss": 0.332, "step": 1140 }, { "epoch": 0.17177318471218656, "grad_norm": 1.907037377357483, "learning_rate": 9.844854980840914e-06, "loss": 0.3251, "step": 1150 }, { "epoch": 0.17326686457924906, "grad_norm": 1.833953857421875, "learning_rate": 9.838344986672518e-06, "loss": 0.3628, "step": 1160 }, { "epoch": 0.17476054444631153, "grad_norm": 1.5002996921539307, "learning_rate": 9.831703445339904e-06, "loss": 0.3346, "step": 1170 }, { "epoch": 0.17625422431337404, "grad_norm": 1.2558107376098633, "learning_rate": 9.824930537416372e-06, "loss": 0.3429, "step": 1180 }, { "epoch": 0.17774790418043654, "grad_norm": 2.025219678878784, "learning_rate": 9.81802644704688e-06, "loss": 0.3385, "step": 1190 }, { "epoch": 0.17924158404749901, "grad_norm": 1.8134315013885498, "learning_rate": 9.810991361943037e-06, "loss": 0.3362, "step": 1200 }, { "epoch": 0.18073526391456152, "grad_norm": 1.2848788499832153, "learning_rate": 9.80382547337799e-06, "loss": 0.3146, "step": 1210 }, { "epoch": 0.182228943781624, "grad_norm": 1.6937299966812134, "learning_rate": 9.796528976181238e-06, "loss": 0.3192, "step": 1220 }, { "epoch": 0.1837226236486865, "grad_norm": 1.8493894338607788, "learning_rate": 9.78910206873333e-06, "loss": 0.3463, "step": 1230 }, { "epoch": 0.185216303515749, "grad_norm": 1.8786216974258423, "learning_rate": 9.781544952960458e-06, "loss": 0.3178, "step": 1240 }, { "epoch": 0.18670998338281147, "grad_norm": 1.6666313409805298, "learning_rate": 9.773857834328992e-06, "loss": 0.3263, "step": 1250 }, { "epoch": 0.18820366324987398, "grad_norm": 1.6735985279083252, "learning_rate": 9.766040921839867e-06, "loss": 0.3435, "step": 1260 }, { "epoch": 0.18969734311693645, "grad_norm": 1.4434776306152344, "learning_rate": 9.758094428022927e-06, "loss": 0.3291, "step": 1270 }, { "epoch": 0.19119102298399895, "grad_norm": 2.4167513847351074, "learning_rate": 9.750018568931122e-06, "loss": 0.3433, "step": 1280 }, { "epoch": 0.19268470285106146, "grad_norm": 1.7800685167312622, "learning_rate": 9.741813564134647e-06, "loss": 0.3223, "step": 1290 }, { "epoch": 0.19417838271812393, "grad_norm": 1.9175775051116943, "learning_rate": 9.733479636714978e-06, "loss": 0.3549, "step": 1300 }, { "epoch": 0.19567206258518643, "grad_norm": 2.1426539421081543, "learning_rate": 9.725017013258789e-06, "loss": 0.3243, "step": 1310 }, { "epoch": 0.1971657424522489, "grad_norm": 1.8812899589538574, "learning_rate": 9.716425923851804e-06, "loss": 0.3312, "step": 1320 }, { "epoch": 0.1986594223193114, "grad_norm": 1.4907119274139404, "learning_rate": 9.707706602072547e-06, "loss": 0.3499, "step": 1330 }, { "epoch": 0.20015310218637392, "grad_norm": 1.8211297988891602, "learning_rate": 9.69885928498597e-06, "loss": 0.3289, "step": 1340 }, { "epoch": 0.2016467820534364, "grad_norm": 1.4706814289093018, "learning_rate": 9.689884213137033e-06, "loss": 0.3252, "step": 1350 }, { "epoch": 0.2031404619204989, "grad_norm": 2.1436257362365723, "learning_rate": 9.68078163054414e-06, "loss": 0.3314, "step": 1360 }, { "epoch": 0.20463414178756137, "grad_norm": 2.100780725479126, "learning_rate": 9.671551784692529e-06, "loss": 0.3227, "step": 1370 }, { "epoch": 0.20612782165462387, "grad_norm": 1.4741297960281372, "learning_rate": 9.662194926527517e-06, "loss": 0.3467, "step": 1380 }, { "epoch": 0.20762150152168637, "grad_norm": 2.250545024871826, "learning_rate": 9.6527113104477e-06, "loss": 0.3504, "step": 1390 }, { "epoch": 0.20911518138874885, "grad_norm": 2.133129835128784, "learning_rate": 9.643101194298023e-06, "loss": 0.3535, "step": 1400 }, { "epoch": 0.21060886125581135, "grad_norm": 2.9924333095550537, "learning_rate": 9.633364839362777e-06, "loss": 0.3501, "step": 1410 }, { "epoch": 0.21210254112287383, "grad_norm": 2.5759615898132324, "learning_rate": 9.623502510358488e-06, "loss": 0.3427, "step": 1420 }, { "epoch": 0.21359622098993633, "grad_norm": 1.1932740211486816, "learning_rate": 9.613514475426722e-06, "loss": 0.3381, "step": 1430 }, { "epoch": 0.21508990085699883, "grad_norm": 1.5130189657211304, "learning_rate": 9.6034010061268e-06, "loss": 0.3297, "step": 1440 }, { "epoch": 0.2165835807240613, "grad_norm": 1.248481035232544, "learning_rate": 9.59316237742841e-06, "loss": 0.3251, "step": 1450 }, { "epoch": 0.2180772605911238, "grad_norm": 1.7967370748519897, "learning_rate": 9.582798867704131e-06, "loss": 0.3398, "step": 1460 }, { "epoch": 0.2195709404581863, "grad_norm": 1.2705239057540894, "learning_rate": 9.572310758721864e-06, "loss": 0.3053, "step": 1470 }, { "epoch": 0.2210646203252488, "grad_norm": 1.6166293621063232, "learning_rate": 9.561698335637171e-06, "loss": 0.3424, "step": 1480 }, { "epoch": 0.2225583001923113, "grad_norm": 1.8217055797576904, "learning_rate": 9.550961886985528e-06, "loss": 0.347, "step": 1490 }, { "epoch": 0.22405198005937377, "grad_norm": 1.6405028104782104, "learning_rate": 9.540101704674473e-06, "loss": 0.3383, "step": 1500 }, { "epoch": 0.22405198005937377, "eval_loss": 0.326803594827652, "eval_runtime": 76.2278, "eval_samples_per_second": 7.097, "eval_steps_per_second": 3.555, "step": 1500 }, { "epoch": 0.22554565992643627, "grad_norm": 2.2624378204345703, "learning_rate": 9.529118083975672e-06, "loss": 0.335, "step": 1510 }, { "epoch": 0.22703933979349875, "grad_norm": 1.2416856288909912, "learning_rate": 9.518011323516892e-06, "loss": 0.342, "step": 1520 }, { "epoch": 0.22853301966056125, "grad_norm": 1.1462935209274292, "learning_rate": 9.506781725273879e-06, "loss": 0.3226, "step": 1530 }, { "epoch": 0.23002669952762375, "grad_norm": 1.9097304344177246, "learning_rate": 9.495429594562151e-06, "loss": 0.3213, "step": 1540 }, { "epoch": 0.23152037939468623, "grad_norm": 1.6176527738571167, "learning_rate": 9.483955240028695e-06, "loss": 0.3348, "step": 1550 }, { "epoch": 0.23301405926174873, "grad_norm": 1.6169483661651611, "learning_rate": 9.472358973643576e-06, "loss": 0.3237, "step": 1560 }, { "epoch": 0.2345077391288112, "grad_norm": 1.86874258518219, "learning_rate": 9.460641110691456e-06, "loss": 0.3475, "step": 1570 }, { "epoch": 0.2360014189958737, "grad_norm": 1.534540057182312, "learning_rate": 9.448801969763016e-06, "loss": 0.3487, "step": 1580 }, { "epoch": 0.2374950988629362, "grad_norm": 1.68146550655365, "learning_rate": 9.436841872746309e-06, "loss": 0.3128, "step": 1590 }, { "epoch": 0.23898877872999869, "grad_norm": 1.0647422075271606, "learning_rate": 9.424761144817987e-06, "loss": 0.3437, "step": 1600 }, { "epoch": 0.2404824585970612, "grad_norm": 1.4840996265411377, "learning_rate": 9.412560114434477e-06, "loss": 0.3483, "step": 1610 }, { "epoch": 0.24197613846412366, "grad_norm": 2.902223587036133, "learning_rate": 9.400239113323042e-06, "loss": 0.3654, "step": 1620 }, { "epoch": 0.24346981833118617, "grad_norm": 1.711083173751831, "learning_rate": 9.387798476472766e-06, "loss": 0.3369, "step": 1630 }, { "epoch": 0.24496349819824867, "grad_norm": 1.4812666177749634, "learning_rate": 9.37523854212545e-06, "loss": 0.3521, "step": 1640 }, { "epoch": 0.24645717806531114, "grad_norm": 1.1067218780517578, "learning_rate": 9.362559651766402e-06, "loss": 0.302, "step": 1650 }, { "epoch": 0.24795085793237365, "grad_norm": 1.2541941404342651, "learning_rate": 9.349762150115163e-06, "loss": 0.3348, "step": 1660 }, { "epoch": 0.24944453779943612, "grad_norm": 1.125554084777832, "learning_rate": 9.336846385116138e-06, "loss": 0.3444, "step": 1670 }, { "epoch": 0.25093821766649865, "grad_norm": 1.8702635765075684, "learning_rate": 9.323812707929126e-06, "loss": 0.3092, "step": 1680 }, { "epoch": 0.2524318975335611, "grad_norm": 1.7931956052780151, "learning_rate": 9.31066147291978e-06, "loss": 0.3416, "step": 1690 }, { "epoch": 0.2539255774006236, "grad_norm": 1.8561383485794067, "learning_rate": 9.297393037649965e-06, "loss": 0.3521, "step": 1700 }, { "epoch": 0.2554192572676861, "grad_norm": 1.8236286640167236, "learning_rate": 9.284007762868047e-06, "loss": 0.3025, "step": 1710 }, { "epoch": 0.2569129371347486, "grad_norm": 1.4247581958770752, "learning_rate": 9.270506012499072e-06, "loss": 0.336, "step": 1720 }, { "epoch": 0.2584066170018111, "grad_norm": 1.79005765914917, "learning_rate": 9.256888153634888e-06, "loss": 0.3153, "step": 1730 }, { "epoch": 0.25990029686887356, "grad_norm": 2.0392203330993652, "learning_rate": 9.243154556524144e-06, "loss": 0.3462, "step": 1740 }, { "epoch": 0.26139397673593606, "grad_norm": 1.978434681892395, "learning_rate": 9.229305594562236e-06, "loss": 0.3491, "step": 1750 }, { "epoch": 0.26288765660299857, "grad_norm": 2.794302463531494, "learning_rate": 9.215341644281161e-06, "loss": 0.3432, "step": 1760 }, { "epoch": 0.26438133647006107, "grad_norm": 2.9479925632476807, "learning_rate": 9.201263085339266e-06, "loss": 0.3267, "step": 1770 }, { "epoch": 0.26587501633712357, "grad_norm": 1.6784942150115967, "learning_rate": 9.187070300510927e-06, "loss": 0.3403, "step": 1780 }, { "epoch": 0.267368696204186, "grad_norm": 1.38883638381958, "learning_rate": 9.172763675676153e-06, "loss": 0.3242, "step": 1790 }, { "epoch": 0.2688623760712485, "grad_norm": 2.5442349910736084, "learning_rate": 9.158343599810087e-06, "loss": 0.3369, "step": 1800 }, { "epoch": 0.270356055938311, "grad_norm": 1.3056666851043701, "learning_rate": 9.143810464972429e-06, "loss": 0.3129, "step": 1810 }, { "epoch": 0.2718497358053735, "grad_norm": 1.8842471837997437, "learning_rate": 9.12916466629678e-06, "loss": 0.3257, "step": 1820 }, { "epoch": 0.27334341567243603, "grad_norm": 0.9923204183578491, "learning_rate": 9.114406601979895e-06, "loss": 0.3208, "step": 1830 }, { "epoch": 0.2748370955394985, "grad_norm": 1.6141374111175537, "learning_rate": 9.099536673270864e-06, "loss": 0.3253, "step": 1840 }, { "epoch": 0.276330775406561, "grad_norm": 2.0269787311553955, "learning_rate": 9.084555284460192e-06, "loss": 0.3179, "step": 1850 }, { "epoch": 0.2778244552736235, "grad_norm": 1.620477557182312, "learning_rate": 9.06946284286882e-06, "loss": 0.3224, "step": 1860 }, { "epoch": 0.279318135140686, "grad_norm": 1.725224494934082, "learning_rate": 9.054259758837038e-06, "loss": 0.3288, "step": 1870 }, { "epoch": 0.2808118150077485, "grad_norm": 2.209329605102539, "learning_rate": 9.038946445713335e-06, "loss": 0.3421, "step": 1880 }, { "epoch": 0.28230549487481094, "grad_norm": 1.3899812698364258, "learning_rate": 9.02352331984316e-06, "loss": 0.3255, "step": 1890 }, { "epoch": 0.28379917474187344, "grad_norm": 1.5803393125534058, "learning_rate": 9.007990800557601e-06, "loss": 0.3147, "step": 1900 }, { "epoch": 0.28529285460893594, "grad_norm": 1.134922742843628, "learning_rate": 8.992349310161989e-06, "loss": 0.3412, "step": 1910 }, { "epoch": 0.28678653447599844, "grad_norm": 1.9992294311523438, "learning_rate": 8.976599273924406e-06, "loss": 0.3429, "step": 1920 }, { "epoch": 0.28828021434306095, "grad_norm": 1.468029260635376, "learning_rate": 8.960741120064131e-06, "loss": 0.3279, "step": 1930 }, { "epoch": 0.2897738942101234, "grad_norm": 1.7822861671447754, "learning_rate": 8.944775279739996e-06, "loss": 0.3192, "step": 1940 }, { "epoch": 0.2912675740771859, "grad_norm": 1.5257068872451782, "learning_rate": 8.928702187038665e-06, "loss": 0.3359, "step": 1950 }, { "epoch": 0.2927612539442484, "grad_norm": 1.5627810955047607, "learning_rate": 8.91252227896282e-06, "loss": 0.3255, "step": 1960 }, { "epoch": 0.2942549338113109, "grad_norm": 1.1691981554031372, "learning_rate": 8.8962359954193e-06, "loss": 0.3398, "step": 1970 }, { "epoch": 0.2957486136783734, "grad_norm": 2.4454123973846436, "learning_rate": 8.879843779207123e-06, "loss": 0.3137, "step": 1980 }, { "epoch": 0.29724229354543585, "grad_norm": 1.4002143144607544, "learning_rate": 8.863346076005452e-06, "loss": 0.3262, "step": 1990 }, { "epoch": 0.29873597341249836, "grad_norm": 1.3549312353134155, "learning_rate": 8.846743334361486e-06, "loss": 0.3352, "step": 2000 }, { "epoch": 0.29873597341249836, "eval_loss": 0.32243964076042175, "eval_runtime": 76.2222, "eval_samples_per_second": 7.098, "eval_steps_per_second": 3.555, "step": 2000 }, { "epoch": 0.30022965327956086, "grad_norm": 1.2843849658966064, "learning_rate": 8.830036005678253e-06, "loss": 0.3178, "step": 2010 }, { "epoch": 0.30172333314662336, "grad_norm": 1.5276010036468506, "learning_rate": 8.81322454420234e-06, "loss": 0.337, "step": 2020 }, { "epoch": 0.30321701301368587, "grad_norm": 1.4595482349395752, "learning_rate": 8.796309407011553e-06, "loss": 0.3196, "step": 2030 }, { "epoch": 0.3047106928807483, "grad_norm": 1.7560086250305176, "learning_rate": 8.779291054002468e-06, "loss": 0.3407, "step": 2040 }, { "epoch": 0.3062043727478108, "grad_norm": 1.4491099119186401, "learning_rate": 8.762169947877951e-06, "loss": 0.3225, "step": 2050 }, { "epoch": 0.3076980526148733, "grad_norm": 1.2083287239074707, "learning_rate": 8.74494655413457e-06, "loss": 0.3135, "step": 2060 }, { "epoch": 0.3091917324819358, "grad_norm": 1.734601616859436, "learning_rate": 8.727621341049924e-06, "loss": 0.3435, "step": 2070 }, { "epoch": 0.3106854123489983, "grad_norm": 1.7759486436843872, "learning_rate": 8.710194779669932e-06, "loss": 0.3192, "step": 2080 }, { "epoch": 0.31217909221606077, "grad_norm": 1.632818579673767, "learning_rate": 8.692667343796013e-06, "loss": 0.334, "step": 2090 }, { "epoch": 0.3136727720831233, "grad_norm": 1.8493646383285522, "learning_rate": 8.675039509972216e-06, "loss": 0.3345, "step": 2100 }, { "epoch": 0.3151664519501858, "grad_norm": 2.082334280014038, "learning_rate": 8.657311757472247e-06, "loss": 0.3551, "step": 2110 }, { "epoch": 0.3166601318172483, "grad_norm": 2.2276527881622314, "learning_rate": 8.639484568286451e-06, "loss": 0.3335, "step": 2120 }, { "epoch": 0.3181538116843108, "grad_norm": 1.907583475112915, "learning_rate": 8.621558427108705e-06, "loss": 0.3219, "step": 2130 }, { "epoch": 0.31964749155137323, "grad_norm": 1.540067434310913, "learning_rate": 8.603533821323238e-06, "loss": 0.322, "step": 2140 }, { "epoch": 0.32114117141843573, "grad_norm": 1.2835497856140137, "learning_rate": 8.585411240991378e-06, "loss": 0.3143, "step": 2150 }, { "epoch": 0.32263485128549824, "grad_norm": 1.189209222793579, "learning_rate": 8.56719117883823e-06, "loss": 0.3333, "step": 2160 }, { "epoch": 0.32412853115256074, "grad_norm": 1.5749543905258179, "learning_rate": 8.548874130239286e-06, "loss": 0.3257, "step": 2170 }, { "epoch": 0.32562221101962324, "grad_norm": 2.1032919883728027, "learning_rate": 8.530460593206942e-06, "loss": 0.3155, "step": 2180 }, { "epoch": 0.3271158908866857, "grad_norm": 1.9780592918395996, "learning_rate": 8.511951068376975e-06, "loss": 0.3199, "step": 2190 }, { "epoch": 0.3286095707537482, "grad_norm": 1.8979151248931885, "learning_rate": 8.493346058994916e-06, "loss": 0.3323, "step": 2200 }, { "epoch": 0.3301032506208107, "grad_norm": 1.7598830461502075, "learning_rate": 8.474646070902376e-06, "loss": 0.3202, "step": 2210 }, { "epoch": 0.3315969304878732, "grad_norm": 1.7886403799057007, "learning_rate": 8.455851612523291e-06, "loss": 0.3319, "step": 2220 }, { "epoch": 0.3330906103549357, "grad_norm": 1.9333144426345825, "learning_rate": 8.4369631948501e-06, "loss": 0.3377, "step": 2230 }, { "epoch": 0.33458429022199815, "grad_norm": 1.5406423807144165, "learning_rate": 8.417981331429855e-06, "loss": 0.3359, "step": 2240 }, { "epoch": 0.33607797008906065, "grad_norm": 1.1198780536651611, "learning_rate": 8.39890653835024e-06, "loss": 0.3423, "step": 2250 }, { "epoch": 0.33757164995612315, "grad_norm": 1.867664098739624, "learning_rate": 8.379739334225571e-06, "loss": 0.3274, "step": 2260 }, { "epoch": 0.33906532982318566, "grad_norm": 1.5488725900650024, "learning_rate": 8.360480240182666e-06, "loss": 0.3366, "step": 2270 }, { "epoch": 0.34055900969024816, "grad_norm": 1.5203229188919067, "learning_rate": 8.341129779846695e-06, "loss": 0.3229, "step": 2280 }, { "epoch": 0.3420526895573106, "grad_norm": 1.774835228919983, "learning_rate": 8.321688479326935e-06, "loss": 0.3307, "step": 2290 }, { "epoch": 0.3435463694243731, "grad_norm": 1.333151936531067, "learning_rate": 8.302156867202468e-06, "loss": 0.3216, "step": 2300 }, { "epoch": 0.3450400492914356, "grad_norm": 1.3206020593643188, "learning_rate": 8.28253547450781e-06, "loss": 0.3125, "step": 2310 }, { "epoch": 0.3465337291584981, "grad_norm": 1.8065084218978882, "learning_rate": 8.262824834718471e-06, "loss": 0.3201, "step": 2320 }, { "epoch": 0.3480274090255606, "grad_norm": 2.162179708480835, "learning_rate": 8.243025483736458e-06, "loss": 0.3156, "step": 2330 }, { "epoch": 0.34952108889262307, "grad_norm": 1.118371844291687, "learning_rate": 8.22313795987569e-06, "loss": 0.3433, "step": 2340 }, { "epoch": 0.35101476875968557, "grad_norm": 1.838300347328186, "learning_rate": 8.20316280384738e-06, "loss": 0.3154, "step": 2350 }, { "epoch": 0.35250844862674807, "grad_norm": 1.6531926393508911, "learning_rate": 8.183100558745317e-06, "loss": 0.3072, "step": 2360 }, { "epoch": 0.3540021284938106, "grad_norm": 2.1075356006622314, "learning_rate": 8.162951770031116e-06, "loss": 0.3291, "step": 2370 }, { "epoch": 0.3554958083608731, "grad_norm": 1.7505310773849487, "learning_rate": 8.142716985519373e-06, "loss": 0.3222, "step": 2380 }, { "epoch": 0.3569894882279355, "grad_norm": 1.5103789567947388, "learning_rate": 8.122396755362782e-06, "loss": 0.3086, "step": 2390 }, { "epoch": 0.35848316809499803, "grad_norm": 1.8631788492202759, "learning_rate": 8.10199163203717e-06, "loss": 0.3312, "step": 2400 }, { "epoch": 0.35997684796206053, "grad_norm": 1.6605143547058105, "learning_rate": 8.081502170326478e-06, "loss": 0.3228, "step": 2410 }, { "epoch": 0.36147052782912303, "grad_norm": 1.1152336597442627, "learning_rate": 8.060928927307687e-06, "loss": 0.3307, "step": 2420 }, { "epoch": 0.36296420769618554, "grad_norm": 1.3379615545272827, "learning_rate": 8.040272462335648e-06, "loss": 0.323, "step": 2430 }, { "epoch": 0.364457887563248, "grad_norm": 2.2633602619171143, "learning_rate": 8.019533337027903e-06, "loss": 0.3195, "step": 2440 }, { "epoch": 0.3659515674303105, "grad_norm": 1.8531728982925415, "learning_rate": 7.998712115249391e-06, "loss": 0.3531, "step": 2450 }, { "epoch": 0.367445247297373, "grad_norm": 1.6278972625732422, "learning_rate": 7.977809363097135e-06, "loss": 0.3373, "step": 2460 }, { "epoch": 0.3689389271644355, "grad_norm": 1.7813271284103394, "learning_rate": 7.956825648884842e-06, "loss": 0.3506, "step": 2470 }, { "epoch": 0.370432607031498, "grad_norm": 2.0010931491851807, "learning_rate": 7.935761543127449e-06, "loss": 0.3166, "step": 2480 }, { "epoch": 0.37192628689856044, "grad_norm": 2.6339111328125, "learning_rate": 7.91461761852562e-06, "loss": 0.32, "step": 2490 }, { "epoch": 0.37341996676562295, "grad_norm": 1.8536508083343506, "learning_rate": 7.893394449950166e-06, "loss": 0.3027, "step": 2500 }, { "epoch": 0.37341996676562295, "eval_loss": 0.31971076130867004, "eval_runtime": 76.1507, "eval_samples_per_second": 7.104, "eval_steps_per_second": 3.559, "step": 2500 }, { "epoch": 0.37491364663268545, "grad_norm": 1.504650592803955, "learning_rate": 7.87209261442643e-06, "loss": 0.3075, "step": 2510 }, { "epoch": 0.37640732649974795, "grad_norm": 1.0728139877319336, "learning_rate": 7.850712691118577e-06, "loss": 0.3329, "step": 2520 }, { "epoch": 0.37790100636681045, "grad_norm": 1.5715535879135132, "learning_rate": 7.829255261313862e-06, "loss": 0.3105, "step": 2530 }, { "epoch": 0.3793946862338729, "grad_norm": 0.8371075987815857, "learning_rate": 7.807720908406826e-06, "loss": 0.3318, "step": 2540 }, { "epoch": 0.3808883661009354, "grad_norm": 2.6301848888397217, "learning_rate": 7.786110217883429e-06, "loss": 0.3471, "step": 2550 }, { "epoch": 0.3823820459679979, "grad_norm": 1.0217111110687256, "learning_rate": 7.764423777305132e-06, "loss": 0.2987, "step": 2560 }, { "epoch": 0.3838757258350604, "grad_norm": 1.5058764219284058, "learning_rate": 7.742662176292926e-06, "loss": 0.301, "step": 2570 }, { "epoch": 0.3853694057021229, "grad_norm": 1.2323505878448486, "learning_rate": 7.720826006511297e-06, "loss": 0.3135, "step": 2580 }, { "epoch": 0.38686308556918536, "grad_norm": 1.6528573036193848, "learning_rate": 7.698915861652139e-06, "loss": 0.3357, "step": 2590 }, { "epoch": 0.38835676543624786, "grad_norm": 1.556429386138916, "learning_rate": 7.676932337418624e-06, "loss": 0.3063, "step": 2600 }, { "epoch": 0.38985044530331037, "grad_norm": 1.9085198640823364, "learning_rate": 7.654876031508981e-06, "loss": 0.3214, "step": 2610 }, { "epoch": 0.39134412517037287, "grad_norm": 1.279447078704834, "learning_rate": 7.63274754360028e-06, "loss": 0.3206, "step": 2620 }, { "epoch": 0.3928378050374354, "grad_norm": 2.345536231994629, "learning_rate": 7.610547475332089e-06, "loss": 0.3254, "step": 2630 }, { "epoch": 0.3943314849044978, "grad_norm": 0.9263664484024048, "learning_rate": 7.588276430290151e-06, "loss": 0.3234, "step": 2640 }, { "epoch": 0.3958251647715603, "grad_norm": 1.5908204317092896, "learning_rate": 7.56593501398995e-06, "loss": 0.3246, "step": 2650 }, { "epoch": 0.3973188446386228, "grad_norm": 1.5689475536346436, "learning_rate": 7.5435238338602604e-06, "loss": 0.3183, "step": 2660 }, { "epoch": 0.39881252450568533, "grad_norm": 0.8952176570892334, "learning_rate": 7.521043499226625e-06, "loss": 0.3019, "step": 2670 }, { "epoch": 0.40030620437274783, "grad_norm": 1.4977798461914062, "learning_rate": 7.498494621294796e-06, "loss": 0.347, "step": 2680 }, { "epoch": 0.4017998842398103, "grad_norm": 1.0641767978668213, "learning_rate": 7.475877813134106e-06, "loss": 0.341, "step": 2690 }, { "epoch": 0.4032935641068728, "grad_norm": 1.3907352685928345, "learning_rate": 7.453193689660811e-06, "loss": 0.3206, "step": 2700 }, { "epoch": 0.4047872439739353, "grad_norm": 1.4206258058547974, "learning_rate": 7.430442867621365e-06, "loss": 0.3058, "step": 2710 }, { "epoch": 0.4062809238409978, "grad_norm": 1.0893877744674683, "learning_rate": 7.407625965575656e-06, "loss": 0.306, "step": 2720 }, { "epoch": 0.4077746037080603, "grad_norm": 1.5306363105773926, "learning_rate": 7.384743603880181e-06, "loss": 0.3395, "step": 2730 }, { "epoch": 0.40926828357512274, "grad_norm": 1.5694290399551392, "learning_rate": 7.361796404671187e-06, "loss": 0.3044, "step": 2740 }, { "epoch": 0.41076196344218524, "grad_norm": 1.862500786781311, "learning_rate": 7.338784991847755e-06, "loss": 0.3307, "step": 2750 }, { "epoch": 0.41225564330924774, "grad_norm": 1.3926466703414917, "learning_rate": 7.315709991054832e-06, "loss": 0.3052, "step": 2760 }, { "epoch": 0.41374932317631025, "grad_norm": 1.6417464017868042, "learning_rate": 7.292572029666228e-06, "loss": 0.3108, "step": 2770 }, { "epoch": 0.41524300304337275, "grad_norm": 2.3295059204101562, "learning_rate": 7.269371736767552e-06, "loss": 0.3299, "step": 2780 }, { "epoch": 0.4167366829104352, "grad_norm": 1.707053303718567, "learning_rate": 7.246109743139111e-06, "loss": 0.3129, "step": 2790 }, { "epoch": 0.4182303627774977, "grad_norm": 1.233490228652954, "learning_rate": 7.222786681238762e-06, "loss": 0.3234, "step": 2800 }, { "epoch": 0.4197240426445602, "grad_norm": 0.8047583699226379, "learning_rate": 7.1994031851847125e-06, "loss": 0.3038, "step": 2810 }, { "epoch": 0.4212177225116227, "grad_norm": 1.466469168663025, "learning_rate": 7.175959890738282e-06, "loss": 0.3382, "step": 2820 }, { "epoch": 0.4227114023786852, "grad_norm": 1.0184977054595947, "learning_rate": 7.152457435286619e-06, "loss": 0.3143, "step": 2830 }, { "epoch": 0.42420508224574766, "grad_norm": 1.102300763130188, "learning_rate": 7.128896457825364e-06, "loss": 0.3228, "step": 2840 }, { "epoch": 0.42569876211281016, "grad_norm": 1.8604798316955566, "learning_rate": 7.1052775989412855e-06, "loss": 0.2981, "step": 2850 }, { "epoch": 0.42719244197987266, "grad_norm": 1.1831308603286743, "learning_rate": 7.081601500794857e-06, "loss": 0.3297, "step": 2860 }, { "epoch": 0.42868612184693516, "grad_norm": 1.7088931798934937, "learning_rate": 7.057868807102799e-06, "loss": 0.3101, "step": 2870 }, { "epoch": 0.43017980171399767, "grad_norm": 1.3123115301132202, "learning_rate": 7.034080163120579e-06, "loss": 0.3258, "step": 2880 }, { "epoch": 0.4316734815810601, "grad_norm": 1.3527169227600098, "learning_rate": 7.010236215624867e-06, "loss": 0.3029, "step": 2890 }, { "epoch": 0.4331671614481226, "grad_norm": 1.361512541770935, "learning_rate": 6.986337612895949e-06, "loss": 0.3392, "step": 2900 }, { "epoch": 0.4346608413151851, "grad_norm": 1.4390591382980347, "learning_rate": 6.962385004700105e-06, "loss": 0.3351, "step": 2910 }, { "epoch": 0.4361545211822476, "grad_norm": 1.67287278175354, "learning_rate": 6.938379042271939e-06, "loss": 0.3255, "step": 2920 }, { "epoch": 0.4376482010493101, "grad_norm": 1.2548269033432007, "learning_rate": 6.914320378296674e-06, "loss": 0.3262, "step": 2930 }, { "epoch": 0.4391418809163726, "grad_norm": 1.2193247079849243, "learning_rate": 6.89020966689241e-06, "loss": 0.3412, "step": 2940 }, { "epoch": 0.4406355607834351, "grad_norm": 1.1901212930679321, "learning_rate": 6.866047563592334e-06, "loss": 0.3002, "step": 2950 }, { "epoch": 0.4421292406504976, "grad_norm": 1.65078866481781, "learning_rate": 6.841834725326899e-06, "loss": 0.3172, "step": 2960 }, { "epoch": 0.4436229205175601, "grad_norm": 1.3838766813278198, "learning_rate": 6.817571810405967e-06, "loss": 0.3215, "step": 2970 }, { "epoch": 0.4451166003846226, "grad_norm": 1.2286713123321533, "learning_rate": 6.793259478500907e-06, "loss": 0.3208, "step": 2980 }, { "epoch": 0.44661028025168503, "grad_norm": 0.9910550713539124, "learning_rate": 6.7688983906266544e-06, "loss": 0.3293, "step": 2990 }, { "epoch": 0.44810396011874754, "grad_norm": 1.6711299419403076, "learning_rate": 6.74448920912375e-06, "loss": 0.3272, "step": 3000 }, { "epoch": 0.44810396011874754, "eval_loss": 0.31658411026000977, "eval_runtime": 76.2262, "eval_samples_per_second": 7.097, "eval_steps_per_second": 3.555, "step": 3000 }, { "epoch": 0.44959763998581004, "grad_norm": 1.8898260593414307, "learning_rate": 6.720032597640326e-06, "loss": 0.332, "step": 3010 }, { "epoch": 0.45109131985287254, "grad_norm": 1.8445961475372314, "learning_rate": 6.695529221114059e-06, "loss": 0.3165, "step": 3020 }, { "epoch": 0.45258499971993504, "grad_norm": 1.3706282377243042, "learning_rate": 6.670979745754101e-06, "loss": 0.3165, "step": 3030 }, { "epoch": 0.4540786795869975, "grad_norm": 1.7057021856307983, "learning_rate": 6.646384839022955e-06, "loss": 0.3045, "step": 3040 }, { "epoch": 0.45557235945406, "grad_norm": 1.5170303583145142, "learning_rate": 6.621745169618337e-06, "loss": 0.3061, "step": 3050 }, { "epoch": 0.4570660393211225, "grad_norm": 2.1427805423736572, "learning_rate": 6.597061407454987e-06, "loss": 0.31, "step": 3060 }, { "epoch": 0.458559719188185, "grad_norm": 1.1289193630218506, "learning_rate": 6.572334223646468e-06, "loss": 0.3388, "step": 3070 }, { "epoch": 0.4600533990552475, "grad_norm": 1.3998080492019653, "learning_rate": 6.5475642904869004e-06, "loss": 0.3296, "step": 3080 }, { "epoch": 0.46154707892230995, "grad_norm": 1.4870209693908691, "learning_rate": 6.5227522814327e-06, "loss": 0.3441, "step": 3090 }, { "epoch": 0.46304075878937245, "grad_norm": 1.3324146270751953, "learning_rate": 6.4978988710842585e-06, "loss": 0.3072, "step": 3100 }, { "epoch": 0.46453443865643496, "grad_norm": 2.6711628437042236, "learning_rate": 6.473004735167605e-06, "loss": 0.3199, "step": 3110 }, { "epoch": 0.46602811852349746, "grad_norm": 1.2170815467834473, "learning_rate": 6.44807055051604e-06, "loss": 0.3184, "step": 3120 }, { "epoch": 0.46752179839055996, "grad_norm": 1.2690013647079468, "learning_rate": 6.423096995051722e-06, "loss": 0.3292, "step": 3130 }, { "epoch": 0.4690154782576224, "grad_norm": 1.5772716999053955, "learning_rate": 6.398084747767241e-06, "loss": 0.3219, "step": 3140 }, { "epoch": 0.4705091581246849, "grad_norm": 1.8444935083389282, "learning_rate": 6.373034488707159e-06, "loss": 0.3282, "step": 3150 }, { "epoch": 0.4720028379917474, "grad_norm": 1.8097927570343018, "learning_rate": 6.347946898949524e-06, "loss": 0.3426, "step": 3160 }, { "epoch": 0.4734965178588099, "grad_norm": 1.232932209968567, "learning_rate": 6.322822660587343e-06, "loss": 0.3195, "step": 3170 }, { "epoch": 0.4749901977258724, "grad_norm": 1.4135682582855225, "learning_rate": 6.297662456710043e-06, "loss": 0.3125, "step": 3180 }, { "epoch": 0.47648387759293487, "grad_norm": 1.2826404571533203, "learning_rate": 6.272466971384902e-06, "loss": 0.3418, "step": 3190 }, { "epoch": 0.47797755745999737, "grad_norm": 1.1015794277191162, "learning_rate": 6.24723688963844e-06, "loss": 0.3114, "step": 3200 }, { "epoch": 0.4794712373270599, "grad_norm": 1.6762737035751343, "learning_rate": 6.221972897437804e-06, "loss": 0.3315, "step": 3210 }, { "epoch": 0.4809649171941224, "grad_norm": 1.5286458730697632, "learning_rate": 6.1966756816721195e-06, "loss": 0.3081, "step": 3220 }, { "epoch": 0.4824585970611849, "grad_norm": 2.174837827682495, "learning_rate": 6.171345930133798e-06, "loss": 0.3251, "step": 3230 }, { "epoch": 0.4839522769282473, "grad_norm": 1.966800570487976, "learning_rate": 6.145984331499859e-06, "loss": 0.33, "step": 3240 }, { "epoch": 0.48544595679530983, "grad_norm": 1.1323667764663696, "learning_rate": 6.120591575313189e-06, "loss": 0.322, "step": 3250 }, { "epoch": 0.48693963666237233, "grad_norm": 1.1445270776748657, "learning_rate": 6.095168351963805e-06, "loss": 0.3066, "step": 3260 }, { "epoch": 0.48843331652943484, "grad_norm": 0.9923702478408813, "learning_rate": 6.069715352670076e-06, "loss": 0.3006, "step": 3270 }, { "epoch": 0.48992699639649734, "grad_norm": 1.8956522941589355, "learning_rate": 6.044233269459935e-06, "loss": 0.3309, "step": 3280 }, { "epoch": 0.4914206762635598, "grad_norm": 1.9034560918807983, "learning_rate": 6.018722795152062e-06, "loss": 0.3168, "step": 3290 }, { "epoch": 0.4929143561306223, "grad_norm": 1.3808101415634155, "learning_rate": 5.993184623337045e-06, "loss": 0.3148, "step": 3300 }, { "epoch": 0.4944080359976848, "grad_norm": 1.3605296611785889, "learning_rate": 5.967619448358529e-06, "loss": 0.3128, "step": 3310 }, { "epoch": 0.4959017158647473, "grad_norm": 1.7083598375320435, "learning_rate": 5.942027965294329e-06, "loss": 0.3224, "step": 3320 }, { "epoch": 0.4973953957318098, "grad_norm": 2.0568454265594482, "learning_rate": 5.916410869937541e-06, "loss": 0.3199, "step": 3330 }, { "epoch": 0.49888907559887224, "grad_norm": 1.6961411237716675, "learning_rate": 5.890768858777613e-06, "loss": 0.3356, "step": 3340 }, { "epoch": 0.5003827554659348, "grad_norm": 1.5090399980545044, "learning_rate": 5.865102628981424e-06, "loss": 0.3014, "step": 3350 }, { "epoch": 0.5018764353329973, "grad_norm": 1.3667364120483398, "learning_rate": 5.839412878374313e-06, "loss": 0.3386, "step": 3360 }, { "epoch": 0.5033701152000597, "grad_norm": 1.5758767127990723, "learning_rate": 5.813700305421119e-06, "loss": 0.2939, "step": 3370 }, { "epoch": 0.5048637950671222, "grad_norm": 1.0520446300506592, "learning_rate": 5.787965609207184e-06, "loss": 0.2978, "step": 3380 }, { "epoch": 0.5063574749341847, "grad_norm": 1.4224300384521484, "learning_rate": 5.762209489419343e-06, "loss": 0.3168, "step": 3390 }, { "epoch": 0.5078511548012472, "grad_norm": 1.1233537197113037, "learning_rate": 5.736432646326911e-06, "loss": 0.3219, "step": 3400 }, { "epoch": 0.5093448346683097, "grad_norm": 1.480785608291626, "learning_rate": 5.710635780762639e-06, "loss": 0.3227, "step": 3410 }, { "epoch": 0.5108385145353722, "grad_norm": 1.2440319061279297, "learning_rate": 5.68481959410365e-06, "loss": 0.3391, "step": 3420 }, { "epoch": 0.5123321944024347, "grad_norm": 1.2866686582565308, "learning_rate": 5.658984788252384e-06, "loss": 0.2983, "step": 3430 }, { "epoch": 0.5138258742694972, "grad_norm": 1.2832037210464478, "learning_rate": 5.633132065617509e-06, "loss": 0.3066, "step": 3440 }, { "epoch": 0.5153195541365597, "grad_norm": 1.5093879699707031, "learning_rate": 5.607262129094819e-06, "loss": 0.3198, "step": 3450 }, { "epoch": 0.5168132340036222, "grad_norm": 1.5857967138290405, "learning_rate": 5.581375682048131e-06, "loss": 0.3187, "step": 3460 }, { "epoch": 0.5183069138706846, "grad_norm": 1.2231806516647339, "learning_rate": 5.555473428290154e-06, "loss": 0.3029, "step": 3470 }, { "epoch": 0.5198005937377471, "grad_norm": 1.2822185754776, "learning_rate": 5.5295560720633575e-06, "loss": 0.3046, "step": 3480 }, { "epoch": 0.5212942736048096, "grad_norm": 1.2995489835739136, "learning_rate": 5.503624318020829e-06, "loss": 0.3295, "step": 3490 }, { "epoch": 0.5227879534718721, "grad_norm": 1.9352302551269531, "learning_rate": 5.477678871207105e-06, "loss": 0.3216, "step": 3500 }, { "epoch": 0.5227879534718721, "eval_loss": 0.31384068727493286, "eval_runtime": 76.2107, "eval_samples_per_second": 7.099, "eval_steps_per_second": 3.556, "step": 3500 }, { "epoch": 0.5242816333389346, "grad_norm": 1.2406340837478638, "learning_rate": 5.4517204370390086e-06, "loss": 0.3009, "step": 3510 }, { "epoch": 0.5257753132059971, "grad_norm": 1.5014777183532715, "learning_rate": 5.425749721286471e-06, "loss": 0.3138, "step": 3520 }, { "epoch": 0.5272689930730596, "grad_norm": 2.0441832542419434, "learning_rate": 5.399767430053338e-06, "loss": 0.3317, "step": 3530 }, { "epoch": 0.5287626729401221, "grad_norm": 1.4949225187301636, "learning_rate": 5.373774269758178e-06, "loss": 0.3156, "step": 3540 }, { "epoch": 0.5302563528071846, "grad_norm": 1.5877892971038818, "learning_rate": 5.3477709471150716e-06, "loss": 0.2948, "step": 3550 }, { "epoch": 0.5317500326742471, "grad_norm": 1.809065580368042, "learning_rate": 5.321758169114396e-06, "loss": 0.3177, "step": 3560 }, { "epoch": 0.5332437125413095, "grad_norm": 1.4500436782836914, "learning_rate": 5.295736643003605e-06, "loss": 0.2974, "step": 3570 }, { "epoch": 0.534737392408372, "grad_norm": 1.2693545818328857, "learning_rate": 5.269707076268005e-06, "loss": 0.2848, "step": 3580 }, { "epoch": 0.5362310722754345, "grad_norm": 1.21388578414917, "learning_rate": 5.243670176611509e-06, "loss": 0.3199, "step": 3590 }, { "epoch": 0.537724752142497, "grad_norm": 1.2438586950302124, "learning_rate": 5.217626651937404e-06, "loss": 0.3064, "step": 3600 }, { "epoch": 0.5392184320095595, "grad_norm": 2.074819326400757, "learning_rate": 5.1915772103291e-06, "loss": 0.3081, "step": 3610 }, { "epoch": 0.540712111876622, "grad_norm": 1.8442296981811523, "learning_rate": 5.1655225600308765e-06, "loss": 0.3303, "step": 3620 }, { "epoch": 0.5422057917436846, "grad_norm": 1.3743780851364136, "learning_rate": 5.139463409428635e-06, "loss": 0.3368, "step": 3630 }, { "epoch": 0.543699471610747, "grad_norm": 1.440290927886963, "learning_rate": 5.113400467030632e-06, "loss": 0.3332, "step": 3640 }, { "epoch": 0.5451931514778096, "grad_norm": 1.7156881093978882, "learning_rate": 5.087334441448213e-06, "loss": 0.3164, "step": 3650 }, { "epoch": 0.5466868313448721, "grad_norm": 0.9851483106613159, "learning_rate": 5.061266041376553e-06, "loss": 0.3407, "step": 3660 }, { "epoch": 0.5481805112119345, "grad_norm": 1.2339802980422974, "learning_rate": 5.035195975575387e-06, "loss": 0.3115, "step": 3670 }, { "epoch": 0.549674191078997, "grad_norm": 1.1633639335632324, "learning_rate": 5.0091249528497374e-06, "loss": 0.3215, "step": 3680 }, { "epoch": 0.5511678709460595, "grad_norm": 1.794061303138733, "learning_rate": 4.983053682030642e-06, "loss": 0.3222, "step": 3690 }, { "epoch": 0.552661550813122, "grad_norm": 1.7158312797546387, "learning_rate": 4.95698287195589e-06, "loss": 0.3021, "step": 3700 }, { "epoch": 0.5541552306801845, "grad_norm": 1.7610148191452026, "learning_rate": 4.930913231450737e-06, "loss": 0.2871, "step": 3710 }, { "epoch": 0.555648910547247, "grad_norm": 1.8225277662277222, "learning_rate": 4.904845469308642e-06, "loss": 0.2988, "step": 3720 }, { "epoch": 0.5571425904143095, "grad_norm": 1.7422287464141846, "learning_rate": 4.8787802942719955e-06, "loss": 0.3258, "step": 3730 }, { "epoch": 0.558636270281372, "grad_norm": 1.324690818786621, "learning_rate": 4.8527184150128475e-06, "loss": 0.3182, "step": 3740 }, { "epoch": 0.5601299501484345, "grad_norm": 1.0865528583526611, "learning_rate": 4.82666054011364e-06, "loss": 0.309, "step": 3750 }, { "epoch": 0.561623630015497, "grad_norm": 1.5340676307678223, "learning_rate": 4.800607378047944e-06, "loss": 0.3356, "step": 3760 }, { "epoch": 0.5631173098825594, "grad_norm": 1.476318359375, "learning_rate": 4.774559637161197e-06, "loss": 0.31, "step": 3770 }, { "epoch": 0.5646109897496219, "grad_norm": 1.3898128271102905, "learning_rate": 4.74851802565144e-06, "loss": 0.3202, "step": 3780 }, { "epoch": 0.5661046696166844, "grad_norm": 1.4143530130386353, "learning_rate": 4.722483251550067e-06, "loss": 0.3445, "step": 3790 }, { "epoch": 0.5675983494837469, "grad_norm": 0.8102360963821411, "learning_rate": 4.696456022702574e-06, "loss": 0.3087, "step": 3800 }, { "epoch": 0.5690920293508094, "grad_norm": 1.0995668172836304, "learning_rate": 4.670437046749312e-06, "loss": 0.3077, "step": 3810 }, { "epoch": 0.5705857092178719, "grad_norm": 1.710694432258606, "learning_rate": 4.6444270311062496e-06, "loss": 0.3123, "step": 3820 }, { "epoch": 0.5720793890849344, "grad_norm": 1.50558602809906, "learning_rate": 4.618426682945736e-06, "loss": 0.3142, "step": 3830 }, { "epoch": 0.5735730689519969, "grad_norm": 1.3168991804122925, "learning_rate": 4.59243670917728e-06, "loss": 0.3349, "step": 3840 }, { "epoch": 0.5750667488190594, "grad_norm": 1.0681779384613037, "learning_rate": 4.566457816428326e-06, "loss": 0.3153, "step": 3850 }, { "epoch": 0.5765604286861219, "grad_norm": 1.5274810791015625, "learning_rate": 4.5404907110250364e-06, "loss": 0.3263, "step": 3860 }, { "epoch": 0.5780541085531843, "grad_norm": 1.5444824695587158, "learning_rate": 4.514536098973105e-06, "loss": 0.306, "step": 3870 }, { "epoch": 0.5795477884202468, "grad_norm": 1.126636028289795, "learning_rate": 4.488594685938541e-06, "loss": 0.3122, "step": 3880 }, { "epoch": 0.5810414682873093, "grad_norm": 1.2185169458389282, "learning_rate": 4.462667177228496e-06, "loss": 0.2975, "step": 3890 }, { "epoch": 0.5825351481543718, "grad_norm": 1.721125602722168, "learning_rate": 4.4367542777720854e-06, "loss": 0.3174, "step": 3900 }, { "epoch": 0.5840288280214343, "grad_norm": 1.476317048072815, "learning_rate": 4.410856692101219e-06, "loss": 0.3093, "step": 3910 }, { "epoch": 0.5855225078884968, "grad_norm": 1.5350698232650757, "learning_rate": 4.384975124331451e-06, "loss": 0.3243, "step": 3920 }, { "epoch": 0.5870161877555593, "grad_norm": 1.8953022956848145, "learning_rate": 4.35911027814283e-06, "loss": 0.319, "step": 3930 }, { "epoch": 0.5885098676226218, "grad_norm": 1.768258810043335, "learning_rate": 4.333262856760774e-06, "loss": 0.3073, "step": 3940 }, { "epoch": 0.5900035474896843, "grad_norm": 0.974807858467102, "learning_rate": 4.3074335629369455e-06, "loss": 0.3208, "step": 3950 }, { "epoch": 0.5914972273567468, "grad_norm": 1.3250782489776611, "learning_rate": 4.281623098930148e-06, "loss": 0.2884, "step": 3960 }, { "epoch": 0.5929909072238092, "grad_norm": 1.5974177122116089, "learning_rate": 4.25583216648723e-06, "loss": 0.2861, "step": 3970 }, { "epoch": 0.5944845870908717, "grad_norm": 1.2887296676635742, "learning_rate": 4.2300614668240065e-06, "loss": 0.3445, "step": 3980 }, { "epoch": 0.5959782669579342, "grad_norm": 2.0698602199554443, "learning_rate": 4.204311700606195e-06, "loss": 0.3091, "step": 3990 }, { "epoch": 0.5974719468249967, "grad_norm": 1.6275320053100586, "learning_rate": 4.1785835679303635e-06, "loss": 0.3223, "step": 4000 }, { "epoch": 0.5974719468249967, "eval_loss": 0.31077098846435547, "eval_runtime": 76.2168, "eval_samples_per_second": 7.098, "eval_steps_per_second": 3.556, "step": 4000 }, { "epoch": 0.5989656266920592, "grad_norm": 1.140994906425476, "learning_rate": 4.152877768304898e-06, "loss": 0.316, "step": 4010 }, { "epoch": 0.6004593065591217, "grad_norm": 1.963865041732788, "learning_rate": 4.127195000630987e-06, "loss": 0.3173, "step": 4020 }, { "epoch": 0.6019529864261842, "grad_norm": 1.7010706663131714, "learning_rate": 4.1015359631836085e-06, "loss": 0.3318, "step": 4030 }, { "epoch": 0.6034466662932467, "grad_norm": 1.9144036769866943, "learning_rate": 4.0759013535925575e-06, "loss": 0.3229, "step": 4040 }, { "epoch": 0.6049403461603092, "grad_norm": 1.5420873165130615, "learning_rate": 4.050291868823469e-06, "loss": 0.2952, "step": 4050 }, { "epoch": 0.6064340260273717, "grad_norm": 1.2293835878372192, "learning_rate": 4.0247082051588794e-06, "loss": 0.3273, "step": 4060 }, { "epoch": 0.6079277058944341, "grad_norm": 1.203016996383667, "learning_rate": 3.999151058179283e-06, "loss": 0.3301, "step": 4070 }, { "epoch": 0.6094213857614966, "grad_norm": 1.7640305757522583, "learning_rate": 3.973621122744226e-06, "loss": 0.3217, "step": 4080 }, { "epoch": 0.6109150656285591, "grad_norm": 0.998776912689209, "learning_rate": 3.9481190929734185e-06, "loss": 0.2961, "step": 4090 }, { "epoch": 0.6124087454956216, "grad_norm": 1.3041551113128662, "learning_rate": 3.922645662227854e-06, "loss": 0.3178, "step": 4100 }, { "epoch": 0.6139024253626841, "grad_norm": 1.2349125146865845, "learning_rate": 3.897201523090967e-06, "loss": 0.2985, "step": 4110 }, { "epoch": 0.6153961052297466, "grad_norm": 1.5378309488296509, "learning_rate": 3.8717873673497945e-06, "loss": 0.2987, "step": 4120 }, { "epoch": 0.6168897850968091, "grad_norm": 1.2633869647979736, "learning_rate": 3.846403885976175e-06, "loss": 0.2989, "step": 4130 }, { "epoch": 0.6183834649638716, "grad_norm": 1.7205194234848022, "learning_rate": 3.821051769107952e-06, "loss": 0.3105, "step": 4140 }, { "epoch": 0.6198771448309341, "grad_norm": 2.0533735752105713, "learning_rate": 3.7957317060302225e-06, "loss": 0.3204, "step": 4150 }, { "epoch": 0.6213708246979966, "grad_norm": 1.609309434890747, "learning_rate": 3.770444385156587e-06, "loss": 0.3107, "step": 4160 }, { "epoch": 0.622864504565059, "grad_norm": 1.2998265027999878, "learning_rate": 3.745190494010436e-06, "loss": 0.3101, "step": 4170 }, { "epoch": 0.6243581844321215, "grad_norm": 2.1723592281341553, "learning_rate": 3.7199707192062578e-06, "loss": 0.2887, "step": 4180 }, { "epoch": 0.625851864299184, "grad_norm": 1.8159315586090088, "learning_rate": 3.6947857464309695e-06, "loss": 0.3088, "step": 4190 }, { "epoch": 0.6273455441662465, "grad_norm": 1.644352912902832, "learning_rate": 3.6696362604252734e-06, "loss": 0.3128, "step": 4200 }, { "epoch": 0.628839224033309, "grad_norm": 2.0270018577575684, "learning_rate": 3.6445229449650443e-06, "loss": 0.3324, "step": 4210 }, { "epoch": 0.6303329039003716, "grad_norm": 1.4132014513015747, "learning_rate": 3.6194464828427324e-06, "loss": 0.3078, "step": 4220 }, { "epoch": 0.6318265837674341, "grad_norm": 1.3948049545288086, "learning_rate": 3.5944075558488e-06, "loss": 0.315, "step": 4230 }, { "epoch": 0.6333202636344966, "grad_norm": 1.699584722518921, "learning_rate": 3.569406844753196e-06, "loss": 0.3218, "step": 4240 }, { "epoch": 0.6348139435015591, "grad_norm": 1.175880789756775, "learning_rate": 3.544445029286829e-06, "loss": 0.3271, "step": 4250 }, { "epoch": 0.6363076233686216, "grad_norm": 1.1815729141235352, "learning_rate": 3.5195227881230985e-06, "loss": 0.3202, "step": 4260 }, { "epoch": 0.637801303235684, "grad_norm": 1.096339464187622, "learning_rate": 3.4946407988594394e-06, "loss": 0.3212, "step": 4270 }, { "epoch": 0.6392949831027465, "grad_norm": 2.088253974914551, "learning_rate": 3.4697997379988983e-06, "loss": 0.3117, "step": 4280 }, { "epoch": 0.640788662969809, "grad_norm": 1.1896518468856812, "learning_rate": 3.445000280931743e-06, "loss": 0.3055, "step": 4290 }, { "epoch": 0.6422823428368715, "grad_norm": 1.6314555406570435, "learning_rate": 3.4202431019170964e-06, "loss": 0.313, "step": 4300 }, { "epoch": 0.643776022703934, "grad_norm": 1.4211452007293701, "learning_rate": 3.3955288740646064e-06, "loss": 0.2967, "step": 4310 }, { "epoch": 0.6452697025709965, "grad_norm": 1.8873090744018555, "learning_rate": 3.3708582693161473e-06, "loss": 0.3218, "step": 4320 }, { "epoch": 0.646763382438059, "grad_norm": 1.0354822874069214, "learning_rate": 3.346231958427546e-06, "loss": 0.3155, "step": 4330 }, { "epoch": 0.6482570623051215, "grad_norm": 1.7193752527236938, "learning_rate": 3.3216506109503478e-06, "loss": 0.2933, "step": 4340 }, { "epoch": 0.649750742172184, "grad_norm": 1.969494104385376, "learning_rate": 3.297114895213611e-06, "loss": 0.3086, "step": 4350 }, { "epoch": 0.6512444220392465, "grad_norm": 1.3515018224716187, "learning_rate": 3.2726254783057388e-06, "loss": 0.3012, "step": 4360 }, { "epoch": 0.6527381019063089, "grad_norm": 1.2439565658569336, "learning_rate": 3.2481830260563393e-06, "loss": 0.3175, "step": 4370 }, { "epoch": 0.6542317817733714, "grad_norm": 2.0741679668426514, "learning_rate": 3.2237882030181227e-06, "loss": 0.3281, "step": 4380 }, { "epoch": 0.6557254616404339, "grad_norm": 1.3941818475723267, "learning_rate": 3.199441672448838e-06, "loss": 0.3179, "step": 4390 }, { "epoch": 0.6572191415074964, "grad_norm": 1.3664950132369995, "learning_rate": 3.1751440962932324e-06, "loss": 0.3252, "step": 4400 }, { "epoch": 0.6587128213745589, "grad_norm": 1.3657866716384888, "learning_rate": 3.150896135165059e-06, "loss": 0.3274, "step": 4410 }, { "epoch": 0.6602065012416214, "grad_norm": 1.4565297365188599, "learning_rate": 3.126698448329112e-06, "loss": 0.319, "step": 4420 }, { "epoch": 0.6617001811086839, "grad_norm": 1.585686445236206, "learning_rate": 3.1025516936833122e-06, "loss": 0.2937, "step": 4430 }, { "epoch": 0.6631938609757464, "grad_norm": 1.6105479001998901, "learning_rate": 3.0784565277408063e-06, "loss": 0.3247, "step": 4440 }, { "epoch": 0.6646875408428089, "grad_norm": 1.0377700328826904, "learning_rate": 3.0544136056121232e-06, "loss": 0.3215, "step": 4450 }, { "epoch": 0.6661812207098714, "grad_norm": 1.4693603515625, "learning_rate": 3.0304235809873654e-06, "loss": 0.3016, "step": 4460 }, { "epoch": 0.6676749005769338, "grad_norm": 1.3283636569976807, "learning_rate": 3.006487106118433e-06, "loss": 0.3024, "step": 4470 }, { "epoch": 0.6691685804439963, "grad_norm": 1.0531262159347534, "learning_rate": 2.982604831801289e-06, "loss": 0.3287, "step": 4480 }, { "epoch": 0.6706622603110588, "grad_norm": 1.6268073320388794, "learning_rate": 2.9587774073582677e-06, "loss": 0.306, "step": 4490 }, { "epoch": 0.6721559401781213, "grad_norm": 1.7072473764419556, "learning_rate": 2.9350054806204214e-06, "loss": 0.3346, "step": 4500 }, { "epoch": 0.6721559401781213, "eval_loss": 0.307580828666687, "eval_runtime": 76.4751, "eval_samples_per_second": 7.074, "eval_steps_per_second": 3.544, "step": 4500 }, { "epoch": 0.6736496200451838, "grad_norm": 1.3722172975540161, "learning_rate": 2.9112896979099037e-06, "loss": 0.3213, "step": 4510 }, { "epoch": 0.6751432999122463, "grad_norm": 0.8439034819602966, "learning_rate": 2.8876307040223956e-06, "loss": 0.3102, "step": 4520 }, { "epoch": 0.6766369797793088, "grad_norm": 2.1569015979766846, "learning_rate": 2.864029142209579e-06, "loss": 0.3189, "step": 4530 }, { "epoch": 0.6781306596463713, "grad_norm": 0.9060570597648621, "learning_rate": 2.840485654161651e-06, "loss": 0.2811, "step": 4540 }, { "epoch": 0.6796243395134338, "grad_norm": 1.4373691082000732, "learning_rate": 2.817000879989866e-06, "loss": 0.3052, "step": 4550 }, { "epoch": 0.6811180193804963, "grad_norm": 1.3326523303985596, "learning_rate": 2.7935754582091413e-06, "loss": 0.3184, "step": 4560 }, { "epoch": 0.6826116992475587, "grad_norm": 1.3754558563232422, "learning_rate": 2.770210025720691e-06, "loss": 0.3192, "step": 4570 }, { "epoch": 0.6841053791146212, "grad_norm": 2.0747873783111572, "learning_rate": 2.746905217794715e-06, "loss": 0.3408, "step": 4580 }, { "epoch": 0.6855990589816837, "grad_norm": 1.3364531993865967, "learning_rate": 2.7236616680531256e-06, "loss": 0.3005, "step": 4590 }, { "epoch": 0.6870927388487462, "grad_norm": 1.6091829538345337, "learning_rate": 2.7004800084523166e-06, "loss": 0.3288, "step": 4600 }, { "epoch": 0.6885864187158087, "grad_norm": 1.1656900644302368, "learning_rate": 2.6773608692659825e-06, "loss": 0.2837, "step": 4610 }, { "epoch": 0.6900800985828712, "grad_norm": 1.4220030307769775, "learning_rate": 2.6543048790679915e-06, "loss": 0.3119, "step": 4620 }, { "epoch": 0.6915737784499337, "grad_norm": 1.688082218170166, "learning_rate": 2.63131266471528e-06, "loss": 0.3282, "step": 4630 }, { "epoch": 0.6930674583169962, "grad_norm": 1.2834751605987549, "learning_rate": 2.60838485133082e-06, "loss": 0.3018, "step": 4640 }, { "epoch": 0.6945611381840587, "grad_norm": 1.5603129863739014, "learning_rate": 2.5855220622866197e-06, "loss": 0.3035, "step": 4650 }, { "epoch": 0.6960548180511212, "grad_norm": 1.6552413702011108, "learning_rate": 2.562724919186777e-06, "loss": 0.321, "step": 4660 }, { "epoch": 0.6975484979181836, "grad_norm": 1.5287736654281616, "learning_rate": 2.5399940418505754e-06, "loss": 0.3229, "step": 4670 }, { "epoch": 0.6990421777852461, "grad_norm": 1.5035234689712524, "learning_rate": 2.5173300482956346e-06, "loss": 0.2946, "step": 4680 }, { "epoch": 0.7005358576523086, "grad_norm": 2.163083791732788, "learning_rate": 2.4947335547211083e-06, "loss": 0.3239, "step": 4690 }, { "epoch": 0.7020295375193711, "grad_norm": 1.5969173908233643, "learning_rate": 2.472205175490928e-06, "loss": 0.3033, "step": 4700 }, { "epoch": 0.7035232173864336, "grad_norm": 1.2077685594558716, "learning_rate": 2.4497455231171003e-06, "loss": 0.3142, "step": 4710 }, { "epoch": 0.7050168972534961, "grad_norm": 1.0711603164672852, "learning_rate": 2.4273552082430586e-06, "loss": 0.292, "step": 4720 }, { "epoch": 0.7065105771205586, "grad_norm": 1.1325751543045044, "learning_rate": 2.405034839627051e-06, "loss": 0.3309, "step": 4730 }, { "epoch": 0.7080042569876212, "grad_norm": 1.3801145553588867, "learning_rate": 2.3827850241255974e-06, "loss": 0.3266, "step": 4740 }, { "epoch": 0.7094979368546837, "grad_norm": 1.4642720222473145, "learning_rate": 2.3606063666769846e-06, "loss": 0.2985, "step": 4750 }, { "epoch": 0.7109916167217462, "grad_norm": 1.8076415061950684, "learning_rate": 2.3384994702848234e-06, "loss": 0.3185, "step": 4760 }, { "epoch": 0.7124852965888087, "grad_norm": 1.7433451414108276, "learning_rate": 2.3164649360016505e-06, "loss": 0.3004, "step": 4770 }, { "epoch": 0.713978976455871, "grad_norm": 1.4180279970169067, "learning_rate": 2.294503362912589e-06, "loss": 0.3193, "step": 4780 }, { "epoch": 0.7154726563229336, "grad_norm": 1.5062882900238037, "learning_rate": 2.2726153481190588e-06, "loss": 0.3233, "step": 4790 }, { "epoch": 0.7169663361899961, "grad_norm": 1.4006506204605103, "learning_rate": 2.250801486722541e-06, "loss": 0.3125, "step": 4800 }, { "epoch": 0.7184600160570586, "grad_norm": 1.7776737213134766, "learning_rate": 2.2290623718084052e-06, "loss": 0.2971, "step": 4810 }, { "epoch": 0.7199536959241211, "grad_norm": 1.5043376684188843, "learning_rate": 2.207398594429773e-06, "loss": 0.2992, "step": 4820 }, { "epoch": 0.7214473757911836, "grad_norm": 2.0632483959198, "learning_rate": 2.185810743591458e-06, "loss": 0.3223, "step": 4830 }, { "epoch": 0.7229410556582461, "grad_norm": 1.3994874954223633, "learning_rate": 2.1642994062339458e-06, "loss": 0.3374, "step": 4840 }, { "epoch": 0.7244347355253086, "grad_norm": 1.8748818635940552, "learning_rate": 2.1428651672174382e-06, "loss": 0.308, "step": 4850 }, { "epoch": 0.7259284153923711, "grad_norm": 1.1066455841064453, "learning_rate": 2.1215086093059527e-06, "loss": 0.2935, "step": 4860 }, { "epoch": 0.7274220952594336, "grad_norm": 1.480527400970459, "learning_rate": 2.100230313151476e-06, "loss": 0.3537, "step": 4870 }, { "epoch": 0.728915775126496, "grad_norm": 1.5903476476669312, "learning_rate": 2.079030857278179e-06, "loss": 0.3039, "step": 4880 }, { "epoch": 0.7304094549935585, "grad_norm": 1.1910865306854248, "learning_rate": 2.057910818066684e-06, "loss": 0.3233, "step": 4890 }, { "epoch": 0.731903134860621, "grad_norm": 1.305713176727295, "learning_rate": 2.036870769738401e-06, "loss": 0.3295, "step": 4900 }, { "epoch": 0.7333968147276835, "grad_norm": 1.8359174728393555, "learning_rate": 2.0159112843399066e-06, "loss": 0.3121, "step": 4910 }, { "epoch": 0.734890494594746, "grad_norm": 1.200527548789978, "learning_rate": 1.995032931727396e-06, "loss": 0.3155, "step": 4920 }, { "epoch": 0.7363841744618085, "grad_norm": 0.9834126234054565, "learning_rate": 1.97423627955119e-06, "loss": 0.3086, "step": 4930 }, { "epoch": 0.737877854328871, "grad_norm": 1.8965601921081543, "learning_rate": 1.9535218932402987e-06, "loss": 0.296, "step": 4940 }, { "epoch": 0.7393715341959335, "grad_norm": 1.8559459447860718, "learning_rate": 1.9328903359870504e-06, "loss": 0.2943, "step": 4950 }, { "epoch": 0.740865214062996, "grad_norm": 1.5035439729690552, "learning_rate": 1.9123421687317784e-06, "loss": 0.3121, "step": 4960 }, { "epoch": 0.7423588939300585, "grad_norm": 1.1852291822433472, "learning_rate": 1.8918779501475708e-06, "loss": 0.3158, "step": 4970 }, { "epoch": 0.7438525737971209, "grad_norm": 1.259185791015625, "learning_rate": 1.8714982366250796e-06, "loss": 0.2938, "step": 4980 }, { "epoch": 0.7453462536641834, "grad_norm": 1.5034098625183105, "learning_rate": 1.8512035822573915e-06, "loss": 0.2949, "step": 4990 }, { "epoch": 0.7468399335312459, "grad_norm": 1.282578706741333, "learning_rate": 1.8309945388249733e-06, "loss": 0.3098, "step": 5000 }, { "epoch": 0.7468399335312459, "eval_loss": 0.30645084381103516, "eval_runtime": 76.3887, "eval_samples_per_second": 7.082, "eval_steps_per_second": 3.548, "step": 5000 }, { "epoch": 0.7483336133983084, "grad_norm": 1.2789969444274902, "learning_rate": 1.8108716557806545e-06, "loss": 0.3168, "step": 5010 }, { "epoch": 0.7498272932653709, "grad_norm": 1.0109808444976807, "learning_rate": 1.7908354802346982e-06, "loss": 0.2843, "step": 5020 }, { "epoch": 0.7513209731324334, "grad_norm": 1.3991084098815918, "learning_rate": 1.7708865569399247e-06, "loss": 0.3324, "step": 5030 }, { "epoch": 0.7528146529994959, "grad_norm": 1.529976725578308, "learning_rate": 1.751025428276899e-06, "loss": 0.3152, "step": 5040 }, { "epoch": 0.7543083328665584, "grad_norm": 1.9336539506912231, "learning_rate": 1.7312526342391862e-06, "loss": 0.3077, "step": 5050 }, { "epoch": 0.7558020127336209, "grad_norm": 1.4918617010116577, "learning_rate": 1.7115687124186658e-06, "loss": 0.3139, "step": 5060 }, { "epoch": 0.7572956926006834, "grad_norm": 2.2446916103363037, "learning_rate": 1.6919741979909222e-06, "loss": 0.3278, "step": 5070 }, { "epoch": 0.7587893724677458, "grad_norm": 1.2982836961746216, "learning_rate": 1.6724696237006848e-06, "loss": 0.3063, "step": 5080 }, { "epoch": 0.7602830523348083, "grad_norm": 1.0719565153121948, "learning_rate": 1.653055519847357e-06, "loss": 0.2921, "step": 5090 }, { "epoch": 0.7617767322018708, "grad_norm": 1.5067294836044312, "learning_rate": 1.6337324142705836e-06, "loss": 0.3102, "step": 5100 }, { "epoch": 0.7632704120689333, "grad_norm": 1.2776610851287842, "learning_rate": 1.6145008323359068e-06, "loss": 0.2969, "step": 5110 }, { "epoch": 0.7647640919359958, "grad_norm": 1.1912457942962646, "learning_rate": 1.5953612969204834e-06, "loss": 0.2682, "step": 5120 }, { "epoch": 0.7662577718030583, "grad_norm": 1.404762625694275, "learning_rate": 1.5763143283988663e-06, "loss": 0.2963, "step": 5130 }, { "epoch": 0.7677514516701208, "grad_norm": 1.2275928258895874, "learning_rate": 1.5573604446288572e-06, "loss": 0.2801, "step": 5140 }, { "epoch": 0.7692451315371833, "grad_norm": 1.437886118888855, "learning_rate": 1.538500160937424e-06, "loss": 0.31, "step": 5150 }, { "epoch": 0.7707388114042458, "grad_norm": 1.3553423881530762, "learning_rate": 1.519733990106696e-06, "loss": 0.2946, "step": 5160 }, { "epoch": 0.7722324912713083, "grad_norm": 1.8724462985992432, "learning_rate": 1.5010624423600161e-06, "loss": 0.294, "step": 5170 }, { "epoch": 0.7737261711383707, "grad_norm": 1.0624821186065674, "learning_rate": 1.48248602534807e-06, "loss": 0.3292, "step": 5180 }, { "epoch": 0.7752198510054332, "grad_norm": 1.6190390586853027, "learning_rate": 1.4640052441350893e-06, "loss": 0.3258, "step": 5190 }, { "epoch": 0.7767135308724957, "grad_norm": 1.0761600732803345, "learning_rate": 1.4456206011851115e-06, "loss": 0.3226, "step": 5200 }, { "epoch": 0.7782072107395582, "grad_norm": 1.5092500448226929, "learning_rate": 1.4273325963483226e-06, "loss": 0.2854, "step": 5210 }, { "epoch": 0.7797008906066207, "grad_norm": 1.7542755603790283, "learning_rate": 1.4091417268474683e-06, "loss": 0.3071, "step": 5220 }, { "epoch": 0.7811945704736832, "grad_norm": 1.1649888753890991, "learning_rate": 1.3910484872643326e-06, "loss": 0.3309, "step": 5230 }, { "epoch": 0.7826882503407457, "grad_norm": 1.1616461277008057, "learning_rate": 1.3730533695262927e-06, "loss": 0.285, "step": 5240 }, { "epoch": 0.7841819302078082, "grad_norm": 1.3221337795257568, "learning_rate": 1.3551568628929434e-06, "loss": 0.3119, "step": 5250 }, { "epoch": 0.7856756100748707, "grad_norm": 1.7051467895507812, "learning_rate": 1.3373594539427941e-06, "loss": 0.3262, "step": 5260 }, { "epoch": 0.7871692899419332, "grad_norm": 1.3770607709884644, "learning_rate": 1.3196616265600442e-06, "loss": 0.2957, "step": 5270 }, { "epoch": 0.7886629698089956, "grad_norm": 2.04876708984375, "learning_rate": 1.3020638619214199e-06, "loss": 0.3109, "step": 5280 }, { "epoch": 0.7901566496760581, "grad_norm": 1.0968056917190552, "learning_rate": 1.2845666384830951e-06, "loss": 0.325, "step": 5290 }, { "epoch": 0.7916503295431206, "grad_norm": 1.4080263376235962, "learning_rate": 1.2671704319676847e-06, "loss": 0.3151, "step": 5300 }, { "epoch": 0.7931440094101831, "grad_norm": 1.1893798112869263, "learning_rate": 1.2498757153513075e-06, "loss": 0.3196, "step": 5310 }, { "epoch": 0.7946376892772457, "grad_norm": 1.286380410194397, "learning_rate": 1.2326829588507282e-06, "loss": 0.3288, "step": 5320 }, { "epoch": 0.7961313691443082, "grad_norm": 1.841400384902954, "learning_rate": 1.2155926299105737e-06, "loss": 0.3035, "step": 5330 }, { "epoch": 0.7976250490113707, "grad_norm": 1.0659278631210327, "learning_rate": 1.1986051931906207e-06, "loss": 0.3368, "step": 5340 }, { "epoch": 0.7991187288784332, "grad_norm": 1.0193463563919067, "learning_rate": 1.1817211105531667e-06, "loss": 0.3063, "step": 5350 }, { "epoch": 0.8006124087454957, "grad_norm": 1.656656265258789, "learning_rate": 1.1649408410504686e-06, "loss": 0.3059, "step": 5360 }, { "epoch": 0.8021060886125582, "grad_norm": 1.4112011194229126, "learning_rate": 1.148264840912267e-06, "loss": 0.3059, "step": 5370 }, { "epoch": 0.8035997684796206, "grad_norm": 0.9142074584960938, "learning_rate": 1.131693563533376e-06, "loss": 0.3003, "step": 5380 }, { "epoch": 0.8050934483466831, "grad_norm": 1.3341706991195679, "learning_rate": 1.1152274594613588e-06, "loss": 0.3185, "step": 5390 }, { "epoch": 0.8065871282137456, "grad_norm": 1.5966401100158691, "learning_rate": 1.0988669763842786e-06, "loss": 0.3394, "step": 5400 }, { "epoch": 0.8080808080808081, "grad_norm": 1.6179472208023071, "learning_rate": 1.0826125591185265e-06, "loss": 0.3209, "step": 5410 }, { "epoch": 0.8095744879478706, "grad_norm": 1.7330031394958496, "learning_rate": 1.0664646495967263e-06, "loss": 0.3303, "step": 5420 }, { "epoch": 0.8110681678149331, "grad_norm": 1.7587456703186035, "learning_rate": 1.050423686855721e-06, "loss": 0.3356, "step": 5430 }, { "epoch": 0.8125618476819956, "grad_norm": 1.2351861000061035, "learning_rate": 1.0344901070246332e-06, "loss": 0.2924, "step": 5440 }, { "epoch": 0.8140555275490581, "grad_norm": 1.0523043870925903, "learning_rate": 1.0186643433130128e-06, "loss": 0.314, "step": 5450 }, { "epoch": 0.8155492074161206, "grad_norm": 1.5923221111297607, "learning_rate": 1.0029468259990515e-06, "loss": 0.2991, "step": 5460 }, { "epoch": 0.8170428872831831, "grad_norm": 1.6099388599395752, "learning_rate": 9.873379824178886e-07, "loss": 0.3055, "step": 5470 }, { "epoch": 0.8185365671502455, "grad_norm": 1.1426987648010254, "learning_rate": 9.718382369499936e-07, "loss": 0.2959, "step": 5480 }, { "epoch": 0.820030247017308, "grad_norm": 2.282841444015503, "learning_rate": 9.564480110096226e-07, "loss": 0.3473, "step": 5490 }, { "epoch": 0.8215239268843705, "grad_norm": 1.753849983215332, "learning_rate": 9.411677230333672e-07, "loss": 0.2938, "step": 5500 }, { "epoch": 0.8215239268843705, "eval_loss": 0.30588287115097046, "eval_runtime": 76.2019, "eval_samples_per_second": 7.1, "eval_steps_per_second": 3.556, "step": 5500 }, { "epoch": 0.823017606751433, "grad_norm": 1.5381008386611938, "learning_rate": 9.259977884687726e-07, "loss": 0.3001, "step": 5510 }, { "epoch": 0.8245112866184955, "grad_norm": 1.2274119853973389, "learning_rate": 9.10938619763046e-07, "loss": 0.2968, "step": 5520 }, { "epoch": 0.826004966485558, "grad_norm": 1.2566465139389038, "learning_rate": 8.959906263518398e-07, "loss": 0.3135, "step": 5530 }, { "epoch": 0.8274986463526205, "grad_norm": 1.4085761308670044, "learning_rate": 8.811542146481223e-07, "loss": 0.3067, "step": 5540 }, { "epoch": 0.828992326219683, "grad_norm": 1.507485032081604, "learning_rate": 8.664297880311234e-07, "loss": 0.3254, "step": 5550 }, { "epoch": 0.8304860060867455, "grad_norm": 1.4448457956314087, "learning_rate": 8.518177468353767e-07, "loss": 0.3273, "step": 5560 }, { "epoch": 0.831979685953808, "grad_norm": 1.4602904319763184, "learning_rate": 8.373184883398239e-07, "loss": 0.2887, "step": 5570 }, { "epoch": 0.8334733658208704, "grad_norm": 2.004305601119995, "learning_rate": 8.229324067570193e-07, "loss": 0.3068, "step": 5580 }, { "epoch": 0.8349670456879329, "grad_norm": 1.5558735132217407, "learning_rate": 8.086598932224116e-07, "loss": 0.3012, "step": 5590 }, { "epoch": 0.8364607255549954, "grad_norm": 1.6343220472335815, "learning_rate": 7.945013357837089e-07, "loss": 0.3052, "step": 5600 }, { "epoch": 0.8379544054220579, "grad_norm": 1.551553726196289, "learning_rate": 7.804571193903277e-07, "loss": 0.3024, "step": 5610 }, { "epoch": 0.8394480852891204, "grad_norm": 1.437134027481079, "learning_rate": 7.665276258829274e-07, "loss": 0.312, "step": 5620 }, { "epoch": 0.8409417651561829, "grad_norm": 1.3783475160598755, "learning_rate": 7.527132339830273e-07, "loss": 0.2973, "step": 5630 }, { "epoch": 0.8424354450232454, "grad_norm": 1.8350893259048462, "learning_rate": 7.390143192827148e-07, "loss": 0.3183, "step": 5640 }, { "epoch": 0.8439291248903079, "grad_norm": 1.1564915180206299, "learning_rate": 7.25431254234425e-07, "loss": 0.281, "step": 5650 }, { "epoch": 0.8454228047573704, "grad_norm": 2.0109055042266846, "learning_rate": 7.119644081408216e-07, "loss": 0.3059, "step": 5660 }, { "epoch": 0.8469164846244329, "grad_norm": 1.2296003103256226, "learning_rate": 6.986141471447533e-07, "loss": 0.3149, "step": 5670 }, { "epoch": 0.8484101644914953, "grad_norm": 1.5590568780899048, "learning_rate": 6.853808342192981e-07, "loss": 0.31, "step": 5680 }, { "epoch": 0.8499038443585578, "grad_norm": 1.2360633611679077, "learning_rate": 6.72264829157896e-07, "loss": 0.306, "step": 5690 }, { "epoch": 0.8513975242256203, "grad_norm": 0.8478025197982788, "learning_rate": 6.592664885645678e-07, "loss": 0.2989, "step": 5700 }, { "epoch": 0.8528912040926828, "grad_norm": 2.2121694087982178, "learning_rate": 6.463861658442166e-07, "loss": 0.3025, "step": 5710 }, { "epoch": 0.8543848839597453, "grad_norm": 1.8200898170471191, "learning_rate": 6.336242111930224e-07, "loss": 0.2983, "step": 5720 }, { "epoch": 0.8558785638268078, "grad_norm": 1.2733746767044067, "learning_rate": 6.209809715889182e-07, "loss": 0.3251, "step": 5730 }, { "epoch": 0.8573722436938703, "grad_norm": 1.0714962482452393, "learning_rate": 6.084567907821559e-07, "loss": 0.3361, "step": 5740 }, { "epoch": 0.8588659235609328, "grad_norm": 1.6452571153640747, "learning_rate": 5.960520092859668e-07, "loss": 0.3235, "step": 5750 }, { "epoch": 0.8603596034279953, "grad_norm": 0.9949471354484558, "learning_rate": 5.837669643672927e-07, "loss": 0.3074, "step": 5760 }, { "epoch": 0.8618532832950578, "grad_norm": 1.2611801624298096, "learning_rate": 5.716019900376257e-07, "loss": 0.2955, "step": 5770 }, { "epoch": 0.8633469631621202, "grad_norm": 1.4436218738555908, "learning_rate": 5.595574170439199e-07, "loss": 0.3071, "step": 5780 }, { "epoch": 0.8648406430291827, "grad_norm": 1.6352945566177368, "learning_rate": 5.476335728596061e-07, "loss": 0.327, "step": 5790 }, { "epoch": 0.8663343228962452, "grad_norm": 1.846356749534607, "learning_rate": 5.358307816756803e-07, "loss": 0.3174, "step": 5800 }, { "epoch": 0.8678280027633077, "grad_norm": 1.2852689027786255, "learning_rate": 5.24149364391895e-07, "loss": 0.3086, "step": 5810 }, { "epoch": 0.8693216826303702, "grad_norm": 1.4159107208251953, "learning_rate": 5.125896386080348e-07, "loss": 0.2913, "step": 5820 }, { "epoch": 0.8708153624974327, "grad_norm": 1.575850248336792, "learning_rate": 5.011519186152775e-07, "loss": 0.2937, "step": 5830 }, { "epoch": 0.8723090423644952, "grad_norm": 1.3794643878936768, "learning_rate": 4.898365153876505e-07, "loss": 0.3049, "step": 5840 }, { "epoch": 0.8738027222315577, "grad_norm": 1.2364630699157715, "learning_rate": 4.78643736573578e-07, "loss": 0.3129, "step": 5850 }, { "epoch": 0.8752964020986203, "grad_norm": 0.8901196122169495, "learning_rate": 4.675738864875134e-07, "loss": 0.2912, "step": 5860 }, { "epoch": 0.8767900819656828, "grad_norm": 1.1799402236938477, "learning_rate": 4.566272661016674e-07, "loss": 0.3204, "step": 5870 }, { "epoch": 0.8782837618327451, "grad_norm": 1.7847167253494263, "learning_rate": 4.4580417303782487e-07, "loss": 0.3081, "step": 5880 }, { "epoch": 0.8797774416998076, "grad_norm": 2.1496951580047607, "learning_rate": 4.3510490155925235e-07, "loss": 0.3114, "step": 5890 }, { "epoch": 0.8812711215668702, "grad_norm": 1.502504587173462, "learning_rate": 4.245297425626971e-07, "loss": 0.2944, "step": 5900 }, { "epoch": 0.8827648014339327, "grad_norm": 1.207311749458313, "learning_rate": 4.140789835704806e-07, "loss": 0.3059, "step": 5910 }, { "epoch": 0.8842584813009952, "grad_norm": 1.821098804473877, "learning_rate": 4.0375290872267825e-07, "loss": 0.2872, "step": 5920 }, { "epoch": 0.8857521611680577, "grad_norm": 1.6243329048156738, "learning_rate": 3.935517987693932e-07, "loss": 0.3064, "step": 5930 }, { "epoch": 0.8872458410351202, "grad_norm": 1.5818045139312744, "learning_rate": 3.8347593106312974e-07, "loss": 0.2777, "step": 5940 }, { "epoch": 0.8887395209021827, "grad_norm": 1.2436670064926147, "learning_rate": 3.7352557955124437e-07, "loss": 0.3014, "step": 5950 }, { "epoch": 0.8902332007692452, "grad_norm": 1.4755676984786987, "learning_rate": 3.637010147685016e-07, "loss": 0.312, "step": 5960 }, { "epoch": 0.8917268806363077, "grad_norm": 1.5071330070495605, "learning_rate": 3.540025038297196e-07, "loss": 0.3246, "step": 5970 }, { "epoch": 0.8932205605033701, "grad_norm": 1.5448203086853027, "learning_rate": 3.44430310422505e-07, "loss": 0.301, "step": 5980 }, { "epoch": 0.8947142403704326, "grad_norm": 1.1398577690124512, "learning_rate": 3.3498469480008454e-07, "loss": 0.2993, "step": 5990 }, { "epoch": 0.8962079202374951, "grad_norm": 1.4388718605041504, "learning_rate": 3.256659137742313e-07, "loss": 0.315, "step": 6000 }, { "epoch": 0.8962079202374951, "eval_loss": 0.30459803342819214, "eval_runtime": 76.2056, "eval_samples_per_second": 7.099, "eval_steps_per_second": 3.556, "step": 6000 }, { "epoch": 0.8977016001045576, "grad_norm": 1.604347825050354, "learning_rate": 3.164742207082788e-07, "loss": 0.319, "step": 6010 }, { "epoch": 0.8991952799716201, "grad_norm": 1.558813214302063, "learning_rate": 3.0740986551023535e-07, "loss": 0.3084, "step": 6020 }, { "epoch": 0.9006889598386826, "grad_norm": 1.7928036451339722, "learning_rate": 2.9847309462598726e-07, "loss": 0.3147, "step": 6030 }, { "epoch": 0.9021826397057451, "grad_norm": 1.4524924755096436, "learning_rate": 2.896641510326009e-07, "loss": 0.3112, "step": 6040 }, { "epoch": 0.9036763195728076, "grad_norm": 1.2006369829177856, "learning_rate": 2.809832742317137e-07, "loss": 0.3284, "step": 6050 }, { "epoch": 0.9051699994398701, "grad_norm": 1.2945834398269653, "learning_rate": 2.724307002430249e-07, "loss": 0.3057, "step": 6060 }, { "epoch": 0.9066636793069326, "grad_norm": 0.915762722492218, "learning_rate": 2.6400666159787646e-07, "loss": 0.3078, "step": 6070 }, { "epoch": 0.908157359173995, "grad_norm": 1.4049233198165894, "learning_rate": 2.5571138733293255e-07, "loss": 0.3251, "step": 6080 }, { "epoch": 0.9096510390410575, "grad_norm": 1.4237291812896729, "learning_rate": 2.475451029839515e-07, "loss": 0.3224, "step": 6090 }, { "epoch": 0.91114471890812, "grad_norm": 1.0404157638549805, "learning_rate": 2.3950803057965435e-07, "loss": 0.312, "step": 6100 }, { "epoch": 0.9126383987751825, "grad_norm": 1.250205636024475, "learning_rate": 2.3160038863568768e-07, "loss": 0.312, "step": 6110 }, { "epoch": 0.914132078642245, "grad_norm": 1.2475612163543701, "learning_rate": 2.2382239214868152e-07, "loss": 0.3077, "step": 6120 }, { "epoch": 0.9156257585093075, "grad_norm": 1.5456167459487915, "learning_rate": 2.161742525904087e-07, "loss": 0.3301, "step": 6130 }, { "epoch": 0.91711943837637, "grad_norm": 1.449046015739441, "learning_rate": 2.086561779020285e-07, "loss": 0.3371, "step": 6140 }, { "epoch": 0.9186131182434325, "grad_norm": 1.6901681423187256, "learning_rate": 2.012683724884379e-07, "loss": 0.3178, "step": 6150 }, { "epoch": 0.920106798110495, "grad_norm": 1.598301649093628, "learning_rate": 1.9401103721271076e-07, "loss": 0.2795, "step": 6160 }, { "epoch": 0.9216004779775575, "grad_norm": 1.3405296802520752, "learning_rate": 1.8688436939064025e-07, "loss": 0.3362, "step": 6170 }, { "epoch": 0.9230941578446199, "grad_norm": 1.2465345859527588, "learning_rate": 1.798885627853708e-07, "loss": 0.3009, "step": 6180 }, { "epoch": 0.9245878377116824, "grad_norm": 1.2908686399459839, "learning_rate": 1.7302380760213345e-07, "loss": 0.3066, "step": 6190 }, { "epoch": 0.9260815175787449, "grad_norm": 1.4556738138198853, "learning_rate": 1.6629029048307044e-07, "loss": 0.3031, "step": 6200 }, { "epoch": 0.9275751974458074, "grad_norm": 0.9532304406166077, "learning_rate": 1.5968819450216444e-07, "loss": 0.331, "step": 6210 }, { "epoch": 0.9290688773128699, "grad_norm": 1.7296748161315918, "learning_rate": 1.5321769916025798e-07, "loss": 0.3211, "step": 6220 }, { "epoch": 0.9305625571799324, "grad_norm": 1.8877276182174683, "learning_rate": 1.4687898038017513e-07, "loss": 0.3241, "step": 6230 }, { "epoch": 0.9320562370469949, "grad_norm": 1.5971440076828003, "learning_rate": 1.406722105019376e-07, "loss": 0.3089, "step": 6240 }, { "epoch": 0.9335499169140574, "grad_norm": 1.8514761924743652, "learning_rate": 1.3459755827807952e-07, "loss": 0.3199, "step": 6250 }, { "epoch": 0.9350435967811199, "grad_norm": 1.612648367881775, "learning_rate": 1.2865518886905848e-07, "loss": 0.3195, "step": 6260 }, { "epoch": 0.9365372766481824, "grad_norm": 1.5449368953704834, "learning_rate": 1.228452638387656e-07, "loss": 0.3154, "step": 6270 }, { "epoch": 0.9380309565152448, "grad_norm": 1.3344995975494385, "learning_rate": 1.1716794115013419e-07, "loss": 0.3065, "step": 6280 }, { "epoch": 0.9395246363823073, "grad_norm": 1.4628318548202515, "learning_rate": 1.1162337516084253e-07, "loss": 0.3333, "step": 6290 }, { "epoch": 0.9410183162493698, "grad_norm": 1.5249940156936646, "learning_rate": 1.0621171661911844e-07, "loss": 0.3183, "step": 6300 }, { "epoch": 0.9425119961164323, "grad_norm": 1.3567790985107422, "learning_rate": 1.0093311265963967e-07, "loss": 0.2903, "step": 6310 }, { "epoch": 0.9440056759834948, "grad_norm": 2.6283679008483887, "learning_rate": 9.578770679953664e-08, "loss": 0.3182, "step": 6320 }, { "epoch": 0.9454993558505573, "grad_norm": 1.6033724546432495, "learning_rate": 9.07756389344866e-08, "loss": 0.3061, "step": 6330 }, { "epoch": 0.9469930357176198, "grad_norm": 1.2911169528961182, "learning_rate": 8.589704533491173e-08, "loss": 0.3242, "step": 6340 }, { "epoch": 0.9484867155846823, "grad_norm": 1.2943521738052368, "learning_rate": 8.115205864227316e-08, "loss": 0.319, "step": 6350 }, { "epoch": 0.9499803954517448, "grad_norm": 1.4396514892578125, "learning_rate": 7.65408078654678e-08, "loss": 0.3246, "step": 6360 }, { "epoch": 0.9514740753188073, "grad_norm": 1.282568097114563, "learning_rate": 7.206341837731667e-08, "loss": 0.3194, "step": 6370 }, { "epoch": 0.9529677551858697, "grad_norm": 1.448075294494629, "learning_rate": 6.772001191115928e-08, "loss": 0.2985, "step": 6380 }, { "epoch": 0.9544614350529322, "grad_norm": 0.9579274654388428, "learning_rate": 6.351070655754187e-08, "loss": 0.3208, "step": 6390 }, { "epoch": 0.9559551149199947, "grad_norm": 1.2854158878326416, "learning_rate": 5.943561676100773e-08, "loss": 0.2923, "step": 6400 }, { "epoch": 0.9574487947870572, "grad_norm": 1.7156449556350708, "learning_rate": 5.5494853316985786e-08, "loss": 0.3132, "step": 6410 }, { "epoch": 0.9589424746541197, "grad_norm": 1.3507882356643677, "learning_rate": 5.168852336877695e-08, "loss": 0.335, "step": 6420 }, { "epoch": 0.9604361545211823, "grad_norm": 1.696518063545227, "learning_rate": 4.801673040464305e-08, "loss": 0.3196, "step": 6430 }, { "epoch": 0.9619298343882448, "grad_norm": 1.6517608165740967, "learning_rate": 4.447957425499139e-08, "loss": 0.3038, "step": 6440 }, { "epoch": 0.9634235142553073, "grad_norm": 1.3434756994247437, "learning_rate": 4.107715108966237e-08, "loss": 0.3067, "step": 6450 }, { "epoch": 0.9649171941223698, "grad_norm": 1.260919213294983, "learning_rate": 3.7809553415311675e-08, "loss": 0.3052, "step": 6460 }, { "epoch": 0.9664108739894323, "grad_norm": 1.7151823043823242, "learning_rate": 3.467687007289833e-08, "loss": 0.2897, "step": 6470 }, { "epoch": 0.9679045538564947, "grad_norm": 1.9328373670578003, "learning_rate": 3.167918623526833e-08, "loss": 0.2919, "step": 6480 }, { "epoch": 0.9693982337235572, "grad_norm": 1.4327538013458252, "learning_rate": 2.8816583404837616e-08, "loss": 0.2983, "step": 6490 }, { "epoch": 0.9708919135906197, "grad_norm": 1.2370225191116333, "learning_rate": 2.608913941137825e-08, "loss": 0.301, "step": 6500 }, { "epoch": 0.9708919135906197, "eval_loss": 0.3039746582508087, "eval_runtime": 76.1933, "eval_samples_per_second": 7.1, "eval_steps_per_second": 3.557, "step": 6500 }, { "epoch": 0.9723855934576822, "grad_norm": 1.433455467224121, "learning_rate": 2.3496928409900143e-08, "loss": 0.3035, "step": 6510 }, { "epoch": 0.9738792733247447, "grad_norm": 1.2109386920928955, "learning_rate": 2.10400208786371e-08, "loss": 0.3246, "step": 6520 }, { "epoch": 0.9753729531918072, "grad_norm": 1.5277369022369385, "learning_rate": 1.87184836171278e-08, "loss": 0.3185, "step": 6530 }, { "epoch": 0.9768666330588697, "grad_norm": 2.307945728302002, "learning_rate": 1.6532379744403915e-08, "loss": 0.3209, "step": 6540 }, { "epoch": 0.9783603129259322, "grad_norm": 2.4343667030334473, "learning_rate": 1.448176869726814e-08, "loss": 0.3146, "step": 6550 }, { "epoch": 0.9798539927929947, "grad_norm": 2.0399911403656006, "learning_rate": 1.2566706228685499e-08, "loss": 0.3042, "step": 6560 }, { "epoch": 0.9813476726600572, "grad_norm": 1.6942330598831177, "learning_rate": 1.0787244406259556e-08, "loss": 0.2949, "step": 6570 }, { "epoch": 0.9828413525271196, "grad_norm": 1.0024651288986206, "learning_rate": 9.143431610822983e-09, "loss": 0.3046, "step": 6580 }, { "epoch": 0.9843350323941821, "grad_norm": 1.426224946975708, "learning_rate": 7.635312535119732e-09, "loss": 0.3148, "step": 6590 }, { "epoch": 0.9858287122612446, "grad_norm": 1.2437832355499268, "learning_rate": 6.2629281825887785e-09, "loss": 0.3209, "step": 6600 }, { "epoch": 0.9873223921283071, "grad_norm": 1.3072861433029175, "learning_rate": 5.026315866252241e-09, "loss": 0.32, "step": 6610 }, { "epoch": 0.9888160719953696, "grad_norm": 1.787488579750061, "learning_rate": 3.9255092076984084e-09, "loss": 0.3269, "step": 6620 }, { "epoch": 0.9903097518624321, "grad_norm": 1.3305130004882812, "learning_rate": 2.9605381361685893e-09, "loss": 0.3157, "step": 6630 }, { "epoch": 0.9918034317294946, "grad_norm": 1.319860577583313, "learning_rate": 2.131428887742204e-09, "loss": 0.2924, "step": 6640 }, { "epoch": 0.9932971115965571, "grad_norm": 1.1278481483459473, "learning_rate": 1.4382040046267976e-09, "loss": 0.3155, "step": 6650 }, { "epoch": 0.9947907914636196, "grad_norm": 1.3013602495193481, "learning_rate": 8.808823345407558e-10, "loss": 0.3081, "step": 6660 }, { "epoch": 0.9962844713306821, "grad_norm": 1.4793535470962524, "learning_rate": 4.594790302037133e-10, "loss": 0.3225, "step": 6670 }, { "epoch": 0.9977781511977445, "grad_norm": 1.2639180421829224, "learning_rate": 1.7400554892466058e-10, "loss": 0.3101, "step": 6680 }, { "epoch": 0.999271831064807, "grad_norm": 1.553964614868164, "learning_rate": 2.4469652287750777e-11, "loss": 0.3104, "step": 6690 }, { "epoch": 1.0, "step": 6695, "total_flos": 1.727502264531714e+18, "train_loss": 0.35656481113248656, "train_runtime": 34664.1508, "train_samples_per_second": 1.545, "train_steps_per_second": 0.193 } ], "logging_steps": 10, "max_steps": 6695, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.727502264531714e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }