| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 6568, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0015225624726414556, |
| "grad_norm": 18.75, |
| "learning_rate": 0.0009090909090909091, |
| "loss": 2.9749, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.003045124945282911, |
| "grad_norm": 8.0625, |
| "learning_rate": 0.0019191919191919192, |
| "loss": 1.9177, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.004567687417924367, |
| "grad_norm": 5.5, |
| "learning_rate": 0.0029292929292929295, |
| "loss": 1.7954, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.006090249890565822, |
| "grad_norm": 5.0, |
| "learning_rate": 0.00393939393939394, |
| "loss": 1.8602, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.007612812363207278, |
| "grad_norm": 3.9375, |
| "learning_rate": 0.00494949494949495, |
| "loss": 1.9204, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.009135374835848734, |
| "grad_norm": 4.15625, |
| "learning_rate": 0.005959595959595959, |
| "loss": 2.0757, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.01065793730849019, |
| "grad_norm": 3.78125, |
| "learning_rate": 0.00696969696969697, |
| "loss": 2.1545, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.012180499781131645, |
| "grad_norm": 3.0625, |
| "learning_rate": 0.007979797979797981, |
| "loss": 2.3187, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0137030622537731, |
| "grad_norm": 3.40625, |
| "learning_rate": 0.00898989898989899, |
| "loss": 2.3589, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.015225624726414555, |
| "grad_norm": 2.828125, |
| "learning_rate": 0.01, |
| "loss": 2.4423, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.016748187199056012, |
| "grad_norm": 3.125, |
| "learning_rate": 0.01101010101010101, |
| "loss": 2.6021, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.018270749671697468, |
| "grad_norm": 3.265625, |
| "learning_rate": 0.01202020202020202, |
| "loss": 2.6122, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.019793312144338923, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.013030303030303031, |
| "loss": 2.5902, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.02131587461698038, |
| "grad_norm": 2.328125, |
| "learning_rate": 0.01404040404040404, |
| "loss": 2.6305, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.022838437089621834, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.01505050505050505, |
| "loss": 2.6464, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.02436099956226329, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.01606060606060606, |
| "loss": 2.6488, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.025883562034904745, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.01707070707070707, |
| "loss": 2.6837, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0274061245075462, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.018080808080808083, |
| "loss": 2.7353, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.028928686980187655, |
| "grad_norm": 3.0625, |
| "learning_rate": 0.019090909090909092, |
| "loss": 2.7126, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.03045124945282911, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.019999998783839547, |
| "loss": 2.7545, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.031973811925470566, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.019999852844943353, |
| "loss": 2.7003, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.033496374398112025, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.019999463678024316, |
| "loss": 2.7056, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.03501893687075348, |
| "grad_norm": 1.625, |
| "learning_rate": 0.019998831292548203, |
| "loss": 2.6891, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.036541499343394936, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.01999795570389663, |
| "loss": 2.6395, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.03806406181603639, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.019996836933366676, |
| "loss": 2.6503, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.039586624288677846, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.01999547500817038, |
| "loss": 2.6254, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0411091867613193, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.019993869961434065, |
| "loss": 2.6104, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.04263174923396076, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.01999202183219754, |
| "loss": 2.6056, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.04415431170660221, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.019989930665413148, |
| "loss": 2.5625, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.04567687417924367, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.019987596511944674, |
| "loss": 2.5482, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.04719943665188512, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.019985019428566106, |
| "loss": 2.5616, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.04872199912452658, |
| "grad_norm": 0.9921875, |
| "learning_rate": 0.019982199477960257, |
| "loss": 2.5245, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.05024456159716803, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.01997913672871724, |
| "loss": 2.5713, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.05176712406980949, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.019975831255332793, |
| "loss": 2.497, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.05328968654245095, |
| "grad_norm": 0.92578125, |
| "learning_rate": 0.01997228313820647, |
| "loss": 2.4797, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0548122490150924, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.019968492463639704, |
| "loss": 2.5012, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.05633481148773386, |
| "grad_norm": 0.953125, |
| "learning_rate": 0.019964459323833665, |
| "loss": 2.4267, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.05785737396037531, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.01996018381688707, |
| "loss": 2.4353, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.05937993643301677, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.019955666046793757, |
| "loss": 2.4498, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.06090249890565822, |
| "grad_norm": 0.921875, |
| "learning_rate": 0.01995090612344017, |
| "loss": 2.4758, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.06242506137829968, |
| "grad_norm": 0.87890625, |
| "learning_rate": 0.019945904162602685, |
| "loss": 2.4757, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.06394762385094113, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.0199406602859448, |
| "loss": 2.45, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.06547018632358259, |
| "grad_norm": 0.890625, |
| "learning_rate": 0.01993517462101417, |
| "loss": 2.4271, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.06699274879622405, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.019929447301239498, |
| "loss": 2.4273, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.0685153112688655, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.0199234784659273, |
| "loss": 2.4073, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.07003787374150695, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.019917268260258518, |
| "loss": 2.3603, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.07156043621414841, |
| "grad_norm": 0.875, |
| "learning_rate": 0.019910816835284974, |
| "loss": 2.3982, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.07308299868678987, |
| "grad_norm": 0.8515625, |
| "learning_rate": 0.01990412434792571, |
| "loss": 2.3634, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.07460556115943132, |
| "grad_norm": 0.88671875, |
| "learning_rate": 0.019897190960963176, |
| "loss": 2.3787, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.07612812363207277, |
| "grad_norm": 0.984375, |
| "learning_rate": 0.01989001684303925, |
| "loss": 2.3464, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.07612812363207277, |
| "eval_loss": 2.4252116680145264, |
| "eval_runtime": 342.391, |
| "eval_samples_per_second": 49.604, |
| "eval_steps_per_second": 24.802, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.07765068610471423, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.019882602168651148, |
| "loss": 2.3568, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.07917324857735569, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.019874947118147187, |
| "loss": 2.331, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.08069581104999715, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.019867051877722388, |
| "loss": 2.3502, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0822183735226386, |
| "grad_norm": 0.84375, |
| "learning_rate": 0.01985891663941395, |
| "loss": 2.3372, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.08374093599528006, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.019850541601096568, |
| "loss": 2.2897, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.08526349846792151, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01984192696647765, |
| "loss": 2.3333, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.08678606094056297, |
| "grad_norm": 0.8203125, |
| "learning_rate": 0.01983307294509233, |
| "loss": 2.3197, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.08830862341320442, |
| "grad_norm": 0.94140625, |
| "learning_rate": 0.019823979752298392, |
| "loss": 2.3009, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.08983118588584588, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.01981464760927102, |
| "loss": 2.332, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.09135374835848734, |
| "grad_norm": 0.83203125, |
| "learning_rate": 0.019805076742997422, |
| "loss": 2.3117, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0928763108311288, |
| "grad_norm": 0.8515625, |
| "learning_rate": 0.019795267386271315, |
| "loss": 2.2978, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.09439887330377024, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.019785219777687248, |
| "loss": 2.2451, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.0959214357764117, |
| "grad_norm": 0.796875, |
| "learning_rate": 0.019774934161634825, |
| "loss": 2.3117, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.09744399824905316, |
| "grad_norm": 0.890625, |
| "learning_rate": 0.019764410788292722, |
| "loss": 2.2956, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.09896656072169462, |
| "grad_norm": 0.82421875, |
| "learning_rate": 0.01975364991362264, |
| "loss": 2.276, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.10048912319433606, |
| "grad_norm": 0.85546875, |
| "learning_rate": 0.01974265179936306, |
| "loss": 2.2066, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.10201168566697752, |
| "grad_norm": 0.76171875, |
| "learning_rate": 0.019731416713022868, |
| "loss": 2.2396, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.10353424813961898, |
| "grad_norm": 0.8203125, |
| "learning_rate": 0.01971994492787488, |
| "loss": 2.2219, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.10505681061226044, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.01970823672294916, |
| "loss": 2.2221, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.1065793730849019, |
| "grad_norm": 0.80859375, |
| "learning_rate": 0.019696292383026247, |
| "loss": 2.2533, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.10810193555754334, |
| "grad_norm": 0.8203125, |
| "learning_rate": 0.019684112198630244, |
| "loss": 2.2463, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.1096244980301848, |
| "grad_norm": 0.76171875, |
| "learning_rate": 0.01967169646602172, |
| "loss": 2.2102, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.11114706050282626, |
| "grad_norm": 0.82421875, |
| "learning_rate": 0.01965904548719053, |
| "loss": 2.2254, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.11266962297546772, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.019646159569848463, |
| "loss": 2.2245, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.11419218544810916, |
| "grad_norm": 0.83203125, |
| "learning_rate": 0.019633039027421747, |
| "loss": 2.2012, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.11571474792075062, |
| "grad_norm": 0.84375, |
| "learning_rate": 0.019619684179043438, |
| "loss": 2.2266, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.11723731039339208, |
| "grad_norm": 0.78125, |
| "learning_rate": 0.019606095349545653, |
| "loss": 2.2187, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.11875987286603354, |
| "grad_norm": 0.7890625, |
| "learning_rate": 0.01959227286945167, |
| "loss": 2.1785, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.12028243533867498, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.019578217074967885, |
| "loss": 2.2224, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.12180499781131644, |
| "grad_norm": 0.86328125, |
| "learning_rate": 0.01956392830797564, |
| "loss": 2.1828, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.1233275602839579, |
| "grad_norm": 0.78515625, |
| "learning_rate": 0.019549406916022905, |
| "loss": 2.2074, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.12485012275659936, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01953465325231582, |
| "loss": 2.1648, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.1263726852292408, |
| "grad_norm": 0.79296875, |
| "learning_rate": 0.019519667675710114, |
| "loss": 2.1821, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.12789524770188226, |
| "grad_norm": 0.8203125, |
| "learning_rate": 0.01950445055070237, |
| "loss": 2.1723, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.12941781017452372, |
| "grad_norm": 0.7890625, |
| "learning_rate": 0.019489002247421148, |
| "loss": 2.1421, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.13094037264716518, |
| "grad_norm": 0.79296875, |
| "learning_rate": 0.019473323141618013, |
| "loss": 2.1654, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.13246293511980664, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.019457413614658366, |
| "loss": 2.1454, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.1339854975924481, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.019441274053512175, |
| "loss": 2.1438, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.13550806006508956, |
| "grad_norm": 0.796875, |
| "learning_rate": 0.01942490485074458, |
| "loss": 2.1856, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.137030622537731, |
| "grad_norm": 0.83984375, |
| "learning_rate": 0.019408306404506314, |
| "loss": 2.1291, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.13855318501037245, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.019391479118524044, |
| "loss": 2.1488, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.1400757474830139, |
| "grad_norm": 0.796875, |
| "learning_rate": 0.01937442340209055, |
| "loss": 2.1124, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.14159830995565537, |
| "grad_norm": 0.78125, |
| "learning_rate": 0.01935713967005475, |
| "loss": 2.1569, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.14312087242829682, |
| "grad_norm": 0.83203125, |
| "learning_rate": 0.019339628342811634, |
| "loss": 2.1194, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.14464343490093828, |
| "grad_norm": 0.77734375, |
| "learning_rate": 0.01932188984629201, |
| "loss": 2.1578, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.14616599737357974, |
| "grad_norm": 0.79296875, |
| "learning_rate": 0.019303924611952177, |
| "loss": 2.1231, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.1476885598462212, |
| "grad_norm": 0.80859375, |
| "learning_rate": 0.01928573307676341, |
| "loss": 2.1173, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.14921112231886263, |
| "grad_norm": 0.7890625, |
| "learning_rate": 0.019267315683201326, |
| "loss": 2.0751, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.1507336847915041, |
| "grad_norm": 0.76953125, |
| "learning_rate": 0.019248672879235148, |
| "loss": 2.1118, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.15225624726414555, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.01922980511831678, |
| "loss": 2.1046, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.15225624726414555, |
| "eval_loss": 2.17815899848938, |
| "eval_runtime": 332.4001, |
| "eval_samples_per_second": 51.095, |
| "eval_steps_per_second": 25.548, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.153778809736787, |
| "grad_norm": 0.7734375, |
| "learning_rate": 0.01921071285936979, |
| "loss": 2.0853, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.15530137220942847, |
| "grad_norm": 0.7421875, |
| "learning_rate": 0.01919139656677826, |
| "loss": 2.1187, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.15682393468206993, |
| "grad_norm": 0.80859375, |
| "learning_rate": 0.01917185671037546, |
| "loss": 2.0578, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.15834649715471139, |
| "grad_norm": 0.76953125, |
| "learning_rate": 0.01915209376543245, |
| "loss": 2.0986, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.15986905962735284, |
| "grad_norm": 0.8046875, |
| "learning_rate": 0.019132108212646513, |
| "loss": 2.0731, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1613916220999943, |
| "grad_norm": 0.7265625, |
| "learning_rate": 0.01911190053812944, |
| "loss": 2.0556, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.16291418457263573, |
| "grad_norm": 0.9296875, |
| "learning_rate": 0.01909147123339575, |
| "loss": 2.0717, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.1644367470452772, |
| "grad_norm": 0.85546875, |
| "learning_rate": 0.019070820795350683, |
| "loss": 2.0919, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.16595930951791865, |
| "grad_norm": 0.796875, |
| "learning_rate": 0.019049949726278156, |
| "loss": 2.037, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.1674818719905601, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.01902885853382853, |
| "loss": 2.0452, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.16900443446320157, |
| "grad_norm": 0.7890625, |
| "learning_rate": 0.019007547731006248, |
| "loss": 2.0627, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.17052699693584303, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.01898601783615739, |
| "loss": 2.0771, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.1720495594084845, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.018964269372957036, |
| "loss": 2.0802, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.17357212188112595, |
| "grad_norm": 0.82421875, |
| "learning_rate": 0.01894230287039654, |
| "loss": 2.061, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.17509468435376738, |
| "grad_norm": 0.81640625, |
| "learning_rate": 0.018920118862770667, |
| "loss": 2.0693, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.17661724682640884, |
| "grad_norm": 0.87890625, |
| "learning_rate": 0.018897717889664576, |
| "loss": 2.0623, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.1781398092990503, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01887510049594074, |
| "loss": 2.0643, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.17966237177169175, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.01885226723172564, |
| "loss": 2.0485, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.1811849342443332, |
| "grad_norm": 0.75, |
| "learning_rate": 0.018829218652396423, |
| "loss": 2.0529, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.18270749671697467, |
| "grad_norm": 0.8046875, |
| "learning_rate": 0.01880595531856738, |
| "loss": 2.0425, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.18423005918961613, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.018782477796076304, |
| "loss": 2.0552, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.1857526216622576, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.018758786655970732, |
| "loss": 2.0348, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.18727518413489905, |
| "grad_norm": 0.8203125, |
| "learning_rate": 0.018734882474494067, |
| "loss": 2.0424, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.18879774660754048, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01871076583307154, |
| "loss": 2.03, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.19032030908018194, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.018686437318296084, |
| "loss": 2.0417, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1918428715528234, |
| "grad_norm": 0.765625, |
| "learning_rate": 0.018661897521914068, |
| "loss": 2.0172, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.19336543402546486, |
| "grad_norm": 0.8046875, |
| "learning_rate": 0.018637147040810886, |
| "loss": 2.0367, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.19488799649810631, |
| "grad_norm": 0.80859375, |
| "learning_rate": 0.018612186476996452, |
| "loss": 2.015, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.19641055897074777, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.018587016437590562, |
| "loss": 2.0083, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.19793312144338923, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.01856163753480812, |
| "loss": 2.0318, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.1994556839160307, |
| "grad_norm": 0.796875, |
| "learning_rate": 0.01853605038594424, |
| "loss": 2.0027, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.20097824638867212, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.01851025561335925, |
| "loss": 1.9983, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.20250080886131358, |
| "grad_norm": 0.8515625, |
| "learning_rate": 0.018484253844463526, |
| "loss": 1.9897, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.20402337133395504, |
| "grad_norm": 0.796875, |
| "learning_rate": 0.018458045711702266, |
| "loss": 1.9809, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.2055459338065965, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.018431631852540077, |
| "loss": 2.0047, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.20706849627923796, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.01840501290944549, |
| "loss": 1.9952, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.20859105875187942, |
| "grad_norm": 0.796875, |
| "learning_rate": 0.018378189529875324, |
| "loss": 2.0212, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.21011362122452087, |
| "grad_norm": 0.78515625, |
| "learning_rate": 0.018351162366258937, |
| "loss": 1.9944, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.21163618369716233, |
| "grad_norm": 0.75390625, |
| "learning_rate": 0.01832393207598236, |
| "loss": 1.9731, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.2131587461698038, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.018296499321372305, |
| "loss": 1.9937, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.21468130864244522, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.018268864769680055, |
| "loss": 1.9897, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.21620387111508668, |
| "grad_norm": 0.84375, |
| "learning_rate": 0.01824102909306524, |
| "loss": 1.991, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.21772643358772814, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.01821299296857948, |
| "loss": 1.9662, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.2192489960603696, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.01818475707814993, |
| "loss": 1.9562, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.22077155853301106, |
| "grad_norm": 0.81640625, |
| "learning_rate": 0.018156322108562675, |
| "loss": 1.9599, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.22229412100565252, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.018127688751446026, |
| "loss": 1.9912, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.22381668347829398, |
| "grad_norm": 0.8046875, |
| "learning_rate": 0.018098857703253726, |
| "loss": 1.9604, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.22533924595093544, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.018069829665247974, |
| "loss": 1.982, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.22686180842357687, |
| "grad_norm": 0.80859375, |
| "learning_rate": 0.01804060534348239, |
| "loss": 1.9447, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.22838437089621832, |
| "grad_norm": 0.78515625, |
| "learning_rate": 0.018011185448784835, |
| "loss": 1.9809, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.22838437089621832, |
| "eval_loss": 2.018177032470703, |
| "eval_runtime": 331.4544, |
| "eval_samples_per_second": 51.241, |
| "eval_steps_per_second": 25.62, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.22990693336885978, |
| "grad_norm": 0.83203125, |
| "learning_rate": 0.017981570696740123, |
| "loss": 1.9661, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.23142949584150124, |
| "grad_norm": 0.7890625, |
| "learning_rate": 0.01795176180767261, |
| "loss": 1.9801, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.2329520583141427, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.01792175950662868, |
| "loss": 1.9376, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.23447462078678416, |
| "grad_norm": 0.91796875, |
| "learning_rate": 0.017891564523359108, |
| "loss": 1.9701, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.23599718325942562, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.017861177592301318, |
| "loss": 1.9454, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.23751974573206708, |
| "grad_norm": 0.83984375, |
| "learning_rate": 0.017830599452561487, |
| "loss": 1.9301, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.23904230820470854, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01779983084789662, |
| "loss": 1.9313, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.24056487067734997, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.01776887252669641, |
| "loss": 1.9767, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.24208743314999143, |
| "grad_norm": 0.7734375, |
| "learning_rate": 0.01773772524196507, |
| "loss": 1.8965, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.24360999562263289, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.017706389751302988, |
| "loss": 1.9254, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.24513255809527434, |
| "grad_norm": 0.8671875, |
| "learning_rate": 0.017674866816888332, |
| "loss": 1.9328, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.2466551205679158, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.017643157205458483, |
| "loss": 1.9314, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.24817768304055726, |
| "grad_norm": 0.8203125, |
| "learning_rate": 0.0176112616882914, |
| "loss": 1.8842, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.24970024551319872, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.01757918104118686, |
| "loss": 1.8764, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.2512228079858402, |
| "grad_norm": 0.88671875, |
| "learning_rate": 0.017546916044447573, |
| "loss": 1.884, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.2527453704584816, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.017514467482860233, |
| "loss": 1.9117, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.2542679329311231, |
| "grad_norm": 0.8125, |
| "learning_rate": 0.017481836145676402, |
| "loss": 1.9062, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.25579049540376453, |
| "grad_norm": 0.8359375, |
| "learning_rate": 0.017449022826593316, |
| "loss": 1.9137, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.257313057876406, |
| "grad_norm": 0.81640625, |
| "learning_rate": 0.017416028323734598, |
| "loss": 1.9257, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.25883562034904745, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.01738285343963083, |
| "loss": 1.902, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.2603581828216889, |
| "grad_norm": 0.8046875, |
| "learning_rate": 0.01734949898120002, |
| "loss": 1.8808, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.26188074529433036, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.017315965759728016, |
| "loss": 1.9147, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.2634033077669718, |
| "grad_norm": 0.81640625, |
| "learning_rate": 0.017282254590848728, |
| "loss": 1.9054, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.2649258702396133, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.017248366294524326, |
| "loss": 1.8789, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.2664484327122547, |
| "grad_norm": 0.8671875, |
| "learning_rate": 0.017214301695025268, |
| "loss": 1.8616, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.2679709951848962, |
| "grad_norm": 0.83984375, |
| "learning_rate": 0.017180061620910264, |
| "loss": 1.9031, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.26949355765753763, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01714564690500612, |
| "loss": 1.8796, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.2710161201301791, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01711105838438749, |
| "loss": 1.9017, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.27253868260282055, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.017076296900356495, |
| "loss": 1.8648, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.274061245075462, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.017041363298422287, |
| "loss": 1.8762, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.27558380754810347, |
| "grad_norm": 0.890625, |
| "learning_rate": 0.017006258428280463, |
| "loss": 1.9024, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.2771063700207449, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.0169709831437924, |
| "loss": 1.9075, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.2786289324933864, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.016935538302964494, |
| "loss": 1.8724, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.2801514949660278, |
| "grad_norm": 0.83984375, |
| "learning_rate": 0.01689992476792729, |
| "loss": 1.8504, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.2816740574386693, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.016864143404914506, |
| "loss": 1.8597, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.28319661991131073, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.01682819508424196, |
| "loss": 1.8458, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.2847191823839522, |
| "grad_norm": 0.8046875, |
| "learning_rate": 0.01679208068028643, |
| "loss": 1.8839, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.28624174485659365, |
| "grad_norm": 0.84375, |
| "learning_rate": 0.016755801071464335, |
| "loss": 1.8525, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.2877643073292351, |
| "grad_norm": 0.83203125, |
| "learning_rate": 0.016719357140210417, |
| "loss": 1.912, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.28928686980187657, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.016682749772956258, |
| "loss": 1.8613, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.290809432274518, |
| "grad_norm": 0.875, |
| "learning_rate": 0.016645979860108715, |
| "loss": 1.8607, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.2923319947471595, |
| "grad_norm": 0.875, |
| "learning_rate": 0.01660904829602827, |
| "loss": 1.8919, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.2938545572198009, |
| "grad_norm": 0.81640625, |
| "learning_rate": 0.01657195597900727, |
| "loss": 1.9042, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.2953771196924424, |
| "grad_norm": 0.86328125, |
| "learning_rate": 0.016534703811248087, |
| "loss": 1.8477, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.29689968216508383, |
| "grad_norm": 0.88671875, |
| "learning_rate": 0.016497292698841162, |
| "loss": 1.8578, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.29842224463772526, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.01645972355174298, |
| "loss": 1.8304, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.29994480711036675, |
| "grad_norm": 0.87890625, |
| "learning_rate": 0.016421997283753927, |
| "loss": 1.8321, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.3014673695830082, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.016384114812496055, |
| "loss": 1.8363, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.30298993205564967, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.01634607705939079, |
| "loss": 1.833, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.3045124945282911, |
| "grad_norm": 0.953125, |
| "learning_rate": 0.016307884949636493, |
| "loss": 1.86, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3045124945282911, |
| "eval_loss": 1.8923254013061523, |
| "eval_runtime": 332.0094, |
| "eval_samples_per_second": 51.155, |
| "eval_steps_per_second": 25.578, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3060350570009326, |
| "grad_norm": 0.890625, |
| "learning_rate": 0.01626953941218597, |
| "loss": 1.8004, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.307557619473574, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.01623104137972386, |
| "loss": 1.8512, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.3090801819462155, |
| "grad_norm": 0.8671875, |
| "learning_rate": 0.016192391788643987, |
| "loss": 1.8061, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.31060274441885694, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.016153591579026544, |
| "loss": 1.8187, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.31212530689149837, |
| "grad_norm": 0.8046875, |
| "learning_rate": 0.016114641694615246, |
| "loss": 1.8084, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.31364786936413985, |
| "grad_norm": 0.92578125, |
| "learning_rate": 0.01607554308279437, |
| "loss": 1.8253, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.3151704318367813, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.016036296694565716, |
| "loss": 1.833, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.31669299430942277, |
| "grad_norm": 0.87890625, |
| "learning_rate": 0.015996903484525475, |
| "loss": 1.8359, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.3182155567820642, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.015957364410841, |
| "loss": 1.8245, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.3197381192547057, |
| "grad_norm": 0.875, |
| "learning_rate": 0.01591768043522752, |
| "loss": 1.8034, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.3212606817273471, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.015877852522924733, |
| "loss": 1.8042, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.3227832441999886, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.015837881642673322, |
| "loss": 1.8242, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.32430580667263004, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.015797768766691426, |
| "loss": 1.7773, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.32582836914527147, |
| "grad_norm": 0.890625, |
| "learning_rate": 0.01575751487065094, |
| "loss": 1.8043, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.32735093161791295, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.015717120933653836, |
| "loss": 1.796, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.3288734940905544, |
| "grad_norm": 0.79296875, |
| "learning_rate": 0.015676587938208305, |
| "loss": 1.7998, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.3303960565631959, |
| "grad_norm": 0.8671875, |
| "learning_rate": 0.0156359168702049, |
| "loss": 1.8345, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.3319186190358373, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.015595108718892518, |
| "loss": 1.8046, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.3334411815084788, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.015554164476854364, |
| "loss": 1.825, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.3349637439811202, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.015513085139983796, |
| "loss": 1.8082, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.3364863064537617, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.015471871707460108, |
| "loss": 1.7726, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.33800886892640314, |
| "grad_norm": 0.82421875, |
| "learning_rate": 0.015430525181724213, |
| "loss": 1.7326, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.33953143139904457, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.015389046568454292, |
| "loss": 1.8309, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.34105399387168606, |
| "grad_norm": 0.9453125, |
| "learning_rate": 0.015347436876541297, |
| "loss": 1.7874, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.3425765563443275, |
| "grad_norm": 0.88671875, |
| "learning_rate": 0.015305697118064428, |
| "loss": 1.7611, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.344099118816969, |
| "grad_norm": 0.87890625, |
| "learning_rate": 0.015263828308266524, |
| "loss": 1.7601, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.3456216812896104, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.015221831465529344, |
| "loss": 1.7254, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.3471442437622519, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.015179707611348832, |
| "loss": 1.7807, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.3486668062348933, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.015137457770310232, |
| "loss": 1.7915, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.35018936870753475, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.015095082970063208, |
| "loss": 1.8016, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.35171193118017624, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.015052584241296808, |
| "loss": 1.7567, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.35323449365281767, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.015009962617714423, |
| "loss": 1.736, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.35475705612545916, |
| "grad_norm": 0.96484375, |
| "learning_rate": 0.01496721913600863, |
| "loss": 1.7302, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.3562796185981006, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.014924354835835983, |
| "loss": 1.7667, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.3578021810707421, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.014881370759791726, |
| "loss": 1.7681, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.3593247435433835, |
| "grad_norm": 0.92578125, |
| "learning_rate": 0.01483826795338442, |
| "loss": 1.7485, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.360847306016025, |
| "grad_norm": 0.96875, |
| "learning_rate": 0.014795047465010541, |
| "loss": 1.7623, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.3623698684886664, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.014751710345928941, |
| "loss": 1.7391, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.36389243096130786, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.01470825765023532, |
| "loss": 1.8042, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.36541499343394934, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.01466469043483655, |
| "loss": 1.7204, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.3669375559065908, |
| "grad_norm": 0.953125, |
| "learning_rate": 0.014621009759424992, |
| "loss": 1.7449, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.36846011837923226, |
| "grad_norm": 0.921875, |
| "learning_rate": 0.014577216686452718, |
| "loss": 1.7779, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.3699826808518737, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.014533312281105657, |
| "loss": 1.7248, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.3715052433245152, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.014489297611277688, |
| "loss": 1.7367, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.3730278057971566, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.014445173747544678, |
| "loss": 1.7237, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.3745503682697981, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.01440094176313844, |
| "loss": 1.7188, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.3760729307424395, |
| "grad_norm": 0.9296875, |
| "learning_rate": 0.014356602733920611, |
| "loss": 1.7373, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.37759549321508096, |
| "grad_norm": 0.921875, |
| "learning_rate": 0.014312157738356509, |
| "loss": 1.7323, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.37911805568772244, |
| "grad_norm": 0.87890625, |
| "learning_rate": 0.014267607857488873, |
| "loss": 1.7167, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.3806406181603639, |
| "grad_norm": 0.96875, |
| "learning_rate": 0.0142229541749116, |
| "loss": 1.7172, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.3806406181603639, |
| "eval_loss": 1.7768399715423584, |
| "eval_runtime": 332.6705, |
| "eval_samples_per_second": 51.054, |
| "eval_steps_per_second": 25.527, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.38216318063300536, |
| "grad_norm": 0.828125, |
| "learning_rate": 0.01417819777674336, |
| "loss": 1.7101, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.3836857431056468, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.0141333397516012, |
| "loss": 1.7252, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.3852083055782883, |
| "grad_norm": 0.92578125, |
| "learning_rate": 0.014088381190574052, |
| "loss": 1.7073, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.3867308680509297, |
| "grad_norm": 0.86328125, |
| "learning_rate": 0.014043323187196198, |
| "loss": 1.725, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.3882534305235712, |
| "grad_norm": 0.91796875, |
| "learning_rate": 0.013998166837420672, |
| "loss": 1.6971, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.38977599299621263, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.013952913239592604, |
| "loss": 1.7083, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.39129855546885406, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.013907563494422506, |
| "loss": 1.7085, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.39282111794149555, |
| "grad_norm": 0.88671875, |
| "learning_rate": 0.013862118704959498, |
| "loss": 1.7366, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.394343680414137, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.013816579976564467, |
| "loss": 1.6875, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.39586624288677846, |
| "grad_norm": 0.94140625, |
| "learning_rate": 0.013770948416883205, |
| "loss": 1.7264, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.3973888053594199, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.013725225135819448, |
| "loss": 1.717, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.3989113678320614, |
| "grad_norm": 0.9296875, |
| "learning_rate": 0.013679411245507889, |
| "loss": 1.6989, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.4004339303047028, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.013633507860287115, |
| "loss": 1.7102, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.40195649277734424, |
| "grad_norm": 0.9375, |
| "learning_rate": 0.013587516096672527, |
| "loss": 1.719, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.40347905524998573, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.013541437073329155, |
| "loss": 1.676, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.40500161772262716, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.01349527191104447, |
| "loss": 1.6962, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.40652418019526865, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.013449021732701105, |
| "loss": 1.723, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.4080467426679101, |
| "grad_norm": 0.9609375, |
| "learning_rate": 0.013402687663249565, |
| "loss": 1.6729, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.40956930514055157, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.013356270829680836, |
| "loss": 1.6872, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.411091867613193, |
| "grad_norm": 0.94140625, |
| "learning_rate": 0.013309772360999006, |
| "loss": 1.6907, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.4126144300858345, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.01326319338819377, |
| "loss": 1.6979, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.4141369925584759, |
| "grad_norm": 1.0, |
| "learning_rate": 0.013216535044212952, |
| "loss": 1.7048, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.41565955503111734, |
| "grad_norm": 0.9609375, |
| "learning_rate": 0.013169798463934925, |
| "loss": 1.687, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.41718211750375883, |
| "grad_norm": 0.9296875, |
| "learning_rate": 0.01312298478414102, |
| "loss": 1.6912, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.41870467997640026, |
| "grad_norm": 0.875, |
| "learning_rate": 0.013076095143487874, |
| "loss": 1.6743, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.42022724244904175, |
| "grad_norm": 0.87109375, |
| "learning_rate": 0.013029130682479722, |
| "loss": 1.6587, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.4217498049216832, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.01298209254344068, |
| "loss": 1.6961, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.42327236739432467, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.012934981870486932, |
| "loss": 1.6898, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.4247949298669661, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.012887799809498932, |
| "loss": 1.6648, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.4263174923396076, |
| "grad_norm": 0.875, |
| "learning_rate": 0.012840547508093506, |
| "loss": 1.6429, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.427840054812249, |
| "grad_norm": 0.9921875, |
| "learning_rate": 0.01279322611559595, |
| "loss": 1.6833, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.42936261728489045, |
| "grad_norm": 0.91796875, |
| "learning_rate": 0.012745836783012075, |
| "loss": 1.6715, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.43088517975753193, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.01269838066300021, |
| "loss": 1.6232, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.43240774223017336, |
| "grad_norm": 0.98046875, |
| "learning_rate": 0.01265085890984317, |
| "loss": 1.6643, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.43393030470281485, |
| "grad_norm": 0.859375, |
| "learning_rate": 0.012603272679420166, |
| "loss": 1.675, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.4354528671754563, |
| "grad_norm": 0.9140625, |
| "learning_rate": 0.0125556231291787, |
| "loss": 1.6551, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.43697542964809777, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.012507911418106423, |
| "loss": 1.6683, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.4384979921207392, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.012460138706702928, |
| "loss": 1.6398, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.4400205545933807, |
| "grad_norm": 0.94140625, |
| "learning_rate": 0.012412306156951524, |
| "loss": 1.6415, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.4415431170660221, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.012364414932290986, |
| "loss": 1.5897, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.44306567953866355, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.012316466197587242, |
| "loss": 1.69, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.44458824201130503, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.01226846111910505, |
| "loss": 1.6568, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.44611080448394647, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.01222040086447962, |
| "loss": 1.6666, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.44763336695658795, |
| "grad_norm": 0.91796875, |
| "learning_rate": 0.012172286602688227, |
| "loss": 1.6326, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.4491559294292294, |
| "grad_norm": 0.9765625, |
| "learning_rate": 0.012124119504021775, |
| "loss": 1.6519, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.45067849190187087, |
| "grad_norm": 0.9140625, |
| "learning_rate": 0.012075900740056315, |
| "loss": 1.6014, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.4522010543745123, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.01202763148362457, |
| "loss": 1.6208, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.45372361684715373, |
| "grad_norm": 0.9140625, |
| "learning_rate": 0.011979312908787398, |
| "loss": 1.6346, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.4552461793197952, |
| "grad_norm": 0.921875, |
| "learning_rate": 0.01193094619080524, |
| "loss": 1.6157, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.45676874179243665, |
| "grad_norm": 0.91015625, |
| "learning_rate": 0.011882532506109517, |
| "loss": 1.6455, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.45676874179243665, |
| "eval_loss": 1.669243335723877, |
| "eval_runtime": 332.838, |
| "eval_samples_per_second": 51.028, |
| "eval_steps_per_second": 25.514, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.45829130426507814, |
| "grad_norm": 0.8828125, |
| "learning_rate": 0.011834073032274042, |
| "loss": 1.6234, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.45981386673771957, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.011785568947986366, |
| "loss": 1.6443, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.46133642921036105, |
| "grad_norm": 0.9921875, |
| "learning_rate": 0.0117370214330191, |
| "loss": 1.6333, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.4628589916830025, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.011688431668201224, |
| "loss": 1.639, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.46438155415564397, |
| "grad_norm": 0.94140625, |
| "learning_rate": 0.011639800835389376, |
| "loss": 1.614, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.4659041166282854, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.011591130117439093, |
| "loss": 1.6256, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.46742667910092683, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.011542420698176048, |
| "loss": 1.6143, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.4689492415735683, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.011493673762367245, |
| "loss": 1.6167, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.47047180404620975, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.011444890495692212, |
| "loss": 1.604, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.47199436651885124, |
| "grad_norm": 0.88671875, |
| "learning_rate": 0.011396072084714166, |
| "loss": 1.5807, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.47351692899149267, |
| "grad_norm": 0.953125, |
| "learning_rate": 0.011347219716851138, |
| "loss": 1.5897, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.47503949146413416, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.011298334580347099, |
| "loss": 1.6162, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.4765620539367756, |
| "grad_norm": 0.98046875, |
| "learning_rate": 0.011249417864243046, |
| "loss": 1.6291, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.4780846164094171, |
| "grad_norm": 0.9375, |
| "learning_rate": 0.011200470758348114, |
| "loss": 1.6292, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.4796071788820585, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.011151494453210595, |
| "loss": 1.5768, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.48112974135469994, |
| "grad_norm": 0.91796875, |
| "learning_rate": 0.011102490140089008, |
| "loss": 1.559, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.4826523038273414, |
| "grad_norm": 0.91796875, |
| "learning_rate": 0.011053459010923108, |
| "loss": 1.5899, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.48417486629998285, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.011004402258304916, |
| "loss": 1.5752, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.48569742877262434, |
| "grad_norm": 0.984375, |
| "learning_rate": 0.010955321075449673, |
| "loss": 1.5958, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.48721999124526577, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.010906216656166857, |
| "loss": 1.5972, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.48874255371790726, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.010857090194831127, |
| "loss": 1.581, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.4902651161905487, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.010807942886353275, |
| "loss": 1.603, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.4917876786631902, |
| "grad_norm": 0.96875, |
| "learning_rate": 0.010758775926151154, |
| "loss": 1.5316, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.4933102411358316, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.010709590510120616, |
| "loss": 1.6024, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.49483280360847304, |
| "grad_norm": 0.953125, |
| "learning_rate": 0.010660387834606414, |
| "loss": 1.5786, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.4963553660811145, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.010611169096373113, |
| "loss": 1.596, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.49787792855375596, |
| "grad_norm": 0.96875, |
| "learning_rate": 0.01056193549257596, |
| "loss": 1.5916, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.49940049102639744, |
| "grad_norm": 0.9453125, |
| "learning_rate": 0.010512688220731791, |
| "loss": 1.5515, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.5009230534990389, |
| "grad_norm": 0.98046875, |
| "learning_rate": 0.010463428478689895, |
| "loss": 1.5383, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.5024456159716804, |
| "grad_norm": 0.96484375, |
| "learning_rate": 0.010414157464602865, |
| "loss": 1.5439, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.5039681784443218, |
| "grad_norm": 0.9453125, |
| "learning_rate": 0.010364876376897467, |
| "loss": 1.5701, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.5054907409169632, |
| "grad_norm": 0.98046875, |
| "learning_rate": 0.010315586414245497, |
| "loss": 1.5591, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.5070133033896047, |
| "grad_norm": 0.97265625, |
| "learning_rate": 0.010266288775534616, |
| "loss": 1.5644, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.5085358658622462, |
| "grad_norm": 1.0, |
| "learning_rate": 0.010216984659839183, |
| "loss": 1.5664, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.5100584283348876, |
| "grad_norm": 0.9765625, |
| "learning_rate": 0.010167675266391103, |
| "loss": 1.5762, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.5115809908075291, |
| "grad_norm": 0.96875, |
| "learning_rate": 0.010118361794550657, |
| "loss": 1.5568, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.5131035532801705, |
| "grad_norm": 1.0, |
| "learning_rate": 0.010069045443777317, |
| "loss": 1.5914, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.514626115752812, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.01001972741360059, |
| "loss": 1.577, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.5161486782254534, |
| "grad_norm": 0.9375, |
| "learning_rate": 0.009970408903590817, |
| "loss": 1.5464, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.5176712406980949, |
| "grad_norm": 0.9765625, |
| "learning_rate": 0.009921091113330026, |
| "loss": 1.5648, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.5191938031707364, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.009871775242382726, |
| "loss": 1.5445, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.5207163656433778, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.009822462490266753, |
| "loss": 1.5704, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.5222389281160192, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.009773154056424068, |
| "loss": 1.54, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.5237614905886607, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.009723851140191612, |
| "loss": 1.5441, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.5252840530613022, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.009674554940772118, |
| "loss": 1.5875, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.5268066155339436, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.009625266657204938, |
| "loss": 1.5179, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.5283291780065851, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.009575987488336891, |
| "loss": 1.5209, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.5298517404792266, |
| "grad_norm": 0.97265625, |
| "learning_rate": 0.0095267186327931, |
| "loss": 1.5544, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.5313743029518679, |
| "grad_norm": 0.953125, |
| "learning_rate": 0.009477461288947827, |
| "loss": 1.52, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.5328968654245094, |
| "grad_norm": 0.953125, |
| "learning_rate": 0.009428216654895339, |
| "loss": 1.5613, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.5328968654245094, |
| "eval_loss": 1.5677825212478638, |
| "eval_runtime": 332.5932, |
| "eval_samples_per_second": 51.065, |
| "eval_steps_per_second": 25.533, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.5344194278971509, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.009378985928420762, |
| "loss": 1.5504, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.5359419903697924, |
| "grad_norm": 0.9765625, |
| "learning_rate": 0.009329770306970941, |
| "loss": 1.5457, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.5374645528424338, |
| "grad_norm": 0.984375, |
| "learning_rate": 0.009280570987625327, |
| "loss": 1.5166, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.5389871153150753, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.009231389167066836, |
| "loss": 1.4936, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.5405096777877167, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.009182226041552777, |
| "loss": 1.5515, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.5420322402603582, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.009133082806885727, |
| "loss": 1.4913, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.5435548027329996, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.009083960658384455, |
| "loss": 1.5148, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.5450773652056411, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.009034860790854849, |
| "loss": 1.5269, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.5465999276782826, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.008985784398560856, |
| "loss": 1.5088, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.548122490150924, |
| "grad_norm": 1.0, |
| "learning_rate": 0.008936732675195425, |
| "loss": 1.5022, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.5496450526235654, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.008887706813851483, |
| "loss": 1.5011, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.5511676150962069, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.008838708006992909, |
| "loss": 1.4785, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.5526901775688484, |
| "grad_norm": 1.0, |
| "learning_rate": 0.008789737446425538, |
| "loss": 1.5353, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.5542127400414898, |
| "grad_norm": 0.96484375, |
| "learning_rate": 0.008740796323268157, |
| "loss": 1.5299, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.5557353025141313, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.008691885827923542, |
| "loss": 1.4997, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.5572578649867728, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.008643007150049509, |
| "loss": 1.5141, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.5587804274594141, |
| "grad_norm": 0.984375, |
| "learning_rate": 0.008594161478529974, |
| "loss": 1.5105, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.5603029899320556, |
| "grad_norm": 0.984375, |
| "learning_rate": 0.008545350001446026, |
| "loss": 1.4957, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.5618255524046971, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.008496573906047047, |
| "loss": 1.4841, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.5633481148773386, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.008447834378721816, |
| "loss": 1.4886, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.56487067734998, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.00839913260496967, |
| "loss": 1.4766, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.5663932398226215, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.008350469769371649, |
| "loss": 1.4882, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.567915802295263, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.008301847055561704, |
| "loss": 1.4452, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.5694383647679044, |
| "grad_norm": 0.97265625, |
| "learning_rate": 0.008253265646197891, |
| "loss": 1.4803, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.5709609272405458, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.008204726722933618, |
| "loss": 1.4771, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.5724834897131873, |
| "grad_norm": 0.93359375, |
| "learning_rate": 0.00815623146638888, |
| "loss": 1.4307, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.5740060521858288, |
| "grad_norm": 0.98046875, |
| "learning_rate": 0.008107781056121581, |
| "loss": 1.4843, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.5755286146584702, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.00805937667059881, |
| "loss": 1.4712, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.5770511771311116, |
| "grad_norm": 1.0, |
| "learning_rate": 0.008011019487168192, |
| "loss": 1.473, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.5785737396037531, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.007962710682029245, |
| "loss": 1.4614, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.5800963020763946, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.007914451430204777, |
| "loss": 1.4484, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.581618864549036, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.007866242905512305, |
| "loss": 1.4851, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.5831414270216775, |
| "grad_norm": 0.98046875, |
| "learning_rate": 0.007818086280535493, |
| "loss": 1.4475, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.584663989494319, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.007769982726595648, |
| "loss": 1.4766, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.5861865519669603, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.007721933413723224, |
| "loss": 1.4819, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.5877091144396018, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.007673939510629349, |
| "loss": 1.4827, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.5892316769122433, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.00762600218467742, |
| "loss": 1.4513, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.5907542393848848, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.007578122601854693, |
| "loss": 1.4446, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.5922768018575262, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.0075303019267439365, |
| "loss": 1.459, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.5937993643301677, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.007482541322495094, |
| "loss": 1.4533, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.5953219268028092, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.007434841950796987, |
| "loss": 1.4674, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.5968444892754505, |
| "grad_norm": 0.92578125, |
| "learning_rate": 0.007387204971849082, |
| "loss": 1.4355, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.598367051748092, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.00733963154433325, |
| "loss": 1.4452, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.5998896142207335, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.007292122825385585, |
| "loss": 1.4334, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.601412176693375, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.007244679970568273, |
| "loss": 1.4053, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.6029347391660164, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.007197304133841477, |
| "loss": 1.457, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.6044573016386579, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.007149996467535253, |
| "loss": 1.4421, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.6059798641112993, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.007102758122321557, |
| "loss": 1.4525, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.6075024265839408, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.007055590247186224, |
| "loss": 1.4069, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.6090249890565822, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.007008493989401038, |
| "loss": 1.4206, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.6090249890565822, |
| "eval_loss": 1.4739612340927124, |
| "eval_runtime": 330.8337, |
| "eval_samples_per_second": 51.337, |
| "eval_steps_per_second": 25.668, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.6105475515292237, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.006961470494495825, |
| "loss": 1.48, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.6120701140018652, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0069145209062305805, |
| "loss": 1.4043, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.6135926764745065, |
| "grad_norm": 0.96484375, |
| "learning_rate": 0.006867646366567665, |
| "loss": 1.4538, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.615115238947148, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.006820848015644018, |
| "loss": 1.4233, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.6166378014197895, |
| "grad_norm": 0.96875, |
| "learning_rate": 0.006774126991743424, |
| "loss": 1.4104, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.618160363892431, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.006727484431268831, |
| "loss": 1.4257, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.6196829263650724, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.006680921468714718, |
| "loss": 1.4148, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.6212054888377139, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.006634439236639473, |
| "loss": 1.4257, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.6227280513103554, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.006588038865637882, |
| "loss": 1.3982, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.6242506137829967, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.0065417214843135965, |
| "loss": 1.4606, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.6257731762556382, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.006495488219251705, |
| "loss": 1.41, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.6272957387282797, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.006449340194991325, |
| "loss": 1.4392, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.6288183012009212, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.006403278533998237, |
| "loss": 1.4079, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.6303408636735626, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.006357304356637605, |
| "loss": 1.4112, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.631863426146204, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.006311418781146709, |
| "loss": 1.4297, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.6333859886188455, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.006265622923607759, |
| "loss": 1.4111, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.634908551091487, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.006219917897920726, |
| "loss": 1.4201, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.6364311135641284, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.006174304815776282, |
| "loss": 1.4279, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.6379536760367699, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0061287847866287205, |
| "loss": 1.4095, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.6394762385094114, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.006083358917669012, |
| "loss": 1.3751, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.6409988009820528, |
| "grad_norm": 0.96875, |
| "learning_rate": 0.00603802831379784, |
| "loss": 1.3945, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.6425213634546942, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.005992794077598747, |
| "loss": 1.4182, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.6440439259273357, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.005947657309311306, |
| "loss": 1.4575, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.6455664883999772, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.005902619106804368, |
| "loss": 1.3849, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.6470890508726186, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.005857680565549341, |
| "loss": 1.4159, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.6486116133452601, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.005812842778593572, |
| "loss": 1.4028, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.6501341758179016, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.005768106836533726, |
| "loss": 1.4047, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.6516567382905429, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.005723473827489301, |
| "loss": 1.4104, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.6531793007631844, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0056789448370761185, |
| "loss": 1.4118, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.6547018632358259, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.005634520948379951, |
| "loss": 1.3793, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.6562244257084674, |
| "grad_norm": 1.0, |
| "learning_rate": 0.005590203241930157, |
| "loss": 1.4074, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.6577469881811088, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.005545992795673408, |
| "loss": 1.4017, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.6592695506537503, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.0055018906849474795, |
| "loss": 1.415, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.6607921131263917, |
| "grad_norm": 0.9765625, |
| "learning_rate": 0.005457897982455072, |
| "loss": 1.4034, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.6623146755990331, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.005414015758237733, |
| "loss": 1.4103, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.6638372380716746, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.005370245079649841, |
| "loss": 1.372, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.6653598005443161, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.005326587011332616, |
| "loss": 1.3938, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.6668823630169576, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.005283042615188249, |
| "loss": 1.3771, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.668404925489599, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.005239612950354074, |
| "loss": 1.3552, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.6699274879622404, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.005196299073176771, |
| "loss": 1.3651, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.6714500504348819, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.0051531020371867265, |
| "loss": 1.401, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.6729726129075234, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.005110022893072361, |
| "loss": 1.3721, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.6744951753801648, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.0050670626886545975, |
| "loss": 1.3655, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.6760177378528063, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.005024222468861377, |
| "loss": 1.3572, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.6775403003254478, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0049815032757022275, |
| "loss": 1.3766, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.6790628627980891, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.004938906148242921, |
| "loss": 1.3632, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.6805854252707306, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.004896432122580222, |
| "loss": 1.3825, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.6821079877433721, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.00485408223181666, |
| "loss": 1.3533, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.6836305502160136, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.004811857506035406, |
| "loss": 1.3628, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.685153112688655, |
| "grad_norm": 1.0, |
| "learning_rate": 0.0047697589722752445, |
| "loss": 1.3787, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.685153112688655, |
| "eval_loss": 1.4034804105758667, |
| "eval_runtime": 332.3993, |
| "eval_samples_per_second": 51.095, |
| "eval_steps_per_second": 25.548, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.6866756751612965, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.004727787654505539, |
| "loss": 1.3787, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.688198237633938, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0046859445736013895, |
| "loss": 1.3493, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.6897208001065793, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.00464423074731875, |
| "loss": 1.3848, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.6912433625792208, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.004602647190269701, |
| "loss": 1.3502, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.6927659250518623, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.004561194913897766, |
| "loss": 1.4067, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.6942884875245038, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.004519874926453302, |
| "loss": 1.3637, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.6958110499971452, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.004478688232968981, |
| "loss": 1.3609, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.6973336124697866, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.004437635835235353, |
| "loss": 1.3606, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.6988561749424281, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.0043967187317764615, |
| "loss": 1.3905, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.7003787374150695, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.004355937917825566, |
| "loss": 1.3614, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.701901299887711, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.004315294385300951, |
| "loss": 1.364, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.7034238623603525, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.004274789122781753, |
| "loss": 1.3518, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.704946424832994, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.004234423115483971, |
| "loss": 1.3544, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.7064689873056353, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.004194197345236467, |
| "loss": 1.3599, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.7079915497782768, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.004154112790457089, |
| "loss": 1.3559, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.7095141122509183, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0041141704261288955, |
| "loss": 1.362, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.7110366747235598, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.004074371223776407, |
| "loss": 1.3175, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.7125592371962012, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.004034716151441996, |
| "loss": 1.3564, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.7140817996688427, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.003995206173662348, |
| "loss": 1.3867, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.7156043621414842, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.003955842251444978, |
| "loss": 1.3807, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.7171269246141255, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.003916625342244869, |
| "loss": 1.3359, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.718649487086767, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0038775563999411955, |
| "loss": 1.3355, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.7201720495594085, |
| "grad_norm": 0.98828125, |
| "learning_rate": 0.0038386363748140894, |
| "loss": 1.3593, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.72169461203205, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.003799866213521568, |
| "loss": 1.3215, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.7232171745046914, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0037612468590764695, |
| "loss": 1.3447, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.7247397369773328, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.003722779250823538, |
| "loss": 1.344, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.7262622994499743, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.0036844643244165775, |
| "loss": 1.3388, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.7277848619226157, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0036463030117956795, |
| "loss": 1.3718, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.7293074243952572, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0036082962411645614, |
| "loss": 1.3073, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.7308299868678987, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0035704449369680005, |
| "loss": 1.3638, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.7323525493405402, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0035327500198693287, |
| "loss": 1.3734, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.7338751118131815, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.0034952124067280555, |
| "loss": 1.3498, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.735397674285823, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.003457833010577558, |
| "loss": 1.3421, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.7369202367584645, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.003420612740602874, |
| "loss": 1.3433, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.738442799231106, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.003383552502118602, |
| "loss": 1.3657, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.7399653617037474, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.003346653196546855, |
| "loss": 1.3565, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.7414879241763889, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.0033099157213953502, |
| "loss": 1.33, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.7430104866490304, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00327334097023559, |
| "loss": 1.3577, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.7445330491216717, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.0032369298326811024, |
| "loss": 1.3191, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.7460556115943132, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.0032006831943658153, |
| "loss": 1.3329, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.7475781740669547, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.003164601936922528, |
| "loss": 1.3318, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.7491007365395962, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.003128686937961438, |
| "loss": 1.3116, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.7506232990122376, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0030929390710488303, |
| "loss": 1.3419, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.752145861484879, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.003057359205685788, |
| "loss": 1.3215, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.7536684239575205, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0030219482072870764, |
| "loss": 1.3114, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.7551909864301619, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.0029867069371600895, |
| "loss": 1.3132, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.7567135489028034, |
| "grad_norm": 1.0, |
| "learning_rate": 0.0029516362524838846, |
| "loss": 1.3154, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.7582361113754449, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0029167370062883405, |
| "loss": 1.3152, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.7597586738480864, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0028820100474334187, |
| "loss": 1.32, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.7612812363207277, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.002847456220588498, |
| "loss": 1.3324, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.7612812363207277, |
| "eval_loss": 1.3596354722976685, |
| "eval_runtime": 330.8649, |
| "eval_samples_per_second": 51.332, |
| "eval_steps_per_second": 25.666, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.7628037987933692, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.0028130763662118498, |
| "loss": 1.3181, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.7643263612660107, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0027788713205301775, |
| "loss": 1.317, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.7658489237386521, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.0027448419155182858, |
| "loss": 1.3055, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.7673714862112936, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.002710988978878851, |
| "loss": 1.3281, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.7688940486839351, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.002677313334022268, |
| "loss": 1.3368, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.7704166111565766, |
| "grad_norm": 1.0, |
| "learning_rate": 0.0026438158000466404, |
| "loss": 1.3504, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.7719391736292179, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.002610497191717861, |
| "loss": 1.3228, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.7734617361018594, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.0025773583194497705, |
| "loss": 1.3179, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.7749842985745009, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.002544399989284476, |
| "loss": 1.3058, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.7765068610471424, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0025116230028727183, |
| "loss": 1.34, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.7780294235197838, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.002479028157454387, |
| "loss": 1.3391, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.7795519859924253, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.002446616245839136, |
| "loss": 1.3133, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.7810745484650667, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.002414388056387079, |
| "loss": 1.3257, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.7825971109377081, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.00238234437298963, |
| "loss": 1.3274, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.7841196734103496, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0023504859750504425, |
| "loss": 1.33, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.7856422358829911, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0023188136374664224, |
| "loss": 1.3209, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.7871647983556326, |
| "grad_norm": 1.125, |
| "learning_rate": 0.002287328130608919, |
| "loss": 1.3023, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.788687360828274, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.0022560302203049575, |
| "loss": 1.317, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.7902099233009154, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0022249206678186216, |
| "loss": 1.316, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.7917324857735569, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.002194000229832547, |
| "loss": 1.3339, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.7932550482461983, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.0021632696584294965, |
| "loss": 1.2943, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.7947776107188398, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0021327297010740797, |
| "loss": 1.3415, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.7963001731914813, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.002102381100594577, |
| "loss": 1.3094, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.7978227356641228, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.002072224595164859, |
| "loss": 1.3142, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.7993452981367641, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0020422609182864336, |
| "loss": 1.3341, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.8008678606094056, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0020124907987706243, |
| "loss": 1.3107, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.8023904230820471, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.0019829149607208064, |
| "loss": 1.3188, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.8039129855546885, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0019535341235148353, |
| "loss": 1.3194, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.80543554802733, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0019243490017875164, |
| "loss": 1.3211, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.8069581104999715, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0018953603054132429, |
| "loss": 1.3, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.808480672972613, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0018665687394887232, |
| "loss": 1.2797, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.8100032354452543, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.001837975004315826, |
| "loss": 1.3289, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.8115257979178958, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0018095797953845505, |
| "loss": 1.2976, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.8130483603905373, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0017813838033561191, |
| "loss": 1.2846, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.8145709228631788, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.0017533877140461585, |
| "loss": 1.3132, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.8160934853358202, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0017255922084080367, |
| "loss": 1.3321, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.8176160478084616, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.0016979979625162888, |
| "loss": 1.3128, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.8191386102811031, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0016706056475501764, |
| "loss": 1.3186, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.8206611727537445, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00164341592977737, |
| "loss": 1.2902, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.822183735226386, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0016164294705377292, |
| "loss": 1.3005, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.8237062976990275, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0015896469262272218, |
| "loss": 1.2982, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.825228860171669, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0015630689482819715, |
| "loss": 1.3086, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.8267514226443103, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0015366961831623882, |
| "loss": 1.2775, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.8282739851169518, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0015105292723374632, |
| "loss": 1.2755, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.8297965475895933, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.0014845688522691647, |
| "loss": 1.3371, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.8313191100622347, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0014588155543969461, |
| "loss": 1.3178, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.8328416725348762, |
| "grad_norm": 1.125, |
| "learning_rate": 0.001433270005122399, |
| "loss": 1.3009, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.8343642350075177, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0014079328257940104, |
| "loss": 1.2848, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.8358867974801591, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0013828046326920496, |
| "loss": 1.3258, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.8374093599528005, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0013578860370135881, |
| "loss": 1.3133, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.8374093599528005, |
| "eval_loss": 1.3418630361557007, |
| "eval_runtime": 330.9458, |
| "eval_samples_per_second": 51.32, |
| "eval_steps_per_second": 25.66, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.838931922425442, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.0013331776448576194, |
| "loss": 1.303, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.8404544848980835, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.001308680057210322, |
| "loss": 1.3289, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.841977047370725, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.001284393869930448, |
| "loss": 1.3162, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.8434996098433664, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.001260319673734821, |
| "loss": 1.2969, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.8450221723160078, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0012364580541839698, |
| "loss": 1.339, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.8465447347886493, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0012128095916678927, |
| "loss": 1.3151, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.8480672972612907, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.001189374861391932, |
| "loss": 1.293, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.8495898597339322, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0011661544333627849, |
| "loss": 1.3107, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.8511124222065737, |
| "grad_norm": 1.125, |
| "learning_rate": 0.001143148872374643, |
| "loss": 1.3048, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.8526349846792152, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0011203587379954505, |
| "loss": 1.3071, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.8541575471518565, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0010977845845533008, |
| "loss": 1.3039, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.855680109624498, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.0010754269611229427, |
| "loss": 1.2884, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.8572026720971395, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.0010532864115124318, |
| "loss": 1.3189, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.8587252345697809, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.0010313634742499067, |
| "loss": 1.3361, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.8602477970424224, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.00100965868257048, |
| "loss": 1.3082, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.8617703595150639, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.0009881725644032757, |
| "loss": 1.2981, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.8632929219877054, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0009669056423585932, |
| "loss": 1.3029, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.8648154844603467, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.000945858433715181, |
| "loss": 1.2981, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.8663380469329882, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0009250314504076684, |
| "loss": 1.3538, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.8678606094056297, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.0009044251990141061, |
| "loss": 1.3062, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.8693831718782711, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.000884040180743646, |
| "loss": 1.2683, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.8709057343509126, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0008638768914243589, |
| "loss": 1.2986, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.872428296823554, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0008439358214911586, |
| "loss": 1.3151, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.8739508592961955, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.0008242174559738802, |
| "loss": 1.3356, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.8754734217688369, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0008047222744854942, |
| "loss": 1.2749, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.8769959842414784, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0007854507512104192, |
| "loss": 1.3067, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.8785185467141199, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0007664033548930016, |
| "loss": 1.3232, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.8800411091867614, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.000747580548826119, |
| "loss": 1.3296, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.8815636716594027, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.000728982790839895, |
| "loss": 1.2851, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.8830862341320442, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0007106105332905777, |
| "loss": 1.2947, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.8846087966046857, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.0006924642230495315, |
| "loss": 1.3003, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.8861313590773271, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.0006745443014923658, |
| "loss": 1.295, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.8876539215499686, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0006568512044882057, |
| "loss": 1.3155, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.8891764840226101, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0006393853623890833, |
| "loss": 1.3215, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.8906990464952516, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.0006221472000194739, |
| "loss": 1.2914, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.8922216089678929, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0006051371366659642, |
| "loss": 1.3099, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.8937441714405344, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0005883555860670487, |
| "loss": 1.3034, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.8952667339131759, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.0005718029564030702, |
| "loss": 1.3146, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.8967892963858173, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0005554796502862957, |
| "loss": 1.3074, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.8983118588584588, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.0005393860647511129, |
| "loss": 1.3068, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.8998344213311003, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0005235225912443808, |
| "loss": 1.3098, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.9013569838037417, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.0005078896156159074, |
| "loss": 1.3097, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.9028795462763831, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0004924875181090627, |
| "loss": 1.3344, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.9044021087490246, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00047731667335153326, |
| "loss": 1.2884, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.9059246712216661, |
| "grad_norm": 1.0078125, |
| "learning_rate": 0.0004623774503462064, |
| "loss": 1.2927, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.9074472336943075, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.0004476702124621956, |
| "loss": 1.3137, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.908969796166949, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.00043319531742600507, |
| "loss": 1.3188, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.9104923586395904, |
| "grad_norm": 1.0, |
| "learning_rate": 0.0004189531173128258, |
| "loss": 1.3023, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.9120149211122319, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0004049439585379733, |
| "loss": 1.3314, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.9135374835848733, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00039116818184846137, |
| "loss": 1.2929, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.9135374835848733, |
| "eval_loss": 1.338244915008545, |
| "eval_runtime": 331.5876, |
| "eval_samples_per_second": 51.22, |
| "eval_steps_per_second": 25.61, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.9150600460575148, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.0003776261223147126, |
| "loss": 1.2987, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.9165826085301563, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00036431810932241015, |
| "loss": 1.306, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.9181051710027978, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0003512444665644865, |
| "loss": 1.3156, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.9196277334754391, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00033840551203324855, |
| "loss": 1.338, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.9211502959480806, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.000325801558012645, |
| "loss": 1.3182, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.9226728584207221, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0003134329110706691, |
| "loss": 1.301, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.9241954208933635, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.0003012998720519011, |
| "loss": 1.3151, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.925717983366005, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0002894027360701945, |
| "loss": 1.3417, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.9272405458386465, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0002777417925014913, |
| "loss": 1.321, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.9287631083112879, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.0002663173249767936, |
| "loss": 1.3347, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.9302856707839293, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00025512961137525217, |
| "loss": 1.2924, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.9318082332565708, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00024417892381741857, |
| "loss": 1.3502, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.9333307957292123, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.00023346552865862182, |
| "loss": 1.2897, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.9348533582018537, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0002229896864824865, |
| "loss": 1.3307, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.9363759206744952, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00021275165209460047, |
| "loss": 1.2939, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.9378984831471366, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.00020275167451631716, |
| "loss": 1.2911, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.9394210456197781, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00019298999697868967, |
| "loss": 1.3012, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.9409436080924195, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00018346685691656762, |
| "loss": 1.2888, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.942466170565061, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0001741824859628116, |
| "loss": 1.2922, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.9439887330377025, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.0001651371099426624, |
| "loss": 1.335, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.945511295510344, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.00015633094886825184, |
| "loss": 1.3128, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.9470338579829853, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.00014776421693324604, |
| "loss": 1.2918, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.9485564204556268, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.00013943712250763851, |
| "loss": 1.2868, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.9500789829282683, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.00013134986813267968, |
| "loss": 1.3157, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.9516015454009097, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00012350265051595534, |
| "loss": 1.2916, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.9531241078735512, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.00011589566052659594, |
| "loss": 1.3327, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.9546466703461927, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00010852908319063826, |
| "loss": 1.3101, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.9561692328188341, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.00010140309768652211, |
| "loss": 1.2918, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.9576917952914755, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.451787734073514e-05, |
| "loss": 1.2713, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.959214357764117, |
| "grad_norm": 1.046875, |
| "learning_rate": 8.787358962359493e-05, |
| "loss": 1.3014, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.9607369202367585, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.147039614517571e-05, |
| "loss": 1.3265, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.9622594827093999, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.53084526513781e-05, |
| "loss": 1.2993, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.9637820451820414, |
| "grad_norm": 1.0546875, |
| "learning_rate": 6.938790902014325e-05, |
| "loss": 1.3177, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.9653046076546828, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.370890925779915e-05, |
| "loss": 1.2914, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.9668271701273243, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.827159149556893e-05, |
| "loss": 1.3135, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.9683497325999657, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.307608798620245e-05, |
| "loss": 1.3195, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.9698722950726072, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.8122525100765534e-05, |
| "loss": 1.3112, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.9713948575452487, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.341102332556024e-05, |
| "loss": 1.3055, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.9729174200178901, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.8941697259199385e-05, |
| "loss": 1.2961, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.9744399824905315, |
| "grad_norm": 1.015625, |
| "learning_rate": 3.471465560981768e-05, |
| "loss": 1.3145, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.975962544963173, |
| "grad_norm": 1.0, |
| "learning_rate": 3.073000119242608e-05, |
| "loss": 1.3125, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.9774851074358145, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.6987830926412658e-05, |
| "loss": 1.3009, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.9790076699084559, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.348823583318338e-05, |
| "loss": 1.3013, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.9805302323810974, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.0231301033951655e-05, |
| "loss": 1.3234, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.9820527948537389, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.721710574766333e-05, |
| "loss": 1.2958, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.9835753573263804, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.4445723289072676e-05, |
| "loss": 1.2891, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.9850979197990217, |
| "grad_norm": 1.1640625, |
| "learning_rate": 1.191722106696158e-05, |
| "loss": 1.3166, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.9866204822716632, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.631660582491986e-06, |
| "loss": 1.322, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.9881430447443047, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.589097427720404e-06, |
| "loss": 1.3266, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.9896656072169461, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.789581284235679e-06, |
| "loss": 1.3226, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.9896656072169461, |
| "eval_loss": 1.3380595445632935, |
| "eval_runtime": 333.6502, |
| "eval_samples_per_second": 50.904, |
| "eval_steps_per_second": 25.452, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.9911881696895876, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.233155921957721e-06, |
| "loss": 1.2931, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.992710732162229, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.9198591980705847e-06, |
| "loss": 1.3047, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.9942332946348705, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.8497230560998724e-06, |
| "loss": 1.3187, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.9957558571075119, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.0227735251400194e-06, |
| "loss": 1.2968, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.9972784195801534, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.3903071921480575e-07, |
| "loss": 1.3062, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.9988009820527949, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.850883679662914e-08, |
| "loss": 1.3219, |
| "step": 6560 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 6568, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.872004116416299e+18, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|