| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 6695, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0014936798670624918, | |
| "grad_norm": 184.9497833251953, | |
| "learning_rate": 1.3432835820895523e-07, | |
| "loss": 4.6248, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0029873597341249837, | |
| "grad_norm": 123.1812744140625, | |
| "learning_rate": 2.8358208955223886e-07, | |
| "loss": 4.6243, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004481039601187476, | |
| "grad_norm": 195.16688537597656, | |
| "learning_rate": 4.3283582089552244e-07, | |
| "loss": 4.2232, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005974719468249967, | |
| "grad_norm": 90.20713806152344, | |
| "learning_rate": 5.82089552238806e-07, | |
| "loss": 3.2697, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.007468399335312459, | |
| "grad_norm": 38.96065139770508, | |
| "learning_rate": 7.313432835820897e-07, | |
| "loss": 2.3537, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.008962079202374951, | |
| "grad_norm": 18.04777717590332, | |
| "learning_rate": 8.805970149253732e-07, | |
| "loss": 1.4564, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.010455759069437442, | |
| "grad_norm": 14.39755630493164, | |
| "learning_rate": 1.0298507462686568e-06, | |
| "loss": 1.1459, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.011949438936499935, | |
| "grad_norm": 14.463386535644531, | |
| "learning_rate": 1.1791044776119403e-06, | |
| "loss": 0.9877, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.013443118803562427, | |
| "grad_norm": 15.058645248413086, | |
| "learning_rate": 1.3283582089552241e-06, | |
| "loss": 0.8928, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.014936798670624918, | |
| "grad_norm": 14.862004280090332, | |
| "learning_rate": 1.4776119402985075e-06, | |
| "loss": 0.7928, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01643047853768741, | |
| "grad_norm": 14.284353256225586, | |
| "learning_rate": 1.626865671641791e-06, | |
| "loss": 0.7325, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.017924158404749903, | |
| "grad_norm": 13.038077354431152, | |
| "learning_rate": 1.7761194029850749e-06, | |
| "loss": 0.6793, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.019417838271812395, | |
| "grad_norm": 12.484222412109375, | |
| "learning_rate": 1.9253731343283582e-06, | |
| "loss": 0.6044, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.020911518138874884, | |
| "grad_norm": 12.41592788696289, | |
| "learning_rate": 2.074626865671642e-06, | |
| "loss": 0.5792, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.022405198005937377, | |
| "grad_norm": 10.647322654724121, | |
| "learning_rate": 2.2238805970149254e-06, | |
| "loss": 0.5225, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02389887787299987, | |
| "grad_norm": 10.59830093383789, | |
| "learning_rate": 2.373134328358209e-06, | |
| "loss": 0.4818, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.02539255774006236, | |
| "grad_norm": 8.66019058227539, | |
| "learning_rate": 2.5223880597014925e-06, | |
| "loss": 0.4556, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.026886237607124854, | |
| "grad_norm": 7.149711608886719, | |
| "learning_rate": 2.6716417910447763e-06, | |
| "loss": 0.3993, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.028379917474187343, | |
| "grad_norm": 5.81325101852417, | |
| "learning_rate": 2.82089552238806e-06, | |
| "loss": 0.4024, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.029873597341249836, | |
| "grad_norm": 5.245195388793945, | |
| "learning_rate": 2.9701492537313435e-06, | |
| "loss": 0.4011, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03136727720831233, | |
| "grad_norm": 5.920019149780273, | |
| "learning_rate": 3.1194029850746273e-06, | |
| "loss": 0.3821, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.03286095707537482, | |
| "grad_norm": 4.425774097442627, | |
| "learning_rate": 3.2686567164179106e-06, | |
| "loss": 0.3863, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.03435463694243731, | |
| "grad_norm": 2.886918067932129, | |
| "learning_rate": 3.417910447761194e-06, | |
| "loss": 0.3741, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.035848316809499806, | |
| "grad_norm": 6.619475841522217, | |
| "learning_rate": 3.5671641791044782e-06, | |
| "loss": 0.3674, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.037341996676562295, | |
| "grad_norm": 5.154594898223877, | |
| "learning_rate": 3.7164179104477616e-06, | |
| "loss": 0.3615, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03883567654362479, | |
| "grad_norm": 4.2160820960998535, | |
| "learning_rate": 3.865671641791045e-06, | |
| "loss": 0.3373, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04032935641068728, | |
| "grad_norm": 2.998599052429199, | |
| "learning_rate": 4.014925373134328e-06, | |
| "loss": 0.3669, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.04182303627774977, | |
| "grad_norm": 4.233543395996094, | |
| "learning_rate": 4.1641791044776125e-06, | |
| "loss": 0.367, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.043316716144812265, | |
| "grad_norm": 4.081936359405518, | |
| "learning_rate": 4.313432835820896e-06, | |
| "loss": 0.3581, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.044810396011874754, | |
| "grad_norm": 3.0351035594940186, | |
| "learning_rate": 4.462686567164179e-06, | |
| "loss": 0.3378, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04630407587893725, | |
| "grad_norm": 5.17939567565918, | |
| "learning_rate": 4.611940298507463e-06, | |
| "loss": 0.3771, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04779775574599974, | |
| "grad_norm": 2.004800319671631, | |
| "learning_rate": 4.761194029850746e-06, | |
| "loss": 0.334, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.04929143561306223, | |
| "grad_norm": 4.026329040527344, | |
| "learning_rate": 4.91044776119403e-06, | |
| "loss": 0.3373, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.05078511548012472, | |
| "grad_norm": 3.8416078090667725, | |
| "learning_rate": 5.059701492537314e-06, | |
| "loss": 0.3616, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05227879534718721, | |
| "grad_norm": 2.231009006500244, | |
| "learning_rate": 5.208955223880598e-06, | |
| "loss": 0.3196, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05377247521424971, | |
| "grad_norm": 3.798912286758423, | |
| "learning_rate": 5.358208955223881e-06, | |
| "loss": 0.3638, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0552661550813122, | |
| "grad_norm": 2.688854932785034, | |
| "learning_rate": 5.5074626865671645e-06, | |
| "loss": 0.3411, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.056759834948374686, | |
| "grad_norm": 3.7967114448547363, | |
| "learning_rate": 5.656716417910449e-06, | |
| "loss": 0.3717, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.05825351481543718, | |
| "grad_norm": 2.219935655593872, | |
| "learning_rate": 5.805970149253732e-06, | |
| "loss": 0.33, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.05974719468249967, | |
| "grad_norm": 2.561140775680542, | |
| "learning_rate": 5.9552238805970155e-06, | |
| "loss": 0.3509, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06124087454956217, | |
| "grad_norm": 1.8147597312927246, | |
| "learning_rate": 6.1044776119403e-06, | |
| "loss": 0.3371, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.06273455441662466, | |
| "grad_norm": 2.9185450077056885, | |
| "learning_rate": 6.253731343283582e-06, | |
| "loss": 0.3351, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.06422823428368715, | |
| "grad_norm": 1.7370891571044922, | |
| "learning_rate": 6.4029850746268664e-06, | |
| "loss": 0.3465, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.06572191415074964, | |
| "grad_norm": 1.9567862749099731, | |
| "learning_rate": 6.552238805970151e-06, | |
| "loss": 0.3617, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.06721559401781213, | |
| "grad_norm": 3.0918054580688477, | |
| "learning_rate": 6.701492537313433e-06, | |
| "loss": 0.344, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.06870927388487462, | |
| "grad_norm": 2.9340105056762695, | |
| "learning_rate": 6.850746268656717e-06, | |
| "loss": 0.3464, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07020295375193712, | |
| "grad_norm": 2.3813934326171875, | |
| "learning_rate": 7e-06, | |
| "loss": 0.3233, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.07169663361899961, | |
| "grad_norm": 1.6313637495040894, | |
| "learning_rate": 7.149253731343284e-06, | |
| "loss": 0.3377, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.0731903134860621, | |
| "grad_norm": 2.1293253898620605, | |
| "learning_rate": 7.298507462686568e-06, | |
| "loss": 0.3595, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.07468399335312459, | |
| "grad_norm": 2.9458706378936768, | |
| "learning_rate": 7.447761194029851e-06, | |
| "loss": 0.3289, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07468399335312459, | |
| "eval_loss": 0.3393873870372772, | |
| "eval_runtime": 77.3397, | |
| "eval_samples_per_second": 6.995, | |
| "eval_steps_per_second": 3.504, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07617767322018708, | |
| "grad_norm": 3.799473762512207, | |
| "learning_rate": 7.597014925373135e-06, | |
| "loss": 0.3188, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.07767135308724958, | |
| "grad_norm": 2.0290870666503906, | |
| "learning_rate": 7.746268656716418e-06, | |
| "loss": 0.3592, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.07916503295431207, | |
| "grad_norm": 3.1726157665252686, | |
| "learning_rate": 7.895522388059703e-06, | |
| "loss": 0.3284, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.08065871282137456, | |
| "grad_norm": 2.264389991760254, | |
| "learning_rate": 8.044776119402986e-06, | |
| "loss": 0.3631, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.08215239268843705, | |
| "grad_norm": 2.3615527153015137, | |
| "learning_rate": 8.19402985074627e-06, | |
| "loss": 0.3333, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08364607255549954, | |
| "grad_norm": 2.3802566528320312, | |
| "learning_rate": 8.343283582089553e-06, | |
| "loss": 0.3482, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.08513975242256204, | |
| "grad_norm": 2.536975145339966, | |
| "learning_rate": 8.492537313432838e-06, | |
| "loss": 0.3316, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.08663343228962453, | |
| "grad_norm": 2.01639723777771, | |
| "learning_rate": 8.64179104477612e-06, | |
| "loss": 0.355, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.08812711215668702, | |
| "grad_norm": 2.158482313156128, | |
| "learning_rate": 8.791044776119405e-06, | |
| "loss": 0.3637, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.08962079202374951, | |
| "grad_norm": 3.023801326751709, | |
| "learning_rate": 8.940298507462686e-06, | |
| "loss": 0.3663, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.091114471890812, | |
| "grad_norm": 1.7491657733917236, | |
| "learning_rate": 9.089552238805971e-06, | |
| "loss": 0.368, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.0926081517578745, | |
| "grad_norm": 1.5877282619476318, | |
| "learning_rate": 9.238805970149255e-06, | |
| "loss": 0.3366, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.09410183162493699, | |
| "grad_norm": 3.5212433338165283, | |
| "learning_rate": 9.388059701492538e-06, | |
| "loss": 0.3501, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.09559551149199948, | |
| "grad_norm": 2.3926730155944824, | |
| "learning_rate": 9.537313432835821e-06, | |
| "loss": 0.3328, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.09708919135906197, | |
| "grad_norm": 3.278258800506592, | |
| "learning_rate": 9.686567164179105e-06, | |
| "loss": 0.3635, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.09858287122612445, | |
| "grad_norm": 2.390896797180176, | |
| "learning_rate": 9.835820895522388e-06, | |
| "loss": 0.3453, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.10007655109318696, | |
| "grad_norm": 2.1486220359802246, | |
| "learning_rate": 9.985074626865673e-06, | |
| "loss": 0.327, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.10157023096024945, | |
| "grad_norm": 3.7770419120788574, | |
| "learning_rate": 9.999944943338487e-06, | |
| "loss": 0.3048, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.10306391082731194, | |
| "grad_norm": 3.788212776184082, | |
| "learning_rate": 9.999754625571397e-06, | |
| "loss": 0.3593, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.10455759069437442, | |
| "grad_norm": 2.0790538787841797, | |
| "learning_rate": 9.999428372160074e-06, | |
| "loss": 0.3782, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10605127056143691, | |
| "grad_norm": 2.0736265182495117, | |
| "learning_rate": 9.998966191974846e-06, | |
| "loss": 0.3522, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.10754495042849942, | |
| "grad_norm": 2.3214290142059326, | |
| "learning_rate": 9.998368097581685e-06, | |
| "loss": 0.3844, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.1090386302955619, | |
| "grad_norm": 1.3843424320220947, | |
| "learning_rate": 9.997634105241855e-06, | |
| "loss": 0.3387, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.1105323101626244, | |
| "grad_norm": 4.11653995513916, | |
| "learning_rate": 9.996764234911483e-06, | |
| "loss": 0.3523, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.11202599002968688, | |
| "grad_norm": 1.6446789503097534, | |
| "learning_rate": 9.995758510241003e-06, | |
| "loss": 0.3339, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.11351966989674937, | |
| "grad_norm": 1.4377137422561646, | |
| "learning_rate": 9.994616958574526e-06, | |
| "loss": 0.3523, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.11501334976381188, | |
| "grad_norm": 1.9575657844543457, | |
| "learning_rate": 9.993339610949084e-06, | |
| "loss": 0.3654, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.11650702963087436, | |
| "grad_norm": 1.8258610963821411, | |
| "learning_rate": 9.9919265020938e-06, | |
| "loss": 0.3465, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.11800070949793685, | |
| "grad_norm": 2.1197669506073, | |
| "learning_rate": 9.99037767042893e-06, | |
| "loss": 0.36, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.11949438936499934, | |
| "grad_norm": 1.671007752418518, | |
| "learning_rate": 9.988693158064826e-06, | |
| "loss": 0.3182, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12098806923206183, | |
| "grad_norm": 2.0421807765960693, | |
| "learning_rate": 9.986873010800792e-06, | |
| "loss": 0.3402, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.12248174909912433, | |
| "grad_norm": 2.7439417839050293, | |
| "learning_rate": 9.984917278123832e-06, | |
| "loss": 0.3551, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.12397542896618682, | |
| "grad_norm": 1.2339754104614258, | |
| "learning_rate": 9.982826013207314e-06, | |
| "loss": 0.3407, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.12546910883324933, | |
| "grad_norm": 3.45686674118042, | |
| "learning_rate": 9.980599272909517e-06, | |
| "loss": 0.3262, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.1269627887003118, | |
| "grad_norm": 2.196939468383789, | |
| "learning_rate": 9.978237117772086e-06, | |
| "loss": 0.3537, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1284564685673743, | |
| "grad_norm": 2.2232518196105957, | |
| "learning_rate": 9.975739612018391e-06, | |
| "loss": 0.3621, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.12995014843443678, | |
| "grad_norm": 1.7306561470031738, | |
| "learning_rate": 9.973106823551772e-06, | |
| "loss": 0.3207, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.13144382830149928, | |
| "grad_norm": 1.6579896211624146, | |
| "learning_rate": 9.970338823953704e-06, | |
| "loss": 0.3399, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.13293750816856179, | |
| "grad_norm": 2.4403505325317383, | |
| "learning_rate": 9.96743568848184e-06, | |
| "loss": 0.3616, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.13443118803562426, | |
| "grad_norm": 2.051017999649048, | |
| "learning_rate": 9.964397496067972e-06, | |
| "loss": 0.3408, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.13592486790268676, | |
| "grad_norm": 1.935581922531128, | |
| "learning_rate": 9.961224329315886e-06, | |
| "loss": 0.3469, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.13741854776974924, | |
| "grad_norm": 2.0615079402923584, | |
| "learning_rate": 9.957916274499103e-06, | |
| "loss": 0.3401, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.13891222763681174, | |
| "grad_norm": 2.1460647583007812, | |
| "learning_rate": 9.954473421558554e-06, | |
| "loss": 0.328, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.14040590750387424, | |
| "grad_norm": 1.6261128187179565, | |
| "learning_rate": 9.950895864100117e-06, | |
| "loss": 0.3483, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.14189958737093672, | |
| "grad_norm": 2.0029091835021973, | |
| "learning_rate": 9.947183699392083e-06, | |
| "loss": 0.3655, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.14339326723799922, | |
| "grad_norm": 2.068676233291626, | |
| "learning_rate": 9.943337028362503e-06, | |
| "loss": 0.3133, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.1448869471050617, | |
| "grad_norm": 2.6636133193969727, | |
| "learning_rate": 9.93935595559645e-06, | |
| "loss": 0.3295, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1463806269721242, | |
| "grad_norm": 1.58219313621521, | |
| "learning_rate": 9.935240589333179e-06, | |
| "loss": 0.3247, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1478743068391867, | |
| "grad_norm": 1.7050349712371826, | |
| "learning_rate": 9.930991041463166e-06, | |
| "loss": 0.3172, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.14936798670624918, | |
| "grad_norm": 1.4773019552230835, | |
| "learning_rate": 9.926607427525094e-06, | |
| "loss": 0.3445, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14936798670624918, | |
| "eval_loss": 0.33145418763160706, | |
| "eval_runtime": 76.2509, | |
| "eval_samples_per_second": 7.095, | |
| "eval_steps_per_second": 3.554, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15086166657331168, | |
| "grad_norm": 2.2282180786132812, | |
| "learning_rate": 9.922089866702685e-06, | |
| "loss": 0.3449, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.15235534644037416, | |
| "grad_norm": 1.7335081100463867, | |
| "learning_rate": 9.917438481821475e-06, | |
| "loss": 0.3664, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.15384902630743666, | |
| "grad_norm": 1.7053015232086182, | |
| "learning_rate": 9.912653399345473e-06, | |
| "loss": 0.3457, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.15534270617449916, | |
| "grad_norm": 1.1435269117355347, | |
| "learning_rate": 9.907734749373712e-06, | |
| "loss": 0.3177, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.15683638604156164, | |
| "grad_norm": 3.1279070377349854, | |
| "learning_rate": 9.90268266563673e-06, | |
| "loss": 0.351, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.15833006590862414, | |
| "grad_norm": 2.90409779548645, | |
| "learning_rate": 9.897497285492919e-06, | |
| "loss": 0.3403, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.15982374577568662, | |
| "grad_norm": 1.5271624326705933, | |
| "learning_rate": 9.892178749924792e-06, | |
| "loss": 0.3039, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.16131742564274912, | |
| "grad_norm": 1.622085452079773, | |
| "learning_rate": 9.886727203535163e-06, | |
| "loss": 0.3323, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.16281110550981162, | |
| "grad_norm": 2.4689345359802246, | |
| "learning_rate": 9.881142794543196e-06, | |
| "loss": 0.3069, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.1643047853768741, | |
| "grad_norm": 1.3529936075210571, | |
| "learning_rate": 9.875425674780388e-06, | |
| "loss": 0.3265, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1657984652439366, | |
| "grad_norm": 1.7921439409255981, | |
| "learning_rate": 9.86957599968644e-06, | |
| "loss": 0.3439, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.16729214511099907, | |
| "grad_norm": 1.8610994815826416, | |
| "learning_rate": 9.863593928305031e-06, | |
| "loss": 0.323, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.16878582497806158, | |
| "grad_norm": 1.460547685623169, | |
| "learning_rate": 9.857479623279481e-06, | |
| "loss": 0.3502, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.17027950484512408, | |
| "grad_norm": 1.4841840267181396, | |
| "learning_rate": 9.851233250848355e-06, | |
| "loss": 0.332, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.17177318471218656, | |
| "grad_norm": 1.907037377357483, | |
| "learning_rate": 9.844854980840914e-06, | |
| "loss": 0.3251, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.17326686457924906, | |
| "grad_norm": 1.833953857421875, | |
| "learning_rate": 9.838344986672518e-06, | |
| "loss": 0.3628, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.17476054444631153, | |
| "grad_norm": 1.5002996921539307, | |
| "learning_rate": 9.831703445339904e-06, | |
| "loss": 0.3346, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.17625422431337404, | |
| "grad_norm": 1.2558107376098633, | |
| "learning_rate": 9.824930537416372e-06, | |
| "loss": 0.3429, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.17774790418043654, | |
| "grad_norm": 2.025219678878784, | |
| "learning_rate": 9.81802644704688e-06, | |
| "loss": 0.3385, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.17924158404749901, | |
| "grad_norm": 1.8134315013885498, | |
| "learning_rate": 9.810991361943037e-06, | |
| "loss": 0.3362, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18073526391456152, | |
| "grad_norm": 1.2848788499832153, | |
| "learning_rate": 9.80382547337799e-06, | |
| "loss": 0.3146, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.182228943781624, | |
| "grad_norm": 1.6937299966812134, | |
| "learning_rate": 9.796528976181238e-06, | |
| "loss": 0.3192, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.1837226236486865, | |
| "grad_norm": 1.8493894338607788, | |
| "learning_rate": 9.78910206873333e-06, | |
| "loss": 0.3463, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.185216303515749, | |
| "grad_norm": 1.8786216974258423, | |
| "learning_rate": 9.781544952960458e-06, | |
| "loss": 0.3178, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.18670998338281147, | |
| "grad_norm": 1.6666313409805298, | |
| "learning_rate": 9.773857834328992e-06, | |
| "loss": 0.3263, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.18820366324987398, | |
| "grad_norm": 1.6735985279083252, | |
| "learning_rate": 9.766040921839867e-06, | |
| "loss": 0.3435, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.18969734311693645, | |
| "grad_norm": 1.4434776306152344, | |
| "learning_rate": 9.758094428022927e-06, | |
| "loss": 0.3291, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.19119102298399895, | |
| "grad_norm": 2.4167513847351074, | |
| "learning_rate": 9.750018568931122e-06, | |
| "loss": 0.3433, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.19268470285106146, | |
| "grad_norm": 1.7800685167312622, | |
| "learning_rate": 9.741813564134647e-06, | |
| "loss": 0.3223, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.19417838271812393, | |
| "grad_norm": 1.9175775051116943, | |
| "learning_rate": 9.733479636714978e-06, | |
| "loss": 0.3549, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.19567206258518643, | |
| "grad_norm": 2.1426539421081543, | |
| "learning_rate": 9.725017013258789e-06, | |
| "loss": 0.3243, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.1971657424522489, | |
| "grad_norm": 1.8812899589538574, | |
| "learning_rate": 9.716425923851804e-06, | |
| "loss": 0.3312, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.1986594223193114, | |
| "grad_norm": 1.4907119274139404, | |
| "learning_rate": 9.707706602072547e-06, | |
| "loss": 0.3499, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.20015310218637392, | |
| "grad_norm": 1.8211297988891602, | |
| "learning_rate": 9.69885928498597e-06, | |
| "loss": 0.3289, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.2016467820534364, | |
| "grad_norm": 1.4706814289093018, | |
| "learning_rate": 9.689884213137033e-06, | |
| "loss": 0.3252, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.2031404619204989, | |
| "grad_norm": 2.1436257362365723, | |
| "learning_rate": 9.68078163054414e-06, | |
| "loss": 0.3314, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.20463414178756137, | |
| "grad_norm": 2.100780725479126, | |
| "learning_rate": 9.671551784692529e-06, | |
| "loss": 0.3227, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.20612782165462387, | |
| "grad_norm": 1.4741297960281372, | |
| "learning_rate": 9.662194926527517e-06, | |
| "loss": 0.3467, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.20762150152168637, | |
| "grad_norm": 2.250545024871826, | |
| "learning_rate": 9.6527113104477e-06, | |
| "loss": 0.3504, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.20911518138874885, | |
| "grad_norm": 2.133129835128784, | |
| "learning_rate": 9.643101194298023e-06, | |
| "loss": 0.3535, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.21060886125581135, | |
| "grad_norm": 2.9924333095550537, | |
| "learning_rate": 9.633364839362777e-06, | |
| "loss": 0.3501, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.21210254112287383, | |
| "grad_norm": 2.5759615898132324, | |
| "learning_rate": 9.623502510358488e-06, | |
| "loss": 0.3427, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.21359622098993633, | |
| "grad_norm": 1.1932740211486816, | |
| "learning_rate": 9.613514475426722e-06, | |
| "loss": 0.3381, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.21508990085699883, | |
| "grad_norm": 1.5130189657211304, | |
| "learning_rate": 9.6034010061268e-06, | |
| "loss": 0.3297, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.2165835807240613, | |
| "grad_norm": 1.248481035232544, | |
| "learning_rate": 9.59316237742841e-06, | |
| "loss": 0.3251, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2180772605911238, | |
| "grad_norm": 1.7967370748519897, | |
| "learning_rate": 9.582798867704131e-06, | |
| "loss": 0.3398, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.2195709404581863, | |
| "grad_norm": 1.2705239057540894, | |
| "learning_rate": 9.572310758721864e-06, | |
| "loss": 0.3053, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.2210646203252488, | |
| "grad_norm": 1.6166293621063232, | |
| "learning_rate": 9.561698335637171e-06, | |
| "loss": 0.3424, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2225583001923113, | |
| "grad_norm": 1.8217055797576904, | |
| "learning_rate": 9.550961886985528e-06, | |
| "loss": 0.347, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.22405198005937377, | |
| "grad_norm": 1.6405028104782104, | |
| "learning_rate": 9.540101704674473e-06, | |
| "loss": 0.3383, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22405198005937377, | |
| "eval_loss": 0.326803594827652, | |
| "eval_runtime": 76.2278, | |
| "eval_samples_per_second": 7.097, | |
| "eval_steps_per_second": 3.555, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22554565992643627, | |
| "grad_norm": 2.2624378204345703, | |
| "learning_rate": 9.529118083975672e-06, | |
| "loss": 0.335, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.22703933979349875, | |
| "grad_norm": 1.2416856288909912, | |
| "learning_rate": 9.518011323516892e-06, | |
| "loss": 0.342, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.22853301966056125, | |
| "grad_norm": 1.1462935209274292, | |
| "learning_rate": 9.506781725273879e-06, | |
| "loss": 0.3226, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.23002669952762375, | |
| "grad_norm": 1.9097304344177246, | |
| "learning_rate": 9.495429594562151e-06, | |
| "loss": 0.3213, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.23152037939468623, | |
| "grad_norm": 1.6176527738571167, | |
| "learning_rate": 9.483955240028695e-06, | |
| "loss": 0.3348, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.23301405926174873, | |
| "grad_norm": 1.6169483661651611, | |
| "learning_rate": 9.472358973643576e-06, | |
| "loss": 0.3237, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.2345077391288112, | |
| "grad_norm": 1.86874258518219, | |
| "learning_rate": 9.460641110691456e-06, | |
| "loss": 0.3475, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2360014189958737, | |
| "grad_norm": 1.534540057182312, | |
| "learning_rate": 9.448801969763016e-06, | |
| "loss": 0.3487, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.2374950988629362, | |
| "grad_norm": 1.68146550655365, | |
| "learning_rate": 9.436841872746309e-06, | |
| "loss": 0.3128, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.23898877872999869, | |
| "grad_norm": 1.0647422075271606, | |
| "learning_rate": 9.424761144817987e-06, | |
| "loss": 0.3437, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2404824585970612, | |
| "grad_norm": 1.4840996265411377, | |
| "learning_rate": 9.412560114434477e-06, | |
| "loss": 0.3483, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.24197613846412366, | |
| "grad_norm": 2.902223587036133, | |
| "learning_rate": 9.400239113323042e-06, | |
| "loss": 0.3654, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.24346981833118617, | |
| "grad_norm": 1.711083173751831, | |
| "learning_rate": 9.387798476472766e-06, | |
| "loss": 0.3369, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.24496349819824867, | |
| "grad_norm": 1.4812666177749634, | |
| "learning_rate": 9.37523854212545e-06, | |
| "loss": 0.3521, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.24645717806531114, | |
| "grad_norm": 1.1067218780517578, | |
| "learning_rate": 9.362559651766402e-06, | |
| "loss": 0.302, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.24795085793237365, | |
| "grad_norm": 1.2541941404342651, | |
| "learning_rate": 9.349762150115163e-06, | |
| "loss": 0.3348, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.24944453779943612, | |
| "grad_norm": 1.125554084777832, | |
| "learning_rate": 9.336846385116138e-06, | |
| "loss": 0.3444, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.25093821766649865, | |
| "grad_norm": 1.8702635765075684, | |
| "learning_rate": 9.323812707929126e-06, | |
| "loss": 0.3092, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.2524318975335611, | |
| "grad_norm": 1.7931956052780151, | |
| "learning_rate": 9.31066147291978e-06, | |
| "loss": 0.3416, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2539255774006236, | |
| "grad_norm": 1.8561383485794067, | |
| "learning_rate": 9.297393037649965e-06, | |
| "loss": 0.3521, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2554192572676861, | |
| "grad_norm": 1.8236286640167236, | |
| "learning_rate": 9.284007762868047e-06, | |
| "loss": 0.3025, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2569129371347486, | |
| "grad_norm": 1.4247581958770752, | |
| "learning_rate": 9.270506012499072e-06, | |
| "loss": 0.336, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.2584066170018111, | |
| "grad_norm": 1.79005765914917, | |
| "learning_rate": 9.256888153634888e-06, | |
| "loss": 0.3153, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.25990029686887356, | |
| "grad_norm": 2.0392203330993652, | |
| "learning_rate": 9.243154556524144e-06, | |
| "loss": 0.3462, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.26139397673593606, | |
| "grad_norm": 1.978434681892395, | |
| "learning_rate": 9.229305594562236e-06, | |
| "loss": 0.3491, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.26288765660299857, | |
| "grad_norm": 2.794302463531494, | |
| "learning_rate": 9.215341644281161e-06, | |
| "loss": 0.3432, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.26438133647006107, | |
| "grad_norm": 2.9479925632476807, | |
| "learning_rate": 9.201263085339266e-06, | |
| "loss": 0.3267, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.26587501633712357, | |
| "grad_norm": 1.6784942150115967, | |
| "learning_rate": 9.187070300510927e-06, | |
| "loss": 0.3403, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.267368696204186, | |
| "grad_norm": 1.38883638381958, | |
| "learning_rate": 9.172763675676153e-06, | |
| "loss": 0.3242, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.2688623760712485, | |
| "grad_norm": 2.5442349910736084, | |
| "learning_rate": 9.158343599810087e-06, | |
| "loss": 0.3369, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.270356055938311, | |
| "grad_norm": 1.3056666851043701, | |
| "learning_rate": 9.143810464972429e-06, | |
| "loss": 0.3129, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.2718497358053735, | |
| "grad_norm": 1.8842471837997437, | |
| "learning_rate": 9.12916466629678e-06, | |
| "loss": 0.3257, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.27334341567243603, | |
| "grad_norm": 0.9923204183578491, | |
| "learning_rate": 9.114406601979895e-06, | |
| "loss": 0.3208, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.2748370955394985, | |
| "grad_norm": 1.6141374111175537, | |
| "learning_rate": 9.099536673270864e-06, | |
| "loss": 0.3253, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.276330775406561, | |
| "grad_norm": 2.0269787311553955, | |
| "learning_rate": 9.084555284460192e-06, | |
| "loss": 0.3179, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2778244552736235, | |
| "grad_norm": 1.620477557182312, | |
| "learning_rate": 9.06946284286882e-06, | |
| "loss": 0.3224, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.279318135140686, | |
| "grad_norm": 1.725224494934082, | |
| "learning_rate": 9.054259758837038e-06, | |
| "loss": 0.3288, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.2808118150077485, | |
| "grad_norm": 2.209329605102539, | |
| "learning_rate": 9.038946445713335e-06, | |
| "loss": 0.3421, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.28230549487481094, | |
| "grad_norm": 1.3899812698364258, | |
| "learning_rate": 9.02352331984316e-06, | |
| "loss": 0.3255, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.28379917474187344, | |
| "grad_norm": 1.5803393125534058, | |
| "learning_rate": 9.007990800557601e-06, | |
| "loss": 0.3147, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.28529285460893594, | |
| "grad_norm": 1.134922742843628, | |
| "learning_rate": 8.992349310161989e-06, | |
| "loss": 0.3412, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.28678653447599844, | |
| "grad_norm": 1.9992294311523438, | |
| "learning_rate": 8.976599273924406e-06, | |
| "loss": 0.3429, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.28828021434306095, | |
| "grad_norm": 1.468029260635376, | |
| "learning_rate": 8.960741120064131e-06, | |
| "loss": 0.3279, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.2897738942101234, | |
| "grad_norm": 1.7822861671447754, | |
| "learning_rate": 8.944775279739996e-06, | |
| "loss": 0.3192, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.2912675740771859, | |
| "grad_norm": 1.5257068872451782, | |
| "learning_rate": 8.928702187038665e-06, | |
| "loss": 0.3359, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.2927612539442484, | |
| "grad_norm": 1.5627810955047607, | |
| "learning_rate": 8.91252227896282e-06, | |
| "loss": 0.3255, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.2942549338113109, | |
| "grad_norm": 1.1691981554031372, | |
| "learning_rate": 8.8962359954193e-06, | |
| "loss": 0.3398, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.2957486136783734, | |
| "grad_norm": 2.4454123973846436, | |
| "learning_rate": 8.879843779207123e-06, | |
| "loss": 0.3137, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.29724229354543585, | |
| "grad_norm": 1.4002143144607544, | |
| "learning_rate": 8.863346076005452e-06, | |
| "loss": 0.3262, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.29873597341249836, | |
| "grad_norm": 1.3549312353134155, | |
| "learning_rate": 8.846743334361486e-06, | |
| "loss": 0.3352, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.29873597341249836, | |
| "eval_loss": 0.32243964076042175, | |
| "eval_runtime": 76.2222, | |
| "eval_samples_per_second": 7.098, | |
| "eval_steps_per_second": 3.555, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.30022965327956086, | |
| "grad_norm": 1.2843849658966064, | |
| "learning_rate": 8.830036005678253e-06, | |
| "loss": 0.3178, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.30172333314662336, | |
| "grad_norm": 1.5276010036468506, | |
| "learning_rate": 8.81322454420234e-06, | |
| "loss": 0.337, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.30321701301368587, | |
| "grad_norm": 1.4595482349395752, | |
| "learning_rate": 8.796309407011553e-06, | |
| "loss": 0.3196, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.3047106928807483, | |
| "grad_norm": 1.7560086250305176, | |
| "learning_rate": 8.779291054002468e-06, | |
| "loss": 0.3407, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.3062043727478108, | |
| "grad_norm": 1.4491099119186401, | |
| "learning_rate": 8.762169947877951e-06, | |
| "loss": 0.3225, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.3076980526148733, | |
| "grad_norm": 1.2083287239074707, | |
| "learning_rate": 8.74494655413457e-06, | |
| "loss": 0.3135, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.3091917324819358, | |
| "grad_norm": 1.734601616859436, | |
| "learning_rate": 8.727621341049924e-06, | |
| "loss": 0.3435, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.3106854123489983, | |
| "grad_norm": 1.7759486436843872, | |
| "learning_rate": 8.710194779669932e-06, | |
| "loss": 0.3192, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.31217909221606077, | |
| "grad_norm": 1.632818579673767, | |
| "learning_rate": 8.692667343796013e-06, | |
| "loss": 0.334, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.3136727720831233, | |
| "grad_norm": 1.8493646383285522, | |
| "learning_rate": 8.675039509972216e-06, | |
| "loss": 0.3345, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3151664519501858, | |
| "grad_norm": 2.082334280014038, | |
| "learning_rate": 8.657311757472247e-06, | |
| "loss": 0.3551, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.3166601318172483, | |
| "grad_norm": 2.2276527881622314, | |
| "learning_rate": 8.639484568286451e-06, | |
| "loss": 0.3335, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3181538116843108, | |
| "grad_norm": 1.907583475112915, | |
| "learning_rate": 8.621558427108705e-06, | |
| "loss": 0.3219, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.31964749155137323, | |
| "grad_norm": 1.540067434310913, | |
| "learning_rate": 8.603533821323238e-06, | |
| "loss": 0.322, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.32114117141843573, | |
| "grad_norm": 1.2835497856140137, | |
| "learning_rate": 8.585411240991378e-06, | |
| "loss": 0.3143, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.32263485128549824, | |
| "grad_norm": 1.189209222793579, | |
| "learning_rate": 8.56719117883823e-06, | |
| "loss": 0.3333, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.32412853115256074, | |
| "grad_norm": 1.5749543905258179, | |
| "learning_rate": 8.548874130239286e-06, | |
| "loss": 0.3257, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.32562221101962324, | |
| "grad_norm": 2.1032919883728027, | |
| "learning_rate": 8.530460593206942e-06, | |
| "loss": 0.3155, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.3271158908866857, | |
| "grad_norm": 1.9780592918395996, | |
| "learning_rate": 8.511951068376975e-06, | |
| "loss": 0.3199, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.3286095707537482, | |
| "grad_norm": 1.8979151248931885, | |
| "learning_rate": 8.493346058994916e-06, | |
| "loss": 0.3323, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3301032506208107, | |
| "grad_norm": 1.7598830461502075, | |
| "learning_rate": 8.474646070902376e-06, | |
| "loss": 0.3202, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.3315969304878732, | |
| "grad_norm": 1.7886403799057007, | |
| "learning_rate": 8.455851612523291e-06, | |
| "loss": 0.3319, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.3330906103549357, | |
| "grad_norm": 1.9333144426345825, | |
| "learning_rate": 8.4369631948501e-06, | |
| "loss": 0.3377, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.33458429022199815, | |
| "grad_norm": 1.5406423807144165, | |
| "learning_rate": 8.417981331429855e-06, | |
| "loss": 0.3359, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.33607797008906065, | |
| "grad_norm": 1.1198780536651611, | |
| "learning_rate": 8.39890653835024e-06, | |
| "loss": 0.3423, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.33757164995612315, | |
| "grad_norm": 1.867664098739624, | |
| "learning_rate": 8.379739334225571e-06, | |
| "loss": 0.3274, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.33906532982318566, | |
| "grad_norm": 1.5488725900650024, | |
| "learning_rate": 8.360480240182666e-06, | |
| "loss": 0.3366, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.34055900969024816, | |
| "grad_norm": 1.5203229188919067, | |
| "learning_rate": 8.341129779846695e-06, | |
| "loss": 0.3229, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3420526895573106, | |
| "grad_norm": 1.774835228919983, | |
| "learning_rate": 8.321688479326935e-06, | |
| "loss": 0.3307, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.3435463694243731, | |
| "grad_norm": 1.333151936531067, | |
| "learning_rate": 8.302156867202468e-06, | |
| "loss": 0.3216, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3450400492914356, | |
| "grad_norm": 1.3206020593643188, | |
| "learning_rate": 8.28253547450781e-06, | |
| "loss": 0.3125, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.3465337291584981, | |
| "grad_norm": 1.8065084218978882, | |
| "learning_rate": 8.262824834718471e-06, | |
| "loss": 0.3201, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.3480274090255606, | |
| "grad_norm": 2.162179708480835, | |
| "learning_rate": 8.243025483736458e-06, | |
| "loss": 0.3156, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.34952108889262307, | |
| "grad_norm": 1.118371844291687, | |
| "learning_rate": 8.22313795987569e-06, | |
| "loss": 0.3433, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.35101476875968557, | |
| "grad_norm": 1.838300347328186, | |
| "learning_rate": 8.20316280384738e-06, | |
| "loss": 0.3154, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.35250844862674807, | |
| "grad_norm": 1.6531926393508911, | |
| "learning_rate": 8.183100558745317e-06, | |
| "loss": 0.3072, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.3540021284938106, | |
| "grad_norm": 2.1075356006622314, | |
| "learning_rate": 8.162951770031116e-06, | |
| "loss": 0.3291, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.3554958083608731, | |
| "grad_norm": 1.7505310773849487, | |
| "learning_rate": 8.142716985519373e-06, | |
| "loss": 0.3222, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.3569894882279355, | |
| "grad_norm": 1.5103789567947388, | |
| "learning_rate": 8.122396755362782e-06, | |
| "loss": 0.3086, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.35848316809499803, | |
| "grad_norm": 1.8631788492202759, | |
| "learning_rate": 8.10199163203717e-06, | |
| "loss": 0.3312, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.35997684796206053, | |
| "grad_norm": 1.6605143547058105, | |
| "learning_rate": 8.081502170326478e-06, | |
| "loss": 0.3228, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.36147052782912303, | |
| "grad_norm": 1.1152336597442627, | |
| "learning_rate": 8.060928927307687e-06, | |
| "loss": 0.3307, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.36296420769618554, | |
| "grad_norm": 1.3379615545272827, | |
| "learning_rate": 8.040272462335648e-06, | |
| "loss": 0.323, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.364457887563248, | |
| "grad_norm": 2.2633602619171143, | |
| "learning_rate": 8.019533337027903e-06, | |
| "loss": 0.3195, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.3659515674303105, | |
| "grad_norm": 1.8531728982925415, | |
| "learning_rate": 7.998712115249391e-06, | |
| "loss": 0.3531, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.367445247297373, | |
| "grad_norm": 1.6278972625732422, | |
| "learning_rate": 7.977809363097135e-06, | |
| "loss": 0.3373, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.3689389271644355, | |
| "grad_norm": 1.7813271284103394, | |
| "learning_rate": 7.956825648884842e-06, | |
| "loss": 0.3506, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.370432607031498, | |
| "grad_norm": 2.0010931491851807, | |
| "learning_rate": 7.935761543127449e-06, | |
| "loss": 0.3166, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.37192628689856044, | |
| "grad_norm": 2.6339111328125, | |
| "learning_rate": 7.91461761852562e-06, | |
| "loss": 0.32, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.37341996676562295, | |
| "grad_norm": 1.8536508083343506, | |
| "learning_rate": 7.893394449950166e-06, | |
| "loss": 0.3027, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.37341996676562295, | |
| "eval_loss": 0.31971076130867004, | |
| "eval_runtime": 76.1507, | |
| "eval_samples_per_second": 7.104, | |
| "eval_steps_per_second": 3.559, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.37491364663268545, | |
| "grad_norm": 1.504650592803955, | |
| "learning_rate": 7.87209261442643e-06, | |
| "loss": 0.3075, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.37640732649974795, | |
| "grad_norm": 1.0728139877319336, | |
| "learning_rate": 7.850712691118577e-06, | |
| "loss": 0.3329, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.37790100636681045, | |
| "grad_norm": 1.5715535879135132, | |
| "learning_rate": 7.829255261313862e-06, | |
| "loss": 0.3105, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.3793946862338729, | |
| "grad_norm": 0.8371075987815857, | |
| "learning_rate": 7.807720908406826e-06, | |
| "loss": 0.3318, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.3808883661009354, | |
| "grad_norm": 2.6301848888397217, | |
| "learning_rate": 7.786110217883429e-06, | |
| "loss": 0.3471, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.3823820459679979, | |
| "grad_norm": 1.0217111110687256, | |
| "learning_rate": 7.764423777305132e-06, | |
| "loss": 0.2987, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.3838757258350604, | |
| "grad_norm": 1.5058764219284058, | |
| "learning_rate": 7.742662176292926e-06, | |
| "loss": 0.301, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.3853694057021229, | |
| "grad_norm": 1.2323505878448486, | |
| "learning_rate": 7.720826006511297e-06, | |
| "loss": 0.3135, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.38686308556918536, | |
| "grad_norm": 1.6528573036193848, | |
| "learning_rate": 7.698915861652139e-06, | |
| "loss": 0.3357, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.38835676543624786, | |
| "grad_norm": 1.556429386138916, | |
| "learning_rate": 7.676932337418624e-06, | |
| "loss": 0.3063, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.38985044530331037, | |
| "grad_norm": 1.9085198640823364, | |
| "learning_rate": 7.654876031508981e-06, | |
| "loss": 0.3214, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.39134412517037287, | |
| "grad_norm": 1.279447078704834, | |
| "learning_rate": 7.63274754360028e-06, | |
| "loss": 0.3206, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.3928378050374354, | |
| "grad_norm": 2.345536231994629, | |
| "learning_rate": 7.610547475332089e-06, | |
| "loss": 0.3254, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.3943314849044978, | |
| "grad_norm": 0.9263664484024048, | |
| "learning_rate": 7.588276430290151e-06, | |
| "loss": 0.3234, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.3958251647715603, | |
| "grad_norm": 1.5908204317092896, | |
| "learning_rate": 7.56593501398995e-06, | |
| "loss": 0.3246, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3973188446386228, | |
| "grad_norm": 1.5689475536346436, | |
| "learning_rate": 7.5435238338602604e-06, | |
| "loss": 0.3183, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.39881252450568533, | |
| "grad_norm": 0.8952176570892334, | |
| "learning_rate": 7.521043499226625e-06, | |
| "loss": 0.3019, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.40030620437274783, | |
| "grad_norm": 1.4977798461914062, | |
| "learning_rate": 7.498494621294796e-06, | |
| "loss": 0.347, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4017998842398103, | |
| "grad_norm": 1.0641767978668213, | |
| "learning_rate": 7.475877813134106e-06, | |
| "loss": 0.341, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.4032935641068728, | |
| "grad_norm": 1.3907352685928345, | |
| "learning_rate": 7.453193689660811e-06, | |
| "loss": 0.3206, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4047872439739353, | |
| "grad_norm": 1.4206258058547974, | |
| "learning_rate": 7.430442867621365e-06, | |
| "loss": 0.3058, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.4062809238409978, | |
| "grad_norm": 1.0893877744674683, | |
| "learning_rate": 7.407625965575656e-06, | |
| "loss": 0.306, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.4077746037080603, | |
| "grad_norm": 1.5306363105773926, | |
| "learning_rate": 7.384743603880181e-06, | |
| "loss": 0.3395, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.40926828357512274, | |
| "grad_norm": 1.5694290399551392, | |
| "learning_rate": 7.361796404671187e-06, | |
| "loss": 0.3044, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.41076196344218524, | |
| "grad_norm": 1.862500786781311, | |
| "learning_rate": 7.338784991847755e-06, | |
| "loss": 0.3307, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.41225564330924774, | |
| "grad_norm": 1.3926466703414917, | |
| "learning_rate": 7.315709991054832e-06, | |
| "loss": 0.3052, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.41374932317631025, | |
| "grad_norm": 1.6417464017868042, | |
| "learning_rate": 7.292572029666228e-06, | |
| "loss": 0.3108, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.41524300304337275, | |
| "grad_norm": 2.3295059204101562, | |
| "learning_rate": 7.269371736767552e-06, | |
| "loss": 0.3299, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.4167366829104352, | |
| "grad_norm": 1.707053303718567, | |
| "learning_rate": 7.246109743139111e-06, | |
| "loss": 0.3129, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.4182303627774977, | |
| "grad_norm": 1.233490228652954, | |
| "learning_rate": 7.222786681238762e-06, | |
| "loss": 0.3234, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4197240426445602, | |
| "grad_norm": 0.8047583699226379, | |
| "learning_rate": 7.1994031851847125e-06, | |
| "loss": 0.3038, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.4212177225116227, | |
| "grad_norm": 1.466469168663025, | |
| "learning_rate": 7.175959890738282e-06, | |
| "loss": 0.3382, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.4227114023786852, | |
| "grad_norm": 1.0184977054595947, | |
| "learning_rate": 7.152457435286619e-06, | |
| "loss": 0.3143, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.42420508224574766, | |
| "grad_norm": 1.102300763130188, | |
| "learning_rate": 7.128896457825364e-06, | |
| "loss": 0.3228, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.42569876211281016, | |
| "grad_norm": 1.8604798316955566, | |
| "learning_rate": 7.1052775989412855e-06, | |
| "loss": 0.2981, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.42719244197987266, | |
| "grad_norm": 1.1831308603286743, | |
| "learning_rate": 7.081601500794857e-06, | |
| "loss": 0.3297, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.42868612184693516, | |
| "grad_norm": 1.7088931798934937, | |
| "learning_rate": 7.057868807102799e-06, | |
| "loss": 0.3101, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.43017980171399767, | |
| "grad_norm": 1.3123115301132202, | |
| "learning_rate": 7.034080163120579e-06, | |
| "loss": 0.3258, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.4316734815810601, | |
| "grad_norm": 1.3527169227600098, | |
| "learning_rate": 7.010236215624867e-06, | |
| "loss": 0.3029, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.4331671614481226, | |
| "grad_norm": 1.361512541770935, | |
| "learning_rate": 6.986337612895949e-06, | |
| "loss": 0.3392, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4346608413151851, | |
| "grad_norm": 1.4390591382980347, | |
| "learning_rate": 6.962385004700105e-06, | |
| "loss": 0.3351, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.4361545211822476, | |
| "grad_norm": 1.67287278175354, | |
| "learning_rate": 6.938379042271939e-06, | |
| "loss": 0.3255, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.4376482010493101, | |
| "grad_norm": 1.2548269033432007, | |
| "learning_rate": 6.914320378296674e-06, | |
| "loss": 0.3262, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.4391418809163726, | |
| "grad_norm": 1.2193247079849243, | |
| "learning_rate": 6.89020966689241e-06, | |
| "loss": 0.3412, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.4406355607834351, | |
| "grad_norm": 1.1901212930679321, | |
| "learning_rate": 6.866047563592334e-06, | |
| "loss": 0.3002, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.4421292406504976, | |
| "grad_norm": 1.65078866481781, | |
| "learning_rate": 6.841834725326899e-06, | |
| "loss": 0.3172, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.4436229205175601, | |
| "grad_norm": 1.3838766813278198, | |
| "learning_rate": 6.817571810405967e-06, | |
| "loss": 0.3215, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.4451166003846226, | |
| "grad_norm": 1.2286713123321533, | |
| "learning_rate": 6.793259478500907e-06, | |
| "loss": 0.3208, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.44661028025168503, | |
| "grad_norm": 0.9910550713539124, | |
| "learning_rate": 6.7688983906266544e-06, | |
| "loss": 0.3293, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.44810396011874754, | |
| "grad_norm": 1.6711299419403076, | |
| "learning_rate": 6.74448920912375e-06, | |
| "loss": 0.3272, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.44810396011874754, | |
| "eval_loss": 0.31658411026000977, | |
| "eval_runtime": 76.2262, | |
| "eval_samples_per_second": 7.097, | |
| "eval_steps_per_second": 3.555, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.44959763998581004, | |
| "grad_norm": 1.8898260593414307, | |
| "learning_rate": 6.720032597640326e-06, | |
| "loss": 0.332, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.45109131985287254, | |
| "grad_norm": 1.8445961475372314, | |
| "learning_rate": 6.695529221114059e-06, | |
| "loss": 0.3165, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.45258499971993504, | |
| "grad_norm": 1.3706282377243042, | |
| "learning_rate": 6.670979745754101e-06, | |
| "loss": 0.3165, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.4540786795869975, | |
| "grad_norm": 1.7057021856307983, | |
| "learning_rate": 6.646384839022955e-06, | |
| "loss": 0.3045, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.45557235945406, | |
| "grad_norm": 1.5170303583145142, | |
| "learning_rate": 6.621745169618337e-06, | |
| "loss": 0.3061, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.4570660393211225, | |
| "grad_norm": 2.1427805423736572, | |
| "learning_rate": 6.597061407454987e-06, | |
| "loss": 0.31, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.458559719188185, | |
| "grad_norm": 1.1289193630218506, | |
| "learning_rate": 6.572334223646468e-06, | |
| "loss": 0.3388, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.4600533990552475, | |
| "grad_norm": 1.3998080492019653, | |
| "learning_rate": 6.5475642904869004e-06, | |
| "loss": 0.3296, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.46154707892230995, | |
| "grad_norm": 1.4870209693908691, | |
| "learning_rate": 6.5227522814327e-06, | |
| "loss": 0.3441, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.46304075878937245, | |
| "grad_norm": 1.3324146270751953, | |
| "learning_rate": 6.4978988710842585e-06, | |
| "loss": 0.3072, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.46453443865643496, | |
| "grad_norm": 2.6711628437042236, | |
| "learning_rate": 6.473004735167605e-06, | |
| "loss": 0.3199, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.46602811852349746, | |
| "grad_norm": 1.2170815467834473, | |
| "learning_rate": 6.44807055051604e-06, | |
| "loss": 0.3184, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.46752179839055996, | |
| "grad_norm": 1.2690013647079468, | |
| "learning_rate": 6.423096995051722e-06, | |
| "loss": 0.3292, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.4690154782576224, | |
| "grad_norm": 1.5772716999053955, | |
| "learning_rate": 6.398084747767241e-06, | |
| "loss": 0.3219, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.4705091581246849, | |
| "grad_norm": 1.8444935083389282, | |
| "learning_rate": 6.373034488707159e-06, | |
| "loss": 0.3282, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.4720028379917474, | |
| "grad_norm": 1.8097927570343018, | |
| "learning_rate": 6.347946898949524e-06, | |
| "loss": 0.3426, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.4734965178588099, | |
| "grad_norm": 1.232932209968567, | |
| "learning_rate": 6.322822660587343e-06, | |
| "loss": 0.3195, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.4749901977258724, | |
| "grad_norm": 1.4135682582855225, | |
| "learning_rate": 6.297662456710043e-06, | |
| "loss": 0.3125, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.47648387759293487, | |
| "grad_norm": 1.2826404571533203, | |
| "learning_rate": 6.272466971384902e-06, | |
| "loss": 0.3418, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.47797755745999737, | |
| "grad_norm": 1.1015794277191162, | |
| "learning_rate": 6.24723688963844e-06, | |
| "loss": 0.3114, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4794712373270599, | |
| "grad_norm": 1.6762737035751343, | |
| "learning_rate": 6.221972897437804e-06, | |
| "loss": 0.3315, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.4809649171941224, | |
| "grad_norm": 1.5286458730697632, | |
| "learning_rate": 6.1966756816721195e-06, | |
| "loss": 0.3081, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.4824585970611849, | |
| "grad_norm": 2.174837827682495, | |
| "learning_rate": 6.171345930133798e-06, | |
| "loss": 0.3251, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.4839522769282473, | |
| "grad_norm": 1.966800570487976, | |
| "learning_rate": 6.145984331499859e-06, | |
| "loss": 0.33, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.48544595679530983, | |
| "grad_norm": 1.1323667764663696, | |
| "learning_rate": 6.120591575313189e-06, | |
| "loss": 0.322, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.48693963666237233, | |
| "grad_norm": 1.1445270776748657, | |
| "learning_rate": 6.095168351963805e-06, | |
| "loss": 0.3066, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.48843331652943484, | |
| "grad_norm": 0.9923702478408813, | |
| "learning_rate": 6.069715352670076e-06, | |
| "loss": 0.3006, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.48992699639649734, | |
| "grad_norm": 1.8956522941589355, | |
| "learning_rate": 6.044233269459935e-06, | |
| "loss": 0.3309, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.4914206762635598, | |
| "grad_norm": 1.9034560918807983, | |
| "learning_rate": 6.018722795152062e-06, | |
| "loss": 0.3168, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.4929143561306223, | |
| "grad_norm": 1.3808101415634155, | |
| "learning_rate": 5.993184623337045e-06, | |
| "loss": 0.3148, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.4944080359976848, | |
| "grad_norm": 1.3605296611785889, | |
| "learning_rate": 5.967619448358529e-06, | |
| "loss": 0.3128, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.4959017158647473, | |
| "grad_norm": 1.7083598375320435, | |
| "learning_rate": 5.942027965294329e-06, | |
| "loss": 0.3224, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.4973953957318098, | |
| "grad_norm": 2.0568454265594482, | |
| "learning_rate": 5.916410869937541e-06, | |
| "loss": 0.3199, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.49888907559887224, | |
| "grad_norm": 1.6961411237716675, | |
| "learning_rate": 5.890768858777613e-06, | |
| "loss": 0.3356, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5003827554659348, | |
| "grad_norm": 1.5090399980545044, | |
| "learning_rate": 5.865102628981424e-06, | |
| "loss": 0.3014, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5018764353329973, | |
| "grad_norm": 1.3667364120483398, | |
| "learning_rate": 5.839412878374313e-06, | |
| "loss": 0.3386, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5033701152000597, | |
| "grad_norm": 1.5758767127990723, | |
| "learning_rate": 5.813700305421119e-06, | |
| "loss": 0.2939, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.5048637950671222, | |
| "grad_norm": 1.0520446300506592, | |
| "learning_rate": 5.787965609207184e-06, | |
| "loss": 0.2978, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5063574749341847, | |
| "grad_norm": 1.4224300384521484, | |
| "learning_rate": 5.762209489419343e-06, | |
| "loss": 0.3168, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.5078511548012472, | |
| "grad_norm": 1.1233537197113037, | |
| "learning_rate": 5.736432646326911e-06, | |
| "loss": 0.3219, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5093448346683097, | |
| "grad_norm": 1.480785608291626, | |
| "learning_rate": 5.710635780762639e-06, | |
| "loss": 0.3227, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5108385145353722, | |
| "grad_norm": 1.2440319061279297, | |
| "learning_rate": 5.68481959410365e-06, | |
| "loss": 0.3391, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5123321944024347, | |
| "grad_norm": 1.2866686582565308, | |
| "learning_rate": 5.658984788252384e-06, | |
| "loss": 0.2983, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5138258742694972, | |
| "grad_norm": 1.2832037210464478, | |
| "learning_rate": 5.633132065617509e-06, | |
| "loss": 0.3066, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.5153195541365597, | |
| "grad_norm": 1.5093879699707031, | |
| "learning_rate": 5.607262129094819e-06, | |
| "loss": 0.3198, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5168132340036222, | |
| "grad_norm": 1.5857967138290405, | |
| "learning_rate": 5.581375682048131e-06, | |
| "loss": 0.3187, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5183069138706846, | |
| "grad_norm": 1.2231806516647339, | |
| "learning_rate": 5.555473428290154e-06, | |
| "loss": 0.3029, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.5198005937377471, | |
| "grad_norm": 1.2822185754776, | |
| "learning_rate": 5.5295560720633575e-06, | |
| "loss": 0.3046, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5212942736048096, | |
| "grad_norm": 1.2995489835739136, | |
| "learning_rate": 5.503624318020829e-06, | |
| "loss": 0.3295, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.5227879534718721, | |
| "grad_norm": 1.9352302551269531, | |
| "learning_rate": 5.477678871207105e-06, | |
| "loss": 0.3216, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5227879534718721, | |
| "eval_loss": 0.31384068727493286, | |
| "eval_runtime": 76.2107, | |
| "eval_samples_per_second": 7.099, | |
| "eval_steps_per_second": 3.556, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5242816333389346, | |
| "grad_norm": 1.2406340837478638, | |
| "learning_rate": 5.4517204370390086e-06, | |
| "loss": 0.3009, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.5257753132059971, | |
| "grad_norm": 1.5014777183532715, | |
| "learning_rate": 5.425749721286471e-06, | |
| "loss": 0.3138, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5272689930730596, | |
| "grad_norm": 2.0441832542419434, | |
| "learning_rate": 5.399767430053338e-06, | |
| "loss": 0.3317, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.5287626729401221, | |
| "grad_norm": 1.4949225187301636, | |
| "learning_rate": 5.373774269758178e-06, | |
| "loss": 0.3156, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.5302563528071846, | |
| "grad_norm": 1.5877892971038818, | |
| "learning_rate": 5.3477709471150716e-06, | |
| "loss": 0.2948, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5317500326742471, | |
| "grad_norm": 1.809065580368042, | |
| "learning_rate": 5.321758169114396e-06, | |
| "loss": 0.3177, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.5332437125413095, | |
| "grad_norm": 1.4500436782836914, | |
| "learning_rate": 5.295736643003605e-06, | |
| "loss": 0.2974, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.534737392408372, | |
| "grad_norm": 1.2693545818328857, | |
| "learning_rate": 5.269707076268005e-06, | |
| "loss": 0.2848, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.5362310722754345, | |
| "grad_norm": 1.21388578414917, | |
| "learning_rate": 5.243670176611509e-06, | |
| "loss": 0.3199, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.537724752142497, | |
| "grad_norm": 1.2438586950302124, | |
| "learning_rate": 5.217626651937404e-06, | |
| "loss": 0.3064, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5392184320095595, | |
| "grad_norm": 2.074819326400757, | |
| "learning_rate": 5.1915772103291e-06, | |
| "loss": 0.3081, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.540712111876622, | |
| "grad_norm": 1.8442296981811523, | |
| "learning_rate": 5.1655225600308765e-06, | |
| "loss": 0.3303, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.5422057917436846, | |
| "grad_norm": 1.3743780851364136, | |
| "learning_rate": 5.139463409428635e-06, | |
| "loss": 0.3368, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.543699471610747, | |
| "grad_norm": 1.440290927886963, | |
| "learning_rate": 5.113400467030632e-06, | |
| "loss": 0.3332, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.5451931514778096, | |
| "grad_norm": 1.7156881093978882, | |
| "learning_rate": 5.087334441448213e-06, | |
| "loss": 0.3164, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5466868313448721, | |
| "grad_norm": 0.9851483106613159, | |
| "learning_rate": 5.061266041376553e-06, | |
| "loss": 0.3407, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.5481805112119345, | |
| "grad_norm": 1.2339802980422974, | |
| "learning_rate": 5.035195975575387e-06, | |
| "loss": 0.3115, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.549674191078997, | |
| "grad_norm": 1.1633639335632324, | |
| "learning_rate": 5.0091249528497374e-06, | |
| "loss": 0.3215, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.5511678709460595, | |
| "grad_norm": 1.794061303138733, | |
| "learning_rate": 4.983053682030642e-06, | |
| "loss": 0.3222, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.552661550813122, | |
| "grad_norm": 1.7158312797546387, | |
| "learning_rate": 4.95698287195589e-06, | |
| "loss": 0.3021, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5541552306801845, | |
| "grad_norm": 1.7610148191452026, | |
| "learning_rate": 4.930913231450737e-06, | |
| "loss": 0.2871, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.555648910547247, | |
| "grad_norm": 1.8225277662277222, | |
| "learning_rate": 4.904845469308642e-06, | |
| "loss": 0.2988, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.5571425904143095, | |
| "grad_norm": 1.7422287464141846, | |
| "learning_rate": 4.8787802942719955e-06, | |
| "loss": 0.3258, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.558636270281372, | |
| "grad_norm": 1.324690818786621, | |
| "learning_rate": 4.8527184150128475e-06, | |
| "loss": 0.3182, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.5601299501484345, | |
| "grad_norm": 1.0865528583526611, | |
| "learning_rate": 4.82666054011364e-06, | |
| "loss": 0.309, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.561623630015497, | |
| "grad_norm": 1.5340676307678223, | |
| "learning_rate": 4.800607378047944e-06, | |
| "loss": 0.3356, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.5631173098825594, | |
| "grad_norm": 1.476318359375, | |
| "learning_rate": 4.774559637161197e-06, | |
| "loss": 0.31, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.5646109897496219, | |
| "grad_norm": 1.3898128271102905, | |
| "learning_rate": 4.74851802565144e-06, | |
| "loss": 0.3202, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.5661046696166844, | |
| "grad_norm": 1.4143530130386353, | |
| "learning_rate": 4.722483251550067e-06, | |
| "loss": 0.3445, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.5675983494837469, | |
| "grad_norm": 0.8102360963821411, | |
| "learning_rate": 4.696456022702574e-06, | |
| "loss": 0.3087, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5690920293508094, | |
| "grad_norm": 1.0995668172836304, | |
| "learning_rate": 4.670437046749312e-06, | |
| "loss": 0.3077, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.5705857092178719, | |
| "grad_norm": 1.710694432258606, | |
| "learning_rate": 4.6444270311062496e-06, | |
| "loss": 0.3123, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.5720793890849344, | |
| "grad_norm": 1.50558602809906, | |
| "learning_rate": 4.618426682945736e-06, | |
| "loss": 0.3142, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.5735730689519969, | |
| "grad_norm": 1.3168991804122925, | |
| "learning_rate": 4.59243670917728e-06, | |
| "loss": 0.3349, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.5750667488190594, | |
| "grad_norm": 1.0681779384613037, | |
| "learning_rate": 4.566457816428326e-06, | |
| "loss": 0.3153, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5765604286861219, | |
| "grad_norm": 1.5274810791015625, | |
| "learning_rate": 4.5404907110250364e-06, | |
| "loss": 0.3263, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.5780541085531843, | |
| "grad_norm": 1.5444824695587158, | |
| "learning_rate": 4.514536098973105e-06, | |
| "loss": 0.306, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.5795477884202468, | |
| "grad_norm": 1.126636028289795, | |
| "learning_rate": 4.488594685938541e-06, | |
| "loss": 0.3122, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.5810414682873093, | |
| "grad_norm": 1.2185169458389282, | |
| "learning_rate": 4.462667177228496e-06, | |
| "loss": 0.2975, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.5825351481543718, | |
| "grad_norm": 1.721125602722168, | |
| "learning_rate": 4.4367542777720854e-06, | |
| "loss": 0.3174, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5840288280214343, | |
| "grad_norm": 1.476317048072815, | |
| "learning_rate": 4.410856692101219e-06, | |
| "loss": 0.3093, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.5855225078884968, | |
| "grad_norm": 1.5350698232650757, | |
| "learning_rate": 4.384975124331451e-06, | |
| "loss": 0.3243, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.5870161877555593, | |
| "grad_norm": 1.8953022956848145, | |
| "learning_rate": 4.35911027814283e-06, | |
| "loss": 0.319, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.5885098676226218, | |
| "grad_norm": 1.768258810043335, | |
| "learning_rate": 4.333262856760774e-06, | |
| "loss": 0.3073, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.5900035474896843, | |
| "grad_norm": 0.974807858467102, | |
| "learning_rate": 4.3074335629369455e-06, | |
| "loss": 0.3208, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.5914972273567468, | |
| "grad_norm": 1.3250782489776611, | |
| "learning_rate": 4.281623098930148e-06, | |
| "loss": 0.2884, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.5929909072238092, | |
| "grad_norm": 1.5974177122116089, | |
| "learning_rate": 4.25583216648723e-06, | |
| "loss": 0.2861, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.5944845870908717, | |
| "grad_norm": 1.2887296676635742, | |
| "learning_rate": 4.2300614668240065e-06, | |
| "loss": 0.3445, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.5959782669579342, | |
| "grad_norm": 2.0698602199554443, | |
| "learning_rate": 4.204311700606195e-06, | |
| "loss": 0.3091, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.5974719468249967, | |
| "grad_norm": 1.6275320053100586, | |
| "learning_rate": 4.1785835679303635e-06, | |
| "loss": 0.3223, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5974719468249967, | |
| "eval_loss": 0.31077098846435547, | |
| "eval_runtime": 76.2168, | |
| "eval_samples_per_second": 7.098, | |
| "eval_steps_per_second": 3.556, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5989656266920592, | |
| "grad_norm": 1.140994906425476, | |
| "learning_rate": 4.152877768304898e-06, | |
| "loss": 0.316, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.6004593065591217, | |
| "grad_norm": 1.963865041732788, | |
| "learning_rate": 4.127195000630987e-06, | |
| "loss": 0.3173, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.6019529864261842, | |
| "grad_norm": 1.7010706663131714, | |
| "learning_rate": 4.1015359631836085e-06, | |
| "loss": 0.3318, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.6034466662932467, | |
| "grad_norm": 1.9144036769866943, | |
| "learning_rate": 4.0759013535925575e-06, | |
| "loss": 0.3229, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6049403461603092, | |
| "grad_norm": 1.5420873165130615, | |
| "learning_rate": 4.050291868823469e-06, | |
| "loss": 0.2952, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6064340260273717, | |
| "grad_norm": 1.2293835878372192, | |
| "learning_rate": 4.0247082051588794e-06, | |
| "loss": 0.3273, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6079277058944341, | |
| "grad_norm": 1.203016996383667, | |
| "learning_rate": 3.999151058179283e-06, | |
| "loss": 0.3301, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6094213857614966, | |
| "grad_norm": 1.7640305757522583, | |
| "learning_rate": 3.973621122744226e-06, | |
| "loss": 0.3217, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6109150656285591, | |
| "grad_norm": 0.998776912689209, | |
| "learning_rate": 3.9481190929734185e-06, | |
| "loss": 0.2961, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.6124087454956216, | |
| "grad_norm": 1.3041551113128662, | |
| "learning_rate": 3.922645662227854e-06, | |
| "loss": 0.3178, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6139024253626841, | |
| "grad_norm": 1.2349125146865845, | |
| "learning_rate": 3.897201523090967e-06, | |
| "loss": 0.2985, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.6153961052297466, | |
| "grad_norm": 1.5378309488296509, | |
| "learning_rate": 3.8717873673497945e-06, | |
| "loss": 0.2987, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.6168897850968091, | |
| "grad_norm": 1.2633869647979736, | |
| "learning_rate": 3.846403885976175e-06, | |
| "loss": 0.2989, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.6183834649638716, | |
| "grad_norm": 1.7205194234848022, | |
| "learning_rate": 3.821051769107952e-06, | |
| "loss": 0.3105, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.6198771448309341, | |
| "grad_norm": 2.0533735752105713, | |
| "learning_rate": 3.7957317060302225e-06, | |
| "loss": 0.3204, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.6213708246979966, | |
| "grad_norm": 1.609309434890747, | |
| "learning_rate": 3.770444385156587e-06, | |
| "loss": 0.3107, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.622864504565059, | |
| "grad_norm": 1.2998265027999878, | |
| "learning_rate": 3.745190494010436e-06, | |
| "loss": 0.3101, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.6243581844321215, | |
| "grad_norm": 2.1723592281341553, | |
| "learning_rate": 3.7199707192062578e-06, | |
| "loss": 0.2887, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.625851864299184, | |
| "grad_norm": 1.8159315586090088, | |
| "learning_rate": 3.6947857464309695e-06, | |
| "loss": 0.3088, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.6273455441662465, | |
| "grad_norm": 1.644352912902832, | |
| "learning_rate": 3.6696362604252734e-06, | |
| "loss": 0.3128, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.628839224033309, | |
| "grad_norm": 2.0270018577575684, | |
| "learning_rate": 3.6445229449650443e-06, | |
| "loss": 0.3324, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.6303329039003716, | |
| "grad_norm": 1.4132014513015747, | |
| "learning_rate": 3.6194464828427324e-06, | |
| "loss": 0.3078, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.6318265837674341, | |
| "grad_norm": 1.3948049545288086, | |
| "learning_rate": 3.5944075558488e-06, | |
| "loss": 0.315, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.6333202636344966, | |
| "grad_norm": 1.699584722518921, | |
| "learning_rate": 3.569406844753196e-06, | |
| "loss": 0.3218, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.6348139435015591, | |
| "grad_norm": 1.175880789756775, | |
| "learning_rate": 3.544445029286829e-06, | |
| "loss": 0.3271, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6363076233686216, | |
| "grad_norm": 1.1815729141235352, | |
| "learning_rate": 3.5195227881230985e-06, | |
| "loss": 0.3202, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.637801303235684, | |
| "grad_norm": 1.096339464187622, | |
| "learning_rate": 3.4946407988594394e-06, | |
| "loss": 0.3212, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.6392949831027465, | |
| "grad_norm": 2.088253974914551, | |
| "learning_rate": 3.4697997379988983e-06, | |
| "loss": 0.3117, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.640788662969809, | |
| "grad_norm": 1.1896518468856812, | |
| "learning_rate": 3.445000280931743e-06, | |
| "loss": 0.3055, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.6422823428368715, | |
| "grad_norm": 1.6314555406570435, | |
| "learning_rate": 3.4202431019170964e-06, | |
| "loss": 0.313, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.643776022703934, | |
| "grad_norm": 1.4211452007293701, | |
| "learning_rate": 3.3955288740646064e-06, | |
| "loss": 0.2967, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.6452697025709965, | |
| "grad_norm": 1.8873090744018555, | |
| "learning_rate": 3.3708582693161473e-06, | |
| "loss": 0.3218, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.646763382438059, | |
| "grad_norm": 1.0354822874069214, | |
| "learning_rate": 3.346231958427546e-06, | |
| "loss": 0.3155, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.6482570623051215, | |
| "grad_norm": 1.7193752527236938, | |
| "learning_rate": 3.3216506109503478e-06, | |
| "loss": 0.2933, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.649750742172184, | |
| "grad_norm": 1.969494104385376, | |
| "learning_rate": 3.297114895213611e-06, | |
| "loss": 0.3086, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6512444220392465, | |
| "grad_norm": 1.3515018224716187, | |
| "learning_rate": 3.2726254783057388e-06, | |
| "loss": 0.3012, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.6527381019063089, | |
| "grad_norm": 1.2439565658569336, | |
| "learning_rate": 3.2481830260563393e-06, | |
| "loss": 0.3175, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.6542317817733714, | |
| "grad_norm": 2.0741679668426514, | |
| "learning_rate": 3.2237882030181227e-06, | |
| "loss": 0.3281, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.6557254616404339, | |
| "grad_norm": 1.3941818475723267, | |
| "learning_rate": 3.199441672448838e-06, | |
| "loss": 0.3179, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.6572191415074964, | |
| "grad_norm": 1.3664950132369995, | |
| "learning_rate": 3.1751440962932324e-06, | |
| "loss": 0.3252, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6587128213745589, | |
| "grad_norm": 1.3657866716384888, | |
| "learning_rate": 3.150896135165059e-06, | |
| "loss": 0.3274, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.6602065012416214, | |
| "grad_norm": 1.4565297365188599, | |
| "learning_rate": 3.126698448329112e-06, | |
| "loss": 0.319, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.6617001811086839, | |
| "grad_norm": 1.585686445236206, | |
| "learning_rate": 3.1025516936833122e-06, | |
| "loss": 0.2937, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.6631938609757464, | |
| "grad_norm": 1.6105479001998901, | |
| "learning_rate": 3.0784565277408063e-06, | |
| "loss": 0.3247, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.6646875408428089, | |
| "grad_norm": 1.0377700328826904, | |
| "learning_rate": 3.0544136056121232e-06, | |
| "loss": 0.3215, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.6661812207098714, | |
| "grad_norm": 1.4693603515625, | |
| "learning_rate": 3.0304235809873654e-06, | |
| "loss": 0.3016, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.6676749005769338, | |
| "grad_norm": 1.3283636569976807, | |
| "learning_rate": 3.006487106118433e-06, | |
| "loss": 0.3024, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.6691685804439963, | |
| "grad_norm": 1.0531262159347534, | |
| "learning_rate": 2.982604831801289e-06, | |
| "loss": 0.3287, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.6706622603110588, | |
| "grad_norm": 1.6268073320388794, | |
| "learning_rate": 2.9587774073582677e-06, | |
| "loss": 0.306, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.6721559401781213, | |
| "grad_norm": 1.7072473764419556, | |
| "learning_rate": 2.9350054806204214e-06, | |
| "loss": 0.3346, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6721559401781213, | |
| "eval_loss": 0.307580828666687, | |
| "eval_runtime": 76.4751, | |
| "eval_samples_per_second": 7.074, | |
| "eval_steps_per_second": 3.544, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6736496200451838, | |
| "grad_norm": 1.3722172975540161, | |
| "learning_rate": 2.9112896979099037e-06, | |
| "loss": 0.3213, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.6751432999122463, | |
| "grad_norm": 0.8439034819602966, | |
| "learning_rate": 2.8876307040223956e-06, | |
| "loss": 0.3102, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.6766369797793088, | |
| "grad_norm": 2.1569015979766846, | |
| "learning_rate": 2.864029142209579e-06, | |
| "loss": 0.3189, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.6781306596463713, | |
| "grad_norm": 0.9060570597648621, | |
| "learning_rate": 2.840485654161651e-06, | |
| "loss": 0.2811, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.6796243395134338, | |
| "grad_norm": 1.4373691082000732, | |
| "learning_rate": 2.817000879989866e-06, | |
| "loss": 0.3052, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.6811180193804963, | |
| "grad_norm": 1.3326523303985596, | |
| "learning_rate": 2.7935754582091413e-06, | |
| "loss": 0.3184, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.6826116992475587, | |
| "grad_norm": 1.3754558563232422, | |
| "learning_rate": 2.770210025720691e-06, | |
| "loss": 0.3192, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.6841053791146212, | |
| "grad_norm": 2.0747873783111572, | |
| "learning_rate": 2.746905217794715e-06, | |
| "loss": 0.3408, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.6855990589816837, | |
| "grad_norm": 1.3364531993865967, | |
| "learning_rate": 2.7236616680531256e-06, | |
| "loss": 0.3005, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.6870927388487462, | |
| "grad_norm": 1.6091829538345337, | |
| "learning_rate": 2.7004800084523166e-06, | |
| "loss": 0.3288, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6885864187158087, | |
| "grad_norm": 1.1656900644302368, | |
| "learning_rate": 2.6773608692659825e-06, | |
| "loss": 0.2837, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.6900800985828712, | |
| "grad_norm": 1.4220030307769775, | |
| "learning_rate": 2.6543048790679915e-06, | |
| "loss": 0.3119, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.6915737784499337, | |
| "grad_norm": 1.688082218170166, | |
| "learning_rate": 2.63131266471528e-06, | |
| "loss": 0.3282, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.6930674583169962, | |
| "grad_norm": 1.2834751605987549, | |
| "learning_rate": 2.60838485133082e-06, | |
| "loss": 0.3018, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.6945611381840587, | |
| "grad_norm": 1.5603129863739014, | |
| "learning_rate": 2.5855220622866197e-06, | |
| "loss": 0.3035, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.6960548180511212, | |
| "grad_norm": 1.6552413702011108, | |
| "learning_rate": 2.562724919186777e-06, | |
| "loss": 0.321, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.6975484979181836, | |
| "grad_norm": 1.5287736654281616, | |
| "learning_rate": 2.5399940418505754e-06, | |
| "loss": 0.3229, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.6990421777852461, | |
| "grad_norm": 1.5035234689712524, | |
| "learning_rate": 2.5173300482956346e-06, | |
| "loss": 0.2946, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7005358576523086, | |
| "grad_norm": 2.163083791732788, | |
| "learning_rate": 2.4947335547211083e-06, | |
| "loss": 0.3239, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.7020295375193711, | |
| "grad_norm": 1.5969173908233643, | |
| "learning_rate": 2.472205175490928e-06, | |
| "loss": 0.3033, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7035232173864336, | |
| "grad_norm": 1.2077685594558716, | |
| "learning_rate": 2.4497455231171003e-06, | |
| "loss": 0.3142, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.7050168972534961, | |
| "grad_norm": 1.0711603164672852, | |
| "learning_rate": 2.4273552082430586e-06, | |
| "loss": 0.292, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.7065105771205586, | |
| "grad_norm": 1.1325751543045044, | |
| "learning_rate": 2.405034839627051e-06, | |
| "loss": 0.3309, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.7080042569876212, | |
| "grad_norm": 1.3801145553588867, | |
| "learning_rate": 2.3827850241255974e-06, | |
| "loss": 0.3266, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.7094979368546837, | |
| "grad_norm": 1.4642720222473145, | |
| "learning_rate": 2.3606063666769846e-06, | |
| "loss": 0.2985, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.7109916167217462, | |
| "grad_norm": 1.8076415061950684, | |
| "learning_rate": 2.3384994702848234e-06, | |
| "loss": 0.3185, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.7124852965888087, | |
| "grad_norm": 1.7433451414108276, | |
| "learning_rate": 2.3164649360016505e-06, | |
| "loss": 0.3004, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.713978976455871, | |
| "grad_norm": 1.4180279970169067, | |
| "learning_rate": 2.294503362912589e-06, | |
| "loss": 0.3193, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.7154726563229336, | |
| "grad_norm": 1.5062882900238037, | |
| "learning_rate": 2.2726153481190588e-06, | |
| "loss": 0.3233, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.7169663361899961, | |
| "grad_norm": 1.4006506204605103, | |
| "learning_rate": 2.250801486722541e-06, | |
| "loss": 0.3125, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7184600160570586, | |
| "grad_norm": 1.7776737213134766, | |
| "learning_rate": 2.2290623718084052e-06, | |
| "loss": 0.2971, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.7199536959241211, | |
| "grad_norm": 1.5043376684188843, | |
| "learning_rate": 2.207398594429773e-06, | |
| "loss": 0.2992, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.7214473757911836, | |
| "grad_norm": 2.0632483959198, | |
| "learning_rate": 2.185810743591458e-06, | |
| "loss": 0.3223, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.7229410556582461, | |
| "grad_norm": 1.3994874954223633, | |
| "learning_rate": 2.1642994062339458e-06, | |
| "loss": 0.3374, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.7244347355253086, | |
| "grad_norm": 1.8748818635940552, | |
| "learning_rate": 2.1428651672174382e-06, | |
| "loss": 0.308, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.7259284153923711, | |
| "grad_norm": 1.1066455841064453, | |
| "learning_rate": 2.1215086093059527e-06, | |
| "loss": 0.2935, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.7274220952594336, | |
| "grad_norm": 1.480527400970459, | |
| "learning_rate": 2.100230313151476e-06, | |
| "loss": 0.3537, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.728915775126496, | |
| "grad_norm": 1.5903476476669312, | |
| "learning_rate": 2.079030857278179e-06, | |
| "loss": 0.3039, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.7304094549935585, | |
| "grad_norm": 1.1910865306854248, | |
| "learning_rate": 2.057910818066684e-06, | |
| "loss": 0.3233, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.731903134860621, | |
| "grad_norm": 1.305713176727295, | |
| "learning_rate": 2.036870769738401e-06, | |
| "loss": 0.3295, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7333968147276835, | |
| "grad_norm": 1.8359174728393555, | |
| "learning_rate": 2.0159112843399066e-06, | |
| "loss": 0.3121, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.734890494594746, | |
| "grad_norm": 1.200527548789978, | |
| "learning_rate": 1.995032931727396e-06, | |
| "loss": 0.3155, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.7363841744618085, | |
| "grad_norm": 0.9834126234054565, | |
| "learning_rate": 1.97423627955119e-06, | |
| "loss": 0.3086, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.737877854328871, | |
| "grad_norm": 1.8965601921081543, | |
| "learning_rate": 1.9535218932402987e-06, | |
| "loss": 0.296, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.7393715341959335, | |
| "grad_norm": 1.8559459447860718, | |
| "learning_rate": 1.9328903359870504e-06, | |
| "loss": 0.2943, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.740865214062996, | |
| "grad_norm": 1.5035439729690552, | |
| "learning_rate": 1.9123421687317784e-06, | |
| "loss": 0.3121, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.7423588939300585, | |
| "grad_norm": 1.1852291822433472, | |
| "learning_rate": 1.8918779501475708e-06, | |
| "loss": 0.3158, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.7438525737971209, | |
| "grad_norm": 1.259185791015625, | |
| "learning_rate": 1.8714982366250796e-06, | |
| "loss": 0.2938, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.7453462536641834, | |
| "grad_norm": 1.5034098625183105, | |
| "learning_rate": 1.8512035822573915e-06, | |
| "loss": 0.2949, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.7468399335312459, | |
| "grad_norm": 1.282578706741333, | |
| "learning_rate": 1.8309945388249733e-06, | |
| "loss": 0.3098, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7468399335312459, | |
| "eval_loss": 0.30645084381103516, | |
| "eval_runtime": 76.3887, | |
| "eval_samples_per_second": 7.082, | |
| "eval_steps_per_second": 3.548, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7483336133983084, | |
| "grad_norm": 1.2789969444274902, | |
| "learning_rate": 1.8108716557806545e-06, | |
| "loss": 0.3168, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.7498272932653709, | |
| "grad_norm": 1.0109808444976807, | |
| "learning_rate": 1.7908354802346982e-06, | |
| "loss": 0.2843, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.7513209731324334, | |
| "grad_norm": 1.3991084098815918, | |
| "learning_rate": 1.7708865569399247e-06, | |
| "loss": 0.3324, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.7528146529994959, | |
| "grad_norm": 1.529976725578308, | |
| "learning_rate": 1.751025428276899e-06, | |
| "loss": 0.3152, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.7543083328665584, | |
| "grad_norm": 1.9336539506912231, | |
| "learning_rate": 1.7312526342391862e-06, | |
| "loss": 0.3077, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.7558020127336209, | |
| "grad_norm": 1.4918617010116577, | |
| "learning_rate": 1.7115687124186658e-06, | |
| "loss": 0.3139, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.7572956926006834, | |
| "grad_norm": 2.2446916103363037, | |
| "learning_rate": 1.6919741979909222e-06, | |
| "loss": 0.3278, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.7587893724677458, | |
| "grad_norm": 1.2982836961746216, | |
| "learning_rate": 1.6724696237006848e-06, | |
| "loss": 0.3063, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.7602830523348083, | |
| "grad_norm": 1.0719565153121948, | |
| "learning_rate": 1.653055519847357e-06, | |
| "loss": 0.2921, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.7617767322018708, | |
| "grad_norm": 1.5067294836044312, | |
| "learning_rate": 1.6337324142705836e-06, | |
| "loss": 0.3102, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7632704120689333, | |
| "grad_norm": 1.2776610851287842, | |
| "learning_rate": 1.6145008323359068e-06, | |
| "loss": 0.2969, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.7647640919359958, | |
| "grad_norm": 1.1912457942962646, | |
| "learning_rate": 1.5953612969204834e-06, | |
| "loss": 0.2682, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.7662577718030583, | |
| "grad_norm": 1.404762625694275, | |
| "learning_rate": 1.5763143283988663e-06, | |
| "loss": 0.2963, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.7677514516701208, | |
| "grad_norm": 1.2275928258895874, | |
| "learning_rate": 1.5573604446288572e-06, | |
| "loss": 0.2801, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.7692451315371833, | |
| "grad_norm": 1.437886118888855, | |
| "learning_rate": 1.538500160937424e-06, | |
| "loss": 0.31, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.7707388114042458, | |
| "grad_norm": 1.3553423881530762, | |
| "learning_rate": 1.519733990106696e-06, | |
| "loss": 0.2946, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.7722324912713083, | |
| "grad_norm": 1.8724462985992432, | |
| "learning_rate": 1.5010624423600161e-06, | |
| "loss": 0.294, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.7737261711383707, | |
| "grad_norm": 1.0624821186065674, | |
| "learning_rate": 1.48248602534807e-06, | |
| "loss": 0.3292, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.7752198510054332, | |
| "grad_norm": 1.6190390586853027, | |
| "learning_rate": 1.4640052441350893e-06, | |
| "loss": 0.3258, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.7767135308724957, | |
| "grad_norm": 1.0761600732803345, | |
| "learning_rate": 1.4456206011851115e-06, | |
| "loss": 0.3226, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.7782072107395582, | |
| "grad_norm": 1.5092500448226929, | |
| "learning_rate": 1.4273325963483226e-06, | |
| "loss": 0.2854, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.7797008906066207, | |
| "grad_norm": 1.7542755603790283, | |
| "learning_rate": 1.4091417268474683e-06, | |
| "loss": 0.3071, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.7811945704736832, | |
| "grad_norm": 1.1649888753890991, | |
| "learning_rate": 1.3910484872643326e-06, | |
| "loss": 0.3309, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.7826882503407457, | |
| "grad_norm": 1.1616461277008057, | |
| "learning_rate": 1.3730533695262927e-06, | |
| "loss": 0.285, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.7841819302078082, | |
| "grad_norm": 1.3221337795257568, | |
| "learning_rate": 1.3551568628929434e-06, | |
| "loss": 0.3119, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.7856756100748707, | |
| "grad_norm": 1.7051467895507812, | |
| "learning_rate": 1.3373594539427941e-06, | |
| "loss": 0.3262, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.7871692899419332, | |
| "grad_norm": 1.3770607709884644, | |
| "learning_rate": 1.3196616265600442e-06, | |
| "loss": 0.2957, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.7886629698089956, | |
| "grad_norm": 2.04876708984375, | |
| "learning_rate": 1.3020638619214199e-06, | |
| "loss": 0.3109, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.7901566496760581, | |
| "grad_norm": 1.0968056917190552, | |
| "learning_rate": 1.2845666384830951e-06, | |
| "loss": 0.325, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.7916503295431206, | |
| "grad_norm": 1.4080263376235962, | |
| "learning_rate": 1.2671704319676847e-06, | |
| "loss": 0.3151, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.7931440094101831, | |
| "grad_norm": 1.1893798112869263, | |
| "learning_rate": 1.2498757153513075e-06, | |
| "loss": 0.3196, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.7946376892772457, | |
| "grad_norm": 1.286380410194397, | |
| "learning_rate": 1.2326829588507282e-06, | |
| "loss": 0.3288, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.7961313691443082, | |
| "grad_norm": 1.841400384902954, | |
| "learning_rate": 1.2155926299105737e-06, | |
| "loss": 0.3035, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.7976250490113707, | |
| "grad_norm": 1.0659278631210327, | |
| "learning_rate": 1.1986051931906207e-06, | |
| "loss": 0.3368, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.7991187288784332, | |
| "grad_norm": 1.0193463563919067, | |
| "learning_rate": 1.1817211105531667e-06, | |
| "loss": 0.3063, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.8006124087454957, | |
| "grad_norm": 1.656656265258789, | |
| "learning_rate": 1.1649408410504686e-06, | |
| "loss": 0.3059, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.8021060886125582, | |
| "grad_norm": 1.4112011194229126, | |
| "learning_rate": 1.148264840912267e-06, | |
| "loss": 0.3059, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.8035997684796206, | |
| "grad_norm": 0.9142074584960938, | |
| "learning_rate": 1.131693563533376e-06, | |
| "loss": 0.3003, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.8050934483466831, | |
| "grad_norm": 1.3341706991195679, | |
| "learning_rate": 1.1152274594613588e-06, | |
| "loss": 0.3185, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.8065871282137456, | |
| "grad_norm": 1.5966401100158691, | |
| "learning_rate": 1.0988669763842786e-06, | |
| "loss": 0.3394, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 1.6179472208023071, | |
| "learning_rate": 1.0826125591185265e-06, | |
| "loss": 0.3209, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.8095744879478706, | |
| "grad_norm": 1.7330031394958496, | |
| "learning_rate": 1.0664646495967263e-06, | |
| "loss": 0.3303, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.8110681678149331, | |
| "grad_norm": 1.7587456703186035, | |
| "learning_rate": 1.050423686855721e-06, | |
| "loss": 0.3356, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.8125618476819956, | |
| "grad_norm": 1.2351861000061035, | |
| "learning_rate": 1.0344901070246332e-06, | |
| "loss": 0.2924, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.8140555275490581, | |
| "grad_norm": 1.0523043870925903, | |
| "learning_rate": 1.0186643433130128e-06, | |
| "loss": 0.314, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.8155492074161206, | |
| "grad_norm": 1.5923221111297607, | |
| "learning_rate": 1.0029468259990515e-06, | |
| "loss": 0.2991, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.8170428872831831, | |
| "grad_norm": 1.6099388599395752, | |
| "learning_rate": 9.873379824178886e-07, | |
| "loss": 0.3055, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.8185365671502455, | |
| "grad_norm": 1.1426987648010254, | |
| "learning_rate": 9.718382369499936e-07, | |
| "loss": 0.2959, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.820030247017308, | |
| "grad_norm": 2.282841444015503, | |
| "learning_rate": 9.564480110096226e-07, | |
| "loss": 0.3473, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.8215239268843705, | |
| "grad_norm": 1.753849983215332, | |
| "learning_rate": 9.411677230333672e-07, | |
| "loss": 0.2938, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8215239268843705, | |
| "eval_loss": 0.30588287115097046, | |
| "eval_runtime": 76.2019, | |
| "eval_samples_per_second": 7.1, | |
| "eval_steps_per_second": 3.556, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.823017606751433, | |
| "grad_norm": 1.5381008386611938, | |
| "learning_rate": 9.259977884687726e-07, | |
| "loss": 0.3001, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.8245112866184955, | |
| "grad_norm": 1.2274119853973389, | |
| "learning_rate": 9.10938619763046e-07, | |
| "loss": 0.2968, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.826004966485558, | |
| "grad_norm": 1.2566465139389038, | |
| "learning_rate": 8.959906263518398e-07, | |
| "loss": 0.3135, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.8274986463526205, | |
| "grad_norm": 1.4085761308670044, | |
| "learning_rate": 8.811542146481223e-07, | |
| "loss": 0.3067, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.828992326219683, | |
| "grad_norm": 1.507485032081604, | |
| "learning_rate": 8.664297880311234e-07, | |
| "loss": 0.3254, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.8304860060867455, | |
| "grad_norm": 1.4448457956314087, | |
| "learning_rate": 8.518177468353767e-07, | |
| "loss": 0.3273, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.831979685953808, | |
| "grad_norm": 1.4602904319763184, | |
| "learning_rate": 8.373184883398239e-07, | |
| "loss": 0.2887, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.8334733658208704, | |
| "grad_norm": 2.004305601119995, | |
| "learning_rate": 8.229324067570193e-07, | |
| "loss": 0.3068, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.8349670456879329, | |
| "grad_norm": 1.5558735132217407, | |
| "learning_rate": 8.086598932224116e-07, | |
| "loss": 0.3012, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.8364607255549954, | |
| "grad_norm": 1.6343220472335815, | |
| "learning_rate": 7.945013357837089e-07, | |
| "loss": 0.3052, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8379544054220579, | |
| "grad_norm": 1.551553726196289, | |
| "learning_rate": 7.804571193903277e-07, | |
| "loss": 0.3024, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.8394480852891204, | |
| "grad_norm": 1.437134027481079, | |
| "learning_rate": 7.665276258829274e-07, | |
| "loss": 0.312, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.8409417651561829, | |
| "grad_norm": 1.3783475160598755, | |
| "learning_rate": 7.527132339830273e-07, | |
| "loss": 0.2973, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.8424354450232454, | |
| "grad_norm": 1.8350893259048462, | |
| "learning_rate": 7.390143192827148e-07, | |
| "loss": 0.3183, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.8439291248903079, | |
| "grad_norm": 1.1564915180206299, | |
| "learning_rate": 7.25431254234425e-07, | |
| "loss": 0.281, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.8454228047573704, | |
| "grad_norm": 2.0109055042266846, | |
| "learning_rate": 7.119644081408216e-07, | |
| "loss": 0.3059, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.8469164846244329, | |
| "grad_norm": 1.2296003103256226, | |
| "learning_rate": 6.986141471447533e-07, | |
| "loss": 0.3149, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.8484101644914953, | |
| "grad_norm": 1.5590568780899048, | |
| "learning_rate": 6.853808342192981e-07, | |
| "loss": 0.31, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.8499038443585578, | |
| "grad_norm": 1.2360633611679077, | |
| "learning_rate": 6.72264829157896e-07, | |
| "loss": 0.306, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.8513975242256203, | |
| "grad_norm": 0.8478025197982788, | |
| "learning_rate": 6.592664885645678e-07, | |
| "loss": 0.2989, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.8528912040926828, | |
| "grad_norm": 2.2121694087982178, | |
| "learning_rate": 6.463861658442166e-07, | |
| "loss": 0.3025, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.8543848839597453, | |
| "grad_norm": 1.8200898170471191, | |
| "learning_rate": 6.336242111930224e-07, | |
| "loss": 0.2983, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.8558785638268078, | |
| "grad_norm": 1.2733746767044067, | |
| "learning_rate": 6.209809715889182e-07, | |
| "loss": 0.3251, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.8573722436938703, | |
| "grad_norm": 1.0714962482452393, | |
| "learning_rate": 6.084567907821559e-07, | |
| "loss": 0.3361, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.8588659235609328, | |
| "grad_norm": 1.6452571153640747, | |
| "learning_rate": 5.960520092859668e-07, | |
| "loss": 0.3235, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.8603596034279953, | |
| "grad_norm": 0.9949471354484558, | |
| "learning_rate": 5.837669643672927e-07, | |
| "loss": 0.3074, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.8618532832950578, | |
| "grad_norm": 1.2611801624298096, | |
| "learning_rate": 5.716019900376257e-07, | |
| "loss": 0.2955, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.8633469631621202, | |
| "grad_norm": 1.4436218738555908, | |
| "learning_rate": 5.595574170439199e-07, | |
| "loss": 0.3071, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.8648406430291827, | |
| "grad_norm": 1.6352945566177368, | |
| "learning_rate": 5.476335728596061e-07, | |
| "loss": 0.327, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.8663343228962452, | |
| "grad_norm": 1.846356749534607, | |
| "learning_rate": 5.358307816756803e-07, | |
| "loss": 0.3174, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.8678280027633077, | |
| "grad_norm": 1.2852689027786255, | |
| "learning_rate": 5.24149364391895e-07, | |
| "loss": 0.3086, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.8693216826303702, | |
| "grad_norm": 1.4159107208251953, | |
| "learning_rate": 5.125896386080348e-07, | |
| "loss": 0.2913, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.8708153624974327, | |
| "grad_norm": 1.575850248336792, | |
| "learning_rate": 5.011519186152775e-07, | |
| "loss": 0.2937, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.8723090423644952, | |
| "grad_norm": 1.3794643878936768, | |
| "learning_rate": 4.898365153876505e-07, | |
| "loss": 0.3049, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.8738027222315577, | |
| "grad_norm": 1.2364630699157715, | |
| "learning_rate": 4.78643736573578e-07, | |
| "loss": 0.3129, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.8752964020986203, | |
| "grad_norm": 0.8901196122169495, | |
| "learning_rate": 4.675738864875134e-07, | |
| "loss": 0.2912, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.8767900819656828, | |
| "grad_norm": 1.1799402236938477, | |
| "learning_rate": 4.566272661016674e-07, | |
| "loss": 0.3204, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.8782837618327451, | |
| "grad_norm": 1.7847167253494263, | |
| "learning_rate": 4.4580417303782487e-07, | |
| "loss": 0.3081, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.8797774416998076, | |
| "grad_norm": 2.1496951580047607, | |
| "learning_rate": 4.3510490155925235e-07, | |
| "loss": 0.3114, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.8812711215668702, | |
| "grad_norm": 1.502504587173462, | |
| "learning_rate": 4.245297425626971e-07, | |
| "loss": 0.2944, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.8827648014339327, | |
| "grad_norm": 1.207311749458313, | |
| "learning_rate": 4.140789835704806e-07, | |
| "loss": 0.3059, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.8842584813009952, | |
| "grad_norm": 1.821098804473877, | |
| "learning_rate": 4.0375290872267825e-07, | |
| "loss": 0.2872, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.8857521611680577, | |
| "grad_norm": 1.6243329048156738, | |
| "learning_rate": 3.935517987693932e-07, | |
| "loss": 0.3064, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.8872458410351202, | |
| "grad_norm": 1.5818045139312744, | |
| "learning_rate": 3.8347593106312974e-07, | |
| "loss": 0.2777, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.8887395209021827, | |
| "grad_norm": 1.2436670064926147, | |
| "learning_rate": 3.7352557955124437e-07, | |
| "loss": 0.3014, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.8902332007692452, | |
| "grad_norm": 1.4755676984786987, | |
| "learning_rate": 3.637010147685016e-07, | |
| "loss": 0.312, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.8917268806363077, | |
| "grad_norm": 1.5071330070495605, | |
| "learning_rate": 3.540025038297196e-07, | |
| "loss": 0.3246, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.8932205605033701, | |
| "grad_norm": 1.5448203086853027, | |
| "learning_rate": 3.44430310422505e-07, | |
| "loss": 0.301, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.8947142403704326, | |
| "grad_norm": 1.1398577690124512, | |
| "learning_rate": 3.3498469480008454e-07, | |
| "loss": 0.2993, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.8962079202374951, | |
| "grad_norm": 1.4388718605041504, | |
| "learning_rate": 3.256659137742313e-07, | |
| "loss": 0.315, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8962079202374951, | |
| "eval_loss": 0.30459803342819214, | |
| "eval_runtime": 76.2056, | |
| "eval_samples_per_second": 7.099, | |
| "eval_steps_per_second": 3.556, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8977016001045576, | |
| "grad_norm": 1.604347825050354, | |
| "learning_rate": 3.164742207082788e-07, | |
| "loss": 0.319, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.8991952799716201, | |
| "grad_norm": 1.558813214302063, | |
| "learning_rate": 3.0740986551023535e-07, | |
| "loss": 0.3084, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.9006889598386826, | |
| "grad_norm": 1.7928036451339722, | |
| "learning_rate": 2.9847309462598726e-07, | |
| "loss": 0.3147, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.9021826397057451, | |
| "grad_norm": 1.4524924755096436, | |
| "learning_rate": 2.896641510326009e-07, | |
| "loss": 0.3112, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.9036763195728076, | |
| "grad_norm": 1.2006369829177856, | |
| "learning_rate": 2.809832742317137e-07, | |
| "loss": 0.3284, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.9051699994398701, | |
| "grad_norm": 1.2945834398269653, | |
| "learning_rate": 2.724307002430249e-07, | |
| "loss": 0.3057, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.9066636793069326, | |
| "grad_norm": 0.915762722492218, | |
| "learning_rate": 2.6400666159787646e-07, | |
| "loss": 0.3078, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.908157359173995, | |
| "grad_norm": 1.4049233198165894, | |
| "learning_rate": 2.5571138733293255e-07, | |
| "loss": 0.3251, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.9096510390410575, | |
| "grad_norm": 1.4237291812896729, | |
| "learning_rate": 2.475451029839515e-07, | |
| "loss": 0.3224, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.91114471890812, | |
| "grad_norm": 1.0404157638549805, | |
| "learning_rate": 2.3950803057965435e-07, | |
| "loss": 0.312, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.9126383987751825, | |
| "grad_norm": 1.250205636024475, | |
| "learning_rate": 2.3160038863568768e-07, | |
| "loss": 0.312, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.914132078642245, | |
| "grad_norm": 1.2475612163543701, | |
| "learning_rate": 2.2382239214868152e-07, | |
| "loss": 0.3077, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.9156257585093075, | |
| "grad_norm": 1.5456167459487915, | |
| "learning_rate": 2.161742525904087e-07, | |
| "loss": 0.3301, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.91711943837637, | |
| "grad_norm": 1.449046015739441, | |
| "learning_rate": 2.086561779020285e-07, | |
| "loss": 0.3371, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.9186131182434325, | |
| "grad_norm": 1.6901681423187256, | |
| "learning_rate": 2.012683724884379e-07, | |
| "loss": 0.3178, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.920106798110495, | |
| "grad_norm": 1.598301649093628, | |
| "learning_rate": 1.9401103721271076e-07, | |
| "loss": 0.2795, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.9216004779775575, | |
| "grad_norm": 1.3405296802520752, | |
| "learning_rate": 1.8688436939064025e-07, | |
| "loss": 0.3362, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.9230941578446199, | |
| "grad_norm": 1.2465345859527588, | |
| "learning_rate": 1.798885627853708e-07, | |
| "loss": 0.3009, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.9245878377116824, | |
| "grad_norm": 1.2908686399459839, | |
| "learning_rate": 1.7302380760213345e-07, | |
| "loss": 0.3066, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.9260815175787449, | |
| "grad_norm": 1.4556738138198853, | |
| "learning_rate": 1.6629029048307044e-07, | |
| "loss": 0.3031, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9275751974458074, | |
| "grad_norm": 0.9532304406166077, | |
| "learning_rate": 1.5968819450216444e-07, | |
| "loss": 0.331, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.9290688773128699, | |
| "grad_norm": 1.7296748161315918, | |
| "learning_rate": 1.5321769916025798e-07, | |
| "loss": 0.3211, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.9305625571799324, | |
| "grad_norm": 1.8877276182174683, | |
| "learning_rate": 1.4687898038017513e-07, | |
| "loss": 0.3241, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.9320562370469949, | |
| "grad_norm": 1.5971440076828003, | |
| "learning_rate": 1.406722105019376e-07, | |
| "loss": 0.3089, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.9335499169140574, | |
| "grad_norm": 1.8514761924743652, | |
| "learning_rate": 1.3459755827807952e-07, | |
| "loss": 0.3199, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.9350435967811199, | |
| "grad_norm": 1.612648367881775, | |
| "learning_rate": 1.2865518886905848e-07, | |
| "loss": 0.3195, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.9365372766481824, | |
| "grad_norm": 1.5449368953704834, | |
| "learning_rate": 1.228452638387656e-07, | |
| "loss": 0.3154, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.9380309565152448, | |
| "grad_norm": 1.3344995975494385, | |
| "learning_rate": 1.1716794115013419e-07, | |
| "loss": 0.3065, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.9395246363823073, | |
| "grad_norm": 1.4628318548202515, | |
| "learning_rate": 1.1162337516084253e-07, | |
| "loss": 0.3333, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.9410183162493698, | |
| "grad_norm": 1.5249940156936646, | |
| "learning_rate": 1.0621171661911844e-07, | |
| "loss": 0.3183, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9425119961164323, | |
| "grad_norm": 1.3567790985107422, | |
| "learning_rate": 1.0093311265963967e-07, | |
| "loss": 0.2903, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.9440056759834948, | |
| "grad_norm": 2.6283679008483887, | |
| "learning_rate": 9.578770679953664e-08, | |
| "loss": 0.3182, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.9454993558505573, | |
| "grad_norm": 1.6033724546432495, | |
| "learning_rate": 9.07756389344866e-08, | |
| "loss": 0.3061, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.9469930357176198, | |
| "grad_norm": 1.2911169528961182, | |
| "learning_rate": 8.589704533491173e-08, | |
| "loss": 0.3242, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.9484867155846823, | |
| "grad_norm": 1.2943521738052368, | |
| "learning_rate": 8.115205864227316e-08, | |
| "loss": 0.319, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.9499803954517448, | |
| "grad_norm": 1.4396514892578125, | |
| "learning_rate": 7.65408078654678e-08, | |
| "loss": 0.3246, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.9514740753188073, | |
| "grad_norm": 1.282568097114563, | |
| "learning_rate": 7.206341837731667e-08, | |
| "loss": 0.3194, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.9529677551858697, | |
| "grad_norm": 1.448075294494629, | |
| "learning_rate": 6.772001191115928e-08, | |
| "loss": 0.2985, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.9544614350529322, | |
| "grad_norm": 0.9579274654388428, | |
| "learning_rate": 6.351070655754187e-08, | |
| "loss": 0.3208, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.9559551149199947, | |
| "grad_norm": 1.2854158878326416, | |
| "learning_rate": 5.943561676100773e-08, | |
| "loss": 0.2923, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.9574487947870572, | |
| "grad_norm": 1.7156449556350708, | |
| "learning_rate": 5.5494853316985786e-08, | |
| "loss": 0.3132, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.9589424746541197, | |
| "grad_norm": 1.3507882356643677, | |
| "learning_rate": 5.168852336877695e-08, | |
| "loss": 0.335, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.9604361545211823, | |
| "grad_norm": 1.696518063545227, | |
| "learning_rate": 4.801673040464305e-08, | |
| "loss": 0.3196, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.9619298343882448, | |
| "grad_norm": 1.6517608165740967, | |
| "learning_rate": 4.447957425499139e-08, | |
| "loss": 0.3038, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.9634235142553073, | |
| "grad_norm": 1.3434756994247437, | |
| "learning_rate": 4.107715108966237e-08, | |
| "loss": 0.3067, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.9649171941223698, | |
| "grad_norm": 1.260919213294983, | |
| "learning_rate": 3.7809553415311675e-08, | |
| "loss": 0.3052, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.9664108739894323, | |
| "grad_norm": 1.7151823043823242, | |
| "learning_rate": 3.467687007289833e-08, | |
| "loss": 0.2897, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 0.9679045538564947, | |
| "grad_norm": 1.9328373670578003, | |
| "learning_rate": 3.167918623526833e-08, | |
| "loss": 0.2919, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.9693982337235572, | |
| "grad_norm": 1.4327538013458252, | |
| "learning_rate": 2.8816583404837616e-08, | |
| "loss": 0.2983, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 0.9708919135906197, | |
| "grad_norm": 1.2370225191116333, | |
| "learning_rate": 2.608913941137825e-08, | |
| "loss": 0.301, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9708919135906197, | |
| "eval_loss": 0.3039746582508087, | |
| "eval_runtime": 76.1933, | |
| "eval_samples_per_second": 7.1, | |
| "eval_steps_per_second": 3.557, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9723855934576822, | |
| "grad_norm": 1.433455467224121, | |
| "learning_rate": 2.3496928409900143e-08, | |
| "loss": 0.3035, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 0.9738792733247447, | |
| "grad_norm": 1.2109386920928955, | |
| "learning_rate": 2.10400208786371e-08, | |
| "loss": 0.3246, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.9753729531918072, | |
| "grad_norm": 1.5277369022369385, | |
| "learning_rate": 1.87184836171278e-08, | |
| "loss": 0.3185, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 0.9768666330588697, | |
| "grad_norm": 2.307945728302002, | |
| "learning_rate": 1.6532379744403915e-08, | |
| "loss": 0.3209, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.9783603129259322, | |
| "grad_norm": 2.4343667030334473, | |
| "learning_rate": 1.448176869726814e-08, | |
| "loss": 0.3146, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.9798539927929947, | |
| "grad_norm": 2.0399911403656006, | |
| "learning_rate": 1.2566706228685499e-08, | |
| "loss": 0.3042, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.9813476726600572, | |
| "grad_norm": 1.6942330598831177, | |
| "learning_rate": 1.0787244406259556e-08, | |
| "loss": 0.2949, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 0.9828413525271196, | |
| "grad_norm": 1.0024651288986206, | |
| "learning_rate": 9.143431610822983e-09, | |
| "loss": 0.3046, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.9843350323941821, | |
| "grad_norm": 1.426224946975708, | |
| "learning_rate": 7.635312535119732e-09, | |
| "loss": 0.3148, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 0.9858287122612446, | |
| "grad_norm": 1.2437832355499268, | |
| "learning_rate": 6.2629281825887785e-09, | |
| "loss": 0.3209, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.9873223921283071, | |
| "grad_norm": 1.3072861433029175, | |
| "learning_rate": 5.026315866252241e-09, | |
| "loss": 0.32, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 0.9888160719953696, | |
| "grad_norm": 1.787488579750061, | |
| "learning_rate": 3.9255092076984084e-09, | |
| "loss": 0.3269, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.9903097518624321, | |
| "grad_norm": 1.3305130004882812, | |
| "learning_rate": 2.9605381361685893e-09, | |
| "loss": 0.3157, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 0.9918034317294946, | |
| "grad_norm": 1.319860577583313, | |
| "learning_rate": 2.131428887742204e-09, | |
| "loss": 0.2924, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.9932971115965571, | |
| "grad_norm": 1.1278481483459473, | |
| "learning_rate": 1.4382040046267976e-09, | |
| "loss": 0.3155, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.9947907914636196, | |
| "grad_norm": 1.3013602495193481, | |
| "learning_rate": 8.808823345407558e-10, | |
| "loss": 0.3081, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.9962844713306821, | |
| "grad_norm": 1.4793535470962524, | |
| "learning_rate": 4.594790302037133e-10, | |
| "loss": 0.3225, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 0.9977781511977445, | |
| "grad_norm": 1.2639180421829224, | |
| "learning_rate": 1.7400554892466058e-10, | |
| "loss": 0.3101, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.999271831064807, | |
| "grad_norm": 1.553964614868164, | |
| "learning_rate": 2.4469652287750777e-11, | |
| "loss": 0.3104, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 6695, | |
| "total_flos": 1.727502264531714e+18, | |
| "train_loss": 0.35656481113248656, | |
| "train_runtime": 34664.1508, | |
| "train_samples_per_second": 1.545, | |
| "train_steps_per_second": 0.193 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6695, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.727502264531714e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |