| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3624, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0027597626604112047, | |
| "grad_norm": 9.687899712924036, | |
| "learning_rate": 2.4793388429752067e-07, | |
| "loss": 1.2489, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005519525320822409, | |
| "grad_norm": 7.993583387794843, | |
| "learning_rate": 5.234159779614326e-07, | |
| "loss": 1.23, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.008279287981233613, | |
| "grad_norm": 4.419911118410825, | |
| "learning_rate": 7.988980716253444e-07, | |
| "loss": 1.134, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.011039050641644819, | |
| "grad_norm": 3.0732247545437965, | |
| "learning_rate": 1.0743801652892562e-06, | |
| "loss": 0.993, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.013798813302056023, | |
| "grad_norm": 2.005789581836949, | |
| "learning_rate": 1.3498622589531682e-06, | |
| "loss": 0.8782, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.016558575962467226, | |
| "grad_norm": 1.3028620164242184, | |
| "learning_rate": 1.62534435261708e-06, | |
| "loss": 0.8004, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.019318338622878434, | |
| "grad_norm": 1.2871645472343145, | |
| "learning_rate": 1.900826446280992e-06, | |
| "loss": 0.7618, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.022078101283289638, | |
| "grad_norm": 2.0977280137118854, | |
| "learning_rate": 2.1763085399449038e-06, | |
| "loss": 0.7363, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.02483786394370084, | |
| "grad_norm": 1.235679470482907, | |
| "learning_rate": 2.4517906336088157e-06, | |
| "loss": 0.7218, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.027597626604112045, | |
| "grad_norm": 1.2008847267657137, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 0.7039, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.030357389264523253, | |
| "grad_norm": 1.2865692124934534, | |
| "learning_rate": 3.002754820936639e-06, | |
| "loss": 0.7016, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03311715192493445, | |
| "grad_norm": 1.225519692241417, | |
| "learning_rate": 3.278236914600551e-06, | |
| "loss": 0.6911, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.035876914585345664, | |
| "grad_norm": 1.1074744380119828, | |
| "learning_rate": 3.553719008264463e-06, | |
| "loss": 0.6837, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03863667724575687, | |
| "grad_norm": 1.2255853901709166, | |
| "learning_rate": 3.8292011019283746e-06, | |
| "loss": 0.6733, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04139643990616807, | |
| "grad_norm": 1.2160790653325682, | |
| "learning_rate": 4.104683195592287e-06, | |
| "loss": 0.6663, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.044156202566579275, | |
| "grad_norm": 1.2160126107257938, | |
| "learning_rate": 4.3801652892561984e-06, | |
| "loss": 0.6672, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04691596522699048, | |
| "grad_norm": 1.3030345619512333, | |
| "learning_rate": 4.655647382920111e-06, | |
| "loss": 0.6604, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04967572788740168, | |
| "grad_norm": 1.1675762652939756, | |
| "learning_rate": 4.931129476584022e-06, | |
| "loss": 0.654, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05243549054781289, | |
| "grad_norm": 1.3421701645033057, | |
| "learning_rate": 5.206611570247935e-06, | |
| "loss": 0.653, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05519525320822409, | |
| "grad_norm": 1.1987056035659454, | |
| "learning_rate": 5.482093663911846e-06, | |
| "loss": 0.6508, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.057955015868635294, | |
| "grad_norm": 1.3869845490974586, | |
| "learning_rate": 5.7575757575757586e-06, | |
| "loss": 0.6498, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.060714778529046505, | |
| "grad_norm": 1.116588718762935, | |
| "learning_rate": 6.03305785123967e-06, | |
| "loss": 0.6453, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.06347454118945771, | |
| "grad_norm": 1.0960389471025322, | |
| "learning_rate": 6.3085399449035824e-06, | |
| "loss": 0.6414, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0662343038498689, | |
| "grad_norm": 1.2133969264114732, | |
| "learning_rate": 6.584022038567494e-06, | |
| "loss": 0.6345, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06899406651028012, | |
| "grad_norm": 1.2700545101041685, | |
| "learning_rate": 6.859504132231406e-06, | |
| "loss": 0.6386, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07175382917069133, | |
| "grad_norm": 1.2174406212987252, | |
| "learning_rate": 7.134986225895317e-06, | |
| "loss": 0.6403, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07451359183110252, | |
| "grad_norm": 1.1500811137175568, | |
| "learning_rate": 7.410468319559229e-06, | |
| "loss": 0.6361, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07727335449151373, | |
| "grad_norm": 1.2075741428461082, | |
| "learning_rate": 7.685950413223142e-06, | |
| "loss": 0.6348, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.08003311715192493, | |
| "grad_norm": 1.6661536273928046, | |
| "learning_rate": 7.961432506887054e-06, | |
| "loss": 0.6263, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.08279287981233614, | |
| "grad_norm": 1.2692327811914956, | |
| "learning_rate": 8.236914600550965e-06, | |
| "loss": 0.6294, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08555264247274734, | |
| "grad_norm": 1.3688551805953726, | |
| "learning_rate": 8.512396694214877e-06, | |
| "loss": 0.6268, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08831240513315855, | |
| "grad_norm": 1.3470949176246652, | |
| "learning_rate": 8.787878787878788e-06, | |
| "loss": 0.6247, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.09107216779356975, | |
| "grad_norm": 1.136787185698281, | |
| "learning_rate": 9.063360881542702e-06, | |
| "loss": 0.619, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.09383193045398096, | |
| "grad_norm": 1.1593643608420736, | |
| "learning_rate": 9.338842975206613e-06, | |
| "loss": 0.6262, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.09659169311439217, | |
| "grad_norm": 1.1708021309308343, | |
| "learning_rate": 9.614325068870525e-06, | |
| "loss": 0.6241, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09935145577480337, | |
| "grad_norm": 1.1394208261112413, | |
| "learning_rate": 9.889807162534436e-06, | |
| "loss": 0.6247, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.10211121843521458, | |
| "grad_norm": 1.243982861233763, | |
| "learning_rate": 9.999916470583429e-06, | |
| "loss": 0.6223, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.10487098109562577, | |
| "grad_norm": 1.0950836157642558, | |
| "learning_rate": 9.999406023144514e-06, | |
| "loss": 0.6172, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.10763074375603698, | |
| "grad_norm": 1.1223718246628647, | |
| "learning_rate": 9.998431580815314e-06, | |
| "loss": 0.6178, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.11039050641644818, | |
| "grad_norm": 1.2801215739716154, | |
| "learning_rate": 9.996993234033826e-06, | |
| "loss": 0.6187, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11315026907685939, | |
| "grad_norm": 1.1553035959732303, | |
| "learning_rate": 9.995091116293022e-06, | |
| "loss": 0.6195, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.11591003173727059, | |
| "grad_norm": 1.163353982074495, | |
| "learning_rate": 9.992725404128452e-06, | |
| "loss": 0.6162, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1186697943976818, | |
| "grad_norm": 1.0553152985762932, | |
| "learning_rate": 9.989896317101873e-06, | |
| "loss": 0.6075, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.12142955705809301, | |
| "grad_norm": 1.061674081293769, | |
| "learning_rate": 9.986604117780861e-06, | |
| "loss": 0.6157, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1241893197185042, | |
| "grad_norm": 1.0865591957408132, | |
| "learning_rate": 9.982849111714445e-06, | |
| "loss": 0.6147, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.12694908237891542, | |
| "grad_norm": 1.039091554305776, | |
| "learning_rate": 9.978631647404755e-06, | |
| "loss": 0.6089, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.12970884503932661, | |
| "grad_norm": 1.1565672987905995, | |
| "learning_rate": 9.973952116274664e-06, | |
| "loss": 0.6132, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1324686076997378, | |
| "grad_norm": 1.141107303982074, | |
| "learning_rate": 9.968810952631473e-06, | |
| "loss": 0.6077, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.13522837036014904, | |
| "grad_norm": 1.1858850784459474, | |
| "learning_rate": 9.9632086336266e-06, | |
| "loss": 0.6116, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.13798813302056023, | |
| "grad_norm": 1.0965450940865213, | |
| "learning_rate": 9.957145679211288e-06, | |
| "loss": 0.6131, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13798813302056023, | |
| "eval_loss": 0.6089209318161011, | |
| "eval_runtime": 49.9455, | |
| "eval_samples_per_second": 58.624, | |
| "eval_steps_per_second": 3.664, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.14074789568097143, | |
| "grad_norm": 1.076032821296556, | |
| "learning_rate": 9.95062265208836e-06, | |
| "loss": 0.606, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.14350765834138265, | |
| "grad_norm": 1.0371790631769926, | |
| "learning_rate": 9.943640157659984e-06, | |
| "loss": 0.6071, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.14626742100179385, | |
| "grad_norm": 1.0166966431904405, | |
| "learning_rate": 9.936198843971493e-06, | |
| "loss": 0.6021, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.14902718366220505, | |
| "grad_norm": 1.3219793417334544, | |
| "learning_rate": 9.928299401651236e-06, | |
| "loss": 0.5981, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.15178694632261625, | |
| "grad_norm": 1.0389071117117548, | |
| "learning_rate": 9.919942563846482e-06, | |
| "loss": 0.6005, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.15454670898302747, | |
| "grad_norm": 1.0246893171620508, | |
| "learning_rate": 9.911129106155375e-06, | |
| "loss": 0.6046, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.15730647164343867, | |
| "grad_norm": 1.0445099757660354, | |
| "learning_rate": 9.901859846554955e-06, | |
| "loss": 0.6053, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.16006623430384986, | |
| "grad_norm": 1.064106651611097, | |
| "learning_rate": 9.892135645325238e-06, | |
| "loss": 0.5984, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.16282599696426106, | |
| "grad_norm": 1.0462870772071302, | |
| "learning_rate": 9.881957404969373e-06, | |
| "loss": 0.5978, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.16558575962467229, | |
| "grad_norm": 1.0000683469984344, | |
| "learning_rate": 9.871326070129885e-06, | |
| "loss": 0.6094, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.16834552228508348, | |
| "grad_norm": 0.9977951236432633, | |
| "learning_rate": 9.860242627500994e-06, | |
| "loss": 0.5974, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.17110528494549468, | |
| "grad_norm": 4.8609383522246405, | |
| "learning_rate": 9.848708105737049e-06, | |
| "loss": 0.598, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1738650476059059, | |
| "grad_norm": 1.0005981539720457, | |
| "learning_rate": 9.836723575357056e-06, | |
| "loss": 0.5988, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1766248102663171, | |
| "grad_norm": 1.0638070414980612, | |
| "learning_rate": 9.824290148645322e-06, | |
| "loss": 0.5957, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1793845729267283, | |
| "grad_norm": 1.0406376026434327, | |
| "learning_rate": 9.811408979548219e-06, | |
| "loss": 0.591, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.1821443355871395, | |
| "grad_norm": 1.07624118905585, | |
| "learning_rate": 9.7980812635671e-06, | |
| "loss": 0.5948, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.18490409824755072, | |
| "grad_norm": 1.049133255644815, | |
| "learning_rate": 9.784308237647329e-06, | |
| "loss": 0.5954, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.18766386090796192, | |
| "grad_norm": 1.0143310556909468, | |
| "learning_rate": 9.770091180063489e-06, | |
| "loss": 0.596, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.1904236235683731, | |
| "grad_norm": 1.0508407539599804, | |
| "learning_rate": 9.755431410300743e-06, | |
| "loss": 0.5871, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.19318338622878434, | |
| "grad_norm": 1.2937338319267142, | |
| "learning_rate": 9.740330288932379e-06, | |
| "loss": 0.5948, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.19594314888919553, | |
| "grad_norm": 1.0318704519944248, | |
| "learning_rate": 9.724789217493514e-06, | |
| "loss": 0.5898, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.19870291154960673, | |
| "grad_norm": 0.9605096481233693, | |
| "learning_rate": 9.708809638351048e-06, | |
| "loss": 0.5925, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.20146267421001793, | |
| "grad_norm": 1.0371369229103382, | |
| "learning_rate": 9.692393034569776e-06, | |
| "loss": 0.5917, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.20422243687042915, | |
| "grad_norm": 0.97367889824368, | |
| "learning_rate": 9.675540929774751e-06, | |
| "loss": 0.5911, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.20698219953084035, | |
| "grad_norm": 0.9914313419740074, | |
| "learning_rate": 9.658254888009877e-06, | |
| "loss": 0.5862, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.20974196219125155, | |
| "grad_norm": 0.9822682480321101, | |
| "learning_rate": 9.64053651359275e-06, | |
| "loss": 0.5871, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.21250172485166274, | |
| "grad_norm": 1.001214024547401, | |
| "learning_rate": 9.622387450965758e-06, | |
| "loss": 0.5851, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.21526148751207397, | |
| "grad_norm": 0.9608822251160972, | |
| "learning_rate": 9.603809384543472e-06, | |
| "loss": 0.5937, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.21802125017248516, | |
| "grad_norm": 1.003270353697929, | |
| "learning_rate": 9.584804038556297e-06, | |
| "loss": 0.5834, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.22078101283289636, | |
| "grad_norm": 1.0623920143347612, | |
| "learning_rate": 9.56537317689046e-06, | |
| "loss": 0.5903, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2235407754933076, | |
| "grad_norm": 0.9490080408477698, | |
| "learning_rate": 9.5455186029243e-06, | |
| "loss": 0.5872, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.22630053815371878, | |
| "grad_norm": 0.944874262643946, | |
| "learning_rate": 9.525242159360897e-06, | |
| "loss": 0.5888, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.22906030081412998, | |
| "grad_norm": 0.9755675580087483, | |
| "learning_rate": 9.504545728057046e-06, | |
| "loss": 0.5822, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.23182006347454118, | |
| "grad_norm": 0.9892178439036814, | |
| "learning_rate": 9.483431229848607e-06, | |
| "loss": 0.5847, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2345798261349524, | |
| "grad_norm": 1.3473013721931224, | |
| "learning_rate": 9.461900624372233e-06, | |
| "loss": 0.5763, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2373395887953636, | |
| "grad_norm": 1.1368854612150283, | |
| "learning_rate": 9.439955909883493e-06, | |
| "loss": 0.5815, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2400993514557748, | |
| "grad_norm": 1.0103375787124518, | |
| "learning_rate": 9.417599123071417e-06, | |
| "loss": 0.5838, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.24285911411618602, | |
| "grad_norm": 1.0432433126635938, | |
| "learning_rate": 9.39483233886946e-06, | |
| "loss": 0.5785, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.24561887677659722, | |
| "grad_norm": 1.1847765522018567, | |
| "learning_rate": 9.371657670262947e-06, | |
| "loss": 0.5742, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2483786394370084, | |
| "grad_norm": 0.9243729444752771, | |
| "learning_rate": 9.348077268092951e-06, | |
| "loss": 0.5763, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2511384020974196, | |
| "grad_norm": 1.3733927460707915, | |
| "learning_rate": 9.324093320856679e-06, | |
| "loss": 0.5777, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.25389816475783084, | |
| "grad_norm": 1.098267274159629, | |
| "learning_rate": 9.299708054504355e-06, | |
| "loss": 0.5781, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.256657927418242, | |
| "grad_norm": 0.940463162328034, | |
| "learning_rate": 9.274923732232635e-06, | |
| "loss": 0.5813, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.25941769007865323, | |
| "grad_norm": 0.936901862695869, | |
| "learning_rate": 9.249742654274554e-06, | |
| "loss": 0.576, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.26217745273906445, | |
| "grad_norm": 0.9160606268594537, | |
| "learning_rate": 9.224167157686044e-06, | |
| "loss": 0.5756, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2649372153994756, | |
| "grad_norm": 0.902157881662283, | |
| "learning_rate": 9.198199616129033e-06, | |
| "loss": 0.5718, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.26769697805988685, | |
| "grad_norm": 0.9251021746256725, | |
| "learning_rate": 9.171842439651143e-06, | |
| "loss": 0.5722, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.27045674072029807, | |
| "grad_norm": 0.9049498340659763, | |
| "learning_rate": 9.145098074462012e-06, | |
| "loss": 0.577, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.27321650338070924, | |
| "grad_norm": 1.0139196145776603, | |
| "learning_rate": 9.117969002706267e-06, | |
| "loss": 0.5738, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.27597626604112047, | |
| "grad_norm": 0.9648188962102778, | |
| "learning_rate": 9.090457742233152e-06, | |
| "loss": 0.5714, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.27597626604112047, | |
| "eval_loss": 0.5767696499824524, | |
| "eval_runtime": 48.8421, | |
| "eval_samples_per_second": 59.948, | |
| "eval_steps_per_second": 3.747, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2787360287015317, | |
| "grad_norm": 0.994421928125615, | |
| "learning_rate": 9.062566846362843e-06, | |
| "loss": 0.5746, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.28149579136194286, | |
| "grad_norm": 0.9627083740767366, | |
| "learning_rate": 9.034298903649485e-06, | |
| "loss": 0.5731, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.2842555540223541, | |
| "grad_norm": 0.9143645503640928, | |
| "learning_rate": 9.005656537640942e-06, | |
| "loss": 0.574, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.2870153166827653, | |
| "grad_norm": 1.0384970550188914, | |
| "learning_rate": 8.976642406635295e-06, | |
| "loss": 0.5725, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.2897750793431765, | |
| "grad_norm": 0.9669853254759909, | |
| "learning_rate": 8.947259203434147e-06, | |
| "loss": 0.5737, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2925348420035877, | |
| "grad_norm": 0.965075042317439, | |
| "learning_rate": 8.917509655092691e-06, | |
| "loss": 0.5723, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.29529460466399887, | |
| "grad_norm": 0.9720003134521186, | |
| "learning_rate": 8.887396522666608e-06, | |
| "loss": 0.566, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.2980543673244101, | |
| "grad_norm": 1.0399992404707865, | |
| "learning_rate": 8.85692260095582e-06, | |
| "loss": 0.5683, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3008141299848213, | |
| "grad_norm": 1.0679186267160017, | |
| "learning_rate": 8.826090718245112e-06, | |
| "loss": 0.5661, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.3035738926452325, | |
| "grad_norm": 0.9643258915292706, | |
| "learning_rate": 8.794903736041622e-06, | |
| "loss": 0.5717, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3063336553056437, | |
| "grad_norm": 0.9136180676106652, | |
| "learning_rate": 8.763364548809279e-06, | |
| "loss": 0.5658, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.30909341796605494, | |
| "grad_norm": 0.9512590668833296, | |
| "learning_rate": 8.731476083700154e-06, | |
| "loss": 0.5679, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3118531806264661, | |
| "grad_norm": 0.8874320591828697, | |
| "learning_rate": 8.699241300282806e-06, | |
| "loss": 0.5684, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.31461294328687733, | |
| "grad_norm": 0.8922502034099749, | |
| "learning_rate": 8.666663190267596e-06, | |
| "loss": 0.5621, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.31737270594728856, | |
| "grad_norm": 1.6201987068583557, | |
| "learning_rate": 8.633744777229029e-06, | |
| "loss": 0.569, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3201324686076997, | |
| "grad_norm": 0.9202299543891491, | |
| "learning_rate": 8.600489116325128e-06, | |
| "loss": 0.5678, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.32289223126811095, | |
| "grad_norm": 1.007642513849446, | |
| "learning_rate": 8.566899294013901e-06, | |
| "loss": 0.5696, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3256519939285221, | |
| "grad_norm": 0.9837333100849195, | |
| "learning_rate": 8.53297842776687e-06, | |
| "loss": 0.5655, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.32841175658893335, | |
| "grad_norm": 0.9874781114751451, | |
| "learning_rate": 8.498729665779751e-06, | |
| "loss": 0.5653, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.33117151924934457, | |
| "grad_norm": 0.9230978368811615, | |
| "learning_rate": 8.464156186680262e-06, | |
| "loss": 0.5665, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.33393128190975574, | |
| "grad_norm": 0.8844537320977038, | |
| "learning_rate": 8.429261199233114e-06, | |
| "loss": 0.5633, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.33669104457016696, | |
| "grad_norm": 0.8743072862956215, | |
| "learning_rate": 8.394047942042215e-06, | |
| "loss": 0.5648, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3394508072305782, | |
| "grad_norm": 0.9571103987457272, | |
| "learning_rate": 8.358519683250087e-06, | |
| "loss": 0.5628, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.34221056989098936, | |
| "grad_norm": 0.9109271244972179, | |
| "learning_rate": 8.322679720234553e-06, | |
| "loss": 0.5665, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3449703325514006, | |
| "grad_norm": 0.9812446794213074, | |
| "learning_rate": 8.286531379302703e-06, | |
| "loss": 0.5615, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3477300952118118, | |
| "grad_norm": 0.9841621501852277, | |
| "learning_rate": 8.25007801538218e-06, | |
| "loss": 0.5626, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.350489857872223, | |
| "grad_norm": 0.9203538923510197, | |
| "learning_rate": 8.21332301170982e-06, | |
| "loss": 0.5645, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3532496205326342, | |
| "grad_norm": 1.0479420531686792, | |
| "learning_rate": 8.17626977951764e-06, | |
| "loss": 0.5622, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3560093831930454, | |
| "grad_norm": 0.9787960431652523, | |
| "learning_rate": 8.138921757716245e-06, | |
| "loss": 0.5581, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.3587691458534566, | |
| "grad_norm": 0.9530332373900086, | |
| "learning_rate": 8.101282412575673e-06, | |
| "loss": 0.5618, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3615289085138678, | |
| "grad_norm": 1.0048234119883286, | |
| "learning_rate": 8.063355237403672e-06, | |
| "loss": 0.5555, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.364288671174279, | |
| "grad_norm": 0.9557255183834055, | |
| "learning_rate": 8.0251437522215e-06, | |
| "loss": 0.5588, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.3670484338346902, | |
| "grad_norm": 0.8586667700859966, | |
| "learning_rate": 7.986651503437233e-06, | |
| "loss": 0.5638, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.36980819649510144, | |
| "grad_norm": 0.910151087145653, | |
| "learning_rate": 7.947882063516612e-06, | |
| "loss": 0.561, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.3725679591555126, | |
| "grad_norm": 1.055588127662, | |
| "learning_rate": 7.908839030651488e-06, | |
| "loss": 0.558, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.37532772181592383, | |
| "grad_norm": 0.9158993525267682, | |
| "learning_rate": 7.869526028425878e-06, | |
| "loss": 0.5587, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.37808748447633506, | |
| "grad_norm": 0.9374100988045402, | |
| "learning_rate": 7.829946705479654e-06, | |
| "loss": 0.5584, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.3808472471367462, | |
| "grad_norm": 0.9220438433292831, | |
| "learning_rate": 7.790104735169915e-06, | |
| "loss": 0.5576, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.38360700979715745, | |
| "grad_norm": 0.987945504497949, | |
| "learning_rate": 7.750003815230062e-06, | |
| "loss": 0.5558, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.3863667724575687, | |
| "grad_norm": 0.9218901797189658, | |
| "learning_rate": 7.70964766742662e-06, | |
| "loss": 0.5611, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.38912653511797984, | |
| "grad_norm": 0.964522451572826, | |
| "learning_rate": 7.669040037213795e-06, | |
| "loss": 0.5559, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.39188629777839107, | |
| "grad_norm": 0.8962914985254243, | |
| "learning_rate": 7.628184693385896e-06, | |
| "loss": 0.5601, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.39464606043880224, | |
| "grad_norm": 1.0900746038682583, | |
| "learning_rate": 7.587085427727523e-06, | |
| "loss": 0.5555, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.39740582309921346, | |
| "grad_norm": 0.8919597256872009, | |
| "learning_rate": 7.54574605466166e-06, | |
| "loss": 0.554, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.4001655857596247, | |
| "grad_norm": 0.937111912961035, | |
| "learning_rate": 7.504170410895668e-06, | |
| "loss": 0.5576, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.40292534842003586, | |
| "grad_norm": 0.9458249661943667, | |
| "learning_rate": 7.462362355065189e-06, | |
| "loss": 0.5531, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4056851110804471, | |
| "grad_norm": 0.9326011700869207, | |
| "learning_rate": 7.420325767376026e-06, | |
| "loss": 0.5536, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4084448737408583, | |
| "grad_norm": 1.0186083692765828, | |
| "learning_rate": 7.378064549244031e-06, | |
| "loss": 0.5512, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4112046364012695, | |
| "grad_norm": 1.2256145317450289, | |
| "learning_rate": 7.335582622933e-06, | |
| "loss": 0.5601, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.4139643990616807, | |
| "grad_norm": 0.8742188645170639, | |
| "learning_rate": 7.292883931190667e-06, | |
| "loss": 0.5524, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4139643990616807, | |
| "eval_loss": 0.5561984777450562, | |
| "eval_runtime": 49.4336, | |
| "eval_samples_per_second": 59.231, | |
| "eval_steps_per_second": 3.702, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4167241617220919, | |
| "grad_norm": 1.2121655526375013, | |
| "learning_rate": 7.249972436882756e-06, | |
| "loss": 0.5542, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4194839243825031, | |
| "grad_norm": 0.9357492214275386, | |
| "learning_rate": 7.206852122625203e-06, | |
| "loss": 0.5524, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4222436870429143, | |
| "grad_norm": 0.9433912153799177, | |
| "learning_rate": 7.163526990414522e-06, | |
| "loss": 0.5542, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4250034497033255, | |
| "grad_norm": 1.1073143594605637, | |
| "learning_rate": 7.120001061256387e-06, | |
| "loss": 0.5525, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.4277632123637367, | |
| "grad_norm": 0.9302028752309601, | |
| "learning_rate": 7.076278374792429e-06, | |
| "loss": 0.5541, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.43052297502414794, | |
| "grad_norm": 0.9606456277899846, | |
| "learning_rate": 7.032362988925332e-06, | |
| "loss": 0.5553, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.4332827376845591, | |
| "grad_norm": 0.9251439989697735, | |
| "learning_rate": 6.9882589794422105e-06, | |
| "loss": 0.5489, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.43604250034497033, | |
| "grad_norm": 0.9323884014701264, | |
| "learning_rate": 6.943970439636336e-06, | |
| "loss": 0.5494, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.43880226300538155, | |
| "grad_norm": 1.3158449326861952, | |
| "learning_rate": 6.899501479927242e-06, | |
| "loss": 0.5484, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4415620256657927, | |
| "grad_norm": 0.8775790961507058, | |
| "learning_rate": 6.8548562274792325e-06, | |
| "loss": 0.5453, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.44432178832620395, | |
| "grad_norm": 0.9478549256736298, | |
| "learning_rate": 6.81003882581834e-06, | |
| "loss": 0.5482, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4470815509866152, | |
| "grad_norm": 0.9723342894189143, | |
| "learning_rate": 6.765053434447769e-06, | |
| "loss": 0.5487, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.44984131364702634, | |
| "grad_norm": 0.974749959194784, | |
| "learning_rate": 6.7199042284618484e-06, | |
| "loss": 0.5505, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.45260107630743757, | |
| "grad_norm": 0.9255671399322642, | |
| "learning_rate": 6.674595398158541e-06, | |
| "loss": 0.5493, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.4553608389678488, | |
| "grad_norm": 0.9465763497145431, | |
| "learning_rate": 6.629131148650543e-06, | |
| "loss": 0.5444, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.45812060162825996, | |
| "grad_norm": 0.9292686081943923, | |
| "learning_rate": 6.583515699475009e-06, | |
| "loss": 0.5496, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4608803642886712, | |
| "grad_norm": 1.108323416936679, | |
| "learning_rate": 6.537753284201935e-06, | |
| "loss": 0.5487, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.46364012694908235, | |
| "grad_norm": 0.9042384031621171, | |
| "learning_rate": 6.491848150041242e-06, | |
| "loss": 0.5477, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.4663998896094936, | |
| "grad_norm": 0.850329034972769, | |
| "learning_rate": 6.4458045574485875e-06, | |
| "loss": 0.5519, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.4691596522699048, | |
| "grad_norm": 0.9490531745735549, | |
| "learning_rate": 6.399626779729959e-06, | |
| "loss": 0.5439, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.471919414930316, | |
| "grad_norm": 0.9352622534171564, | |
| "learning_rate": 6.353319102645069e-06, | |
| "loss": 0.5465, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.4746791775907272, | |
| "grad_norm": 0.9191680266246547, | |
| "learning_rate": 6.306885824009585e-06, | |
| "loss": 0.547, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4774389402511384, | |
| "grad_norm": 0.8597116149610817, | |
| "learning_rate": 6.260331253296259e-06, | |
| "loss": 0.5495, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.4801987029115496, | |
| "grad_norm": 0.929502998942292, | |
| "learning_rate": 6.213659711234958e-06, | |
| "loss": 0.5481, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.4829584655719608, | |
| "grad_norm": 0.9675046561306946, | |
| "learning_rate": 6.1668755294116655e-06, | |
| "loss": 0.5468, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.48571822823237204, | |
| "grad_norm": 0.9073333178919157, | |
| "learning_rate": 6.119983049866456e-06, | |
| "loss": 0.5438, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.4884779908927832, | |
| "grad_norm": 0.9109565773453232, | |
| "learning_rate": 6.072986624690516e-06, | |
| "loss": 0.5478, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.49123775355319443, | |
| "grad_norm": 0.8955027642684514, | |
| "learning_rate": 6.025890615622233e-06, | |
| "loss": 0.5415, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.4939975162136056, | |
| "grad_norm": 0.8648546291027864, | |
| "learning_rate": 5.97869939364237e-06, | |
| "loss": 0.5411, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.4967572788740168, | |
| "grad_norm": 0.9523614008291538, | |
| "learning_rate": 5.9314173385683986e-06, | |
| "loss": 0.5453, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.49951704153442805, | |
| "grad_norm": 0.9383064266388607, | |
| "learning_rate": 5.884048838648017e-06, | |
| "loss": 0.5384, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5022768041948392, | |
| "grad_norm": 0.9021624786717238, | |
| "learning_rate": 5.836598290151866e-06, | |
| "loss": 0.5456, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5050365668552504, | |
| "grad_norm": 0.9088357380698893, | |
| "learning_rate": 5.789070096965514e-06, | |
| "loss": 0.5426, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5077963295156617, | |
| "grad_norm": 0.922862032580903, | |
| "learning_rate": 5.741468670180737e-06, | |
| "loss": 0.5434, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.5105560921760729, | |
| "grad_norm": 1.4926573419318607, | |
| "learning_rate": 5.6937984276861195e-06, | |
| "loss": 0.5441, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.513315854836484, | |
| "grad_norm": 0.8580464247033338, | |
| "learning_rate": 5.646063793757028e-06, | |
| "loss": 0.5413, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5160756174968952, | |
| "grad_norm": 0.8836024449571962, | |
| "learning_rate": 5.598269198645008e-06, | |
| "loss": 0.5457, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.5188353801573065, | |
| "grad_norm": 0.8951578721801148, | |
| "learning_rate": 5.550419078166594e-06, | |
| "loss": 0.5452, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.5215951428177177, | |
| "grad_norm": 0.9955192132312739, | |
| "learning_rate": 5.502517873291632e-06, | |
| "loss": 0.5467, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.5243549054781289, | |
| "grad_norm": 1.1988654703686727, | |
| "learning_rate": 5.454570029731115e-06, | |
| "loss": 0.5413, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5271146681385401, | |
| "grad_norm": 0.8839237136665734, | |
| "learning_rate": 5.406579997524567e-06, | |
| "loss": 0.5411, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.5298744307989512, | |
| "grad_norm": 0.9047836025418462, | |
| "learning_rate": 5.358552230627044e-06, | |
| "loss": 0.539, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.5326341934593625, | |
| "grad_norm": 0.875356281556686, | |
| "learning_rate": 5.310491186495757e-06, | |
| "loss": 0.5429, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.5353939561197737, | |
| "grad_norm": 0.9167892739496736, | |
| "learning_rate": 5.262401325676378e-06, | |
| "loss": 0.5384, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.5381537187801849, | |
| "grad_norm": 0.9525976300842104, | |
| "learning_rate": 5.214287111389057e-06, | |
| "loss": 0.5362, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5409134814405961, | |
| "grad_norm": 0.885862636010332, | |
| "learning_rate": 5.166153009114188e-06, | |
| "loss": 0.5378, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.5436732441010073, | |
| "grad_norm": 0.9013246458040829, | |
| "learning_rate": 5.1180034861779685e-06, | |
| "loss": 0.5399, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5464330067614185, | |
| "grad_norm": 0.9017690594117211, | |
| "learning_rate": 5.069843011337789e-06, | |
| "loss": 0.5412, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.5491927694218297, | |
| "grad_norm": 0.9063217665862804, | |
| "learning_rate": 5.0216760543674855e-06, | |
| "loss": 0.5415, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5519525320822409, | |
| "grad_norm": 0.8411472472974292, | |
| "learning_rate": 4.973507085642502e-06, | |
| "loss": 0.537, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5519525320822409, | |
| "eval_loss": 0.540687620639801, | |
| "eval_runtime": 48.7225, | |
| "eval_samples_per_second": 60.095, | |
| "eval_steps_per_second": 3.756, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5547122947426522, | |
| "grad_norm": 0.8884146283634436, | |
| "learning_rate": 4.92534057572499e-06, | |
| "loss": 0.5406, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5574720574030634, | |
| "grad_norm": 0.8914897264034541, | |
| "learning_rate": 4.8771809949489056e-06, | |
| "loss": 0.5385, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5602318200634745, | |
| "grad_norm": 0.936304320970765, | |
| "learning_rate": 4.829032813005103e-06, | |
| "loss": 0.542, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5629915827238857, | |
| "grad_norm": 0.9310122808691009, | |
| "learning_rate": 4.780900498526515e-06, | |
| "loss": 0.5365, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5657513453842969, | |
| "grad_norm": 0.9518666584186597, | |
| "learning_rate": 4.732788518673418e-06, | |
| "loss": 0.5353, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5685111080447082, | |
| "grad_norm": 0.9510071761987918, | |
| "learning_rate": 4.684701338718825e-06, | |
| "loss": 0.5317, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5712708707051194, | |
| "grad_norm": 0.9432991523879628, | |
| "learning_rate": 4.636643421634075e-06, | |
| "loss": 0.5371, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5740306333655306, | |
| "grad_norm": 0.8853811386420297, | |
| "learning_rate": 4.588619227674619e-06, | |
| "loss": 0.5365, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5767903960259417, | |
| "grad_norm": 0.9290294341366813, | |
| "learning_rate": 4.540633213966064e-06, | |
| "loss": 0.5334, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.579550158686353, | |
| "grad_norm": 0.9327668087903835, | |
| "learning_rate": 4.492689834090508e-06, | |
| "loss": 0.5341, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5823099213467642, | |
| "grad_norm": 1.3176571259191412, | |
| "learning_rate": 4.444793537673204e-06, | |
| "loss": 0.5306, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.5850696840071754, | |
| "grad_norm": 0.9485574547153368, | |
| "learning_rate": 4.396948769969587e-06, | |
| "loss": 0.5379, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5878294466675866, | |
| "grad_norm": 0.951959101524703, | |
| "learning_rate": 4.3491599714527115e-06, | |
| "loss": 0.5348, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.5905892093279977, | |
| "grad_norm": 0.8736078038744113, | |
| "learning_rate": 4.301431577401136e-06, | |
| "loss": 0.5323, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.593348971988409, | |
| "grad_norm": 0.8979881907613325, | |
| "learning_rate": 4.253768017487275e-06, | |
| "loss": 0.5389, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5961087346488202, | |
| "grad_norm": 0.9371779488004714, | |
| "learning_rate": 4.206173715366289e-06, | |
| "loss": 0.5343, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5988684973092314, | |
| "grad_norm": 0.9563875289878281, | |
| "learning_rate": 4.1586530882655226e-06, | |
| "loss": 0.5352, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6016282599696426, | |
| "grad_norm": 0.9747450112581771, | |
| "learning_rate": 4.111210546574545e-06, | |
| "loss": 0.534, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.6043880226300539, | |
| "grad_norm": 0.9351891869474309, | |
| "learning_rate": 4.063850493435808e-06, | |
| "loss": 0.5298, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.607147785290465, | |
| "grad_norm": 0.8980425451702817, | |
| "learning_rate": 4.0165773243360105e-06, | |
| "loss": 0.5338, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6099075479508762, | |
| "grad_norm": 0.9224682367333663, | |
| "learning_rate": 3.96939542669814e-06, | |
| "loss": 0.5306, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.6126673106112874, | |
| "grad_norm": 0.8491438680952316, | |
| "learning_rate": 3.922309179474279e-06, | |
| "loss": 0.5306, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.6154270732716987, | |
| "grad_norm": 0.9550543136397366, | |
| "learning_rate": 3.875322952739196e-06, | |
| "loss": 0.5348, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.6181868359321099, | |
| "grad_norm": 0.8390020003541692, | |
| "learning_rate": 3.828441107284755e-06, | |
| "loss": 0.5343, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.620946598592521, | |
| "grad_norm": 0.892945059961568, | |
| "learning_rate": 3.7816679942151945e-06, | |
| "loss": 0.523, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6237063612529322, | |
| "grad_norm": 0.9210249115479134, | |
| "learning_rate": 3.7350079545433014e-06, | |
| "loss": 0.5279, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.6264661239133434, | |
| "grad_norm": 0.9570746572961814, | |
| "learning_rate": 3.6884653187875193e-06, | |
| "loss": 0.5293, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.6292258865737547, | |
| "grad_norm": 0.8534492122472642, | |
| "learning_rate": 3.642044406570031e-06, | |
| "loss": 0.5256, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.6319856492341659, | |
| "grad_norm": 0.8732168431838571, | |
| "learning_rate": 3.595749526215862e-06, | |
| "loss": 0.5328, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.6347454118945771, | |
| "grad_norm": 0.8962727112777485, | |
| "learning_rate": 3.549584974353018e-06, | |
| "loss": 0.5336, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6375051745549882, | |
| "grad_norm": 0.8730073595406601, | |
| "learning_rate": 3.5035550355137156e-06, | |
| "loss": 0.5318, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.6402649372153995, | |
| "grad_norm": 0.9162256052176202, | |
| "learning_rate": 3.457663981736739e-06, | |
| "loss": 0.5346, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.6430246998758107, | |
| "grad_norm": 0.8555722066919174, | |
| "learning_rate": 3.411916072170946e-06, | |
| "loss": 0.5318, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.6457844625362219, | |
| "grad_norm": 0.8407652045731654, | |
| "learning_rate": 3.3663155526799827e-06, | |
| "loss": 0.5306, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.6485442251966331, | |
| "grad_norm": 0.876217339145573, | |
| "learning_rate": 3.3208666554482216e-06, | |
| "loss": 0.5291, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6513039878570442, | |
| "grad_norm": 0.897083649993733, | |
| "learning_rate": 3.275573598587969e-06, | |
| "loss": 0.5229, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6540637505174555, | |
| "grad_norm": 0.7995897085360415, | |
| "learning_rate": 3.230440585747991e-06, | |
| "loss": 0.527, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6568235131778667, | |
| "grad_norm": 0.9014903537041772, | |
| "learning_rate": 3.185471805723365e-06, | |
| "loss": 0.5261, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.6595832758382779, | |
| "grad_norm": 0.856492997294296, | |
| "learning_rate": 3.140671432066719e-06, | |
| "loss": 0.5308, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6623430384986891, | |
| "grad_norm": 0.888225217505466, | |
| "learning_rate": 3.096043622700888e-06, | |
| "loss": 0.5294, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6651028011591004, | |
| "grad_norm": 0.9531940854665482, | |
| "learning_rate": 3.0515925195330148e-06, | |
| "loss": 0.5283, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6678625638195115, | |
| "grad_norm": 1.2539256252009454, | |
| "learning_rate": 3.0073222480701354e-06, | |
| "loss": 0.5275, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6706223264799227, | |
| "grad_norm": 0.9137548818626098, | |
| "learning_rate": 2.9632369170362977e-06, | |
| "loss": 0.5235, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6733820891403339, | |
| "grad_norm": 1.6494495739553416, | |
| "learning_rate": 2.9193406179912297e-06, | |
| "loss": 0.5271, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.6761418518007452, | |
| "grad_norm": 0.8590622849369576, | |
| "learning_rate": 2.875637424950595e-06, | |
| "loss": 0.522, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6789016144611564, | |
| "grad_norm": 0.9186749142898574, | |
| "learning_rate": 2.832131394007891e-06, | |
| "loss": 0.5221, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.6816613771215675, | |
| "grad_norm": 0.9090102002494171, | |
| "learning_rate": 2.788826562958e-06, | |
| "loss": 0.5289, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.6844211397819787, | |
| "grad_norm": 1.0599147672457048, | |
| "learning_rate": 2.745726950922444e-06, | |
| "loss": 0.5241, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.6871809024423899, | |
| "grad_norm": 0.8881000835172156, | |
| "learning_rate": 2.7028365579763606e-06, | |
| "loss": 0.5298, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.6899406651028012, | |
| "grad_norm": 0.8992940208666876, | |
| "learning_rate": 2.6601593647772696e-06, | |
| "loss": 0.5282, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6899406651028012, | |
| "eval_loss": 0.5283246636390686, | |
| "eval_runtime": 51.449, | |
| "eval_samples_per_second": 56.911, | |
| "eval_steps_per_second": 3.557, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6927004277632124, | |
| "grad_norm": 0.9769386150489252, | |
| "learning_rate": 2.6176993321956185e-06, | |
| "loss": 0.5253, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6954601904236236, | |
| "grad_norm": 0.9191890198831723, | |
| "learning_rate": 2.5754604009471786e-06, | |
| "loss": 0.5229, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.6982199530840347, | |
| "grad_norm": 0.8879005162140707, | |
| "learning_rate": 2.533446491227305e-06, | |
| "loss": 0.5247, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.700979715744446, | |
| "grad_norm": 0.9109715281747162, | |
| "learning_rate": 2.491661502347106e-06, | |
| "loss": 0.5218, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.7037394784048572, | |
| "grad_norm": 0.8969876737597425, | |
| "learning_rate": 2.4501093123715395e-06, | |
| "loss": 0.5238, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7064992410652684, | |
| "grad_norm": 0.9123939456680997, | |
| "learning_rate": 2.408793777759504e-06, | |
| "loss": 0.5234, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.7092590037256796, | |
| "grad_norm": 0.8767338790553574, | |
| "learning_rate": 2.3677187330059084e-06, | |
| "loss": 0.5247, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.7120187663860909, | |
| "grad_norm": 0.9605486388857951, | |
| "learning_rate": 2.3268879902857978e-06, | |
| "loss": 0.5194, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.714778529046502, | |
| "grad_norm": 0.9895359728969572, | |
| "learning_rate": 2.2863053391005462e-06, | |
| "loss": 0.5236, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.7175382917069132, | |
| "grad_norm": 0.957733912839131, | |
| "learning_rate": 2.245974545926152e-06, | |
| "loss": 0.5179, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7202980543673244, | |
| "grad_norm": 0.9601154451397089, | |
| "learning_rate": 2.205899353863665e-06, | |
| "loss": 0.5224, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.7230578170277356, | |
| "grad_norm": 0.9084441231449814, | |
| "learning_rate": 2.166083482291801e-06, | |
| "loss": 0.5254, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.7258175796881469, | |
| "grad_norm": 1.0841389588139543, | |
| "learning_rate": 2.1265306265217382e-06, | |
| "loss": 0.5214, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.728577342348558, | |
| "grad_norm": 1.451146075094232, | |
| "learning_rate": 2.0872444574541574e-06, | |
| "loss": 0.5298, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.7313371050089692, | |
| "grad_norm": 0.9009140743152368, | |
| "learning_rate": 2.048228621238547e-06, | |
| "loss": 0.5209, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7340968676693804, | |
| "grad_norm": 0.8967623610111647, | |
| "learning_rate": 2.0094867389347982e-06, | |
| "loss": 0.5248, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.7368566303297917, | |
| "grad_norm": 0.9029822617189499, | |
| "learning_rate": 1.971022406177142e-06, | |
| "loss": 0.5227, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.7396163929902029, | |
| "grad_norm": 0.9009675358712146, | |
| "learning_rate": 1.932839192840436e-06, | |
| "loss": 0.522, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.7423761556506141, | |
| "grad_norm": 0.8816707412986489, | |
| "learning_rate": 1.8949406427088407e-06, | |
| "loss": 0.5223, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.7451359183110252, | |
| "grad_norm": 0.8605202814409114, | |
| "learning_rate": 1.8573302731469255e-06, | |
| "loss": 0.5241, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7478956809714364, | |
| "grad_norm": 0.9612849756128029, | |
| "learning_rate": 1.820011574773221e-06, | |
| "loss": 0.5264, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.7506554436318477, | |
| "grad_norm": 0.8793842539758803, | |
| "learning_rate": 1.7829880111362486e-06, | |
| "loss": 0.5181, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.7534152062922589, | |
| "grad_norm": 1.2081068952668312, | |
| "learning_rate": 1.746263018393079e-06, | |
| "loss": 0.5292, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.7561749689526701, | |
| "grad_norm": 0.8860653822887559, | |
| "learning_rate": 1.7098400049904163e-06, | |
| "loss": 0.5219, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.7589347316130812, | |
| "grad_norm": 0.8901286343549357, | |
| "learning_rate": 1.6737223513482591e-06, | |
| "loss": 0.5259, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.7616944942734925, | |
| "grad_norm": 0.9081512876330592, | |
| "learning_rate": 1.6379134095461673e-06, | |
| "loss": 0.5171, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.7644542569339037, | |
| "grad_norm": 0.854771397996212, | |
| "learning_rate": 1.6024165030121542e-06, | |
| "loss": 0.521, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7672140195943149, | |
| "grad_norm": 0.9522518174715512, | |
| "learning_rate": 1.567234926214236e-06, | |
| "loss": 0.5149, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7699737822547261, | |
| "grad_norm": 0.855171965732468, | |
| "learning_rate": 1.5323719443546785e-06, | |
| "loss": 0.5217, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.7727335449151373, | |
| "grad_norm": 0.846551026714045, | |
| "learning_rate": 1.4978307930669483e-06, | |
| "loss": 0.5174, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7754933075755485, | |
| "grad_norm": 0.9109328397942944, | |
| "learning_rate": 1.4636146781154164e-06, | |
| "loss": 0.5262, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7782530702359597, | |
| "grad_norm": 0.8916543609749644, | |
| "learning_rate": 1.4297267750978277e-06, | |
| "loss": 0.5258, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.7810128328963709, | |
| "grad_norm": 0.839376705554234, | |
| "learning_rate": 1.3961702291505791e-06, | |
| "loss": 0.5233, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.7837725955567821, | |
| "grad_norm": 0.9530767315659909, | |
| "learning_rate": 1.3629481546568163e-06, | |
| "loss": 0.5153, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.7865323582171934, | |
| "grad_norm": 0.9433520691735623, | |
| "learning_rate": 1.3300636349573882e-06, | |
| "loss": 0.5169, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7892921208776045, | |
| "grad_norm": 0.8633774616839734, | |
| "learning_rate": 1.2975197220646807e-06, | |
| "loss": 0.5211, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.7920518835380157, | |
| "grad_norm": 0.8830161483028809, | |
| "learning_rate": 1.2653194363793642e-06, | |
| "loss": 0.5169, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.7948116461984269, | |
| "grad_norm": 0.905868276953366, | |
| "learning_rate": 1.2334657664100614e-06, | |
| "loss": 0.5167, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.7975714088588381, | |
| "grad_norm": 0.8402578283356454, | |
| "learning_rate": 1.2019616684959934e-06, | |
| "loss": 0.5207, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.8003311715192494, | |
| "grad_norm": 1.0070145612320756, | |
| "learning_rate": 1.1708100665325967e-06, | |
| "loss": 0.5176, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8030909341796606, | |
| "grad_norm": 0.8631246623697408, | |
| "learning_rate": 1.1400138517001564e-06, | |
| "loss": 0.5147, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.8058506968400717, | |
| "grad_norm": 0.8746596064396687, | |
| "learning_rate": 1.1095758821954788e-06, | |
| "loss": 0.5212, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.8086104595004829, | |
| "grad_norm": 0.8440888494194767, | |
| "learning_rate": 1.0794989829666197e-06, | |
| "loss": 0.5154, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.8113702221608942, | |
| "grad_norm": 0.8369885923181033, | |
| "learning_rate": 1.049785945450697e-06, | |
| "loss": 0.516, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.8141299848213054, | |
| "grad_norm": 1.5997002594596106, | |
| "learning_rate": 1.0204395273148277e-06, | |
| "loss": 0.5197, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8168897474817166, | |
| "grad_norm": 0.8873807689235719, | |
| "learning_rate": 9.914624522001792e-07, | |
| "loss": 0.5232, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.8196495101421277, | |
| "grad_norm": 0.8697811520680214, | |
| "learning_rate": 9.628574094691945e-07, | |
| "loss": 0.5139, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.822409272802539, | |
| "grad_norm": 0.8378502425029508, | |
| "learning_rate": 9.346270539559882e-07, | |
| "loss": 0.52, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.8251690354629502, | |
| "grad_norm": 0.8849196731646373, | |
| "learning_rate": 9.067740057199514e-07, | |
| "loss": 0.518, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.8279287981233614, | |
| "grad_norm": 0.8664792504731555, | |
| "learning_rate": 8.793008498025879e-07, | |
| "loss": 0.5155, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8279287981233614, | |
| "eval_loss": 0.5207065939903259, | |
| "eval_runtime": 49.1882, | |
| "eval_samples_per_second": 59.526, | |
| "eval_steps_per_second": 3.72, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8306885607837726, | |
| "grad_norm": 0.8700644465815897, | |
| "learning_rate": 8.522101359875934e-07, | |
| "loss": 0.5131, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.8334483234441838, | |
| "grad_norm": 0.9097989981675693, | |
| "learning_rate": 8.255043785642108e-07, | |
| "loss": 0.5125, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.836208086104595, | |
| "grad_norm": 0.8631047164247316, | |
| "learning_rate": 7.991860560938786e-07, | |
| "loss": 0.5234, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.8389678487650062, | |
| "grad_norm": 1.2358190890284166, | |
| "learning_rate": 7.732576111801982e-07, | |
| "loss": 0.5176, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.8417276114254174, | |
| "grad_norm": 0.8514248482095018, | |
| "learning_rate": 7.477214502422281e-07, | |
| "loss": 0.5223, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.8444873740858286, | |
| "grad_norm": 0.8563129413552018, | |
| "learning_rate": 7.225799432911557e-07, | |
| "loss": 0.5219, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.8472471367462399, | |
| "grad_norm": 1.6151093899813862, | |
| "learning_rate": 6.978354237103264e-07, | |
| "loss": 0.516, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.850006899406651, | |
| "grad_norm": 2.00845392683435, | |
| "learning_rate": 6.734901880386896e-07, | |
| "loss": 0.5185, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.8527666620670622, | |
| "grad_norm": 1.1862873972091095, | |
| "learning_rate": 6.495464957576508e-07, | |
| "loss": 0.5122, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.8555264247274734, | |
| "grad_norm": 0.8470639388291085, | |
| "learning_rate": 6.260065690813754e-07, | |
| "loss": 0.5158, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8582861873878846, | |
| "grad_norm": 0.9219107715988072, | |
| "learning_rate": 6.028725927505369e-07, | |
| "loss": 0.5201, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.8610459500482959, | |
| "grad_norm": 0.8827729120692012, | |
| "learning_rate": 5.801467138295597e-07, | |
| "loss": 0.5189, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.8638057127087071, | |
| "grad_norm": 0.8610445224310359, | |
| "learning_rate": 5.578310415073451e-07, | |
| "loss": 0.515, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.8665654753691182, | |
| "grad_norm": 0.870963273635567, | |
| "learning_rate": 5.359276469015179e-07, | |
| "loss": 0.5151, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.8693252380295294, | |
| "grad_norm": 0.8396590023282043, | |
| "learning_rate": 5.14438562866208e-07, | |
| "loss": 0.5223, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8720850006899407, | |
| "grad_norm": 0.8992961235238134, | |
| "learning_rate": 4.933657838033795e-07, | |
| "loss": 0.5144, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.8748447633503519, | |
| "grad_norm": 0.9038948906592709, | |
| "learning_rate": 4.7271126547772773e-07, | |
| "loss": 0.5165, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.8776045260107631, | |
| "grad_norm": 0.8686028788473747, | |
| "learning_rate": 4.524769248351718e-07, | |
| "loss": 0.5157, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.8803642886711743, | |
| "grad_norm": 0.8744713706157983, | |
| "learning_rate": 4.3266463982493566e-07, | |
| "loss": 0.5167, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8831240513315854, | |
| "grad_norm": 0.8953682081464681, | |
| "learning_rate": 4.132762492252601e-07, | |
| "loss": 0.5229, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8858838139919967, | |
| "grad_norm": 1.0500886926956579, | |
| "learning_rate": 3.943135524727448e-07, | |
| "loss": 0.5168, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8886435766524079, | |
| "grad_norm": 0.8891911066911182, | |
| "learning_rate": 3.757783094953382e-07, | |
| "loss": 0.52, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.8914033393128191, | |
| "grad_norm": 0.8469428073562058, | |
| "learning_rate": 3.5767224054900687e-07, | |
| "loss": 0.514, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.8941631019732303, | |
| "grad_norm": 0.8985979256834127, | |
| "learning_rate": 3.3999702605807203e-07, | |
| "loss": 0.5153, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8969228646336415, | |
| "grad_norm": 0.8740663379412131, | |
| "learning_rate": 3.227543064592514e-07, | |
| "loss": 0.5179, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8996826272940527, | |
| "grad_norm": 0.9253549886032714, | |
| "learning_rate": 3.059456820494111e-07, | |
| "loss": 0.5187, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.9024423899544639, | |
| "grad_norm": 0.8657401705396123, | |
| "learning_rate": 2.8957271283704067e-07, | |
| "loss": 0.5188, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.9052021526148751, | |
| "grad_norm": 0.8317058461633765, | |
| "learning_rate": 2.736369183974685e-07, | |
| "loss": 0.5161, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.9079619152752864, | |
| "grad_norm": 0.8738336902844653, | |
| "learning_rate": 2.5813977773183175e-07, | |
| "loss": 0.515, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.9107216779356976, | |
| "grad_norm": 0.9137477549092036, | |
| "learning_rate": 2.430827291298099e-07, | |
| "loss": 0.5198, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9134814405961087, | |
| "grad_norm": 0.8521777662643962, | |
| "learning_rate": 2.2846717003613462e-07, | |
| "loss": 0.5221, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.9162412032565199, | |
| "grad_norm": 1.12701228509693, | |
| "learning_rate": 2.1429445692089712e-07, | |
| "loss": 0.5154, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.9190009659169311, | |
| "grad_norm": 0.9027916863842183, | |
| "learning_rate": 2.0056590515365016e-07, | |
| "loss": 0.5159, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.9217607285773424, | |
| "grad_norm": 0.839347519350046, | |
| "learning_rate": 1.8728278888132944e-07, | |
| "loss": 0.518, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.9245204912377536, | |
| "grad_norm": 0.8627735870099915, | |
| "learning_rate": 1.744463409100039e-07, | |
| "loss": 0.5189, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9272802538981647, | |
| "grad_norm": 0.8677279393067534, | |
| "learning_rate": 1.620577525904532e-07, | |
| "loss": 0.5099, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.9300400165585759, | |
| "grad_norm": 0.8705917700670992, | |
| "learning_rate": 1.501181737076035e-07, | |
| "loss": 0.5154, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.9327997792189872, | |
| "grad_norm": 0.9046790697526798, | |
| "learning_rate": 1.3862871237381004e-07, | |
| "loss": 0.5108, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.9355595418793984, | |
| "grad_norm": 0.8203655507439958, | |
| "learning_rate": 1.2759043492601986e-07, | |
| "loss": 0.5136, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.9383193045398096, | |
| "grad_norm": 0.8949093577414237, | |
| "learning_rate": 1.1700436582680108e-07, | |
| "loss": 0.5192, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.9410790672002208, | |
| "grad_norm": 0.8425742363423367, | |
| "learning_rate": 1.068714875692628e-07, | |
| "loss": 0.5116, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.943838829860632, | |
| "grad_norm": 0.9030687400394452, | |
| "learning_rate": 9.719274058587247e-08, | |
| "loss": 0.5125, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.9465985925210432, | |
| "grad_norm": 0.8753228488205499, | |
| "learning_rate": 8.796902316117018e-08, | |
| "loss": 0.5156, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.9493583551814544, | |
| "grad_norm": 0.8396184463940205, | |
| "learning_rate": 7.920119134840199e-08, | |
| "loss": 0.5141, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.9521181178418656, | |
| "grad_norm": 0.882889766020658, | |
| "learning_rate": 7.08900588900685e-08, | |
| "loss": 0.5131, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.9548778805022768, | |
| "grad_norm": 0.8504174898031396, | |
| "learning_rate": 6.303639714240196e-08, | |
| "loss": 0.5193, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.957637643162688, | |
| "grad_norm": 0.8595498992279648, | |
| "learning_rate": 5.564093500377732e-08, | |
| "loss": 0.515, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.9603974058230992, | |
| "grad_norm": 0.8244758548316357, | |
| "learning_rate": 4.870435884705982e-08, | |
| "loss": 0.5135, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.9631571684835104, | |
| "grad_norm": 0.9192549923625648, | |
| "learning_rate": 4.22273124559075e-08, | |
| "loss": 0.5165, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.9659169311439216, | |
| "grad_norm": 1.8467401343991516, | |
| "learning_rate": 3.621039696501794e-08, | |
| "loss": 0.5106, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9659169311439216, | |
| "eval_loss": 0.5180693864822388, | |
| "eval_runtime": 48.9508, | |
| "eval_samples_per_second": 59.815, | |
| "eval_steps_per_second": 3.738, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9686766938043329, | |
| "grad_norm": 0.9145993584663505, | |
| "learning_rate": 3.065417080433841e-08, | |
| "loss": 0.517, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.9714364564647441, | |
| "grad_norm": 0.8570121000540161, | |
| "learning_rate": 2.555914964723849e-08, | |
| "loss": 0.5142, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.9741962191251552, | |
| "grad_norm": 0.8678615083324009, | |
| "learning_rate": 2.0925806362648847e-08, | |
| "loss": 0.5198, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.9769559817855664, | |
| "grad_norm": 0.8381539016578547, | |
| "learning_rate": 1.6754570971176944e-08, | |
| "loss": 0.5125, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.9797157444459776, | |
| "grad_norm": 0.9056379885360033, | |
| "learning_rate": 1.3045830605192266e-08, | |
| "loss": 0.5192, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.9824755071063889, | |
| "grad_norm": 0.8696346972561173, | |
| "learning_rate": 9.799929472902315e-09, | |
| "loss": 0.5205, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.9852352697668001, | |
| "grad_norm": 3.314581793197866, | |
| "learning_rate": 7.017168826401466e-09, | |
| "loss": 0.5169, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.9879950324272112, | |
| "grad_norm": 0.8566695475770818, | |
| "learning_rate": 4.697806933715021e-09, | |
| "loss": 0.5095, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.9907547950876224, | |
| "grad_norm": 0.8718028954806468, | |
| "learning_rate": 2.8420590548294825e-09, | |
| "loss": 0.5175, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.9935145577480337, | |
| "grad_norm": 0.8678996243220692, | |
| "learning_rate": 1.450097421710206e-09, | |
| "loss": 0.519, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9962743204084449, | |
| "grad_norm": 0.8913548884246177, | |
| "learning_rate": 5.220512223219621e-10, | |
| "loss": 0.5135, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9990340830688561, | |
| "grad_norm": 0.8708615108897616, | |
| "learning_rate": 5.8006588636305704e-11, | |
| "loss": 0.5151, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 3624, | |
| "total_flos": 1884305306419200.0, | |
| "train_loss": 0.5639046454929622, | |
| "train_runtime": 12600.7671, | |
| "train_samples_per_second": 23.004, | |
| "train_steps_per_second": 0.288 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3624, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1884305306419200.0, | |
| "train_batch_size": 5, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |