diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,263 +3,8623 @@ "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, - "global_step": 325, + "global_step": 12285, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.15384615384615385, - "grad_norm": 1.6880494356155396, - "learning_rate": 1.9384615384615386e-05, - "loss": 1.0254, + "epoch": 0.0, + "grad_norm": 1.3860865831375122, + "learning_rate": 1.9983719983719984e-05, + "loss": 0.684, "step": 10 }, { - "epoch": 0.3076923076923077, - "grad_norm": 1.6863218545913696, - "learning_rate": 1.876923076923077e-05, - "loss": 0.875, + "epoch": 0.01, + "grad_norm": 1.3011243343353271, + "learning_rate": 1.996743996743997e-05, + "loss": 0.6568, "step": 20 }, { - "epoch": 0.46153846153846156, - "grad_norm": 1.5608391761779785, - "learning_rate": 1.8153846153846155e-05, - "loss": 0.7825, + "epoch": 0.01, + "grad_norm": 1.646723985671997, + "learning_rate": 1.9951159951159952e-05, + "loss": 0.6477, "step": 30 }, { - "epoch": 0.6153846153846154, - "grad_norm": 1.4615881443023682, - "learning_rate": 1.753846153846154e-05, - "loss": 0.6326, + "epoch": 0.02, + "grad_norm": 1.43569016456604, + "learning_rate": 1.9934879934879937e-05, + "loss": 0.61, "step": 40 }, { - "epoch": 0.7692307692307693, - "grad_norm": 2.1352627277374268, - "learning_rate": 1.6923076923076924e-05, - "loss": 0.5693, + "epoch": 0.02, + "grad_norm": 1.4323921203613281, + "learning_rate": 1.991859991859992e-05, + "loss": 0.5758, "step": 50 }, { - "epoch": 0.9230769230769231, - "grad_norm": 1.560471773147583, - "learning_rate": 1.630769230769231e-05, - "loss": 0.4709, + "epoch": 0.02, + "grad_norm": 2.1664583683013916, + "learning_rate": 1.9902319902319905e-05, + "loss": 0.5529, "step": 60 }, { - "epoch": 1.0769230769230769, - "grad_norm": 1.3950369358062744, - "learning_rate": 1.5692307692307693e-05, - "loss": 0.4044, + "epoch": 0.03, + "grad_norm": 1.935644268989563, + "learning_rate": 1.9886039886039888e-05, + "loss": 0.4969, "step": 70 }, { - "epoch": 1.2307692307692308, - "grad_norm": 1.1524418592453003, - "learning_rate": 1.5076923076923078e-05, - "loss": 0.3491, + "epoch": 0.03, + "grad_norm": 2.984022617340088, + "learning_rate": 1.986975986975987e-05, + "loss": 0.5017, "step": 80 }, { - "epoch": 1.3846153846153846, - "grad_norm": 1.3851335048675537, - "learning_rate": 1.4461538461538462e-05, - "loss": 0.3191, + "epoch": 0.04, + "grad_norm": 1.9753074645996094, + "learning_rate": 1.9853479853479855e-05, + "loss": 0.4438, "step": 90 }, { - "epoch": 1.5384615384615383, - "grad_norm": 0.7817617654800415, - "learning_rate": 1.3846153846153847e-05, - "loss": 0.2975, + "epoch": 0.04, + "grad_norm": 4.39138650894165, + "learning_rate": 1.9837199837199838e-05, + "loss": 0.4033, "step": 100 }, { - "epoch": 1.6923076923076923, - "grad_norm": 2.4296486377716064, - "learning_rate": 1.3230769230769231e-05, - "loss": 0.2719, + "epoch": 0.04, + "grad_norm": 3.0486788749694824, + "learning_rate": 1.9820919820919823e-05, + "loss": 0.3642, "step": 110 }, { - "epoch": 1.8461538461538463, - "grad_norm": 3.495382308959961, - "learning_rate": 1.2615384615384616e-05, - "loss": 0.2863, + "epoch": 0.05, + "grad_norm": 1.738529920578003, + "learning_rate": 1.9804639804639806e-05, + "loss": 0.3557, "step": 120 }, { - "epoch": 2.0, - "grad_norm": 0.9178031086921692, - "learning_rate": 1.2e-05, - "loss": 0.2066, + "epoch": 0.05, + "grad_norm": 2.9336562156677246, + "learning_rate": 1.978835978835979e-05, + "loss": 0.3655, "step": 130 }, { - "epoch": 2.1538461538461537, - "grad_norm": 1.042531967163086, - "learning_rate": 1.1384615384615385e-05, - "loss": 0.1934, + "epoch": 0.06, + "grad_norm": 2.0220277309417725, + "learning_rate": 1.9772079772079773e-05, + "loss": 0.2903, "step": 140 }, { - "epoch": 2.3076923076923075, - "grad_norm": 2.2865443229675293, - "learning_rate": 1.076923076923077e-05, - "loss": 0.1905, + "epoch": 0.06, + "grad_norm": 2.4428532123565674, + "learning_rate": 1.975579975579976e-05, + "loss": 0.2706, "step": 150 }, { - "epoch": 2.4615384615384617, - "grad_norm": 2.3291144371032715, - "learning_rate": 1.0153846153846154e-05, - "loss": 0.2134, + "epoch": 0.07, + "grad_norm": 5.031763076782227, + "learning_rate": 1.973951973951974e-05, + "loss": 0.3558, "step": 160 }, { - "epoch": 2.6153846153846154, - "grad_norm": 1.420284628868103, - "learning_rate": 9.53846153846154e-06, - "loss": 0.1895, + "epoch": 0.07, + "grad_norm": 3.3514373302459717, + "learning_rate": 1.9723239723239724e-05, + "loss": 0.232, "step": 170 }, { - "epoch": 2.769230769230769, - "grad_norm": 2.7554662227630615, - "learning_rate": 8.923076923076925e-06, - "loss": 0.1897, + "epoch": 0.07, + "grad_norm": 1.0613574981689453, + "learning_rate": 1.970695970695971e-05, + "loss": 0.2499, "step": 180 }, { - "epoch": 2.9230769230769234, - "grad_norm": 1.4687916040420532, - "learning_rate": 8.307692307692309e-06, - "loss": 0.1761, + "epoch": 0.08, + "grad_norm": 7.994803428649902, + "learning_rate": 1.969067969067969e-05, + "loss": 0.2609, "step": 190 }, { - "epoch": 3.076923076923077, - "grad_norm": 3.535647392272949, - "learning_rate": 7.692307692307694e-06, - "loss": 0.1862, + "epoch": 0.08, + "grad_norm": 1.9980034828186035, + "learning_rate": 1.9674399674399677e-05, + "loss": 0.2267, "step": 200 }, { - "epoch": 3.230769230769231, - "grad_norm": 3.035341739654541, - "learning_rate": 7.076923076923078e-06, - "loss": 0.1937, + "epoch": 0.09, + "grad_norm": 2.297769069671631, + "learning_rate": 1.965811965811966e-05, + "loss": 0.2371, "step": 210 }, { - "epoch": 3.3846153846153846, - "grad_norm": 2.828181028366089, - "learning_rate": 6.461538461538463e-06, - "loss": 0.1407, + "epoch": 0.09, + "grad_norm": 5.071080207824707, + "learning_rate": 1.9641839641839645e-05, + "loss": 0.248, "step": 220 }, { - "epoch": 3.5384615384615383, - "grad_norm": 2.125542163848877, - "learning_rate": 5.846153846153847e-06, - "loss": 0.1338, + "epoch": 0.09, + "grad_norm": 0.43589872121810913, + "learning_rate": 1.9625559625559627e-05, + "loss": 0.151, "step": 230 }, { - "epoch": 3.6923076923076925, - "grad_norm": 1.5795857906341553, - "learning_rate": 5.230769230769232e-06, - "loss": 0.131, + "epoch": 0.1, + "grad_norm": 6.790423393249512, + "learning_rate": 1.960927960927961e-05, + "loss": 0.1886, "step": 240 }, { - "epoch": 3.8461538461538463, - "grad_norm": 0.5753270387649536, - "learning_rate": 4.615384615384616e-06, - "loss": 0.1493, + "epoch": 0.1, + "grad_norm": 1.1880414485931396, + "learning_rate": 1.9592999592999595e-05, + "loss": 0.2104, "step": 250 }, { - "epoch": 4.0, - "grad_norm": 0.9189938306808472, - "learning_rate": 4.000000000000001e-06, - "loss": 0.1489, + "epoch": 0.11, + "grad_norm": 6.434396266937256, + "learning_rate": 1.9576719576719577e-05, + "loss": 0.2847, "step": 260 }, { - "epoch": 4.153846153846154, - "grad_norm": 0.939368724822998, - "learning_rate": 3.384615384615385e-06, - "loss": 0.1615, + "epoch": 0.11, + "grad_norm": 2.175398826599121, + "learning_rate": 1.9560439560439563e-05, + "loss": 0.261, "step": 270 }, { - "epoch": 4.3076923076923075, - "grad_norm": 1.0938255786895752, - "learning_rate": 2.7692307692307697e-06, - "loss": 0.1433, + "epoch": 0.11, + "grad_norm": 2.2410614490509033, + "learning_rate": 1.9544159544159545e-05, + "loss": 0.1376, "step": 280 }, { - "epoch": 4.461538461538462, - "grad_norm": 0.398496150970459, - "learning_rate": 2.153846153846154e-06, - "loss": 0.1485, + "epoch": 0.12, + "grad_norm": 0.5503996014595032, + "learning_rate": 1.952787952787953e-05, + "loss": 0.2034, "step": 290 }, { - "epoch": 4.615384615384615, - "grad_norm": 1.6919806003570557, - "learning_rate": 1.5384615384615387e-06, - "loss": 0.131, + "epoch": 0.12, + "grad_norm": 3.550145387649536, + "learning_rate": 1.9511599511599513e-05, + "loss": 0.1802, "step": 300 }, { - "epoch": 4.769230769230769, - "grad_norm": 0.9723582863807678, - "learning_rate": 9.230769230769232e-07, - "loss": 0.1288, + "epoch": 0.13, + "grad_norm": 0.35100820660591125, + "learning_rate": 1.94953194953195e-05, + "loss": 0.1778, "step": 310 }, { - "epoch": 4.923076923076923, - "grad_norm": 0.5153496265411377, - "learning_rate": 3.0769230769230774e-07, - "loss": 0.109, + "epoch": 0.13, + "grad_norm": 1.0361884832382202, + "learning_rate": 1.947903947903948e-05, + "loss": 0.2186, "step": 320 }, { - "epoch": 5.0, - "step": 325, - "total_flos": 4.006371770595533e+17, - "train_loss": 0.29627929100623496, - "train_runtime": 105.6493, - "train_samples_per_second": 48.935, - "train_steps_per_second": 3.076 - } - ], - "logging_steps": 10, - "max_steps": 325, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": false, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.006371770595533e+17, + "epoch": 0.13, + "grad_norm": 7.980532169342041, + "learning_rate": 1.9462759462759463e-05, + "loss": 0.2742, + "step": 330 + }, + { + "epoch": 0.14, + "grad_norm": 4.8567585945129395, + "learning_rate": 1.9446479446479445e-05, + "loss": 0.1544, + "step": 340 + }, + { + "epoch": 0.14, + "grad_norm": 5.355805397033691, + "learning_rate": 1.943019943019943e-05, + "loss": 0.2739, + "step": 350 + }, + { + "epoch": 0.15, + "grad_norm": 6.359828472137451, + "learning_rate": 1.9413919413919417e-05, + "loss": 0.2076, + "step": 360 + }, + { + "epoch": 0.15, + "grad_norm": 0.9936553239822388, + "learning_rate": 1.93976393976394e-05, + "loss": 0.1666, + "step": 370 + }, + { + "epoch": 0.15, + "grad_norm": 6.090355396270752, + "learning_rate": 1.9381359381359385e-05, + "loss": 0.1941, + "step": 380 + }, + { + "epoch": 0.16, + "grad_norm": 2.5009548664093018, + "learning_rate": 1.9365079365079367e-05, + "loss": 0.1905, + "step": 390 + }, + { + "epoch": 0.16, + "grad_norm": 8.634650230407715, + "learning_rate": 1.934879934879935e-05, + "loss": 0.1431, + "step": 400 + }, + { + "epoch": 0.17, + "grad_norm": 2.43247389793396, + "learning_rate": 1.9332519332519335e-05, + "loss": 0.1736, + "step": 410 + }, + { + "epoch": 0.17, + "grad_norm": 15.868481636047363, + "learning_rate": 1.9316239316239317e-05, + "loss": 0.249, + "step": 420 + }, + { + "epoch": 0.18, + "grad_norm": 1.8422390222549438, + "learning_rate": 1.9299959299959303e-05, + "loss": 0.1407, + "step": 430 + }, + { + "epoch": 0.18, + "grad_norm": 5.148740291595459, + "learning_rate": 1.9283679283679285e-05, + "loss": 0.1503, + "step": 440 + }, + { + "epoch": 0.18, + "grad_norm": 2.3315675258636475, + "learning_rate": 1.926739926739927e-05, + "loss": 0.1885, + "step": 450 + }, + { + "epoch": 0.19, + "grad_norm": 3.6225030422210693, + "learning_rate": 1.9251119251119253e-05, + "loss": 0.1403, + "step": 460 + }, + { + "epoch": 0.19, + "grad_norm": 4.605388641357422, + "learning_rate": 1.9234839234839235e-05, + "loss": 0.2384, + "step": 470 + }, + { + "epoch": 0.2, + "grad_norm": 2.3162589073181152, + "learning_rate": 1.921855921855922e-05, + "loss": 0.129, + "step": 480 + }, + { + "epoch": 0.2, + "grad_norm": 0.4153892695903778, + "learning_rate": 1.9202279202279203e-05, + "loss": 0.1109, + "step": 490 + }, + { + "epoch": 0.2, + "grad_norm": 7.691011905670166, + "learning_rate": 1.9185999185999185e-05, + "loss": 0.1846, + "step": 500 + }, + { + "epoch": 0.21, + "grad_norm": 7.940028667449951, + "learning_rate": 1.916971916971917e-05, + "loss": 0.1391, + "step": 510 + }, + { + "epoch": 0.21, + "grad_norm": 0.5145124793052673, + "learning_rate": 1.9153439153439156e-05, + "loss": 0.1288, + "step": 520 + }, + { + "epoch": 0.22, + "grad_norm": 2.5785932540893555, + "learning_rate": 1.913715913715914e-05, + "loss": 0.1537, + "step": 530 + }, + { + "epoch": 0.22, + "grad_norm": 6.997181415557861, + "learning_rate": 1.9120879120879124e-05, + "loss": 0.1578, + "step": 540 + }, + { + "epoch": 0.22, + "grad_norm": 2.4879519939422607, + "learning_rate": 1.9104599104599107e-05, + "loss": 0.1954, + "step": 550 + }, + { + "epoch": 0.23, + "grad_norm": 5.291905879974365, + "learning_rate": 1.908831908831909e-05, + "loss": 0.1557, + "step": 560 + }, + { + "epoch": 0.23, + "grad_norm": 5.735557556152344, + "learning_rate": 1.9072039072039074e-05, + "loss": 0.1621, + "step": 570 + }, + { + "epoch": 0.24, + "grad_norm": 5.979973316192627, + "learning_rate": 1.9055759055759057e-05, + "loss": 0.1503, + "step": 580 + }, + { + "epoch": 0.24, + "grad_norm": 0.19126015901565552, + "learning_rate": 1.9039479039479042e-05, + "loss": 0.094, + "step": 590 + }, + { + "epoch": 0.24, + "grad_norm": 1.0556552410125732, + "learning_rate": 1.9023199023199025e-05, + "loss": 0.1876, + "step": 600 + }, + { + "epoch": 0.25, + "grad_norm": 3.954843759536743, + "learning_rate": 1.900691900691901e-05, + "loss": 0.3162, + "step": 610 + }, + { + "epoch": 0.25, + "grad_norm": 0.14296281337738037, + "learning_rate": 1.8990638990638992e-05, + "loss": 0.1288, + "step": 620 + }, + { + "epoch": 0.26, + "grad_norm": 8.772310256958008, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.2622, + "step": 630 + }, + { + "epoch": 0.26, + "grad_norm": 2.3941524028778076, + "learning_rate": 1.895807895807896e-05, + "loss": 0.1524, + "step": 640 + }, + { + "epoch": 0.26, + "grad_norm": 8.104179382324219, + "learning_rate": 1.8941798941798943e-05, + "loss": 0.1109, + "step": 650 + }, + { + "epoch": 0.27, + "grad_norm": 1.5782121419906616, + "learning_rate": 1.8925518925518925e-05, + "loss": 0.0729, + "step": 660 + }, + { + "epoch": 0.27, + "grad_norm": 0.39667731523513794, + "learning_rate": 1.890923890923891e-05, + "loss": 0.1116, + "step": 670 + }, + { + "epoch": 0.28, + "grad_norm": 5.58447265625, + "learning_rate": 1.8892958892958896e-05, + "loss": 0.1312, + "step": 680 + }, + { + "epoch": 0.28, + "grad_norm": 1.3114192485809326, + "learning_rate": 1.887667887667888e-05, + "loss": 0.2522, + "step": 690 + }, + { + "epoch": 0.28, + "grad_norm": 0.997601330280304, + "learning_rate": 1.8860398860398864e-05, + "loss": 0.1605, + "step": 700 + }, + { + "epoch": 0.29, + "grad_norm": 3.605452537536621, + "learning_rate": 1.8844118844118846e-05, + "loss": 0.2443, + "step": 710 + }, + { + "epoch": 0.29, + "grad_norm": 18.868513107299805, + "learning_rate": 1.882783882783883e-05, + "loss": 0.1923, + "step": 720 + }, + { + "epoch": 0.3, + "grad_norm": 3.4352970123291016, + "learning_rate": 1.881155881155881e-05, + "loss": 0.1099, + "step": 730 + }, + { + "epoch": 0.3, + "grad_norm": 2.06532883644104, + "learning_rate": 1.8795278795278796e-05, + "loss": 0.1426, + "step": 740 + }, + { + "epoch": 0.31, + "grad_norm": 5.7250237464904785, + "learning_rate": 1.8778998778998782e-05, + "loss": 0.209, + "step": 750 + }, + { + "epoch": 0.31, + "grad_norm": 0.23493361473083496, + "learning_rate": 1.8762718762718764e-05, + "loss": 0.1986, + "step": 760 + }, + { + "epoch": 0.31, + "grad_norm": 17.769451141357422, + "learning_rate": 1.874643874643875e-05, + "loss": 0.1267, + "step": 770 + }, + { + "epoch": 0.32, + "grad_norm": 0.27574750781059265, + "learning_rate": 1.8730158730158732e-05, + "loss": 0.1484, + "step": 780 + }, + { + "epoch": 0.32, + "grad_norm": 0.30309033393859863, + "learning_rate": 1.8713878713878714e-05, + "loss": 0.1838, + "step": 790 + }, + { + "epoch": 0.33, + "grad_norm": 17.183013916015625, + "learning_rate": 1.86975986975987e-05, + "loss": 0.135, + "step": 800 + }, + { + "epoch": 0.33, + "grad_norm": 0.5572558641433716, + "learning_rate": 1.8681318681318682e-05, + "loss": 0.1456, + "step": 810 + }, + { + "epoch": 0.33, + "grad_norm": 0.4613451659679413, + "learning_rate": 1.8665038665038664e-05, + "loss": 0.1337, + "step": 820 + }, + { + "epoch": 0.34, + "grad_norm": 6.645438194274902, + "learning_rate": 1.864875864875865e-05, + "loss": 0.1446, + "step": 830 + }, + { + "epoch": 0.34, + "grad_norm": 5.389886856079102, + "learning_rate": 1.8632478632478636e-05, + "loss": 0.1253, + "step": 840 + }, + { + "epoch": 0.35, + "grad_norm": 14.86754322052002, + "learning_rate": 1.8616198616198618e-05, + "loss": 0.1346, + "step": 850 + }, + { + "epoch": 0.35, + "grad_norm": 13.419057846069336, + "learning_rate": 1.85999185999186e-05, + "loss": 0.0926, + "step": 860 + }, + { + "epoch": 0.35, + "grad_norm": 13.904304504394531, + "learning_rate": 1.8583638583638586e-05, + "loss": 0.1944, + "step": 870 + }, + { + "epoch": 0.36, + "grad_norm": 0.28235912322998047, + "learning_rate": 1.8567358567358568e-05, + "loss": 0.1261, + "step": 880 + }, + { + "epoch": 0.36, + "grad_norm": 5.711563587188721, + "learning_rate": 1.855107855107855e-05, + "loss": 0.1824, + "step": 890 + }, + { + "epoch": 0.37, + "grad_norm": 17.74437141418457, + "learning_rate": 1.8534798534798536e-05, + "loss": 0.1742, + "step": 900 + }, + { + "epoch": 0.37, + "grad_norm": 3.648202657699585, + "learning_rate": 1.851851851851852e-05, + "loss": 0.2103, + "step": 910 + }, + { + "epoch": 0.37, + "grad_norm": 2.0693366527557373, + "learning_rate": 1.8502238502238504e-05, + "loss": 0.1868, + "step": 920 + }, + { + "epoch": 0.38, + "grad_norm": 2.299172878265381, + "learning_rate": 1.848595848595849e-05, + "loss": 0.168, + "step": 930 + }, + { + "epoch": 0.38, + "grad_norm": 1.4059839248657227, + "learning_rate": 1.8469678469678472e-05, + "loss": 0.1455, + "step": 940 + }, + { + "epoch": 0.39, + "grad_norm": 0.926304817199707, + "learning_rate": 1.8453398453398454e-05, + "loss": 0.2002, + "step": 950 + }, + { + "epoch": 0.39, + "grad_norm": 4.728667736053467, + "learning_rate": 1.8437118437118436e-05, + "loss": 0.1245, + "step": 960 + }, + { + "epoch": 0.39, + "grad_norm": 0.5045005083084106, + "learning_rate": 1.8420838420838422e-05, + "loss": 0.0638, + "step": 970 + }, + { + "epoch": 0.4, + "grad_norm": 8.82520580291748, + "learning_rate": 1.8404558404558404e-05, + "loss": 0.1127, + "step": 980 + }, + { + "epoch": 0.4, + "grad_norm": 5.101595401763916, + "learning_rate": 1.838827838827839e-05, + "loss": 0.2363, + "step": 990 + }, + { + "epoch": 0.41, + "grad_norm": 7.01576566696167, + "learning_rate": 1.8371998371998375e-05, + "loss": 0.1026, + "step": 1000 + }, + { + "epoch": 0.41, + "grad_norm": 0.865003764629364, + "learning_rate": 1.8355718355718358e-05, + "loss": 0.0965, + "step": 1010 + }, + { + "epoch": 0.42, + "grad_norm": 9.897397994995117, + "learning_rate": 1.833943833943834e-05, + "loss": 0.1156, + "step": 1020 + }, + { + "epoch": 0.42, + "grad_norm": 1.5007679462432861, + "learning_rate": 1.8323158323158326e-05, + "loss": 0.1888, + "step": 1030 + }, + { + "epoch": 0.42, + "grad_norm": 4.676563262939453, + "learning_rate": 1.8306878306878308e-05, + "loss": 0.1552, + "step": 1040 + }, + { + "epoch": 0.43, + "grad_norm": 5.3361430168151855, + "learning_rate": 1.829059829059829e-05, + "loss": 0.1447, + "step": 1050 + }, + { + "epoch": 0.43, + "grad_norm": 0.8933970332145691, + "learning_rate": 1.8274318274318276e-05, + "loss": 0.1394, + "step": 1060 + }, + { + "epoch": 0.44, + "grad_norm": 7.401905059814453, + "learning_rate": 1.825803825803826e-05, + "loss": 0.2188, + "step": 1070 + }, + { + "epoch": 0.44, + "grad_norm": 0.4379027783870697, + "learning_rate": 1.8241758241758244e-05, + "loss": 0.1277, + "step": 1080 + }, + { + "epoch": 0.44, + "grad_norm": 2.8909428119659424, + "learning_rate": 1.8225478225478226e-05, + "loss": 0.1726, + "step": 1090 + }, + { + "epoch": 0.45, + "grad_norm": 0.11447061598300934, + "learning_rate": 1.820919820919821e-05, + "loss": 0.1523, + "step": 1100 + }, + { + "epoch": 0.45, + "grad_norm": 0.12276914715766907, + "learning_rate": 1.8192918192918194e-05, + "loss": 0.1823, + "step": 1110 + }, + { + "epoch": 0.46, + "grad_norm": 1.3844455480575562, + "learning_rate": 1.8176638176638176e-05, + "loss": 0.1006, + "step": 1120 + }, + { + "epoch": 0.46, + "grad_norm": 3.1034061908721924, + "learning_rate": 1.816035816035816e-05, + "loss": 0.1387, + "step": 1130 + }, + { + "epoch": 0.46, + "grad_norm": 8.602412223815918, + "learning_rate": 1.8144078144078144e-05, + "loss": 0.2345, + "step": 1140 + }, + { + "epoch": 0.47, + "grad_norm": 0.13919875025749207, + "learning_rate": 1.812779812779813e-05, + "loss": 0.1653, + "step": 1150 + }, + { + "epoch": 0.47, + "grad_norm": 0.34385234117507935, + "learning_rate": 1.8111518111518115e-05, + "loss": 0.2003, + "step": 1160 + }, + { + "epoch": 0.48, + "grad_norm": 4.868250846862793, + "learning_rate": 1.8095238095238097e-05, + "loss": 0.1504, + "step": 1170 + }, + { + "epoch": 0.48, + "grad_norm": 2.267928123474121, + "learning_rate": 1.807895807895808e-05, + "loss": 0.2023, + "step": 1180 + }, + { + "epoch": 0.48, + "grad_norm": 3.634040594100952, + "learning_rate": 1.8062678062678065e-05, + "loss": 0.1682, + "step": 1190 + }, + { + "epoch": 0.49, + "grad_norm": 1.8135625123977661, + "learning_rate": 1.8046398046398047e-05, + "loss": 0.2059, + "step": 1200 + }, + { + "epoch": 0.49, + "grad_norm": 2.294635057449341, + "learning_rate": 1.803011803011803e-05, + "loss": 0.1498, + "step": 1210 + }, + { + "epoch": 0.5, + "grad_norm": 3.8841567039489746, + "learning_rate": 1.8013838013838015e-05, + "loss": 0.0727, + "step": 1220 + }, + { + "epoch": 0.5, + "grad_norm": 0.9216524958610535, + "learning_rate": 1.7997557997558e-05, + "loss": 0.1273, + "step": 1230 + }, + { + "epoch": 0.5, + "grad_norm": 0.08572695404291153, + "learning_rate": 1.7981277981277983e-05, + "loss": 0.1066, + "step": 1240 + }, + { + "epoch": 0.51, + "grad_norm": 5.445361137390137, + "learning_rate": 1.7964997964997966e-05, + "loss": 0.1894, + "step": 1250 + }, + { + "epoch": 0.51, + "grad_norm": 4.239029407501221, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0947, + "step": 1260 + }, + { + "epoch": 0.52, + "grad_norm": 0.7807052135467529, + "learning_rate": 1.7932437932437933e-05, + "loss": 0.1692, + "step": 1270 + }, + { + "epoch": 0.52, + "grad_norm": 0.1252571940422058, + "learning_rate": 1.7916157916157916e-05, + "loss": 0.0901, + "step": 1280 + }, + { + "epoch": 0.53, + "grad_norm": 5.491313457489014, + "learning_rate": 1.78998778998779e-05, + "loss": 0.0849, + "step": 1290 + }, + { + "epoch": 0.53, + "grad_norm": 0.3406262695789337, + "learning_rate": 1.7883597883597884e-05, + "loss": 0.1345, + "step": 1300 + }, + { + "epoch": 0.53, + "grad_norm": 3.4588377475738525, + "learning_rate": 1.786731786731787e-05, + "loss": 0.1501, + "step": 1310 + }, + { + "epoch": 0.54, + "grad_norm": 3.2964069843292236, + "learning_rate": 1.7851037851037855e-05, + "loss": 0.1679, + "step": 1320 + }, + { + "epoch": 0.54, + "grad_norm": 6.95346212387085, + "learning_rate": 1.7834757834757837e-05, + "loss": 0.095, + "step": 1330 + }, + { + "epoch": 0.55, + "grad_norm": 2.9120900630950928, + "learning_rate": 1.781847781847782e-05, + "loss": 0.1089, + "step": 1340 + }, + { + "epoch": 0.55, + "grad_norm": 8.793939590454102, + "learning_rate": 1.78021978021978e-05, + "loss": 0.1338, + "step": 1350 + }, + { + "epoch": 0.55, + "grad_norm": 0.08519359678030014, + "learning_rate": 1.7785917785917787e-05, + "loss": 0.0932, + "step": 1360 + }, + { + "epoch": 0.56, + "grad_norm": 16.41631317138672, + "learning_rate": 1.776963776963777e-05, + "loss": 0.1376, + "step": 1370 + }, + { + "epoch": 0.56, + "grad_norm": 3.0415103435516357, + "learning_rate": 1.7753357753357755e-05, + "loss": 0.1567, + "step": 1380 + }, + { + "epoch": 0.57, + "grad_norm": 0.8246403336524963, + "learning_rate": 1.773707773707774e-05, + "loss": 0.122, + "step": 1390 + }, + { + "epoch": 0.57, + "grad_norm": 2.198512077331543, + "learning_rate": 1.7720797720797723e-05, + "loss": 0.1528, + "step": 1400 + }, + { + "epoch": 0.57, + "grad_norm": 0.5246292352676392, + "learning_rate": 1.7704517704517705e-05, + "loss": 0.1088, + "step": 1410 + }, + { + "epoch": 0.58, + "grad_norm": 12.515607833862305, + "learning_rate": 1.768823768823769e-05, + "loss": 0.1627, + "step": 1420 + }, + { + "epoch": 0.58, + "grad_norm": 13.734766006469727, + "learning_rate": 1.7671957671957673e-05, + "loss": 0.1475, + "step": 1430 + }, + { + "epoch": 0.59, + "grad_norm": 2.593158483505249, + "learning_rate": 1.7655677655677655e-05, + "loss": 0.0968, + "step": 1440 + }, + { + "epoch": 0.59, + "grad_norm": 0.3462279736995697, + "learning_rate": 1.763939763939764e-05, + "loss": 0.0854, + "step": 1450 + }, + { + "epoch": 0.59, + "grad_norm": 1.6409497261047363, + "learning_rate": 1.7623117623117623e-05, + "loss": 0.2295, + "step": 1460 + }, + { + "epoch": 0.6, + "grad_norm": 2.9609594345092773, + "learning_rate": 1.760683760683761e-05, + "loss": 0.1883, + "step": 1470 + }, + { + "epoch": 0.6, + "grad_norm": 0.673570454120636, + "learning_rate": 1.759055759055759e-05, + "loss": 0.1369, + "step": 1480 + }, + { + "epoch": 0.61, + "grad_norm": 0.2929579019546509, + "learning_rate": 1.7574277574277577e-05, + "loss": 0.1189, + "step": 1490 + }, + { + "epoch": 0.61, + "grad_norm": 1.4493731260299683, + "learning_rate": 1.755799755799756e-05, + "loss": 0.1187, + "step": 1500 + }, + { + "epoch": 0.61, + "grad_norm": 0.07135419547557831, + "learning_rate": 1.754171754171754e-05, + "loss": 0.0603, + "step": 1510 + }, + { + "epoch": 0.62, + "grad_norm": 0.10734464973211288, + "learning_rate": 1.7525437525437527e-05, + "loss": 0.0217, + "step": 1520 + }, + { + "epoch": 0.62, + "grad_norm": 0.2217961698770523, + "learning_rate": 1.750915750915751e-05, + "loss": 0.0303, + "step": 1530 + }, + { + "epoch": 0.63, + "grad_norm": 0.14218159019947052, + "learning_rate": 1.7492877492877495e-05, + "loss": 0.0603, + "step": 1540 + }, + { + "epoch": 0.63, + "grad_norm": 0.09605604410171509, + "learning_rate": 1.747659747659748e-05, + "loss": 0.0407, + "step": 1550 + }, + { + "epoch": 0.63, + "grad_norm": 0.07094033062458038, + "learning_rate": 1.7460317460317463e-05, + "loss": 0.0202, + "step": 1560 + }, + { + "epoch": 0.64, + "grad_norm": 4.650410175323486, + "learning_rate": 1.7444037444037445e-05, + "loss": 0.0611, + "step": 1570 + }, + { + "epoch": 0.64, + "grad_norm": 23.229633331298828, + "learning_rate": 1.742775742775743e-05, + "loss": 0.0528, + "step": 1580 + }, + { + "epoch": 0.65, + "grad_norm": 1.466739535331726, + "learning_rate": 1.7411477411477413e-05, + "loss": 0.07, + "step": 1590 + }, + { + "epoch": 0.65, + "grad_norm": 0.05839679762721062, + "learning_rate": 1.7395197395197395e-05, + "loss": 0.0077, + "step": 1600 + }, + { + "epoch": 0.66, + "grad_norm": 1.6192926168441772, + "learning_rate": 1.737891737891738e-05, + "loss": 0.0239, + "step": 1610 + }, + { + "epoch": 0.66, + "grad_norm": 3.8529036045074463, + "learning_rate": 1.7362637362637363e-05, + "loss": 0.0976, + "step": 1620 + }, + { + "epoch": 0.66, + "grad_norm": 0.24398411810398102, + "learning_rate": 1.734635734635735e-05, + "loss": 0.0079, + "step": 1630 + }, + { + "epoch": 0.67, + "grad_norm": 0.04527588561177254, + "learning_rate": 1.733007733007733e-05, + "loss": 0.0065, + "step": 1640 + }, + { + "epoch": 0.67, + "grad_norm": 6.153138160705566, + "learning_rate": 1.7313797313797316e-05, + "loss": 0.0364, + "step": 1650 + }, + { + "epoch": 0.68, + "grad_norm": 0.03938959911465645, + "learning_rate": 1.72975172975173e-05, + "loss": 0.009, + "step": 1660 + }, + { + "epoch": 0.68, + "grad_norm": 0.04055130481719971, + "learning_rate": 1.728123728123728e-05, + "loss": 0.0472, + "step": 1670 + }, + { + "epoch": 0.68, + "grad_norm": 0.07095145434141159, + "learning_rate": 1.7264957264957267e-05, + "loss": 0.0078, + "step": 1680 + }, + { + "epoch": 0.69, + "grad_norm": 2.7965128421783447, + "learning_rate": 1.724867724867725e-05, + "loss": 0.0559, + "step": 1690 + }, + { + "epoch": 0.69, + "grad_norm": 6.2940592765808105, + "learning_rate": 1.7232397232397234e-05, + "loss": 0.0366, + "step": 1700 + }, + { + "epoch": 0.7, + "grad_norm": 0.11980397999286652, + "learning_rate": 1.721611721611722e-05, + "loss": 0.0125, + "step": 1710 + }, + { + "epoch": 0.7, + "grad_norm": 8.26235294342041, + "learning_rate": 1.7199837199837202e-05, + "loss": 0.0137, + "step": 1720 + }, + { + "epoch": 0.7, + "grad_norm": 0.04125256836414337, + "learning_rate": 1.7183557183557185e-05, + "loss": 0.0051, + "step": 1730 + }, + { + "epoch": 0.71, + "grad_norm": 0.03920783847570419, + "learning_rate": 1.7167277167277167e-05, + "loss": 0.0067, + "step": 1740 + }, + { + "epoch": 0.71, + "grad_norm": 0.13922813534736633, + "learning_rate": 1.7150997150997152e-05, + "loss": 0.0374, + "step": 1750 + }, + { + "epoch": 0.72, + "grad_norm": 0.034091122448444366, + "learning_rate": 1.7134717134717135e-05, + "loss": 0.006, + "step": 1760 + }, + { + "epoch": 0.72, + "grad_norm": 10.509510040283203, + "learning_rate": 1.711843711843712e-05, + "loss": 0.0589, + "step": 1770 + }, + { + "epoch": 0.72, + "grad_norm": 0.043251294642686844, + "learning_rate": 1.7102157102157103e-05, + "loss": 0.0226, + "step": 1780 + }, + { + "epoch": 0.73, + "grad_norm": 0.8053480982780457, + "learning_rate": 1.7085877085877088e-05, + "loss": 0.0582, + "step": 1790 + }, + { + "epoch": 0.73, + "grad_norm": 0.04081906005740166, + "learning_rate": 1.706959706959707e-05, + "loss": 0.0391, + "step": 1800 + }, + { + "epoch": 0.74, + "grad_norm": 0.03760745748877525, + "learning_rate": 1.7053317053317056e-05, + "loss": 0.027, + "step": 1810 + }, + { + "epoch": 0.74, + "grad_norm": 0.04111940413713455, + "learning_rate": 1.7037037037037038e-05, + "loss": 0.0368, + "step": 1820 + }, + { + "epoch": 0.74, + "grad_norm": 2.6297411918640137, + "learning_rate": 1.702075702075702e-05, + "loss": 0.0478, + "step": 1830 + }, + { + "epoch": 0.75, + "grad_norm": 0.1751009225845337, + "learning_rate": 1.7004477004477006e-05, + "loss": 0.0298, + "step": 1840 + }, + { + "epoch": 0.75, + "grad_norm": 0.042650580406188965, + "learning_rate": 1.698819698819699e-05, + "loss": 0.0203, + "step": 1850 + }, + { + "epoch": 0.76, + "grad_norm": 0.034141793847084045, + "learning_rate": 1.6971916971916974e-05, + "loss": 0.0044, + "step": 1860 + }, + { + "epoch": 0.76, + "grad_norm": 0.03497103974223137, + "learning_rate": 1.6955636955636956e-05, + "loss": 0.0304, + "step": 1870 + }, + { + "epoch": 0.77, + "grad_norm": 3.8585641384124756, + "learning_rate": 1.6939356939356942e-05, + "loss": 0.0365, + "step": 1880 + }, + { + "epoch": 0.77, + "grad_norm": 0.0322452187538147, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 0.77, + "grad_norm": 0.034800559282302856, + "learning_rate": 1.6906796906796906e-05, + "loss": 0.0061, + "step": 1900 + }, + { + "epoch": 0.78, + "grad_norm": 0.0860045924782753, + "learning_rate": 1.6890516890516892e-05, + "loss": 0.0172, + "step": 1910 + }, + { + "epoch": 0.78, + "grad_norm": 0.031149201095104218, + "learning_rate": 1.6874236874236874e-05, + "loss": 0.0238, + "step": 1920 + }, + { + "epoch": 0.79, + "grad_norm": 0.03368987515568733, + "learning_rate": 1.685795685795686e-05, + "loss": 0.0043, + "step": 1930 + }, + { + "epoch": 0.79, + "grad_norm": 0.03161125257611275, + "learning_rate": 1.6841676841676846e-05, + "loss": 0.0146, + "step": 1940 + }, + { + "epoch": 0.79, + "grad_norm": 0.029046092182397842, + "learning_rate": 1.6825396825396828e-05, + "loss": 0.0236, + "step": 1950 + }, + { + "epoch": 0.8, + "grad_norm": 0.9345057606697083, + "learning_rate": 1.680911680911681e-05, + "loss": 0.0042, + "step": 1960 + }, + { + "epoch": 0.8, + "grad_norm": 0.028860267251729965, + "learning_rate": 1.6792836792836796e-05, + "loss": 0.0286, + "step": 1970 + }, + { + "epoch": 0.81, + "grad_norm": 0.02852853201329708, + "learning_rate": 1.6776556776556778e-05, + "loss": 0.023, + "step": 1980 + }, + { + "epoch": 0.81, + "grad_norm": 0.03128168359398842, + "learning_rate": 1.676027676027676e-05, + "loss": 0.0036, + "step": 1990 + }, + { + "epoch": 0.81, + "grad_norm": 0.037479083985090256, + "learning_rate": 1.6743996743996746e-05, + "loss": 0.0591, + "step": 2000 + }, + { + "epoch": 0.82, + "grad_norm": 0.04688659682869911, + "learning_rate": 1.6727716727716728e-05, + "loss": 0.0316, + "step": 2010 + }, + { + "epoch": 0.82, + "grad_norm": 0.03302760049700737, + "learning_rate": 1.6711436711436714e-05, + "loss": 0.0668, + "step": 2020 + }, + { + "epoch": 0.83, + "grad_norm": 0.06181880831718445, + "learning_rate": 1.6695156695156696e-05, + "loss": 0.0281, + "step": 2030 + }, + { + "epoch": 0.83, + "grad_norm": 0.0320013165473938, + "learning_rate": 1.667887667887668e-05, + "loss": 0.0232, + "step": 2040 + }, + { + "epoch": 0.83, + "grad_norm": 0.13600216805934906, + "learning_rate": 1.6662596662596664e-05, + "loss": 0.0442, + "step": 2050 + }, + { + "epoch": 0.84, + "grad_norm": 0.12886099517345428, + "learning_rate": 1.6646316646316646e-05, + "loss": 0.0305, + "step": 2060 + }, + { + "epoch": 0.84, + "grad_norm": 0.0625109001994133, + "learning_rate": 1.6630036630036632e-05, + "loss": 0.0233, + "step": 2070 + }, + { + "epoch": 0.85, + "grad_norm": 13.604376792907715, + "learning_rate": 1.6613756613756614e-05, + "loss": 0.0288, + "step": 2080 + }, + { + "epoch": 0.85, + "grad_norm": 0.029248738661408424, + "learning_rate": 1.65974765974766e-05, + "loss": 0.0039, + "step": 2090 + }, + { + "epoch": 0.85, + "grad_norm": 1.4231517314910889, + "learning_rate": 1.6581196581196585e-05, + "loss": 0.0095, + "step": 2100 + }, + { + "epoch": 0.86, + "grad_norm": 0.02830047346651554, + "learning_rate": 1.6564916564916568e-05, + "loss": 0.007, + "step": 2110 + }, + { + "epoch": 0.86, + "grad_norm": 0.027091912925243378, + "learning_rate": 1.654863654863655e-05, + "loss": 0.0041, + "step": 2120 + }, + { + "epoch": 0.87, + "grad_norm": 0.02793751284480095, + "learning_rate": 1.6532356532356532e-05, + "loss": 0.0087, + "step": 2130 + }, + { + "epoch": 0.87, + "grad_norm": 0.030688917264342308, + "learning_rate": 1.6516076516076518e-05, + "loss": 0.0033, + "step": 2140 + }, + { + "epoch": 0.88, + "grad_norm": 0.02540646307170391, + "learning_rate": 1.64997964997965e-05, + "loss": 0.0254, + "step": 2150 + }, + { + "epoch": 0.88, + "grad_norm": 0.026573829352855682, + "learning_rate": 1.6483516483516486e-05, + "loss": 0.0195, + "step": 2160 + }, + { + "epoch": 0.88, + "grad_norm": 0.025454262271523476, + "learning_rate": 1.6467236467236468e-05, + "loss": 0.0031, + "step": 2170 + }, + { + "epoch": 0.89, + "grad_norm": 0.038121115416288376, + "learning_rate": 1.6450956450956453e-05, + "loss": 0.0035, + "step": 2180 + }, + { + "epoch": 0.89, + "grad_norm": 0.025772370398044586, + "learning_rate": 1.6434676434676436e-05, + "loss": 0.003, + "step": 2190 + }, + { + "epoch": 0.9, + "grad_norm": 3.4986250400543213, + "learning_rate": 1.641839641839642e-05, + "loss": 0.0038, + "step": 2200 + }, + { + "epoch": 0.9, + "grad_norm": 25.038734436035156, + "learning_rate": 1.6402116402116404e-05, + "loss": 0.0119, + "step": 2210 + }, + { + "epoch": 0.9, + "grad_norm": 0.025794176384806633, + "learning_rate": 1.6385836385836386e-05, + "loss": 0.0353, + "step": 2220 + }, + { + "epoch": 0.91, + "grad_norm": 4.056914806365967, + "learning_rate": 1.636955636955637e-05, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.91, + "grad_norm": 0.19518433511257172, + "learning_rate": 1.6353276353276354e-05, + "loss": 0.0291, + "step": 2240 + }, + { + "epoch": 0.92, + "grad_norm": 0.02424285002052784, + "learning_rate": 1.633699633699634e-05, + "loss": 0.0359, + "step": 2250 + }, + { + "epoch": 0.92, + "grad_norm": 0.03164544701576233, + "learning_rate": 1.632071632071632e-05, + "loss": 0.0382, + "step": 2260 + }, + { + "epoch": 0.92, + "grad_norm": 0.022855272516608238, + "learning_rate": 1.6304436304436307e-05, + "loss": 0.003, + "step": 2270 + }, + { + "epoch": 0.93, + "grad_norm": 0.023591142147779465, + "learning_rate": 1.628815628815629e-05, + "loss": 0.0497, + "step": 2280 + }, + { + "epoch": 0.93, + "grad_norm": 0.02427799627184868, + "learning_rate": 1.627187627187627e-05, + "loss": 0.0381, + "step": 2290 + }, + { + "epoch": 0.94, + "grad_norm": 0.022075733169913292, + "learning_rate": 1.6255596255596257e-05, + "loss": 0.0038, + "step": 2300 + }, + { + "epoch": 0.94, + "grad_norm": 0.25007203221321106, + "learning_rate": 1.623931623931624e-05, + "loss": 0.0364, + "step": 2310 + }, + { + "epoch": 0.94, + "grad_norm": 0.02502160519361496, + "learning_rate": 1.6223036223036225e-05, + "loss": 0.0029, + "step": 2320 + }, + { + "epoch": 0.95, + "grad_norm": 0.036409296095371246, + "learning_rate": 1.6206756206756207e-05, + "loss": 0.0387, + "step": 2330 + }, + { + "epoch": 0.95, + "grad_norm": 0.027146685868501663, + "learning_rate": 1.6190476190476193e-05, + "loss": 0.0045, + "step": 2340 + }, + { + "epoch": 0.96, + "grad_norm": 0.024981442838907242, + "learning_rate": 1.6174196174196175e-05, + "loss": 0.0264, + "step": 2350 + }, + { + "epoch": 0.96, + "grad_norm": 0.027865292504429817, + "learning_rate": 1.615791615791616e-05, + "loss": 0.0029, + "step": 2360 + }, + { + "epoch": 0.96, + "grad_norm": 0.034725822508335114, + "learning_rate": 1.6141636141636143e-05, + "loss": 0.0029, + "step": 2370 + }, + { + "epoch": 0.97, + "grad_norm": 0.022250523790717125, + "learning_rate": 1.6125356125356125e-05, + "loss": 0.0337, + "step": 2380 + }, + { + "epoch": 0.97, + "grad_norm": 0.024188194423913956, + "learning_rate": 1.610907610907611e-05, + "loss": 0.0026, + "step": 2390 + }, + { + "epoch": 0.98, + "grad_norm": 0.02303464338183403, + "learning_rate": 1.6092796092796093e-05, + "loss": 0.0285, + "step": 2400 + }, + { + "epoch": 0.98, + "grad_norm": 0.020316725596785545, + "learning_rate": 1.607651607651608e-05, + "loss": 0.0026, + "step": 2410 + }, + { + "epoch": 0.98, + "grad_norm": 0.023156961426138878, + "learning_rate": 1.606023606023606e-05, + "loss": 0.0031, + "step": 2420 + }, + { + "epoch": 0.99, + "grad_norm": 2.9847331047058105, + "learning_rate": 1.6043956043956047e-05, + "loss": 0.0034, + "step": 2430 + }, + { + "epoch": 0.99, + "grad_norm": 10.845735549926758, + "learning_rate": 1.602767602767603e-05, + "loss": 0.0557, + "step": 2440 + }, + { + "epoch": 1.0, + "grad_norm": 0.02037137933075428, + "learning_rate": 1.601139601139601e-05, + "loss": 0.0333, + "step": 2450 + }, + { + "epoch": 1.0, + "grad_norm": 0.019075889140367508, + "learning_rate": 1.5995115995115997e-05, + "loss": 0.0029, + "step": 2460 + }, + { + "epoch": 1.01, + "grad_norm": 0.02034451812505722, + "learning_rate": 1.597883597883598e-05, + "loss": 0.0035, + "step": 2470 + }, + { + "epoch": 1.01, + "grad_norm": 0.02513672597706318, + "learning_rate": 1.5962555962555965e-05, + "loss": 0.0149, + "step": 2480 + }, + { + "epoch": 1.01, + "grad_norm": 0.0232282355427742, + "learning_rate": 1.5946275946275947e-05, + "loss": 0.0066, + "step": 2490 + }, + { + "epoch": 1.02, + "grad_norm": 0.019541621208190918, + "learning_rate": 1.5929995929995933e-05, + "loss": 0.003, + "step": 2500 + }, + { + "epoch": 1.02, + "grad_norm": 0.027926787734031677, + "learning_rate": 1.5913715913715915e-05, + "loss": 0.0024, + "step": 2510 + }, + { + "epoch": 1.03, + "grad_norm": 0.021236905828118324, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0023, + "step": 2520 + }, + { + "epoch": 1.03, + "grad_norm": 0.017625728622078896, + "learning_rate": 1.5881155881155883e-05, + "loss": 0.0023, + "step": 2530 + }, + { + "epoch": 1.03, + "grad_norm": 3.0908312797546387, + "learning_rate": 1.5864875864875865e-05, + "loss": 0.0032, + "step": 2540 + }, + { + "epoch": 1.04, + "grad_norm": 0.025432445108890533, + "learning_rate": 1.584859584859585e-05, + "loss": 0.0246, + "step": 2550 + }, + { + "epoch": 1.04, + "grad_norm": 0.0189252570271492, + "learning_rate": 1.5832315832315833e-05, + "loss": 0.0025, + "step": 2560 + }, + { + "epoch": 1.05, + "grad_norm": 0.16396763920783997, + "learning_rate": 1.581603581603582e-05, + "loss": 0.0378, + "step": 2570 + }, + { + "epoch": 1.05, + "grad_norm": 0.019563721492886543, + "learning_rate": 1.57997557997558e-05, + "loss": 0.0281, + "step": 2580 + }, + { + "epoch": 1.05, + "grad_norm": 0.02156243473291397, + "learning_rate": 1.5783475783475787e-05, + "loss": 0.1073, + "step": 2590 + }, + { + "epoch": 1.06, + "grad_norm": 3.184936285018921, + "learning_rate": 1.576719576719577e-05, + "loss": 0.0413, + "step": 2600 + }, + { + "epoch": 1.06, + "grad_norm": 0.0187922902405262, + "learning_rate": 1.575091575091575e-05, + "loss": 0.0423, + "step": 2610 + }, + { + "epoch": 1.07, + "grad_norm": 0.020309004932641983, + "learning_rate": 1.5734635734635737e-05, + "loss": 0.0026, + "step": 2620 + }, + { + "epoch": 1.07, + "grad_norm": 0.028299883008003235, + "learning_rate": 1.571835571835572e-05, + "loss": 0.0026, + "step": 2630 + }, + { + "epoch": 1.07, + "grad_norm": 0.022750265896320343, + "learning_rate": 1.5702075702075705e-05, + "loss": 0.0026, + "step": 2640 + }, + { + "epoch": 1.08, + "grad_norm": 0.017459379509091377, + "learning_rate": 1.5685795685795687e-05, + "loss": 0.0026, + "step": 2650 + }, + { + "epoch": 1.08, + "grad_norm": 0.02400645986199379, + "learning_rate": 1.5669515669515672e-05, + "loss": 0.0022, + "step": 2660 + }, + { + "epoch": 1.09, + "grad_norm": 0.037710972130298615, + "learning_rate": 1.5653235653235655e-05, + "loss": 0.0024, + "step": 2670 + }, + { + "epoch": 1.09, + "grad_norm": 0.01844876818358898, + "learning_rate": 1.5636955636955637e-05, + "loss": 0.0022, + "step": 2680 + }, + { + "epoch": 1.09, + "grad_norm": 0.015886761248111725, + "learning_rate": 1.5620675620675623e-05, + "loss": 0.0021, + "step": 2690 + }, + { + "epoch": 1.1, + "grad_norm": 0.016119027510285378, + "learning_rate": 1.5604395604395605e-05, + "loss": 0.0024, + "step": 2700 + }, + { + "epoch": 1.1, + "grad_norm": 0.01977747306227684, + "learning_rate": 1.558811558811559e-05, + "loss": 0.0405, + "step": 2710 + }, + { + "epoch": 1.11, + "grad_norm": 0.01591884344816208, + "learning_rate": 1.5571835571835573e-05, + "loss": 0.0021, + "step": 2720 + }, + { + "epoch": 1.11, + "grad_norm": 0.017170535400509834, + "learning_rate": 1.555555555555556e-05, + "loss": 0.0102, + "step": 2730 + }, + { + "epoch": 1.12, + "grad_norm": 0.02160962112247944, + "learning_rate": 1.553927553927554e-05, + "loss": 0.0164, + "step": 2740 + }, + { + "epoch": 1.12, + "grad_norm": 0.04177393019199371, + "learning_rate": 1.5522995522995526e-05, + "loss": 0.002, + "step": 2750 + }, + { + "epoch": 1.12, + "grad_norm": 0.01732414774596691, + "learning_rate": 1.550671550671551e-05, + "loss": 0.0022, + "step": 2760 + }, + { + "epoch": 1.13, + "grad_norm": 0.05687391385436058, + "learning_rate": 1.549043549043549e-05, + "loss": 0.002, + "step": 2770 + }, + { + "epoch": 1.13, + "grad_norm": 0.015546981245279312, + "learning_rate": 1.5474155474155473e-05, + "loss": 0.0296, + "step": 2780 + }, + { + "epoch": 1.14, + "grad_norm": 11.891217231750488, + "learning_rate": 1.545787545787546e-05, + "loss": 0.0303, + "step": 2790 + }, + { + "epoch": 1.14, + "grad_norm": 3.074970245361328, + "learning_rate": 1.5441595441595444e-05, + "loss": 0.0346, + "step": 2800 + }, + { + "epoch": 1.14, + "grad_norm": 1.3277289867401123, + "learning_rate": 1.5425315425315426e-05, + "loss": 0.0053, + "step": 2810 + }, + { + "epoch": 1.15, + "grad_norm": 0.014851146377623081, + "learning_rate": 1.5409035409035412e-05, + "loss": 0.0021, + "step": 2820 + }, + { + "epoch": 1.15, + "grad_norm": 0.02586003951728344, + "learning_rate": 1.5392755392755394e-05, + "loss": 0.0194, + "step": 2830 + }, + { + "epoch": 1.16, + "grad_norm": 0.018063299357891083, + "learning_rate": 1.5376475376475377e-05, + "loss": 0.0374, + "step": 2840 + }, + { + "epoch": 1.16, + "grad_norm": 0.014860156923532486, + "learning_rate": 1.5360195360195362e-05, + "loss": 0.0368, + "step": 2850 + }, + { + "epoch": 1.16, + "grad_norm": 0.016715556383132935, + "learning_rate": 1.5343915343915344e-05, + "loss": 0.0232, + "step": 2860 + }, + { + "epoch": 1.17, + "grad_norm": 0.017222585156559944, + "learning_rate": 1.532763532763533e-05, + "loss": 0.0021, + "step": 2870 + }, + { + "epoch": 1.17, + "grad_norm": 0.015297485515475273, + "learning_rate": 1.5311355311355312e-05, + "loss": 0.002, + "step": 2880 + }, + { + "epoch": 1.18, + "grad_norm": 0.01927722617983818, + "learning_rate": 1.5295075295075298e-05, + "loss": 0.0344, + "step": 2890 + }, + { + "epoch": 1.18, + "grad_norm": 0.014726200141012669, + "learning_rate": 1.527879527879528e-05, + "loss": 0.0105, + "step": 2900 + }, + { + "epoch": 1.18, + "grad_norm": 0.015239718370139599, + "learning_rate": 1.5262515262515263e-05, + "loss": 0.0019, + "step": 2910 + }, + { + "epoch": 1.19, + "grad_norm": 0.014116072095930576, + "learning_rate": 1.5246235246235248e-05, + "loss": 0.0482, + "step": 2920 + }, + { + "epoch": 1.19, + "grad_norm": 0.014437291771173477, + "learning_rate": 1.522995522995523e-05, + "loss": 0.0028, + "step": 2930 + }, + { + "epoch": 1.2, + "grad_norm": 0.017663761973381042, + "learning_rate": 1.5213675213675214e-05, + "loss": 0.007, + "step": 2940 + }, + { + "epoch": 1.2, + "grad_norm": 0.024807853624224663, + "learning_rate": 1.51973951973952e-05, + "loss": 0.0044, + "step": 2950 + }, + { + "epoch": 1.2, + "grad_norm": 0.01389392837882042, + "learning_rate": 1.5181115181115182e-05, + "loss": 0.021, + "step": 2960 + }, + { + "epoch": 1.21, + "grad_norm": 0.014578912407159805, + "learning_rate": 1.5164835164835166e-05, + "loss": 0.002, + "step": 2970 + }, + { + "epoch": 1.21, + "grad_norm": 0.013830927200615406, + "learning_rate": 1.514855514855515e-05, + "loss": 0.0017, + "step": 2980 + }, + { + "epoch": 1.22, + "grad_norm": 0.012908479198813438, + "learning_rate": 1.5132275132275134e-05, + "loss": 0.0047, + "step": 2990 + }, + { + "epoch": 1.22, + "grad_norm": 0.013685975223779678, + "learning_rate": 1.5115995115995116e-05, + "loss": 0.0062, + "step": 3000 + }, + { + "epoch": 1.23, + "grad_norm": 0.015914512798190117, + "learning_rate": 1.50997150997151e-05, + "loss": 0.0415, + "step": 3010 + }, + { + "epoch": 1.23, + "grad_norm": 0.09328664839267731, + "learning_rate": 1.5083435083435086e-05, + "loss": 0.0017, + "step": 3020 + }, + { + "epoch": 1.23, + "grad_norm": 0.013503558933734894, + "learning_rate": 1.5067155067155068e-05, + "loss": 0.0292, + "step": 3030 + }, + { + "epoch": 1.24, + "grad_norm": 0.012664329260587692, + "learning_rate": 1.505087505087505e-05, + "loss": 0.0108, + "step": 3040 + }, + { + "epoch": 1.24, + "grad_norm": 0.013521691784262657, + "learning_rate": 1.5034595034595036e-05, + "loss": 0.0016, + "step": 3050 + }, + { + "epoch": 1.25, + "grad_norm": 0.017031285911798477, + "learning_rate": 1.501831501831502e-05, + "loss": 0.0056, + "step": 3060 + }, + { + "epoch": 1.25, + "grad_norm": 0.0123978890478611, + "learning_rate": 1.5002035002035002e-05, + "loss": 0.0454, + "step": 3070 + }, + { + "epoch": 1.25, + "grad_norm": 0.01293584518134594, + "learning_rate": 1.4985754985754988e-05, + "loss": 0.004, + "step": 3080 + }, + { + "epoch": 1.26, + "grad_norm": 0.013730690814554691, + "learning_rate": 1.496947496947497e-05, + "loss": 0.0355, + "step": 3090 + }, + { + "epoch": 1.26, + "grad_norm": 0.01241120882332325, + "learning_rate": 1.4953194953194954e-05, + "loss": 0.0017, + "step": 3100 + }, + { + "epoch": 1.27, + "grad_norm": 0.016001150012016296, + "learning_rate": 1.493691493691494e-05, + "loss": 0.0017, + "step": 3110 + }, + { + "epoch": 1.27, + "grad_norm": 0.019151071086525917, + "learning_rate": 1.4920634920634922e-05, + "loss": 0.0335, + "step": 3120 + }, + { + "epoch": 1.27, + "grad_norm": 0.014675545506179333, + "learning_rate": 1.4904354904354906e-05, + "loss": 0.0203, + "step": 3130 + }, + { + "epoch": 1.28, + "grad_norm": 0.5518173575401306, + "learning_rate": 1.4888074888074888e-05, + "loss": 0.0025, + "step": 3140 + }, + { + "epoch": 1.28, + "grad_norm": 0.012442667037248611, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0021, + "step": 3150 + }, + { + "epoch": 1.29, + "grad_norm": 0.013752995058894157, + "learning_rate": 1.4855514855514856e-05, + "loss": 0.0018, + "step": 3160 + }, + { + "epoch": 1.29, + "grad_norm": 0.011561810038983822, + "learning_rate": 1.483923483923484e-05, + "loss": 0.0016, + "step": 3170 + }, + { + "epoch": 1.29, + "grad_norm": 0.011732109822332859, + "learning_rate": 1.4822954822954826e-05, + "loss": 0.0015, + "step": 3180 + }, + { + "epoch": 1.3, + "grad_norm": 0.011794438585639, + "learning_rate": 1.4806674806674808e-05, + "loss": 0.0014, + "step": 3190 + }, + { + "epoch": 1.3, + "grad_norm": 0.011947757564485073, + "learning_rate": 1.479039479039479e-05, + "loss": 0.0026, + "step": 3200 + }, + { + "epoch": 1.31, + "grad_norm": 0.017924221232533455, + "learning_rate": 1.4774114774114776e-05, + "loss": 0.0015, + "step": 3210 + }, + { + "epoch": 1.31, + "grad_norm": 0.011501024477183819, + "learning_rate": 1.475783475783476e-05, + "loss": 0.0021, + "step": 3220 + }, + { + "epoch": 1.31, + "grad_norm": 0.05062294751405716, + "learning_rate": 1.4741554741554742e-05, + "loss": 0.0015, + "step": 3230 + }, + { + "epoch": 1.32, + "grad_norm": 0.011451934464275837, + "learning_rate": 1.4725274725274727e-05, + "loss": 0.0015, + "step": 3240 + }, + { + "epoch": 1.32, + "grad_norm": 0.011398130096495152, + "learning_rate": 1.470899470899471e-05, + "loss": 0.0262, + "step": 3250 + }, + { + "epoch": 1.33, + "grad_norm": 0.011111021041870117, + "learning_rate": 1.4692714692714694e-05, + "loss": 0.0015, + "step": 3260 + }, + { + "epoch": 1.33, + "grad_norm": 0.011720293201506138, + "learning_rate": 1.4676434676434676e-05, + "loss": 0.0014, + "step": 3270 + }, + { + "epoch": 1.33, + "grad_norm": 0.01106089074164629, + "learning_rate": 1.4660154660154662e-05, + "loss": 0.0248, + "step": 3280 + }, + { + "epoch": 1.34, + "grad_norm": 0.031572628766298294, + "learning_rate": 1.4643874643874645e-05, + "loss": 0.0015, + "step": 3290 + }, + { + "epoch": 1.34, + "grad_norm": 0.010560325346887112, + "learning_rate": 1.4627594627594628e-05, + "loss": 0.0014, + "step": 3300 + }, + { + "epoch": 1.35, + "grad_norm": 31.388111114501953, + "learning_rate": 1.4611314611314613e-05, + "loss": 0.0255, + "step": 3310 + }, + { + "epoch": 1.35, + "grad_norm": 0.016965394839644432, + "learning_rate": 1.4595034595034596e-05, + "loss": 0.0014, + "step": 3320 + }, + { + "epoch": 1.36, + "grad_norm": 0.022373100742697716, + "learning_rate": 1.457875457875458e-05, + "loss": 0.0013, + "step": 3330 + }, + { + "epoch": 1.36, + "grad_norm": 0.011025676503777504, + "learning_rate": 1.4562474562474565e-05, + "loss": 0.0374, + "step": 3340 + }, + { + "epoch": 1.36, + "grad_norm": 0.016683539375662804, + "learning_rate": 1.4546194546194547e-05, + "loss": 0.0389, + "step": 3350 + }, + { + "epoch": 1.37, + "grad_norm": 0.012086950242519379, + "learning_rate": 1.4529914529914531e-05, + "loss": 0.0304, + "step": 3360 + }, + { + "epoch": 1.37, + "grad_norm": 0.011172090657055378, + "learning_rate": 1.4513634513634515e-05, + "loss": 0.0178, + "step": 3370 + }, + { + "epoch": 1.38, + "grad_norm": 0.013024254702031612, + "learning_rate": 1.44973544973545e-05, + "loss": 0.0014, + "step": 3380 + }, + { + "epoch": 1.38, + "grad_norm": 0.010836287401616573, + "learning_rate": 1.4481074481074482e-05, + "loss": 0.0014, + "step": 3390 + }, + { + "epoch": 1.38, + "grad_norm": 0.014210844412446022, + "learning_rate": 1.4464794464794465e-05, + "loss": 0.0014, + "step": 3400 + }, + { + "epoch": 1.39, + "grad_norm": 0.010528087615966797, + "learning_rate": 1.444851444851445e-05, + "loss": 0.0044, + "step": 3410 + }, + { + "epoch": 1.39, + "grad_norm": 0.01593305543065071, + "learning_rate": 1.4432234432234433e-05, + "loss": 0.0455, + "step": 3420 + }, + { + "epoch": 1.4, + "grad_norm": 0.015049874782562256, + "learning_rate": 1.4415954415954416e-05, + "loss": 0.0027, + "step": 3430 + }, + { + "epoch": 1.4, + "grad_norm": 0.011662309989333153, + "learning_rate": 1.4399674399674401e-05, + "loss": 0.0013, + "step": 3440 + }, + { + "epoch": 1.4, + "grad_norm": 0.011207195930182934, + "learning_rate": 1.4383394383394385e-05, + "loss": 0.0018, + "step": 3450 + }, + { + "epoch": 1.41, + "grad_norm": 3.6042699813842773, + "learning_rate": 1.4367114367114367e-05, + "loss": 0.0029, + "step": 3460 + }, + { + "epoch": 1.41, + "grad_norm": 0.09215729683637619, + "learning_rate": 1.4350834350834353e-05, + "loss": 0.002, + "step": 3470 + }, + { + "epoch": 1.42, + "grad_norm": 0.010877463966608047, + "learning_rate": 1.4334554334554335e-05, + "loss": 0.0014, + "step": 3480 + }, + { + "epoch": 1.42, + "grad_norm": 0.009993131272494793, + "learning_rate": 1.431827431827432e-05, + "loss": 0.0016, + "step": 3490 + }, + { + "epoch": 1.42, + "grad_norm": 1.349046230316162, + "learning_rate": 1.4301994301994305e-05, + "loss": 0.0018, + "step": 3500 + }, + { + "epoch": 1.43, + "grad_norm": 0.009341539815068245, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.0012, + "step": 3510 + }, + { + "epoch": 1.43, + "grad_norm": 0.009393510408699512, + "learning_rate": 1.4269434269434271e-05, + "loss": 0.0011, + "step": 3520 + }, + { + "epoch": 1.44, + "grad_norm": 0.009326926432549953, + "learning_rate": 1.4253154253154253e-05, + "loss": 0.0012, + "step": 3530 + }, + { + "epoch": 1.44, + "grad_norm": 0.009275635704398155, + "learning_rate": 1.4236874236874239e-05, + "loss": 0.0384, + "step": 3540 + }, + { + "epoch": 1.44, + "grad_norm": 22.40707778930664, + "learning_rate": 1.4220594220594221e-05, + "loss": 0.0131, + "step": 3550 + }, + { + "epoch": 1.45, + "grad_norm": 0.00953533872961998, + "learning_rate": 1.4204314204314205e-05, + "loss": 0.0347, + "step": 3560 + }, + { + "epoch": 1.45, + "grad_norm": 0.5032986998558044, + "learning_rate": 1.4188034188034189e-05, + "loss": 0.0402, + "step": 3570 + }, + { + "epoch": 1.46, + "grad_norm": 0.011732584796845913, + "learning_rate": 1.4171754171754173e-05, + "loss": 0.0592, + "step": 3580 + }, + { + "epoch": 1.46, + "grad_norm": 0.010645696893334389, + "learning_rate": 1.4155474155474155e-05, + "loss": 0.0268, + "step": 3590 + }, + { + "epoch": 1.47, + "grad_norm": 0.013740918599069118, + "learning_rate": 1.4139194139194141e-05, + "loss": 0.0252, + "step": 3600 + }, + { + "epoch": 1.47, + "grad_norm": 0.013372181914746761, + "learning_rate": 1.4122914122914125e-05, + "loss": 0.0376, + "step": 3610 + }, + { + "epoch": 1.47, + "grad_norm": 0.015505131334066391, + "learning_rate": 1.4106634106634107e-05, + "loss": 0.0014, + "step": 3620 + }, + { + "epoch": 1.48, + "grad_norm": 0.014338747598230839, + "learning_rate": 1.4090354090354093e-05, + "loss": 0.0853, + "step": 3630 + }, + { + "epoch": 1.48, + "grad_norm": 0.01571911759674549, + "learning_rate": 1.4074074074074075e-05, + "loss": 0.0298, + "step": 3640 + }, + { + "epoch": 1.49, + "grad_norm": 0.020005526021122932, + "learning_rate": 1.4057794057794059e-05, + "loss": 0.0017, + "step": 3650 + }, + { + "epoch": 1.49, + "grad_norm": 0.018354693427681923, + "learning_rate": 1.4041514041514041e-05, + "loss": 0.0016, + "step": 3660 + }, + { + "epoch": 1.49, + "grad_norm": 0.021922029554843903, + "learning_rate": 1.4025234025234027e-05, + "loss": 0.0017, + "step": 3670 + }, + { + "epoch": 1.5, + "grad_norm": 0.013702883385121822, + "learning_rate": 1.400895400895401e-05, + "loss": 0.0014, + "step": 3680 + }, + { + "epoch": 1.5, + "grad_norm": 0.010742840357124805, + "learning_rate": 1.3992673992673993e-05, + "loss": 0.0026, + "step": 3690 + }, + { + "epoch": 1.51, + "grad_norm": 0.15446045994758606, + "learning_rate": 1.3976393976393979e-05, + "loss": 0.0013, + "step": 3700 + }, + { + "epoch": 1.51, + "grad_norm": 0.01300391647964716, + "learning_rate": 1.3960113960113961e-05, + "loss": 0.0012, + "step": 3710 + }, + { + "epoch": 1.51, + "grad_norm": 0.017101220786571503, + "learning_rate": 1.3943833943833945e-05, + "loss": 0.0012, + "step": 3720 + }, + { + "epoch": 1.52, + "grad_norm": 0.009062445722520351, + "learning_rate": 1.3927553927553929e-05, + "loss": 0.0012, + "step": 3730 + }, + { + "epoch": 1.52, + "grad_norm": 0.008803702890872955, + "learning_rate": 1.3911273911273913e-05, + "loss": 0.0011, + "step": 3740 + }, + { + "epoch": 1.53, + "grad_norm": 0.008593735285103321, + "learning_rate": 1.3894993894993895e-05, + "loss": 0.0012, + "step": 3750 + }, + { + "epoch": 1.53, + "grad_norm": 0.009692203253507614, + "learning_rate": 1.387871387871388e-05, + "loss": 0.0011, + "step": 3760 + }, + { + "epoch": 1.53, + "grad_norm": 0.011008762754499912, + "learning_rate": 1.3862433862433865e-05, + "loss": 0.0011, + "step": 3770 + }, + { + "epoch": 1.54, + "grad_norm": 0.009994535706937313, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.022, + "step": 3780 + }, + { + "epoch": 1.54, + "grad_norm": 0.009117243811488152, + "learning_rate": 1.382987382987383e-05, + "loss": 0.0011, + "step": 3790 + }, + { + "epoch": 1.55, + "grad_norm": 0.008967447094619274, + "learning_rate": 1.3813593813593815e-05, + "loss": 0.0057, + "step": 3800 + }, + { + "epoch": 1.55, + "grad_norm": 0.008691845461726189, + "learning_rate": 1.3797313797313799e-05, + "loss": 0.0013, + "step": 3810 + }, + { + "epoch": 1.55, + "grad_norm": 0.011074850335717201, + "learning_rate": 1.378103378103378e-05, + "loss": 0.001, + "step": 3820 + }, + { + "epoch": 1.56, + "grad_norm": 0.00832684338092804, + "learning_rate": 1.3764753764753766e-05, + "loss": 0.0011, + "step": 3830 + }, + { + "epoch": 1.56, + "grad_norm": 0.008292116224765778, + "learning_rate": 1.374847374847375e-05, + "loss": 0.001, + "step": 3840 + }, + { + "epoch": 1.57, + "grad_norm": 0.009205167181789875, + "learning_rate": 1.3732193732193733e-05, + "loss": 0.0011, + "step": 3850 + }, + { + "epoch": 1.57, + "grad_norm": 0.008790573105216026, + "learning_rate": 1.3715913715913718e-05, + "loss": 0.001, + "step": 3860 + }, + { + "epoch": 1.58, + "grad_norm": 0.008000485599040985, + "learning_rate": 1.36996336996337e-05, + "loss": 0.008, + "step": 3870 + }, + { + "epoch": 1.58, + "grad_norm": 0.00819096527993679, + "learning_rate": 1.3683353683353684e-05, + "loss": 0.001, + "step": 3880 + }, + { + "epoch": 1.58, + "grad_norm": 0.014848892576992512, + "learning_rate": 1.3667073667073668e-05, + "loss": 0.015, + "step": 3890 + }, + { + "epoch": 1.59, + "grad_norm": 0.008053899742662907, + "learning_rate": 1.3650793650793652e-05, + "loss": 0.0009, + "step": 3900 + }, + { + "epoch": 1.59, + "grad_norm": 6.416678428649902, + "learning_rate": 1.3634513634513635e-05, + "loss": 0.0344, + "step": 3910 + }, + { + "epoch": 1.6, + "grad_norm": 0.10300695151090622, + "learning_rate": 1.3618233618233619e-05, + "loss": 0.001, + "step": 3920 + }, + { + "epoch": 1.6, + "grad_norm": 0.008424129337072372, + "learning_rate": 1.3601953601953604e-05, + "loss": 0.0267, + "step": 3930 + }, + { + "epoch": 1.6, + "grad_norm": 0.00800679437816143, + "learning_rate": 1.3585673585673586e-05, + "loss": 0.0326, + "step": 3940 + }, + { + "epoch": 1.61, + "grad_norm": 0.009919759817421436, + "learning_rate": 1.356939356939357e-05, + "loss": 0.0011, + "step": 3950 + }, + { + "epoch": 1.61, + "grad_norm": 0.02416282147169113, + "learning_rate": 1.3553113553113554e-05, + "loss": 0.0012, + "step": 3960 + }, + { + "epoch": 1.62, + "grad_norm": 5.555994033813477, + "learning_rate": 1.3536833536833538e-05, + "loss": 0.043, + "step": 3970 + }, + { + "epoch": 1.62, + "grad_norm": 0.10745339095592499, + "learning_rate": 1.352055352055352e-05, + "loss": 0.0011, + "step": 3980 + }, + { + "epoch": 1.62, + "grad_norm": 0.00835937075316906, + "learning_rate": 1.3504273504273506e-05, + "loss": 0.0009, + "step": 3990 + }, + { + "epoch": 1.63, + "grad_norm": 0.007618330419063568, + "learning_rate": 1.348799348799349e-05, + "loss": 0.0241, + "step": 4000 + }, + { + "epoch": 1.63, + "grad_norm": 0.022973209619522095, + "learning_rate": 1.3471713471713472e-05, + "loss": 0.001, + "step": 4010 + }, + { + "epoch": 1.64, + "grad_norm": 0.008424985222518444, + "learning_rate": 1.3455433455433458e-05, + "loss": 0.0018, + "step": 4020 + }, + { + "epoch": 1.64, + "grad_norm": 0.015286185778677464, + "learning_rate": 1.343915343915344e-05, + "loss": 0.0009, + "step": 4030 + }, + { + "epoch": 1.64, + "grad_norm": 0.007264839485287666, + "learning_rate": 1.3422873422873424e-05, + "loss": 0.0009, + "step": 4040 + }, + { + "epoch": 1.65, + "grad_norm": 0.0074860285967588425, + "learning_rate": 1.3406593406593406e-05, + "loss": 0.0009, + "step": 4050 + }, + { + "epoch": 1.65, + "grad_norm": 0.008237460628151894, + "learning_rate": 1.3390313390313392e-05, + "loss": 0.0373, + "step": 4060 + }, + { + "epoch": 1.66, + "grad_norm": 0.007270953617990017, + "learning_rate": 1.3374033374033374e-05, + "loss": 0.0009, + "step": 4070 + }, + { + "epoch": 1.66, + "grad_norm": 0.03919156640768051, + "learning_rate": 1.3357753357753358e-05, + "loss": 0.001, + "step": 4080 + }, + { + "epoch": 1.66, + "grad_norm": 0.11515277624130249, + "learning_rate": 1.3341473341473344e-05, + "loss": 0.001, + "step": 4090 + }, + { + "epoch": 1.67, + "grad_norm": 0.007153298240154982, + "learning_rate": 1.3325193325193326e-05, + "loss": 0.0014, + "step": 4100 + }, + { + "epoch": 1.67, + "grad_norm": 0.00894332304596901, + "learning_rate": 1.330891330891331e-05, + "loss": 0.0022, + "step": 4110 + }, + { + "epoch": 1.68, + "grad_norm": 0.046884216368198395, + "learning_rate": 1.3292633292633294e-05, + "loss": 0.001, + "step": 4120 + }, + { + "epoch": 1.68, + "grad_norm": 0.0074531338177621365, + "learning_rate": 1.3276353276353278e-05, + "loss": 0.0009, + "step": 4130 + }, + { + "epoch": 1.68, + "grad_norm": 0.008025778457522392, + "learning_rate": 1.326007326007326e-05, + "loss": 0.0008, + "step": 4140 + }, + { + "epoch": 1.69, + "grad_norm": 0.007099485024809837, + "learning_rate": 1.3243793243793246e-05, + "loss": 0.0349, + "step": 4150 + }, + { + "epoch": 1.69, + "grad_norm": 0.007894063368439674, + "learning_rate": 1.322751322751323e-05, + "loss": 0.0008, + "step": 4160 + }, + { + "epoch": 1.7, + "grad_norm": 0.008376212790608406, + "learning_rate": 1.3211233211233212e-05, + "loss": 0.0009, + "step": 4170 + }, + { + "epoch": 1.7, + "grad_norm": 0.007172748912125826, + "learning_rate": 1.3194953194953194e-05, + "loss": 0.0011, + "step": 4180 + }, + { + "epoch": 1.71, + "grad_norm": 0.007325605023652315, + "learning_rate": 1.317867317867318e-05, + "loss": 0.0008, + "step": 4190 + }, + { + "epoch": 1.71, + "grad_norm": 0.007277225609868765, + "learning_rate": 1.3162393162393164e-05, + "loss": 0.0009, + "step": 4200 + }, + { + "epoch": 1.71, + "grad_norm": 0.007008700165897608, + "learning_rate": 1.3146113146113146e-05, + "loss": 0.0009, + "step": 4210 + }, + { + "epoch": 1.72, + "grad_norm": 0.007119116373360157, + "learning_rate": 1.3129833129833132e-05, + "loss": 0.0088, + "step": 4220 + }, + { + "epoch": 1.72, + "grad_norm": 0.006735885515809059, + "learning_rate": 1.3113553113553114e-05, + "loss": 0.0011, + "step": 4230 + }, + { + "epoch": 1.73, + "grad_norm": 0.006696558557450771, + "learning_rate": 1.3097273097273098e-05, + "loss": 0.0057, + "step": 4240 + }, + { + "epoch": 1.73, + "grad_norm": 0.01188244204968214, + "learning_rate": 1.3080993080993084e-05, + "loss": 0.0011, + "step": 4250 + }, + { + "epoch": 1.73, + "grad_norm": 0.007251105271279812, + "learning_rate": 1.3064713064713066e-05, + "loss": 0.0357, + "step": 4260 + }, + { + "epoch": 1.74, + "grad_norm": 0.006903903558850288, + "learning_rate": 1.304843304843305e-05, + "loss": 0.0008, + "step": 4270 + }, + { + "epoch": 1.74, + "grad_norm": 0.008923369459807873, + "learning_rate": 1.3032153032153034e-05, + "loss": 0.0008, + "step": 4280 + }, + { + "epoch": 1.75, + "grad_norm": 0.006224838085472584, + "learning_rate": 1.3015873015873018e-05, + "loss": 0.0077, + "step": 4290 + }, + { + "epoch": 1.75, + "grad_norm": 0.00695427879691124, + "learning_rate": 1.2999592999593e-05, + "loss": 0.0008, + "step": 4300 + }, + { + "epoch": 1.75, + "grad_norm": 0.007040718570351601, + "learning_rate": 1.2983312983312984e-05, + "loss": 0.0008, + "step": 4310 + }, + { + "epoch": 1.76, + "grad_norm": 0.006210348103195429, + "learning_rate": 1.296703296703297e-05, + "loss": 0.0015, + "step": 4320 + }, + { + "epoch": 1.76, + "grad_norm": 0.0062638637609779835, + "learning_rate": 1.2950752950752952e-05, + "loss": 0.0044, + "step": 4330 + }, + { + "epoch": 1.77, + "grad_norm": 0.006666597910225391, + "learning_rate": 1.2934472934472934e-05, + "loss": 0.0007, + "step": 4340 + }, + { + "epoch": 1.77, + "grad_norm": 0.0061942501924932, + "learning_rate": 1.291819291819292e-05, + "loss": 0.0011, + "step": 4350 + }, + { + "epoch": 1.77, + "grad_norm": 0.00600019795820117, + "learning_rate": 1.2901912901912904e-05, + "loss": 0.0008, + "step": 4360 + }, + { + "epoch": 1.78, + "grad_norm": 0.006045353598892689, + "learning_rate": 1.2885632885632886e-05, + "loss": 0.0451, + "step": 4370 + }, + { + "epoch": 1.78, + "grad_norm": 0.006641109474003315, + "learning_rate": 1.2869352869352871e-05, + "loss": 0.0008, + "step": 4380 + }, + { + "epoch": 1.79, + "grad_norm": 0.4562086760997772, + "learning_rate": 1.2853072853072854e-05, + "loss": 0.0009, + "step": 4390 + }, + { + "epoch": 1.79, + "grad_norm": 0.0076696197502315044, + "learning_rate": 1.2836792836792838e-05, + "loss": 0.0348, + "step": 4400 + }, + { + "epoch": 1.79, + "grad_norm": 0.006937106605619192, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0596, + "step": 4410 + }, + { + "epoch": 1.8, + "grad_norm": 0.00782240740954876, + "learning_rate": 1.2804232804232805e-05, + "loss": 0.0851, + "step": 4420 + }, + { + "epoch": 1.8, + "grad_norm": 0.007307849358767271, + "learning_rate": 1.278795278795279e-05, + "loss": 0.0009, + "step": 4430 + }, + { + "epoch": 1.81, + "grad_norm": 0.008858690969645977, + "learning_rate": 1.2771672771672772e-05, + "loss": 0.0021, + "step": 4440 + }, + { + "epoch": 1.81, + "grad_norm": 0.006560084410011768, + "learning_rate": 1.2755392755392757e-05, + "loss": 0.0008, + "step": 4450 + }, + { + "epoch": 1.82, + "grad_norm": 0.06266916543245316, + "learning_rate": 1.273911273911274e-05, + "loss": 0.0011, + "step": 4460 + }, + { + "epoch": 1.82, + "grad_norm": 0.00679628923535347, + "learning_rate": 1.2722832722832723e-05, + "loss": 0.0009, + "step": 4470 + }, + { + "epoch": 1.82, + "grad_norm": 0.006765253376215696, + "learning_rate": 1.2706552706552709e-05, + "loss": 0.0013, + "step": 4480 + }, + { + "epoch": 1.83, + "grad_norm": 0.005858385004103184, + "learning_rate": 1.2690272690272691e-05, + "loss": 0.0007, + "step": 4490 + }, + { + "epoch": 1.83, + "grad_norm": 0.006266339216381311, + "learning_rate": 1.2673992673992674e-05, + "loss": 0.0008, + "step": 4500 + }, + { + "epoch": 1.84, + "grad_norm": 0.006281218025833368, + "learning_rate": 1.265771265771266e-05, + "loss": 0.1082, + "step": 4510 + }, + { + "epoch": 1.84, + "grad_norm": 0.006863302085548639, + "learning_rate": 1.2641432641432643e-05, + "loss": 0.0009, + "step": 4520 + }, + { + "epoch": 1.84, + "grad_norm": 0.013896014541387558, + "learning_rate": 1.2625152625152625e-05, + "loss": 0.0281, + "step": 4530 + }, + { + "epoch": 1.85, + "grad_norm": 0.24578307569026947, + "learning_rate": 1.2608872608872611e-05, + "loss": 0.001, + "step": 4540 + }, + { + "epoch": 1.85, + "grad_norm": 0.011449114419519901, + "learning_rate": 1.2592592592592593e-05, + "loss": 0.0007, + "step": 4550 + }, + { + "epoch": 1.86, + "grad_norm": 36.35368728637695, + "learning_rate": 1.2576312576312577e-05, + "loss": 0.0217, + "step": 4560 + }, + { + "epoch": 1.86, + "grad_norm": 0.011718428693711758, + "learning_rate": 1.256003256003256e-05, + "loss": 0.0008, + "step": 4570 + }, + { + "epoch": 1.86, + "grad_norm": 10.411919593811035, + "learning_rate": 1.2543752543752545e-05, + "loss": 0.0159, + "step": 4580 + }, + { + "epoch": 1.87, + "grad_norm": 0.006179590709507465, + "learning_rate": 1.2527472527472529e-05, + "loss": 0.0307, + "step": 4590 + }, + { + "epoch": 1.87, + "grad_norm": 0.0063836839981377125, + "learning_rate": 1.2511192511192511e-05, + "loss": 0.0034, + "step": 4600 + }, + { + "epoch": 1.88, + "grad_norm": 0.008047536946833134, + "learning_rate": 1.2494912494912497e-05, + "loss": 0.001, + "step": 4610 + }, + { + "epoch": 1.88, + "grad_norm": 0.010491227731108665, + "learning_rate": 1.247863247863248e-05, + "loss": 0.0008, + "step": 4620 + }, + { + "epoch": 1.88, + "grad_norm": 0.005860119592398405, + "learning_rate": 1.2462352462352463e-05, + "loss": 0.0007, + "step": 4630 + }, + { + "epoch": 1.89, + "grad_norm": 10.03593635559082, + "learning_rate": 1.2446072446072449e-05, + "loss": 0.0314, + "step": 4640 + }, + { + "epoch": 1.89, + "grad_norm": 0.006240949500352144, + "learning_rate": 1.2429792429792431e-05, + "loss": 0.0009, + "step": 4650 + }, + { + "epoch": 1.9, + "grad_norm": 0.00653426069766283, + "learning_rate": 1.2413512413512413e-05, + "loss": 0.0008, + "step": 4660 + }, + { + "epoch": 1.9, + "grad_norm": 0.0061131748370826244, + "learning_rate": 1.2397232397232399e-05, + "loss": 0.0385, + "step": 4670 + }, + { + "epoch": 1.9, + "grad_norm": 0.018757157027721405, + "learning_rate": 1.2380952380952383e-05, + "loss": 0.0008, + "step": 4680 + }, + { + "epoch": 1.91, + "grad_norm": 0.005603988189250231, + "learning_rate": 1.2364672364672365e-05, + "loss": 0.0007, + "step": 4690 + }, + { + "epoch": 1.91, + "grad_norm": 0.008327238261699677, + "learning_rate": 1.2348392348392349e-05, + "loss": 0.0007, + "step": 4700 + }, + { + "epoch": 1.92, + "grad_norm": 0.006342690903693438, + "learning_rate": 1.2332112332112333e-05, + "loss": 0.0027, + "step": 4710 + }, + { + "epoch": 1.92, + "grad_norm": 0.007467071060091257, + "learning_rate": 1.2315832315832317e-05, + "loss": 0.001, + "step": 4720 + }, + { + "epoch": 1.93, + "grad_norm": 0.005770612042397261, + "learning_rate": 1.22995522995523e-05, + "loss": 0.0422, + "step": 4730 + }, + { + "epoch": 1.93, + "grad_norm": 0.01268511638045311, + "learning_rate": 1.2283272283272285e-05, + "loss": 0.001, + "step": 4740 + }, + { + "epoch": 1.93, + "grad_norm": 0.025519585236907005, + "learning_rate": 1.2266992266992269e-05, + "loss": 0.019, + "step": 4750 + }, + { + "epoch": 1.94, + "grad_norm": 12.875621795654297, + "learning_rate": 1.2250712250712251e-05, + "loss": 0.0206, + "step": 4760 + }, + { + "epoch": 1.94, + "grad_norm": 0.018496304750442505, + "learning_rate": 1.2234432234432237e-05, + "loss": 0.0008, + "step": 4770 + }, + { + "epoch": 1.95, + "grad_norm": 0.005795106291770935, + "learning_rate": 1.2218152218152219e-05, + "loss": 0.0032, + "step": 4780 + }, + { + "epoch": 1.95, + "grad_norm": 0.005989160854369402, + "learning_rate": 1.2201872201872203e-05, + "loss": 0.0007, + "step": 4790 + }, + { + "epoch": 1.95, + "grad_norm": 0.005859148222953081, + "learning_rate": 1.2185592185592185e-05, + "loss": 0.0007, + "step": 4800 + }, + { + "epoch": 1.96, + "grad_norm": 0.008097686804831028, + "learning_rate": 1.216931216931217e-05, + "loss": 0.0007, + "step": 4810 + }, + { + "epoch": 1.96, + "grad_norm": 0.005901312455534935, + "learning_rate": 1.2153032153032153e-05, + "loss": 0.0007, + "step": 4820 + }, + { + "epoch": 1.97, + "grad_norm": 0.006804050877690315, + "learning_rate": 1.2136752136752137e-05, + "loss": 0.0009, + "step": 4830 + }, + { + "epoch": 1.97, + "grad_norm": 0.006251387298107147, + "learning_rate": 1.2120472120472123e-05, + "loss": 0.0423, + "step": 4840 + }, + { + "epoch": 1.97, + "grad_norm": 0.0055562574416399, + "learning_rate": 1.2104192104192105e-05, + "loss": 0.0008, + "step": 4850 + }, + { + "epoch": 1.98, + "grad_norm": 0.006534604821354151, + "learning_rate": 1.2087912087912089e-05, + "loss": 0.0038, + "step": 4860 + }, + { + "epoch": 1.98, + "grad_norm": 0.010235198773443699, + "learning_rate": 1.2071632071632073e-05, + "loss": 0.003, + "step": 4870 + }, + { + "epoch": 1.99, + "grad_norm": 0.006196849979460239, + "learning_rate": 1.2055352055352057e-05, + "loss": 0.0007, + "step": 4880 + }, + { + "epoch": 1.99, + "grad_norm": 0.015244298614561558, + "learning_rate": 1.2039072039072039e-05, + "loss": 0.0007, + "step": 4890 + }, + { + "epoch": 1.99, + "grad_norm": 0.03133594989776611, + "learning_rate": 1.2022792022792024e-05, + "loss": 0.0319, + "step": 4900 + }, + { + "epoch": 2.0, + "grad_norm": 0.012942776083946228, + "learning_rate": 1.2006512006512008e-05, + "loss": 0.0007, + "step": 4910 + }, + { + "epoch": 2.0, + "grad_norm": 0.0054002669639885426, + "learning_rate": 1.199023199023199e-05, + "loss": 0.0386, + "step": 4920 + }, + { + "epoch": 2.01, + "grad_norm": 0.006965090055018663, + "learning_rate": 1.1973951973951975e-05, + "loss": 0.0414, + "step": 4930 + }, + { + "epoch": 2.01, + "grad_norm": 0.005913823377341032, + "learning_rate": 1.1957671957671959e-05, + "loss": 0.0008, + "step": 4940 + }, + { + "epoch": 2.01, + "grad_norm": 0.00729360431432724, + "learning_rate": 1.1941391941391942e-05, + "loss": 0.0015, + "step": 4950 + }, + { + "epoch": 2.02, + "grad_norm": 0.005881543271243572, + "learning_rate": 1.1925111925111925e-05, + "loss": 0.0017, + "step": 4960 + }, + { + "epoch": 2.02, + "grad_norm": 0.00946744717657566, + "learning_rate": 1.190883190883191e-05, + "loss": 0.0008, + "step": 4970 + }, + { + "epoch": 2.03, + "grad_norm": 0.7791256904602051, + "learning_rate": 1.1892551892551893e-05, + "loss": 0.0456, + "step": 4980 + }, + { + "epoch": 2.03, + "grad_norm": 0.08430014550685883, + "learning_rate": 1.1876271876271877e-05, + "loss": 0.0048, + "step": 4990 + }, + { + "epoch": 2.04, + "grad_norm": 0.007524729706346989, + "learning_rate": 1.1859991859991862e-05, + "loss": 0.0008, + "step": 5000 + }, + { + "epoch": 2.04, + "grad_norm": 0.007158556021749973, + "learning_rate": 1.1843711843711844e-05, + "loss": 0.0007, + "step": 5010 + }, + { + "epoch": 2.04, + "grad_norm": 0.006158571690320969, + "learning_rate": 1.1827431827431828e-05, + "loss": 0.0007, + "step": 5020 + }, + { + "epoch": 2.05, + "grad_norm": 0.0062376465648412704, + "learning_rate": 1.1811151811151812e-05, + "loss": 0.0007, + "step": 5030 + }, + { + "epoch": 2.05, + "grad_norm": 0.009434174746274948, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0333, + "step": 5040 + }, + { + "epoch": 2.06, + "grad_norm": 0.006017903331667185, + "learning_rate": 1.1778591778591779e-05, + "loss": 0.0007, + "step": 5050 + }, + { + "epoch": 2.06, + "grad_norm": 0.007532346062362194, + "learning_rate": 1.1762311762311762e-05, + "loss": 0.0007, + "step": 5060 + }, + { + "epoch": 2.06, + "grad_norm": 0.005684974603354931, + "learning_rate": 1.1746031746031748e-05, + "loss": 0.0008, + "step": 5070 + }, + { + "epoch": 2.07, + "grad_norm": 0.005241623613983393, + "learning_rate": 1.172975172975173e-05, + "loss": 0.0306, + "step": 5080 + }, + { + "epoch": 2.07, + "grad_norm": 0.019347479566931725, + "learning_rate": 1.1713471713471714e-05, + "loss": 0.0008, + "step": 5090 + }, + { + "epoch": 2.08, + "grad_norm": 0.08700444549322128, + "learning_rate": 1.1697191697191698e-05, + "loss": 0.0009, + "step": 5100 + }, + { + "epoch": 2.08, + "grad_norm": 0.005539617035537958, + "learning_rate": 1.1680911680911682e-05, + "loss": 0.0009, + "step": 5110 + }, + { + "epoch": 2.08, + "grad_norm": 0.005851482041180134, + "learning_rate": 1.1664631664631664e-05, + "loss": 0.0007, + "step": 5120 + }, + { + "epoch": 2.09, + "grad_norm": 0.007532169576734304, + "learning_rate": 1.164835164835165e-05, + "loss": 0.0011, + "step": 5130 + }, + { + "epoch": 2.09, + "grad_norm": 0.00506225973367691, + "learning_rate": 1.1632071632071634e-05, + "loss": 0.0007, + "step": 5140 + }, + { + "epoch": 2.1, + "grad_norm": 0.005589496809989214, + "learning_rate": 1.1615791615791616e-05, + "loss": 0.0007, + "step": 5150 + }, + { + "epoch": 2.1, + "grad_norm": 0.004957486409693956, + "learning_rate": 1.1599511599511602e-05, + "loss": 0.0156, + "step": 5160 + }, + { + "epoch": 2.1, + "grad_norm": 0.00666527496650815, + "learning_rate": 1.1583231583231584e-05, + "loss": 0.0007, + "step": 5170 + }, + { + "epoch": 2.11, + "grad_norm": 0.006306789815425873, + "learning_rate": 1.1566951566951568e-05, + "loss": 0.0006, + "step": 5180 + }, + { + "epoch": 2.11, + "grad_norm": 0.005329395178705454, + "learning_rate": 1.155067155067155e-05, + "loss": 0.0006, + "step": 5190 + }, + { + "epoch": 2.12, + "grad_norm": 0.0049823857843875885, + "learning_rate": 1.1534391534391536e-05, + "loss": 0.0006, + "step": 5200 + }, + { + "epoch": 2.12, + "grad_norm": 0.0051444037817418575, + "learning_rate": 1.1518111518111518e-05, + "loss": 0.0022, + "step": 5210 + }, + { + "epoch": 2.12, + "grad_norm": 0.00532697094604373, + "learning_rate": 1.1501831501831502e-05, + "loss": 0.0006, + "step": 5220 + }, + { + "epoch": 2.13, + "grad_norm": 0.006971771828830242, + "learning_rate": 1.1485551485551488e-05, + "loss": 0.0007, + "step": 5230 + }, + { + "epoch": 2.13, + "grad_norm": 0.005065458826720715, + "learning_rate": 1.146927146927147e-05, + "loss": 0.0006, + "step": 5240 + }, + { + "epoch": 2.14, + "grad_norm": 0.00542556494474411, + "learning_rate": 1.1452991452991454e-05, + "loss": 0.0006, + "step": 5250 + }, + { + "epoch": 2.14, + "grad_norm": 0.005721778143197298, + "learning_rate": 1.1436711436711438e-05, + "loss": 0.0006, + "step": 5260 + }, + { + "epoch": 2.14, + "grad_norm": 0.0050778863951563835, + "learning_rate": 1.1420431420431422e-05, + "loss": 0.0006, + "step": 5270 + }, + { + "epoch": 2.15, + "grad_norm": 0.005689846817404032, + "learning_rate": 1.1404151404151404e-05, + "loss": 0.0007, + "step": 5280 + }, + { + "epoch": 2.15, + "grad_norm": 0.005032387096434832, + "learning_rate": 1.138787138787139e-05, + "loss": 0.0053, + "step": 5290 + }, + { + "epoch": 2.16, + "grad_norm": 0.004602556582540274, + "learning_rate": 1.1371591371591374e-05, + "loss": 0.0006, + "step": 5300 + }, + { + "epoch": 2.16, + "grad_norm": 0.005181928165256977, + "learning_rate": 1.1355311355311356e-05, + "loss": 0.0006, + "step": 5310 + }, + { + "epoch": 2.17, + "grad_norm": 0.004627116955816746, + "learning_rate": 1.1339031339031338e-05, + "loss": 0.0006, + "step": 5320 + }, + { + "epoch": 2.17, + "grad_norm": 0.004680185578763485, + "learning_rate": 1.1322751322751324e-05, + "loss": 0.0006, + "step": 5330 + }, + { + "epoch": 2.17, + "grad_norm": 0.00517154298722744, + "learning_rate": 1.1306471306471308e-05, + "loss": 0.0006, + "step": 5340 + }, + { + "epoch": 2.18, + "grad_norm": 0.2655492126941681, + "learning_rate": 1.129019129019129e-05, + "loss": 0.04, + "step": 5350 + }, + { + "epoch": 2.18, + "grad_norm": 0.004791987128555775, + "learning_rate": 1.1273911273911276e-05, + "loss": 0.0027, + "step": 5360 + }, + { + "epoch": 2.19, + "grad_norm": 0.00524140102788806, + "learning_rate": 1.1257631257631258e-05, + "loss": 0.0019, + "step": 5370 + }, + { + "epoch": 2.19, + "grad_norm": 0.004854326602071524, + "learning_rate": 1.1241351241351242e-05, + "loss": 0.0006, + "step": 5380 + }, + { + "epoch": 2.19, + "grad_norm": 0.004912737291306257, + "learning_rate": 1.1225071225071227e-05, + "loss": 0.0229, + "step": 5390 + }, + { + "epoch": 2.2, + "grad_norm": 0.009351348504424095, + "learning_rate": 1.120879120879121e-05, + "loss": 0.0006, + "step": 5400 + }, + { + "epoch": 2.2, + "grad_norm": 0.006594196427613497, + "learning_rate": 1.1192511192511194e-05, + "loss": 0.0007, + "step": 5410 + }, + { + "epoch": 2.21, + "grad_norm": 0.004785753786563873, + "learning_rate": 1.1176231176231178e-05, + "loss": 0.0006, + "step": 5420 + }, + { + "epoch": 2.21, + "grad_norm": 0.010175659321248531, + "learning_rate": 1.1159951159951162e-05, + "loss": 0.0347, + "step": 5430 + }, + { + "epoch": 2.21, + "grad_norm": 0.007659697439521551, + "learning_rate": 1.1143671143671144e-05, + "loss": 0.0006, + "step": 5440 + }, + { + "epoch": 2.22, + "grad_norm": 0.005518093705177307, + "learning_rate": 1.1127391127391128e-05, + "loss": 0.0007, + "step": 5450 + }, + { + "epoch": 2.22, + "grad_norm": 0.004838414024561644, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.0006, + "step": 5460 + }, + { + "epoch": 2.23, + "grad_norm": 0.004535248037427664, + "learning_rate": 1.1094831094831096e-05, + "loss": 0.0007, + "step": 5470 + }, + { + "epoch": 2.23, + "grad_norm": 0.004755628295242786, + "learning_rate": 1.1078551078551078e-05, + "loss": 0.0006, + "step": 5480 + }, + { + "epoch": 2.23, + "grad_norm": 0.007153332699090242, + "learning_rate": 1.1062271062271063e-05, + "loss": 0.0006, + "step": 5490 + }, + { + "epoch": 2.24, + "grad_norm": 0.004593558143824339, + "learning_rate": 1.1045991045991047e-05, + "loss": 0.0006, + "step": 5500 + }, + { + "epoch": 2.24, + "grad_norm": 0.004781143739819527, + "learning_rate": 1.102971102971103e-05, + "loss": 0.0187, + "step": 5510 + }, + { + "epoch": 2.25, + "grad_norm": 0.022694548591971397, + "learning_rate": 1.1013431013431015e-05, + "loss": 0.0006, + "step": 5520 + }, + { + "epoch": 2.25, + "grad_norm": 0.004701571073383093, + "learning_rate": 1.0997150997150998e-05, + "loss": 0.0005, + "step": 5530 + }, + { + "epoch": 2.25, + "grad_norm": 0.014217639341950417, + "learning_rate": 1.0980870980870981e-05, + "loss": 0.0006, + "step": 5540 + }, + { + "epoch": 2.26, + "grad_norm": 0.0047623575665056705, + "learning_rate": 1.0964590964590967e-05, + "loss": 0.0005, + "step": 5550 + }, + { + "epoch": 2.26, + "grad_norm": 0.004431570880115032, + "learning_rate": 1.094831094831095e-05, + "loss": 0.0006, + "step": 5560 + }, + { + "epoch": 2.27, + "grad_norm": 0.006182719487696886, + "learning_rate": 1.0932030932030933e-05, + "loss": 0.0006, + "step": 5570 + }, + { + "epoch": 2.27, + "grad_norm": 0.004717973992228508, + "learning_rate": 1.0915750915750916e-05, + "loss": 0.0005, + "step": 5580 + }, + { + "epoch": 2.28, + "grad_norm": 0.005284770391881466, + "learning_rate": 1.0899470899470901e-05, + "loss": 0.0051, + "step": 5590 + }, + { + "epoch": 2.28, + "grad_norm": 0.004852925427258015, + "learning_rate": 1.0883190883190883e-05, + "loss": 0.0129, + "step": 5600 + }, + { + "epoch": 2.28, + "grad_norm": 0.011825304478406906, + "learning_rate": 1.0866910866910867e-05, + "loss": 0.0006, + "step": 5610 + }, + { + "epoch": 2.29, + "grad_norm": 5.4084672927856445, + "learning_rate": 1.0850630850630853e-05, + "loss": 0.0014, + "step": 5620 + }, + { + "epoch": 2.29, + "grad_norm": 0.0045865620486438274, + "learning_rate": 1.0834350834350835e-05, + "loss": 0.0012, + "step": 5630 + }, + { + "epoch": 2.3, + "grad_norm": 0.004212076775729656, + "learning_rate": 1.0818070818070818e-05, + "loss": 0.0005, + "step": 5640 + }, + { + "epoch": 2.3, + "grad_norm": 0.0043626646511256695, + "learning_rate": 1.0801790801790803e-05, + "loss": 0.0005, + "step": 5650 + }, + { + "epoch": 2.3, + "grad_norm": 0.003995486069470644, + "learning_rate": 1.0785510785510787e-05, + "loss": 0.0005, + "step": 5660 + }, + { + "epoch": 2.31, + "grad_norm": 12.674348831176758, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0288, + "step": 5670 + }, + { + "epoch": 2.31, + "grad_norm": 0.004922170657664537, + "learning_rate": 1.0752950752950755e-05, + "loss": 0.0005, + "step": 5680 + }, + { + "epoch": 2.32, + "grad_norm": 0.013311301358044147, + "learning_rate": 1.0736670736670737e-05, + "loss": 0.0006, + "step": 5690 + }, + { + "epoch": 2.32, + "grad_norm": 0.004092982970178127, + "learning_rate": 1.0720390720390721e-05, + "loss": 0.0306, + "step": 5700 + }, + { + "epoch": 2.32, + "grad_norm": 0.005637271795421839, + "learning_rate": 1.0704110704110703e-05, + "loss": 0.0483, + "step": 5710 + }, + { + "epoch": 2.33, + "grad_norm": 8.750419616699219, + "learning_rate": 1.0687830687830689e-05, + "loss": 0.0386, + "step": 5720 + }, + { + "epoch": 2.33, + "grad_norm": 0.01064012385904789, + "learning_rate": 1.0671550671550673e-05, + "loss": 0.0006, + "step": 5730 + }, + { + "epoch": 2.34, + "grad_norm": 0.004589624237269163, + "learning_rate": 1.0655270655270655e-05, + "loss": 0.0006, + "step": 5740 + }, + { + "epoch": 2.34, + "grad_norm": 0.004802080802619457, + "learning_rate": 1.0638990638990641e-05, + "loss": 0.0009, + "step": 5750 + }, + { + "epoch": 2.34, + "grad_norm": 0.004713993053883314, + "learning_rate": 1.0622710622710623e-05, + "loss": 0.0006, + "step": 5760 + }, + { + "epoch": 2.35, + "grad_norm": 0.004530477803200483, + "learning_rate": 1.0606430606430607e-05, + "loss": 0.001, + "step": 5770 + }, + { + "epoch": 2.35, + "grad_norm": 0.00422940356656909, + "learning_rate": 1.0590150590150593e-05, + "loss": 0.0007, + "step": 5780 + }, + { + "epoch": 2.36, + "grad_norm": 0.004178835544735193, + "learning_rate": 1.0573870573870575e-05, + "loss": 0.0006, + "step": 5790 + }, + { + "epoch": 2.36, + "grad_norm": 0.006506350357085466, + "learning_rate": 1.0557590557590557e-05, + "loss": 0.0005, + "step": 5800 + }, + { + "epoch": 2.36, + "grad_norm": 0.004273206926882267, + "learning_rate": 1.0541310541310543e-05, + "loss": 0.0005, + "step": 5810 + }, + { + "epoch": 2.37, + "grad_norm": 0.004112168215215206, + "learning_rate": 1.0525030525030527e-05, + "loss": 0.0005, + "step": 5820 + }, + { + "epoch": 2.37, + "grad_norm": 0.005212805233895779, + "learning_rate": 1.0508750508750509e-05, + "loss": 0.0005, + "step": 5830 + }, + { + "epoch": 2.38, + "grad_norm": 0.004351438954472542, + "learning_rate": 1.0492470492470493e-05, + "loss": 0.0005, + "step": 5840 + }, + { + "epoch": 2.38, + "grad_norm": 0.011514941230416298, + "learning_rate": 1.0476190476190477e-05, + "loss": 0.0005, + "step": 5850 + }, + { + "epoch": 2.39, + "grad_norm": 0.005969716235995293, + "learning_rate": 1.045991045991046e-05, + "loss": 0.0005, + "step": 5860 + }, + { + "epoch": 2.39, + "grad_norm": 0.004150481894612312, + "learning_rate": 1.0443630443630443e-05, + "loss": 0.0005, + "step": 5870 + }, + { + "epoch": 2.39, + "grad_norm": 0.003940541297197342, + "learning_rate": 1.0427350427350429e-05, + "loss": 0.0005, + "step": 5880 + }, + { + "epoch": 2.4, + "grad_norm": 0.00408910820260644, + "learning_rate": 1.0411070411070413e-05, + "loss": 0.0005, + "step": 5890 + }, + { + "epoch": 2.4, + "grad_norm": 0.0038459610659629107, + "learning_rate": 1.0394790394790395e-05, + "loss": 0.0006, + "step": 5900 + }, + { + "epoch": 2.41, + "grad_norm": 0.004051607567816973, + "learning_rate": 1.037851037851038e-05, + "loss": 0.0449, + "step": 5910 + }, + { + "epoch": 2.41, + "grad_norm": 0.005520727019757032, + "learning_rate": 1.0362230362230363e-05, + "loss": 0.0078, + "step": 5920 + }, + { + "epoch": 2.41, + "grad_norm": 0.004394978284835815, + "learning_rate": 1.0345950345950347e-05, + "loss": 0.0564, + "step": 5930 + }, + { + "epoch": 2.42, + "grad_norm": 0.004857844207435846, + "learning_rate": 1.0329670329670332e-05, + "loss": 0.0005, + "step": 5940 + }, + { + "epoch": 2.42, + "grad_norm": 0.06114115193486214, + "learning_rate": 1.0313390313390315e-05, + "loss": 0.0007, + "step": 5950 + }, + { + "epoch": 2.43, + "grad_norm": 0.004661387763917446, + "learning_rate": 1.0297110297110297e-05, + "loss": 0.0014, + "step": 5960 + }, + { + "epoch": 2.43, + "grad_norm": 0.005134343635290861, + "learning_rate": 1.028083028083028e-05, + "loss": 0.001, + "step": 5970 + }, + { + "epoch": 2.43, + "grad_norm": 0.004655875731259584, + "learning_rate": 1.0264550264550266e-05, + "loss": 0.0006, + "step": 5980 + }, + { + "epoch": 2.44, + "grad_norm": 0.0050579700618982315, + "learning_rate": 1.0248270248270249e-05, + "loss": 0.0015, + "step": 5990 + }, + { + "epoch": 2.44, + "grad_norm": 0.0047796061262488365, + "learning_rate": 1.0231990231990233e-05, + "loss": 0.0005, + "step": 6000 + }, + { + "epoch": 2.45, + "grad_norm": 0.003949730657041073, + "learning_rate": 1.0215710215710217e-05, + "loss": 0.0005, + "step": 6010 + }, + { + "epoch": 2.45, + "grad_norm": 0.004095940385013819, + "learning_rate": 1.01994301994302e-05, + "loss": 0.0005, + "step": 6020 + }, + { + "epoch": 2.45, + "grad_norm": 0.005133763421326876, + "learning_rate": 1.0183150183150183e-05, + "loss": 0.0005, + "step": 6030 + }, + { + "epoch": 2.46, + "grad_norm": 0.01469303946942091, + "learning_rate": 1.0166870166870168e-05, + "loss": 0.0013, + "step": 6040 + }, + { + "epoch": 2.46, + "grad_norm": 0.004049224779009819, + "learning_rate": 1.0150590150590152e-05, + "loss": 0.0018, + "step": 6050 + }, + { + "epoch": 2.47, + "grad_norm": 0.004054594319313765, + "learning_rate": 1.0134310134310135e-05, + "loss": 0.0184, + "step": 6060 + }, + { + "epoch": 2.47, + "grad_norm": 0.004326994996517897, + "learning_rate": 1.011803011803012e-05, + "loss": 0.0005, + "step": 6070 + }, + { + "epoch": 2.47, + "grad_norm": 0.004046597983688116, + "learning_rate": 1.0101750101750102e-05, + "loss": 0.0005, + "step": 6080 + }, + { + "epoch": 2.48, + "grad_norm": 0.00401376374065876, + "learning_rate": 1.0085470085470086e-05, + "loss": 0.0006, + "step": 6090 + }, + { + "epoch": 2.48, + "grad_norm": 0.005812219809740782, + "learning_rate": 1.0069190069190069e-05, + "loss": 0.0006, + "step": 6100 + }, + { + "epoch": 2.49, + "grad_norm": 0.003832985181361437, + "learning_rate": 1.0052910052910054e-05, + "loss": 0.0005, + "step": 6110 + }, + { + "epoch": 2.49, + "grad_norm": 0.07061895728111267, + "learning_rate": 1.0036630036630037e-05, + "loss": 0.0005, + "step": 6120 + }, + { + "epoch": 2.49, + "grad_norm": 0.0039010499604046345, + "learning_rate": 1.002035002035002e-05, + "loss": 0.0017, + "step": 6130 + }, + { + "epoch": 2.5, + "grad_norm": 0.004468753468245268, + "learning_rate": 1.0004070004070006e-05, + "loss": 0.0022, + "step": 6140 + }, + { + "epoch": 2.5, + "grad_norm": 0.004342631436884403, + "learning_rate": 9.987789987789988e-06, + "loss": 0.0005, + "step": 6150 + }, + { + "epoch": 2.51, + "grad_norm": 0.011564524844288826, + "learning_rate": 9.971509971509972e-06, + "loss": 0.0005, + "step": 6160 + }, + { + "epoch": 2.51, + "grad_norm": 0.00380577496252954, + "learning_rate": 9.955229955229956e-06, + "loss": 0.0005, + "step": 6170 + }, + { + "epoch": 2.52, + "grad_norm": 0.0037414473481476307, + "learning_rate": 9.93894993894994e-06, + "loss": 0.0011, + "step": 6180 + }, + { + "epoch": 2.52, + "grad_norm": 0.003814409486949444, + "learning_rate": 9.922669922669922e-06, + "loss": 0.0005, + "step": 6190 + }, + { + "epoch": 2.52, + "grad_norm": 0.0039341021329164505, + "learning_rate": 9.906389906389906e-06, + "loss": 0.0005, + "step": 6200 + }, + { + "epoch": 2.53, + "grad_norm": 0.003710733028128743, + "learning_rate": 9.890109890109892e-06, + "loss": 0.0005, + "step": 6210 + }, + { + "epoch": 2.53, + "grad_norm": 3.491090774536133, + "learning_rate": 9.873829873829874e-06, + "loss": 0.0356, + "step": 6220 + }, + { + "epoch": 2.54, + "grad_norm": 0.005923949647694826, + "learning_rate": 9.857549857549858e-06, + "loss": 0.0005, + "step": 6230 + }, + { + "epoch": 2.54, + "grad_norm": 0.0036399513483047485, + "learning_rate": 9.841269841269842e-06, + "loss": 0.0004, + "step": 6240 + }, + { + "epoch": 2.54, + "grad_norm": 0.003884287318214774, + "learning_rate": 9.824989824989826e-06, + "loss": 0.0005, + "step": 6250 + }, + { + "epoch": 2.55, + "grad_norm": 0.0036194841377437115, + "learning_rate": 9.80870980870981e-06, + "loss": 0.0004, + "step": 6260 + }, + { + "epoch": 2.55, + "grad_norm": 0.005207626614719629, + "learning_rate": 9.792429792429792e-06, + "loss": 0.0004, + "step": 6270 + }, + { + "epoch": 2.56, + "grad_norm": 0.008327057585120201, + "learning_rate": 9.776149776149776e-06, + "loss": 0.0005, + "step": 6280 + }, + { + "epoch": 2.56, + "grad_norm": 0.003949583508074284, + "learning_rate": 9.759869759869762e-06, + "loss": 0.0004, + "step": 6290 + }, + { + "epoch": 2.56, + "grad_norm": 0.004071325063705444, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0005, + "step": 6300 + }, + { + "epoch": 2.57, + "grad_norm": 0.0036700996570289135, + "learning_rate": 9.727309727309728e-06, + "loss": 0.0004, + "step": 6310 + }, + { + "epoch": 2.57, + "grad_norm": 0.005211398471146822, + "learning_rate": 9.711029711029712e-06, + "loss": 0.0005, + "step": 6320 + }, + { + "epoch": 2.58, + "grad_norm": 0.003708272473886609, + "learning_rate": 9.694749694749696e-06, + "loss": 0.0004, + "step": 6330 + }, + { + "epoch": 2.58, + "grad_norm": 0.0042539420537650585, + "learning_rate": 9.67846967846968e-06, + "loss": 0.0004, + "step": 6340 + }, + { + "epoch": 2.58, + "grad_norm": 0.0038529515732079744, + "learning_rate": 9.662189662189662e-06, + "loss": 0.0004, + "step": 6350 + }, + { + "epoch": 2.59, + "grad_norm": 0.003946115728467703, + "learning_rate": 9.645909645909646e-06, + "loss": 0.0005, + "step": 6360 + }, + { + "epoch": 2.59, + "grad_norm": 0.004324799869209528, + "learning_rate": 9.62962962962963e-06, + "loss": 0.0004, + "step": 6370 + }, + { + "epoch": 2.6, + "grad_norm": 0.0038023737724870443, + "learning_rate": 9.613349613349614e-06, + "loss": 0.0004, + "step": 6380 + }, + { + "epoch": 2.6, + "grad_norm": 0.0037666463758796453, + "learning_rate": 9.597069597069598e-06, + "loss": 0.0004, + "step": 6390 + }, + { + "epoch": 2.6, + "grad_norm": 0.0034590172581374645, + "learning_rate": 9.580789580789582e-06, + "loss": 0.017, + "step": 6400 + }, + { + "epoch": 2.61, + "grad_norm": 0.0038201683200895786, + "learning_rate": 9.564509564509566e-06, + "loss": 0.0004, + "step": 6410 + }, + { + "epoch": 2.61, + "grad_norm": 0.004171228501945734, + "learning_rate": 9.54822954822955e-06, + "loss": 0.0004, + "step": 6420 + }, + { + "epoch": 2.62, + "grad_norm": 0.0038926773704588413, + "learning_rate": 9.531949531949532e-06, + "loss": 0.0004, + "step": 6430 + }, + { + "epoch": 2.62, + "grad_norm": 0.0037587357219308615, + "learning_rate": 9.515669515669516e-06, + "loss": 0.0004, + "step": 6440 + }, + { + "epoch": 2.63, + "grad_norm": 0.0034505994990468025, + "learning_rate": 9.4993894993895e-06, + "loss": 0.0024, + "step": 6450 + }, + { + "epoch": 2.63, + "grad_norm": 0.0034958263859152794, + "learning_rate": 9.483109483109484e-06, + "loss": 0.0004, + "step": 6460 + }, + { + "epoch": 2.63, + "grad_norm": 0.0037652612663805485, + "learning_rate": 9.466829466829468e-06, + "loss": 0.0004, + "step": 6470 + }, + { + "epoch": 2.64, + "grad_norm": 0.003452475182712078, + "learning_rate": 9.450549450549452e-06, + "loss": 0.0004, + "step": 6480 + }, + { + "epoch": 2.64, + "grad_norm": 0.005090977996587753, + "learning_rate": 9.434269434269436e-06, + "loss": 0.0004, + "step": 6490 + }, + { + "epoch": 2.65, + "grad_norm": 0.0036007205490022898, + "learning_rate": 9.417989417989418e-06, + "loss": 0.0004, + "step": 6500 + }, + { + "epoch": 2.65, + "grad_norm": 0.0033244409132748842, + "learning_rate": 9.401709401709402e-06, + "loss": 0.0004, + "step": 6510 + }, + { + "epoch": 2.65, + "grad_norm": 0.00387198431417346, + "learning_rate": 9.385429385429386e-06, + "loss": 0.0004, + "step": 6520 + }, + { + "epoch": 2.66, + "grad_norm": 0.003582969307899475, + "learning_rate": 9.36914936914937e-06, + "loss": 0.0004, + "step": 6530 + }, + { + "epoch": 2.66, + "grad_norm": 0.0032744621858000755, + "learning_rate": 9.352869352869354e-06, + "loss": 0.0005, + "step": 6540 + }, + { + "epoch": 2.67, + "grad_norm": 0.0034951018169522285, + "learning_rate": 9.336589336589338e-06, + "loss": 0.0004, + "step": 6550 + }, + { + "epoch": 2.67, + "grad_norm": 0.0034060273319482803, + "learning_rate": 9.320309320309321e-06, + "loss": 0.0004, + "step": 6560 + }, + { + "epoch": 2.67, + "grad_norm": 0.0034066797234117985, + "learning_rate": 9.304029304029305e-06, + "loss": 0.0004, + "step": 6570 + }, + { + "epoch": 2.68, + "grad_norm": 0.0035453049931675196, + "learning_rate": 9.287749287749288e-06, + "loss": 0.0004, + "step": 6580 + }, + { + "epoch": 2.68, + "grad_norm": 0.0033404843416064978, + "learning_rate": 9.271469271469272e-06, + "loss": 0.0004, + "step": 6590 + }, + { + "epoch": 2.69, + "grad_norm": 0.0032289137598127127, + "learning_rate": 9.255189255189256e-06, + "loss": 0.0004, + "step": 6600 + }, + { + "epoch": 2.69, + "grad_norm": 0.0035338301677256823, + "learning_rate": 9.23890923890924e-06, + "loss": 0.0004, + "step": 6610 + }, + { + "epoch": 2.69, + "grad_norm": 0.0032329142559319735, + "learning_rate": 9.222629222629223e-06, + "loss": 0.0004, + "step": 6620 + }, + { + "epoch": 2.7, + "grad_norm": 0.0033918411936610937, + "learning_rate": 9.206349206349207e-06, + "loss": 0.0004, + "step": 6630 + }, + { + "epoch": 2.7, + "grad_norm": 0.003434843849390745, + "learning_rate": 9.190069190069191e-06, + "loss": 0.0004, + "step": 6640 + }, + { + "epoch": 2.71, + "grad_norm": 0.0032904883846640587, + "learning_rate": 9.173789173789175e-06, + "loss": 0.0004, + "step": 6650 + }, + { + "epoch": 2.71, + "grad_norm": 0.003165784990414977, + "learning_rate": 9.157509157509158e-06, + "loss": 0.0004, + "step": 6660 + }, + { + "epoch": 2.71, + "grad_norm": 0.0034379889257252216, + "learning_rate": 9.141229141229141e-06, + "loss": 0.0004, + "step": 6670 + }, + { + "epoch": 2.72, + "grad_norm": 0.0032244266476482153, + "learning_rate": 9.124949124949125e-06, + "loss": 0.001, + "step": 6680 + }, + { + "epoch": 2.72, + "grad_norm": 0.003119837259873748, + "learning_rate": 9.10866910866911e-06, + "loss": 0.0004, + "step": 6690 + }, + { + "epoch": 2.73, + "grad_norm": 0.0038290254306048155, + "learning_rate": 9.092389092389093e-06, + "loss": 0.0004, + "step": 6700 + }, + { + "epoch": 2.73, + "grad_norm": 0.0032256192062050104, + "learning_rate": 9.076109076109077e-06, + "loss": 0.0004, + "step": 6710 + }, + { + "epoch": 2.74, + "grad_norm": 0.004083781037479639, + "learning_rate": 9.059829059829061e-06, + "loss": 0.0004, + "step": 6720 + }, + { + "epoch": 2.74, + "grad_norm": 0.003274232381954789, + "learning_rate": 9.043549043549045e-06, + "loss": 0.0004, + "step": 6730 + }, + { + "epoch": 2.74, + "grad_norm": 0.0032298911828547716, + "learning_rate": 9.027269027269027e-06, + "loss": 0.0004, + "step": 6740 + }, + { + "epoch": 2.75, + "grad_norm": 0.0031462605111300945, + "learning_rate": 9.010989010989011e-06, + "loss": 0.0425, + "step": 6750 + }, + { + "epoch": 2.75, + "grad_norm": 0.00312459422275424, + "learning_rate": 8.994708994708995e-06, + "loss": 0.0004, + "step": 6760 + }, + { + "epoch": 2.76, + "grad_norm": 0.0036323266103863716, + "learning_rate": 8.97842897842898e-06, + "loss": 0.0004, + "step": 6770 + }, + { + "epoch": 2.76, + "grad_norm": 0.0033034805674105883, + "learning_rate": 8.962148962148963e-06, + "loss": 0.0004, + "step": 6780 + }, + { + "epoch": 2.76, + "grad_norm": 0.003054459812119603, + "learning_rate": 8.945868945868947e-06, + "loss": 0.0004, + "step": 6790 + }, + { + "epoch": 2.77, + "grad_norm": 0.005314236972481012, + "learning_rate": 8.929588929588931e-06, + "loss": 0.0004, + "step": 6800 + }, + { + "epoch": 2.77, + "grad_norm": 0.010932357981801033, + "learning_rate": 8.913308913308915e-06, + "loss": 0.0004, + "step": 6810 + }, + { + "epoch": 2.78, + "grad_norm": 0.0031523159705102444, + "learning_rate": 8.897028897028897e-06, + "loss": 0.0004, + "step": 6820 + }, + { + "epoch": 2.78, + "grad_norm": 0.0034312924835830927, + "learning_rate": 8.880748880748881e-06, + "loss": 0.0101, + "step": 6830 + }, + { + "epoch": 2.78, + "grad_norm": 0.00318572367541492, + "learning_rate": 8.864468864468865e-06, + "loss": 0.0004, + "step": 6840 + }, + { + "epoch": 2.79, + "grad_norm": 0.0032511164899915457, + "learning_rate": 8.848188848188849e-06, + "loss": 0.0379, + "step": 6850 + }, + { + "epoch": 2.79, + "grad_norm": 0.0037167894188314676, + "learning_rate": 8.831908831908833e-06, + "loss": 0.0004, + "step": 6860 + }, + { + "epoch": 2.8, + "grad_norm": 0.003721152199432254, + "learning_rate": 8.815628815628817e-06, + "loss": 0.0008, + "step": 6870 + }, + { + "epoch": 2.8, + "grad_norm": 0.0030927169136703014, + "learning_rate": 8.7993487993488e-06, + "loss": 0.0004, + "step": 6880 + }, + { + "epoch": 2.8, + "grad_norm": 0.0034221247769892216, + "learning_rate": 8.783068783068783e-06, + "loss": 0.0031, + "step": 6890 + }, + { + "epoch": 2.81, + "grad_norm": 0.0033293466549366713, + "learning_rate": 8.766788766788767e-06, + "loss": 0.0004, + "step": 6900 + }, + { + "epoch": 2.81, + "grad_norm": 0.003214113647118211, + "learning_rate": 8.750508750508751e-06, + "loss": 0.0004, + "step": 6910 + }, + { + "epoch": 2.82, + "grad_norm": 0.0032116910442709923, + "learning_rate": 8.734228734228735e-06, + "loss": 0.034, + "step": 6920 + }, + { + "epoch": 2.82, + "grad_norm": 0.003405655035749078, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0515, + "step": 6930 + }, + { + "epoch": 2.82, + "grad_norm": 0.003949843347072601, + "learning_rate": 8.701668701668703e-06, + "loss": 0.0508, + "step": 6940 + }, + { + "epoch": 2.83, + "grad_norm": 0.0030140685848891735, + "learning_rate": 8.685388685388687e-06, + "loss": 0.0385, + "step": 6950 + }, + { + "epoch": 2.83, + "grad_norm": 0.0034769896883517504, + "learning_rate": 8.66910866910867e-06, + "loss": 0.0004, + "step": 6960 + }, + { + "epoch": 2.84, + "grad_norm": 0.005965403746813536, + "learning_rate": 8.652828652828653e-06, + "loss": 0.0454, + "step": 6970 + }, + { + "epoch": 2.84, + "grad_norm": 0.004475270863622427, + "learning_rate": 8.636548636548637e-06, + "loss": 0.0005, + "step": 6980 + }, + { + "epoch": 2.84, + "grad_norm": 0.0039094300009310246, + "learning_rate": 8.62026862026862e-06, + "loss": 0.0005, + "step": 6990 + }, + { + "epoch": 2.85, + "grad_norm": 0.004547227174043655, + "learning_rate": 8.603988603988605e-06, + "loss": 0.0004, + "step": 7000 + }, + { + "epoch": 2.85, + "grad_norm": 0.0033658877946436405, + "learning_rate": 8.587708587708589e-06, + "loss": 0.0005, + "step": 7010 + }, + { + "epoch": 2.86, + "grad_norm": 0.0037282053381204605, + "learning_rate": 8.571428571428571e-06, + "loss": 0.0005, + "step": 7020 + }, + { + "epoch": 2.86, + "grad_norm": 0.012108271941542625, + "learning_rate": 8.555148555148557e-06, + "loss": 0.0005, + "step": 7030 + }, + { + "epoch": 2.87, + "grad_norm": 0.00378889380954206, + "learning_rate": 8.53886853886854e-06, + "loss": 0.0142, + "step": 7040 + }, + { + "epoch": 2.87, + "grad_norm": 0.0037225610576570034, + "learning_rate": 8.522588522588523e-06, + "loss": 0.0009, + "step": 7050 + }, + { + "epoch": 2.87, + "grad_norm": 0.005083514377474785, + "learning_rate": 8.506308506308507e-06, + "loss": 0.0004, + "step": 7060 + }, + { + "epoch": 2.88, + "grad_norm": 0.0035945470444858074, + "learning_rate": 8.49002849002849e-06, + "loss": 0.0005, + "step": 7070 + }, + { + "epoch": 2.88, + "grad_norm": 0.0031938895117491484, + "learning_rate": 8.473748473748475e-06, + "loss": 0.0007, + "step": 7080 + }, + { + "epoch": 2.89, + "grad_norm": 0.007891247980296612, + "learning_rate": 8.457468457468459e-06, + "loss": 0.0004, + "step": 7090 + }, + { + "epoch": 2.89, + "grad_norm": 0.003397688502445817, + "learning_rate": 8.44118844118844e-06, + "loss": 0.0004, + "step": 7100 + }, + { + "epoch": 2.89, + "grad_norm": 0.004096095450222492, + "learning_rate": 8.424908424908426e-06, + "loss": 0.0004, + "step": 7110 + }, + { + "epoch": 2.9, + "grad_norm": 0.004969074856489897, + "learning_rate": 8.40862840862841e-06, + "loss": 0.0004, + "step": 7120 + }, + { + "epoch": 2.9, + "grad_norm": 0.002869043732061982, + "learning_rate": 8.392348392348393e-06, + "loss": 0.0376, + "step": 7130 + }, + { + "epoch": 2.91, + "grad_norm": 0.004255395848304033, + "learning_rate": 8.376068376068377e-06, + "loss": 0.0004, + "step": 7140 + }, + { + "epoch": 2.91, + "grad_norm": 0.003371414029970765, + "learning_rate": 8.35978835978836e-06, + "loss": 0.0005, + "step": 7150 + }, + { + "epoch": 2.91, + "grad_norm": 0.0031468465458601713, + "learning_rate": 8.343508343508344e-06, + "loss": 0.0004, + "step": 7160 + }, + { + "epoch": 2.92, + "grad_norm": 0.004064807202666998, + "learning_rate": 8.327228327228328e-06, + "loss": 0.0004, + "step": 7170 + }, + { + "epoch": 2.92, + "grad_norm": 0.0038253762759268284, + "learning_rate": 8.31094831094831e-06, + "loss": 0.0275, + "step": 7180 + }, + { + "epoch": 2.93, + "grad_norm": 0.0029601927381008863, + "learning_rate": 8.294668294668296e-06, + "loss": 0.0162, + "step": 7190 + }, + { + "epoch": 2.93, + "grad_norm": 0.0035592832136899233, + "learning_rate": 8.278388278388278e-06, + "loss": 0.001, + "step": 7200 + }, + { + "epoch": 2.93, + "grad_norm": 0.003166656941175461, + "learning_rate": 8.262108262108262e-06, + "loss": 0.0004, + "step": 7210 + }, + { + "epoch": 2.94, + "grad_norm": 0.0038591506890952587, + "learning_rate": 8.245828245828246e-06, + "loss": 0.0004, + "step": 7220 + }, + { + "epoch": 2.94, + "grad_norm": 0.004316645674407482, + "learning_rate": 8.22954822954823e-06, + "loss": 0.0339, + "step": 7230 + }, + { + "epoch": 2.95, + "grad_norm": 0.003106352873146534, + "learning_rate": 8.213268213268214e-06, + "loss": 0.0004, + "step": 7240 + }, + { + "epoch": 2.95, + "grad_norm": 0.003383921692147851, + "learning_rate": 8.196988196988198e-06, + "loss": 0.0004, + "step": 7250 + }, + { + "epoch": 2.95, + "grad_norm": 0.003904301906004548, + "learning_rate": 8.18070818070818e-06, + "loss": 0.009, + "step": 7260 + }, + { + "epoch": 2.96, + "grad_norm": 0.002857522340491414, + "learning_rate": 8.164428164428166e-06, + "loss": 0.0004, + "step": 7270 + }, + { + "epoch": 2.96, + "grad_norm": 0.0028671324253082275, + "learning_rate": 8.148148148148148e-06, + "loss": 0.0004, + "step": 7280 + }, + { + "epoch": 2.97, + "grad_norm": 0.0028230687603354454, + "learning_rate": 8.131868131868132e-06, + "loss": 0.0009, + "step": 7290 + }, + { + "epoch": 2.97, + "grad_norm": 0.0028381363954395056, + "learning_rate": 8.115588115588116e-06, + "loss": 0.0003, + "step": 7300 + }, + { + "epoch": 2.98, + "grad_norm": 0.0028295046649873257, + "learning_rate": 8.0993080993081e-06, + "loss": 0.0099, + "step": 7310 + }, + { + "epoch": 2.98, + "grad_norm": 0.0051268660463392735, + "learning_rate": 8.083028083028084e-06, + "loss": 0.0004, + "step": 7320 + }, + { + "epoch": 2.98, + "grad_norm": 0.006851341109722853, + "learning_rate": 8.066748066748066e-06, + "loss": 0.0569, + "step": 7330 + }, + { + "epoch": 2.99, + "grad_norm": 0.003248844761401415, + "learning_rate": 8.05046805046805e-06, + "loss": 0.0004, + "step": 7340 + }, + { + "epoch": 2.99, + "grad_norm": 0.003859333461150527, + "learning_rate": 8.034188034188036e-06, + "loss": 0.0012, + "step": 7350 + }, + { + "epoch": 3.0, + "grad_norm": 0.002941732294857502, + "learning_rate": 8.017908017908018e-06, + "loss": 0.0145, + "step": 7360 + }, + { + "epoch": 3.0, + "grad_norm": 0.0032136046793311834, + "learning_rate": 8.001628001628002e-06, + "loss": 0.0004, + "step": 7370 + }, + { + "epoch": 3.0, + "grad_norm": 0.0037520972546190023, + "learning_rate": 7.985347985347986e-06, + "loss": 0.0003, + "step": 7380 + }, + { + "epoch": 3.01, + "grad_norm": 0.002765586832538247, + "learning_rate": 7.96906796906797e-06, + "loss": 0.0003, + "step": 7390 + }, + { + "epoch": 3.01, + "grad_norm": 0.002917984500527382, + "learning_rate": 7.952787952787954e-06, + "loss": 0.0003, + "step": 7400 + }, + { + "epoch": 3.02, + "grad_norm": 0.002771808998659253, + "learning_rate": 7.936507936507936e-06, + "loss": 0.0003, + "step": 7410 + }, + { + "epoch": 3.02, + "grad_norm": 0.0028077957686036825, + "learning_rate": 7.92022792022792e-06, + "loss": 0.0003, + "step": 7420 + }, + { + "epoch": 3.02, + "grad_norm": 0.014859122224152088, + "learning_rate": 7.903947903947906e-06, + "loss": 0.0187, + "step": 7430 + }, + { + "epoch": 3.03, + "grad_norm": 0.0029327922966331244, + "learning_rate": 7.887667887667888e-06, + "loss": 0.0003, + "step": 7440 + }, + { + "epoch": 3.03, + "grad_norm": 0.003155101090669632, + "learning_rate": 7.871387871387872e-06, + "loss": 0.0003, + "step": 7450 + }, + { + "epoch": 3.04, + "grad_norm": 0.002822224283590913, + "learning_rate": 7.855107855107856e-06, + "loss": 0.0003, + "step": 7460 + }, + { + "epoch": 3.04, + "grad_norm": 0.002726204926148057, + "learning_rate": 7.83882783882784e-06, + "loss": 0.0003, + "step": 7470 + }, + { + "epoch": 3.04, + "grad_norm": 0.0026202842127531767, + "learning_rate": 7.822547822547824e-06, + "loss": 0.0004, + "step": 7480 + }, + { + "epoch": 3.05, + "grad_norm": 0.0026620635762810707, + "learning_rate": 7.806267806267806e-06, + "loss": 0.0003, + "step": 7490 + }, + { + "epoch": 3.05, + "grad_norm": 0.0026845140382647514, + "learning_rate": 7.78998778998779e-06, + "loss": 0.0003, + "step": 7500 + }, + { + "epoch": 3.06, + "grad_norm": 0.002793940482661128, + "learning_rate": 7.773707773707776e-06, + "loss": 0.0004, + "step": 7510 + }, + { + "epoch": 3.06, + "grad_norm": 0.002819318324327469, + "learning_rate": 7.757427757427758e-06, + "loss": 0.0003, + "step": 7520 + }, + { + "epoch": 3.06, + "grad_norm": 0.0027769345324486494, + "learning_rate": 7.741147741147742e-06, + "loss": 0.0003, + "step": 7530 + }, + { + "epoch": 3.07, + "grad_norm": 0.002659664023667574, + "learning_rate": 7.724867724867726e-06, + "loss": 0.0003, + "step": 7540 + }, + { + "epoch": 3.07, + "grad_norm": 0.0025388060603290796, + "learning_rate": 7.70858770858771e-06, + "loss": 0.0003, + "step": 7550 + }, + { + "epoch": 3.08, + "grad_norm": 0.002629263559356332, + "learning_rate": 7.692307692307694e-06, + "loss": 0.0003, + "step": 7560 + }, + { + "epoch": 3.08, + "grad_norm": 0.0025471756234765053, + "learning_rate": 7.676027676027676e-06, + "loss": 0.0003, + "step": 7570 + }, + { + "epoch": 3.09, + "grad_norm": 0.006246237549930811, + "learning_rate": 7.65974765974766e-06, + "loss": 0.0003, + "step": 7580 + }, + { + "epoch": 3.09, + "grad_norm": 0.0031642026733607054, + "learning_rate": 7.643467643467644e-06, + "loss": 0.0003, + "step": 7590 + }, + { + "epoch": 3.09, + "grad_norm": 0.0028460524044930935, + "learning_rate": 7.627187627187628e-06, + "loss": 0.0003, + "step": 7600 + }, + { + "epoch": 3.1, + "grad_norm": 0.0027321220841258764, + "learning_rate": 7.610907610907612e-06, + "loss": 0.0004, + "step": 7610 + }, + { + "epoch": 3.1, + "grad_norm": 0.07277552038431168, + "learning_rate": 7.594627594627595e-06, + "loss": 0.0003, + "step": 7620 + }, + { + "epoch": 3.11, + "grad_norm": 0.002561114262789488, + "learning_rate": 7.578347578347579e-06, + "loss": 0.0003, + "step": 7630 + }, + { + "epoch": 3.11, + "grad_norm": 0.002666006563231349, + "learning_rate": 7.5620675620675634e-06, + "loss": 0.0003, + "step": 7640 + }, + { + "epoch": 3.11, + "grad_norm": 0.003249433124437928, + "learning_rate": 7.5457875457875465e-06, + "loss": 0.0003, + "step": 7650 + }, + { + "epoch": 3.12, + "grad_norm": 0.002814142033457756, + "learning_rate": 7.5295075295075305e-06, + "loss": 0.0003, + "step": 7660 + }, + { + "epoch": 3.12, + "grad_norm": 0.002647695131599903, + "learning_rate": 7.5132275132275136e-06, + "loss": 0.0003, + "step": 7670 + }, + { + "epoch": 3.13, + "grad_norm": 0.0028357000555843115, + "learning_rate": 7.4969474969474975e-06, + "loss": 0.0003, + "step": 7680 + }, + { + "epoch": 3.13, + "grad_norm": 0.002574663609266281, + "learning_rate": 7.4806674806674814e-06, + "loss": 0.0003, + "step": 7690 + }, + { + "epoch": 3.13, + "grad_norm": 0.002485772827640176, + "learning_rate": 7.4643874643874645e-06, + "loss": 0.0004, + "step": 7700 + }, + { + "epoch": 3.14, + "grad_norm": 0.0026384114753454924, + "learning_rate": 7.448107448107449e-06, + "loss": 0.0003, + "step": 7710 + }, + { + "epoch": 3.14, + "grad_norm": 0.0025012667756527662, + "learning_rate": 7.4318274318274316e-06, + "loss": 0.0003, + "step": 7720 + }, + { + "epoch": 3.15, + "grad_norm": 0.0023603325244039297, + "learning_rate": 7.415547415547416e-06, + "loss": 0.0008, + "step": 7730 + }, + { + "epoch": 3.15, + "grad_norm": 0.006851641461253166, + "learning_rate": 7.3992673992674e-06, + "loss": 0.0003, + "step": 7740 + }, + { + "epoch": 3.15, + "grad_norm": 0.0029785565566271544, + "learning_rate": 7.382987382987383e-06, + "loss": 0.0003, + "step": 7750 + }, + { + "epoch": 3.16, + "grad_norm": 0.002378121018409729, + "learning_rate": 7.366707366707367e-06, + "loss": 0.0062, + "step": 7760 + }, + { + "epoch": 3.16, + "grad_norm": 0.0024877325631678104, + "learning_rate": 7.350427350427351e-06, + "loss": 0.0003, + "step": 7770 + }, + { + "epoch": 3.17, + "grad_norm": 0.004979600198566914, + "learning_rate": 7.334147334147334e-06, + "loss": 0.0003, + "step": 7780 + }, + { + "epoch": 3.17, + "grad_norm": 0.002649629721418023, + "learning_rate": 7.317867317867319e-06, + "loss": 0.0003, + "step": 7790 + }, + { + "epoch": 3.17, + "grad_norm": 0.0030928929336369038, + "learning_rate": 7.301587301587301e-06, + "loss": 0.0003, + "step": 7800 + }, + { + "epoch": 3.18, + "grad_norm": 0.00250143650919199, + "learning_rate": 7.285307285307286e-06, + "loss": 0.0003, + "step": 7810 + }, + { + "epoch": 3.18, + "grad_norm": 0.002448960905894637, + "learning_rate": 7.26902726902727e-06, + "loss": 0.0003, + "step": 7820 + }, + { + "epoch": 3.19, + "grad_norm": 0.0023297348525375128, + "learning_rate": 7.252747252747253e-06, + "loss": 0.0003, + "step": 7830 + }, + { + "epoch": 3.19, + "grad_norm": 0.0023908980656415224, + "learning_rate": 7.236467236467237e-06, + "loss": 0.0003, + "step": 7840 + }, + { + "epoch": 3.19, + "grad_norm": 0.003359014866873622, + "learning_rate": 7.22018722018722e-06, + "loss": 0.0003, + "step": 7850 + }, + { + "epoch": 3.2, + "grad_norm": 0.002836639992892742, + "learning_rate": 7.203907203907204e-06, + "loss": 0.0003, + "step": 7860 + }, + { + "epoch": 3.2, + "grad_norm": 0.0024746765848249197, + "learning_rate": 7.187627187627189e-06, + "loss": 0.0003, + "step": 7870 + }, + { + "epoch": 3.21, + "grad_norm": 0.002388924825936556, + "learning_rate": 7.171347171347171e-06, + "loss": 0.0003, + "step": 7880 + }, + { + "epoch": 3.21, + "grad_norm": 0.0024251139257103205, + "learning_rate": 7.155067155067156e-06, + "loss": 0.0003, + "step": 7890 + }, + { + "epoch": 3.22, + "grad_norm": 0.00242208456620574, + "learning_rate": 7.13878713878714e-06, + "loss": 0.0003, + "step": 7900 + }, + { + "epoch": 3.22, + "grad_norm": 0.0023256507702171803, + "learning_rate": 7.122507122507123e-06, + "loss": 0.0003, + "step": 7910 + }, + { + "epoch": 3.22, + "grad_norm": 0.0022887035738676786, + "learning_rate": 7.106227106227107e-06, + "loss": 0.0003, + "step": 7920 + }, + { + "epoch": 3.23, + "grad_norm": 0.0022210038732737303, + "learning_rate": 7.08994708994709e-06, + "loss": 0.0003, + "step": 7930 + }, + { + "epoch": 3.23, + "grad_norm": 0.002328604692593217, + "learning_rate": 7.073667073667074e-06, + "loss": 0.0003, + "step": 7940 + }, + { + "epoch": 3.24, + "grad_norm": 0.002483953256160021, + "learning_rate": 7.057387057387059e-06, + "loss": 0.0003, + "step": 7950 + }, + { + "epoch": 3.24, + "grad_norm": 0.002675483236089349, + "learning_rate": 7.041107041107041e-06, + "loss": 0.0003, + "step": 7960 + }, + { + "epoch": 3.24, + "grad_norm": 0.0023732264526188374, + "learning_rate": 7.024827024827026e-06, + "loss": 0.0003, + "step": 7970 + }, + { + "epoch": 3.25, + "grad_norm": 0.002226916840299964, + "learning_rate": 7.008547008547009e-06, + "loss": 0.0003, + "step": 7980 + }, + { + "epoch": 3.25, + "grad_norm": 0.003264982718974352, + "learning_rate": 6.992266992266993e-06, + "loss": 0.0003, + "step": 7990 + }, + { + "epoch": 3.26, + "grad_norm": 0.0026976047083735466, + "learning_rate": 6.975986975986977e-06, + "loss": 0.0003, + "step": 8000 + }, + { + "epoch": 3.26, + "grad_norm": 0.002336106961593032, + "learning_rate": 6.95970695970696e-06, + "loss": 0.0003, + "step": 8010 + }, + { + "epoch": 3.26, + "grad_norm": 0.0023025060072541237, + "learning_rate": 6.943426943426944e-06, + "loss": 0.0003, + "step": 8020 + }, + { + "epoch": 3.27, + "grad_norm": 0.0024826654698699713, + "learning_rate": 6.927146927146929e-06, + "loss": 0.0003, + "step": 8030 + }, + { + "epoch": 3.27, + "grad_norm": 0.002214565174654126, + "learning_rate": 6.910866910866911e-06, + "loss": 0.0003, + "step": 8040 + }, + { + "epoch": 3.28, + "grad_norm": 0.002279749372974038, + "learning_rate": 6.894586894586896e-06, + "loss": 0.0003, + "step": 8050 + }, + { + "epoch": 3.28, + "grad_norm": 0.002262295223772526, + "learning_rate": 6.878306878306879e-06, + "loss": 0.0003, + "step": 8060 + }, + { + "epoch": 3.28, + "grad_norm": 0.0022824567276984453, + "learning_rate": 6.862026862026863e-06, + "loss": 0.0003, + "step": 8070 + }, + { + "epoch": 3.29, + "grad_norm": 0.0022059327457100153, + "learning_rate": 6.845746845746847e-06, + "loss": 0.0003, + "step": 8080 + }, + { + "epoch": 3.29, + "grad_norm": 0.0022225133143365383, + "learning_rate": 6.82946682946683e-06, + "loss": 0.0003, + "step": 8090 + }, + { + "epoch": 3.3, + "grad_norm": 0.0030766648706048727, + "learning_rate": 6.813186813186814e-06, + "loss": 0.0003, + "step": 8100 + }, + { + "epoch": 3.3, + "grad_norm": 0.0020688914228230715, + "learning_rate": 6.796906796906797e-06, + "loss": 0.0003, + "step": 8110 + }, + { + "epoch": 3.3, + "grad_norm": 0.0026230113580822945, + "learning_rate": 6.780626780626781e-06, + "loss": 0.0003, + "step": 8120 + }, + { + "epoch": 3.31, + "grad_norm": 0.0027380469255149364, + "learning_rate": 6.7643467643467655e-06, + "loss": 0.0002, + "step": 8130 + }, + { + "epoch": 3.31, + "grad_norm": 0.0020218545105308294, + "learning_rate": 6.748066748066749e-06, + "loss": 0.0002, + "step": 8140 + }, + { + "epoch": 3.32, + "grad_norm": 0.0022498080506920815, + "learning_rate": 6.7317867317867326e-06, + "loss": 0.0002, + "step": 8150 + }, + { + "epoch": 3.32, + "grad_norm": 0.0026646710466593504, + "learning_rate": 6.715506715506716e-06, + "loss": 0.0002, + "step": 8160 + }, + { + "epoch": 3.33, + "grad_norm": 0.0021166689693927765, + "learning_rate": 6.6992266992267e-06, + "loss": 0.0002, + "step": 8170 + }, + { + "epoch": 3.33, + "grad_norm": 0.0022176315542310476, + "learning_rate": 6.6829466829466836e-06, + "loss": 0.0003, + "step": 8180 + }, + { + "epoch": 3.33, + "grad_norm": 0.0020941500551998615, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0002, + "step": 8190 + }, + { + "epoch": 3.34, + "grad_norm": 0.002201402559876442, + "learning_rate": 6.650386650386651e-06, + "loss": 0.0003, + "step": 8200 + }, + { + "epoch": 3.34, + "grad_norm": 0.002235386986285448, + "learning_rate": 6.634106634106635e-06, + "loss": 0.0062, + "step": 8210 + }, + { + "epoch": 3.35, + "grad_norm": 0.002202383242547512, + "learning_rate": 6.6178266178266185e-06, + "loss": 0.0002, + "step": 8220 + }, + { + "epoch": 3.35, + "grad_norm": 0.002144381171092391, + "learning_rate": 6.601546601546602e-06, + "loss": 0.0002, + "step": 8230 + }, + { + "epoch": 3.35, + "grad_norm": 0.0027761892415583134, + "learning_rate": 6.5852665852665855e-06, + "loss": 0.0003, + "step": 8240 + }, + { + "epoch": 3.36, + "grad_norm": 0.002119843615218997, + "learning_rate": 6.5689865689865694e-06, + "loss": 0.0002, + "step": 8250 + }, + { + "epoch": 3.36, + "grad_norm": 0.003361073322594166, + "learning_rate": 6.552706552706553e-06, + "loss": 0.0002, + "step": 8260 + }, + { + "epoch": 3.37, + "grad_norm": 0.0021668022964149714, + "learning_rate": 6.5364265364265365e-06, + "loss": 0.0003, + "step": 8270 + }, + { + "epoch": 3.37, + "grad_norm": 0.0020495818462222815, + "learning_rate": 6.5201465201465204e-06, + "loss": 0.0002, + "step": 8280 + }, + { + "epoch": 3.37, + "grad_norm": 0.002108585089445114, + "learning_rate": 6.5038665038665035e-06, + "loss": 0.0004, + "step": 8290 + }, + { + "epoch": 3.38, + "grad_norm": 0.0022084820084273815, + "learning_rate": 6.487586487586488e-06, + "loss": 0.0094, + "step": 8300 + }, + { + "epoch": 3.38, + "grad_norm": 0.002132968744263053, + "learning_rate": 6.471306471306472e-06, + "loss": 0.0002, + "step": 8310 + }, + { + "epoch": 3.39, + "grad_norm": 0.002239073161035776, + "learning_rate": 6.455026455026455e-06, + "loss": 0.0002, + "step": 8320 + }, + { + "epoch": 3.39, + "grad_norm": 0.00218349602073431, + "learning_rate": 6.438746438746439e-06, + "loss": 0.0002, + "step": 8330 + }, + { + "epoch": 3.39, + "grad_norm": 0.00208345171995461, + "learning_rate": 6.422466422466423e-06, + "loss": 0.0002, + "step": 8340 + }, + { + "epoch": 3.4, + "grad_norm": 0.003050567815080285, + "learning_rate": 6.406186406186406e-06, + "loss": 0.0002, + "step": 8350 + }, + { + "epoch": 3.4, + "grad_norm": 0.0019847999792546034, + "learning_rate": 6.38990638990639e-06, + "loss": 0.0003, + "step": 8360 + }, + { + "epoch": 3.41, + "grad_norm": 0.0020100034307688475, + "learning_rate": 6.373626373626373e-06, + "loss": 0.0002, + "step": 8370 + }, + { + "epoch": 3.41, + "grad_norm": 0.0020706066861748695, + "learning_rate": 6.357346357346358e-06, + "loss": 0.0002, + "step": 8380 + }, + { + "epoch": 3.41, + "grad_norm": 0.0019506254466250539, + "learning_rate": 6.341066341066342e-06, + "loss": 0.0002, + "step": 8390 + }, + { + "epoch": 3.42, + "grad_norm": 0.0020071598701179028, + "learning_rate": 6.324786324786325e-06, + "loss": 0.0002, + "step": 8400 + }, + { + "epoch": 3.42, + "grad_norm": 0.002606179565191269, + "learning_rate": 6.308506308506309e-06, + "loss": 0.0467, + "step": 8410 + }, + { + "epoch": 3.43, + "grad_norm": 0.0021410868503153324, + "learning_rate": 6.292226292226292e-06, + "loss": 0.0002, + "step": 8420 + }, + { + "epoch": 3.43, + "grad_norm": 0.002439359435811639, + "learning_rate": 6.275946275946276e-06, + "loss": 0.0002, + "step": 8430 + }, + { + "epoch": 3.44, + "grad_norm": 0.0037037180736660957, + "learning_rate": 6.25966625966626e-06, + "loss": 0.0002, + "step": 8440 + }, + { + "epoch": 3.44, + "grad_norm": 0.0022582276724278927, + "learning_rate": 6.243386243386243e-06, + "loss": 0.0002, + "step": 8450 + }, + { + "epoch": 3.44, + "grad_norm": 0.006983071565628052, + "learning_rate": 6.227106227106228e-06, + "loss": 0.0002, + "step": 8460 + }, + { + "epoch": 3.45, + "grad_norm": 0.01085950993001461, + "learning_rate": 6.210826210826212e-06, + "loss": 0.0003, + "step": 8470 + }, + { + "epoch": 3.45, + "grad_norm": 0.0021798298694193363, + "learning_rate": 6.194546194546195e-06, + "loss": 0.0002, + "step": 8480 + }, + { + "epoch": 3.46, + "grad_norm": 0.0021102093160152435, + "learning_rate": 6.178266178266179e-06, + "loss": 0.0002, + "step": 8490 + }, + { + "epoch": 3.46, + "grad_norm": 0.0021143911872059107, + "learning_rate": 6.161986161986162e-06, + "loss": 0.0002, + "step": 8500 + }, + { + "epoch": 3.46, + "grad_norm": 0.002472953638061881, + "learning_rate": 6.145706145706146e-06, + "loss": 0.0002, + "step": 8510 + }, + { + "epoch": 3.47, + "grad_norm": 0.0019736222457140684, + "learning_rate": 6.12942612942613e-06, + "loss": 0.0002, + "step": 8520 + }, + { + "epoch": 3.47, + "grad_norm": 0.001965272007510066, + "learning_rate": 6.113146113146113e-06, + "loss": 0.0002, + "step": 8530 + }, + { + "epoch": 3.48, + "grad_norm": 0.001975101651623845, + "learning_rate": 6.096866096866098e-06, + "loss": 0.0002, + "step": 8540 + }, + { + "epoch": 3.48, + "grad_norm": 0.002040453255176544, + "learning_rate": 6.080586080586081e-06, + "loss": 0.0002, + "step": 8550 + }, + { + "epoch": 3.48, + "grad_norm": 7.389462471008301, + "learning_rate": 6.064306064306065e-06, + "loss": 0.034, + "step": 8560 + }, + { + "epoch": 3.49, + "grad_norm": 0.0022456683218479156, + "learning_rate": 6.048026048026049e-06, + "loss": 0.0002, + "step": 8570 + }, + { + "epoch": 3.49, + "grad_norm": 0.003975760657340288, + "learning_rate": 6.031746031746032e-06, + "loss": 0.0002, + "step": 8580 + }, + { + "epoch": 3.5, + "grad_norm": 0.0020120914559811354, + "learning_rate": 6.015466015466016e-06, + "loss": 0.0002, + "step": 8590 + }, + { + "epoch": 3.5, + "grad_norm": 0.0022050223778933287, + "learning_rate": 5.999185999186001e-06, + "loss": 0.0002, + "step": 8600 + }, + { + "epoch": 3.5, + "grad_norm": 6.6309919357299805, + "learning_rate": 5.982905982905983e-06, + "loss": 0.043, + "step": 8610 + }, + { + "epoch": 3.51, + "grad_norm": 0.0022617350332438946, + "learning_rate": 5.966625966625968e-06, + "loss": 0.0002, + "step": 8620 + }, + { + "epoch": 3.51, + "grad_norm": 0.0019437572918832302, + "learning_rate": 5.950345950345951e-06, + "loss": 0.0002, + "step": 8630 + }, + { + "epoch": 3.52, + "grad_norm": 0.001993882469832897, + "learning_rate": 5.934065934065935e-06, + "loss": 0.0002, + "step": 8640 + }, + { + "epoch": 3.52, + "grad_norm": 0.0022044796496629715, + "learning_rate": 5.917785917785919e-06, + "loss": 0.0002, + "step": 8650 + }, + { + "epoch": 3.52, + "grad_norm": 0.0020595360547304153, + "learning_rate": 5.901505901505902e-06, + "loss": 0.0004, + "step": 8660 + }, + { + "epoch": 3.53, + "grad_norm": 0.002459390088915825, + "learning_rate": 5.885225885225886e-06, + "loss": 0.0002, + "step": 8670 + }, + { + "epoch": 3.53, + "grad_norm": 0.0018390618497505784, + "learning_rate": 5.868945868945869e-06, + "loss": 0.0002, + "step": 8680 + }, + { + "epoch": 3.54, + "grad_norm": 0.002049500122666359, + "learning_rate": 5.852665852665853e-06, + "loss": 0.0002, + "step": 8690 + }, + { + "epoch": 3.54, + "grad_norm": 0.001947426819242537, + "learning_rate": 5.8363858363858375e-06, + "loss": 0.0002, + "step": 8700 + }, + { + "epoch": 3.54, + "grad_norm": 0.0030878265388309956, + "learning_rate": 5.820105820105821e-06, + "loss": 0.0002, + "step": 8710 + }, + { + "epoch": 3.55, + "grad_norm": 0.001884807599708438, + "learning_rate": 5.8038258038258045e-06, + "loss": 0.0002, + "step": 8720 + }, + { + "epoch": 3.55, + "grad_norm": 0.0019810153171420097, + "learning_rate": 5.7875457875457885e-06, + "loss": 0.0002, + "step": 8730 + }, + { + "epoch": 3.56, + "grad_norm": 0.001923812204040587, + "learning_rate": 5.7712657712657716e-06, + "loss": 0.0002, + "step": 8740 + }, + { + "epoch": 3.56, + "grad_norm": 0.001998158637434244, + "learning_rate": 5.7549857549857555e-06, + "loss": 0.0002, + "step": 8750 + }, + { + "epoch": 3.57, + "grad_norm": 0.0018681226065382361, + "learning_rate": 5.738705738705739e-06, + "loss": 0.0002, + "step": 8760 + }, + { + "epoch": 3.57, + "grad_norm": 0.006764058023691177, + "learning_rate": 5.7224257224257225e-06, + "loss": 0.0068, + "step": 8770 + }, + { + "epoch": 3.57, + "grad_norm": 0.003150691743940115, + "learning_rate": 5.706145706145707e-06, + "loss": 0.0007, + "step": 8780 + }, + { + "epoch": 3.58, + "grad_norm": 0.00217335089109838, + "learning_rate": 5.68986568986569e-06, + "loss": 0.0002, + "step": 8790 + }, + { + "epoch": 3.58, + "grad_norm": 0.00865620281547308, + "learning_rate": 5.673585673585674e-06, + "loss": 0.0002, + "step": 8800 + }, + { + "epoch": 3.59, + "grad_norm": 0.0020344313234090805, + "learning_rate": 5.6573056573056575e-06, + "loss": 0.0002, + "step": 8810 + }, + { + "epoch": 3.59, + "grad_norm": 0.0018948889337480068, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0002, + "step": 8820 + }, + { + "epoch": 3.59, + "grad_norm": 0.001868214923888445, + "learning_rate": 5.624745624745625e-06, + "loss": 0.0002, + "step": 8830 + }, + { + "epoch": 3.6, + "grad_norm": 0.0019942354410886765, + "learning_rate": 5.6084656084656084e-06, + "loss": 0.0002, + "step": 8840 + }, + { + "epoch": 3.6, + "grad_norm": 0.0018839197000488639, + "learning_rate": 5.592185592185592e-06, + "loss": 0.0002, + "step": 8850 + }, + { + "epoch": 3.61, + "grad_norm": 0.0022100857459008694, + "learning_rate": 5.575905575905577e-06, + "loss": 0.0002, + "step": 8860 + }, + { + "epoch": 3.61, + "grad_norm": 0.0019310906063765287, + "learning_rate": 5.55962555962556e-06, + "loss": 0.0002, + "step": 8870 + }, + { + "epoch": 3.61, + "grad_norm": 0.002325033536180854, + "learning_rate": 5.543345543345544e-06, + "loss": 0.0002, + "step": 8880 + }, + { + "epoch": 3.62, + "grad_norm": 0.0017883091932162642, + "learning_rate": 5.527065527065527e-06, + "loss": 0.0002, + "step": 8890 + }, + { + "epoch": 3.62, + "grad_norm": 0.0018799800891429186, + "learning_rate": 5.510785510785511e-06, + "loss": 0.0002, + "step": 8900 + }, + { + "epoch": 3.63, + "grad_norm": 0.0017864195397123694, + "learning_rate": 5.494505494505495e-06, + "loss": 0.0002, + "step": 8910 + }, + { + "epoch": 3.63, + "grad_norm": 0.0018234961898997426, + "learning_rate": 5.478225478225478e-06, + "loss": 0.0002, + "step": 8920 + }, + { + "epoch": 3.63, + "grad_norm": 0.0017443567048758268, + "learning_rate": 5.461945461945462e-06, + "loss": 0.0002, + "step": 8930 + }, + { + "epoch": 3.64, + "grad_norm": 0.0017067781882360578, + "learning_rate": 5.445665445665445e-06, + "loss": 0.0002, + "step": 8940 + }, + { + "epoch": 3.64, + "grad_norm": 0.002276692306622863, + "learning_rate": 5.42938542938543e-06, + "loss": 0.0186, + "step": 8950 + }, + { + "epoch": 3.65, + "grad_norm": 0.0017357119359076023, + "learning_rate": 5.413105413105414e-06, + "loss": 0.0002, + "step": 8960 + }, + { + "epoch": 3.65, + "grad_norm": 0.0019965972751379013, + "learning_rate": 5.396825396825397e-06, + "loss": 0.0002, + "step": 8970 + }, + { + "epoch": 3.65, + "grad_norm": 0.0017553390935063362, + "learning_rate": 5.380545380545381e-06, + "loss": 0.0003, + "step": 8980 + }, + { + "epoch": 3.66, + "grad_norm": 0.0019675048533827066, + "learning_rate": 5.364265364265364e-06, + "loss": 0.0002, + "step": 8990 + }, + { + "epoch": 3.66, + "grad_norm": 0.002049475908279419, + "learning_rate": 5.347985347985348e-06, + "loss": 0.0002, + "step": 9000 + }, + { + "epoch": 3.67, + "grad_norm": 0.0019142305245622993, + "learning_rate": 5.331705331705332e-06, + "loss": 0.0002, + "step": 9010 + }, + { + "epoch": 3.67, + "grad_norm": 0.0018189084948971868, + "learning_rate": 5.315425315425315e-06, + "loss": 0.042, + "step": 9020 + }, + { + "epoch": 3.68, + "grad_norm": 0.0019228870514780283, + "learning_rate": 5.2991452991453e-06, + "loss": 0.0005, + "step": 9030 + }, + { + "epoch": 3.68, + "grad_norm": 0.002307659713551402, + "learning_rate": 5.282865282865284e-06, + "loss": 0.0002, + "step": 9040 + }, + { + "epoch": 3.68, + "grad_norm": 0.0021766172721982002, + "learning_rate": 5.266585266585267e-06, + "loss": 0.0002, + "step": 9050 + }, + { + "epoch": 3.69, + "grad_norm": 0.0017359366174787283, + "learning_rate": 5.250305250305251e-06, + "loss": 0.0341, + "step": 9060 + }, + { + "epoch": 3.69, + "grad_norm": 0.0017763186478987336, + "learning_rate": 5.234025234025234e-06, + "loss": 0.0002, + "step": 9070 + }, + { + "epoch": 3.7, + "grad_norm": 0.001665986143052578, + "learning_rate": 5.217745217745218e-06, + "loss": 0.0008, + "step": 9080 + }, + { + "epoch": 3.7, + "grad_norm": 0.0017538231331855059, + "learning_rate": 5.201465201465202e-06, + "loss": 0.0002, + "step": 9090 + }, + { + "epoch": 3.7, + "grad_norm": 0.0016558667412027717, + "learning_rate": 5.185185185185185e-06, + "loss": 0.0002, + "step": 9100 + }, + { + "epoch": 3.71, + "grad_norm": 0.0018909978680312634, + "learning_rate": 5.16890516890517e-06, + "loss": 0.0002, + "step": 9110 + }, + { + "epoch": 3.71, + "grad_norm": 0.0017842132365331054, + "learning_rate": 5.152625152625153e-06, + "loss": 0.0002, + "step": 9120 + }, + { + "epoch": 3.72, + "grad_norm": 0.0017819767817854881, + "learning_rate": 5.136345136345137e-06, + "loss": 0.0002, + "step": 9130 + }, + { + "epoch": 3.72, + "grad_norm": 0.00168974872212857, + "learning_rate": 5.120065120065121e-06, + "loss": 0.0002, + "step": 9140 + }, + { + "epoch": 3.72, + "grad_norm": 0.0017720448086038232, + "learning_rate": 5.103785103785104e-06, + "loss": 0.0002, + "step": 9150 + }, + { + "epoch": 3.73, + "grad_norm": 0.0017071804031729698, + "learning_rate": 5.087505087505088e-06, + "loss": 0.0002, + "step": 9160 + }, + { + "epoch": 3.73, + "grad_norm": 0.0018827036255970597, + "learning_rate": 5.071225071225072e-06, + "loss": 0.0002, + "step": 9170 + }, + { + "epoch": 3.74, + "grad_norm": 0.0022226206492632627, + "learning_rate": 5.054945054945055e-06, + "loss": 0.0002, + "step": 9180 + }, + { + "epoch": 3.74, + "grad_norm": 0.0019109738059341908, + "learning_rate": 5.03866503866504e-06, + "loss": 0.0494, + "step": 9190 + }, + { + "epoch": 3.74, + "grad_norm": 0.0019527949625626206, + "learning_rate": 5.022385022385023e-06, + "loss": 0.0002, + "step": 9200 + }, + { + "epoch": 3.75, + "grad_norm": 0.0020662578754127026, + "learning_rate": 5.006105006105007e-06, + "loss": 0.0415, + "step": 9210 + }, + { + "epoch": 3.75, + "grad_norm": 0.002151912311092019, + "learning_rate": 4.98982498982499e-06, + "loss": 0.0003, + "step": 9220 + }, + { + "epoch": 3.76, + "grad_norm": 0.001946290722116828, + "learning_rate": 4.973544973544974e-06, + "loss": 0.0002, + "step": 9230 + }, + { + "epoch": 3.76, + "grad_norm": 0.001949216122739017, + "learning_rate": 4.957264957264958e-06, + "loss": 0.0002, + "step": 9240 + }, + { + "epoch": 3.76, + "grad_norm": 0.002394681563600898, + "learning_rate": 4.9409849409849416e-06, + "loss": 0.0352, + "step": 9250 + }, + { + "epoch": 3.77, + "grad_norm": 0.0022585808765143156, + "learning_rate": 4.924704924704925e-06, + "loss": 0.0471, + "step": 9260 + }, + { + "epoch": 3.77, + "grad_norm": 0.002393248025327921, + "learning_rate": 4.908424908424909e-06, + "loss": 0.0002, + "step": 9270 + }, + { + "epoch": 3.78, + "grad_norm": 0.02360522374510765, + "learning_rate": 4.8921448921448925e-06, + "loss": 0.0003, + "step": 9280 + }, + { + "epoch": 3.78, + "grad_norm": 0.0023453827016055584, + "learning_rate": 4.8758648758648765e-06, + "loss": 0.0003, + "step": 9290 + }, + { + "epoch": 3.79, + "grad_norm": 0.00220270873978734, + "learning_rate": 4.8595848595848596e-06, + "loss": 0.0003, + "step": 9300 + }, + { + "epoch": 3.79, + "grad_norm": 0.0021812734194099903, + "learning_rate": 4.8433048433048435e-06, + "loss": 0.0003, + "step": 9310 + }, + { + "epoch": 3.79, + "grad_norm": 0.0021124074701219797, + "learning_rate": 4.8270248270248275e-06, + "loss": 0.0002, + "step": 9320 + }, + { + "epoch": 3.8, + "grad_norm": 0.002312874887138605, + "learning_rate": 4.810744810744811e-06, + "loss": 0.0003, + "step": 9330 + }, + { + "epoch": 3.8, + "grad_norm": 0.0025071410927921534, + "learning_rate": 4.7944647944647945e-06, + "loss": 0.0002, + "step": 9340 + }, + { + "epoch": 3.81, + "grad_norm": 0.0022760110441595316, + "learning_rate": 4.7781847781847784e-06, + "loss": 0.0003, + "step": 9350 + }, + { + "epoch": 3.81, + "grad_norm": 0.002391684567555785, + "learning_rate": 4.761904761904762e-06, + "loss": 0.0002, + "step": 9360 + }, + { + "epoch": 3.81, + "grad_norm": 0.0021324707195162773, + "learning_rate": 4.745624745624746e-06, + "loss": 0.0002, + "step": 9370 + }, + { + "epoch": 3.82, + "grad_norm": 0.0021602713968604803, + "learning_rate": 4.729344729344729e-06, + "loss": 0.0002, + "step": 9380 + }, + { + "epoch": 3.82, + "grad_norm": 0.003342408686876297, + "learning_rate": 4.713064713064713e-06, + "loss": 0.0002, + "step": 9390 + }, + { + "epoch": 3.83, + "grad_norm": 0.003199818776920438, + "learning_rate": 4.696784696784697e-06, + "loss": 0.0002, + "step": 9400 + }, + { + "epoch": 3.83, + "grad_norm": 0.002259862143546343, + "learning_rate": 4.680504680504681e-06, + "loss": 0.0005, + "step": 9410 + }, + { + "epoch": 3.83, + "grad_norm": 0.0020261441823095083, + "learning_rate": 4.664224664224664e-06, + "loss": 0.0002, + "step": 9420 + }, + { + "epoch": 3.84, + "grad_norm": 0.001844863872975111, + "learning_rate": 4.647944647944648e-06, + "loss": 0.0003, + "step": 9430 + }, + { + "epoch": 3.84, + "grad_norm": 0.0022536173928529024, + "learning_rate": 4.631664631664632e-06, + "loss": 0.0002, + "step": 9440 + }, + { + "epoch": 3.85, + "grad_norm": 0.001871871529147029, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0003, + "step": 9450 + }, + { + "epoch": 3.85, + "grad_norm": 0.0019549911376088858, + "learning_rate": 4.599104599104599e-06, + "loss": 0.0414, + "step": 9460 + }, + { + "epoch": 3.85, + "grad_norm": 0.002595256781205535, + "learning_rate": 4.582824582824583e-06, + "loss": 0.0002, + "step": 9470 + }, + { + "epoch": 3.86, + "grad_norm": 0.0021485532633960247, + "learning_rate": 4.566544566544567e-06, + "loss": 0.0002, + "step": 9480 + }, + { + "epoch": 3.86, + "grad_norm": 0.0018896989058703184, + "learning_rate": 4.55026455026455e-06, + "loss": 0.0264, + "step": 9490 + }, + { + "epoch": 3.87, + "grad_norm": 0.0023643136955797672, + "learning_rate": 4.533984533984534e-06, + "loss": 0.0003, + "step": 9500 + }, + { + "epoch": 3.87, + "grad_norm": 0.0017869413131847978, + "learning_rate": 4.517704517704518e-06, + "loss": 0.0003, + "step": 9510 + }, + { + "epoch": 3.87, + "grad_norm": 0.0022810434456914663, + "learning_rate": 4.501424501424502e-06, + "loss": 0.0002, + "step": 9520 + }, + { + "epoch": 3.88, + "grad_norm": 0.0020936301443725824, + "learning_rate": 4.485144485144485e-06, + "loss": 0.0003, + "step": 9530 + }, + { + "epoch": 3.88, + "grad_norm": 0.0017164949094876647, + "learning_rate": 4.468864468864469e-06, + "loss": 0.0147, + "step": 9540 + }, + { + "epoch": 3.89, + "grad_norm": 0.008885451592504978, + "learning_rate": 4.452584452584453e-06, + "loss": 0.0003, + "step": 9550 + }, + { + "epoch": 3.89, + "grad_norm": 0.20433610677719116, + "learning_rate": 4.436304436304437e-06, + "loss": 0.0004, + "step": 9560 + }, + { + "epoch": 3.89, + "grad_norm": 0.0018083051545545459, + "learning_rate": 4.42002442002442e-06, + "loss": 0.0002, + "step": 9570 + }, + { + "epoch": 3.9, + "grad_norm": 0.00233688997104764, + "learning_rate": 4.403744403744404e-06, + "loss": 0.0002, + "step": 9580 + }, + { + "epoch": 3.9, + "grad_norm": 0.0020819292403757572, + "learning_rate": 4.387464387464388e-06, + "loss": 0.0012, + "step": 9590 + }, + { + "epoch": 3.91, + "grad_norm": 0.0069807544350624084, + "learning_rate": 4.371184371184372e-06, + "loss": 0.0003, + "step": 9600 + }, + { + "epoch": 3.91, + "grad_norm": 0.0027952860109508038, + "learning_rate": 4.354904354904355e-06, + "loss": 0.0002, + "step": 9610 + }, + { + "epoch": 3.92, + "grad_norm": 0.0018937455024570227, + "learning_rate": 4.338624338624339e-06, + "loss": 0.0272, + "step": 9620 + }, + { + "epoch": 3.92, + "grad_norm": 0.001811556052416563, + "learning_rate": 4.322344322344323e-06, + "loss": 0.0002, + "step": 9630 + }, + { + "epoch": 3.92, + "grad_norm": 0.0017631722148507833, + "learning_rate": 4.306064306064307e-06, + "loss": 0.0002, + "step": 9640 + }, + { + "epoch": 3.93, + "grad_norm": 0.001867889310233295, + "learning_rate": 4.28978428978429e-06, + "loss": 0.0197, + "step": 9650 + }, + { + "epoch": 3.93, + "grad_norm": 0.0020562438294291496, + "learning_rate": 4.273504273504274e-06, + "loss": 0.0002, + "step": 9660 + }, + { + "epoch": 3.94, + "grad_norm": 0.007918364368379116, + "learning_rate": 4.257224257224258e-06, + "loss": 0.0003, + "step": 9670 + }, + { + "epoch": 3.94, + "grad_norm": 0.0026931529864668846, + "learning_rate": 4.240944240944242e-06, + "loss": 0.0003, + "step": 9680 + }, + { + "epoch": 3.94, + "grad_norm": 0.002624350832775235, + "learning_rate": 4.224664224664225e-06, + "loss": 0.0003, + "step": 9690 + }, + { + "epoch": 3.95, + "grad_norm": 0.001771993818692863, + "learning_rate": 4.208384208384209e-06, + "loss": 0.0002, + "step": 9700 + }, + { + "epoch": 3.95, + "grad_norm": 0.010523835197091103, + "learning_rate": 4.192104192104192e-06, + "loss": 0.0003, + "step": 9710 + }, + { + "epoch": 3.96, + "grad_norm": 0.0034396941773593426, + "learning_rate": 4.175824175824177e-06, + "loss": 0.0003, + "step": 9720 + }, + { + "epoch": 3.96, + "grad_norm": 0.003138788277283311, + "learning_rate": 4.15954415954416e-06, + "loss": 0.0058, + "step": 9730 + }, + { + "epoch": 3.96, + "grad_norm": 0.002142369979992509, + "learning_rate": 4.143264143264144e-06, + "loss": 0.0002, + "step": 9740 + }, + { + "epoch": 3.97, + "grad_norm": 0.006518381182104349, + "learning_rate": 4.126984126984127e-06, + "loss": 0.0002, + "step": 9750 + }, + { + "epoch": 3.97, + "grad_norm": 0.0019359017023816705, + "learning_rate": 4.1107041107041116e-06, + "loss": 0.0002, + "step": 9760 + }, + { + "epoch": 3.98, + "grad_norm": 0.0018001939170062542, + "learning_rate": 4.094424094424095e-06, + "loss": 0.0002, + "step": 9770 + }, + { + "epoch": 3.98, + "grad_norm": 0.002167722210288048, + "learning_rate": 4.078144078144079e-06, + "loss": 0.0002, + "step": 9780 + }, + { + "epoch": 3.98, + "grad_norm": 0.008154891431331635, + "learning_rate": 4.061864061864062e-06, + "loss": 0.0002, + "step": 9790 + }, + { + "epoch": 3.99, + "grad_norm": 0.001978978980332613, + "learning_rate": 4.0455840455840465e-06, + "loss": 0.0002, + "step": 9800 + }, + { + "epoch": 3.99, + "grad_norm": 0.0018466059118509293, + "learning_rate": 4.0293040293040296e-06, + "loss": 0.0002, + "step": 9810 + }, + { + "epoch": 4.0, + "grad_norm": 0.00179979985114187, + "learning_rate": 4.0130240130240135e-06, + "loss": 0.0002, + "step": 9820 + }, + { + "epoch": 4.0, + "grad_norm": 0.002002492779865861, + "learning_rate": 3.996743996743997e-06, + "loss": 0.0002, + "step": 9830 + }, + { + "epoch": 4.0, + "grad_norm": 0.0019970801658928394, + "learning_rate": 3.9804639804639805e-06, + "loss": 0.0002, + "step": 9840 + }, + { + "epoch": 4.01, + "grad_norm": 0.0017706368817016482, + "learning_rate": 3.9641839641839645e-06, + "loss": 0.0002, + "step": 9850 + }, + { + "epoch": 4.01, + "grad_norm": 0.0017488128505647182, + "learning_rate": 3.9479039479039484e-06, + "loss": 0.0003, + "step": 9860 + }, + { + "epoch": 4.02, + "grad_norm": 0.0025758370757102966, + "learning_rate": 3.9316239316239315e-06, + "loss": 0.0002, + "step": 9870 + }, + { + "epoch": 4.02, + "grad_norm": 0.002105166669934988, + "learning_rate": 3.9153439153439155e-06, + "loss": 0.0002, + "step": 9880 + }, + { + "epoch": 4.03, + "grad_norm": 0.0027692352887243032, + "learning_rate": 3.899063899063899e-06, + "loss": 0.0043, + "step": 9890 + }, + { + "epoch": 4.03, + "grad_norm": 0.0020704329945147038, + "learning_rate": 3.882783882783883e-06, + "loss": 0.0002, + "step": 9900 + }, + { + "epoch": 4.03, + "grad_norm": 0.0019208292942494154, + "learning_rate": 3.8665038665038664e-06, + "loss": 0.0002, + "step": 9910 + }, + { + "epoch": 4.04, + "grad_norm": 0.0017399511998519301, + "learning_rate": 3.85022385022385e-06, + "loss": 0.0002, + "step": 9920 + }, + { + "epoch": 4.04, + "grad_norm": 0.0017688291845843196, + "learning_rate": 3.833943833943834e-06, + "loss": 0.0002, + "step": 9930 + }, + { + "epoch": 4.05, + "grad_norm": 4.471590995788574, + "learning_rate": 3.817663817663818e-06, + "loss": 0.0023, + "step": 9940 + }, + { + "epoch": 4.05, + "grad_norm": 0.0016602250980213284, + "learning_rate": 3.8013838013838018e-06, + "loss": 0.0002, + "step": 9950 + }, + { + "epoch": 4.05, + "grad_norm": 0.001645643264055252, + "learning_rate": 3.7851037851037853e-06, + "loss": 0.0002, + "step": 9960 + }, + { + "epoch": 4.06, + "grad_norm": 0.0017087948508560658, + "learning_rate": 3.768823768823769e-06, + "loss": 0.0002, + "step": 9970 + }, + { + "epoch": 4.06, + "grad_norm": 0.002038088161498308, + "learning_rate": 3.752543752543753e-06, + "loss": 0.0002, + "step": 9980 + }, + { + "epoch": 4.07, + "grad_norm": 0.0071817911230027676, + "learning_rate": 3.7362637362637367e-06, + "loss": 0.0002, + "step": 9990 + }, + { + "epoch": 4.07, + "grad_norm": 0.0021325184497982264, + "learning_rate": 3.7199837199837202e-06, + "loss": 0.0002, + "step": 10000 + }, + { + "epoch": 4.07, + "grad_norm": 0.001710103009827435, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0002, + "step": 10010 + }, + { + "epoch": 4.08, + "grad_norm": 0.0015926583437249064, + "learning_rate": 3.687423687423688e-06, + "loss": 0.0003, + "step": 10020 + }, + { + "epoch": 4.08, + "grad_norm": 0.0016407363582402468, + "learning_rate": 3.6711436711436716e-06, + "loss": 0.0002, + "step": 10030 + }, + { + "epoch": 4.09, + "grad_norm": 0.005499332211911678, + "learning_rate": 3.654863654863655e-06, + "loss": 0.0002, + "step": 10040 + }, + { + "epoch": 4.09, + "grad_norm": 0.0018358811503276229, + "learning_rate": 3.6385836385836387e-06, + "loss": 0.0002, + "step": 10050 + }, + { + "epoch": 4.09, + "grad_norm": 0.0016708581242710352, + "learning_rate": 3.622303622303623e-06, + "loss": 0.0002, + "step": 10060 + }, + { + "epoch": 4.1, + "grad_norm": 0.0017990090418606997, + "learning_rate": 3.6060236060236065e-06, + "loss": 0.0002, + "step": 10070 + }, + { + "epoch": 4.1, + "grad_norm": 0.0018943555187433958, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0002, + "step": 10080 + }, + { + "epoch": 4.11, + "grad_norm": 0.0016270867781713605, + "learning_rate": 3.5734635734635736e-06, + "loss": 0.0002, + "step": 10090 + }, + { + "epoch": 4.11, + "grad_norm": 0.009296965785324574, + "learning_rate": 3.557183557183557e-06, + "loss": 0.0002, + "step": 10100 + }, + { + "epoch": 4.11, + "grad_norm": 0.0016765177715569735, + "learning_rate": 3.5409035409035415e-06, + "loss": 0.0002, + "step": 10110 + }, + { + "epoch": 4.12, + "grad_norm": 0.004676634445786476, + "learning_rate": 3.524623524623525e-06, + "loss": 0.0002, + "step": 10120 + }, + { + "epoch": 4.12, + "grad_norm": 0.001882671844214201, + "learning_rate": 3.5083435083435085e-06, + "loss": 0.0011, + "step": 10130 + }, + { + "epoch": 4.13, + "grad_norm": 0.0016701704589650035, + "learning_rate": 3.492063492063492e-06, + "loss": 0.0002, + "step": 10140 + }, + { + "epoch": 4.13, + "grad_norm": 0.0018036847468465567, + "learning_rate": 3.4757834757834764e-06, + "loss": 0.0002, + "step": 10150 + }, + { + "epoch": 4.14, + "grad_norm": 0.0019449255196377635, + "learning_rate": 3.45950345950346e-06, + "loss": 0.0002, + "step": 10160 + }, + { + "epoch": 4.14, + "grad_norm": 0.0023109372705221176, + "learning_rate": 3.4432234432234434e-06, + "loss": 0.0002, + "step": 10170 + }, + { + "epoch": 4.14, + "grad_norm": 0.001794449402950704, + "learning_rate": 3.426943426943427e-06, + "loss": 0.0002, + "step": 10180 + }, + { + "epoch": 4.15, + "grad_norm": 0.0016366175841540098, + "learning_rate": 3.410663410663411e-06, + "loss": 0.0002, + "step": 10190 + }, + { + "epoch": 4.15, + "grad_norm": 0.0022932947613298893, + "learning_rate": 3.394383394383395e-06, + "loss": 0.0002, + "step": 10200 + }, + { + "epoch": 4.16, + "grad_norm": 0.003153660800307989, + "learning_rate": 3.3781033781033783e-06, + "loss": 0.0002, + "step": 10210 + }, + { + "epoch": 4.16, + "grad_norm": 0.0018573219422250986, + "learning_rate": 3.361823361823362e-06, + "loss": 0.0002, + "step": 10220 + }, + { + "epoch": 4.16, + "grad_norm": 0.0016019688919186592, + "learning_rate": 3.345543345543346e-06, + "loss": 0.0002, + "step": 10230 + }, + { + "epoch": 4.17, + "grad_norm": 0.0016897093737497926, + "learning_rate": 3.3292633292633297e-06, + "loss": 0.0002, + "step": 10240 + }, + { + "epoch": 4.17, + "grad_norm": 0.0018914591055363417, + "learning_rate": 3.3129833129833133e-06, + "loss": 0.0003, + "step": 10250 + }, + { + "epoch": 4.18, + "grad_norm": 0.0018889600178226829, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.0002, + "step": 10260 + }, + { + "epoch": 4.18, + "grad_norm": 0.00160633132327348, + "learning_rate": 3.2804232804232807e-06, + "loss": 0.0002, + "step": 10270 + }, + { + "epoch": 4.18, + "grad_norm": 0.00516732269898057, + "learning_rate": 3.2641432641432647e-06, + "loss": 0.0002, + "step": 10280 + }, + { + "epoch": 4.19, + "grad_norm": 0.0015665347455069423, + "learning_rate": 3.247863247863248e-06, + "loss": 0.0002, + "step": 10290 + }, + { + "epoch": 4.19, + "grad_norm": 0.0016588406870141625, + "learning_rate": 3.2315832315832317e-06, + "loss": 0.0002, + "step": 10300 + }, + { + "epoch": 4.2, + "grad_norm": 0.00242376746609807, + "learning_rate": 3.2153032153032156e-06, + "loss": 0.0002, + "step": 10310 + }, + { + "epoch": 4.2, + "grad_norm": 0.0070383488200604916, + "learning_rate": 3.199023199023199e-06, + "loss": 0.0002, + "step": 10320 + }, + { + "epoch": 4.2, + "grad_norm": 0.0019135623006150126, + "learning_rate": 3.182743182743183e-06, + "loss": 0.0002, + "step": 10330 + }, + { + "epoch": 4.21, + "grad_norm": 0.0018966845236718655, + "learning_rate": 3.1664631664631666e-06, + "loss": 0.0002, + "step": 10340 + }, + { + "epoch": 4.21, + "grad_norm": 0.0014899246161803603, + "learning_rate": 3.1501831501831505e-06, + "loss": 0.0002, + "step": 10350 + }, + { + "epoch": 4.22, + "grad_norm": 0.001564052072353661, + "learning_rate": 3.133903133903134e-06, + "loss": 0.0002, + "step": 10360 + }, + { + "epoch": 4.22, + "grad_norm": 0.001840132987126708, + "learning_rate": 3.117623117623118e-06, + "loss": 0.0002, + "step": 10370 + }, + { + "epoch": 4.22, + "grad_norm": 0.0020550840999931097, + "learning_rate": 3.1013431013431015e-06, + "loss": 0.0002, + "step": 10380 + }, + { + "epoch": 4.23, + "grad_norm": 0.0018264094833284616, + "learning_rate": 3.0850630850630855e-06, + "loss": 0.0002, + "step": 10390 + }, + { + "epoch": 4.23, + "grad_norm": 0.001516546355560422, + "learning_rate": 3.068783068783069e-06, + "loss": 0.0002, + "step": 10400 + }, + { + "epoch": 4.24, + "grad_norm": 0.0016487749526277184, + "learning_rate": 3.052503052503053e-06, + "loss": 0.0002, + "step": 10410 + }, + { + "epoch": 4.24, + "grad_norm": 0.0016116101760417223, + "learning_rate": 3.0362230362230364e-06, + "loss": 0.0002, + "step": 10420 + }, + { + "epoch": 4.25, + "grad_norm": 0.001680860761553049, + "learning_rate": 3.0199430199430204e-06, + "loss": 0.0002, + "step": 10430 + }, + { + "epoch": 4.25, + "grad_norm": 0.002029112773016095, + "learning_rate": 3.003663003663004e-06, + "loss": 0.0002, + "step": 10440 + }, + { + "epoch": 4.25, + "grad_norm": 0.002056869911029935, + "learning_rate": 2.9873829873829874e-06, + "loss": 0.0002, + "step": 10450 + }, + { + "epoch": 4.26, + "grad_norm": 0.0016365089686587453, + "learning_rate": 2.9711029711029714e-06, + "loss": 0.0017, + "step": 10460 + }, + { + "epoch": 4.26, + "grad_norm": 0.001570598571561277, + "learning_rate": 2.9548229548229553e-06, + "loss": 0.0002, + "step": 10470 + }, + { + "epoch": 4.27, + "grad_norm": 0.0019338660640642047, + "learning_rate": 2.938542938542939e-06, + "loss": 0.0002, + "step": 10480 + }, + { + "epoch": 4.27, + "grad_norm": 0.001604044926352799, + "learning_rate": 2.9222629222629223e-06, + "loss": 0.0002, + "step": 10490 + }, + { + "epoch": 4.27, + "grad_norm": 0.0015405503800138831, + "learning_rate": 2.9059829059829063e-06, + "loss": 0.0003, + "step": 10500 + }, + { + "epoch": 4.28, + "grad_norm": 0.001597168273292482, + "learning_rate": 2.8897028897028902e-06, + "loss": 0.0002, + "step": 10510 + }, + { + "epoch": 4.28, + "grad_norm": 0.001601763884536922, + "learning_rate": 2.8734228734228737e-06, + "loss": 0.0002, + "step": 10520 + }, + { + "epoch": 4.29, + "grad_norm": 0.0014684420311823487, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.0002, + "step": 10530 + }, + { + "epoch": 4.29, + "grad_norm": 0.0019548002164810896, + "learning_rate": 2.840862840862841e-06, + "loss": 0.0002, + "step": 10540 + }, + { + "epoch": 4.29, + "grad_norm": 0.0019341334700584412, + "learning_rate": 2.824582824582825e-06, + "loss": 0.0002, + "step": 10550 + }, + { + "epoch": 4.3, + "grad_norm": 0.0015359672252088785, + "learning_rate": 2.8083028083028087e-06, + "loss": 0.0002, + "step": 10560 + }, + { + "epoch": 4.3, + "grad_norm": 0.001660957932472229, + "learning_rate": 2.792022792022792e-06, + "loss": 0.0002, + "step": 10570 + }, + { + "epoch": 4.31, + "grad_norm": 0.002642634091898799, + "learning_rate": 2.7757427757427757e-06, + "loss": 0.0002, + "step": 10580 + }, + { + "epoch": 4.31, + "grad_norm": 0.001577245187945664, + "learning_rate": 2.75946275946276e-06, + "loss": 0.0002, + "step": 10590 + }, + { + "epoch": 4.31, + "grad_norm": 0.0016623500268906355, + "learning_rate": 2.7431827431827436e-06, + "loss": 0.0274, + "step": 10600 + }, + { + "epoch": 4.32, + "grad_norm": 0.001559157157316804, + "learning_rate": 2.726902726902727e-06, + "loss": 0.0002, + "step": 10610 + }, + { + "epoch": 4.32, + "grad_norm": 0.0015379212563857436, + "learning_rate": 2.7106227106227106e-06, + "loss": 0.0002, + "step": 10620 + }, + { + "epoch": 4.33, + "grad_norm": 0.001682962873019278, + "learning_rate": 2.694342694342695e-06, + "loss": 0.0002, + "step": 10630 + }, + { + "epoch": 4.33, + "grad_norm": 0.0015784628922119737, + "learning_rate": 2.6780626780626785e-06, + "loss": 0.0002, + "step": 10640 + }, + { + "epoch": 4.33, + "grad_norm": 0.0015349604655057192, + "learning_rate": 2.661782661782662e-06, + "loss": 0.0002, + "step": 10650 + }, + { + "epoch": 4.34, + "grad_norm": 0.0015412438660860062, + "learning_rate": 2.6455026455026455e-06, + "loss": 0.0002, + "step": 10660 + }, + { + "epoch": 4.34, + "grad_norm": 0.0016461275517940521, + "learning_rate": 2.629222629222629e-06, + "loss": 0.0002, + "step": 10670 + }, + { + "epoch": 4.35, + "grad_norm": 0.0016684934962540865, + "learning_rate": 2.6129426129426134e-06, + "loss": 0.0002, + "step": 10680 + }, + { + "epoch": 4.35, + "grad_norm": 0.0015019102720543742, + "learning_rate": 2.596662596662597e-06, + "loss": 0.0002, + "step": 10690 + }, + { + "epoch": 4.35, + "grad_norm": 0.0015912950038909912, + "learning_rate": 2.5803825803825804e-06, + "loss": 0.0002, + "step": 10700 + }, + { + "epoch": 4.36, + "grad_norm": 0.002051288727670908, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0002, + "step": 10710 + }, + { + "epoch": 4.36, + "grad_norm": 0.0014287488302215934, + "learning_rate": 2.5478225478225483e-06, + "loss": 0.0002, + "step": 10720 + }, + { + "epoch": 4.37, + "grad_norm": 0.0014953837962821126, + "learning_rate": 2.531542531542532e-06, + "loss": 0.0002, + "step": 10730 + }, + { + "epoch": 4.37, + "grad_norm": 0.0016842116601765156, + "learning_rate": 2.5152625152625154e-06, + "loss": 0.0002, + "step": 10740 + }, + { + "epoch": 4.38, + "grad_norm": 0.0016165722627192736, + "learning_rate": 2.4989824989824993e-06, + "loss": 0.0004, + "step": 10750 + }, + { + "epoch": 4.38, + "grad_norm": 0.0016578533686697483, + "learning_rate": 2.482702482702483e-06, + "loss": 0.0002, + "step": 10760 + }, + { + "epoch": 4.38, + "grad_norm": 0.001627171179279685, + "learning_rate": 2.4664224664224668e-06, + "loss": 0.0002, + "step": 10770 + }, + { + "epoch": 4.39, + "grad_norm": 0.0029889908619225025, + "learning_rate": 2.4501424501424503e-06, + "loss": 0.0002, + "step": 10780 + }, + { + "epoch": 4.39, + "grad_norm": 0.0015365415019914508, + "learning_rate": 2.433862433862434e-06, + "loss": 0.0002, + "step": 10790 + }, + { + "epoch": 4.4, + "grad_norm": 0.0019263201393187046, + "learning_rate": 2.4175824175824177e-06, + "loss": 0.0002, + "step": 10800 + }, + { + "epoch": 4.4, + "grad_norm": 0.001516710501164198, + "learning_rate": 2.4013024013024013e-06, + "loss": 0.0002, + "step": 10810 + }, + { + "epoch": 4.4, + "grad_norm": 0.001614395878277719, + "learning_rate": 2.385022385022385e-06, + "loss": 0.0002, + "step": 10820 + }, + { + "epoch": 4.41, + "grad_norm": 0.0014490768080577254, + "learning_rate": 2.3687423687423687e-06, + "loss": 0.0004, + "step": 10830 + }, + { + "epoch": 4.41, + "grad_norm": 0.0015428679762408137, + "learning_rate": 2.3524623524623527e-06, + "loss": 0.0002, + "step": 10840 + }, + { + "epoch": 4.42, + "grad_norm": 0.0015440605347976089, + "learning_rate": 2.336182336182336e-06, + "loss": 0.0004, + "step": 10850 + }, + { + "epoch": 4.42, + "grad_norm": 0.00148781796451658, + "learning_rate": 2.31990231990232e-06, + "loss": 0.0002, + "step": 10860 + }, + { + "epoch": 4.42, + "grad_norm": 0.0015348844463005662, + "learning_rate": 2.3036223036223036e-06, + "loss": 0.0002, + "step": 10870 + }, + { + "epoch": 4.43, + "grad_norm": 0.001880201743915677, + "learning_rate": 2.2873422873422876e-06, + "loss": 0.0002, + "step": 10880 + }, + { + "epoch": 4.43, + "grad_norm": 0.001558057265356183, + "learning_rate": 2.271062271062271e-06, + "loss": 0.0002, + "step": 10890 + }, + { + "epoch": 4.44, + "grad_norm": 0.010920335538685322, + "learning_rate": 2.254782254782255e-06, + "loss": 0.0002, + "step": 10900 + }, + { + "epoch": 4.44, + "grad_norm": 0.0014644470065832138, + "learning_rate": 2.2385022385022386e-06, + "loss": 0.0002, + "step": 10910 + }, + { + "epoch": 4.44, + "grad_norm": 0.0014618238201364875, + "learning_rate": 2.222222222222222e-06, + "loss": 0.0002, + "step": 10920 + }, + { + "epoch": 4.45, + "grad_norm": 0.0016169185983017087, + "learning_rate": 2.205942205942206e-06, + "loss": 0.0002, + "step": 10930 + }, + { + "epoch": 4.45, + "grad_norm": 0.0014386329567059875, + "learning_rate": 2.1896621896621895e-06, + "loss": 0.0002, + "step": 10940 + }, + { + "epoch": 4.46, + "grad_norm": 0.0015079034492373466, + "learning_rate": 2.1733821733821735e-06, + "loss": 0.0002, + "step": 10950 + }, + { + "epoch": 4.46, + "grad_norm": 0.00197400595061481, + "learning_rate": 2.157102157102157e-06, + "loss": 0.0002, + "step": 10960 + }, + { + "epoch": 4.46, + "grad_norm": 0.001524322316981852, + "learning_rate": 2.140822140822141e-06, + "loss": 0.0311, + "step": 10970 + }, + { + "epoch": 4.47, + "grad_norm": 0.0014644163893535733, + "learning_rate": 2.1245421245421245e-06, + "loss": 0.0002, + "step": 10980 + }, + { + "epoch": 4.47, + "grad_norm": 0.0014774493174627423, + "learning_rate": 2.1082621082621084e-06, + "loss": 0.0002, + "step": 10990 + }, + { + "epoch": 4.48, + "grad_norm": 0.0014835140900686383, + "learning_rate": 2.091982091982092e-06, + "loss": 0.0002, + "step": 11000 + }, + { + "epoch": 4.48, + "grad_norm": 0.001458540791645646, + "learning_rate": 2.075702075702076e-06, + "loss": 0.0002, + "step": 11010 + }, + { + "epoch": 4.49, + "grad_norm": 0.002432051347568631, + "learning_rate": 2.05942205942206e-06, + "loss": 0.0002, + "step": 11020 + }, + { + "epoch": 4.49, + "grad_norm": 0.001562487450428307, + "learning_rate": 2.0431420431420433e-06, + "loss": 0.0353, + "step": 11030 + }, + { + "epoch": 4.49, + "grad_norm": 0.0016052748542279005, + "learning_rate": 2.0268620268620273e-06, + "loss": 0.0002, + "step": 11040 + }, + { + "epoch": 4.5, + "grad_norm": 0.001545790466479957, + "learning_rate": 2.0105820105820108e-06, + "loss": 0.0002, + "step": 11050 + }, + { + "epoch": 4.5, + "grad_norm": 0.001812846981920302, + "learning_rate": 1.9943019943019947e-06, + "loss": 0.0002, + "step": 11060 + }, + { + "epoch": 4.51, + "grad_norm": 0.0017415074398741126, + "learning_rate": 1.9780219780219782e-06, + "loss": 0.0002, + "step": 11070 + }, + { + "epoch": 4.51, + "grad_norm": 0.0016338001005351543, + "learning_rate": 1.961741961741962e-06, + "loss": 0.0002, + "step": 11080 + }, + { + "epoch": 4.51, + "grad_norm": 0.0014169925125315785, + "learning_rate": 1.9454619454619457e-06, + "loss": 0.0006, + "step": 11090 + }, + { + "epoch": 4.52, + "grad_norm": 0.0016671591438353062, + "learning_rate": 1.9291819291819296e-06, + "loss": 0.0002, + "step": 11100 + }, + { + "epoch": 4.52, + "grad_norm": 0.0033444638829678297, + "learning_rate": 1.912901912901913e-06, + "loss": 0.0002, + "step": 11110 + }, + { + "epoch": 4.53, + "grad_norm": 0.0015689071733504534, + "learning_rate": 1.8966218966218969e-06, + "loss": 0.0002, + "step": 11120 + }, + { + "epoch": 4.53, + "grad_norm": 0.0018193925498053432, + "learning_rate": 1.8803418803418804e-06, + "loss": 0.0002, + "step": 11130 + }, + { + "epoch": 4.53, + "grad_norm": 0.0015975474379956722, + "learning_rate": 1.8640618640618643e-06, + "loss": 0.0002, + "step": 11140 + }, + { + "epoch": 4.54, + "grad_norm": 0.0015228153206408024, + "learning_rate": 1.8477818477818479e-06, + "loss": 0.0002, + "step": 11150 + }, + { + "epoch": 4.54, + "grad_norm": 0.0017481072572991252, + "learning_rate": 1.8315018315018316e-06, + "loss": 0.0002, + "step": 11160 + }, + { + "epoch": 4.55, + "grad_norm": 0.0014254804700613022, + "learning_rate": 1.8152218152218153e-06, + "loss": 0.0002, + "step": 11170 + }, + { + "epoch": 4.55, + "grad_norm": 0.0014639191795140505, + "learning_rate": 1.798941798941799e-06, + "loss": 0.0002, + "step": 11180 + }, + { + "epoch": 4.55, + "grad_norm": 0.0014739630278199911, + "learning_rate": 1.7826617826617828e-06, + "loss": 0.0002, + "step": 11190 + }, + { + "epoch": 4.56, + "grad_norm": 0.001486291061155498, + "learning_rate": 1.7663817663817665e-06, + "loss": 0.0002, + "step": 11200 + }, + { + "epoch": 4.56, + "grad_norm": 0.0021130377426743507, + "learning_rate": 1.7501017501017502e-06, + "loss": 0.0002, + "step": 11210 + }, + { + "epoch": 4.57, + "grad_norm": 0.0014680501772090793, + "learning_rate": 1.733821733821734e-06, + "loss": 0.0002, + "step": 11220 + }, + { + "epoch": 4.57, + "grad_norm": 0.0018635701853781939, + "learning_rate": 1.7175417175417177e-06, + "loss": 0.0002, + "step": 11230 + }, + { + "epoch": 4.57, + "grad_norm": 0.0015968162333592772, + "learning_rate": 1.7012617012617014e-06, + "loss": 0.0094, + "step": 11240 + }, + { + "epoch": 4.58, + "grad_norm": 0.0017093609785661101, + "learning_rate": 1.6849816849816852e-06, + "loss": 0.0002, + "step": 11250 + }, + { + "epoch": 4.58, + "grad_norm": 0.0014892058679834008, + "learning_rate": 1.6687016687016689e-06, + "loss": 0.0139, + "step": 11260 + }, + { + "epoch": 4.59, + "grad_norm": 0.0014219109434634447, + "learning_rate": 1.6524216524216524e-06, + "loss": 0.0002, + "step": 11270 + }, + { + "epoch": 4.59, + "grad_norm": 0.004563894122838974, + "learning_rate": 1.6361416361416363e-06, + "loss": 0.0002, + "step": 11280 + }, + { + "epoch": 4.6, + "grad_norm": 0.0014352177968248725, + "learning_rate": 1.6198616198616199e-06, + "loss": 0.0002, + "step": 11290 + }, + { + "epoch": 4.6, + "grad_norm": 0.001390959369018674, + "learning_rate": 1.6035816035816038e-06, + "loss": 0.0002, + "step": 11300 + }, + { + "epoch": 4.6, + "grad_norm": 0.0038959532976150513, + "learning_rate": 1.5873015873015873e-06, + "loss": 0.0002, + "step": 11310 + }, + { + "epoch": 4.61, + "grad_norm": 0.001589680789038539, + "learning_rate": 1.5710215710215713e-06, + "loss": 0.0002, + "step": 11320 + }, + { + "epoch": 4.61, + "grad_norm": 0.001737966202199459, + "learning_rate": 1.5547415547415548e-06, + "loss": 0.0002, + "step": 11330 + }, + { + "epoch": 4.62, + "grad_norm": 0.0014157581608742476, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0002, + "step": 11340 + }, + { + "epoch": 4.62, + "grad_norm": 0.0018974760314449668, + "learning_rate": 1.5221815221815222e-06, + "loss": 0.0002, + "step": 11350 + }, + { + "epoch": 4.62, + "grad_norm": 0.0015809000469744205, + "learning_rate": 1.5059015059015062e-06, + "loss": 0.0002, + "step": 11360 + }, + { + "epoch": 4.63, + "grad_norm": 0.01224368717521429, + "learning_rate": 1.4896214896214897e-06, + "loss": 0.0002, + "step": 11370 + }, + { + "epoch": 4.63, + "grad_norm": 0.0015656572068110108, + "learning_rate": 1.4733414733414736e-06, + "loss": 0.0002, + "step": 11380 + }, + { + "epoch": 4.64, + "grad_norm": 0.0015062758466228843, + "learning_rate": 1.4570614570614572e-06, + "loss": 0.0002, + "step": 11390 + }, + { + "epoch": 4.64, + "grad_norm": 0.001575302449055016, + "learning_rate": 1.4407814407814407e-06, + "loss": 0.0002, + "step": 11400 + }, + { + "epoch": 4.64, + "grad_norm": 0.0014867339050397277, + "learning_rate": 1.4245014245014246e-06, + "loss": 0.0002, + "step": 11410 + }, + { + "epoch": 4.65, + "grad_norm": 0.0014463013503700495, + "learning_rate": 1.4082214082214083e-06, + "loss": 0.0002, + "step": 11420 + }, + { + "epoch": 4.65, + "grad_norm": 0.0014738457975909114, + "learning_rate": 1.391941391941392e-06, + "loss": 0.0002, + "step": 11430 + }, + { + "epoch": 4.66, + "grad_norm": 0.0033326647244393826, + "learning_rate": 1.3756613756613758e-06, + "loss": 0.0002, + "step": 11440 + }, + { + "epoch": 4.66, + "grad_norm": 0.0014288641978055239, + "learning_rate": 1.3593813593813595e-06, + "loss": 0.0002, + "step": 11450 + }, + { + "epoch": 4.66, + "grad_norm": 0.0015226053074002266, + "learning_rate": 1.3431013431013433e-06, + "loss": 0.0002, + "step": 11460 + }, + { + "epoch": 4.67, + "grad_norm": 0.0014885494019836187, + "learning_rate": 1.326821326821327e-06, + "loss": 0.0002, + "step": 11470 + }, + { + "epoch": 4.67, + "grad_norm": 0.0014367675175890326, + "learning_rate": 1.3105413105413107e-06, + "loss": 0.0002, + "step": 11480 + }, + { + "epoch": 4.68, + "grad_norm": 0.0014275162247940898, + "learning_rate": 1.2942612942612944e-06, + "loss": 0.0002, + "step": 11490 + }, + { + "epoch": 4.68, + "grad_norm": 0.0014797335024923086, + "learning_rate": 1.2779812779812782e-06, + "loss": 0.0002, + "step": 11500 + }, + { + "epoch": 4.68, + "grad_norm": 0.0014295239234343171, + "learning_rate": 1.2617012617012617e-06, + "loss": 0.0002, + "step": 11510 + }, + { + "epoch": 4.69, + "grad_norm": 0.0014915807405486703, + "learning_rate": 1.2454212454212456e-06, + "loss": 0.0002, + "step": 11520 + }, + { + "epoch": 4.69, + "grad_norm": 0.0016227615997195244, + "learning_rate": 1.2291412291412294e-06, + "loss": 0.0002, + "step": 11530 + }, + { + "epoch": 4.7, + "grad_norm": 0.0014580420684069395, + "learning_rate": 1.212861212861213e-06, + "loss": 0.0002, + "step": 11540 + }, + { + "epoch": 4.7, + "grad_norm": 0.0017063523409888148, + "learning_rate": 1.1965811965811968e-06, + "loss": 0.0002, + "step": 11550 + }, + { + "epoch": 4.7, + "grad_norm": 0.0014365671668201685, + "learning_rate": 1.1803011803011806e-06, + "loss": 0.0002, + "step": 11560 + }, + { + "epoch": 4.71, + "grad_norm": 1.2260816097259521, + "learning_rate": 1.164021164021164e-06, + "loss": 0.006, + "step": 11570 + }, + { + "epoch": 4.71, + "grad_norm": 0.0018874687375500798, + "learning_rate": 1.1477411477411478e-06, + "loss": 0.0002, + "step": 11580 + }, + { + "epoch": 4.72, + "grad_norm": 0.0013750126818194985, + "learning_rate": 1.1314611314611315e-06, + "loss": 0.0002, + "step": 11590 + }, + { + "epoch": 4.72, + "grad_norm": 0.001419686945155263, + "learning_rate": 1.1151811151811153e-06, + "loss": 0.0002, + "step": 11600 + }, + { + "epoch": 4.73, + "grad_norm": 0.0014118729159235954, + "learning_rate": 1.098901098901099e-06, + "loss": 0.0002, + "step": 11610 + }, + { + "epoch": 4.73, + "grad_norm": 0.0014139912091195583, + "learning_rate": 1.0826210826210827e-06, + "loss": 0.0002, + "step": 11620 + }, + { + "epoch": 4.73, + "grad_norm": 0.0015478282002732158, + "learning_rate": 1.0663410663410665e-06, + "loss": 0.0002, + "step": 11630 + }, + { + "epoch": 4.74, + "grad_norm": 0.0015366330044344068, + "learning_rate": 1.0500610500610502e-06, + "loss": 0.0002, + "step": 11640 + }, + { + "epoch": 4.74, + "grad_norm": 0.001490729977376759, + "learning_rate": 1.033781033781034e-06, + "loss": 0.0002, + "step": 11650 + }, + { + "epoch": 4.75, + "grad_norm": 0.001423732377588749, + "learning_rate": 1.0175010175010176e-06, + "loss": 0.0002, + "step": 11660 + }, + { + "epoch": 4.75, + "grad_norm": 0.0014315071748569608, + "learning_rate": 1.0012210012210014e-06, + "loss": 0.0002, + "step": 11670 + }, + { + "epoch": 4.75, + "grad_norm": 0.001486779423430562, + "learning_rate": 9.84940984940985e-07, + "loss": 0.0002, + "step": 11680 + }, + { + "epoch": 4.76, + "grad_norm": 0.004522906616330147, + "learning_rate": 9.686609686609686e-07, + "loss": 0.0002, + "step": 11690 + }, + { + "epoch": 4.76, + "grad_norm": 0.0014580250717699528, + "learning_rate": 9.523809523809525e-07, + "loss": 0.0002, + "step": 11700 + }, + { + "epoch": 4.77, + "grad_norm": 0.0015162237687036395, + "learning_rate": 9.361009361009362e-07, + "loss": 0.0092, + "step": 11710 + }, + { + "epoch": 4.77, + "grad_norm": 0.0013567224377766252, + "learning_rate": 9.198209198209199e-07, + "loss": 0.0002, + "step": 11720 + }, + { + "epoch": 4.77, + "grad_norm": 0.0014696550788357854, + "learning_rate": 9.035409035409036e-07, + "loss": 0.0002, + "step": 11730 + }, + { + "epoch": 4.78, + "grad_norm": 0.0014795665629208088, + "learning_rate": 8.872608872608874e-07, + "loss": 0.0002, + "step": 11740 + }, + { + "epoch": 4.78, + "grad_norm": 0.0015325862914323807, + "learning_rate": 8.709808709808711e-07, + "loss": 0.0002, + "step": 11750 + }, + { + "epoch": 4.79, + "grad_norm": 0.0014404217945411801, + "learning_rate": 8.547008547008548e-07, + "loss": 0.0002, + "step": 11760 + }, + { + "epoch": 4.79, + "grad_norm": 0.0019404751947149634, + "learning_rate": 8.384208384208386e-07, + "loss": 0.0002, + "step": 11770 + }, + { + "epoch": 4.79, + "grad_norm": 0.0016487601678818464, + "learning_rate": 8.221408221408223e-07, + "loss": 0.0002, + "step": 11780 + }, + { + "epoch": 4.8, + "grad_norm": 0.0020900049712508917, + "learning_rate": 8.05860805860806e-07, + "loss": 0.0002, + "step": 11790 + }, + { + "epoch": 4.8, + "grad_norm": 0.044903818517923355, + "learning_rate": 7.895807895807897e-07, + "loss": 0.0002, + "step": 11800 + }, + { + "epoch": 4.81, + "grad_norm": 0.006237754598259926, + "learning_rate": 7.733007733007733e-07, + "loss": 0.0002, + "step": 11810 + }, + { + "epoch": 4.81, + "grad_norm": 0.001496842596679926, + "learning_rate": 7.57020757020757e-07, + "loss": 0.0002, + "step": 11820 + }, + { + "epoch": 4.81, + "grad_norm": 0.0014312907587736845, + "learning_rate": 7.407407407407407e-07, + "loss": 0.0002, + "step": 11830 + }, + { + "epoch": 4.82, + "grad_norm": 0.0020331472624093294, + "learning_rate": 7.244607244607245e-07, + "loss": 0.0002, + "step": 11840 + }, + { + "epoch": 4.82, + "grad_norm": 0.0015430136118084192, + "learning_rate": 7.081807081807082e-07, + "loss": 0.0002, + "step": 11850 + }, + { + "epoch": 4.83, + "grad_norm": 0.0014734879368916154, + "learning_rate": 6.919006919006919e-07, + "loss": 0.0002, + "step": 11860 + }, + { + "epoch": 4.83, + "grad_norm": 0.006134878844022751, + "learning_rate": 6.756206756206756e-07, + "loss": 0.0002, + "step": 11870 + }, + { + "epoch": 4.84, + "grad_norm": 0.0013609755551442504, + "learning_rate": 6.593406593406594e-07, + "loss": 0.0002, + "step": 11880 + }, + { + "epoch": 4.84, + "grad_norm": 0.002070717280730605, + "learning_rate": 6.430606430606431e-07, + "loss": 0.0002, + "step": 11890 + }, + { + "epoch": 4.84, + "grad_norm": 0.0014169508358463645, + "learning_rate": 6.267806267806268e-07, + "loss": 0.0002, + "step": 11900 + }, + { + "epoch": 4.85, + "grad_norm": 0.0014770817942917347, + "learning_rate": 6.105006105006106e-07, + "loss": 0.0002, + "step": 11910 + }, + { + "epoch": 4.85, + "grad_norm": 0.0014419537037611008, + "learning_rate": 5.942205942205943e-07, + "loss": 0.0002, + "step": 11920 + }, + { + "epoch": 4.86, + "grad_norm": 0.001446893555112183, + "learning_rate": 5.77940577940578e-07, + "loss": 0.0002, + "step": 11930 + }, + { + "epoch": 4.86, + "grad_norm": 0.001416919520124793, + "learning_rate": 5.616605616605618e-07, + "loss": 0.0002, + "step": 11940 + }, + { + "epoch": 4.86, + "grad_norm": 0.0034949700348079205, + "learning_rate": 5.453805453805455e-07, + "loss": 0.0002, + "step": 11950 + }, + { + "epoch": 4.87, + "grad_norm": 0.0014441277598962188, + "learning_rate": 5.291005291005291e-07, + "loss": 0.0002, + "step": 11960 + }, + { + "epoch": 4.87, + "grad_norm": 0.0015632550930604339, + "learning_rate": 5.128205128205128e-07, + "loss": 0.0002, + "step": 11970 + }, + { + "epoch": 4.88, + "grad_norm": 0.001399176544509828, + "learning_rate": 4.965404965404966e-07, + "loss": 0.0002, + "step": 11980 + }, + { + "epoch": 4.88, + "grad_norm": 0.0013975553447380662, + "learning_rate": 4.802604802604803e-07, + "loss": 0.0002, + "step": 11990 + }, + { + "epoch": 4.88, + "grad_norm": 0.0013712114887312055, + "learning_rate": 4.63980463980464e-07, + "loss": 0.0002, + "step": 12000 + }, + { + "epoch": 4.89, + "grad_norm": 0.001828977488912642, + "learning_rate": 4.4770044770044775e-07, + "loss": 0.0002, + "step": 12010 + }, + { + "epoch": 4.89, + "grad_norm": 0.0014294543070718646, + "learning_rate": 4.3142043142043143e-07, + "loss": 0.0002, + "step": 12020 + }, + { + "epoch": 4.9, + "grad_norm": 0.0013922780053690076, + "learning_rate": 4.1514041514041516e-07, + "loss": 0.0002, + "step": 12030 + }, + { + "epoch": 4.9, + "grad_norm": 0.0016130340518429875, + "learning_rate": 3.988603988603989e-07, + "loss": 0.0002, + "step": 12040 + }, + { + "epoch": 4.9, + "grad_norm": 0.0013872876297682524, + "learning_rate": 3.825803825803826e-07, + "loss": 0.0002, + "step": 12050 + }, + { + "epoch": 4.91, + "grad_norm": 0.0014586036559194326, + "learning_rate": 3.6630036630036635e-07, + "loss": 0.0002, + "step": 12060 + }, + { + "epoch": 4.91, + "grad_norm": 0.0014334677252918482, + "learning_rate": 3.500203500203501e-07, + "loss": 0.0002, + "step": 12070 + }, + { + "epoch": 4.92, + "grad_norm": 0.0014047607546672225, + "learning_rate": 3.3374033374033376e-07, + "loss": 0.0002, + "step": 12080 + }, + { + "epoch": 4.92, + "grad_norm": 0.0013850359246134758, + "learning_rate": 3.174603174603175e-07, + "loss": 0.0002, + "step": 12090 + }, + { + "epoch": 4.92, + "grad_norm": 0.0013912185095250607, + "learning_rate": 3.011803011803012e-07, + "loss": 0.0002, + "step": 12100 + }, + { + "epoch": 4.93, + "grad_norm": 0.001442193053662777, + "learning_rate": 2.8490028490028494e-07, + "loss": 0.0002, + "step": 12110 + }, + { + "epoch": 4.93, + "grad_norm": 0.0014724673237651587, + "learning_rate": 2.6862026862026867e-07, + "loss": 0.0002, + "step": 12120 + }, + { + "epoch": 4.94, + "grad_norm": 0.0017670753877609968, + "learning_rate": 2.5234025234025235e-07, + "loss": 0.0002, + "step": 12130 + }, + { + "epoch": 4.94, + "grad_norm": 0.001458752085454762, + "learning_rate": 2.3606023606023608e-07, + "loss": 0.0002, + "step": 12140 + }, + { + "epoch": 4.95, + "grad_norm": 0.0015336443902924657, + "learning_rate": 2.197802197802198e-07, + "loss": 0.0002, + "step": 12150 + }, + { + "epoch": 4.95, + "grad_norm": 0.001413301331922412, + "learning_rate": 2.035002035002035e-07, + "loss": 0.0002, + "step": 12160 + }, + { + "epoch": 4.95, + "grad_norm": 0.001454474637284875, + "learning_rate": 1.8722018722018724e-07, + "loss": 0.0002, + "step": 12170 + }, + { + "epoch": 4.96, + "grad_norm": 0.0014644395560026169, + "learning_rate": 1.7094017094017097e-07, + "loss": 0.0002, + "step": 12180 + }, + { + "epoch": 4.96, + "grad_norm": 0.0014874679036438465, + "learning_rate": 1.5466015466015467e-07, + "loss": 0.0002, + "step": 12190 + }, + { + "epoch": 4.97, + "grad_norm": 0.0014028714504092932, + "learning_rate": 1.383801383801384e-07, + "loss": 0.0002, + "step": 12200 + }, + { + "epoch": 4.97, + "grad_norm": 0.0014859441434964538, + "learning_rate": 1.221001221001221e-07, + "loss": 0.0002, + "step": 12210 + }, + { + "epoch": 4.97, + "grad_norm": 0.0014206055784597993, + "learning_rate": 1.0582010582010582e-07, + "loss": 0.0002, + "step": 12220 + }, + { + "epoch": 4.98, + "grad_norm": 0.0013865531655028462, + "learning_rate": 8.954008954008955e-08, + "loss": 0.0002, + "step": 12230 + }, + { + "epoch": 4.98, + "grad_norm": 0.0014404187677428126, + "learning_rate": 7.326007326007327e-08, + "loss": 0.0002, + "step": 12240 + }, + { + "epoch": 4.99, + "grad_norm": 0.0015573910204693675, + "learning_rate": 5.6980056980056986e-08, + "loss": 0.0003, + "step": 12250 + }, + { + "epoch": 4.99, + "grad_norm": 0.0015043216990306973, + "learning_rate": 4.07000407000407e-08, + "loss": 0.0002, + "step": 12260 + }, + { + "epoch": 4.99, + "grad_norm": 0.0015565322246402502, + "learning_rate": 2.4420024420024422e-08, + "loss": 0.0002, + "step": 12270 + }, + { + "epoch": 5.0, + "grad_norm": 0.003684895345941186, + "learning_rate": 8.14000814000814e-09, + "loss": 0.0002, + "step": 12280 + }, + { + "epoch": 5.0, + "step": 12285, + "total_flos": 1.523143801869613e+19, + "train_loss": 0.006059203078136595, + "train_runtime": 4479.7513, + "train_samples_per_second": 43.876, + "train_steps_per_second": 2.742 + } + ], + "logging_steps": 10, + "max_steps": 12285, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 1.523143801869613e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null