diff --git a/blip3o_fast_vlm_3b_distillation_dit_learnable_512_model_1024_no_queries_frozen_mobile_condition_last_4_v2_blip3o_sharegpt_after_pretrain_3_cfm/trainer_state.json b/blip3o_fast_vlm_3b_distillation_dit_learnable_512_model_1024_no_queries_frozen_mobile_condition_last_4_v2_blip3o_sharegpt_after_pretrain_3_cfm/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d98d626204bfdda2a79b37eeb4fb62ab32063933 --- /dev/null +++ b/blip3o_fast_vlm_3b_distillation_dit_learnable_512_model_1024_no_queries_frozen_mobile_condition_last_4_v2_blip3o_sharegpt_after_pretrain_3_cfm/trainer_state.json @@ -0,0 +1,3865 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 5460, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03663003663003663, + "grad_norm": 0.21381588280200958, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.6563, + "step": 10 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 0.16336338222026825, + "learning_rate": 1.391941391941392e-05, + "loss": 0.663, + "step": 20 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 0.13548393547534943, + "learning_rate": 2.1245421245421246e-05, + "loss": 0.6513, + "step": 30 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 0.12504926323890686, + "learning_rate": 2.857142857142857e-05, + "loss": 0.6433, + "step": 40 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 0.14509989321231842, + "learning_rate": 3.58974358974359e-05, + "loss": 0.627, + "step": 50 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 0.12336988747119904, + "learning_rate": 4.322344322344323e-05, + "loss": 0.6453, + "step": 60 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.11007435619831085, + "learning_rate": 5.054945054945055e-05, + "loss": 0.6319, + "step": 70 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 0.12032974511384964, + "learning_rate": 5.787545787545788e-05, + "loss": 0.6345, + "step": 80 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 0.0944727212190628, + "learning_rate": 6.52014652014652e-05, + "loss": 0.6347, + "step": 90 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 0.12763062119483948, + "learning_rate": 7.252747252747253e-05, + "loss": 0.6296, + "step": 100 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 0.1317506730556488, + "learning_rate": 7.985347985347986e-05, + "loss": 0.6458, + "step": 110 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 0.1032543033361435, + "learning_rate": 8.717948717948718e-05, + "loss": 0.6344, + "step": 120 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.1516275852918625, + "learning_rate": 9.450549450549451e-05, + "loss": 0.6285, + "step": 130 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.09366358071565628, + "learning_rate": 0.00010183150183150183, + "loss": 0.6356, + "step": 140 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 0.11660020053386688, + "learning_rate": 0.00010915750915750915, + "loss": 0.625, + "step": 150 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 0.1604832112789154, + "learning_rate": 0.0001164835164835165, + "loss": 0.6301, + "step": 160 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 0.11046287417411804, + "learning_rate": 0.0001238095238095238, + "loss": 0.6213, + "step": 170 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 0.13051177561283112, + "learning_rate": 0.00013113553113553114, + "loss": 0.6348, + "step": 180 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 0.15282383561134338, + "learning_rate": 0.00013846153846153847, + "loss": 0.6223, + "step": 190 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 0.09235327690839767, + "learning_rate": 0.00014578754578754578, + "loss": 0.6302, + "step": 200 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.15384027361869812, + "learning_rate": 0.0001531135531135531, + "loss": 0.6235, + "step": 210 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 0.1622333526611328, + "learning_rate": 0.00016043956043956044, + "loss": 0.6248, + "step": 220 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 0.1282617151737213, + "learning_rate": 0.00016776556776556777, + "loss": 0.6217, + "step": 230 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 0.15851661562919617, + "learning_rate": 0.0001750915750915751, + "loss": 0.6209, + "step": 240 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 0.11857573688030243, + "learning_rate": 0.0001824175824175824, + "loss": 0.6321, + "step": 250 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.16058525443077087, + "learning_rate": 0.00018974358974358974, + "loss": 0.6237, + "step": 260 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 0.12434748560190201, + "learning_rate": 0.0001970695970695971, + "loss": 0.6194, + "step": 270 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.13378217816352844, + "learning_rate": 0.00019999934300455693, + "loss": 0.6161, + "step": 280 + }, + { + "epoch": 1.0622710622710623, + "grad_norm": 0.13045711815357208, + "learning_rate": 0.0001999953280638248, + "loss": 0.6309, + "step": 290 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 0.13684824109077454, + "learning_rate": 0.00019998766332693084, + "loss": 0.6353, + "step": 300 + }, + { + "epoch": 1.1355311355311355, + "grad_norm": 0.13938014209270477, + "learning_rate": 0.0001999763490750413, + "loss": 0.6194, + "step": 310 + }, + { + "epoch": 1.1721611721611722, + "grad_norm": 0.1351129710674286, + "learning_rate": 0.00019996138572319787, + "loss": 0.6288, + "step": 320 + }, + { + "epoch": 1.2087912087912087, + "grad_norm": 0.14925244450569153, + "learning_rate": 0.00019994277382030259, + "loss": 0.6092, + "step": 330 + }, + { + "epoch": 1.2454212454212454, + "grad_norm": 0.10388221591711044, + "learning_rate": 0.00019992051404909758, + "loss": 0.6346, + "step": 340 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.12104734778404236, + "learning_rate": 0.00019989460722614002, + "loss": 0.6315, + "step": 350 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 0.12242613732814789, + "learning_rate": 0.0001998650543017723, + "loss": 0.623, + "step": 360 + }, + { + "epoch": 1.3553113553113554, + "grad_norm": 0.12440565973520279, + "learning_rate": 0.000199831856360087, + "loss": 0.6134, + "step": 370 + }, + { + "epoch": 1.3919413919413919, + "grad_norm": 0.10807174444198608, + "learning_rate": 0.0001997950146188872, + "loss": 0.6262, + "step": 380 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.14467234909534454, + "learning_rate": 0.00019975453042964187, + "loss": 0.6146, + "step": 390 + }, + { + "epoch": 1.4652014652014653, + "grad_norm": 0.12609444558620453, + "learning_rate": 0.00019971040527743626, + "loss": 0.621, + "step": 400 + }, + { + "epoch": 1.5018315018315018, + "grad_norm": 0.09819615632295609, + "learning_rate": 0.00019966264078091723, + "loss": 0.6162, + "step": 410 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.11803926527500153, + "learning_rate": 0.00019961123869223423, + "loss": 0.6168, + "step": 420 + }, + { + "epoch": 1.575091575091575, + "grad_norm": 0.12449613213539124, + "learning_rate": 0.00019955620089697474, + "loss": 0.6221, + "step": 430 + }, + { + "epoch": 1.6117216117216118, + "grad_norm": 0.12128981202840805, + "learning_rate": 0.00019949752941409514, + "loss": 0.6229, + "step": 440 + }, + { + "epoch": 1.6483516483516483, + "grad_norm": 0.14718155562877655, + "learning_rate": 0.0001994352263958469, + "loss": 0.6173, + "step": 450 + }, + { + "epoch": 1.684981684981685, + "grad_norm": 0.14411243796348572, + "learning_rate": 0.00019936929412769713, + "loss": 0.6337, + "step": 460 + }, + { + "epoch": 1.7216117216117217, + "grad_norm": 0.1278337836265564, + "learning_rate": 0.00019929973502824533, + "loss": 0.6238, + "step": 470 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 0.13230876624584198, + "learning_rate": 0.00019922655164913427, + "loss": 0.6259, + "step": 480 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.12312595546245575, + "learning_rate": 0.00019914974667495646, + "loss": 0.6125, + "step": 490 + }, + { + "epoch": 1.8315018315018317, + "grad_norm": 0.09721134603023529, + "learning_rate": 0.00019906932292315573, + "loss": 0.6174, + "step": 500 + }, + { + "epoch": 1.8681318681318682, + "grad_norm": 0.12478078901767731, + "learning_rate": 0.0001989852833439239, + "loss": 0.6252, + "step": 510 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.11270100623369217, + "learning_rate": 0.0001988976310200925, + "loss": 0.6216, + "step": 520 + }, + { + "epoch": 1.9413919413919414, + "grad_norm": 0.11008603870868683, + "learning_rate": 0.0001988063691670197, + "loss": 0.6163, + "step": 530 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 0.11471746116876602, + "learning_rate": 0.0001987115011324723, + "loss": 0.6328, + "step": 540 + }, + { + "epoch": 2.0146520146520146, + "grad_norm": 0.11322673410177231, + "learning_rate": 0.00019861303039650315, + "loss": 0.6099, + "step": 550 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.11044928431510925, + "learning_rate": 0.00019851096057132308, + "loss": 0.619, + "step": 560 + }, + { + "epoch": 2.087912087912088, + "grad_norm": 0.09827021509408951, + "learning_rate": 0.0001984052954011688, + "loss": 0.6299, + "step": 570 + }, + { + "epoch": 2.1245421245421245, + "grad_norm": 0.12820111215114594, + "learning_rate": 0.00019829603876216537, + "loss": 0.6088, + "step": 580 + }, + { + "epoch": 2.161172161172161, + "grad_norm": 0.10461625456809998, + "learning_rate": 0.000198183194662184, + "loss": 0.6251, + "step": 590 + }, + { + "epoch": 2.197802197802198, + "grad_norm": 0.13258452713489532, + "learning_rate": 0.00019806676724069505, + "loss": 0.6124, + "step": 600 + }, + { + "epoch": 2.2344322344322345, + "grad_norm": 0.11297419667243958, + "learning_rate": 0.0001979467607686162, + "loss": 0.617, + "step": 610 + }, + { + "epoch": 2.271062271062271, + "grad_norm": 0.11036292463541031, + "learning_rate": 0.00019782317964815585, + "loss": 0.6274, + "step": 620 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.09046752005815506, + "learning_rate": 0.00019769602841265135, + "loss": 0.6175, + "step": 630 + }, + { + "epoch": 2.3443223443223444, + "grad_norm": 0.15168985724449158, + "learning_rate": 0.00019756531172640312, + "loss": 0.611, + "step": 640 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.12152957171201706, + "learning_rate": 0.00019743103438450322, + "loss": 0.6206, + "step": 650 + }, + { + "epoch": 2.4175824175824174, + "grad_norm": 0.11831982433795929, + "learning_rate": 0.00019729320131265957, + "loss": 0.6092, + "step": 660 + }, + { + "epoch": 2.4542124542124544, + "grad_norm": 0.12286005914211273, + "learning_rate": 0.00019715181756701526, + "loss": 0.6131, + "step": 670 + }, + { + "epoch": 2.490842490842491, + "grad_norm": 0.14576148986816406, + "learning_rate": 0.00019700688833396314, + "loss": 0.6149, + "step": 680 + }, + { + "epoch": 2.5274725274725274, + "grad_norm": 0.13091808557510376, + "learning_rate": 0.00019685841892995537, + "loss": 0.6204, + "step": 690 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.09313470125198364, + "learning_rate": 0.00019670641480130861, + "loss": 0.6148, + "step": 700 + }, + { + "epoch": 2.600732600732601, + "grad_norm": 0.12012402713298798, + "learning_rate": 0.00019655088152400416, + "loss": 0.6219, + "step": 710 + }, + { + "epoch": 2.6373626373626373, + "grad_norm": 0.13228215277194977, + "learning_rate": 0.00019639182480348332, + "loss": 0.6226, + "step": 720 + }, + { + "epoch": 2.6739926739926743, + "grad_norm": 0.12784317135810852, + "learning_rate": 0.00019622925047443825, + "loss": 0.6188, + "step": 730 + }, + { + "epoch": 2.7106227106227108, + "grad_norm": 0.12311346083879471, + "learning_rate": 0.00019606316450059786, + "loss": 0.6174, + "step": 740 + }, + { + "epoch": 2.7472527472527473, + "grad_norm": 0.11249978840351105, + "learning_rate": 0.00019589357297450897, + "loss": 0.6086, + "step": 750 + }, + { + "epoch": 2.7838827838827838, + "grad_norm": 0.0970926508307457, + "learning_rate": 0.00019572048211731295, + "loss": 0.6099, + "step": 760 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.1097983717918396, + "learning_rate": 0.00019554389827851752, + "loss": 0.6291, + "step": 770 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.11450190097093582, + "learning_rate": 0.0001953638279357636, + "loss": 0.6158, + "step": 780 + }, + { + "epoch": 2.8937728937728937, + "grad_norm": 0.08861864358186722, + "learning_rate": 0.00019518027769458799, + "loss": 0.6255, + "step": 790 + }, + { + "epoch": 2.9304029304029307, + "grad_norm": 0.10655124485492706, + "learning_rate": 0.00019499325428818084, + "loss": 0.6149, + "step": 800 + }, + { + "epoch": 2.967032967032967, + "grad_norm": 0.1027015745639801, + "learning_rate": 0.00019480276457713874, + "loss": 0.6182, + "step": 810 + }, + { + "epoch": 3.0036630036630036, + "grad_norm": 0.12306101620197296, + "learning_rate": 0.00019460881554921315, + "loss": 0.6164, + "step": 820 + }, + { + "epoch": 3.04029304029304, + "grad_norm": 0.11898409575223923, + "learning_rate": 0.00019441141431905383, + "loss": 0.6062, + "step": 830 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.1291988044977188, + "learning_rate": 0.0001942105681279481, + "loss": 0.6258, + "step": 840 + }, + { + "epoch": 3.1135531135531136, + "grad_norm": 0.11260422319173813, + "learning_rate": 0.00019400628434355495, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 3.15018315018315, + "grad_norm": 0.10207366943359375, + "learning_rate": 0.00019379857045963509, + "loss": 0.6159, + "step": 860 + }, + { + "epoch": 3.186813186813187, + "grad_norm": 0.10797443240880966, + "learning_rate": 0.00019358743409577577, + "loss": 0.6134, + "step": 870 + }, + { + "epoch": 3.2234432234432235, + "grad_norm": 0.09320534765720367, + "learning_rate": 0.00019337288299711144, + "loss": 0.6104, + "step": 880 + }, + { + "epoch": 3.26007326007326, + "grad_norm": 0.15332849323749542, + "learning_rate": 0.0001931549250340395, + "loss": 0.6178, + "step": 890 + }, + { + "epoch": 3.2967032967032965, + "grad_norm": 0.12163515388965607, + "learning_rate": 0.00019293356820193177, + "loss": 0.606, + "step": 900 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.12709611654281616, + "learning_rate": 0.00019270882062084099, + "loss": 0.6104, + "step": 910 + }, + { + "epoch": 3.36996336996337, + "grad_norm": 0.11496417224407196, + "learning_rate": 0.00019248069053520316, + "loss": 0.6177, + "step": 920 + }, + { + "epoch": 3.4065934065934065, + "grad_norm": 0.13935399055480957, + "learning_rate": 0.0001922491863135349, + "loss": 0.6159, + "step": 930 + }, + { + "epoch": 3.4432234432234434, + "grad_norm": 0.11748811602592468, + "learning_rate": 0.0001920143164481266, + "loss": 0.6223, + "step": 940 + }, + { + "epoch": 3.47985347985348, + "grad_norm": 0.11896082758903503, + "learning_rate": 0.00019177608955473096, + "loss": 0.6176, + "step": 950 + }, + { + "epoch": 3.5164835164835164, + "grad_norm": 0.12537701427936554, + "learning_rate": 0.0001915345143722467, + "loss": 0.6108, + "step": 960 + }, + { + "epoch": 3.553113553113553, + "grad_norm": 0.172188863158226, + "learning_rate": 0.0001912895997623982, + "loss": 0.6186, + "step": 970 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 0.12360432744026184, + "learning_rate": 0.00019104135470941034, + "loss": 0.6219, + "step": 980 + }, + { + "epoch": 3.6263736263736264, + "grad_norm": 0.15684974193572998, + "learning_rate": 0.000190789788319679, + "loss": 0.6084, + "step": 990 + }, + { + "epoch": 3.663003663003663, + "grad_norm": 0.14988037943840027, + "learning_rate": 0.00019053490982143687, + "loss": 0.6028, + "step": 1000 + }, + { + "epoch": 3.6996336996337, + "grad_norm": 0.12309486418962479, + "learning_rate": 0.00019027672856441512, + "loss": 0.621, + "step": 1010 + }, + { + "epoch": 3.7362637362637363, + "grad_norm": 0.11249048262834549, + "learning_rate": 0.00019001525401950018, + "loss": 0.6088, + "step": 1020 + }, + { + "epoch": 3.772893772893773, + "grad_norm": 0.12147108465433121, + "learning_rate": 0.00018975049577838664, + "loss": 0.6169, + "step": 1030 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.11105571687221527, + "learning_rate": 0.0001894824635532251, + "loss": 0.6047, + "step": 1040 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.12829916179180145, + "learning_rate": 0.00018921116717626594, + "loss": 0.6197, + "step": 1050 + }, + { + "epoch": 3.8827838827838828, + "grad_norm": 0.1178424134850502, + "learning_rate": 0.00018893661659949895, + "loss": 0.602, + "step": 1060 + }, + { + "epoch": 3.9194139194139193, + "grad_norm": 0.09845280647277832, + "learning_rate": 0.00018865882189428778, + "loss": 0.61, + "step": 1070 + }, + { + "epoch": 3.956043956043956, + "grad_norm": 0.1087396964430809, + "learning_rate": 0.0001883777932510009, + "loss": 0.6275, + "step": 1080 + }, + { + "epoch": 3.9926739926739927, + "grad_norm": 0.15589557588100433, + "learning_rate": 0.0001880935409786376, + "loss": 0.6167, + "step": 1090 + }, + { + "epoch": 4.029304029304029, + "grad_norm": 0.0800589993596077, + "learning_rate": 0.00018780607550444978, + "loss": 0.6014, + "step": 1100 + }, + { + "epoch": 4.065934065934066, + "grad_norm": 0.11557458341121674, + "learning_rate": 0.00018751540737355958, + "loss": 0.6157, + "step": 1110 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 0.11841552704572678, + "learning_rate": 0.00018722154724857241, + "loss": 0.6131, + "step": 1120 + }, + { + "epoch": 4.13919413919414, + "grad_norm": 0.09185516089200974, + "learning_rate": 0.00018692450590918598, + "loss": 0.6151, + "step": 1130 + }, + { + "epoch": 4.175824175824176, + "grad_norm": 0.1254640370607376, + "learning_rate": 0.00018662429425179476, + "loss": 0.6158, + "step": 1140 + }, + { + "epoch": 4.212454212454213, + "grad_norm": 0.08927269279956818, + "learning_rate": 0.00018632092328909022, + "loss": 0.6092, + "step": 1150 + }, + { + "epoch": 4.249084249084249, + "grad_norm": 0.09850620478391647, + "learning_rate": 0.000186014404149657, + "loss": 0.602, + "step": 1160 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.12001339346170425, + "learning_rate": 0.00018570474807756452, + "loss": 0.6041, + "step": 1170 + }, + { + "epoch": 4.322344322344322, + "grad_norm": 0.11746009439229965, + "learning_rate": 0.0001853919664319546, + "loss": 0.6096, + "step": 1180 + }, + { + "epoch": 4.358974358974359, + "grad_norm": 0.10340318828821182, + "learning_rate": 0.0001850760706866248, + "loss": 0.6023, + "step": 1190 + }, + { + "epoch": 4.395604395604396, + "grad_norm": 0.11639545857906342, + "learning_rate": 0.00018475707242960742, + "loss": 0.6191, + "step": 1200 + }, + { + "epoch": 4.4322344322344325, + "grad_norm": 0.15589426457881927, + "learning_rate": 0.00018443498336274462, + "loss": 0.6054, + "step": 1210 + }, + { + "epoch": 4.468864468864469, + "grad_norm": 0.09896785765886307, + "learning_rate": 0.00018410981530125875, + "loss": 0.6215, + "step": 1220 + }, + { + "epoch": 4.5054945054945055, + "grad_norm": 0.12751416862010956, + "learning_rate": 0.0001837815801733195, + "loss": 0.5995, + "step": 1230 + }, + { + "epoch": 4.542124542124542, + "grad_norm": 0.08719929307699203, + "learning_rate": 0.0001834502900196058, + "loss": 0.6196, + "step": 1240 + }, + { + "epoch": 4.5787545787545785, + "grad_norm": 0.1458660364151001, + "learning_rate": 0.00018311595699286454, + "loss": 0.6178, + "step": 1250 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.14082370698451996, + "learning_rate": 0.0001827785933574645, + "loss": 0.6038, + "step": 1260 + }, + { + "epoch": 4.652014652014652, + "grad_norm": 0.09783344715833664, + "learning_rate": 0.00018243821148894654, + "loss": 0.6152, + "step": 1270 + }, + { + "epoch": 4.688644688644689, + "grad_norm": 0.13895457983016968, + "learning_rate": 0.00018209482387356978, + "loss": 0.611, + "step": 1280 + }, + { + "epoch": 4.725274725274725, + "grad_norm": 0.12415605038404465, + "learning_rate": 0.00018174844310785322, + "loss": 0.6189, + "step": 1290 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.1300516426563263, + "learning_rate": 0.000181399081898114, + "loss": 0.601, + "step": 1300 + }, + { + "epoch": 4.798534798534798, + "grad_norm": 0.10010094940662384, + "learning_rate": 0.0001810467530600011, + "loss": 0.6111, + "step": 1310 + }, + { + "epoch": 4.835164835164835, + "grad_norm": 0.10929550230503082, + "learning_rate": 0.00018069146951802542, + "loss": 0.6053, + "step": 1320 + }, + { + "epoch": 4.871794871794872, + "grad_norm": 0.1457078754901886, + "learning_rate": 0.0001803332443050853, + "loss": 0.6112, + "step": 1330 + }, + { + "epoch": 4.908424908424909, + "grad_norm": 0.1061316654086113, + "learning_rate": 0.00017997209056198882, + "loss": 0.6199, + "step": 1340 + }, + { + "epoch": 4.945054945054945, + "grad_norm": 0.1740475445985794, + "learning_rate": 0.0001796080215369716, + "loss": 0.6048, + "step": 1350 + }, + { + "epoch": 4.981684981684982, + "grad_norm": 0.12337993085384369, + "learning_rate": 0.0001792410505852107, + "loss": 0.6091, + "step": 1360 + }, + { + "epoch": 5.018315018315018, + "grad_norm": 0.11758992075920105, + "learning_rate": 0.00017887119116833495, + "loss": 0.62, + "step": 1370 + }, + { + "epoch": 5.054945054945055, + "grad_norm": 0.11902181804180145, + "learning_rate": 0.00017849845685393093, + "loss": 0.6145, + "step": 1380 + }, + { + "epoch": 5.091575091575091, + "grad_norm": 0.10019928216934204, + "learning_rate": 0.0001781228613150454, + "loss": 0.6124, + "step": 1390 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 0.14300206303596497, + "learning_rate": 0.0001777444183296836, + "loss": 0.6141, + "step": 1400 + }, + { + "epoch": 5.164835164835165, + "grad_norm": 0.15143392980098724, + "learning_rate": 0.000177363141780304, + "loss": 0.6066, + "step": 1410 + }, + { + "epoch": 5.201465201465202, + "grad_norm": 0.11212984472513199, + "learning_rate": 0.0001769790456533089, + "loss": 0.6041, + "step": 1420 + }, + { + "epoch": 5.238095238095238, + "grad_norm": 0.11619605123996735, + "learning_rate": 0.00017659214403853145, + "loss": 0.614, + "step": 1430 + }, + { + "epoch": 5.274725274725275, + "grad_norm": 0.10530748963356018, + "learning_rate": 0.00017620245112871867, + "loss": 0.6131, + "step": 1440 + }, + { + "epoch": 5.311355311355311, + "grad_norm": 0.18820564448833466, + "learning_rate": 0.00017580998121901107, + "loss": 0.6058, + "step": 1450 + }, + { + "epoch": 5.347985347985348, + "grad_norm": 0.10431553423404694, + "learning_rate": 0.00017541474870641797, + "loss": 0.6058, + "step": 1460 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 0.09268366545438766, + "learning_rate": 0.0001750167680892896, + "loss": 0.5981, + "step": 1470 + }, + { + "epoch": 5.4212454212454215, + "grad_norm": 0.1386278122663498, + "learning_rate": 0.00017461605396678506, + "loss": 0.6086, + "step": 1480 + }, + { + "epoch": 5.457875457875458, + "grad_norm": 0.11717536300420761, + "learning_rate": 0.000174212621038337, + "loss": 0.6032, + "step": 1490 + }, + { + "epoch": 5.4945054945054945, + "grad_norm": 0.10783743113279343, + "learning_rate": 0.00017380648410311207, + "loss": 0.5993, + "step": 1500 + }, + { + "epoch": 5.531135531135531, + "grad_norm": 0.13064159452915192, + "learning_rate": 0.00017339765805946846, + "loss": 0.6107, + "step": 1510 + }, + { + "epoch": 5.5677655677655675, + "grad_norm": 0.12426242977380753, + "learning_rate": 0.0001729861579044091, + "loss": 0.6029, + "step": 1520 + }, + { + "epoch": 5.604395604395604, + "grad_norm": 0.12688453495502472, + "learning_rate": 0.00017257199873303155, + "loss": 0.6154, + "step": 1530 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 0.11696295440196991, + "learning_rate": 0.0001721551957379743, + "loss": 0.61, + "step": 1540 + }, + { + "epoch": 5.677655677655678, + "grad_norm": 0.11864680051803589, + "learning_rate": 0.00017173576420885956, + "loss": 0.6085, + "step": 1550 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.09387882798910141, + "learning_rate": 0.00017131371953173222, + "loss": 0.6015, + "step": 1560 + }, + { + "epoch": 5.750915750915751, + "grad_norm": 0.09436681121587753, + "learning_rate": 0.00017088907718849545, + "loss": 0.6032, + "step": 1570 + }, + { + "epoch": 5.787545787545787, + "grad_norm": 0.11666887998580933, + "learning_rate": 0.00017046185275634303, + "loss": 0.6067, + "step": 1580 + }, + { + "epoch": 5.824175824175824, + "grad_norm": 0.11687021702528, + "learning_rate": 0.00017003206190718758, + "loss": 0.6112, + "step": 1590 + }, + { + "epoch": 5.860805860805861, + "grad_norm": 0.11235155165195465, + "learning_rate": 0.00016959972040708586, + "loss": 0.6169, + "step": 1600 + }, + { + "epoch": 5.897435897435898, + "grad_norm": 0.10137765854597092, + "learning_rate": 0.00016916484411566045, + "loss": 0.6028, + "step": 1610 + }, + { + "epoch": 5.934065934065934, + "grad_norm": 0.09540582448244095, + "learning_rate": 0.00016872744898551787, + "loss": 0.6091, + "step": 1620 + }, + { + "epoch": 5.970695970695971, + "grad_norm": 0.08847875148057938, + "learning_rate": 0.0001682875510616634, + "loss": 0.6279, + "step": 1630 + }, + { + "epoch": 6.007326007326007, + "grad_norm": 0.13143204152584076, + "learning_rate": 0.00016784516648091254, + "loss": 0.601, + "step": 1640 + }, + { + "epoch": 6.043956043956044, + "grad_norm": 0.11306729912757874, + "learning_rate": 0.00016740031147129917, + "loss": 0.6093, + "step": 1650 + }, + { + "epoch": 6.08058608058608, + "grad_norm": 0.08409835398197174, + "learning_rate": 0.00016695300235147994, + "loss": 0.6051, + "step": 1660 + }, + { + "epoch": 6.117216117216117, + "grad_norm": 0.0728335753083229, + "learning_rate": 0.000166503255530136, + "loss": 0.5947, + "step": 1670 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 0.11714360117912292, + "learning_rate": 0.00016605108750537083, + "loss": 0.6103, + "step": 1680 + }, + { + "epoch": 6.190476190476191, + "grad_norm": 0.10780514776706696, + "learning_rate": 0.0001655965148641052, + "loss": 0.59, + "step": 1690 + }, + { + "epoch": 6.227106227106227, + "grad_norm": 0.10457523167133331, + "learning_rate": 0.00016513955428146864, + "loss": 0.6014, + "step": 1700 + }, + { + "epoch": 6.263736263736264, + "grad_norm": 0.11372072249650955, + "learning_rate": 0.00016468022252018774, + "loss": 0.6053, + "step": 1710 + }, + { + "epoch": 6.3003663003663, + "grad_norm": 0.10791448503732681, + "learning_rate": 0.00016421853642997118, + "loss": 0.6123, + "step": 1720 + }, + { + "epoch": 6.336996336996337, + "grad_norm": 0.10883180052042007, + "learning_rate": 0.00016375451294689186, + "loss": 0.6127, + "step": 1730 + }, + { + "epoch": 6.373626373626374, + "grad_norm": 0.1315256953239441, + "learning_rate": 0.00016328816909276525, + "loss": 0.6111, + "step": 1740 + }, + { + "epoch": 6.410256410256411, + "grad_norm": 0.10748538374900818, + "learning_rate": 0.00016281952197452545, + "loss": 0.6016, + "step": 1750 + }, + { + "epoch": 6.446886446886447, + "grad_norm": 0.10810081660747528, + "learning_rate": 0.00016234858878359725, + "loss": 0.6128, + "step": 1760 + }, + { + "epoch": 6.483516483516484, + "grad_norm": 0.10564674437046051, + "learning_rate": 0.0001618753867952657, + "loss": 0.6025, + "step": 1770 + }, + { + "epoch": 6.52014652014652, + "grad_norm": 0.13300639390945435, + "learning_rate": 0.00016139993336804234, + "loss": 0.614, + "step": 1780 + }, + { + "epoch": 6.556776556776557, + "grad_norm": 0.09764187037944794, + "learning_rate": 0.00016092224594302845, + "loss": 0.6077, + "step": 1790 + }, + { + "epoch": 6.593406593406593, + "grad_norm": 0.08439364284276962, + "learning_rate": 0.00016044234204327527, + "loss": 0.6098, + "step": 1800 + }, + { + "epoch": 6.63003663003663, + "grad_norm": 0.13017664849758148, + "learning_rate": 0.00015996023927314113, + "loss": 0.6111, + "step": 1810 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.08291947841644287, + "learning_rate": 0.00015947595531764578, + "loss": 0.614, + "step": 1820 + }, + { + "epoch": 6.7032967032967035, + "grad_norm": 0.14750197529792786, + "learning_rate": 0.0001589895079418216, + "loss": 0.5958, + "step": 1830 + }, + { + "epoch": 6.73992673992674, + "grad_norm": 0.10692450404167175, + "learning_rate": 0.00015850091499006188, + "loss": 0.599, + "step": 1840 + }, + { + "epoch": 6.7765567765567765, + "grad_norm": 0.13206391036510468, + "learning_rate": 0.00015801019438546626, + "loss": 0.6066, + "step": 1850 + }, + { + "epoch": 6.813186813186813, + "grad_norm": 0.08591387420892715, + "learning_rate": 0.00015751736412918327, + "loss": 0.6093, + "step": 1860 + }, + { + "epoch": 6.8498168498168495, + "grad_norm": 0.09769956022500992, + "learning_rate": 0.00015702244229975, + "loss": 0.6002, + "step": 1870 + }, + { + "epoch": 6.886446886446887, + "grad_norm": 0.10705573111772537, + "learning_rate": 0.0001565254470524289, + "loss": 0.6071, + "step": 1880 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 0.11658084392547607, + "learning_rate": 0.00015602639661854183, + "loss": 0.607, + "step": 1890 + }, + { + "epoch": 6.95970695970696, + "grad_norm": 0.10922308266162872, + "learning_rate": 0.00015552530930480114, + "loss": 0.6082, + "step": 1900 + }, + { + "epoch": 6.996336996336996, + "grad_norm": 0.10610403120517731, + "learning_rate": 0.00015502220349263837, + "loss": 0.6099, + "step": 1910 + }, + { + "epoch": 7.032967032967033, + "grad_norm": 0.14902709424495697, + "learning_rate": 0.0001545170976375297, + "loss": 0.5997, + "step": 1920 + }, + { + "epoch": 7.069597069597069, + "grad_norm": 0.1101558580994606, + "learning_rate": 0.00015401001026831915, + "loss": 0.6117, + "step": 1930 + }, + { + "epoch": 7.106227106227106, + "grad_norm": 0.11664875596761703, + "learning_rate": 0.00015350095998653874, + "loss": 0.6002, + "step": 1940 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.13412544131278992, + "learning_rate": 0.00015298996546572627, + "loss": 0.5914, + "step": 1950 + }, + { + "epoch": 7.17948717948718, + "grad_norm": 0.15238548815250397, + "learning_rate": 0.00015247704545074008, + "loss": 0.6113, + "step": 1960 + }, + { + "epoch": 7.216117216117216, + "grad_norm": 0.12290948629379272, + "learning_rate": 0.00015196221875707184, + "loss": 0.6094, + "step": 1970 + }, + { + "epoch": 7.252747252747253, + "grad_norm": 0.0918455421924591, + "learning_rate": 0.00015144550427015588, + "loss": 0.6126, + "step": 1980 + }, + { + "epoch": 7.289377289377289, + "grad_norm": 0.12030185759067535, + "learning_rate": 0.00015092692094467668, + "loss": 0.5912, + "step": 1990 + }, + { + "epoch": 7.326007326007326, + "grad_norm": 0.10280390083789825, + "learning_rate": 0.0001504064878038735, + "loss": 0.6072, + "step": 2000 + }, + { + "epoch": 7.362637362637362, + "grad_norm": 0.10017659515142441, + "learning_rate": 0.00014988422393884254, + "loss": 0.6053, + "step": 2010 + }, + { + "epoch": 7.3992673992674, + "grad_norm": 0.08514354377985, + "learning_rate": 0.0001493601485078366, + "loss": 0.6218, + "step": 2020 + }, + { + "epoch": 7.435897435897436, + "grad_norm": 0.10472512245178223, + "learning_rate": 0.00014883428073556238, + "loss": 0.6064, + "step": 2030 + }, + { + "epoch": 7.472527472527473, + "grad_norm": 0.11152771860361099, + "learning_rate": 0.0001483066399124752, + "loss": 0.5951, + "step": 2040 + }, + { + "epoch": 7.509157509157509, + "grad_norm": 0.11035049706697464, + "learning_rate": 0.00014777724539407122, + "loss": 0.6017, + "step": 2050 + }, + { + "epoch": 7.545787545787546, + "grad_norm": 0.11814115196466446, + "learning_rate": 0.0001472461166001778, + "loss": 0.5947, + "step": 2060 + }, + { + "epoch": 7.582417582417582, + "grad_norm": 0.11158854514360428, + "learning_rate": 0.00014671327301424067, + "loss": 0.6004, + "step": 2070 + }, + { + "epoch": 7.619047619047619, + "grad_norm": 0.1224694699048996, + "learning_rate": 0.00014617873418260952, + "loss": 0.6072, + "step": 2080 + }, + { + "epoch": 7.655677655677656, + "grad_norm": 0.1077955961227417, + "learning_rate": 0.00014564251971382097, + "loss": 0.6007, + "step": 2090 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.10596826672554016, + "learning_rate": 0.00014510464927787898, + "loss": 0.6029, + "step": 2100 + }, + { + "epoch": 7.728937728937729, + "grad_norm": 0.13761618733406067, + "learning_rate": 0.00014456514260553375, + "loss": 0.6025, + "step": 2110 + }, + { + "epoch": 7.7655677655677655, + "grad_norm": 0.13236203789710999, + "learning_rate": 0.00014402401948755754, + "loss": 0.6041, + "step": 2120 + }, + { + "epoch": 7.802197802197802, + "grad_norm": 0.09635107964277267, + "learning_rate": 0.00014348129977401886, + "loss": 0.606, + "step": 2130 + }, + { + "epoch": 7.8388278388278385, + "grad_norm": 0.0904504582285881, + "learning_rate": 0.00014293700337355427, + "loss": 0.6049, + "step": 2140 + }, + { + "epoch": 7.875457875457876, + "grad_norm": 0.10969776660203934, + "learning_rate": 0.00014239115025263812, + "loss": 0.5982, + "step": 2150 + }, + { + "epoch": 7.912087912087912, + "grad_norm": 0.1088518351316452, + "learning_rate": 0.00014184376043485005, + "loss": 0.6021, + "step": 2160 + }, + { + "epoch": 7.948717948717949, + "grad_norm": 0.11425957083702087, + "learning_rate": 0.0001412948540001405, + "loss": 0.6022, + "step": 2170 + }, + { + "epoch": 7.985347985347985, + "grad_norm": 0.11807142943143845, + "learning_rate": 0.00014074445108409406, + "loss": 0.616, + "step": 2180 + }, + { + "epoch": 8.021978021978022, + "grad_norm": 0.13567444682121277, + "learning_rate": 0.00014019257187719098, + "loss": 0.6025, + "step": 2190 + }, + { + "epoch": 8.058608058608058, + "grad_norm": 0.12121989578008652, + "learning_rate": 0.00013963923662406634, + "loss": 0.6046, + "step": 2200 + }, + { + "epoch": 8.095238095238095, + "grad_norm": 0.11369021236896515, + "learning_rate": 0.0001390844656227675, + "loss": 0.6061, + "step": 2210 + }, + { + "epoch": 8.131868131868131, + "grad_norm": 0.10813486576080322, + "learning_rate": 0.00013852827922400954, + "loss": 0.6025, + "step": 2220 + }, + { + "epoch": 8.168498168498168, + "grad_norm": 0.11306055635213852, + "learning_rate": 0.00013797069783042865, + "loss": 0.6105, + "step": 2230 + }, + { + "epoch": 8.205128205128204, + "grad_norm": 0.171703040599823, + "learning_rate": 0.0001374117418958338, + "loss": 0.5866, + "step": 2240 + }, + { + "epoch": 8.241758241758241, + "grad_norm": 0.10016179084777832, + "learning_rate": 0.00013685143192445633, + "loss": 0.6078, + "step": 2250 + }, + { + "epoch": 8.27838827838828, + "grad_norm": 0.09870107471942902, + "learning_rate": 0.00013628978847019782, + "loss": 0.6074, + "step": 2260 + }, + { + "epoch": 8.315018315018316, + "grad_norm": 0.11638665944337845, + "learning_rate": 0.00013572683213587618, + "loss": 0.6018, + "step": 2270 + }, + { + "epoch": 8.351648351648352, + "grad_norm": 0.11457157135009766, + "learning_rate": 0.00013516258357246976, + "loss": 0.5986, + "step": 2280 + }, + { + "epoch": 8.388278388278389, + "grad_norm": 0.09893471002578735, + "learning_rate": 0.00013459706347835988, + "loss": 0.6042, + "step": 2290 + }, + { + "epoch": 8.424908424908425, + "grad_norm": 0.09442324936389923, + "learning_rate": 0.0001340302925985716, + "loss": 0.604, + "step": 2300 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 0.11142978072166443, + "learning_rate": 0.00013346229172401256, + "loss": 0.5966, + "step": 2310 + }, + { + "epoch": 8.498168498168498, + "grad_norm": 0.13073523342609406, + "learning_rate": 0.00013289308169071051, + "loss": 0.5908, + "step": 2320 + }, + { + "epoch": 8.534798534798535, + "grad_norm": 0.11161026358604431, + "learning_rate": 0.0001323226833790488, + "loss": 0.6105, + "step": 2330 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.09900886565446854, + "learning_rate": 0.00013175111771300053, + "loss": 0.6013, + "step": 2340 + }, + { + "epoch": 8.608058608058608, + "grad_norm": 0.130936399102211, + "learning_rate": 0.00013117840565936103, + "loss": 0.6022, + "step": 2350 + }, + { + "epoch": 8.644688644688644, + "grad_norm": 0.0852714255452156, + "learning_rate": 0.00013060456822697848, + "loss": 0.606, + "step": 2360 + }, + { + "epoch": 8.68131868131868, + "grad_norm": 0.09974522143602371, + "learning_rate": 0.00013002962646598373, + "loss": 0.5918, + "step": 2370 + }, + { + "epoch": 8.717948717948717, + "grad_norm": 0.08324407786130905, + "learning_rate": 0.0001294536014670176, + "loss": 0.6011, + "step": 2380 + }, + { + "epoch": 8.754578754578755, + "grad_norm": 0.09671464562416077, + "learning_rate": 0.00012887651436045737, + "loss": 0.606, + "step": 2390 + }, + { + "epoch": 8.791208791208792, + "grad_norm": 0.10795936733484268, + "learning_rate": 0.00012829838631564196, + "loss": 0.6008, + "step": 2400 + }, + { + "epoch": 8.827838827838828, + "grad_norm": 0.10716196894645691, + "learning_rate": 0.00012771923854009495, + "loss": 0.5983, + "step": 2410 + }, + { + "epoch": 8.864468864468865, + "grad_norm": 0.09245418757200241, + "learning_rate": 0.00012713909227874675, + "loss": 0.6045, + "step": 2420 + }, + { + "epoch": 8.901098901098901, + "grad_norm": 0.11781131476163864, + "learning_rate": 0.00012655796881315547, + "loss": 0.6028, + "step": 2430 + }, + { + "epoch": 8.937728937728938, + "grad_norm": 0.12876753509044647, + "learning_rate": 0.00012597588946072595, + "loss": 0.6002, + "step": 2440 + }, + { + "epoch": 8.974358974358974, + "grad_norm": 0.08043269068002701, + "learning_rate": 0.00012539287557392803, + "loss": 0.5973, + "step": 2450 + }, + { + "epoch": 9.010989010989011, + "grad_norm": 0.11363924294710159, + "learning_rate": 0.0001248089485395131, + "loss": 0.591, + "step": 2460 + }, + { + "epoch": 9.047619047619047, + "grad_norm": 0.09732921421527863, + "learning_rate": 0.00012422412977772956, + "loss": 0.6034, + "step": 2470 + }, + { + "epoch": 9.084249084249084, + "grad_norm": 0.1470343917608261, + "learning_rate": 0.0001236384407415373, + "loss": 0.5994, + "step": 2480 + }, + { + "epoch": 9.12087912087912, + "grad_norm": 0.11025113612413406, + "learning_rate": 0.0001230519029158204, + "loss": 0.5965, + "step": 2490 + }, + { + "epoch": 9.157509157509157, + "grad_norm": 0.1023898497223854, + "learning_rate": 0.00012246453781659928, + "loss": 0.5966, + "step": 2500 + }, + { + "epoch": 9.194139194139193, + "grad_norm": 0.09586931765079498, + "learning_rate": 0.00012187636699024118, + "loss": 0.6094, + "step": 2510 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 0.1178598701953888, + "learning_rate": 0.00012128741201267007, + "loss": 0.5996, + "step": 2520 + }, + { + "epoch": 9.267399267399268, + "grad_norm": 0.10787366330623627, + "learning_rate": 0.00012069769448857484, + "loss": 0.5964, + "step": 2530 + }, + { + "epoch": 9.304029304029305, + "grad_norm": 0.10359363257884979, + "learning_rate": 0.00012010723605061705, + "loss": 0.5919, + "step": 2540 + }, + { + "epoch": 9.340659340659341, + "grad_norm": 0.1379256248474121, + "learning_rate": 0.00011951605835863724, + "loss": 0.6008, + "step": 2550 + }, + { + "epoch": 9.377289377289378, + "grad_norm": 0.12038258463144302, + "learning_rate": 0.00011892418309886044, + "loss": 0.6027, + "step": 2560 + }, + { + "epoch": 9.413919413919414, + "grad_norm": 0.07978381216526031, + "learning_rate": 0.00011833163198310055, + "loss": 0.5954, + "step": 2570 + }, + { + "epoch": 9.45054945054945, + "grad_norm": 0.11855828762054443, + "learning_rate": 0.00011773842674796405, + "loss": 0.5904, + "step": 2580 + }, + { + "epoch": 9.487179487179487, + "grad_norm": 0.10759320110082626, + "learning_rate": 0.00011714458915405246, + "loss": 0.6129, + "step": 2590 + }, + { + "epoch": 9.523809523809524, + "grad_norm": 0.1048365980386734, + "learning_rate": 0.00011655014098516422, + "loss": 0.5933, + "step": 2600 + }, + { + "epoch": 9.56043956043956, + "grad_norm": 0.08056604862213135, + "learning_rate": 0.0001159551040474955, + "loss": 0.6053, + "step": 2610 + }, + { + "epoch": 9.597069597069597, + "grad_norm": 0.16295000910758972, + "learning_rate": 0.00011535950016884043, + "loss": 0.5994, + "step": 2620 + }, + { + "epoch": 9.633699633699633, + "grad_norm": 0.09486577659845352, + "learning_rate": 0.00011476335119779015, + "loss": 0.6012, + "step": 2630 + }, + { + "epoch": 9.67032967032967, + "grad_norm": 0.11261876672506332, + "learning_rate": 0.00011416667900293163, + "loss": 0.6046, + "step": 2640 + }, + { + "epoch": 9.706959706959706, + "grad_norm": 0.14503304660320282, + "learning_rate": 0.00011356950547204514, + "loss": 0.5988, + "step": 2650 + }, + { + "epoch": 9.743589743589745, + "grad_norm": 0.11020506173372269, + "learning_rate": 0.00011297185251130165, + "loss": 0.6072, + "step": 2660 + }, + { + "epoch": 9.780219780219781, + "grad_norm": 0.11951778084039688, + "learning_rate": 0.00011237374204445894, + "loss": 0.6145, + "step": 2670 + }, + { + "epoch": 9.816849816849818, + "grad_norm": 0.13774767518043518, + "learning_rate": 0.00011177519601205772, + "loss": 0.5822, + "step": 2680 + }, + { + "epoch": 9.853479853479854, + "grad_norm": 0.09933178126811981, + "learning_rate": 0.00011117623637061649, + "loss": 0.6056, + "step": 2690 + }, + { + "epoch": 9.89010989010989, + "grad_norm": 0.09804636985063553, + "learning_rate": 0.00011057688509182616, + "loss": 0.6002, + "step": 2700 + }, + { + "epoch": 9.926739926739927, + "grad_norm": 0.14475852251052856, + "learning_rate": 0.0001099771641617442, + "loss": 0.6005, + "step": 2710 + }, + { + "epoch": 9.963369963369964, + "grad_norm": 0.12410984933376312, + "learning_rate": 0.00010937709557998797, + "loss": 0.597, + "step": 2720 + }, + { + "epoch": 10.0, + "grad_norm": 0.12814611196517944, + "learning_rate": 0.00010877670135892781, + "loss": 0.5909, + "step": 2730 + }, + { + "epoch": 10.036630036630036, + "grad_norm": 0.1395956575870514, + "learning_rate": 0.00010817600352287946, + "loss": 0.6083, + "step": 2740 + }, + { + "epoch": 10.073260073260073, + "grad_norm": 0.1210312694311142, + "learning_rate": 0.00010757502410729625, + "loss": 0.5917, + "step": 2750 + }, + { + "epoch": 10.10989010989011, + "grad_norm": 0.10577362775802612, + "learning_rate": 0.00010697378515796078, + "loss": 0.6027, + "step": 2760 + }, + { + "epoch": 10.146520146520146, + "grad_norm": 0.10341820120811462, + "learning_rate": 0.00010637230873017592, + "loss": 0.5922, + "step": 2770 + }, + { + "epoch": 10.183150183150182, + "grad_norm": 0.09854844957590103, + "learning_rate": 0.00010577061688795627, + "loss": 0.6048, + "step": 2780 + }, + { + "epoch": 10.219780219780219, + "grad_norm": 0.13221371173858643, + "learning_rate": 0.00010516873170321829, + "loss": 0.594, + "step": 2790 + }, + { + "epoch": 10.256410256410255, + "grad_norm": 0.11323889344930649, + "learning_rate": 0.0001045666752549709, + "loss": 0.5854, + "step": 2800 + }, + { + "epoch": 10.293040293040294, + "grad_norm": 0.10400225222110748, + "learning_rate": 0.00010396446962850557, + "loss": 0.5999, + "step": 2810 + }, + { + "epoch": 10.32967032967033, + "grad_norm": 0.11270953714847565, + "learning_rate": 0.00010336213691458601, + "loss": 0.5936, + "step": 2820 + }, + { + "epoch": 10.366300366300367, + "grad_norm": 0.11540117859840393, + "learning_rate": 0.00010275969920863786, + "loss": 0.598, + "step": 2830 + }, + { + "epoch": 10.402930402930403, + "grad_norm": 0.09625425189733505, + "learning_rate": 0.00010215717860993831, + "loss": 0.593, + "step": 2840 + }, + { + "epoch": 10.43956043956044, + "grad_norm": 0.10338188707828522, + "learning_rate": 0.00010155459722080518, + "loss": 0.5903, + "step": 2850 + }, + { + "epoch": 10.476190476190476, + "grad_norm": 0.1506458967924118, + "learning_rate": 0.00010095197714578635, + "loss": 0.5821, + "step": 2860 + }, + { + "epoch": 10.512820512820513, + "grad_norm": 0.12155159562826157, + "learning_rate": 0.00010034934049084881, + "loss": 0.6108, + "step": 2870 + }, + { + "epoch": 10.54945054945055, + "grad_norm": 0.10831907391548157, + "learning_rate": 9.974670936256771e-05, + "loss": 0.5872, + "step": 2880 + }, + { + "epoch": 10.586080586080586, + "grad_norm": 0.1336948573589325, + "learning_rate": 9.914410586731555e-05, + "loss": 0.5929, + "step": 2890 + }, + { + "epoch": 10.622710622710622, + "grad_norm": 0.10216037929058075, + "learning_rate": 9.854155211045111e-05, + "loss": 0.6144, + "step": 2900 + }, + { + "epoch": 10.659340659340659, + "grad_norm": 0.11720311641693115, + "learning_rate": 9.793907019550857e-05, + "loss": 0.598, + "step": 2910 + }, + { + "epoch": 10.695970695970695, + "grad_norm": 0.11751140654087067, + "learning_rate": 9.733668222338677e-05, + "loss": 0.6086, + "step": 2920 + }, + { + "epoch": 10.732600732600732, + "grad_norm": 0.09198504686355591, + "learning_rate": 9.67344102915385e-05, + "loss": 0.604, + "step": 2930 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 0.1153954491019249, + "learning_rate": 9.613227649315973e-05, + "loss": 0.6086, + "step": 2940 + }, + { + "epoch": 10.805860805860807, + "grad_norm": 0.09167054295539856, + "learning_rate": 9.553030291637929e-05, + "loss": 0.5991, + "step": 2950 + }, + { + "epoch": 10.842490842490843, + "grad_norm": 0.11931636184453964, + "learning_rate": 9.492851164344865e-05, + "loss": 0.6154, + "step": 2960 + }, + { + "epoch": 10.87912087912088, + "grad_norm": 0.07273609191179276, + "learning_rate": 9.432692474993173e-05, + "loss": 0.5948, + "step": 2970 + }, + { + "epoch": 10.915750915750916, + "grad_norm": 0.12160748243331909, + "learning_rate": 9.372556430389523e-05, + "loss": 0.602, + "step": 2980 + }, + { + "epoch": 10.952380952380953, + "grad_norm": 0.14035232365131378, + "learning_rate": 9.312445236509911e-05, + "loss": 0.5985, + "step": 2990 + }, + { + "epoch": 10.989010989010989, + "grad_norm": 0.12618041038513184, + "learning_rate": 9.252361098418715e-05, + "loss": 0.6169, + "step": 3000 + }, + { + "epoch": 11.025641025641026, + "grad_norm": 0.1227792352437973, + "learning_rate": 9.192306220187838e-05, + "loss": 0.5856, + "step": 3010 + }, + { + "epoch": 11.062271062271062, + "grad_norm": 0.12947812676429749, + "learning_rate": 9.132282804815835e-05, + "loss": 0.5869, + "step": 3020 + }, + { + "epoch": 11.098901098901099, + "grad_norm": 0.14743156731128693, + "learning_rate": 9.072293054147112e-05, + "loss": 0.6026, + "step": 3030 + }, + { + "epoch": 11.135531135531135, + "grad_norm": 0.10158231854438782, + "learning_rate": 9.012339168791138e-05, + "loss": 0.604, + "step": 3040 + }, + { + "epoch": 11.172161172161172, + "grad_norm": 0.10932044684886932, + "learning_rate": 8.95242334804174e-05, + "loss": 0.6072, + "step": 3050 + }, + { + "epoch": 11.208791208791208, + "grad_norm": 0.10173647105693817, + "learning_rate": 8.892547789796422e-05, + "loss": 0.6042, + "step": 3060 + }, + { + "epoch": 11.245421245421245, + "grad_norm": 0.11170593649148941, + "learning_rate": 8.832714690475722e-05, + "loss": 0.5958, + "step": 3070 + }, + { + "epoch": 11.282051282051283, + "grad_norm": 0.1408560425043106, + "learning_rate": 8.77292624494266e-05, + "loss": 0.5912, + "step": 3080 + }, + { + "epoch": 11.31868131868132, + "grad_norm": 0.08870992064476013, + "learning_rate": 8.71318464642222e-05, + "loss": 0.593, + "step": 3090 + }, + { + "epoch": 11.355311355311356, + "grad_norm": 0.11843115091323853, + "learning_rate": 8.653492086420894e-05, + "loss": 0.6056, + "step": 3100 + }, + { + "epoch": 11.391941391941392, + "grad_norm": 0.09680238366127014, + "learning_rate": 8.593850754646276e-05, + "loss": 0.6032, + "step": 3110 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.08638516068458557, + "learning_rate": 8.534262838926766e-05, + "loss": 0.5969, + "step": 3120 + }, + { + "epoch": 11.465201465201465, + "grad_norm": 0.10992743074893951, + "learning_rate": 8.474730525131291e-05, + "loss": 0.6078, + "step": 3130 + }, + { + "epoch": 11.501831501831502, + "grad_norm": 0.12074992805719376, + "learning_rate": 8.415255997089129e-05, + "loss": 0.6033, + "step": 3140 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 0.14779578149318695, + "learning_rate": 8.355841436509795e-05, + "loss": 0.5936, + "step": 3150 + }, + { + "epoch": 11.575091575091575, + "grad_norm": 0.1313851773738861, + "learning_rate": 8.29648902290302e-05, + "loss": 0.5947, + "step": 3160 + }, + { + "epoch": 11.611721611721611, + "grad_norm": 0.12997916340827942, + "learning_rate": 8.23720093349878e-05, + "loss": 0.5981, + "step": 3170 + }, + { + "epoch": 11.648351648351648, + "grad_norm": 0.1216355711221695, + "learning_rate": 8.177979343167449e-05, + "loss": 0.6024, + "step": 3180 + }, + { + "epoch": 11.684981684981684, + "grad_norm": 0.12574613094329834, + "learning_rate": 8.118826424340013e-05, + "loss": 0.5986, + "step": 3190 + }, + { + "epoch": 11.72161172161172, + "grad_norm": 0.13364483416080475, + "learning_rate": 8.05974434692836e-05, + "loss": 0.6075, + "step": 3200 + }, + { + "epoch": 11.758241758241759, + "grad_norm": 0.10564299672842026, + "learning_rate": 8.000735278245708e-05, + "loss": 0.5886, + "step": 3210 + }, + { + "epoch": 11.794871794871796, + "grad_norm": 0.12722384929656982, + "learning_rate": 7.941801382927094e-05, + "loss": 0.5962, + "step": 3220 + }, + { + "epoch": 11.831501831501832, + "grad_norm": 0.09655703604221344, + "learning_rate": 7.882944822849946e-05, + "loss": 0.5918, + "step": 3230 + }, + { + "epoch": 11.868131868131869, + "grad_norm": 0.10955348610877991, + "learning_rate": 7.82416775705481e-05, + "loss": 0.5958, + "step": 3240 + }, + { + "epoch": 11.904761904761905, + "grad_norm": 0.08911681175231934, + "learning_rate": 7.765472341666133e-05, + "loss": 0.5923, + "step": 3250 + }, + { + "epoch": 11.941391941391942, + "grad_norm": 0.11370093375444412, + "learning_rate": 7.706860729813177e-05, + "loss": 0.6032, + "step": 3260 + }, + { + "epoch": 11.978021978021978, + "grad_norm": 0.1376587450504303, + "learning_rate": 7.648335071551016e-05, + "loss": 0.582, + "step": 3270 + }, + { + "epoch": 12.014652014652015, + "grad_norm": 0.09854473918676376, + "learning_rate": 7.589897513781698e-05, + "loss": 0.5944, + "step": 3280 + }, + { + "epoch": 12.051282051282051, + "grad_norm": 0.10749837011098862, + "learning_rate": 7.531550200175465e-05, + "loss": 0.5958, + "step": 3290 + }, + { + "epoch": 12.087912087912088, + "grad_norm": 0.11621621996164322, + "learning_rate": 7.473295271092125e-05, + "loss": 0.601, + "step": 3300 + }, + { + "epoch": 12.124542124542124, + "grad_norm": 0.15026544034481049, + "learning_rate": 7.415134863502532e-05, + "loss": 0.5934, + "step": 3310 + }, + { + "epoch": 12.16117216117216, + "grad_norm": 0.09685179591178894, + "learning_rate": 7.35707111091021e-05, + "loss": 0.6051, + "step": 3320 + }, + { + "epoch": 12.197802197802197, + "grad_norm": 0.09188985824584961, + "learning_rate": 7.299106143273067e-05, + "loss": 0.5978, + "step": 3330 + }, + { + "epoch": 12.234432234432234, + "grad_norm": 0.10112758725881577, + "learning_rate": 7.24124208692528e-05, + "loss": 0.5981, + "step": 3340 + }, + { + "epoch": 12.271062271062272, + "grad_norm": 0.10160616040229797, + "learning_rate": 7.18348106449929e-05, + "loss": 0.5923, + "step": 3350 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 0.12250513583421707, + "learning_rate": 7.125825194847929e-05, + "loss": 0.5819, + "step": 3360 + }, + { + "epoch": 12.344322344322345, + "grad_norm": 0.12580272555351257, + "learning_rate": 7.068276592966702e-05, + "loss": 0.5921, + "step": 3370 + }, + { + "epoch": 12.380952380952381, + "grad_norm": 0.09846355020999908, + "learning_rate": 7.010837369916204e-05, + "loss": 0.6016, + "step": 3380 + }, + { + "epoch": 12.417582417582418, + "grad_norm": 0.1316191852092743, + "learning_rate": 6.953509632744678e-05, + "loss": 0.5908, + "step": 3390 + }, + { + "epoch": 12.454212454212454, + "grad_norm": 0.10895272344350815, + "learning_rate": 6.896295484410712e-05, + "loss": 0.5953, + "step": 3400 + }, + { + "epoch": 12.49084249084249, + "grad_norm": 0.1075824722647667, + "learning_rate": 6.839197023706109e-05, + "loss": 0.5882, + "step": 3410 + }, + { + "epoch": 12.527472527472527, + "grad_norm": 0.09522432088851929, + "learning_rate": 6.782216345178898e-05, + "loss": 0.5873, + "step": 3420 + }, + { + "epoch": 12.564102564102564, + "grad_norm": 0.11377083510160446, + "learning_rate": 6.725355539056478e-05, + "loss": 0.6092, + "step": 3430 + }, + { + "epoch": 12.6007326007326, + "grad_norm": 0.10006774961948395, + "learning_rate": 6.668616691168976e-05, + "loss": 0.5863, + "step": 3440 + }, + { + "epoch": 12.637362637362637, + "grad_norm": 0.10102878510951996, + "learning_rate": 6.612001882872705e-05, + "loss": 0.5885, + "step": 3450 + }, + { + "epoch": 12.673992673992673, + "grad_norm": 0.09821674972772598, + "learning_rate": 6.555513190973818e-05, + "loss": 0.5988, + "step": 3460 + }, + { + "epoch": 12.71062271062271, + "grad_norm": 0.09962121397256851, + "learning_rate": 6.499152687652136e-05, + "loss": 0.5859, + "step": 3470 + }, + { + "epoch": 12.747252747252748, + "grad_norm": 0.1359899640083313, + "learning_rate": 6.442922440385134e-05, + "loss": 0.5915, + "step": 3480 + }, + { + "epoch": 12.783882783882785, + "grad_norm": 0.10209459066390991, + "learning_rate": 6.386824511872068e-05, + "loss": 0.6009, + "step": 3490 + }, + { + "epoch": 12.820512820512821, + "grad_norm": 0.10285753011703491, + "learning_rate": 6.330860959958354e-05, + "loss": 0.5894, + "step": 3500 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 0.10430514812469482, + "learning_rate": 6.275033837560054e-05, + "loss": 0.5952, + "step": 3510 + }, + { + "epoch": 12.893772893772894, + "grad_norm": 0.09391837567090988, + "learning_rate": 6.219345192588576e-05, + "loss": 0.6002, + "step": 3520 + }, + { + "epoch": 12.93040293040293, + "grad_norm": 0.11428876221179962, + "learning_rate": 6.163797067875537e-05, + "loss": 0.6024, + "step": 3530 + }, + { + "epoch": 12.967032967032967, + "grad_norm": 0.09616833925247192, + "learning_rate": 6.108391501097848e-05, + "loss": 0.5969, + "step": 3540 + }, + { + "epoch": 13.003663003663004, + "grad_norm": 0.111970454454422, + "learning_rate": 6.053130524702956e-05, + "loss": 0.592, + "step": 3550 + }, + { + "epoch": 13.04029304029304, + "grad_norm": 0.14766483008861542, + "learning_rate": 5.99801616583427e-05, + "loss": 0.6047, + "step": 3560 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 0.09645646065473557, + "learning_rate": 5.9430504462568315e-05, + "loss": 0.5963, + "step": 3570 + }, + { + "epoch": 13.113553113553113, + "grad_norm": 0.10448257625102997, + "learning_rate": 5.8882353822831294e-05, + "loss": 0.6018, + "step": 3580 + }, + { + "epoch": 13.15018315018315, + "grad_norm": 0.09417391568422318, + "learning_rate": 5.8335729846991304e-05, + "loss": 0.5873, + "step": 3590 + }, + { + "epoch": 13.186813186813186, + "grad_norm": 0.12039947509765625, + "learning_rate": 5.779065258690537e-05, + "loss": 0.5941, + "step": 3600 + }, + { + "epoch": 13.223443223443223, + "grad_norm": 0.17030514776706696, + "learning_rate": 5.724714203769218e-05, + "loss": 0.5967, + "step": 3610 + }, + { + "epoch": 13.260073260073261, + "grad_norm": 0.10252691805362701, + "learning_rate": 5.670521813699856e-05, + "loss": 0.5962, + "step": 3620 + }, + { + "epoch": 13.296703296703297, + "grad_norm": 0.10711918771266937, + "learning_rate": 5.6164900764268235e-05, + "loss": 0.6012, + "step": 3630 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 0.11735249310731888, + "learning_rate": 5.56262097400125e-05, + "loss": 0.5935, + "step": 3640 + }, + { + "epoch": 13.36996336996337, + "grad_norm": 0.09693988412618637, + "learning_rate": 5.508916482508318e-05, + "loss": 0.6063, + "step": 3650 + }, + { + "epoch": 13.406593406593407, + "grad_norm": 0.1074470654129982, + "learning_rate": 5.455378571994764e-05, + "loss": 0.5857, + "step": 3660 + }, + { + "epoch": 13.443223443223443, + "grad_norm": 0.15257665514945984, + "learning_rate": 5.402009206396627e-05, + "loss": 0.5926, + "step": 3670 + }, + { + "epoch": 13.47985347985348, + "grad_norm": 0.12574537098407745, + "learning_rate": 5.348810343467197e-05, + "loss": 0.5909, + "step": 3680 + }, + { + "epoch": 13.516483516483516, + "grad_norm": 0.1099851056933403, + "learning_rate": 5.295783934705201e-05, + "loss": 0.5953, + "step": 3690 + }, + { + "epoch": 13.553113553113553, + "grad_norm": 0.09711455553770065, + "learning_rate": 5.2429319252832045e-05, + "loss": 0.6059, + "step": 3700 + }, + { + "epoch": 13.58974358974359, + "grad_norm": 0.10976620763540268, + "learning_rate": 5.1902562539762736e-05, + "loss": 0.5806, + "step": 3710 + }, + { + "epoch": 13.626373626373626, + "grad_norm": 0.1407400518655777, + "learning_rate": 5.137758853090848e-05, + "loss": 0.589, + "step": 3720 + }, + { + "epoch": 13.663003663003662, + "grad_norm": 0.09327317029237747, + "learning_rate": 5.085441648393856e-05, + "loss": 0.5897, + "step": 3730 + }, + { + "epoch": 13.699633699633699, + "grad_norm": 0.1264810413122177, + "learning_rate": 5.033306559042075e-05, + "loss": 0.5972, + "step": 3740 + }, + { + "epoch": 13.736263736263737, + "grad_norm": 0.12380846589803696, + "learning_rate": 4.98135549751172e-05, + "loss": 0.6, + "step": 3750 + }, + { + "epoch": 13.772893772893774, + "grad_norm": 0.12347488850355148, + "learning_rate": 4.929590369528302e-05, + "loss": 0.5797, + "step": 3760 + }, + { + "epoch": 13.80952380952381, + "grad_norm": 0.11884108930826187, + "learning_rate": 4.8780130739967216e-05, + "loss": 0.5931, + "step": 3770 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 0.11143997311592102, + "learning_rate": 4.826625502931597e-05, + "loss": 0.5803, + "step": 3780 + }, + { + "epoch": 13.882783882783883, + "grad_norm": 0.10896267741918564, + "learning_rate": 4.775429541387864e-05, + "loss": 0.602, + "step": 3790 + }, + { + "epoch": 13.91941391941392, + "grad_norm": 0.10565280169248581, + "learning_rate": 4.724427067391632e-05, + "loss": 0.5949, + "step": 3800 + }, + { + "epoch": 13.956043956043956, + "grad_norm": 0.12976156175136566, + "learning_rate": 4.673619951871299e-05, + "loss": 0.5924, + "step": 3810 + }, + { + "epoch": 13.992673992673993, + "grad_norm": 0.1173262670636177, + "learning_rate": 4.6230100585888905e-05, + "loss": 0.594, + "step": 3820 + }, + { + "epoch": 14.02930402930403, + "grad_norm": 0.12908582389354706, + "learning_rate": 4.572599244071726e-05, + "loss": 0.5992, + "step": 3830 + }, + { + "epoch": 14.065934065934066, + "grad_norm": 0.11487755179405212, + "learning_rate": 4.522389357544297e-05, + "loss": 0.5932, + "step": 3840 + }, + { + "epoch": 14.102564102564102, + "grad_norm": 0.1192779615521431, + "learning_rate": 4.472382240860431e-05, + "loss": 0.5866, + "step": 3850 + }, + { + "epoch": 14.139194139194139, + "grad_norm": 0.13515004515647888, + "learning_rate": 4.4225797284357354e-05, + "loss": 0.5834, + "step": 3860 + }, + { + "epoch": 14.175824175824175, + "grad_norm": 0.12698876857757568, + "learning_rate": 4.372983647180302e-05, + "loss": 0.5984, + "step": 3870 + }, + { + "epoch": 14.212454212454212, + "grad_norm": 0.1023423969745636, + "learning_rate": 4.323595816431687e-05, + "loss": 0.5906, + "step": 3880 + }, + { + "epoch": 14.249084249084248, + "grad_norm": 0.1128733679652214, + "learning_rate": 4.27441804788817e-05, + "loss": 0.5964, + "step": 3890 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 0.15929995477199554, + "learning_rate": 4.225452145542305e-05, + "loss": 0.5722, + "step": 3900 + }, + { + "epoch": 14.322344322344323, + "grad_norm": 0.12099599838256836, + "learning_rate": 4.1766999056147435e-05, + "loss": 0.5987, + "step": 3910 + }, + { + "epoch": 14.35897435897436, + "grad_norm": 0.12424413114786148, + "learning_rate": 4.1281631164883264e-05, + "loss": 0.5856, + "step": 3920 + }, + { + "epoch": 14.395604395604396, + "grad_norm": 0.1399509310722351, + "learning_rate": 4.079843558642504e-05, + "loss": 0.5883, + "step": 3930 + }, + { + "epoch": 14.432234432234432, + "grad_norm": 0.11879005283117294, + "learning_rate": 4.031743004588009e-05, + "loss": 0.595, + "step": 3940 + }, + { + "epoch": 14.468864468864469, + "grad_norm": 0.11288290470838547, + "learning_rate": 3.9838632188018345e-05, + "loss": 0.5859, + "step": 3950 + }, + { + "epoch": 14.505494505494505, + "grad_norm": 0.08327735960483551, + "learning_rate": 3.936205957662514e-05, + "loss": 0.6001, + "step": 3960 + }, + { + "epoch": 14.542124542124542, + "grad_norm": 0.11755986511707306, + "learning_rate": 3.888772969385695e-05, + "loss": 0.5834, + "step": 3970 + }, + { + "epoch": 14.578754578754578, + "grad_norm": 0.13514429330825806, + "learning_rate": 3.8415659939599886e-05, + "loss": 0.6095, + "step": 3980 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 0.12506259977817535, + "learning_rate": 3.79458676308317e-05, + "loss": 0.5889, + "step": 3990 + }, + { + "epoch": 14.652014652014651, + "grad_norm": 0.10506050288677216, + "learning_rate": 3.747837000098633e-05, + "loss": 0.5969, + "step": 4000 + }, + { + "epoch": 14.688644688644688, + "grad_norm": 0.12048743665218353, + "learning_rate": 3.7013184199321865e-05, + "loss": 0.5928, + "step": 4010 + }, + { + "epoch": 14.725274725274724, + "grad_norm": 0.10874942690134048, + "learning_rate": 3.655032729029127e-05, + "loss": 0.5942, + "step": 4020 + }, + { + "epoch": 14.761904761904763, + "grad_norm": 0.12899090349674225, + "learning_rate": 3.608981625291665e-05, + "loss": 0.5906, + "step": 4030 + }, + { + "epoch": 14.7985347985348, + "grad_norm": 0.10427646338939667, + "learning_rate": 3.563166798016624e-05, + "loss": 0.5942, + "step": 4040 + }, + { + "epoch": 14.835164835164836, + "grad_norm": 0.08348722010850906, + "learning_rate": 3.517589927833476e-05, + "loss": 0.6057, + "step": 4050 + }, + { + "epoch": 14.871794871794872, + "grad_norm": 0.1019279882311821, + "learning_rate": 3.472252686642692e-05, + "loss": 0.5883, + "step": 4060 + }, + { + "epoch": 14.908424908424909, + "grad_norm": 0.13301706314086914, + "learning_rate": 3.427156737554416e-05, + "loss": 0.5862, + "step": 4070 + }, + { + "epoch": 14.945054945054945, + "grad_norm": 0.07168400287628174, + "learning_rate": 3.382303734827438e-05, + "loss": 0.5917, + "step": 4080 + }, + { + "epoch": 14.981684981684982, + "grad_norm": 0.13180795311927795, + "learning_rate": 3.337695323808538e-05, + "loss": 0.5866, + "step": 4090 + }, + { + "epoch": 15.018315018315018, + "grad_norm": 0.11809457838535309, + "learning_rate": 3.293333140872115e-05, + "loss": 0.5933, + "step": 4100 + }, + { + "epoch": 15.054945054945055, + "grad_norm": 0.15424320101737976, + "learning_rate": 3.24921881336015e-05, + "loss": 0.5883, + "step": 4110 + }, + { + "epoch": 15.091575091575091, + "grad_norm": 0.10090993344783783, + "learning_rate": 3.205353959522531e-05, + "loss": 0.5962, + "step": 4120 + }, + { + "epoch": 15.128205128205128, + "grad_norm": 0.10715416073799133, + "learning_rate": 3.161740188457681e-05, + "loss": 0.5892, + "step": 4130 + }, + { + "epoch": 15.164835164835164, + "grad_norm": 0.11065078526735306, + "learning_rate": 3.118379100053528e-05, + "loss": 0.5763, + "step": 4140 + }, + { + "epoch": 15.2014652014652, + "grad_norm": 0.0856284573674202, + "learning_rate": 3.075272284928813e-05, + "loss": 0.5919, + "step": 4150 + }, + { + "epoch": 15.238095238095237, + "grad_norm": 0.10988280922174454, + "learning_rate": 3.0324213243747567e-05, + "loss": 0.6042, + "step": 4160 + }, + { + "epoch": 15.274725274725276, + "grad_norm": 0.12171415239572525, + "learning_rate": 2.989827790297043e-05, + "loss": 0.5836, + "step": 4170 + }, + { + "epoch": 15.311355311355312, + "grad_norm": 0.16990408301353455, + "learning_rate": 2.947493245158146e-05, + "loss": 0.6058, + "step": 4180 + }, + { + "epoch": 15.347985347985349, + "grad_norm": 0.12295118719339371, + "learning_rate": 2.9054192419200365e-05, + "loss": 0.6056, + "step": 4190 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 0.10608595609664917, + "learning_rate": 2.863607323987203e-05, + "loss": 0.5909, + "step": 4200 + }, + { + "epoch": 15.421245421245422, + "grad_norm": 0.12604480981826782, + "learning_rate": 2.8220590251500228e-05, + "loss": 0.5878, + "step": 4210 + }, + { + "epoch": 15.457875457875458, + "grad_norm": 0.11635804176330566, + "learning_rate": 2.780775869528522e-05, + "loss": 0.5818, + "step": 4220 + }, + { + "epoch": 15.494505494505495, + "grad_norm": 0.11485312134027481, + "learning_rate": 2.7397593715164548e-05, + "loss": 0.5871, + "step": 4230 + }, + { + "epoch": 15.531135531135531, + "grad_norm": 0.13255690038204193, + "learning_rate": 2.699011035725738e-05, + "loss": 0.5851, + "step": 4240 + }, + { + "epoch": 15.567765567765568, + "grad_norm": 0.06791131943464279, + "learning_rate": 2.6585323569312786e-05, + "loss": 0.5916, + "step": 4250 + }, + { + "epoch": 15.604395604395604, + "grad_norm": 0.12282761931419373, + "learning_rate": 2.618324820016133e-05, + "loss": 0.5909, + "step": 4260 + }, + { + "epoch": 15.64102564102564, + "grad_norm": 0.08577387779951096, + "learning_rate": 2.5783898999170314e-05, + "loss": 0.5876, + "step": 4270 + }, + { + "epoch": 15.677655677655677, + "grad_norm": 0.12403818964958191, + "learning_rate": 2.538729061570274e-05, + "loss": 0.5888, + "step": 4280 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 0.11415976285934448, + "learning_rate": 2.4993437598579994e-05, + "loss": 0.5947, + "step": 4290 + }, + { + "epoch": 15.750915750915752, + "grad_norm": 0.12366656213998795, + "learning_rate": 2.4602354395548148e-05, + "loss": 0.592, + "step": 4300 + }, + { + "epoch": 15.787545787545788, + "grad_norm": 0.09358074516057968, + "learning_rate": 2.421405535274781e-05, + "loss": 0.609, + "step": 4310 + }, + { + "epoch": 15.824175824175825, + "grad_norm": 0.08951131254434586, + "learning_rate": 2.3828554714188076e-05, + "loss": 0.5835, + "step": 4320 + }, + { + "epoch": 15.860805860805861, + "grad_norm": 0.1570052057504654, + "learning_rate": 2.3445866621223954e-05, + "loss": 0.5947, + "step": 4330 + }, + { + "epoch": 15.897435897435898, + "grad_norm": 0.09661392867565155, + "learning_rate": 2.3066005112037474e-05, + "loss": 0.5992, + "step": 4340 + }, + { + "epoch": 15.934065934065934, + "grad_norm": 0.11398851126432419, + "learning_rate": 2.2688984121122932e-05, + "loss": 0.5893, + "step": 4350 + }, + { + "epoch": 15.97069597069597, + "grad_norm": 0.15302923321723938, + "learning_rate": 2.2314817478775594e-05, + "loss": 0.6001, + "step": 4360 + }, + { + "epoch": 16.007326007326007, + "grad_norm": 0.09366681426763535, + "learning_rate": 2.1943518910584413e-05, + "loss": 0.5932, + "step": 4370 + }, + { + "epoch": 16.043956043956044, + "grad_norm": 0.08003173023462296, + "learning_rate": 2.157510203692851e-05, + "loss": 0.6002, + "step": 4380 + }, + { + "epoch": 16.08058608058608, + "grad_norm": 0.13794177770614624, + "learning_rate": 2.1209580372477568e-05, + "loss": 0.5839, + "step": 4390 + }, + { + "epoch": 16.117216117216117, + "grad_norm": 0.15384089946746826, + "learning_rate": 2.0846967325695974e-05, + "loss": 0.5906, + "step": 4400 + }, + { + "epoch": 16.153846153846153, + "grad_norm": 0.11059562861919403, + "learning_rate": 2.0487276198351102e-05, + "loss": 0.5955, + "step": 4410 + }, + { + "epoch": 16.19047619047619, + "grad_norm": 0.10853768140077591, + "learning_rate": 2.0130520185025272e-05, + "loss": 0.5871, + "step": 4420 + }, + { + "epoch": 16.227106227106226, + "grad_norm": 0.10565737634897232, + "learning_rate": 1.9776712372631758e-05, + "loss": 0.5824, + "step": 4430 + }, + { + "epoch": 16.263736263736263, + "grad_norm": 0.14627011120319366, + "learning_rate": 1.942586573993467e-05, + "loss": 0.5904, + "step": 4440 + }, + { + "epoch": 16.3003663003663, + "grad_norm": 0.08151823282241821, + "learning_rate": 1.907799315707292e-05, + "loss": 0.5884, + "step": 4450 + }, + { + "epoch": 16.336996336996336, + "grad_norm": 0.11814941465854645, + "learning_rate": 1.8733107385088118e-05, + "loss": 0.6046, + "step": 4460 + }, + { + "epoch": 16.373626373626372, + "grad_norm": 0.1151585578918457, + "learning_rate": 1.839122107545633e-05, + "loss": 0.5931, + "step": 4470 + }, + { + "epoch": 16.41025641025641, + "grad_norm": 0.11447068303823471, + "learning_rate": 1.805234676962415e-05, + "loss": 0.5864, + "step": 4480 + }, + { + "epoch": 16.446886446886445, + "grad_norm": 0.14155898988246918, + "learning_rate": 1.7716496898548574e-05, + "loss": 0.5807, + "step": 4490 + }, + { + "epoch": 16.483516483516482, + "grad_norm": 0.12283307313919067, + "learning_rate": 1.7383683782240874e-05, + "loss": 0.5875, + "step": 4500 + }, + { + "epoch": 16.520146520146522, + "grad_norm": 0.1172984391450882, + "learning_rate": 1.7053919629314873e-05, + "loss": 0.5904, + "step": 4510 + }, + { + "epoch": 16.55677655677656, + "grad_norm": 0.10092102736234665, + "learning_rate": 1.6727216536538968e-05, + "loss": 0.5808, + "step": 4520 + }, + { + "epoch": 16.593406593406595, + "grad_norm": 0.08429203927516937, + "learning_rate": 1.640358648839244e-05, + "loss": 0.588, + "step": 4530 + }, + { + "epoch": 16.63003663003663, + "grad_norm": 0.10441552102565765, + "learning_rate": 1.608304135662568e-05, + "loss": 0.581, + "step": 4540 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 0.12228482216596603, + "learning_rate": 1.5765592899824945e-05, + "loss": 0.5886, + "step": 4550 + }, + { + "epoch": 16.703296703296704, + "grad_norm": 0.12790054082870483, + "learning_rate": 1.545125276298082e-05, + "loss": 0.6001, + "step": 4560 + }, + { + "epoch": 16.73992673992674, + "grad_norm": 0.10372098535299301, + "learning_rate": 1.5140032477061087e-05, + "loss": 0.5895, + "step": 4570 + }, + { + "epoch": 16.776556776556777, + "grad_norm": 0.09895550459623337, + "learning_rate": 1.483194345858779e-05, + "loss": 0.5913, + "step": 4580 + }, + { + "epoch": 16.813186813186814, + "grad_norm": 0.09903260320425034, + "learning_rate": 1.4526997009218429e-05, + "loss": 0.594, + "step": 4590 + }, + { + "epoch": 16.84981684981685, + "grad_norm": 0.10232320427894592, + "learning_rate": 1.4225204315331267e-05, + "loss": 0.6048, + "step": 4600 + }, + { + "epoch": 16.886446886446887, + "grad_norm": 0.12157467007637024, + "learning_rate": 1.3926576447615141e-05, + "loss": 0.591, + "step": 4610 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 0.10378751158714294, + "learning_rate": 1.3631124360663295e-05, + "loss": 0.5841, + "step": 4620 + }, + { + "epoch": 16.95970695970696, + "grad_norm": 0.12422292679548264, + "learning_rate": 1.3338858892571438e-05, + "loss": 0.5997, + "step": 4630 + }, + { + "epoch": 16.996336996336996, + "grad_norm": 0.10485445708036423, + "learning_rate": 1.3049790764540315e-05, + "loss": 0.5861, + "step": 4640 + }, + { + "epoch": 17.032967032967033, + "grad_norm": 0.11898715049028397, + "learning_rate": 1.2763930580482347e-05, + "loss": 0.5898, + "step": 4650 + }, + { + "epoch": 17.06959706959707, + "grad_norm": 0.10091210901737213, + "learning_rate": 1.248128882663267e-05, + "loss": 0.6032, + "step": 4660 + }, + { + "epoch": 17.106227106227106, + "grad_norm": 0.1248786523938179, + "learning_rate": 1.220187587116436e-05, + "loss": 0.5856, + "step": 4670 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 0.09647556394338608, + "learning_rate": 1.1925701963808304e-05, + "loss": 0.5955, + "step": 4680 + }, + { + "epoch": 17.17948717948718, + "grad_norm": 0.10741813480854034, + "learning_rate": 1.1652777235477038e-05, + "loss": 0.587, + "step": 4690 + }, + { + "epoch": 17.216117216117215, + "grad_norm": 0.10575282573699951, + "learning_rate": 1.1383111697893177e-05, + "loss": 0.5918, + "step": 4700 + }, + { + "epoch": 17.252747252747252, + "grad_norm": 0.11025935411453247, + "learning_rate": 1.1116715243222157e-05, + "loss": 0.5894, + "step": 4710 + }, + { + "epoch": 17.28937728937729, + "grad_norm": 0.13426312804222107, + "learning_rate": 1.0853597643709306e-05, + "loss": 0.5846, + "step": 4720 + }, + { + "epoch": 17.326007326007325, + "grad_norm": 0.14736682176589966, + "learning_rate": 1.0593768551321449e-05, + "loss": 0.5953, + "step": 4730 + }, + { + "epoch": 17.36263736263736, + "grad_norm": 0.12300019711256027, + "learning_rate": 1.03372374973928e-05, + "loss": 0.5892, + "step": 4740 + }, + { + "epoch": 17.399267399267398, + "grad_norm": 0.07489597797393799, + "learning_rate": 1.008401389227533e-05, + "loss": 0.5895, + "step": 4750 + }, + { + "epoch": 17.435897435897434, + "grad_norm": 0.09103349596261978, + "learning_rate": 9.834107024993564e-06, + "loss": 0.5897, + "step": 4760 + }, + { + "epoch": 17.47252747252747, + "grad_norm": 0.10881728678941727, + "learning_rate": 9.587526062903813e-06, + "loss": 0.5944, + "step": 4770 + }, + { + "epoch": 17.50915750915751, + "grad_norm": 0.1256643533706665, + "learning_rate": 9.34428005135792e-06, + "loss": 0.5954, + "step": 4780 + }, + { + "epoch": 17.545787545787547, + "grad_norm": 0.09786739200353622, + "learning_rate": 9.10437791337146e-06, + "loss": 0.586, + "step": 4790 + }, + { + "epoch": 17.582417582417584, + "grad_norm": 0.12305913120508194, + "learning_rate": 8.86782844929632e-06, + "loss": 0.5849, + "step": 4800 + }, + { + "epoch": 17.61904761904762, + "grad_norm": 0.094321109354496, + "learning_rate": 8.634640336498005e-06, + "loss": 0.6047, + "step": 4810 + }, + { + "epoch": 17.655677655677657, + "grad_norm": 0.10959235578775406, + "learning_rate": 8.404822129037282e-06, + "loss": 0.5845, + "step": 4820 + }, + { + "epoch": 17.692307692307693, + "grad_norm": 0.12101900577545166, + "learning_rate": 8.178382257356299e-06, + "loss": 0.5957, + "step": 4830 + }, + { + "epoch": 17.72893772893773, + "grad_norm": 0.0871628075838089, + "learning_rate": 7.955329027969454e-06, + "loss": 0.5982, + "step": 4840 + }, + { + "epoch": 17.765567765567766, + "grad_norm": 0.11070037633180618, + "learning_rate": 7.735670623158669e-06, + "loss": 0.5968, + "step": 4850 + }, + { + "epoch": 17.802197802197803, + "grad_norm": 0.12083383649587631, + "learning_rate": 7.519415100673134e-06, + "loss": 0.5903, + "step": 4860 + }, + { + "epoch": 17.83882783882784, + "grad_norm": 0.12383836507797241, + "learning_rate": 7.306570393433841e-06, + "loss": 0.5816, + "step": 4870 + }, + { + "epoch": 17.875457875457876, + "grad_norm": 0.127301886677742, + "learning_rate": 7.09714430924253e-06, + "loss": 0.5868, + "step": 4880 + }, + { + "epoch": 17.912087912087912, + "grad_norm": 0.10558178275823593, + "learning_rate": 6.891144530495308e-06, + "loss": 0.5934, + "step": 4890 + }, + { + "epoch": 17.94871794871795, + "grad_norm": 0.1014784723520279, + "learning_rate": 6.688578613900741e-06, + "loss": 0.5774, + "step": 4900 + }, + { + "epoch": 17.985347985347985, + "grad_norm": 0.09873995184898376, + "learning_rate": 6.489453990202765e-06, + "loss": 0.5889, + "step": 4910 + }, + { + "epoch": 18.021978021978022, + "grad_norm": 0.08788427710533142, + "learning_rate": 6.29377796390809e-06, + "loss": 0.6059, + "step": 4920 + }, + { + "epoch": 18.05860805860806, + "grad_norm": 0.11927574127912521, + "learning_rate": 6.101557713018109e-06, + "loss": 0.5867, + "step": 4930 + }, + { + "epoch": 18.095238095238095, + "grad_norm": 0.11732903867959976, + "learning_rate": 5.912800288765798e-06, + "loss": 0.5955, + "step": 4940 + }, + { + "epoch": 18.13186813186813, + "grad_norm": 0.10442673414945602, + "learning_rate": 5.727512615356896e-06, + "loss": 0.5861, + "step": 4950 + }, + { + "epoch": 18.168498168498168, + "grad_norm": 0.1446387618780136, + "learning_rate": 5.54570148971596e-06, + "loss": 0.5933, + "step": 4960 + }, + { + "epoch": 18.205128205128204, + "grad_norm": 0.12358086556196213, + "learning_rate": 5.367373581237032e-06, + "loss": 0.5851, + "step": 4970 + }, + { + "epoch": 18.24175824175824, + "grad_norm": 0.13660740852355957, + "learning_rate": 5.192535431539018e-06, + "loss": 0.5857, + "step": 4980 + }, + { + "epoch": 18.278388278388277, + "grad_norm": 0.16124746203422546, + "learning_rate": 5.021193454225608e-06, + "loss": 0.5818, + "step": 4990 + }, + { + "epoch": 18.315018315018314, + "grad_norm": 0.11943217366933823, + "learning_rate": 4.853353934650142e-06, + "loss": 0.5777, + "step": 5000 + }, + { + "epoch": 18.35164835164835, + "grad_norm": 0.13698545098304749, + "learning_rate": 4.689023029684991e-06, + "loss": 0.5885, + "step": 5010 + }, + { + "epoch": 18.388278388278387, + "grad_norm": 0.12822367250919342, + "learning_rate": 4.528206767495648e-06, + "loss": 0.5958, + "step": 5020 + }, + { + "epoch": 18.424908424908423, + "grad_norm": 0.09347227960824966, + "learning_rate": 4.370911047319707e-06, + "loss": 0.5865, + "step": 5030 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 0.09904488921165466, + "learning_rate": 4.2171416392503395e-06, + "loss": 0.5998, + "step": 5040 + }, + { + "epoch": 18.498168498168496, + "grad_norm": 0.0935777798295021, + "learning_rate": 4.066904184024721e-06, + "loss": 0.602, + "step": 5050 + }, + { + "epoch": 18.534798534798536, + "grad_norm": 0.12104463577270508, + "learning_rate": 3.920204192817085e-06, + "loss": 0.5911, + "step": 5060 + }, + { + "epoch": 18.571428571428573, + "grad_norm": 0.10362918674945831, + "learning_rate": 3.7770470470365394e-06, + "loss": 0.5977, + "step": 5070 + }, + { + "epoch": 18.60805860805861, + "grad_norm": 0.08118361979722977, + "learning_rate": 3.6374379981296724e-06, + "loss": 0.6034, + "step": 5080 + }, + { + "epoch": 18.644688644688646, + "grad_norm": 0.11816778033971786, + "learning_rate": 3.5013821673879024e-06, + "loss": 0.5813, + "step": 5090 + }, + { + "epoch": 18.681318681318682, + "grad_norm": 0.12376552075147629, + "learning_rate": 3.368884545759655e-06, + "loss": 0.6031, + "step": 5100 + }, + { + "epoch": 18.71794871794872, + "grad_norm": 0.1341128796339035, + "learning_rate": 3.2399499936672143e-06, + "loss": 0.5848, + "step": 5110 + }, + { + "epoch": 18.754578754578755, + "grad_norm": 0.108884297311306, + "learning_rate": 3.1145832408284605e-06, + "loss": 0.5881, + "step": 5120 + }, + { + "epoch": 18.791208791208792, + "grad_norm": 0.12129422277212143, + "learning_rate": 2.9927888860833994e-06, + "loss": 0.5913, + "step": 5130 + }, + { + "epoch": 18.82783882783883, + "grad_norm": 0.12739436328411102, + "learning_rate": 2.8745713972254186e-06, + "loss": 0.5986, + "step": 5140 + }, + { + "epoch": 18.864468864468865, + "grad_norm": 0.11512988805770874, + "learning_rate": 2.759935110837394e-06, + "loss": 0.5859, + "step": 5150 + }, + { + "epoch": 18.9010989010989, + "grad_norm": 0.09803087264299393, + "learning_rate": 2.6488842321326404e-06, + "loss": 0.5956, + "step": 5160 + }, + { + "epoch": 18.937728937728938, + "grad_norm": 0.1529194414615631, + "learning_rate": 2.541422834800623e-06, + "loss": 0.587, + "step": 5170 + }, + { + "epoch": 18.974358974358974, + "grad_norm": 0.13892097771167755, + "learning_rate": 2.437554860857573e-06, + "loss": 0.593, + "step": 5180 + }, + { + "epoch": 19.01098901098901, + "grad_norm": 0.14193354547023773, + "learning_rate": 2.337284120501794e-06, + "loss": 0.59, + "step": 5190 + }, + { + "epoch": 19.047619047619047, + "grad_norm": 0.12481102347373962, + "learning_rate": 2.2406142919739795e-06, + "loss": 0.6003, + "step": 5200 + }, + { + "epoch": 19.084249084249084, + "grad_norm": 0.09259341657161713, + "learning_rate": 2.147548921422245e-06, + "loss": 0.5781, + "step": 5210 + }, + { + "epoch": 19.12087912087912, + "grad_norm": 0.12095209211111069, + "learning_rate": 2.058091422772005e-06, + "loss": 0.579, + "step": 5220 + }, + { + "epoch": 19.157509157509157, + "grad_norm": 0.12661872804164886, + "learning_rate": 1.9722450776008397e-06, + "loss": 0.5917, + "step": 5230 + }, + { + "epoch": 19.194139194139193, + "grad_norm": 0.13092070817947388, + "learning_rate": 1.8900130350180266e-06, + "loss": 0.5867, + "step": 5240 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 0.12919223308563232, + "learning_rate": 1.8113983115490504e-06, + "loss": 0.6015, + "step": 5250 + }, + { + "epoch": 19.267399267399266, + "grad_norm": 0.11888205260038376, + "learning_rate": 1.7364037910249573e-06, + "loss": 0.5846, + "step": 5260 + }, + { + "epoch": 19.304029304029303, + "grad_norm": 0.11549215018749237, + "learning_rate": 1.6650322244765616e-06, + "loss": 0.5927, + "step": 5270 + }, + { + "epoch": 19.34065934065934, + "grad_norm": 0.1435500979423523, + "learning_rate": 1.5972862300335327e-06, + "loss": 0.5918, + "step": 5280 + }, + { + "epoch": 19.377289377289376, + "grad_norm": 0.10421594977378845, + "learning_rate": 1.5331682928283338e-06, + "loss": 0.6008, + "step": 5290 + }, + { + "epoch": 19.413919413919412, + "grad_norm": 0.10365405678749084, + "learning_rate": 1.4726807649050746e-06, + "loss": 0.5876, + "step": 5300 + }, + { + "epoch": 19.45054945054945, + "grad_norm": 0.1124468669295311, + "learning_rate": 1.4158258651332599e-06, + "loss": 0.603, + "step": 5310 + }, + { + "epoch": 19.487179487179485, + "grad_norm": 0.07424788922071457, + "learning_rate": 1.3626056791263295e-06, + "loss": 0.5959, + "step": 5320 + }, + { + "epoch": 19.523809523809526, + "grad_norm": 0.12210869044065475, + "learning_rate": 1.3130221591652044e-06, + "loss": 0.5884, + "step": 5330 + }, + { + "epoch": 19.560439560439562, + "grad_norm": 0.14640656113624573, + "learning_rate": 1.267077124126661e-06, + "loss": 0.5899, + "step": 5340 + }, + { + "epoch": 19.5970695970696, + "grad_norm": 0.11317311227321625, + "learning_rate": 1.2247722594165962e-06, + "loss": 0.5853, + "step": 5350 + }, + { + "epoch": 19.633699633699635, + "grad_norm": 0.12382689118385315, + "learning_rate": 1.1861091169081995e-06, + "loss": 0.6015, + "step": 5360 + }, + { + "epoch": 19.67032967032967, + "grad_norm": 0.12879708409309387, + "learning_rate": 1.1510891148850406e-06, + "loss": 0.5851, + "step": 5370 + }, + { + "epoch": 19.706959706959708, + "grad_norm": 0.18461808562278748, + "learning_rate": 1.1197135379890394e-06, + "loss": 0.5805, + "step": 5380 + }, + { + "epoch": 19.743589743589745, + "grad_norm": 0.09487687051296234, + "learning_rate": 1.0919835371733294e-06, + "loss": 0.5875, + "step": 5390 + }, + { + "epoch": 19.78021978021978, + "grad_norm": 0.1067289263010025, + "learning_rate": 1.0679001296600698e-06, + "loss": 0.5817, + "step": 5400 + }, + { + "epoch": 19.816849816849818, + "grad_norm": 0.12654584646224976, + "learning_rate": 1.0474641989030647e-06, + "loss": 0.5846, + "step": 5410 + }, + { + "epoch": 19.853479853479854, + "grad_norm": 0.09692192822694778, + "learning_rate": 1.0306764945554389e-06, + "loss": 0.5921, + "step": 5420 + }, + { + "epoch": 19.89010989010989, + "grad_norm": 0.10969322174787521, + "learning_rate": 1.017537632442055e-06, + "loss": 0.5885, + "step": 5430 + }, + { + "epoch": 19.926739926739927, + "grad_norm": 0.089121513068676, + "learning_rate": 1.0080480945370008e-06, + "loss": 0.601, + "step": 5440 + }, + { + "epoch": 19.963369963369964, + "grad_norm": 0.08970560878515244, + "learning_rate": 1.002208228945858e-06, + "loss": 0.5843, + "step": 5450 + }, + { + "epoch": 20.0, + "grad_norm": 0.13014668226242065, + "learning_rate": 1.0000182498929442e-06, + "loss": 0.5894, + "step": 5460 + }, + { + "epoch": 20.0, + "step": 5460, + "total_flos": 6.021259110266175e+18, + "train_loss": 0.6025372538374457, + "train_runtime": 24164.0932, + "train_samples_per_second": 86.611, + "train_steps_per_second": 0.226 + } + ], + "logging_steps": 10, + "max_steps": 5460, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.021259110266175e+18, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}