diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.56, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 4.3523268699646, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.7342, + "step": 10 + }, + { + "epoch": 0.0064, + "grad_norm": 4.409250259399414, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 20 + }, + { + "epoch": 0.0096, + "grad_norm": 6.984313488006592, + "learning_rate": 3e-06, + "loss": 0.6975, + "step": 30 + }, + { + "epoch": 0.0128, + "grad_norm": 4.685063362121582, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6833, + "step": 40 + }, + { + "epoch": 0.016, + "grad_norm": 8.828569412231445, + "learning_rate": 5e-06, + "loss": 0.7077, + "step": 50 + }, + { + "epoch": 0.0192, + "grad_norm": 5.112845420837402, + "learning_rate": 6e-06, + "loss": 0.66, + "step": 60 + }, + { + "epoch": 0.0224, + "grad_norm": 6.451657772064209, + "learning_rate": 7.000000000000001e-06, + "loss": 0.6894, + "step": 70 + }, + { + "epoch": 0.0256, + "grad_norm": 8.004484176635742, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6722, + "step": 80 + }, + { + "epoch": 0.0288, + "grad_norm": 4.263449668884277, + "learning_rate": 9e-06, + "loss": 0.6639, + "step": 90 + }, + { + "epoch": 0.032, + "grad_norm": 6.908220291137695, + "learning_rate": 1e-05, + "loss": 0.6664, + "step": 100 + }, + { + "epoch": 0.0352, + "grad_norm": 9.965716361999512, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.6425, + "step": 110 + }, + { + "epoch": 0.0384, + "grad_norm": 9.22775650024414, + "learning_rate": 1.2e-05, + "loss": 0.5719, + "step": 120 + }, + { + "epoch": 0.0416, + "grad_norm": 8.442060470581055, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.5148, + "step": 130 + }, + { + "epoch": 0.0448, + "grad_norm": 3.89926815032959, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.4995, + "step": 140 + }, + { + "epoch": 0.048, + "grad_norm": 10.715374946594238, + "learning_rate": 1.5e-05, + "loss": 0.445, + "step": 150 + }, + { + "epoch": 0.0512, + "grad_norm": 9.363809585571289, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.5027, + "step": 160 + }, + { + "epoch": 0.0544, + "grad_norm": 12.60737419128418, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.3606, + "step": 170 + }, + { + "epoch": 0.0576, + "grad_norm": 6.228287696838379, + "learning_rate": 1.8e-05, + "loss": 0.4815, + "step": 180 + }, + { + "epoch": 0.0608, + "grad_norm": 21.15777015686035, + "learning_rate": 1.9e-05, + "loss": 0.3848, + "step": 190 + }, + { + "epoch": 0.064, + "grad_norm": 7.5884809494018555, + "learning_rate": 2e-05, + "loss": 0.4474, + "step": 200 + }, + { + "epoch": 0.0672, + "grad_norm": 14.817816734313965, + "learning_rate": 2.1e-05, + "loss": 0.3291, + "step": 210 + }, + { + "epoch": 0.0704, + "grad_norm": 25.30421257019043, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.4788, + "step": 220 + }, + { + "epoch": 0.0736, + "grad_norm": 5.897189617156982, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.4708, + "step": 230 + }, + { + "epoch": 0.0768, + "grad_norm": 5.656806468963623, + "learning_rate": 2.4e-05, + "loss": 0.4214, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 39.75941467285156, + "learning_rate": 2.5e-05, + "loss": 0.5151, + "step": 250 + }, + { + "epoch": 0.0832, + "grad_norm": 9.505982398986816, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.5456, + "step": 260 + }, + { + "epoch": 0.0864, + "grad_norm": 9.837905883789062, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3981, + "step": 270 + }, + { + "epoch": 0.0896, + "grad_norm": 6.425085544586182, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.3444, + "step": 280 + }, + { + "epoch": 0.0928, + "grad_norm": 8.601673126220703, + "learning_rate": 2.9e-05, + "loss": 0.3116, + "step": 290 + }, + { + "epoch": 0.096, + "grad_norm": 24.99056625366211, + "learning_rate": 3e-05, + "loss": 0.4125, + "step": 300 + }, + { + "epoch": 0.0992, + "grad_norm": 4.368201732635498, + "learning_rate": 3.1e-05, + "loss": 0.2946, + "step": 310 + }, + { + "epoch": 0.1024, + "grad_norm": 7.49916934967041, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.4568, + "step": 320 + }, + { + "epoch": 0.1056, + "grad_norm": 6.2486138343811035, + "learning_rate": 3.3e-05, + "loss": 0.4596, + "step": 330 + }, + { + "epoch": 0.1088, + "grad_norm": 5.9687886238098145, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4197, + "step": 340 + }, + { + "epoch": 0.112, + "grad_norm": 5.545505046844482, + "learning_rate": 3.5e-05, + "loss": 0.3072, + "step": 350 + }, + { + "epoch": 0.1152, + "grad_norm": 29.903961181640625, + "learning_rate": 3.6e-05, + "loss": 0.5313, + "step": 360 + }, + { + "epoch": 0.1184, + "grad_norm": 7.169201850891113, + "learning_rate": 3.7e-05, + "loss": 0.4665, + "step": 370 + }, + { + "epoch": 0.1216, + "grad_norm": 11.079299926757812, + "learning_rate": 3.8e-05, + "loss": 0.5497, + "step": 380 + }, + { + "epoch": 0.1248, + "grad_norm": 4.827323913574219, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.4849, + "step": 390 + }, + { + "epoch": 0.128, + "grad_norm": 6.925987720489502, + "learning_rate": 4e-05, + "loss": 0.4411, + "step": 400 + }, + { + "epoch": 0.1312, + "grad_norm": 8.159820556640625, + "learning_rate": 4.1e-05, + "loss": 0.4872, + "step": 410 + }, + { + "epoch": 0.1344, + "grad_norm": 10.454991340637207, + "learning_rate": 4.2e-05, + "loss": 0.3407, + "step": 420 + }, + { + "epoch": 0.1376, + "grad_norm": 7.866086959838867, + "learning_rate": 4.3e-05, + "loss": 0.5081, + "step": 430 + }, + { + "epoch": 0.1408, + "grad_norm": 11.918012619018555, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.5015, + "step": 440 + }, + { + "epoch": 0.144, + "grad_norm": 14.668400764465332, + "learning_rate": 4.5e-05, + "loss": 0.3979, + "step": 450 + }, + { + "epoch": 0.1472, + "grad_norm": 13.602070808410645, + "learning_rate": 4.600000000000001e-05, + "loss": 0.4356, + "step": 460 + }, + { + "epoch": 0.1504, + "grad_norm": 16.491836547851562, + "learning_rate": 4.7e-05, + "loss": 0.5188, + "step": 470 + }, + { + "epoch": 0.1536, + "grad_norm": 7.220741271972656, + "learning_rate": 4.8e-05, + "loss": 0.3895, + "step": 480 + }, + { + "epoch": 0.1568, + "grad_norm": 11.220433235168457, + "learning_rate": 4.9e-05, + "loss": 0.4704, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 19.75952911376953, + "learning_rate": 5e-05, + "loss": 0.3499, + "step": 500 + }, + { + "epoch": 0.1632, + "grad_norm": 6.53499174118042, + "learning_rate": 4.994366197183099e-05, + "loss": 0.5274, + "step": 510 + }, + { + "epoch": 0.1664, + "grad_norm": 7.6956000328063965, + "learning_rate": 4.9887323943661973e-05, + "loss": 0.3979, + "step": 520 + }, + { + "epoch": 0.1696, + "grad_norm": 21.266582489013672, + "learning_rate": 4.983098591549296e-05, + "loss": 0.3423, + "step": 530 + }, + { + "epoch": 0.1728, + "grad_norm": 1.1490899324417114, + "learning_rate": 4.9774647887323944e-05, + "loss": 0.3753, + "step": 540 + }, + { + "epoch": 0.176, + "grad_norm": 10.279012680053711, + "learning_rate": 4.971830985915493e-05, + "loss": 0.5932, + "step": 550 + }, + { + "epoch": 0.1792, + "grad_norm": 6.127996444702148, + "learning_rate": 4.966197183098592e-05, + "loss": 0.608, + "step": 560 + }, + { + "epoch": 0.1824, + "grad_norm": 6.2316718101501465, + "learning_rate": 4.96056338028169e-05, + "loss": 0.4464, + "step": 570 + }, + { + "epoch": 0.1856, + "grad_norm": 2.966583251953125, + "learning_rate": 4.954929577464789e-05, + "loss": 0.4239, + "step": 580 + }, + { + "epoch": 0.1888, + "grad_norm": 13.743029594421387, + "learning_rate": 4.949295774647887e-05, + "loss": 0.5533, + "step": 590 + }, + { + "epoch": 0.192, + "grad_norm": 18.420978546142578, + "learning_rate": 4.9436619718309864e-05, + "loss": 0.4474, + "step": 600 + }, + { + "epoch": 0.1952, + "grad_norm": 7.505041122436523, + "learning_rate": 4.938028169014084e-05, + "loss": 0.443, + "step": 610 + }, + { + "epoch": 0.1984, + "grad_norm": 12.293984413146973, + "learning_rate": 4.9323943661971835e-05, + "loss": 0.4537, + "step": 620 + }, + { + "epoch": 0.2016, + "grad_norm": 4.876405239105225, + "learning_rate": 4.926760563380282e-05, + "loss": 0.5191, + "step": 630 + }, + { + "epoch": 0.2048, + "grad_norm": 8.690363883972168, + "learning_rate": 4.9211267605633806e-05, + "loss": 0.556, + "step": 640 + }, + { + "epoch": 0.208, + "grad_norm": 15.03184700012207, + "learning_rate": 4.915492957746479e-05, + "loss": 0.3694, + "step": 650 + }, + { + "epoch": 0.2112, + "grad_norm": 5.66799259185791, + "learning_rate": 4.909859154929578e-05, + "loss": 0.5219, + "step": 660 + }, + { + "epoch": 0.2144, + "grad_norm": 12.765690803527832, + "learning_rate": 4.904225352112676e-05, + "loss": 0.4007, + "step": 670 + }, + { + "epoch": 0.2176, + "grad_norm": 3.547962188720703, + "learning_rate": 4.898591549295775e-05, + "loss": 0.2905, + "step": 680 + }, + { + "epoch": 0.2208, + "grad_norm": 3.7791709899902344, + "learning_rate": 4.8929577464788734e-05, + "loss": 0.4592, + "step": 690 + }, + { + "epoch": 0.224, + "grad_norm": 18.622026443481445, + "learning_rate": 4.887323943661972e-05, + "loss": 0.4226, + "step": 700 + }, + { + "epoch": 0.2272, + "grad_norm": 7.643071174621582, + "learning_rate": 4.8816901408450705e-05, + "loss": 0.3244, + "step": 710 + }, + { + "epoch": 0.2304, + "grad_norm": 27.778474807739258, + "learning_rate": 4.876056338028169e-05, + "loss": 0.3863, + "step": 720 + }, + { + "epoch": 0.2336, + "grad_norm": 4.5000834465026855, + "learning_rate": 4.8704225352112676e-05, + "loss": 0.3431, + "step": 730 + }, + { + "epoch": 0.2368, + "grad_norm": 44.82728576660156, + "learning_rate": 4.864788732394366e-05, + "loss": 0.4113, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 29.374372482299805, + "learning_rate": 4.8591549295774653e-05, + "loss": 0.5123, + "step": 750 + }, + { + "epoch": 0.2432, + "grad_norm": 5.750948429107666, + "learning_rate": 4.853521126760563e-05, + "loss": 0.3296, + "step": 760 + }, + { + "epoch": 0.2464, + "grad_norm": 15.520541191101074, + "learning_rate": 4.8478873239436624e-05, + "loss": 0.3358, + "step": 770 + }, + { + "epoch": 0.2496, + "grad_norm": 5.127716541290283, + "learning_rate": 4.84225352112676e-05, + "loss": 0.3469, + "step": 780 + }, + { + "epoch": 0.2528, + "grad_norm": 20.350370407104492, + "learning_rate": 4.8366197183098595e-05, + "loss": 0.3067, + "step": 790 + }, + { + "epoch": 0.256, + "grad_norm": 11.152381896972656, + "learning_rate": 4.830985915492958e-05, + "loss": 0.5645, + "step": 800 + }, + { + "epoch": 0.2592, + "grad_norm": 19.948450088500977, + "learning_rate": 4.8253521126760566e-05, + "loss": 0.6015, + "step": 810 + }, + { + "epoch": 0.2624, + "grad_norm": 7.729649543762207, + "learning_rate": 4.819718309859155e-05, + "loss": 0.3755, + "step": 820 + }, + { + "epoch": 0.2656, + "grad_norm": 9.633999824523926, + "learning_rate": 4.814084507042254e-05, + "loss": 0.4016, + "step": 830 + }, + { + "epoch": 0.2688, + "grad_norm": 10.421425819396973, + "learning_rate": 4.808450704225352e-05, + "loss": 0.3536, + "step": 840 + }, + { + "epoch": 0.272, + "grad_norm": 7.2195515632629395, + "learning_rate": 4.8028169014084515e-05, + "loss": 0.4654, + "step": 850 + }, + { + "epoch": 0.2752, + "grad_norm": 17.624547958374023, + "learning_rate": 4.7971830985915494e-05, + "loss": 0.3924, + "step": 860 + }, + { + "epoch": 0.2784, + "grad_norm": 12.011979103088379, + "learning_rate": 4.791549295774648e-05, + "loss": 0.3513, + "step": 870 + }, + { + "epoch": 0.2816, + "grad_norm": 5.497714519500732, + "learning_rate": 4.7859154929577465e-05, + "loss": 0.3268, + "step": 880 + }, + { + "epoch": 0.2848, + "grad_norm": 24.21810531616211, + "learning_rate": 4.780281690140845e-05, + "loss": 0.4526, + "step": 890 + }, + { + "epoch": 0.288, + "grad_norm": 20.337770462036133, + "learning_rate": 4.7746478873239436e-05, + "loss": 0.3238, + "step": 900 + }, + { + "epoch": 0.2912, + "grad_norm": 3.2851223945617676, + "learning_rate": 4.769014084507042e-05, + "loss": 0.3168, + "step": 910 + }, + { + "epoch": 0.2944, + "grad_norm": 37.14635467529297, + "learning_rate": 4.7633802816901414e-05, + "loss": 0.4194, + "step": 920 + }, + { + "epoch": 0.2976, + "grad_norm": 16.439712524414062, + "learning_rate": 4.757746478873239e-05, + "loss": 0.4244, + "step": 930 + }, + { + "epoch": 0.3008, + "grad_norm": 20.88243293762207, + "learning_rate": 4.7521126760563385e-05, + "loss": 0.5273, + "step": 940 + }, + { + "epoch": 0.304, + "grad_norm": 7.504245758056641, + "learning_rate": 4.7464788732394363e-05, + "loss": 0.4809, + "step": 950 + }, + { + "epoch": 0.3072, + "grad_norm": 4.482902526855469, + "learning_rate": 4.7408450704225356e-05, + "loss": 0.3758, + "step": 960 + }, + { + "epoch": 0.3104, + "grad_norm": 7.861649513244629, + "learning_rate": 4.735211267605634e-05, + "loss": 0.2721, + "step": 970 + }, + { + "epoch": 0.3136, + "grad_norm": 7.620471000671387, + "learning_rate": 4.729577464788733e-05, + "loss": 0.4302, + "step": 980 + }, + { + "epoch": 0.3168, + "grad_norm": 9.727157592773438, + "learning_rate": 4.723943661971831e-05, + "loss": 0.3482, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 20.1031494140625, + "learning_rate": 4.71830985915493e-05, + "loss": 0.3377, + "step": 1000 + }, + { + "epoch": 0.3232, + "grad_norm": 5.509233474731445, + "learning_rate": 4.712676056338028e-05, + "loss": 0.583, + "step": 1010 + }, + { + "epoch": 0.3264, + "grad_norm": 7.83276891708374, + "learning_rate": 4.707042253521127e-05, + "loss": 0.4061, + "step": 1020 + }, + { + "epoch": 0.3296, + "grad_norm": 11.982447624206543, + "learning_rate": 4.7014084507042254e-05, + "loss": 0.3287, + "step": 1030 + }, + { + "epoch": 0.3328, + "grad_norm": 5.372861385345459, + "learning_rate": 4.6957746478873247e-05, + "loss": 0.2898, + "step": 1040 + }, + { + "epoch": 0.336, + "grad_norm": 6.437655448913574, + "learning_rate": 4.6901408450704225e-05, + "loss": 0.3084, + "step": 1050 + }, + { + "epoch": 0.3392, + "grad_norm": 14.815783500671387, + "learning_rate": 4.684507042253522e-05, + "loss": 0.444, + "step": 1060 + }, + { + "epoch": 0.3424, + "grad_norm": 5.138498306274414, + "learning_rate": 4.6788732394366196e-05, + "loss": 0.3657, + "step": 1070 + }, + { + "epoch": 0.3456, + "grad_norm": 12.453933715820312, + "learning_rate": 4.673239436619719e-05, + "loss": 0.6394, + "step": 1080 + }, + { + "epoch": 0.3488, + "grad_norm": 12.436538696289062, + "learning_rate": 4.6676056338028174e-05, + "loss": 0.3662, + "step": 1090 + }, + { + "epoch": 0.352, + "grad_norm": 8.125226974487305, + "learning_rate": 4.661971830985915e-05, + "loss": 0.486, + "step": 1100 + }, + { + "epoch": 0.3552, + "grad_norm": 12.688572883605957, + "learning_rate": 4.6563380281690145e-05, + "loss": 0.5292, + "step": 1110 + }, + { + "epoch": 0.3584, + "grad_norm": 8.127309799194336, + "learning_rate": 4.6507042253521124e-05, + "loss": 0.2529, + "step": 1120 + }, + { + "epoch": 0.3616, + "grad_norm": 14.914011001586914, + "learning_rate": 4.6450704225352116e-05, + "loss": 0.381, + "step": 1130 + }, + { + "epoch": 0.3648, + "grad_norm": 15.199287414550781, + "learning_rate": 4.63943661971831e-05, + "loss": 0.5236, + "step": 1140 + }, + { + "epoch": 0.368, + "grad_norm": 6.843156814575195, + "learning_rate": 4.633802816901409e-05, + "loss": 0.2929, + "step": 1150 + }, + { + "epoch": 0.3712, + "grad_norm": 12.875916481018066, + "learning_rate": 4.628169014084507e-05, + "loss": 0.5681, + "step": 1160 + }, + { + "epoch": 0.3744, + "grad_norm": 7.395442485809326, + "learning_rate": 4.622535211267606e-05, + "loss": 0.4717, + "step": 1170 + }, + { + "epoch": 0.3776, + "grad_norm": 5.369324207305908, + "learning_rate": 4.6169014084507044e-05, + "loss": 0.4466, + "step": 1180 + }, + { + "epoch": 0.3808, + "grad_norm": 5.074844837188721, + "learning_rate": 4.611267605633803e-05, + "loss": 0.4263, + "step": 1190 + }, + { + "epoch": 0.384, + "grad_norm": 15.802391052246094, + "learning_rate": 4.6056338028169015e-05, + "loss": 0.351, + "step": 1200 + }, + { + "epoch": 0.3872, + "grad_norm": 20.11571502685547, + "learning_rate": 4.600000000000001e-05, + "loss": 0.3114, + "step": 1210 + }, + { + "epoch": 0.3904, + "grad_norm": 20.25322723388672, + "learning_rate": 4.5943661971830986e-05, + "loss": 0.4008, + "step": 1220 + }, + { + "epoch": 0.3936, + "grad_norm": 4.877046585083008, + "learning_rate": 4.588732394366198e-05, + "loss": 0.6073, + "step": 1230 + }, + { + "epoch": 0.3968, + "grad_norm": 6.517822742462158, + "learning_rate": 4.5830985915492957e-05, + "loss": 0.4318, + "step": 1240 + }, + { + "epoch": 0.4, + "grad_norm": 6.672747611999512, + "learning_rate": 4.577464788732395e-05, + "loss": 0.414, + "step": 1250 + }, + { + "epoch": 0.4032, + "grad_norm": 4.382776260375977, + "learning_rate": 4.5718309859154934e-05, + "loss": 0.3432, + "step": 1260 + }, + { + "epoch": 0.4064, + "grad_norm": 9.080897331237793, + "learning_rate": 4.566197183098592e-05, + "loss": 0.4862, + "step": 1270 + }, + { + "epoch": 0.4096, + "grad_norm": 5.132823944091797, + "learning_rate": 4.5605633802816905e-05, + "loss": 0.4707, + "step": 1280 + }, + { + "epoch": 0.4128, + "grad_norm": 4.521566867828369, + "learning_rate": 4.554929577464789e-05, + "loss": 0.4951, + "step": 1290 + }, + { + "epoch": 0.416, + "grad_norm": 9.381317138671875, + "learning_rate": 4.5492957746478876e-05, + "loss": 0.3341, + "step": 1300 + }, + { + "epoch": 0.4192, + "grad_norm": 10.4902982711792, + "learning_rate": 4.543661971830986e-05, + "loss": 0.4471, + "step": 1310 + }, + { + "epoch": 0.4224, + "grad_norm": 5.194609642028809, + "learning_rate": 4.538028169014085e-05, + "loss": 0.4406, + "step": 1320 + }, + { + "epoch": 0.4256, + "grad_norm": 13.40365982055664, + "learning_rate": 4.532394366197183e-05, + "loss": 0.3805, + "step": 1330 + }, + { + "epoch": 0.4288, + "grad_norm": 15.255276679992676, + "learning_rate": 4.526760563380282e-05, + "loss": 0.2003, + "step": 1340 + }, + { + "epoch": 0.432, + "grad_norm": 13.552937507629395, + "learning_rate": 4.5211267605633804e-05, + "loss": 0.4048, + "step": 1350 + }, + { + "epoch": 0.4352, + "grad_norm": 24.772096633911133, + "learning_rate": 4.515492957746479e-05, + "loss": 0.586, + "step": 1360 + }, + { + "epoch": 0.4384, + "grad_norm": 29.092702865600586, + "learning_rate": 4.5098591549295775e-05, + "loss": 0.6368, + "step": 1370 + }, + { + "epoch": 0.4416, + "grad_norm": 15.915087699890137, + "learning_rate": 4.504225352112677e-05, + "loss": 0.3818, + "step": 1380 + }, + { + "epoch": 0.4448, + "grad_norm": 14.00623607635498, + "learning_rate": 4.4985915492957746e-05, + "loss": 0.4146, + "step": 1390 + }, + { + "epoch": 0.448, + "grad_norm": 9.716373443603516, + "learning_rate": 4.492957746478874e-05, + "loss": 0.3421, + "step": 1400 + }, + { + "epoch": 0.4512, + "grad_norm": 5.982295989990234, + "learning_rate": 4.487323943661972e-05, + "loss": 0.4263, + "step": 1410 + }, + { + "epoch": 0.4544, + "grad_norm": 10.845952987670898, + "learning_rate": 4.481690140845071e-05, + "loss": 0.2946, + "step": 1420 + }, + { + "epoch": 0.4576, + "grad_norm": 17.834733963012695, + "learning_rate": 4.4760563380281695e-05, + "loss": 0.3593, + "step": 1430 + }, + { + "epoch": 0.4608, + "grad_norm": 7.576904296875, + "learning_rate": 4.470422535211268e-05, + "loss": 0.4492, + "step": 1440 + }, + { + "epoch": 0.464, + "grad_norm": 7.559220790863037, + "learning_rate": 4.4647887323943666e-05, + "loss": 0.4127, + "step": 1450 + }, + { + "epoch": 0.4672, + "grad_norm": 4.112594127655029, + "learning_rate": 4.459154929577465e-05, + "loss": 0.3307, + "step": 1460 + }, + { + "epoch": 0.4704, + "grad_norm": 2.598599910736084, + "learning_rate": 4.4535211267605637e-05, + "loss": 0.2452, + "step": 1470 + }, + { + "epoch": 0.4736, + "grad_norm": 5.336888790130615, + "learning_rate": 4.447887323943662e-05, + "loss": 0.4106, + "step": 1480 + }, + { + "epoch": 0.4768, + "grad_norm": 7.816699028015137, + "learning_rate": 4.442253521126761e-05, + "loss": 0.492, + "step": 1490 + }, + { + "epoch": 0.48, + "grad_norm": 14.75847053527832, + "learning_rate": 4.436619718309859e-05, + "loss": 0.4283, + "step": 1500 + }, + { + "epoch": 0.4832, + "grad_norm": 5.2887959480285645, + "learning_rate": 4.430985915492958e-05, + "loss": 0.3547, + "step": 1510 + }, + { + "epoch": 0.4864, + "grad_norm": 3.928128242492676, + "learning_rate": 4.4253521126760564e-05, + "loss": 0.3868, + "step": 1520 + }, + { + "epoch": 0.4896, + "grad_norm": 6.465476036071777, + "learning_rate": 4.419718309859155e-05, + "loss": 0.3789, + "step": 1530 + }, + { + "epoch": 0.4928, + "grad_norm": 5.5618743896484375, + "learning_rate": 4.4140845070422535e-05, + "loss": 0.4552, + "step": 1540 + }, + { + "epoch": 0.496, + "grad_norm": 6.504174709320068, + "learning_rate": 4.408450704225353e-05, + "loss": 0.4515, + "step": 1550 + }, + { + "epoch": 0.4992, + "grad_norm": 11.997910499572754, + "learning_rate": 4.4028169014084506e-05, + "loss": 0.2976, + "step": 1560 + }, + { + "epoch": 0.5024, + "grad_norm": 5.83452844619751, + "learning_rate": 4.39718309859155e-05, + "loss": 0.3886, + "step": 1570 + }, + { + "epoch": 0.5056, + "grad_norm": 11.406950950622559, + "learning_rate": 4.391549295774648e-05, + "loss": 0.3743, + "step": 1580 + }, + { + "epoch": 0.5088, + "grad_norm": 4.557556629180908, + "learning_rate": 4.385915492957747e-05, + "loss": 0.3984, + "step": 1590 + }, + { + "epoch": 0.512, + "grad_norm": 10.334356307983398, + "learning_rate": 4.3802816901408455e-05, + "loss": 0.3593, + "step": 1600 + }, + { + "epoch": 0.5152, + "grad_norm": 10.674864768981934, + "learning_rate": 4.374647887323944e-05, + "loss": 0.1878, + "step": 1610 + }, + { + "epoch": 0.5184, + "grad_norm": 3.705169916152954, + "learning_rate": 4.3690140845070426e-05, + "loss": 0.446, + "step": 1620 + }, + { + "epoch": 0.5216, + "grad_norm": 5.510721683502197, + "learning_rate": 4.363380281690141e-05, + "loss": 0.4812, + "step": 1630 + }, + { + "epoch": 0.5248, + "grad_norm": 2.3618953227996826, + "learning_rate": 4.35774647887324e-05, + "loss": 0.4004, + "step": 1640 + }, + { + "epoch": 0.528, + "grad_norm": 10.285249710083008, + "learning_rate": 4.352112676056338e-05, + "loss": 0.3904, + "step": 1650 + }, + { + "epoch": 0.5312, + "grad_norm": 15.25522518157959, + "learning_rate": 4.346478873239437e-05, + "loss": 0.3363, + "step": 1660 + }, + { + "epoch": 0.5344, + "grad_norm": 10.684788703918457, + "learning_rate": 4.340845070422535e-05, + "loss": 0.5174, + "step": 1670 + }, + { + "epoch": 0.5376, + "grad_norm": 4.573671340942383, + "learning_rate": 4.335211267605634e-05, + "loss": 0.2947, + "step": 1680 + }, + { + "epoch": 0.5408, + "grad_norm": 13.247304916381836, + "learning_rate": 4.3295774647887324e-05, + "loss": 0.4169, + "step": 1690 + }, + { + "epoch": 0.544, + "grad_norm": 16.0648250579834, + "learning_rate": 4.323943661971831e-05, + "loss": 0.2454, + "step": 1700 + }, + { + "epoch": 0.5472, + "grad_norm": 7.58563232421875, + "learning_rate": 4.3183098591549295e-05, + "loss": 0.4982, + "step": 1710 + }, + { + "epoch": 0.5504, + "grad_norm": 4.593902587890625, + "learning_rate": 4.312676056338029e-05, + "loss": 0.4422, + "step": 1720 + }, + { + "epoch": 0.5536, + "grad_norm": 6.4184370040893555, + "learning_rate": 4.3070422535211266e-05, + "loss": 0.3147, + "step": 1730 + }, + { + "epoch": 0.5568, + "grad_norm": 16.60883140563965, + "learning_rate": 4.301408450704226e-05, + "loss": 0.3153, + "step": 1740 + }, + { + "epoch": 0.56, + "grad_norm": 2.526179552078247, + "learning_rate": 4.295774647887324e-05, + "loss": 0.3074, + "step": 1750 + }, + { + "epoch": 0.5632, + "grad_norm": 17.307958602905273, + "learning_rate": 4.290140845070423e-05, + "loss": 0.3178, + "step": 1760 + }, + { + "epoch": 0.5664, + "grad_norm": 9.892918586730957, + "learning_rate": 4.284507042253521e-05, + "loss": 0.2585, + "step": 1770 + }, + { + "epoch": 0.5696, + "grad_norm": 11.281522750854492, + "learning_rate": 4.27887323943662e-05, + "loss": 0.2706, + "step": 1780 + }, + { + "epoch": 0.5728, + "grad_norm": 22.379169464111328, + "learning_rate": 4.2732394366197186e-05, + "loss": 0.4405, + "step": 1790 + }, + { + "epoch": 0.576, + "grad_norm": 7.15933084487915, + "learning_rate": 4.267605633802817e-05, + "loss": 0.3592, + "step": 1800 + }, + { + "epoch": 0.5792, + "grad_norm": 6.169369220733643, + "learning_rate": 4.261971830985916e-05, + "loss": 0.3866, + "step": 1810 + }, + { + "epoch": 0.5824, + "grad_norm": 4.099594593048096, + "learning_rate": 4.256338028169014e-05, + "loss": 0.4223, + "step": 1820 + }, + { + "epoch": 0.5856, + "grad_norm": 18.78368377685547, + "learning_rate": 4.250704225352113e-05, + "loss": 0.376, + "step": 1830 + }, + { + "epoch": 0.5888, + "grad_norm": 2.630387306213379, + "learning_rate": 4.2450704225352114e-05, + "loss": 0.2327, + "step": 1840 + }, + { + "epoch": 0.592, + "grad_norm": 17.63787269592285, + "learning_rate": 4.23943661971831e-05, + "loss": 0.5726, + "step": 1850 + }, + { + "epoch": 0.5952, + "grad_norm": 6.631278038024902, + "learning_rate": 4.2338028169014085e-05, + "loss": 0.3042, + "step": 1860 + }, + { + "epoch": 0.5984, + "grad_norm": 10.951118469238281, + "learning_rate": 4.228169014084507e-05, + "loss": 0.5125, + "step": 1870 + }, + { + "epoch": 0.6016, + "grad_norm": 8.784004211425781, + "learning_rate": 4.2225352112676056e-05, + "loss": 0.4033, + "step": 1880 + }, + { + "epoch": 0.6048, + "grad_norm": 4.034893989562988, + "learning_rate": 4.216901408450705e-05, + "loss": 0.3788, + "step": 1890 + }, + { + "epoch": 0.608, + "grad_norm": 4.547167778015137, + "learning_rate": 4.211267605633803e-05, + "loss": 0.3065, + "step": 1900 + }, + { + "epoch": 0.6112, + "grad_norm": 8.01904582977295, + "learning_rate": 4.205633802816902e-05, + "loss": 0.3656, + "step": 1910 + }, + { + "epoch": 0.6144, + "grad_norm": 3.676229953765869, + "learning_rate": 4.2e-05, + "loss": 0.4528, + "step": 1920 + }, + { + "epoch": 0.6176, + "grad_norm": 14.89476203918457, + "learning_rate": 4.194366197183099e-05, + "loss": 0.3974, + "step": 1930 + }, + { + "epoch": 0.6208, + "grad_norm": 6.517081260681152, + "learning_rate": 4.188732394366197e-05, + "loss": 0.3502, + "step": 1940 + }, + { + "epoch": 0.624, + "grad_norm": 9.692541122436523, + "learning_rate": 4.183098591549296e-05, + "loss": 0.2676, + "step": 1950 + }, + { + "epoch": 0.6272, + "grad_norm": 2.047581434249878, + "learning_rate": 4.1774647887323946e-05, + "loss": 0.3422, + "step": 1960 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5576546788215637, + "learning_rate": 4.171830985915493e-05, + "loss": 0.2076, + "step": 1970 + }, + { + "epoch": 0.6336, + "grad_norm": 3.5656802654266357, + "learning_rate": 4.166197183098592e-05, + "loss": 0.4356, + "step": 1980 + }, + { + "epoch": 0.6368, + "grad_norm": 1.7690439224243164, + "learning_rate": 4.16056338028169e-05, + "loss": 0.2592, + "step": 1990 + }, + { + "epoch": 0.64, + "grad_norm": 21.8055362701416, + "learning_rate": 4.154929577464789e-05, + "loss": 0.4841, + "step": 2000 + }, + { + "epoch": 0.6432, + "grad_norm": 2.1585135459899902, + "learning_rate": 4.149295774647888e-05, + "loss": 0.2427, + "step": 2010 + }, + { + "epoch": 0.6464, + "grad_norm": 22.61993980407715, + "learning_rate": 4.143661971830986e-05, + "loss": 0.3264, + "step": 2020 + }, + { + "epoch": 0.6496, + "grad_norm": 3.826843500137329, + "learning_rate": 4.138028169014085e-05, + "loss": 0.3896, + "step": 2030 + }, + { + "epoch": 0.6528, + "grad_norm": 14.643287658691406, + "learning_rate": 4.132394366197183e-05, + "loss": 0.5588, + "step": 2040 + }, + { + "epoch": 0.656, + "grad_norm": 1.5682073831558228, + "learning_rate": 4.126760563380282e-05, + "loss": 0.4475, + "step": 2050 + }, + { + "epoch": 0.6592, + "grad_norm": 11.66586971282959, + "learning_rate": 4.12112676056338e-05, + "loss": 0.3633, + "step": 2060 + }, + { + "epoch": 0.6624, + "grad_norm": 11.989961624145508, + "learning_rate": 4.115492957746479e-05, + "loss": 0.4635, + "step": 2070 + }, + { + "epoch": 0.6656, + "grad_norm": 3.4878032207489014, + "learning_rate": 4.109859154929578e-05, + "loss": 0.2872, + "step": 2080 + }, + { + "epoch": 0.6688, + "grad_norm": 3.756565570831299, + "learning_rate": 4.104225352112676e-05, + "loss": 0.4745, + "step": 2090 + }, + { + "epoch": 0.672, + "grad_norm": 6.970531940460205, + "learning_rate": 4.098591549295775e-05, + "loss": 0.3618, + "step": 2100 + }, + { + "epoch": 0.6752, + "grad_norm": 16.83983039855957, + "learning_rate": 4.092957746478873e-05, + "loss": 0.3934, + "step": 2110 + }, + { + "epoch": 0.6784, + "grad_norm": 11.064716339111328, + "learning_rate": 4.087323943661972e-05, + "loss": 0.2683, + "step": 2120 + }, + { + "epoch": 0.6816, + "grad_norm": 18.883390426635742, + "learning_rate": 4.081690140845071e-05, + "loss": 0.3461, + "step": 2130 + }, + { + "epoch": 0.6848, + "grad_norm": 4.280035972595215, + "learning_rate": 4.076056338028169e-05, + "loss": 0.372, + "step": 2140 + }, + { + "epoch": 0.688, + "grad_norm": 10.117981910705566, + "learning_rate": 4.070422535211268e-05, + "loss": 0.2614, + "step": 2150 + }, + { + "epoch": 0.6912, + "grad_norm": 11.863015174865723, + "learning_rate": 4.064788732394366e-05, + "loss": 0.3622, + "step": 2160 + }, + { + "epoch": 0.6944, + "grad_norm": 7.1603875160217285, + "learning_rate": 4.059154929577465e-05, + "loss": 0.278, + "step": 2170 + }, + { + "epoch": 0.6976, + "grad_norm": 9.962820053100586, + "learning_rate": 4.053521126760564e-05, + "loss": 0.544, + "step": 2180 + }, + { + "epoch": 0.7008, + "grad_norm": 7.794748306274414, + "learning_rate": 4.047887323943662e-05, + "loss": 0.3095, + "step": 2190 + }, + { + "epoch": 0.704, + "grad_norm": 20.568464279174805, + "learning_rate": 4.042253521126761e-05, + "loss": 0.3264, + "step": 2200 + }, + { + "epoch": 0.7072, + "grad_norm": 4.824507236480713, + "learning_rate": 4.036619718309859e-05, + "loss": 0.3093, + "step": 2210 + }, + { + "epoch": 0.7104, + "grad_norm": 6.159689426422119, + "learning_rate": 4.030985915492958e-05, + "loss": 0.3073, + "step": 2220 + }, + { + "epoch": 0.7136, + "grad_norm": 22.985084533691406, + "learning_rate": 4.025352112676056e-05, + "loss": 0.2162, + "step": 2230 + }, + { + "epoch": 0.7168, + "grad_norm": 19.654817581176758, + "learning_rate": 4.0197183098591554e-05, + "loss": 0.6091, + "step": 2240 + }, + { + "epoch": 0.72, + "grad_norm": 6.315866947174072, + "learning_rate": 4.014084507042254e-05, + "loss": 0.459, + "step": 2250 + }, + { + "epoch": 0.7232, + "grad_norm": 7.05145788192749, + "learning_rate": 4.0084507042253525e-05, + "loss": 0.3191, + "step": 2260 + }, + { + "epoch": 0.7264, + "grad_norm": 11.295578956604004, + "learning_rate": 4.002816901408451e-05, + "loss": 0.3457, + "step": 2270 + }, + { + "epoch": 0.7296, + "grad_norm": 4.964128494262695, + "learning_rate": 3.9971830985915496e-05, + "loss": 0.3634, + "step": 2280 + }, + { + "epoch": 0.7328, + "grad_norm": 9.028850555419922, + "learning_rate": 3.991549295774648e-05, + "loss": 0.4049, + "step": 2290 + }, + { + "epoch": 0.736, + "grad_norm": 7.955386161804199, + "learning_rate": 3.985915492957747e-05, + "loss": 0.3013, + "step": 2300 + }, + { + "epoch": 0.7392, + "grad_norm": 9.309741020202637, + "learning_rate": 3.980281690140845e-05, + "loss": 0.3583, + "step": 2310 + }, + { + "epoch": 0.7424, + "grad_norm": 13.393871307373047, + "learning_rate": 3.974647887323944e-05, + "loss": 0.433, + "step": 2320 + }, + { + "epoch": 0.7456, + "grad_norm": 13.058290481567383, + "learning_rate": 3.9690140845070424e-05, + "loss": 0.5419, + "step": 2330 + }, + { + "epoch": 0.7488, + "grad_norm": 5.2141900062561035, + "learning_rate": 3.963380281690141e-05, + "loss": 0.2244, + "step": 2340 + }, + { + "epoch": 0.752, + "grad_norm": 10.393515586853027, + "learning_rate": 3.9577464788732395e-05, + "loss": 0.4972, + "step": 2350 + }, + { + "epoch": 0.7552, + "grad_norm": 2.1989641189575195, + "learning_rate": 3.952112676056338e-05, + "loss": 0.395, + "step": 2360 + }, + { + "epoch": 0.7584, + "grad_norm": 10.207283973693848, + "learning_rate": 3.946478873239437e-05, + "loss": 0.5946, + "step": 2370 + }, + { + "epoch": 0.7616, + "grad_norm": 5.437625408172607, + "learning_rate": 3.940845070422535e-05, + "loss": 0.2024, + "step": 2380 + }, + { + "epoch": 0.7648, + "grad_norm": 5.534853458404541, + "learning_rate": 3.935211267605634e-05, + "loss": 0.3462, + "step": 2390 + }, + { + "epoch": 0.768, + "grad_norm": 14.348875045776367, + "learning_rate": 3.929577464788732e-05, + "loss": 0.2925, + "step": 2400 + }, + { + "epoch": 0.7712, + "grad_norm": 1.4454820156097412, + "learning_rate": 3.9239436619718314e-05, + "loss": 0.3436, + "step": 2410 + }, + { + "epoch": 0.7744, + "grad_norm": 11.598908424377441, + "learning_rate": 3.91830985915493e-05, + "loss": 0.3707, + "step": 2420 + }, + { + "epoch": 0.7776, + "grad_norm": 23.476207733154297, + "learning_rate": 3.9126760563380285e-05, + "loss": 0.5327, + "step": 2430 + }, + { + "epoch": 0.7808, + "grad_norm": 11.371722221374512, + "learning_rate": 3.907042253521127e-05, + "loss": 0.4592, + "step": 2440 + }, + { + "epoch": 0.784, + "grad_norm": 7.114121437072754, + "learning_rate": 3.9014084507042256e-05, + "loss": 0.2222, + "step": 2450 + }, + { + "epoch": 0.7872, + "grad_norm": 9.982179641723633, + "learning_rate": 3.895774647887324e-05, + "loss": 0.4403, + "step": 2460 + }, + { + "epoch": 0.7904, + "grad_norm": 4.670555114746094, + "learning_rate": 3.890140845070423e-05, + "loss": 0.3811, + "step": 2470 + }, + { + "epoch": 0.7936, + "grad_norm": 4.608165740966797, + "learning_rate": 3.884507042253521e-05, + "loss": 0.2737, + "step": 2480 + }, + { + "epoch": 0.7968, + "grad_norm": 10.740816116333008, + "learning_rate": 3.87887323943662e-05, + "loss": 0.3186, + "step": 2490 + }, + { + "epoch": 0.8, + "grad_norm": 21.781532287597656, + "learning_rate": 3.8732394366197184e-05, + "loss": 0.2801, + "step": 2500 + }, + { + "epoch": 0.8032, + "grad_norm": 18.888141632080078, + "learning_rate": 3.867605633802817e-05, + "loss": 0.2524, + "step": 2510 + }, + { + "epoch": 0.8064, + "grad_norm": 14.424897193908691, + "learning_rate": 3.8619718309859155e-05, + "loss": 0.4049, + "step": 2520 + }, + { + "epoch": 0.8096, + "grad_norm": 2.566080093383789, + "learning_rate": 3.856338028169014e-05, + "loss": 0.542, + "step": 2530 + }, + { + "epoch": 0.8128, + "grad_norm": 20.100793838500977, + "learning_rate": 3.850704225352113e-05, + "loss": 0.331, + "step": 2540 + }, + { + "epoch": 0.816, + "grad_norm": 17.21098518371582, + "learning_rate": 3.845070422535211e-05, + "loss": 0.5059, + "step": 2550 + }, + { + "epoch": 0.8192, + "grad_norm": 26.911142349243164, + "learning_rate": 3.8394366197183104e-05, + "loss": 0.3813, + "step": 2560 + }, + { + "epoch": 0.8224, + "grad_norm": 8.0260591506958, + "learning_rate": 3.833802816901408e-05, + "loss": 0.3696, + "step": 2570 + }, + { + "epoch": 0.8256, + "grad_norm": 12.544899940490723, + "learning_rate": 3.8281690140845075e-05, + "loss": 0.2778, + "step": 2580 + }, + { + "epoch": 0.8288, + "grad_norm": 8.61177921295166, + "learning_rate": 3.822535211267606e-05, + "loss": 0.3647, + "step": 2590 + }, + { + "epoch": 0.832, + "grad_norm": 2.9444468021392822, + "learning_rate": 3.8169014084507046e-05, + "loss": 0.3391, + "step": 2600 + }, + { + "epoch": 0.8352, + "grad_norm": 12.346148490905762, + "learning_rate": 3.811267605633803e-05, + "loss": 0.3134, + "step": 2610 + }, + { + "epoch": 0.8384, + "grad_norm": 16.827272415161133, + "learning_rate": 3.8056338028169017e-05, + "loss": 0.2674, + "step": 2620 + }, + { + "epoch": 0.8416, + "grad_norm": 16.586444854736328, + "learning_rate": 3.8e-05, + "loss": 0.2912, + "step": 2630 + }, + { + "epoch": 0.8448, + "grad_norm": 6.491583347320557, + "learning_rate": 3.794366197183099e-05, + "loss": 0.3261, + "step": 2640 + }, + { + "epoch": 0.848, + "grad_norm": 35.861572265625, + "learning_rate": 3.788732394366197e-05, + "loss": 0.2646, + "step": 2650 + }, + { + "epoch": 0.8512, + "grad_norm": 8.829320907592773, + "learning_rate": 3.783098591549296e-05, + "loss": 0.5009, + "step": 2660 + }, + { + "epoch": 0.8544, + "grad_norm": 1.658776879310608, + "learning_rate": 3.7774647887323944e-05, + "loss": 0.4636, + "step": 2670 + }, + { + "epoch": 0.8576, + "grad_norm": 2.2379770278930664, + "learning_rate": 3.771830985915493e-05, + "loss": 0.3915, + "step": 2680 + }, + { + "epoch": 0.8608, + "grad_norm": 4.984516143798828, + "learning_rate": 3.7661971830985915e-05, + "loss": 0.1507, + "step": 2690 + }, + { + "epoch": 0.864, + "grad_norm": 5.562011241912842, + "learning_rate": 3.76056338028169e-05, + "loss": 0.5398, + "step": 2700 + }, + { + "epoch": 0.8672, + "grad_norm": 21.320629119873047, + "learning_rate": 3.754929577464789e-05, + "loss": 0.4201, + "step": 2710 + }, + { + "epoch": 0.8704, + "grad_norm": 19.99195671081543, + "learning_rate": 3.749295774647887e-05, + "loss": 0.4603, + "step": 2720 + }, + { + "epoch": 0.8736, + "grad_norm": 3.42061185836792, + "learning_rate": 3.7436619718309864e-05, + "loss": 0.2968, + "step": 2730 + }, + { + "epoch": 0.8768, + "grad_norm": 27.126548767089844, + "learning_rate": 3.738028169014084e-05, + "loss": 0.3913, + "step": 2740 + }, + { + "epoch": 0.88, + "grad_norm": 11.521971702575684, + "learning_rate": 3.7323943661971835e-05, + "loss": 0.2858, + "step": 2750 + }, + { + "epoch": 0.8832, + "grad_norm": 20.222986221313477, + "learning_rate": 3.726760563380282e-05, + "loss": 0.3669, + "step": 2760 + }, + { + "epoch": 0.8864, + "grad_norm": 1.7271472215652466, + "learning_rate": 3.7211267605633806e-05, + "loss": 0.3193, + "step": 2770 + }, + { + "epoch": 0.8896, + "grad_norm": 29.57312774658203, + "learning_rate": 3.715492957746479e-05, + "loss": 0.5223, + "step": 2780 + }, + { + "epoch": 0.8928, + "grad_norm": 6.471129894256592, + "learning_rate": 3.709859154929578e-05, + "loss": 0.3132, + "step": 2790 + }, + { + "epoch": 0.896, + "grad_norm": 13.332724571228027, + "learning_rate": 3.704225352112676e-05, + "loss": 0.5883, + "step": 2800 + }, + { + "epoch": 0.8992, + "grad_norm": 16.782337188720703, + "learning_rate": 3.698591549295775e-05, + "loss": 0.3876, + "step": 2810 + }, + { + "epoch": 0.9024, + "grad_norm": 12.669254302978516, + "learning_rate": 3.692957746478873e-05, + "loss": 0.5085, + "step": 2820 + }, + { + "epoch": 0.9056, + "grad_norm": 6.967997074127197, + "learning_rate": 3.687323943661972e-05, + "loss": 0.3927, + "step": 2830 + }, + { + "epoch": 0.9088, + "grad_norm": 11.67349624633789, + "learning_rate": 3.6816901408450704e-05, + "loss": 0.2962, + "step": 2840 + }, + { + "epoch": 0.912, + "grad_norm": 9.875104904174805, + "learning_rate": 3.676056338028169e-05, + "loss": 0.3898, + "step": 2850 + }, + { + "epoch": 0.9152, + "grad_norm": 1.1998872756958008, + "learning_rate": 3.6704225352112675e-05, + "loss": 0.3745, + "step": 2860 + }, + { + "epoch": 0.9184, + "grad_norm": 6.544206619262695, + "learning_rate": 3.664788732394366e-05, + "loss": 0.2814, + "step": 2870 + }, + { + "epoch": 0.9216, + "grad_norm": 27.720895767211914, + "learning_rate": 3.659154929577465e-05, + "loss": 0.5197, + "step": 2880 + }, + { + "epoch": 0.9248, + "grad_norm": 5.2081403732299805, + "learning_rate": 3.653521126760563e-05, + "loss": 0.2418, + "step": 2890 + }, + { + "epoch": 0.928, + "grad_norm": 25.75909996032715, + "learning_rate": 3.6478873239436624e-05, + "loss": 0.43, + "step": 2900 + }, + { + "epoch": 0.9312, + "grad_norm": 11.020965576171875, + "learning_rate": 3.64225352112676e-05, + "loss": 0.2788, + "step": 2910 + }, + { + "epoch": 0.9344, + "grad_norm": 5.504922866821289, + "learning_rate": 3.6366197183098595e-05, + "loss": 0.2734, + "step": 2920 + }, + { + "epoch": 0.9376, + "grad_norm": 10.418407440185547, + "learning_rate": 3.630985915492958e-05, + "loss": 0.4713, + "step": 2930 + }, + { + "epoch": 0.9408, + "grad_norm": 7.805202960968018, + "learning_rate": 3.6253521126760566e-05, + "loss": 0.2821, + "step": 2940 + }, + { + "epoch": 0.944, + "grad_norm": 7.880125045776367, + "learning_rate": 3.619718309859155e-05, + "loss": 0.1627, + "step": 2950 + }, + { + "epoch": 0.9472, + "grad_norm": 33.17704772949219, + "learning_rate": 3.614084507042254e-05, + "loss": 0.4046, + "step": 2960 + }, + { + "epoch": 0.9504, + "grad_norm": 3.1542086601257324, + "learning_rate": 3.608450704225352e-05, + "loss": 0.3558, + "step": 2970 + }, + { + "epoch": 0.9536, + "grad_norm": 21.562021255493164, + "learning_rate": 3.602816901408451e-05, + "loss": 0.3107, + "step": 2980 + }, + { + "epoch": 0.9568, + "grad_norm": 17.724111557006836, + "learning_rate": 3.5971830985915494e-05, + "loss": 0.2855, + "step": 2990 + }, + { + "epoch": 0.96, + "grad_norm": 16.4515323638916, + "learning_rate": 3.5915492957746486e-05, + "loss": 0.3282, + "step": 3000 + }, + { + "epoch": 0.9632, + "grad_norm": 3.998889684677124, + "learning_rate": 3.5859154929577465e-05, + "loss": 0.4798, + "step": 3010 + }, + { + "epoch": 0.9664, + "grad_norm": 13.89487361907959, + "learning_rate": 3.580281690140846e-05, + "loss": 0.3655, + "step": 3020 + }, + { + "epoch": 0.9696, + "grad_norm": 12.545125007629395, + "learning_rate": 3.5746478873239436e-05, + "loss": 0.374, + "step": 3030 + }, + { + "epoch": 0.9728, + "grad_norm": 16.73505210876465, + "learning_rate": 3.569014084507042e-05, + "loss": 0.4029, + "step": 3040 + }, + { + "epoch": 0.976, + "grad_norm": 3.9983644485473633, + "learning_rate": 3.5633802816901413e-05, + "loss": 0.1769, + "step": 3050 + }, + { + "epoch": 0.9792, + "grad_norm": 19.214399337768555, + "learning_rate": 3.557746478873239e-05, + "loss": 0.335, + "step": 3060 + }, + { + "epoch": 0.9824, + "grad_norm": 27.911151885986328, + "learning_rate": 3.5521126760563384e-05, + "loss": 0.3406, + "step": 3070 + }, + { + "epoch": 0.9856, + "grad_norm": 8.778318405151367, + "learning_rate": 3.546478873239436e-05, + "loss": 0.4602, + "step": 3080 + }, + { + "epoch": 0.9888, + "grad_norm": 5.281238555908203, + "learning_rate": 3.5408450704225355e-05, + "loss": 0.3499, + "step": 3090 + }, + { + "epoch": 0.992, + "grad_norm": 7.649629592895508, + "learning_rate": 3.5352112676056334e-05, + "loss": 0.2717, + "step": 3100 + }, + { + "epoch": 0.9952, + "grad_norm": 8.83627986907959, + "learning_rate": 3.5295774647887326e-05, + "loss": 0.3544, + "step": 3110 + }, + { + "epoch": 0.9984, + "grad_norm": 13.762328147888184, + "learning_rate": 3.523943661971831e-05, + "loss": 0.3039, + "step": 3120 + }, + { + "epoch": 1.0016, + "grad_norm": 28.637189865112305, + "learning_rate": 3.51830985915493e-05, + "loss": 0.1381, + "step": 3130 + }, + { + "epoch": 1.0048, + "grad_norm": 6.5435791015625, + "learning_rate": 3.512676056338028e-05, + "loss": 0.4339, + "step": 3140 + }, + { + "epoch": 1.008, + "grad_norm": 8.024589538574219, + "learning_rate": 3.507042253521127e-05, + "loss": 0.3516, + "step": 3150 + }, + { + "epoch": 1.0112, + "grad_norm": 0.35201606154441833, + "learning_rate": 3.5014084507042254e-05, + "loss": 0.232, + "step": 3160 + }, + { + "epoch": 1.0144, + "grad_norm": 0.8222833275794983, + "learning_rate": 3.4957746478873246e-05, + "loss": 0.555, + "step": 3170 + }, + { + "epoch": 1.0176, + "grad_norm": 0.6108925342559814, + "learning_rate": 3.4901408450704225e-05, + "loss": 0.4073, + "step": 3180 + }, + { + "epoch": 1.0208, + "grad_norm": 33.10356521606445, + "learning_rate": 3.484507042253522e-05, + "loss": 0.2513, + "step": 3190 + }, + { + "epoch": 1.024, + "grad_norm": 0.9735102653503418, + "learning_rate": 3.4788732394366196e-05, + "loss": 0.2464, + "step": 3200 + }, + { + "epoch": 1.0272, + "grad_norm": 28.903047561645508, + "learning_rate": 3.473239436619719e-05, + "loss": 0.2955, + "step": 3210 + }, + { + "epoch": 1.0304, + "grad_norm": 2.8575825691223145, + "learning_rate": 3.4676056338028174e-05, + "loss": 0.1999, + "step": 3220 + }, + { + "epoch": 1.0336, + "grad_norm": 20.94749641418457, + "learning_rate": 3.461971830985916e-05, + "loss": 0.2652, + "step": 3230 + }, + { + "epoch": 1.0368, + "grad_norm": 56.3437385559082, + "learning_rate": 3.4563380281690145e-05, + "loss": 0.2871, + "step": 3240 + }, + { + "epoch": 1.04, + "grad_norm": 0.698115885257721, + "learning_rate": 3.450704225352113e-05, + "loss": 0.1267, + "step": 3250 + }, + { + "epoch": 1.0432, + "grad_norm": 0.10409284383058548, + "learning_rate": 3.4450704225352116e-05, + "loss": 0.3235, + "step": 3260 + }, + { + "epoch": 1.0464, + "grad_norm": 22.576404571533203, + "learning_rate": 3.4394366197183094e-05, + "loss": 0.3016, + "step": 3270 + }, + { + "epoch": 1.0496, + "grad_norm": 15.452841758728027, + "learning_rate": 3.433802816901409e-05, + "loss": 0.43, + "step": 3280 + }, + { + "epoch": 1.0528, + "grad_norm": 1.0227371454238892, + "learning_rate": 3.428169014084507e-05, + "loss": 0.261, + "step": 3290 + }, + { + "epoch": 1.056, + "grad_norm": 12.007558822631836, + "learning_rate": 3.422535211267606e-05, + "loss": 0.1861, + "step": 3300 + }, + { + "epoch": 1.0592, + "grad_norm": 4.2839484214782715, + "learning_rate": 3.416901408450704e-05, + "loss": 0.2487, + "step": 3310 + }, + { + "epoch": 1.0624, + "grad_norm": 31.287580490112305, + "learning_rate": 3.411267605633803e-05, + "loss": 0.2447, + "step": 3320 + }, + { + "epoch": 1.0656, + "grad_norm": 4.608018398284912, + "learning_rate": 3.4056338028169014e-05, + "loss": 0.3264, + "step": 3330 + }, + { + "epoch": 1.0688, + "grad_norm": 10.261385917663574, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.1514, + "step": 3340 + }, + { + "epoch": 1.072, + "grad_norm": 28.37779426574707, + "learning_rate": 3.3943661971830985e-05, + "loss": 0.3787, + "step": 3350 + }, + { + "epoch": 1.0752, + "grad_norm": 4.77971076965332, + "learning_rate": 3.388732394366198e-05, + "loss": 0.2528, + "step": 3360 + }, + { + "epoch": 1.0784, + "grad_norm": 5.056029319763184, + "learning_rate": 3.3830985915492956e-05, + "loss": 0.1816, + "step": 3370 + }, + { + "epoch": 1.0816, + "grad_norm": 1.7177847623825073, + "learning_rate": 3.377464788732395e-05, + "loss": 0.1776, + "step": 3380 + }, + { + "epoch": 1.0848, + "grad_norm": 28.095943450927734, + "learning_rate": 3.371830985915493e-05, + "loss": 0.5433, + "step": 3390 + }, + { + "epoch": 1.088, + "grad_norm": 18.927244186401367, + "learning_rate": 3.366197183098592e-05, + "loss": 0.1047, + "step": 3400 + }, + { + "epoch": 1.0912, + "grad_norm": 13.517168998718262, + "learning_rate": 3.3605633802816905e-05, + "loss": 0.4499, + "step": 3410 + }, + { + "epoch": 1.0944, + "grad_norm": 8.441484451293945, + "learning_rate": 3.354929577464789e-05, + "loss": 0.1514, + "step": 3420 + }, + { + "epoch": 1.0976, + "grad_norm": 12.520185470581055, + "learning_rate": 3.3492957746478876e-05, + "loss": 0.329, + "step": 3430 + }, + { + "epoch": 1.1008, + "grad_norm": 5.51774263381958, + "learning_rate": 3.343661971830986e-05, + "loss": 0.4714, + "step": 3440 + }, + { + "epoch": 1.104, + "grad_norm": 0.26915284991264343, + "learning_rate": 3.338028169014085e-05, + "loss": 0.0801, + "step": 3450 + }, + { + "epoch": 1.1072, + "grad_norm": 8.110269546508789, + "learning_rate": 3.332394366197183e-05, + "loss": 0.2196, + "step": 3460 + }, + { + "epoch": 1.1104, + "grad_norm": 24.571348190307617, + "learning_rate": 3.326760563380282e-05, + "loss": 0.2151, + "step": 3470 + }, + { + "epoch": 1.1136, + "grad_norm": 4.834783554077148, + "learning_rate": 3.3211267605633804e-05, + "loss": 0.2286, + "step": 3480 + }, + { + "epoch": 1.1168, + "grad_norm": 0.4705199599266052, + "learning_rate": 3.315492957746479e-05, + "loss": 0.262, + "step": 3490 + }, + { + "epoch": 1.12, + "grad_norm": 10.442275047302246, + "learning_rate": 3.3098591549295775e-05, + "loss": 0.1664, + "step": 3500 + }, + { + "epoch": 1.1232, + "grad_norm": 18.814613342285156, + "learning_rate": 3.304225352112677e-05, + "loss": 0.2115, + "step": 3510 + }, + { + "epoch": 1.1264, + "grad_norm": 0.24957086145877838, + "learning_rate": 3.2985915492957746e-05, + "loss": 0.3388, + "step": 3520 + }, + { + "epoch": 1.1296, + "grad_norm": 0.16773709654808044, + "learning_rate": 3.292957746478874e-05, + "loss": 0.0763, + "step": 3530 + }, + { + "epoch": 1.1328, + "grad_norm": 13.98716926574707, + "learning_rate": 3.2873239436619717e-05, + "loss": 0.1653, + "step": 3540 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.13978277146816254, + "learning_rate": 3.281690140845071e-05, + "loss": 0.4153, + "step": 3550 + }, + { + "epoch": 1.1392, + "grad_norm": 1.202662706375122, + "learning_rate": 3.276056338028169e-05, + "loss": 0.1792, + "step": 3560 + }, + { + "epoch": 1.1424, + "grad_norm": 0.2656092643737793, + "learning_rate": 3.270422535211268e-05, + "loss": 0.1704, + "step": 3570 + }, + { + "epoch": 1.1456, + "grad_norm": 0.1266532689332962, + "learning_rate": 3.2647887323943665e-05, + "loss": 0.1174, + "step": 3580 + }, + { + "epoch": 1.1488, + "grad_norm": 5.823769569396973, + "learning_rate": 3.259154929577465e-05, + "loss": 0.4392, + "step": 3590 + }, + { + "epoch": 1.152, + "grad_norm": 0.7768305540084839, + "learning_rate": 3.2535211267605636e-05, + "loss": 0.5019, + "step": 3600 + }, + { + "epoch": 1.1552, + "grad_norm": 4.988490104675293, + "learning_rate": 3.247887323943662e-05, + "loss": 0.1525, + "step": 3610 + }, + { + "epoch": 1.1584, + "grad_norm": 7.760066509246826, + "learning_rate": 3.242253521126761e-05, + "loss": 0.2967, + "step": 3620 + }, + { + "epoch": 1.1616, + "grad_norm": 0.3713386356830597, + "learning_rate": 3.236619718309859e-05, + "loss": 0.2972, + "step": 3630 + }, + { + "epoch": 1.1648, + "grad_norm": 0.1917293667793274, + "learning_rate": 3.230985915492958e-05, + "loss": 0.1505, + "step": 3640 + }, + { + "epoch": 1.168, + "grad_norm": 0.31380757689476013, + "learning_rate": 3.2253521126760564e-05, + "loss": 0.3006, + "step": 3650 + }, + { + "epoch": 1.1712, + "grad_norm": 18.34935188293457, + "learning_rate": 3.219718309859155e-05, + "loss": 0.5165, + "step": 3660 + }, + { + "epoch": 1.1743999999999999, + "grad_norm": 1.479733943939209, + "learning_rate": 3.2140845070422535e-05, + "loss": 0.104, + "step": 3670 + }, + { + "epoch": 1.1776, + "grad_norm": 29.272424697875977, + "learning_rate": 3.208450704225353e-05, + "loss": 0.1871, + "step": 3680 + }, + { + "epoch": 1.1808, + "grad_norm": 31.237834930419922, + "learning_rate": 3.2028169014084506e-05, + "loss": 0.3294, + "step": 3690 + }, + { + "epoch": 1.184, + "grad_norm": 0.5057691335678101, + "learning_rate": 3.19718309859155e-05, + "loss": 0.3202, + "step": 3700 + }, + { + "epoch": 1.1872, + "grad_norm": 35.599571228027344, + "learning_rate": 3.191549295774648e-05, + "loss": 0.5781, + "step": 3710 + }, + { + "epoch": 1.1904, + "grad_norm": 18.676931381225586, + "learning_rate": 3.185915492957747e-05, + "loss": 0.2537, + "step": 3720 + }, + { + "epoch": 1.1936, + "grad_norm": 17.747034072875977, + "learning_rate": 3.180281690140845e-05, + "loss": 0.2176, + "step": 3730 + }, + { + "epoch": 1.1968, + "grad_norm": 20.384511947631836, + "learning_rate": 3.174647887323944e-05, + "loss": 0.3224, + "step": 3740 + }, + { + "epoch": 1.2, + "grad_norm": 1.9572980403900146, + "learning_rate": 3.1690140845070426e-05, + "loss": 0.3297, + "step": 3750 + }, + { + "epoch": 1.2032, + "grad_norm": 11.518452644348145, + "learning_rate": 3.163380281690141e-05, + "loss": 0.0754, + "step": 3760 + }, + { + "epoch": 1.2064, + "grad_norm": 21.269893646240234, + "learning_rate": 3.1577464788732397e-05, + "loss": 0.3698, + "step": 3770 + }, + { + "epoch": 1.2096, + "grad_norm": 6.943552494049072, + "learning_rate": 3.152112676056338e-05, + "loss": 0.3419, + "step": 3780 + }, + { + "epoch": 1.2128, + "grad_norm": 2.637138605117798, + "learning_rate": 3.146478873239437e-05, + "loss": 0.4475, + "step": 3790 + }, + { + "epoch": 1.216, + "grad_norm": 0.2345736175775528, + "learning_rate": 3.140845070422535e-05, + "loss": 0.26, + "step": 3800 + }, + { + "epoch": 1.2192, + "grad_norm": 23.990619659423828, + "learning_rate": 3.135211267605634e-05, + "loss": 0.4548, + "step": 3810 + }, + { + "epoch": 1.2224, + "grad_norm": 30.698591232299805, + "learning_rate": 3.1295774647887324e-05, + "loss": 0.2941, + "step": 3820 + }, + { + "epoch": 1.2256, + "grad_norm": 8.31469440460205, + "learning_rate": 3.123943661971831e-05, + "loss": 0.2317, + "step": 3830 + }, + { + "epoch": 1.2288000000000001, + "grad_norm": 0.8447297215461731, + "learning_rate": 3.1183098591549295e-05, + "loss": 0.1829, + "step": 3840 + }, + { + "epoch": 1.232, + "grad_norm": 1.337220549583435, + "learning_rate": 3.112676056338028e-05, + "loss": 0.1399, + "step": 3850 + }, + { + "epoch": 1.2352, + "grad_norm": 0.0955493152141571, + "learning_rate": 3.1070422535211266e-05, + "loss": 0.4618, + "step": 3860 + }, + { + "epoch": 1.2384, + "grad_norm": 0.7334279417991638, + "learning_rate": 3.101408450704226e-05, + "loss": 0.0653, + "step": 3870 + }, + { + "epoch": 1.2416, + "grad_norm": 0.8991394639015198, + "learning_rate": 3.095774647887324e-05, + "loss": 0.6064, + "step": 3880 + }, + { + "epoch": 1.2448, + "grad_norm": 30.786052703857422, + "learning_rate": 3.090140845070423e-05, + "loss": 0.2257, + "step": 3890 + }, + { + "epoch": 1.248, + "grad_norm": 8.322766304016113, + "learning_rate": 3.084507042253521e-05, + "loss": 0.3758, + "step": 3900 + }, + { + "epoch": 1.2511999999999999, + "grad_norm": 1.9371285438537598, + "learning_rate": 3.07887323943662e-05, + "loss": 0.1619, + "step": 3910 + }, + { + "epoch": 1.2544, + "grad_norm": 30.936664581298828, + "learning_rate": 3.0732394366197186e-05, + "loss": 0.226, + "step": 3920 + }, + { + "epoch": 1.2576, + "grad_norm": 18.97284507751465, + "learning_rate": 3.067605633802817e-05, + "loss": 0.2106, + "step": 3930 + }, + { + "epoch": 1.2608, + "grad_norm": 0.17548918724060059, + "learning_rate": 3.061971830985916e-05, + "loss": 0.1827, + "step": 3940 + }, + { + "epoch": 1.264, + "grad_norm": 53.99245834350586, + "learning_rate": 3.056338028169014e-05, + "loss": 0.1693, + "step": 3950 + }, + { + "epoch": 1.2671999999999999, + "grad_norm": 35.83251190185547, + "learning_rate": 3.0507042253521128e-05, + "loss": 0.2246, + "step": 3960 + }, + { + "epoch": 1.2704, + "grad_norm": 0.14614693820476532, + "learning_rate": 3.0450704225352117e-05, + "loss": 0.0399, + "step": 3970 + }, + { + "epoch": 1.2736, + "grad_norm": 0.09585163742303848, + "learning_rate": 3.03943661971831e-05, + "loss": 0.3103, + "step": 3980 + }, + { + "epoch": 1.2768, + "grad_norm": 0.35296738147735596, + "learning_rate": 3.0338028169014088e-05, + "loss": 0.4458, + "step": 3990 + }, + { + "epoch": 1.28, + "grad_norm": 25.493444442749023, + "learning_rate": 3.028169014084507e-05, + "loss": 0.2397, + "step": 4000 + }, + { + "epoch": 1.2832, + "grad_norm": 21.99680519104004, + "learning_rate": 3.022535211267606e-05, + "loss": 0.3189, + "step": 4010 + }, + { + "epoch": 1.2864, + "grad_norm": 1.8091436624526978, + "learning_rate": 3.016901408450704e-05, + "loss": 0.1739, + "step": 4020 + }, + { + "epoch": 1.2896, + "grad_norm": 0.42829862236976624, + "learning_rate": 3.011267605633803e-05, + "loss": 0.1976, + "step": 4030 + }, + { + "epoch": 1.2928, + "grad_norm": 0.08488719165325165, + "learning_rate": 3.005633802816902e-05, + "loss": 0.2067, + "step": 4040 + }, + { + "epoch": 1.296, + "grad_norm": 9.230164527893066, + "learning_rate": 3e-05, + "loss": 0.2783, + "step": 4050 + }, + { + "epoch": 1.2992, + "grad_norm": 2.309288263320923, + "learning_rate": 2.994366197183099e-05, + "loss": 0.2829, + "step": 4060 + }, + { + "epoch": 1.3024, + "grad_norm": 38.344730377197266, + "learning_rate": 2.9887323943661972e-05, + "loss": 0.2297, + "step": 4070 + }, + { + "epoch": 1.3056, + "grad_norm": 6.4838337898254395, + "learning_rate": 2.983098591549296e-05, + "loss": 0.4281, + "step": 4080 + }, + { + "epoch": 1.3088, + "grad_norm": 30.31648826599121, + "learning_rate": 2.9774647887323946e-05, + "loss": 0.1629, + "step": 4090 + }, + { + "epoch": 1.312, + "grad_norm": 2.2914836406707764, + "learning_rate": 2.971830985915493e-05, + "loss": 0.2485, + "step": 4100 + }, + { + "epoch": 1.3152, + "grad_norm": 0.811667263507843, + "learning_rate": 2.9661971830985917e-05, + "loss": 0.096, + "step": 4110 + }, + { + "epoch": 1.3184, + "grad_norm": 10.037412643432617, + "learning_rate": 2.9605633802816903e-05, + "loss": 0.4057, + "step": 4120 + }, + { + "epoch": 1.3216, + "grad_norm": 15.876739501953125, + "learning_rate": 2.9549295774647888e-05, + "loss": 0.2493, + "step": 4130 + }, + { + "epoch": 1.3248, + "grad_norm": 0.10714894533157349, + "learning_rate": 2.9492957746478874e-05, + "loss": 0.1766, + "step": 4140 + }, + { + "epoch": 1.328, + "grad_norm": 0.9898651242256165, + "learning_rate": 2.943661971830986e-05, + "loss": 0.1064, + "step": 4150 + }, + { + "epoch": 1.3312, + "grad_norm": 17.39281463623047, + "learning_rate": 2.9380281690140848e-05, + "loss": 0.1863, + "step": 4160 + }, + { + "epoch": 1.3344, + "grad_norm": 4.467952728271484, + "learning_rate": 2.932394366197183e-05, + "loss": 0.3616, + "step": 4170 + }, + { + "epoch": 1.3376000000000001, + "grad_norm": 0.1744953840970993, + "learning_rate": 2.926760563380282e-05, + "loss": 0.3911, + "step": 4180 + }, + { + "epoch": 1.3408, + "grad_norm": 11.605582237243652, + "learning_rate": 2.92112676056338e-05, + "loss": 0.086, + "step": 4190 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.277223110198975, + "learning_rate": 2.915492957746479e-05, + "loss": 0.1406, + "step": 4200 + }, + { + "epoch": 1.3472, + "grad_norm": 55.23845672607422, + "learning_rate": 2.909859154929578e-05, + "loss": 0.3971, + "step": 4210 + }, + { + "epoch": 1.3504, + "grad_norm": 14.972460746765137, + "learning_rate": 2.904225352112676e-05, + "loss": 0.2301, + "step": 4220 + }, + { + "epoch": 1.3536000000000001, + "grad_norm": 0.19094644486904144, + "learning_rate": 2.898591549295775e-05, + "loss": 0.3753, + "step": 4230 + }, + { + "epoch": 1.3568, + "grad_norm": 4.376794815063477, + "learning_rate": 2.8929577464788732e-05, + "loss": 0.1735, + "step": 4240 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 50.90641403198242, + "learning_rate": 2.887323943661972e-05, + "loss": 0.22, + "step": 4250 + }, + { + "epoch": 1.3632, + "grad_norm": 10.364707946777344, + "learning_rate": 2.881690140845071e-05, + "loss": 0.1155, + "step": 4260 + }, + { + "epoch": 1.3664, + "grad_norm": 1.0661869049072266, + "learning_rate": 2.8760563380281692e-05, + "loss": 0.3827, + "step": 4270 + }, + { + "epoch": 1.3696, + "grad_norm": 17.855276107788086, + "learning_rate": 2.870422535211268e-05, + "loss": 0.3932, + "step": 4280 + }, + { + "epoch": 1.3728, + "grad_norm": 27.09354019165039, + "learning_rate": 2.8647887323943663e-05, + "loss": 0.3695, + "step": 4290 + }, + { + "epoch": 1.376, + "grad_norm": 0.13813486695289612, + "learning_rate": 2.859154929577465e-05, + "loss": 0.2251, + "step": 4300 + }, + { + "epoch": 1.3792, + "grad_norm": 16.627347946166992, + "learning_rate": 2.8535211267605634e-05, + "loss": 0.3089, + "step": 4310 + }, + { + "epoch": 1.3824, + "grad_norm": 0.3655046820640564, + "learning_rate": 2.847887323943662e-05, + "loss": 0.2134, + "step": 4320 + }, + { + "epoch": 1.3856, + "grad_norm": 38.9260368347168, + "learning_rate": 2.842253521126761e-05, + "loss": 0.3283, + "step": 4330 + }, + { + "epoch": 1.3888, + "grad_norm": 26.279027938842773, + "learning_rate": 2.836619718309859e-05, + "loss": 0.2663, + "step": 4340 + }, + { + "epoch": 1.392, + "grad_norm": 49.378902435302734, + "learning_rate": 2.830985915492958e-05, + "loss": 0.2607, + "step": 4350 + }, + { + "epoch": 1.3952, + "grad_norm": 0.32010015845298767, + "learning_rate": 2.825352112676056e-05, + "loss": 0.423, + "step": 4360 + }, + { + "epoch": 1.3984, + "grad_norm": 5.871781349182129, + "learning_rate": 2.819718309859155e-05, + "loss": 0.1471, + "step": 4370 + }, + { + "epoch": 1.4016, + "grad_norm": 3.814654588699341, + "learning_rate": 2.814084507042254e-05, + "loss": 0.2127, + "step": 4380 + }, + { + "epoch": 1.4048, + "grad_norm": 0.4913332164287567, + "learning_rate": 2.808450704225352e-05, + "loss": 0.4049, + "step": 4390 + }, + { + "epoch": 1.408, + "grad_norm": 1.195360779762268, + "learning_rate": 2.802816901408451e-05, + "loss": 0.3488, + "step": 4400 + }, + { + "epoch": 1.4112, + "grad_norm": 30.266616821289062, + "learning_rate": 2.7971830985915492e-05, + "loss": 0.2471, + "step": 4410 + }, + { + "epoch": 1.4144, + "grad_norm": 0.1005750447511673, + "learning_rate": 2.791549295774648e-05, + "loss": 0.1442, + "step": 4420 + }, + { + "epoch": 1.4176, + "grad_norm": 19.245065689086914, + "learning_rate": 2.7859154929577463e-05, + "loss": 0.3952, + "step": 4430 + }, + { + "epoch": 1.4208, + "grad_norm": 0.5281161069869995, + "learning_rate": 2.7802816901408452e-05, + "loss": 0.1281, + "step": 4440 + }, + { + "epoch": 1.424, + "grad_norm": 0.20444443821907043, + "learning_rate": 2.774647887323944e-05, + "loss": 0.3929, + "step": 4450 + }, + { + "epoch": 1.4272, + "grad_norm": 0.12726615369319916, + "learning_rate": 2.7690140845070423e-05, + "loss": 0.1062, + "step": 4460 + }, + { + "epoch": 1.4304000000000001, + "grad_norm": 108.31690216064453, + "learning_rate": 2.7633802816901412e-05, + "loss": 0.1228, + "step": 4470 + }, + { + "epoch": 1.4336, + "grad_norm": 25.274940490722656, + "learning_rate": 2.7577464788732394e-05, + "loss": 0.2112, + "step": 4480 + }, + { + "epoch": 1.4368, + "grad_norm": 7.059344291687012, + "learning_rate": 2.7521126760563383e-05, + "loss": 0.4667, + "step": 4490 + }, + { + "epoch": 1.44, + "grad_norm": 24.798084259033203, + "learning_rate": 2.746478873239437e-05, + "loss": 0.3644, + "step": 4500 + }, + { + "epoch": 1.4432, + "grad_norm": 11.129374504089355, + "learning_rate": 2.7408450704225354e-05, + "loss": 0.4076, + "step": 4510 + }, + { + "epoch": 1.4464000000000001, + "grad_norm": 6.293646335601807, + "learning_rate": 2.735211267605634e-05, + "loss": 0.2754, + "step": 4520 + }, + { + "epoch": 1.4496, + "grad_norm": 22.136383056640625, + "learning_rate": 2.7295774647887322e-05, + "loss": 0.4665, + "step": 4530 + }, + { + "epoch": 1.4527999999999999, + "grad_norm": 36.15532684326172, + "learning_rate": 2.723943661971831e-05, + "loss": 0.2789, + "step": 4540 + }, + { + "epoch": 1.456, + "grad_norm": 58.91614532470703, + "learning_rate": 2.71830985915493e-05, + "loss": 0.2236, + "step": 4550 + }, + { + "epoch": 1.4592, + "grad_norm": 5.225749492645264, + "learning_rate": 2.712676056338028e-05, + "loss": 0.459, + "step": 4560 + }, + { + "epoch": 1.4624, + "grad_norm": 11.404582977294922, + "learning_rate": 2.707042253521127e-05, + "loss": 0.2615, + "step": 4570 + }, + { + "epoch": 1.4656, + "grad_norm": 15.184187889099121, + "learning_rate": 2.7014084507042253e-05, + "loss": 0.242, + "step": 4580 + }, + { + "epoch": 1.4687999999999999, + "grad_norm": 29.988828659057617, + "learning_rate": 2.695774647887324e-05, + "loss": 0.1761, + "step": 4590 + }, + { + "epoch": 1.472, + "grad_norm": 0.3717154562473297, + "learning_rate": 2.6901408450704224e-05, + "loss": 0.1711, + "step": 4600 + }, + { + "epoch": 1.4752, + "grad_norm": 0.5243228077888489, + "learning_rate": 2.6845070422535213e-05, + "loss": 0.3551, + "step": 4610 + }, + { + "epoch": 1.4784, + "grad_norm": 0.14952997863292694, + "learning_rate": 2.67887323943662e-05, + "loss": 0.2119, + "step": 4620 + }, + { + "epoch": 1.4816, + "grad_norm": 0.5155125856399536, + "learning_rate": 2.6732394366197184e-05, + "loss": 0.1516, + "step": 4630 + }, + { + "epoch": 1.4848, + "grad_norm": 16.079330444335938, + "learning_rate": 2.6676056338028172e-05, + "loss": 0.3477, + "step": 4640 + }, + { + "epoch": 1.488, + "grad_norm": 0.5251998901367188, + "learning_rate": 2.6619718309859155e-05, + "loss": 0.3483, + "step": 4650 + }, + { + "epoch": 1.4912, + "grad_norm": 12.90518569946289, + "learning_rate": 2.6563380281690143e-05, + "loss": 0.3346, + "step": 4660 + }, + { + "epoch": 1.4944, + "grad_norm": 3.163393259048462, + "learning_rate": 2.650704225352113e-05, + "loss": 0.4457, + "step": 4670 + }, + { + "epoch": 1.4976, + "grad_norm": 0.8409318327903748, + "learning_rate": 2.6450704225352114e-05, + "loss": 0.1024, + "step": 4680 + }, + { + "epoch": 1.5008, + "grad_norm": 0.23881012201309204, + "learning_rate": 2.63943661971831e-05, + "loss": 0.2065, + "step": 4690 + }, + { + "epoch": 1.504, + "grad_norm": 31.078039169311523, + "learning_rate": 2.6338028169014085e-05, + "loss": 0.2194, + "step": 4700 + }, + { + "epoch": 1.5072, + "grad_norm": 0.28144362568855286, + "learning_rate": 2.628169014084507e-05, + "loss": 0.0813, + "step": 4710 + }, + { + "epoch": 1.5104, + "grad_norm": 10.667701721191406, + "learning_rate": 2.6225352112676056e-05, + "loss": 0.3627, + "step": 4720 + }, + { + "epoch": 1.5135999999999998, + "grad_norm": 0.4722766578197479, + "learning_rate": 2.6169014084507042e-05, + "loss": 0.3747, + "step": 4730 + }, + { + "epoch": 1.5168, + "grad_norm": 13.2311429977417, + "learning_rate": 2.611267605633803e-05, + "loss": 0.1583, + "step": 4740 + }, + { + "epoch": 1.52, + "grad_norm": 1.4900763034820557, + "learning_rate": 2.6056338028169013e-05, + "loss": 0.2564, + "step": 4750 + }, + { + "epoch": 1.5232, + "grad_norm": 32.169681549072266, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2757, + "step": 4760 + }, + { + "epoch": 1.5264, + "grad_norm": 25.864120483398438, + "learning_rate": 2.5943661971830984e-05, + "loss": 0.2264, + "step": 4770 + }, + { + "epoch": 1.5295999999999998, + "grad_norm": 5.702986717224121, + "learning_rate": 2.5887323943661973e-05, + "loss": 0.1946, + "step": 4780 + }, + { + "epoch": 1.5328, + "grad_norm": 0.28651973605155945, + "learning_rate": 2.583098591549296e-05, + "loss": 0.0838, + "step": 4790 + }, + { + "epoch": 1.536, + "grad_norm": 0.4322168529033661, + "learning_rate": 2.5774647887323944e-05, + "loss": 0.2331, + "step": 4800 + }, + { + "epoch": 1.5392000000000001, + "grad_norm": 0.17899250984191895, + "learning_rate": 2.5718309859154933e-05, + "loss": 0.2475, + "step": 4810 + }, + { + "epoch": 1.5424, + "grad_norm": 30.139265060424805, + "learning_rate": 2.5661971830985915e-05, + "loss": 0.4416, + "step": 4820 + }, + { + "epoch": 1.5455999999999999, + "grad_norm": 0.23452678322792053, + "learning_rate": 2.5605633802816904e-05, + "loss": 0.2825, + "step": 4830 + }, + { + "epoch": 1.5488, + "grad_norm": 0.22751548886299133, + "learning_rate": 2.5549295774647893e-05, + "loss": 0.2478, + "step": 4840 + }, + { + "epoch": 1.552, + "grad_norm": 6.262718200683594, + "learning_rate": 2.5492957746478875e-05, + "loss": 0.2214, + "step": 4850 + }, + { + "epoch": 1.5552000000000001, + "grad_norm": 0.2855672836303711, + "learning_rate": 2.5436619718309864e-05, + "loss": 0.1745, + "step": 4860 + }, + { + "epoch": 1.5584, + "grad_norm": 4.517999649047852, + "learning_rate": 2.5380281690140846e-05, + "loss": 0.2678, + "step": 4870 + }, + { + "epoch": 1.5615999999999999, + "grad_norm": 31.07318115234375, + "learning_rate": 2.5323943661971835e-05, + "loss": 0.2528, + "step": 4880 + }, + { + "epoch": 1.5648, + "grad_norm": 1.6451767683029175, + "learning_rate": 2.5267605633802817e-05, + "loss": 0.1002, + "step": 4890 + }, + { + "epoch": 1.568, + "grad_norm": 32.15398406982422, + "learning_rate": 2.5211267605633802e-05, + "loss": 0.3257, + "step": 4900 + }, + { + "epoch": 1.5712000000000002, + "grad_norm": 7.450695037841797, + "learning_rate": 2.515492957746479e-05, + "loss": 0.2022, + "step": 4910 + }, + { + "epoch": 1.5744, + "grad_norm": 14.619000434875488, + "learning_rate": 2.5098591549295773e-05, + "loss": 0.346, + "step": 4920 + }, + { + "epoch": 1.5776, + "grad_norm": 0.7622524499893188, + "learning_rate": 2.5042253521126762e-05, + "loss": 0.1638, + "step": 4930 + }, + { + "epoch": 1.5808, + "grad_norm": 2.6016695499420166, + "learning_rate": 2.4985915492957748e-05, + "loss": 0.0822, + "step": 4940 + }, + { + "epoch": 1.584, + "grad_norm": 2.1474409103393555, + "learning_rate": 2.4929577464788733e-05, + "loss": 0.1602, + "step": 4950 + }, + { + "epoch": 1.5872000000000002, + "grad_norm": 65.45417785644531, + "learning_rate": 2.487323943661972e-05, + "loss": 0.1883, + "step": 4960 + }, + { + "epoch": 1.5904, + "grad_norm": 13.360310554504395, + "learning_rate": 2.4816901408450704e-05, + "loss": 0.2619, + "step": 4970 + }, + { + "epoch": 1.5936, + "grad_norm": 0.6584329009056091, + "learning_rate": 2.476056338028169e-05, + "loss": 0.2612, + "step": 4980 + }, + { + "epoch": 1.5968, + "grad_norm": 54.88881301879883, + "learning_rate": 2.470422535211268e-05, + "loss": 0.6558, + "step": 4990 + }, + { + "epoch": 1.6, + "grad_norm": 0.08469274640083313, + "learning_rate": 2.4647887323943664e-05, + "loss": 0.2935, + "step": 5000 + }, + { + "epoch": 1.6032, + "grad_norm": 38.17769241333008, + "learning_rate": 2.459154929577465e-05, + "loss": 0.2391, + "step": 5010 + }, + { + "epoch": 1.6064, + "grad_norm": 0.1355709284543991, + "learning_rate": 2.4535211267605635e-05, + "loss": 0.1574, + "step": 5020 + }, + { + "epoch": 1.6096, + "grad_norm": 7.013975143432617, + "learning_rate": 2.447887323943662e-05, + "loss": 0.1076, + "step": 5030 + }, + { + "epoch": 1.6128, + "grad_norm": 13.909317970275879, + "learning_rate": 2.442253521126761e-05, + "loss": 0.4274, + "step": 5040 + }, + { + "epoch": 1.616, + "grad_norm": 4.903537273406982, + "learning_rate": 2.4366197183098595e-05, + "loss": 0.2527, + "step": 5050 + }, + { + "epoch": 1.6192, + "grad_norm": 9.500699996948242, + "learning_rate": 2.430985915492958e-05, + "loss": 0.4478, + "step": 5060 + }, + { + "epoch": 1.6223999999999998, + "grad_norm": 47.62290954589844, + "learning_rate": 2.4253521126760566e-05, + "loss": 0.2439, + "step": 5070 + }, + { + "epoch": 1.6256, + "grad_norm": 0.21192322671413422, + "learning_rate": 2.419718309859155e-05, + "loss": 0.2524, + "step": 5080 + }, + { + "epoch": 1.6288, + "grad_norm": 3.06548810005188, + "learning_rate": 2.4140845070422537e-05, + "loss": 0.1995, + "step": 5090 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 36.12741470336914, + "learning_rate": 2.4084507042253522e-05, + "loss": 0.2553, + "step": 5100 + }, + { + "epoch": 1.6352, + "grad_norm": 9.318374633789062, + "learning_rate": 2.4028169014084508e-05, + "loss": 0.2452, + "step": 5110 + }, + { + "epoch": 1.6383999999999999, + "grad_norm": 27.07297134399414, + "learning_rate": 2.3971830985915493e-05, + "loss": 0.4859, + "step": 5120 + }, + { + "epoch": 1.6416, + "grad_norm": 17.92713165283203, + "learning_rate": 2.391549295774648e-05, + "loss": 0.2726, + "step": 5130 + }, + { + "epoch": 1.6448, + "grad_norm": 20.595443725585938, + "learning_rate": 2.3859154929577464e-05, + "loss": 0.4673, + "step": 5140 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 15.670424461364746, + "learning_rate": 2.380281690140845e-05, + "loss": 0.131, + "step": 5150 + }, + { + "epoch": 1.6512, + "grad_norm": 0.27188238501548767, + "learning_rate": 2.374647887323944e-05, + "loss": 0.0991, + "step": 5160 + }, + { + "epoch": 1.6543999999999999, + "grad_norm": 0.1418936550617218, + "learning_rate": 2.3690140845070424e-05, + "loss": 0.3724, + "step": 5170 + }, + { + "epoch": 1.6576, + "grad_norm": 14.037035942077637, + "learning_rate": 2.363380281690141e-05, + "loss": 0.3105, + "step": 5180 + }, + { + "epoch": 1.6608, + "grad_norm": 1.6368416547775269, + "learning_rate": 2.3577464788732395e-05, + "loss": 0.174, + "step": 5190 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 16.241477966308594, + "learning_rate": 2.352112676056338e-05, + "loss": 0.2609, + "step": 5200 + }, + { + "epoch": 1.6672, + "grad_norm": 0.27356526255607605, + "learning_rate": 2.3464788732394366e-05, + "loss": 0.1827, + "step": 5210 + }, + { + "epoch": 1.6703999999999999, + "grad_norm": 35.08028030395508, + "learning_rate": 2.3408450704225355e-05, + "loss": 0.2963, + "step": 5220 + }, + { + "epoch": 1.6736, + "grad_norm": 0.12633004784584045, + "learning_rate": 2.335211267605634e-05, + "loss": 0.1284, + "step": 5230 + }, + { + "epoch": 1.6768, + "grad_norm": 10.867715835571289, + "learning_rate": 2.3295774647887326e-05, + "loss": 0.3947, + "step": 5240 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.37642917037010193, + "learning_rate": 2.323943661971831e-05, + "loss": 0.0878, + "step": 5250 + }, + { + "epoch": 1.6832, + "grad_norm": 9.886677742004395, + "learning_rate": 2.3183098591549297e-05, + "loss": 0.1667, + "step": 5260 + }, + { + "epoch": 1.6864, + "grad_norm": 0.5025785565376282, + "learning_rate": 2.3126760563380283e-05, + "loss": 0.2399, + "step": 5270 + }, + { + "epoch": 1.6896, + "grad_norm": 0.07013744115829468, + "learning_rate": 2.3070422535211268e-05, + "loss": 0.2592, + "step": 5280 + }, + { + "epoch": 1.6928, + "grad_norm": 9.038287162780762, + "learning_rate": 2.3014084507042254e-05, + "loss": 0.3243, + "step": 5290 + }, + { + "epoch": 1.696, + "grad_norm": 0.15734457969665527, + "learning_rate": 2.295774647887324e-05, + "loss": 0.0506, + "step": 5300 + }, + { + "epoch": 1.6992, + "grad_norm": 0.21506910026073456, + "learning_rate": 2.2901408450704225e-05, + "loss": 0.2006, + "step": 5310 + }, + { + "epoch": 1.7024, + "grad_norm": 11.207597732543945, + "learning_rate": 2.284507042253521e-05, + "loss": 0.2125, + "step": 5320 + }, + { + "epoch": 1.7056, + "grad_norm": 7.165248394012451, + "learning_rate": 2.27887323943662e-05, + "loss": 0.1971, + "step": 5330 + }, + { + "epoch": 1.7088, + "grad_norm": 0.8289473056793213, + "learning_rate": 2.2732394366197185e-05, + "loss": 0.1824, + "step": 5340 + }, + { + "epoch": 1.712, + "grad_norm": 1.2633789777755737, + "learning_rate": 2.267605633802817e-05, + "loss": 0.2601, + "step": 5350 + }, + { + "epoch": 1.7151999999999998, + "grad_norm": 38.94256591796875, + "learning_rate": 2.2619718309859156e-05, + "loss": 0.345, + "step": 5360 + }, + { + "epoch": 1.7184, + "grad_norm": 0.10120674222707748, + "learning_rate": 2.256338028169014e-05, + "loss": 0.1882, + "step": 5370 + }, + { + "epoch": 1.7216, + "grad_norm": 17.41254425048828, + "learning_rate": 2.2507042253521127e-05, + "loss": 0.1648, + "step": 5380 + }, + { + "epoch": 1.7248, + "grad_norm": 0.33858543634414673, + "learning_rate": 2.2450704225352115e-05, + "loss": 0.3624, + "step": 5390 + }, + { + "epoch": 1.728, + "grad_norm": 0.3513981103897095, + "learning_rate": 2.23943661971831e-05, + "loss": 0.3238, + "step": 5400 + }, + { + "epoch": 1.7311999999999999, + "grad_norm": 0.7570049166679382, + "learning_rate": 2.2338028169014086e-05, + "loss": 0.3377, + "step": 5410 + }, + { + "epoch": 1.7344, + "grad_norm": 0.7027788162231445, + "learning_rate": 2.2281690140845072e-05, + "loss": 0.1963, + "step": 5420 + }, + { + "epoch": 1.7376, + "grad_norm": 55.278343200683594, + "learning_rate": 2.2225352112676057e-05, + "loss": 0.4997, + "step": 5430 + }, + { + "epoch": 1.7408000000000001, + "grad_norm": 2.759753704071045, + "learning_rate": 2.2169014084507043e-05, + "loss": 0.2161, + "step": 5440 + }, + { + "epoch": 1.744, + "grad_norm": 13.195887565612793, + "learning_rate": 2.2112676056338032e-05, + "loss": 0.2539, + "step": 5450 + }, + { + "epoch": 1.7471999999999999, + "grad_norm": 12.78817081451416, + "learning_rate": 2.2056338028169017e-05, + "loss": 0.2056, + "step": 5460 + }, + { + "epoch": 1.7504, + "grad_norm": 40.1257209777832, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.2951, + "step": 5470 + }, + { + "epoch": 1.7536, + "grad_norm": 0.3393701910972595, + "learning_rate": 2.1943661971830985e-05, + "loss": 0.2428, + "step": 5480 + }, + { + "epoch": 1.7568000000000001, + "grad_norm": 13.551216125488281, + "learning_rate": 2.188732394366197e-05, + "loss": 0.3264, + "step": 5490 + }, + { + "epoch": 1.76, + "grad_norm": 41.21603012084961, + "learning_rate": 2.1830985915492956e-05, + "loss": 0.4246, + "step": 5500 + }, + { + "epoch": 1.7631999999999999, + "grad_norm": 9.464485168457031, + "learning_rate": 2.1774647887323945e-05, + "loss": 0.1579, + "step": 5510 + }, + { + "epoch": 1.7664, + "grad_norm": 45.843814849853516, + "learning_rate": 2.171830985915493e-05, + "loss": 0.1919, + "step": 5520 + }, + { + "epoch": 1.7696, + "grad_norm": 1.6334397792816162, + "learning_rate": 2.1661971830985916e-05, + "loss": 0.2822, + "step": 5530 + }, + { + "epoch": 1.7728000000000002, + "grad_norm": 0.7097220420837402, + "learning_rate": 2.16056338028169e-05, + "loss": 0.1146, + "step": 5540 + }, + { + "epoch": 1.776, + "grad_norm": 11.706197738647461, + "learning_rate": 2.1549295774647887e-05, + "loss": 0.2456, + "step": 5550 + }, + { + "epoch": 1.7792, + "grad_norm": 0.8858042359352112, + "learning_rate": 2.1492957746478876e-05, + "loss": 0.461, + "step": 5560 + }, + { + "epoch": 1.7824, + "grad_norm": 2.2900185585021973, + "learning_rate": 2.143661971830986e-05, + "loss": 0.0824, + "step": 5570 + }, + { + "epoch": 1.7856, + "grad_norm": 1.9041435718536377, + "learning_rate": 2.1380281690140847e-05, + "loss": 0.1843, + "step": 5580 + }, + { + "epoch": 1.7888, + "grad_norm": 9.106405258178711, + "learning_rate": 2.1323943661971832e-05, + "loss": 0.1421, + "step": 5590 + }, + { + "epoch": 1.792, + "grad_norm": 19.528039932250977, + "learning_rate": 2.1267605633802818e-05, + "loss": 0.3623, + "step": 5600 + }, + { + "epoch": 1.7952, + "grad_norm": 0.16566412150859833, + "learning_rate": 2.1211267605633803e-05, + "loss": 0.4338, + "step": 5610 + }, + { + "epoch": 1.7984, + "grad_norm": 0.8347293138504028, + "learning_rate": 2.1154929577464792e-05, + "loss": 0.2157, + "step": 5620 + }, + { + "epoch": 1.8016, + "grad_norm": 22.129648208618164, + "learning_rate": 2.1098591549295778e-05, + "loss": 0.2917, + "step": 5630 + }, + { + "epoch": 1.8048, + "grad_norm": 0.25225210189819336, + "learning_rate": 2.1042253521126763e-05, + "loss": 0.2006, + "step": 5640 + }, + { + "epoch": 1.808, + "grad_norm": 0.3600423336029053, + "learning_rate": 2.098591549295775e-05, + "loss": 0.3905, + "step": 5650 + }, + { + "epoch": 1.8112, + "grad_norm": 22.235361099243164, + "learning_rate": 2.0929577464788734e-05, + "loss": 0.2482, + "step": 5660 + }, + { + "epoch": 1.8144, + "grad_norm": 9.403947830200195, + "learning_rate": 2.087323943661972e-05, + "loss": 0.2406, + "step": 5670 + }, + { + "epoch": 1.8176, + "grad_norm": 1.0296498537063599, + "learning_rate": 2.0816901408450705e-05, + "loss": 0.1626, + "step": 5680 + }, + { + "epoch": 1.8208, + "grad_norm": 25.019081115722656, + "learning_rate": 2.076056338028169e-05, + "loss": 0.1861, + "step": 5690 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 6.003271579742432, + "learning_rate": 2.0704225352112676e-05, + "loss": 0.4198, + "step": 5700 + }, + { + "epoch": 1.8272, + "grad_norm": 0.24664323031902313, + "learning_rate": 2.064788732394366e-05, + "loss": 0.2311, + "step": 5710 + }, + { + "epoch": 1.8304, + "grad_norm": 0.3681061863899231, + "learning_rate": 2.0591549295774647e-05, + "loss": 0.3282, + "step": 5720 + }, + { + "epoch": 1.8336000000000001, + "grad_norm": 1.1691765785217285, + "learning_rate": 2.0535211267605633e-05, + "loss": 0.1885, + "step": 5730 + }, + { + "epoch": 1.8368, + "grad_norm": 43.80043029785156, + "learning_rate": 2.047887323943662e-05, + "loss": 0.2497, + "step": 5740 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 22.248729705810547, + "learning_rate": 2.0422535211267607e-05, + "loss": 0.1009, + "step": 5750 + }, + { + "epoch": 1.8432, + "grad_norm": 0.1887352615594864, + "learning_rate": 2.0366197183098592e-05, + "loss": 0.2099, + "step": 5760 + }, + { + "epoch": 1.8464, + "grad_norm": 39.37889862060547, + "learning_rate": 2.0309859154929578e-05, + "loss": 0.2349, + "step": 5770 + }, + { + "epoch": 1.8496000000000001, + "grad_norm": 2.4076569080352783, + "learning_rate": 2.0253521126760563e-05, + "loss": 0.3262, + "step": 5780 + }, + { + "epoch": 1.8528, + "grad_norm": 0.38582533597946167, + "learning_rate": 2.019718309859155e-05, + "loss": 0.0311, + "step": 5790 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.12109358608722687, + "learning_rate": 2.0140845070422538e-05, + "loss": 0.311, + "step": 5800 + }, + { + "epoch": 1.8592, + "grad_norm": 6.223245620727539, + "learning_rate": 2.0084507042253523e-05, + "loss": 0.4537, + "step": 5810 + }, + { + "epoch": 1.8624, + "grad_norm": 0.16367606818675995, + "learning_rate": 2.002816901408451e-05, + "loss": 0.3076, + "step": 5820 + }, + { + "epoch": 1.8656000000000001, + "grad_norm": 30.715484619140625, + "learning_rate": 1.9971830985915494e-05, + "loss": 0.346, + "step": 5830 + }, + { + "epoch": 1.8688, + "grad_norm": 42.86243438720703, + "learning_rate": 1.991549295774648e-05, + "loss": 0.0792, + "step": 5840 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 15.401847839355469, + "learning_rate": 1.9859154929577465e-05, + "loss": 0.2927, + "step": 5850 + }, + { + "epoch": 1.8752, + "grad_norm": 0.9566125273704529, + "learning_rate": 1.980281690140845e-05, + "loss": 0.2851, + "step": 5860 + }, + { + "epoch": 1.8784, + "grad_norm": 11.325541496276855, + "learning_rate": 1.9746478873239436e-05, + "loss": 0.4101, + "step": 5870 + }, + { + "epoch": 1.8816000000000002, + "grad_norm": 0.608905553817749, + "learning_rate": 1.9690140845070422e-05, + "loss": 0.0911, + "step": 5880 + }, + { + "epoch": 1.8848, + "grad_norm": 14.199214935302734, + "learning_rate": 1.9633802816901407e-05, + "loss": 0.4375, + "step": 5890 + }, + { + "epoch": 1.888, + "grad_norm": 20.619394302368164, + "learning_rate": 1.9577464788732393e-05, + "loss": 0.3372, + "step": 5900 + }, + { + "epoch": 1.8912, + "grad_norm": 11.778953552246094, + "learning_rate": 1.9521126760563382e-05, + "loss": 0.2411, + "step": 5910 + }, + { + "epoch": 1.8944, + "grad_norm": 111.18775939941406, + "learning_rate": 1.9464788732394367e-05, + "loss": 0.239, + "step": 5920 + }, + { + "epoch": 1.8976, + "grad_norm": 0.6485037207603455, + "learning_rate": 1.9408450704225353e-05, + "loss": 0.1002, + "step": 5930 + }, + { + "epoch": 1.9008, + "grad_norm": 51.51342010498047, + "learning_rate": 1.9352112676056338e-05, + "loss": 0.3721, + "step": 5940 + }, + { + "epoch": 1.904, + "grad_norm": 9.155081748962402, + "learning_rate": 1.9295774647887324e-05, + "loss": 0.2457, + "step": 5950 + }, + { + "epoch": 1.9072, + "grad_norm": 31.439834594726562, + "learning_rate": 1.923943661971831e-05, + "loss": 0.3378, + "step": 5960 + }, + { + "epoch": 1.9104, + "grad_norm": 38.59767532348633, + "learning_rate": 1.9183098591549298e-05, + "loss": 0.0842, + "step": 5970 + }, + { + "epoch": 1.9136, + "grad_norm": 30.20688819885254, + "learning_rate": 1.9126760563380284e-05, + "loss": 0.1471, + "step": 5980 + }, + { + "epoch": 1.9167999999999998, + "grad_norm": 76.58573150634766, + "learning_rate": 1.907042253521127e-05, + "loss": 0.2427, + "step": 5990 + }, + { + "epoch": 1.92, + "grad_norm": 0.44934359192848206, + "learning_rate": 1.9014084507042255e-05, + "loss": 0.2053, + "step": 6000 + }, + { + "epoch": 1.9232, + "grad_norm": 12.041892051696777, + "learning_rate": 1.895774647887324e-05, + "loss": 0.235, + "step": 6010 + }, + { + "epoch": 1.9264000000000001, + "grad_norm": 0.10604248195886612, + "learning_rate": 1.8901408450704226e-05, + "loss": 0.2931, + "step": 6020 + }, + { + "epoch": 1.9296, + "grad_norm": 52.336849212646484, + "learning_rate": 1.8845070422535215e-05, + "loss": 0.46, + "step": 6030 + }, + { + "epoch": 1.9327999999999999, + "grad_norm": 0.30715158581733704, + "learning_rate": 1.87887323943662e-05, + "loss": 0.086, + "step": 6040 + }, + { + "epoch": 1.936, + "grad_norm": 12.268070220947266, + "learning_rate": 1.8732394366197186e-05, + "loss": 0.3663, + "step": 6050 + }, + { + "epoch": 1.9392, + "grad_norm": 0.7918230891227722, + "learning_rate": 1.867605633802817e-05, + "loss": 0.2688, + "step": 6060 + }, + { + "epoch": 1.9424000000000001, + "grad_norm": 0.276155024766922, + "learning_rate": 1.8619718309859157e-05, + "loss": 0.2618, + "step": 6070 + }, + { + "epoch": 1.9456, + "grad_norm": 0.6748977899551392, + "learning_rate": 1.8563380281690142e-05, + "loss": 0.0704, + "step": 6080 + }, + { + "epoch": 1.9487999999999999, + "grad_norm": 27.20131492614746, + "learning_rate": 1.8507042253521128e-05, + "loss": 0.4101, + "step": 6090 + }, + { + "epoch": 1.952, + "grad_norm": 0.1357765942811966, + "learning_rate": 1.8450704225352113e-05, + "loss": 0.354, + "step": 6100 + }, + { + "epoch": 1.9552, + "grad_norm": 21.598268508911133, + "learning_rate": 1.83943661971831e-05, + "loss": 0.2213, + "step": 6110 + }, + { + "epoch": 1.9584000000000001, + "grad_norm": 0.2529258728027344, + "learning_rate": 1.8338028169014084e-05, + "loss": 0.3205, + "step": 6120 + }, + { + "epoch": 1.9616, + "grad_norm": 2.0377094745635986, + "learning_rate": 1.828169014084507e-05, + "loss": 0.2115, + "step": 6130 + }, + { + "epoch": 1.9647999999999999, + "grad_norm": 9.584620475769043, + "learning_rate": 1.822535211267606e-05, + "loss": 0.4502, + "step": 6140 + }, + { + "epoch": 1.968, + "grad_norm": 0.26572760939598083, + "learning_rate": 1.8169014084507044e-05, + "loss": 0.4565, + "step": 6150 + }, + { + "epoch": 1.9712, + "grad_norm": 0.4592236876487732, + "learning_rate": 1.811267605633803e-05, + "loss": 0.1688, + "step": 6160 + }, + { + "epoch": 1.9744000000000002, + "grad_norm": 2.714552879333496, + "learning_rate": 1.8056338028169015e-05, + "loss": 0.1334, + "step": 6170 + }, + { + "epoch": 1.9776, + "grad_norm": 1.833774209022522, + "learning_rate": 1.8e-05, + "loss": 0.2703, + "step": 6180 + }, + { + "epoch": 1.9808, + "grad_norm": 0.23094186186790466, + "learning_rate": 1.7943661971830986e-05, + "loss": 0.2316, + "step": 6190 + }, + { + "epoch": 1.984, + "grad_norm": 2.593341112136841, + "learning_rate": 1.7887323943661975e-05, + "loss": 0.1416, + "step": 6200 + }, + { + "epoch": 1.9872, + "grad_norm": 63.741641998291016, + "learning_rate": 1.783098591549296e-05, + "loss": 0.1999, + "step": 6210 + }, + { + "epoch": 1.9904, + "grad_norm": 59.86637878417969, + "learning_rate": 1.7774647887323946e-05, + "loss": 0.7514, + "step": 6220 + }, + { + "epoch": 1.9936, + "grad_norm": 1.3984020948410034, + "learning_rate": 1.771830985915493e-05, + "loss": 0.3885, + "step": 6230 + }, + { + "epoch": 1.9968, + "grad_norm": 25.51970863342285, + "learning_rate": 1.7661971830985917e-05, + "loss": 0.1954, + "step": 6240 + }, + { + "epoch": 2.0, + "grad_norm": 0.26810941100120544, + "learning_rate": 1.7605633802816902e-05, + "loss": 0.2831, + "step": 6250 + }, + { + "epoch": 2.0032, + "grad_norm": 25.82304573059082, + "learning_rate": 1.7549295774647888e-05, + "loss": 0.1234, + "step": 6260 + }, + { + "epoch": 2.0064, + "grad_norm": 0.15285342931747437, + "learning_rate": 1.7492957746478873e-05, + "loss": 0.1264, + "step": 6270 + }, + { + "epoch": 2.0096, + "grad_norm": 0.1930648535490036, + "learning_rate": 1.743661971830986e-05, + "loss": 0.1633, + "step": 6280 + }, + { + "epoch": 2.0128, + "grad_norm": 10.310894966125488, + "learning_rate": 1.7380281690140844e-05, + "loss": 0.0062, + "step": 6290 + }, + { + "epoch": 2.016, + "grad_norm": 0.060901541262865067, + "learning_rate": 1.732394366197183e-05, + "loss": 0.0814, + "step": 6300 + }, + { + "epoch": 2.0192, + "grad_norm": 0.14364077150821686, + "learning_rate": 1.7267605633802815e-05, + "loss": 0.0068, + "step": 6310 + }, + { + "epoch": 2.0224, + "grad_norm": 0.5780632495880127, + "learning_rate": 1.7211267605633804e-05, + "loss": 0.0471, + "step": 6320 + }, + { + "epoch": 2.0256, + "grad_norm": 0.5313758850097656, + "learning_rate": 1.715492957746479e-05, + "loss": 0.0619, + "step": 6330 + }, + { + "epoch": 2.0288, + "grad_norm": 0.028105057775974274, + "learning_rate": 1.7098591549295775e-05, + "loss": 0.009, + "step": 6340 + }, + { + "epoch": 2.032, + "grad_norm": 1.1972317695617676, + "learning_rate": 1.704225352112676e-05, + "loss": 0.0622, + "step": 6350 + }, + { + "epoch": 2.0352, + "grad_norm": 0.027558835223317146, + "learning_rate": 1.6985915492957746e-05, + "loss": 0.2316, + "step": 6360 + }, + { + "epoch": 2.0384, + "grad_norm": 0.04284098371863365, + "learning_rate": 1.6929577464788735e-05, + "loss": 0.0582, + "step": 6370 + }, + { + "epoch": 2.0416, + "grad_norm": 0.1924617737531662, + "learning_rate": 1.687323943661972e-05, + "loss": 0.0691, + "step": 6380 + }, + { + "epoch": 2.0448, + "grad_norm": 0.036435432732105255, + "learning_rate": 1.6816901408450706e-05, + "loss": 0.1442, + "step": 6390 + }, + { + "epoch": 2.048, + "grad_norm": 0.8796645402908325, + "learning_rate": 1.676056338028169e-05, + "loss": 0.0757, + "step": 6400 + }, + { + "epoch": 2.0512, + "grad_norm": 0.6916587352752686, + "learning_rate": 1.6704225352112677e-05, + "loss": 0.1356, + "step": 6410 + }, + { + "epoch": 2.0544, + "grad_norm": 0.10934862494468689, + "learning_rate": 1.6647887323943663e-05, + "loss": 0.211, + "step": 6420 + }, + { + "epoch": 2.0576, + "grad_norm": 0.03238527849316597, + "learning_rate": 1.659154929577465e-05, + "loss": 0.0556, + "step": 6430 + }, + { + "epoch": 2.0608, + "grad_norm": 0.25189611315727234, + "learning_rate": 1.6535211267605634e-05, + "loss": 0.0208, + "step": 6440 + }, + { + "epoch": 2.064, + "grad_norm": 0.08050217479467392, + "learning_rate": 1.647887323943662e-05, + "loss": 0.0021, + "step": 6450 + }, + { + "epoch": 2.0672, + "grad_norm": 0.045152150094509125, + "learning_rate": 1.6422535211267605e-05, + "loss": 0.0743, + "step": 6460 + }, + { + "epoch": 2.0704, + "grad_norm": 0.036941394209861755, + "learning_rate": 1.636619718309859e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 2.0736, + "grad_norm": 0.024720242246985435, + "learning_rate": 1.6309859154929576e-05, + "loss": 0.1229, + "step": 6480 + }, + { + "epoch": 2.0768, + "grad_norm": 0.033186838030815125, + "learning_rate": 1.6253521126760565e-05, + "loss": 0.1691, + "step": 6490 + }, + { + "epoch": 2.08, + "grad_norm": 0.04443328082561493, + "learning_rate": 1.619718309859155e-05, + "loss": 0.1783, + "step": 6500 + }, + { + "epoch": 2.0832, + "grad_norm": 134.47421264648438, + "learning_rate": 1.6140845070422536e-05, + "loss": 0.2483, + "step": 6510 + }, + { + "epoch": 2.0864, + "grad_norm": 5.727556228637695, + "learning_rate": 1.608450704225352e-05, + "loss": 0.1534, + "step": 6520 + }, + { + "epoch": 2.0896, + "grad_norm": 0.7954875230789185, + "learning_rate": 1.6028169014084507e-05, + "loss": 0.1282, + "step": 6530 + }, + { + "epoch": 2.0928, + "grad_norm": 0.08250103145837784, + "learning_rate": 1.5971830985915492e-05, + "loss": 0.2734, + "step": 6540 + }, + { + "epoch": 2.096, + "grad_norm": 0.04844718798995018, + "learning_rate": 1.591549295774648e-05, + "loss": 0.0514, + "step": 6550 + }, + { + "epoch": 2.0992, + "grad_norm": 0.04677910357713699, + "learning_rate": 1.5859154929577466e-05, + "loss": 0.0621, + "step": 6560 + }, + { + "epoch": 2.1024, + "grad_norm": 0.12014532834291458, + "learning_rate": 1.5802816901408452e-05, + "loss": 0.0712, + "step": 6570 + }, + { + "epoch": 2.1056, + "grad_norm": 0.18135568499565125, + "learning_rate": 1.5746478873239437e-05, + "loss": 0.0642, + "step": 6580 + }, + { + "epoch": 2.1088, + "grad_norm": 0.13500288128852844, + "learning_rate": 1.5690140845070423e-05, + "loss": 0.0566, + "step": 6590 + }, + { + "epoch": 2.112, + "grad_norm": 0.03971581906080246, + "learning_rate": 1.5633802816901412e-05, + "loss": 0.0039, + "step": 6600 + }, + { + "epoch": 2.1152, + "grad_norm": 0.12814994156360626, + "learning_rate": 1.5577464788732397e-05, + "loss": 0.1774, + "step": 6610 + }, + { + "epoch": 2.1184, + "grad_norm": 0.02763848565518856, + "learning_rate": 1.5521126760563383e-05, + "loss": 0.2017, + "step": 6620 + }, + { + "epoch": 2.1216, + "grad_norm": 0.16662102937698364, + "learning_rate": 1.546478873239437e-05, + "loss": 0.0964, + "step": 6630 + }, + { + "epoch": 2.1248, + "grad_norm": 0.0411493182182312, + "learning_rate": 1.5408450704225354e-05, + "loss": 0.055, + "step": 6640 + }, + { + "epoch": 2.128, + "grad_norm": 0.10390494018793106, + "learning_rate": 1.535211267605634e-05, + "loss": 0.1993, + "step": 6650 + }, + { + "epoch": 2.1312, + "grad_norm": 32.90834426879883, + "learning_rate": 1.5295774647887325e-05, + "loss": 0.2295, + "step": 6660 + }, + { + "epoch": 2.1344, + "grad_norm": 0.20629918575286865, + "learning_rate": 1.5239436619718312e-05, + "loss": 0.1365, + "step": 6670 + }, + { + "epoch": 2.1376, + "grad_norm": 0.06436211615800858, + "learning_rate": 1.5183098591549298e-05, + "loss": 0.2804, + "step": 6680 + }, + { + "epoch": 2.1408, + "grad_norm": 6.357541561126709, + "learning_rate": 1.5126760563380283e-05, + "loss": 0.1547, + "step": 6690 + }, + { + "epoch": 2.144, + "grad_norm": 0.031177496537566185, + "learning_rate": 1.5070422535211269e-05, + "loss": 0.1221, + "step": 6700 + }, + { + "epoch": 2.1471999999999998, + "grad_norm": 0.0401877760887146, + "learning_rate": 1.5014084507042252e-05, + "loss": 0.0467, + "step": 6710 + }, + { + "epoch": 2.1504, + "grad_norm": 0.04002746194601059, + "learning_rate": 1.4957746478873241e-05, + "loss": 0.2081, + "step": 6720 + }, + { + "epoch": 2.1536, + "grad_norm": 0.08599916845560074, + "learning_rate": 1.4901408450704227e-05, + "loss": 0.0094, + "step": 6730 + }, + { + "epoch": 2.1568, + "grad_norm": 792.8843994140625, + "learning_rate": 1.4845070422535212e-05, + "loss": 0.0313, + "step": 6740 + }, + { + "epoch": 2.16, + "grad_norm": 49.27170944213867, + "learning_rate": 1.4788732394366198e-05, + "loss": 0.0488, + "step": 6750 + }, + { + "epoch": 2.1632, + "grad_norm": 0.027708498761057854, + "learning_rate": 1.4732394366197183e-05, + "loss": 0.2522, + "step": 6760 + }, + { + "epoch": 2.1664, + "grad_norm": 144.439697265625, + "learning_rate": 1.4676056338028169e-05, + "loss": 0.2231, + "step": 6770 + }, + { + "epoch": 2.1696, + "grad_norm": 0.05224217101931572, + "learning_rate": 1.4619718309859156e-05, + "loss": 0.0018, + "step": 6780 + }, + { + "epoch": 2.1728, + "grad_norm": 85.55796813964844, + "learning_rate": 1.4563380281690141e-05, + "loss": 0.218, + "step": 6790 + }, + { + "epoch": 2.176, + "grad_norm": 0.17730030417442322, + "learning_rate": 1.4507042253521127e-05, + "loss": 0.13, + "step": 6800 + }, + { + "epoch": 2.1792, + "grad_norm": 0.05483116954565048, + "learning_rate": 1.4450704225352112e-05, + "loss": 0.0386, + "step": 6810 + }, + { + "epoch": 2.1824, + "grad_norm": 0.03330325335264206, + "learning_rate": 1.4394366197183098e-05, + "loss": 0.0036, + "step": 6820 + }, + { + "epoch": 2.1856, + "grad_norm": 0.030421894043684006, + "learning_rate": 1.4338028169014083e-05, + "loss": 0.1214, + "step": 6830 + }, + { + "epoch": 2.1888, + "grad_norm": 0.037813425064086914, + "learning_rate": 1.4281690140845072e-05, + "loss": 0.002, + "step": 6840 + }, + { + "epoch": 2.192, + "grad_norm": 0.4608314335346222, + "learning_rate": 1.4225352112676058e-05, + "loss": 0.2043, + "step": 6850 + }, + { + "epoch": 2.1952, + "grad_norm": 0.16903652250766754, + "learning_rate": 1.4169014084507043e-05, + "loss": 0.2208, + "step": 6860 + }, + { + "epoch": 2.1984, + "grad_norm": 0.09764442592859268, + "learning_rate": 1.4112676056338029e-05, + "loss": 0.2925, + "step": 6870 + }, + { + "epoch": 2.2016, + "grad_norm": 0.04944216087460518, + "learning_rate": 1.4056338028169014e-05, + "loss": 0.0023, + "step": 6880 + }, + { + "epoch": 2.2048, + "grad_norm": 18.86257553100586, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.1508, + "step": 6890 + }, + { + "epoch": 2.208, + "grad_norm": 5.072443962097168, + "learning_rate": 1.3943661971830987e-05, + "loss": 0.1069, + "step": 6900 + }, + { + "epoch": 2.2112, + "grad_norm": 0.12859505414962769, + "learning_rate": 1.3887323943661972e-05, + "loss": 0.0233, + "step": 6910 + }, + { + "epoch": 2.2144, + "grad_norm": 0.06567766517400742, + "learning_rate": 1.3830985915492958e-05, + "loss": 0.0834, + "step": 6920 + }, + { + "epoch": 2.2176, + "grad_norm": 15.95632266998291, + "learning_rate": 1.3774647887323943e-05, + "loss": 0.3579, + "step": 6930 + }, + { + "epoch": 2.2208, + "grad_norm": 0.274181067943573, + "learning_rate": 1.3718309859154929e-05, + "loss": 0.1515, + "step": 6940 + }, + { + "epoch": 2.224, + "grad_norm": 0.13101747632026672, + "learning_rate": 1.3661971830985918e-05, + "loss": 0.0393, + "step": 6950 + }, + { + "epoch": 2.2272, + "grad_norm": 1.139413595199585, + "learning_rate": 1.3605633802816903e-05, + "loss": 0.1484, + "step": 6960 + }, + { + "epoch": 2.2304, + "grad_norm": 0.057852111756801605, + "learning_rate": 1.3549295774647889e-05, + "loss": 0.0394, + "step": 6970 + }, + { + "epoch": 2.2336, + "grad_norm": 0.6658930778503418, + "learning_rate": 1.3492957746478874e-05, + "loss": 0.0675, + "step": 6980 + }, + { + "epoch": 2.2368, + "grad_norm": 0.057538602501153946, + "learning_rate": 1.343661971830986e-05, + "loss": 0.1998, + "step": 6990 + }, + { + "epoch": 2.24, + "grad_norm": 0.08062786608934402, + "learning_rate": 1.3380281690140845e-05, + "loss": 0.1043, + "step": 7000 + }, + { + "epoch": 2.2432, + "grad_norm": 12.996604919433594, + "learning_rate": 1.3323943661971833e-05, + "loss": 0.0718, + "step": 7010 + }, + { + "epoch": 2.2464, + "grad_norm": 0.1011863648891449, + "learning_rate": 1.3267605633802818e-05, + "loss": 0.0694, + "step": 7020 + }, + { + "epoch": 2.2496, + "grad_norm": 20.72796058654785, + "learning_rate": 1.3211267605633804e-05, + "loss": 0.1426, + "step": 7030 + }, + { + "epoch": 2.2528, + "grad_norm": 0.22724929451942444, + "learning_rate": 1.3154929577464789e-05, + "loss": 0.0025, + "step": 7040 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 229.4677734375, + "learning_rate": 1.3098591549295775e-05, + "loss": 0.2219, + "step": 7050 + }, + { + "epoch": 2.2592, + "grad_norm": 0.1337730884552002, + "learning_rate": 1.304225352112676e-05, + "loss": 0.1146, + "step": 7060 + }, + { + "epoch": 2.2624, + "grad_norm": 0.08331338316202164, + "learning_rate": 1.2985915492957749e-05, + "loss": 0.0336, + "step": 7070 + }, + { + "epoch": 2.2656, + "grad_norm": 0.047301776707172394, + "learning_rate": 1.2929577464788733e-05, + "loss": 0.0719, + "step": 7080 + }, + { + "epoch": 2.2688, + "grad_norm": 0.0657852441072464, + "learning_rate": 1.2873239436619718e-05, + "loss": 0.1904, + "step": 7090 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.033138763159513474, + "learning_rate": 1.2816901408450704e-05, + "loss": 0.0779, + "step": 7100 + }, + { + "epoch": 2.2752, + "grad_norm": 37.34537887573242, + "learning_rate": 1.276056338028169e-05, + "loss": 0.1858, + "step": 7110 + }, + { + "epoch": 2.2784, + "grad_norm": 0.10586226731538773, + "learning_rate": 1.2704225352112675e-05, + "loss": 0.002, + "step": 7120 + }, + { + "epoch": 2.2816, + "grad_norm": 0.0424388162791729, + "learning_rate": 1.2647887323943664e-05, + "loss": 0.1177, + "step": 7130 + }, + { + "epoch": 2.2848, + "grad_norm": 0.05596411973237991, + "learning_rate": 1.259154929577465e-05, + "loss": 0.2422, + "step": 7140 + }, + { + "epoch": 2.288, + "grad_norm": 0.05766447260975838, + "learning_rate": 1.2535211267605635e-05, + "loss": 0.0803, + "step": 7150 + }, + { + "epoch": 2.2912, + "grad_norm": 1.7550102472305298, + "learning_rate": 1.247887323943662e-05, + "loss": 0.1408, + "step": 7160 + }, + { + "epoch": 2.2944, + "grad_norm": 0.13066236674785614, + "learning_rate": 1.2422535211267607e-05, + "loss": 0.002, + "step": 7170 + }, + { + "epoch": 2.2976, + "grad_norm": 0.1156509518623352, + "learning_rate": 1.2366197183098593e-05, + "loss": 0.2522, + "step": 7180 + }, + { + "epoch": 2.3008, + "grad_norm": 0.09673482924699783, + "learning_rate": 1.2309859154929577e-05, + "loss": 0.0857, + "step": 7190 + }, + { + "epoch": 2.304, + "grad_norm": 0.8121844530105591, + "learning_rate": 1.2253521126760564e-05, + "loss": 0.193, + "step": 7200 + }, + { + "epoch": 2.3072, + "grad_norm": 0.15363769233226776, + "learning_rate": 1.219718309859155e-05, + "loss": 0.1038, + "step": 7210 + }, + { + "epoch": 2.3104, + "grad_norm": 0.10093524307012558, + "learning_rate": 1.2140845070422535e-05, + "loss": 0.2331, + "step": 7220 + }, + { + "epoch": 2.3136, + "grad_norm": 0.4434497058391571, + "learning_rate": 1.2084507042253522e-05, + "loss": 0.0804, + "step": 7230 + }, + { + "epoch": 2.3168, + "grad_norm": 8.138230323791504, + "learning_rate": 1.2028169014084508e-05, + "loss": 0.2648, + "step": 7240 + }, + { + "epoch": 2.32, + "grad_norm": 45.24201583862305, + "learning_rate": 1.1971830985915493e-05, + "loss": 0.2238, + "step": 7250 + }, + { + "epoch": 2.3232, + "grad_norm": 0.13272492587566376, + "learning_rate": 1.191549295774648e-05, + "loss": 0.2196, + "step": 7260 + }, + { + "epoch": 2.3264, + "grad_norm": 89.29029846191406, + "learning_rate": 1.1859154929577466e-05, + "loss": 0.3195, + "step": 7270 + }, + { + "epoch": 2.3296, + "grad_norm": 0.09818959981203079, + "learning_rate": 1.1802816901408451e-05, + "loss": 0.2584, + "step": 7280 + }, + { + "epoch": 2.3327999999999998, + "grad_norm": 0.21061986684799194, + "learning_rate": 1.1746478873239437e-05, + "loss": 0.0191, + "step": 7290 + }, + { + "epoch": 2.336, + "grad_norm": 0.12225896865129471, + "learning_rate": 1.1690140845070422e-05, + "loss": 0.1498, + "step": 7300 + }, + { + "epoch": 2.3392, + "grad_norm": 0.10590647161006927, + "learning_rate": 1.163380281690141e-05, + "loss": 0.0103, + "step": 7310 + }, + { + "epoch": 2.3424, + "grad_norm": 0.0610116645693779, + "learning_rate": 1.1577464788732395e-05, + "loss": 0.1074, + "step": 7320 + }, + { + "epoch": 2.3456, + "grad_norm": 0.21642152965068817, + "learning_rate": 1.152112676056338e-05, + "loss": 0.0459, + "step": 7330 + }, + { + "epoch": 2.3487999999999998, + "grad_norm": 0.12459522485733032, + "learning_rate": 1.1464788732394368e-05, + "loss": 0.0251, + "step": 7340 + }, + { + "epoch": 2.352, + "grad_norm": 0.03281530365347862, + "learning_rate": 1.1408450704225353e-05, + "loss": 0.0669, + "step": 7350 + }, + { + "epoch": 2.3552, + "grad_norm": 26.73065757751465, + "learning_rate": 1.1352112676056339e-05, + "loss": 0.287, + "step": 7360 + }, + { + "epoch": 2.3584, + "grad_norm": 55.194541931152344, + "learning_rate": 1.1295774647887324e-05, + "loss": 0.034, + "step": 7370 + }, + { + "epoch": 2.3616, + "grad_norm": 2.0578792095184326, + "learning_rate": 1.123943661971831e-05, + "loss": 0.0027, + "step": 7380 + }, + { + "epoch": 2.3648, + "grad_norm": 4.148108005523682, + "learning_rate": 1.1183098591549295e-05, + "loss": 0.2128, + "step": 7390 + }, + { + "epoch": 2.368, + "grad_norm": 0.11812355369329453, + "learning_rate": 1.1126760563380282e-05, + "loss": 0.0027, + "step": 7400 + }, + { + "epoch": 2.3712, + "grad_norm": 0.03446757793426514, + "learning_rate": 1.1070422535211268e-05, + "loss": 0.2747, + "step": 7410 + }, + { + "epoch": 2.3744, + "grad_norm": 16.427898406982422, + "learning_rate": 1.1014084507042253e-05, + "loss": 0.0905, + "step": 7420 + }, + { + "epoch": 2.3776, + "grad_norm": 0.08660475164651871, + "learning_rate": 1.095774647887324e-05, + "loss": 0.0019, + "step": 7430 + }, + { + "epoch": 2.3808, + "grad_norm": 0.038691841065883636, + "learning_rate": 1.0901408450704226e-05, + "loss": 0.0015, + "step": 7440 + }, + { + "epoch": 2.384, + "grad_norm": 0.037728451192379, + "learning_rate": 1.0845070422535212e-05, + "loss": 0.0715, + "step": 7450 + }, + { + "epoch": 2.3872, + "grad_norm": 0.17045029997825623, + "learning_rate": 1.0788732394366199e-05, + "loss": 0.1117, + "step": 7460 + }, + { + "epoch": 2.3904, + "grad_norm": 0.061354752629995346, + "learning_rate": 1.0732394366197184e-05, + "loss": 0.1794, + "step": 7470 + }, + { + "epoch": 2.3936, + "grad_norm": 0.052094943821430206, + "learning_rate": 1.067605633802817e-05, + "loss": 0.0585, + "step": 7480 + }, + { + "epoch": 2.3968, + "grad_norm": 81.28450012207031, + "learning_rate": 1.0619718309859155e-05, + "loss": 0.0784, + "step": 7490 + }, + { + "epoch": 2.4, + "grad_norm": 3.190966844558716, + "learning_rate": 1.056338028169014e-05, + "loss": 0.2101, + "step": 7500 + }, + { + "epoch": 2.4032, + "grad_norm": 0.030748562887310982, + "learning_rate": 1.0507042253521126e-05, + "loss": 0.0671, + "step": 7510 + }, + { + "epoch": 2.4064, + "grad_norm": 0.1670941859483719, + "learning_rate": 1.0450704225352113e-05, + "loss": 0.2274, + "step": 7520 + }, + { + "epoch": 2.4096, + "grad_norm": 0.06420325487852097, + "learning_rate": 1.0394366197183099e-05, + "loss": 0.0918, + "step": 7530 + }, + { + "epoch": 2.4128, + "grad_norm": 0.030242426320910454, + "learning_rate": 1.0338028169014086e-05, + "loss": 0.0561, + "step": 7540 + }, + { + "epoch": 2.416, + "grad_norm": 0.0714086964726448, + "learning_rate": 1.0281690140845072e-05, + "loss": 0.2359, + "step": 7550 + }, + { + "epoch": 2.4192, + "grad_norm": 0.17600581049919128, + "learning_rate": 1.0225352112676057e-05, + "loss": 0.0605, + "step": 7560 + }, + { + "epoch": 2.4224, + "grad_norm": 0.03392624855041504, + "learning_rate": 1.0169014084507043e-05, + "loss": 0.0405, + "step": 7570 + }, + { + "epoch": 2.4256, + "grad_norm": 0.02195708081126213, + "learning_rate": 1.0112676056338028e-05, + "loss": 0.0828, + "step": 7580 + }, + { + "epoch": 2.4288, + "grad_norm": 0.08389753103256226, + "learning_rate": 1.0056338028169014e-05, + "loss": 0.1063, + "step": 7590 + }, + { + "epoch": 2.432, + "grad_norm": 4.389575481414795, + "learning_rate": 1e-05, + "loss": 0.5413, + "step": 7600 + }, + { + "epoch": 2.4352, + "grad_norm": 0.2542371153831482, + "learning_rate": 9.943661971830986e-06, + "loss": 0.0079, + "step": 7610 + }, + { + "epoch": 2.4384, + "grad_norm": 0.2106814831495285, + "learning_rate": 9.887323943661972e-06, + "loss": 0.1727, + "step": 7620 + }, + { + "epoch": 2.4416, + "grad_norm": 212.36619567871094, + "learning_rate": 9.830985915492959e-06, + "loss": 0.0392, + "step": 7630 + }, + { + "epoch": 2.4448, + "grad_norm": 80.81587219238281, + "learning_rate": 9.774647887323945e-06, + "loss": 0.1838, + "step": 7640 + }, + { + "epoch": 2.448, + "grad_norm": 23.665437698364258, + "learning_rate": 9.71830985915493e-06, + "loss": 0.2827, + "step": 7650 + }, + { + "epoch": 2.4512, + "grad_norm": 0.037149883806705475, + "learning_rate": 9.661971830985917e-06, + "loss": 0.0753, + "step": 7660 + }, + { + "epoch": 2.4544, + "grad_norm": 0.2889798581600189, + "learning_rate": 9.605633802816901e-06, + "loss": 0.0021, + "step": 7670 + }, + { + "epoch": 2.4576000000000002, + "grad_norm": 39.32601547241211, + "learning_rate": 9.549295774647887e-06, + "loss": 0.1529, + "step": 7680 + }, + { + "epoch": 2.4608, + "grad_norm": 14.48507022857666, + "learning_rate": 9.492957746478874e-06, + "loss": 0.3042, + "step": 7690 + }, + { + "epoch": 2.464, + "grad_norm": 13.561240196228027, + "learning_rate": 9.43661971830986e-06, + "loss": 0.2097, + "step": 7700 + }, + { + "epoch": 2.4672, + "grad_norm": 0.1803174614906311, + "learning_rate": 9.380281690140845e-06, + "loss": 0.198, + "step": 7710 + }, + { + "epoch": 2.4704, + "grad_norm": 0.10523468255996704, + "learning_rate": 9.323943661971832e-06, + "loss": 0.138, + "step": 7720 + }, + { + "epoch": 2.4736000000000002, + "grad_norm": 0.13104431331157684, + "learning_rate": 9.267605633802817e-06, + "loss": 0.0508, + "step": 7730 + }, + { + "epoch": 2.4768, + "grad_norm": 0.6252680420875549, + "learning_rate": 9.211267605633803e-06, + "loss": 0.0026, + "step": 7740 + }, + { + "epoch": 2.48, + "grad_norm": 0.3997954726219177, + "learning_rate": 9.15492957746479e-06, + "loss": 0.0091, + "step": 7750 + }, + { + "epoch": 2.4832, + "grad_norm": 0.063034288585186, + "learning_rate": 9.098591549295776e-06, + "loss": 0.0568, + "step": 7760 + }, + { + "epoch": 2.4864, + "grad_norm": 0.26265960931777954, + "learning_rate": 9.042253521126761e-06, + "loss": 0.1125, + "step": 7770 + }, + { + "epoch": 2.4896, + "grad_norm": 16.294443130493164, + "learning_rate": 8.985915492957747e-06, + "loss": 0.0943, + "step": 7780 + }, + { + "epoch": 2.4928, + "grad_norm": 0.042526353150606155, + "learning_rate": 8.929577464788732e-06, + "loss": 0.0812, + "step": 7790 + }, + { + "epoch": 2.496, + "grad_norm": 0.23846685886383057, + "learning_rate": 8.87323943661972e-06, + "loss": 0.2302, + "step": 7800 + }, + { + "epoch": 2.4992, + "grad_norm": 0.08233381807804108, + "learning_rate": 8.816901408450705e-06, + "loss": 0.2928, + "step": 7810 + }, + { + "epoch": 2.5023999999999997, + "grad_norm": 0.319055438041687, + "learning_rate": 8.76056338028169e-06, + "loss": 0.0741, + "step": 7820 + }, + { + "epoch": 2.5056000000000003, + "grad_norm": 4.767480850219727, + "learning_rate": 8.704225352112677e-06, + "loss": 0.2083, + "step": 7830 + }, + { + "epoch": 2.5088, + "grad_norm": 0.0436800941824913, + "learning_rate": 8.647887323943663e-06, + "loss": 0.084, + "step": 7840 + }, + { + "epoch": 2.512, + "grad_norm": 102.92646789550781, + "learning_rate": 8.591549295774648e-06, + "loss": 0.0866, + "step": 7850 + }, + { + "epoch": 2.5152, + "grad_norm": 0.14441460371017456, + "learning_rate": 8.535211267605634e-06, + "loss": 0.0812, + "step": 7860 + }, + { + "epoch": 2.5183999999999997, + "grad_norm": 43.7078971862793, + "learning_rate": 8.47887323943662e-06, + "loss": 0.186, + "step": 7870 + }, + { + "epoch": 2.5216, + "grad_norm": 0.21896377205848694, + "learning_rate": 8.422535211267605e-06, + "loss": 0.022, + "step": 7880 + }, + { + "epoch": 2.5248, + "grad_norm": 0.0571066252887249, + "learning_rate": 8.366197183098592e-06, + "loss": 0.0039, + "step": 7890 + }, + { + "epoch": 2.528, + "grad_norm": 0.02678661048412323, + "learning_rate": 8.309859154929578e-06, + "loss": 0.0926, + "step": 7900 + }, + { + "epoch": 2.5312, + "grad_norm": 11.297701835632324, + "learning_rate": 8.253521126760563e-06, + "loss": 0.1038, + "step": 7910 + }, + { + "epoch": 2.5343999999999998, + "grad_norm": 0.04962446540594101, + "learning_rate": 8.19718309859155e-06, + "loss": 0.0019, + "step": 7920 + }, + { + "epoch": 2.5376, + "grad_norm": 17.65663719177246, + "learning_rate": 8.140845070422536e-06, + "loss": 0.128, + "step": 7930 + }, + { + "epoch": 2.5408, + "grad_norm": 33.21250915527344, + "learning_rate": 8.084507042253521e-06, + "loss": 0.1849, + "step": 7940 + }, + { + "epoch": 2.544, + "grad_norm": 52.65444564819336, + "learning_rate": 8.028169014084509e-06, + "loss": 0.0707, + "step": 7950 + }, + { + "epoch": 2.5472, + "grad_norm": 18.572467803955078, + "learning_rate": 7.971830985915494e-06, + "loss": 0.1295, + "step": 7960 + }, + { + "epoch": 2.5504, + "grad_norm": 0.7327030897140503, + "learning_rate": 7.915492957746478e-06, + "loss": 0.0859, + "step": 7970 + }, + { + "epoch": 2.5536, + "grad_norm": 0.0810910239815712, + "learning_rate": 7.859154929577465e-06, + "loss": 0.0147, + "step": 7980 + }, + { + "epoch": 2.5568, + "grad_norm": 0.1518411636352539, + "learning_rate": 7.80281690140845e-06, + "loss": 0.0688, + "step": 7990 + }, + { + "epoch": 2.56, + "grad_norm": 0.07638181000947952, + "learning_rate": 7.746478873239436e-06, + "loss": 0.2128, + "step": 8000 + } + ], + "logging_steps": 10, + "max_steps": 9375, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4209776885760000.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}