| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.56, | |
| "eval_steps": 500, | |
| "global_step": 8000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 4.3523268699646, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.7342, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 4.409250259399414, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.7042, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 6.984313488006592, | |
| "learning_rate": 3e-06, | |
| "loss": 0.6975, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 4.685063362121582, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.6833, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 8.828569412231445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7077, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 5.112845420837402, | |
| "learning_rate": 6e-06, | |
| "loss": 0.66, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 6.451657772064209, | |
| "learning_rate": 7.000000000000001e-06, | |
| "loss": 0.6894, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 8.004484176635742, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.6722, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 4.263449668884277, | |
| "learning_rate": 9e-06, | |
| "loss": 0.6639, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 6.908220291137695, | |
| "learning_rate": 1e-05, | |
| "loss": 0.6664, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 9.965716361999512, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 0.6425, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 9.22775650024414, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.5719, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 8.442060470581055, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 0.5148, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 3.89926815032959, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 0.4995, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 10.715374946594238, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.445, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 9.363809585571289, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.5027, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 12.60737419128418, | |
| "learning_rate": 1.7000000000000003e-05, | |
| "loss": 0.3606, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 6.228287696838379, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.4815, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 21.15777015686035, | |
| "learning_rate": 1.9e-05, | |
| "loss": 0.3848, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 7.5884809494018555, | |
| "learning_rate": 2e-05, | |
| "loss": 0.4474, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 14.817816734313965, | |
| "learning_rate": 2.1e-05, | |
| "loss": 0.3291, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 25.30421257019043, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 0.4788, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 5.897189617156982, | |
| "learning_rate": 2.3000000000000003e-05, | |
| "loss": 0.4708, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 5.656806468963623, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.4214, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 39.75941467285156, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.5151, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 9.505982398986816, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 0.5456, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 9.837905883789062, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 0.3981, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 6.425085544586182, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 0.3444, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 8.601673126220703, | |
| "learning_rate": 2.9e-05, | |
| "loss": 0.3116, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 24.99056625366211, | |
| "learning_rate": 3e-05, | |
| "loss": 0.4125, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 4.368201732635498, | |
| "learning_rate": 3.1e-05, | |
| "loss": 0.2946, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 7.49916934967041, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.4568, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 6.2486138343811035, | |
| "learning_rate": 3.3e-05, | |
| "loss": 0.4596, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 5.9687886238098145, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 0.4197, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 5.545505046844482, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.3072, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 29.903961181640625, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.5313, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 7.169201850891113, | |
| "learning_rate": 3.7e-05, | |
| "loss": 0.4665, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 11.079299926757812, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.5497, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 4.827323913574219, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.4849, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 6.925987720489502, | |
| "learning_rate": 4e-05, | |
| "loss": 0.4411, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 8.159820556640625, | |
| "learning_rate": 4.1e-05, | |
| "loss": 0.4872, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 10.454991340637207, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.3407, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 7.866086959838867, | |
| "learning_rate": 4.3e-05, | |
| "loss": 0.5081, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 11.918012619018555, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.5015, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 14.668400764465332, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.3979, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 13.602070808410645, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 0.4356, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 16.491836547851562, | |
| "learning_rate": 4.7e-05, | |
| "loss": 0.5188, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 7.220741271972656, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.3895, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 11.220433235168457, | |
| "learning_rate": 4.9e-05, | |
| "loss": 0.4704, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 19.75952911376953, | |
| "learning_rate": 5e-05, | |
| "loss": 0.3499, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 6.53499174118042, | |
| "learning_rate": 4.994366197183099e-05, | |
| "loss": 0.5274, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 7.6956000328063965, | |
| "learning_rate": 4.9887323943661973e-05, | |
| "loss": 0.3979, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 21.266582489013672, | |
| "learning_rate": 4.983098591549296e-05, | |
| "loss": 0.3423, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 1.1490899324417114, | |
| "learning_rate": 4.9774647887323944e-05, | |
| "loss": 0.3753, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 10.279012680053711, | |
| "learning_rate": 4.971830985915493e-05, | |
| "loss": 0.5932, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 6.127996444702148, | |
| "learning_rate": 4.966197183098592e-05, | |
| "loss": 0.608, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 6.2316718101501465, | |
| "learning_rate": 4.96056338028169e-05, | |
| "loss": 0.4464, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 2.966583251953125, | |
| "learning_rate": 4.954929577464789e-05, | |
| "loss": 0.4239, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 13.743029594421387, | |
| "learning_rate": 4.949295774647887e-05, | |
| "loss": 0.5533, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 18.420978546142578, | |
| "learning_rate": 4.9436619718309864e-05, | |
| "loss": 0.4474, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 7.505041122436523, | |
| "learning_rate": 4.938028169014084e-05, | |
| "loss": 0.443, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 12.293984413146973, | |
| "learning_rate": 4.9323943661971835e-05, | |
| "loss": 0.4537, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 4.876405239105225, | |
| "learning_rate": 4.926760563380282e-05, | |
| "loss": 0.5191, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 8.690363883972168, | |
| "learning_rate": 4.9211267605633806e-05, | |
| "loss": 0.556, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 15.03184700012207, | |
| "learning_rate": 4.915492957746479e-05, | |
| "loss": 0.3694, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 5.66799259185791, | |
| "learning_rate": 4.909859154929578e-05, | |
| "loss": 0.5219, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 12.765690803527832, | |
| "learning_rate": 4.904225352112676e-05, | |
| "loss": 0.4007, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 3.547962188720703, | |
| "learning_rate": 4.898591549295775e-05, | |
| "loss": 0.2905, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 3.7791709899902344, | |
| "learning_rate": 4.8929577464788734e-05, | |
| "loss": 0.4592, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 18.622026443481445, | |
| "learning_rate": 4.887323943661972e-05, | |
| "loss": 0.4226, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 7.643071174621582, | |
| "learning_rate": 4.8816901408450705e-05, | |
| "loss": 0.3244, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 27.778474807739258, | |
| "learning_rate": 4.876056338028169e-05, | |
| "loss": 0.3863, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 4.5000834465026855, | |
| "learning_rate": 4.8704225352112676e-05, | |
| "loss": 0.3431, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 44.82728576660156, | |
| "learning_rate": 4.864788732394366e-05, | |
| "loss": 0.4113, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 29.374372482299805, | |
| "learning_rate": 4.8591549295774653e-05, | |
| "loss": 0.5123, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 5.750948429107666, | |
| "learning_rate": 4.853521126760563e-05, | |
| "loss": 0.3296, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 15.520541191101074, | |
| "learning_rate": 4.8478873239436624e-05, | |
| "loss": 0.3358, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 5.127716541290283, | |
| "learning_rate": 4.84225352112676e-05, | |
| "loss": 0.3469, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 20.350370407104492, | |
| "learning_rate": 4.8366197183098595e-05, | |
| "loss": 0.3067, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 11.152381896972656, | |
| "learning_rate": 4.830985915492958e-05, | |
| "loss": 0.5645, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 19.948450088500977, | |
| "learning_rate": 4.8253521126760566e-05, | |
| "loss": 0.6015, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 7.729649543762207, | |
| "learning_rate": 4.819718309859155e-05, | |
| "loss": 0.3755, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 9.633999824523926, | |
| "learning_rate": 4.814084507042254e-05, | |
| "loss": 0.4016, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 10.421425819396973, | |
| "learning_rate": 4.808450704225352e-05, | |
| "loss": 0.3536, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 7.2195515632629395, | |
| "learning_rate": 4.8028169014084515e-05, | |
| "loss": 0.4654, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 17.624547958374023, | |
| "learning_rate": 4.7971830985915494e-05, | |
| "loss": 0.3924, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 12.011979103088379, | |
| "learning_rate": 4.791549295774648e-05, | |
| "loss": 0.3513, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 5.497714519500732, | |
| "learning_rate": 4.7859154929577465e-05, | |
| "loss": 0.3268, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 24.21810531616211, | |
| "learning_rate": 4.780281690140845e-05, | |
| "loss": 0.4526, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 20.337770462036133, | |
| "learning_rate": 4.7746478873239436e-05, | |
| "loss": 0.3238, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 3.2851223945617676, | |
| "learning_rate": 4.769014084507042e-05, | |
| "loss": 0.3168, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 37.14635467529297, | |
| "learning_rate": 4.7633802816901414e-05, | |
| "loss": 0.4194, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 16.439712524414062, | |
| "learning_rate": 4.757746478873239e-05, | |
| "loss": 0.4244, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 20.88243293762207, | |
| "learning_rate": 4.7521126760563385e-05, | |
| "loss": 0.5273, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 7.504245758056641, | |
| "learning_rate": 4.7464788732394363e-05, | |
| "loss": 0.4809, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 4.482902526855469, | |
| "learning_rate": 4.7408450704225356e-05, | |
| "loss": 0.3758, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 7.861649513244629, | |
| "learning_rate": 4.735211267605634e-05, | |
| "loss": 0.2721, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 7.620471000671387, | |
| "learning_rate": 4.729577464788733e-05, | |
| "loss": 0.4302, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 9.727157592773438, | |
| "learning_rate": 4.723943661971831e-05, | |
| "loss": 0.3482, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 20.1031494140625, | |
| "learning_rate": 4.71830985915493e-05, | |
| "loss": 0.3377, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 5.509233474731445, | |
| "learning_rate": 4.712676056338028e-05, | |
| "loss": 0.583, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 7.83276891708374, | |
| "learning_rate": 4.707042253521127e-05, | |
| "loss": 0.4061, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 11.982447624206543, | |
| "learning_rate": 4.7014084507042254e-05, | |
| "loss": 0.3287, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 5.372861385345459, | |
| "learning_rate": 4.6957746478873247e-05, | |
| "loss": 0.2898, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 6.437655448913574, | |
| "learning_rate": 4.6901408450704225e-05, | |
| "loss": 0.3084, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 14.815783500671387, | |
| "learning_rate": 4.684507042253522e-05, | |
| "loss": 0.444, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 5.138498306274414, | |
| "learning_rate": 4.6788732394366196e-05, | |
| "loss": 0.3657, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 12.453933715820312, | |
| "learning_rate": 4.673239436619719e-05, | |
| "loss": 0.6394, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 12.436538696289062, | |
| "learning_rate": 4.6676056338028174e-05, | |
| "loss": 0.3662, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 8.125226974487305, | |
| "learning_rate": 4.661971830985915e-05, | |
| "loss": 0.486, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 12.688572883605957, | |
| "learning_rate": 4.6563380281690145e-05, | |
| "loss": 0.5292, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 8.127309799194336, | |
| "learning_rate": 4.6507042253521124e-05, | |
| "loss": 0.2529, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 14.914011001586914, | |
| "learning_rate": 4.6450704225352116e-05, | |
| "loss": 0.381, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 15.199287414550781, | |
| "learning_rate": 4.63943661971831e-05, | |
| "loss": 0.5236, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 6.843156814575195, | |
| "learning_rate": 4.633802816901409e-05, | |
| "loss": 0.2929, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 12.875916481018066, | |
| "learning_rate": 4.628169014084507e-05, | |
| "loss": 0.5681, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 7.395442485809326, | |
| "learning_rate": 4.622535211267606e-05, | |
| "loss": 0.4717, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 5.369324207305908, | |
| "learning_rate": 4.6169014084507044e-05, | |
| "loss": 0.4466, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 5.074844837188721, | |
| "learning_rate": 4.611267605633803e-05, | |
| "loss": 0.4263, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 15.802391052246094, | |
| "learning_rate": 4.6056338028169015e-05, | |
| "loss": 0.351, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 20.11571502685547, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 0.3114, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 20.25322723388672, | |
| "learning_rate": 4.5943661971830986e-05, | |
| "loss": 0.4008, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 4.877046585083008, | |
| "learning_rate": 4.588732394366198e-05, | |
| "loss": 0.6073, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 6.517822742462158, | |
| "learning_rate": 4.5830985915492957e-05, | |
| "loss": 0.4318, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 6.672747611999512, | |
| "learning_rate": 4.577464788732395e-05, | |
| "loss": 0.414, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 4.382776260375977, | |
| "learning_rate": 4.5718309859154934e-05, | |
| "loss": 0.3432, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 9.080897331237793, | |
| "learning_rate": 4.566197183098592e-05, | |
| "loss": 0.4862, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 5.132823944091797, | |
| "learning_rate": 4.5605633802816905e-05, | |
| "loss": 0.4707, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 4.521566867828369, | |
| "learning_rate": 4.554929577464789e-05, | |
| "loss": 0.4951, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 9.381317138671875, | |
| "learning_rate": 4.5492957746478876e-05, | |
| "loss": 0.3341, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 10.4902982711792, | |
| "learning_rate": 4.543661971830986e-05, | |
| "loss": 0.4471, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 5.194609642028809, | |
| "learning_rate": 4.538028169014085e-05, | |
| "loss": 0.4406, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 13.40365982055664, | |
| "learning_rate": 4.532394366197183e-05, | |
| "loss": 0.3805, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 15.255276679992676, | |
| "learning_rate": 4.526760563380282e-05, | |
| "loss": 0.2003, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 13.552937507629395, | |
| "learning_rate": 4.5211267605633804e-05, | |
| "loss": 0.4048, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 24.772096633911133, | |
| "learning_rate": 4.515492957746479e-05, | |
| "loss": 0.586, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 29.092702865600586, | |
| "learning_rate": 4.5098591549295775e-05, | |
| "loss": 0.6368, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 15.915087699890137, | |
| "learning_rate": 4.504225352112677e-05, | |
| "loss": 0.3818, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 14.00623607635498, | |
| "learning_rate": 4.4985915492957746e-05, | |
| "loss": 0.4146, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 9.716373443603516, | |
| "learning_rate": 4.492957746478874e-05, | |
| "loss": 0.3421, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 5.982295989990234, | |
| "learning_rate": 4.487323943661972e-05, | |
| "loss": 0.4263, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 10.845952987670898, | |
| "learning_rate": 4.481690140845071e-05, | |
| "loss": 0.2946, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 17.834733963012695, | |
| "learning_rate": 4.4760563380281695e-05, | |
| "loss": 0.3593, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 7.576904296875, | |
| "learning_rate": 4.470422535211268e-05, | |
| "loss": 0.4492, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 7.559220790863037, | |
| "learning_rate": 4.4647887323943666e-05, | |
| "loss": 0.4127, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 4.112594127655029, | |
| "learning_rate": 4.459154929577465e-05, | |
| "loss": 0.3307, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 2.598599910736084, | |
| "learning_rate": 4.4535211267605637e-05, | |
| "loss": 0.2452, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 5.336888790130615, | |
| "learning_rate": 4.447887323943662e-05, | |
| "loss": 0.4106, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 7.816699028015137, | |
| "learning_rate": 4.442253521126761e-05, | |
| "loss": 0.492, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 14.75847053527832, | |
| "learning_rate": 4.436619718309859e-05, | |
| "loss": 0.4283, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 5.2887959480285645, | |
| "learning_rate": 4.430985915492958e-05, | |
| "loss": 0.3547, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 3.928128242492676, | |
| "learning_rate": 4.4253521126760564e-05, | |
| "loss": 0.3868, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 6.465476036071777, | |
| "learning_rate": 4.419718309859155e-05, | |
| "loss": 0.3789, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 5.5618743896484375, | |
| "learning_rate": 4.4140845070422535e-05, | |
| "loss": 0.4552, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 6.504174709320068, | |
| "learning_rate": 4.408450704225353e-05, | |
| "loss": 0.4515, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 11.997910499572754, | |
| "learning_rate": 4.4028169014084506e-05, | |
| "loss": 0.2976, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 5.83452844619751, | |
| "learning_rate": 4.39718309859155e-05, | |
| "loss": 0.3886, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 11.406950950622559, | |
| "learning_rate": 4.391549295774648e-05, | |
| "loss": 0.3743, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 4.557556629180908, | |
| "learning_rate": 4.385915492957747e-05, | |
| "loss": 0.3984, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 10.334356307983398, | |
| "learning_rate": 4.3802816901408455e-05, | |
| "loss": 0.3593, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 10.674864768981934, | |
| "learning_rate": 4.374647887323944e-05, | |
| "loss": 0.1878, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 3.705169916152954, | |
| "learning_rate": 4.3690140845070426e-05, | |
| "loss": 0.446, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 5.510721683502197, | |
| "learning_rate": 4.363380281690141e-05, | |
| "loss": 0.4812, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 2.3618953227996826, | |
| "learning_rate": 4.35774647887324e-05, | |
| "loss": 0.4004, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 10.285249710083008, | |
| "learning_rate": 4.352112676056338e-05, | |
| "loss": 0.3904, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 15.25522518157959, | |
| "learning_rate": 4.346478873239437e-05, | |
| "loss": 0.3363, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 10.684788703918457, | |
| "learning_rate": 4.340845070422535e-05, | |
| "loss": 0.5174, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 4.573671340942383, | |
| "learning_rate": 4.335211267605634e-05, | |
| "loss": 0.2947, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 13.247304916381836, | |
| "learning_rate": 4.3295774647887324e-05, | |
| "loss": 0.4169, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 16.0648250579834, | |
| "learning_rate": 4.323943661971831e-05, | |
| "loss": 0.2454, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 7.58563232421875, | |
| "learning_rate": 4.3183098591549295e-05, | |
| "loss": 0.4982, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 4.593902587890625, | |
| "learning_rate": 4.312676056338029e-05, | |
| "loss": 0.4422, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 6.4184370040893555, | |
| "learning_rate": 4.3070422535211266e-05, | |
| "loss": 0.3147, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 16.60883140563965, | |
| "learning_rate": 4.301408450704226e-05, | |
| "loss": 0.3153, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.526179552078247, | |
| "learning_rate": 4.295774647887324e-05, | |
| "loss": 0.3074, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 17.307958602905273, | |
| "learning_rate": 4.290140845070423e-05, | |
| "loss": 0.3178, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 9.892918586730957, | |
| "learning_rate": 4.284507042253521e-05, | |
| "loss": 0.2585, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 11.281522750854492, | |
| "learning_rate": 4.27887323943662e-05, | |
| "loss": 0.2706, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 22.379169464111328, | |
| "learning_rate": 4.2732394366197186e-05, | |
| "loss": 0.4405, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 7.15933084487915, | |
| "learning_rate": 4.267605633802817e-05, | |
| "loss": 0.3592, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 6.169369220733643, | |
| "learning_rate": 4.261971830985916e-05, | |
| "loss": 0.3866, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 4.099594593048096, | |
| "learning_rate": 4.256338028169014e-05, | |
| "loss": 0.4223, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 18.78368377685547, | |
| "learning_rate": 4.250704225352113e-05, | |
| "loss": 0.376, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 2.630387306213379, | |
| "learning_rate": 4.2450704225352114e-05, | |
| "loss": 0.2327, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 17.63787269592285, | |
| "learning_rate": 4.23943661971831e-05, | |
| "loss": 0.5726, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 6.631278038024902, | |
| "learning_rate": 4.2338028169014085e-05, | |
| "loss": 0.3042, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 10.951118469238281, | |
| "learning_rate": 4.228169014084507e-05, | |
| "loss": 0.5125, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 8.784004211425781, | |
| "learning_rate": 4.2225352112676056e-05, | |
| "loss": 0.4033, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 4.034893989562988, | |
| "learning_rate": 4.216901408450705e-05, | |
| "loss": 0.3788, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 4.547167778015137, | |
| "learning_rate": 4.211267605633803e-05, | |
| "loss": 0.3065, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 8.01904582977295, | |
| "learning_rate": 4.205633802816902e-05, | |
| "loss": 0.3656, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 3.676229953765869, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.4528, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 14.89476203918457, | |
| "learning_rate": 4.194366197183099e-05, | |
| "loss": 0.3974, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 6.517081260681152, | |
| "learning_rate": 4.188732394366197e-05, | |
| "loss": 0.3502, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 9.692541122436523, | |
| "learning_rate": 4.183098591549296e-05, | |
| "loss": 0.2676, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 2.047581434249878, | |
| "learning_rate": 4.1774647887323946e-05, | |
| "loss": 0.3422, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 0.5576546788215637, | |
| "learning_rate": 4.171830985915493e-05, | |
| "loss": 0.2076, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 3.5656802654266357, | |
| "learning_rate": 4.166197183098592e-05, | |
| "loss": 0.4356, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 1.7690439224243164, | |
| "learning_rate": 4.16056338028169e-05, | |
| "loss": 0.2592, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 21.8055362701416, | |
| "learning_rate": 4.154929577464789e-05, | |
| "loss": 0.4841, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 2.1585135459899902, | |
| "learning_rate": 4.149295774647888e-05, | |
| "loss": 0.2427, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 22.61993980407715, | |
| "learning_rate": 4.143661971830986e-05, | |
| "loss": 0.3264, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 3.826843500137329, | |
| "learning_rate": 4.138028169014085e-05, | |
| "loss": 0.3896, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 14.643287658691406, | |
| "learning_rate": 4.132394366197183e-05, | |
| "loss": 0.5588, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 1.5682073831558228, | |
| "learning_rate": 4.126760563380282e-05, | |
| "loss": 0.4475, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 11.66586971282959, | |
| "learning_rate": 4.12112676056338e-05, | |
| "loss": 0.3633, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 11.989961624145508, | |
| "learning_rate": 4.115492957746479e-05, | |
| "loss": 0.4635, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 3.4878032207489014, | |
| "learning_rate": 4.109859154929578e-05, | |
| "loss": 0.2872, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 3.756565570831299, | |
| "learning_rate": 4.104225352112676e-05, | |
| "loss": 0.4745, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 6.970531940460205, | |
| "learning_rate": 4.098591549295775e-05, | |
| "loss": 0.3618, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 16.83983039855957, | |
| "learning_rate": 4.092957746478873e-05, | |
| "loss": 0.3934, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 11.064716339111328, | |
| "learning_rate": 4.087323943661972e-05, | |
| "loss": 0.2683, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 18.883390426635742, | |
| "learning_rate": 4.081690140845071e-05, | |
| "loss": 0.3461, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 4.280035972595215, | |
| "learning_rate": 4.076056338028169e-05, | |
| "loss": 0.372, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 10.117981910705566, | |
| "learning_rate": 4.070422535211268e-05, | |
| "loss": 0.2614, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 11.863015174865723, | |
| "learning_rate": 4.064788732394366e-05, | |
| "loss": 0.3622, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 7.1603875160217285, | |
| "learning_rate": 4.059154929577465e-05, | |
| "loss": 0.278, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 9.962820053100586, | |
| "learning_rate": 4.053521126760564e-05, | |
| "loss": 0.544, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 7.794748306274414, | |
| "learning_rate": 4.047887323943662e-05, | |
| "loss": 0.3095, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 20.568464279174805, | |
| "learning_rate": 4.042253521126761e-05, | |
| "loss": 0.3264, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 4.824507236480713, | |
| "learning_rate": 4.036619718309859e-05, | |
| "loss": 0.3093, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 6.159689426422119, | |
| "learning_rate": 4.030985915492958e-05, | |
| "loss": 0.3073, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 22.985084533691406, | |
| "learning_rate": 4.025352112676056e-05, | |
| "loss": 0.2162, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 19.654817581176758, | |
| "learning_rate": 4.0197183098591554e-05, | |
| "loss": 0.6091, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 6.315866947174072, | |
| "learning_rate": 4.014084507042254e-05, | |
| "loss": 0.459, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 7.05145788192749, | |
| "learning_rate": 4.0084507042253525e-05, | |
| "loss": 0.3191, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 11.295578956604004, | |
| "learning_rate": 4.002816901408451e-05, | |
| "loss": 0.3457, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 4.964128494262695, | |
| "learning_rate": 3.9971830985915496e-05, | |
| "loss": 0.3634, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 9.028850555419922, | |
| "learning_rate": 3.991549295774648e-05, | |
| "loss": 0.4049, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 7.955386161804199, | |
| "learning_rate": 3.985915492957747e-05, | |
| "loss": 0.3013, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 9.309741020202637, | |
| "learning_rate": 3.980281690140845e-05, | |
| "loss": 0.3583, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 13.393871307373047, | |
| "learning_rate": 3.974647887323944e-05, | |
| "loss": 0.433, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 13.058290481567383, | |
| "learning_rate": 3.9690140845070424e-05, | |
| "loss": 0.5419, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 5.2141900062561035, | |
| "learning_rate": 3.963380281690141e-05, | |
| "loss": 0.2244, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 10.393515586853027, | |
| "learning_rate": 3.9577464788732395e-05, | |
| "loss": 0.4972, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 2.1989641189575195, | |
| "learning_rate": 3.952112676056338e-05, | |
| "loss": 0.395, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 10.207283973693848, | |
| "learning_rate": 3.946478873239437e-05, | |
| "loss": 0.5946, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 5.437625408172607, | |
| "learning_rate": 3.940845070422535e-05, | |
| "loss": 0.2024, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 5.534853458404541, | |
| "learning_rate": 3.935211267605634e-05, | |
| "loss": 0.3462, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 14.348875045776367, | |
| "learning_rate": 3.929577464788732e-05, | |
| "loss": 0.2925, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 1.4454820156097412, | |
| "learning_rate": 3.9239436619718314e-05, | |
| "loss": 0.3436, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 11.598908424377441, | |
| "learning_rate": 3.91830985915493e-05, | |
| "loss": 0.3707, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 23.476207733154297, | |
| "learning_rate": 3.9126760563380285e-05, | |
| "loss": 0.5327, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 11.371722221374512, | |
| "learning_rate": 3.907042253521127e-05, | |
| "loss": 0.4592, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 7.114121437072754, | |
| "learning_rate": 3.9014084507042256e-05, | |
| "loss": 0.2222, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 9.982179641723633, | |
| "learning_rate": 3.895774647887324e-05, | |
| "loss": 0.4403, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 4.670555114746094, | |
| "learning_rate": 3.890140845070423e-05, | |
| "loss": 0.3811, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 4.608165740966797, | |
| "learning_rate": 3.884507042253521e-05, | |
| "loss": 0.2737, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 10.740816116333008, | |
| "learning_rate": 3.87887323943662e-05, | |
| "loss": 0.3186, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 21.781532287597656, | |
| "learning_rate": 3.8732394366197184e-05, | |
| "loss": 0.2801, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 18.888141632080078, | |
| "learning_rate": 3.867605633802817e-05, | |
| "loss": 0.2524, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 14.424897193908691, | |
| "learning_rate": 3.8619718309859155e-05, | |
| "loss": 0.4049, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 2.566080093383789, | |
| "learning_rate": 3.856338028169014e-05, | |
| "loss": 0.542, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 20.100793838500977, | |
| "learning_rate": 3.850704225352113e-05, | |
| "loss": 0.331, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 17.21098518371582, | |
| "learning_rate": 3.845070422535211e-05, | |
| "loss": 0.5059, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 26.911142349243164, | |
| "learning_rate": 3.8394366197183104e-05, | |
| "loss": 0.3813, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 8.0260591506958, | |
| "learning_rate": 3.833802816901408e-05, | |
| "loss": 0.3696, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 12.544899940490723, | |
| "learning_rate": 3.8281690140845075e-05, | |
| "loss": 0.2778, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 8.61177921295166, | |
| "learning_rate": 3.822535211267606e-05, | |
| "loss": 0.3647, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 2.9444468021392822, | |
| "learning_rate": 3.8169014084507046e-05, | |
| "loss": 0.3391, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 12.346148490905762, | |
| "learning_rate": 3.811267605633803e-05, | |
| "loss": 0.3134, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 16.827272415161133, | |
| "learning_rate": 3.8056338028169017e-05, | |
| "loss": 0.2674, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 16.586444854736328, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.2912, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 6.491583347320557, | |
| "learning_rate": 3.794366197183099e-05, | |
| "loss": 0.3261, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 35.861572265625, | |
| "learning_rate": 3.788732394366197e-05, | |
| "loss": 0.2646, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 8.829320907592773, | |
| "learning_rate": 3.783098591549296e-05, | |
| "loss": 0.5009, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 1.658776879310608, | |
| "learning_rate": 3.7774647887323944e-05, | |
| "loss": 0.4636, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 2.2379770278930664, | |
| "learning_rate": 3.771830985915493e-05, | |
| "loss": 0.3915, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 4.984516143798828, | |
| "learning_rate": 3.7661971830985915e-05, | |
| "loss": 0.1507, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 5.562011241912842, | |
| "learning_rate": 3.76056338028169e-05, | |
| "loss": 0.5398, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 21.320629119873047, | |
| "learning_rate": 3.754929577464789e-05, | |
| "loss": 0.4201, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 19.99195671081543, | |
| "learning_rate": 3.749295774647887e-05, | |
| "loss": 0.4603, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 3.42061185836792, | |
| "learning_rate": 3.7436619718309864e-05, | |
| "loss": 0.2968, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 27.126548767089844, | |
| "learning_rate": 3.738028169014084e-05, | |
| "loss": 0.3913, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 11.521971702575684, | |
| "learning_rate": 3.7323943661971835e-05, | |
| "loss": 0.2858, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 20.222986221313477, | |
| "learning_rate": 3.726760563380282e-05, | |
| "loss": 0.3669, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 1.7271472215652466, | |
| "learning_rate": 3.7211267605633806e-05, | |
| "loss": 0.3193, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 29.57312774658203, | |
| "learning_rate": 3.715492957746479e-05, | |
| "loss": 0.5223, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 6.471129894256592, | |
| "learning_rate": 3.709859154929578e-05, | |
| "loss": 0.3132, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 13.332724571228027, | |
| "learning_rate": 3.704225352112676e-05, | |
| "loss": 0.5883, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 16.782337188720703, | |
| "learning_rate": 3.698591549295775e-05, | |
| "loss": 0.3876, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 12.669254302978516, | |
| "learning_rate": 3.692957746478873e-05, | |
| "loss": 0.5085, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 6.967997074127197, | |
| "learning_rate": 3.687323943661972e-05, | |
| "loss": 0.3927, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 11.67349624633789, | |
| "learning_rate": 3.6816901408450704e-05, | |
| "loss": 0.2962, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 9.875104904174805, | |
| "learning_rate": 3.676056338028169e-05, | |
| "loss": 0.3898, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 1.1998872756958008, | |
| "learning_rate": 3.6704225352112675e-05, | |
| "loss": 0.3745, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 6.544206619262695, | |
| "learning_rate": 3.664788732394366e-05, | |
| "loss": 0.2814, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 27.720895767211914, | |
| "learning_rate": 3.659154929577465e-05, | |
| "loss": 0.5197, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 5.2081403732299805, | |
| "learning_rate": 3.653521126760563e-05, | |
| "loss": 0.2418, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 25.75909996032715, | |
| "learning_rate": 3.6478873239436624e-05, | |
| "loss": 0.43, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 11.020965576171875, | |
| "learning_rate": 3.64225352112676e-05, | |
| "loss": 0.2788, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 5.504922866821289, | |
| "learning_rate": 3.6366197183098595e-05, | |
| "loss": 0.2734, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 10.418407440185547, | |
| "learning_rate": 3.630985915492958e-05, | |
| "loss": 0.4713, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 7.805202960968018, | |
| "learning_rate": 3.6253521126760566e-05, | |
| "loss": 0.2821, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 7.880125045776367, | |
| "learning_rate": 3.619718309859155e-05, | |
| "loss": 0.1627, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 33.17704772949219, | |
| "learning_rate": 3.614084507042254e-05, | |
| "loss": 0.4046, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 3.1542086601257324, | |
| "learning_rate": 3.608450704225352e-05, | |
| "loss": 0.3558, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 21.562021255493164, | |
| "learning_rate": 3.602816901408451e-05, | |
| "loss": 0.3107, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 17.724111557006836, | |
| "learning_rate": 3.5971830985915494e-05, | |
| "loss": 0.2855, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 16.4515323638916, | |
| "learning_rate": 3.5915492957746486e-05, | |
| "loss": 0.3282, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 3.998889684677124, | |
| "learning_rate": 3.5859154929577465e-05, | |
| "loss": 0.4798, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 13.89487361907959, | |
| "learning_rate": 3.580281690140846e-05, | |
| "loss": 0.3655, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 12.545125007629395, | |
| "learning_rate": 3.5746478873239436e-05, | |
| "loss": 0.374, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 16.73505210876465, | |
| "learning_rate": 3.569014084507042e-05, | |
| "loss": 0.4029, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 3.9983644485473633, | |
| "learning_rate": 3.5633802816901413e-05, | |
| "loss": 0.1769, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 19.214399337768555, | |
| "learning_rate": 3.557746478873239e-05, | |
| "loss": 0.335, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 27.911151885986328, | |
| "learning_rate": 3.5521126760563384e-05, | |
| "loss": 0.3406, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 8.778318405151367, | |
| "learning_rate": 3.546478873239436e-05, | |
| "loss": 0.4602, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 5.281238555908203, | |
| "learning_rate": 3.5408450704225355e-05, | |
| "loss": 0.3499, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 7.649629592895508, | |
| "learning_rate": 3.5352112676056334e-05, | |
| "loss": 0.2717, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 8.83627986907959, | |
| "learning_rate": 3.5295774647887326e-05, | |
| "loss": 0.3544, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 13.762328147888184, | |
| "learning_rate": 3.523943661971831e-05, | |
| "loss": 0.3039, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.0016, | |
| "grad_norm": 28.637189865112305, | |
| "learning_rate": 3.51830985915493e-05, | |
| "loss": 0.1381, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.0048, | |
| "grad_norm": 6.5435791015625, | |
| "learning_rate": 3.512676056338028e-05, | |
| "loss": 0.4339, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 8.024589538574219, | |
| "learning_rate": 3.507042253521127e-05, | |
| "loss": 0.3516, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.0112, | |
| "grad_norm": 0.35201606154441833, | |
| "learning_rate": 3.5014084507042254e-05, | |
| "loss": 0.232, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.0144, | |
| "grad_norm": 0.8222833275794983, | |
| "learning_rate": 3.4957746478873246e-05, | |
| "loss": 0.555, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.0176, | |
| "grad_norm": 0.6108925342559814, | |
| "learning_rate": 3.4901408450704225e-05, | |
| "loss": 0.4073, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.0208, | |
| "grad_norm": 33.10356521606445, | |
| "learning_rate": 3.484507042253522e-05, | |
| "loss": 0.2513, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 0.9735102653503418, | |
| "learning_rate": 3.4788732394366196e-05, | |
| "loss": 0.2464, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.0272, | |
| "grad_norm": 28.903047561645508, | |
| "learning_rate": 3.473239436619719e-05, | |
| "loss": 0.2955, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.0304, | |
| "grad_norm": 2.8575825691223145, | |
| "learning_rate": 3.4676056338028174e-05, | |
| "loss": 0.1999, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.0336, | |
| "grad_norm": 20.94749641418457, | |
| "learning_rate": 3.461971830985916e-05, | |
| "loss": 0.2652, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.0368, | |
| "grad_norm": 56.3437385559082, | |
| "learning_rate": 3.4563380281690145e-05, | |
| "loss": 0.2871, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.698115885257721, | |
| "learning_rate": 3.450704225352113e-05, | |
| "loss": 0.1267, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.0432, | |
| "grad_norm": 0.10409284383058548, | |
| "learning_rate": 3.4450704225352116e-05, | |
| "loss": 0.3235, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.0464, | |
| "grad_norm": 22.576404571533203, | |
| "learning_rate": 3.4394366197183094e-05, | |
| "loss": 0.3016, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.0496, | |
| "grad_norm": 15.452841758728027, | |
| "learning_rate": 3.433802816901409e-05, | |
| "loss": 0.43, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.0528, | |
| "grad_norm": 1.0227371454238892, | |
| "learning_rate": 3.428169014084507e-05, | |
| "loss": 0.261, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 12.007558822631836, | |
| "learning_rate": 3.422535211267606e-05, | |
| "loss": 0.1861, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.0592, | |
| "grad_norm": 4.2839484214782715, | |
| "learning_rate": 3.416901408450704e-05, | |
| "loss": 0.2487, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.0624, | |
| "grad_norm": 31.287580490112305, | |
| "learning_rate": 3.411267605633803e-05, | |
| "loss": 0.2447, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.0656, | |
| "grad_norm": 4.608018398284912, | |
| "learning_rate": 3.4056338028169014e-05, | |
| "loss": 0.3264, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.0688, | |
| "grad_norm": 10.261385917663574, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 0.1514, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 28.37779426574707, | |
| "learning_rate": 3.3943661971830985e-05, | |
| "loss": 0.3787, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.0752, | |
| "grad_norm": 4.77971076965332, | |
| "learning_rate": 3.388732394366198e-05, | |
| "loss": 0.2528, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.0784, | |
| "grad_norm": 5.056029319763184, | |
| "learning_rate": 3.3830985915492956e-05, | |
| "loss": 0.1816, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.0816, | |
| "grad_norm": 1.7177847623825073, | |
| "learning_rate": 3.377464788732395e-05, | |
| "loss": 0.1776, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.0848, | |
| "grad_norm": 28.095943450927734, | |
| "learning_rate": 3.371830985915493e-05, | |
| "loss": 0.5433, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 18.927244186401367, | |
| "learning_rate": 3.366197183098592e-05, | |
| "loss": 0.1047, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0912, | |
| "grad_norm": 13.517168998718262, | |
| "learning_rate": 3.3605633802816905e-05, | |
| "loss": 0.4499, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.0944, | |
| "grad_norm": 8.441484451293945, | |
| "learning_rate": 3.354929577464789e-05, | |
| "loss": 0.1514, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.0976, | |
| "grad_norm": 12.520185470581055, | |
| "learning_rate": 3.3492957746478876e-05, | |
| "loss": 0.329, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.1008, | |
| "grad_norm": 5.51774263381958, | |
| "learning_rate": 3.343661971830986e-05, | |
| "loss": 0.4714, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 0.26915284991264343, | |
| "learning_rate": 3.338028169014085e-05, | |
| "loss": 0.0801, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.1072, | |
| "grad_norm": 8.110269546508789, | |
| "learning_rate": 3.332394366197183e-05, | |
| "loss": 0.2196, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.1104, | |
| "grad_norm": 24.571348190307617, | |
| "learning_rate": 3.326760563380282e-05, | |
| "loss": 0.2151, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.1136, | |
| "grad_norm": 4.834783554077148, | |
| "learning_rate": 3.3211267605633804e-05, | |
| "loss": 0.2286, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.1168, | |
| "grad_norm": 0.4705199599266052, | |
| "learning_rate": 3.315492957746479e-05, | |
| "loss": 0.262, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 10.442275047302246, | |
| "learning_rate": 3.3098591549295775e-05, | |
| "loss": 0.1664, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1232, | |
| "grad_norm": 18.814613342285156, | |
| "learning_rate": 3.304225352112677e-05, | |
| "loss": 0.2115, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.1264, | |
| "grad_norm": 0.24957086145877838, | |
| "learning_rate": 3.2985915492957746e-05, | |
| "loss": 0.3388, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.1296, | |
| "grad_norm": 0.16773709654808044, | |
| "learning_rate": 3.292957746478874e-05, | |
| "loss": 0.0763, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.1328, | |
| "grad_norm": 13.98716926574707, | |
| "learning_rate": 3.2873239436619717e-05, | |
| "loss": 0.1653, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 0.13978277146816254, | |
| "learning_rate": 3.281690140845071e-05, | |
| "loss": 0.4153, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.1392, | |
| "grad_norm": 1.202662706375122, | |
| "learning_rate": 3.276056338028169e-05, | |
| "loss": 0.1792, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.1424, | |
| "grad_norm": 0.2656092643737793, | |
| "learning_rate": 3.270422535211268e-05, | |
| "loss": 0.1704, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.1456, | |
| "grad_norm": 0.1266532689332962, | |
| "learning_rate": 3.2647887323943665e-05, | |
| "loss": 0.1174, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.1488, | |
| "grad_norm": 5.823769569396973, | |
| "learning_rate": 3.259154929577465e-05, | |
| "loss": 0.4392, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 0.7768305540084839, | |
| "learning_rate": 3.2535211267605636e-05, | |
| "loss": 0.5019, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1552, | |
| "grad_norm": 4.988490104675293, | |
| "learning_rate": 3.247887323943662e-05, | |
| "loss": 0.1525, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.1584, | |
| "grad_norm": 7.760066509246826, | |
| "learning_rate": 3.242253521126761e-05, | |
| "loss": 0.2967, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.1616, | |
| "grad_norm": 0.3713386356830597, | |
| "learning_rate": 3.236619718309859e-05, | |
| "loss": 0.2972, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.1648, | |
| "grad_norm": 0.1917293667793274, | |
| "learning_rate": 3.230985915492958e-05, | |
| "loss": 0.1505, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 0.31380757689476013, | |
| "learning_rate": 3.2253521126760564e-05, | |
| "loss": 0.3006, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.1712, | |
| "grad_norm": 18.34935188293457, | |
| "learning_rate": 3.219718309859155e-05, | |
| "loss": 0.5165, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.1743999999999999, | |
| "grad_norm": 1.479733943939209, | |
| "learning_rate": 3.2140845070422535e-05, | |
| "loss": 0.104, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.1776, | |
| "grad_norm": 29.272424697875977, | |
| "learning_rate": 3.208450704225353e-05, | |
| "loss": 0.1871, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.1808, | |
| "grad_norm": 31.237834930419922, | |
| "learning_rate": 3.2028169014084506e-05, | |
| "loss": 0.3294, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 0.5057691335678101, | |
| "learning_rate": 3.19718309859155e-05, | |
| "loss": 0.3202, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.1872, | |
| "grad_norm": 35.599571228027344, | |
| "learning_rate": 3.191549295774648e-05, | |
| "loss": 0.5781, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.1904, | |
| "grad_norm": 18.676931381225586, | |
| "learning_rate": 3.185915492957747e-05, | |
| "loss": 0.2537, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.1936, | |
| "grad_norm": 17.747034072875977, | |
| "learning_rate": 3.180281690140845e-05, | |
| "loss": 0.2176, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.1968, | |
| "grad_norm": 20.384511947631836, | |
| "learning_rate": 3.174647887323944e-05, | |
| "loss": 0.3224, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.9572980403900146, | |
| "learning_rate": 3.1690140845070426e-05, | |
| "loss": 0.3297, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.2032, | |
| "grad_norm": 11.518452644348145, | |
| "learning_rate": 3.163380281690141e-05, | |
| "loss": 0.0754, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.2064, | |
| "grad_norm": 21.269893646240234, | |
| "learning_rate": 3.1577464788732397e-05, | |
| "loss": 0.3698, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.2096, | |
| "grad_norm": 6.943552494049072, | |
| "learning_rate": 3.152112676056338e-05, | |
| "loss": 0.3419, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.2128, | |
| "grad_norm": 2.637138605117798, | |
| "learning_rate": 3.146478873239437e-05, | |
| "loss": 0.4475, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 0.2345736175775528, | |
| "learning_rate": 3.140845070422535e-05, | |
| "loss": 0.26, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.2192, | |
| "grad_norm": 23.990619659423828, | |
| "learning_rate": 3.135211267605634e-05, | |
| "loss": 0.4548, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.2224, | |
| "grad_norm": 30.698591232299805, | |
| "learning_rate": 3.1295774647887324e-05, | |
| "loss": 0.2941, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.2256, | |
| "grad_norm": 8.31469440460205, | |
| "learning_rate": 3.123943661971831e-05, | |
| "loss": 0.2317, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.2288000000000001, | |
| "grad_norm": 0.8447297215461731, | |
| "learning_rate": 3.1183098591549295e-05, | |
| "loss": 0.1829, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 1.337220549583435, | |
| "learning_rate": 3.112676056338028e-05, | |
| "loss": 0.1399, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.2352, | |
| "grad_norm": 0.0955493152141571, | |
| "learning_rate": 3.1070422535211266e-05, | |
| "loss": 0.4618, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.2384, | |
| "grad_norm": 0.7334279417991638, | |
| "learning_rate": 3.101408450704226e-05, | |
| "loss": 0.0653, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.2416, | |
| "grad_norm": 0.8991394639015198, | |
| "learning_rate": 3.095774647887324e-05, | |
| "loss": 0.6064, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.2448, | |
| "grad_norm": 30.786052703857422, | |
| "learning_rate": 3.090140845070423e-05, | |
| "loss": 0.2257, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 8.322766304016113, | |
| "learning_rate": 3.084507042253521e-05, | |
| "loss": 0.3758, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.2511999999999999, | |
| "grad_norm": 1.9371285438537598, | |
| "learning_rate": 3.07887323943662e-05, | |
| "loss": 0.1619, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.2544, | |
| "grad_norm": 30.936664581298828, | |
| "learning_rate": 3.0732394366197186e-05, | |
| "loss": 0.226, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.2576, | |
| "grad_norm": 18.97284507751465, | |
| "learning_rate": 3.067605633802817e-05, | |
| "loss": 0.2106, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.2608, | |
| "grad_norm": 0.17548918724060059, | |
| "learning_rate": 3.061971830985916e-05, | |
| "loss": 0.1827, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 53.99245834350586, | |
| "learning_rate": 3.056338028169014e-05, | |
| "loss": 0.1693, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.2671999999999999, | |
| "grad_norm": 35.83251190185547, | |
| "learning_rate": 3.0507042253521128e-05, | |
| "loss": 0.2246, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.2704, | |
| "grad_norm": 0.14614693820476532, | |
| "learning_rate": 3.0450704225352117e-05, | |
| "loss": 0.0399, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.2736, | |
| "grad_norm": 0.09585163742303848, | |
| "learning_rate": 3.03943661971831e-05, | |
| "loss": 0.3103, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.2768, | |
| "grad_norm": 0.35296738147735596, | |
| "learning_rate": 3.0338028169014088e-05, | |
| "loss": 0.4458, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 25.493444442749023, | |
| "learning_rate": 3.028169014084507e-05, | |
| "loss": 0.2397, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2832, | |
| "grad_norm": 21.99680519104004, | |
| "learning_rate": 3.022535211267606e-05, | |
| "loss": 0.3189, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.2864, | |
| "grad_norm": 1.8091436624526978, | |
| "learning_rate": 3.016901408450704e-05, | |
| "loss": 0.1739, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.2896, | |
| "grad_norm": 0.42829862236976624, | |
| "learning_rate": 3.011267605633803e-05, | |
| "loss": 0.1976, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.2928, | |
| "grad_norm": 0.08488719165325165, | |
| "learning_rate": 3.005633802816902e-05, | |
| "loss": 0.2067, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 9.230164527893066, | |
| "learning_rate": 3e-05, | |
| "loss": 0.2783, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.2992, | |
| "grad_norm": 2.309288263320923, | |
| "learning_rate": 2.994366197183099e-05, | |
| "loss": 0.2829, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.3024, | |
| "grad_norm": 38.344730377197266, | |
| "learning_rate": 2.9887323943661972e-05, | |
| "loss": 0.2297, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.3056, | |
| "grad_norm": 6.4838337898254395, | |
| "learning_rate": 2.983098591549296e-05, | |
| "loss": 0.4281, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.3088, | |
| "grad_norm": 30.31648826599121, | |
| "learning_rate": 2.9774647887323946e-05, | |
| "loss": 0.1629, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 2.2914836406707764, | |
| "learning_rate": 2.971830985915493e-05, | |
| "loss": 0.2485, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.3152, | |
| "grad_norm": 0.811667263507843, | |
| "learning_rate": 2.9661971830985917e-05, | |
| "loss": 0.096, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.3184, | |
| "grad_norm": 10.037412643432617, | |
| "learning_rate": 2.9605633802816903e-05, | |
| "loss": 0.4057, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.3216, | |
| "grad_norm": 15.876739501953125, | |
| "learning_rate": 2.9549295774647888e-05, | |
| "loss": 0.2493, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.3248, | |
| "grad_norm": 0.10714894533157349, | |
| "learning_rate": 2.9492957746478874e-05, | |
| "loss": 0.1766, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 0.9898651242256165, | |
| "learning_rate": 2.943661971830986e-05, | |
| "loss": 0.1064, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.3312, | |
| "grad_norm": 17.39281463623047, | |
| "learning_rate": 2.9380281690140848e-05, | |
| "loss": 0.1863, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.3344, | |
| "grad_norm": 4.467952728271484, | |
| "learning_rate": 2.932394366197183e-05, | |
| "loss": 0.3616, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.3376000000000001, | |
| "grad_norm": 0.1744953840970993, | |
| "learning_rate": 2.926760563380282e-05, | |
| "loss": 0.3911, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.3408, | |
| "grad_norm": 11.605582237243652, | |
| "learning_rate": 2.92112676056338e-05, | |
| "loss": 0.086, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 6.277223110198975, | |
| "learning_rate": 2.915492957746479e-05, | |
| "loss": 0.1406, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.3472, | |
| "grad_norm": 55.23845672607422, | |
| "learning_rate": 2.909859154929578e-05, | |
| "loss": 0.3971, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.3504, | |
| "grad_norm": 14.972460746765137, | |
| "learning_rate": 2.904225352112676e-05, | |
| "loss": 0.2301, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.3536000000000001, | |
| "grad_norm": 0.19094644486904144, | |
| "learning_rate": 2.898591549295775e-05, | |
| "loss": 0.3753, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.3568, | |
| "grad_norm": 4.376794815063477, | |
| "learning_rate": 2.8929577464788732e-05, | |
| "loss": 0.1735, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 50.90641403198242, | |
| "learning_rate": 2.887323943661972e-05, | |
| "loss": 0.22, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.3632, | |
| "grad_norm": 10.364707946777344, | |
| "learning_rate": 2.881690140845071e-05, | |
| "loss": 0.1155, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.3664, | |
| "grad_norm": 1.0661869049072266, | |
| "learning_rate": 2.8760563380281692e-05, | |
| "loss": 0.3827, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.3696, | |
| "grad_norm": 17.855276107788086, | |
| "learning_rate": 2.870422535211268e-05, | |
| "loss": 0.3932, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.3728, | |
| "grad_norm": 27.09354019165039, | |
| "learning_rate": 2.8647887323943663e-05, | |
| "loss": 0.3695, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 0.13813486695289612, | |
| "learning_rate": 2.859154929577465e-05, | |
| "loss": 0.2251, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.3792, | |
| "grad_norm": 16.627347946166992, | |
| "learning_rate": 2.8535211267605634e-05, | |
| "loss": 0.3089, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.3824, | |
| "grad_norm": 0.3655046820640564, | |
| "learning_rate": 2.847887323943662e-05, | |
| "loss": 0.2134, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.3856, | |
| "grad_norm": 38.9260368347168, | |
| "learning_rate": 2.842253521126761e-05, | |
| "loss": 0.3283, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.3888, | |
| "grad_norm": 26.279027938842773, | |
| "learning_rate": 2.836619718309859e-05, | |
| "loss": 0.2663, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 49.378902435302734, | |
| "learning_rate": 2.830985915492958e-05, | |
| "loss": 0.2607, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.3952, | |
| "grad_norm": 0.32010015845298767, | |
| "learning_rate": 2.825352112676056e-05, | |
| "loss": 0.423, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.3984, | |
| "grad_norm": 5.871781349182129, | |
| "learning_rate": 2.819718309859155e-05, | |
| "loss": 0.1471, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.4016, | |
| "grad_norm": 3.814654588699341, | |
| "learning_rate": 2.814084507042254e-05, | |
| "loss": 0.2127, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.4048, | |
| "grad_norm": 0.4913332164287567, | |
| "learning_rate": 2.808450704225352e-05, | |
| "loss": 0.4049, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 1.195360779762268, | |
| "learning_rate": 2.802816901408451e-05, | |
| "loss": 0.3488, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.4112, | |
| "grad_norm": 30.266616821289062, | |
| "learning_rate": 2.7971830985915492e-05, | |
| "loss": 0.2471, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.4144, | |
| "grad_norm": 0.1005750447511673, | |
| "learning_rate": 2.791549295774648e-05, | |
| "loss": 0.1442, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.4176, | |
| "grad_norm": 19.245065689086914, | |
| "learning_rate": 2.7859154929577463e-05, | |
| "loss": 0.3952, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.4208, | |
| "grad_norm": 0.5281161069869995, | |
| "learning_rate": 2.7802816901408452e-05, | |
| "loss": 0.1281, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 0.20444443821907043, | |
| "learning_rate": 2.774647887323944e-05, | |
| "loss": 0.3929, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.4272, | |
| "grad_norm": 0.12726615369319916, | |
| "learning_rate": 2.7690140845070423e-05, | |
| "loss": 0.1062, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.4304000000000001, | |
| "grad_norm": 108.31690216064453, | |
| "learning_rate": 2.7633802816901412e-05, | |
| "loss": 0.1228, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.4336, | |
| "grad_norm": 25.274940490722656, | |
| "learning_rate": 2.7577464788732394e-05, | |
| "loss": 0.2112, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.4368, | |
| "grad_norm": 7.059344291687012, | |
| "learning_rate": 2.7521126760563383e-05, | |
| "loss": 0.4667, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 24.798084259033203, | |
| "learning_rate": 2.746478873239437e-05, | |
| "loss": 0.3644, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.4432, | |
| "grad_norm": 11.129374504089355, | |
| "learning_rate": 2.7408450704225354e-05, | |
| "loss": 0.4076, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.4464000000000001, | |
| "grad_norm": 6.293646335601807, | |
| "learning_rate": 2.735211267605634e-05, | |
| "loss": 0.2754, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.4496, | |
| "grad_norm": 22.136383056640625, | |
| "learning_rate": 2.7295774647887322e-05, | |
| "loss": 0.4665, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.4527999999999999, | |
| "grad_norm": 36.15532684326172, | |
| "learning_rate": 2.723943661971831e-05, | |
| "loss": 0.2789, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 58.91614532470703, | |
| "learning_rate": 2.71830985915493e-05, | |
| "loss": 0.2236, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.4592, | |
| "grad_norm": 5.225749492645264, | |
| "learning_rate": 2.712676056338028e-05, | |
| "loss": 0.459, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.4624, | |
| "grad_norm": 11.404582977294922, | |
| "learning_rate": 2.707042253521127e-05, | |
| "loss": 0.2615, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.4656, | |
| "grad_norm": 15.184187889099121, | |
| "learning_rate": 2.7014084507042253e-05, | |
| "loss": 0.242, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.4687999999999999, | |
| "grad_norm": 29.988828659057617, | |
| "learning_rate": 2.695774647887324e-05, | |
| "loss": 0.1761, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 0.3717154562473297, | |
| "learning_rate": 2.6901408450704224e-05, | |
| "loss": 0.1711, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.4752, | |
| "grad_norm": 0.5243228077888489, | |
| "learning_rate": 2.6845070422535213e-05, | |
| "loss": 0.3551, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.4784, | |
| "grad_norm": 0.14952997863292694, | |
| "learning_rate": 2.67887323943662e-05, | |
| "loss": 0.2119, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.4816, | |
| "grad_norm": 0.5155125856399536, | |
| "learning_rate": 2.6732394366197184e-05, | |
| "loss": 0.1516, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.4848, | |
| "grad_norm": 16.079330444335938, | |
| "learning_rate": 2.6676056338028172e-05, | |
| "loss": 0.3477, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 0.5251998901367188, | |
| "learning_rate": 2.6619718309859155e-05, | |
| "loss": 0.3483, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.4912, | |
| "grad_norm": 12.90518569946289, | |
| "learning_rate": 2.6563380281690143e-05, | |
| "loss": 0.3346, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.4944, | |
| "grad_norm": 3.163393259048462, | |
| "learning_rate": 2.650704225352113e-05, | |
| "loss": 0.4457, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.4976, | |
| "grad_norm": 0.8409318327903748, | |
| "learning_rate": 2.6450704225352114e-05, | |
| "loss": 0.1024, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.5008, | |
| "grad_norm": 0.23881012201309204, | |
| "learning_rate": 2.63943661971831e-05, | |
| "loss": 0.2065, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 31.078039169311523, | |
| "learning_rate": 2.6338028169014085e-05, | |
| "loss": 0.2194, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.5072, | |
| "grad_norm": 0.28144362568855286, | |
| "learning_rate": 2.628169014084507e-05, | |
| "loss": 0.0813, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.5104, | |
| "grad_norm": 10.667701721191406, | |
| "learning_rate": 2.6225352112676056e-05, | |
| "loss": 0.3627, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.5135999999999998, | |
| "grad_norm": 0.4722766578197479, | |
| "learning_rate": 2.6169014084507042e-05, | |
| "loss": 0.3747, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.5168, | |
| "grad_norm": 13.2311429977417, | |
| "learning_rate": 2.611267605633803e-05, | |
| "loss": 0.1583, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.4900763034820557, | |
| "learning_rate": 2.6056338028169013e-05, | |
| "loss": 0.2564, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.5232, | |
| "grad_norm": 32.169681549072266, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 0.2757, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.5264, | |
| "grad_norm": 25.864120483398438, | |
| "learning_rate": 2.5943661971830984e-05, | |
| "loss": 0.2264, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.5295999999999998, | |
| "grad_norm": 5.702986717224121, | |
| "learning_rate": 2.5887323943661973e-05, | |
| "loss": 0.1946, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.5328, | |
| "grad_norm": 0.28651973605155945, | |
| "learning_rate": 2.583098591549296e-05, | |
| "loss": 0.0838, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 0.4322168529033661, | |
| "learning_rate": 2.5774647887323944e-05, | |
| "loss": 0.2331, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.5392000000000001, | |
| "grad_norm": 0.17899250984191895, | |
| "learning_rate": 2.5718309859154933e-05, | |
| "loss": 0.2475, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.5424, | |
| "grad_norm": 30.139265060424805, | |
| "learning_rate": 2.5661971830985915e-05, | |
| "loss": 0.4416, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.5455999999999999, | |
| "grad_norm": 0.23452678322792053, | |
| "learning_rate": 2.5605633802816904e-05, | |
| "loss": 0.2825, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.5488, | |
| "grad_norm": 0.22751548886299133, | |
| "learning_rate": 2.5549295774647893e-05, | |
| "loss": 0.2478, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 6.262718200683594, | |
| "learning_rate": 2.5492957746478875e-05, | |
| "loss": 0.2214, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.5552000000000001, | |
| "grad_norm": 0.2855672836303711, | |
| "learning_rate": 2.5436619718309864e-05, | |
| "loss": 0.1745, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.5584, | |
| "grad_norm": 4.517999649047852, | |
| "learning_rate": 2.5380281690140846e-05, | |
| "loss": 0.2678, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.5615999999999999, | |
| "grad_norm": 31.07318115234375, | |
| "learning_rate": 2.5323943661971835e-05, | |
| "loss": 0.2528, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.5648, | |
| "grad_norm": 1.6451767683029175, | |
| "learning_rate": 2.5267605633802817e-05, | |
| "loss": 0.1002, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 32.15398406982422, | |
| "learning_rate": 2.5211267605633802e-05, | |
| "loss": 0.3257, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.5712000000000002, | |
| "grad_norm": 7.450695037841797, | |
| "learning_rate": 2.515492957746479e-05, | |
| "loss": 0.2022, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.5744, | |
| "grad_norm": 14.619000434875488, | |
| "learning_rate": 2.5098591549295773e-05, | |
| "loss": 0.346, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.5776, | |
| "grad_norm": 0.7622524499893188, | |
| "learning_rate": 2.5042253521126762e-05, | |
| "loss": 0.1638, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.5808, | |
| "grad_norm": 2.6016695499420166, | |
| "learning_rate": 2.4985915492957748e-05, | |
| "loss": 0.0822, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 2.1474409103393555, | |
| "learning_rate": 2.4929577464788733e-05, | |
| "loss": 0.1602, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.5872000000000002, | |
| "grad_norm": 65.45417785644531, | |
| "learning_rate": 2.487323943661972e-05, | |
| "loss": 0.1883, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.5904, | |
| "grad_norm": 13.360310554504395, | |
| "learning_rate": 2.4816901408450704e-05, | |
| "loss": 0.2619, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.5936, | |
| "grad_norm": 0.6584329009056091, | |
| "learning_rate": 2.476056338028169e-05, | |
| "loss": 0.2612, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.5968, | |
| "grad_norm": 54.88881301879883, | |
| "learning_rate": 2.470422535211268e-05, | |
| "loss": 0.6558, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.08469274640083313, | |
| "learning_rate": 2.4647887323943664e-05, | |
| "loss": 0.2935, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.6032, | |
| "grad_norm": 38.17769241333008, | |
| "learning_rate": 2.459154929577465e-05, | |
| "loss": 0.2391, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.6064, | |
| "grad_norm": 0.1355709284543991, | |
| "learning_rate": 2.4535211267605635e-05, | |
| "loss": 0.1574, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.6096, | |
| "grad_norm": 7.013975143432617, | |
| "learning_rate": 2.447887323943662e-05, | |
| "loss": 0.1076, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.6128, | |
| "grad_norm": 13.909317970275879, | |
| "learning_rate": 2.442253521126761e-05, | |
| "loss": 0.4274, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 4.903537273406982, | |
| "learning_rate": 2.4366197183098595e-05, | |
| "loss": 0.2527, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.6192, | |
| "grad_norm": 9.500699996948242, | |
| "learning_rate": 2.430985915492958e-05, | |
| "loss": 0.4478, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.6223999999999998, | |
| "grad_norm": 47.62290954589844, | |
| "learning_rate": 2.4253521126760566e-05, | |
| "loss": 0.2439, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.6256, | |
| "grad_norm": 0.21192322671413422, | |
| "learning_rate": 2.419718309859155e-05, | |
| "loss": 0.2524, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.6288, | |
| "grad_norm": 3.06548810005188, | |
| "learning_rate": 2.4140845070422537e-05, | |
| "loss": 0.1995, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 36.12741470336914, | |
| "learning_rate": 2.4084507042253522e-05, | |
| "loss": 0.2553, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.6352, | |
| "grad_norm": 9.318374633789062, | |
| "learning_rate": 2.4028169014084508e-05, | |
| "loss": 0.2452, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.6383999999999999, | |
| "grad_norm": 27.07297134399414, | |
| "learning_rate": 2.3971830985915493e-05, | |
| "loss": 0.4859, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.6416, | |
| "grad_norm": 17.92713165283203, | |
| "learning_rate": 2.391549295774648e-05, | |
| "loss": 0.2726, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.6448, | |
| "grad_norm": 20.595443725585938, | |
| "learning_rate": 2.3859154929577464e-05, | |
| "loss": 0.4673, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 15.670424461364746, | |
| "learning_rate": 2.380281690140845e-05, | |
| "loss": 0.131, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.6512, | |
| "grad_norm": 0.27188238501548767, | |
| "learning_rate": 2.374647887323944e-05, | |
| "loss": 0.0991, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.6543999999999999, | |
| "grad_norm": 0.1418936550617218, | |
| "learning_rate": 2.3690140845070424e-05, | |
| "loss": 0.3724, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.6576, | |
| "grad_norm": 14.037035942077637, | |
| "learning_rate": 2.363380281690141e-05, | |
| "loss": 0.3105, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.6608, | |
| "grad_norm": 1.6368416547775269, | |
| "learning_rate": 2.3577464788732395e-05, | |
| "loss": 0.174, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 16.241477966308594, | |
| "learning_rate": 2.352112676056338e-05, | |
| "loss": 0.2609, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.6672, | |
| "grad_norm": 0.27356526255607605, | |
| "learning_rate": 2.3464788732394366e-05, | |
| "loss": 0.1827, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.6703999999999999, | |
| "grad_norm": 35.08028030395508, | |
| "learning_rate": 2.3408450704225355e-05, | |
| "loss": 0.2963, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.6736, | |
| "grad_norm": 0.12633004784584045, | |
| "learning_rate": 2.335211267605634e-05, | |
| "loss": 0.1284, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.6768, | |
| "grad_norm": 10.867715835571289, | |
| "learning_rate": 2.3295774647887326e-05, | |
| "loss": 0.3947, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.37642917037010193, | |
| "learning_rate": 2.323943661971831e-05, | |
| "loss": 0.0878, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.6832, | |
| "grad_norm": 9.886677742004395, | |
| "learning_rate": 2.3183098591549297e-05, | |
| "loss": 0.1667, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.6864, | |
| "grad_norm": 0.5025785565376282, | |
| "learning_rate": 2.3126760563380283e-05, | |
| "loss": 0.2399, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.6896, | |
| "grad_norm": 0.07013744115829468, | |
| "learning_rate": 2.3070422535211268e-05, | |
| "loss": 0.2592, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.6928, | |
| "grad_norm": 9.038287162780762, | |
| "learning_rate": 2.3014084507042254e-05, | |
| "loss": 0.3243, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 0.15734457969665527, | |
| "learning_rate": 2.295774647887324e-05, | |
| "loss": 0.0506, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.6992, | |
| "grad_norm": 0.21506910026073456, | |
| "learning_rate": 2.2901408450704225e-05, | |
| "loss": 0.2006, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.7024, | |
| "grad_norm": 11.207597732543945, | |
| "learning_rate": 2.284507042253521e-05, | |
| "loss": 0.2125, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.7056, | |
| "grad_norm": 7.165248394012451, | |
| "learning_rate": 2.27887323943662e-05, | |
| "loss": 0.1971, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.7088, | |
| "grad_norm": 0.8289473056793213, | |
| "learning_rate": 2.2732394366197185e-05, | |
| "loss": 0.1824, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 1.2633789777755737, | |
| "learning_rate": 2.267605633802817e-05, | |
| "loss": 0.2601, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.7151999999999998, | |
| "grad_norm": 38.94256591796875, | |
| "learning_rate": 2.2619718309859156e-05, | |
| "loss": 0.345, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.7184, | |
| "grad_norm": 0.10120674222707748, | |
| "learning_rate": 2.256338028169014e-05, | |
| "loss": 0.1882, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.7216, | |
| "grad_norm": 17.41254425048828, | |
| "learning_rate": 2.2507042253521127e-05, | |
| "loss": 0.1648, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.7248, | |
| "grad_norm": 0.33858543634414673, | |
| "learning_rate": 2.2450704225352115e-05, | |
| "loss": 0.3624, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 0.3513981103897095, | |
| "learning_rate": 2.23943661971831e-05, | |
| "loss": 0.3238, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.7311999999999999, | |
| "grad_norm": 0.7570049166679382, | |
| "learning_rate": 2.2338028169014086e-05, | |
| "loss": 0.3377, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.7344, | |
| "grad_norm": 0.7027788162231445, | |
| "learning_rate": 2.2281690140845072e-05, | |
| "loss": 0.1963, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.7376, | |
| "grad_norm": 55.278343200683594, | |
| "learning_rate": 2.2225352112676057e-05, | |
| "loss": 0.4997, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.7408000000000001, | |
| "grad_norm": 2.759753704071045, | |
| "learning_rate": 2.2169014084507043e-05, | |
| "loss": 0.2161, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 13.195887565612793, | |
| "learning_rate": 2.2112676056338032e-05, | |
| "loss": 0.2539, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.7471999999999999, | |
| "grad_norm": 12.78817081451416, | |
| "learning_rate": 2.2056338028169017e-05, | |
| "loss": 0.2056, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.7504, | |
| "grad_norm": 40.1257209777832, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 0.2951, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.7536, | |
| "grad_norm": 0.3393701910972595, | |
| "learning_rate": 2.1943661971830985e-05, | |
| "loss": 0.2428, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.7568000000000001, | |
| "grad_norm": 13.551216125488281, | |
| "learning_rate": 2.188732394366197e-05, | |
| "loss": 0.3264, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 41.21603012084961, | |
| "learning_rate": 2.1830985915492956e-05, | |
| "loss": 0.4246, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.7631999999999999, | |
| "grad_norm": 9.464485168457031, | |
| "learning_rate": 2.1774647887323945e-05, | |
| "loss": 0.1579, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.7664, | |
| "grad_norm": 45.843814849853516, | |
| "learning_rate": 2.171830985915493e-05, | |
| "loss": 0.1919, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.7696, | |
| "grad_norm": 1.6334397792816162, | |
| "learning_rate": 2.1661971830985916e-05, | |
| "loss": 0.2822, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.7728000000000002, | |
| "grad_norm": 0.7097220420837402, | |
| "learning_rate": 2.16056338028169e-05, | |
| "loss": 0.1146, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 11.706197738647461, | |
| "learning_rate": 2.1549295774647887e-05, | |
| "loss": 0.2456, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.7792, | |
| "grad_norm": 0.8858042359352112, | |
| "learning_rate": 2.1492957746478876e-05, | |
| "loss": 0.461, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.7824, | |
| "grad_norm": 2.2900185585021973, | |
| "learning_rate": 2.143661971830986e-05, | |
| "loss": 0.0824, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.7856, | |
| "grad_norm": 1.9041435718536377, | |
| "learning_rate": 2.1380281690140847e-05, | |
| "loss": 0.1843, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.7888, | |
| "grad_norm": 9.106405258178711, | |
| "learning_rate": 2.1323943661971832e-05, | |
| "loss": 0.1421, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 19.528039932250977, | |
| "learning_rate": 2.1267605633802818e-05, | |
| "loss": 0.3623, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.7952, | |
| "grad_norm": 0.16566412150859833, | |
| "learning_rate": 2.1211267605633803e-05, | |
| "loss": 0.4338, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.7984, | |
| "grad_norm": 0.8347293138504028, | |
| "learning_rate": 2.1154929577464792e-05, | |
| "loss": 0.2157, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.8016, | |
| "grad_norm": 22.129648208618164, | |
| "learning_rate": 2.1098591549295778e-05, | |
| "loss": 0.2917, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.8048, | |
| "grad_norm": 0.25225210189819336, | |
| "learning_rate": 2.1042253521126763e-05, | |
| "loss": 0.2006, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 0.3600423336029053, | |
| "learning_rate": 2.098591549295775e-05, | |
| "loss": 0.3905, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.8112, | |
| "grad_norm": 22.235361099243164, | |
| "learning_rate": 2.0929577464788734e-05, | |
| "loss": 0.2482, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.8144, | |
| "grad_norm": 9.403947830200195, | |
| "learning_rate": 2.087323943661972e-05, | |
| "loss": 0.2406, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.8176, | |
| "grad_norm": 1.0296498537063599, | |
| "learning_rate": 2.0816901408450705e-05, | |
| "loss": 0.1626, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.8208, | |
| "grad_norm": 25.019081115722656, | |
| "learning_rate": 2.076056338028169e-05, | |
| "loss": 0.1861, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 6.003271579742432, | |
| "learning_rate": 2.0704225352112676e-05, | |
| "loss": 0.4198, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.8272, | |
| "grad_norm": 0.24664323031902313, | |
| "learning_rate": 2.064788732394366e-05, | |
| "loss": 0.2311, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.8304, | |
| "grad_norm": 0.3681061863899231, | |
| "learning_rate": 2.0591549295774647e-05, | |
| "loss": 0.3282, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.8336000000000001, | |
| "grad_norm": 1.1691765785217285, | |
| "learning_rate": 2.0535211267605633e-05, | |
| "loss": 0.1885, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.8368, | |
| "grad_norm": 43.80043029785156, | |
| "learning_rate": 2.047887323943662e-05, | |
| "loss": 0.2497, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 22.248729705810547, | |
| "learning_rate": 2.0422535211267607e-05, | |
| "loss": 0.1009, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.8432, | |
| "grad_norm": 0.1887352615594864, | |
| "learning_rate": 2.0366197183098592e-05, | |
| "loss": 0.2099, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.8464, | |
| "grad_norm": 39.37889862060547, | |
| "learning_rate": 2.0309859154929578e-05, | |
| "loss": 0.2349, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.8496000000000001, | |
| "grad_norm": 2.4076569080352783, | |
| "learning_rate": 2.0253521126760563e-05, | |
| "loss": 0.3262, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.8528, | |
| "grad_norm": 0.38582533597946167, | |
| "learning_rate": 2.019718309859155e-05, | |
| "loss": 0.0311, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 0.12109358608722687, | |
| "learning_rate": 2.0140845070422538e-05, | |
| "loss": 0.311, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.8592, | |
| "grad_norm": 6.223245620727539, | |
| "learning_rate": 2.0084507042253523e-05, | |
| "loss": 0.4537, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.8624, | |
| "grad_norm": 0.16367606818675995, | |
| "learning_rate": 2.002816901408451e-05, | |
| "loss": 0.3076, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.8656000000000001, | |
| "grad_norm": 30.715484619140625, | |
| "learning_rate": 1.9971830985915494e-05, | |
| "loss": 0.346, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.8688, | |
| "grad_norm": 42.86243438720703, | |
| "learning_rate": 1.991549295774648e-05, | |
| "loss": 0.0792, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 15.401847839355469, | |
| "learning_rate": 1.9859154929577465e-05, | |
| "loss": 0.2927, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.8752, | |
| "grad_norm": 0.9566125273704529, | |
| "learning_rate": 1.980281690140845e-05, | |
| "loss": 0.2851, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.8784, | |
| "grad_norm": 11.325541496276855, | |
| "learning_rate": 1.9746478873239436e-05, | |
| "loss": 0.4101, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.8816000000000002, | |
| "grad_norm": 0.608905553817749, | |
| "learning_rate": 1.9690140845070422e-05, | |
| "loss": 0.0911, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.8848, | |
| "grad_norm": 14.199214935302734, | |
| "learning_rate": 1.9633802816901407e-05, | |
| "loss": 0.4375, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 20.619394302368164, | |
| "learning_rate": 1.9577464788732393e-05, | |
| "loss": 0.3372, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.8912, | |
| "grad_norm": 11.778953552246094, | |
| "learning_rate": 1.9521126760563382e-05, | |
| "loss": 0.2411, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.8944, | |
| "grad_norm": 111.18775939941406, | |
| "learning_rate": 1.9464788732394367e-05, | |
| "loss": 0.239, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.8976, | |
| "grad_norm": 0.6485037207603455, | |
| "learning_rate": 1.9408450704225353e-05, | |
| "loss": 0.1002, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.9008, | |
| "grad_norm": 51.51342010498047, | |
| "learning_rate": 1.9352112676056338e-05, | |
| "loss": 0.3721, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 9.155081748962402, | |
| "learning_rate": 1.9295774647887324e-05, | |
| "loss": 0.2457, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.9072, | |
| "grad_norm": 31.439834594726562, | |
| "learning_rate": 1.923943661971831e-05, | |
| "loss": 0.3378, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.9104, | |
| "grad_norm": 38.59767532348633, | |
| "learning_rate": 1.9183098591549298e-05, | |
| "loss": 0.0842, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.9136, | |
| "grad_norm": 30.20688819885254, | |
| "learning_rate": 1.9126760563380284e-05, | |
| "loss": 0.1471, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.9167999999999998, | |
| "grad_norm": 76.58573150634766, | |
| "learning_rate": 1.907042253521127e-05, | |
| "loss": 0.2427, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.44934359192848206, | |
| "learning_rate": 1.9014084507042255e-05, | |
| "loss": 0.2053, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.9232, | |
| "grad_norm": 12.041892051696777, | |
| "learning_rate": 1.895774647887324e-05, | |
| "loss": 0.235, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.9264000000000001, | |
| "grad_norm": 0.10604248195886612, | |
| "learning_rate": 1.8901408450704226e-05, | |
| "loss": 0.2931, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.9296, | |
| "grad_norm": 52.336849212646484, | |
| "learning_rate": 1.8845070422535215e-05, | |
| "loss": 0.46, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.9327999999999999, | |
| "grad_norm": 0.30715158581733704, | |
| "learning_rate": 1.87887323943662e-05, | |
| "loss": 0.086, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 12.268070220947266, | |
| "learning_rate": 1.8732394366197186e-05, | |
| "loss": 0.3663, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.9392, | |
| "grad_norm": 0.7918230891227722, | |
| "learning_rate": 1.867605633802817e-05, | |
| "loss": 0.2688, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.9424000000000001, | |
| "grad_norm": 0.276155024766922, | |
| "learning_rate": 1.8619718309859157e-05, | |
| "loss": 0.2618, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.9456, | |
| "grad_norm": 0.6748977899551392, | |
| "learning_rate": 1.8563380281690142e-05, | |
| "loss": 0.0704, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.9487999999999999, | |
| "grad_norm": 27.20131492614746, | |
| "learning_rate": 1.8507042253521128e-05, | |
| "loss": 0.4101, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 0.1357765942811966, | |
| "learning_rate": 1.8450704225352113e-05, | |
| "loss": 0.354, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.9552, | |
| "grad_norm": 21.598268508911133, | |
| "learning_rate": 1.83943661971831e-05, | |
| "loss": 0.2213, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.9584000000000001, | |
| "grad_norm": 0.2529258728027344, | |
| "learning_rate": 1.8338028169014084e-05, | |
| "loss": 0.3205, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.9616, | |
| "grad_norm": 2.0377094745635986, | |
| "learning_rate": 1.828169014084507e-05, | |
| "loss": 0.2115, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.9647999999999999, | |
| "grad_norm": 9.584620475769043, | |
| "learning_rate": 1.822535211267606e-05, | |
| "loss": 0.4502, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 0.26572760939598083, | |
| "learning_rate": 1.8169014084507044e-05, | |
| "loss": 0.4565, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.9712, | |
| "grad_norm": 0.4592236876487732, | |
| "learning_rate": 1.811267605633803e-05, | |
| "loss": 0.1688, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.9744000000000002, | |
| "grad_norm": 2.714552879333496, | |
| "learning_rate": 1.8056338028169015e-05, | |
| "loss": 0.1334, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.9776, | |
| "grad_norm": 1.833774209022522, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.2703, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.9808, | |
| "grad_norm": 0.23094186186790466, | |
| "learning_rate": 1.7943661971830986e-05, | |
| "loss": 0.2316, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 2.593341112136841, | |
| "learning_rate": 1.7887323943661975e-05, | |
| "loss": 0.1416, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.9872, | |
| "grad_norm": 63.741641998291016, | |
| "learning_rate": 1.783098591549296e-05, | |
| "loss": 0.1999, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.9904, | |
| "grad_norm": 59.86637878417969, | |
| "learning_rate": 1.7774647887323946e-05, | |
| "loss": 0.7514, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.9936, | |
| "grad_norm": 1.3984020948410034, | |
| "learning_rate": 1.771830985915493e-05, | |
| "loss": 0.3885, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.9968, | |
| "grad_norm": 25.51970863342285, | |
| "learning_rate": 1.7661971830985917e-05, | |
| "loss": 0.1954, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.26810941100120544, | |
| "learning_rate": 1.7605633802816902e-05, | |
| "loss": 0.2831, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.0032, | |
| "grad_norm": 25.82304573059082, | |
| "learning_rate": 1.7549295774647888e-05, | |
| "loss": 0.1234, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.0064, | |
| "grad_norm": 0.15285342931747437, | |
| "learning_rate": 1.7492957746478873e-05, | |
| "loss": 0.1264, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 2.0096, | |
| "grad_norm": 0.1930648535490036, | |
| "learning_rate": 1.743661971830986e-05, | |
| "loss": 0.1633, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.0128, | |
| "grad_norm": 10.310894966125488, | |
| "learning_rate": 1.7380281690140844e-05, | |
| "loss": 0.0062, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 0.060901541262865067, | |
| "learning_rate": 1.732394366197183e-05, | |
| "loss": 0.0814, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.0192, | |
| "grad_norm": 0.14364077150821686, | |
| "learning_rate": 1.7267605633802815e-05, | |
| "loss": 0.0068, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 2.0224, | |
| "grad_norm": 0.5780632495880127, | |
| "learning_rate": 1.7211267605633804e-05, | |
| "loss": 0.0471, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.0256, | |
| "grad_norm": 0.5313758850097656, | |
| "learning_rate": 1.715492957746479e-05, | |
| "loss": 0.0619, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 2.0288, | |
| "grad_norm": 0.028105057775974274, | |
| "learning_rate": 1.7098591549295775e-05, | |
| "loss": 0.009, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 1.1972317695617676, | |
| "learning_rate": 1.704225352112676e-05, | |
| "loss": 0.0622, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.0352, | |
| "grad_norm": 0.027558835223317146, | |
| "learning_rate": 1.6985915492957746e-05, | |
| "loss": 0.2316, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.0384, | |
| "grad_norm": 0.04284098371863365, | |
| "learning_rate": 1.6929577464788735e-05, | |
| "loss": 0.0582, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 2.0416, | |
| "grad_norm": 0.1924617737531662, | |
| "learning_rate": 1.687323943661972e-05, | |
| "loss": 0.0691, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.0448, | |
| "grad_norm": 0.036435432732105255, | |
| "learning_rate": 1.6816901408450706e-05, | |
| "loss": 0.1442, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 0.8796645402908325, | |
| "learning_rate": 1.676056338028169e-05, | |
| "loss": 0.0757, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.0512, | |
| "grad_norm": 0.6916587352752686, | |
| "learning_rate": 1.6704225352112677e-05, | |
| "loss": 0.1356, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 2.0544, | |
| "grad_norm": 0.10934862494468689, | |
| "learning_rate": 1.6647887323943663e-05, | |
| "loss": 0.211, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.0576, | |
| "grad_norm": 0.03238527849316597, | |
| "learning_rate": 1.659154929577465e-05, | |
| "loss": 0.0556, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 2.0608, | |
| "grad_norm": 0.25189611315727234, | |
| "learning_rate": 1.6535211267605634e-05, | |
| "loss": 0.0208, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.08050217479467392, | |
| "learning_rate": 1.647887323943662e-05, | |
| "loss": 0.0021, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.0672, | |
| "grad_norm": 0.045152150094509125, | |
| "learning_rate": 1.6422535211267605e-05, | |
| "loss": 0.0743, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.0704, | |
| "grad_norm": 0.036941394209861755, | |
| "learning_rate": 1.636619718309859e-05, | |
| "loss": 0.1484, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 2.0736, | |
| "grad_norm": 0.024720242246985435, | |
| "learning_rate": 1.6309859154929576e-05, | |
| "loss": 0.1229, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.0768, | |
| "grad_norm": 0.033186838030815125, | |
| "learning_rate": 1.6253521126760565e-05, | |
| "loss": 0.1691, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.04443328082561493, | |
| "learning_rate": 1.619718309859155e-05, | |
| "loss": 0.1783, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.0832, | |
| "grad_norm": 134.47421264648438, | |
| "learning_rate": 1.6140845070422536e-05, | |
| "loss": 0.2483, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 2.0864, | |
| "grad_norm": 5.727556228637695, | |
| "learning_rate": 1.608450704225352e-05, | |
| "loss": 0.1534, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.0896, | |
| "grad_norm": 0.7954875230789185, | |
| "learning_rate": 1.6028169014084507e-05, | |
| "loss": 0.1282, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 2.0928, | |
| "grad_norm": 0.08250103145837784, | |
| "learning_rate": 1.5971830985915492e-05, | |
| "loss": 0.2734, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 0.04844718798995018, | |
| "learning_rate": 1.591549295774648e-05, | |
| "loss": 0.0514, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.0992, | |
| "grad_norm": 0.04677910357713699, | |
| "learning_rate": 1.5859154929577466e-05, | |
| "loss": 0.0621, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.1024, | |
| "grad_norm": 0.12014532834291458, | |
| "learning_rate": 1.5802816901408452e-05, | |
| "loss": 0.0712, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 2.1056, | |
| "grad_norm": 0.18135568499565125, | |
| "learning_rate": 1.5746478873239437e-05, | |
| "loss": 0.0642, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.1088, | |
| "grad_norm": 0.13500288128852844, | |
| "learning_rate": 1.5690140845070423e-05, | |
| "loss": 0.0566, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 0.03971581906080246, | |
| "learning_rate": 1.5633802816901412e-05, | |
| "loss": 0.0039, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.1152, | |
| "grad_norm": 0.12814994156360626, | |
| "learning_rate": 1.5577464788732397e-05, | |
| "loss": 0.1774, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 2.1184, | |
| "grad_norm": 0.02763848565518856, | |
| "learning_rate": 1.5521126760563383e-05, | |
| "loss": 0.2017, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.1216, | |
| "grad_norm": 0.16662102937698364, | |
| "learning_rate": 1.546478873239437e-05, | |
| "loss": 0.0964, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 2.1248, | |
| "grad_norm": 0.0411493182182312, | |
| "learning_rate": 1.5408450704225354e-05, | |
| "loss": 0.055, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 0.10390494018793106, | |
| "learning_rate": 1.535211267605634e-05, | |
| "loss": 0.1993, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.1312, | |
| "grad_norm": 32.90834426879883, | |
| "learning_rate": 1.5295774647887325e-05, | |
| "loss": 0.2295, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.1344, | |
| "grad_norm": 0.20629918575286865, | |
| "learning_rate": 1.5239436619718312e-05, | |
| "loss": 0.1365, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 2.1376, | |
| "grad_norm": 0.06436211615800858, | |
| "learning_rate": 1.5183098591549298e-05, | |
| "loss": 0.2804, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.1408, | |
| "grad_norm": 6.357541561126709, | |
| "learning_rate": 1.5126760563380283e-05, | |
| "loss": 0.1547, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.031177496537566185, | |
| "learning_rate": 1.5070422535211269e-05, | |
| "loss": 0.1221, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.1471999999999998, | |
| "grad_norm": 0.0401877760887146, | |
| "learning_rate": 1.5014084507042252e-05, | |
| "loss": 0.0467, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 2.1504, | |
| "grad_norm": 0.04002746194601059, | |
| "learning_rate": 1.4957746478873241e-05, | |
| "loss": 0.2081, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.1536, | |
| "grad_norm": 0.08599916845560074, | |
| "learning_rate": 1.4901408450704227e-05, | |
| "loss": 0.0094, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 2.1568, | |
| "grad_norm": 792.8843994140625, | |
| "learning_rate": 1.4845070422535212e-05, | |
| "loss": 0.0313, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 49.27170944213867, | |
| "learning_rate": 1.4788732394366198e-05, | |
| "loss": 0.0488, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.1632, | |
| "grad_norm": 0.027708498761057854, | |
| "learning_rate": 1.4732394366197183e-05, | |
| "loss": 0.2522, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.1664, | |
| "grad_norm": 144.439697265625, | |
| "learning_rate": 1.4676056338028169e-05, | |
| "loss": 0.2231, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 2.1696, | |
| "grad_norm": 0.05224217101931572, | |
| "learning_rate": 1.4619718309859156e-05, | |
| "loss": 0.0018, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.1728, | |
| "grad_norm": 85.55796813964844, | |
| "learning_rate": 1.4563380281690141e-05, | |
| "loss": 0.218, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 0.17730030417442322, | |
| "learning_rate": 1.4507042253521127e-05, | |
| "loss": 0.13, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.1792, | |
| "grad_norm": 0.05483116954565048, | |
| "learning_rate": 1.4450704225352112e-05, | |
| "loss": 0.0386, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 2.1824, | |
| "grad_norm": 0.03330325335264206, | |
| "learning_rate": 1.4394366197183098e-05, | |
| "loss": 0.0036, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.1856, | |
| "grad_norm": 0.030421894043684006, | |
| "learning_rate": 1.4338028169014083e-05, | |
| "loss": 0.1214, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 2.1888, | |
| "grad_norm": 0.037813425064086914, | |
| "learning_rate": 1.4281690140845072e-05, | |
| "loss": 0.002, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 0.4608314335346222, | |
| "learning_rate": 1.4225352112676058e-05, | |
| "loss": 0.2043, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.1952, | |
| "grad_norm": 0.16903652250766754, | |
| "learning_rate": 1.4169014084507043e-05, | |
| "loss": 0.2208, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.1984, | |
| "grad_norm": 0.09764442592859268, | |
| "learning_rate": 1.4112676056338029e-05, | |
| "loss": 0.2925, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 2.2016, | |
| "grad_norm": 0.04944216087460518, | |
| "learning_rate": 1.4056338028169014e-05, | |
| "loss": 0.0023, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.2048, | |
| "grad_norm": 18.86257553100586, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 0.1508, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 5.072443962097168, | |
| "learning_rate": 1.3943661971830987e-05, | |
| "loss": 0.1069, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.2112, | |
| "grad_norm": 0.12859505414962769, | |
| "learning_rate": 1.3887323943661972e-05, | |
| "loss": 0.0233, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 2.2144, | |
| "grad_norm": 0.06567766517400742, | |
| "learning_rate": 1.3830985915492958e-05, | |
| "loss": 0.0834, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.2176, | |
| "grad_norm": 15.95632266998291, | |
| "learning_rate": 1.3774647887323943e-05, | |
| "loss": 0.3579, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 2.2208, | |
| "grad_norm": 0.274181067943573, | |
| "learning_rate": 1.3718309859154929e-05, | |
| "loss": 0.1515, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 0.13101747632026672, | |
| "learning_rate": 1.3661971830985918e-05, | |
| "loss": 0.0393, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.2272, | |
| "grad_norm": 1.139413595199585, | |
| "learning_rate": 1.3605633802816903e-05, | |
| "loss": 0.1484, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.2304, | |
| "grad_norm": 0.057852111756801605, | |
| "learning_rate": 1.3549295774647889e-05, | |
| "loss": 0.0394, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 2.2336, | |
| "grad_norm": 0.6658930778503418, | |
| "learning_rate": 1.3492957746478874e-05, | |
| "loss": 0.0675, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.2368, | |
| "grad_norm": 0.057538602501153946, | |
| "learning_rate": 1.343661971830986e-05, | |
| "loss": 0.1998, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.08062786608934402, | |
| "learning_rate": 1.3380281690140845e-05, | |
| "loss": 0.1043, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.2432, | |
| "grad_norm": 12.996604919433594, | |
| "learning_rate": 1.3323943661971833e-05, | |
| "loss": 0.0718, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 2.2464, | |
| "grad_norm": 0.1011863648891449, | |
| "learning_rate": 1.3267605633802818e-05, | |
| "loss": 0.0694, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 2.2496, | |
| "grad_norm": 20.72796058654785, | |
| "learning_rate": 1.3211267605633804e-05, | |
| "loss": 0.1426, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 2.2528, | |
| "grad_norm": 0.22724929451942444, | |
| "learning_rate": 1.3154929577464789e-05, | |
| "loss": 0.0025, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 229.4677734375, | |
| "learning_rate": 1.3098591549295775e-05, | |
| "loss": 0.2219, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.2592, | |
| "grad_norm": 0.1337730884552002, | |
| "learning_rate": 1.304225352112676e-05, | |
| "loss": 0.1146, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 2.2624, | |
| "grad_norm": 0.08331338316202164, | |
| "learning_rate": 1.2985915492957749e-05, | |
| "loss": 0.0336, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 2.2656, | |
| "grad_norm": 0.047301776707172394, | |
| "learning_rate": 1.2929577464788733e-05, | |
| "loss": 0.0719, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.2688, | |
| "grad_norm": 0.0657852441072464, | |
| "learning_rate": 1.2873239436619718e-05, | |
| "loss": 0.1904, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 0.033138763159513474, | |
| "learning_rate": 1.2816901408450704e-05, | |
| "loss": 0.0779, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.2752, | |
| "grad_norm": 37.34537887573242, | |
| "learning_rate": 1.276056338028169e-05, | |
| "loss": 0.1858, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 2.2784, | |
| "grad_norm": 0.10586226731538773, | |
| "learning_rate": 1.2704225352112675e-05, | |
| "loss": 0.002, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.2816, | |
| "grad_norm": 0.0424388162791729, | |
| "learning_rate": 1.2647887323943664e-05, | |
| "loss": 0.1177, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 2.2848, | |
| "grad_norm": 0.05596411973237991, | |
| "learning_rate": 1.259154929577465e-05, | |
| "loss": 0.2422, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 0.05766447260975838, | |
| "learning_rate": 1.2535211267605635e-05, | |
| "loss": 0.0803, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.2912, | |
| "grad_norm": 1.7550102472305298, | |
| "learning_rate": 1.247887323943662e-05, | |
| "loss": 0.1408, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.2944, | |
| "grad_norm": 0.13066236674785614, | |
| "learning_rate": 1.2422535211267607e-05, | |
| "loss": 0.002, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 2.2976, | |
| "grad_norm": 0.1156509518623352, | |
| "learning_rate": 1.2366197183098593e-05, | |
| "loss": 0.2522, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.3008, | |
| "grad_norm": 0.09673482924699783, | |
| "learning_rate": 1.2309859154929577e-05, | |
| "loss": 0.0857, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.8121844530105591, | |
| "learning_rate": 1.2253521126760564e-05, | |
| "loss": 0.193, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.3072, | |
| "grad_norm": 0.15363769233226776, | |
| "learning_rate": 1.219718309859155e-05, | |
| "loss": 0.1038, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 2.3104, | |
| "grad_norm": 0.10093524307012558, | |
| "learning_rate": 1.2140845070422535e-05, | |
| "loss": 0.2331, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.3136, | |
| "grad_norm": 0.4434497058391571, | |
| "learning_rate": 1.2084507042253522e-05, | |
| "loss": 0.0804, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 2.3168, | |
| "grad_norm": 8.138230323791504, | |
| "learning_rate": 1.2028169014084508e-05, | |
| "loss": 0.2648, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 45.24201583862305, | |
| "learning_rate": 1.1971830985915493e-05, | |
| "loss": 0.2238, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.3232, | |
| "grad_norm": 0.13272492587566376, | |
| "learning_rate": 1.191549295774648e-05, | |
| "loss": 0.2196, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.3264, | |
| "grad_norm": 89.29029846191406, | |
| "learning_rate": 1.1859154929577466e-05, | |
| "loss": 0.3195, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 2.3296, | |
| "grad_norm": 0.09818959981203079, | |
| "learning_rate": 1.1802816901408451e-05, | |
| "loss": 0.2584, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.3327999999999998, | |
| "grad_norm": 0.21061986684799194, | |
| "learning_rate": 1.1746478873239437e-05, | |
| "loss": 0.0191, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 0.12225896865129471, | |
| "learning_rate": 1.1690140845070422e-05, | |
| "loss": 0.1498, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.3392, | |
| "grad_norm": 0.10590647161006927, | |
| "learning_rate": 1.163380281690141e-05, | |
| "loss": 0.0103, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 2.3424, | |
| "grad_norm": 0.0610116645693779, | |
| "learning_rate": 1.1577464788732395e-05, | |
| "loss": 0.1074, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.3456, | |
| "grad_norm": 0.21642152965068817, | |
| "learning_rate": 1.152112676056338e-05, | |
| "loss": 0.0459, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 2.3487999999999998, | |
| "grad_norm": 0.12459522485733032, | |
| "learning_rate": 1.1464788732394368e-05, | |
| "loss": 0.0251, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 0.03281530365347862, | |
| "learning_rate": 1.1408450704225353e-05, | |
| "loss": 0.0669, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.3552, | |
| "grad_norm": 26.73065757751465, | |
| "learning_rate": 1.1352112676056339e-05, | |
| "loss": 0.287, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 2.3584, | |
| "grad_norm": 55.194541931152344, | |
| "learning_rate": 1.1295774647887324e-05, | |
| "loss": 0.034, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 2.3616, | |
| "grad_norm": 2.0578792095184326, | |
| "learning_rate": 1.123943661971831e-05, | |
| "loss": 0.0027, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 2.3648, | |
| "grad_norm": 4.148108005523682, | |
| "learning_rate": 1.1183098591549295e-05, | |
| "loss": 0.2128, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 0.11812355369329453, | |
| "learning_rate": 1.1126760563380282e-05, | |
| "loss": 0.0027, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.3712, | |
| "grad_norm": 0.03446757793426514, | |
| "learning_rate": 1.1070422535211268e-05, | |
| "loss": 0.2747, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 2.3744, | |
| "grad_norm": 16.427898406982422, | |
| "learning_rate": 1.1014084507042253e-05, | |
| "loss": 0.0905, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 2.3776, | |
| "grad_norm": 0.08660475164651871, | |
| "learning_rate": 1.095774647887324e-05, | |
| "loss": 0.0019, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 2.3808, | |
| "grad_norm": 0.038691841065883636, | |
| "learning_rate": 1.0901408450704226e-05, | |
| "loss": 0.0015, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.037728451192379, | |
| "learning_rate": 1.0845070422535212e-05, | |
| "loss": 0.0715, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.3872, | |
| "grad_norm": 0.17045029997825623, | |
| "learning_rate": 1.0788732394366199e-05, | |
| "loss": 0.1117, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 2.3904, | |
| "grad_norm": 0.061354752629995346, | |
| "learning_rate": 1.0732394366197184e-05, | |
| "loss": 0.1794, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 2.3936, | |
| "grad_norm": 0.052094943821430206, | |
| "learning_rate": 1.067605633802817e-05, | |
| "loss": 0.0585, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 2.3968, | |
| "grad_norm": 81.28450012207031, | |
| "learning_rate": 1.0619718309859155e-05, | |
| "loss": 0.0784, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.190966844558716, | |
| "learning_rate": 1.056338028169014e-05, | |
| "loss": 0.2101, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.4032, | |
| "grad_norm": 0.030748562887310982, | |
| "learning_rate": 1.0507042253521126e-05, | |
| "loss": 0.0671, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 2.4064, | |
| "grad_norm": 0.1670941859483719, | |
| "learning_rate": 1.0450704225352113e-05, | |
| "loss": 0.2274, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 2.4096, | |
| "grad_norm": 0.06420325487852097, | |
| "learning_rate": 1.0394366197183099e-05, | |
| "loss": 0.0918, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 2.4128, | |
| "grad_norm": 0.030242426320910454, | |
| "learning_rate": 1.0338028169014086e-05, | |
| "loss": 0.0561, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 0.0714086964726448, | |
| "learning_rate": 1.0281690140845072e-05, | |
| "loss": 0.2359, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.4192, | |
| "grad_norm": 0.17600581049919128, | |
| "learning_rate": 1.0225352112676057e-05, | |
| "loss": 0.0605, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 2.4224, | |
| "grad_norm": 0.03392624855041504, | |
| "learning_rate": 1.0169014084507043e-05, | |
| "loss": 0.0405, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 2.4256, | |
| "grad_norm": 0.02195708081126213, | |
| "learning_rate": 1.0112676056338028e-05, | |
| "loss": 0.0828, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 2.4288, | |
| "grad_norm": 0.08389753103256226, | |
| "learning_rate": 1.0056338028169014e-05, | |
| "loss": 0.1063, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 4.389575481414795, | |
| "learning_rate": 1e-05, | |
| "loss": 0.5413, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.4352, | |
| "grad_norm": 0.2542371153831482, | |
| "learning_rate": 9.943661971830986e-06, | |
| "loss": 0.0079, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 2.4384, | |
| "grad_norm": 0.2106814831495285, | |
| "learning_rate": 9.887323943661972e-06, | |
| "loss": 0.1727, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 2.4416, | |
| "grad_norm": 212.36619567871094, | |
| "learning_rate": 9.830985915492959e-06, | |
| "loss": 0.0392, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 2.4448, | |
| "grad_norm": 80.81587219238281, | |
| "learning_rate": 9.774647887323945e-06, | |
| "loss": 0.1838, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 23.665437698364258, | |
| "learning_rate": 9.71830985915493e-06, | |
| "loss": 0.2827, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.4512, | |
| "grad_norm": 0.037149883806705475, | |
| "learning_rate": 9.661971830985917e-06, | |
| "loss": 0.0753, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 2.4544, | |
| "grad_norm": 0.2889798581600189, | |
| "learning_rate": 9.605633802816901e-06, | |
| "loss": 0.0021, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 2.4576000000000002, | |
| "grad_norm": 39.32601547241211, | |
| "learning_rate": 9.549295774647887e-06, | |
| "loss": 0.1529, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 2.4608, | |
| "grad_norm": 14.48507022857666, | |
| "learning_rate": 9.492957746478874e-06, | |
| "loss": 0.3042, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 13.561240196228027, | |
| "learning_rate": 9.43661971830986e-06, | |
| "loss": 0.2097, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.4672, | |
| "grad_norm": 0.1803174614906311, | |
| "learning_rate": 9.380281690140845e-06, | |
| "loss": 0.198, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 2.4704, | |
| "grad_norm": 0.10523468255996704, | |
| "learning_rate": 9.323943661971832e-06, | |
| "loss": 0.138, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 2.4736000000000002, | |
| "grad_norm": 0.13104431331157684, | |
| "learning_rate": 9.267605633802817e-06, | |
| "loss": 0.0508, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 2.4768, | |
| "grad_norm": 0.6252680420875549, | |
| "learning_rate": 9.211267605633803e-06, | |
| "loss": 0.0026, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.3997954726219177, | |
| "learning_rate": 9.15492957746479e-06, | |
| "loss": 0.0091, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.4832, | |
| "grad_norm": 0.063034288585186, | |
| "learning_rate": 9.098591549295776e-06, | |
| "loss": 0.0568, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 2.4864, | |
| "grad_norm": 0.26265960931777954, | |
| "learning_rate": 9.042253521126761e-06, | |
| "loss": 0.1125, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 2.4896, | |
| "grad_norm": 16.294443130493164, | |
| "learning_rate": 8.985915492957747e-06, | |
| "loss": 0.0943, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 2.4928, | |
| "grad_norm": 0.042526353150606155, | |
| "learning_rate": 8.929577464788732e-06, | |
| "loss": 0.0812, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 0.23846685886383057, | |
| "learning_rate": 8.87323943661972e-06, | |
| "loss": 0.2302, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.4992, | |
| "grad_norm": 0.08233381807804108, | |
| "learning_rate": 8.816901408450705e-06, | |
| "loss": 0.2928, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 2.5023999999999997, | |
| "grad_norm": 0.319055438041687, | |
| "learning_rate": 8.76056338028169e-06, | |
| "loss": 0.0741, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 2.5056000000000003, | |
| "grad_norm": 4.767480850219727, | |
| "learning_rate": 8.704225352112677e-06, | |
| "loss": 0.2083, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 2.5088, | |
| "grad_norm": 0.0436800941824913, | |
| "learning_rate": 8.647887323943663e-06, | |
| "loss": 0.084, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 102.92646789550781, | |
| "learning_rate": 8.591549295774648e-06, | |
| "loss": 0.0866, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.5152, | |
| "grad_norm": 0.14441460371017456, | |
| "learning_rate": 8.535211267605634e-06, | |
| "loss": 0.0812, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 2.5183999999999997, | |
| "grad_norm": 43.7078971862793, | |
| "learning_rate": 8.47887323943662e-06, | |
| "loss": 0.186, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 2.5216, | |
| "grad_norm": 0.21896377205848694, | |
| "learning_rate": 8.422535211267605e-06, | |
| "loss": 0.022, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 2.5248, | |
| "grad_norm": 0.0571066252887249, | |
| "learning_rate": 8.366197183098592e-06, | |
| "loss": 0.0039, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 0.02678661048412323, | |
| "learning_rate": 8.309859154929578e-06, | |
| "loss": 0.0926, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.5312, | |
| "grad_norm": 11.297701835632324, | |
| "learning_rate": 8.253521126760563e-06, | |
| "loss": 0.1038, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 2.5343999999999998, | |
| "grad_norm": 0.04962446540594101, | |
| "learning_rate": 8.19718309859155e-06, | |
| "loss": 0.0019, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.5376, | |
| "grad_norm": 17.65663719177246, | |
| "learning_rate": 8.140845070422536e-06, | |
| "loss": 0.128, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 2.5408, | |
| "grad_norm": 33.21250915527344, | |
| "learning_rate": 8.084507042253521e-06, | |
| "loss": 0.1849, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 52.65444564819336, | |
| "learning_rate": 8.028169014084509e-06, | |
| "loss": 0.0707, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.5472, | |
| "grad_norm": 18.572467803955078, | |
| "learning_rate": 7.971830985915494e-06, | |
| "loss": 0.1295, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 2.5504, | |
| "grad_norm": 0.7327030897140503, | |
| "learning_rate": 7.915492957746478e-06, | |
| "loss": 0.0859, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 2.5536, | |
| "grad_norm": 0.0810910239815712, | |
| "learning_rate": 7.859154929577465e-06, | |
| "loss": 0.0147, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 2.5568, | |
| "grad_norm": 0.1518411636352539, | |
| "learning_rate": 7.80281690140845e-06, | |
| "loss": 0.0688, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.07638181000947952, | |
| "learning_rate": 7.746478873239436e-06, | |
| "loss": 0.2128, | |
| "step": 8000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 9375, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4209776885760000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |