| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0051031256644696, | |
| "eval_steps": 500, | |
| "global_step": 6500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0015464017165059054, | |
| "grad_norm": 4352.0, | |
| "learning_rate": 1.9972307692307693e-05, | |
| "loss": 10.9174, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0030928034330118107, | |
| "grad_norm": 71168.0, | |
| "learning_rate": 1.9941538461538464e-05, | |
| "loss": 11.9649, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004639205149517716, | |
| "grad_norm": 190.0, | |
| "learning_rate": 1.9910769230769232e-05, | |
| "loss": 5.27, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0061856068660236215, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.9880000000000003e-05, | |
| "loss": 0.3647, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.007732008582529527, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 1.984923076923077e-05, | |
| "loss": 0.3099, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.009278410299035433, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.9818461538461538e-05, | |
| "loss": 0.2842, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.010824812015541337, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.978769230769231e-05, | |
| "loss": 0.2943, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.012371213732047243, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 1.9756923076923077e-05, | |
| "loss": 0.3539, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.013917615448553147, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.9726153846153848e-05, | |
| "loss": 0.259, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.015464017165059053, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 1.9695384615384616e-05, | |
| "loss": 0.2741, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01701041888156496, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.9664615384615387e-05, | |
| "loss": 0.281, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.018556820598070865, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.9633846153846155e-05, | |
| "loss": 0.2586, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.020103222314576768, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.9603076923076926e-05, | |
| "loss": 0.2776, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.021649624031082674, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 1.9572307692307693e-05, | |
| "loss": 0.3186, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.02319602574758858, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.9541538461538464e-05, | |
| "loss": 0.3315, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.024742427464094486, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.9510769230769232e-05, | |
| "loss": 0.257, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.026288829180600392, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 1.948e-05, | |
| "loss": 0.2592, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.027835230897106295, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.944923076923077e-05, | |
| "loss": 0.2703, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0293816326136122, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.941846153846154e-05, | |
| "loss": 0.2547, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.030928034330118107, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.938769230769231e-05, | |
| "loss": 0.3182, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03247443604662401, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.9356923076923077e-05, | |
| "loss": 0.3005, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.03402083776312992, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.932615384615385e-05, | |
| "loss": 0.2693, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.035567239479635825, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.929538461538462e-05, | |
| "loss": 0.2925, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.03711364119614173, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.9264615384615387e-05, | |
| "loss": 0.3165, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03866004291264763, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.9233846153846155e-05, | |
| "loss": 0.2606, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.040206444629153536, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.9203076923076923e-05, | |
| "loss": 0.324, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04175284634565944, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.9172307692307694e-05, | |
| "loss": 0.2787, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.04329924806216535, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.914153846153846e-05, | |
| "loss": 0.3092, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.044845649778671254, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.9110769230769233e-05, | |
| "loss": 0.2831, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.04639205149517716, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.908e-05, | |
| "loss": 0.282, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.047938453211683066, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.904923076923077e-05, | |
| "loss": 0.3863, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04948485492818897, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.901846153846154e-05, | |
| "loss": 0.246, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.05103125664469488, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 1.898769230769231e-05, | |
| "loss": 0.3483, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.052577658361200784, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.8956923076923078e-05, | |
| "loss": 0.4107, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05412406007770668, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.892615384615385e-05, | |
| "loss": 0.2813, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05567046179421259, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.8895384615384617e-05, | |
| "loss": 0.283, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.057216863510718495, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.8864615384615384e-05, | |
| "loss": 0.268, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0587632652272244, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.8833846153846155e-05, | |
| "loss": 0.2852, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.06030966694373031, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.8803076923076923e-05, | |
| "loss": 0.2477, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.06185606866023621, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.8772307692307694e-05, | |
| "loss": 0.2418, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06340247037674211, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.8741538461538462e-05, | |
| "loss": 0.2218, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.06494887209324803, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.8710769230769233e-05, | |
| "loss": 0.2616, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.06649527380975392, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.8680000000000004e-05, | |
| "loss": 0.3475, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.06804167552625984, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.8649230769230772e-05, | |
| "loss": 0.3025, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.06958807724276574, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.861846153846154e-05, | |
| "loss": 0.3119, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07113447895927165, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 1.8587692307692307e-05, | |
| "loss": 0.3004, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07268088067577755, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 1.8556923076923078e-05, | |
| "loss": 0.2957, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.07422728239228346, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.8526153846153846e-05, | |
| "loss": 0.3162, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.07577368410878936, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.8495384615384617e-05, | |
| "loss": 0.3637, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.07732008582529526, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.8464615384615385e-05, | |
| "loss": 0.2379, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07886648754180117, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 1.8433846153846156e-05, | |
| "loss": 0.3098, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.08041288925830707, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.8403076923076924e-05, | |
| "loss": 0.3977, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.08195929097481298, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.8372307692307695e-05, | |
| "loss": 0.3034, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.08350569269131888, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 1.8341538461538462e-05, | |
| "loss": 0.2327, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.0850520944078248, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.8310769230769233e-05, | |
| "loss": 0.2561, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0865984961243307, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.828e-05, | |
| "loss": 0.3739, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.08814489784083661, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 1.824923076923077e-05, | |
| "loss": 0.3605, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.08969129955734251, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.821846153846154e-05, | |
| "loss": 0.2557, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09123770127384842, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.8187692307692308e-05, | |
| "loss": 0.2806, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.09278410299035432, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.815692307692308e-05, | |
| "loss": 0.2977, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09433050470686022, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 1.8126153846153846e-05, | |
| "loss": 0.2845, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.09587690642336613, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.8095384615384618e-05, | |
| "loss": 0.3309, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.09742330813987203, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.806461538461539e-05, | |
| "loss": 0.3197, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.09896970985637794, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.8033846153846156e-05, | |
| "loss": 0.2654, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.10051611157288384, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.8003076923076924e-05, | |
| "loss": 0.2954, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.10206251328938976, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.7972307692307692e-05, | |
| "loss": 0.3237, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.10360891500589565, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.7941538461538463e-05, | |
| "loss": 0.2887, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.10515531672240157, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.791076923076923e-05, | |
| "loss": 0.3018, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.10670171843890747, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.788e-05, | |
| "loss": 0.261, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.10824812015541337, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 1.784923076923077e-05, | |
| "loss": 0.254, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10979452187191928, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.781846153846154e-05, | |
| "loss": 0.2944, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.11134092358842518, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.778769230769231e-05, | |
| "loss": 0.3163, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.11288732530493109, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.775692307692308e-05, | |
| "loss": 0.2838, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.11443372702143699, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 1.7726153846153847e-05, | |
| "loss": 0.236, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.1159801287379429, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.7695384615384618e-05, | |
| "loss": 0.2164, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.1175265304544488, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.7664615384615386e-05, | |
| "loss": 0.3331, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.11907293217095472, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.7633846153846153e-05, | |
| "loss": 0.303, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.12061933388746061, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.7603076923076924e-05, | |
| "loss": 0.3264, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.12216573560396653, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.7572307692307692e-05, | |
| "loss": 0.2097, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.12371213732047243, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.7541538461538463e-05, | |
| "loss": 0.2456, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12525853903697834, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.751076923076923e-05, | |
| "loss": 0.2877, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.12680494075348422, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.7480000000000002e-05, | |
| "loss": 0.2902, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.12835134246999014, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.7449230769230773e-05, | |
| "loss": 0.2357, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.12989774418649605, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.741846153846154e-05, | |
| "loss": 0.2926, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.13144414590300196, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.738769230769231e-05, | |
| "loss": 0.2301, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.13299054761950785, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.7356923076923076e-05, | |
| "loss": 0.2501, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.13453694933601376, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.7326153846153847e-05, | |
| "loss": 0.2393, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.13608335105251967, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.7295384615384615e-05, | |
| "loss": 0.2337, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.1376297527690256, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.7264615384615386e-05, | |
| "loss": 0.3147, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.13917615448553147, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.7233846153846154e-05, | |
| "loss": 0.2949, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.14072255620203739, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.7203076923076925e-05, | |
| "loss": 0.3394, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.1422689579185433, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.7172307692307696e-05, | |
| "loss": 0.3119, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.14381535963504918, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.7141538461538464e-05, | |
| "loss": 0.2959, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.1453617613515551, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.711076923076923e-05, | |
| "loss": 0.2677, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.146908163068061, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.7080000000000002e-05, | |
| "loss": 0.2575, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.14845456478456692, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 1.704923076923077e-05, | |
| "loss": 0.2419, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.1500009665010728, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.7018461538461538e-05, | |
| "loss": 0.2631, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.15154736821757872, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.698769230769231e-05, | |
| "loss": 0.2415, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.15309376993408463, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.6956923076923077e-05, | |
| "loss": 0.2498, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.15464017165059052, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.6926153846153848e-05, | |
| "loss": 0.2845, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15618657336709643, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.6895384615384615e-05, | |
| "loss": 0.3159, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.15773297508360234, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.6864615384615387e-05, | |
| "loss": 0.2969, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.15927937680010826, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.6833846153846158e-05, | |
| "loss": 0.3195, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.16082577851661414, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.6803076923076925e-05, | |
| "loss": 0.3086, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.16237218023312006, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.6772307692307693e-05, | |
| "loss": 0.297, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.16391858194962597, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.674153846153846e-05, | |
| "loss": 0.2677, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.16546498366613188, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.6710769230769232e-05, | |
| "loss": 0.294, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.16701138538263777, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.668e-05, | |
| "loss": 0.2483, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.16855778709914368, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.664923076923077e-05, | |
| "loss": 0.2564, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.1701041888156496, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.661846153846154e-05, | |
| "loss": 0.2363, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.17165059053215548, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.658769230769231e-05, | |
| "loss": 0.2486, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.1731969922486614, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.655692307692308e-05, | |
| "loss": 0.3142, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.1747433939651673, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.6526153846153848e-05, | |
| "loss": 0.4319, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.17628979568167322, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.6495384615384616e-05, | |
| "loss": 0.2727, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.1778361973981791, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 1.6464615384615387e-05, | |
| "loss": 0.2472, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.17938259911468502, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.6433846153846155e-05, | |
| "loss": 0.3036, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.18092900083119093, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.6403076923076922e-05, | |
| "loss": 0.2199, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.18247540254769684, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.6372307692307693e-05, | |
| "loss": 0.2474, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.18402180426420273, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.634153846153846e-05, | |
| "loss": 0.2892, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.18556820598070864, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.6310769230769232e-05, | |
| "loss": 0.3317, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18711460769721455, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.628e-05, | |
| "loss": 0.3066, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.18866100941372044, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 1.624923076923077e-05, | |
| "loss": 0.2811, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.19020741113022635, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 1.6218461538461542e-05, | |
| "loss": 0.2503, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.19175381284673226, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.618769230769231e-05, | |
| "loss": 0.3128, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.19330021456323818, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.6156923076923078e-05, | |
| "loss": 0.3067, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.19484661627974406, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 1.6126153846153845e-05, | |
| "loss": 0.2975, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.19639301799624997, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 1.6095384615384616e-05, | |
| "loss": 0.3083, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.1979394197127559, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 1.6064615384615384e-05, | |
| "loss": 0.2786, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.19948582142926177, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.6033846153846155e-05, | |
| "loss": 0.404, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.20103222314576769, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.6003076923076923e-05, | |
| "loss": 0.3213, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2025786248622736, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.5972307692307694e-05, | |
| "loss": 0.24, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.2041250265787795, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 1.5941538461538465e-05, | |
| "loss": 0.2711, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.2056714282952854, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.5910769230769233e-05, | |
| "loss": 0.2493, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.2072178300117913, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 1.588e-05, | |
| "loss": 0.353, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.20876423172829722, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.584923076923077e-05, | |
| "loss": 0.2534, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.21031063344480314, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.581846153846154e-05, | |
| "loss": 0.2088, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.21185703516130902, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.5787692307692307e-05, | |
| "loss": 0.3146, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.21340343687781493, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.5756923076923078e-05, | |
| "loss": 0.2947, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.21494983859432085, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.5726153846153846e-05, | |
| "loss": 0.2039, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.21649624031082673, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.5695384615384617e-05, | |
| "loss": 0.252, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.21804264202733264, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.5664615384615388e-05, | |
| "loss": 0.2689, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.21958904374383856, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.5633846153846156e-05, | |
| "loss": 0.3239, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.22113544546034447, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.5603076923076927e-05, | |
| "loss": 0.2891, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.22268184717685036, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.5572307692307694e-05, | |
| "loss": 0.3306, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.22422824889335627, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.5541538461538462e-05, | |
| "loss": 0.2971, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.22577465060986218, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.551076923076923e-05, | |
| "loss": 0.2892, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.2273210523263681, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.548e-05, | |
| "loss": 0.2773, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.22886745404287398, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 1.544923076923077e-05, | |
| "loss": 0.2767, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2304138557593799, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.541846153846154e-05, | |
| "loss": 0.2899, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.2319602574758858, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.5387692307692307e-05, | |
| "loss": 0.2521, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2335066591923917, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.535692307692308e-05, | |
| "loss": 0.2479, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.2350530609088976, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.532615384615385e-05, | |
| "loss": 0.3154, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.23659946262540352, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.5295384615384617e-05, | |
| "loss": 0.3391, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.23814586434190943, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.5264615384615385e-05, | |
| "loss": 0.265, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.23969226605841532, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.5233846153846154e-05, | |
| "loss": 0.2949, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.24123866777492123, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 1.5203076923076925e-05, | |
| "loss": 0.3136, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.24278506949142714, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 1.5172307692307693e-05, | |
| "loss": 0.3073, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.24433147120793305, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 1.5141538461538463e-05, | |
| "loss": 0.3271, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.24587787292443894, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.5110769230769232e-05, | |
| "loss": 0.2722, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.24742427464094485, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.5080000000000001e-05, | |
| "loss": 0.3513, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.24897067635745077, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 1.504923076923077e-05, | |
| "loss": 0.2212, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2505170780739567, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.501846153846154e-05, | |
| "loss": 0.2914, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.25206347979046256, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.498769230769231e-05, | |
| "loss": 0.281, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.25360988150696845, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.4956923076923077e-05, | |
| "loss": 0.2509, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.2551562832234744, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 1.4926153846153848e-05, | |
| "loss": 0.2994, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2567026849399803, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 1.4895384615384616e-05, | |
| "loss": 0.2839, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2582490866564862, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.4864615384615385e-05, | |
| "loss": 0.229, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.2597954883729921, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 1.4833846153846155e-05, | |
| "loss": 0.2381, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.261341890089498, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.4803076923076924e-05, | |
| "loss": 0.3495, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2628882918060039, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 1.4772307692307692e-05, | |
| "loss": 0.2756, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2644346935225098, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.4741538461538463e-05, | |
| "loss": 0.3189, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2659810952390157, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.4710769230769232e-05, | |
| "loss": 0.289, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.26752749695552164, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 1.4680000000000002e-05, | |
| "loss": 0.2848, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2690738986720275, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.4649230769230771e-05, | |
| "loss": 0.3115, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.2706203003885334, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.4618461538461539e-05, | |
| "loss": 0.3075, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.27216670210503935, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.458769230769231e-05, | |
| "loss": 0.2705, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.27371310382154523, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 1.4556923076923078e-05, | |
| "loss": 0.266, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.2752595055380512, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.4526153846153847e-05, | |
| "loss": 0.3179, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.27680590725455706, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.4495384615384616e-05, | |
| "loss": 0.2775, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.27835230897106295, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 1.4464615384615386e-05, | |
| "loss": 0.3279, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2798987106875689, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.4433846153846155e-05, | |
| "loss": 0.2373, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.28144511240407477, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.4403076923076925e-05, | |
| "loss": 0.2216, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.28299151412058066, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.4372307692307694e-05, | |
| "loss": 0.3206, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.2845379158370866, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.4341538461538462e-05, | |
| "loss": 0.2467, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.2860843175535925, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.4310769230769233e-05, | |
| "loss": 0.2818, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.28763071927009837, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.428e-05, | |
| "loss": 0.2477, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.2891771209866043, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.4249230769230772e-05, | |
| "loss": 0.2576, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.2907235227031102, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.421846153846154e-05, | |
| "loss": 0.2841, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.2922699244196161, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 1.4187692307692309e-05, | |
| "loss": 0.3371, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.293816326136122, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.4156923076923076e-05, | |
| "loss": 0.3037, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2953627278526279, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.4126153846153847e-05, | |
| "loss": 0.2526, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.29690912956913385, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 1.4095384615384617e-05, | |
| "loss": 0.2125, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.29845553128563973, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.4064615384615386e-05, | |
| "loss": 0.2783, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.3000019330021456, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 1.4033846153846156e-05, | |
| "loss": 0.3131, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.30154833471865156, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.4003076923076923e-05, | |
| "loss": 0.3226, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.30309473643515744, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.3972307692307694e-05, | |
| "loss": 0.2819, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3046411381516633, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.3941538461538462e-05, | |
| "loss": 0.2868, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.30618753986816927, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 1.3910769230769232e-05, | |
| "loss": 0.2615, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.30773394158467515, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.3880000000000001e-05, | |
| "loss": 0.255, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.30928034330118104, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.384923076923077e-05, | |
| "loss": 0.251, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.310826745017687, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.3818461538461541e-05, | |
| "loss": 0.2983, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.31237314673419286, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.3787692307692309e-05, | |
| "loss": 0.2705, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.3139195484506988, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.3756923076923079e-05, | |
| "loss": 0.2937, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.3154659501672047, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 1.3726153846153846e-05, | |
| "loss": 0.3296, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.3170123518837106, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 1.3695384615384617e-05, | |
| "loss": 0.2666, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.3185587536002165, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.3664615384615385e-05, | |
| "loss": 0.3124, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.3201051553167224, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.3633846153846156e-05, | |
| "loss": 0.3752, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.3216515570332283, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 1.3603076923076924e-05, | |
| "loss": 0.2622, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3231979587497342, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.3572307692307693e-05, | |
| "loss": 0.2526, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.3247443604662401, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.3541538461538464e-05, | |
| "loss": 0.2775, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.326290762182746, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.3510769230769232e-05, | |
| "loss": 0.3322, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.32783716389925194, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.3480000000000001e-05, | |
| "loss": 0.2897, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3293835656157578, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.344923076923077e-05, | |
| "loss": 0.2815, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.33092996733226376, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.341846153846154e-05, | |
| "loss": 0.2959, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.33247636904876965, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.3387692307692308e-05, | |
| "loss": 0.2571, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.33402277076527553, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.3356923076923079e-05, | |
| "loss": 0.247, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3355691724817815, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.3326153846153847e-05, | |
| "loss": 0.248, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.33711557419828736, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.3295384615384616e-05, | |
| "loss": 0.2438, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.33866197591479325, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 1.3264615384615385e-05, | |
| "loss": 0.3612, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.3402083776312992, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.3233846153846155e-05, | |
| "loss": 0.3287, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.34175477934780507, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.3203076923076926e-05, | |
| "loss": 0.2756, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.34330118106431096, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.3172307692307694e-05, | |
| "loss": 0.2886, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.3448475827808169, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.3141538461538463e-05, | |
| "loss": 0.2446, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.3463939844973228, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.311076923076923e-05, | |
| "loss": 0.3064, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.3479403862138287, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.3080000000000002e-05, | |
| "loss": 0.2376, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3494867879303346, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 1.304923076923077e-05, | |
| "loss": 0.2932, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.3510331896468405, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 1.301846153846154e-05, | |
| "loss": 0.2979, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.35257959136334643, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 1.2987692307692308e-05, | |
| "loss": 0.2897, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3541259930798523, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.2956923076923078e-05, | |
| "loss": 0.2744, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.3556723947963582, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.2926153846153849e-05, | |
| "loss": 0.2708, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.35721879651286415, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.2895384615384616e-05, | |
| "loss": 0.2224, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.35876519822937003, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.2864615384615386e-05, | |
| "loss": 0.2728, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.3603115999458759, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.2833846153846155e-05, | |
| "loss": 0.2661, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.36185800166238186, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.2803076923076925e-05, | |
| "loss": 0.2892, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.36340440337888774, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.2772307692307692e-05, | |
| "loss": 0.3092, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3649508050953937, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.2741538461538463e-05, | |
| "loss": 0.2542, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.36649720681189957, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.2710769230769231e-05, | |
| "loss": 0.3589, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.36804360852840545, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.268e-05, | |
| "loss": 0.2237, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.3695900102449114, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 1.264923076923077e-05, | |
| "loss": 0.3413, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.3711364119614173, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.261846153846154e-05, | |
| "loss": 0.2556, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.37268281367792316, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 1.258769230769231e-05, | |
| "loss": 0.3087, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.3742292153944291, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.2556923076923078e-05, | |
| "loss": 0.2609, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.375775617110935, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.2526153846153848e-05, | |
| "loss": 0.2572, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.3773220188274409, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.2495384615384615e-05, | |
| "loss": 0.3003, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.3788684205439468, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.2464615384615386e-05, | |
| "loss": 0.259, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.3804148222604527, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 1.2433846153846154e-05, | |
| "loss": 0.2606, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.38196122397695864, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.2403076923076925e-05, | |
| "loss": 0.2351, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.3835076256934645, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 1.2372307692307693e-05, | |
| "loss": 0.2664, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.3850540274099704, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.2341538461538462e-05, | |
| "loss": 0.245, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.38660042912647635, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.2310769230769233e-05, | |
| "loss": 0.2781, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.38814683084298224, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.2280000000000001e-05, | |
| "loss": 0.2847, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.3896932325594881, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.224923076923077e-05, | |
| "loss": 0.3223, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.39123963427599406, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.221846153846154e-05, | |
| "loss": 0.3068, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.39278603599249995, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.218769230769231e-05, | |
| "loss": 0.2393, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.39433243770900583, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.2156923076923077e-05, | |
| "loss": 0.2918, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.3958788394255118, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 1.2126153846153848e-05, | |
| "loss": 0.2146, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.39742524114201766, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.2095384615384616e-05, | |
| "loss": 0.3178, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.39897164285852355, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.2064615384615385e-05, | |
| "loss": 0.2247, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.4005180445750295, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.2033846153846154e-05, | |
| "loss": 0.2684, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.40206444629153537, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.2003076923076924e-05, | |
| "loss": 0.2332, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4036108480080413, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.1972307692307695e-05, | |
| "loss": 0.4153, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.4051572497245472, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.1941538461538463e-05, | |
| "loss": 0.2559, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.4067036514410531, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 1.1910769230769232e-05, | |
| "loss": 0.2974, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.408250053157559, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.188e-05, | |
| "loss": 0.3523, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.4097964548740649, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.1849230769230771e-05, | |
| "loss": 0.2619, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.4113428565905708, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.1818461538461539e-05, | |
| "loss": 0.2833, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.41288925830707673, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.178769230769231e-05, | |
| "loss": 0.244, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4144356600235826, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 1.1756923076923077e-05, | |
| "loss": 0.2238, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4159820617400885, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 1.1726153846153847e-05, | |
| "loss": 0.2839, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.41752846345659445, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 1.1695384615384618e-05, | |
| "loss": 0.3264, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.41907486517310033, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.1664615384615386e-05, | |
| "loss": 0.2501, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.42062126688960627, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.1633846153846155e-05, | |
| "loss": 0.3618, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.42216766860611216, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.1603076923076924e-05, | |
| "loss": 0.2353, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.42371407032261804, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.1572307692307694e-05, | |
| "loss": 0.2745, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.425260472039124, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.1541538461538461e-05, | |
| "loss": 0.2673, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.42680687375562987, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 1.1510769230769232e-05, | |
| "loss": 0.2448, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.42835327547213575, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.148e-05, | |
| "loss": 0.2428, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.4298996771886417, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 1.144923076923077e-05, | |
| "loss": 0.2259, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.4314460789051476, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.141846153846154e-05, | |
| "loss": 0.3214, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.43299248062165346, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 1.1387692307692308e-05, | |
| "loss": 0.272, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4345388823381594, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 1.135692307692308e-05, | |
| "loss": 0.3055, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.4360852840546653, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 1.1326153846153847e-05, | |
| "loss": 0.3783, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.43763168577117123, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.1295384615384617e-05, | |
| "loss": 0.2318, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.4391780874876771, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.1264615384615384e-05, | |
| "loss": 0.3072, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.440724489204183, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 1.1233846153846155e-05, | |
| "loss": 0.3272, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.44227089092068894, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.1203076923076923e-05, | |
| "loss": 0.2422, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.4438172926371948, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.1172307692307694e-05, | |
| "loss": 0.2424, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.4453636943537007, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 1.1141538461538462e-05, | |
| "loss": 0.288, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.44691009607020665, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 1.1110769230769231e-05, | |
| "loss": 0.2376, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.44845649778671254, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.1080000000000002e-05, | |
| "loss": 0.2541, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4500028995032184, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.104923076923077e-05, | |
| "loss": 0.2928, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.45154930121972436, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.101846153846154e-05, | |
| "loss": 0.2582, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.45309570293623025, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.0987692307692309e-05, | |
| "loss": 0.2548, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.4546421046527362, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.0956923076923078e-05, | |
| "loss": 0.3462, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.4561885063692421, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.0926153846153846e-05, | |
| "loss": 0.3076, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.45773490808574796, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.0895384615384617e-05, | |
| "loss": 0.2761, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.4592813098022539, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.0864615384615385e-05, | |
| "loss": 0.3359, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.4608277115187598, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.0833846153846154e-05, | |
| "loss": 0.3213, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.46237411323526567, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.0803076923076925e-05, | |
| "loss": 0.2917, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.4639205149517716, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.0772307692307693e-05, | |
| "loss": 0.2774, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4654669166682775, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 1.0741538461538464e-05, | |
| "loss": 0.3373, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.4670133183847834, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.0710769230769232e-05, | |
| "loss": 0.248, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.4685597201012893, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.0680000000000001e-05, | |
| "loss": 0.2782, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.4701061218177952, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.0649230769230769e-05, | |
| "loss": 0.3041, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.47165252353430115, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 1.061846153846154e-05, | |
| "loss": 0.2109, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.47319892525080703, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.0587692307692308e-05, | |
| "loss": 0.2815, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.4747453269673129, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 1.0556923076923079e-05, | |
| "loss": 0.2775, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.47629172868381886, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.0526153846153846e-05, | |
| "loss": 0.2645, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.47783813040032475, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.0495384615384616e-05, | |
| "loss": 0.2738, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.47938453211683063, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.0464615384615387e-05, | |
| "loss": 0.2912, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.48093093383333657, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.0433846153846155e-05, | |
| "loss": 0.217, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.48247733554984246, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.0403076923076924e-05, | |
| "loss": 0.3397, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.48402373726634834, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.0372307692307693e-05, | |
| "loss": 0.215, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.4855701389828543, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.0341538461538463e-05, | |
| "loss": 0.2587, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.48711654069936017, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.031076923076923e-05, | |
| "loss": 0.2183, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.4886629424158661, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 1.0280000000000002e-05, | |
| "loss": 0.2551, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.490209344132372, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.024923076923077e-05, | |
| "loss": 0.2389, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.4917557458488779, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.0218461538461539e-05, | |
| "loss": 0.2774, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.4933021475653838, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.018769230769231e-05, | |
| "loss": 0.2608, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.4948485492818897, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.0156923076923077e-05, | |
| "loss": 0.3287, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4963949509983956, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.0126153846153849e-05, | |
| "loss": 0.236, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.49794135271490153, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.0095384615384616e-05, | |
| "loss": 0.259, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.4994877544314074, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.0064615384615386e-05, | |
| "loss": 0.2668, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.5010341561479134, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.0033846153846153e-05, | |
| "loss": 0.3078, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.5025805578644192, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.0003076923076924e-05, | |
| "loss": 0.2674, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5041269595809251, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 9.972307692307694e-06, | |
| "loss": 0.276, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.505673361297431, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 9.941538461538463e-06, | |
| "loss": 0.2331, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.5072197630139369, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 9.910769230769231e-06, | |
| "loss": 0.2518, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5087661647304429, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 9.88e-06, | |
| "loss": 0.3217, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.5103125664469488, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 9.84923076923077e-06, | |
| "loss": 0.2582, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5118589681634547, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 9.818461538461539e-06, | |
| "loss": 0.2967, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.5134053698799605, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 9.787692307692308e-06, | |
| "loss": 0.2508, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.5149517715964664, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 9.756923076923078e-06, | |
| "loss": 0.247, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.5164981733129724, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.726153846153847e-06, | |
| "loss": 0.2664, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5180445750294783, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 9.695384615384617e-06, | |
| "loss": 0.2963, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5195909767459842, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.664615384615386e-06, | |
| "loss": 0.2869, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5211373784624901, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 9.633846153846155e-06, | |
| "loss": 0.2125, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.522683780178996, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 9.603076923076923e-06, | |
| "loss": 0.2379, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5242301818955019, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.572307692307693e-06, | |
| "loss": 0.2897, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.5257765836120079, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.541538461538462e-06, | |
| "loss": 0.2954, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5273229853285137, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 9.510769230769231e-06, | |
| "loss": 0.257, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5288693870450196, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 9.48e-06, | |
| "loss": 0.3107, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5304157887615255, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 9.44923076923077e-06, | |
| "loss": 0.2178, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5319621904780314, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.41846153846154e-06, | |
| "loss": 0.2407, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.5335085921945374, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.387692307692309e-06, | |
| "loss": 0.284, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5350549939110433, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 9.356923076923078e-06, | |
| "loss": 0.2358, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5366013956275492, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.326153846153848e-06, | |
| "loss": 0.2455, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.538147797344055, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.295384615384615e-06, | |
| "loss": 0.3416, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5396941990605609, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.264615384615385e-06, | |
| "loss": 0.2908, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.5412406007770668, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 9.233846153846154e-06, | |
| "loss": 0.2648, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5427870024935728, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 9.203076923076924e-06, | |
| "loss": 0.2159, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.5443334042100787, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.172307692307693e-06, | |
| "loss": 0.3019, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5458798059265846, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 9.141538461538462e-06, | |
| "loss": 0.2886, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.5474262076430905, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 9.110769230769232e-06, | |
| "loss": 0.2674, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.5489726093595964, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.080000000000001e-06, | |
| "loss": 0.2809, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5505190110761023, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.04923076923077e-06, | |
| "loss": 0.3105, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.5520654127926082, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 9.01846153846154e-06, | |
| "loss": 0.3115, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.5536118145091141, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 8.987692307692308e-06, | |
| "loss": 0.2605, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.55515821622562, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 8.956923076923077e-06, | |
| "loss": 0.2281, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.5567046179421259, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 8.926153846153846e-06, | |
| "loss": 0.2732, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5582510196586318, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.895384615384616e-06, | |
| "loss": 0.2134, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.5597974213751378, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 8.864615384615385e-06, | |
| "loss": 0.2788, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.5613438230916437, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 8.833846153846155e-06, | |
| "loss": 0.2558, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.5628902248081495, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 8.803076923076924e-06, | |
| "loss": 0.2719, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.5644366265246554, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.772307692307693e-06, | |
| "loss": 0.2596, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5659830282411613, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 8.741538461538463e-06, | |
| "loss": 0.2484, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.5675294299576673, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 8.710769230769232e-06, | |
| "loss": 0.2734, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.5690758316741732, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 8.68e-06, | |
| "loss": 0.2728, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.5706222333906791, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 8.64923076923077e-06, | |
| "loss": 0.2746, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.572168635107185, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 8.618461538461539e-06, | |
| "loss": 0.2767, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5737150368236908, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 8.587692307692308e-06, | |
| "loss": 0.2798, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.5752614385401967, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.556923076923077e-06, | |
| "loss": 0.2573, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.5768078402567027, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.526153846153847e-06, | |
| "loss": 0.2756, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.5783542419732086, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8.495384615384616e-06, | |
| "loss": 0.2819, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.5799006436897145, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 8.464615384615386e-06, | |
| "loss": 0.22, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.5814470454062204, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.433846153846155e-06, | |
| "loss": 0.2857, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.5829934471227263, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.403076923076924e-06, | |
| "loss": 0.2803, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.5845398488392322, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8.372307692307692e-06, | |
| "loss": 0.2207, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.5860862505557382, | |
| "grad_norm": 1.375, | |
| "learning_rate": 8.341538461538462e-06, | |
| "loss": 0.2684, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.587632652272244, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 8.310769230769231e-06, | |
| "loss": 0.2353, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5891790539887499, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 8.28e-06, | |
| "loss": 0.3244, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.5907254557052558, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8.24923076923077e-06, | |
| "loss": 0.288, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.5922718574217617, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 8.218461538461539e-06, | |
| "loss": 0.261, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.5938182591382677, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 8.187692307692309e-06, | |
| "loss": 0.3277, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.5953646608547736, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 8.156923076923078e-06, | |
| "loss": 0.2727, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5969110625712795, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 8.126153846153847e-06, | |
| "loss": 0.2319, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.5984574642877853, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 8.095384615384617e-06, | |
| "loss": 0.252, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.6000038660042912, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 8.064615384615384e-06, | |
| "loss": 0.2683, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.6015502677207971, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 8.033846153846154e-06, | |
| "loss": 0.3251, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.6030966694373031, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.003076923076923e-06, | |
| "loss": 0.3153, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.604643071153809, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 7.972307692307693e-06, | |
| "loss": 0.3234, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.6061894728703149, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 7.941538461538462e-06, | |
| "loss": 0.2812, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.6077358745868208, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 7.910769230769231e-06, | |
| "loss": 0.2959, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.6092822763033267, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 7.88e-06, | |
| "loss": 0.2818, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.6108286780198326, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 7.84923076923077e-06, | |
| "loss": 0.3152, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.6123750797363385, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 7.81846153846154e-06, | |
| "loss": 0.2988, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.6139214814528444, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 7.787692307692309e-06, | |
| "loss": 0.2835, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.6154678831693503, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 7.756923076923077e-06, | |
| "loss": 0.3486, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.6170142848858562, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 7.726153846153846e-06, | |
| "loss": 0.2934, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.6185606866023621, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 7.695384615384615e-06, | |
| "loss": 0.2678, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6201070883188681, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 7.664615384615385e-06, | |
| "loss": 0.2608, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.621653490035374, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 7.633846153846154e-06, | |
| "loss": 0.289, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.6231998917518798, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 7.6030769230769245e-06, | |
| "loss": 0.2856, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.6247462934683857, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 7.572307692307693e-06, | |
| "loss": 0.2569, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6262926951848916, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 7.5415384615384624e-06, | |
| "loss": 0.2727, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6278390969013976, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 7.510769230769232e-06, | |
| "loss": 0.279, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6293854986179035, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 7.48e-06, | |
| "loss": 0.3606, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6309319003344094, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 7.44923076923077e-06, | |
| "loss": 0.2959, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6324783020509153, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 7.418461538461539e-06, | |
| "loss": 0.2622, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.6340247037674211, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 7.387692307692308e-06, | |
| "loss": 0.2207, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.635571105483927, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 7.356923076923077e-06, | |
| "loss": 0.3007, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.637117507200433, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 7.326153846153847e-06, | |
| "loss": 0.2815, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.6386639089169389, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 7.295384615384617e-06, | |
| "loss": 0.2587, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.6402103106334448, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 7.264615384615385e-06, | |
| "loss": 0.2999, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.6417567123499507, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 7.233846153846155e-06, | |
| "loss": 0.2398, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.6433031140664566, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 7.203076923076924e-06, | |
| "loss": 0.2716, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.6448495157829626, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 7.172307692307693e-06, | |
| "loss": 0.2222, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.6463959174994685, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 7.141538461538462e-06, | |
| "loss": 0.285, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.6479423192159743, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 7.1107692307692314e-06, | |
| "loss": 0.3562, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.6494887209324802, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 7.08e-06, | |
| "loss": 0.283, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6510351226489861, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 7.049230769230769e-06, | |
| "loss": 0.2915, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.652581524365492, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 7.01846153846154e-06, | |
| "loss": 0.2424, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.654127926081998, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 6.987692307692309e-06, | |
| "loss": 0.2456, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.6556743277985039, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 6.9569230769230776e-06, | |
| "loss": 0.2946, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.6572207295150098, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 6.926153846153847e-06, | |
| "loss": 0.3338, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6587671312315156, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 6.895384615384616e-06, | |
| "loss": 0.2645, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.6603135329480215, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 6.864615384615385e-06, | |
| "loss": 0.2671, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.6618599346645275, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 6.833846153846154e-06, | |
| "loss": 0.2627, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.6634063363810334, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 6.803076923076924e-06, | |
| "loss": 0.2972, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.6649527380975393, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 6.772307692307692e-06, | |
| "loss": 0.2637, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6664991398140452, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 6.741538461538462e-06, | |
| "loss": 0.2459, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.6680455415305511, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 6.710769230769232e-06, | |
| "loss": 0.3008, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.669591943247057, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 6.680000000000001e-06, | |
| "loss": 0.2881, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.671138344963563, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 6.64923076923077e-06, | |
| "loss": 0.2887, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.6726847466800688, | |
| "grad_norm": 0.875, | |
| "learning_rate": 6.618461538461539e-06, | |
| "loss": 0.3097, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6742311483965747, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 6.587692307692309e-06, | |
| "loss": 0.2623, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.6757775501130806, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 6.556923076923077e-06, | |
| "loss": 0.2589, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.6773239518295865, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 6.5261538461538465e-06, | |
| "loss": 0.2149, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.6788703535460925, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 6.495384615384616e-06, | |
| "loss": 0.22, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.6804167552625984, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 6.4646153846153845e-06, | |
| "loss": 0.2636, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6819631569791043, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 6.433846153846154e-06, | |
| "loss": 0.3088, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.6835095586956101, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 6.403076923076924e-06, | |
| "loss": 0.2378, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.685055960412116, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 6.3723076923076935e-06, | |
| "loss": 0.2405, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.6866023621286219, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 6.341538461538462e-06, | |
| "loss": 0.2825, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.6881487638451279, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 6.3107692307692315e-06, | |
| "loss": 0.3513, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.6896951655616338, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 6.280000000000001e-06, | |
| "loss": 0.3271, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.6912415672781397, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 6.249230769230769e-06, | |
| "loss": 0.2742, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.6927879689946456, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 6.218461538461539e-06, | |
| "loss": 0.2642, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.6943343707111514, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 6.187692307692308e-06, | |
| "loss": 0.2364, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.6958807724276574, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 6.156923076923077e-06, | |
| "loss": 0.2883, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6974271741441633, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 6.126153846153846e-06, | |
| "loss": 0.2219, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.6989735758606692, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 6.095384615384616e-06, | |
| "loss": 0.2639, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.7005199775771751, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 6.064615384615386e-06, | |
| "loss": 0.2764, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.702066379293681, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 6.033846153846154e-06, | |
| "loss": 0.3097, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.7036127810101869, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 6.003076923076924e-06, | |
| "loss": 0.2262, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.7051591827266929, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 5.972307692307693e-06, | |
| "loss": 0.2537, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.7067055844431988, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 5.941538461538462e-06, | |
| "loss": 0.347, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.7082519861597046, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 5.910769230769231e-06, | |
| "loss": 0.3193, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.7097983878762105, | |
| "grad_norm": 0.75, | |
| "learning_rate": 5.8800000000000005e-06, | |
| "loss": 0.273, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.7113447895927164, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 5.849230769230769e-06, | |
| "loss": 0.2902, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7128911913092224, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 5.818461538461538e-06, | |
| "loss": 0.3653, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.7144375930257283, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 5.787692307692309e-06, | |
| "loss": 0.3106, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.7159839947422342, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 5.756923076923078e-06, | |
| "loss": 0.2368, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.7175303964587401, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 5.726153846153847e-06, | |
| "loss": 0.249, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.719076798175246, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 5.695384615384616e-06, | |
| "loss": 0.3709, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.7206231998917518, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 5.664615384615385e-06, | |
| "loss": 0.2921, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.7221696016082578, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 5.633846153846154e-06, | |
| "loss": 0.3115, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.7237160033247637, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 5.603076923076923e-06, | |
| "loss": 0.2479, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7252624050412696, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 5.572307692307693e-06, | |
| "loss": 0.2297, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.7268088067577755, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 5.541538461538461e-06, | |
| "loss": 0.2454, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7283552084742814, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 5.5107692307692315e-06, | |
| "loss": 0.2849, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.7299016101907874, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 5.480000000000001e-06, | |
| "loss": 0.2797, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.7314480119072932, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 5.44923076923077e-06, | |
| "loss": 0.3882, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.7329944136237991, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 5.418461538461539e-06, | |
| "loss": 0.2509, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.734540815340305, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 5.387692307692308e-06, | |
| "loss": 0.2408, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.7360872170568109, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 5.356923076923078e-06, | |
| "loss": 0.2413, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.7376336187733168, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 5.326153846153846e-06, | |
| "loss": 0.2432, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.7391800204898228, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 5.2953846153846156e-06, | |
| "loss": 0.277, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.7407264222063287, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 5.264615384615385e-06, | |
| "loss": 0.2486, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.7422728239228346, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 5.2338461538461535e-06, | |
| "loss": 0.3, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7438192256393404, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 5.203076923076924e-06, | |
| "loss": 0.26, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.7453656273558463, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 5.172307692307693e-06, | |
| "loss": 0.2937, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.7469120290723523, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 5.1415384615384625e-06, | |
| "loss": 0.3057, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.7484584307888582, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 5.110769230769231e-06, | |
| "loss": 0.3284, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.7500048325053641, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 5.0800000000000005e-06, | |
| "loss": 0.2434, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.75155123422187, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 5.04923076923077e-06, | |
| "loss": 0.257, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.7530976359383759, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 5.0184615384615384e-06, | |
| "loss": 0.2714, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.7546440376548817, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 4.987692307692308e-06, | |
| "loss": 0.2182, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.7561904393713877, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 4.956923076923077e-06, | |
| "loss": 0.2855, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.7577368410878936, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 4.926153846153847e-06, | |
| "loss": 0.2774, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7592832428043995, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 4.895384615384616e-06, | |
| "loss": 0.2489, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.7608296445209054, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 4.8646153846153846e-06, | |
| "loss": 0.3157, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.7623760462374113, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 4.833846153846154e-06, | |
| "loss": 0.2704, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.7639224479539173, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 4.803076923076923e-06, | |
| "loss": 0.2995, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.7654688496704232, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 4.772307692307693e-06, | |
| "loss": 0.2422, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.767015251386929, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 4.741538461538462e-06, | |
| "loss": 0.2692, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.7685616531034349, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 4.710769230769231e-06, | |
| "loss": 0.2704, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.7701080548199408, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 4.680000000000001e-06, | |
| "loss": 0.3147, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.7716544565364467, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 4.6492307692307695e-06, | |
| "loss": 0.2867, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.7732008582529527, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 4.618461538461539e-06, | |
| "loss": 0.2896, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7747472599694586, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 4.587692307692308e-06, | |
| "loss": 0.2335, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.7762936616859645, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 4.556923076923077e-06, | |
| "loss": 0.2441, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.7778400634024704, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 4.526153846153847e-06, | |
| "loss": 0.3049, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.7793864651189762, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 4.495384615384616e-06, | |
| "loss": 0.2605, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.7809328668354822, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 4.464615384615385e-06, | |
| "loss": 0.2876, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.7824792685519881, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 4.433846153846154e-06, | |
| "loss": 0.3434, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.784025670268494, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 4.403076923076923e-06, | |
| "loss": 0.2956, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.7855720719849999, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 4.372307692307693e-06, | |
| "loss": 0.3175, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.7871184737015058, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 4.341538461538462e-06, | |
| "loss": 0.2914, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.7886648754180117, | |
| "grad_norm": 1.125, | |
| "learning_rate": 4.310769230769231e-06, | |
| "loss": 0.2657, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7902112771345177, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 4.2800000000000005e-06, | |
| "loss": 0.3168, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.7917576788510235, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 4.249230769230769e-06, | |
| "loss": 0.2422, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.7933040805675294, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 4.218461538461539e-06, | |
| "loss": 0.2651, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.7948504822840353, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 4.187692307692308e-06, | |
| "loss": 0.245, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.7963968840005412, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 4.156923076923077e-06, | |
| "loss": 0.3055, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.7979432857170471, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 4.126153846153847e-06, | |
| "loss": 0.2992, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.7994896874335531, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 4.095384615384615e-06, | |
| "loss": 0.3123, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.801036089150059, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 4.0646153846153854e-06, | |
| "loss": 0.2849, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.8025824908665649, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 4.033846153846154e-06, | |
| "loss": 0.317, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.8041288925830707, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 4.003076923076923e-06, | |
| "loss": 0.2567, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.8056752942995766, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 3.972307692307693e-06, | |
| "loss": 0.2918, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.8072216960160826, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 3.941538461538461e-06, | |
| "loss": 0.3973, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.8087680977325885, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.9107692307692316e-06, | |
| "loss": 0.3034, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.8103144994490944, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 3.88e-06, | |
| "loss": 0.2369, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.8118609011656003, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 3.8492307692307695e-06, | |
| "loss": 0.261, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.8134073028821062, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 3.818461538461539e-06, | |
| "loss": 0.2657, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.814953704598612, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 3.787692307692308e-06, | |
| "loss": 0.2336, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.816500106315118, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 3.7569230769230773e-06, | |
| "loss": 0.2683, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.8180465080316239, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 3.7261538461538467e-06, | |
| "loss": 0.2703, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.8195929097481298, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 3.6953846153846156e-06, | |
| "loss": 0.2907, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.8211393114646357, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 3.6646153846153846e-06, | |
| "loss": 0.3177, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.8226857131811416, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 3.633846153846154e-06, | |
| "loss": 0.3023, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.8242321148976476, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 3.6030769230769234e-06, | |
| "loss": 0.2169, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.8257785166141535, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 3.572307692307693e-06, | |
| "loss": 0.24, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.8273249183306594, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 3.5415384615384618e-06, | |
| "loss": 0.3394, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.8288713200471652, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 3.5107692307692307e-06, | |
| "loss": 0.2527, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.8304177217636711, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.48e-06, | |
| "loss": 0.2447, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.831964123480177, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 3.4492307692307695e-06, | |
| "loss": 0.2509, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.833510525196683, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.418461538461539e-06, | |
| "loss": 0.3633, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.8350569269131889, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.387692307692308e-06, | |
| "loss": 0.3206, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.8366033286296948, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 3.356923076923077e-06, | |
| "loss": 0.3542, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.8381497303462007, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 3.3261538461538463e-06, | |
| "loss": 0.2731, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.8396961320627065, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 3.2953846153846157e-06, | |
| "loss": 0.3256, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.8412425337792125, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 3.264615384615385e-06, | |
| "loss": 0.2471, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.8427889354957184, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 3.233846153846154e-06, | |
| "loss": 0.2755, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.8443353372122243, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 3.203076923076923e-06, | |
| "loss": 0.3139, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.8458817389287302, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 3.1723076923076924e-06, | |
| "loss": 0.2722, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.8474281406452361, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 3.141538461538462e-06, | |
| "loss": 0.3058, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.848974542361742, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 3.110769230769231e-06, | |
| "loss": 0.2424, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.850520944078248, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 3.08e-06, | |
| "loss": 0.2752, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8520673457947538, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.049230769230769e-06, | |
| "loss": 0.2309, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.8536137475112597, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 3.0184615384615385e-06, | |
| "loss": 0.33, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.8551601492277656, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 2.987692307692308e-06, | |
| "loss": 0.2942, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.8567065509442715, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 2.9569230769230773e-06, | |
| "loss": 0.3103, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.8582529526607775, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 2.9261538461538463e-06, | |
| "loss": 0.2775, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.8597993543772834, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.8953846153846153e-06, | |
| "loss": 0.2941, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.8613457560937893, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 2.8646153846153847e-06, | |
| "loss": 0.2591, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.8628921578102952, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 2.833846153846154e-06, | |
| "loss": 0.2801, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.864438559526801, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 2.8030769230769234e-06, | |
| "loss": 0.3041, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.8659849612433069, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 2.7723076923076924e-06, | |
| "loss": 0.2866, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8675313629598129, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 2.7415384615384614e-06, | |
| "loss": 0.3128, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.8690777646763188, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 2.710769230769231e-06, | |
| "loss": 0.3121, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.8706241663928247, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.68e-06, | |
| "loss": 0.2264, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.8721705681093306, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 2.6492307692307696e-06, | |
| "loss": 0.2619, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.8737169698258365, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 2.6184615384615385e-06, | |
| "loss": 0.2631, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.8752633715423425, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 2.587692307692308e-06, | |
| "loss": 0.2636, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.8768097732588483, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 2.5569230769230773e-06, | |
| "loss": 0.3569, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.8783561749753542, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 2.5261538461538463e-06, | |
| "loss": 0.2297, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.8799025766918601, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 2.4953846153846157e-06, | |
| "loss": 0.2181, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.881448978408366, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 2.4646153846153847e-06, | |
| "loss": 0.3117, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.8829953801248719, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 2.433846153846154e-06, | |
| "loss": 0.3071, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.8845417818413779, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 2.4030769230769235e-06, | |
| "loss": 0.2599, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.8860881835578838, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.3723076923076924e-06, | |
| "loss": 0.265, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.8876345852743897, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 2.341538461538462e-06, | |
| "loss": 0.2922, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.8891809869908955, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 2.310769230769231e-06, | |
| "loss": 0.3616, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.8907273887074014, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 2.28e-06, | |
| "loss": 0.2587, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.8922737904239074, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 2.2492307692307696e-06, | |
| "loss": 0.335, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.8938201921404133, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 2.218461538461539e-06, | |
| "loss": 0.288, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.8953665938569192, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 2.187692307692308e-06, | |
| "loss": 0.2932, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.8969129955734251, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 2.156923076923077e-06, | |
| "loss": 0.282, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.898459397289931, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 2.1261538461538463e-06, | |
| "loss": 0.2073, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.9000057990064368, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 2.0953846153846157e-06, | |
| "loss": 0.2583, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.9015522007229428, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 2.064615384615385e-06, | |
| "loss": 0.2805, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.9030986024394487, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 2.033846153846154e-06, | |
| "loss": 0.2416, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.9046450041559546, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 2.003076923076923e-06, | |
| "loss": 0.2826, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.9061914058724605, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.9723076923076924e-06, | |
| "loss": 0.3072, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.9077378075889664, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.941538461538462e-06, | |
| "loss": 0.3551, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.9092842093054724, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.9107692307692312e-06, | |
| "loss": 0.3224, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.9108306110219783, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.8800000000000002e-06, | |
| "loss": 0.2501, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.9123770127384841, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.8492307692307692e-06, | |
| "loss": 0.2555, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.91392341445499, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 1.8184615384615386e-06, | |
| "loss": 0.329, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.9154698161714959, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.7876923076923078e-06, | |
| "loss": 0.3193, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.9170162178880018, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.7569230769230772e-06, | |
| "loss": 0.3162, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.9185626196045078, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.7261538461538463e-06, | |
| "loss": 0.29, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.9201090213210137, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.6953846153846153e-06, | |
| "loss": 0.3122, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.9216554230375196, | |
| "grad_norm": 0.75, | |
| "learning_rate": 1.6646153846153847e-06, | |
| "loss": 0.2374, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.9232018247540255, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.6338461538461539e-06, | |
| "loss": 0.2562, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.9247482264705313, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.6030769230769233e-06, | |
| "loss": 0.2854, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.9262946281870373, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.5723076923076925e-06, | |
| "loss": 0.3549, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.9278410299035432, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 1.5415384615384614e-06, | |
| "loss": 0.4152, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9293874316200491, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.5107692307692308e-06, | |
| "loss": 0.2626, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.930933833336555, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 1.48e-06, | |
| "loss": 0.3302, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.9324802350530609, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.4492307692307694e-06, | |
| "loss": 0.3063, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.9340266367695668, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.4184615384615386e-06, | |
| "loss": 0.2758, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.9355730384860728, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.3876923076923076e-06, | |
| "loss": 0.2999, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.9371194402025786, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.356923076923077e-06, | |
| "loss": 0.3438, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.9386658419190845, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 1.3261538461538461e-06, | |
| "loss": 0.2368, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.9402122436355904, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.2953846153846155e-06, | |
| "loss": 0.2524, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.9417586453520963, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.2646153846153847e-06, | |
| "loss": 0.2264, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.9433050470686023, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.233846153846154e-06, | |
| "loss": 0.2067, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.9448514487851082, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.2030769230769233e-06, | |
| "loss": 0.231, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.9463978505016141, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.1723076923076925e-06, | |
| "loss": 0.2731, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.94794425221812, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.1415384615384617e-06, | |
| "loss": 0.2837, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.9494906539346258, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.1107692307692309e-06, | |
| "loss": 0.2272, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.9510370556511317, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 1.08e-06, | |
| "loss": 0.3152, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.9525834573676377, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.0492307692307694e-06, | |
| "loss": 0.3017, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.9541298590841436, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.0184615384615386e-06, | |
| "loss": 0.3378, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.9556762608006495, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 9.876923076923078e-07, | |
| "loss": 0.2503, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.9572226625171554, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 9.56923076923077e-07, | |
| "loss": 0.3438, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.9587690642336613, | |
| "grad_norm": 0.875, | |
| "learning_rate": 9.261538461538462e-07, | |
| "loss": 0.2667, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9603154659501673, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 8.953846153846155e-07, | |
| "loss": 0.2745, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.9618618676666731, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 8.646153846153847e-07, | |
| "loss": 0.2683, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.963408269383179, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 8.338461538461539e-07, | |
| "loss": 0.2422, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.9649546710996849, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 8.030769230769231e-07, | |
| "loss": 0.2588, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.9665010728161908, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 7.723076923076923e-07, | |
| "loss": 0.2812, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.9680474745326967, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 7.415384615384616e-07, | |
| "loss": 0.3232, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.9695938762492027, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 7.107692307692309e-07, | |
| "loss": 0.2422, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.9711402779657086, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.2833, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.9726866796822145, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 6.492307692307692e-07, | |
| "loss": 0.2517, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.9742330813987203, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 6.184615384615385e-07, | |
| "loss": 0.2534, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9757794831152262, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 5.876923076923077e-07, | |
| "loss": 0.2911, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.9773258848317322, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 5.56923076923077e-07, | |
| "loss": 0.2628, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.9788722865482381, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 5.261538461538462e-07, | |
| "loss": 0.3191, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.980418688264744, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 4.953846153846155e-07, | |
| "loss": 0.3225, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.9819650899812499, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 4.6461538461538465e-07, | |
| "loss": 0.2836, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.9835114916977558, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 4.3384615384615384e-07, | |
| "loss": 0.303, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.9850578934142616, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 4.0307692307692313e-07, | |
| "loss": 0.2381, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.9866042951307676, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 3.7230769230769236e-07, | |
| "loss": 0.2821, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.9881506968472735, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 3.4153846153846155e-07, | |
| "loss": 0.3187, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.9896970985637794, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 3.107692307692308e-07, | |
| "loss": 0.2993, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.9912435002802853, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.2923, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.9927899019967912, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 2.4923076923076926e-07, | |
| "loss": 0.2407, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.9943363037132972, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 2.1846153846153847e-07, | |
| "loss": 0.257, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.9958827054298031, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.8769230769230773e-07, | |
| "loss": 0.2531, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.997429107146309, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 1.5692307692307694e-07, | |
| "loss": 0.273, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.9989755088628148, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.2615384615384617e-07, | |
| "loss": 0.313, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.0004639205149517, | |
| "grad_norm": 0.75, | |
| "learning_rate": 9.53846153846154e-08, | |
| "loss": 0.2218, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.0020103222314576, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 6.461538461538462e-08, | |
| "loss": 0.3402, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.0035567239479637, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 3.384615384615385e-08, | |
| "loss": 0.2327, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.0051031256644696, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 3.0769230769230774e-09, | |
| "loss": 0.2185, | |
| "step": 6500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4935783864782275e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |