| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.4745, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005, |
| "grad_norm": 1824.0, |
| "learning_rate": 4.5000000000000003e-07, |
| "loss": 13.3646, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1160.0, |
| "learning_rate": 9.500000000000001e-07, |
| "loss": 11.1271, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 174.0, |
| "learning_rate": 1.45e-06, |
| "loss": 4.3183, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 15.875, |
| "learning_rate": 1.9500000000000004e-06, |
| "loss": 0.3922, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 227.0, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 0.3421, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.734375, |
| "learning_rate": 2.95e-06, |
| "loss": 0.3052, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 2.140625, |
| "learning_rate": 3.45e-06, |
| "loss": 0.2949, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.1875, |
| "learning_rate": 3.95e-06, |
| "loss": 0.2687, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 4.9375, |
| "learning_rate": 4.450000000000001e-06, |
| "loss": 0.2949, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.95e-06, |
| "loss": 0.2766, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 1.03125, |
| "learning_rate": 5.450000000000001e-06, |
| "loss": 0.2721, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.950000000000001e-06, |
| "loss": 0.266, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 3.421875, |
| "learning_rate": 6.450000000000001e-06, |
| "loss": 0.2815, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.95e-06, |
| "loss": 0.2694, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 0.9140625, |
| "learning_rate": 7.450000000000001e-06, |
| "loss": 0.256, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 3.0, |
| "learning_rate": 7.950000000000002e-06, |
| "loss": 0.2588, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 0.92578125, |
| "learning_rate": 8.45e-06, |
| "loss": 0.2729, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.890625, |
| "learning_rate": 8.95e-06, |
| "loss": 0.2643, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.450000000000001e-06, |
| "loss": 0.2596, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.9296875, |
| "learning_rate": 9.950000000000001e-06, |
| "loss": 0.2518, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 0.8125, |
| "learning_rate": 9.999383162408303e-06, |
| "loss": 0.2365, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.8984375, |
| "learning_rate": 9.997251079708788e-06, |
| "loss": 0.2479, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 0.84765625, |
| "learning_rate": 9.993596785920932e-06, |
| "loss": 0.2473, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.76953125, |
| "learning_rate": 9.988421394178027e-06, |
| "loss": 0.2337, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.7734375, |
| "learning_rate": 9.981726480954532e-06, |
| "loss": 0.2287, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.81640625, |
| "learning_rate": 9.973514085585871e-06, |
| "loss": 0.2418, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 0.80078125, |
| "learning_rate": 9.963786709647228e-06, |
| "loss": 0.2229, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.80078125, |
| "learning_rate": 9.952547316191545e-06, |
| "loss": 0.2329, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 0.8046875, |
| "learning_rate": 9.939799328846947e-06, |
| "loss": 0.2317, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.78515625, |
| "learning_rate": 9.92554663077387e-06, |
| "loss": 0.2486, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 0.82421875, |
| "learning_rate": 9.90979356348222e-06, |
| "loss": 0.231, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.892544925508894e-06, |
| "loss": 0.2297, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 0.74609375, |
| "learning_rate": 9.87380597095611e-06, |
| "loss": 0.2325, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.7421875, |
| "learning_rate": 9.853582407890954e-06, |
| "loss": 0.2352, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 0.79296875, |
| "learning_rate": 9.831880396606649e-06, |
| "loss": 0.2271, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.76953125, |
| "learning_rate": 9.808706547746057e-06, |
| "loss": 0.2403, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 0.83203125, |
| "learning_rate": 9.78406792028804e-06, |
| "loss": 0.2321, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.73828125, |
| "learning_rate": 9.757972019397192e-06, |
| "loss": 0.2254, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 10.125, |
| "learning_rate": 9.730426794137727e-06, |
| "loss": 0.2302, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.328125, |
| "learning_rate": 9.701440635052094e-06, |
| "loss": 1.9073, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 0.72265625, |
| "learning_rate": 9.671022371605148e-06, |
| "loss": 0.2439, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.7890625, |
| "learning_rate": 9.6391812694946e-06, |
| "loss": 0.2406, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 0.73046875, |
| "learning_rate": 9.605927027828608e-06, |
| "loss": 0.2206, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.80859375, |
| "learning_rate": 9.571269776171319e-06, |
| "loss": 0.2318, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 0.74609375, |
| "learning_rate": 9.535220071457325e-06, |
| "loss": 0.2332, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.796875, |
| "learning_rate": 9.497788894775903e-06, |
| "loss": 0.2328, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 0.80078125, |
| "learning_rate": 9.458987648026071e-06, |
| "loss": 0.2221, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.66796875, |
| "learning_rate": 9.418828150443469e-06, |
| "loss": 0.2227, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 0.77734375, |
| "learning_rate": 9.37732263500009e-06, |
| "loss": 0.2345, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.69921875, |
| "learning_rate": 9.334483744678015e-06, |
| "loss": 0.2288, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.255, |
| "grad_norm": 0.765625, |
| "learning_rate": 9.290324528618225e-06, |
| "loss": 0.2377, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.6796875, |
| "learning_rate": 9.244858438145709e-06, |
| "loss": 0.2341, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.265, |
| "grad_norm": 0.7109375, |
| "learning_rate": 9.198099322672066e-06, |
| "loss": 0.219, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.671875, |
| "learning_rate": 9.150061425476839e-06, |
| "loss": 0.2245, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 0.7109375, |
| "learning_rate": 9.100759379368863e-06, |
| "loss": 0.2417, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.75390625, |
| "learning_rate": 9.050208202228981e-06, |
| "loss": 0.2386, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.285, |
| "grad_norm": 0.78515625, |
| "learning_rate": 8.998423292435455e-06, |
| "loss": 0.2378, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.73828125, |
| "learning_rate": 8.945420424173455e-06, |
| "loss": 0.2359, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.295, |
| "grad_norm": 0.6796875, |
| "learning_rate": 8.891215742630106e-06, |
| "loss": 0.2385, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.6484375, |
| "learning_rate": 8.8358257590765e-06, |
| "loss": 0.2183, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.305, |
| "grad_norm": 0.69921875, |
| "learning_rate": 8.779267345838198e-06, |
| "loss": 0.2364, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.6640625, |
| "learning_rate": 8.72155773115577e-06, |
| "loss": 0.222, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.315, |
| "grad_norm": 0.71484375, |
| "learning_rate": 8.662714493936895e-06, |
| "loss": 0.2281, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.63671875, |
| "learning_rate": 8.602755558401653e-06, |
| "loss": 0.2302, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 0.6875, |
| "learning_rate": 8.541699188622645e-06, |
| "loss": 0.2299, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.81640625, |
| "learning_rate": 8.479563982961572e-06, |
| "loss": 0.2361, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.335, |
| "grad_norm": 0.8203125, |
| "learning_rate": 8.416368868403997e-06, |
| "loss": 0.2353, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.703125, |
| "learning_rate": 8.352133094793996e-06, |
| "loss": 0.2159, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.345, |
| "grad_norm": 0.6796875, |
| "learning_rate": 8.28687622897048e-06, |
| "loss": 0.2301, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.75, |
| "learning_rate": 8.220618148806934e-06, |
| "loss": 0.2371, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.355, |
| "grad_norm": 0.71875, |
| "learning_rate": 8.153379037156433e-06, |
| "loss": 0.2314, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.66796875, |
| "learning_rate": 8.085179375703745e-06, |
| "loss": 0.2285, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.365, |
| "grad_norm": 0.75, |
| "learning_rate": 8.016039938726413e-06, |
| "loss": 0.2328, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.6875, |
| "learning_rate": 7.945981786766712e-06, |
| "loss": 0.2271, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 0.69140625, |
| "learning_rate": 7.875026260216395e-06, |
| "loss": 0.2296, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.734375, |
| "learning_rate": 7.80319497281621e-06, |
| "loss": 0.2341, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.385, |
| "grad_norm": 0.6796875, |
| "learning_rate": 7.730509805072146e-06, |
| "loss": 0.2262, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.71875, |
| "learning_rate": 7.656992897590416e-06, |
| "loss": 0.2269, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.395, |
| "grad_norm": 0.72265625, |
| "learning_rate": 7.58266664433321e-06, |
| "loss": 0.231, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.74609375, |
| "learning_rate": 7.507553685797288e-06, |
| "loss": 0.2244, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.405, |
| "grad_norm": 0.65625, |
| "learning_rate": 7.431676902117453e-06, |
| "loss": 0.2076, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.69140625, |
| "learning_rate": 7.35505940609705e-06, |
| "loss": 0.2256, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.415, |
| "grad_norm": 0.6484375, |
| "learning_rate": 7.2777245361675786e-06, |
| "loss": 0.2251, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.71875, |
| "learning_rate": 7.199695849279576e-06, |
| "loss": 0.2266, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 0.7109375, |
| "learning_rate": 7.120997113726951e-06, |
| "loss": 0.2357, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.68359375, |
| "learning_rate": 7.041652301906925e-06, |
| "loss": 0.2244, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.435, |
| "grad_norm": 0.6328125, |
| "learning_rate": 6.961685583017808e-06, |
| "loss": 0.2394, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.7421875, |
| "learning_rate": 6.881121315696828e-06, |
| "loss": 0.2304, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.445, |
| "grad_norm": 0.69921875, |
| "learning_rate": 6.799984040600257e-06, |
| "loss": 0.2314, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.703125, |
| "learning_rate": 6.718298472928082e-06, |
| "loss": 0.2287, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.455, |
| "grad_norm": 0.66796875, |
| "learning_rate": 6.63608949489552e-06, |
| "loss": 0.2288, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.72265625, |
| "learning_rate": 6.55338214815366e-06, |
| "loss": 0.2265, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.465, |
| "grad_norm": 0.68359375, |
| "learning_rate": 6.47020162616152e-06, |
| "loss": 0.226, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.67578125, |
| "learning_rate": 6.386573266511891e-06, |
| "loss": 0.2325, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 0.69921875, |
| "learning_rate": 6.3025225432132434e-06, |
| "loss": 0.2264, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.59765625, |
| "learning_rate": 6.218075058930113e-06, |
| "loss": 0.2184, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.485, |
| "grad_norm": 0.67578125, |
| "learning_rate": 6.133256537184276e-06, |
| "loss": 0.2268, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.6640625, |
| "learning_rate": 6.048092814519109e-06, |
| "loss": 0.2109, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.495, |
| "grad_norm": 0.6875, |
| "learning_rate": 5.962609832629538e-06, |
| "loss": 0.2188, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.69921875, |
| "learning_rate": 5.876833630459936e-06, |
| "loss": 0.223, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.505, |
| "grad_norm": 0.625, |
| "learning_rate": 5.7907903362724195e-06, |
| "loss": 0.2131, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.71875, |
| "learning_rate": 5.704506159687914e-06, |
| "loss": 0.2226, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.515, |
| "grad_norm": 0.75, |
| "learning_rate": 5.618007383702464e-06, |
| "loss": 0.2326, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.7265625, |
| "learning_rate": 5.5313203566811666e-06, |
| "loss": 0.2248, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.525, |
| "grad_norm": 0.734375, |
| "learning_rate": 5.4444714843322085e-06, |
| "loss": 0.2225, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.0045, |
| "grad_norm": 0.71875, |
| "learning_rate": 5.35748722166343e-06, |
| "loss": 0.2348, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.0095, |
| "grad_norm": 0.64453125, |
| "learning_rate": 5.270394064923878e-06, |
| "loss": 0.2212, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.0145, |
| "grad_norm": 0.71875, |
| "learning_rate": 5.183218543532782e-06, |
| "loss": 0.2157, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.0195, |
| "grad_norm": 0.6953125, |
| "learning_rate": 5.09598721199845e-06, |
| "loss": 0.218, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.0245, |
| "grad_norm": 0.66015625, |
| "learning_rate": 5.008726641829492e-06, |
| "loss": 0.2199, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.0295, |
| "grad_norm": 0.66015625, |
| "learning_rate": 4.921463413440898e-06, |
| "loss": 0.2228, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.0345, |
| "grad_norm": 0.64453125, |
| "learning_rate": 4.8342241080573696e-06, |
| "loss": 0.2108, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.0395, |
| "grad_norm": 0.64453125, |
| "learning_rate": 4.747035299616434e-06, |
| "loss": 0.1961, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.0445, |
| "grad_norm": 0.71875, |
| "learning_rate": 4.659923546673761e-06, |
| "loss": 0.2114, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.0495, |
| "grad_norm": 0.6328125, |
| "learning_rate": 4.572915384313163e-06, |
| "loss": 0.1996, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.0545, |
| "grad_norm": 0.625, |
| "learning_rate": 4.4860373160637665e-06, |
| "loss": 0.1923, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.0594999999999999, |
| "grad_norm": 0.64453125, |
| "learning_rate": 4.399315805826765e-06, |
| "loss": 0.1881, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.0645, |
| "grad_norm": 0.66015625, |
| "learning_rate": 4.312777269814268e-06, |
| "loss": 0.1996, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.0695000000000001, |
| "grad_norm": 0.703125, |
| "learning_rate": 4.226448068502661e-06, |
| "loss": 0.1856, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.0745, |
| "grad_norm": 0.68359375, |
| "learning_rate": 4.140354498602952e-06, |
| "loss": 0.1757, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.0795, |
| "grad_norm": 0.68359375, |
| "learning_rate": 4.054522785050543e-06, |
| "loss": 0.1643, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.0845, |
| "grad_norm": 0.73046875, |
| "learning_rate": 3.968979073016853e-06, |
| "loss": 0.1871, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.0895, |
| "grad_norm": 0.86328125, |
| "learning_rate": 3.883749419945244e-06, |
| "loss": 0.1667, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.0945, |
| "grad_norm": 0.94921875, |
| "learning_rate": 3.798859787613682e-06, |
| "loss": 0.1633, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.0995, |
| "grad_norm": 0.73828125, |
| "learning_rate": 3.7143360342265206e-06, |
| "loss": 0.156, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.1045, |
| "grad_norm": 0.71875, |
| "learning_rate": 3.630203906537838e-06, |
| "loss": 0.1473, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.1095, |
| "grad_norm": 0.6953125, |
| "learning_rate": 3.5464890320087374e-06, |
| "loss": 0.1561, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.1145, |
| "grad_norm": 0.80078125, |
| "learning_rate": 3.463216911000965e-06, |
| "loss": 0.1595, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.1195, |
| "grad_norm": 0.7734375, |
| "learning_rate": 3.3804129090092542e-06, |
| "loss": 0.1487, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.1245, |
| "grad_norm": 0.80859375, |
| "learning_rate": 3.2981022489347503e-06, |
| "loss": 0.1442, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.1295, |
| "grad_norm": 0.7265625, |
| "learning_rate": 3.2163100034018735e-06, |
| "loss": 0.1573, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.1345, |
| "grad_norm": 0.6796875, |
| "learning_rate": 3.1350610871209553e-06, |
| "loss": 0.1463, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.1395, |
| "grad_norm": 0.6875, |
| "learning_rate": 3.0543802492989693e-06, |
| "loss": 0.1477, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.1445, |
| "grad_norm": 0.71875, |
| "learning_rate": 2.974292066100688e-06, |
| "loss": 0.1541, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.1495, |
| "grad_norm": 0.81640625, |
| "learning_rate": 2.8948209331625454e-06, |
| "loss": 0.1616, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.1545, |
| "grad_norm": 0.70703125, |
| "learning_rate": 2.8159910581614904e-06, |
| "loss": 0.15, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.1595, |
| "grad_norm": 0.734375, |
| "learning_rate": 2.7378264534410865e-06, |
| "loss": 0.1537, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.1645, |
| "grad_norm": 0.765625, |
| "learning_rate": 2.6603509286971342e-06, |
| "loss": 0.1505, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.1695, |
| "grad_norm": 0.7421875, |
| "learning_rate": 2.5835880837249884e-06, |
| "loss": 0.1534, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.1745, |
| "grad_norm": 0.7421875, |
| "learning_rate": 2.507561301230849e-06, |
| "loss": 0.1454, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.1795, |
| "grad_norm": 0.79296875, |
| "learning_rate": 2.432293739709151e-06, |
| "loss": 0.1538, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.1844999999999999, |
| "grad_norm": 0.82421875, |
| "learning_rate": 2.357808326388265e-06, |
| "loss": 0.1513, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.1895, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.284127750246646e-06, |
| "loss": 0.1466, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.1945000000000001, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.2112744551015496e-06, |
| "loss": 0.1531, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.1995, |
| "grad_norm": 0.81640625, |
| "learning_rate": 2.13927063277242e-06, |
| "loss": 0.202, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.2045, |
| "grad_norm": 0.69921875, |
| "learning_rate": 2.0681382163210533e-06, |
| "loss": 0.1779, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.2095, |
| "grad_norm": 0.78125, |
| "learning_rate": 1.9978988733705807e-06, |
| "loss": 0.155, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.2145, |
| "grad_norm": 0.703125, |
| "learning_rate": 1.928573999505284e-06, |
| "loss": 0.1436, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.2195, |
| "grad_norm": 0.6484375, |
| "learning_rate": 1.8601847117533112e-06, |
| "loss": 0.1491, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.2245, |
| "grad_norm": 0.66796875, |
| "learning_rate": 1.7927518421542106e-06, |
| "loss": 0.1535, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2295, |
| "grad_norm": 0.7734375, |
| "learning_rate": 1.7262959314133015e-06, |
| "loss": 0.15, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.2345, |
| "grad_norm": 0.74609375, |
| "learning_rate": 1.6608372226447678e-06, |
| "loss": 0.1484, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.2395, |
| "grad_norm": 0.7109375, |
| "learning_rate": 1.596395655205411e-06, |
| "loss": 0.149, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.2445, |
| "grad_norm": 0.79296875, |
| "learning_rate": 1.5329908586209347e-06, |
| "loss": 0.1541, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.2495, |
| "grad_norm": 0.76953125, |
| "learning_rate": 1.4706421466065952e-06, |
| "loss": 0.1456, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.2545, |
| "grad_norm": 0.703125, |
| "learning_rate": 1.4093685111840567e-06, |
| "loss": 0.1561, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.2595, |
| "grad_norm": 0.765625, |
| "learning_rate": 1.349188616896238e-06, |
| "loss": 0.1581, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.2645, |
| "grad_norm": 0.73828125, |
| "learning_rate": 1.2901207951219186e-06, |
| "loss": 0.1436, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.2695, |
| "grad_norm": 0.7890625, |
| "learning_rate": 1.2321830384918116e-06, |
| "loss": 0.1503, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.2745, |
| "grad_norm": 0.75390625, |
| "learning_rate": 1.1753929954078414e-06, |
| "loss": 0.1608, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.2795, |
| "grad_norm": 0.72265625, |
| "learning_rate": 1.1197679646672698e-06, |
| "loss": 0.153, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.2845, |
| "grad_norm": 0.79296875, |
| "learning_rate": 1.065324890193314e-06, |
| "loss": 0.1558, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.2894999999999999, |
| "grad_norm": 0.71875, |
| "learning_rate": 1.0120803558738585e-06, |
| "loss": 0.1597, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.2945, |
| "grad_norm": 0.7265625, |
| "learning_rate": 9.600505805098486e-07, |
| "loss": 0.1583, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.2995, |
| "grad_norm": 0.7578125, |
| "learning_rate": 9.09251412874882e-07, |
| "loss": 0.1486, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.3045, |
| "grad_norm": 0.7265625, |
| "learning_rate": 8.596983268875281e-07, |
| "loss": 0.1571, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.3094999999999999, |
| "grad_norm": 0.70703125, |
| "learning_rate": 8.114064168978064e-07, |
| "loss": 0.1494, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.3145, |
| "grad_norm": 0.68359375, |
| "learning_rate": 7.643903930893154e-07, |
| "loss": 0.153, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.3195000000000001, |
| "grad_norm": 0.765625, |
| "learning_rate": 7.186645769983591e-07, |
| "loss": 0.1592, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.3245, |
| "grad_norm": 0.75390625, |
| "learning_rate": 6.742428971514786e-07, |
| "loss": 0.1588, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3295, |
| "grad_norm": 0.82421875, |
| "learning_rate": 6.311388848226741e-07, |
| "loss": 0.1593, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.3345, |
| "grad_norm": 0.66796875, |
| "learning_rate": 5.893656699116618e-07, |
| "loss": 0.1592, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.3395000000000001, |
| "grad_norm": 0.70703125, |
| "learning_rate": 5.489359769443675e-07, |
| "loss": 0.1565, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.3445, |
| "grad_norm": 0.72265625, |
| "learning_rate": 5.098621211969224e-07, |
| "loss": 0.1577, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.3495, |
| "grad_norm": 0.796875, |
| "learning_rate": 4.72156004944303e-07, |
| "loss": 0.1622, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.3545, |
| "grad_norm": 0.75390625, |
| "learning_rate": 4.3582911383478646e-07, |
| "loss": 0.1613, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.3595, |
| "grad_norm": 0.7578125, |
| "learning_rate": 4.0089251339131164e-07, |
| "loss": 0.1574, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.3645, |
| "grad_norm": 0.70703125, |
| "learning_rate": 3.6735684564081385e-07, |
| "loss": 0.1583, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.3695, |
| "grad_norm": 0.671875, |
| "learning_rate": 3.352323258725554e-07, |
| "loss": 0.1633, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.3745, |
| "grad_norm": 0.91015625, |
| "learning_rate": 3.0452873952645455e-07, |
| "loss": 0.1611, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.3795, |
| "grad_norm": 0.75, |
| "learning_rate": 2.752554392123463e-07, |
| "loss": 0.1613, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.3845, |
| "grad_norm": 0.7265625, |
| "learning_rate": 2.474213418610816e-07, |
| "loss": 0.1616, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.3895, |
| "grad_norm": 0.703125, |
| "learning_rate": 2.210349260083494e-07, |
| "loss": 0.1621, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.3945, |
| "grad_norm": 0.7265625, |
| "learning_rate": 1.961042292120291e-07, |
| "loss": 0.1628, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.3995, |
| "grad_norm": 0.7109375, |
| "learning_rate": 1.7263684560387518e-07, |
| "loss": 0.1583, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.4045, |
| "grad_norm": 0.6953125, |
| "learning_rate": 1.5063992357626623e-07, |
| "loss": 0.152, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.4095, |
| "grad_norm": 0.7265625, |
| "learning_rate": 1.3012016360474223e-07, |
| "loss": 0.1605, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.4144999999999999, |
| "grad_norm": 0.70703125, |
| "learning_rate": 1.1108381620696885e-07, |
| "loss": 0.1632, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.4195, |
| "grad_norm": 0.7578125, |
| "learning_rate": 9.353668003877437e-08, |
| "loss": 0.1661, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.4245, |
| "grad_norm": 0.80859375, |
| "learning_rate": 7.748410012781705e-08, |
| "loss": 0.174, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.4295, |
| "grad_norm": 0.72265625, |
| "learning_rate": 6.293096624544304e-08, |
| "loss": 0.1661, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.4344999999999999, |
| "grad_norm": 0.75, |
| "learning_rate": 4.988171141721232e-08, |
| "loss": 0.179, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.4395, |
| "grad_norm": 0.87890625, |
| "learning_rate": 3.83403105725566e-08, |
| "loss": 0.1701, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.4445000000000001, |
| "grad_norm": 0.78515625, |
| "learning_rate": 2.8310279333976786e-08, |
| "loss": 0.1732, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.4495, |
| "grad_norm": 0.7265625, |
| "learning_rate": 1.9794672946152337e-08, |
| "loss": 0.1709, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.4545, |
| "grad_norm": 0.75390625, |
| "learning_rate": 1.2796085345280207e-08, |
| "loss": 0.1739, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.4595, |
| "grad_norm": 0.77734375, |
| "learning_rate": 7.3166483689413035e-09, |
| "loss": 0.1715, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.4645000000000001, |
| "grad_norm": 0.76953125, |
| "learning_rate": 3.3580311067188396e-09, |
| "loss": 0.1723, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.4695, |
| "grad_norm": 0.75, |
| "learning_rate": 9.214393917789111e-10, |
| "loss": 0.1797, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.4745, |
| "grad_norm": 0.7890625, |
| "learning_rate": 7.615433561536379e-12, |
| "loss": 0.174, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.904385425501323e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|