| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005, |
| "grad_norm": 253.0, |
| "learning_rate": 4.5000000000000003e-07, |
| "loss": 1.7315, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 146.0, |
| "learning_rate": 9.500000000000001e-07, |
| "loss": 1.7698, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 416.0, |
| "learning_rate": 1.45e-06, |
| "loss": 1.6967, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 227.0, |
| "learning_rate": 1.9500000000000004e-06, |
| "loss": 1.7892, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 344.0, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 1.6764, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 177.0, |
| "learning_rate": 2.95e-06, |
| "loss": 1.6747, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 165.0, |
| "learning_rate": 3.45e-06, |
| "loss": 1.6633, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 80.0, |
| "learning_rate": 3.95e-06, |
| "loss": 1.6268, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 96.5, |
| "learning_rate": 4.450000000000001e-06, |
| "loss": 1.6238, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 78.0, |
| "learning_rate": 4.95e-06, |
| "loss": 1.5828, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 66.5, |
| "learning_rate": 5.450000000000001e-06, |
| "loss": 1.6216, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 129.0, |
| "learning_rate": 5.950000000000001e-06, |
| "loss": 1.5087, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 70.5, |
| "learning_rate": 6.450000000000001e-06, |
| "loss": 1.4589, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 186.0, |
| "learning_rate": 6.95e-06, |
| "loss": 1.3991, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 98.0, |
| "learning_rate": 7.450000000000001e-06, |
| "loss": 1.4253, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 75.0, |
| "learning_rate": 7.950000000000002e-06, |
| "loss": 1.3915, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 60.0, |
| "learning_rate": 8.45e-06, |
| "loss": 1.2803, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 124.0, |
| "learning_rate": 8.95e-06, |
| "loss": 1.2138, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 183.0, |
| "learning_rate": 9.450000000000001e-06, |
| "loss": 1.2579, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 87.0, |
| "learning_rate": 9.950000000000001e-06, |
| "loss": 1.2237, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 53.5, |
| "learning_rate": 9.999383162408303e-06, |
| "loss": 1.1624, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 77.5, |
| "learning_rate": 9.997251079708788e-06, |
| "loss": 1.1821, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 60.25, |
| "learning_rate": 9.993596785920932e-06, |
| "loss": 1.1511, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 75.0, |
| "learning_rate": 9.988421394178027e-06, |
| "loss": 1.167, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 50.75, |
| "learning_rate": 9.981726480954532e-06, |
| "loss": 1.0907, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 34.75, |
| "learning_rate": 9.973514085585871e-06, |
| "loss": 1.1361, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 36.5, |
| "learning_rate": 9.963786709647228e-06, |
| "loss": 1.0804, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 22.625, |
| "learning_rate": 9.952547316191545e-06, |
| "loss": 1.0901, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 22.875, |
| "learning_rate": 9.939799328846947e-06, |
| "loss": 0.9906, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 22.875, |
| "learning_rate": 9.92554663077387e-06, |
| "loss": 1.0076, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 21.5, |
| "learning_rate": 9.90979356348222e-06, |
| "loss": 0.9819, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 19.875, |
| "learning_rate": 9.892544925508894e-06, |
| "loss": 0.9892, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 19.875, |
| "learning_rate": 9.87380597095611e-06, |
| "loss": 1.023, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 18.375, |
| "learning_rate": 9.853582407890954e-06, |
| "loss": 0.9444, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 24.0, |
| "learning_rate": 9.831880396606649e-06, |
| "loss": 0.9955, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 21.5, |
| "learning_rate": 9.808706547746057e-06, |
| "loss": 0.9076, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 20.875, |
| "learning_rate": 9.78406792028804e-06, |
| "loss": 0.973, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 17.375, |
| "learning_rate": 9.757972019397192e-06, |
| "loss": 0.893, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 17.625, |
| "learning_rate": 9.730426794137727e-06, |
| "loss": 0.9694, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 21.75, |
| "learning_rate": 9.701440635052094e-06, |
| "loss": 0.9668, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 22.0, |
| "learning_rate": 9.671022371605148e-06, |
| "loss": 0.9044, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 18.625, |
| "learning_rate": 9.6391812694946e-06, |
| "loss": 0.8733, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 27.25, |
| "learning_rate": 9.605927027828608e-06, |
| "loss": 0.8956, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 25.5, |
| "learning_rate": 9.571269776171319e-06, |
| "loss": 0.9276, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 18.875, |
| "learning_rate": 9.535220071457325e-06, |
| "loss": 0.8804, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 53.75, |
| "learning_rate": 9.497788894775903e-06, |
| "loss": 0.921, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 16.875, |
| "learning_rate": 9.458987648026071e-06, |
| "loss": 0.9119, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 28.125, |
| "learning_rate": 9.418828150443469e-06, |
| "loss": 0.9022, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 23.0, |
| "learning_rate": 9.37732263500009e-06, |
| "loss": 0.8735, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 14.125, |
| "learning_rate": 9.334483744678015e-06, |
| "loss": 0.907, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.255, |
| "grad_norm": 23.75, |
| "learning_rate": 9.290324528618225e-06, |
| "loss": 0.9666, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 12.5, |
| "learning_rate": 9.244858438145709e-06, |
| "loss": 0.9087, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.265, |
| "grad_norm": 16.0, |
| "learning_rate": 9.198099322672066e-06, |
| "loss": 0.8592, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 13.5625, |
| "learning_rate": 9.150061425476839e-06, |
| "loss": 0.8687, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 15.9375, |
| "learning_rate": 9.100759379368863e-06, |
| "loss": 0.8926, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 19.125, |
| "learning_rate": 9.050208202228981e-06, |
| "loss": 0.9293, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.285, |
| "grad_norm": 15.5, |
| "learning_rate": 8.998423292435455e-06, |
| "loss": 0.8354, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 17.125, |
| "learning_rate": 8.945420424173455e-06, |
| "loss": 0.8671, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.295, |
| "grad_norm": 16.375, |
| "learning_rate": 8.891215742630106e-06, |
| "loss": 0.878, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 23.0, |
| "learning_rate": 8.8358257590765e-06, |
| "loss": 0.8117, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.305, |
| "grad_norm": 17.625, |
| "learning_rate": 8.779267345838198e-06, |
| "loss": 0.8938, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 14.6875, |
| "learning_rate": 8.72155773115577e-06, |
| "loss": 0.8384, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.315, |
| "grad_norm": 11.5625, |
| "learning_rate": 8.662714493936895e-06, |
| "loss": 0.9152, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 12.9375, |
| "learning_rate": 8.602755558401653e-06, |
| "loss": 0.8899, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 11.875, |
| "learning_rate": 8.541699188622645e-06, |
| "loss": 0.8906, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 14.75, |
| "learning_rate": 8.479563982961572e-06, |
| "loss": 0.9039, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.335, |
| "grad_norm": 9.8125, |
| "learning_rate": 8.416368868403997e-06, |
| "loss": 0.8605, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 12.8125, |
| "learning_rate": 8.352133094793996e-06, |
| "loss": 0.8883, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.345, |
| "grad_norm": 12.125, |
| "learning_rate": 8.28687622897048e-06, |
| "loss": 0.8559, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 13.0, |
| "learning_rate": 8.220618148806934e-06, |
| "loss": 0.8561, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.355, |
| "grad_norm": 10.5625, |
| "learning_rate": 8.153379037156433e-06, |
| "loss": 0.7992, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 10.5, |
| "learning_rate": 8.085179375703745e-06, |
| "loss": 0.8422, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.365, |
| "grad_norm": 11.6875, |
| "learning_rate": 8.016039938726413e-06, |
| "loss": 0.8568, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 13.8125, |
| "learning_rate": 7.945981786766712e-06, |
| "loss": 0.8133, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 16.875, |
| "learning_rate": 7.875026260216395e-06, |
| "loss": 0.8552, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 9.6875, |
| "learning_rate": 7.80319497281621e-06, |
| "loss": 0.8579, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.385, |
| "grad_norm": 11.1875, |
| "learning_rate": 7.730509805072146e-06, |
| "loss": 0.8212, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 13.8125, |
| "learning_rate": 7.656992897590416e-06, |
| "loss": 0.8569, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.395, |
| "grad_norm": 8.5, |
| "learning_rate": 7.58266664433321e-06, |
| "loss": 0.8502, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 11.5625, |
| "learning_rate": 7.507553685797288e-06, |
| "loss": 0.8381, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.405, |
| "grad_norm": 12.125, |
| "learning_rate": 7.431676902117453e-06, |
| "loss": 0.8611, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 15.1875, |
| "learning_rate": 7.35505940609705e-06, |
| "loss": 0.809, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.415, |
| "grad_norm": 9.625, |
| "learning_rate": 7.2777245361675786e-06, |
| "loss": 0.8613, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 12.0625, |
| "learning_rate": 7.199695849279576e-06, |
| "loss": 0.8252, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 13.25, |
| "learning_rate": 7.120997113726951e-06, |
| "loss": 0.8994, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 11.625, |
| "learning_rate": 7.041652301906925e-06, |
| "loss": 0.806, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.435, |
| "grad_norm": 12.0, |
| "learning_rate": 6.961685583017808e-06, |
| "loss": 0.844, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 15.25, |
| "learning_rate": 6.881121315696828e-06, |
| "loss": 0.811, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.445, |
| "grad_norm": 11.1875, |
| "learning_rate": 6.799984040600257e-06, |
| "loss": 0.855, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 12.875, |
| "learning_rate": 6.718298472928082e-06, |
| "loss": 0.8385, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.455, |
| "grad_norm": 16.125, |
| "learning_rate": 6.63608949489552e-06, |
| "loss": 0.8543, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 12.75, |
| "learning_rate": 6.55338214815366e-06, |
| "loss": 0.9006, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.465, |
| "grad_norm": 11.125, |
| "learning_rate": 6.47020162616152e-06, |
| "loss": 0.8139, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 11.5625, |
| "learning_rate": 6.386573266511891e-06, |
| "loss": 0.8247, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 9.6875, |
| "learning_rate": 6.3025225432132434e-06, |
| "loss": 0.8145, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 12.3125, |
| "learning_rate": 6.218075058930113e-06, |
| "loss": 0.8642, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.485, |
| "grad_norm": 10.875, |
| "learning_rate": 6.133256537184276e-06, |
| "loss": 0.7867, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 14.1875, |
| "learning_rate": 6.048092814519109e-06, |
| "loss": 0.8341, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.495, |
| "grad_norm": 11.75, |
| "learning_rate": 5.962609832629538e-06, |
| "loss": 0.8033, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 16.875, |
| "learning_rate": 5.876833630459936e-06, |
| "loss": 0.7778, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.505, |
| "grad_norm": 13.125, |
| "learning_rate": 5.7907903362724195e-06, |
| "loss": 0.8177, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 14.6875, |
| "learning_rate": 5.704506159687914e-06, |
| "loss": 0.837, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.515, |
| "grad_norm": 14.8125, |
| "learning_rate": 5.618007383702464e-06, |
| "loss": 0.8762, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 14.375, |
| "learning_rate": 5.5313203566811666e-06, |
| "loss": 0.825, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.525, |
| "grad_norm": 12.6875, |
| "learning_rate": 5.4444714843322085e-06, |
| "loss": 0.8447, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 11.6875, |
| "learning_rate": 5.35748722166343e-06, |
| "loss": 0.8196, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.535, |
| "grad_norm": 12.5625, |
| "learning_rate": 5.270394064923878e-06, |
| "loss": 0.8216, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 11.4375, |
| "learning_rate": 5.183218543532782e-06, |
| "loss": 0.8063, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.545, |
| "grad_norm": 11.25, |
| "learning_rate": 5.09598721199845e-06, |
| "loss": 0.8679, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 11.6875, |
| "learning_rate": 5.008726641829492e-06, |
| "loss": 0.8548, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.555, |
| "grad_norm": 10.75, |
| "learning_rate": 4.921463413440898e-06, |
| "loss": 0.8687, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 12.0625, |
| "learning_rate": 4.8342241080573696e-06, |
| "loss": 0.8212, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.565, |
| "grad_norm": 11.0, |
| "learning_rate": 4.747035299616434e-06, |
| "loss": 0.8318, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 10.6875, |
| "learning_rate": 4.659923546673761e-06, |
| "loss": 0.8049, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.575, |
| "grad_norm": 10.5, |
| "learning_rate": 4.572915384313163e-06, |
| "loss": 0.8397, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 12.4375, |
| "learning_rate": 4.4860373160637665e-06, |
| "loss": 0.8489, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.585, |
| "grad_norm": 14.1875, |
| "learning_rate": 4.399315805826765e-06, |
| "loss": 0.8585, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 12.9375, |
| "learning_rate": 4.312777269814268e-06, |
| "loss": 0.835, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.595, |
| "grad_norm": 12.5625, |
| "learning_rate": 4.226448068502661e-06, |
| "loss": 0.8303, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 11.75, |
| "learning_rate": 4.140354498602952e-06, |
| "loss": 0.8348, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.605, |
| "grad_norm": 11.1875, |
| "learning_rate": 4.054522785050543e-06, |
| "loss": 0.9092, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 12.4375, |
| "learning_rate": 3.968979073016853e-06, |
| "loss": 0.8015, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.615, |
| "grad_norm": 12.125, |
| "learning_rate": 3.883749419945244e-06, |
| "loss": 0.8342, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 9.5, |
| "learning_rate": 3.798859787613682e-06, |
| "loss": 0.8457, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 12.75, |
| "learning_rate": 3.7143360342265206e-06, |
| "loss": 0.8561, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 10.1875, |
| "learning_rate": 3.630203906537838e-06, |
| "loss": 0.8619, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.635, |
| "grad_norm": 17.25, |
| "learning_rate": 3.5464890320087374e-06, |
| "loss": 0.8599, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 9.3125, |
| "learning_rate": 3.463216911000965e-06, |
| "loss": 0.7845, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.645, |
| "grad_norm": 11.25, |
| "learning_rate": 3.3804129090092542e-06, |
| "loss": 0.848, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 8.6875, |
| "learning_rate": 3.2981022489347503e-06, |
| "loss": 0.8123, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.655, |
| "grad_norm": 10.875, |
| "learning_rate": 3.2163100034018735e-06, |
| "loss": 0.8516, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 12.5625, |
| "learning_rate": 3.1350610871209553e-06, |
| "loss": 0.7952, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.665, |
| "grad_norm": 10.6875, |
| "learning_rate": 3.0543802492989693e-06, |
| "loss": 0.7857, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 11.4375, |
| "learning_rate": 2.974292066100688e-06, |
| "loss": 0.8264, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.675, |
| "grad_norm": 12.3125, |
| "learning_rate": 2.8948209331625454e-06, |
| "loss": 0.8206, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 14.375, |
| "learning_rate": 2.8159910581614904e-06, |
| "loss": 0.8148, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.685, |
| "grad_norm": 13.1875, |
| "learning_rate": 2.7378264534410865e-06, |
| "loss": 0.7992, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 16.375, |
| "learning_rate": 2.6603509286971342e-06, |
| "loss": 0.8353, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.695, |
| "grad_norm": 12.3125, |
| "learning_rate": 2.5835880837249884e-06, |
| "loss": 0.8504, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 10.875, |
| "learning_rate": 2.507561301230849e-06, |
| "loss": 0.8375, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.705, |
| "grad_norm": 9.125, |
| "learning_rate": 2.432293739709151e-06, |
| "loss": 0.8263, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 12.4375, |
| "learning_rate": 2.357808326388265e-06, |
| "loss": 0.7837, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.715, |
| "grad_norm": 14.1875, |
| "learning_rate": 2.284127750246646e-06, |
| "loss": 0.8091, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 10.9375, |
| "learning_rate": 2.2112744551015496e-06, |
| "loss": 0.8337, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.725, |
| "grad_norm": 10.625, |
| "learning_rate": 2.13927063277242e-06, |
| "loss": 0.8336, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 10.0, |
| "learning_rate": 2.0681382163210533e-06, |
| "loss": 0.812, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.735, |
| "grad_norm": 13.9375, |
| "learning_rate": 1.9978988733705807e-06, |
| "loss": 0.8673, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 9.8125, |
| "learning_rate": 1.928573999505284e-06, |
| "loss": 0.7956, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.745, |
| "grad_norm": 12.1875, |
| "learning_rate": 1.8601847117533112e-06, |
| "loss": 0.8501, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 12.75, |
| "learning_rate": 1.7927518421542106e-06, |
| "loss": 0.8574, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.755, |
| "grad_norm": 14.25, |
| "learning_rate": 1.7262959314133015e-06, |
| "loss": 0.8975, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 13.25, |
| "learning_rate": 1.6608372226447678e-06, |
| "loss": 0.791, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.765, |
| "grad_norm": 11.6875, |
| "learning_rate": 1.596395655205411e-06, |
| "loss": 0.7473, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 13.5625, |
| "learning_rate": 1.5329908586209347e-06, |
| "loss": 0.7871, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.775, |
| "grad_norm": 12.5, |
| "learning_rate": 1.4706421466065952e-06, |
| "loss": 0.8209, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 10.125, |
| "learning_rate": 1.4093685111840567e-06, |
| "loss": 0.8152, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.785, |
| "grad_norm": 10.6875, |
| "learning_rate": 1.349188616896238e-06, |
| "loss": 0.7958, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 16.625, |
| "learning_rate": 1.2901207951219186e-06, |
| "loss": 0.8766, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.795, |
| "grad_norm": 15.8125, |
| "learning_rate": 1.2321830384918116e-06, |
| "loss": 0.8304, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 11.5625, |
| "learning_rate": 1.1753929954078414e-06, |
| "loss": 0.847, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.805, |
| "grad_norm": 13.1875, |
| "learning_rate": 1.1197679646672698e-06, |
| "loss": 0.8618, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 14.875, |
| "learning_rate": 1.065324890193314e-06, |
| "loss": 0.8767, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.815, |
| "grad_norm": 11.625, |
| "learning_rate": 1.0120803558738585e-06, |
| "loss": 0.7807, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 15.625, |
| "learning_rate": 9.600505805098486e-07, |
| "loss": 0.7866, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.825, |
| "grad_norm": 13.0, |
| "learning_rate": 9.09251412874882e-07, |
| "loss": 0.8144, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 12.75, |
| "learning_rate": 8.596983268875281e-07, |
| "loss": 0.8139, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.835, |
| "grad_norm": 14.8125, |
| "learning_rate": 8.114064168978064e-07, |
| "loss": 0.8343, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 10.875, |
| "learning_rate": 7.643903930893154e-07, |
| "loss": 0.8079, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.845, |
| "grad_norm": 12.0625, |
| "learning_rate": 7.186645769983591e-07, |
| "loss": 0.8719, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 16.0, |
| "learning_rate": 6.742428971514786e-07, |
| "loss": 0.8109, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.855, |
| "grad_norm": 11.25, |
| "learning_rate": 6.311388848226741e-07, |
| "loss": 0.8224, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 14.0, |
| "learning_rate": 5.893656699116618e-07, |
| "loss": 0.8492, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.865, |
| "grad_norm": 21.5, |
| "learning_rate": 5.489359769443675e-07, |
| "loss": 0.8314, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 11.625, |
| "learning_rate": 5.098621211969224e-07, |
| "loss": 0.8347, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 13.3125, |
| "learning_rate": 4.72156004944303e-07, |
| "loss": 0.8807, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 11.375, |
| "learning_rate": 4.3582911383478646e-07, |
| "loss": 0.8198, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.885, |
| "grad_norm": 10.875, |
| "learning_rate": 4.0089251339131164e-07, |
| "loss": 0.8313, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 12.375, |
| "learning_rate": 3.6735684564081385e-07, |
| "loss": 0.8023, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.895, |
| "grad_norm": 11.1875, |
| "learning_rate": 3.352323258725554e-07, |
| "loss": 0.8729, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 26.375, |
| "learning_rate": 3.0452873952645455e-07, |
| "loss": 0.8551, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.905, |
| "grad_norm": 13.5625, |
| "learning_rate": 2.752554392123463e-07, |
| "loss": 0.8927, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 12.375, |
| "learning_rate": 2.474213418610816e-07, |
| "loss": 0.8577, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.915, |
| "grad_norm": 11.5, |
| "learning_rate": 2.210349260083494e-07, |
| "loss": 0.8146, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 11.0, |
| "learning_rate": 1.961042292120291e-07, |
| "loss": 0.781, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.925, |
| "grad_norm": 11.8125, |
| "learning_rate": 1.7263684560387518e-07, |
| "loss": 0.8772, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 9.625, |
| "learning_rate": 1.5063992357626623e-07, |
| "loss": 0.8103, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.935, |
| "grad_norm": 14.9375, |
| "learning_rate": 1.3012016360474223e-07, |
| "loss": 0.8103, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 15.5, |
| "learning_rate": 1.1108381620696885e-07, |
| "loss": 0.7785, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.945, |
| "grad_norm": 19.0, |
| "learning_rate": 9.353668003877437e-08, |
| "loss": 0.7905, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 15.375, |
| "learning_rate": 7.748410012781705e-08, |
| "loss": 0.7977, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.955, |
| "grad_norm": 16.75, |
| "learning_rate": 6.293096624544304e-08, |
| "loss": 0.8159, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 12.125, |
| "learning_rate": 4.988171141721232e-08, |
| "loss": 0.7816, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.965, |
| "grad_norm": 9.25, |
| "learning_rate": 3.83403105725566e-08, |
| "loss": 0.7862, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 12.125, |
| "learning_rate": 2.8310279333976786e-08, |
| "loss": 0.815, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.975, |
| "grad_norm": 12.0, |
| "learning_rate": 1.9794672946152337e-08, |
| "loss": 0.8259, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 11.9375, |
| "learning_rate": 1.2796085345280207e-08, |
| "loss": 0.8543, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.985, |
| "grad_norm": 12.8125, |
| "learning_rate": 7.3166483689413035e-09, |
| "loss": 0.878, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 11.375, |
| "learning_rate": 3.3580311067188396e-09, |
| "loss": 0.833, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.995, |
| "grad_norm": 11.3125, |
| "learning_rate": 9.214393917789111e-10, |
| "loss": 0.8103, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 10.5625, |
| "learning_rate": 7.615433561536379e-12, |
| "loss": 0.8275, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.1659061057355776e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|