| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.998870907038013, | |
| "eval_steps": 500, | |
| "global_step": 1992, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.2066432386636734, | |
| "learning_rate": 9.999384369486675e-05, | |
| "loss": 0.6539, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.1350802630186081, | |
| "learning_rate": 9.997525241303441e-05, | |
| "loss": 0.4242, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.12290512770414352, | |
| "learning_rate": 9.994423062331178e-05, | |
| "loss": 0.4085, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.10319065302610397, | |
| "learning_rate": 9.990078604185e-05, | |
| "loss": 0.3843, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.1067107692360878, | |
| "learning_rate": 9.984492947476183e-05, | |
| "loss": 0.3814, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.10464764386415482, | |
| "learning_rate": 9.977667481543383e-05, | |
| "loss": 0.3806, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.10096515715122223, | |
| "learning_rate": 9.969603904107045e-05, | |
| "loss": 0.3823, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.10181506723165512, | |
| "learning_rate": 9.960304220847147e-05, | |
| "loss": 0.3717, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.10328900068998337, | |
| "learning_rate": 9.949770744904306e-05, | |
| "loss": 0.3761, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.10094640403985977, | |
| "learning_rate": 9.938006096304422e-05, | |
| "loss": 0.3766, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.10436520725488663, | |
| "learning_rate": 9.925013201306999e-05, | |
| "loss": 0.3815, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.09791135042905807, | |
| "learning_rate": 9.910795291677279e-05, | |
| "loss": 0.3793, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.10530658811330795, | |
| "learning_rate": 9.8953559038824e-05, | |
| "loss": 0.3746, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.10106656700372696, | |
| "learning_rate": 9.878698878211756e-05, | |
| "loss": 0.3627, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.10589198768138885, | |
| "learning_rate": 9.86082835782179e-05, | |
| "loss": 0.3585, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.09950920939445496, | |
| "learning_rate": 9.841748787705453e-05, | |
| "loss": 0.3648, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.09534649550914764, | |
| "learning_rate": 9.821464913586586e-05, | |
| "loss": 0.3714, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.1069357767701149, | |
| "learning_rate": 9.799981780739504e-05, | |
| "loss": 0.3691, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.0956304594874382, | |
| "learning_rate": 9.777304732734063e-05, | |
| "loss": 0.3621, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.1080741360783577, | |
| "learning_rate": 9.753439410106537e-05, | |
| "loss": 0.3627, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.11919151246547699, | |
| "learning_rate": 9.728391748956637e-05, | |
| "loss": 0.358, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.09667570888996124, | |
| "learning_rate": 9.702167979470994e-05, | |
| "loss": 0.3587, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.09914813190698624, | |
| "learning_rate": 9.67477462437351e-05, | |
| "loss": 0.3593, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.09963402152061462, | |
| "learning_rate": 9.646218497302945e-05, | |
| "loss": 0.3619, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.0956585481762886, | |
| "learning_rate": 9.616506701118124e-05, | |
| "loss": 0.3592, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.08918160200119019, | |
| "learning_rate": 9.585646626131237e-05, | |
| "loss": 0.3572, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.10680545121431351, | |
| "learning_rate": 9.553645948269607e-05, | |
| "loss": 0.3584, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.09338073432445526, | |
| "learning_rate": 9.520512627166445e-05, | |
| "loss": 0.3569, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.09849222749471664, | |
| "learning_rate": 9.48625490418101e-05, | |
| "loss": 0.353, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.09319322556257248, | |
| "learning_rate": 9.450881300348724e-05, | |
| "loss": 0.3492, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.09313003718852997, | |
| "learning_rate": 9.414400614261693e-05, | |
| "loss": 0.3618, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.09641731530427933, | |
| "learning_rate": 9.376821919880219e-05, | |
| "loss": 0.3668, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.10052476078271866, | |
| "learning_rate": 9.338154564275788e-05, | |
| "loss": 0.3445, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.09882521629333496, | |
| "learning_rate": 9.298408165306157e-05, | |
| "loss": 0.3459, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.0998958945274353, | |
| "learning_rate": 9.257592609223059e-05, | |
| "loss": 0.3532, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.0996759906411171, | |
| "learning_rate": 9.21571804821318e-05, | |
| "loss": 0.3542, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.09207039326429367, | |
| "learning_rate": 9.172794897872957e-05, | |
| "loss": 0.3424, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.09934650361537933, | |
| "learning_rate": 9.128833834617876e-05, | |
| "loss": 0.35, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.09678385406732559, | |
| "learning_rate": 9.083845793026905e-05, | |
| "loss": 0.3461, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.09645042568445206, | |
| "learning_rate": 9.037841963122682e-05, | |
| "loss": 0.3367, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.09238140285015106, | |
| "learning_rate": 8.990833787588194e-05, | |
| "loss": 0.3504, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.09661918878555298, | |
| "learning_rate": 8.942832958920602e-05, | |
| "loss": 0.3496, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.09576547890901566, | |
| "learning_rate": 8.893851416522925e-05, | |
| "loss": 0.3513, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.091176837682724, | |
| "learning_rate": 8.843901343734309e-05, | |
| "loss": 0.3409, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.09654372185468674, | |
| "learning_rate": 8.792995164799637e-05, | |
| "loss": 0.3446, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.08725057542324066, | |
| "learning_rate": 8.741145541779199e-05, | |
| "loss": 0.3442, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.09062401205301285, | |
| "learning_rate": 8.688365371399208e-05, | |
| "loss": 0.3444, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.09887181222438812, | |
| "learning_rate": 8.63466778184397e-05, | |
| "loss": 0.3402, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.09595289826393127, | |
| "learning_rate": 8.580066129490462e-05, | |
| "loss": 0.3424, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.0977388396859169, | |
| "learning_rate": 8.524573995586153e-05, | |
| "loss": 0.358, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 0.26840201020240784, | |
| "eval_runtime": 20.7882, | |
| "eval_samples_per_second": 0.529, | |
| "eval_steps_per_second": 0.529, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.08854895085096359, | |
| "learning_rate": 8.468205182870901e-05, | |
| "loss": 0.3328, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.09385243058204651, | |
| "learning_rate": 8.410973712143747e-05, | |
| "loss": 0.3441, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.09778619557619095, | |
| "learning_rate": 8.352893818775484e-05, | |
| "loss": 0.3451, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.09615397453308105, | |
| "learning_rate": 8.293979949167839e-05, | |
| "loss": 0.3441, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.10019069164991379, | |
| "learning_rate": 8.234246757160174e-05, | |
| "loss": 0.3309, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.09099920839071274, | |
| "learning_rate": 8.17370910038459e-05, | |
| "loss": 0.3319, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.10040932148694992, | |
| "learning_rate": 8.112382036570344e-05, | |
| "loss": 0.342, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.09268151968717575, | |
| "learning_rate": 8.050280819798481e-05, | |
| "loss": 0.334, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.09127725660800934, | |
| "learning_rate": 7.987420896707645e-05, | |
| "loss": 0.3476, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.09117837995290756, | |
| "learning_rate": 7.923817902651978e-05, | |
| "loss": 0.3351, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.09493087977170944, | |
| "learning_rate": 7.859487657812095e-05, | |
| "loss": 0.3408, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.09857796877622604, | |
| "learning_rate": 7.794446163260077e-05, | |
| "loss": 0.3416, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.09530830383300781, | |
| "learning_rate": 7.728709596979471e-05, | |
| "loss": 0.3403, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.10255115479230881, | |
| "learning_rate": 7.662294309841283e-05, | |
| "loss": 0.3349, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.09442761540412903, | |
| "learning_rate": 7.595216821536981e-05, | |
| "loss": 0.3469, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.09131593257188797, | |
| "learning_rate": 7.527493816469492e-05, | |
| "loss": 0.3232, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.10782450437545776, | |
| "learning_rate": 7.459142139603236e-05, | |
| "loss": 0.3275, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.10051246732473373, | |
| "learning_rate": 7.390178792274227e-05, | |
| "loss": 0.3168, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.09489897638559341, | |
| "learning_rate": 7.32062092796127e-05, | |
| "loss": 0.3205, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.10021229833364487, | |
| "learning_rate": 7.250485848019326e-05, | |
| "loss": 0.314, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.09837724268436432, | |
| "learning_rate": 7.179790997376083e-05, | |
| "loss": 0.3131, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.11071084439754486, | |
| "learning_rate": 7.108553960192827e-05, | |
| "loss": 0.3141, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.09489303082227707, | |
| "learning_rate": 7.036792455490675e-05, | |
| "loss": 0.3124, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.10039713978767395, | |
| "learning_rate": 6.964524332743263e-05, | |
| "loss": 0.3258, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.09435191005468369, | |
| "learning_rate": 6.891767567436988e-05, | |
| "loss": 0.318, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.10240574926137924, | |
| "learning_rate": 6.818540256599913e-05, | |
| "loss": 0.3286, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.10050353407859802, | |
| "learning_rate": 6.744860614300426e-05, | |
| "loss": 0.3096, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.09765986353158951, | |
| "learning_rate": 6.670746967116793e-05, | |
| "loss": 0.318, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.10131178051233292, | |
| "learning_rate": 6.596217749578743e-05, | |
| "loss": 0.3199, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.0985412448644638, | |
| "learning_rate": 6.521291499582172e-05, | |
| "loss": 0.3173, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.10045495629310608, | |
| "learning_rate": 6.445986853778156e-05, | |
| "loss": 0.304, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.10255653411149979, | |
| "learning_rate": 6.370322542937403e-05, | |
| "loss": 0.3215, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.10014262050390244, | |
| "learning_rate": 6.294317387291276e-05, | |
| "loss": 0.3185, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.10973095148801804, | |
| "learning_rate": 6.217990291850581e-05, | |
| "loss": 0.3128, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.1075139194726944, | |
| "learning_rate": 6.141360241703264e-05, | |
| "loss": 0.3117, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 0.10679470747709274, | |
| "learning_rate": 6.0644462972921845e-05, | |
| "loss": 0.314, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.10646017640829086, | |
| "learning_rate": 5.98726758967415e-05, | |
| "loss": 0.3166, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.10854795575141907, | |
| "learning_rate": 5.909843315761385e-05, | |
| "loss": 0.3104, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.09923075139522552, | |
| "learning_rate": 5.832192733546621e-05, | |
| "loss": 0.3085, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.10540423542261124, | |
| "learning_rate": 5.7543351573129964e-05, | |
| "loss": 0.3035, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.10513672232627869, | |
| "learning_rate": 5.676289952829945e-05, | |
| "loss": 0.3069, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.10602447390556335, | |
| "learning_rate": 5.598076532536291e-05, | |
| "loss": 0.3126, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.10258585214614868, | |
| "learning_rate": 5.5197143507117234e-05, | |
| "loss": 0.3148, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.10304014384746552, | |
| "learning_rate": 5.441222898637877e-05, | |
| "loss": 0.3138, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.10201530903577805, | |
| "learning_rate": 5.362621699750196e-05, | |
| "loss": 0.3104, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.10214639455080032, | |
| "learning_rate": 5.28393030478181e-05, | |
| "loss": 0.3081, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.09911152720451355, | |
| "learning_rate": 5.2051682869006126e-05, | |
| "loss": 0.3081, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.10031607747077942, | |
| "learning_rate": 5.126355236840764e-05, | |
| "loss": 0.3134, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.1077575534582138, | |
| "learning_rate": 5.047510758029832e-05, | |
| "loss": 0.3272, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 0.11312615871429443, | |
| "learning_rate": 4.968654461712753e-05, | |
| "loss": 0.3167, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "eval_loss": 0.2502507269382477, | |
| "eval_runtime": 14.5906, | |
| "eval_samples_per_second": 0.754, | |
| "eval_steps_per_second": 0.754, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.10545619577169418, | |
| "learning_rate": 4.889805962073874e-05, | |
| "loss": 0.3142, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.11502473056316376, | |
| "learning_rate": 4.8109848713582475e-05, | |
| "loss": 0.3164, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.11116404086351395, | |
| "learning_rate": 4.7322107949934146e-05, | |
| "loss": 0.3191, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 0.11101904511451721, | |
| "learning_rate": 4.653503326712886e-05, | |
| "loss": 0.3223, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.10275658220052719, | |
| "learning_rate": 4.5748820436825204e-05, | |
| "loss": 0.3127, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.10536840558052063, | |
| "learning_rate": 4.496366501631043e-05, | |
| "loss": 0.3104, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.11228681355714798, | |
| "learning_rate": 4.417976229985876e-05, | |
| "loss": 0.3181, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 0.10409337282180786, | |
| "learning_rate": 4.339730727015527e-05, | |
| "loss": 0.3085, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.11756953597068787, | |
| "learning_rate": 4.261649454979714e-05, | |
| "loss": 0.3105, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.10340123623609543, | |
| "learning_rate": 4.183751835288463e-05, | |
| "loss": 0.3168, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.11595764011144638, | |
| "learning_rate": 4.10605724367135e-05, | |
| "loss": 0.3172, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.1063317060470581, | |
| "learning_rate": 4.0285850053581105e-05, | |
| "loss": 0.319, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.10760544240474701, | |
| "learning_rate": 3.9513543902718206e-05, | |
| "loss": 0.3096, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.11430344730615616, | |
| "learning_rate": 3.87438460823582e-05, | |
| "loss": 0.3119, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.10582321882247925, | |
| "learning_rate": 3.7976948041955904e-05, | |
| "loss": 0.3179, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.11124568432569504, | |
| "learning_rate": 3.7213040534567725e-05, | |
| "loss": 0.3099, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.10317942500114441, | |
| "learning_rate": 3.645231356940501e-05, | |
| "loss": 0.3081, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.10943736135959625, | |
| "learning_rate": 3.569495636457244e-05, | |
| "loss": 0.3103, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.10795300453901291, | |
| "learning_rate": 3.494115730000321e-05, | |
| "loss": 0.3123, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 0.1120479553937912, | |
| "learning_rate": 3.4191103870602656e-05, | |
| "loss": 0.3072, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.10475881397724152, | |
| "learning_rate": 3.344498263961201e-05, | |
| "loss": 0.3107, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.10638121515512466, | |
| "learning_rate": 3.270297919220395e-05, | |
| "loss": 0.3101, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.10646604746580124, | |
| "learning_rate": 3.1965278089321396e-05, | |
| "loss": 0.3201, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.10810079425573349, | |
| "learning_rate": 3.123206282177105e-05, | |
| "loss": 0.3129, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.10837584733963013, | |
| "learning_rate": 3.05035157645831e-05, | |
| "loss": 0.3138, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.11282283812761307, | |
| "learning_rate": 2.9779818131648563e-05, | |
| "loss": 0.3048, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 0.10869032144546509, | |
| "learning_rate": 2.9061149930645243e-05, | |
| "loss": 0.3163, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 0.11098149418830872, | |
| "learning_rate": 2.8347689918263976e-05, | |
| "loss": 0.3083, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.10902676731348038, | |
| "learning_rate": 2.763961555574575e-05, | |
| "loss": 0.3008, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.10680384933948517, | |
| "learning_rate": 2.69371029647413e-05, | |
| "loss": 0.3022, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.10289633274078369, | |
| "learning_rate": 2.624032688350374e-05, | |
| "loss": 0.3045, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.11078934371471405, | |
| "learning_rate": 2.5549460623425354e-05, | |
| "loss": 0.3065, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.10170484334230423, | |
| "learning_rate": 2.486467602592929e-05, | |
| "loss": 0.2956, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.1094578206539154, | |
| "learning_rate": 2.4186143419726885e-05, | |
| "loss": 0.2938, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.10822325944900513, | |
| "learning_rate": 2.351403157845125e-05, | |
| "loss": 0.2863, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.11142423003911972, | |
| "learning_rate": 2.2848507678677633e-05, | |
| "loss": 0.2846, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.11497773230075836, | |
| "learning_rate": 2.218973725834109e-05, | |
| "loss": 0.2936, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.10550795495510101, | |
| "learning_rate": 2.153788417556164e-05, | |
| "loss": 0.2888, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.10813359171152115, | |
| "learning_rate": 2.089311056788731e-05, | |
| "loss": 0.2889, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.10650567710399628, | |
| "learning_rate": 2.0255576811965154e-05, | |
| "loss": 0.2925, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.11273263394832611, | |
| "learning_rate": 1.9625441483650235e-05, | |
| "loss": 0.2856, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.11437318474054337, | |
| "learning_rate": 1.9002861318562536e-05, | |
| "loss": 0.2845, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.11178340762853622, | |
| "learning_rate": 1.8387991173101587e-05, | |
| "loss": 0.2904, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.11630496382713318, | |
| "learning_rate": 1.7780983985928534e-05, | |
| "loss": 0.2851, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.11532077938318253, | |
| "learning_rate": 1.7181990739925213e-05, | |
| "loss": 0.2797, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.11311406642198563, | |
| "learning_rate": 1.6591160424639675e-05, | |
| "loss": 0.288, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.11721379309892654, | |
| "learning_rate": 1.6008639999227527e-05, | |
| "loss": 0.2926, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.11629457771778107, | |
| "learning_rate": 1.5434574355898306e-05, | |
| "loss": 0.2883, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.11253120750188828, | |
| "learning_rate": 1.4869106283875972e-05, | |
| "loss": 0.2878, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.1093481034040451, | |
| "learning_rate": 1.4312376433882457e-05, | |
| "loss": 0.2893, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "eval_loss": 0.24623431265354156, | |
| "eval_runtime": 15.3708, | |
| "eval_samples_per_second": 0.716, | |
| "eval_steps_per_second": 0.716, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.114622101187706, | |
| "learning_rate": 1.376452328315318e-05, | |
| "loss": 0.2905, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.11465822905302048, | |
| "learning_rate": 1.3225683100993113e-05, | |
| "loss": 0.2886, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.11255551874637604, | |
| "learning_rate": 1.2695989914882128e-05, | |
| "loss": 0.2873, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.11598910391330719, | |
| "learning_rate": 1.2175575477137824e-05, | |
| "loss": 0.2853, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.11426587402820587, | |
| "learning_rate": 1.1664569232144445e-05, | |
| "loss": 0.2934, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.11229728907346725, | |
| "learning_rate": 1.1163098284155665e-05, | |
| "loss": 0.2878, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.10811661183834076, | |
| "learning_rate": 1.0671287365679567e-05, | |
| "loss": 0.2818, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.109583280980587, | |
| "learning_rate": 1.018925880645351e-05, | |
| "loss": 0.2915, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.1194503903388977, | |
| "learning_rate": 9.717132503016685e-06, | |
| "loss": 0.2922, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.11107660830020905, | |
| "learning_rate": 9.255025888887814e-06, | |
| "loss": 0.2843, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.12213042378425598, | |
| "learning_rate": 8.80305390535554e-06, | |
| "loss": 0.2867, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.11774149537086487, | |
| "learning_rate": 8.361328972888732e-06, | |
| "loss": 0.2838, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.11726924777030945, | |
| "learning_rate": 7.929960963173727e-06, | |
| "loss": 0.288, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 0.12220928817987442, | |
| "learning_rate": 7.509057171785639e-06, | |
| "loss": 0.2844, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.11534737050533295, | |
| "learning_rate": 7.098722291500331e-06, | |
| "loss": 0.2842, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.11947780847549438, | |
| "learning_rate": 6.699058386253865e-06, | |
| "loss": 0.2827, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.1192295104265213, | |
| "learning_rate": 6.310164865755808e-06, | |
| "loss": 0.2907, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.11454175412654877, | |
| "learning_rate": 5.93213846076271e-06, | |
| "loss": 0.2833, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.11072956025600433, | |
| "learning_rate": 5.5650731990179674e-06, | |
| "loss": 0.2869, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.10620978474617004, | |
| "learning_rate": 5.20906038186399e-06, | |
| "loss": 0.2775, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.1189485564827919, | |
| "learning_rate": 4.864188561532507e-06, | |
| "loss": 0.2842, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.1264795958995819, | |
| "learning_rate": 4.530543519118702e-06, | |
| "loss": 0.2944, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.10882660746574402, | |
| "learning_rate": 4.208208243244577e-06, | |
| "loss": 0.2903, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.1184026375412941, | |
| "learning_rate": 3.8972629094169485e-06, | |
| "loss": 0.2934, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.115041583776474, | |
| "learning_rate": 3.5977848600851016e-06, | |
| "loss": 0.2831, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.1151689738035202, | |
| "learning_rate": 3.309848585403169e-06, | |
| "loss": 0.2828, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.11265784502029419, | |
| "learning_rate": 3.033525704701956e-06, | |
| "loss": 0.2868, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.10989955067634583, | |
| "learning_rate": 2.768884948674816e-06, | |
| "loss": 0.2798, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.12034480273723602, | |
| "learning_rate": 2.515992142282042e-06, | |
| "loss": 0.2932, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.11389750242233276, | |
| "learning_rate": 2.2749101883780157e-06, | |
| "loss": 0.2877, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.12224046885967255, | |
| "learning_rate": 2.0456990520651696e-06, | |
| "loss": 0.29, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.11018862575292587, | |
| "learning_rate": 1.8284157457786833e-06, | |
| "loss": 0.2903, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.11373750865459442, | |
| "learning_rate": 1.6231143151055838e-06, | |
| "loss": 0.2823, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.11346758902072906, | |
| "learning_rate": 1.4298458253417968e-06, | |
| "loss": 0.2918, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.12030266970396042, | |
| "learning_rate": 1.2486583487905324e-06, | |
| "loss": 0.2793, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.11971867829561234, | |
| "learning_rate": 1.079596952805101e-06, | |
| "loss": 0.2835, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.11345378309488297, | |
| "learning_rate": 9.227036885791352e-07, | |
| "loss": 0.287, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.12087972462177277, | |
| "learning_rate": 7.78017580687107e-07, | |
| "loss": 0.2834, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.11702313274145126, | |
| "learning_rate": 6.455746173775701e-07, | |
| "loss": 0.2828, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.11680703610181808, | |
| "learning_rate": 5.25407741621714e-07, | |
| "loss": 0.2821, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.11461573839187622, | |
| "learning_rate": 4.1754684291934744e-07, | |
| "loss": 0.2784, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.12143554538488388, | |
| "learning_rate": 3.2201874986437784e-07, | |
| "loss": 0.278, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.11503802239894867, | |
| "learning_rate": 2.3884722347164434e-07, | |
| "loss": 0.2881, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.11443481594324112, | |
| "learning_rate": 1.6805295126677833e-07, | |
| "loss": 0.2952, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.1119546890258789, | |
| "learning_rate": 1.0965354214051982e-07, | |
| "loss": 0.2833, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.11658598482608795, | |
| "learning_rate": 6.366352196878756e-08, | |
| "loss": 0.2954, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.11841295659542084, | |
| "learning_rate": 3.0094329999635906e-08, | |
| "loss": 0.2792, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.11211774498224258, | |
| "learning_rate": 8.954316007908636e-09, | |
| "loss": 0.2914, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.12004334479570389, | |
| "learning_rate": 2.4873821838911073e-10, | |
| "loss": 0.2895, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1992, | |
| "total_flos": 7.377073297607885e+18, | |
| "train_loss": 0.32009275511924523, | |
| "train_runtime": 49821.5539, | |
| "train_samples_per_second": 6.399, | |
| "train_steps_per_second": 0.04 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1992, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 250, | |
| "total_flos": 7.377073297607885e+18, | |
| "train_batch_size": 20, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |