| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.87109375, | |
| "eval_steps": 500, | |
| "global_step": 1470, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01953125, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 1e-05, | |
| "loss": 12.1665, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0390625, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 2e-05, | |
| "loss": 11.9295, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05859375, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 3e-05, | |
| "loss": 11.3886, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 4e-05, | |
| "loss": 10.1535, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09765625, | |
| "grad_norm": 6.375, | |
| "learning_rate": 5e-05, | |
| "loss": 8.626, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1171875, | |
| "grad_norm": 6.75, | |
| "learning_rate": 6e-05, | |
| "loss": 7.467, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13671875, | |
| "grad_norm": 9.375, | |
| "learning_rate": 7e-05, | |
| "loss": 6.5431, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 5.5603, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17578125, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 9e-05, | |
| "loss": 4.1738, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5115, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21484375, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 9.930362116991644e-05, | |
| "loss": 0.7972, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.860724233983287e-05, | |
| "loss": 0.3554, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.25390625, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 9.79108635097493e-05, | |
| "loss": 0.2982, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2734375, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 9.721448467966574e-05, | |
| "loss": 0.3548, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.29296875, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 9.651810584958218e-05, | |
| "loss": 0.2728, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 9.58217270194986e-05, | |
| "loss": 0.275, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.33203125, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 9.512534818941504e-05, | |
| "loss": 0.2812, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3515625, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.442896935933148e-05, | |
| "loss": 0.2561, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.37109375, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 9.373259052924791e-05, | |
| "loss": 0.2348, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 9.303621169916435e-05, | |
| "loss": 0.2322, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.41015625, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 9.233983286908079e-05, | |
| "loss": 0.2403, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4296875, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 9.164345403899723e-05, | |
| "loss": 0.2299, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.44921875, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.094707520891366e-05, | |
| "loss": 0.2166, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 9.025069637883009e-05, | |
| "loss": 0.2118, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 8.955431754874652e-05, | |
| "loss": 0.2317, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5078125, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 8.885793871866296e-05, | |
| "loss": 0.2111, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.52734375, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 8.81615598885794e-05, | |
| "loss": 0.2349, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 8.746518105849582e-05, | |
| "loss": 0.2059, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.56640625, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 8.676880222841226e-05, | |
| "loss": 0.1923, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 8.60724233983287e-05, | |
| "loss": 0.1994, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.60546875, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 8.537604456824512e-05, | |
| "loss": 0.2052, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 30.125, | |
| "learning_rate": 8.467966573816156e-05, | |
| "loss": 0.1987, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.64453125, | |
| "grad_norm": 1.125, | |
| "learning_rate": 8.3983286908078e-05, | |
| "loss": 0.2071, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6640625, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 8.328690807799443e-05, | |
| "loss": 0.1882, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.68359375, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8.259052924791086e-05, | |
| "loss": 0.1953, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 8.18941504178273e-05, | |
| "loss": 0.1923, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.72265625, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 8.119777158774373e-05, | |
| "loss": 0.1866, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7421875, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 8.050139275766017e-05, | |
| "loss": 0.1754, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.76171875, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 7.980501392757661e-05, | |
| "loss": 0.1762, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 7.910863509749304e-05, | |
| "loss": 0.1818, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.80078125, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 7.841225626740948e-05, | |
| "loss": 0.1762, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8203125, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 7.771587743732592e-05, | |
| "loss": 0.1752, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.83984375, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 7.701949860724234e-05, | |
| "loss": 0.163, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 7.632311977715878e-05, | |
| "loss": 0.1531, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.87890625, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 7.562674094707522e-05, | |
| "loss": 0.1657, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8984375, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 7.493036211699165e-05, | |
| "loss": 0.1326, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.91796875, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 7.423398328690808e-05, | |
| "loss": 0.1369, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 7.353760445682452e-05, | |
| "loss": 0.1431, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.95703125, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 7.284122562674095e-05, | |
| "loss": 0.1341, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 7.214484679665738e-05, | |
| "loss": 0.1319, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.99609375, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 7.144846796657381e-05, | |
| "loss": 0.1243, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.015625, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 7.075208913649025e-05, | |
| "loss": 0.1269, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.03515625, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 7.005571030640669e-05, | |
| "loss": 0.1208, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0546875, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 6.935933147632311e-05, | |
| "loss": 0.114, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.07421875, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 6.866295264623955e-05, | |
| "loss": 0.1116, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.09375, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 6.796657381615599e-05, | |
| "loss": 0.1146, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.11328125, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 6.727019498607243e-05, | |
| "loss": 0.1156, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1328125, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 6.657381615598886e-05, | |
| "loss": 0.1068, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.15234375, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 6.58774373259053e-05, | |
| "loss": 0.1211, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 6.518105849582174e-05, | |
| "loss": 0.1099, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.19140625, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 6.448467966573817e-05, | |
| "loss": 0.1008, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2109375, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 6.37883008356546e-05, | |
| "loss": 0.1115, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.23046875, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 6.309192200557104e-05, | |
| "loss": 0.1067, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 6.239554317548747e-05, | |
| "loss": 0.1081, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.26953125, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 6.169916434540391e-05, | |
| "loss": 0.0991, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.2890625, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 6.100278551532034e-05, | |
| "loss": 0.1027, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.30859375, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 6.030640668523677e-05, | |
| "loss": 0.102, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.328125, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 5.96100278551532e-05, | |
| "loss": 0.1012, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.34765625, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 5.891364902506964e-05, | |
| "loss": 0.0978, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3671875, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 5.821727019498607e-05, | |
| "loss": 0.1196, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.38671875, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 5.752089136490251e-05, | |
| "loss": 0.1047, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.40625, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 5.682451253481894e-05, | |
| "loss": 0.1071, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.42578125, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 5.6128133704735375e-05, | |
| "loss": 0.105, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4453125, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 5.5431754874651806e-05, | |
| "loss": 0.1121, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.46484375, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 5.473537604456824e-05, | |
| "loss": 0.1016, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.484375, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 5.4038997214484674e-05, | |
| "loss": 0.1079, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.50390625, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 5.3342618384401125e-05, | |
| "loss": 0.1079, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.5234375, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 5.2646239554317555e-05, | |
| "loss": 0.1076, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.54296875, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 5.194986072423399e-05, | |
| "loss": 0.1192, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 5.125348189415042e-05, | |
| "loss": 0.1, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.58203125, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 5.055710306406686e-05, | |
| "loss": 0.1075, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6015625, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 4.986072423398329e-05, | |
| "loss": 0.1036, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.62109375, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 4.916434540389973e-05, | |
| "loss": 0.1016, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.640625, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 4.846796657381616e-05, | |
| "loss": 0.1041, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.66015625, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 4.7771587743732597e-05, | |
| "loss": 0.108, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.6796875, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 4.707520891364903e-05, | |
| "loss": 0.1063, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.69921875, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 4.637883008356546e-05, | |
| "loss": 0.1048, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.71875, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 4.5682451253481895e-05, | |
| "loss": 0.0985, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.73828125, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 4.4986072423398326e-05, | |
| "loss": 0.111, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.7578125, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 4.428969359331476e-05, | |
| "loss": 0.1007, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.77734375, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 4.35933147632312e-05, | |
| "loss": 0.0981, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.796875, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.289693593314764e-05, | |
| "loss": 0.1016, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.81640625, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 4.220055710306407e-05, | |
| "loss": 0.1045, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8359375, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 4.1504178272980506e-05, | |
| "loss": 0.1019, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.85546875, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 4.0807799442896936e-05, | |
| "loss": 0.1061, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 4.0111420612813374e-05, | |
| "loss": 0.1098, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.89453125, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 3.9415041782729804e-05, | |
| "loss": 0.0954, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9140625, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 3.871866295264624e-05, | |
| "loss": 0.1012, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.93359375, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 3.802228412256267e-05, | |
| "loss": 0.099, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 3.7325905292479116e-05, | |
| "loss": 0.1046, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.97265625, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 3.662952646239555e-05, | |
| "loss": 0.097, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.9921875, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 3.5933147632311984e-05, | |
| "loss": 0.0964, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.01171875, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.5236768802228415e-05, | |
| "loss": 0.0986, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.03125, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 3.454038997214485e-05, | |
| "loss": 0.1056, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.05078125, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 3.384401114206128e-05, | |
| "loss": 0.1071, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.0703125, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 3.314763231197771e-05, | |
| "loss": 0.1053, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.08984375, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 3.245125348189415e-05, | |
| "loss": 0.1018, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.109375, | |
| "grad_norm": 5.125, | |
| "learning_rate": 3.175487465181058e-05, | |
| "loss": 0.093, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.12890625, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 3.105849582172702e-05, | |
| "loss": 0.102, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.1484375, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 3.036211699164346e-05, | |
| "loss": 0.1046, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.16796875, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.9665738161559893e-05, | |
| "loss": 0.1016, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 2.8969359331476327e-05, | |
| "loss": 0.0973, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.20703125, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.827298050139276e-05, | |
| "loss": 0.0925, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.2265625, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.7576601671309192e-05, | |
| "loss": 0.1027, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.24609375, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 2.6880222841225626e-05, | |
| "loss": 0.0926, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.265625, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 2.618384401114206e-05, | |
| "loss": 0.0987, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.28515625, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 2.5487465181058494e-05, | |
| "loss": 0.0975, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.3046875, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 2.479108635097493e-05, | |
| "loss": 0.1016, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.32421875, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 2.4094707520891365e-05, | |
| "loss": 0.1005, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 2.33983286908078e-05, | |
| "loss": 0.0971, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.36328125, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.2701949860724233e-05, | |
| "loss": 0.096, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.3828125, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 2.200557103064067e-05, | |
| "loss": 0.0988, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.40234375, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.1309192200557104e-05, | |
| "loss": 0.1, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.421875, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 2.0612813370473538e-05, | |
| "loss": 0.0996, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.44140625, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.9916434540389972e-05, | |
| "loss": 0.1018, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.4609375, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 1.922005571030641e-05, | |
| "loss": 0.0939, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.48046875, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.8523676880222844e-05, | |
| "loss": 0.0981, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.7827298050139278e-05, | |
| "loss": 0.1044, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.51953125, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.713091922005571e-05, | |
| "loss": 0.1002, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.5390625, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.6434540389972145e-05, | |
| "loss": 0.0922, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.55859375, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.5738161559888583e-05, | |
| "loss": 0.0959, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.578125, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.5041782729805015e-05, | |
| "loss": 0.0932, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.59765625, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.4345403899721449e-05, | |
| "loss": 0.0982, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.6171875, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 1.3649025069637883e-05, | |
| "loss": 0.0994, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.63671875, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.2952646239554317e-05, | |
| "loss": 0.0995, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.65625, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.2256267409470753e-05, | |
| "loss": 0.0947, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.67578125, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 1.1559888579387188e-05, | |
| "loss": 0.0923, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.6953125, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 1.086350974930362e-05, | |
| "loss": 0.1004, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.71484375, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 1.0167130919220056e-05, | |
| "loss": 0.095, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 9.47075208913649e-06, | |
| "loss": 0.098, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.75390625, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 8.774373259052924e-06, | |
| "loss": 0.1082, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.7734375, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 8.07799442896936e-06, | |
| "loss": 0.0969, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.79296875, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 7.381615598885794e-06, | |
| "loss": 0.0922, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 6.6852367688022295e-06, | |
| "loss": 0.0951, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.83203125, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 5.988857938718663e-06, | |
| "loss": 0.0947, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.8515625, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 5.2924791086350974e-06, | |
| "loss": 0.1013, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.87109375, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 4.596100278551532e-06, | |
| "loss": 0.0897, | |
| "step": 1470 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1536, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5658272704198656.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |