| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0863531225905936, | |
| "eval_steps": 64, | |
| "global_step": 352, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003108003108003108, | |
| "grad_norm": 10.027831077575684, | |
| "learning_rate": 0.0, | |
| "loss": 0.6767, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006216006216006216, | |
| "grad_norm": 9.679778099060059, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.6644, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.009324009324009324, | |
| "grad_norm": 10.520271301269531, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.6934, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.012432012432012432, | |
| "grad_norm": 8.677583694458008, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.6617, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01554001554001554, | |
| "grad_norm": 6.502548694610596, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.6509, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.018648018648018648, | |
| "grad_norm": 4.257171154022217, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.639, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.021756021756021756, | |
| "grad_norm": 3.460066556930542, | |
| "learning_rate": 3e-06, | |
| "loss": 0.6286, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.024864024864024864, | |
| "grad_norm": 3.0126283168792725, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.5948, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.027972027972027972, | |
| "grad_norm": 2.567995309829712, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.5744, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03108003108003108, | |
| "grad_norm": 2.516597032546997, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.5496, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03418803418803419, | |
| "grad_norm": 1.8187586069107056, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5397, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.037296037296037296, | |
| "grad_norm": 1.7935529947280884, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.5229, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04040404040404041, | |
| "grad_norm": 1.8665963411331177, | |
| "learning_rate": 6e-06, | |
| "loss": 0.5227, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.04351204351204351, | |
| "grad_norm": 2.0106680393218994, | |
| "learning_rate": 6.5000000000000004e-06, | |
| "loss": 0.4882, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.046620046620046623, | |
| "grad_norm": 3.305211305618286, | |
| "learning_rate": 7e-06, | |
| "loss": 0.4772, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04972804972804973, | |
| "grad_norm": 3.047219753265381, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.452, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.05283605283605284, | |
| "grad_norm": 2.5453591346740723, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.4138, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.055944055944055944, | |
| "grad_norm": 5.414841175079346, | |
| "learning_rate": 8.5e-06, | |
| "loss": 0.4238, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.059052059052059055, | |
| "grad_norm": 2.979440927505493, | |
| "learning_rate": 9e-06, | |
| "loss": 0.3987, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06216006216006216, | |
| "grad_norm": 1.981175422668457, | |
| "learning_rate": 9.5e-06, | |
| "loss": 0.3874, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06526806526806526, | |
| "grad_norm": 1.7793089151382446, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3631, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.06837606837606838, | |
| "grad_norm": 1.1854480504989624, | |
| "learning_rate": 9.989429175475688e-06, | |
| "loss": 0.3765, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.07148407148407149, | |
| "grad_norm": 0.8928348422050476, | |
| "learning_rate": 9.978858350951375e-06, | |
| "loss": 0.3481, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.07459207459207459, | |
| "grad_norm": 1.7531942129135132, | |
| "learning_rate": 9.968287526427062e-06, | |
| "loss": 0.3693, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0777000777000777, | |
| "grad_norm": 1.0829464197158813, | |
| "learning_rate": 9.957716701902749e-06, | |
| "loss": 0.3644, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08080808080808081, | |
| "grad_norm": 0.98089200258255, | |
| "learning_rate": 9.947145877378436e-06, | |
| "loss": 0.3616, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.08391608391608392, | |
| "grad_norm": 0.795221745967865, | |
| "learning_rate": 9.936575052854123e-06, | |
| "loss": 0.3679, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.08702408702408702, | |
| "grad_norm": 1.091843605041504, | |
| "learning_rate": 9.92600422832981e-06, | |
| "loss": 0.3439, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.09013209013209013, | |
| "grad_norm": 0.8538377285003662, | |
| "learning_rate": 9.915433403805497e-06, | |
| "loss": 0.3401, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.09324009324009325, | |
| "grad_norm": 0.9114591479301453, | |
| "learning_rate": 9.904862579281184e-06, | |
| "loss": 0.3515, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09634809634809635, | |
| "grad_norm": 0.9083001017570496, | |
| "learning_rate": 9.894291754756871e-06, | |
| "loss": 0.3449, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.09945609945609946, | |
| "grad_norm": 0.9144365787506104, | |
| "learning_rate": 9.883720930232558e-06, | |
| "loss": 0.3393, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.10256410256410256, | |
| "grad_norm": 1.0221809148788452, | |
| "learning_rate": 9.873150105708245e-06, | |
| "loss": 0.353, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.10567210567210568, | |
| "grad_norm": 1.0219439268112183, | |
| "learning_rate": 9.862579281183932e-06, | |
| "loss": 0.3439, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.10878010878010878, | |
| "grad_norm": 1.5430618524551392, | |
| "learning_rate": 9.852008456659621e-06, | |
| "loss": 0.3338, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11188811188811189, | |
| "grad_norm": 1.4754544496536255, | |
| "learning_rate": 9.841437632135308e-06, | |
| "loss": 0.3363, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.11499611499611499, | |
| "grad_norm": 1.1298989057540894, | |
| "learning_rate": 9.830866807610995e-06, | |
| "loss": 0.3423, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.11810411810411811, | |
| "grad_norm": 1.0130062103271484, | |
| "learning_rate": 9.820295983086682e-06, | |
| "loss": 0.3298, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.12121212121212122, | |
| "grad_norm": 1.8003513813018799, | |
| "learning_rate": 9.80972515856237e-06, | |
| "loss": 0.3272, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.12432012432012432, | |
| "grad_norm": 0.9532265067100525, | |
| "learning_rate": 9.799154334038056e-06, | |
| "loss": 0.3282, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12742812742812742, | |
| "grad_norm": 1.5232913494110107, | |
| "learning_rate": 9.788583509513743e-06, | |
| "loss": 0.3469, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.13053613053613053, | |
| "grad_norm": 0.8918169736862183, | |
| "learning_rate": 9.77801268498943e-06, | |
| "loss": 0.326, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.13364413364413363, | |
| "grad_norm": 0.8845950365066528, | |
| "learning_rate": 9.767441860465117e-06, | |
| "loss": 0.3313, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.13675213675213677, | |
| "grad_norm": 0.8410794138908386, | |
| "learning_rate": 9.756871035940804e-06, | |
| "loss": 0.3318, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.13986013986013987, | |
| "grad_norm": 0.7157808542251587, | |
| "learning_rate": 9.746300211416491e-06, | |
| "loss": 0.3381, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14296814296814297, | |
| "grad_norm": 1.1680670976638794, | |
| "learning_rate": 9.735729386892178e-06, | |
| "loss": 0.3281, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.14607614607614608, | |
| "grad_norm": 0.9500836133956909, | |
| "learning_rate": 9.725158562367865e-06, | |
| "loss": 0.336, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.14918414918414918, | |
| "grad_norm": 0.8565309643745422, | |
| "learning_rate": 9.714587737843552e-06, | |
| "loss": 0.3207, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1522921522921523, | |
| "grad_norm": 1.1311777830123901, | |
| "learning_rate": 9.70401691331924e-06, | |
| "loss": 0.3339, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.1554001554001554, | |
| "grad_norm": 1.0368160009384155, | |
| "learning_rate": 9.693446088794927e-06, | |
| "loss": 0.3262, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1585081585081585, | |
| "grad_norm": 0.9648517370223999, | |
| "learning_rate": 9.682875264270614e-06, | |
| "loss": 0.3376, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.16161616161616163, | |
| "grad_norm": 1.1039059162139893, | |
| "learning_rate": 9.6723044397463e-06, | |
| "loss": 0.3352, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.16472416472416473, | |
| "grad_norm": 1.0544918775558472, | |
| "learning_rate": 9.661733615221988e-06, | |
| "loss": 0.3237, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.16783216783216784, | |
| "grad_norm": 1.533158302307129, | |
| "learning_rate": 9.651162790697676e-06, | |
| "loss": 0.3287, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.17094017094017094, | |
| "grad_norm": 1.2342826128005981, | |
| "learning_rate": 9.640591966173363e-06, | |
| "loss": 0.3162, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.17404817404817405, | |
| "grad_norm": 1.0702942609786987, | |
| "learning_rate": 9.63002114164905e-06, | |
| "loss": 0.3143, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.17715617715617715, | |
| "grad_norm": 1.02211594581604, | |
| "learning_rate": 9.619450317124736e-06, | |
| "loss": 0.3318, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.18026418026418026, | |
| "grad_norm": 0.8379388451576233, | |
| "learning_rate": 9.608879492600423e-06, | |
| "loss": 0.3239, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.18337218337218336, | |
| "grad_norm": 0.9620960354804993, | |
| "learning_rate": 9.59830866807611e-06, | |
| "loss": 0.3246, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.1864801864801865, | |
| "grad_norm": 0.9239097833633423, | |
| "learning_rate": 9.587737843551797e-06, | |
| "loss": 0.3278, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1895881895881896, | |
| "grad_norm": 0.7097995281219482, | |
| "learning_rate": 9.577167019027484e-06, | |
| "loss": 0.3152, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.1926961926961927, | |
| "grad_norm": 0.9077997803688049, | |
| "learning_rate": 9.566596194503171e-06, | |
| "loss": 0.3219, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.1958041958041958, | |
| "grad_norm": 0.8704112768173218, | |
| "learning_rate": 9.55602536997886e-06, | |
| "loss": 0.3262, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.1989121989121989, | |
| "grad_norm": 0.9264605641365051, | |
| "learning_rate": 9.545454545454547e-06, | |
| "loss": 0.3176, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.1989121989121989, | |
| "eval_loss": 0.3377174139022827, | |
| "eval_runtime": 149.1316, | |
| "eval_samples_per_second": 1.911, | |
| "eval_steps_per_second": 0.959, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.20202020202020202, | |
| "grad_norm": 0.9881049394607544, | |
| "learning_rate": 9.534883720930234e-06, | |
| "loss": 0.3312, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 1.1825007200241089, | |
| "learning_rate": 9.524312896405921e-06, | |
| "loss": 0.3189, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.20823620823620823, | |
| "grad_norm": 0.8272495865821838, | |
| "learning_rate": 9.513742071881608e-06, | |
| "loss": 0.3293, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.21134421134421136, | |
| "grad_norm": 1.0992769002914429, | |
| "learning_rate": 9.503171247357295e-06, | |
| "loss": 0.3119, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.21445221445221446, | |
| "grad_norm": 0.9182390570640564, | |
| "learning_rate": 9.492600422832982e-06, | |
| "loss": 0.331, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.21756021756021757, | |
| "grad_norm": 0.8677308559417725, | |
| "learning_rate": 9.482029598308669e-06, | |
| "loss": 0.3168, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.22066822066822067, | |
| "grad_norm": 1.2915256023406982, | |
| "learning_rate": 9.471458773784356e-06, | |
| "loss": 0.3181, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.22377622377622378, | |
| "grad_norm": 1.6176910400390625, | |
| "learning_rate": 9.460887949260043e-06, | |
| "loss": 0.3254, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.22688422688422688, | |
| "grad_norm": 0.6357202529907227, | |
| "learning_rate": 9.45031712473573e-06, | |
| "loss": 0.3298, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.22999222999222999, | |
| "grad_norm": 0.911662220954895, | |
| "learning_rate": 9.439746300211417e-06, | |
| "loss": 0.3248, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.2331002331002331, | |
| "grad_norm": 0.7426556944847107, | |
| "learning_rate": 9.429175475687104e-06, | |
| "loss": 0.3301, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.23620823620823622, | |
| "grad_norm": 0.7509779930114746, | |
| "learning_rate": 9.418604651162791e-06, | |
| "loss": 0.3209, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.23931623931623933, | |
| "grad_norm": 0.7699870467185974, | |
| "learning_rate": 9.408033826638478e-06, | |
| "loss": 0.3171, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.24242424242424243, | |
| "grad_norm": 0.7583193182945251, | |
| "learning_rate": 9.397463002114165e-06, | |
| "loss": 0.3128, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.24553224553224554, | |
| "grad_norm": 0.968973696231842, | |
| "learning_rate": 9.386892177589852e-06, | |
| "loss": 0.3293, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.24864024864024864, | |
| "grad_norm": 0.9967902302742004, | |
| "learning_rate": 9.37632135306554e-06, | |
| "loss": 0.3209, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2517482517482518, | |
| "grad_norm": 0.7837809920310974, | |
| "learning_rate": 9.365750528541226e-06, | |
| "loss": 0.3152, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.25485625485625485, | |
| "grad_norm": 1.6905367374420166, | |
| "learning_rate": 9.355179704016915e-06, | |
| "loss": 0.3163, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.257964257964258, | |
| "grad_norm": 0.8734452128410339, | |
| "learning_rate": 9.344608879492602e-06, | |
| "loss": 0.3306, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.26107226107226106, | |
| "grad_norm": 3.6059653759002686, | |
| "learning_rate": 9.33403805496829e-06, | |
| "loss": 0.3104, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2641802641802642, | |
| "grad_norm": 1.1703656911849976, | |
| "learning_rate": 9.323467230443976e-06, | |
| "loss": 0.3071, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.26728826728826727, | |
| "grad_norm": 0.8762909770011902, | |
| "learning_rate": 9.312896405919663e-06, | |
| "loss": 0.3022, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2703962703962704, | |
| "grad_norm": 2.158876419067383, | |
| "learning_rate": 9.30232558139535e-06, | |
| "loss": 0.3217, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.27350427350427353, | |
| "grad_norm": 0.8010348081588745, | |
| "learning_rate": 9.291754756871036e-06, | |
| "loss": 0.322, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.2766122766122766, | |
| "grad_norm": 1.119739055633545, | |
| "learning_rate": 9.281183932346723e-06, | |
| "loss": 0.3248, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.27972027972027974, | |
| "grad_norm": 0.7900079488754272, | |
| "learning_rate": 9.27061310782241e-06, | |
| "loss": 0.3102, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2828282828282828, | |
| "grad_norm": 0.8093041181564331, | |
| "learning_rate": 9.260042283298098e-06, | |
| "loss": 0.3259, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.28593628593628595, | |
| "grad_norm": 0.7240622043609619, | |
| "learning_rate": 9.249471458773785e-06, | |
| "loss": 0.3002, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.289044289044289, | |
| "grad_norm": 0.9449782371520996, | |
| "learning_rate": 9.238900634249473e-06, | |
| "loss": 0.3076, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.29215229215229216, | |
| "grad_norm": 0.9448596835136414, | |
| "learning_rate": 9.22832980972516e-06, | |
| "loss": 0.3012, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.29526029526029524, | |
| "grad_norm": 0.9209067821502686, | |
| "learning_rate": 9.217758985200847e-06, | |
| "loss": 0.3131, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.29836829836829837, | |
| "grad_norm": 0.878709614276886, | |
| "learning_rate": 9.207188160676534e-06, | |
| "loss": 0.3157, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3014763014763015, | |
| "grad_norm": 1.1178463697433472, | |
| "learning_rate": 9.19661733615222e-06, | |
| "loss": 0.3166, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3045843045843046, | |
| "grad_norm": 0.9717866778373718, | |
| "learning_rate": 9.186046511627908e-06, | |
| "loss": 0.3144, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 0.9905857443809509, | |
| "learning_rate": 9.175475687103595e-06, | |
| "loss": 0.3263, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3108003108003108, | |
| "grad_norm": 1.0447399616241455, | |
| "learning_rate": 9.164904862579282e-06, | |
| "loss": 0.3074, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3139083139083139, | |
| "grad_norm": 0.9876366853713989, | |
| "learning_rate": 9.154334038054969e-06, | |
| "loss": 0.3221, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.317016317016317, | |
| "grad_norm": 1.3406106233596802, | |
| "learning_rate": 9.143763213530656e-06, | |
| "loss": 0.3209, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3201243201243201, | |
| "grad_norm": 1.1402978897094727, | |
| "learning_rate": 9.133192389006343e-06, | |
| "loss": 0.3181, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.32323232323232326, | |
| "grad_norm": 1.0274314880371094, | |
| "learning_rate": 9.12262156448203e-06, | |
| "loss": 0.3179, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.32634032634032634, | |
| "grad_norm": 1.0853135585784912, | |
| "learning_rate": 9.112050739957717e-06, | |
| "loss": 0.3068, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.32944832944832947, | |
| "grad_norm": 0.9549627900123596, | |
| "learning_rate": 9.101479915433404e-06, | |
| "loss": 0.3058, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.33255633255633255, | |
| "grad_norm": 0.9081363081932068, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.305, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3356643356643357, | |
| "grad_norm": 1.083267092704773, | |
| "learning_rate": 9.080338266384778e-06, | |
| "loss": 0.3293, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.33877233877233875, | |
| "grad_norm": 0.9146764278411865, | |
| "learning_rate": 9.069767441860465e-06, | |
| "loss": 0.3308, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3418803418803419, | |
| "grad_norm": 0.8309290409088135, | |
| "learning_rate": 9.059196617336154e-06, | |
| "loss": 0.3219, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.34498834498834496, | |
| "grad_norm": 0.7540556788444519, | |
| "learning_rate": 9.048625792811841e-06, | |
| "loss": 0.3165, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.3480963480963481, | |
| "grad_norm": 0.7756165862083435, | |
| "learning_rate": 9.038054968287528e-06, | |
| "loss": 0.3201, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.35120435120435123, | |
| "grad_norm": 1.016161561012268, | |
| "learning_rate": 9.027484143763215e-06, | |
| "loss": 0.318, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3543123543123543, | |
| "grad_norm": 1.1762275695800781, | |
| "learning_rate": 9.016913319238902e-06, | |
| "loss": 0.3071, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.35742035742035744, | |
| "grad_norm": 1.0186941623687744, | |
| "learning_rate": 9.006342494714589e-06, | |
| "loss": 0.3094, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3605283605283605, | |
| "grad_norm": 1.3835426568984985, | |
| "learning_rate": 8.995771670190276e-06, | |
| "loss": 0.3203, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 0.9151639938354492, | |
| "learning_rate": 8.985200845665963e-06, | |
| "loss": 0.3075, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.3667443667443667, | |
| "grad_norm": 0.9079708456993103, | |
| "learning_rate": 8.974630021141648e-06, | |
| "loss": 0.3111, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.36985236985236986, | |
| "grad_norm": 0.7135366201400757, | |
| "learning_rate": 8.964059196617337e-06, | |
| "loss": 0.3131, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.372960372960373, | |
| "grad_norm": 0.7310993671417236, | |
| "learning_rate": 8.953488372093024e-06, | |
| "loss": 0.3181, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.37606837606837606, | |
| "grad_norm": 0.9562262296676636, | |
| "learning_rate": 8.942917547568711e-06, | |
| "loss": 0.3114, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3791763791763792, | |
| "grad_norm": 1.088692545890808, | |
| "learning_rate": 8.932346723044398e-06, | |
| "loss": 0.2985, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3822843822843823, | |
| "grad_norm": 1.3334287405014038, | |
| "learning_rate": 8.921775898520085e-06, | |
| "loss": 0.3198, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3853923853923854, | |
| "grad_norm": 1.1457082033157349, | |
| "learning_rate": 8.911205073995772e-06, | |
| "loss": 0.3027, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.3885003885003885, | |
| "grad_norm": 1.0944201946258545, | |
| "learning_rate": 8.90063424947146e-06, | |
| "loss": 0.3195, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3916083916083916, | |
| "grad_norm": 1.679890513420105, | |
| "learning_rate": 8.890063424947146e-06, | |
| "loss": 0.3118, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3947163947163947, | |
| "grad_norm": 1.0934737920761108, | |
| "learning_rate": 8.879492600422833e-06, | |
| "loss": 0.3125, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3978243978243978, | |
| "grad_norm": 0.9423776865005493, | |
| "learning_rate": 8.86892177589852e-06, | |
| "loss": 0.3069, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.3978243978243978, | |
| "eval_loss": 0.33542340993881226, | |
| "eval_runtime": 147.0915, | |
| "eval_samples_per_second": 1.938, | |
| "eval_steps_per_second": 0.972, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.40093240093240096, | |
| "grad_norm": 1.373064637184143, | |
| "learning_rate": 8.858350951374208e-06, | |
| "loss": 0.3113, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.40404040404040403, | |
| "grad_norm": 0.9782734513282776, | |
| "learning_rate": 8.847780126849895e-06, | |
| "loss": 0.3176, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.40714840714840717, | |
| "grad_norm": 1.1988129615783691, | |
| "learning_rate": 8.837209302325582e-06, | |
| "loss": 0.3036, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 1.3978164196014404, | |
| "learning_rate": 8.826638477801269e-06, | |
| "loss": 0.3067, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4133644133644134, | |
| "grad_norm": 0.8266012072563171, | |
| "learning_rate": 8.816067653276956e-06, | |
| "loss": 0.3105, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.41647241647241645, | |
| "grad_norm": 1.0358003377914429, | |
| "learning_rate": 8.805496828752643e-06, | |
| "loss": 0.3176, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.4195804195804196, | |
| "grad_norm": 0.9363102316856384, | |
| "learning_rate": 8.79492600422833e-06, | |
| "loss": 0.3151, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4226884226884227, | |
| "grad_norm": 0.9805242419242859, | |
| "learning_rate": 8.784355179704017e-06, | |
| "loss": 0.3164, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4257964257964258, | |
| "grad_norm": 1.4923985004425049, | |
| "learning_rate": 8.773784355179706e-06, | |
| "loss": 0.3059, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4289044289044289, | |
| "grad_norm": 1.7009886503219604, | |
| "learning_rate": 8.763213530655393e-06, | |
| "loss": 0.2937, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.432012432012432, | |
| "grad_norm": 0.8320425748825073, | |
| "learning_rate": 8.75264270613108e-06, | |
| "loss": 0.288, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.43512043512043513, | |
| "grad_norm": 1.3431979417800903, | |
| "learning_rate": 8.742071881606767e-06, | |
| "loss": 0.3063, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4382284382284382, | |
| "grad_norm": 1.0519447326660156, | |
| "learning_rate": 8.731501057082454e-06, | |
| "loss": 0.3043, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.44133644133644134, | |
| "grad_norm": 1.0041645765304565, | |
| "learning_rate": 8.72093023255814e-06, | |
| "loss": 0.3207, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 1.176352620124817, | |
| "learning_rate": 8.710359408033828e-06, | |
| "loss": 0.3099, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.44755244755244755, | |
| "grad_norm": 0.8591434955596924, | |
| "learning_rate": 8.699788583509515e-06, | |
| "loss": 0.2913, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.4471858134155744, | |
| "grad_norm": 1.2351419925689697, | |
| "learning_rate": 8.689217758985202e-06, | |
| "loss": 0.3099, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4502698535080956, | |
| "grad_norm": 1.8375589847564697, | |
| "learning_rate": 8.691099476439791e-06, | |
| "loss": 0.3092, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.4533538936006168, | |
| "grad_norm": 1.07125985622406, | |
| "learning_rate": 8.680628272251308e-06, | |
| "loss": 0.3016, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.456437933693138, | |
| "grad_norm": 1.1839478015899658, | |
| "learning_rate": 8.670157068062827e-06, | |
| "loss": 0.3003, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.45952197378565923, | |
| "grad_norm": 1.294833779335022, | |
| "learning_rate": 8.659685863874346e-06, | |
| "loss": 0.2972, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.4626060138781804, | |
| "grad_norm": 1.0540661811828613, | |
| "learning_rate": 8.649214659685865e-06, | |
| "loss": 0.2837, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4656900539707016, | |
| "grad_norm": 1.1067568063735962, | |
| "learning_rate": 8.638743455497383e-06, | |
| "loss": 0.2966, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.46877409406322285, | |
| "grad_norm": 0.9972389340400696, | |
| "learning_rate": 8.6282722513089e-06, | |
| "loss": 0.2934, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.471858134155744, | |
| "grad_norm": 1.1589370965957642, | |
| "learning_rate": 8.61780104712042e-06, | |
| "loss": 0.3026, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.47494217424826524, | |
| "grad_norm": 1.1224210262298584, | |
| "learning_rate": 8.607329842931938e-06, | |
| "loss": 0.3042, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.4780262143407864, | |
| "grad_norm": 1.3200238943099976, | |
| "learning_rate": 8.596858638743457e-06, | |
| "loss": 0.3124, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4811102544333076, | |
| "grad_norm": 1.1300067901611328, | |
| "learning_rate": 8.586387434554974e-06, | |
| "loss": 0.3167, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.48419429452582885, | |
| "grad_norm": 0.9678866863250732, | |
| "learning_rate": 8.575916230366493e-06, | |
| "loss": 0.3039, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.48727833461835, | |
| "grad_norm": 0.9656190872192383, | |
| "learning_rate": 8.565445026178011e-06, | |
| "loss": 0.3067, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.49036237471087124, | |
| "grad_norm": 0.9618685245513916, | |
| "learning_rate": 8.55497382198953e-06, | |
| "loss": 0.2992, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.49344641480339246, | |
| "grad_norm": 1.1055867671966553, | |
| "learning_rate": 8.544502617801049e-06, | |
| "loss": 0.2986, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.49653045489591363, | |
| "grad_norm": 0.8761485815048218, | |
| "learning_rate": 8.534031413612566e-06, | |
| "loss": 0.3071, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.49961449498843485, | |
| "grad_norm": 1.0709651708602905, | |
| "learning_rate": 8.523560209424085e-06, | |
| "loss": 0.2965, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5026985350809561, | |
| "grad_norm": 1.2407382726669312, | |
| "learning_rate": 8.513089005235604e-06, | |
| "loss": 0.3134, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5057825751734772, | |
| "grad_norm": 1.46315598487854, | |
| "learning_rate": 8.502617801047122e-06, | |
| "loss": 0.2886, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5088666152659984, | |
| "grad_norm": 1.2314726114273071, | |
| "learning_rate": 8.49214659685864e-06, | |
| "loss": 0.2902, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5119506553585197, | |
| "grad_norm": 1.223716378211975, | |
| "learning_rate": 8.481675392670158e-06, | |
| "loss": 0.3088, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5150346954510409, | |
| "grad_norm": 1.1966098546981812, | |
| "learning_rate": 8.471204188481677e-06, | |
| "loss": 0.3139, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.518118735543562, | |
| "grad_norm": 1.1182276010513306, | |
| "learning_rate": 8.460732984293194e-06, | |
| "loss": 0.3161, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5212027756360833, | |
| "grad_norm": 1.1583510637283325, | |
| "learning_rate": 8.450261780104713e-06, | |
| "loss": 0.3041, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5242868157286045, | |
| "grad_norm": 1.1864618062973022, | |
| "learning_rate": 8.439790575916232e-06, | |
| "loss": 0.3008, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5273708558211256, | |
| "grad_norm": 1.3757935762405396, | |
| "learning_rate": 8.429319371727749e-06, | |
| "loss": 0.2865, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5304548959136469, | |
| "grad_norm": 1.4410743713378906, | |
| "learning_rate": 8.418848167539267e-06, | |
| "loss": 0.3081, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5335389360061681, | |
| "grad_norm": 1.3494313955307007, | |
| "learning_rate": 8.408376963350786e-06, | |
| "loss": 0.2988, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5366229760986893, | |
| "grad_norm": 1.3871009349822998, | |
| "learning_rate": 8.397905759162305e-06, | |
| "loss": 0.3045, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5397070161912105, | |
| "grad_norm": 1.183766484260559, | |
| "learning_rate": 8.387434554973822e-06, | |
| "loss": 0.2969, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5427910562837317, | |
| "grad_norm": 1.1075443029403687, | |
| "learning_rate": 8.37696335078534e-06, | |
| "loss": 0.2834, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.5458750963762529, | |
| "grad_norm": 1.3118195533752441, | |
| "learning_rate": 8.36649214659686e-06, | |
| "loss": 0.2945, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5489591364687741, | |
| "grad_norm": 1.3226675987243652, | |
| "learning_rate": 8.356020942408377e-06, | |
| "loss": 0.3085, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5520431765612953, | |
| "grad_norm": 1.1877515316009521, | |
| "learning_rate": 8.345549738219895e-06, | |
| "loss": 0.2757, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5551272166538165, | |
| "grad_norm": 1.379599928855896, | |
| "learning_rate": 8.335078534031414e-06, | |
| "loss": 0.2968, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5582112567463376, | |
| "grad_norm": 1.2975775003433228, | |
| "learning_rate": 8.324607329842933e-06, | |
| "loss": 0.3074, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5612952968388589, | |
| "grad_norm": 1.2829333543777466, | |
| "learning_rate": 8.31413612565445e-06, | |
| "loss": 0.3014, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5643793369313801, | |
| "grad_norm": 1.4759114980697632, | |
| "learning_rate": 8.303664921465969e-06, | |
| "loss": 0.3014, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5674633770239013, | |
| "grad_norm": 1.3108978271484375, | |
| "learning_rate": 8.293193717277488e-06, | |
| "loss": 0.2914, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.5705474171164225, | |
| "grad_norm": 1.271666407585144, | |
| "learning_rate": 8.282722513089005e-06, | |
| "loss": 0.305, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5736314572089437, | |
| "grad_norm": 1.1115907430648804, | |
| "learning_rate": 8.272251308900523e-06, | |
| "loss": 0.2963, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5767154973014649, | |
| "grad_norm": 1.089092493057251, | |
| "learning_rate": 8.261780104712042e-06, | |
| "loss": 0.303, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.5797995373939862, | |
| "grad_norm": 1.1514776945114136, | |
| "learning_rate": 8.251308900523561e-06, | |
| "loss": 0.3073, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.5828835774865073, | |
| "grad_norm": 1.1654891967773438, | |
| "learning_rate": 8.240837696335078e-06, | |
| "loss": 0.2883, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.5859676175790285, | |
| "grad_norm": 1.2040210962295532, | |
| "learning_rate": 8.230366492146597e-06, | |
| "loss": 0.295, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5890516576715498, | |
| "grad_norm": 1.203511118888855, | |
| "learning_rate": 8.219895287958116e-06, | |
| "loss": 0.2795, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.5921356977640709, | |
| "grad_norm": 1.5743706226348877, | |
| "learning_rate": 8.209424083769634e-06, | |
| "loss": 0.3123, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.5921356977640709, | |
| "eval_loss": 0.3412991166114807, | |
| "eval_runtime": 149.387, | |
| "eval_samples_per_second": 1.928, | |
| "eval_steps_per_second": 0.964, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.5952197378565921, | |
| "grad_norm": 1.4109128713607788, | |
| "learning_rate": 8.198952879581153e-06, | |
| "loss": 0.2996, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.5983037779491134, | |
| "grad_norm": 1.3817074298858643, | |
| "learning_rate": 8.18848167539267e-06, | |
| "loss": 0.2964, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6013878180416345, | |
| "grad_norm": 1.3587619066238403, | |
| "learning_rate": 8.178010471204189e-06, | |
| "loss": 0.3004, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6044718581341557, | |
| "grad_norm": 1.502744197845459, | |
| "learning_rate": 8.167539267015708e-06, | |
| "loss": 0.2957, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.607555898226677, | |
| "grad_norm": 1.4416728019714355, | |
| "learning_rate": 8.157068062827227e-06, | |
| "loss": 0.2962, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6106399383191982, | |
| "grad_norm": 2.2597157955169678, | |
| "learning_rate": 8.146596858638745e-06, | |
| "loss": 0.2853, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6137239784117193, | |
| "grad_norm": 1.854837417602539, | |
| "learning_rate": 8.136125654450262e-06, | |
| "loss": 0.2918, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6168080185042406, | |
| "grad_norm": 2.1409687995910645, | |
| "learning_rate": 8.125654450261781e-06, | |
| "loss": 0.3118, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6198920585967618, | |
| "grad_norm": 1.7128517627716064, | |
| "learning_rate": 8.1151832460733e-06, | |
| "loss": 0.2822, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6229760986892829, | |
| "grad_norm": 1.4401497840881348, | |
| "learning_rate": 8.104712041884819e-06, | |
| "loss": 0.2802, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6260601387818041, | |
| "grad_norm": 1.7307312488555908, | |
| "learning_rate": 8.094240837696336e-06, | |
| "loss": 0.2973, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6291441788743254, | |
| "grad_norm": 1.263535737991333, | |
| "learning_rate": 8.083769633507855e-06, | |
| "loss": 0.3016, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6322282189668466, | |
| "grad_norm": 1.4065901041030884, | |
| "learning_rate": 8.073298429319373e-06, | |
| "loss": 0.284, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6353122590593677, | |
| "grad_norm": 1.6004809141159058, | |
| "learning_rate": 8.06282722513089e-06, | |
| "loss": 0.2908, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.638396299151889, | |
| "grad_norm": 1.458287239074707, | |
| "learning_rate": 8.05235602094241e-06, | |
| "loss": 0.2832, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6414803392444102, | |
| "grad_norm": 1.8239188194274902, | |
| "learning_rate": 8.041884816753928e-06, | |
| "loss": 0.2993, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6445643793369313, | |
| "grad_norm": 1.8187966346740723, | |
| "learning_rate": 8.031413612565445e-06, | |
| "loss": 0.311, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.6476484194294526, | |
| "grad_norm": 1.5089385509490967, | |
| "learning_rate": 8.020942408376964e-06, | |
| "loss": 0.2835, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6507324595219738, | |
| "grad_norm": 1.5591213703155518, | |
| "learning_rate": 8.010471204188483e-06, | |
| "loss": 0.2985, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.653816499614495, | |
| "grad_norm": 1.5221312046051025, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.2805, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6569005397070162, | |
| "grad_norm": 1.8211005926132202, | |
| "learning_rate": 7.989528795811518e-06, | |
| "loss": 0.2728, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6599845797995374, | |
| "grad_norm": 2.2500016689300537, | |
| "learning_rate": 7.979057591623037e-06, | |
| "loss": 0.2932, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.6630686198920586, | |
| "grad_norm": 1.7227460145950317, | |
| "learning_rate": 7.968586387434556e-06, | |
| "loss": 0.2927, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6661526599845798, | |
| "grad_norm": 2.1821672916412354, | |
| "learning_rate": 7.958115183246073e-06, | |
| "loss": 0.2919, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.669236700077101, | |
| "grad_norm": 1.3368958234786987, | |
| "learning_rate": 7.947643979057592e-06, | |
| "loss": 0.2789, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.6723207401696222, | |
| "grad_norm": 1.4419403076171875, | |
| "learning_rate": 7.93717277486911e-06, | |
| "loss": 0.2876, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.6754047802621435, | |
| "grad_norm": 2.0355281829833984, | |
| "learning_rate": 7.92670157068063e-06, | |
| "loss": 0.3059, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.6784888203546646, | |
| "grad_norm": 1.7871628999710083, | |
| "learning_rate": 7.916230366492146e-06, | |
| "loss": 0.2804, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6815728604471858, | |
| "grad_norm": 1.8160405158996582, | |
| "learning_rate": 7.905759162303665e-06, | |
| "loss": 0.2842, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.6846569005397071, | |
| "grad_norm": 2.1498160362243652, | |
| "learning_rate": 7.895287958115184e-06, | |
| "loss": 0.2875, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.6877409406322282, | |
| "grad_norm": 1.9483954906463623, | |
| "learning_rate": 7.884816753926701e-06, | |
| "loss": 0.2874, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.6908249807247494, | |
| "grad_norm": 2.0145816802978516, | |
| "learning_rate": 7.87434554973822e-06, | |
| "loss": 0.2879, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.6939090208172706, | |
| "grad_norm": 1.680413007736206, | |
| "learning_rate": 7.863874345549739e-06, | |
| "loss": 0.2755, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6969930609097919, | |
| "grad_norm": 1.5203242301940918, | |
| "learning_rate": 7.853403141361257e-06, | |
| "loss": 0.284, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.700077101002313, | |
| "grad_norm": 1.892943263053894, | |
| "learning_rate": 7.842931937172774e-06, | |
| "loss": 0.2799, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7031611410948342, | |
| "grad_norm": 1.5476278066635132, | |
| "learning_rate": 7.832460732984293e-06, | |
| "loss": 0.2767, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7062451811873555, | |
| "grad_norm": 2.2650210857391357, | |
| "learning_rate": 7.821989528795812e-06, | |
| "loss": 0.2905, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.7093292212798766, | |
| "grad_norm": 2.1595096588134766, | |
| "learning_rate": 7.81151832460733e-06, | |
| "loss": 0.274, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7124132613723978, | |
| "grad_norm": 1.587994933128357, | |
| "learning_rate": 7.80104712041885e-06, | |
| "loss": 0.2743, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7154973014649191, | |
| "grad_norm": 1.9411978721618652, | |
| "learning_rate": 7.790575916230367e-06, | |
| "loss": 0.272, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7185813415574402, | |
| "grad_norm": 2.1039252281188965, | |
| "learning_rate": 7.780104712041885e-06, | |
| "loss": 0.2884, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7216653816499614, | |
| "grad_norm": 1.834591269493103, | |
| "learning_rate": 7.769633507853404e-06, | |
| "loss": 0.2756, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.7247494217424827, | |
| "grad_norm": 2.1758062839508057, | |
| "learning_rate": 7.759162303664923e-06, | |
| "loss": 0.287, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7278334618350039, | |
| "grad_norm": 2.0601179599761963, | |
| "learning_rate": 7.748691099476442e-06, | |
| "loss": 0.2683, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.730917501927525, | |
| "grad_norm": 1.7605801820755005, | |
| "learning_rate": 7.738219895287959e-06, | |
| "loss": 0.2552, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.7340015420200463, | |
| "grad_norm": 2.0951759815216064, | |
| "learning_rate": 7.727748691099478e-06, | |
| "loss": 0.258, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.7370855821125675, | |
| "grad_norm": 2.2250118255615234, | |
| "learning_rate": 7.717277486910996e-06, | |
| "loss": 0.2627, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7401696222050886, | |
| "grad_norm": 2.54436993598938, | |
| "learning_rate": 7.706806282722513e-06, | |
| "loss": 0.278, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7432536622976099, | |
| "grad_norm": 1.810699701309204, | |
| "learning_rate": 7.696335078534032e-06, | |
| "loss": 0.2684, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7463377023901311, | |
| "grad_norm": 2.161043882369995, | |
| "learning_rate": 7.685863874345551e-06, | |
| "loss": 0.2828, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7494217424826523, | |
| "grad_norm": 1.7965888977050781, | |
| "learning_rate": 7.67539267015707e-06, | |
| "loss": 0.2677, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7525057825751735, | |
| "grad_norm": 1.9139559268951416, | |
| "learning_rate": 7.664921465968587e-06, | |
| "loss": 0.2701, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.7555898226676947, | |
| "grad_norm": 2.0285589694976807, | |
| "learning_rate": 7.654450261780106e-06, | |
| "loss": 0.2726, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7586738627602159, | |
| "grad_norm": 2.2968027591705322, | |
| "learning_rate": 7.643979057591624e-06, | |
| "loss": 0.2606, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.761757902852737, | |
| "grad_norm": 2.4324936866760254, | |
| "learning_rate": 7.633507853403141e-06, | |
| "loss": 0.2659, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.7648419429452583, | |
| "grad_norm": 2.66330885887146, | |
| "learning_rate": 7.62303664921466e-06, | |
| "loss": 0.2627, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.7679259830377795, | |
| "grad_norm": 2.435866355895996, | |
| "learning_rate": 7.612565445026179e-06, | |
| "loss": 0.2713, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.7710100231303006, | |
| "grad_norm": 2.2584385871887207, | |
| "learning_rate": 7.602094240837698e-06, | |
| "loss": 0.2754, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7740940632228219, | |
| "grad_norm": 2.1898317337036133, | |
| "learning_rate": 7.591623036649215e-06, | |
| "loss": 0.2705, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.7771781033153431, | |
| "grad_norm": 2.051255464553833, | |
| "learning_rate": 7.5811518324607335e-06, | |
| "loss": 0.2491, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.7802621434078643, | |
| "grad_norm": 2.353940725326538, | |
| "learning_rate": 7.570680628272252e-06, | |
| "loss": 0.277, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.7833461835003855, | |
| "grad_norm": 2.3826687335968018, | |
| "learning_rate": 7.560209424083769e-06, | |
| "loss": 0.2693, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.7864302235929067, | |
| "grad_norm": 2.522019863128662, | |
| "learning_rate": 7.549738219895288e-06, | |
| "loss": 0.2706, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7895142636854279, | |
| "grad_norm": 2.3525524139404297, | |
| "learning_rate": 7.539267015706807e-06, | |
| "loss": 0.2509, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.7895142636854279, | |
| "eval_loss": 0.3851300776004791, | |
| "eval_runtime": 149.046, | |
| "eval_samples_per_second": 1.932, | |
| "eval_steps_per_second": 0.966, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.7925983037779492, | |
| "grad_norm": 2.7143642902374268, | |
| "learning_rate": 7.528795811518326e-06, | |
| "loss": 0.2701, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.7956823438704703, | |
| "grad_norm": 2.6725356578826904, | |
| "learning_rate": 7.518324607329844e-06, | |
| "loss": 0.2718, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.7987663839629915, | |
| "grad_norm": 2.4051880836486816, | |
| "learning_rate": 7.5078534031413615e-06, | |
| "loss": 0.2554, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.8018504240555128, | |
| "grad_norm": 2.472904920578003, | |
| "learning_rate": 7.49738219895288e-06, | |
| "loss": 0.2666, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8049344641480339, | |
| "grad_norm": 2.3598804473876953, | |
| "learning_rate": 7.486910994764398e-06, | |
| "loss": 0.2532, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.8080185042405551, | |
| "grad_norm": 2.383300542831421, | |
| "learning_rate": 7.476439790575917e-06, | |
| "loss": 0.2568, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.8111025443330764, | |
| "grad_norm": 2.999469518661499, | |
| "learning_rate": 7.465968586387436e-06, | |
| "loss": 0.2403, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8141865844255975, | |
| "grad_norm": 4.071384429931641, | |
| "learning_rate": 7.455497382198954e-06, | |
| "loss": 0.265, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.8172706245181187, | |
| "grad_norm": 3.5529489517211914, | |
| "learning_rate": 7.445026178010472e-06, | |
| "loss": 0.2647, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8203546646106399, | |
| "grad_norm": 2.8842644691467285, | |
| "learning_rate": 7.43455497382199e-06, | |
| "loss": 0.2725, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8234387047031612, | |
| "grad_norm": 2.1277332305908203, | |
| "learning_rate": 7.424083769633509e-06, | |
| "loss": 0.2657, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8265227447956823, | |
| "grad_norm": 2.832111358642578, | |
| "learning_rate": 7.413612565445026e-06, | |
| "loss": 0.255, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8296067848882035, | |
| "grad_norm": 2.7438676357269287, | |
| "learning_rate": 7.403141361256545e-06, | |
| "loss": 0.2596, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.8326908249807248, | |
| "grad_norm": 2.7950987815856934, | |
| "learning_rate": 7.392670157068064e-06, | |
| "loss": 0.2624, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8357748650732459, | |
| "grad_norm": 3.497069835662842, | |
| "learning_rate": 7.382198952879581e-06, | |
| "loss": 0.2385, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8388589051657671, | |
| "grad_norm": 5.024068832397461, | |
| "learning_rate": 7.3717277486911e-06, | |
| "loss": 0.2526, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8419429452582884, | |
| "grad_norm": 3.5298011302948, | |
| "learning_rate": 7.361256544502618e-06, | |
| "loss": 0.2452, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8450269853508096, | |
| "grad_norm": 2.701545238494873, | |
| "learning_rate": 7.350785340314137e-06, | |
| "loss": 0.2293, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.8481110254433307, | |
| "grad_norm": 2.838541030883789, | |
| "learning_rate": 7.340314136125655e-06, | |
| "loss": 0.2554, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.851195065535852, | |
| "grad_norm": 2.5854012966156006, | |
| "learning_rate": 7.329842931937173e-06, | |
| "loss": 0.245, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8542791056283732, | |
| "grad_norm": 2.9351906776428223, | |
| "learning_rate": 7.319371727748692e-06, | |
| "loss": 0.2556, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8573631457208943, | |
| "grad_norm": 3.0675830841064453, | |
| "learning_rate": 7.30890052356021e-06, | |
| "loss": 0.2501, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.8604471858134156, | |
| "grad_norm": 3.1958088874816895, | |
| "learning_rate": 7.2984293193717285e-06, | |
| "loss": 0.2347, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.8635312259059368, | |
| "grad_norm": 3.0006463527679443, | |
| "learning_rate": 7.287958115183246e-06, | |
| "loss": 0.242, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.866615265998458, | |
| "grad_norm": 2.862990379333496, | |
| "learning_rate": 7.277486910994765e-06, | |
| "loss": 0.2442, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.8696993060909792, | |
| "grad_norm": 3.1585986614227295, | |
| "learning_rate": 7.267015706806283e-06, | |
| "loss": 0.2401, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.8727833461835004, | |
| "grad_norm": 2.6111812591552734, | |
| "learning_rate": 7.256544502617802e-06, | |
| "loss": 0.2324, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.8758673862760216, | |
| "grad_norm": 3.1289191246032715, | |
| "learning_rate": 7.246073298429321e-06, | |
| "loss": 0.2426, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.8789514263685428, | |
| "grad_norm": 3.448789358139038, | |
| "learning_rate": 7.235602094240838e-06, | |
| "loss": 0.2224, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.882035466461064, | |
| "grad_norm": 3.018432855606079, | |
| "learning_rate": 7.2251308900523565e-06, | |
| "loss": 0.2238, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.8851195065535852, | |
| "grad_norm": 4.171509742736816, | |
| "learning_rate": 7.214659685863875e-06, | |
| "loss": 0.2546, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.8882035466461063, | |
| "grad_norm": 3.5390446186065674, | |
| "learning_rate": 7.204188481675394e-06, | |
| "loss": 0.2417, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.8912875867386276, | |
| "grad_norm": 2.8169162273406982, | |
| "learning_rate": 7.193717277486911e-06, | |
| "loss": 0.2348, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.8943716268311488, | |
| "grad_norm": 2.9175827503204346, | |
| "learning_rate": 7.18324607329843e-06, | |
| "loss": 0.214, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.89745566692367, | |
| "grad_norm": 3.939680576324463, | |
| "learning_rate": 7.172774869109949e-06, | |
| "loss": 0.2489, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.9005397070161912, | |
| "grad_norm": 2.874373435974121, | |
| "learning_rate": 7.162303664921466e-06, | |
| "loss": 0.2219, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9036237471087124, | |
| "grad_norm": 4.381021976470947, | |
| "learning_rate": 7.1518324607329845e-06, | |
| "loss": 0.2419, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.9067077872012336, | |
| "grad_norm": 3.9895918369293213, | |
| "learning_rate": 7.141361256544503e-06, | |
| "loss": 0.2552, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.9097918272937549, | |
| "grad_norm": 2.9028842449188232, | |
| "learning_rate": 7.130890052356022e-06, | |
| "loss": 0.2323, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.912875867386276, | |
| "grad_norm": 3.5980117321014404, | |
| "learning_rate": 7.12041884816754e-06, | |
| "loss": 0.2404, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9159599074787972, | |
| "grad_norm": 3.490727186203003, | |
| "learning_rate": 7.109947643979058e-06, | |
| "loss": 0.22, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.9190439475713185, | |
| "grad_norm": 3.256279706954956, | |
| "learning_rate": 7.099476439790577e-06, | |
| "loss": 0.2368, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.9221279876638396, | |
| "grad_norm": 3.92038893699646, | |
| "learning_rate": 7.089005235602095e-06, | |
| "loss": 0.2331, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.9252120277563608, | |
| "grad_norm": 3.6917364597320557, | |
| "learning_rate": 7.078534031413613e-06, | |
| "loss": 0.2139, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9282960678488821, | |
| "grad_norm": 3.058729887008667, | |
| "learning_rate": 7.068062827225132e-06, | |
| "loss": 0.2199, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.9313801079414032, | |
| "grad_norm": 3.150188446044922, | |
| "learning_rate": 7.057591623036649e-06, | |
| "loss": 0.2137, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9344641480339244, | |
| "grad_norm": 5.77610445022583, | |
| "learning_rate": 7.047120418848168e-06, | |
| "loss": 0.2478, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.9375481881264457, | |
| "grad_norm": 2.8851089477539062, | |
| "learning_rate": 7.036649214659687e-06, | |
| "loss": 0.227, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.9406322282189669, | |
| "grad_norm": 3.1656086444854736, | |
| "learning_rate": 7.0261780104712055e-06, | |
| "loss": 0.2335, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.943716268311488, | |
| "grad_norm": 3.3355696201324463, | |
| "learning_rate": 7.015706806282723e-06, | |
| "loss": 0.2169, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9468003084040093, | |
| "grad_norm": 3.5095317363739014, | |
| "learning_rate": 7.005235602094241e-06, | |
| "loss": 0.2161, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.9498843484965305, | |
| "grad_norm": 3.5365262031555176, | |
| "learning_rate": 6.99476439790576e-06, | |
| "loss": 0.2097, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.9529683885890516, | |
| "grad_norm": 4.159248352050781, | |
| "learning_rate": 6.984293193717277e-06, | |
| "loss": 0.2337, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.9560524286815728, | |
| "grad_norm": 2.9792213439941406, | |
| "learning_rate": 6.973821989528796e-06, | |
| "loss": 0.2149, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9591364687740941, | |
| "grad_norm": 3.2603046894073486, | |
| "learning_rate": 6.963350785340315e-06, | |
| "loss": 0.2218, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.9622205088666153, | |
| "grad_norm": 3.5064327716827393, | |
| "learning_rate": 6.9528795811518335e-06, | |
| "loss": 0.2128, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.9653045489591364, | |
| "grad_norm": 3.971139430999756, | |
| "learning_rate": 6.942408376963351e-06, | |
| "loss": 0.2172, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.9683885890516577, | |
| "grad_norm": 3.651603937149048, | |
| "learning_rate": 6.931937172774869e-06, | |
| "loss": 0.2036, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.9714726291441789, | |
| "grad_norm": 5.394900321960449, | |
| "learning_rate": 6.921465968586388e-06, | |
| "loss": 0.2157, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9745566692367, | |
| "grad_norm": 3.7696452140808105, | |
| "learning_rate": 6.910994764397906e-06, | |
| "loss": 0.2168, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.9776407093292213, | |
| "grad_norm": 3.3137505054473877, | |
| "learning_rate": 6.900523560209425e-06, | |
| "loss": 0.2217, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.9807247494217425, | |
| "grad_norm": 3.927021026611328, | |
| "learning_rate": 6.890052356020943e-06, | |
| "loss": 0.2149, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.9838087895142636, | |
| "grad_norm": 3.598501443862915, | |
| "learning_rate": 6.8795811518324615e-06, | |
| "loss": 0.2007, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.9868928296067849, | |
| "grad_norm": 4.063229084014893, | |
| "learning_rate": 6.8691099476439794e-06, | |
| "loss": 0.2142, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9868928296067849, | |
| "eval_loss": 0.46243318915367126, | |
| "eval_runtime": 150.4594, | |
| "eval_samples_per_second": 1.914, | |
| "eval_steps_per_second": 0.957, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9899768696993061, | |
| "grad_norm": 4.520982265472412, | |
| "learning_rate": 6.858638743455498e-06, | |
| "loss": 0.1978, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.9930609097918273, | |
| "grad_norm": 3.6312687397003174, | |
| "learning_rate": 6.848167539267017e-06, | |
| "loss": 0.1896, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.9961449498843485, | |
| "grad_norm": 3.1252243518829346, | |
| "learning_rate": 6.837696335078534e-06, | |
| "loss": 0.1817, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.9992289899768697, | |
| "grad_norm": 4.3829264640808105, | |
| "learning_rate": 6.827225130890053e-06, | |
| "loss": 0.2199, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.0030840400925212, | |
| "grad_norm": 9.755841255187988, | |
| "learning_rate": 6.816753926701572e-06, | |
| "loss": 0.4578, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.0061680801850423, | |
| "grad_norm": 3.9052581787109375, | |
| "learning_rate": 6.80628272251309e-06, | |
| "loss": 0.1959, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.0092521202775635, | |
| "grad_norm": 3.6258931159973145, | |
| "learning_rate": 6.7958115183246075e-06, | |
| "loss": 0.2062, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.012336160370085, | |
| "grad_norm": 4.131122589111328, | |
| "learning_rate": 6.785340314136126e-06, | |
| "loss": 0.1915, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.015420200462606, | |
| "grad_norm": 4.387429237365723, | |
| "learning_rate": 6.774869109947645e-06, | |
| "loss": 0.1792, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.0185042405551272, | |
| "grad_norm": 3.873361110687256, | |
| "learning_rate": 6.764397905759162e-06, | |
| "loss": 0.1895, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0215882806476484, | |
| "grad_norm": 4.318599700927734, | |
| "learning_rate": 6.753926701570681e-06, | |
| "loss": 0.1836, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.0246723207401696, | |
| "grad_norm": 4.9434494972229, | |
| "learning_rate": 6.7434554973822e-06, | |
| "loss": 0.2199, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.0277563608326907, | |
| "grad_norm": 3.8584797382354736, | |
| "learning_rate": 6.732984293193718e-06, | |
| "loss": 0.1796, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.0308404009252121, | |
| "grad_norm": 4.104945659637451, | |
| "learning_rate": 6.722513089005236e-06, | |
| "loss": 0.1812, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.0339244410177333, | |
| "grad_norm": 4.125020503997803, | |
| "learning_rate": 6.712041884816754e-06, | |
| "loss": 0.197, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.0370084811102545, | |
| "grad_norm": 3.783364772796631, | |
| "learning_rate": 6.701570680628273e-06, | |
| "loss": 0.1798, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.0400925212027756, | |
| "grad_norm": 4.799828052520752, | |
| "learning_rate": 6.691099476439791e-06, | |
| "loss": 0.1837, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.0431765612952968, | |
| "grad_norm": 5.570056438446045, | |
| "learning_rate": 6.68062827225131e-06, | |
| "loss": 0.1987, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.046260601387818, | |
| "grad_norm": 3.9299843311309814, | |
| "learning_rate": 6.670157068062828e-06, | |
| "loss": 0.1728, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.0493446414803393, | |
| "grad_norm": 4.746124267578125, | |
| "learning_rate": 6.6596858638743455e-06, | |
| "loss": 0.2055, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0524286815728605, | |
| "grad_norm": 3.6969268321990967, | |
| "learning_rate": 6.649214659685864e-06, | |
| "loss": 0.1919, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.0555127216653817, | |
| "grad_norm": 4.096460819244385, | |
| "learning_rate": 6.638743455497383e-06, | |
| "loss": 0.1725, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.0585967617579028, | |
| "grad_norm": 3.819343328475952, | |
| "learning_rate": 6.628272251308902e-06, | |
| "loss": 0.1727, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.061680801850424, | |
| "grad_norm": 4.487940788269043, | |
| "learning_rate": 6.617801047120419e-06, | |
| "loss": 0.176, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.0647648419429452, | |
| "grad_norm": 4.727810382843018, | |
| "learning_rate": 6.607329842931938e-06, | |
| "loss": 0.1694, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.0678488820354666, | |
| "grad_norm": 5.403895854949951, | |
| "learning_rate": 6.5968586387434565e-06, | |
| "loss": 0.1853, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.0709329221279877, | |
| "grad_norm": 3.548576831817627, | |
| "learning_rate": 6.5863874345549736e-06, | |
| "loss": 0.1711, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.074016962220509, | |
| "grad_norm": 3.6849658489227295, | |
| "learning_rate": 6.575916230366492e-06, | |
| "loss": 0.1877, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.07710100231303, | |
| "grad_norm": 3.7493557929992676, | |
| "learning_rate": 6.565445026178011e-06, | |
| "loss": 0.1858, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.0801850424055512, | |
| "grad_norm": 3.9486773014068604, | |
| "learning_rate": 6.55497382198953e-06, | |
| "loss": 0.1515, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0832690824980724, | |
| "grad_norm": 4.970436096191406, | |
| "learning_rate": 6.544502617801047e-06, | |
| "loss": 0.172, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.0863531225905936, | |
| "grad_norm": 5.032225131988525, | |
| "learning_rate": 6.534031413612566e-06, | |
| "loss": 0.1611, | |
| "step": 352 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 975, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 16, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.458954269238886e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |