{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0863531225905936, "eval_steps": 64, "global_step": 352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003108003108003108, "grad_norm": 10.027831077575684, "learning_rate": 0.0, "loss": 0.6767, "step": 1 }, { "epoch": 0.006216006216006216, "grad_norm": 9.679778099060059, "learning_rate": 5.000000000000001e-07, "loss": 0.6644, "step": 2 }, { "epoch": 0.009324009324009324, "grad_norm": 10.520271301269531, "learning_rate": 1.0000000000000002e-06, "loss": 0.6934, "step": 3 }, { "epoch": 0.012432012432012432, "grad_norm": 8.677583694458008, "learning_rate": 1.5e-06, "loss": 0.6617, "step": 4 }, { "epoch": 0.01554001554001554, "grad_norm": 6.502548694610596, "learning_rate": 2.0000000000000003e-06, "loss": 0.6509, "step": 5 }, { "epoch": 0.018648018648018648, "grad_norm": 4.257171154022217, "learning_rate": 2.5e-06, "loss": 0.639, "step": 6 }, { "epoch": 0.021756021756021756, "grad_norm": 3.460066556930542, "learning_rate": 3e-06, "loss": 0.6286, "step": 7 }, { "epoch": 0.024864024864024864, "grad_norm": 3.0126283168792725, "learning_rate": 3.5e-06, "loss": 0.5948, "step": 8 }, { "epoch": 0.027972027972027972, "grad_norm": 2.567995309829712, "learning_rate": 4.000000000000001e-06, "loss": 0.5744, "step": 9 }, { "epoch": 0.03108003108003108, "grad_norm": 2.516597032546997, "learning_rate": 4.5e-06, "loss": 0.5496, "step": 10 }, { "epoch": 0.03418803418803419, "grad_norm": 1.8187586069107056, "learning_rate": 5e-06, "loss": 0.5397, "step": 11 }, { "epoch": 0.037296037296037296, "grad_norm": 1.7935529947280884, "learning_rate": 5.500000000000001e-06, "loss": 0.5229, "step": 12 }, { "epoch": 0.04040404040404041, "grad_norm": 1.8665963411331177, "learning_rate": 6e-06, "loss": 0.5227, "step": 13 }, { "epoch": 0.04351204351204351, "grad_norm": 2.0106680393218994, "learning_rate": 6.5000000000000004e-06, "loss": 0.4882, "step": 14 }, { "epoch": 0.046620046620046623, "grad_norm": 3.305211305618286, "learning_rate": 7e-06, "loss": 0.4772, "step": 15 }, { "epoch": 0.04972804972804973, "grad_norm": 3.047219753265381, "learning_rate": 7.500000000000001e-06, "loss": 0.452, "step": 16 }, { "epoch": 0.05283605283605284, "grad_norm": 2.5453591346740723, "learning_rate": 8.000000000000001e-06, "loss": 0.4138, "step": 17 }, { "epoch": 0.055944055944055944, "grad_norm": 5.414841175079346, "learning_rate": 8.5e-06, "loss": 0.4238, "step": 18 }, { "epoch": 0.059052059052059055, "grad_norm": 2.979440927505493, "learning_rate": 9e-06, "loss": 0.3987, "step": 19 }, { "epoch": 0.06216006216006216, "grad_norm": 1.981175422668457, "learning_rate": 9.5e-06, "loss": 0.3874, "step": 20 }, { "epoch": 0.06526806526806526, "grad_norm": 1.7793089151382446, "learning_rate": 1e-05, "loss": 0.3631, "step": 21 }, { "epoch": 0.06837606837606838, "grad_norm": 1.1854480504989624, "learning_rate": 9.989429175475688e-06, "loss": 0.3765, "step": 22 }, { "epoch": 0.07148407148407149, "grad_norm": 0.8928348422050476, "learning_rate": 9.978858350951375e-06, "loss": 0.3481, "step": 23 }, { "epoch": 0.07459207459207459, "grad_norm": 1.7531942129135132, "learning_rate": 9.968287526427062e-06, "loss": 0.3693, "step": 24 }, { "epoch": 0.0777000777000777, "grad_norm": 1.0829464197158813, "learning_rate": 9.957716701902749e-06, "loss": 0.3644, "step": 25 }, { "epoch": 0.08080808080808081, "grad_norm": 0.98089200258255, "learning_rate": 9.947145877378436e-06, "loss": 0.3616, "step": 26 }, { "epoch": 0.08391608391608392, "grad_norm": 0.795221745967865, "learning_rate": 9.936575052854123e-06, "loss": 0.3679, "step": 27 }, { "epoch": 0.08702408702408702, "grad_norm": 1.091843605041504, "learning_rate": 9.92600422832981e-06, "loss": 0.3439, "step": 28 }, { "epoch": 0.09013209013209013, "grad_norm": 0.8538377285003662, "learning_rate": 9.915433403805497e-06, "loss": 0.3401, "step": 29 }, { "epoch": 0.09324009324009325, "grad_norm": 0.9114591479301453, "learning_rate": 9.904862579281184e-06, "loss": 0.3515, "step": 30 }, { "epoch": 0.09634809634809635, "grad_norm": 0.9083001017570496, "learning_rate": 9.894291754756871e-06, "loss": 0.3449, "step": 31 }, { "epoch": 0.09945609945609946, "grad_norm": 0.9144365787506104, "learning_rate": 9.883720930232558e-06, "loss": 0.3393, "step": 32 }, { "epoch": 0.10256410256410256, "grad_norm": 1.0221809148788452, "learning_rate": 9.873150105708245e-06, "loss": 0.353, "step": 33 }, { "epoch": 0.10567210567210568, "grad_norm": 1.0219439268112183, "learning_rate": 9.862579281183932e-06, "loss": 0.3439, "step": 34 }, { "epoch": 0.10878010878010878, "grad_norm": 1.5430618524551392, "learning_rate": 9.852008456659621e-06, "loss": 0.3338, "step": 35 }, { "epoch": 0.11188811188811189, "grad_norm": 1.4754544496536255, "learning_rate": 9.841437632135308e-06, "loss": 0.3363, "step": 36 }, { "epoch": 0.11499611499611499, "grad_norm": 1.1298989057540894, "learning_rate": 9.830866807610995e-06, "loss": 0.3423, "step": 37 }, { "epoch": 0.11810411810411811, "grad_norm": 1.0130062103271484, "learning_rate": 9.820295983086682e-06, "loss": 0.3298, "step": 38 }, { "epoch": 0.12121212121212122, "grad_norm": 1.8003513813018799, "learning_rate": 9.80972515856237e-06, "loss": 0.3272, "step": 39 }, { "epoch": 0.12432012432012432, "grad_norm": 0.9532265067100525, "learning_rate": 9.799154334038056e-06, "loss": 0.3282, "step": 40 }, { "epoch": 0.12742812742812742, "grad_norm": 1.5232913494110107, "learning_rate": 9.788583509513743e-06, "loss": 0.3469, "step": 41 }, { "epoch": 0.13053613053613053, "grad_norm": 0.8918169736862183, "learning_rate": 9.77801268498943e-06, "loss": 0.326, "step": 42 }, { "epoch": 0.13364413364413363, "grad_norm": 0.8845950365066528, "learning_rate": 9.767441860465117e-06, "loss": 0.3313, "step": 43 }, { "epoch": 0.13675213675213677, "grad_norm": 0.8410794138908386, "learning_rate": 9.756871035940804e-06, "loss": 0.3318, "step": 44 }, { "epoch": 0.13986013986013987, "grad_norm": 0.7157808542251587, "learning_rate": 9.746300211416491e-06, "loss": 0.3381, "step": 45 }, { "epoch": 0.14296814296814297, "grad_norm": 1.1680670976638794, "learning_rate": 9.735729386892178e-06, "loss": 0.3281, "step": 46 }, { "epoch": 0.14607614607614608, "grad_norm": 0.9500836133956909, "learning_rate": 9.725158562367865e-06, "loss": 0.336, "step": 47 }, { "epoch": 0.14918414918414918, "grad_norm": 0.8565309643745422, "learning_rate": 9.714587737843552e-06, "loss": 0.3207, "step": 48 }, { "epoch": 0.1522921522921523, "grad_norm": 1.1311777830123901, "learning_rate": 9.70401691331924e-06, "loss": 0.3339, "step": 49 }, { "epoch": 0.1554001554001554, "grad_norm": 1.0368160009384155, "learning_rate": 9.693446088794927e-06, "loss": 0.3262, "step": 50 }, { "epoch": 0.1585081585081585, "grad_norm": 0.9648517370223999, "learning_rate": 9.682875264270614e-06, "loss": 0.3376, "step": 51 }, { "epoch": 0.16161616161616163, "grad_norm": 1.1039059162139893, "learning_rate": 9.6723044397463e-06, "loss": 0.3352, "step": 52 }, { "epoch": 0.16472416472416473, "grad_norm": 1.0544918775558472, "learning_rate": 9.661733615221988e-06, "loss": 0.3237, "step": 53 }, { "epoch": 0.16783216783216784, "grad_norm": 1.533158302307129, "learning_rate": 9.651162790697676e-06, "loss": 0.3287, "step": 54 }, { "epoch": 0.17094017094017094, "grad_norm": 1.2342826128005981, "learning_rate": 9.640591966173363e-06, "loss": 0.3162, "step": 55 }, { "epoch": 0.17404817404817405, "grad_norm": 1.0702942609786987, "learning_rate": 9.63002114164905e-06, "loss": 0.3143, "step": 56 }, { "epoch": 0.17715617715617715, "grad_norm": 1.02211594581604, "learning_rate": 9.619450317124736e-06, "loss": 0.3318, "step": 57 }, { "epoch": 0.18026418026418026, "grad_norm": 0.8379388451576233, "learning_rate": 9.608879492600423e-06, "loss": 0.3239, "step": 58 }, { "epoch": 0.18337218337218336, "grad_norm": 0.9620960354804993, "learning_rate": 9.59830866807611e-06, "loss": 0.3246, "step": 59 }, { "epoch": 0.1864801864801865, "grad_norm": 0.9239097833633423, "learning_rate": 9.587737843551797e-06, "loss": 0.3278, "step": 60 }, { "epoch": 0.1895881895881896, "grad_norm": 0.7097995281219482, "learning_rate": 9.577167019027484e-06, "loss": 0.3152, "step": 61 }, { "epoch": 0.1926961926961927, "grad_norm": 0.9077997803688049, "learning_rate": 9.566596194503171e-06, "loss": 0.3219, "step": 62 }, { "epoch": 0.1958041958041958, "grad_norm": 0.8704112768173218, "learning_rate": 9.55602536997886e-06, "loss": 0.3262, "step": 63 }, { "epoch": 0.1989121989121989, "grad_norm": 0.9264605641365051, "learning_rate": 9.545454545454547e-06, "loss": 0.3176, "step": 64 }, { "epoch": 0.1989121989121989, "eval_loss": 0.3377174139022827, "eval_runtime": 149.1316, "eval_samples_per_second": 1.911, "eval_steps_per_second": 0.959, "step": 64 }, { "epoch": 0.20202020202020202, "grad_norm": 0.9881049394607544, "learning_rate": 9.534883720930234e-06, "loss": 0.3312, "step": 65 }, { "epoch": 0.20512820512820512, "grad_norm": 1.1825007200241089, "learning_rate": 9.524312896405921e-06, "loss": 0.3189, "step": 66 }, { "epoch": 0.20823620823620823, "grad_norm": 0.8272495865821838, "learning_rate": 9.513742071881608e-06, "loss": 0.3293, "step": 67 }, { "epoch": 0.21134421134421136, "grad_norm": 1.0992769002914429, "learning_rate": 9.503171247357295e-06, "loss": 0.3119, "step": 68 }, { "epoch": 0.21445221445221446, "grad_norm": 0.9182390570640564, "learning_rate": 9.492600422832982e-06, "loss": 0.331, "step": 69 }, { "epoch": 0.21756021756021757, "grad_norm": 0.8677308559417725, "learning_rate": 9.482029598308669e-06, "loss": 0.3168, "step": 70 }, { "epoch": 0.22066822066822067, "grad_norm": 1.2915256023406982, "learning_rate": 9.471458773784356e-06, "loss": 0.3181, "step": 71 }, { "epoch": 0.22377622377622378, "grad_norm": 1.6176910400390625, "learning_rate": 9.460887949260043e-06, "loss": 0.3254, "step": 72 }, { "epoch": 0.22688422688422688, "grad_norm": 0.6357202529907227, "learning_rate": 9.45031712473573e-06, "loss": 0.3298, "step": 73 }, { "epoch": 0.22999222999222999, "grad_norm": 0.911662220954895, "learning_rate": 9.439746300211417e-06, "loss": 0.3248, "step": 74 }, { "epoch": 0.2331002331002331, "grad_norm": 0.7426556944847107, "learning_rate": 9.429175475687104e-06, "loss": 0.3301, "step": 75 }, { "epoch": 0.23620823620823622, "grad_norm": 0.7509779930114746, "learning_rate": 9.418604651162791e-06, "loss": 0.3209, "step": 76 }, { "epoch": 0.23931623931623933, "grad_norm": 0.7699870467185974, "learning_rate": 9.408033826638478e-06, "loss": 0.3171, "step": 77 }, { "epoch": 0.24242424242424243, "grad_norm": 0.7583193182945251, "learning_rate": 9.397463002114165e-06, "loss": 0.3128, "step": 78 }, { "epoch": 0.24553224553224554, "grad_norm": 0.968973696231842, "learning_rate": 9.386892177589852e-06, "loss": 0.3293, "step": 79 }, { "epoch": 0.24864024864024864, "grad_norm": 0.9967902302742004, "learning_rate": 9.37632135306554e-06, "loss": 0.3209, "step": 80 }, { "epoch": 0.2517482517482518, "grad_norm": 0.7837809920310974, "learning_rate": 9.365750528541226e-06, "loss": 0.3152, "step": 81 }, { "epoch": 0.25485625485625485, "grad_norm": 1.6905367374420166, "learning_rate": 9.355179704016915e-06, "loss": 0.3163, "step": 82 }, { "epoch": 0.257964257964258, "grad_norm": 0.8734452128410339, "learning_rate": 9.344608879492602e-06, "loss": 0.3306, "step": 83 }, { "epoch": 0.26107226107226106, "grad_norm": 3.6059653759002686, "learning_rate": 9.33403805496829e-06, "loss": 0.3104, "step": 84 }, { "epoch": 0.2641802641802642, "grad_norm": 1.1703656911849976, "learning_rate": 9.323467230443976e-06, "loss": 0.3071, "step": 85 }, { "epoch": 0.26728826728826727, "grad_norm": 0.8762909770011902, "learning_rate": 9.312896405919663e-06, "loss": 0.3022, "step": 86 }, { "epoch": 0.2703962703962704, "grad_norm": 2.158876419067383, "learning_rate": 9.30232558139535e-06, "loss": 0.3217, "step": 87 }, { "epoch": 0.27350427350427353, "grad_norm": 0.8010348081588745, "learning_rate": 9.291754756871036e-06, "loss": 0.322, "step": 88 }, { "epoch": 0.2766122766122766, "grad_norm": 1.119739055633545, "learning_rate": 9.281183932346723e-06, "loss": 0.3248, "step": 89 }, { "epoch": 0.27972027972027974, "grad_norm": 0.7900079488754272, "learning_rate": 9.27061310782241e-06, "loss": 0.3102, "step": 90 }, { "epoch": 0.2828282828282828, "grad_norm": 0.8093041181564331, "learning_rate": 9.260042283298098e-06, "loss": 0.3259, "step": 91 }, { "epoch": 0.28593628593628595, "grad_norm": 0.7240622043609619, "learning_rate": 9.249471458773785e-06, "loss": 0.3002, "step": 92 }, { "epoch": 0.289044289044289, "grad_norm": 0.9449782371520996, "learning_rate": 9.238900634249473e-06, "loss": 0.3076, "step": 93 }, { "epoch": 0.29215229215229216, "grad_norm": 0.9448596835136414, "learning_rate": 9.22832980972516e-06, "loss": 0.3012, "step": 94 }, { "epoch": 0.29526029526029524, "grad_norm": 0.9209067821502686, "learning_rate": 9.217758985200847e-06, "loss": 0.3131, "step": 95 }, { "epoch": 0.29836829836829837, "grad_norm": 0.878709614276886, "learning_rate": 9.207188160676534e-06, "loss": 0.3157, "step": 96 }, { "epoch": 0.3014763014763015, "grad_norm": 1.1178463697433472, "learning_rate": 9.19661733615222e-06, "loss": 0.3166, "step": 97 }, { "epoch": 0.3045843045843046, "grad_norm": 0.9717866778373718, "learning_rate": 9.186046511627908e-06, "loss": 0.3144, "step": 98 }, { "epoch": 0.3076923076923077, "grad_norm": 0.9905857443809509, "learning_rate": 9.175475687103595e-06, "loss": 0.3263, "step": 99 }, { "epoch": 0.3108003108003108, "grad_norm": 1.0447399616241455, "learning_rate": 9.164904862579282e-06, "loss": 0.3074, "step": 100 }, { "epoch": 0.3139083139083139, "grad_norm": 0.9876366853713989, "learning_rate": 9.154334038054969e-06, "loss": 0.3221, "step": 101 }, { "epoch": 0.317016317016317, "grad_norm": 1.3406106233596802, "learning_rate": 9.143763213530656e-06, "loss": 0.3209, "step": 102 }, { "epoch": 0.3201243201243201, "grad_norm": 1.1402978897094727, "learning_rate": 9.133192389006343e-06, "loss": 0.3181, "step": 103 }, { "epoch": 0.32323232323232326, "grad_norm": 1.0274314880371094, "learning_rate": 9.12262156448203e-06, "loss": 0.3179, "step": 104 }, { "epoch": 0.32634032634032634, "grad_norm": 1.0853135585784912, "learning_rate": 9.112050739957717e-06, "loss": 0.3068, "step": 105 }, { "epoch": 0.32944832944832947, "grad_norm": 0.9549627900123596, "learning_rate": 9.101479915433404e-06, "loss": 0.3058, "step": 106 }, { "epoch": 0.33255633255633255, "grad_norm": 0.9081363081932068, "learning_rate": 9.090909090909091e-06, "loss": 0.305, "step": 107 }, { "epoch": 0.3356643356643357, "grad_norm": 1.083267092704773, "learning_rate": 9.080338266384778e-06, "loss": 0.3293, "step": 108 }, { "epoch": 0.33877233877233875, "grad_norm": 0.9146764278411865, "learning_rate": 9.069767441860465e-06, "loss": 0.3308, "step": 109 }, { "epoch": 0.3418803418803419, "grad_norm": 0.8309290409088135, "learning_rate": 9.059196617336154e-06, "loss": 0.3219, "step": 110 }, { "epoch": 0.34498834498834496, "grad_norm": 0.7540556788444519, "learning_rate": 9.048625792811841e-06, "loss": 0.3165, "step": 111 }, { "epoch": 0.3480963480963481, "grad_norm": 0.7756165862083435, "learning_rate": 9.038054968287528e-06, "loss": 0.3201, "step": 112 }, { "epoch": 0.35120435120435123, "grad_norm": 1.016161561012268, "learning_rate": 9.027484143763215e-06, "loss": 0.318, "step": 113 }, { "epoch": 0.3543123543123543, "grad_norm": 1.1762275695800781, "learning_rate": 9.016913319238902e-06, "loss": 0.3071, "step": 114 }, { "epoch": 0.35742035742035744, "grad_norm": 1.0186941623687744, "learning_rate": 9.006342494714589e-06, "loss": 0.3094, "step": 115 }, { "epoch": 0.3605283605283605, "grad_norm": 1.3835426568984985, "learning_rate": 8.995771670190276e-06, "loss": 0.3203, "step": 116 }, { "epoch": 0.36363636363636365, "grad_norm": 0.9151639938354492, "learning_rate": 8.985200845665963e-06, "loss": 0.3075, "step": 117 }, { "epoch": 0.3667443667443667, "grad_norm": 0.9079708456993103, "learning_rate": 8.974630021141648e-06, "loss": 0.3111, "step": 118 }, { "epoch": 0.36985236985236986, "grad_norm": 0.7135366201400757, "learning_rate": 8.964059196617337e-06, "loss": 0.3131, "step": 119 }, { "epoch": 0.372960372960373, "grad_norm": 0.7310993671417236, "learning_rate": 8.953488372093024e-06, "loss": 0.3181, "step": 120 }, { "epoch": 0.37606837606837606, "grad_norm": 0.9562262296676636, "learning_rate": 8.942917547568711e-06, "loss": 0.3114, "step": 121 }, { "epoch": 0.3791763791763792, "grad_norm": 1.088692545890808, "learning_rate": 8.932346723044398e-06, "loss": 0.2985, "step": 122 }, { "epoch": 0.3822843822843823, "grad_norm": 1.3334287405014038, "learning_rate": 8.921775898520085e-06, "loss": 0.3198, "step": 123 }, { "epoch": 0.3853923853923854, "grad_norm": 1.1457082033157349, "learning_rate": 8.911205073995772e-06, "loss": 0.3027, "step": 124 }, { "epoch": 0.3885003885003885, "grad_norm": 1.0944201946258545, "learning_rate": 8.90063424947146e-06, "loss": 0.3195, "step": 125 }, { "epoch": 0.3916083916083916, "grad_norm": 1.679890513420105, "learning_rate": 8.890063424947146e-06, "loss": 0.3118, "step": 126 }, { "epoch": 0.3947163947163947, "grad_norm": 1.0934737920761108, "learning_rate": 8.879492600422833e-06, "loss": 0.3125, "step": 127 }, { "epoch": 0.3978243978243978, "grad_norm": 0.9423776865005493, "learning_rate": 8.86892177589852e-06, "loss": 0.3069, "step": 128 }, { "epoch": 0.3978243978243978, "eval_loss": 0.33542340993881226, "eval_runtime": 147.0915, "eval_samples_per_second": 1.938, "eval_steps_per_second": 0.972, "step": 128 }, { "epoch": 0.40093240093240096, "grad_norm": 1.373064637184143, "learning_rate": 8.858350951374208e-06, "loss": 0.3113, "step": 129 }, { "epoch": 0.40404040404040403, "grad_norm": 0.9782734513282776, "learning_rate": 8.847780126849895e-06, "loss": 0.3176, "step": 130 }, { "epoch": 0.40714840714840717, "grad_norm": 1.1988129615783691, "learning_rate": 8.837209302325582e-06, "loss": 0.3036, "step": 131 }, { "epoch": 0.41025641025641024, "grad_norm": 1.3978164196014404, "learning_rate": 8.826638477801269e-06, "loss": 0.3067, "step": 132 }, { "epoch": 0.4133644133644134, "grad_norm": 0.8266012072563171, "learning_rate": 8.816067653276956e-06, "loss": 0.3105, "step": 133 }, { "epoch": 0.41647241647241645, "grad_norm": 1.0358003377914429, "learning_rate": 8.805496828752643e-06, "loss": 0.3176, "step": 134 }, { "epoch": 0.4195804195804196, "grad_norm": 0.9363102316856384, "learning_rate": 8.79492600422833e-06, "loss": 0.3151, "step": 135 }, { "epoch": 0.4226884226884227, "grad_norm": 0.9805242419242859, "learning_rate": 8.784355179704017e-06, "loss": 0.3164, "step": 136 }, { "epoch": 0.4257964257964258, "grad_norm": 1.4923985004425049, "learning_rate": 8.773784355179706e-06, "loss": 0.3059, "step": 137 }, { "epoch": 0.4289044289044289, "grad_norm": 1.7009886503219604, "learning_rate": 8.763213530655393e-06, "loss": 0.2937, "step": 138 }, { "epoch": 0.432012432012432, "grad_norm": 0.8320425748825073, "learning_rate": 8.75264270613108e-06, "loss": 0.288, "step": 139 }, { "epoch": 0.43512043512043513, "grad_norm": 1.3431979417800903, "learning_rate": 8.742071881606767e-06, "loss": 0.3063, "step": 140 }, { "epoch": 0.4382284382284382, "grad_norm": 1.0519447326660156, "learning_rate": 8.731501057082454e-06, "loss": 0.3043, "step": 141 }, { "epoch": 0.44133644133644134, "grad_norm": 1.0041645765304565, "learning_rate": 8.72093023255814e-06, "loss": 0.3207, "step": 142 }, { "epoch": 0.4444444444444444, "grad_norm": 1.176352620124817, "learning_rate": 8.710359408033828e-06, "loss": 0.3099, "step": 143 }, { "epoch": 0.44755244755244755, "grad_norm": 0.8591434955596924, "learning_rate": 8.699788583509515e-06, "loss": 0.2913, "step": 144 }, { "epoch": 0.4471858134155744, "grad_norm": 1.2351419925689697, "learning_rate": 8.689217758985202e-06, "loss": 0.3099, "step": 145 }, { "epoch": 0.4502698535080956, "grad_norm": 1.8375589847564697, "learning_rate": 8.691099476439791e-06, "loss": 0.3092, "step": 146 }, { "epoch": 0.4533538936006168, "grad_norm": 1.07125985622406, "learning_rate": 8.680628272251308e-06, "loss": 0.3016, "step": 147 }, { "epoch": 0.456437933693138, "grad_norm": 1.1839478015899658, "learning_rate": 8.670157068062827e-06, "loss": 0.3003, "step": 148 }, { "epoch": 0.45952197378565923, "grad_norm": 1.294833779335022, "learning_rate": 8.659685863874346e-06, "loss": 0.2972, "step": 149 }, { "epoch": 0.4626060138781804, "grad_norm": 1.0540661811828613, "learning_rate": 8.649214659685865e-06, "loss": 0.2837, "step": 150 }, { "epoch": 0.4656900539707016, "grad_norm": 1.1067568063735962, "learning_rate": 8.638743455497383e-06, "loss": 0.2966, "step": 151 }, { "epoch": 0.46877409406322285, "grad_norm": 0.9972389340400696, "learning_rate": 8.6282722513089e-06, "loss": 0.2934, "step": 152 }, { "epoch": 0.471858134155744, "grad_norm": 1.1589370965957642, "learning_rate": 8.61780104712042e-06, "loss": 0.3026, "step": 153 }, { "epoch": 0.47494217424826524, "grad_norm": 1.1224210262298584, "learning_rate": 8.607329842931938e-06, "loss": 0.3042, "step": 154 }, { "epoch": 0.4780262143407864, "grad_norm": 1.3200238943099976, "learning_rate": 8.596858638743457e-06, "loss": 0.3124, "step": 155 }, { "epoch": 0.4811102544333076, "grad_norm": 1.1300067901611328, "learning_rate": 8.586387434554974e-06, "loss": 0.3167, "step": 156 }, { "epoch": 0.48419429452582885, "grad_norm": 0.9678866863250732, "learning_rate": 8.575916230366493e-06, "loss": 0.3039, "step": 157 }, { "epoch": 0.48727833461835, "grad_norm": 0.9656190872192383, "learning_rate": 8.565445026178011e-06, "loss": 0.3067, "step": 158 }, { "epoch": 0.49036237471087124, "grad_norm": 0.9618685245513916, "learning_rate": 8.55497382198953e-06, "loss": 0.2992, "step": 159 }, { "epoch": 0.49344641480339246, "grad_norm": 1.1055867671966553, "learning_rate": 8.544502617801049e-06, "loss": 0.2986, "step": 160 }, { "epoch": 0.49653045489591363, "grad_norm": 0.8761485815048218, "learning_rate": 8.534031413612566e-06, "loss": 0.3071, "step": 161 }, { "epoch": 0.49961449498843485, "grad_norm": 1.0709651708602905, "learning_rate": 8.523560209424085e-06, "loss": 0.2965, "step": 162 }, { "epoch": 0.5026985350809561, "grad_norm": 1.2407382726669312, "learning_rate": 8.513089005235604e-06, "loss": 0.3134, "step": 163 }, { "epoch": 0.5057825751734772, "grad_norm": 1.46315598487854, "learning_rate": 8.502617801047122e-06, "loss": 0.2886, "step": 164 }, { "epoch": 0.5088666152659984, "grad_norm": 1.2314726114273071, "learning_rate": 8.49214659685864e-06, "loss": 0.2902, "step": 165 }, { "epoch": 0.5119506553585197, "grad_norm": 1.223716378211975, "learning_rate": 8.481675392670158e-06, "loss": 0.3088, "step": 166 }, { "epoch": 0.5150346954510409, "grad_norm": 1.1966098546981812, "learning_rate": 8.471204188481677e-06, "loss": 0.3139, "step": 167 }, { "epoch": 0.518118735543562, "grad_norm": 1.1182276010513306, "learning_rate": 8.460732984293194e-06, "loss": 0.3161, "step": 168 }, { "epoch": 0.5212027756360833, "grad_norm": 1.1583510637283325, "learning_rate": 8.450261780104713e-06, "loss": 0.3041, "step": 169 }, { "epoch": 0.5242868157286045, "grad_norm": 1.1864618062973022, "learning_rate": 8.439790575916232e-06, "loss": 0.3008, "step": 170 }, { "epoch": 0.5273708558211256, "grad_norm": 1.3757935762405396, "learning_rate": 8.429319371727749e-06, "loss": 0.2865, "step": 171 }, { "epoch": 0.5304548959136469, "grad_norm": 1.4410743713378906, "learning_rate": 8.418848167539267e-06, "loss": 0.3081, "step": 172 }, { "epoch": 0.5335389360061681, "grad_norm": 1.3494313955307007, "learning_rate": 8.408376963350786e-06, "loss": 0.2988, "step": 173 }, { "epoch": 0.5366229760986893, "grad_norm": 1.3871009349822998, "learning_rate": 8.397905759162305e-06, "loss": 0.3045, "step": 174 }, { "epoch": 0.5397070161912105, "grad_norm": 1.183766484260559, "learning_rate": 8.387434554973822e-06, "loss": 0.2969, "step": 175 }, { "epoch": 0.5427910562837317, "grad_norm": 1.1075443029403687, "learning_rate": 8.37696335078534e-06, "loss": 0.2834, "step": 176 }, { "epoch": 0.5458750963762529, "grad_norm": 1.3118195533752441, "learning_rate": 8.36649214659686e-06, "loss": 0.2945, "step": 177 }, { "epoch": 0.5489591364687741, "grad_norm": 1.3226675987243652, "learning_rate": 8.356020942408377e-06, "loss": 0.3085, "step": 178 }, { "epoch": 0.5520431765612953, "grad_norm": 1.1877515316009521, "learning_rate": 8.345549738219895e-06, "loss": 0.2757, "step": 179 }, { "epoch": 0.5551272166538165, "grad_norm": 1.379599928855896, "learning_rate": 8.335078534031414e-06, "loss": 0.2968, "step": 180 }, { "epoch": 0.5582112567463376, "grad_norm": 1.2975775003433228, "learning_rate": 8.324607329842933e-06, "loss": 0.3074, "step": 181 }, { "epoch": 0.5612952968388589, "grad_norm": 1.2829333543777466, "learning_rate": 8.31413612565445e-06, "loss": 0.3014, "step": 182 }, { "epoch": 0.5643793369313801, "grad_norm": 1.4759114980697632, "learning_rate": 8.303664921465969e-06, "loss": 0.3014, "step": 183 }, { "epoch": 0.5674633770239013, "grad_norm": 1.3108978271484375, "learning_rate": 8.293193717277488e-06, "loss": 0.2914, "step": 184 }, { "epoch": 0.5705474171164225, "grad_norm": 1.271666407585144, "learning_rate": 8.282722513089005e-06, "loss": 0.305, "step": 185 }, { "epoch": 0.5736314572089437, "grad_norm": 1.1115907430648804, "learning_rate": 8.272251308900523e-06, "loss": 0.2963, "step": 186 }, { "epoch": 0.5767154973014649, "grad_norm": 1.089092493057251, "learning_rate": 8.261780104712042e-06, "loss": 0.303, "step": 187 }, { "epoch": 0.5797995373939862, "grad_norm": 1.1514776945114136, "learning_rate": 8.251308900523561e-06, "loss": 0.3073, "step": 188 }, { "epoch": 0.5828835774865073, "grad_norm": 1.1654891967773438, "learning_rate": 8.240837696335078e-06, "loss": 0.2883, "step": 189 }, { "epoch": 0.5859676175790285, "grad_norm": 1.2040210962295532, "learning_rate": 8.230366492146597e-06, "loss": 0.295, "step": 190 }, { "epoch": 0.5890516576715498, "grad_norm": 1.203511118888855, "learning_rate": 8.219895287958116e-06, "loss": 0.2795, "step": 191 }, { "epoch": 0.5921356977640709, "grad_norm": 1.5743706226348877, "learning_rate": 8.209424083769634e-06, "loss": 0.3123, "step": 192 }, { "epoch": 0.5921356977640709, "eval_loss": 0.3412991166114807, "eval_runtime": 149.387, "eval_samples_per_second": 1.928, "eval_steps_per_second": 0.964, "step": 192 }, { "epoch": 0.5952197378565921, "grad_norm": 1.4109128713607788, "learning_rate": 8.198952879581153e-06, "loss": 0.2996, "step": 193 }, { "epoch": 0.5983037779491134, "grad_norm": 1.3817074298858643, "learning_rate": 8.18848167539267e-06, "loss": 0.2964, "step": 194 }, { "epoch": 0.6013878180416345, "grad_norm": 1.3587619066238403, "learning_rate": 8.178010471204189e-06, "loss": 0.3004, "step": 195 }, { "epoch": 0.6044718581341557, "grad_norm": 1.502744197845459, "learning_rate": 8.167539267015708e-06, "loss": 0.2957, "step": 196 }, { "epoch": 0.607555898226677, "grad_norm": 1.4416728019714355, "learning_rate": 8.157068062827227e-06, "loss": 0.2962, "step": 197 }, { "epoch": 0.6106399383191982, "grad_norm": 2.2597157955169678, "learning_rate": 8.146596858638745e-06, "loss": 0.2853, "step": 198 }, { "epoch": 0.6137239784117193, "grad_norm": 1.854837417602539, "learning_rate": 8.136125654450262e-06, "loss": 0.2918, "step": 199 }, { "epoch": 0.6168080185042406, "grad_norm": 2.1409687995910645, "learning_rate": 8.125654450261781e-06, "loss": 0.3118, "step": 200 }, { "epoch": 0.6198920585967618, "grad_norm": 1.7128517627716064, "learning_rate": 8.1151832460733e-06, "loss": 0.2822, "step": 201 }, { "epoch": 0.6229760986892829, "grad_norm": 1.4401497840881348, "learning_rate": 8.104712041884819e-06, "loss": 0.2802, "step": 202 }, { "epoch": 0.6260601387818041, "grad_norm": 1.7307312488555908, "learning_rate": 8.094240837696336e-06, "loss": 0.2973, "step": 203 }, { "epoch": 0.6291441788743254, "grad_norm": 1.263535737991333, "learning_rate": 8.083769633507855e-06, "loss": 0.3016, "step": 204 }, { "epoch": 0.6322282189668466, "grad_norm": 1.4065901041030884, "learning_rate": 8.073298429319373e-06, "loss": 0.284, "step": 205 }, { "epoch": 0.6353122590593677, "grad_norm": 1.6004809141159058, "learning_rate": 8.06282722513089e-06, "loss": 0.2908, "step": 206 }, { "epoch": 0.638396299151889, "grad_norm": 1.458287239074707, "learning_rate": 8.05235602094241e-06, "loss": 0.2832, "step": 207 }, { "epoch": 0.6414803392444102, "grad_norm": 1.8239188194274902, "learning_rate": 8.041884816753928e-06, "loss": 0.2993, "step": 208 }, { "epoch": 0.6445643793369313, "grad_norm": 1.8187966346740723, "learning_rate": 8.031413612565445e-06, "loss": 0.311, "step": 209 }, { "epoch": 0.6476484194294526, "grad_norm": 1.5089385509490967, "learning_rate": 8.020942408376964e-06, "loss": 0.2835, "step": 210 }, { "epoch": 0.6507324595219738, "grad_norm": 1.5591213703155518, "learning_rate": 8.010471204188483e-06, "loss": 0.2985, "step": 211 }, { "epoch": 0.653816499614495, "grad_norm": 1.5221312046051025, "learning_rate": 8.000000000000001e-06, "loss": 0.2805, "step": 212 }, { "epoch": 0.6569005397070162, "grad_norm": 1.8211005926132202, "learning_rate": 7.989528795811518e-06, "loss": 0.2728, "step": 213 }, { "epoch": 0.6599845797995374, "grad_norm": 2.2500016689300537, "learning_rate": 7.979057591623037e-06, "loss": 0.2932, "step": 214 }, { "epoch": 0.6630686198920586, "grad_norm": 1.7227460145950317, "learning_rate": 7.968586387434556e-06, "loss": 0.2927, "step": 215 }, { "epoch": 0.6661526599845798, "grad_norm": 2.1821672916412354, "learning_rate": 7.958115183246073e-06, "loss": 0.2919, "step": 216 }, { "epoch": 0.669236700077101, "grad_norm": 1.3368958234786987, "learning_rate": 7.947643979057592e-06, "loss": 0.2789, "step": 217 }, { "epoch": 0.6723207401696222, "grad_norm": 1.4419403076171875, "learning_rate": 7.93717277486911e-06, "loss": 0.2876, "step": 218 }, { "epoch": 0.6754047802621435, "grad_norm": 2.0355281829833984, "learning_rate": 7.92670157068063e-06, "loss": 0.3059, "step": 219 }, { "epoch": 0.6784888203546646, "grad_norm": 1.7871628999710083, "learning_rate": 7.916230366492146e-06, "loss": 0.2804, "step": 220 }, { "epoch": 0.6815728604471858, "grad_norm": 1.8160405158996582, "learning_rate": 7.905759162303665e-06, "loss": 0.2842, "step": 221 }, { "epoch": 0.6846569005397071, "grad_norm": 2.1498160362243652, "learning_rate": 7.895287958115184e-06, "loss": 0.2875, "step": 222 }, { "epoch": 0.6877409406322282, "grad_norm": 1.9483954906463623, "learning_rate": 7.884816753926701e-06, "loss": 0.2874, "step": 223 }, { "epoch": 0.6908249807247494, "grad_norm": 2.0145816802978516, "learning_rate": 7.87434554973822e-06, "loss": 0.2879, "step": 224 }, { "epoch": 0.6939090208172706, "grad_norm": 1.680413007736206, "learning_rate": 7.863874345549739e-06, "loss": 0.2755, "step": 225 }, { "epoch": 0.6969930609097919, "grad_norm": 1.5203242301940918, "learning_rate": 7.853403141361257e-06, "loss": 0.284, "step": 226 }, { "epoch": 0.700077101002313, "grad_norm": 1.892943263053894, "learning_rate": 7.842931937172774e-06, "loss": 0.2799, "step": 227 }, { "epoch": 0.7031611410948342, "grad_norm": 1.5476278066635132, "learning_rate": 7.832460732984293e-06, "loss": 0.2767, "step": 228 }, { "epoch": 0.7062451811873555, "grad_norm": 2.2650210857391357, "learning_rate": 7.821989528795812e-06, "loss": 0.2905, "step": 229 }, { "epoch": 0.7093292212798766, "grad_norm": 2.1595096588134766, "learning_rate": 7.81151832460733e-06, "loss": 0.274, "step": 230 }, { "epoch": 0.7124132613723978, "grad_norm": 1.587994933128357, "learning_rate": 7.80104712041885e-06, "loss": 0.2743, "step": 231 }, { "epoch": 0.7154973014649191, "grad_norm": 1.9411978721618652, "learning_rate": 7.790575916230367e-06, "loss": 0.272, "step": 232 }, { "epoch": 0.7185813415574402, "grad_norm": 2.1039252281188965, "learning_rate": 7.780104712041885e-06, "loss": 0.2884, "step": 233 }, { "epoch": 0.7216653816499614, "grad_norm": 1.834591269493103, "learning_rate": 7.769633507853404e-06, "loss": 0.2756, "step": 234 }, { "epoch": 0.7247494217424827, "grad_norm": 2.1758062839508057, "learning_rate": 7.759162303664923e-06, "loss": 0.287, "step": 235 }, { "epoch": 0.7278334618350039, "grad_norm": 2.0601179599761963, "learning_rate": 7.748691099476442e-06, "loss": 0.2683, "step": 236 }, { "epoch": 0.730917501927525, "grad_norm": 1.7605801820755005, "learning_rate": 7.738219895287959e-06, "loss": 0.2552, "step": 237 }, { "epoch": 0.7340015420200463, "grad_norm": 2.0951759815216064, "learning_rate": 7.727748691099478e-06, "loss": 0.258, "step": 238 }, { "epoch": 0.7370855821125675, "grad_norm": 2.2250118255615234, "learning_rate": 7.717277486910996e-06, "loss": 0.2627, "step": 239 }, { "epoch": 0.7401696222050886, "grad_norm": 2.54436993598938, "learning_rate": 7.706806282722513e-06, "loss": 0.278, "step": 240 }, { "epoch": 0.7432536622976099, "grad_norm": 1.810699701309204, "learning_rate": 7.696335078534032e-06, "loss": 0.2684, "step": 241 }, { "epoch": 0.7463377023901311, "grad_norm": 2.161043882369995, "learning_rate": 7.685863874345551e-06, "loss": 0.2828, "step": 242 }, { "epoch": 0.7494217424826523, "grad_norm": 1.7965888977050781, "learning_rate": 7.67539267015707e-06, "loss": 0.2677, "step": 243 }, { "epoch": 0.7525057825751735, "grad_norm": 1.9139559268951416, "learning_rate": 7.664921465968587e-06, "loss": 0.2701, "step": 244 }, { "epoch": 0.7555898226676947, "grad_norm": 2.0285589694976807, "learning_rate": 7.654450261780106e-06, "loss": 0.2726, "step": 245 }, { "epoch": 0.7586738627602159, "grad_norm": 2.2968027591705322, "learning_rate": 7.643979057591624e-06, "loss": 0.2606, "step": 246 }, { "epoch": 0.761757902852737, "grad_norm": 2.4324936866760254, "learning_rate": 7.633507853403141e-06, "loss": 0.2659, "step": 247 }, { "epoch": 0.7648419429452583, "grad_norm": 2.66330885887146, "learning_rate": 7.62303664921466e-06, "loss": 0.2627, "step": 248 }, { "epoch": 0.7679259830377795, "grad_norm": 2.435866355895996, "learning_rate": 7.612565445026179e-06, "loss": 0.2713, "step": 249 }, { "epoch": 0.7710100231303006, "grad_norm": 2.2584385871887207, "learning_rate": 7.602094240837698e-06, "loss": 0.2754, "step": 250 }, { "epoch": 0.7740940632228219, "grad_norm": 2.1898317337036133, "learning_rate": 7.591623036649215e-06, "loss": 0.2705, "step": 251 }, { "epoch": 0.7771781033153431, "grad_norm": 2.051255464553833, "learning_rate": 7.5811518324607335e-06, "loss": 0.2491, "step": 252 }, { "epoch": 0.7802621434078643, "grad_norm": 2.353940725326538, "learning_rate": 7.570680628272252e-06, "loss": 0.277, "step": 253 }, { "epoch": 0.7833461835003855, "grad_norm": 2.3826687335968018, "learning_rate": 7.560209424083769e-06, "loss": 0.2693, "step": 254 }, { "epoch": 0.7864302235929067, "grad_norm": 2.522019863128662, "learning_rate": 7.549738219895288e-06, "loss": 0.2706, "step": 255 }, { "epoch": 0.7895142636854279, "grad_norm": 2.3525524139404297, "learning_rate": 7.539267015706807e-06, "loss": 0.2509, "step": 256 }, { "epoch": 0.7895142636854279, "eval_loss": 0.3851300776004791, "eval_runtime": 149.046, "eval_samples_per_second": 1.932, "eval_steps_per_second": 0.966, "step": 256 }, { "epoch": 0.7925983037779492, "grad_norm": 2.7143642902374268, "learning_rate": 7.528795811518326e-06, "loss": 0.2701, "step": 257 }, { "epoch": 0.7956823438704703, "grad_norm": 2.6725356578826904, "learning_rate": 7.518324607329844e-06, "loss": 0.2718, "step": 258 }, { "epoch": 0.7987663839629915, "grad_norm": 2.4051880836486816, "learning_rate": 7.5078534031413615e-06, "loss": 0.2554, "step": 259 }, { "epoch": 0.8018504240555128, "grad_norm": 2.472904920578003, "learning_rate": 7.49738219895288e-06, "loss": 0.2666, "step": 260 }, { "epoch": 0.8049344641480339, "grad_norm": 2.3598804473876953, "learning_rate": 7.486910994764398e-06, "loss": 0.2532, "step": 261 }, { "epoch": 0.8080185042405551, "grad_norm": 2.383300542831421, "learning_rate": 7.476439790575917e-06, "loss": 0.2568, "step": 262 }, { "epoch": 0.8111025443330764, "grad_norm": 2.999469518661499, "learning_rate": 7.465968586387436e-06, "loss": 0.2403, "step": 263 }, { "epoch": 0.8141865844255975, "grad_norm": 4.071384429931641, "learning_rate": 7.455497382198954e-06, "loss": 0.265, "step": 264 }, { "epoch": 0.8172706245181187, "grad_norm": 3.5529489517211914, "learning_rate": 7.445026178010472e-06, "loss": 0.2647, "step": 265 }, { "epoch": 0.8203546646106399, "grad_norm": 2.8842644691467285, "learning_rate": 7.43455497382199e-06, "loss": 0.2725, "step": 266 }, { "epoch": 0.8234387047031612, "grad_norm": 2.1277332305908203, "learning_rate": 7.424083769633509e-06, "loss": 0.2657, "step": 267 }, { "epoch": 0.8265227447956823, "grad_norm": 2.832111358642578, "learning_rate": 7.413612565445026e-06, "loss": 0.255, "step": 268 }, { "epoch": 0.8296067848882035, "grad_norm": 2.7438676357269287, "learning_rate": 7.403141361256545e-06, "loss": 0.2596, "step": 269 }, { "epoch": 0.8326908249807248, "grad_norm": 2.7950987815856934, "learning_rate": 7.392670157068064e-06, "loss": 0.2624, "step": 270 }, { "epoch": 0.8357748650732459, "grad_norm": 3.497069835662842, "learning_rate": 7.382198952879581e-06, "loss": 0.2385, "step": 271 }, { "epoch": 0.8388589051657671, "grad_norm": 5.024068832397461, "learning_rate": 7.3717277486911e-06, "loss": 0.2526, "step": 272 }, { "epoch": 0.8419429452582884, "grad_norm": 3.5298011302948, "learning_rate": 7.361256544502618e-06, "loss": 0.2452, "step": 273 }, { "epoch": 0.8450269853508096, "grad_norm": 2.701545238494873, "learning_rate": 7.350785340314137e-06, "loss": 0.2293, "step": 274 }, { "epoch": 0.8481110254433307, "grad_norm": 2.838541030883789, "learning_rate": 7.340314136125655e-06, "loss": 0.2554, "step": 275 }, { "epoch": 0.851195065535852, "grad_norm": 2.5854012966156006, "learning_rate": 7.329842931937173e-06, "loss": 0.245, "step": 276 }, { "epoch": 0.8542791056283732, "grad_norm": 2.9351906776428223, "learning_rate": 7.319371727748692e-06, "loss": 0.2556, "step": 277 }, { "epoch": 0.8573631457208943, "grad_norm": 3.0675830841064453, "learning_rate": 7.30890052356021e-06, "loss": 0.2501, "step": 278 }, { "epoch": 0.8604471858134156, "grad_norm": 3.1958088874816895, "learning_rate": 7.2984293193717285e-06, "loss": 0.2347, "step": 279 }, { "epoch": 0.8635312259059368, "grad_norm": 3.0006463527679443, "learning_rate": 7.287958115183246e-06, "loss": 0.242, "step": 280 }, { "epoch": 0.866615265998458, "grad_norm": 2.862990379333496, "learning_rate": 7.277486910994765e-06, "loss": 0.2442, "step": 281 }, { "epoch": 0.8696993060909792, "grad_norm": 3.1585986614227295, "learning_rate": 7.267015706806283e-06, "loss": 0.2401, "step": 282 }, { "epoch": 0.8727833461835004, "grad_norm": 2.6111812591552734, "learning_rate": 7.256544502617802e-06, "loss": 0.2324, "step": 283 }, { "epoch": 0.8758673862760216, "grad_norm": 3.1289191246032715, "learning_rate": 7.246073298429321e-06, "loss": 0.2426, "step": 284 }, { "epoch": 0.8789514263685428, "grad_norm": 3.448789358139038, "learning_rate": 7.235602094240838e-06, "loss": 0.2224, "step": 285 }, { "epoch": 0.882035466461064, "grad_norm": 3.018432855606079, "learning_rate": 7.2251308900523565e-06, "loss": 0.2238, "step": 286 }, { "epoch": 0.8851195065535852, "grad_norm": 4.171509742736816, "learning_rate": 7.214659685863875e-06, "loss": 0.2546, "step": 287 }, { "epoch": 0.8882035466461063, "grad_norm": 3.5390446186065674, "learning_rate": 7.204188481675394e-06, "loss": 0.2417, "step": 288 }, { "epoch": 0.8912875867386276, "grad_norm": 2.8169162273406982, "learning_rate": 7.193717277486911e-06, "loss": 0.2348, "step": 289 }, { "epoch": 0.8943716268311488, "grad_norm": 2.9175827503204346, "learning_rate": 7.18324607329843e-06, "loss": 0.214, "step": 290 }, { "epoch": 0.89745566692367, "grad_norm": 3.939680576324463, "learning_rate": 7.172774869109949e-06, "loss": 0.2489, "step": 291 }, { "epoch": 0.9005397070161912, "grad_norm": 2.874373435974121, "learning_rate": 7.162303664921466e-06, "loss": 0.2219, "step": 292 }, { "epoch": 0.9036237471087124, "grad_norm": 4.381021976470947, "learning_rate": 7.1518324607329845e-06, "loss": 0.2419, "step": 293 }, { "epoch": 0.9067077872012336, "grad_norm": 3.9895918369293213, "learning_rate": 7.141361256544503e-06, "loss": 0.2552, "step": 294 }, { "epoch": 0.9097918272937549, "grad_norm": 2.9028842449188232, "learning_rate": 7.130890052356022e-06, "loss": 0.2323, "step": 295 }, { "epoch": 0.912875867386276, "grad_norm": 3.5980117321014404, "learning_rate": 7.12041884816754e-06, "loss": 0.2404, "step": 296 }, { "epoch": 0.9159599074787972, "grad_norm": 3.490727186203003, "learning_rate": 7.109947643979058e-06, "loss": 0.22, "step": 297 }, { "epoch": 0.9190439475713185, "grad_norm": 3.256279706954956, "learning_rate": 7.099476439790577e-06, "loss": 0.2368, "step": 298 }, { "epoch": 0.9221279876638396, "grad_norm": 3.92038893699646, "learning_rate": 7.089005235602095e-06, "loss": 0.2331, "step": 299 }, { "epoch": 0.9252120277563608, "grad_norm": 3.6917364597320557, "learning_rate": 7.078534031413613e-06, "loss": 0.2139, "step": 300 }, { "epoch": 0.9282960678488821, "grad_norm": 3.058729887008667, "learning_rate": 7.068062827225132e-06, "loss": 0.2199, "step": 301 }, { "epoch": 0.9313801079414032, "grad_norm": 3.150188446044922, "learning_rate": 7.057591623036649e-06, "loss": 0.2137, "step": 302 }, { "epoch": 0.9344641480339244, "grad_norm": 5.77610445022583, "learning_rate": 7.047120418848168e-06, "loss": 0.2478, "step": 303 }, { "epoch": 0.9375481881264457, "grad_norm": 2.8851089477539062, "learning_rate": 7.036649214659687e-06, "loss": 0.227, "step": 304 }, { "epoch": 0.9406322282189669, "grad_norm": 3.1656086444854736, "learning_rate": 7.0261780104712055e-06, "loss": 0.2335, "step": 305 }, { "epoch": 0.943716268311488, "grad_norm": 3.3355696201324463, "learning_rate": 7.015706806282723e-06, "loss": 0.2169, "step": 306 }, { "epoch": 0.9468003084040093, "grad_norm": 3.5095317363739014, "learning_rate": 7.005235602094241e-06, "loss": 0.2161, "step": 307 }, { "epoch": 0.9498843484965305, "grad_norm": 3.5365262031555176, "learning_rate": 6.99476439790576e-06, "loss": 0.2097, "step": 308 }, { "epoch": 0.9529683885890516, "grad_norm": 4.159248352050781, "learning_rate": 6.984293193717277e-06, "loss": 0.2337, "step": 309 }, { "epoch": 0.9560524286815728, "grad_norm": 2.9792213439941406, "learning_rate": 6.973821989528796e-06, "loss": 0.2149, "step": 310 }, { "epoch": 0.9591364687740941, "grad_norm": 3.2603046894073486, "learning_rate": 6.963350785340315e-06, "loss": 0.2218, "step": 311 }, { "epoch": 0.9622205088666153, "grad_norm": 3.5064327716827393, "learning_rate": 6.9528795811518335e-06, "loss": 0.2128, "step": 312 }, { "epoch": 0.9653045489591364, "grad_norm": 3.971139430999756, "learning_rate": 6.942408376963351e-06, "loss": 0.2172, "step": 313 }, { "epoch": 0.9683885890516577, "grad_norm": 3.651603937149048, "learning_rate": 6.931937172774869e-06, "loss": 0.2036, "step": 314 }, { "epoch": 0.9714726291441789, "grad_norm": 5.394900321960449, "learning_rate": 6.921465968586388e-06, "loss": 0.2157, "step": 315 }, { "epoch": 0.9745566692367, "grad_norm": 3.7696452140808105, "learning_rate": 6.910994764397906e-06, "loss": 0.2168, "step": 316 }, { "epoch": 0.9776407093292213, "grad_norm": 3.3137505054473877, "learning_rate": 6.900523560209425e-06, "loss": 0.2217, "step": 317 }, { "epoch": 0.9807247494217425, "grad_norm": 3.927021026611328, "learning_rate": 6.890052356020943e-06, "loss": 0.2149, "step": 318 }, { "epoch": 0.9838087895142636, "grad_norm": 3.598501443862915, "learning_rate": 6.8795811518324615e-06, "loss": 0.2007, "step": 319 }, { "epoch": 0.9868928296067849, "grad_norm": 4.063229084014893, "learning_rate": 6.8691099476439794e-06, "loss": 0.2142, "step": 320 }, { "epoch": 0.9868928296067849, "eval_loss": 0.46243318915367126, "eval_runtime": 150.4594, "eval_samples_per_second": 1.914, "eval_steps_per_second": 0.957, "step": 320 }, { "epoch": 0.9899768696993061, "grad_norm": 4.520982265472412, "learning_rate": 6.858638743455498e-06, "loss": 0.1978, "step": 321 }, { "epoch": 0.9930609097918273, "grad_norm": 3.6312687397003174, "learning_rate": 6.848167539267017e-06, "loss": 0.1896, "step": 322 }, { "epoch": 0.9961449498843485, "grad_norm": 3.1252243518829346, "learning_rate": 6.837696335078534e-06, "loss": 0.1817, "step": 323 }, { "epoch": 0.9992289899768697, "grad_norm": 4.3829264640808105, "learning_rate": 6.827225130890053e-06, "loss": 0.2199, "step": 324 }, { "epoch": 1.0030840400925212, "grad_norm": 9.755841255187988, "learning_rate": 6.816753926701572e-06, "loss": 0.4578, "step": 325 }, { "epoch": 1.0061680801850423, "grad_norm": 3.9052581787109375, "learning_rate": 6.80628272251309e-06, "loss": 0.1959, "step": 326 }, { "epoch": 1.0092521202775635, "grad_norm": 3.6258931159973145, "learning_rate": 6.7958115183246075e-06, "loss": 0.2062, "step": 327 }, { "epoch": 1.012336160370085, "grad_norm": 4.131122589111328, "learning_rate": 6.785340314136126e-06, "loss": 0.1915, "step": 328 }, { "epoch": 1.015420200462606, "grad_norm": 4.387429237365723, "learning_rate": 6.774869109947645e-06, "loss": 0.1792, "step": 329 }, { "epoch": 1.0185042405551272, "grad_norm": 3.873361110687256, "learning_rate": 6.764397905759162e-06, "loss": 0.1895, "step": 330 }, { "epoch": 1.0215882806476484, "grad_norm": 4.318599700927734, "learning_rate": 6.753926701570681e-06, "loss": 0.1836, "step": 331 }, { "epoch": 1.0246723207401696, "grad_norm": 4.9434494972229, "learning_rate": 6.7434554973822e-06, "loss": 0.2199, "step": 332 }, { "epoch": 1.0277563608326907, "grad_norm": 3.8584797382354736, "learning_rate": 6.732984293193718e-06, "loss": 0.1796, "step": 333 }, { "epoch": 1.0308404009252121, "grad_norm": 4.104945659637451, "learning_rate": 6.722513089005236e-06, "loss": 0.1812, "step": 334 }, { "epoch": 1.0339244410177333, "grad_norm": 4.125020503997803, "learning_rate": 6.712041884816754e-06, "loss": 0.197, "step": 335 }, { "epoch": 1.0370084811102545, "grad_norm": 3.783364772796631, "learning_rate": 6.701570680628273e-06, "loss": 0.1798, "step": 336 }, { "epoch": 1.0400925212027756, "grad_norm": 4.799828052520752, "learning_rate": 6.691099476439791e-06, "loss": 0.1837, "step": 337 }, { "epoch": 1.0431765612952968, "grad_norm": 5.570056438446045, "learning_rate": 6.68062827225131e-06, "loss": 0.1987, "step": 338 }, { "epoch": 1.046260601387818, "grad_norm": 3.9299843311309814, "learning_rate": 6.670157068062828e-06, "loss": 0.1728, "step": 339 }, { "epoch": 1.0493446414803393, "grad_norm": 4.746124267578125, "learning_rate": 6.6596858638743455e-06, "loss": 0.2055, "step": 340 }, { "epoch": 1.0524286815728605, "grad_norm": 3.6969268321990967, "learning_rate": 6.649214659685864e-06, "loss": 0.1919, "step": 341 }, { "epoch": 1.0555127216653817, "grad_norm": 4.096460819244385, "learning_rate": 6.638743455497383e-06, "loss": 0.1725, "step": 342 }, { "epoch": 1.0585967617579028, "grad_norm": 3.819343328475952, "learning_rate": 6.628272251308902e-06, "loss": 0.1727, "step": 343 }, { "epoch": 1.061680801850424, "grad_norm": 4.487940788269043, "learning_rate": 6.617801047120419e-06, "loss": 0.176, "step": 344 }, { "epoch": 1.0647648419429452, "grad_norm": 4.727810382843018, "learning_rate": 6.607329842931938e-06, "loss": 0.1694, "step": 345 }, { "epoch": 1.0678488820354666, "grad_norm": 5.403895854949951, "learning_rate": 6.5968586387434565e-06, "loss": 0.1853, "step": 346 }, { "epoch": 1.0709329221279877, "grad_norm": 3.548576831817627, "learning_rate": 6.5863874345549736e-06, "loss": 0.1711, "step": 347 }, { "epoch": 1.074016962220509, "grad_norm": 3.6849658489227295, "learning_rate": 6.575916230366492e-06, "loss": 0.1877, "step": 348 }, { "epoch": 1.07710100231303, "grad_norm": 3.7493557929992676, "learning_rate": 6.565445026178011e-06, "loss": 0.1858, "step": 349 }, { "epoch": 1.0801850424055512, "grad_norm": 3.9486773014068604, "learning_rate": 6.55497382198953e-06, "loss": 0.1515, "step": 350 }, { "epoch": 1.0832690824980724, "grad_norm": 4.970436096191406, "learning_rate": 6.544502617801047e-06, "loss": 0.172, "step": 351 }, { "epoch": 1.0863531225905936, "grad_norm": 5.032225131988525, "learning_rate": 6.534031413612566e-06, "loss": 0.1611, "step": 352 } ], "logging_steps": 1, "max_steps": 975, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.458954269238886e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }