{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 50420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.039666798889329634, "grad_norm": 46.122135162353516, "learning_rate": 4.990182467274891e-05, "loss": 11.8425, "step": 100 }, { "epoch": 0.07933359777865927, "grad_norm": 23.861114501953125, "learning_rate": 4.9802657675525586e-05, "loss": 9.0419, "step": 200 }, { "epoch": 0.1190003966679889, "grad_norm": 11.361534118652344, "learning_rate": 4.970349067830226e-05, "loss": 7.4942, "step": 300 }, { "epoch": 0.15866719555731854, "grad_norm": 9.868680000305176, "learning_rate": 4.960432368107894e-05, "loss": 6.5606, "step": 400 }, { "epoch": 0.19833399444664815, "grad_norm": 19.773534774780273, "learning_rate": 4.9505156683855616e-05, "loss": 6.0234, "step": 500 }, { "epoch": 0.2380007933359778, "grad_norm": 6.297336101531982, "learning_rate": 4.9405989686632294e-05, "loss": 5.7502, "step": 600 }, { "epoch": 0.2776675922253074, "grad_norm": 6.993984222412109, "learning_rate": 4.9306822689408966e-05, "loss": 5.4746, "step": 700 }, { "epoch": 0.31733439111463707, "grad_norm": 5.947575569152832, "learning_rate": 4.9207655692185645e-05, "loss": 5.2263, "step": 800 }, { "epoch": 0.3570011900039667, "grad_norm": 6.459949493408203, "learning_rate": 4.9108488694962317e-05, "loss": 5.071, "step": 900 }, { "epoch": 0.3966679888932963, "grad_norm": 6.12597131729126, "learning_rate": 4.9009321697738995e-05, "loss": 4.9277, "step": 1000 }, { "epoch": 0.43633478778262597, "grad_norm": 7.8361406326293945, "learning_rate": 4.891015470051567e-05, "loss": 4.7178, "step": 1100 }, { "epoch": 0.4760015866719556, "grad_norm": 8.883167266845703, "learning_rate": 4.881098770329235e-05, "loss": 4.5832, "step": 1200 }, { "epoch": 0.5156683855612852, "grad_norm": 5.529339790344238, "learning_rate": 4.8711820706069024e-05, "loss": 4.5019, "step": 1300 }, { "epoch": 0.5553351844506148, "grad_norm": 5.6431121826171875, "learning_rate": 4.86126537088457e-05, "loss": 4.4477, "step": 1400 }, { "epoch": 0.5950019833399445, "grad_norm": 5.4094367027282715, "learning_rate": 4.8513486711622375e-05, "loss": 4.317, "step": 1500 }, { "epoch": 0.6346687822292741, "grad_norm": 4.631023406982422, "learning_rate": 4.841431971439905e-05, "loss": 4.2006, "step": 1600 }, { "epoch": 0.6743355811186037, "grad_norm": 5.530189037322998, "learning_rate": 4.8315152717175725e-05, "loss": 4.0726, "step": 1700 }, { "epoch": 0.7140023800079334, "grad_norm": 6.281075477600098, "learning_rate": 4.82159857199524e-05, "loss": 4.005, "step": 1800 }, { "epoch": 0.753669178897263, "grad_norm": 8.573356628417969, "learning_rate": 4.8116818722729076e-05, "loss": 3.8903, "step": 1900 }, { "epoch": 0.7933359777865926, "grad_norm": 7.195920467376709, "learning_rate": 4.8017651725505754e-05, "loss": 3.8038, "step": 2000 }, { "epoch": 0.8330027766759223, "grad_norm": 6.207021236419678, "learning_rate": 4.791848472828243e-05, "loss": 3.775, "step": 2100 }, { "epoch": 0.8726695755652519, "grad_norm": 6.628279685974121, "learning_rate": 4.7819317731059105e-05, "loss": 3.711, "step": 2200 }, { "epoch": 0.9123363744545815, "grad_norm": 5.220765590667725, "learning_rate": 4.7720150733835784e-05, "loss": 3.6694, "step": 2300 }, { "epoch": 0.9520031733439112, "grad_norm": 4.995284557342529, "learning_rate": 4.7620983736612455e-05, "loss": 3.5359, "step": 2400 }, { "epoch": 0.9916699722332408, "grad_norm": 5.370284557342529, "learning_rate": 4.7521816739389134e-05, "loss": 3.5537, "step": 2500 }, { "epoch": 1.0, "eval_loss": 2.6742756366729736, "eval_runtime": 33.2175, "eval_samples_per_second": 45.699, "eval_steps_per_second": 5.72, "step": 2521 }, { "epoch": 1.0313367711225705, "grad_norm": 6.217808246612549, "learning_rate": 4.7422649742165806e-05, "loss": 3.4208, "step": 2600 }, { "epoch": 1.0710035700119, "grad_norm": 5.972238540649414, "learning_rate": 4.7323482744942484e-05, "loss": 3.3766, "step": 2700 }, { "epoch": 1.1106703689012296, "grad_norm": 6.439736366271973, "learning_rate": 4.722431574771916e-05, "loss": 3.2706, "step": 2800 }, { "epoch": 1.1503371677905594, "grad_norm": 4.722689151763916, "learning_rate": 4.712514875049584e-05, "loss": 3.1785, "step": 2900 }, { "epoch": 1.190003966679889, "grad_norm": 4.344363689422607, "learning_rate": 4.7025981753272514e-05, "loss": 3.2959, "step": 3000 }, { "epoch": 1.2296707655692185, "grad_norm": 5.724086284637451, "learning_rate": 4.6926814756049185e-05, "loss": 3.1135, "step": 3100 }, { "epoch": 1.269337564458548, "grad_norm": 5.9762163162231445, "learning_rate": 4.6827647758825864e-05, "loss": 3.2194, "step": 3200 }, { "epoch": 1.3090043633478778, "grad_norm": 5.490255355834961, "learning_rate": 4.6728480761602536e-05, "loss": 3.1226, "step": 3300 }, { "epoch": 1.3486711622372074, "grad_norm": 5.34173583984375, "learning_rate": 4.6629313764379215e-05, "loss": 3.0641, "step": 3400 }, { "epoch": 1.388337961126537, "grad_norm": 6.500023365020752, "learning_rate": 4.653014676715589e-05, "loss": 3.0342, "step": 3500 }, { "epoch": 1.4280047600158667, "grad_norm": 4.705812931060791, "learning_rate": 4.643097976993257e-05, "loss": 2.9638, "step": 3600 }, { "epoch": 1.4676715589051963, "grad_norm": 5.796449661254883, "learning_rate": 4.6331812772709244e-05, "loss": 2.9635, "step": 3700 }, { "epoch": 1.5073383577945259, "grad_norm": 5.73616886138916, "learning_rate": 4.623264577548592e-05, "loss": 2.9933, "step": 3800 }, { "epoch": 1.5470051566838556, "grad_norm": 5.073670864105225, "learning_rate": 4.6133478778262594e-05, "loss": 2.9872, "step": 3900 }, { "epoch": 1.5866719555731852, "grad_norm": 5.04343318939209, "learning_rate": 4.603431178103927e-05, "loss": 2.9624, "step": 4000 }, { "epoch": 1.6263387544625147, "grad_norm": 4.266116619110107, "learning_rate": 4.5935144783815945e-05, "loss": 2.9711, "step": 4100 }, { "epoch": 1.6660055533518445, "grad_norm": 4.732306957244873, "learning_rate": 4.583597778659262e-05, "loss": 2.8238, "step": 4200 }, { "epoch": 1.705672352241174, "grad_norm": 5.156635284423828, "learning_rate": 4.57368107893693e-05, "loss": 2.839, "step": 4300 }, { "epoch": 1.7453391511305036, "grad_norm": 6.178804874420166, "learning_rate": 4.563764379214598e-05, "loss": 2.7441, "step": 4400 }, { "epoch": 1.7850059500198334, "grad_norm": 6.307518482208252, "learning_rate": 4.553847679492265e-05, "loss": 2.7271, "step": 4500 }, { "epoch": 1.824672748909163, "grad_norm": 4.5322136878967285, "learning_rate": 4.5439309797699324e-05, "loss": 2.6839, "step": 4600 }, { "epoch": 1.8643395477984925, "grad_norm": 4.728321552276611, "learning_rate": 4.5340142800476e-05, "loss": 2.815, "step": 4700 }, { "epoch": 1.9040063466878223, "grad_norm": 5.051918029785156, "learning_rate": 4.5240975803252675e-05, "loss": 2.6735, "step": 4800 }, { "epoch": 1.9436731455771519, "grad_norm": 4.968688011169434, "learning_rate": 4.5141808806029353e-05, "loss": 2.6604, "step": 4900 }, { "epoch": 1.9833399444664814, "grad_norm": 4.792623996734619, "learning_rate": 4.504264180880603e-05, "loss": 2.6375, "step": 5000 }, { "epoch": 2.0, "eval_loss": 2.0515847206115723, "eval_runtime": 33.0931, "eval_samples_per_second": 45.871, "eval_steps_per_second": 5.741, "step": 5042 }, { "epoch": 2.023006743355811, "grad_norm": 7.228871822357178, "learning_rate": 4.494347481158271e-05, "loss": 2.5813, "step": 5100 }, { "epoch": 2.062673542245141, "grad_norm": 4.44078254699707, "learning_rate": 4.484430781435938e-05, "loss": 2.5798, "step": 5200 }, { "epoch": 2.1023403411344703, "grad_norm": 5.475325107574463, "learning_rate": 4.474514081713606e-05, "loss": 2.5297, "step": 5300 }, { "epoch": 2.1420071400238, "grad_norm": 4.271339416503906, "learning_rate": 4.464597381991273e-05, "loss": 2.5595, "step": 5400 }, { "epoch": 2.18167393891313, "grad_norm": 3.9716315269470215, "learning_rate": 4.454680682268941e-05, "loss": 2.5629, "step": 5500 }, { "epoch": 2.221340737802459, "grad_norm": 5.6469807624816895, "learning_rate": 4.4447639825466084e-05, "loss": 2.4691, "step": 5600 }, { "epoch": 2.261007536691789, "grad_norm": 4.760526657104492, "learning_rate": 4.434847282824276e-05, "loss": 2.606, "step": 5700 }, { "epoch": 2.300674335581119, "grad_norm": 5.259726047515869, "learning_rate": 4.424930583101944e-05, "loss": 2.4984, "step": 5800 }, { "epoch": 2.340341134470448, "grad_norm": 4.372512340545654, "learning_rate": 4.415013883379612e-05, "loss": 2.4104, "step": 5900 }, { "epoch": 2.380007933359778, "grad_norm": 5.21671724319458, "learning_rate": 4.405097183657279e-05, "loss": 2.4612, "step": 6000 }, { "epoch": 2.4196747322491077, "grad_norm": 4.706778049468994, "learning_rate": 4.395180483934946e-05, "loss": 2.3845, "step": 6100 }, { "epoch": 2.459341531138437, "grad_norm": 4.4265217781066895, "learning_rate": 4.385263784212614e-05, "loss": 2.4508, "step": 6200 }, { "epoch": 2.499008330027767, "grad_norm": 112.53572082519531, "learning_rate": 4.3753470844902814e-05, "loss": 2.3788, "step": 6300 }, { "epoch": 2.538675128917096, "grad_norm": 5.193419933319092, "learning_rate": 4.365430384767949e-05, "loss": 2.3999, "step": 6400 }, { "epoch": 2.578341927806426, "grad_norm": 4.786646842956543, "learning_rate": 4.355513685045617e-05, "loss": 2.3964, "step": 6500 }, { "epoch": 2.6180087266957557, "grad_norm": 4.764982223510742, "learning_rate": 4.345596985323285e-05, "loss": 2.2939, "step": 6600 }, { "epoch": 2.657675525585085, "grad_norm": 8.752727508544922, "learning_rate": 4.335680285600952e-05, "loss": 2.2859, "step": 6700 }, { "epoch": 2.697342324474415, "grad_norm": 5.419288158416748, "learning_rate": 4.32576358587862e-05, "loss": 2.3073, "step": 6800 }, { "epoch": 2.7370091233637446, "grad_norm": 3.573631763458252, "learning_rate": 4.315846886156287e-05, "loss": 2.1833, "step": 6900 }, { "epoch": 2.776675922253074, "grad_norm": 5.297525882720947, "learning_rate": 4.305930186433955e-05, "loss": 2.4245, "step": 7000 }, { "epoch": 2.8163427211424037, "grad_norm": 4.3615827560424805, "learning_rate": 4.296013486711622e-05, "loss": 2.2811, "step": 7100 }, { "epoch": 2.8560095200317335, "grad_norm": 6.935328960418701, "learning_rate": 4.28609678698929e-05, "loss": 2.2544, "step": 7200 }, { "epoch": 2.895676318921063, "grad_norm": 3.9425063133239746, "learning_rate": 4.276180087266958e-05, "loss": 2.2934, "step": 7300 }, { "epoch": 2.9353431178103926, "grad_norm": 6.062328815460205, "learning_rate": 4.266263387544626e-05, "loss": 2.3048, "step": 7400 }, { "epoch": 2.9750099166997224, "grad_norm": 4.808726787567139, "learning_rate": 4.256346687822293e-05, "loss": 2.2118, "step": 7500 }, { "epoch": 3.0, "eval_loss": 1.770484209060669, "eval_runtime": 33.1033, "eval_samples_per_second": 45.856, "eval_steps_per_second": 5.74, "step": 7563 }, { "epoch": 3.014676715589052, "grad_norm": 4.881776809692383, "learning_rate": 4.24642998809996e-05, "loss": 2.2472, "step": 7600 }, { "epoch": 3.0543435144783815, "grad_norm": 6.6921706199646, "learning_rate": 4.236513288377628e-05, "loss": 2.1728, "step": 7700 }, { "epoch": 3.0940103133677113, "grad_norm": 3.29506254196167, "learning_rate": 4.226596588655295e-05, "loss": 2.0689, "step": 7800 }, { "epoch": 3.133677112257041, "grad_norm": 4.864801406860352, "learning_rate": 4.216679888932963e-05, "loss": 2.2328, "step": 7900 }, { "epoch": 3.1733439111463704, "grad_norm": 3.8594539165496826, "learning_rate": 4.206763189210631e-05, "loss": 2.0911, "step": 8000 }, { "epoch": 3.2130107100357, "grad_norm": 5.1737380027771, "learning_rate": 4.196846489488299e-05, "loss": 2.0999, "step": 8100 }, { "epoch": 3.25267750892503, "grad_norm": 4.454146385192871, "learning_rate": 4.186929789765966e-05, "loss": 2.0902, "step": 8200 }, { "epoch": 3.2923443078143593, "grad_norm": 5.417801380157471, "learning_rate": 4.177013090043634e-05, "loss": 2.0971, "step": 8300 }, { "epoch": 3.332011106703689, "grad_norm": 2.7768959999084473, "learning_rate": 4.167096390321301e-05, "loss": 2.1635, "step": 8400 }, { "epoch": 3.371677905593019, "grad_norm": 4.387384414672852, "learning_rate": 4.157179690598969e-05, "loss": 2.0166, "step": 8500 }, { "epoch": 3.411344704482348, "grad_norm": 4.593613624572754, "learning_rate": 4.147262990876636e-05, "loss": 2.0944, "step": 8600 }, { "epoch": 3.451011503371678, "grad_norm": 5.243652820587158, "learning_rate": 4.137346291154304e-05, "loss": 2.0518, "step": 8700 }, { "epoch": 3.4906783022610077, "grad_norm": 5.076266765594482, "learning_rate": 4.127429591431972e-05, "loss": 2.0412, "step": 8800 }, { "epoch": 3.530345101150337, "grad_norm": 5.36345911026001, "learning_rate": 4.11751289170964e-05, "loss": 2.0586, "step": 8900 }, { "epoch": 3.570011900039667, "grad_norm": 6.591952800750732, "learning_rate": 4.107596191987307e-05, "loss": 2.059, "step": 9000 }, { "epoch": 3.609678698928996, "grad_norm": 5.091315746307373, "learning_rate": 4.097679492264974e-05, "loss": 2.0451, "step": 9100 }, { "epoch": 3.649345497818326, "grad_norm": 4.647657871246338, "learning_rate": 4.087762792542642e-05, "loss": 2.0488, "step": 9200 }, { "epoch": 3.6890122967076557, "grad_norm": 5.167809963226318, "learning_rate": 4.077846092820309e-05, "loss": 2.0688, "step": 9300 }, { "epoch": 3.728679095596985, "grad_norm": 67.48959350585938, "learning_rate": 4.067929393097977e-05, "loss": 2.0513, "step": 9400 }, { "epoch": 3.768345894486315, "grad_norm": 3.942390203475952, "learning_rate": 4.058012693375645e-05, "loss": 1.9697, "step": 9500 }, { "epoch": 3.8080126933756446, "grad_norm": 5.491151332855225, "learning_rate": 4.048095993653313e-05, "loss": 2.0849, "step": 9600 }, { "epoch": 3.847679492264974, "grad_norm": 4.637006759643555, "learning_rate": 4.03817929393098e-05, "loss": 1.9753, "step": 9700 }, { "epoch": 3.8873462911543037, "grad_norm": 4.818416595458984, "learning_rate": 4.028262594208648e-05, "loss": 2.0526, "step": 9800 }, { "epoch": 3.9270130900436335, "grad_norm": 4.810122013092041, "learning_rate": 4.018345894486315e-05, "loss": 2.0148, "step": 9900 }, { "epoch": 3.966679888932963, "grad_norm": 4.372331142425537, "learning_rate": 4.008429194763983e-05, "loss": 2.0324, "step": 10000 }, { "epoch": 4.0, "eval_loss": 1.5883285999298096, "eval_runtime": 33.141, "eval_samples_per_second": 45.804, "eval_steps_per_second": 5.733, "step": 10084 }, { "epoch": 4.006346687822293, "grad_norm": 4.643691539764404, "learning_rate": 3.99851249504165e-05, "loss": 2.0171, "step": 10100 }, { "epoch": 4.046013486711622, "grad_norm": 5.210694789886475, "learning_rate": 3.988595795319318e-05, "loss": 1.9401, "step": 10200 }, { "epoch": 4.085680285600952, "grad_norm": 5.724204063415527, "learning_rate": 3.978679095596986e-05, "loss": 1.9012, "step": 10300 }, { "epoch": 4.125347084490282, "grad_norm": 3.6750075817108154, "learning_rate": 3.9687623958746536e-05, "loss": 1.7982, "step": 10400 }, { "epoch": 4.165013883379611, "grad_norm": 4.948938369750977, "learning_rate": 3.958845696152321e-05, "loss": 1.9426, "step": 10500 }, { "epoch": 4.204680682268941, "grad_norm": 5.098011016845703, "learning_rate": 3.948928996429988e-05, "loss": 1.9476, "step": 10600 }, { "epoch": 4.244347481158271, "grad_norm": 3.605708599090576, "learning_rate": 3.939012296707656e-05, "loss": 1.91, "step": 10700 }, { "epoch": 4.2840142800476, "grad_norm": 4.1741766929626465, "learning_rate": 3.929095596985323e-05, "loss": 1.9512, "step": 10800 }, { "epoch": 4.3236810789369295, "grad_norm": 4.427469730377197, "learning_rate": 3.919178897262991e-05, "loss": 1.859, "step": 10900 }, { "epoch": 4.36334787782626, "grad_norm": 4.128306865692139, "learning_rate": 3.909262197540659e-05, "loss": 1.8465, "step": 11000 }, { "epoch": 4.403014676715589, "grad_norm": 3.959047317504883, "learning_rate": 3.8993454978183266e-05, "loss": 1.9168, "step": 11100 }, { "epoch": 4.442681475604918, "grad_norm": 5.283690452575684, "learning_rate": 3.889428798095994e-05, "loss": 1.8212, "step": 11200 }, { "epoch": 4.482348274494249, "grad_norm": 4.190108299255371, "learning_rate": 3.8795120983736616e-05, "loss": 1.7881, "step": 11300 }, { "epoch": 4.522015073383578, "grad_norm": 5.957630157470703, "learning_rate": 3.869595398651329e-05, "loss": 1.8606, "step": 11400 }, { "epoch": 4.561681872272907, "grad_norm": 4.41494607925415, "learning_rate": 3.859678698928997e-05, "loss": 1.8808, "step": 11500 }, { "epoch": 4.601348671162238, "grad_norm": 4.355372428894043, "learning_rate": 3.849761999206664e-05, "loss": 1.8034, "step": 11600 }, { "epoch": 4.641015470051567, "grad_norm": 4.594727993011475, "learning_rate": 3.839845299484332e-05, "loss": 1.8631, "step": 11700 }, { "epoch": 4.680682268940896, "grad_norm": 3.8081648349761963, "learning_rate": 3.8299285997619996e-05, "loss": 1.7664, "step": 11800 }, { "epoch": 4.7203490678302265, "grad_norm": 5.383887767791748, "learning_rate": 3.8200119000396675e-05, "loss": 1.8918, "step": 11900 }, { "epoch": 4.760015866719556, "grad_norm": 4.703048229217529, "learning_rate": 3.8100952003173347e-05, "loss": 1.7821, "step": 12000 }, { "epoch": 4.799682665608885, "grad_norm": 5.115866661071777, "learning_rate": 3.800178500595002e-05, "loss": 1.8276, "step": 12100 }, { "epoch": 4.839349464498215, "grad_norm": 4.647130012512207, "learning_rate": 3.79026180087267e-05, "loss": 1.7743, "step": 12200 }, { "epoch": 4.879016263387545, "grad_norm": 4.2948994636535645, "learning_rate": 3.780345101150337e-05, "loss": 1.7567, "step": 12300 }, { "epoch": 4.918683062276874, "grad_norm": 4.055002212524414, "learning_rate": 3.770428401428005e-05, "loss": 1.832, "step": 12400 }, { "epoch": 4.958349861166204, "grad_norm": 4.373877048492432, "learning_rate": 3.7605117017056726e-05, "loss": 1.7995, "step": 12500 }, { "epoch": 4.998016660055534, "grad_norm": 5.246423721313477, "learning_rate": 3.7505950019833405e-05, "loss": 1.7464, "step": 12600 }, { "epoch": 5.0, "eval_loss": 1.4670053720474243, "eval_runtime": 33.135, "eval_samples_per_second": 45.813, "eval_steps_per_second": 5.734, "step": 12605 }, { "epoch": 5.037683458944863, "grad_norm": 5.669796943664551, "learning_rate": 3.740678302261008e-05, "loss": 1.6803, "step": 12700 }, { "epoch": 5.077350257834193, "grad_norm": 4.203566074371338, "learning_rate": 3.7307616025386755e-05, "loss": 1.665, "step": 12800 }, { "epoch": 5.1170170567235225, "grad_norm": 3.6892035007476807, "learning_rate": 3.720844902816343e-05, "loss": 1.7269, "step": 12900 }, { "epoch": 5.156683855612852, "grad_norm": 4.452983379364014, "learning_rate": 3.7109282030940106e-05, "loss": 1.7276, "step": 13000 }, { "epoch": 5.196350654502182, "grad_norm": 3.7172744274139404, "learning_rate": 3.701011503371678e-05, "loss": 1.6838, "step": 13100 }, { "epoch": 5.236017453391511, "grad_norm": 4.209805488586426, "learning_rate": 3.6910948036493456e-05, "loss": 1.6982, "step": 13200 }, { "epoch": 5.275684252280841, "grad_norm": 4.2851362228393555, "learning_rate": 3.6811781039270135e-05, "loss": 1.6656, "step": 13300 }, { "epoch": 5.315351051170171, "grad_norm": 3.345033884048462, "learning_rate": 3.6712614042046814e-05, "loss": 1.7176, "step": 13400 }, { "epoch": 5.3550178500595, "grad_norm": 7.854482173919678, "learning_rate": 3.6613447044823485e-05, "loss": 1.7395, "step": 13500 }, { "epoch": 5.39468464894883, "grad_norm": 5.201159477233887, "learning_rate": 3.651428004760016e-05, "loss": 1.7159, "step": 13600 }, { "epoch": 5.43435144783816, "grad_norm": 5.032053470611572, "learning_rate": 3.6415113050376836e-05, "loss": 1.7993, "step": 13700 }, { "epoch": 5.474018246727489, "grad_norm": 4.350612640380859, "learning_rate": 3.631594605315351e-05, "loss": 1.722, "step": 13800 }, { "epoch": 5.5136850456168185, "grad_norm": 5.67685604095459, "learning_rate": 3.6216779055930186e-05, "loss": 1.5882, "step": 13900 }, { "epoch": 5.553351844506149, "grad_norm": 3.8744733333587646, "learning_rate": 3.6117612058706865e-05, "loss": 1.6864, "step": 14000 }, { "epoch": 5.593018643395478, "grad_norm": 3.0556750297546387, "learning_rate": 3.6018445061483544e-05, "loss": 1.6494, "step": 14100 }, { "epoch": 5.632685442284807, "grad_norm": 5.0797343254089355, "learning_rate": 3.5919278064260215e-05, "loss": 1.747, "step": 14200 }, { "epoch": 5.672352241174138, "grad_norm": 4.625453948974609, "learning_rate": 3.5820111067036894e-05, "loss": 1.689, "step": 14300 }, { "epoch": 5.712019040063467, "grad_norm": 5.2560133934021, "learning_rate": 3.5720944069813566e-05, "loss": 1.6218, "step": 14400 }, { "epoch": 5.751685838952796, "grad_norm": 4.45328950881958, "learning_rate": 3.5621777072590245e-05, "loss": 1.7095, "step": 14500 }, { "epoch": 5.7913526378421265, "grad_norm": 3.0788328647613525, "learning_rate": 3.5522610075366916e-05, "loss": 1.7223, "step": 14600 }, { "epoch": 5.831019436731456, "grad_norm": 6.247290134429932, "learning_rate": 3.5423443078143595e-05, "loss": 1.6158, "step": 14700 }, { "epoch": 5.870686235620785, "grad_norm": 4.095520973205566, "learning_rate": 3.5324276080920274e-05, "loss": 1.6313, "step": 14800 }, { "epoch": 5.910353034510115, "grad_norm": 4.251845836639404, "learning_rate": 3.522510908369695e-05, "loss": 1.6458, "step": 14900 }, { "epoch": 5.950019833399445, "grad_norm": 3.833702802658081, "learning_rate": 3.5125942086473624e-05, "loss": 1.6425, "step": 15000 }, { "epoch": 5.989686632288774, "grad_norm": 4.577655792236328, "learning_rate": 3.5026775089250296e-05, "loss": 1.6747, "step": 15100 }, { "epoch": 6.0, "eval_loss": 1.3812555074691772, "eval_runtime": 33.4674, "eval_samples_per_second": 45.358, "eval_steps_per_second": 5.677, "step": 15126 }, { "epoch": 6.029353431178104, "grad_norm": 3.609616279602051, "learning_rate": 3.4927608092026975e-05, "loss": 1.6579, "step": 15200 }, { "epoch": 6.069020230067434, "grad_norm": 3.5658130645751953, "learning_rate": 3.4828441094803647e-05, "loss": 1.6842, "step": 15300 }, { "epoch": 6.108687028956763, "grad_norm": 4.586058139801025, "learning_rate": 3.4729274097580325e-05, "loss": 1.5563, "step": 15400 }, { "epoch": 6.148353827846093, "grad_norm": 5.103824615478516, "learning_rate": 3.4630107100357004e-05, "loss": 1.5386, "step": 15500 }, { "epoch": 6.1880206267354225, "grad_norm": 5.14306116104126, "learning_rate": 3.453094010313368e-05, "loss": 1.6081, "step": 15600 }, { "epoch": 6.227687425624752, "grad_norm": 4.270661354064941, "learning_rate": 3.4431773105910354e-05, "loss": 1.5569, "step": 15700 }, { "epoch": 6.267354224514082, "grad_norm": 13.869562149047852, "learning_rate": 3.433260610868703e-05, "loss": 1.5484, "step": 15800 }, { "epoch": 6.307021023403411, "grad_norm": 3.9378180503845215, "learning_rate": 3.4233439111463705e-05, "loss": 1.5441, "step": 15900 }, { "epoch": 6.346687822292741, "grad_norm": 4.3542656898498535, "learning_rate": 3.4134272114240383e-05, "loss": 1.58, "step": 16000 }, { "epoch": 6.386354621182071, "grad_norm": 3.8545126914978027, "learning_rate": 3.4035105117017055e-05, "loss": 1.4445, "step": 16100 }, { "epoch": 6.4260214200714, "grad_norm": 3.9810452461242676, "learning_rate": 3.3935938119793734e-05, "loss": 1.6052, "step": 16200 }, { "epoch": 6.46568821896073, "grad_norm": 7.306039333343506, "learning_rate": 3.383677112257041e-05, "loss": 1.608, "step": 16300 }, { "epoch": 6.50535501785006, "grad_norm": 4.018649578094482, "learning_rate": 3.373760412534709e-05, "loss": 1.581, "step": 16400 }, { "epoch": 6.545021816739389, "grad_norm": 5.1577019691467285, "learning_rate": 3.363843712812376e-05, "loss": 1.548, "step": 16500 }, { "epoch": 6.5846886156287185, "grad_norm": 6.858482837677002, "learning_rate": 3.3539270130900435e-05, "loss": 1.5823, "step": 16600 }, { "epoch": 6.624355414518049, "grad_norm": 4.213831901550293, "learning_rate": 3.3440103133677114e-05, "loss": 1.5199, "step": 16700 }, { "epoch": 6.664022213407378, "grad_norm": 3.531313180923462, "learning_rate": 3.3340936136453785e-05, "loss": 1.5993, "step": 16800 }, { "epoch": 6.703689012296707, "grad_norm": 4.222484588623047, "learning_rate": 3.3241769139230464e-05, "loss": 1.5839, "step": 16900 }, { "epoch": 6.743355811186038, "grad_norm": 3.11354660987854, "learning_rate": 3.314260214200714e-05, "loss": 1.5302, "step": 17000 }, { "epoch": 6.783022610075367, "grad_norm": 3.699721574783325, "learning_rate": 3.304343514478382e-05, "loss": 1.5662, "step": 17100 }, { "epoch": 6.822689408964696, "grad_norm": 6.095912456512451, "learning_rate": 3.294426814756049e-05, "loss": 1.6208, "step": 17200 }, { "epoch": 6.862356207854027, "grad_norm": 3.0489301681518555, "learning_rate": 3.284510115033717e-05, "loss": 1.4306, "step": 17300 }, { "epoch": 6.902023006743356, "grad_norm": 4.094913005828857, "learning_rate": 3.2745934153113844e-05, "loss": 1.438, "step": 17400 }, { "epoch": 6.941689805632685, "grad_norm": 3.900447130203247, "learning_rate": 3.264676715589052e-05, "loss": 1.4798, "step": 17500 }, { "epoch": 6.9813566045220155, "grad_norm": 4.244141578674316, "learning_rate": 3.2547600158667194e-05, "loss": 1.5627, "step": 17600 }, { "epoch": 7.0, "eval_loss": 1.3121882677078247, "eval_runtime": 33.1418, "eval_samples_per_second": 45.803, "eval_steps_per_second": 5.733, "step": 17647 }, { "epoch": 7.021023403411345, "grad_norm": 5.134289264678955, "learning_rate": 3.244843316144387e-05, "loss": 1.4704, "step": 17700 }, { "epoch": 7.060690202300674, "grad_norm": 4.705554008483887, "learning_rate": 3.234926616422055e-05, "loss": 1.4257, "step": 17800 }, { "epoch": 7.100357001190004, "grad_norm": 5.20936918258667, "learning_rate": 3.225009916699723e-05, "loss": 1.4684, "step": 17900 }, { "epoch": 7.140023800079334, "grad_norm": 5.669763565063477, "learning_rate": 3.21509321697739e-05, "loss": 1.4626, "step": 18000 }, { "epoch": 7.179690598968663, "grad_norm": 4.726533889770508, "learning_rate": 3.2051765172550574e-05, "loss": 1.4362, "step": 18100 }, { "epoch": 7.219357397857993, "grad_norm": 3.413167715072632, "learning_rate": 3.195259817532725e-05, "loss": 1.4591, "step": 18200 }, { "epoch": 7.259024196747323, "grad_norm": 3.3368911743164062, "learning_rate": 3.1853431178103924e-05, "loss": 1.5077, "step": 18300 }, { "epoch": 7.298690995636652, "grad_norm": 3.5089704990386963, "learning_rate": 3.17542641808806e-05, "loss": 1.509, "step": 18400 }, { "epoch": 7.338357794525982, "grad_norm": 4.13053035736084, "learning_rate": 3.165509718365728e-05, "loss": 1.5048, "step": 18500 }, { "epoch": 7.3780245934153115, "grad_norm": 4.646170139312744, "learning_rate": 3.155593018643396e-05, "loss": 1.474, "step": 18600 }, { "epoch": 7.417691392304641, "grad_norm": 4.4724812507629395, "learning_rate": 3.145676318921063e-05, "loss": 1.5448, "step": 18700 }, { "epoch": 7.457358191193971, "grad_norm": 3.79464054107666, "learning_rate": 3.135759619198731e-05, "loss": 1.4379, "step": 18800 }, { "epoch": 7.4970249900833, "grad_norm": 3.2396161556243896, "learning_rate": 3.125842919476398e-05, "loss": 1.4857, "step": 18900 }, { "epoch": 7.53669178897263, "grad_norm": 3.6047024726867676, "learning_rate": 3.115926219754066e-05, "loss": 1.453, "step": 19000 }, { "epoch": 7.57635858786196, "grad_norm": 4.998748779296875, "learning_rate": 3.106009520031733e-05, "loss": 1.4062, "step": 19100 }, { "epoch": 7.616025386751289, "grad_norm": 4.068435192108154, "learning_rate": 3.096092820309401e-05, "loss": 1.3582, "step": 19200 }, { "epoch": 7.655692185640619, "grad_norm": 5.680367469787598, "learning_rate": 3.086176120587069e-05, "loss": 1.4897, "step": 19300 }, { "epoch": 7.695358984529949, "grad_norm": 3.917802333831787, "learning_rate": 3.076259420864737e-05, "loss": 1.4195, "step": 19400 }, { "epoch": 7.735025783419278, "grad_norm": 3.1522891521453857, "learning_rate": 3.066342721142404e-05, "loss": 1.4599, "step": 19500 }, { "epoch": 7.7746925823086075, "grad_norm": 4.597601890563965, "learning_rate": 3.056426021420071e-05, "loss": 1.4701, "step": 19600 }, { "epoch": 7.814359381197937, "grad_norm": 4.217317581176758, "learning_rate": 3.046509321697739e-05, "loss": 1.4263, "step": 19700 }, { "epoch": 7.854026180087267, "grad_norm": 4.17954158782959, "learning_rate": 3.0365926219754066e-05, "loss": 1.4155, "step": 19800 }, { "epoch": 7.893692978976596, "grad_norm": 4.049231052398682, "learning_rate": 3.0266759222530745e-05, "loss": 1.4343, "step": 19900 }, { "epoch": 7.933359777865926, "grad_norm": 3.9351389408111572, "learning_rate": 3.0167592225307417e-05, "loss": 1.4247, "step": 20000 }, { "epoch": 7.973026576755256, "grad_norm": 6.478794097900391, "learning_rate": 3.0068425228084096e-05, "loss": 1.4336, "step": 20100 }, { "epoch": 8.0, "eval_loss": 1.2493535280227661, "eval_runtime": 33.1532, "eval_samples_per_second": 45.787, "eval_steps_per_second": 5.731, "step": 20168 }, { "epoch": 8.012693375644586, "grad_norm": 7.988471508026123, "learning_rate": 2.996925823086077e-05, "loss": 1.4408, "step": 20200 }, { "epoch": 8.052360174533915, "grad_norm": 3.978797674179077, "learning_rate": 2.987009123363745e-05, "loss": 1.4227, "step": 20300 }, { "epoch": 8.092026973423245, "grad_norm": 2.8589699268341064, "learning_rate": 2.977092423641412e-05, "loss": 1.3348, "step": 20400 }, { "epoch": 8.131693772312575, "grad_norm": 4.3820061683654785, "learning_rate": 2.96717572391908e-05, "loss": 1.3374, "step": 20500 }, { "epoch": 8.171360571201904, "grad_norm": 4.421834468841553, "learning_rate": 2.9572590241967475e-05, "loss": 1.379, "step": 20600 }, { "epoch": 8.211027370091234, "grad_norm": 3.6717193126678467, "learning_rate": 2.9473423244744154e-05, "loss": 1.3878, "step": 20700 }, { "epoch": 8.250694168980564, "grad_norm": 5.8960466384887695, "learning_rate": 2.9374256247520826e-05, "loss": 1.418, "step": 20800 }, { "epoch": 8.290360967869892, "grad_norm": 4.1541428565979, "learning_rate": 2.9275089250297504e-05, "loss": 1.3427, "step": 20900 }, { "epoch": 8.330027766759223, "grad_norm": 4.0375566482543945, "learning_rate": 2.917592225307418e-05, "loss": 1.3659, "step": 21000 }, { "epoch": 8.369694565648553, "grad_norm": 2.6886465549468994, "learning_rate": 2.907675525585085e-05, "loss": 1.3568, "step": 21100 }, { "epoch": 8.409361364537881, "grad_norm": 4.069731712341309, "learning_rate": 2.897758825862753e-05, "loss": 1.4326, "step": 21200 }, { "epoch": 8.449028163427212, "grad_norm": 4.844085693359375, "learning_rate": 2.8878421261404205e-05, "loss": 1.4363, "step": 21300 }, { "epoch": 8.488694962316542, "grad_norm": 2.894545316696167, "learning_rate": 2.8779254264180884e-05, "loss": 1.362, "step": 21400 }, { "epoch": 8.52836176120587, "grad_norm": 3.8921375274658203, "learning_rate": 2.8680087266957556e-05, "loss": 1.3303, "step": 21500 }, { "epoch": 8.5680285600952, "grad_norm": 3.6468684673309326, "learning_rate": 2.8580920269734234e-05, "loss": 1.387, "step": 21600 }, { "epoch": 8.60769535898453, "grad_norm": 4.2180938720703125, "learning_rate": 2.848175327251091e-05, "loss": 1.366, "step": 21700 }, { "epoch": 8.647362157873859, "grad_norm": 4.113888263702393, "learning_rate": 2.8382586275287588e-05, "loss": 1.4047, "step": 21800 }, { "epoch": 8.68702895676319, "grad_norm": 4.009461402893066, "learning_rate": 2.828341927806426e-05, "loss": 1.3446, "step": 21900 }, { "epoch": 8.72669575565252, "grad_norm": 3.8195252418518066, "learning_rate": 2.818425228084094e-05, "loss": 1.3304, "step": 22000 }, { "epoch": 8.766362554541848, "grad_norm": 4.5541300773620605, "learning_rate": 2.8085085283617614e-05, "loss": 1.4156, "step": 22100 }, { "epoch": 8.806029353431178, "grad_norm": 4.221588611602783, "learning_rate": 2.7985918286394293e-05, "loss": 1.3258, "step": 22200 }, { "epoch": 8.845696152320508, "grad_norm": 3.7638354301452637, "learning_rate": 2.7886751289170964e-05, "loss": 1.2697, "step": 22300 }, { "epoch": 8.885362951209837, "grad_norm": 3.7174267768859863, "learning_rate": 2.7787584291947643e-05, "loss": 1.3468, "step": 22400 }, { "epoch": 8.925029750099167, "grad_norm": 4.4955153465271, "learning_rate": 2.768841729472432e-05, "loss": 1.3074, "step": 22500 }, { "epoch": 8.964696548988497, "grad_norm": 4.170012950897217, "learning_rate": 2.758925029750099e-05, "loss": 1.3324, "step": 22600 }, { "epoch": 9.0, "eval_loss": 1.204575538635254, "eval_runtime": 33.1963, "eval_samples_per_second": 45.728, "eval_steps_per_second": 5.724, "step": 22689 }, { "epoch": 9.004363347877826, "grad_norm": 3.331163167953491, "learning_rate": 2.749008330027767e-05, "loss": 1.3285, "step": 22700 }, { "epoch": 9.044030146767156, "grad_norm": 3.822847843170166, "learning_rate": 2.7390916303054344e-05, "loss": 1.3589, "step": 22800 }, { "epoch": 9.083696945656486, "grad_norm": 3.4321391582489014, "learning_rate": 2.7291749305831023e-05, "loss": 1.2863, "step": 22900 }, { "epoch": 9.123363744545815, "grad_norm": 4.23520040512085, "learning_rate": 2.7192582308607695e-05, "loss": 1.297, "step": 23000 }, { "epoch": 9.163030543435145, "grad_norm": 3.0839881896972656, "learning_rate": 2.7093415311384373e-05, "loss": 1.272, "step": 23100 }, { "epoch": 9.202697342324475, "grad_norm": 5.115342617034912, "learning_rate": 2.699424831416105e-05, "loss": 1.2667, "step": 23200 }, { "epoch": 9.242364141213804, "grad_norm": 3.8965401649475098, "learning_rate": 2.6895081316937727e-05, "loss": 1.2995, "step": 23300 }, { "epoch": 9.282030940103134, "grad_norm": 3.395707368850708, "learning_rate": 2.67959143197144e-05, "loss": 1.2064, "step": 23400 }, { "epoch": 9.321697738992464, "grad_norm": 3.7783238887786865, "learning_rate": 2.6696747322491078e-05, "loss": 1.354, "step": 23500 }, { "epoch": 9.361364537881792, "grad_norm": 3.6201136112213135, "learning_rate": 2.6597580325267753e-05, "loss": 1.318, "step": 23600 }, { "epoch": 9.401031336771123, "grad_norm": 7.127315044403076, "learning_rate": 2.649841332804443e-05, "loss": 1.2809, "step": 23700 }, { "epoch": 9.440698135660453, "grad_norm": 3.341298818588257, "learning_rate": 2.6399246330821103e-05, "loss": 1.3285, "step": 23800 }, { "epoch": 9.480364934549781, "grad_norm": 3.38814377784729, "learning_rate": 2.6300079333597782e-05, "loss": 1.3326, "step": 23900 }, { "epoch": 9.520031733439112, "grad_norm": 2.880125045776367, "learning_rate": 2.6200912336374457e-05, "loss": 1.3142, "step": 24000 }, { "epoch": 9.559698532328442, "grad_norm": 3.778383731842041, "learning_rate": 2.610174533915113e-05, "loss": 1.3217, "step": 24100 }, { "epoch": 9.59936533121777, "grad_norm": 5.5109734535217285, "learning_rate": 2.6002578341927808e-05, "loss": 1.2715, "step": 24200 }, { "epoch": 9.6390321301071, "grad_norm": 3.931368112564087, "learning_rate": 2.5903411344704483e-05, "loss": 1.318, "step": 24300 }, { "epoch": 9.67869892899643, "grad_norm": 3.6587719917297363, "learning_rate": 2.580424434748116e-05, "loss": 1.2384, "step": 24400 }, { "epoch": 9.71836572788576, "grad_norm": 3.4478108882904053, "learning_rate": 2.5705077350257833e-05, "loss": 1.2682, "step": 24500 }, { "epoch": 9.75803252677509, "grad_norm": 3.9226527214050293, "learning_rate": 2.5605910353034512e-05, "loss": 1.2966, "step": 24600 }, { "epoch": 9.79769932566442, "grad_norm": 4.621306419372559, "learning_rate": 2.5506743355811187e-05, "loss": 1.2788, "step": 24700 }, { "epoch": 9.837366124553748, "grad_norm": 3.4298593997955322, "learning_rate": 2.5407576358587866e-05, "loss": 1.3299, "step": 24800 }, { "epoch": 9.877032923443078, "grad_norm": 3.7832400798797607, "learning_rate": 2.5308409361364538e-05, "loss": 1.2634, "step": 24900 }, { "epoch": 9.916699722332408, "grad_norm": 5.351818561553955, "learning_rate": 2.5209242364141216e-05, "loss": 1.3117, "step": 25000 }, { "epoch": 9.956366521221737, "grad_norm": 4.65415096282959, "learning_rate": 2.511007536691789e-05, "loss": 1.2613, "step": 25100 }, { "epoch": 9.996033320111067, "grad_norm": 3.2736618518829346, "learning_rate": 2.501090836969457e-05, "loss": 1.3156, "step": 25200 }, { "epoch": 10.0, "eval_loss": 1.164570689201355, "eval_runtime": 33.0846, "eval_samples_per_second": 45.882, "eval_steps_per_second": 5.743, "step": 25210 }, { "epoch": 10.035700119000397, "grad_norm": 3.6819069385528564, "learning_rate": 2.4911741372471242e-05, "loss": 1.2604, "step": 25300 }, { "epoch": 10.075366917889726, "grad_norm": 3.9212143421173096, "learning_rate": 2.4812574375247917e-05, "loss": 1.2308, "step": 25400 }, { "epoch": 10.115033716779056, "grad_norm": 3.3087549209594727, "learning_rate": 2.4713407378024596e-05, "loss": 1.1652, "step": 25500 }, { "epoch": 10.154700515668386, "grad_norm": 3.8680827617645264, "learning_rate": 2.461424038080127e-05, "loss": 1.2311, "step": 25600 }, { "epoch": 10.194367314557715, "grad_norm": 5.244319438934326, "learning_rate": 2.4515073383577946e-05, "loss": 1.1819, "step": 25700 }, { "epoch": 10.234034113447045, "grad_norm": 3.2293717861175537, "learning_rate": 2.4415906386354622e-05, "loss": 1.249, "step": 25800 }, { "epoch": 10.273700912336375, "grad_norm": 4.391103744506836, "learning_rate": 2.43167393891313e-05, "loss": 1.2283, "step": 25900 }, { "epoch": 10.313367711225704, "grad_norm": 4.615547180175781, "learning_rate": 2.4217572391907976e-05, "loss": 1.2915, "step": 26000 }, { "epoch": 10.353034510115034, "grad_norm": 3.3367502689361572, "learning_rate": 2.411840539468465e-05, "loss": 1.2221, "step": 26100 }, { "epoch": 10.392701309004364, "grad_norm": 5.194177150726318, "learning_rate": 2.4019238397461326e-05, "loss": 1.2611, "step": 26200 }, { "epoch": 10.432368107893693, "grad_norm": 5.576562404632568, "learning_rate": 2.3920071400238005e-05, "loss": 1.2764, "step": 26300 }, { "epoch": 10.472034906783023, "grad_norm": 4.902477264404297, "learning_rate": 2.3820904403014677e-05, "loss": 1.2066, "step": 26400 }, { "epoch": 10.511701705672353, "grad_norm": 4.312764644622803, "learning_rate": 2.3721737405791352e-05, "loss": 1.219, "step": 26500 }, { "epoch": 10.551368504561681, "grad_norm": 4.345120429992676, "learning_rate": 2.362257040856803e-05, "loss": 1.2679, "step": 26600 }, { "epoch": 10.591035303451012, "grad_norm": 3.9365150928497314, "learning_rate": 2.3523403411344706e-05, "loss": 1.1752, "step": 26700 }, { "epoch": 10.630702102340342, "grad_norm": 3.843207597732544, "learning_rate": 2.342423641412138e-05, "loss": 1.2706, "step": 26800 }, { "epoch": 10.67036890122967, "grad_norm": 4.076716423034668, "learning_rate": 2.3325069416898056e-05, "loss": 1.1561, "step": 26900 }, { "epoch": 10.710035700119, "grad_norm": 4.182331562042236, "learning_rate": 2.3225902419674735e-05, "loss": 1.2027, "step": 27000 }, { "epoch": 10.74970249900833, "grad_norm": 5.730105876922607, "learning_rate": 2.312673542245141e-05, "loss": 1.2703, "step": 27100 }, { "epoch": 10.78936929789766, "grad_norm": 5.552068710327148, "learning_rate": 2.3027568425228085e-05, "loss": 1.252, "step": 27200 }, { "epoch": 10.82903609678699, "grad_norm": 4.406209945678711, "learning_rate": 2.292840142800476e-05, "loss": 1.183, "step": 27300 }, { "epoch": 10.86870289567632, "grad_norm": 3.434688091278076, "learning_rate": 2.282923443078144e-05, "loss": 1.3214, "step": 27400 }, { "epoch": 10.908369694565648, "grad_norm": 5.0344085693359375, "learning_rate": 2.2730067433558114e-05, "loss": 1.3043, "step": 27500 }, { "epoch": 10.948036493454978, "grad_norm": 3.3030033111572266, "learning_rate": 2.263090043633479e-05, "loss": 1.1764, "step": 27600 }, { "epoch": 10.987703292344309, "grad_norm": 5.79923152923584, "learning_rate": 2.2531733439111465e-05, "loss": 1.2218, "step": 27700 }, { "epoch": 11.0, "eval_loss": 1.1352205276489258, "eval_runtime": 31.6991, "eval_samples_per_second": 47.888, "eval_steps_per_second": 5.994, "step": 27731 }, { "epoch": 11.027370091233637, "grad_norm": 4.073122501373291, "learning_rate": 2.2432566441888144e-05, "loss": 1.1861, "step": 27800 }, { "epoch": 11.067036890122967, "grad_norm": 2.8648433685302734, "learning_rate": 2.2333399444664815e-05, "loss": 1.1659, "step": 27900 }, { "epoch": 11.106703689012297, "grad_norm": 3.6053709983825684, "learning_rate": 2.223423244744149e-05, "loss": 1.2087, "step": 28000 }, { "epoch": 11.146370487901626, "grad_norm": 3.5773251056671143, "learning_rate": 2.2135065450218166e-05, "loss": 1.1787, "step": 28100 }, { "epoch": 11.186037286790956, "grad_norm": 5.5593485832214355, "learning_rate": 2.2035898452994845e-05, "loss": 1.1941, "step": 28200 }, { "epoch": 11.225704085680286, "grad_norm": 3.9467504024505615, "learning_rate": 2.193673145577152e-05, "loss": 1.2505, "step": 28300 }, { "epoch": 11.265370884569615, "grad_norm": 4.707422733306885, "learning_rate": 2.1837564458548195e-05, "loss": 1.1165, "step": 28400 }, { "epoch": 11.305037683458945, "grad_norm": 4.517952919006348, "learning_rate": 2.1738397461324874e-05, "loss": 1.2379, "step": 28500 }, { "epoch": 11.344704482348275, "grad_norm": 2.318586587905884, "learning_rate": 2.163923046410155e-05, "loss": 1.2098, "step": 28600 }, { "epoch": 11.384371281237604, "grad_norm": 3.655980110168457, "learning_rate": 2.1540063466878224e-05, "loss": 1.2044, "step": 28700 }, { "epoch": 11.424038080126934, "grad_norm": 4.038224697113037, "learning_rate": 2.14408964696549e-05, "loss": 1.1651, "step": 28800 }, { "epoch": 11.463704879016264, "grad_norm": 3.9811367988586426, "learning_rate": 2.1341729472431578e-05, "loss": 1.199, "step": 28900 }, { "epoch": 11.503371677905593, "grad_norm": 6.200103759765625, "learning_rate": 2.1242562475208253e-05, "loss": 1.1094, "step": 29000 }, { "epoch": 11.543038476794923, "grad_norm": 3.919187545776367, "learning_rate": 2.114339547798493e-05, "loss": 1.1522, "step": 29100 }, { "epoch": 11.582705275684253, "grad_norm": 3.701822519302368, "learning_rate": 2.1044228480761604e-05, "loss": 1.1556, "step": 29200 }, { "epoch": 11.622372074573581, "grad_norm": 4.491922855377197, "learning_rate": 2.0945061483538282e-05, "loss": 1.1779, "step": 29300 }, { "epoch": 11.662038873462912, "grad_norm": 4.367665767669678, "learning_rate": 2.0845894486314954e-05, "loss": 1.1392, "step": 29400 }, { "epoch": 11.701705672352242, "grad_norm": 4.0435028076171875, "learning_rate": 2.074672748909163e-05, "loss": 1.1621, "step": 29500 }, { "epoch": 11.74137247124157, "grad_norm": 4.151968955993652, "learning_rate": 2.0647560491868305e-05, "loss": 1.1983, "step": 29600 }, { "epoch": 11.7810392701309, "grad_norm": 4.687623500823975, "learning_rate": 2.0548393494644983e-05, "loss": 1.1563, "step": 29700 }, { "epoch": 11.82070606902023, "grad_norm": 4.415579795837402, "learning_rate": 2.044922649742166e-05, "loss": 1.1497, "step": 29800 }, { "epoch": 11.86037286790956, "grad_norm": 4.241002082824707, "learning_rate": 2.0350059500198334e-05, "loss": 1.2298, "step": 29900 }, { "epoch": 11.90003966679889, "grad_norm": 5.38535213470459, "learning_rate": 2.025089250297501e-05, "loss": 1.1403, "step": 30000 }, { "epoch": 11.93970646568822, "grad_norm": 3.886983633041382, "learning_rate": 2.0151725505751688e-05, "loss": 1.237, "step": 30100 }, { "epoch": 11.979373264577548, "grad_norm": 4.2845048904418945, "learning_rate": 2.0052558508528363e-05, "loss": 1.2216, "step": 30200 }, { "epoch": 12.0, "eval_loss": 1.097899317741394, "eval_runtime": 31.7141, "eval_samples_per_second": 47.865, "eval_steps_per_second": 5.991, "step": 30252 }, { "epoch": 12.019040063466878, "grad_norm": 4.043181896209717, "learning_rate": 1.9953391511305038e-05, "loss": 1.1738, "step": 30300 }, { "epoch": 12.058706862356209, "grad_norm": 3.213641405105591, "learning_rate": 1.9854224514081713e-05, "loss": 1.1143, "step": 30400 }, { "epoch": 12.098373661245537, "grad_norm": 4.7294511795043945, "learning_rate": 1.9755057516858392e-05, "loss": 1.142, "step": 30500 }, { "epoch": 12.138040460134867, "grad_norm": 4.42033052444458, "learning_rate": 1.9655890519635067e-05, "loss": 1.1422, "step": 30600 }, { "epoch": 12.177707259024197, "grad_norm": 4.57334041595459, "learning_rate": 1.9556723522411743e-05, "loss": 1.1148, "step": 30700 }, { "epoch": 12.217374057913526, "grad_norm": 4.560477256774902, "learning_rate": 1.945755652518842e-05, "loss": 1.1742, "step": 30800 }, { "epoch": 12.257040856802856, "grad_norm": 3.4284374713897705, "learning_rate": 1.9358389527965093e-05, "loss": 1.1115, "step": 30900 }, { "epoch": 12.296707655692186, "grad_norm": 3.185410499572754, "learning_rate": 1.925922253074177e-05, "loss": 1.1542, "step": 31000 }, { "epoch": 12.336374454581515, "grad_norm": 3.674408435821533, "learning_rate": 1.9160055533518444e-05, "loss": 1.1787, "step": 31100 }, { "epoch": 12.376041253470845, "grad_norm": 3.7118613719940186, "learning_rate": 1.9060888536295122e-05, "loss": 1.1716, "step": 31200 }, { "epoch": 12.415708052360175, "grad_norm": 4.5831756591796875, "learning_rate": 1.8961721539071797e-05, "loss": 1.1372, "step": 31300 }, { "epoch": 12.455374851249504, "grad_norm": 7.098066806793213, "learning_rate": 1.8862554541848473e-05, "loss": 1.1361, "step": 31400 }, { "epoch": 12.495041650138834, "grad_norm": 3.451817512512207, "learning_rate": 1.8763387544625148e-05, "loss": 1.1458, "step": 31500 }, { "epoch": 12.534708449028164, "grad_norm": 2.6188955307006836, "learning_rate": 1.8664220547401827e-05, "loss": 1.0782, "step": 31600 }, { "epoch": 12.574375247917493, "grad_norm": 3.3588056564331055, "learning_rate": 1.8565053550178502e-05, "loss": 1.1593, "step": 31700 }, { "epoch": 12.614042046806823, "grad_norm": 5.186858654022217, "learning_rate": 1.8465886552955177e-05, "loss": 1.137, "step": 31800 }, { "epoch": 12.653708845696153, "grad_norm": 4.593524932861328, "learning_rate": 1.8366719555731852e-05, "loss": 1.1715, "step": 31900 }, { "epoch": 12.693375644585482, "grad_norm": 4.951717853546143, "learning_rate": 1.826755255850853e-05, "loss": 1.0765, "step": 32000 }, { "epoch": 12.733042443474812, "grad_norm": 6.989925384521484, "learning_rate": 1.8168385561285206e-05, "loss": 1.1062, "step": 32100 }, { "epoch": 12.772709242364142, "grad_norm": 3.6436753273010254, "learning_rate": 1.806921856406188e-05, "loss": 1.1574, "step": 32200 }, { "epoch": 12.81237604125347, "grad_norm": 4.659509181976318, "learning_rate": 1.7970051566838557e-05, "loss": 1.1257, "step": 32300 }, { "epoch": 12.8520428401428, "grad_norm": 2.914414882659912, "learning_rate": 1.7870884569615232e-05, "loss": 1.1131, "step": 32400 }, { "epoch": 12.89170963903213, "grad_norm": 3.9510741233825684, "learning_rate": 1.7771717572391907e-05, "loss": 1.1144, "step": 32500 }, { "epoch": 12.93137643792146, "grad_norm": 4.820216178894043, "learning_rate": 1.7672550575168582e-05, "loss": 1.1628, "step": 32600 }, { "epoch": 12.97104323681079, "grad_norm": 4.699492931365967, "learning_rate": 1.757338357794526e-05, "loss": 1.1587, "step": 32700 }, { "epoch": 13.0, "eval_loss": 1.081364631652832, "eval_runtime": 31.6894, "eval_samples_per_second": 47.902, "eval_steps_per_second": 5.996, "step": 32773 }, { "epoch": 13.01071003570012, "grad_norm": 3.7646989822387695, "learning_rate": 1.7474216580721936e-05, "loss": 1.1084, "step": 32800 }, { "epoch": 13.050376834589448, "grad_norm": 4.074378967285156, "learning_rate": 1.737504958349861e-05, "loss": 1.1007, "step": 32900 }, { "epoch": 13.090043633478778, "grad_norm": 4.0714521408081055, "learning_rate": 1.7275882586275287e-05, "loss": 1.1298, "step": 33000 }, { "epoch": 13.129710432368109, "grad_norm": 3.7556121349334717, "learning_rate": 1.7176715589051965e-05, "loss": 1.1407, "step": 33100 }, { "epoch": 13.169377231257437, "grad_norm": 3.3032736778259277, "learning_rate": 1.707754859182864e-05, "loss": 1.1437, "step": 33200 }, { "epoch": 13.209044030146767, "grad_norm": 4.428369522094727, "learning_rate": 1.6978381594605316e-05, "loss": 1.0659, "step": 33300 }, { "epoch": 13.248710829036098, "grad_norm": 3.486649990081787, "learning_rate": 1.687921459738199e-05, "loss": 1.0744, "step": 33400 }, { "epoch": 13.288377627925426, "grad_norm": 4.116626262664795, "learning_rate": 1.678004760015867e-05, "loss": 1.0933, "step": 33500 }, { "epoch": 13.328044426814756, "grad_norm": 5.455049991607666, "learning_rate": 1.6680880602935345e-05, "loss": 1.0387, "step": 33600 }, { "epoch": 13.367711225704086, "grad_norm": 4.454029083251953, "learning_rate": 1.658171360571202e-05, "loss": 1.0488, "step": 33700 }, { "epoch": 13.407378024593415, "grad_norm": 3.605964422225952, "learning_rate": 1.6482546608488695e-05, "loss": 1.1565, "step": 33800 }, { "epoch": 13.447044823482745, "grad_norm": 3.3428781032562256, "learning_rate": 1.638337961126537e-05, "loss": 1.1255, "step": 33900 }, { "epoch": 13.486711622372075, "grad_norm": 5.9332709312438965, "learning_rate": 1.6284212614042046e-05, "loss": 1.0814, "step": 34000 }, { "epoch": 13.526378421261404, "grad_norm": 3.3487417697906494, "learning_rate": 1.618504561681872e-05, "loss": 1.1105, "step": 34100 }, { "epoch": 13.566045220150734, "grad_norm": 3.4275264739990234, "learning_rate": 1.60858786195954e-05, "loss": 1.0292, "step": 34200 }, { "epoch": 13.605712019040064, "grad_norm": 5.602040767669678, "learning_rate": 1.5986711622372075e-05, "loss": 1.0629, "step": 34300 }, { "epoch": 13.645378817929393, "grad_norm": 2.6752493381500244, "learning_rate": 1.588754462514875e-05, "loss": 1.0761, "step": 34400 }, { "epoch": 13.685045616818723, "grad_norm": 3.2931220531463623, "learning_rate": 1.5788377627925426e-05, "loss": 0.9885, "step": 34500 }, { "epoch": 13.724712415708053, "grad_norm": 8.223132133483887, "learning_rate": 1.5689210630702104e-05, "loss": 1.1423, "step": 34600 }, { "epoch": 13.764379214597382, "grad_norm": 4.580158233642578, "learning_rate": 1.559004363347878e-05, "loss": 1.0879, "step": 34700 }, { "epoch": 13.804046013486712, "grad_norm": 3.891131639480591, "learning_rate": 1.5490876636255455e-05, "loss": 1.0819, "step": 34800 }, { "epoch": 13.843712812376042, "grad_norm": 5.4781084060668945, "learning_rate": 1.539170963903213e-05, "loss": 1.1007, "step": 34900 }, { "epoch": 13.88337961126537, "grad_norm": 5.0408220291137695, "learning_rate": 1.529254264180881e-05, "loss": 1.1124, "step": 35000 }, { "epoch": 13.9230464101547, "grad_norm": 4.6583452224731445, "learning_rate": 1.5193375644585484e-05, "loss": 1.1607, "step": 35100 }, { "epoch": 13.962713209044031, "grad_norm": 5.026098251342773, "learning_rate": 1.5094208647362159e-05, "loss": 1.0744, "step": 35200 }, { "epoch": 14.0, "eval_loss": 1.068395733833313, "eval_runtime": 31.6512, "eval_samples_per_second": 47.96, "eval_steps_per_second": 6.003, "step": 35294 }, { "epoch": 14.00238000793336, "grad_norm": 2.9335262775421143, "learning_rate": 1.4995041650138836e-05, "loss": 1.0841, "step": 35300 }, { "epoch": 14.04204680682269, "grad_norm": 4.208588123321533, "learning_rate": 1.489587465291551e-05, "loss": 1.0901, "step": 35400 }, { "epoch": 14.08171360571202, "grad_norm": 5.132387638092041, "learning_rate": 1.4796707655692185e-05, "loss": 1.1201, "step": 35500 }, { "epoch": 14.121380404601348, "grad_norm": 3.9229278564453125, "learning_rate": 1.4697540658468862e-05, "loss": 1.0782, "step": 35600 }, { "epoch": 14.161047203490678, "grad_norm": 6.1097259521484375, "learning_rate": 1.4598373661245537e-05, "loss": 1.1051, "step": 35700 }, { "epoch": 14.200714002380009, "grad_norm": 4.1445417404174805, "learning_rate": 1.4499206664022214e-05, "loss": 1.1283, "step": 35800 }, { "epoch": 14.240380801269337, "grad_norm": 3.5986008644104004, "learning_rate": 1.440003966679889e-05, "loss": 1.0453, "step": 35900 }, { "epoch": 14.280047600158667, "grad_norm": 3.8175106048583984, "learning_rate": 1.4300872669575566e-05, "loss": 1.0585, "step": 36000 }, { "epoch": 14.319714399047998, "grad_norm": 2.821758985519409, "learning_rate": 1.4201705672352241e-05, "loss": 1.06, "step": 36100 }, { "epoch": 14.359381197937326, "grad_norm": 3.65065860748291, "learning_rate": 1.4102538675128918e-05, "loss": 1.1064, "step": 36200 }, { "epoch": 14.399047996826656, "grad_norm": 5.7176713943481445, "learning_rate": 1.4003371677905594e-05, "loss": 1.008, "step": 36300 }, { "epoch": 14.438714795715986, "grad_norm": 5.075132846832275, "learning_rate": 1.390420468068227e-05, "loss": 1.114, "step": 36400 }, { "epoch": 14.478381594605315, "grad_norm": 5.210816860198975, "learning_rate": 1.3805037683458946e-05, "loss": 1.0944, "step": 36500 }, { "epoch": 14.518048393494645, "grad_norm": 4.964089870452881, "learning_rate": 1.3705870686235623e-05, "loss": 1.0904, "step": 36600 }, { "epoch": 14.557715192383975, "grad_norm": 3.131520986557007, "learning_rate": 1.3606703689012298e-05, "loss": 1.063, "step": 36700 }, { "epoch": 14.597381991273304, "grad_norm": 6.203433036804199, "learning_rate": 1.3507536691788975e-05, "loss": 1.0885, "step": 36800 }, { "epoch": 14.637048790162634, "grad_norm": 2.8487484455108643, "learning_rate": 1.3408369694565648e-05, "loss": 1.0785, "step": 36900 }, { "epoch": 14.676715589051964, "grad_norm": 3.4533579349517822, "learning_rate": 1.3309202697342324e-05, "loss": 1.0956, "step": 37000 }, { "epoch": 14.716382387941293, "grad_norm": 5.409042835235596, "learning_rate": 1.3210035700119e-05, "loss": 1.0635, "step": 37100 }, { "epoch": 14.756049186830623, "grad_norm": 4.514674186706543, "learning_rate": 1.3110868702895676e-05, "loss": 1.0829, "step": 37200 }, { "epoch": 14.795715985719953, "grad_norm": 4.7005791664123535, "learning_rate": 1.3011701705672353e-05, "loss": 1.0003, "step": 37300 }, { "epoch": 14.835382784609282, "grad_norm": 4.253646373748779, "learning_rate": 1.2912534708449028e-05, "loss": 1.0562, "step": 37400 }, { "epoch": 14.875049583498612, "grad_norm": 4.305023193359375, "learning_rate": 1.2813367711225705e-05, "loss": 1.0712, "step": 37500 }, { "epoch": 14.914716382387942, "grad_norm": 4.189399719238281, "learning_rate": 1.271420071400238e-05, "loss": 1.0761, "step": 37600 }, { "epoch": 14.95438318127727, "grad_norm": 3.2512216567993164, "learning_rate": 1.2615033716779057e-05, "loss": 1.0336, "step": 37700 }, { "epoch": 14.9940499801666, "grad_norm": 3.3554651737213135, "learning_rate": 1.2515866719555732e-05, "loss": 1.0636, "step": 37800 }, { "epoch": 15.0, "eval_loss": 1.051405906677246, "eval_runtime": 31.6428, "eval_samples_per_second": 47.973, "eval_steps_per_second": 6.005, "step": 37815 }, { "epoch": 15.033716779055931, "grad_norm": 3.6472902297973633, "learning_rate": 1.241669972233241e-05, "loss": 1.0596, "step": 37900 }, { "epoch": 15.07338357794526, "grad_norm": 5.338723659515381, "learning_rate": 1.2317532725109085e-05, "loss": 1.0462, "step": 38000 }, { "epoch": 15.11305037683459, "grad_norm": 4.401419639587402, "learning_rate": 1.221836572788576e-05, "loss": 1.0869, "step": 38100 }, { "epoch": 15.15271717572392, "grad_norm": 9.426093101501465, "learning_rate": 1.2119198730662435e-05, "loss": 1.0198, "step": 38200 }, { "epoch": 15.192383974613248, "grad_norm": 3.7169394493103027, "learning_rate": 1.2020031733439112e-05, "loss": 1.1285, "step": 38300 }, { "epoch": 15.232050773502579, "grad_norm": 3.466498851776123, "learning_rate": 1.1920864736215787e-05, "loss": 1.0125, "step": 38400 }, { "epoch": 15.271717572391909, "grad_norm": 2.7933382987976074, "learning_rate": 1.1821697738992464e-05, "loss": 1.0545, "step": 38500 }, { "epoch": 15.311384371281237, "grad_norm": 2.926934003829956, "learning_rate": 1.172253074176914e-05, "loss": 1.1035, "step": 38600 }, { "epoch": 15.351051170170567, "grad_norm": 3.2757022380828857, "learning_rate": 1.1623363744545816e-05, "loss": 1.0479, "step": 38700 }, { "epoch": 15.390717969059898, "grad_norm": 4.160761833190918, "learning_rate": 1.1524196747322492e-05, "loss": 1.064, "step": 38800 }, { "epoch": 15.430384767949226, "grad_norm": 3.412480592727661, "learning_rate": 1.1425029750099167e-05, "loss": 0.9485, "step": 38900 }, { "epoch": 15.470051566838556, "grad_norm": 3.1907808780670166, "learning_rate": 1.1325862752875844e-05, "loss": 1.0605, "step": 39000 }, { "epoch": 15.509718365727887, "grad_norm": 4.184901714324951, "learning_rate": 1.1226695755652519e-05, "loss": 1.0551, "step": 39100 }, { "epoch": 15.549385164617215, "grad_norm": 4.784205436706543, "learning_rate": 1.1127528758429196e-05, "loss": 0.9941, "step": 39200 }, { "epoch": 15.589051963506545, "grad_norm": 4.00923490524292, "learning_rate": 1.1028361761205871e-05, "loss": 1.076, "step": 39300 }, { "epoch": 15.628718762395875, "grad_norm": 4.559725284576416, "learning_rate": 1.0929194763982548e-05, "loss": 0.9979, "step": 39400 }, { "epoch": 15.668385561285204, "grad_norm": 3.8985109329223633, "learning_rate": 1.0830027766759223e-05, "loss": 1.0397, "step": 39500 }, { "epoch": 15.708052360174534, "grad_norm": 3.3521323204040527, "learning_rate": 1.0730860769535899e-05, "loss": 1.006, "step": 39600 }, { "epoch": 15.747719159063864, "grad_norm": 3.2745351791381836, "learning_rate": 1.0631693772312574e-05, "loss": 1.0642, "step": 39700 }, { "epoch": 15.787385957953193, "grad_norm": 3.955242156982422, "learning_rate": 1.053252677508925e-05, "loss": 1.0455, "step": 39800 }, { "epoch": 15.827052756842523, "grad_norm": 3.2223598957061768, "learning_rate": 1.0433359777865926e-05, "loss": 1.0675, "step": 39900 }, { "epoch": 15.866719555731853, "grad_norm": 4.809605121612549, "learning_rate": 1.0334192780642603e-05, "loss": 1.0992, "step": 40000 }, { "epoch": 15.906386354621182, "grad_norm": 2.6435019969940186, "learning_rate": 1.0235025783419278e-05, "loss": 0.9905, "step": 40100 }, { "epoch": 15.946053153510512, "grad_norm": 6.68290376663208, "learning_rate": 1.0135858786195955e-05, "loss": 1.0951, "step": 40200 }, { "epoch": 15.985719952399842, "grad_norm": 2.6426591873168945, "learning_rate": 1.003669178897263e-05, "loss": 1.073, "step": 40300 }, { "epoch": 16.0, "eval_loss": 1.039953351020813, "eval_runtime": 31.6995, "eval_samples_per_second": 47.887, "eval_steps_per_second": 5.994, "step": 40336 }, { "epoch": 16.025386751289172, "grad_norm": 3.456146001815796, "learning_rate": 9.937524791749306e-06, "loss": 1.0191, "step": 40400 }, { "epoch": 16.0650535501785, "grad_norm": 5.939918518066406, "learning_rate": 9.838357794525983e-06, "loss": 1.0433, "step": 40500 }, { "epoch": 16.10472034906783, "grad_norm": 3.538282871246338, "learning_rate": 9.739190797302658e-06, "loss": 1.0295, "step": 40600 }, { "epoch": 16.14438714795716, "grad_norm": 4.2307844161987305, "learning_rate": 9.640023800079335e-06, "loss": 1.0371, "step": 40700 }, { "epoch": 16.18405394684649, "grad_norm": 4.40711784362793, "learning_rate": 9.54085680285601e-06, "loss": 1.0236, "step": 40800 }, { "epoch": 16.223720745735818, "grad_norm": 3.8492507934570312, "learning_rate": 9.441689805632687e-06, "loss": 1.0628, "step": 40900 }, { "epoch": 16.26338754462515, "grad_norm": 4.397724628448486, "learning_rate": 9.342522808409362e-06, "loss": 1.0072, "step": 41000 }, { "epoch": 16.30305434351448, "grad_norm": 3.3145904541015625, "learning_rate": 9.243355811186037e-06, "loss": 1.045, "step": 41100 }, { "epoch": 16.342721142403807, "grad_norm": 5.359413146972656, "learning_rate": 9.144188813962713e-06, "loss": 1.0299, "step": 41200 }, { "epoch": 16.38238794129314, "grad_norm": 3.4849679470062256, "learning_rate": 9.04502181673939e-06, "loss": 1.012, "step": 41300 }, { "epoch": 16.422054740182467, "grad_norm": 2.9378600120544434, "learning_rate": 8.945854819516065e-06, "loss": 1.0269, "step": 41400 }, { "epoch": 16.461721539071796, "grad_norm": 3.024475574493408, "learning_rate": 8.846687822292742e-06, "loss": 1.0373, "step": 41500 }, { "epoch": 16.501388337961128, "grad_norm": 3.2381701469421387, "learning_rate": 8.747520825069417e-06, "loss": 0.9888, "step": 41600 }, { "epoch": 16.541055136850456, "grad_norm": 3.816202163696289, "learning_rate": 8.648353827846094e-06, "loss": 0.9384, "step": 41700 }, { "epoch": 16.580721935739785, "grad_norm": 4.290541648864746, "learning_rate": 8.54918683062277e-06, "loss": 1.0653, "step": 41800 }, { "epoch": 16.620388734629117, "grad_norm": 4.712522029876709, "learning_rate": 8.450019833399444e-06, "loss": 0.9951, "step": 41900 }, { "epoch": 16.660055533518445, "grad_norm": 3.3500611782073975, "learning_rate": 8.350852836176121e-06, "loss": 1.0356, "step": 42000 }, { "epoch": 16.699722332407774, "grad_norm": 3.6570308208465576, "learning_rate": 8.251685838952797e-06, "loss": 1.0205, "step": 42100 }, { "epoch": 16.739389131297106, "grad_norm": 3.4734184741973877, "learning_rate": 8.152518841729474e-06, "loss": 1.037, "step": 42200 }, { "epoch": 16.779055930186434, "grad_norm": 3.528817653656006, "learning_rate": 8.053351844506149e-06, "loss": 0.9402, "step": 42300 }, { "epoch": 16.818722729075763, "grad_norm": 4.3084025382995605, "learning_rate": 7.954184847282826e-06, "loss": 1.0702, "step": 42400 }, { "epoch": 16.858389527965095, "grad_norm": 3.520242214202881, "learning_rate": 7.855017850059501e-06, "loss": 1.0474, "step": 42500 }, { "epoch": 16.898056326854423, "grad_norm": 4.44198751449585, "learning_rate": 7.755850852836176e-06, "loss": 1.0506, "step": 42600 }, { "epoch": 16.93772312574375, "grad_norm": 2.8113813400268555, "learning_rate": 7.656683855612852e-06, "loss": 1.0167, "step": 42700 }, { "epoch": 16.977389924633083, "grad_norm": 3.3131535053253174, "learning_rate": 7.5575168583895284e-06, "loss": 1.0077, "step": 42800 }, { "epoch": 17.0, "eval_loss": 1.0288244485855103, "eval_runtime": 31.6731, "eval_samples_per_second": 47.927, "eval_steps_per_second": 5.999, "step": 42857 }, { "epoch": 17.017056723522412, "grad_norm": 5.444199562072754, "learning_rate": 7.4583498611662045e-06, "loss": 0.9996, "step": 42900 }, { "epoch": 17.05672352241174, "grad_norm": 4.1272783279418945, "learning_rate": 7.359182863942881e-06, "loss": 1.0256, "step": 43000 }, { "epoch": 17.096390321301072, "grad_norm": 4.819570064544678, "learning_rate": 7.260015866719557e-06, "loss": 1.0325, "step": 43100 }, { "epoch": 17.1360571201904, "grad_norm": 4.795453071594238, "learning_rate": 7.160848869496233e-06, "loss": 0.9845, "step": 43200 }, { "epoch": 17.17572391907973, "grad_norm": 5.2741827964782715, "learning_rate": 7.061681872272907e-06, "loss": 1.0406, "step": 43300 }, { "epoch": 17.21539071796906, "grad_norm": 5.457202911376953, "learning_rate": 6.962514875049583e-06, "loss": 1.0704, "step": 43400 }, { "epoch": 17.25505751685839, "grad_norm": 6.256078243255615, "learning_rate": 6.863347877826259e-06, "loss": 1.0182, "step": 43500 }, { "epoch": 17.294724315747718, "grad_norm": 3.9407060146331787, "learning_rate": 6.7641808806029355e-06, "loss": 0.9889, "step": 43600 }, { "epoch": 17.33439111463705, "grad_norm": 3.250436782836914, "learning_rate": 6.6650138833796116e-06, "loss": 1.0079, "step": 43700 }, { "epoch": 17.37405791352638, "grad_norm": 2.7779972553253174, "learning_rate": 6.565846886156288e-06, "loss": 1.0134, "step": 43800 }, { "epoch": 17.413724712415707, "grad_norm": 4.296668529510498, "learning_rate": 6.466679888932964e-06, "loss": 0.9585, "step": 43900 }, { "epoch": 17.45339151130504, "grad_norm": 3.737541437149048, "learning_rate": 6.36751289170964e-06, "loss": 1.0307, "step": 44000 }, { "epoch": 17.493058310194368, "grad_norm": 5.0776848793029785, "learning_rate": 6.268345894486314e-06, "loss": 1.0395, "step": 44100 }, { "epoch": 17.532725109083696, "grad_norm": 6.334095001220703, "learning_rate": 6.169178897262991e-06, "loss": 0.9772, "step": 44200 }, { "epoch": 17.572391907973028, "grad_norm": 5.443525314331055, "learning_rate": 6.070011900039667e-06, "loss": 0.9264, "step": 44300 }, { "epoch": 17.612058706862356, "grad_norm": 4.61970853805542, "learning_rate": 5.970844902816343e-06, "loss": 1.0307, "step": 44400 }, { "epoch": 17.651725505751685, "grad_norm": 3.089509963989258, "learning_rate": 5.8716779055930195e-06, "loss": 0.9633, "step": 44500 }, { "epoch": 17.691392304641017, "grad_norm": 4.635293006896973, "learning_rate": 5.7725109083696955e-06, "loss": 1.0537, "step": 44600 }, { "epoch": 17.731059103530345, "grad_norm": 3.052475929260254, "learning_rate": 5.673343911146371e-06, "loss": 0.9983, "step": 44700 }, { "epoch": 17.770725902419674, "grad_norm": 3.9765052795410156, "learning_rate": 5.574176913923047e-06, "loss": 1.0687, "step": 44800 }, { "epoch": 17.810392701309006, "grad_norm": 4.3488030433654785, "learning_rate": 5.475009916699723e-06, "loss": 1.012, "step": 44900 }, { "epoch": 17.850059500198334, "grad_norm": 3.6032917499542236, "learning_rate": 5.375842919476398e-06, "loss": 0.9933, "step": 45000 }, { "epoch": 17.889726299087663, "grad_norm": 3.2621772289276123, "learning_rate": 5.276675922253074e-06, "loss": 0.9657, "step": 45100 }, { "epoch": 17.929393097976995, "grad_norm": 3.9976959228515625, "learning_rate": 5.17750892502975e-06, "loss": 0.9799, "step": 45200 }, { "epoch": 17.969059896866323, "grad_norm": 4.725791931152344, "learning_rate": 5.0783419278064265e-06, "loss": 1.043, "step": 45300 }, { "epoch": 18.0, "eval_loss": 1.0184741020202637, "eval_runtime": 31.6571, "eval_samples_per_second": 47.951, "eval_steps_per_second": 6.002, "step": 45378 }, { "epoch": 18.00872669575565, "grad_norm": 3.418588876724243, "learning_rate": 4.979174930583102e-06, "loss": 0.9965, "step": 45400 }, { "epoch": 18.048393494644984, "grad_norm": 4.835160255432129, "learning_rate": 4.880007933359778e-06, "loss": 0.9789, "step": 45500 }, { "epoch": 18.088060293534312, "grad_norm": 4.275815486907959, "learning_rate": 4.780840936136454e-06, "loss": 1.0233, "step": 45600 }, { "epoch": 18.12772709242364, "grad_norm": 4.429009914398193, "learning_rate": 4.68167393891313e-06, "loss": 1.0487, "step": 45700 }, { "epoch": 18.167393891312972, "grad_norm": 4.390066146850586, "learning_rate": 4.582506941689805e-06, "loss": 0.968, "step": 45800 }, { "epoch": 18.2070606902023, "grad_norm": 3.265092372894287, "learning_rate": 4.483339944466481e-06, "loss": 1.0171, "step": 45900 }, { "epoch": 18.24672748909163, "grad_norm": 4.843317031860352, "learning_rate": 4.3841729472431574e-06, "loss": 1.0204, "step": 46000 }, { "epoch": 18.28639428798096, "grad_norm": 4.457988262176514, "learning_rate": 4.2850059500198335e-06, "loss": 1.0246, "step": 46100 }, { "epoch": 18.32606108687029, "grad_norm": 3.9527127742767334, "learning_rate": 4.18583895279651e-06, "loss": 0.9241, "step": 46200 }, { "epoch": 18.36572788575962, "grad_norm": 3.7694692611694336, "learning_rate": 4.086671955573186e-06, "loss": 1.0318, "step": 46300 }, { "epoch": 18.40539468464895, "grad_norm": 5.390737533569336, "learning_rate": 3.987504958349862e-06, "loss": 0.9938, "step": 46400 }, { "epoch": 18.44506148353828, "grad_norm": 3.8084776401519775, "learning_rate": 3.888337961126538e-06, "loss": 0.9652, "step": 46500 }, { "epoch": 18.484728282427607, "grad_norm": 3.5767834186553955, "learning_rate": 3.789170963903213e-06, "loss": 0.9582, "step": 46600 }, { "epoch": 18.52439508131694, "grad_norm": 3.4777605533599854, "learning_rate": 3.6900039666798892e-06, "loss": 0.9981, "step": 46700 }, { "epoch": 18.564061880206268, "grad_norm": 4.1490092277526855, "learning_rate": 3.5908369694565653e-06, "loss": 0.9607, "step": 46800 }, { "epoch": 18.603728679095596, "grad_norm": 4.089176654815674, "learning_rate": 3.4916699722332406e-06, "loss": 1.0168, "step": 46900 }, { "epoch": 18.643395477984928, "grad_norm": 3.9602725505828857, "learning_rate": 3.3925029750099167e-06, "loss": 0.9785, "step": 47000 }, { "epoch": 18.683062276874256, "grad_norm": 4.800217628479004, "learning_rate": 3.2933359777865927e-06, "loss": 1.0514, "step": 47100 }, { "epoch": 18.722729075763585, "grad_norm": 4.848387718200684, "learning_rate": 3.194168980563269e-06, "loss": 0.9798, "step": 47200 }, { "epoch": 18.762395874652917, "grad_norm": 3.5444610118865967, "learning_rate": 3.0950019833399445e-06, "loss": 1.0602, "step": 47300 }, { "epoch": 18.802062673542245, "grad_norm": 3.4162533283233643, "learning_rate": 2.9958349861166206e-06, "loss": 0.9881, "step": 47400 }, { "epoch": 18.841729472431574, "grad_norm": 4.719314098358154, "learning_rate": 2.8966679888932967e-06, "loss": 0.9503, "step": 47500 }, { "epoch": 18.881396271320906, "grad_norm": 5.332608222961426, "learning_rate": 2.7975009916699724e-06, "loss": 1.0245, "step": 47600 }, { "epoch": 18.921063070210234, "grad_norm": 5.230047702789307, "learning_rate": 2.6983339944466484e-06, "loss": 0.9947, "step": 47700 }, { "epoch": 18.960729869099563, "grad_norm": 3.1582813262939453, "learning_rate": 2.599166997223324e-06, "loss": 1.0198, "step": 47800 }, { "epoch": 19.0, "eval_loss": 1.017343521118164, "eval_runtime": 31.7039, "eval_samples_per_second": 47.881, "eval_steps_per_second": 5.993, "step": 47899 }, { "epoch": 19.000396667988895, "grad_norm": 5.45066499710083, "learning_rate": 2.5e-06, "loss": 0.9753, "step": 47900 }, { "epoch": 19.040063466878223, "grad_norm": 3.2004072666168213, "learning_rate": 2.400833002776676e-06, "loss": 0.946, "step": 48000 }, { "epoch": 19.07973026576755, "grad_norm": 3.971540689468384, "learning_rate": 2.301666005553352e-06, "loss": 0.9783, "step": 48100 }, { "epoch": 19.119397064656884, "grad_norm": 4.348784923553467, "learning_rate": 2.202499008330028e-06, "loss": 0.952, "step": 48200 }, { "epoch": 19.159063863546212, "grad_norm": 3.7044036388397217, "learning_rate": 2.1033320111067037e-06, "loss": 1.0662, "step": 48300 }, { "epoch": 19.19873066243554, "grad_norm": 2.662105083465576, "learning_rate": 2.00416501388338e-06, "loss": 0.9647, "step": 48400 }, { "epoch": 19.238397461324872, "grad_norm": 4.103559494018555, "learning_rate": 1.9049980166600555e-06, "loss": 0.9142, "step": 48500 }, { "epoch": 19.2780642602142, "grad_norm": 2.8791961669921875, "learning_rate": 1.8058310194367316e-06, "loss": 1.0167, "step": 48600 }, { "epoch": 19.31773105910353, "grad_norm": 2.689680576324463, "learning_rate": 1.7066640222134072e-06, "loss": 1.0012, "step": 48700 }, { "epoch": 19.35739785799286, "grad_norm": 3.3067831993103027, "learning_rate": 1.6074970249900833e-06, "loss": 0.967, "step": 48800 }, { "epoch": 19.39706465688219, "grad_norm": 3.9777708053588867, "learning_rate": 1.5083300277667594e-06, "loss": 0.9081, "step": 48900 }, { "epoch": 19.43673145577152, "grad_norm": 3.582973003387451, "learning_rate": 1.4091630305434353e-06, "loss": 1.0104, "step": 49000 }, { "epoch": 19.47639825466085, "grad_norm": 5.202731132507324, "learning_rate": 1.309996033320111e-06, "loss": 0.9648, "step": 49100 }, { "epoch": 19.51606505355018, "grad_norm": 3.264211893081665, "learning_rate": 1.210829036096787e-06, "loss": 0.9599, "step": 49200 }, { "epoch": 19.555731852439507, "grad_norm": 4.432053565979004, "learning_rate": 1.111662038873463e-06, "loss": 0.9935, "step": 49300 }, { "epoch": 19.59539865132884, "grad_norm": 3.386671781539917, "learning_rate": 1.0124950416501388e-06, "loss": 1.002, "step": 49400 }, { "epoch": 19.635065450218168, "grad_norm": 4.273075103759766, "learning_rate": 9.133280444268148e-07, "loss": 1.0225, "step": 49500 }, { "epoch": 19.674732249107496, "grad_norm": 3.5673136711120605, "learning_rate": 8.141610472034907e-07, "loss": 1.0149, "step": 49600 }, { "epoch": 19.714399047996828, "grad_norm": 3.68278431892395, "learning_rate": 7.149940499801666e-07, "loss": 0.996, "step": 49700 }, { "epoch": 19.754065846886157, "grad_norm": 4.8836870193481445, "learning_rate": 6.158270527568425e-07, "loss": 1.0097, "step": 49800 }, { "epoch": 19.793732645775485, "grad_norm": 3.579880475997925, "learning_rate": 5.166600555335184e-07, "loss": 0.9482, "step": 49900 }, { "epoch": 19.833399444664817, "grad_norm": 2.7329444885253906, "learning_rate": 4.174930583101944e-07, "loss": 1.0365, "step": 50000 }, { "epoch": 19.873066243554145, "grad_norm": 5.478430271148682, "learning_rate": 3.1832606108687035e-07, "loss": 1.0543, "step": 50100 }, { "epoch": 19.912733042443474, "grad_norm": 3.1377158164978027, "learning_rate": 2.191590638635462e-07, "loss": 0.9637, "step": 50200 }, { "epoch": 19.952399841332806, "grad_norm": 3.789954662322998, "learning_rate": 1.1999206664022213e-07, "loss": 0.9847, "step": 50300 }, { "epoch": 19.992066640222134, "grad_norm": 4.29661226272583, "learning_rate": 2.0825069416898058e-08, "loss": 1.0306, "step": 50400 }, { "epoch": 20.0, "eval_loss": 1.0158944129943848, "eval_runtime": 31.6451, "eval_samples_per_second": 47.97, "eval_steps_per_second": 6.004, "step": 50420 } ], "logging_steps": 100, "max_steps": 50420, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.902420484390912e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }