| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 50420, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.039666798889329634, | |
| "grad_norm": 46.122135162353516, | |
| "learning_rate": 4.990182467274891e-05, | |
| "loss": 11.8425, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07933359777865927, | |
| "grad_norm": 23.861114501953125, | |
| "learning_rate": 4.9802657675525586e-05, | |
| "loss": 9.0419, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1190003966679889, | |
| "grad_norm": 11.361534118652344, | |
| "learning_rate": 4.970349067830226e-05, | |
| "loss": 7.4942, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15866719555731854, | |
| "grad_norm": 9.868680000305176, | |
| "learning_rate": 4.960432368107894e-05, | |
| "loss": 6.5606, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.19833399444664815, | |
| "grad_norm": 19.773534774780273, | |
| "learning_rate": 4.9505156683855616e-05, | |
| "loss": 6.0234, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2380007933359778, | |
| "grad_norm": 6.297336101531982, | |
| "learning_rate": 4.9405989686632294e-05, | |
| "loss": 5.7502, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2776675922253074, | |
| "grad_norm": 6.993984222412109, | |
| "learning_rate": 4.9306822689408966e-05, | |
| "loss": 5.4746, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.31733439111463707, | |
| "grad_norm": 5.947575569152832, | |
| "learning_rate": 4.9207655692185645e-05, | |
| "loss": 5.2263, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3570011900039667, | |
| "grad_norm": 6.459949493408203, | |
| "learning_rate": 4.9108488694962317e-05, | |
| "loss": 5.071, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3966679888932963, | |
| "grad_norm": 6.12597131729126, | |
| "learning_rate": 4.9009321697738995e-05, | |
| "loss": 4.9277, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.43633478778262597, | |
| "grad_norm": 7.8361406326293945, | |
| "learning_rate": 4.891015470051567e-05, | |
| "loss": 4.7178, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.4760015866719556, | |
| "grad_norm": 8.883167266845703, | |
| "learning_rate": 4.881098770329235e-05, | |
| "loss": 4.5832, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5156683855612852, | |
| "grad_norm": 5.529339790344238, | |
| "learning_rate": 4.8711820706069024e-05, | |
| "loss": 4.5019, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5553351844506148, | |
| "grad_norm": 5.6431121826171875, | |
| "learning_rate": 4.86126537088457e-05, | |
| "loss": 4.4477, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5950019833399445, | |
| "grad_norm": 5.4094367027282715, | |
| "learning_rate": 4.8513486711622375e-05, | |
| "loss": 4.317, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6346687822292741, | |
| "grad_norm": 4.631023406982422, | |
| "learning_rate": 4.841431971439905e-05, | |
| "loss": 4.2006, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6743355811186037, | |
| "grad_norm": 5.530189037322998, | |
| "learning_rate": 4.8315152717175725e-05, | |
| "loss": 4.0726, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7140023800079334, | |
| "grad_norm": 6.281075477600098, | |
| "learning_rate": 4.82159857199524e-05, | |
| "loss": 4.005, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.753669178897263, | |
| "grad_norm": 8.573356628417969, | |
| "learning_rate": 4.8116818722729076e-05, | |
| "loss": 3.8903, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7933359777865926, | |
| "grad_norm": 7.195920467376709, | |
| "learning_rate": 4.8017651725505754e-05, | |
| "loss": 3.8038, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8330027766759223, | |
| "grad_norm": 6.207021236419678, | |
| "learning_rate": 4.791848472828243e-05, | |
| "loss": 3.775, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8726695755652519, | |
| "grad_norm": 6.628279685974121, | |
| "learning_rate": 4.7819317731059105e-05, | |
| "loss": 3.711, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9123363744545815, | |
| "grad_norm": 5.220765590667725, | |
| "learning_rate": 4.7720150733835784e-05, | |
| "loss": 3.6694, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.9520031733439112, | |
| "grad_norm": 4.995284557342529, | |
| "learning_rate": 4.7620983736612455e-05, | |
| "loss": 3.5359, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.9916699722332408, | |
| "grad_norm": 5.370284557342529, | |
| "learning_rate": 4.7521816739389134e-05, | |
| "loss": 3.5537, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 2.6742756366729736, | |
| "eval_runtime": 33.2175, | |
| "eval_samples_per_second": 45.699, | |
| "eval_steps_per_second": 5.72, | |
| "step": 2521 | |
| }, | |
| { | |
| "epoch": 1.0313367711225705, | |
| "grad_norm": 6.217808246612549, | |
| "learning_rate": 4.7422649742165806e-05, | |
| "loss": 3.4208, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.0710035700119, | |
| "grad_norm": 5.972238540649414, | |
| "learning_rate": 4.7323482744942484e-05, | |
| "loss": 3.3766, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.1106703689012296, | |
| "grad_norm": 6.439736366271973, | |
| "learning_rate": 4.722431574771916e-05, | |
| "loss": 3.2706, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.1503371677905594, | |
| "grad_norm": 4.722689151763916, | |
| "learning_rate": 4.712514875049584e-05, | |
| "loss": 3.1785, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.190003966679889, | |
| "grad_norm": 4.344363689422607, | |
| "learning_rate": 4.7025981753272514e-05, | |
| "loss": 3.2959, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2296707655692185, | |
| "grad_norm": 5.724086284637451, | |
| "learning_rate": 4.6926814756049185e-05, | |
| "loss": 3.1135, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.269337564458548, | |
| "grad_norm": 5.9762163162231445, | |
| "learning_rate": 4.6827647758825864e-05, | |
| "loss": 3.2194, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.3090043633478778, | |
| "grad_norm": 5.490255355834961, | |
| "learning_rate": 4.6728480761602536e-05, | |
| "loss": 3.1226, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.3486711622372074, | |
| "grad_norm": 5.34173583984375, | |
| "learning_rate": 4.6629313764379215e-05, | |
| "loss": 3.0641, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.388337961126537, | |
| "grad_norm": 6.500023365020752, | |
| "learning_rate": 4.653014676715589e-05, | |
| "loss": 3.0342, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.4280047600158667, | |
| "grad_norm": 4.705812931060791, | |
| "learning_rate": 4.643097976993257e-05, | |
| "loss": 2.9638, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.4676715589051963, | |
| "grad_norm": 5.796449661254883, | |
| "learning_rate": 4.6331812772709244e-05, | |
| "loss": 2.9635, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.5073383577945259, | |
| "grad_norm": 5.73616886138916, | |
| "learning_rate": 4.623264577548592e-05, | |
| "loss": 2.9933, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.5470051566838556, | |
| "grad_norm": 5.073670864105225, | |
| "learning_rate": 4.6133478778262594e-05, | |
| "loss": 2.9872, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.5866719555731852, | |
| "grad_norm": 5.04343318939209, | |
| "learning_rate": 4.603431178103927e-05, | |
| "loss": 2.9624, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.6263387544625147, | |
| "grad_norm": 4.266116619110107, | |
| "learning_rate": 4.5935144783815945e-05, | |
| "loss": 2.9711, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.6660055533518445, | |
| "grad_norm": 4.732306957244873, | |
| "learning_rate": 4.583597778659262e-05, | |
| "loss": 2.8238, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.705672352241174, | |
| "grad_norm": 5.156635284423828, | |
| "learning_rate": 4.57368107893693e-05, | |
| "loss": 2.839, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.7453391511305036, | |
| "grad_norm": 6.178804874420166, | |
| "learning_rate": 4.563764379214598e-05, | |
| "loss": 2.7441, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.7850059500198334, | |
| "grad_norm": 6.307518482208252, | |
| "learning_rate": 4.553847679492265e-05, | |
| "loss": 2.7271, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.824672748909163, | |
| "grad_norm": 4.5322136878967285, | |
| "learning_rate": 4.5439309797699324e-05, | |
| "loss": 2.6839, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.8643395477984925, | |
| "grad_norm": 4.728321552276611, | |
| "learning_rate": 4.5340142800476e-05, | |
| "loss": 2.815, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.9040063466878223, | |
| "grad_norm": 5.051918029785156, | |
| "learning_rate": 4.5240975803252675e-05, | |
| "loss": 2.6735, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.9436731455771519, | |
| "grad_norm": 4.968688011169434, | |
| "learning_rate": 4.5141808806029353e-05, | |
| "loss": 2.6604, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.9833399444664814, | |
| "grad_norm": 4.792623996734619, | |
| "learning_rate": 4.504264180880603e-05, | |
| "loss": 2.6375, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 2.0515847206115723, | |
| "eval_runtime": 33.0931, | |
| "eval_samples_per_second": 45.871, | |
| "eval_steps_per_second": 5.741, | |
| "step": 5042 | |
| }, | |
| { | |
| "epoch": 2.023006743355811, | |
| "grad_norm": 7.228871822357178, | |
| "learning_rate": 4.494347481158271e-05, | |
| "loss": 2.5813, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.062673542245141, | |
| "grad_norm": 4.44078254699707, | |
| "learning_rate": 4.484430781435938e-05, | |
| "loss": 2.5798, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.1023403411344703, | |
| "grad_norm": 5.475325107574463, | |
| "learning_rate": 4.474514081713606e-05, | |
| "loss": 2.5297, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.1420071400238, | |
| "grad_norm": 4.271339416503906, | |
| "learning_rate": 4.464597381991273e-05, | |
| "loss": 2.5595, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.18167393891313, | |
| "grad_norm": 3.9716315269470215, | |
| "learning_rate": 4.454680682268941e-05, | |
| "loss": 2.5629, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.221340737802459, | |
| "grad_norm": 5.6469807624816895, | |
| "learning_rate": 4.4447639825466084e-05, | |
| "loss": 2.4691, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.261007536691789, | |
| "grad_norm": 4.760526657104492, | |
| "learning_rate": 4.434847282824276e-05, | |
| "loss": 2.606, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.300674335581119, | |
| "grad_norm": 5.259726047515869, | |
| "learning_rate": 4.424930583101944e-05, | |
| "loss": 2.4984, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.340341134470448, | |
| "grad_norm": 4.372512340545654, | |
| "learning_rate": 4.415013883379612e-05, | |
| "loss": 2.4104, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.380007933359778, | |
| "grad_norm": 5.21671724319458, | |
| "learning_rate": 4.405097183657279e-05, | |
| "loss": 2.4612, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.4196747322491077, | |
| "grad_norm": 4.706778049468994, | |
| "learning_rate": 4.395180483934946e-05, | |
| "loss": 2.3845, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.459341531138437, | |
| "grad_norm": 4.4265217781066895, | |
| "learning_rate": 4.385263784212614e-05, | |
| "loss": 2.4508, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.499008330027767, | |
| "grad_norm": 112.53572082519531, | |
| "learning_rate": 4.3753470844902814e-05, | |
| "loss": 2.3788, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.538675128917096, | |
| "grad_norm": 5.193419933319092, | |
| "learning_rate": 4.365430384767949e-05, | |
| "loss": 2.3999, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.578341927806426, | |
| "grad_norm": 4.786646842956543, | |
| "learning_rate": 4.355513685045617e-05, | |
| "loss": 2.3964, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.6180087266957557, | |
| "grad_norm": 4.764982223510742, | |
| "learning_rate": 4.345596985323285e-05, | |
| "loss": 2.2939, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.657675525585085, | |
| "grad_norm": 8.752727508544922, | |
| "learning_rate": 4.335680285600952e-05, | |
| "loss": 2.2859, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.697342324474415, | |
| "grad_norm": 5.419288158416748, | |
| "learning_rate": 4.32576358587862e-05, | |
| "loss": 2.3073, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.7370091233637446, | |
| "grad_norm": 3.573631763458252, | |
| "learning_rate": 4.315846886156287e-05, | |
| "loss": 2.1833, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.776675922253074, | |
| "grad_norm": 5.297525882720947, | |
| "learning_rate": 4.305930186433955e-05, | |
| "loss": 2.4245, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.8163427211424037, | |
| "grad_norm": 4.3615827560424805, | |
| "learning_rate": 4.296013486711622e-05, | |
| "loss": 2.2811, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.8560095200317335, | |
| "grad_norm": 6.935328960418701, | |
| "learning_rate": 4.28609678698929e-05, | |
| "loss": 2.2544, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.895676318921063, | |
| "grad_norm": 3.9425063133239746, | |
| "learning_rate": 4.276180087266958e-05, | |
| "loss": 2.2934, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.9353431178103926, | |
| "grad_norm": 6.062328815460205, | |
| "learning_rate": 4.266263387544626e-05, | |
| "loss": 2.3048, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.9750099166997224, | |
| "grad_norm": 4.808726787567139, | |
| "learning_rate": 4.256346687822293e-05, | |
| "loss": 2.2118, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.770484209060669, | |
| "eval_runtime": 33.1033, | |
| "eval_samples_per_second": 45.856, | |
| "eval_steps_per_second": 5.74, | |
| "step": 7563 | |
| }, | |
| { | |
| "epoch": 3.014676715589052, | |
| "grad_norm": 4.881776809692383, | |
| "learning_rate": 4.24642998809996e-05, | |
| "loss": 2.2472, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.0543435144783815, | |
| "grad_norm": 6.6921706199646, | |
| "learning_rate": 4.236513288377628e-05, | |
| "loss": 2.1728, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.0940103133677113, | |
| "grad_norm": 3.29506254196167, | |
| "learning_rate": 4.226596588655295e-05, | |
| "loss": 2.0689, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.133677112257041, | |
| "grad_norm": 4.864801406860352, | |
| "learning_rate": 4.216679888932963e-05, | |
| "loss": 2.2328, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.1733439111463704, | |
| "grad_norm": 3.8594539165496826, | |
| "learning_rate": 4.206763189210631e-05, | |
| "loss": 2.0911, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.2130107100357, | |
| "grad_norm": 5.1737380027771, | |
| "learning_rate": 4.196846489488299e-05, | |
| "loss": 2.0999, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.25267750892503, | |
| "grad_norm": 4.454146385192871, | |
| "learning_rate": 4.186929789765966e-05, | |
| "loss": 2.0902, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.2923443078143593, | |
| "grad_norm": 5.417801380157471, | |
| "learning_rate": 4.177013090043634e-05, | |
| "loss": 2.0971, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.332011106703689, | |
| "grad_norm": 2.7768959999084473, | |
| "learning_rate": 4.167096390321301e-05, | |
| "loss": 2.1635, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.371677905593019, | |
| "grad_norm": 4.387384414672852, | |
| "learning_rate": 4.157179690598969e-05, | |
| "loss": 2.0166, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.411344704482348, | |
| "grad_norm": 4.593613624572754, | |
| "learning_rate": 4.147262990876636e-05, | |
| "loss": 2.0944, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.451011503371678, | |
| "grad_norm": 5.243652820587158, | |
| "learning_rate": 4.137346291154304e-05, | |
| "loss": 2.0518, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.4906783022610077, | |
| "grad_norm": 5.076266765594482, | |
| "learning_rate": 4.127429591431972e-05, | |
| "loss": 2.0412, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 3.530345101150337, | |
| "grad_norm": 5.36345911026001, | |
| "learning_rate": 4.11751289170964e-05, | |
| "loss": 2.0586, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.570011900039667, | |
| "grad_norm": 6.591952800750732, | |
| "learning_rate": 4.107596191987307e-05, | |
| "loss": 2.059, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.609678698928996, | |
| "grad_norm": 5.091315746307373, | |
| "learning_rate": 4.097679492264974e-05, | |
| "loss": 2.0451, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.649345497818326, | |
| "grad_norm": 4.647657871246338, | |
| "learning_rate": 4.087762792542642e-05, | |
| "loss": 2.0488, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.6890122967076557, | |
| "grad_norm": 5.167809963226318, | |
| "learning_rate": 4.077846092820309e-05, | |
| "loss": 2.0688, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.728679095596985, | |
| "grad_norm": 67.48959350585938, | |
| "learning_rate": 4.067929393097977e-05, | |
| "loss": 2.0513, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.768345894486315, | |
| "grad_norm": 3.942390203475952, | |
| "learning_rate": 4.058012693375645e-05, | |
| "loss": 1.9697, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.8080126933756446, | |
| "grad_norm": 5.491151332855225, | |
| "learning_rate": 4.048095993653313e-05, | |
| "loss": 2.0849, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.847679492264974, | |
| "grad_norm": 4.637006759643555, | |
| "learning_rate": 4.03817929393098e-05, | |
| "loss": 1.9753, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.8873462911543037, | |
| "grad_norm": 4.818416595458984, | |
| "learning_rate": 4.028262594208648e-05, | |
| "loss": 2.0526, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.9270130900436335, | |
| "grad_norm": 4.810122013092041, | |
| "learning_rate": 4.018345894486315e-05, | |
| "loss": 2.0148, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.966679888932963, | |
| "grad_norm": 4.372331142425537, | |
| "learning_rate": 4.008429194763983e-05, | |
| "loss": 2.0324, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.5883285999298096, | |
| "eval_runtime": 33.141, | |
| "eval_samples_per_second": 45.804, | |
| "eval_steps_per_second": 5.733, | |
| "step": 10084 | |
| }, | |
| { | |
| "epoch": 4.006346687822293, | |
| "grad_norm": 4.643691539764404, | |
| "learning_rate": 3.99851249504165e-05, | |
| "loss": 2.0171, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 4.046013486711622, | |
| "grad_norm": 5.210694789886475, | |
| "learning_rate": 3.988595795319318e-05, | |
| "loss": 1.9401, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 4.085680285600952, | |
| "grad_norm": 5.724204063415527, | |
| "learning_rate": 3.978679095596986e-05, | |
| "loss": 1.9012, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 4.125347084490282, | |
| "grad_norm": 3.6750075817108154, | |
| "learning_rate": 3.9687623958746536e-05, | |
| "loss": 1.7982, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 4.165013883379611, | |
| "grad_norm": 4.948938369750977, | |
| "learning_rate": 3.958845696152321e-05, | |
| "loss": 1.9426, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 4.204680682268941, | |
| "grad_norm": 5.098011016845703, | |
| "learning_rate": 3.948928996429988e-05, | |
| "loss": 1.9476, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 4.244347481158271, | |
| "grad_norm": 3.605708599090576, | |
| "learning_rate": 3.939012296707656e-05, | |
| "loss": 1.91, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 4.2840142800476, | |
| "grad_norm": 4.1741766929626465, | |
| "learning_rate": 3.929095596985323e-05, | |
| "loss": 1.9512, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 4.3236810789369295, | |
| "grad_norm": 4.427469730377197, | |
| "learning_rate": 3.919178897262991e-05, | |
| "loss": 1.859, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 4.36334787782626, | |
| "grad_norm": 4.128306865692139, | |
| "learning_rate": 3.909262197540659e-05, | |
| "loss": 1.8465, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.403014676715589, | |
| "grad_norm": 3.959047317504883, | |
| "learning_rate": 3.8993454978183266e-05, | |
| "loss": 1.9168, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 4.442681475604918, | |
| "grad_norm": 5.283690452575684, | |
| "learning_rate": 3.889428798095994e-05, | |
| "loss": 1.8212, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 4.482348274494249, | |
| "grad_norm": 4.190108299255371, | |
| "learning_rate": 3.8795120983736616e-05, | |
| "loss": 1.7881, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 4.522015073383578, | |
| "grad_norm": 5.957630157470703, | |
| "learning_rate": 3.869595398651329e-05, | |
| "loss": 1.8606, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 4.561681872272907, | |
| "grad_norm": 4.41494607925415, | |
| "learning_rate": 3.859678698928997e-05, | |
| "loss": 1.8808, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.601348671162238, | |
| "grad_norm": 4.355372428894043, | |
| "learning_rate": 3.849761999206664e-05, | |
| "loss": 1.8034, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 4.641015470051567, | |
| "grad_norm": 4.594727993011475, | |
| "learning_rate": 3.839845299484332e-05, | |
| "loss": 1.8631, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 4.680682268940896, | |
| "grad_norm": 3.8081648349761963, | |
| "learning_rate": 3.8299285997619996e-05, | |
| "loss": 1.7664, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 4.7203490678302265, | |
| "grad_norm": 5.383887767791748, | |
| "learning_rate": 3.8200119000396675e-05, | |
| "loss": 1.8918, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 4.760015866719556, | |
| "grad_norm": 4.703048229217529, | |
| "learning_rate": 3.8100952003173347e-05, | |
| "loss": 1.7821, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.799682665608885, | |
| "grad_norm": 5.115866661071777, | |
| "learning_rate": 3.800178500595002e-05, | |
| "loss": 1.8276, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 4.839349464498215, | |
| "grad_norm": 4.647130012512207, | |
| "learning_rate": 3.79026180087267e-05, | |
| "loss": 1.7743, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 4.879016263387545, | |
| "grad_norm": 4.2948994636535645, | |
| "learning_rate": 3.780345101150337e-05, | |
| "loss": 1.7567, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 4.918683062276874, | |
| "grad_norm": 4.055002212524414, | |
| "learning_rate": 3.770428401428005e-05, | |
| "loss": 1.832, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 4.958349861166204, | |
| "grad_norm": 4.373877048492432, | |
| "learning_rate": 3.7605117017056726e-05, | |
| "loss": 1.7995, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.998016660055534, | |
| "grad_norm": 5.246423721313477, | |
| "learning_rate": 3.7505950019833405e-05, | |
| "loss": 1.7464, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.4670053720474243, | |
| "eval_runtime": 33.135, | |
| "eval_samples_per_second": 45.813, | |
| "eval_steps_per_second": 5.734, | |
| "step": 12605 | |
| }, | |
| { | |
| "epoch": 5.037683458944863, | |
| "grad_norm": 5.669796943664551, | |
| "learning_rate": 3.740678302261008e-05, | |
| "loss": 1.6803, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 5.077350257834193, | |
| "grad_norm": 4.203566074371338, | |
| "learning_rate": 3.7307616025386755e-05, | |
| "loss": 1.665, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 5.1170170567235225, | |
| "grad_norm": 3.6892035007476807, | |
| "learning_rate": 3.720844902816343e-05, | |
| "loss": 1.7269, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 5.156683855612852, | |
| "grad_norm": 4.452983379364014, | |
| "learning_rate": 3.7109282030940106e-05, | |
| "loss": 1.7276, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 5.196350654502182, | |
| "grad_norm": 3.7172744274139404, | |
| "learning_rate": 3.701011503371678e-05, | |
| "loss": 1.6838, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 5.236017453391511, | |
| "grad_norm": 4.209805488586426, | |
| "learning_rate": 3.6910948036493456e-05, | |
| "loss": 1.6982, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 5.275684252280841, | |
| "grad_norm": 4.2851362228393555, | |
| "learning_rate": 3.6811781039270135e-05, | |
| "loss": 1.6656, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 5.315351051170171, | |
| "grad_norm": 3.345033884048462, | |
| "learning_rate": 3.6712614042046814e-05, | |
| "loss": 1.7176, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 5.3550178500595, | |
| "grad_norm": 7.854482173919678, | |
| "learning_rate": 3.6613447044823485e-05, | |
| "loss": 1.7395, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 5.39468464894883, | |
| "grad_norm": 5.201159477233887, | |
| "learning_rate": 3.651428004760016e-05, | |
| "loss": 1.7159, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 5.43435144783816, | |
| "grad_norm": 5.032053470611572, | |
| "learning_rate": 3.6415113050376836e-05, | |
| "loss": 1.7993, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 5.474018246727489, | |
| "grad_norm": 4.350612640380859, | |
| "learning_rate": 3.631594605315351e-05, | |
| "loss": 1.722, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 5.5136850456168185, | |
| "grad_norm": 5.67685604095459, | |
| "learning_rate": 3.6216779055930186e-05, | |
| "loss": 1.5882, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 5.553351844506149, | |
| "grad_norm": 3.8744733333587646, | |
| "learning_rate": 3.6117612058706865e-05, | |
| "loss": 1.6864, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.593018643395478, | |
| "grad_norm": 3.0556750297546387, | |
| "learning_rate": 3.6018445061483544e-05, | |
| "loss": 1.6494, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 5.632685442284807, | |
| "grad_norm": 5.0797343254089355, | |
| "learning_rate": 3.5919278064260215e-05, | |
| "loss": 1.747, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 5.672352241174138, | |
| "grad_norm": 4.625453948974609, | |
| "learning_rate": 3.5820111067036894e-05, | |
| "loss": 1.689, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 5.712019040063467, | |
| "grad_norm": 5.2560133934021, | |
| "learning_rate": 3.5720944069813566e-05, | |
| "loss": 1.6218, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 5.751685838952796, | |
| "grad_norm": 4.45328950881958, | |
| "learning_rate": 3.5621777072590245e-05, | |
| "loss": 1.7095, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 5.7913526378421265, | |
| "grad_norm": 3.0788328647613525, | |
| "learning_rate": 3.5522610075366916e-05, | |
| "loss": 1.7223, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 5.831019436731456, | |
| "grad_norm": 6.247290134429932, | |
| "learning_rate": 3.5423443078143595e-05, | |
| "loss": 1.6158, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 5.870686235620785, | |
| "grad_norm": 4.095520973205566, | |
| "learning_rate": 3.5324276080920274e-05, | |
| "loss": 1.6313, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 5.910353034510115, | |
| "grad_norm": 4.251845836639404, | |
| "learning_rate": 3.522510908369695e-05, | |
| "loss": 1.6458, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 5.950019833399445, | |
| "grad_norm": 3.833702802658081, | |
| "learning_rate": 3.5125942086473624e-05, | |
| "loss": 1.6425, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.989686632288774, | |
| "grad_norm": 4.577655792236328, | |
| "learning_rate": 3.5026775089250296e-05, | |
| "loss": 1.6747, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 1.3812555074691772, | |
| "eval_runtime": 33.4674, | |
| "eval_samples_per_second": 45.358, | |
| "eval_steps_per_second": 5.677, | |
| "step": 15126 | |
| }, | |
| { | |
| "epoch": 6.029353431178104, | |
| "grad_norm": 3.609616279602051, | |
| "learning_rate": 3.4927608092026975e-05, | |
| "loss": 1.6579, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 6.069020230067434, | |
| "grad_norm": 3.5658130645751953, | |
| "learning_rate": 3.4828441094803647e-05, | |
| "loss": 1.6842, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 6.108687028956763, | |
| "grad_norm": 4.586058139801025, | |
| "learning_rate": 3.4729274097580325e-05, | |
| "loss": 1.5563, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 6.148353827846093, | |
| "grad_norm": 5.103824615478516, | |
| "learning_rate": 3.4630107100357004e-05, | |
| "loss": 1.5386, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 6.1880206267354225, | |
| "grad_norm": 5.14306116104126, | |
| "learning_rate": 3.453094010313368e-05, | |
| "loss": 1.6081, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 6.227687425624752, | |
| "grad_norm": 4.270661354064941, | |
| "learning_rate": 3.4431773105910354e-05, | |
| "loss": 1.5569, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 6.267354224514082, | |
| "grad_norm": 13.869562149047852, | |
| "learning_rate": 3.433260610868703e-05, | |
| "loss": 1.5484, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 6.307021023403411, | |
| "grad_norm": 3.9378180503845215, | |
| "learning_rate": 3.4233439111463705e-05, | |
| "loss": 1.5441, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 6.346687822292741, | |
| "grad_norm": 4.3542656898498535, | |
| "learning_rate": 3.4134272114240383e-05, | |
| "loss": 1.58, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 6.386354621182071, | |
| "grad_norm": 3.8545126914978027, | |
| "learning_rate": 3.4035105117017055e-05, | |
| "loss": 1.4445, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 6.4260214200714, | |
| "grad_norm": 3.9810452461242676, | |
| "learning_rate": 3.3935938119793734e-05, | |
| "loss": 1.6052, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 6.46568821896073, | |
| "grad_norm": 7.306039333343506, | |
| "learning_rate": 3.383677112257041e-05, | |
| "loss": 1.608, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 6.50535501785006, | |
| "grad_norm": 4.018649578094482, | |
| "learning_rate": 3.373760412534709e-05, | |
| "loss": 1.581, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 6.545021816739389, | |
| "grad_norm": 5.1577019691467285, | |
| "learning_rate": 3.363843712812376e-05, | |
| "loss": 1.548, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 6.5846886156287185, | |
| "grad_norm": 6.858482837677002, | |
| "learning_rate": 3.3539270130900435e-05, | |
| "loss": 1.5823, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 6.624355414518049, | |
| "grad_norm": 4.213831901550293, | |
| "learning_rate": 3.3440103133677114e-05, | |
| "loss": 1.5199, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 6.664022213407378, | |
| "grad_norm": 3.531313180923462, | |
| "learning_rate": 3.3340936136453785e-05, | |
| "loss": 1.5993, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 6.703689012296707, | |
| "grad_norm": 4.222484588623047, | |
| "learning_rate": 3.3241769139230464e-05, | |
| "loss": 1.5839, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 6.743355811186038, | |
| "grad_norm": 3.11354660987854, | |
| "learning_rate": 3.314260214200714e-05, | |
| "loss": 1.5302, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 6.783022610075367, | |
| "grad_norm": 3.699721574783325, | |
| "learning_rate": 3.304343514478382e-05, | |
| "loss": 1.5662, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 6.822689408964696, | |
| "grad_norm": 6.095912456512451, | |
| "learning_rate": 3.294426814756049e-05, | |
| "loss": 1.6208, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 6.862356207854027, | |
| "grad_norm": 3.0489301681518555, | |
| "learning_rate": 3.284510115033717e-05, | |
| "loss": 1.4306, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 6.902023006743356, | |
| "grad_norm": 4.094913005828857, | |
| "learning_rate": 3.2745934153113844e-05, | |
| "loss": 1.438, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 6.941689805632685, | |
| "grad_norm": 3.900447130203247, | |
| "learning_rate": 3.264676715589052e-05, | |
| "loss": 1.4798, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 6.9813566045220155, | |
| "grad_norm": 4.244141578674316, | |
| "learning_rate": 3.2547600158667194e-05, | |
| "loss": 1.5627, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 1.3121882677078247, | |
| "eval_runtime": 33.1418, | |
| "eval_samples_per_second": 45.803, | |
| "eval_steps_per_second": 5.733, | |
| "step": 17647 | |
| }, | |
| { | |
| "epoch": 7.021023403411345, | |
| "grad_norm": 5.134289264678955, | |
| "learning_rate": 3.244843316144387e-05, | |
| "loss": 1.4704, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 7.060690202300674, | |
| "grad_norm": 4.705554008483887, | |
| "learning_rate": 3.234926616422055e-05, | |
| "loss": 1.4257, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 7.100357001190004, | |
| "grad_norm": 5.20936918258667, | |
| "learning_rate": 3.225009916699723e-05, | |
| "loss": 1.4684, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 7.140023800079334, | |
| "grad_norm": 5.669763565063477, | |
| "learning_rate": 3.21509321697739e-05, | |
| "loss": 1.4626, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 7.179690598968663, | |
| "grad_norm": 4.726533889770508, | |
| "learning_rate": 3.2051765172550574e-05, | |
| "loss": 1.4362, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 7.219357397857993, | |
| "grad_norm": 3.413167715072632, | |
| "learning_rate": 3.195259817532725e-05, | |
| "loss": 1.4591, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 7.259024196747323, | |
| "grad_norm": 3.3368911743164062, | |
| "learning_rate": 3.1853431178103924e-05, | |
| "loss": 1.5077, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 7.298690995636652, | |
| "grad_norm": 3.5089704990386963, | |
| "learning_rate": 3.17542641808806e-05, | |
| "loss": 1.509, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 7.338357794525982, | |
| "grad_norm": 4.13053035736084, | |
| "learning_rate": 3.165509718365728e-05, | |
| "loss": 1.5048, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 7.3780245934153115, | |
| "grad_norm": 4.646170139312744, | |
| "learning_rate": 3.155593018643396e-05, | |
| "loss": 1.474, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 7.417691392304641, | |
| "grad_norm": 4.4724812507629395, | |
| "learning_rate": 3.145676318921063e-05, | |
| "loss": 1.5448, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 7.457358191193971, | |
| "grad_norm": 3.79464054107666, | |
| "learning_rate": 3.135759619198731e-05, | |
| "loss": 1.4379, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 7.4970249900833, | |
| "grad_norm": 3.2396161556243896, | |
| "learning_rate": 3.125842919476398e-05, | |
| "loss": 1.4857, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 7.53669178897263, | |
| "grad_norm": 3.6047024726867676, | |
| "learning_rate": 3.115926219754066e-05, | |
| "loss": 1.453, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.57635858786196, | |
| "grad_norm": 4.998748779296875, | |
| "learning_rate": 3.106009520031733e-05, | |
| "loss": 1.4062, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 7.616025386751289, | |
| "grad_norm": 4.068435192108154, | |
| "learning_rate": 3.096092820309401e-05, | |
| "loss": 1.3582, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 7.655692185640619, | |
| "grad_norm": 5.680367469787598, | |
| "learning_rate": 3.086176120587069e-05, | |
| "loss": 1.4897, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 7.695358984529949, | |
| "grad_norm": 3.917802333831787, | |
| "learning_rate": 3.076259420864737e-05, | |
| "loss": 1.4195, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 7.735025783419278, | |
| "grad_norm": 3.1522891521453857, | |
| "learning_rate": 3.066342721142404e-05, | |
| "loss": 1.4599, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 7.7746925823086075, | |
| "grad_norm": 4.597601890563965, | |
| "learning_rate": 3.056426021420071e-05, | |
| "loss": 1.4701, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 7.814359381197937, | |
| "grad_norm": 4.217317581176758, | |
| "learning_rate": 3.046509321697739e-05, | |
| "loss": 1.4263, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 7.854026180087267, | |
| "grad_norm": 4.17954158782959, | |
| "learning_rate": 3.0365926219754066e-05, | |
| "loss": 1.4155, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 7.893692978976596, | |
| "grad_norm": 4.049231052398682, | |
| "learning_rate": 3.0266759222530745e-05, | |
| "loss": 1.4343, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 7.933359777865926, | |
| "grad_norm": 3.9351389408111572, | |
| "learning_rate": 3.0167592225307417e-05, | |
| "loss": 1.4247, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 7.973026576755256, | |
| "grad_norm": 6.478794097900391, | |
| "learning_rate": 3.0068425228084096e-05, | |
| "loss": 1.4336, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 1.2493535280227661, | |
| "eval_runtime": 33.1532, | |
| "eval_samples_per_second": 45.787, | |
| "eval_steps_per_second": 5.731, | |
| "step": 20168 | |
| }, | |
| { | |
| "epoch": 8.012693375644586, | |
| "grad_norm": 7.988471508026123, | |
| "learning_rate": 2.996925823086077e-05, | |
| "loss": 1.4408, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 8.052360174533915, | |
| "grad_norm": 3.978797674179077, | |
| "learning_rate": 2.987009123363745e-05, | |
| "loss": 1.4227, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 8.092026973423245, | |
| "grad_norm": 2.8589699268341064, | |
| "learning_rate": 2.977092423641412e-05, | |
| "loss": 1.3348, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 8.131693772312575, | |
| "grad_norm": 4.3820061683654785, | |
| "learning_rate": 2.96717572391908e-05, | |
| "loss": 1.3374, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 8.171360571201904, | |
| "grad_norm": 4.421834468841553, | |
| "learning_rate": 2.9572590241967475e-05, | |
| "loss": 1.379, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 8.211027370091234, | |
| "grad_norm": 3.6717193126678467, | |
| "learning_rate": 2.9473423244744154e-05, | |
| "loss": 1.3878, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 8.250694168980564, | |
| "grad_norm": 5.8960466384887695, | |
| "learning_rate": 2.9374256247520826e-05, | |
| "loss": 1.418, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 8.290360967869892, | |
| "grad_norm": 4.1541428565979, | |
| "learning_rate": 2.9275089250297504e-05, | |
| "loss": 1.3427, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 8.330027766759223, | |
| "grad_norm": 4.0375566482543945, | |
| "learning_rate": 2.917592225307418e-05, | |
| "loss": 1.3659, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 8.369694565648553, | |
| "grad_norm": 2.6886465549468994, | |
| "learning_rate": 2.907675525585085e-05, | |
| "loss": 1.3568, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 8.409361364537881, | |
| "grad_norm": 4.069731712341309, | |
| "learning_rate": 2.897758825862753e-05, | |
| "loss": 1.4326, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 8.449028163427212, | |
| "grad_norm": 4.844085693359375, | |
| "learning_rate": 2.8878421261404205e-05, | |
| "loss": 1.4363, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 8.488694962316542, | |
| "grad_norm": 2.894545316696167, | |
| "learning_rate": 2.8779254264180884e-05, | |
| "loss": 1.362, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 8.52836176120587, | |
| "grad_norm": 3.8921375274658203, | |
| "learning_rate": 2.8680087266957556e-05, | |
| "loss": 1.3303, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 8.5680285600952, | |
| "grad_norm": 3.6468684673309326, | |
| "learning_rate": 2.8580920269734234e-05, | |
| "loss": 1.387, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 8.60769535898453, | |
| "grad_norm": 4.2180938720703125, | |
| "learning_rate": 2.848175327251091e-05, | |
| "loss": 1.366, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 8.647362157873859, | |
| "grad_norm": 4.113888263702393, | |
| "learning_rate": 2.8382586275287588e-05, | |
| "loss": 1.4047, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 8.68702895676319, | |
| "grad_norm": 4.009461402893066, | |
| "learning_rate": 2.828341927806426e-05, | |
| "loss": 1.3446, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 8.72669575565252, | |
| "grad_norm": 3.8195252418518066, | |
| "learning_rate": 2.818425228084094e-05, | |
| "loss": 1.3304, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 8.766362554541848, | |
| "grad_norm": 4.5541300773620605, | |
| "learning_rate": 2.8085085283617614e-05, | |
| "loss": 1.4156, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 8.806029353431178, | |
| "grad_norm": 4.221588611602783, | |
| "learning_rate": 2.7985918286394293e-05, | |
| "loss": 1.3258, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 8.845696152320508, | |
| "grad_norm": 3.7638354301452637, | |
| "learning_rate": 2.7886751289170964e-05, | |
| "loss": 1.2697, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 8.885362951209837, | |
| "grad_norm": 3.7174267768859863, | |
| "learning_rate": 2.7787584291947643e-05, | |
| "loss": 1.3468, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 8.925029750099167, | |
| "grad_norm": 4.4955153465271, | |
| "learning_rate": 2.768841729472432e-05, | |
| "loss": 1.3074, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 8.964696548988497, | |
| "grad_norm": 4.170012950897217, | |
| "learning_rate": 2.758925029750099e-05, | |
| "loss": 1.3324, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 1.204575538635254, | |
| "eval_runtime": 33.1963, | |
| "eval_samples_per_second": 45.728, | |
| "eval_steps_per_second": 5.724, | |
| "step": 22689 | |
| }, | |
| { | |
| "epoch": 9.004363347877826, | |
| "grad_norm": 3.331163167953491, | |
| "learning_rate": 2.749008330027767e-05, | |
| "loss": 1.3285, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 9.044030146767156, | |
| "grad_norm": 3.822847843170166, | |
| "learning_rate": 2.7390916303054344e-05, | |
| "loss": 1.3589, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 9.083696945656486, | |
| "grad_norm": 3.4321391582489014, | |
| "learning_rate": 2.7291749305831023e-05, | |
| "loss": 1.2863, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 9.123363744545815, | |
| "grad_norm": 4.23520040512085, | |
| "learning_rate": 2.7192582308607695e-05, | |
| "loss": 1.297, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 9.163030543435145, | |
| "grad_norm": 3.0839881896972656, | |
| "learning_rate": 2.7093415311384373e-05, | |
| "loss": 1.272, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 9.202697342324475, | |
| "grad_norm": 5.115342617034912, | |
| "learning_rate": 2.699424831416105e-05, | |
| "loss": 1.2667, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 9.242364141213804, | |
| "grad_norm": 3.8965401649475098, | |
| "learning_rate": 2.6895081316937727e-05, | |
| "loss": 1.2995, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 9.282030940103134, | |
| "grad_norm": 3.395707368850708, | |
| "learning_rate": 2.67959143197144e-05, | |
| "loss": 1.2064, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 9.321697738992464, | |
| "grad_norm": 3.7783238887786865, | |
| "learning_rate": 2.6696747322491078e-05, | |
| "loss": 1.354, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 9.361364537881792, | |
| "grad_norm": 3.6201136112213135, | |
| "learning_rate": 2.6597580325267753e-05, | |
| "loss": 1.318, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 9.401031336771123, | |
| "grad_norm": 7.127315044403076, | |
| "learning_rate": 2.649841332804443e-05, | |
| "loss": 1.2809, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 9.440698135660453, | |
| "grad_norm": 3.341298818588257, | |
| "learning_rate": 2.6399246330821103e-05, | |
| "loss": 1.3285, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 9.480364934549781, | |
| "grad_norm": 3.38814377784729, | |
| "learning_rate": 2.6300079333597782e-05, | |
| "loss": 1.3326, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 9.520031733439112, | |
| "grad_norm": 2.880125045776367, | |
| "learning_rate": 2.6200912336374457e-05, | |
| "loss": 1.3142, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 9.559698532328442, | |
| "grad_norm": 3.778383731842041, | |
| "learning_rate": 2.610174533915113e-05, | |
| "loss": 1.3217, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 9.59936533121777, | |
| "grad_norm": 5.5109734535217285, | |
| "learning_rate": 2.6002578341927808e-05, | |
| "loss": 1.2715, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 9.6390321301071, | |
| "grad_norm": 3.931368112564087, | |
| "learning_rate": 2.5903411344704483e-05, | |
| "loss": 1.318, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 9.67869892899643, | |
| "grad_norm": 3.6587719917297363, | |
| "learning_rate": 2.580424434748116e-05, | |
| "loss": 1.2384, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 9.71836572788576, | |
| "grad_norm": 3.4478108882904053, | |
| "learning_rate": 2.5705077350257833e-05, | |
| "loss": 1.2682, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 9.75803252677509, | |
| "grad_norm": 3.9226527214050293, | |
| "learning_rate": 2.5605910353034512e-05, | |
| "loss": 1.2966, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 9.79769932566442, | |
| "grad_norm": 4.621306419372559, | |
| "learning_rate": 2.5506743355811187e-05, | |
| "loss": 1.2788, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 9.837366124553748, | |
| "grad_norm": 3.4298593997955322, | |
| "learning_rate": 2.5407576358587866e-05, | |
| "loss": 1.3299, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 9.877032923443078, | |
| "grad_norm": 3.7832400798797607, | |
| "learning_rate": 2.5308409361364538e-05, | |
| "loss": 1.2634, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 9.916699722332408, | |
| "grad_norm": 5.351818561553955, | |
| "learning_rate": 2.5209242364141216e-05, | |
| "loss": 1.3117, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 9.956366521221737, | |
| "grad_norm": 4.65415096282959, | |
| "learning_rate": 2.511007536691789e-05, | |
| "loss": 1.2613, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 9.996033320111067, | |
| "grad_norm": 3.2736618518829346, | |
| "learning_rate": 2.501090836969457e-05, | |
| "loss": 1.3156, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.164570689201355, | |
| "eval_runtime": 33.0846, | |
| "eval_samples_per_second": 45.882, | |
| "eval_steps_per_second": 5.743, | |
| "step": 25210 | |
| }, | |
| { | |
| "epoch": 10.035700119000397, | |
| "grad_norm": 3.6819069385528564, | |
| "learning_rate": 2.4911741372471242e-05, | |
| "loss": 1.2604, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 10.075366917889726, | |
| "grad_norm": 3.9212143421173096, | |
| "learning_rate": 2.4812574375247917e-05, | |
| "loss": 1.2308, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 10.115033716779056, | |
| "grad_norm": 3.3087549209594727, | |
| "learning_rate": 2.4713407378024596e-05, | |
| "loss": 1.1652, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 10.154700515668386, | |
| "grad_norm": 3.8680827617645264, | |
| "learning_rate": 2.461424038080127e-05, | |
| "loss": 1.2311, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 10.194367314557715, | |
| "grad_norm": 5.244319438934326, | |
| "learning_rate": 2.4515073383577946e-05, | |
| "loss": 1.1819, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 10.234034113447045, | |
| "grad_norm": 3.2293717861175537, | |
| "learning_rate": 2.4415906386354622e-05, | |
| "loss": 1.249, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 10.273700912336375, | |
| "grad_norm": 4.391103744506836, | |
| "learning_rate": 2.43167393891313e-05, | |
| "loss": 1.2283, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 10.313367711225704, | |
| "grad_norm": 4.615547180175781, | |
| "learning_rate": 2.4217572391907976e-05, | |
| "loss": 1.2915, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 10.353034510115034, | |
| "grad_norm": 3.3367502689361572, | |
| "learning_rate": 2.411840539468465e-05, | |
| "loss": 1.2221, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 10.392701309004364, | |
| "grad_norm": 5.194177150726318, | |
| "learning_rate": 2.4019238397461326e-05, | |
| "loss": 1.2611, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 10.432368107893693, | |
| "grad_norm": 5.576562404632568, | |
| "learning_rate": 2.3920071400238005e-05, | |
| "loss": 1.2764, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 10.472034906783023, | |
| "grad_norm": 4.902477264404297, | |
| "learning_rate": 2.3820904403014677e-05, | |
| "loss": 1.2066, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 10.511701705672353, | |
| "grad_norm": 4.312764644622803, | |
| "learning_rate": 2.3721737405791352e-05, | |
| "loss": 1.219, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 10.551368504561681, | |
| "grad_norm": 4.345120429992676, | |
| "learning_rate": 2.362257040856803e-05, | |
| "loss": 1.2679, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 10.591035303451012, | |
| "grad_norm": 3.9365150928497314, | |
| "learning_rate": 2.3523403411344706e-05, | |
| "loss": 1.1752, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 10.630702102340342, | |
| "grad_norm": 3.843207597732544, | |
| "learning_rate": 2.342423641412138e-05, | |
| "loss": 1.2706, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 10.67036890122967, | |
| "grad_norm": 4.076716423034668, | |
| "learning_rate": 2.3325069416898056e-05, | |
| "loss": 1.1561, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 10.710035700119, | |
| "grad_norm": 4.182331562042236, | |
| "learning_rate": 2.3225902419674735e-05, | |
| "loss": 1.2027, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 10.74970249900833, | |
| "grad_norm": 5.730105876922607, | |
| "learning_rate": 2.312673542245141e-05, | |
| "loss": 1.2703, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 10.78936929789766, | |
| "grad_norm": 5.552068710327148, | |
| "learning_rate": 2.3027568425228085e-05, | |
| "loss": 1.252, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 10.82903609678699, | |
| "grad_norm": 4.406209945678711, | |
| "learning_rate": 2.292840142800476e-05, | |
| "loss": 1.183, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 10.86870289567632, | |
| "grad_norm": 3.434688091278076, | |
| "learning_rate": 2.282923443078144e-05, | |
| "loss": 1.3214, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 10.908369694565648, | |
| "grad_norm": 5.0344085693359375, | |
| "learning_rate": 2.2730067433558114e-05, | |
| "loss": 1.3043, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 10.948036493454978, | |
| "grad_norm": 3.3030033111572266, | |
| "learning_rate": 2.263090043633479e-05, | |
| "loss": 1.1764, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 10.987703292344309, | |
| "grad_norm": 5.79923152923584, | |
| "learning_rate": 2.2531733439111465e-05, | |
| "loss": 1.2218, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 1.1352205276489258, | |
| "eval_runtime": 31.6991, | |
| "eval_samples_per_second": 47.888, | |
| "eval_steps_per_second": 5.994, | |
| "step": 27731 | |
| }, | |
| { | |
| "epoch": 11.027370091233637, | |
| "grad_norm": 4.073122501373291, | |
| "learning_rate": 2.2432566441888144e-05, | |
| "loss": 1.1861, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 11.067036890122967, | |
| "grad_norm": 2.8648433685302734, | |
| "learning_rate": 2.2333399444664815e-05, | |
| "loss": 1.1659, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 11.106703689012297, | |
| "grad_norm": 3.6053709983825684, | |
| "learning_rate": 2.223423244744149e-05, | |
| "loss": 1.2087, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 11.146370487901626, | |
| "grad_norm": 3.5773251056671143, | |
| "learning_rate": 2.2135065450218166e-05, | |
| "loss": 1.1787, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 11.186037286790956, | |
| "grad_norm": 5.5593485832214355, | |
| "learning_rate": 2.2035898452994845e-05, | |
| "loss": 1.1941, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 11.225704085680286, | |
| "grad_norm": 3.9467504024505615, | |
| "learning_rate": 2.193673145577152e-05, | |
| "loss": 1.2505, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 11.265370884569615, | |
| "grad_norm": 4.707422733306885, | |
| "learning_rate": 2.1837564458548195e-05, | |
| "loss": 1.1165, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 11.305037683458945, | |
| "grad_norm": 4.517952919006348, | |
| "learning_rate": 2.1738397461324874e-05, | |
| "loss": 1.2379, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 11.344704482348275, | |
| "grad_norm": 2.318586587905884, | |
| "learning_rate": 2.163923046410155e-05, | |
| "loss": 1.2098, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 11.384371281237604, | |
| "grad_norm": 3.655980110168457, | |
| "learning_rate": 2.1540063466878224e-05, | |
| "loss": 1.2044, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 11.424038080126934, | |
| "grad_norm": 4.038224697113037, | |
| "learning_rate": 2.14408964696549e-05, | |
| "loss": 1.1651, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 11.463704879016264, | |
| "grad_norm": 3.9811367988586426, | |
| "learning_rate": 2.1341729472431578e-05, | |
| "loss": 1.199, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 11.503371677905593, | |
| "grad_norm": 6.200103759765625, | |
| "learning_rate": 2.1242562475208253e-05, | |
| "loss": 1.1094, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 11.543038476794923, | |
| "grad_norm": 3.919187545776367, | |
| "learning_rate": 2.114339547798493e-05, | |
| "loss": 1.1522, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 11.582705275684253, | |
| "grad_norm": 3.701822519302368, | |
| "learning_rate": 2.1044228480761604e-05, | |
| "loss": 1.1556, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 11.622372074573581, | |
| "grad_norm": 4.491922855377197, | |
| "learning_rate": 2.0945061483538282e-05, | |
| "loss": 1.1779, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 11.662038873462912, | |
| "grad_norm": 4.367665767669678, | |
| "learning_rate": 2.0845894486314954e-05, | |
| "loss": 1.1392, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 11.701705672352242, | |
| "grad_norm": 4.0435028076171875, | |
| "learning_rate": 2.074672748909163e-05, | |
| "loss": 1.1621, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 11.74137247124157, | |
| "grad_norm": 4.151968955993652, | |
| "learning_rate": 2.0647560491868305e-05, | |
| "loss": 1.1983, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 11.7810392701309, | |
| "grad_norm": 4.687623500823975, | |
| "learning_rate": 2.0548393494644983e-05, | |
| "loss": 1.1563, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 11.82070606902023, | |
| "grad_norm": 4.415579795837402, | |
| "learning_rate": 2.044922649742166e-05, | |
| "loss": 1.1497, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 11.86037286790956, | |
| "grad_norm": 4.241002082824707, | |
| "learning_rate": 2.0350059500198334e-05, | |
| "loss": 1.2298, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 11.90003966679889, | |
| "grad_norm": 5.38535213470459, | |
| "learning_rate": 2.025089250297501e-05, | |
| "loss": 1.1403, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 11.93970646568822, | |
| "grad_norm": 3.886983633041382, | |
| "learning_rate": 2.0151725505751688e-05, | |
| "loss": 1.237, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 11.979373264577548, | |
| "grad_norm": 4.2845048904418945, | |
| "learning_rate": 2.0052558508528363e-05, | |
| "loss": 1.2216, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 1.097899317741394, | |
| "eval_runtime": 31.7141, | |
| "eval_samples_per_second": 47.865, | |
| "eval_steps_per_second": 5.991, | |
| "step": 30252 | |
| }, | |
| { | |
| "epoch": 12.019040063466878, | |
| "grad_norm": 4.043181896209717, | |
| "learning_rate": 1.9953391511305038e-05, | |
| "loss": 1.1738, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 12.058706862356209, | |
| "grad_norm": 3.213641405105591, | |
| "learning_rate": 1.9854224514081713e-05, | |
| "loss": 1.1143, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 12.098373661245537, | |
| "grad_norm": 4.7294511795043945, | |
| "learning_rate": 1.9755057516858392e-05, | |
| "loss": 1.142, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 12.138040460134867, | |
| "grad_norm": 4.42033052444458, | |
| "learning_rate": 1.9655890519635067e-05, | |
| "loss": 1.1422, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 12.177707259024197, | |
| "grad_norm": 4.57334041595459, | |
| "learning_rate": 1.9556723522411743e-05, | |
| "loss": 1.1148, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 12.217374057913526, | |
| "grad_norm": 4.560477256774902, | |
| "learning_rate": 1.945755652518842e-05, | |
| "loss": 1.1742, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 12.257040856802856, | |
| "grad_norm": 3.4284374713897705, | |
| "learning_rate": 1.9358389527965093e-05, | |
| "loss": 1.1115, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 12.296707655692186, | |
| "grad_norm": 3.185410499572754, | |
| "learning_rate": 1.925922253074177e-05, | |
| "loss": 1.1542, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 12.336374454581515, | |
| "grad_norm": 3.674408435821533, | |
| "learning_rate": 1.9160055533518444e-05, | |
| "loss": 1.1787, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 12.376041253470845, | |
| "grad_norm": 3.7118613719940186, | |
| "learning_rate": 1.9060888536295122e-05, | |
| "loss": 1.1716, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 12.415708052360175, | |
| "grad_norm": 4.5831756591796875, | |
| "learning_rate": 1.8961721539071797e-05, | |
| "loss": 1.1372, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 12.455374851249504, | |
| "grad_norm": 7.098066806793213, | |
| "learning_rate": 1.8862554541848473e-05, | |
| "loss": 1.1361, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 12.495041650138834, | |
| "grad_norm": 3.451817512512207, | |
| "learning_rate": 1.8763387544625148e-05, | |
| "loss": 1.1458, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 12.534708449028164, | |
| "grad_norm": 2.6188955307006836, | |
| "learning_rate": 1.8664220547401827e-05, | |
| "loss": 1.0782, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 12.574375247917493, | |
| "grad_norm": 3.3588056564331055, | |
| "learning_rate": 1.8565053550178502e-05, | |
| "loss": 1.1593, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 12.614042046806823, | |
| "grad_norm": 5.186858654022217, | |
| "learning_rate": 1.8465886552955177e-05, | |
| "loss": 1.137, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 12.653708845696153, | |
| "grad_norm": 4.593524932861328, | |
| "learning_rate": 1.8366719555731852e-05, | |
| "loss": 1.1715, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 12.693375644585482, | |
| "grad_norm": 4.951717853546143, | |
| "learning_rate": 1.826755255850853e-05, | |
| "loss": 1.0765, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 12.733042443474812, | |
| "grad_norm": 6.989925384521484, | |
| "learning_rate": 1.8168385561285206e-05, | |
| "loss": 1.1062, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 12.772709242364142, | |
| "grad_norm": 3.6436753273010254, | |
| "learning_rate": 1.806921856406188e-05, | |
| "loss": 1.1574, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 12.81237604125347, | |
| "grad_norm": 4.659509181976318, | |
| "learning_rate": 1.7970051566838557e-05, | |
| "loss": 1.1257, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 12.8520428401428, | |
| "grad_norm": 2.914414882659912, | |
| "learning_rate": 1.7870884569615232e-05, | |
| "loss": 1.1131, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 12.89170963903213, | |
| "grad_norm": 3.9510741233825684, | |
| "learning_rate": 1.7771717572391907e-05, | |
| "loss": 1.1144, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 12.93137643792146, | |
| "grad_norm": 4.820216178894043, | |
| "learning_rate": 1.7672550575168582e-05, | |
| "loss": 1.1628, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 12.97104323681079, | |
| "grad_norm": 4.699492931365967, | |
| "learning_rate": 1.757338357794526e-05, | |
| "loss": 1.1587, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 1.081364631652832, | |
| "eval_runtime": 31.6894, | |
| "eval_samples_per_second": 47.902, | |
| "eval_steps_per_second": 5.996, | |
| "step": 32773 | |
| }, | |
| { | |
| "epoch": 13.01071003570012, | |
| "grad_norm": 3.7646989822387695, | |
| "learning_rate": 1.7474216580721936e-05, | |
| "loss": 1.1084, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 13.050376834589448, | |
| "grad_norm": 4.074378967285156, | |
| "learning_rate": 1.737504958349861e-05, | |
| "loss": 1.1007, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 13.090043633478778, | |
| "grad_norm": 4.0714521408081055, | |
| "learning_rate": 1.7275882586275287e-05, | |
| "loss": 1.1298, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 13.129710432368109, | |
| "grad_norm": 3.7556121349334717, | |
| "learning_rate": 1.7176715589051965e-05, | |
| "loss": 1.1407, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 13.169377231257437, | |
| "grad_norm": 3.3032736778259277, | |
| "learning_rate": 1.707754859182864e-05, | |
| "loss": 1.1437, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 13.209044030146767, | |
| "grad_norm": 4.428369522094727, | |
| "learning_rate": 1.6978381594605316e-05, | |
| "loss": 1.0659, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 13.248710829036098, | |
| "grad_norm": 3.486649990081787, | |
| "learning_rate": 1.687921459738199e-05, | |
| "loss": 1.0744, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 13.288377627925426, | |
| "grad_norm": 4.116626262664795, | |
| "learning_rate": 1.678004760015867e-05, | |
| "loss": 1.0933, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 13.328044426814756, | |
| "grad_norm": 5.455049991607666, | |
| "learning_rate": 1.6680880602935345e-05, | |
| "loss": 1.0387, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 13.367711225704086, | |
| "grad_norm": 4.454029083251953, | |
| "learning_rate": 1.658171360571202e-05, | |
| "loss": 1.0488, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 13.407378024593415, | |
| "grad_norm": 3.605964422225952, | |
| "learning_rate": 1.6482546608488695e-05, | |
| "loss": 1.1565, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 13.447044823482745, | |
| "grad_norm": 3.3428781032562256, | |
| "learning_rate": 1.638337961126537e-05, | |
| "loss": 1.1255, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 13.486711622372075, | |
| "grad_norm": 5.9332709312438965, | |
| "learning_rate": 1.6284212614042046e-05, | |
| "loss": 1.0814, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 13.526378421261404, | |
| "grad_norm": 3.3487417697906494, | |
| "learning_rate": 1.618504561681872e-05, | |
| "loss": 1.1105, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 13.566045220150734, | |
| "grad_norm": 3.4275264739990234, | |
| "learning_rate": 1.60858786195954e-05, | |
| "loss": 1.0292, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 13.605712019040064, | |
| "grad_norm": 5.602040767669678, | |
| "learning_rate": 1.5986711622372075e-05, | |
| "loss": 1.0629, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 13.645378817929393, | |
| "grad_norm": 2.6752493381500244, | |
| "learning_rate": 1.588754462514875e-05, | |
| "loss": 1.0761, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 13.685045616818723, | |
| "grad_norm": 3.2931220531463623, | |
| "learning_rate": 1.5788377627925426e-05, | |
| "loss": 0.9885, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 13.724712415708053, | |
| "grad_norm": 8.223132133483887, | |
| "learning_rate": 1.5689210630702104e-05, | |
| "loss": 1.1423, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 13.764379214597382, | |
| "grad_norm": 4.580158233642578, | |
| "learning_rate": 1.559004363347878e-05, | |
| "loss": 1.0879, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 13.804046013486712, | |
| "grad_norm": 3.891131639480591, | |
| "learning_rate": 1.5490876636255455e-05, | |
| "loss": 1.0819, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 13.843712812376042, | |
| "grad_norm": 5.4781084060668945, | |
| "learning_rate": 1.539170963903213e-05, | |
| "loss": 1.1007, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 13.88337961126537, | |
| "grad_norm": 5.0408220291137695, | |
| "learning_rate": 1.529254264180881e-05, | |
| "loss": 1.1124, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 13.9230464101547, | |
| "grad_norm": 4.6583452224731445, | |
| "learning_rate": 1.5193375644585484e-05, | |
| "loss": 1.1607, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 13.962713209044031, | |
| "grad_norm": 5.026098251342773, | |
| "learning_rate": 1.5094208647362159e-05, | |
| "loss": 1.0744, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 1.068395733833313, | |
| "eval_runtime": 31.6512, | |
| "eval_samples_per_second": 47.96, | |
| "eval_steps_per_second": 6.003, | |
| "step": 35294 | |
| }, | |
| { | |
| "epoch": 14.00238000793336, | |
| "grad_norm": 2.9335262775421143, | |
| "learning_rate": 1.4995041650138836e-05, | |
| "loss": 1.0841, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 14.04204680682269, | |
| "grad_norm": 4.208588123321533, | |
| "learning_rate": 1.489587465291551e-05, | |
| "loss": 1.0901, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 14.08171360571202, | |
| "grad_norm": 5.132387638092041, | |
| "learning_rate": 1.4796707655692185e-05, | |
| "loss": 1.1201, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 14.121380404601348, | |
| "grad_norm": 3.9229278564453125, | |
| "learning_rate": 1.4697540658468862e-05, | |
| "loss": 1.0782, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 14.161047203490678, | |
| "grad_norm": 6.1097259521484375, | |
| "learning_rate": 1.4598373661245537e-05, | |
| "loss": 1.1051, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 14.200714002380009, | |
| "grad_norm": 4.1445417404174805, | |
| "learning_rate": 1.4499206664022214e-05, | |
| "loss": 1.1283, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 14.240380801269337, | |
| "grad_norm": 3.5986008644104004, | |
| "learning_rate": 1.440003966679889e-05, | |
| "loss": 1.0453, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 14.280047600158667, | |
| "grad_norm": 3.8175106048583984, | |
| "learning_rate": 1.4300872669575566e-05, | |
| "loss": 1.0585, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 14.319714399047998, | |
| "grad_norm": 2.821758985519409, | |
| "learning_rate": 1.4201705672352241e-05, | |
| "loss": 1.06, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 14.359381197937326, | |
| "grad_norm": 3.65065860748291, | |
| "learning_rate": 1.4102538675128918e-05, | |
| "loss": 1.1064, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 14.399047996826656, | |
| "grad_norm": 5.7176713943481445, | |
| "learning_rate": 1.4003371677905594e-05, | |
| "loss": 1.008, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 14.438714795715986, | |
| "grad_norm": 5.075132846832275, | |
| "learning_rate": 1.390420468068227e-05, | |
| "loss": 1.114, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 14.478381594605315, | |
| "grad_norm": 5.210816860198975, | |
| "learning_rate": 1.3805037683458946e-05, | |
| "loss": 1.0944, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 14.518048393494645, | |
| "grad_norm": 4.964089870452881, | |
| "learning_rate": 1.3705870686235623e-05, | |
| "loss": 1.0904, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 14.557715192383975, | |
| "grad_norm": 3.131520986557007, | |
| "learning_rate": 1.3606703689012298e-05, | |
| "loss": 1.063, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 14.597381991273304, | |
| "grad_norm": 6.203433036804199, | |
| "learning_rate": 1.3507536691788975e-05, | |
| "loss": 1.0885, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 14.637048790162634, | |
| "grad_norm": 2.8487484455108643, | |
| "learning_rate": 1.3408369694565648e-05, | |
| "loss": 1.0785, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 14.676715589051964, | |
| "grad_norm": 3.4533579349517822, | |
| "learning_rate": 1.3309202697342324e-05, | |
| "loss": 1.0956, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 14.716382387941293, | |
| "grad_norm": 5.409042835235596, | |
| "learning_rate": 1.3210035700119e-05, | |
| "loss": 1.0635, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 14.756049186830623, | |
| "grad_norm": 4.514674186706543, | |
| "learning_rate": 1.3110868702895676e-05, | |
| "loss": 1.0829, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 14.795715985719953, | |
| "grad_norm": 4.7005791664123535, | |
| "learning_rate": 1.3011701705672353e-05, | |
| "loss": 1.0003, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 14.835382784609282, | |
| "grad_norm": 4.253646373748779, | |
| "learning_rate": 1.2912534708449028e-05, | |
| "loss": 1.0562, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 14.875049583498612, | |
| "grad_norm": 4.305023193359375, | |
| "learning_rate": 1.2813367711225705e-05, | |
| "loss": 1.0712, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 14.914716382387942, | |
| "grad_norm": 4.189399719238281, | |
| "learning_rate": 1.271420071400238e-05, | |
| "loss": 1.0761, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 14.95438318127727, | |
| "grad_norm": 3.2512216567993164, | |
| "learning_rate": 1.2615033716779057e-05, | |
| "loss": 1.0336, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 14.9940499801666, | |
| "grad_norm": 3.3554651737213135, | |
| "learning_rate": 1.2515866719555732e-05, | |
| "loss": 1.0636, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 1.051405906677246, | |
| "eval_runtime": 31.6428, | |
| "eval_samples_per_second": 47.973, | |
| "eval_steps_per_second": 6.005, | |
| "step": 37815 | |
| }, | |
| { | |
| "epoch": 15.033716779055931, | |
| "grad_norm": 3.6472902297973633, | |
| "learning_rate": 1.241669972233241e-05, | |
| "loss": 1.0596, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 15.07338357794526, | |
| "grad_norm": 5.338723659515381, | |
| "learning_rate": 1.2317532725109085e-05, | |
| "loss": 1.0462, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 15.11305037683459, | |
| "grad_norm": 4.401419639587402, | |
| "learning_rate": 1.221836572788576e-05, | |
| "loss": 1.0869, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 15.15271717572392, | |
| "grad_norm": 9.426093101501465, | |
| "learning_rate": 1.2119198730662435e-05, | |
| "loss": 1.0198, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 15.192383974613248, | |
| "grad_norm": 3.7169394493103027, | |
| "learning_rate": 1.2020031733439112e-05, | |
| "loss": 1.1285, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 15.232050773502579, | |
| "grad_norm": 3.466498851776123, | |
| "learning_rate": 1.1920864736215787e-05, | |
| "loss": 1.0125, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 15.271717572391909, | |
| "grad_norm": 2.7933382987976074, | |
| "learning_rate": 1.1821697738992464e-05, | |
| "loss": 1.0545, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 15.311384371281237, | |
| "grad_norm": 2.926934003829956, | |
| "learning_rate": 1.172253074176914e-05, | |
| "loss": 1.1035, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 15.351051170170567, | |
| "grad_norm": 3.2757022380828857, | |
| "learning_rate": 1.1623363744545816e-05, | |
| "loss": 1.0479, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 15.390717969059898, | |
| "grad_norm": 4.160761833190918, | |
| "learning_rate": 1.1524196747322492e-05, | |
| "loss": 1.064, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 15.430384767949226, | |
| "grad_norm": 3.412480592727661, | |
| "learning_rate": 1.1425029750099167e-05, | |
| "loss": 0.9485, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 15.470051566838556, | |
| "grad_norm": 3.1907808780670166, | |
| "learning_rate": 1.1325862752875844e-05, | |
| "loss": 1.0605, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 15.509718365727887, | |
| "grad_norm": 4.184901714324951, | |
| "learning_rate": 1.1226695755652519e-05, | |
| "loss": 1.0551, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 15.549385164617215, | |
| "grad_norm": 4.784205436706543, | |
| "learning_rate": 1.1127528758429196e-05, | |
| "loss": 0.9941, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 15.589051963506545, | |
| "grad_norm": 4.00923490524292, | |
| "learning_rate": 1.1028361761205871e-05, | |
| "loss": 1.076, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 15.628718762395875, | |
| "grad_norm": 4.559725284576416, | |
| "learning_rate": 1.0929194763982548e-05, | |
| "loss": 0.9979, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 15.668385561285204, | |
| "grad_norm": 3.8985109329223633, | |
| "learning_rate": 1.0830027766759223e-05, | |
| "loss": 1.0397, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 15.708052360174534, | |
| "grad_norm": 3.3521323204040527, | |
| "learning_rate": 1.0730860769535899e-05, | |
| "loss": 1.006, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 15.747719159063864, | |
| "grad_norm": 3.2745351791381836, | |
| "learning_rate": 1.0631693772312574e-05, | |
| "loss": 1.0642, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 15.787385957953193, | |
| "grad_norm": 3.955242156982422, | |
| "learning_rate": 1.053252677508925e-05, | |
| "loss": 1.0455, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 15.827052756842523, | |
| "grad_norm": 3.2223598957061768, | |
| "learning_rate": 1.0433359777865926e-05, | |
| "loss": 1.0675, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 15.866719555731853, | |
| "grad_norm": 4.809605121612549, | |
| "learning_rate": 1.0334192780642603e-05, | |
| "loss": 1.0992, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 15.906386354621182, | |
| "grad_norm": 2.6435019969940186, | |
| "learning_rate": 1.0235025783419278e-05, | |
| "loss": 0.9905, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 15.946053153510512, | |
| "grad_norm": 6.68290376663208, | |
| "learning_rate": 1.0135858786195955e-05, | |
| "loss": 1.0951, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 15.985719952399842, | |
| "grad_norm": 2.6426591873168945, | |
| "learning_rate": 1.003669178897263e-05, | |
| "loss": 1.073, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 1.039953351020813, | |
| "eval_runtime": 31.6995, | |
| "eval_samples_per_second": 47.887, | |
| "eval_steps_per_second": 5.994, | |
| "step": 40336 | |
| }, | |
| { | |
| "epoch": 16.025386751289172, | |
| "grad_norm": 3.456146001815796, | |
| "learning_rate": 9.937524791749306e-06, | |
| "loss": 1.0191, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 16.0650535501785, | |
| "grad_norm": 5.939918518066406, | |
| "learning_rate": 9.838357794525983e-06, | |
| "loss": 1.0433, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 16.10472034906783, | |
| "grad_norm": 3.538282871246338, | |
| "learning_rate": 9.739190797302658e-06, | |
| "loss": 1.0295, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 16.14438714795716, | |
| "grad_norm": 4.2307844161987305, | |
| "learning_rate": 9.640023800079335e-06, | |
| "loss": 1.0371, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 16.18405394684649, | |
| "grad_norm": 4.40711784362793, | |
| "learning_rate": 9.54085680285601e-06, | |
| "loss": 1.0236, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 16.223720745735818, | |
| "grad_norm": 3.8492507934570312, | |
| "learning_rate": 9.441689805632687e-06, | |
| "loss": 1.0628, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 16.26338754462515, | |
| "grad_norm": 4.397724628448486, | |
| "learning_rate": 9.342522808409362e-06, | |
| "loss": 1.0072, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 16.30305434351448, | |
| "grad_norm": 3.3145904541015625, | |
| "learning_rate": 9.243355811186037e-06, | |
| "loss": 1.045, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 16.342721142403807, | |
| "grad_norm": 5.359413146972656, | |
| "learning_rate": 9.144188813962713e-06, | |
| "loss": 1.0299, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 16.38238794129314, | |
| "grad_norm": 3.4849679470062256, | |
| "learning_rate": 9.04502181673939e-06, | |
| "loss": 1.012, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 16.422054740182467, | |
| "grad_norm": 2.9378600120544434, | |
| "learning_rate": 8.945854819516065e-06, | |
| "loss": 1.0269, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 16.461721539071796, | |
| "grad_norm": 3.024475574493408, | |
| "learning_rate": 8.846687822292742e-06, | |
| "loss": 1.0373, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 16.501388337961128, | |
| "grad_norm": 3.2381701469421387, | |
| "learning_rate": 8.747520825069417e-06, | |
| "loss": 0.9888, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 16.541055136850456, | |
| "grad_norm": 3.816202163696289, | |
| "learning_rate": 8.648353827846094e-06, | |
| "loss": 0.9384, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 16.580721935739785, | |
| "grad_norm": 4.290541648864746, | |
| "learning_rate": 8.54918683062277e-06, | |
| "loss": 1.0653, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 16.620388734629117, | |
| "grad_norm": 4.712522029876709, | |
| "learning_rate": 8.450019833399444e-06, | |
| "loss": 0.9951, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 16.660055533518445, | |
| "grad_norm": 3.3500611782073975, | |
| "learning_rate": 8.350852836176121e-06, | |
| "loss": 1.0356, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 16.699722332407774, | |
| "grad_norm": 3.6570308208465576, | |
| "learning_rate": 8.251685838952797e-06, | |
| "loss": 1.0205, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 16.739389131297106, | |
| "grad_norm": 3.4734184741973877, | |
| "learning_rate": 8.152518841729474e-06, | |
| "loss": 1.037, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 16.779055930186434, | |
| "grad_norm": 3.528817653656006, | |
| "learning_rate": 8.053351844506149e-06, | |
| "loss": 0.9402, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 16.818722729075763, | |
| "grad_norm": 4.3084025382995605, | |
| "learning_rate": 7.954184847282826e-06, | |
| "loss": 1.0702, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 16.858389527965095, | |
| "grad_norm": 3.520242214202881, | |
| "learning_rate": 7.855017850059501e-06, | |
| "loss": 1.0474, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 16.898056326854423, | |
| "grad_norm": 4.44198751449585, | |
| "learning_rate": 7.755850852836176e-06, | |
| "loss": 1.0506, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 16.93772312574375, | |
| "grad_norm": 2.8113813400268555, | |
| "learning_rate": 7.656683855612852e-06, | |
| "loss": 1.0167, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 16.977389924633083, | |
| "grad_norm": 3.3131535053253174, | |
| "learning_rate": 7.5575168583895284e-06, | |
| "loss": 1.0077, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 1.0288244485855103, | |
| "eval_runtime": 31.6731, | |
| "eval_samples_per_second": 47.927, | |
| "eval_steps_per_second": 5.999, | |
| "step": 42857 | |
| }, | |
| { | |
| "epoch": 17.017056723522412, | |
| "grad_norm": 5.444199562072754, | |
| "learning_rate": 7.4583498611662045e-06, | |
| "loss": 0.9996, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 17.05672352241174, | |
| "grad_norm": 4.1272783279418945, | |
| "learning_rate": 7.359182863942881e-06, | |
| "loss": 1.0256, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 17.096390321301072, | |
| "grad_norm": 4.819570064544678, | |
| "learning_rate": 7.260015866719557e-06, | |
| "loss": 1.0325, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 17.1360571201904, | |
| "grad_norm": 4.795453071594238, | |
| "learning_rate": 7.160848869496233e-06, | |
| "loss": 0.9845, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 17.17572391907973, | |
| "grad_norm": 5.2741827964782715, | |
| "learning_rate": 7.061681872272907e-06, | |
| "loss": 1.0406, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 17.21539071796906, | |
| "grad_norm": 5.457202911376953, | |
| "learning_rate": 6.962514875049583e-06, | |
| "loss": 1.0704, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 17.25505751685839, | |
| "grad_norm": 6.256078243255615, | |
| "learning_rate": 6.863347877826259e-06, | |
| "loss": 1.0182, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 17.294724315747718, | |
| "grad_norm": 3.9407060146331787, | |
| "learning_rate": 6.7641808806029355e-06, | |
| "loss": 0.9889, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 17.33439111463705, | |
| "grad_norm": 3.250436782836914, | |
| "learning_rate": 6.6650138833796116e-06, | |
| "loss": 1.0079, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 17.37405791352638, | |
| "grad_norm": 2.7779972553253174, | |
| "learning_rate": 6.565846886156288e-06, | |
| "loss": 1.0134, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 17.413724712415707, | |
| "grad_norm": 4.296668529510498, | |
| "learning_rate": 6.466679888932964e-06, | |
| "loss": 0.9585, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 17.45339151130504, | |
| "grad_norm": 3.737541437149048, | |
| "learning_rate": 6.36751289170964e-06, | |
| "loss": 1.0307, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 17.493058310194368, | |
| "grad_norm": 5.0776848793029785, | |
| "learning_rate": 6.268345894486314e-06, | |
| "loss": 1.0395, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 17.532725109083696, | |
| "grad_norm": 6.334095001220703, | |
| "learning_rate": 6.169178897262991e-06, | |
| "loss": 0.9772, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 17.572391907973028, | |
| "grad_norm": 5.443525314331055, | |
| "learning_rate": 6.070011900039667e-06, | |
| "loss": 0.9264, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 17.612058706862356, | |
| "grad_norm": 4.61970853805542, | |
| "learning_rate": 5.970844902816343e-06, | |
| "loss": 1.0307, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 17.651725505751685, | |
| "grad_norm": 3.089509963989258, | |
| "learning_rate": 5.8716779055930195e-06, | |
| "loss": 0.9633, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 17.691392304641017, | |
| "grad_norm": 4.635293006896973, | |
| "learning_rate": 5.7725109083696955e-06, | |
| "loss": 1.0537, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 17.731059103530345, | |
| "grad_norm": 3.052475929260254, | |
| "learning_rate": 5.673343911146371e-06, | |
| "loss": 0.9983, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 17.770725902419674, | |
| "grad_norm": 3.9765052795410156, | |
| "learning_rate": 5.574176913923047e-06, | |
| "loss": 1.0687, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 17.810392701309006, | |
| "grad_norm": 4.3488030433654785, | |
| "learning_rate": 5.475009916699723e-06, | |
| "loss": 1.012, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 17.850059500198334, | |
| "grad_norm": 3.6032917499542236, | |
| "learning_rate": 5.375842919476398e-06, | |
| "loss": 0.9933, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 17.889726299087663, | |
| "grad_norm": 3.2621772289276123, | |
| "learning_rate": 5.276675922253074e-06, | |
| "loss": 0.9657, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 17.929393097976995, | |
| "grad_norm": 3.9976959228515625, | |
| "learning_rate": 5.17750892502975e-06, | |
| "loss": 0.9799, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 17.969059896866323, | |
| "grad_norm": 4.725791931152344, | |
| "learning_rate": 5.0783419278064265e-06, | |
| "loss": 1.043, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 1.0184741020202637, | |
| "eval_runtime": 31.6571, | |
| "eval_samples_per_second": 47.951, | |
| "eval_steps_per_second": 6.002, | |
| "step": 45378 | |
| }, | |
| { | |
| "epoch": 18.00872669575565, | |
| "grad_norm": 3.418588876724243, | |
| "learning_rate": 4.979174930583102e-06, | |
| "loss": 0.9965, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 18.048393494644984, | |
| "grad_norm": 4.835160255432129, | |
| "learning_rate": 4.880007933359778e-06, | |
| "loss": 0.9789, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 18.088060293534312, | |
| "grad_norm": 4.275815486907959, | |
| "learning_rate": 4.780840936136454e-06, | |
| "loss": 1.0233, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 18.12772709242364, | |
| "grad_norm": 4.429009914398193, | |
| "learning_rate": 4.68167393891313e-06, | |
| "loss": 1.0487, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 18.167393891312972, | |
| "grad_norm": 4.390066146850586, | |
| "learning_rate": 4.582506941689805e-06, | |
| "loss": 0.968, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 18.2070606902023, | |
| "grad_norm": 3.265092372894287, | |
| "learning_rate": 4.483339944466481e-06, | |
| "loss": 1.0171, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 18.24672748909163, | |
| "grad_norm": 4.843317031860352, | |
| "learning_rate": 4.3841729472431574e-06, | |
| "loss": 1.0204, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 18.28639428798096, | |
| "grad_norm": 4.457988262176514, | |
| "learning_rate": 4.2850059500198335e-06, | |
| "loss": 1.0246, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 18.32606108687029, | |
| "grad_norm": 3.9527127742767334, | |
| "learning_rate": 4.18583895279651e-06, | |
| "loss": 0.9241, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 18.36572788575962, | |
| "grad_norm": 3.7694692611694336, | |
| "learning_rate": 4.086671955573186e-06, | |
| "loss": 1.0318, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 18.40539468464895, | |
| "grad_norm": 5.390737533569336, | |
| "learning_rate": 3.987504958349862e-06, | |
| "loss": 0.9938, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 18.44506148353828, | |
| "grad_norm": 3.8084776401519775, | |
| "learning_rate": 3.888337961126538e-06, | |
| "loss": 0.9652, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 18.484728282427607, | |
| "grad_norm": 3.5767834186553955, | |
| "learning_rate": 3.789170963903213e-06, | |
| "loss": 0.9582, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 18.52439508131694, | |
| "grad_norm": 3.4777605533599854, | |
| "learning_rate": 3.6900039666798892e-06, | |
| "loss": 0.9981, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 18.564061880206268, | |
| "grad_norm": 4.1490092277526855, | |
| "learning_rate": 3.5908369694565653e-06, | |
| "loss": 0.9607, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 18.603728679095596, | |
| "grad_norm": 4.089176654815674, | |
| "learning_rate": 3.4916699722332406e-06, | |
| "loss": 1.0168, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 18.643395477984928, | |
| "grad_norm": 3.9602725505828857, | |
| "learning_rate": 3.3925029750099167e-06, | |
| "loss": 0.9785, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 18.683062276874256, | |
| "grad_norm": 4.800217628479004, | |
| "learning_rate": 3.2933359777865927e-06, | |
| "loss": 1.0514, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 18.722729075763585, | |
| "grad_norm": 4.848387718200684, | |
| "learning_rate": 3.194168980563269e-06, | |
| "loss": 0.9798, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 18.762395874652917, | |
| "grad_norm": 3.5444610118865967, | |
| "learning_rate": 3.0950019833399445e-06, | |
| "loss": 1.0602, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 18.802062673542245, | |
| "grad_norm": 3.4162533283233643, | |
| "learning_rate": 2.9958349861166206e-06, | |
| "loss": 0.9881, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 18.841729472431574, | |
| "grad_norm": 4.719314098358154, | |
| "learning_rate": 2.8966679888932967e-06, | |
| "loss": 0.9503, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 18.881396271320906, | |
| "grad_norm": 5.332608222961426, | |
| "learning_rate": 2.7975009916699724e-06, | |
| "loss": 1.0245, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 18.921063070210234, | |
| "grad_norm": 5.230047702789307, | |
| "learning_rate": 2.6983339944466484e-06, | |
| "loss": 0.9947, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 18.960729869099563, | |
| "grad_norm": 3.1582813262939453, | |
| "learning_rate": 2.599166997223324e-06, | |
| "loss": 1.0198, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 1.017343521118164, | |
| "eval_runtime": 31.7039, | |
| "eval_samples_per_second": 47.881, | |
| "eval_steps_per_second": 5.993, | |
| "step": 47899 | |
| }, | |
| { | |
| "epoch": 19.000396667988895, | |
| "grad_norm": 5.45066499710083, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.9753, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 19.040063466878223, | |
| "grad_norm": 3.2004072666168213, | |
| "learning_rate": 2.400833002776676e-06, | |
| "loss": 0.946, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 19.07973026576755, | |
| "grad_norm": 3.971540689468384, | |
| "learning_rate": 2.301666005553352e-06, | |
| "loss": 0.9783, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 19.119397064656884, | |
| "grad_norm": 4.348784923553467, | |
| "learning_rate": 2.202499008330028e-06, | |
| "loss": 0.952, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 19.159063863546212, | |
| "grad_norm": 3.7044036388397217, | |
| "learning_rate": 2.1033320111067037e-06, | |
| "loss": 1.0662, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 19.19873066243554, | |
| "grad_norm": 2.662105083465576, | |
| "learning_rate": 2.00416501388338e-06, | |
| "loss": 0.9647, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 19.238397461324872, | |
| "grad_norm": 4.103559494018555, | |
| "learning_rate": 1.9049980166600555e-06, | |
| "loss": 0.9142, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 19.2780642602142, | |
| "grad_norm": 2.8791961669921875, | |
| "learning_rate": 1.8058310194367316e-06, | |
| "loss": 1.0167, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 19.31773105910353, | |
| "grad_norm": 2.689680576324463, | |
| "learning_rate": 1.7066640222134072e-06, | |
| "loss": 1.0012, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 19.35739785799286, | |
| "grad_norm": 3.3067831993103027, | |
| "learning_rate": 1.6074970249900833e-06, | |
| "loss": 0.967, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 19.39706465688219, | |
| "grad_norm": 3.9777708053588867, | |
| "learning_rate": 1.5083300277667594e-06, | |
| "loss": 0.9081, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 19.43673145577152, | |
| "grad_norm": 3.582973003387451, | |
| "learning_rate": 1.4091630305434353e-06, | |
| "loss": 1.0104, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 19.47639825466085, | |
| "grad_norm": 5.202731132507324, | |
| "learning_rate": 1.309996033320111e-06, | |
| "loss": 0.9648, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 19.51606505355018, | |
| "grad_norm": 3.264211893081665, | |
| "learning_rate": 1.210829036096787e-06, | |
| "loss": 0.9599, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 19.555731852439507, | |
| "grad_norm": 4.432053565979004, | |
| "learning_rate": 1.111662038873463e-06, | |
| "loss": 0.9935, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 19.59539865132884, | |
| "grad_norm": 3.386671781539917, | |
| "learning_rate": 1.0124950416501388e-06, | |
| "loss": 1.002, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 19.635065450218168, | |
| "grad_norm": 4.273075103759766, | |
| "learning_rate": 9.133280444268148e-07, | |
| "loss": 1.0225, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 19.674732249107496, | |
| "grad_norm": 3.5673136711120605, | |
| "learning_rate": 8.141610472034907e-07, | |
| "loss": 1.0149, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 19.714399047996828, | |
| "grad_norm": 3.68278431892395, | |
| "learning_rate": 7.149940499801666e-07, | |
| "loss": 0.996, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 19.754065846886157, | |
| "grad_norm": 4.8836870193481445, | |
| "learning_rate": 6.158270527568425e-07, | |
| "loss": 1.0097, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 19.793732645775485, | |
| "grad_norm": 3.579880475997925, | |
| "learning_rate": 5.166600555335184e-07, | |
| "loss": 0.9482, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 19.833399444664817, | |
| "grad_norm": 2.7329444885253906, | |
| "learning_rate": 4.174930583101944e-07, | |
| "loss": 1.0365, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 19.873066243554145, | |
| "grad_norm": 5.478430271148682, | |
| "learning_rate": 3.1832606108687035e-07, | |
| "loss": 1.0543, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 19.912733042443474, | |
| "grad_norm": 3.1377158164978027, | |
| "learning_rate": 2.191590638635462e-07, | |
| "loss": 0.9637, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 19.952399841332806, | |
| "grad_norm": 3.789954662322998, | |
| "learning_rate": 1.1999206664022213e-07, | |
| "loss": 0.9847, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 19.992066640222134, | |
| "grad_norm": 4.29661226272583, | |
| "learning_rate": 2.0825069416898058e-08, | |
| "loss": 1.0306, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 1.0158944129943848, | |
| "eval_runtime": 31.6451, | |
| "eval_samples_per_second": 47.97, | |
| "eval_steps_per_second": 6.004, | |
| "step": 50420 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 50420, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.902420484390912e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |