| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 15000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5043091773986816, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 2.1257, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.06766939163208, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.0451, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8579052686691284, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.9972, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.866980791091919, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.9872, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8615725040435791, | |
| "learning_rate": 2e-05, | |
| "loss": 0.9757, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 0.9876903295516968, | |
| "eval_runtime": 41.3404, | |
| "eval_samples_per_second": 24.189, | |
| "eval_steps_per_second": 6.047, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.9930108785629272, | |
| "learning_rate": 1.9862068965517244e-05, | |
| "loss": 0.9801, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.8045609593391418, | |
| "learning_rate": 1.9724137931034483e-05, | |
| "loss": 0.9593, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8184276819229126, | |
| "learning_rate": 1.9586206896551725e-05, | |
| "loss": 0.9567, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.872108519077301, | |
| "learning_rate": 1.9448275862068968e-05, | |
| "loss": 0.9541, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.7699779868125916, | |
| "learning_rate": 1.931034482758621e-05, | |
| "loss": 0.9354, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 0.9477444887161255, | |
| "eval_runtime": 41.7539, | |
| "eval_samples_per_second": 23.95, | |
| "eval_steps_per_second": 5.987, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.9201217889785767, | |
| "learning_rate": 1.917241379310345e-05, | |
| "loss": 0.945, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.9041834473609924, | |
| "learning_rate": 1.903448275862069e-05, | |
| "loss": 0.9472, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.8310356140136719, | |
| "learning_rate": 1.8896551724137934e-05, | |
| "loss": 0.9457, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.8618925213813782, | |
| "learning_rate": 1.8758620689655173e-05, | |
| "loss": 0.9529, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.8114230036735535, | |
| "learning_rate": 1.8620689655172415e-05, | |
| "loss": 0.9421, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 0.9172976016998291, | |
| "eval_runtime": 41.6025, | |
| "eval_samples_per_second": 24.037, | |
| "eval_steps_per_second": 6.009, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.8510406017303467, | |
| "learning_rate": 1.8482758620689657e-05, | |
| "loss": 0.9371, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.8189296126365662, | |
| "learning_rate": 1.8344827586206896e-05, | |
| "loss": 0.9507, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8465898036956787, | |
| "learning_rate": 1.820689655172414e-05, | |
| "loss": 0.9298, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8617157936096191, | |
| "learning_rate": 1.806896551724138e-05, | |
| "loss": 0.9203, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.8387811183929443, | |
| "learning_rate": 1.7931034482758623e-05, | |
| "loss": 0.9539, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 0.9198890328407288, | |
| "eval_runtime": 40.979, | |
| "eval_samples_per_second": 24.403, | |
| "eval_steps_per_second": 6.101, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.8710920214653015, | |
| "learning_rate": 1.7793103448275862e-05, | |
| "loss": 0.9227, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.7497248649597168, | |
| "learning_rate": 1.7655172413793105e-05, | |
| "loss": 0.93, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.7869455814361572, | |
| "learning_rate": 1.7517241379310347e-05, | |
| "loss": 0.9323, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.8080159425735474, | |
| "learning_rate": 1.7379310344827586e-05, | |
| "loss": 0.9418, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.72604900598526, | |
| "learning_rate": 1.7241379310344828e-05, | |
| "loss": 0.9336, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.9328898787498474, | |
| "eval_runtime": 41.2374, | |
| "eval_samples_per_second": 24.25, | |
| "eval_steps_per_second": 6.062, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.766793966293335, | |
| "learning_rate": 1.710344827586207e-05, | |
| "loss": 0.9209, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7827008962631226, | |
| "learning_rate": 1.6965517241379313e-05, | |
| "loss": 0.92, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7644964456558228, | |
| "learning_rate": 1.6827586206896552e-05, | |
| "loss": 0.9232, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.7284256815910339, | |
| "learning_rate": 1.6689655172413794e-05, | |
| "loss": 0.9284, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.8209360837936401, | |
| "learning_rate": 1.6551724137931037e-05, | |
| "loss": 0.9206, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 0.9171387553215027, | |
| "eval_runtime": 40.5489, | |
| "eval_samples_per_second": 24.662, | |
| "eval_steps_per_second": 6.165, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.8341450691223145, | |
| "learning_rate": 1.6413793103448276e-05, | |
| "loss": 0.9229, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.7724445462226868, | |
| "learning_rate": 1.6275862068965518e-05, | |
| "loss": 0.9229, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.7436179518699646, | |
| "learning_rate": 1.613793103448276e-05, | |
| "loss": 0.8998, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.8365751504898071, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.9104, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.8854556083679199, | |
| "learning_rate": 1.586206896551724e-05, | |
| "loss": 0.9182, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 0.9123844504356384, | |
| "eval_runtime": 41.9203, | |
| "eval_samples_per_second": 23.855, | |
| "eval_steps_per_second": 5.964, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.897222101688385, | |
| "learning_rate": 1.5724137931034484e-05, | |
| "loss": 0.9037, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.8261429667472839, | |
| "learning_rate": 1.5586206896551726e-05, | |
| "loss": 0.9252, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.8133054375648499, | |
| "learning_rate": 1.5448275862068965e-05, | |
| "loss": 0.9246, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7939581871032715, | |
| "learning_rate": 1.5310344827586208e-05, | |
| "loss": 0.9187, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.8768674731254578, | |
| "learning_rate": 1.5172413793103448e-05, | |
| "loss": 0.9139, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.914544939994812, | |
| "eval_runtime": 42.1027, | |
| "eval_samples_per_second": 23.751, | |
| "eval_steps_per_second": 5.938, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.7592473030090332, | |
| "learning_rate": 1.503448275862069e-05, | |
| "loss": 0.9024, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.7141114473342896, | |
| "learning_rate": 1.4896551724137933e-05, | |
| "loss": 0.9222, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.8296393156051636, | |
| "learning_rate": 1.4758620689655174e-05, | |
| "loss": 0.9065, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.7411290407180786, | |
| "learning_rate": 1.4620689655172416e-05, | |
| "loss": 0.9063, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.7428833842277527, | |
| "learning_rate": 1.4482758620689657e-05, | |
| "loss": 0.9153, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": 0.8934066891670227, | |
| "eval_runtime": 40.7429, | |
| "eval_samples_per_second": 24.544, | |
| "eval_steps_per_second": 6.136, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.8348454236984253, | |
| "learning_rate": 1.4344827586206897e-05, | |
| "loss": 0.9065, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.9168105125427246, | |
| "learning_rate": 1.4206896551724138e-05, | |
| "loss": 0.9205, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.7295928597450256, | |
| "learning_rate": 1.406896551724138e-05, | |
| "loss": 0.9069, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.74032062292099, | |
| "learning_rate": 1.3931034482758621e-05, | |
| "loss": 0.9069, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.7203904986381531, | |
| "learning_rate": 1.3793103448275863e-05, | |
| "loss": 0.9046, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.872988760471344, | |
| "eval_runtime": 40.9885, | |
| "eval_samples_per_second": 24.397, | |
| "eval_steps_per_second": 6.099, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.7743054032325745, | |
| "learning_rate": 1.3655172413793106e-05, | |
| "loss": 0.7995, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.8992008566856384, | |
| "learning_rate": 1.3517241379310346e-05, | |
| "loss": 0.8001, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.7732968330383301, | |
| "learning_rate": 1.3379310344827587e-05, | |
| "loss": 0.7977, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.8479374051094055, | |
| "learning_rate": 1.324137931034483e-05, | |
| "loss": 0.8162, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.8781663775444031, | |
| "learning_rate": 1.310344827586207e-05, | |
| "loss": 0.803, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "eval_loss": 0.9017807841300964, | |
| "eval_runtime": 41.4996, | |
| "eval_samples_per_second": 24.097, | |
| "eval_steps_per_second": 6.024, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.7585736513137817, | |
| "learning_rate": 1.296551724137931e-05, | |
| "loss": 0.8048, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.8792749047279358, | |
| "learning_rate": 1.2827586206896551e-05, | |
| "loss": 0.7975, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.9402467608451843, | |
| "learning_rate": 1.2689655172413795e-05, | |
| "loss": 0.7968, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.8136008381843567, | |
| "learning_rate": 1.2551724137931036e-05, | |
| "loss": 0.7888, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.7935479879379272, | |
| "learning_rate": 1.2413793103448277e-05, | |
| "loss": 0.8051, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 0.9130223393440247, | |
| "eval_runtime": 40.9878, | |
| "eval_samples_per_second": 24.397, | |
| "eval_steps_per_second": 6.099, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.7666265964508057, | |
| "learning_rate": 1.2275862068965519e-05, | |
| "loss": 0.8013, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.808914065361023, | |
| "learning_rate": 1.213793103448276e-05, | |
| "loss": 0.8108, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.8120896220207214, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.7955, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.8073500990867615, | |
| "learning_rate": 1.1862068965517241e-05, | |
| "loss": 0.7918, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.865331768989563, | |
| "learning_rate": 1.1724137931034483e-05, | |
| "loss": 0.8089, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "eval_loss": 0.9077558517456055, | |
| "eval_runtime": 41.3054, | |
| "eval_samples_per_second": 24.21, | |
| "eval_steps_per_second": 6.052, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.8069093227386475, | |
| "learning_rate": 1.1586206896551726e-05, | |
| "loss": 0.7933, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.8061268925666809, | |
| "learning_rate": 1.1448275862068966e-05, | |
| "loss": 0.802, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.8158251047134399, | |
| "learning_rate": 1.1310344827586209e-05, | |
| "loss": 0.8028, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.849918782711029, | |
| "learning_rate": 1.117241379310345e-05, | |
| "loss": 0.7967, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.7494837045669556, | |
| "learning_rate": 1.103448275862069e-05, | |
| "loss": 0.7973, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_loss": 0.8674909472465515, | |
| "eval_runtime": 41.4384, | |
| "eval_samples_per_second": 24.132, | |
| "eval_steps_per_second": 6.033, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.7753856182098389, | |
| "learning_rate": 1.0896551724137932e-05, | |
| "loss": 0.7922, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.9751661419868469, | |
| "learning_rate": 1.0758620689655173e-05, | |
| "loss": 0.816, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.850592851638794, | |
| "learning_rate": 1.0620689655172414e-05, | |
| "loss": 0.8084, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.8299598097801208, | |
| "learning_rate": 1.0482758620689658e-05, | |
| "loss": 0.8135, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.7985300421714783, | |
| "learning_rate": 1.0344827586206898e-05, | |
| "loss": 0.7965, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 0.8954501748085022, | |
| "eval_runtime": 41.3848, | |
| "eval_samples_per_second": 24.163, | |
| "eval_steps_per_second": 6.041, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.8110877275466919, | |
| "learning_rate": 1.0206896551724139e-05, | |
| "loss": 0.786, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.7997573614120483, | |
| "learning_rate": 1.006896551724138e-05, | |
| "loss": 0.7959, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.7852098345756531, | |
| "learning_rate": 9.931034482758622e-06, | |
| "loss": 0.8106, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.8886978030204773, | |
| "learning_rate": 9.793103448275863e-06, | |
| "loss": 0.8081, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.7849240899085999, | |
| "learning_rate": 9.655172413793105e-06, | |
| "loss": 0.8103, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.9043481945991516, | |
| "eval_runtime": 41.0052, | |
| "eval_samples_per_second": 24.387, | |
| "eval_steps_per_second": 6.097, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.8720059394836426, | |
| "learning_rate": 9.517241379310346e-06, | |
| "loss": 0.8041, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.8483107686042786, | |
| "learning_rate": 9.379310344827586e-06, | |
| "loss": 0.8098, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.8146810531616211, | |
| "learning_rate": 9.241379310344829e-06, | |
| "loss": 0.803, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.8219090700149536, | |
| "learning_rate": 9.10344827586207e-06, | |
| "loss": 0.792, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.7642741203308105, | |
| "learning_rate": 8.965517241379312e-06, | |
| "loss": 0.7969, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "eval_loss": 0.9122523069381714, | |
| "eval_runtime": 42.1629, | |
| "eval_samples_per_second": 23.718, | |
| "eval_steps_per_second": 5.929, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.7875528931617737, | |
| "learning_rate": 8.827586206896552e-06, | |
| "loss": 0.8075, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.8270419836044312, | |
| "learning_rate": 8.689655172413793e-06, | |
| "loss": 0.7952, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.9126586318016052, | |
| "learning_rate": 8.551724137931035e-06, | |
| "loss": 0.7848, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.8643565773963928, | |
| "learning_rate": 8.413793103448276e-06, | |
| "loss": 0.7924, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.9442116618156433, | |
| "learning_rate": 8.275862068965518e-06, | |
| "loss": 0.7971, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "eval_loss": 0.9290862679481506, | |
| "eval_runtime": 41.6918, | |
| "eval_samples_per_second": 23.986, | |
| "eval_steps_per_second": 5.996, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.7398823499679565, | |
| "learning_rate": 8.137931034482759e-06, | |
| "loss": 0.8003, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.8488145470619202, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.8072, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 0.7276484370231628, | |
| "learning_rate": 7.862068965517242e-06, | |
| "loss": 0.7966, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.8641906380653381, | |
| "learning_rate": 7.724137931034483e-06, | |
| "loss": 0.8063, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.8261032700538635, | |
| "learning_rate": 7.586206896551724e-06, | |
| "loss": 0.7907, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_loss": 0.892309308052063, | |
| "eval_runtime": 41.16, | |
| "eval_samples_per_second": 24.295, | |
| "eval_steps_per_second": 6.074, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.8013048768043518, | |
| "learning_rate": 7.4482758620689665e-06, | |
| "loss": 0.8031, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.8098243474960327, | |
| "learning_rate": 7.310344827586208e-06, | |
| "loss": 0.7931, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.8183510303497314, | |
| "learning_rate": 7.172413793103449e-06, | |
| "loss": 0.7967, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.8251721858978271, | |
| "learning_rate": 7.03448275862069e-06, | |
| "loss": 0.7877, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.8741477131843567, | |
| "learning_rate": 6.896551724137932e-06, | |
| "loss": 0.7967, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.873385488986969, | |
| "eval_runtime": 41.2883, | |
| "eval_samples_per_second": 24.22, | |
| "eval_steps_per_second": 6.055, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.8094070553779602, | |
| "learning_rate": 6.760000000000001e-06, | |
| "loss": 0.7059, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.8748703598976135, | |
| "learning_rate": 6.622068965517242e-06, | |
| "loss": 0.6925, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.1019681692123413, | |
| "learning_rate": 6.4841379310344835e-06, | |
| "loss": 0.6884, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.8715025186538696, | |
| "learning_rate": 6.346206896551724e-06, | |
| "loss": 0.6989, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.0019214153289795, | |
| "learning_rate": 6.2082758620689665e-06, | |
| "loss": 0.6941, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "eval_loss": 0.954196035861969, | |
| "eval_runtime": 40.949, | |
| "eval_samples_per_second": 24.421, | |
| "eval_steps_per_second": 6.105, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.8947280645370483, | |
| "learning_rate": 6.070344827586207e-06, | |
| "loss": 0.6932, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.8519116044044495, | |
| "learning_rate": 5.932413793103449e-06, | |
| "loss": 0.6966, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.8703382015228271, | |
| "learning_rate": 5.79448275862069e-06, | |
| "loss": 0.6955, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.9201086759567261, | |
| "learning_rate": 5.656551724137932e-06, | |
| "loss": 0.6911, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.0000112056732178, | |
| "learning_rate": 5.518620689655173e-06, | |
| "loss": 0.6927, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "eval_loss": 0.9255943894386292, | |
| "eval_runtime": 40.2744, | |
| "eval_samples_per_second": 24.83, | |
| "eval_steps_per_second": 6.207, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.93467777967453, | |
| "learning_rate": 5.382068965517242e-06, | |
| "loss": 0.6902, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.9694642424583435, | |
| "learning_rate": 5.2441379310344835e-06, | |
| "loss": 0.6936, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.9247058629989624, | |
| "learning_rate": 5.106206896551724e-06, | |
| "loss": 0.6939, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 0.996800422668457, | |
| "learning_rate": 4.968275862068966e-06, | |
| "loss": 0.6965, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.8894180059432983, | |
| "learning_rate": 4.830344827586207e-06, | |
| "loss": 0.706, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "eval_loss": 0.9258891344070435, | |
| "eval_runtime": 40.6757, | |
| "eval_samples_per_second": 24.585, | |
| "eval_steps_per_second": 6.146, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.0035386085510254, | |
| "learning_rate": 4.692413793103449e-06, | |
| "loss": 0.6936, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.9458960294723511, | |
| "learning_rate": 4.55448275862069e-06, | |
| "loss": 0.7003, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.9162298440933228, | |
| "learning_rate": 4.416551724137932e-06, | |
| "loss": 0.7004, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.8926594257354736, | |
| "learning_rate": 4.278620689655173e-06, | |
| "loss": 0.6848, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.9700310230255127, | |
| "learning_rate": 4.140689655172414e-06, | |
| "loss": 0.6937, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 0.9162700176239014, | |
| "eval_runtime": 40.714, | |
| "eval_samples_per_second": 24.562, | |
| "eval_steps_per_second": 6.14, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.9614285826683044, | |
| "learning_rate": 4.002758620689655e-06, | |
| "loss": 0.6844, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.897529125213623, | |
| "learning_rate": 3.864827586206897e-06, | |
| "loss": 0.6846, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.9933466911315918, | |
| "learning_rate": 3.7268965517241383e-06, | |
| "loss": 0.7014, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.8733190298080444, | |
| "learning_rate": 3.5889655172413794e-06, | |
| "loss": 0.6896, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.9706907868385315, | |
| "learning_rate": 3.4510344827586214e-06, | |
| "loss": 0.6961, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 0.9194909334182739, | |
| "eval_runtime": 40.9566, | |
| "eval_samples_per_second": 24.416, | |
| "eval_steps_per_second": 6.104, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.8588589429855347, | |
| "learning_rate": 3.3131034482758624e-06, | |
| "loss": 0.6992, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.1075055599212646, | |
| "learning_rate": 3.175172413793104e-06, | |
| "loss": 0.6983, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.0699206590652466, | |
| "learning_rate": 3.037241379310345e-06, | |
| "loss": 0.6836, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 1.0196533203125, | |
| "learning_rate": 2.8993103448275865e-06, | |
| "loss": 0.6849, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.880547821521759, | |
| "learning_rate": 2.7613793103448276e-06, | |
| "loss": 0.6976, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "eval_loss": 0.9251711368560791, | |
| "eval_runtime": 40.7697, | |
| "eval_samples_per_second": 24.528, | |
| "eval_steps_per_second": 6.132, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.9048807621002197, | |
| "learning_rate": 2.6234482758620695e-06, | |
| "loss": 0.7059, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.9848681688308716, | |
| "learning_rate": 2.4855172413793106e-06, | |
| "loss": 0.6936, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.9536793231964111, | |
| "learning_rate": 2.3475862068965517e-06, | |
| "loss": 0.6882, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.9996353387832642, | |
| "learning_rate": 2.209655172413793e-06, | |
| "loss": 0.6954, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.9549928903579712, | |
| "learning_rate": 2.0717241379310347e-06, | |
| "loss": 0.6905, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "eval_loss": 0.9238373041152954, | |
| "eval_runtime": 40.4773, | |
| "eval_samples_per_second": 24.705, | |
| "eval_steps_per_second": 6.176, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.9328065514564514, | |
| "learning_rate": 1.933793103448276e-06, | |
| "loss": 0.6776, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.9359473586082458, | |
| "learning_rate": 1.7958620689655173e-06, | |
| "loss": 0.6772, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.982263445854187, | |
| "learning_rate": 1.6579310344827588e-06, | |
| "loss": 0.7012, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.9556898474693298, | |
| "learning_rate": 1.52e-06, | |
| "loss": 0.6814, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.975192666053772, | |
| "learning_rate": 1.3820689655172416e-06, | |
| "loss": 0.6951, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_loss": 0.8940379619598389, | |
| "eval_runtime": 40.6996, | |
| "eval_samples_per_second": 24.57, | |
| "eval_steps_per_second": 6.143, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.9017152786254883, | |
| "learning_rate": 1.2441379310344829e-06, | |
| "loss": 0.6939, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.968861997127533, | |
| "learning_rate": 1.1062068965517241e-06, | |
| "loss": 0.6861, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.0695827007293701, | |
| "learning_rate": 9.682758620689656e-07, | |
| "loss": 0.6922, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.976294994354248, | |
| "learning_rate": 8.303448275862069e-07, | |
| "loss": 0.6937, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.0001001358032227, | |
| "learning_rate": 6.924137931034483e-07, | |
| "loss": 0.7094, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "eval_loss": 0.900869607925415, | |
| "eval_runtime": 40.5224, | |
| "eval_samples_per_second": 24.678, | |
| "eval_steps_per_second": 6.169, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.9175110459327698, | |
| "learning_rate": 5.544827586206897e-07, | |
| "loss": 0.6752, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.9188650846481323, | |
| "learning_rate": 4.1655172413793107e-07, | |
| "loss": 0.6973, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.0105127096176147, | |
| "learning_rate": 2.7862068965517247e-07, | |
| "loss": 0.6899, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.9797114133834839, | |
| "learning_rate": 1.406896551724138e-07, | |
| "loss": 0.7016, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.8816357254981995, | |
| "learning_rate": 2.758620689655173e-09, | |
| "loss": 0.6788, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.9271326065063477, | |
| "eval_runtime": 41.5918, | |
| "eval_samples_per_second": 24.043, | |
| "eval_steps_per_second": 6.011, | |
| "step": 15000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 15000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 1.88804379967488e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |