{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.5254850387573242, "learning_rate": 4.000000000000001e-06, "loss": 2.1559, "step": 100 }, { "epoch": 0.04, "grad_norm": 1.157167911529541, "learning_rate": 8.000000000000001e-06, "loss": 1.0597, "step": 200 }, { "epoch": 0.06, "grad_norm": 0.9404276013374329, "learning_rate": 1.2e-05, "loss": 1.0037, "step": 300 }, { "epoch": 0.08, "grad_norm": 0.8754622340202332, "learning_rate": 1.6000000000000003e-05, "loss": 0.9886, "step": 400 }, { "epoch": 0.1, "grad_norm": 0.8939309120178223, "learning_rate": 2e-05, "loss": 0.9766, "step": 500 }, { "epoch": 0.1, "eval_loss": 0.9880711436271667, "eval_runtime": 41.2386, "eval_samples_per_second": 24.249, "eval_steps_per_second": 6.062, "step": 500 }, { "epoch": 0.12, "grad_norm": 0.976137638092041, "learning_rate": 1.9862068965517244e-05, "loss": 0.9804, "step": 600 }, { "epoch": 0.14, "grad_norm": 0.8116142153739929, "learning_rate": 1.9724137931034483e-05, "loss": 0.9595, "step": 700 }, { "epoch": 0.16, "grad_norm": 0.8091762065887451, "learning_rate": 1.9586206896551725e-05, "loss": 0.9572, "step": 800 }, { "epoch": 0.18, "grad_norm": 0.8825358152389526, "learning_rate": 1.9448275862068968e-05, "loss": 0.9546, "step": 900 }, { "epoch": 0.2, "grad_norm": 0.7465589642524719, "learning_rate": 1.931034482758621e-05, "loss": 0.9357, "step": 1000 }, { "epoch": 0.2, "eval_loss": 0.9481207132339478, "eval_runtime": 41.187, "eval_samples_per_second": 24.28, "eval_steps_per_second": 6.07, "step": 1000 }, { "epoch": 0.22, "grad_norm": 0.9324079751968384, "learning_rate": 1.917241379310345e-05, "loss": 0.9454, "step": 1100 }, { "epoch": 0.24, "grad_norm": 0.8732156753540039, "learning_rate": 1.903448275862069e-05, "loss": 0.9471, "step": 1200 }, { "epoch": 0.26, "grad_norm": 0.8051126599311829, "learning_rate": 1.8896551724137934e-05, "loss": 0.9458, "step": 1300 }, { "epoch": 0.28, "grad_norm": 0.8477068543434143, "learning_rate": 1.8758620689655173e-05, "loss": 0.9531, "step": 1400 }, { "epoch": 0.3, "grad_norm": 0.7887871265411377, "learning_rate": 1.8620689655172415e-05, "loss": 0.9421, "step": 1500 }, { "epoch": 0.3, "eval_loss": 0.9176677465438843, "eval_runtime": 42.0616, "eval_samples_per_second": 23.775, "eval_steps_per_second": 5.944, "step": 1500 }, { "epoch": 0.32, "grad_norm": 2.8292715549468994, "learning_rate": 1.8482758620689657e-05, "loss": 0.937, "step": 1600 }, { "epoch": 0.34, "grad_norm": 0.8473380208015442, "learning_rate": 1.8344827586206896e-05, "loss": 0.9508, "step": 1700 }, { "epoch": 0.36, "grad_norm": 0.8711748123168945, "learning_rate": 1.820689655172414e-05, "loss": 0.9301, "step": 1800 }, { "epoch": 0.38, "grad_norm": 0.8255301713943481, "learning_rate": 1.806896551724138e-05, "loss": 0.9207, "step": 1900 }, { "epoch": 0.4, "grad_norm": 0.9544655680656433, "learning_rate": 1.7931034482758623e-05, "loss": 0.9545, "step": 2000 }, { "epoch": 0.4, "eval_loss": 0.9197984337806702, "eval_runtime": 41.8061, "eval_samples_per_second": 23.92, "eval_steps_per_second": 5.98, "step": 2000 }, { "epoch": 0.42, "grad_norm": 0.8674423694610596, "learning_rate": 1.7793103448275862e-05, "loss": 0.9232, "step": 2100 }, { "epoch": 0.44, "grad_norm": 0.7438651323318481, "learning_rate": 1.7655172413793105e-05, "loss": 0.9301, "step": 2200 }, { "epoch": 0.46, "grad_norm": 0.8020528554916382, "learning_rate": 1.7517241379310347e-05, "loss": 0.9327, "step": 2300 }, { "epoch": 0.48, "grad_norm": 0.8499415516853333, "learning_rate": 1.7379310344827586e-05, "loss": 0.9419, "step": 2400 }, { "epoch": 0.5, "grad_norm": 0.7375739216804504, "learning_rate": 1.7241379310344828e-05, "loss": 0.9337, "step": 2500 }, { "epoch": 0.5, "eval_loss": 0.9325445890426636, "eval_runtime": 40.8356, "eval_samples_per_second": 24.488, "eval_steps_per_second": 6.122, "step": 2500 }, { "epoch": 0.52, "grad_norm": 0.7540984749794006, "learning_rate": 1.710344827586207e-05, "loss": 0.9212, "step": 2600 }, { "epoch": 0.54, "grad_norm": 0.7706255316734314, "learning_rate": 1.6965517241379313e-05, "loss": 0.9205, "step": 2700 }, { "epoch": 0.56, "grad_norm": 0.7500404715538025, "learning_rate": 1.6827586206896552e-05, "loss": 0.9232, "step": 2800 }, { "epoch": 0.58, "grad_norm": 0.7382043600082397, "learning_rate": 1.6689655172413794e-05, "loss": 0.9285, "step": 2900 }, { "epoch": 0.6, "grad_norm": 0.8193643093109131, "learning_rate": 1.6551724137931037e-05, "loss": 0.9207, "step": 3000 }, { "epoch": 0.6, "eval_loss": 0.9166812896728516, "eval_runtime": 40.7975, "eval_samples_per_second": 24.511, "eval_steps_per_second": 6.128, "step": 3000 }, { "epoch": 0.62, "grad_norm": 0.8520973324775696, "learning_rate": 1.6413793103448276e-05, "loss": 0.9228, "step": 3100 }, { "epoch": 0.64, "grad_norm": 0.7618833184242249, "learning_rate": 1.6275862068965518e-05, "loss": 0.9235, "step": 3200 }, { "epoch": 0.66, "grad_norm": 0.7382549047470093, "learning_rate": 1.613793103448276e-05, "loss": 0.9, "step": 3300 }, { "epoch": 0.68, "grad_norm": 0.8558925986289978, "learning_rate": 1.6000000000000003e-05, "loss": 0.9107, "step": 3400 }, { "epoch": 0.7, "grad_norm": 0.873500645160675, "learning_rate": 1.586206896551724e-05, "loss": 0.9183, "step": 3500 }, { "epoch": 0.7, "eval_loss": 0.9125198125839233, "eval_runtime": 41.6085, "eval_samples_per_second": 24.034, "eval_steps_per_second": 6.008, "step": 3500 }, { "epoch": 0.72, "grad_norm": 0.8846485614776611, "learning_rate": 1.5724137931034484e-05, "loss": 0.9037, "step": 3600 }, { "epoch": 0.74, "grad_norm": 0.8197630643844604, "learning_rate": 1.5586206896551726e-05, "loss": 0.9248, "step": 3700 }, { "epoch": 0.76, "grad_norm": 0.8452670574188232, "learning_rate": 1.5448275862068965e-05, "loss": 0.9246, "step": 3800 }, { "epoch": 0.78, "grad_norm": 1.0297187566757202, "learning_rate": 1.5310344827586208e-05, "loss": 0.9189, "step": 3900 }, { "epoch": 0.8, "grad_norm": 0.876285195350647, "learning_rate": 1.5172413793103448e-05, "loss": 0.914, "step": 4000 }, { "epoch": 0.8, "eval_loss": 0.9148977994918823, "eval_runtime": 41.4354, "eval_samples_per_second": 24.134, "eval_steps_per_second": 6.033, "step": 4000 }, { "epoch": 0.82, "grad_norm": 0.7724210619926453, "learning_rate": 1.503448275862069e-05, "loss": 0.9026, "step": 4100 }, { "epoch": 0.84, "grad_norm": 0.7171404957771301, "learning_rate": 1.4896551724137933e-05, "loss": 0.9223, "step": 4200 }, { "epoch": 0.86, "grad_norm": 0.8213006258010864, "learning_rate": 1.4758620689655174e-05, "loss": 0.9067, "step": 4300 }, { "epoch": 0.88, "grad_norm": 0.7411559224128723, "learning_rate": 1.4620689655172416e-05, "loss": 0.9064, "step": 4400 }, { "epoch": 0.9, "grad_norm": 0.7361301183700562, "learning_rate": 1.4482758620689657e-05, "loss": 0.9154, "step": 4500 }, { "epoch": 0.9, "eval_loss": 0.8940753936767578, "eval_runtime": 41.8655, "eval_samples_per_second": 23.886, "eval_steps_per_second": 5.972, "step": 4500 }, { "epoch": 0.92, "grad_norm": 0.8315750956535339, "learning_rate": 1.4344827586206897e-05, "loss": 0.9065, "step": 4600 }, { "epoch": 0.94, "grad_norm": 0.9056677222251892, "learning_rate": 1.4206896551724138e-05, "loss": 0.9207, "step": 4700 }, { "epoch": 0.96, "grad_norm": 0.7418957948684692, "learning_rate": 1.406896551724138e-05, "loss": 0.9073, "step": 4800 }, { "epoch": 0.98, "grad_norm": 0.7177292108535767, "learning_rate": 1.3931034482758621e-05, "loss": 0.9069, "step": 4900 }, { "epoch": 1.0, "grad_norm": 0.7678000926971436, "learning_rate": 1.3793103448275863e-05, "loss": 0.9044, "step": 5000 }, { "epoch": 1.0, "eval_loss": 0.8728386163711548, "eval_runtime": 40.6738, "eval_samples_per_second": 24.586, "eval_steps_per_second": 6.146, "step": 5000 }, { "epoch": 1.02, "grad_norm": 0.772551417350769, "learning_rate": 1.3655172413793106e-05, "loss": 0.799, "step": 5100 }, { "epoch": 1.04, "grad_norm": 0.9084504246711731, "learning_rate": 1.3517241379310346e-05, "loss": 0.7992, "step": 5200 }, { "epoch": 1.06, "grad_norm": 0.780437707901001, "learning_rate": 1.3379310344827587e-05, "loss": 0.7973, "step": 5300 }, { "epoch": 1.08, "grad_norm": 0.831598699092865, "learning_rate": 1.324137931034483e-05, "loss": 0.8157, "step": 5400 }, { "epoch": 1.1, "grad_norm": 0.8440998196601868, "learning_rate": 1.310344827586207e-05, "loss": 0.8022, "step": 5500 }, { "epoch": 1.1, "eval_loss": 0.901544451713562, "eval_runtime": 41.2275, "eval_samples_per_second": 24.256, "eval_steps_per_second": 6.064, "step": 5500 }, { "epoch": 1.12, "grad_norm": 0.7484360933303833, "learning_rate": 1.296551724137931e-05, "loss": 0.8042, "step": 5600 }, { "epoch": 1.14, "grad_norm": 0.9037306904792786, "learning_rate": 1.2827586206896551e-05, "loss": 0.7968, "step": 5700 }, { "epoch": 1.16, "grad_norm": 0.9326817989349365, "learning_rate": 1.2689655172413795e-05, "loss": 0.7961, "step": 5800 }, { "epoch": 1.18, "grad_norm": 0.8012159466743469, "learning_rate": 1.2551724137931036e-05, "loss": 0.7878, "step": 5900 }, { "epoch": 1.2, "grad_norm": 0.7988967895507812, "learning_rate": 1.2413793103448277e-05, "loss": 0.8042, "step": 6000 }, { "epoch": 1.2, "eval_loss": 0.9131768941879272, "eval_runtime": 41.3623, "eval_samples_per_second": 24.177, "eval_steps_per_second": 6.044, "step": 6000 }, { "epoch": 1.22, "grad_norm": 0.7649048566818237, "learning_rate": 1.2275862068965519e-05, "loss": 0.8009, "step": 6100 }, { "epoch": 1.24, "grad_norm": 0.8105271458625793, "learning_rate": 1.213793103448276e-05, "loss": 0.8101, "step": 6200 }, { "epoch": 1.26, "grad_norm": 0.814795196056366, "learning_rate": 1.2e-05, "loss": 0.7948, "step": 6300 }, { "epoch": 1.28, "grad_norm": 0.8023428916931152, "learning_rate": 1.1862068965517241e-05, "loss": 0.7912, "step": 6400 }, { "epoch": 1.3, "grad_norm": 0.8638762831687927, "learning_rate": 1.1724137931034483e-05, "loss": 0.8079, "step": 6500 }, { "epoch": 1.3, "eval_loss": 0.9087982177734375, "eval_runtime": 42.0298, "eval_samples_per_second": 23.793, "eval_steps_per_second": 5.948, "step": 6500 }, { "epoch": 1.32, "grad_norm": 0.7907779812812805, "learning_rate": 1.1586206896551726e-05, "loss": 0.7924, "step": 6600 }, { "epoch": 1.34, "grad_norm": 0.8300544619560242, "learning_rate": 1.1448275862068966e-05, "loss": 0.8012, "step": 6700 }, { "epoch": 1.36, "grad_norm": 0.8163663744926453, "learning_rate": 1.1310344827586209e-05, "loss": 0.8018, "step": 6800 }, { "epoch": 1.38, "grad_norm": 0.8645555973052979, "learning_rate": 1.117241379310345e-05, "loss": 0.7959, "step": 6900 }, { "epoch": 1.4, "grad_norm": 0.7604303956031799, "learning_rate": 1.103448275862069e-05, "loss": 0.7965, "step": 7000 }, { "epoch": 1.4, "eval_loss": 0.8678131103515625, "eval_runtime": 41.8605, "eval_samples_per_second": 23.889, "eval_steps_per_second": 5.972, "step": 7000 }, { "epoch": 1.42, "grad_norm": 0.7852880954742432, "learning_rate": 1.0896551724137932e-05, "loss": 0.7915, "step": 7100 }, { "epoch": 1.44, "grad_norm": 0.9778708815574646, "learning_rate": 1.0758620689655173e-05, "loss": 0.8155, "step": 7200 }, { "epoch": 1.46, "grad_norm": 0.8450771570205688, "learning_rate": 1.0620689655172414e-05, "loss": 0.8076, "step": 7300 }, { "epoch": 1.48, "grad_norm": 0.8553265929222107, "learning_rate": 1.0482758620689658e-05, "loss": 0.8129, "step": 7400 }, { "epoch": 1.5, "grad_norm": 0.8340109586715698, "learning_rate": 1.0344827586206898e-05, "loss": 0.796, "step": 7500 }, { "epoch": 1.5, "eval_loss": 0.8960831165313721, "eval_runtime": 40.8546, "eval_samples_per_second": 24.477, "eval_steps_per_second": 6.119, "step": 7500 }, { "epoch": 1.52, "grad_norm": 0.8168347477912903, "learning_rate": 1.0206896551724139e-05, "loss": 0.7853, "step": 7600 }, { "epoch": 1.54, "grad_norm": 0.8185055255889893, "learning_rate": 1.006896551724138e-05, "loss": 0.7953, "step": 7700 }, { "epoch": 1.56, "grad_norm": 0.7901424765586853, "learning_rate": 9.931034482758622e-06, "loss": 0.8097, "step": 7800 }, { "epoch": 1.58, "grad_norm": 0.8761677742004395, "learning_rate": 9.793103448275863e-06, "loss": 0.8075, "step": 7900 }, { "epoch": 1.6, "grad_norm": 0.7806304097175598, "learning_rate": 9.655172413793105e-06, "loss": 0.8097, "step": 8000 }, { "epoch": 1.6, "eval_loss": 0.9041339159011841, "eval_runtime": 41.2866, "eval_samples_per_second": 24.221, "eval_steps_per_second": 6.055, "step": 8000 }, { "epoch": 1.62, "grad_norm": 0.8681219220161438, "learning_rate": 9.517241379310346e-06, "loss": 0.8036, "step": 8100 }, { "epoch": 1.64, "grad_norm": 0.8567246794700623, "learning_rate": 9.379310344827586e-06, "loss": 0.809, "step": 8200 }, { "epoch": 1.66, "grad_norm": 0.8123043775558472, "learning_rate": 9.241379310344829e-06, "loss": 0.8026, "step": 8300 }, { "epoch": 1.68, "grad_norm": 0.8345689177513123, "learning_rate": 9.10344827586207e-06, "loss": 0.7909, "step": 8400 }, { "epoch": 1.7, "grad_norm": 0.7822232246398926, "learning_rate": 8.965517241379312e-06, "loss": 0.796, "step": 8500 }, { "epoch": 1.7, "eval_loss": 0.9124335646629333, "eval_runtime": 41.9233, "eval_samples_per_second": 23.853, "eval_steps_per_second": 5.963, "step": 8500 }, { "epoch": 1.72, "grad_norm": 0.8015475273132324, "learning_rate": 8.827586206896552e-06, "loss": 0.8069, "step": 8600 }, { "epoch": 1.74, "grad_norm": 0.8299379944801331, "learning_rate": 8.689655172413793e-06, "loss": 0.7948, "step": 8700 }, { "epoch": 1.76, "grad_norm": 0.9477623701095581, "learning_rate": 8.551724137931035e-06, "loss": 0.784, "step": 8800 }, { "epoch": 1.78, "grad_norm": 0.8605545163154602, "learning_rate": 8.413793103448276e-06, "loss": 0.7917, "step": 8900 }, { "epoch": 1.8, "grad_norm": 0.9456145167350769, "learning_rate": 8.275862068965518e-06, "loss": 0.7962, "step": 9000 }, { "epoch": 1.8, "eval_loss": 0.9288654327392578, "eval_runtime": 41.4026, "eval_samples_per_second": 24.153, "eval_steps_per_second": 6.038, "step": 9000 }, { "epoch": 1.82, "grad_norm": 0.7313815355300903, "learning_rate": 8.137931034482759e-06, "loss": 0.7995, "step": 9100 }, { "epoch": 1.84, "grad_norm": 0.8317314982414246, "learning_rate": 8.000000000000001e-06, "loss": 0.8066, "step": 9200 }, { "epoch": 1.86, "grad_norm": 0.7538059949874878, "learning_rate": 7.862068965517242e-06, "loss": 0.7957, "step": 9300 }, { "epoch": 1.88, "grad_norm": 0.8746474385261536, "learning_rate": 7.724137931034483e-06, "loss": 0.8052, "step": 9400 }, { "epoch": 1.9, "grad_norm": 0.7950379252433777, "learning_rate": 7.586206896551724e-06, "loss": 0.7902, "step": 9500 }, { "epoch": 1.9, "eval_loss": 0.8927227854728699, "eval_runtime": 42.1448, "eval_samples_per_second": 23.728, "eval_steps_per_second": 5.932, "step": 9500 }, { "epoch": 1.92, "grad_norm": 0.7989736795425415, "learning_rate": 7.4482758620689665e-06, "loss": 0.8025, "step": 9600 }, { "epoch": 1.94, "grad_norm": 0.8516004681587219, "learning_rate": 7.310344827586208e-06, "loss": 0.792, "step": 9700 }, { "epoch": 1.96, "grad_norm": 0.8401670455932617, "learning_rate": 7.172413793103449e-06, "loss": 0.796, "step": 9800 }, { "epoch": 1.98, "grad_norm": 0.8132777810096741, "learning_rate": 7.03448275862069e-06, "loss": 0.7871, "step": 9900 }, { "epoch": 2.0, "grad_norm": 0.883852481842041, "learning_rate": 6.897931034482759e-06, "loss": 0.7961, "step": 10000 }, { "epoch": 2.0, "eval_loss": 0.8727924823760986, "eval_runtime": 42.1514, "eval_samples_per_second": 23.724, "eval_steps_per_second": 5.931, "step": 10000 }, { "epoch": 2.02, "grad_norm": 0.8017972707748413, "learning_rate": 6.760000000000001e-06, "loss": 0.7048, "step": 10100 }, { "epoch": 2.04, "grad_norm": 0.874146044254303, "learning_rate": 6.622068965517242e-06, "loss": 0.6911, "step": 10200 }, { "epoch": 2.06, "grad_norm": 1.0310399532318115, "learning_rate": 6.4841379310344835e-06, "loss": 0.6871, "step": 10300 }, { "epoch": 2.08, "grad_norm": 0.8766441345214844, "learning_rate": 6.346206896551724e-06, "loss": 0.6977, "step": 10400 }, { "epoch": 2.1, "grad_norm": 0.9991758465766907, "learning_rate": 6.2082758620689665e-06, "loss": 0.6925, "step": 10500 }, { "epoch": 2.1, "eval_loss": 0.9542326331138611, "eval_runtime": 41.8343, "eval_samples_per_second": 23.904, "eval_steps_per_second": 5.976, "step": 10500 }, { "epoch": 2.12, "grad_norm": 0.8941115140914917, "learning_rate": 6.070344827586207e-06, "loss": 0.6919, "step": 10600 }, { "epoch": 2.14, "grad_norm": 0.885079026222229, "learning_rate": 5.932413793103449e-06, "loss": 0.6949, "step": 10700 }, { "epoch": 2.16, "grad_norm": 0.8566780686378479, "learning_rate": 5.79448275862069e-06, "loss": 0.694, "step": 10800 }, { "epoch": 2.18, "grad_norm": 0.9314544796943665, "learning_rate": 5.656551724137932e-06, "loss": 0.6897, "step": 10900 }, { "epoch": 2.2, "grad_norm": 0.9923773407936096, "learning_rate": 5.518620689655173e-06, "loss": 0.6912, "step": 11000 }, { "epoch": 2.2, "eval_loss": 0.9255372285842896, "eval_runtime": 40.523, "eval_samples_per_second": 24.677, "eval_steps_per_second": 6.169, "step": 11000 }, { "epoch": 2.22, "grad_norm": 0.9591454863548279, "learning_rate": 5.380689655172414e-06, "loss": 0.689, "step": 11100 }, { "epoch": 2.24, "grad_norm": 0.9528708457946777, "learning_rate": 5.242758620689655e-06, "loss": 0.6928, "step": 11200 }, { "epoch": 2.26, "grad_norm": 0.9224244952201843, "learning_rate": 5.104827586206898e-06, "loss": 0.6933, "step": 11300 }, { "epoch": 2.28, "grad_norm": 1.0021519660949707, "learning_rate": 4.966896551724138e-06, "loss": 0.6951, "step": 11400 }, { "epoch": 2.3, "grad_norm": 0.8857138752937317, "learning_rate": 4.830344827586207e-06, "loss": 0.7047, "step": 11500 }, { "epoch": 2.3, "eval_loss": 0.9267728328704834, "eval_runtime": 40.8947, "eval_samples_per_second": 24.453, "eval_steps_per_second": 6.113, "step": 11500 }, { "epoch": 2.32, "grad_norm": 0.9973616600036621, "learning_rate": 4.692413793103449e-06, "loss": 0.6928, "step": 11600 }, { "epoch": 2.34, "grad_norm": 0.9475648999214172, "learning_rate": 4.55448275862069e-06, "loss": 0.6989, "step": 11700 }, { "epoch": 2.36, "grad_norm": 0.8182489275932312, "learning_rate": 4.416551724137932e-06, "loss": 0.6992, "step": 11800 }, { "epoch": 2.38, "grad_norm": 0.9170109033584595, "learning_rate": 4.278620689655173e-06, "loss": 0.6836, "step": 11900 }, { "epoch": 2.4, "grad_norm": 0.9699552655220032, "learning_rate": 4.140689655172414e-06, "loss": 0.6922, "step": 12000 }, { "epoch": 2.4, "eval_loss": 0.9180066585540771, "eval_runtime": 40.9795, "eval_samples_per_second": 24.402, "eval_steps_per_second": 6.101, "step": 12000 }, { "epoch": 2.42, "grad_norm": 0.950919508934021, "learning_rate": 4.002758620689655e-06, "loss": 0.6833, "step": 12100 }, { "epoch": 2.44, "grad_norm": 0.9411114454269409, "learning_rate": 3.864827586206897e-06, "loss": 0.6834, "step": 12200 }, { "epoch": 2.46, "grad_norm": 0.9745101928710938, "learning_rate": 3.7268965517241383e-06, "loss": 0.7002, "step": 12300 }, { "epoch": 2.48, "grad_norm": 0.8441026210784912, "learning_rate": 3.5889655172413794e-06, "loss": 0.6885, "step": 12400 }, { "epoch": 2.5, "grad_norm": 0.9617292284965515, "learning_rate": 3.4510344827586214e-06, "loss": 0.6948, "step": 12500 }, { "epoch": 2.5, "eval_loss": 0.9197391271591187, "eval_runtime": 41.1352, "eval_samples_per_second": 24.31, "eval_steps_per_second": 6.078, "step": 12500 }, { "epoch": 2.52, "grad_norm": 0.8583619594573975, "learning_rate": 3.3131034482758624e-06, "loss": 0.6979, "step": 12600 }, { "epoch": 2.54, "grad_norm": 1.2281546592712402, "learning_rate": 3.175172413793104e-06, "loss": 0.6971, "step": 12700 }, { "epoch": 2.56, "grad_norm": 0.9949316382408142, "learning_rate": 3.037241379310345e-06, "loss": 0.6823, "step": 12800 }, { "epoch": 2.58, "grad_norm": 1.0743273496627808, "learning_rate": 2.8993103448275865e-06, "loss": 0.6838, "step": 12900 }, { "epoch": 2.6, "grad_norm": 0.8986988067626953, "learning_rate": 2.7613793103448276e-06, "loss": 0.6966, "step": 13000 }, { "epoch": 2.6, "eval_loss": 0.9260442852973938, "eval_runtime": 40.9933, "eval_samples_per_second": 24.394, "eval_steps_per_second": 6.099, "step": 13000 }, { "epoch": 2.62, "grad_norm": 0.9071211814880371, "learning_rate": 2.6234482758620695e-06, "loss": 0.7046, "step": 13100 }, { "epoch": 2.64, "grad_norm": 0.9917349815368652, "learning_rate": 2.4855172413793106e-06, "loss": 0.6922, "step": 13200 }, { "epoch": 2.66, "grad_norm": 0.9854599833488464, "learning_rate": 2.3475862068965517e-06, "loss": 0.6868, "step": 13300 }, { "epoch": 2.68, "grad_norm": 1.0206737518310547, "learning_rate": 2.209655172413793e-06, "loss": 0.6944, "step": 13400 }, { "epoch": 2.7, "grad_norm": 0.9407541751861572, "learning_rate": 2.0717241379310347e-06, "loss": 0.6899, "step": 13500 }, { "epoch": 2.7, "eval_loss": 0.9235697984695435, "eval_runtime": 40.6899, "eval_samples_per_second": 24.576, "eval_steps_per_second": 6.144, "step": 13500 }, { "epoch": 2.72, "grad_norm": 0.9291812181472778, "learning_rate": 1.933793103448276e-06, "loss": 0.6761, "step": 13600 }, { "epoch": 2.74, "grad_norm": 0.959474503993988, "learning_rate": 1.7958620689655173e-06, "loss": 0.6759, "step": 13700 }, { "epoch": 2.76, "grad_norm": 0.9794566035270691, "learning_rate": 1.6579310344827588e-06, "loss": 0.6994, "step": 13800 }, { "epoch": 2.78, "grad_norm": 0.9680616855621338, "learning_rate": 1.52e-06, "loss": 0.68, "step": 13900 }, { "epoch": 2.8, "grad_norm": 0.9499403834342957, "learning_rate": 1.3820689655172416e-06, "loss": 0.6936, "step": 14000 }, { "epoch": 2.8, "eval_loss": 0.8948045969009399, "eval_runtime": 40.9041, "eval_samples_per_second": 24.447, "eval_steps_per_second": 6.112, "step": 14000 }, { "epoch": 2.82, "grad_norm": 0.8915033340454102, "learning_rate": 1.2441379310344829e-06, "loss": 0.6925, "step": 14100 }, { "epoch": 2.84, "grad_norm": 0.9769828915596008, "learning_rate": 1.1062068965517241e-06, "loss": 0.6847, "step": 14200 }, { "epoch": 2.86, "grad_norm": 1.0965638160705566, "learning_rate": 9.682758620689656e-07, "loss": 0.6912, "step": 14300 }, { "epoch": 2.88, "grad_norm": 0.9820475578308105, "learning_rate": 8.303448275862069e-07, "loss": 0.6927, "step": 14400 }, { "epoch": 2.9, "grad_norm": 1.0005227327346802, "learning_rate": 6.924137931034483e-07, "loss": 0.708, "step": 14500 }, { "epoch": 2.9, "eval_loss": 0.9022778272628784, "eval_runtime": 40.6913, "eval_samples_per_second": 24.575, "eval_steps_per_second": 6.144, "step": 14500 }, { "epoch": 2.92, "grad_norm": 0.9248984456062317, "learning_rate": 5.544827586206897e-07, "loss": 0.6742, "step": 14600 }, { "epoch": 2.94, "grad_norm": 0.9249628186225891, "learning_rate": 4.1655172413793107e-07, "loss": 0.6961, "step": 14700 }, { "epoch": 2.96, "grad_norm": 1.0156011581420898, "learning_rate": 2.7862068965517247e-07, "loss": 0.6881, "step": 14800 }, { "epoch": 2.98, "grad_norm": 0.9798776507377625, "learning_rate": 1.406896551724138e-07, "loss": 0.7001, "step": 14900 }, { "epoch": 3.0, "grad_norm": 0.875923216342926, "learning_rate": 2.758620689655173e-09, "loss": 0.6774, "step": 15000 }, { "epoch": 3.0, "eval_loss": 0.9279061555862427, "eval_runtime": 41.6915, "eval_samples_per_second": 23.986, "eval_steps_per_second": 5.996, "step": 15000 } ], "logging_steps": 100, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.88804379967488e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }