{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.5043091773986816, "learning_rate": 4.000000000000001e-06, "loss": 2.1257, "step": 100 }, { "epoch": 0.04, "grad_norm": 1.06766939163208, "learning_rate": 8.000000000000001e-06, "loss": 1.0451, "step": 200 }, { "epoch": 0.06, "grad_norm": 0.8579052686691284, "learning_rate": 1.2e-05, "loss": 0.9972, "step": 300 }, { "epoch": 0.08, "grad_norm": 0.866980791091919, "learning_rate": 1.6000000000000003e-05, "loss": 0.9872, "step": 400 }, { "epoch": 0.1, "grad_norm": 0.8615725040435791, "learning_rate": 2e-05, "loss": 0.9757, "step": 500 }, { "epoch": 0.1, "eval_loss": 0.9876903295516968, "eval_runtime": 41.3404, "eval_samples_per_second": 24.189, "eval_steps_per_second": 6.047, "step": 500 }, { "epoch": 0.12, "grad_norm": 0.9930108785629272, "learning_rate": 1.9862068965517244e-05, "loss": 0.9801, "step": 600 }, { "epoch": 0.14, "grad_norm": 0.8045609593391418, "learning_rate": 1.9724137931034483e-05, "loss": 0.9593, "step": 700 }, { "epoch": 0.16, "grad_norm": 0.8184276819229126, "learning_rate": 1.9586206896551725e-05, "loss": 0.9567, "step": 800 }, { "epoch": 0.18, "grad_norm": 0.872108519077301, "learning_rate": 1.9448275862068968e-05, "loss": 0.9541, "step": 900 }, { "epoch": 0.2, "grad_norm": 0.7699779868125916, "learning_rate": 1.931034482758621e-05, "loss": 0.9354, "step": 1000 }, { "epoch": 0.2, "eval_loss": 0.9477444887161255, "eval_runtime": 41.7539, "eval_samples_per_second": 23.95, "eval_steps_per_second": 5.987, "step": 1000 }, { "epoch": 0.22, "grad_norm": 0.9201217889785767, "learning_rate": 1.917241379310345e-05, "loss": 0.945, "step": 1100 }, { "epoch": 0.24, "grad_norm": 0.9041834473609924, "learning_rate": 1.903448275862069e-05, "loss": 0.9472, "step": 1200 }, { "epoch": 0.26, "grad_norm": 0.8310356140136719, "learning_rate": 1.8896551724137934e-05, "loss": 0.9457, "step": 1300 }, { "epoch": 0.28, "grad_norm": 0.8618925213813782, "learning_rate": 1.8758620689655173e-05, "loss": 0.9529, "step": 1400 }, { "epoch": 0.3, "grad_norm": 0.8114230036735535, "learning_rate": 1.8620689655172415e-05, "loss": 0.9421, "step": 1500 }, { "epoch": 0.3, "eval_loss": 0.9172976016998291, "eval_runtime": 41.6025, "eval_samples_per_second": 24.037, "eval_steps_per_second": 6.009, "step": 1500 }, { "epoch": 0.32, "grad_norm": 0.8510406017303467, "learning_rate": 1.8482758620689657e-05, "loss": 0.9371, "step": 1600 }, { "epoch": 0.34, "grad_norm": 0.8189296126365662, "learning_rate": 1.8344827586206896e-05, "loss": 0.9507, "step": 1700 }, { "epoch": 0.36, "grad_norm": 0.8465898036956787, "learning_rate": 1.820689655172414e-05, "loss": 0.9298, "step": 1800 }, { "epoch": 0.38, "grad_norm": 0.8617157936096191, "learning_rate": 1.806896551724138e-05, "loss": 0.9203, "step": 1900 }, { "epoch": 0.4, "grad_norm": 0.8387811183929443, "learning_rate": 1.7931034482758623e-05, "loss": 0.9539, "step": 2000 }, { "epoch": 0.4, "eval_loss": 0.9198890328407288, "eval_runtime": 40.979, "eval_samples_per_second": 24.403, "eval_steps_per_second": 6.101, "step": 2000 }, { "epoch": 0.42, "grad_norm": 0.8710920214653015, "learning_rate": 1.7793103448275862e-05, "loss": 0.9227, "step": 2100 }, { "epoch": 0.44, "grad_norm": 0.7497248649597168, "learning_rate": 1.7655172413793105e-05, "loss": 0.93, "step": 2200 }, { "epoch": 0.46, "grad_norm": 0.7869455814361572, "learning_rate": 1.7517241379310347e-05, "loss": 0.9323, "step": 2300 }, { "epoch": 0.48, "grad_norm": 0.8080159425735474, "learning_rate": 1.7379310344827586e-05, "loss": 0.9418, "step": 2400 }, { "epoch": 0.5, "grad_norm": 0.72604900598526, "learning_rate": 1.7241379310344828e-05, "loss": 0.9336, "step": 2500 }, { "epoch": 0.5, "eval_loss": 0.9328898787498474, "eval_runtime": 41.2374, "eval_samples_per_second": 24.25, "eval_steps_per_second": 6.062, "step": 2500 }, { "epoch": 0.52, "grad_norm": 0.766793966293335, "learning_rate": 1.710344827586207e-05, "loss": 0.9209, "step": 2600 }, { "epoch": 0.54, "grad_norm": 0.7827008962631226, "learning_rate": 1.6965517241379313e-05, "loss": 0.92, "step": 2700 }, { "epoch": 0.56, "grad_norm": 0.7644964456558228, "learning_rate": 1.6827586206896552e-05, "loss": 0.9232, "step": 2800 }, { "epoch": 0.58, "grad_norm": 0.7284256815910339, "learning_rate": 1.6689655172413794e-05, "loss": 0.9284, "step": 2900 }, { "epoch": 0.6, "grad_norm": 0.8209360837936401, "learning_rate": 1.6551724137931037e-05, "loss": 0.9206, "step": 3000 }, { "epoch": 0.6, "eval_loss": 0.9171387553215027, "eval_runtime": 40.5489, "eval_samples_per_second": 24.662, "eval_steps_per_second": 6.165, "step": 3000 }, { "epoch": 0.62, "grad_norm": 0.8341450691223145, "learning_rate": 1.6413793103448276e-05, "loss": 0.9229, "step": 3100 }, { "epoch": 0.64, "grad_norm": 0.7724445462226868, "learning_rate": 1.6275862068965518e-05, "loss": 0.9229, "step": 3200 }, { "epoch": 0.66, "grad_norm": 0.7436179518699646, "learning_rate": 1.613793103448276e-05, "loss": 0.8998, "step": 3300 }, { "epoch": 0.68, "grad_norm": 0.8365751504898071, "learning_rate": 1.6000000000000003e-05, "loss": 0.9104, "step": 3400 }, { "epoch": 0.7, "grad_norm": 0.8854556083679199, "learning_rate": 1.586206896551724e-05, "loss": 0.9182, "step": 3500 }, { "epoch": 0.7, "eval_loss": 0.9123844504356384, "eval_runtime": 41.9203, "eval_samples_per_second": 23.855, "eval_steps_per_second": 5.964, "step": 3500 }, { "epoch": 0.72, "grad_norm": 0.897222101688385, "learning_rate": 1.5724137931034484e-05, "loss": 0.9037, "step": 3600 }, { "epoch": 0.74, "grad_norm": 0.8261429667472839, "learning_rate": 1.5586206896551726e-05, "loss": 0.9252, "step": 3700 }, { "epoch": 0.76, "grad_norm": 0.8133054375648499, "learning_rate": 1.5448275862068965e-05, "loss": 0.9246, "step": 3800 }, { "epoch": 0.78, "grad_norm": 0.7939581871032715, "learning_rate": 1.5310344827586208e-05, "loss": 0.9187, "step": 3900 }, { "epoch": 0.8, "grad_norm": 0.8768674731254578, "learning_rate": 1.5172413793103448e-05, "loss": 0.9139, "step": 4000 }, { "epoch": 0.8, "eval_loss": 0.914544939994812, "eval_runtime": 42.1027, "eval_samples_per_second": 23.751, "eval_steps_per_second": 5.938, "step": 4000 }, { "epoch": 0.82, "grad_norm": 0.7592473030090332, "learning_rate": 1.503448275862069e-05, "loss": 0.9024, "step": 4100 }, { "epoch": 0.84, "grad_norm": 0.7141114473342896, "learning_rate": 1.4896551724137933e-05, "loss": 0.9222, "step": 4200 }, { "epoch": 0.86, "grad_norm": 0.8296393156051636, "learning_rate": 1.4758620689655174e-05, "loss": 0.9065, "step": 4300 }, { "epoch": 0.88, "grad_norm": 0.7411290407180786, "learning_rate": 1.4620689655172416e-05, "loss": 0.9063, "step": 4400 }, { "epoch": 0.9, "grad_norm": 0.7428833842277527, "learning_rate": 1.4482758620689657e-05, "loss": 0.9153, "step": 4500 }, { "epoch": 0.9, "eval_loss": 0.8934066891670227, "eval_runtime": 40.7429, "eval_samples_per_second": 24.544, "eval_steps_per_second": 6.136, "step": 4500 }, { "epoch": 0.92, "grad_norm": 0.8348454236984253, "learning_rate": 1.4344827586206897e-05, "loss": 0.9065, "step": 4600 }, { "epoch": 0.94, "grad_norm": 0.9168105125427246, "learning_rate": 1.4206896551724138e-05, "loss": 0.9205, "step": 4700 }, { "epoch": 0.96, "grad_norm": 0.7295928597450256, "learning_rate": 1.406896551724138e-05, "loss": 0.9069, "step": 4800 }, { "epoch": 0.98, "grad_norm": 0.74032062292099, "learning_rate": 1.3931034482758621e-05, "loss": 0.9069, "step": 4900 }, { "epoch": 1.0, "grad_norm": 0.7203904986381531, "learning_rate": 1.3793103448275863e-05, "loss": 0.9046, "step": 5000 }, { "epoch": 1.0, "eval_loss": 0.872988760471344, "eval_runtime": 40.9885, "eval_samples_per_second": 24.397, "eval_steps_per_second": 6.099, "step": 5000 }, { "epoch": 1.02, "grad_norm": 0.7743054032325745, "learning_rate": 1.3655172413793106e-05, "loss": 0.7995, "step": 5100 }, { "epoch": 1.04, "grad_norm": 0.8992008566856384, "learning_rate": 1.3517241379310346e-05, "loss": 0.8001, "step": 5200 }, { "epoch": 1.06, "grad_norm": 0.7732968330383301, "learning_rate": 1.3379310344827587e-05, "loss": 0.7977, "step": 5300 }, { "epoch": 1.08, "grad_norm": 0.8479374051094055, "learning_rate": 1.324137931034483e-05, "loss": 0.8162, "step": 5400 }, { "epoch": 1.1, "grad_norm": 0.8781663775444031, "learning_rate": 1.310344827586207e-05, "loss": 0.803, "step": 5500 }, { "epoch": 1.1, "eval_loss": 0.9017807841300964, "eval_runtime": 41.4996, "eval_samples_per_second": 24.097, "eval_steps_per_second": 6.024, "step": 5500 }, { "epoch": 1.12, "grad_norm": 0.7585736513137817, "learning_rate": 1.296551724137931e-05, "loss": 0.8048, "step": 5600 }, { "epoch": 1.14, "grad_norm": 0.8792749047279358, "learning_rate": 1.2827586206896551e-05, "loss": 0.7975, "step": 5700 }, { "epoch": 1.16, "grad_norm": 0.9402467608451843, "learning_rate": 1.2689655172413795e-05, "loss": 0.7968, "step": 5800 }, { "epoch": 1.18, "grad_norm": 0.8136008381843567, "learning_rate": 1.2551724137931036e-05, "loss": 0.7888, "step": 5900 }, { "epoch": 1.2, "grad_norm": 0.7935479879379272, "learning_rate": 1.2413793103448277e-05, "loss": 0.8051, "step": 6000 }, { "epoch": 1.2, "eval_loss": 0.9130223393440247, "eval_runtime": 40.9878, "eval_samples_per_second": 24.397, "eval_steps_per_second": 6.099, "step": 6000 }, { "epoch": 1.22, "grad_norm": 0.7666265964508057, "learning_rate": 1.2275862068965519e-05, "loss": 0.8013, "step": 6100 }, { "epoch": 1.24, "grad_norm": 0.808914065361023, "learning_rate": 1.213793103448276e-05, "loss": 0.8108, "step": 6200 }, { "epoch": 1.26, "grad_norm": 0.8120896220207214, "learning_rate": 1.2e-05, "loss": 0.7955, "step": 6300 }, { "epoch": 1.28, "grad_norm": 0.8073500990867615, "learning_rate": 1.1862068965517241e-05, "loss": 0.7918, "step": 6400 }, { "epoch": 1.3, "grad_norm": 0.865331768989563, "learning_rate": 1.1724137931034483e-05, "loss": 0.8089, "step": 6500 }, { "epoch": 1.3, "eval_loss": 0.9077558517456055, "eval_runtime": 41.3054, "eval_samples_per_second": 24.21, "eval_steps_per_second": 6.052, "step": 6500 }, { "epoch": 1.32, "grad_norm": 0.8069093227386475, "learning_rate": 1.1586206896551726e-05, "loss": 0.7933, "step": 6600 }, { "epoch": 1.34, "grad_norm": 0.8061268925666809, "learning_rate": 1.1448275862068966e-05, "loss": 0.802, "step": 6700 }, { "epoch": 1.36, "grad_norm": 0.8158251047134399, "learning_rate": 1.1310344827586209e-05, "loss": 0.8028, "step": 6800 }, { "epoch": 1.38, "grad_norm": 0.849918782711029, "learning_rate": 1.117241379310345e-05, "loss": 0.7967, "step": 6900 }, { "epoch": 1.4, "grad_norm": 0.7494837045669556, "learning_rate": 1.103448275862069e-05, "loss": 0.7973, "step": 7000 }, { "epoch": 1.4, "eval_loss": 0.8674909472465515, "eval_runtime": 41.4384, "eval_samples_per_second": 24.132, "eval_steps_per_second": 6.033, "step": 7000 }, { "epoch": 1.42, "grad_norm": 0.7753856182098389, "learning_rate": 1.0896551724137932e-05, "loss": 0.7922, "step": 7100 }, { "epoch": 1.44, "grad_norm": 0.9751661419868469, "learning_rate": 1.0758620689655173e-05, "loss": 0.816, "step": 7200 }, { "epoch": 1.46, "grad_norm": 0.850592851638794, "learning_rate": 1.0620689655172414e-05, "loss": 0.8084, "step": 7300 }, { "epoch": 1.48, "grad_norm": 0.8299598097801208, "learning_rate": 1.0482758620689658e-05, "loss": 0.8135, "step": 7400 }, { "epoch": 1.5, "grad_norm": 0.7985300421714783, "learning_rate": 1.0344827586206898e-05, "loss": 0.7965, "step": 7500 }, { "epoch": 1.5, "eval_loss": 0.8954501748085022, "eval_runtime": 41.3848, "eval_samples_per_second": 24.163, "eval_steps_per_second": 6.041, "step": 7500 }, { "epoch": 1.52, "grad_norm": 0.8110877275466919, "learning_rate": 1.0206896551724139e-05, "loss": 0.786, "step": 7600 }, { "epoch": 1.54, "grad_norm": 0.7997573614120483, "learning_rate": 1.006896551724138e-05, "loss": 0.7959, "step": 7700 }, { "epoch": 1.56, "grad_norm": 0.7852098345756531, "learning_rate": 9.931034482758622e-06, "loss": 0.8106, "step": 7800 }, { "epoch": 1.58, "grad_norm": 0.8886978030204773, "learning_rate": 9.793103448275863e-06, "loss": 0.8081, "step": 7900 }, { "epoch": 1.6, "grad_norm": 0.7849240899085999, "learning_rate": 9.655172413793105e-06, "loss": 0.8103, "step": 8000 }, { "epoch": 1.6, "eval_loss": 0.9043481945991516, "eval_runtime": 41.0052, "eval_samples_per_second": 24.387, "eval_steps_per_second": 6.097, "step": 8000 }, { "epoch": 1.62, "grad_norm": 0.8720059394836426, "learning_rate": 9.517241379310346e-06, "loss": 0.8041, "step": 8100 }, { "epoch": 1.64, "grad_norm": 0.8483107686042786, "learning_rate": 9.379310344827586e-06, "loss": 0.8098, "step": 8200 }, { "epoch": 1.66, "grad_norm": 0.8146810531616211, "learning_rate": 9.241379310344829e-06, "loss": 0.803, "step": 8300 }, { "epoch": 1.68, "grad_norm": 0.8219090700149536, "learning_rate": 9.10344827586207e-06, "loss": 0.792, "step": 8400 }, { "epoch": 1.7, "grad_norm": 0.7642741203308105, "learning_rate": 8.965517241379312e-06, "loss": 0.7969, "step": 8500 }, { "epoch": 1.7, "eval_loss": 0.9122523069381714, "eval_runtime": 42.1629, "eval_samples_per_second": 23.718, "eval_steps_per_second": 5.929, "step": 8500 }, { "epoch": 1.72, "grad_norm": 0.7875528931617737, "learning_rate": 8.827586206896552e-06, "loss": 0.8075, "step": 8600 }, { "epoch": 1.74, "grad_norm": 0.8270419836044312, "learning_rate": 8.689655172413793e-06, "loss": 0.7952, "step": 8700 }, { "epoch": 1.76, "grad_norm": 0.9126586318016052, "learning_rate": 8.551724137931035e-06, "loss": 0.7848, "step": 8800 }, { "epoch": 1.78, "grad_norm": 0.8643565773963928, "learning_rate": 8.413793103448276e-06, "loss": 0.7924, "step": 8900 }, { "epoch": 1.8, "grad_norm": 0.9442116618156433, "learning_rate": 8.275862068965518e-06, "loss": 0.7971, "step": 9000 }, { "epoch": 1.8, "eval_loss": 0.9290862679481506, "eval_runtime": 41.6918, "eval_samples_per_second": 23.986, "eval_steps_per_second": 5.996, "step": 9000 }, { "epoch": 1.82, "grad_norm": 0.7398823499679565, "learning_rate": 8.137931034482759e-06, "loss": 0.8003, "step": 9100 }, { "epoch": 1.84, "grad_norm": 0.8488145470619202, "learning_rate": 8.000000000000001e-06, "loss": 0.8072, "step": 9200 }, { "epoch": 1.86, "grad_norm": 0.7276484370231628, "learning_rate": 7.862068965517242e-06, "loss": 0.7966, "step": 9300 }, { "epoch": 1.88, "grad_norm": 0.8641906380653381, "learning_rate": 7.724137931034483e-06, "loss": 0.8063, "step": 9400 }, { "epoch": 1.9, "grad_norm": 0.8261032700538635, "learning_rate": 7.586206896551724e-06, "loss": 0.7907, "step": 9500 }, { "epoch": 1.9, "eval_loss": 0.892309308052063, "eval_runtime": 41.16, "eval_samples_per_second": 24.295, "eval_steps_per_second": 6.074, "step": 9500 }, { "epoch": 1.92, "grad_norm": 0.8013048768043518, "learning_rate": 7.4482758620689665e-06, "loss": 0.8031, "step": 9600 }, { "epoch": 1.94, "grad_norm": 0.8098243474960327, "learning_rate": 7.310344827586208e-06, "loss": 0.7931, "step": 9700 }, { "epoch": 1.96, "grad_norm": 0.8183510303497314, "learning_rate": 7.172413793103449e-06, "loss": 0.7967, "step": 9800 }, { "epoch": 1.98, "grad_norm": 0.8251721858978271, "learning_rate": 7.03448275862069e-06, "loss": 0.7877, "step": 9900 }, { "epoch": 2.0, "grad_norm": 0.8741477131843567, "learning_rate": 6.896551724137932e-06, "loss": 0.7967, "step": 10000 }, { "epoch": 2.0, "eval_loss": 0.873385488986969, "eval_runtime": 41.2883, "eval_samples_per_second": 24.22, "eval_steps_per_second": 6.055, "step": 10000 }, { "epoch": 2.02, "grad_norm": 0.8094070553779602, "learning_rate": 6.760000000000001e-06, "loss": 0.7059, "step": 10100 }, { "epoch": 2.04, "grad_norm": 0.8748703598976135, "learning_rate": 6.622068965517242e-06, "loss": 0.6925, "step": 10200 }, { "epoch": 2.06, "grad_norm": 1.1019681692123413, "learning_rate": 6.4841379310344835e-06, "loss": 0.6884, "step": 10300 }, { "epoch": 2.08, "grad_norm": 0.8715025186538696, "learning_rate": 6.346206896551724e-06, "loss": 0.6989, "step": 10400 }, { "epoch": 2.1, "grad_norm": 1.0019214153289795, "learning_rate": 6.2082758620689665e-06, "loss": 0.6941, "step": 10500 }, { "epoch": 2.1, "eval_loss": 0.954196035861969, "eval_runtime": 40.949, "eval_samples_per_second": 24.421, "eval_steps_per_second": 6.105, "step": 10500 }, { "epoch": 2.12, "grad_norm": 0.8947280645370483, "learning_rate": 6.070344827586207e-06, "loss": 0.6932, "step": 10600 }, { "epoch": 2.14, "grad_norm": 0.8519116044044495, "learning_rate": 5.932413793103449e-06, "loss": 0.6966, "step": 10700 }, { "epoch": 2.16, "grad_norm": 0.8703382015228271, "learning_rate": 5.79448275862069e-06, "loss": 0.6955, "step": 10800 }, { "epoch": 2.18, "grad_norm": 0.9201086759567261, "learning_rate": 5.656551724137932e-06, "loss": 0.6911, "step": 10900 }, { "epoch": 2.2, "grad_norm": 1.0000112056732178, "learning_rate": 5.518620689655173e-06, "loss": 0.6927, "step": 11000 }, { "epoch": 2.2, "eval_loss": 0.9255943894386292, "eval_runtime": 40.2744, "eval_samples_per_second": 24.83, "eval_steps_per_second": 6.207, "step": 11000 }, { "epoch": 2.22, "grad_norm": 0.93467777967453, "learning_rate": 5.382068965517242e-06, "loss": 0.6902, "step": 11100 }, { "epoch": 2.24, "grad_norm": 0.9694642424583435, "learning_rate": 5.2441379310344835e-06, "loss": 0.6936, "step": 11200 }, { "epoch": 2.26, "grad_norm": 0.9247058629989624, "learning_rate": 5.106206896551724e-06, "loss": 0.6939, "step": 11300 }, { "epoch": 2.28, "grad_norm": 0.996800422668457, "learning_rate": 4.968275862068966e-06, "loss": 0.6965, "step": 11400 }, { "epoch": 2.3, "grad_norm": 0.8894180059432983, "learning_rate": 4.830344827586207e-06, "loss": 0.706, "step": 11500 }, { "epoch": 2.3, "eval_loss": 0.9258891344070435, "eval_runtime": 40.6757, "eval_samples_per_second": 24.585, "eval_steps_per_second": 6.146, "step": 11500 }, { "epoch": 2.32, "grad_norm": 1.0035386085510254, "learning_rate": 4.692413793103449e-06, "loss": 0.6936, "step": 11600 }, { "epoch": 2.34, "grad_norm": 0.9458960294723511, "learning_rate": 4.55448275862069e-06, "loss": 0.7003, "step": 11700 }, { "epoch": 2.36, "grad_norm": 0.9162298440933228, "learning_rate": 4.416551724137932e-06, "loss": 0.7004, "step": 11800 }, { "epoch": 2.38, "grad_norm": 0.8926594257354736, "learning_rate": 4.278620689655173e-06, "loss": 0.6848, "step": 11900 }, { "epoch": 2.4, "grad_norm": 0.9700310230255127, "learning_rate": 4.140689655172414e-06, "loss": 0.6937, "step": 12000 }, { "epoch": 2.4, "eval_loss": 0.9162700176239014, "eval_runtime": 40.714, "eval_samples_per_second": 24.562, "eval_steps_per_second": 6.14, "step": 12000 }, { "epoch": 2.42, "grad_norm": 0.9614285826683044, "learning_rate": 4.002758620689655e-06, "loss": 0.6844, "step": 12100 }, { "epoch": 2.44, "grad_norm": 0.897529125213623, "learning_rate": 3.864827586206897e-06, "loss": 0.6846, "step": 12200 }, { "epoch": 2.46, "grad_norm": 0.9933466911315918, "learning_rate": 3.7268965517241383e-06, "loss": 0.7014, "step": 12300 }, { "epoch": 2.48, "grad_norm": 0.8733190298080444, "learning_rate": 3.5889655172413794e-06, "loss": 0.6896, "step": 12400 }, { "epoch": 2.5, "grad_norm": 0.9706907868385315, "learning_rate": 3.4510344827586214e-06, "loss": 0.6961, "step": 12500 }, { "epoch": 2.5, "eval_loss": 0.9194909334182739, "eval_runtime": 40.9566, "eval_samples_per_second": 24.416, "eval_steps_per_second": 6.104, "step": 12500 }, { "epoch": 2.52, "grad_norm": 0.8588589429855347, "learning_rate": 3.3131034482758624e-06, "loss": 0.6992, "step": 12600 }, { "epoch": 2.54, "grad_norm": 1.1075055599212646, "learning_rate": 3.175172413793104e-06, "loss": 0.6983, "step": 12700 }, { "epoch": 2.56, "grad_norm": 1.0699206590652466, "learning_rate": 3.037241379310345e-06, "loss": 0.6836, "step": 12800 }, { "epoch": 2.58, "grad_norm": 1.0196533203125, "learning_rate": 2.8993103448275865e-06, "loss": 0.6849, "step": 12900 }, { "epoch": 2.6, "grad_norm": 0.880547821521759, "learning_rate": 2.7613793103448276e-06, "loss": 0.6976, "step": 13000 }, { "epoch": 2.6, "eval_loss": 0.9251711368560791, "eval_runtime": 40.7697, "eval_samples_per_second": 24.528, "eval_steps_per_second": 6.132, "step": 13000 }, { "epoch": 2.62, "grad_norm": 0.9048807621002197, "learning_rate": 2.6234482758620695e-06, "loss": 0.7059, "step": 13100 }, { "epoch": 2.64, "grad_norm": 0.9848681688308716, "learning_rate": 2.4855172413793106e-06, "loss": 0.6936, "step": 13200 }, { "epoch": 2.66, "grad_norm": 0.9536793231964111, "learning_rate": 2.3475862068965517e-06, "loss": 0.6882, "step": 13300 }, { "epoch": 2.68, "grad_norm": 0.9996353387832642, "learning_rate": 2.209655172413793e-06, "loss": 0.6954, "step": 13400 }, { "epoch": 2.7, "grad_norm": 0.9549928903579712, "learning_rate": 2.0717241379310347e-06, "loss": 0.6905, "step": 13500 }, { "epoch": 2.7, "eval_loss": 0.9238373041152954, "eval_runtime": 40.4773, "eval_samples_per_second": 24.705, "eval_steps_per_second": 6.176, "step": 13500 }, { "epoch": 2.72, "grad_norm": 0.9328065514564514, "learning_rate": 1.933793103448276e-06, "loss": 0.6776, "step": 13600 }, { "epoch": 2.74, "grad_norm": 0.9359473586082458, "learning_rate": 1.7958620689655173e-06, "loss": 0.6772, "step": 13700 }, { "epoch": 2.76, "grad_norm": 0.982263445854187, "learning_rate": 1.6579310344827588e-06, "loss": 0.7012, "step": 13800 }, { "epoch": 2.78, "grad_norm": 0.9556898474693298, "learning_rate": 1.52e-06, "loss": 0.6814, "step": 13900 }, { "epoch": 2.8, "grad_norm": 0.975192666053772, "learning_rate": 1.3820689655172416e-06, "loss": 0.6951, "step": 14000 }, { "epoch": 2.8, "eval_loss": 0.8940379619598389, "eval_runtime": 40.6996, "eval_samples_per_second": 24.57, "eval_steps_per_second": 6.143, "step": 14000 }, { "epoch": 2.82, "grad_norm": 0.9017152786254883, "learning_rate": 1.2441379310344829e-06, "loss": 0.6939, "step": 14100 }, { "epoch": 2.84, "grad_norm": 0.968861997127533, "learning_rate": 1.1062068965517241e-06, "loss": 0.6861, "step": 14200 }, { "epoch": 2.86, "grad_norm": 1.0695827007293701, "learning_rate": 9.682758620689656e-07, "loss": 0.6922, "step": 14300 }, { "epoch": 2.88, "grad_norm": 0.976294994354248, "learning_rate": 8.303448275862069e-07, "loss": 0.6937, "step": 14400 }, { "epoch": 2.9, "grad_norm": 1.0001001358032227, "learning_rate": 6.924137931034483e-07, "loss": 0.7094, "step": 14500 }, { "epoch": 2.9, "eval_loss": 0.900869607925415, "eval_runtime": 40.5224, "eval_samples_per_second": 24.678, "eval_steps_per_second": 6.169, "step": 14500 }, { "epoch": 2.92, "grad_norm": 0.9175110459327698, "learning_rate": 5.544827586206897e-07, "loss": 0.6752, "step": 14600 }, { "epoch": 2.94, "grad_norm": 0.9188650846481323, "learning_rate": 4.1655172413793107e-07, "loss": 0.6973, "step": 14700 }, { "epoch": 2.96, "grad_norm": 1.0105127096176147, "learning_rate": 2.7862068965517247e-07, "loss": 0.6899, "step": 14800 }, { "epoch": 2.98, "grad_norm": 0.9797114133834839, "learning_rate": 1.406896551724138e-07, "loss": 0.7016, "step": 14900 }, { "epoch": 3.0, "grad_norm": 0.8816357254981995, "learning_rate": 2.758620689655173e-09, "loss": 0.6788, "step": 15000 }, { "epoch": 3.0, "eval_loss": 0.9271326065063477, "eval_runtime": 41.5918, "eval_samples_per_second": 24.043, "eval_steps_per_second": 6.011, "step": 15000 } ], "logging_steps": 100, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.88804379967488e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }