| { |
| "best_metric": 0.229187473654747, |
| "best_model_checkpoint": "learning_source_20260318/compounds/bert-output/compounds-small/checkpoint-61000", |
| "epoch": 3.7895791085917425, |
| "eval_steps": 100, |
| "global_step": 63000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006015204934272608, |
| "grad_norm": 4.738959312438965, |
| "learning_rate": 3e-06, |
| "loss": 4.8794, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.006015204934272608, |
| "eval_loss": 3.196549892425537, |
| "eval_runtime": 21.7167, |
| "eval_samples_per_second": 460.474, |
| "eval_steps_per_second": 57.559, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012030409868545216, |
| "grad_norm": 3.5901083946228027, |
| "learning_rate": 6e-06, |
| "loss": 2.8953, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.012030409868545216, |
| "eval_loss": 2.467365264892578, |
| "eval_runtime": 21.7318, |
| "eval_samples_per_second": 460.156, |
| "eval_steps_per_second": 57.52, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.018045614802817824, |
| "grad_norm": 1.398197889328003, |
| "learning_rate": 5.998999666555519e-06, |
| "loss": 2.4113, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.018045614802817824, |
| "eval_loss": 2.267258644104004, |
| "eval_runtime": 21.7416, |
| "eval_samples_per_second": 459.948, |
| "eval_steps_per_second": 57.493, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.02406081973709043, |
| "grad_norm": 1.1230988502502441, |
| "learning_rate": 5.997999333111037e-06, |
| "loss": 2.2393, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.02406081973709043, |
| "eval_loss": 2.1407406330108643, |
| "eval_runtime": 21.7301, |
| "eval_samples_per_second": 460.192, |
| "eval_steps_per_second": 57.524, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.030076024671363038, |
| "grad_norm": 1.1243526935577393, |
| "learning_rate": 5.9969989996665554e-06, |
| "loss": 2.129, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.030076024671363038, |
| "eval_loss": 2.0753207206726074, |
| "eval_runtime": 21.7547, |
| "eval_samples_per_second": 459.671, |
| "eval_steps_per_second": 57.459, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.03609122960563565, |
| "grad_norm": 2.403114080429077, |
| "learning_rate": 5.995998666222074e-06, |
| "loss": 2.0746, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.03609122960563565, |
| "eval_loss": 2.029200315475464, |
| "eval_runtime": 21.746, |
| "eval_samples_per_second": 459.855, |
| "eval_steps_per_second": 57.482, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.042106434539908255, |
| "grad_norm": 1.4393417835235596, |
| "learning_rate": 5.994998332777593e-06, |
| "loss": 2.0389, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.042106434539908255, |
| "eval_loss": 1.995573878288269, |
| "eval_runtime": 21.7364, |
| "eval_samples_per_second": 460.057, |
| "eval_steps_per_second": 57.507, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.04812163947418086, |
| "grad_norm": 1.392498254776001, |
| "learning_rate": 5.9939979993331115e-06, |
| "loss": 1.9989, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.04812163947418086, |
| "eval_loss": 1.9491331577301025, |
| "eval_runtime": 21.7276, |
| "eval_samples_per_second": 460.245, |
| "eval_steps_per_second": 57.531, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.05413684440845347, |
| "grad_norm": 1.9008598327636719, |
| "learning_rate": 5.992997665888629e-06, |
| "loss": 1.9332, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.05413684440845347, |
| "eval_loss": 1.8610961437225342, |
| "eval_runtime": 21.7345, |
| "eval_samples_per_second": 460.098, |
| "eval_steps_per_second": 57.512, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.060152049342726076, |
| "grad_norm": 2.0319344997406006, |
| "learning_rate": 5.991997332444148e-06, |
| "loss": 1.8607, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.060152049342726076, |
| "eval_loss": 1.7824585437774658, |
| "eval_runtime": 21.7313, |
| "eval_samples_per_second": 460.167, |
| "eval_steps_per_second": 57.521, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.06616725427699868, |
| "grad_norm": 1.9651939868927002, |
| "learning_rate": 5.990996998999667e-06, |
| "loss": 1.8064, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.06616725427699868, |
| "eval_loss": 1.7280884981155396, |
| "eval_runtime": 21.7428, |
| "eval_samples_per_second": 459.923, |
| "eval_steps_per_second": 57.49, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.0721824592112713, |
| "grad_norm": 1.2767350673675537, |
| "learning_rate": 5.989996665555185e-06, |
| "loss": 1.7432, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0721824592112713, |
| "eval_loss": 1.644949197769165, |
| "eval_runtime": 21.7259, |
| "eval_samples_per_second": 460.279, |
| "eval_steps_per_second": 57.535, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0781976641455439, |
| "grad_norm": 1.3338353633880615, |
| "learning_rate": 5.988996332110703e-06, |
| "loss": 1.6816, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.0781976641455439, |
| "eval_loss": 1.5758517980575562, |
| "eval_runtime": 21.7331, |
| "eval_samples_per_second": 460.127, |
| "eval_steps_per_second": 57.516, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.08421286907981651, |
| "grad_norm": 1.5716562271118164, |
| "learning_rate": 5.987995998666222e-06, |
| "loss": 1.6209, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.08421286907981651, |
| "eval_loss": 1.506639003753662, |
| "eval_runtime": 21.74, |
| "eval_samples_per_second": 459.982, |
| "eval_steps_per_second": 57.498, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.09022807401408911, |
| "grad_norm": 1.4891563653945923, |
| "learning_rate": 5.986995665221741e-06, |
| "loss": 1.5562, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.09022807401408911, |
| "eval_loss": 1.4430310726165771, |
| "eval_runtime": 21.7254, |
| "eval_samples_per_second": 460.29, |
| "eval_steps_per_second": 57.536, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.09624327894836172, |
| "grad_norm": 1.6210014820098877, |
| "learning_rate": 5.9859953317772595e-06, |
| "loss": 1.5081, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.09624327894836172, |
| "eval_loss": 1.394563913345337, |
| "eval_runtime": 21.7318, |
| "eval_samples_per_second": 460.155, |
| "eval_steps_per_second": 57.519, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.10225848388263432, |
| "grad_norm": 2.3340542316436768, |
| "learning_rate": 5.984994998332777e-06, |
| "loss": 1.4674, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.10225848388263432, |
| "eval_loss": 1.3446385860443115, |
| "eval_runtime": 21.735, |
| "eval_samples_per_second": 460.088, |
| "eval_steps_per_second": 57.511, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.10827368881690694, |
| "grad_norm": 1.6647675037384033, |
| "learning_rate": 5.983994664888296e-06, |
| "loss": 1.424, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.10827368881690694, |
| "eval_loss": 1.3100253343582153, |
| "eval_runtime": 21.725, |
| "eval_samples_per_second": 460.299, |
| "eval_steps_per_second": 57.537, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.11428889375117954, |
| "grad_norm": 1.46592116355896, |
| "learning_rate": 5.982994331443815e-06, |
| "loss": 1.3892, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.11428889375117954, |
| "eval_loss": 1.2686667442321777, |
| "eval_runtime": 21.7512, |
| "eval_samples_per_second": 459.744, |
| "eval_steps_per_second": 57.468, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.12030409868545215, |
| "grad_norm": 1.8340036869049072, |
| "learning_rate": 5.981993997999333e-06, |
| "loss": 1.3564, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.12030409868545215, |
| "eval_loss": 1.2294234037399292, |
| "eval_runtime": 21.7966, |
| "eval_samples_per_second": 458.788, |
| "eval_steps_per_second": 57.348, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.12631930361972477, |
| "grad_norm": 1.5960652828216553, |
| "learning_rate": 5.980993664554851e-06, |
| "loss": 1.3285, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.12631930361972477, |
| "eval_loss": 1.2100887298583984, |
| "eval_runtime": 21.7436, |
| "eval_samples_per_second": 459.905, |
| "eval_steps_per_second": 57.488, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.13233450855399737, |
| "grad_norm": 1.8335785865783691, |
| "learning_rate": 5.979993331110371e-06, |
| "loss": 1.3001, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.13233450855399737, |
| "eval_loss": 1.1752792596817017, |
| "eval_runtime": 21.7453, |
| "eval_samples_per_second": 459.87, |
| "eval_steps_per_second": 57.484, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.13834971348826997, |
| "grad_norm": 1.612433671951294, |
| "learning_rate": 5.978992997665889e-06, |
| "loss": 1.2695, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.13834971348826997, |
| "eval_loss": 1.147255778312683, |
| "eval_runtime": 21.6924, |
| "eval_samples_per_second": 460.992, |
| "eval_steps_per_second": 57.624, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.1443649184225426, |
| "grad_norm": 1.5603346824645996, |
| "learning_rate": 5.9779926642214075e-06, |
| "loss": 1.2412, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.1443649184225426, |
| "eval_loss": 1.1130963563919067, |
| "eval_runtime": 21.7087, |
| "eval_samples_per_second": 460.645, |
| "eval_steps_per_second": 57.581, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.1503801233568152, |
| "grad_norm": 1.6393444538116455, |
| "learning_rate": 5.976992330776926e-06, |
| "loss": 1.2159, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.1503801233568152, |
| "eval_loss": 1.0844037532806396, |
| "eval_runtime": 21.7041, |
| "eval_samples_per_second": 460.743, |
| "eval_steps_per_second": 57.593, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.1563953282910878, |
| "grad_norm": 1.638340950012207, |
| "learning_rate": 5.975991997332444e-06, |
| "loss": 1.1898, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.1563953282910878, |
| "eval_loss": 1.0646270513534546, |
| "eval_runtime": 21.7166, |
| "eval_samples_per_second": 460.476, |
| "eval_steps_per_second": 57.56, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.1624105332253604, |
| "grad_norm": 1.745104432106018, |
| "learning_rate": 5.974991663887963e-06, |
| "loss": 1.1708, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.1624105332253604, |
| "eval_loss": 1.0485780239105225, |
| "eval_runtime": 21.7198, |
| "eval_samples_per_second": 460.41, |
| "eval_steps_per_second": 57.551, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.16842573815963302, |
| "grad_norm": 1.759570837020874, |
| "learning_rate": 5.973991330443481e-06, |
| "loss": 1.1522, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.16842573815963302, |
| "eval_loss": 1.0218431949615479, |
| "eval_runtime": 21.7241, |
| "eval_samples_per_second": 460.318, |
| "eval_steps_per_second": 57.54, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.17444094309390562, |
| "grad_norm": 1.76418936252594, |
| "learning_rate": 5.972990996999e-06, |
| "loss": 1.1218, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.17444094309390562, |
| "eval_loss": 1.0075299739837646, |
| "eval_runtime": 21.7441, |
| "eval_samples_per_second": 459.894, |
| "eval_steps_per_second": 57.487, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.18045614802817822, |
| "grad_norm": 1.7186238765716553, |
| "learning_rate": 5.971990663554519e-06, |
| "loss": 1.1074, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.18045614802817822, |
| "eval_loss": 0.9909061789512634, |
| "eval_runtime": 21.7393, |
| "eval_samples_per_second": 459.997, |
| "eval_steps_per_second": 57.5, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.18647135296245085, |
| "grad_norm": 1.6869324445724487, |
| "learning_rate": 5.970990330110037e-06, |
| "loss": 1.0871, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.18647135296245085, |
| "eval_loss": 0.965461254119873, |
| "eval_runtime": 21.7525, |
| "eval_samples_per_second": 459.717, |
| "eval_steps_per_second": 57.465, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.19248655789672345, |
| "grad_norm": 1.590827465057373, |
| "learning_rate": 5.9699899966655554e-06, |
| "loss": 1.0678, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.19248655789672345, |
| "eval_loss": 0.9502421617507935, |
| "eval_runtime": 21.7257, |
| "eval_samples_per_second": 460.284, |
| "eval_steps_per_second": 57.536, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.19850176283099605, |
| "grad_norm": 1.3480803966522217, |
| "learning_rate": 5.968989663221074e-06, |
| "loss": 1.05, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.19850176283099605, |
| "eval_loss": 0.9217738509178162, |
| "eval_runtime": 21.7126, |
| "eval_samples_per_second": 460.562, |
| "eval_steps_per_second": 57.57, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.20451696776526865, |
| "grad_norm": 1.611717700958252, |
| "learning_rate": 5.967989329776592e-06, |
| "loss": 1.0308, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.20451696776526865, |
| "eval_loss": 0.9114508628845215, |
| "eval_runtime": 21.727, |
| "eval_samples_per_second": 460.257, |
| "eval_steps_per_second": 57.532, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.21053217269954128, |
| "grad_norm": 1.424517035484314, |
| "learning_rate": 5.966988996332111e-06, |
| "loss": 1.0161, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.21053217269954128, |
| "eval_loss": 0.8955187797546387, |
| "eval_runtime": 21.7397, |
| "eval_samples_per_second": 459.988, |
| "eval_steps_per_second": 57.498, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.21654737763381388, |
| "grad_norm": 1.8415201902389526, |
| "learning_rate": 5.965988662887629e-06, |
| "loss": 0.9983, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.21654737763381388, |
| "eval_loss": 0.8804967999458313, |
| "eval_runtime": 21.7608, |
| "eval_samples_per_second": 459.541, |
| "eval_steps_per_second": 57.443, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.22256258256808648, |
| "grad_norm": 1.5056076049804688, |
| "learning_rate": 5.964988329443148e-06, |
| "loss": 0.9849, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.22256258256808648, |
| "eval_loss": 0.8613883852958679, |
| "eval_runtime": 21.7348, |
| "eval_samples_per_second": 460.091, |
| "eval_steps_per_second": 57.511, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.22857778750235908, |
| "grad_norm": 1.6334686279296875, |
| "learning_rate": 5.963987995998667e-06, |
| "loss": 0.9689, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.22857778750235908, |
| "eval_loss": 0.8555884957313538, |
| "eval_runtime": 21.7044, |
| "eval_samples_per_second": 460.736, |
| "eval_steps_per_second": 57.592, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.2345929924366317, |
| "grad_norm": 1.7393226623535156, |
| "learning_rate": 5.962987662554185e-06, |
| "loss": 0.9564, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.2345929924366317, |
| "eval_loss": 0.8427873849868774, |
| "eval_runtime": 21.7376, |
| "eval_samples_per_second": 460.033, |
| "eval_steps_per_second": 57.504, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.2406081973709043, |
| "grad_norm": 1.5030866861343384, |
| "learning_rate": 5.961987329109703e-06, |
| "loss": 0.9417, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.2406081973709043, |
| "eval_loss": 0.8300994038581848, |
| "eval_runtime": 21.7438, |
| "eval_samples_per_second": 459.902, |
| "eval_steps_per_second": 57.488, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.2466234023051769, |
| "grad_norm": 1.8627735376358032, |
| "learning_rate": 5.960986995665222e-06, |
| "loss": 0.9277, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.2466234023051769, |
| "eval_loss": 0.8120391368865967, |
| "eval_runtime": 21.755, |
| "eval_samples_per_second": 459.664, |
| "eval_steps_per_second": 57.458, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.25263860723944953, |
| "grad_norm": 1.5174646377563477, |
| "learning_rate": 5.95998666222074e-06, |
| "loss": 0.9123, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.25263860723944953, |
| "eval_loss": 0.7974905371665955, |
| "eval_runtime": 21.7219, |
| "eval_samples_per_second": 460.366, |
| "eval_steps_per_second": 57.546, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.25865381217372213, |
| "grad_norm": 1.354490041732788, |
| "learning_rate": 5.958986328776259e-06, |
| "loss": 0.9028, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.25865381217372213, |
| "eval_loss": 0.7938092947006226, |
| "eval_runtime": 21.7182, |
| "eval_samples_per_second": 460.444, |
| "eval_steps_per_second": 57.555, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.26466901710799473, |
| "grad_norm": 1.6153218746185303, |
| "learning_rate": 5.957985995331777e-06, |
| "loss": 0.8954, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.26466901710799473, |
| "eval_loss": 0.7785645723342896, |
| "eval_runtime": 21.7451, |
| "eval_samples_per_second": 459.873, |
| "eval_steps_per_second": 57.484, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.27068422204226733, |
| "grad_norm": 1.9774231910705566, |
| "learning_rate": 5.956985661887296e-06, |
| "loss": 0.8819, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.27068422204226733, |
| "eval_loss": 0.7742797136306763, |
| "eval_runtime": 21.7358, |
| "eval_samples_per_second": 460.07, |
| "eval_steps_per_second": 57.509, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.27669942697653993, |
| "grad_norm": 1.6561676263809204, |
| "learning_rate": 5.955985328442815e-06, |
| "loss": 0.8729, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.27669942697653993, |
| "eval_loss": 0.7637073397636414, |
| "eval_runtime": 21.7296, |
| "eval_samples_per_second": 460.202, |
| "eval_steps_per_second": 57.525, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.28271463191081253, |
| "grad_norm": 1.5622860193252563, |
| "learning_rate": 5.954984994998333e-06, |
| "loss": 0.8608, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.28271463191081253, |
| "eval_loss": 0.7628427743911743, |
| "eval_runtime": 21.7339, |
| "eval_samples_per_second": 460.111, |
| "eval_steps_per_second": 57.514, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.2887298368450852, |
| "grad_norm": 1.6501961946487427, |
| "learning_rate": 5.953984661553851e-06, |
| "loss": 0.8489, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.2887298368450852, |
| "eval_loss": 0.7505598068237305, |
| "eval_runtime": 21.7165, |
| "eval_samples_per_second": 460.479, |
| "eval_steps_per_second": 57.56, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.2947450417793578, |
| "grad_norm": 1.7538303136825562, |
| "learning_rate": 5.95298432810937e-06, |
| "loss": 0.8401, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.2947450417793578, |
| "eval_loss": 0.7426216006278992, |
| "eval_runtime": 21.7216, |
| "eval_samples_per_second": 460.37, |
| "eval_steps_per_second": 57.546, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.3007602467136304, |
| "grad_norm": 1.5520670413970947, |
| "learning_rate": 5.951983994664888e-06, |
| "loss": 0.8361, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.3007602467136304, |
| "eval_loss": 0.7258592247962952, |
| "eval_runtime": 21.7177, |
| "eval_samples_per_second": 460.454, |
| "eval_steps_per_second": 57.557, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.306775451647903, |
| "grad_norm": 2.0393898487091064, |
| "learning_rate": 5.950983661220407e-06, |
| "loss": 0.8273, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.306775451647903, |
| "eval_loss": 0.7185364365577698, |
| "eval_runtime": 21.6894, |
| "eval_samples_per_second": 461.054, |
| "eval_steps_per_second": 57.632, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.3127906565821756, |
| "grad_norm": 1.9601730108261108, |
| "learning_rate": 5.949983327775925e-06, |
| "loss": 0.8135, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.3127906565821756, |
| "eval_loss": 0.7162497639656067, |
| "eval_runtime": 21.7614, |
| "eval_samples_per_second": 459.53, |
| "eval_steps_per_second": 57.441, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.3188058615164482, |
| "grad_norm": 1.4966851472854614, |
| "learning_rate": 5.948982994331444e-06, |
| "loss": 0.8037, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.3188058615164482, |
| "eval_loss": 0.7116673588752747, |
| "eval_runtime": 21.8409, |
| "eval_samples_per_second": 457.857, |
| "eval_steps_per_second": 57.232, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.3248210664507208, |
| "grad_norm": 1.4574569463729858, |
| "learning_rate": 5.947982660886963e-06, |
| "loss": 0.8027, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.3248210664507208, |
| "eval_loss": 0.6981866359710693, |
| "eval_runtime": 21.7832, |
| "eval_samples_per_second": 459.07, |
| "eval_steps_per_second": 57.384, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.33083627138499344, |
| "grad_norm": 1.5823230743408203, |
| "learning_rate": 5.9469823274424815e-06, |
| "loss": 0.7898, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.33083627138499344, |
| "eval_loss": 0.6950494050979614, |
| "eval_runtime": 21.754, |
| "eval_samples_per_second": 459.685, |
| "eval_steps_per_second": 57.461, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.33685147631926604, |
| "grad_norm": 1.5350251197814941, |
| "learning_rate": 5.945981993997999e-06, |
| "loss": 0.7829, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.33685147631926604, |
| "eval_loss": 0.6908562183380127, |
| "eval_runtime": 21.6939, |
| "eval_samples_per_second": 460.958, |
| "eval_steps_per_second": 57.62, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.34286668125353864, |
| "grad_norm": 1.5343948602676392, |
| "learning_rate": 5.944981660553518e-06, |
| "loss": 0.7778, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.34286668125353864, |
| "eval_loss": 0.6897854208946228, |
| "eval_runtime": 21.687, |
| "eval_samples_per_second": 461.107, |
| "eval_steps_per_second": 57.638, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.34888188618781124, |
| "grad_norm": 1.6000343561172485, |
| "learning_rate": 5.943981327109036e-06, |
| "loss": 0.7672, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.34888188618781124, |
| "eval_loss": 0.6832409501075745, |
| "eval_runtime": 21.7047, |
| "eval_samples_per_second": 460.73, |
| "eval_steps_per_second": 57.591, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.35489709112208384, |
| "grad_norm": 1.3873372077941895, |
| "learning_rate": 5.942980993664555e-06, |
| "loss": 0.7645, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.35489709112208384, |
| "eval_loss": 0.6712300777435303, |
| "eval_runtime": 21.707, |
| "eval_samples_per_second": 460.681, |
| "eval_steps_per_second": 57.585, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.36091229605635644, |
| "grad_norm": 1.5178308486938477, |
| "learning_rate": 5.941980660220073e-06, |
| "loss": 0.756, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.36091229605635644, |
| "eval_loss": 0.6661484241485596, |
| "eval_runtime": 21.7032, |
| "eval_samples_per_second": 460.761, |
| "eval_steps_per_second": 57.595, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.36692750099062904, |
| "grad_norm": 1.4745811223983765, |
| "learning_rate": 5.940980326775592e-06, |
| "loss": 0.753, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.36692750099062904, |
| "eval_loss": 0.664915144443512, |
| "eval_runtime": 21.7252, |
| "eval_samples_per_second": 460.294, |
| "eval_steps_per_second": 57.537, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.3729427059249017, |
| "grad_norm": 1.6472891569137573, |
| "learning_rate": 5.939979993331111e-06, |
| "loss": 0.743, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.3729427059249017, |
| "eval_loss": 0.6596666574478149, |
| "eval_runtime": 21.6717, |
| "eval_samples_per_second": 461.432, |
| "eval_steps_per_second": 57.679, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.3789579108591743, |
| "grad_norm": 1.4315409660339355, |
| "learning_rate": 5.9389796598866294e-06, |
| "loss": 0.737, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.3789579108591743, |
| "eval_loss": 0.6593905091285706, |
| "eval_runtime": 21.8558, |
| "eval_samples_per_second": 457.545, |
| "eval_steps_per_second": 57.193, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.3849731157934469, |
| "grad_norm": 1.553122639656067, |
| "learning_rate": 5.937979326442147e-06, |
| "loss": 0.7284, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.3849731157934469, |
| "eval_loss": 0.6500257253646851, |
| "eval_runtime": 21.725, |
| "eval_samples_per_second": 460.298, |
| "eval_steps_per_second": 57.537, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.3909883207277195, |
| "grad_norm": 1.4755713939666748, |
| "learning_rate": 5.936978992997666e-06, |
| "loss": 0.7253, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3909883207277195, |
| "eval_loss": 0.6457264423370361, |
| "eval_runtime": 21.6694, |
| "eval_samples_per_second": 461.48, |
| "eval_steps_per_second": 57.685, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3970035256619921, |
| "grad_norm": 1.3153866529464722, |
| "learning_rate": 5.935978659553185e-06, |
| "loss": 0.7227, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.3970035256619921, |
| "eval_loss": 0.6387376189231873, |
| "eval_runtime": 21.6735, |
| "eval_samples_per_second": 461.393, |
| "eval_steps_per_second": 57.674, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.4030187305962647, |
| "grad_norm": 1.3349621295928955, |
| "learning_rate": 5.9349783261087026e-06, |
| "loss": 0.7161, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.4030187305962647, |
| "eval_loss": 0.6228384971618652, |
| "eval_runtime": 21.6839, |
| "eval_samples_per_second": 461.172, |
| "eval_steps_per_second": 57.646, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.4090339355305373, |
| "grad_norm": 1.4209269285202026, |
| "learning_rate": 5.933977992664221e-06, |
| "loss": 0.7101, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.4090339355305373, |
| "eval_loss": 0.6393507719039917, |
| "eval_runtime": 21.6842, |
| "eval_samples_per_second": 461.165, |
| "eval_steps_per_second": 57.646, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.4150491404648099, |
| "grad_norm": 1.3392629623413086, |
| "learning_rate": 5.93297765921974e-06, |
| "loss": 0.7043, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.4150491404648099, |
| "eval_loss": 0.6370413303375244, |
| "eval_runtime": 21.6802, |
| "eval_samples_per_second": 461.251, |
| "eval_steps_per_second": 57.656, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.42106434539908255, |
| "grad_norm": 1.420782446861267, |
| "learning_rate": 5.931977325775259e-06, |
| "loss": 0.6976, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.42106434539908255, |
| "eval_loss": 0.6197584867477417, |
| "eval_runtime": 21.6736, |
| "eval_samples_per_second": 461.391, |
| "eval_steps_per_second": 57.674, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.42707955033335515, |
| "grad_norm": 1.3362140655517578, |
| "learning_rate": 5.930976992330777e-06, |
| "loss": 0.6938, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.42707955033335515, |
| "eval_loss": 0.6171865463256836, |
| "eval_runtime": 21.6908, |
| "eval_samples_per_second": 461.024, |
| "eval_steps_per_second": 57.628, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.43309475526762775, |
| "grad_norm": 1.2855477333068848, |
| "learning_rate": 5.929976658886295e-06, |
| "loss": 0.6897, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.43309475526762775, |
| "eval_loss": 0.6011925935745239, |
| "eval_runtime": 21.6697, |
| "eval_samples_per_second": 461.474, |
| "eval_steps_per_second": 57.684, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.43910996020190035, |
| "grad_norm": 1.6744885444641113, |
| "learning_rate": 5.928976325441814e-06, |
| "loss": 0.6815, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.43910996020190035, |
| "eval_loss": 0.606606662273407, |
| "eval_runtime": 21.7361, |
| "eval_samples_per_second": 460.064, |
| "eval_steps_per_second": 57.508, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.44512516513617295, |
| "grad_norm": 1.4268521070480347, |
| "learning_rate": 5.927975991997333e-06, |
| "loss": 0.6785, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.44512516513617295, |
| "eval_loss": 0.6065685749053955, |
| "eval_runtime": 21.7924, |
| "eval_samples_per_second": 458.876, |
| "eval_steps_per_second": 57.359, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.45114037007044555, |
| "grad_norm": 1.248145341873169, |
| "learning_rate": 5.9269756585528505e-06, |
| "loss": 0.6734, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.45114037007044555, |
| "eval_loss": 0.5927532911300659, |
| "eval_runtime": 21.7131, |
| "eval_samples_per_second": 460.551, |
| "eval_steps_per_second": 57.569, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.45715557500471815, |
| "grad_norm": 1.3543365001678467, |
| "learning_rate": 5.92597532510837e-06, |
| "loss": 0.6692, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.45715557500471815, |
| "eval_loss": 0.584913432598114, |
| "eval_runtime": 21.6765, |
| "eval_samples_per_second": 461.329, |
| "eval_steps_per_second": 57.666, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.4631707799389908, |
| "grad_norm": 1.519895315170288, |
| "learning_rate": 5.924974991663888e-06, |
| "loss": 0.6683, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.4631707799389908, |
| "eval_loss": 0.5899286270141602, |
| "eval_runtime": 21.7078, |
| "eval_samples_per_second": 460.664, |
| "eval_steps_per_second": 57.583, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.4691859848732634, |
| "grad_norm": 1.3677542209625244, |
| "learning_rate": 5.923974658219407e-06, |
| "loss": 0.6612, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.4691859848732634, |
| "eval_loss": 0.5877178907394409, |
| "eval_runtime": 21.699, |
| "eval_samples_per_second": 460.851, |
| "eval_steps_per_second": 57.606, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.475201189807536, |
| "grad_norm": 1.3020201921463013, |
| "learning_rate": 5.922974324774925e-06, |
| "loss": 0.6593, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.475201189807536, |
| "eval_loss": 0.5901273488998413, |
| "eval_runtime": 21.6975, |
| "eval_samples_per_second": 460.883, |
| "eval_steps_per_second": 57.61, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.4812163947418086, |
| "grad_norm": 1.2522666454315186, |
| "learning_rate": 5.921973991330443e-06, |
| "loss": 0.6515, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4812163947418086, |
| "eval_loss": 0.5791921019554138, |
| "eval_runtime": 21.6482, |
| "eval_samples_per_second": 461.932, |
| "eval_steps_per_second": 57.741, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4872315996760812, |
| "grad_norm": 1.7226676940917969, |
| "learning_rate": 5.920973657885962e-06, |
| "loss": 0.6497, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.4872315996760812, |
| "eval_loss": 0.5783876776695251, |
| "eval_runtime": 21.8009, |
| "eval_samples_per_second": 458.696, |
| "eval_steps_per_second": 57.337, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.4932468046103538, |
| "grad_norm": 1.4653980731964111, |
| "learning_rate": 5.919973324441481e-06, |
| "loss": 0.6463, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.4932468046103538, |
| "eval_loss": 0.5752367973327637, |
| "eval_runtime": 21.7179, |
| "eval_samples_per_second": 460.45, |
| "eval_steps_per_second": 57.556, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.4992620095446264, |
| "grad_norm": 1.3331021070480347, |
| "learning_rate": 5.918972990996999e-06, |
| "loss": 0.6412, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.4992620095446264, |
| "eval_loss": 0.5725879669189453, |
| "eval_runtime": 21.7719, |
| "eval_samples_per_second": 459.308, |
| "eval_steps_per_second": 57.414, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.5052772144788991, |
| "grad_norm": 1.245968222618103, |
| "learning_rate": 5.917972657552518e-06, |
| "loss": 0.64, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.5052772144788991, |
| "eval_loss": 0.5639936923980713, |
| "eval_runtime": 21.7448, |
| "eval_samples_per_second": 459.88, |
| "eval_steps_per_second": 57.485, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.5112924194131716, |
| "grad_norm": 1.269049882888794, |
| "learning_rate": 5.916972324108037e-06, |
| "loss": 0.6341, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.5112924194131716, |
| "eval_loss": 0.5605804324150085, |
| "eval_runtime": 21.7116, |
| "eval_samples_per_second": 460.582, |
| "eval_steps_per_second": 57.573, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.5173076243474443, |
| "grad_norm": 1.2048168182373047, |
| "learning_rate": 5.915971990663555e-06, |
| "loss": 0.6327, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.5173076243474443, |
| "eval_loss": 0.5681275129318237, |
| "eval_runtime": 21.7037, |
| "eval_samples_per_second": 460.751, |
| "eval_steps_per_second": 57.594, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.5233228292817168, |
| "grad_norm": 1.269063949584961, |
| "learning_rate": 5.914971657219073e-06, |
| "loss": 0.6251, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.5233228292817168, |
| "eval_loss": 0.5644165277481079, |
| "eval_runtime": 21.6949, |
| "eval_samples_per_second": 460.937, |
| "eval_steps_per_second": 57.617, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.5293380342159895, |
| "grad_norm": 1.3928773403167725, |
| "learning_rate": 5.913971323774591e-06, |
| "loss": 0.6268, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.5293380342159895, |
| "eval_loss": 0.5452607870101929, |
| "eval_runtime": 21.7013, |
| "eval_samples_per_second": 460.803, |
| "eval_steps_per_second": 57.6, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.5353532391502621, |
| "grad_norm": 1.6263777017593384, |
| "learning_rate": 5.91297099033011e-06, |
| "loss": 0.6198, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.5353532391502621, |
| "eval_loss": 0.5565773248672485, |
| "eval_runtime": 21.7101, |
| "eval_samples_per_second": 460.615, |
| "eval_steps_per_second": 57.577, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.5413684440845347, |
| "grad_norm": 1.312068223953247, |
| "learning_rate": 5.911970656885629e-06, |
| "loss": 0.6168, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.5413684440845347, |
| "eval_loss": 0.544517457485199, |
| "eval_runtime": 21.6689, |
| "eval_samples_per_second": 461.49, |
| "eval_steps_per_second": 57.686, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.5473836490188073, |
| "grad_norm": 1.4878406524658203, |
| "learning_rate": 5.910970323441147e-06, |
| "loss": 0.6168, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.5473836490188073, |
| "eval_loss": 0.5467077493667603, |
| "eval_runtime": 21.7585, |
| "eval_samples_per_second": 459.591, |
| "eval_steps_per_second": 57.449, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.5533988539530799, |
| "grad_norm": 1.4762675762176514, |
| "learning_rate": 5.909969989996666e-06, |
| "loss": 0.6062, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.5533988539530799, |
| "eval_loss": 0.5416296720504761, |
| "eval_runtime": 21.7398, |
| "eval_samples_per_second": 459.985, |
| "eval_steps_per_second": 57.498, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.5594140588873525, |
| "grad_norm": 1.3053025007247925, |
| "learning_rate": 5.908969656552185e-06, |
| "loss": 0.6106, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.5594140588873525, |
| "eval_loss": 0.5386621356010437, |
| "eval_runtime": 21.7444, |
| "eval_samples_per_second": 459.888, |
| "eval_steps_per_second": 57.486, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.5654292638216251, |
| "grad_norm": 1.5423814058303833, |
| "learning_rate": 5.907969323107703e-06, |
| "loss": 0.6019, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.5654292638216251, |
| "eval_loss": 0.5405033230781555, |
| "eval_runtime": 21.726, |
| "eval_samples_per_second": 460.277, |
| "eval_steps_per_second": 57.535, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.5714444687558977, |
| "grad_norm": 1.4696613550186157, |
| "learning_rate": 5.906968989663221e-06, |
| "loss": 0.6011, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5714444687558977, |
| "eval_loss": 0.5457667708396912, |
| "eval_runtime": 21.7773, |
| "eval_samples_per_second": 459.193, |
| "eval_steps_per_second": 57.399, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5774596736901704, |
| "grad_norm": 1.5349172353744507, |
| "learning_rate": 5.90596865621874e-06, |
| "loss": 0.5961, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.5774596736901704, |
| "eval_loss": 0.533613920211792, |
| "eval_runtime": 21.9838, |
| "eval_samples_per_second": 454.88, |
| "eval_steps_per_second": 56.86, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.5834748786244429, |
| "grad_norm": 1.2024816274642944, |
| "learning_rate": 5.904968322774258e-06, |
| "loss": 0.593, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.5834748786244429, |
| "eval_loss": 0.5246294140815735, |
| "eval_runtime": 22.5017, |
| "eval_samples_per_second": 444.411, |
| "eval_steps_per_second": 55.551, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.5894900835587156, |
| "grad_norm": 1.2983571290969849, |
| "learning_rate": 5.9039679893297766e-06, |
| "loss": 0.5925, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.5894900835587156, |
| "eval_loss": 0.5254473686218262, |
| "eval_runtime": 23.0942, |
| "eval_samples_per_second": 433.009, |
| "eval_steps_per_second": 54.126, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.5955052884929881, |
| "grad_norm": 1.2889515161514282, |
| "learning_rate": 5.902967655885295e-06, |
| "loss": 0.5911, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.5955052884929881, |
| "eval_loss": 0.5365324020385742, |
| "eval_runtime": 23.3271, |
| "eval_samples_per_second": 428.686, |
| "eval_steps_per_second": 53.586, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.6015204934272608, |
| "grad_norm": 1.3131366968154907, |
| "learning_rate": 5.901967322440814e-06, |
| "loss": 0.5843, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.6015204934272608, |
| "eval_loss": 0.5123865008354187, |
| "eval_runtime": 23.435, |
| "eval_samples_per_second": 426.712, |
| "eval_steps_per_second": 53.339, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.6075356983615333, |
| "grad_norm": 1.3315032720565796, |
| "learning_rate": 5.900966988996333e-06, |
| "loss": 0.5832, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.6075356983615333, |
| "eval_loss": 0.5256994962692261, |
| "eval_runtime": 23.4061, |
| "eval_samples_per_second": 427.24, |
| "eval_steps_per_second": 53.405, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.613550903295806, |
| "grad_norm": 1.3008897304534912, |
| "learning_rate": 5.8999666555518505e-06, |
| "loss": 0.582, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.613550903295806, |
| "eval_loss": 0.5148985981941223, |
| "eval_runtime": 23.4451, |
| "eval_samples_per_second": 426.528, |
| "eval_steps_per_second": 53.316, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.6195661082300786, |
| "grad_norm": 1.272538423538208, |
| "learning_rate": 5.898966322107369e-06, |
| "loss": 0.5789, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.6195661082300786, |
| "eval_loss": 0.5160868763923645, |
| "eval_runtime": 23.3699, |
| "eval_samples_per_second": 427.901, |
| "eval_steps_per_second": 53.488, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.6255813131643512, |
| "grad_norm": 1.38733971118927, |
| "learning_rate": 5.897965988662888e-06, |
| "loss": 0.5768, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.6255813131643512, |
| "eval_loss": 0.5101234912872314, |
| "eval_runtime": 23.5052, |
| "eval_samples_per_second": 425.437, |
| "eval_steps_per_second": 53.18, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.6315965180986238, |
| "grad_norm": 1.3414686918258667, |
| "learning_rate": 5.896965655218406e-06, |
| "loss": 0.5728, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.6315965180986238, |
| "eval_loss": 0.5151140689849854, |
| "eval_runtime": 23.1483, |
| "eval_samples_per_second": 431.997, |
| "eval_steps_per_second": 54.0, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.6376117230328964, |
| "grad_norm": 1.2821862697601318, |
| "learning_rate": 5.8959653217739245e-06, |
| "loss": 0.5732, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.6376117230328964, |
| "eval_loss": 0.5067505240440369, |
| "eval_runtime": 23.3046, |
| "eval_samples_per_second": 429.099, |
| "eval_steps_per_second": 53.637, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.643626927967169, |
| "grad_norm": 1.4687350988388062, |
| "learning_rate": 5.894964988329443e-06, |
| "loss": 0.568, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.643626927967169, |
| "eval_loss": 0.5038474798202515, |
| "eval_runtime": 48.8496, |
| "eval_samples_per_second": 204.71, |
| "eval_steps_per_second": 25.589, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.6496421329014416, |
| "grad_norm": 1.1854100227355957, |
| "learning_rate": 5.893964654884962e-06, |
| "loss": 0.5665, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.6496421329014416, |
| "eval_loss": 0.5092170834541321, |
| "eval_runtime": 51.2918, |
| "eval_samples_per_second": 194.963, |
| "eval_steps_per_second": 24.37, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.6556573378357142, |
| "grad_norm": 1.2117469310760498, |
| "learning_rate": 5.892964321440481e-06, |
| "loss": 0.5641, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.6556573378357142, |
| "eval_loss": 0.4948270618915558, |
| "eval_runtime": 51.7341, |
| "eval_samples_per_second": 193.296, |
| "eval_steps_per_second": 24.162, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.6616725427699869, |
| "grad_norm": 1.1809200048446655, |
| "learning_rate": 5.8919639879959985e-06, |
| "loss": 0.559, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.6616725427699869, |
| "eval_loss": 0.49759823083877563, |
| "eval_runtime": 50.8828, |
| "eval_samples_per_second": 196.53, |
| "eval_steps_per_second": 24.566, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.6676877477042594, |
| "grad_norm": 1.4321728944778442, |
| "learning_rate": 5.890963654551517e-06, |
| "loss": 0.5597, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.6676877477042594, |
| "eval_loss": 0.49609047174453735, |
| "eval_runtime": 51.278, |
| "eval_samples_per_second": 195.015, |
| "eval_steps_per_second": 24.377, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.6737029526385321, |
| "grad_norm": 1.3043360710144043, |
| "learning_rate": 5.889963321107036e-06, |
| "loss": 0.5574, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.6737029526385321, |
| "eval_loss": 0.5004040002822876, |
| "eval_runtime": 50.7636, |
| "eval_samples_per_second": 196.992, |
| "eval_steps_per_second": 24.624, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.6797181575728046, |
| "grad_norm": 1.2415975332260132, |
| "learning_rate": 5.888962987662554e-06, |
| "loss": 0.5555, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.6797181575728046, |
| "eval_loss": 0.5004035234451294, |
| "eval_runtime": 51.3686, |
| "eval_samples_per_second": 194.672, |
| "eval_steps_per_second": 24.334, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.6857333625070773, |
| "grad_norm": 1.1731830835342407, |
| "learning_rate": 5.8879626542180725e-06, |
| "loss": 0.5541, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.6857333625070773, |
| "eval_loss": 0.4998365342617035, |
| "eval_runtime": 50.9083, |
| "eval_samples_per_second": 196.432, |
| "eval_steps_per_second": 24.554, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.6917485674413498, |
| "grad_norm": 1.2296881675720215, |
| "learning_rate": 5.886962320773592e-06, |
| "loss": 0.5487, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6917485674413498, |
| "eval_loss": 0.4932882785797119, |
| "eval_runtime": 50.9764, |
| "eval_samples_per_second": 196.169, |
| "eval_steps_per_second": 24.521, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6977637723756225, |
| "grad_norm": 1.4027659893035889, |
| "learning_rate": 5.88596198732911e-06, |
| "loss": 0.5488, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.6977637723756225, |
| "eval_loss": 0.48723334074020386, |
| "eval_runtime": 51.3087, |
| "eval_samples_per_second": 194.899, |
| "eval_steps_per_second": 24.362, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.7037789773098951, |
| "grad_norm": 1.345869541168213, |
| "learning_rate": 5.884961653884629e-06, |
| "loss": 0.5464, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.7037789773098951, |
| "eval_loss": 0.48902279138565063, |
| "eval_runtime": 51.5761, |
| "eval_samples_per_second": 193.888, |
| "eval_steps_per_second": 24.236, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.7097941822441677, |
| "grad_norm": 1.3029801845550537, |
| "learning_rate": 5.8839613204401465e-06, |
| "loss": 0.545, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.7097941822441677, |
| "eval_loss": 0.4815163016319275, |
| "eval_runtime": 51.0467, |
| "eval_samples_per_second": 195.899, |
| "eval_steps_per_second": 24.487, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.7158093871784403, |
| "grad_norm": 1.3300397396087646, |
| "learning_rate": 5.882960986995665e-06, |
| "loss": 0.5406, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.7158093871784403, |
| "eval_loss": 0.4828699231147766, |
| "eval_runtime": 50.6859, |
| "eval_samples_per_second": 197.294, |
| "eval_steps_per_second": 24.662, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.7218245921127129, |
| "grad_norm": 1.3354322910308838, |
| "learning_rate": 5.881960653551184e-06, |
| "loss": 0.5412, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.7218245921127129, |
| "eval_loss": 0.4760846197605133, |
| "eval_runtime": 51.0095, |
| "eval_samples_per_second": 196.042, |
| "eval_steps_per_second": 24.505, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.7278397970469855, |
| "grad_norm": 1.2316620349884033, |
| "learning_rate": 5.880960320106702e-06, |
| "loss": 0.5354, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.7278397970469855, |
| "eval_loss": 0.49535489082336426, |
| "eval_runtime": 51.064, |
| "eval_samples_per_second": 195.833, |
| "eval_steps_per_second": 24.479, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.7338550019812581, |
| "grad_norm": 1.2033593654632568, |
| "learning_rate": 5.879959986662221e-06, |
| "loss": 0.5343, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.7338550019812581, |
| "eval_loss": 0.4705411195755005, |
| "eval_runtime": 50.9982, |
| "eval_samples_per_second": 196.085, |
| "eval_steps_per_second": 24.511, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.7398702069155307, |
| "grad_norm": 1.2634704113006592, |
| "learning_rate": 5.87895965321774e-06, |
| "loss": 0.5337, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.7398702069155307, |
| "eval_loss": 0.47791826725006104, |
| "eval_runtime": 51.1718, |
| "eval_samples_per_second": 195.42, |
| "eval_steps_per_second": 24.428, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.7458854118498034, |
| "grad_norm": 1.2546501159667969, |
| "learning_rate": 5.877959319773258e-06, |
| "loss": 0.5324, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.7458854118498034, |
| "eval_loss": 0.4756995737552643, |
| "eval_runtime": 51.0651, |
| "eval_samples_per_second": 195.828, |
| "eval_steps_per_second": 24.479, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.7519006167840759, |
| "grad_norm": 1.1833654642105103, |
| "learning_rate": 5.876958986328777e-06, |
| "loss": 0.5299, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.7519006167840759, |
| "eval_loss": 0.47130194306373596, |
| "eval_runtime": 51.0775, |
| "eval_samples_per_second": 195.781, |
| "eval_steps_per_second": 24.473, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.7579158217183486, |
| "grad_norm": 1.0535800457000732, |
| "learning_rate": 5.875958652884295e-06, |
| "loss": 0.5288, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.7579158217183486, |
| "eval_loss": 0.46586230397224426, |
| "eval_runtime": 51.3884, |
| "eval_samples_per_second": 194.596, |
| "eval_steps_per_second": 24.325, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.7639310266526211, |
| "grad_norm": 1.2561872005462646, |
| "learning_rate": 5.874958319439813e-06, |
| "loss": 0.5297, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.7639310266526211, |
| "eval_loss": 0.4665389657020569, |
| "eval_runtime": 51.1355, |
| "eval_samples_per_second": 195.559, |
| "eval_steps_per_second": 24.445, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.7699462315868938, |
| "grad_norm": 1.177007794380188, |
| "learning_rate": 5.873957985995332e-06, |
| "loss": 0.5326, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.7699462315868938, |
| "eval_loss": 0.4671100676059723, |
| "eval_runtime": 51.3263, |
| "eval_samples_per_second": 194.832, |
| "eval_steps_per_second": 24.354, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.7759614365211663, |
| "grad_norm": 1.181401252746582, |
| "learning_rate": 5.8729576525508506e-06, |
| "loss": 0.5222, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.7759614365211663, |
| "eval_loss": 0.4585270583629608, |
| "eval_runtime": 51.1292, |
| "eval_samples_per_second": 195.583, |
| "eval_steps_per_second": 24.448, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.781976641455439, |
| "grad_norm": 1.108788013458252, |
| "learning_rate": 5.871957319106369e-06, |
| "loss": 0.5202, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.781976641455439, |
| "eval_loss": 0.46135467290878296, |
| "eval_runtime": 51.1302, |
| "eval_samples_per_second": 195.579, |
| "eval_steps_per_second": 24.447, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.7879918463897116, |
| "grad_norm": 1.152575969696045, |
| "learning_rate": 5.870956985661888e-06, |
| "loss": 0.5157, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.7879918463897116, |
| "eval_loss": 0.46781352162361145, |
| "eval_runtime": 51.1065, |
| "eval_samples_per_second": 195.67, |
| "eval_steps_per_second": 24.459, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.7940070513239842, |
| "grad_norm": 1.1765929460525513, |
| "learning_rate": 5.869956652217406e-06, |
| "loss": 0.5177, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.7940070513239842, |
| "eval_loss": 0.4588942527770996, |
| "eval_runtime": 51.1353, |
| "eval_samples_per_second": 195.56, |
| "eval_steps_per_second": 24.445, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.8000222562582568, |
| "grad_norm": 1.1165159940719604, |
| "learning_rate": 5.8689563187729245e-06, |
| "loss": 0.5141, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.8000222562582568, |
| "eval_loss": 0.4517599046230316, |
| "eval_runtime": 51.1096, |
| "eval_samples_per_second": 195.658, |
| "eval_steps_per_second": 24.457, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.8060374611925294, |
| "grad_norm": 1.0414021015167236, |
| "learning_rate": 5.867955985328443e-06, |
| "loss": 0.5135, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.8060374611925294, |
| "eval_loss": 0.46558651328086853, |
| "eval_runtime": 51.1277, |
| "eval_samples_per_second": 195.589, |
| "eval_steps_per_second": 24.449, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.812052666126802, |
| "grad_norm": 1.3002249002456665, |
| "learning_rate": 5.866955651883961e-06, |
| "loss": 0.5124, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.812052666126802, |
| "eval_loss": 0.4563812017440796, |
| "eval_runtime": 51.132, |
| "eval_samples_per_second": 195.572, |
| "eval_steps_per_second": 24.447, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.8180678710610746, |
| "grad_norm": 1.5342046022415161, |
| "learning_rate": 5.86595531843948e-06, |
| "loss": 0.5101, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.8180678710610746, |
| "eval_loss": 0.44918256998062134, |
| "eval_runtime": 51.2205, |
| "eval_samples_per_second": 195.234, |
| "eval_steps_per_second": 24.404, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.8240830759953472, |
| "grad_norm": 1.312056064605713, |
| "learning_rate": 5.8649549849949985e-06, |
| "loss": 0.5087, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.8240830759953472, |
| "eval_loss": 0.45463162660598755, |
| "eval_runtime": 50.988, |
| "eval_samples_per_second": 196.125, |
| "eval_steps_per_second": 24.516, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.8300982809296198, |
| "grad_norm": 1.4413928985595703, |
| "learning_rate": 5.863954651550517e-06, |
| "loss": 0.5079, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.8300982809296198, |
| "eval_loss": 0.4562767446041107, |
| "eval_runtime": 51.212, |
| "eval_samples_per_second": 195.267, |
| "eval_steps_per_second": 24.408, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.8361134858638924, |
| "grad_norm": 1.3391541242599487, |
| "learning_rate": 5.862954318106036e-06, |
| "loss": 0.5077, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.8361134858638924, |
| "eval_loss": 0.44607582688331604, |
| "eval_runtime": 51.1173, |
| "eval_samples_per_second": 195.628, |
| "eval_steps_per_second": 24.454, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.8421286907981651, |
| "grad_norm": 1.2158905267715454, |
| "learning_rate": 5.861953984661554e-06, |
| "loss": 0.5032, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.8421286907981651, |
| "eval_loss": 0.4587889611721039, |
| "eval_runtime": 51.1702, |
| "eval_samples_per_second": 195.426, |
| "eval_steps_per_second": 24.428, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.8481438957324376, |
| "grad_norm": 1.1938725709915161, |
| "learning_rate": 5.8609536512170725e-06, |
| "loss": 0.4996, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.8481438957324376, |
| "eval_loss": 0.4515674412250519, |
| "eval_runtime": 51.1351, |
| "eval_samples_per_second": 195.56, |
| "eval_steps_per_second": 24.445, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.8541591006667103, |
| "grad_norm": 1.1953227519989014, |
| "learning_rate": 5.859953317772591e-06, |
| "loss": 0.5014, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.8541591006667103, |
| "eval_loss": 0.44719940423965454, |
| "eval_runtime": 51.0487, |
| "eval_samples_per_second": 195.891, |
| "eval_steps_per_second": 24.486, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.8601743056009828, |
| "grad_norm": 1.2699577808380127, |
| "learning_rate": 5.858952984328109e-06, |
| "loss": 0.499, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.8601743056009828, |
| "eval_loss": 0.4444737732410431, |
| "eval_runtime": 51.2894, |
| "eval_samples_per_second": 194.972, |
| "eval_steps_per_second": 24.372, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.8661895105352555, |
| "grad_norm": 1.0982294082641602, |
| "learning_rate": 5.857952650883628e-06, |
| "loss": 0.5024, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.8661895105352555, |
| "eval_loss": 0.4426032602787018, |
| "eval_runtime": 51.0622, |
| "eval_samples_per_second": 195.84, |
| "eval_steps_per_second": 24.48, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.872204715469528, |
| "grad_norm": 1.1881742477416992, |
| "learning_rate": 5.8569523174391465e-06, |
| "loss": 0.4971, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.872204715469528, |
| "eval_loss": 0.4500812590122223, |
| "eval_runtime": 51.0676, |
| "eval_samples_per_second": 195.819, |
| "eval_steps_per_second": 24.477, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.8782199204038007, |
| "grad_norm": 1.2892823219299316, |
| "learning_rate": 5.855951983994665e-06, |
| "loss": 0.4947, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.8782199204038007, |
| "eval_loss": 0.45143038034439087, |
| "eval_runtime": 51.2218, |
| "eval_samples_per_second": 195.229, |
| "eval_steps_per_second": 24.404, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.8842351253380734, |
| "grad_norm": 1.1228898763656616, |
| "learning_rate": 5.854951650550184e-06, |
| "loss": 0.4912, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.8842351253380734, |
| "eval_loss": 0.443864107131958, |
| "eval_runtime": 51.1005, |
| "eval_samples_per_second": 195.693, |
| "eval_steps_per_second": 24.462, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.8902503302723459, |
| "grad_norm": 1.2021640539169312, |
| "learning_rate": 5.853951317105702e-06, |
| "loss": 0.4911, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.8902503302723459, |
| "eval_loss": 0.44539061188697815, |
| "eval_runtime": 51.3647, |
| "eval_samples_per_second": 194.686, |
| "eval_steps_per_second": 24.336, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.8962655352066186, |
| "grad_norm": 1.226335883140564, |
| "learning_rate": 5.8529509836612205e-06, |
| "loss": 0.488, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.8962655352066186, |
| "eval_loss": 0.43708336353302, |
| "eval_runtime": 51.0878, |
| "eval_samples_per_second": 195.741, |
| "eval_steps_per_second": 24.468, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.9022807401408911, |
| "grad_norm": 1.1519514322280884, |
| "learning_rate": 5.851950650216739e-06, |
| "loss": 0.4879, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.9022807401408911, |
| "eval_loss": 0.43572157621383667, |
| "eval_runtime": 51.0673, |
| "eval_samples_per_second": 195.82, |
| "eval_steps_per_second": 24.477, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.9082959450751638, |
| "grad_norm": 1.0578216314315796, |
| "learning_rate": 5.850950316772257e-06, |
| "loss": 0.491, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.9082959450751638, |
| "eval_loss": 0.43306058645248413, |
| "eval_runtime": 51.2921, |
| "eval_samples_per_second": 194.962, |
| "eval_steps_per_second": 24.37, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.9143111500094363, |
| "grad_norm": 1.292629599571228, |
| "learning_rate": 5.849949983327776e-06, |
| "loss": 0.4852, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.9143111500094363, |
| "eval_loss": 0.43448084592819214, |
| "eval_runtime": 51.0849, |
| "eval_samples_per_second": 195.752, |
| "eval_steps_per_second": 24.469, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.920326354943709, |
| "grad_norm": 1.2115490436553955, |
| "learning_rate": 5.8489496498832945e-06, |
| "loss": 0.4879, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.920326354943709, |
| "eval_loss": 0.4403839409351349, |
| "eval_runtime": 51.0866, |
| "eval_samples_per_second": 195.746, |
| "eval_steps_per_second": 24.468, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.9263415598779816, |
| "grad_norm": 1.2206310033798218, |
| "learning_rate": 5.847949316438813e-06, |
| "loss": 0.4771, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.9263415598779816, |
| "eval_loss": 0.43060389161109924, |
| "eval_runtime": 51.0659, |
| "eval_samples_per_second": 195.825, |
| "eval_steps_per_second": 24.478, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.9323567648122542, |
| "grad_norm": 1.0853536128997803, |
| "learning_rate": 5.846948982994332e-06, |
| "loss": 0.4821, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.9323567648122542, |
| "eval_loss": 0.42842620611190796, |
| "eval_runtime": 51.036, |
| "eval_samples_per_second": 195.94, |
| "eval_steps_per_second": 24.493, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.9383719697465268, |
| "grad_norm": 1.0656437873840332, |
| "learning_rate": 5.8459486495498506e-06, |
| "loss": 0.4796, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.9383719697465268, |
| "eval_loss": 0.4259638786315918, |
| "eval_runtime": 51.0811, |
| "eval_samples_per_second": 195.767, |
| "eval_steps_per_second": 24.471, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.9443871746807994, |
| "grad_norm": 1.2496039867401123, |
| "learning_rate": 5.8449483161053684e-06, |
| "loss": 0.4783, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.9443871746807994, |
| "eval_loss": 0.42784813046455383, |
| "eval_runtime": 51.0862, |
| "eval_samples_per_second": 195.748, |
| "eval_steps_per_second": 24.468, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.950402379615072, |
| "grad_norm": 1.0478885173797607, |
| "learning_rate": 5.843947982660887e-06, |
| "loss": 0.4736, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.950402379615072, |
| "eval_loss": 0.42105141282081604, |
| "eval_runtime": 51.0949, |
| "eval_samples_per_second": 195.714, |
| "eval_steps_per_second": 24.464, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.9564175845493446, |
| "grad_norm": 1.1973545551300049, |
| "learning_rate": 5.842947649216405e-06, |
| "loss": 0.4765, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.9564175845493446, |
| "eval_loss": 0.41922861337661743, |
| "eval_runtime": 51.0499, |
| "eval_samples_per_second": 195.887, |
| "eval_steps_per_second": 24.486, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.9624327894836172, |
| "grad_norm": 1.0738471746444702, |
| "learning_rate": 5.841947315771924e-06, |
| "loss": 0.4713, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.9624327894836172, |
| "eval_loss": 0.4311535060405731, |
| "eval_runtime": 51.0775, |
| "eval_samples_per_second": 195.781, |
| "eval_steps_per_second": 24.473, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.9684479944178899, |
| "grad_norm": 1.14482581615448, |
| "learning_rate": 5.840946982327443e-06, |
| "loss": 0.4732, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.9684479944178899, |
| "eval_loss": 0.41709282994270325, |
| "eval_runtime": 39.7116, |
| "eval_samples_per_second": 251.815, |
| "eval_steps_per_second": 31.477, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.9744631993521624, |
| "grad_norm": 1.1577385663986206, |
| "learning_rate": 5.839946648882961e-06, |
| "loss": 0.4704, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.9744631993521624, |
| "eval_loss": 0.4273630976676941, |
| "eval_runtime": 51.0906, |
| "eval_samples_per_second": 195.731, |
| "eval_steps_per_second": 24.466, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.9804784042864351, |
| "grad_norm": 1.125328779220581, |
| "learning_rate": 5.83894631543848e-06, |
| "loss": 0.4697, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.9804784042864351, |
| "eval_loss": 0.42490535974502563, |
| "eval_runtime": 51.0751, |
| "eval_samples_per_second": 195.79, |
| "eval_steps_per_second": 24.474, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.9864936092207076, |
| "grad_norm": 1.2619575262069702, |
| "learning_rate": 5.8379459819939985e-06, |
| "loss": 0.4721, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.9864936092207076, |
| "eval_loss": 0.42143183946609497, |
| "eval_runtime": 51.2808, |
| "eval_samples_per_second": 195.005, |
| "eval_steps_per_second": 24.376, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.9925088141549803, |
| "grad_norm": 1.0622971057891846, |
| "learning_rate": 5.836945648549516e-06, |
| "loss": 0.4672, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.9925088141549803, |
| "eval_loss": 0.4140073359012604, |
| "eval_runtime": 51.137, |
| "eval_samples_per_second": 195.553, |
| "eval_steps_per_second": 24.444, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.9985240190892528, |
| "grad_norm": 1.1675751209259033, |
| "learning_rate": 5.835945315105035e-06, |
| "loss": 0.469, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.9985240190892528, |
| "eval_loss": 0.413769394159317, |
| "eval_runtime": 51.1298, |
| "eval_samples_per_second": 195.581, |
| "eval_steps_per_second": 24.448, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.0045392240235254, |
| "grad_norm": 1.1390060186386108, |
| "learning_rate": 5.834944981660553e-06, |
| "loss": 0.4668, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.0045392240235254, |
| "eval_loss": 0.41630059480667114, |
| "eval_runtime": 51.1382, |
| "eval_samples_per_second": 195.548, |
| "eval_steps_per_second": 24.444, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.0105544289577981, |
| "grad_norm": 1.2013533115386963, |
| "learning_rate": 5.8339446482160725e-06, |
| "loss": 0.4636, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.0105544289577981, |
| "eval_loss": 0.4128175675868988, |
| "eval_runtime": 51.0766, |
| "eval_samples_per_second": 195.784, |
| "eval_steps_per_second": 24.473, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.0165696338920707, |
| "grad_norm": 1.1893339157104492, |
| "learning_rate": 5.832944314771591e-06, |
| "loss": 0.4628, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.0165696338920707, |
| "eval_loss": 0.4195719361305237, |
| "eval_runtime": 51.0932, |
| "eval_samples_per_second": 195.721, |
| "eval_steps_per_second": 24.465, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.0225848388263432, |
| "grad_norm": 1.1112314462661743, |
| "learning_rate": 5.831943981327109e-06, |
| "loss": 0.4631, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.0225848388263432, |
| "eval_loss": 0.41490069031715393, |
| "eval_runtime": 51.0962, |
| "eval_samples_per_second": 195.709, |
| "eval_steps_per_second": 24.464, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.028600043760616, |
| "grad_norm": 1.0246236324310303, |
| "learning_rate": 5.830943647882628e-06, |
| "loss": 0.4634, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.028600043760616, |
| "eval_loss": 0.4150553345680237, |
| "eval_runtime": 51.0756, |
| "eval_samples_per_second": 195.788, |
| "eval_steps_per_second": 24.474, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.0346152486948885, |
| "grad_norm": 1.09652578830719, |
| "learning_rate": 5.8299433144381465e-06, |
| "loss": 0.4618, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.0346152486948885, |
| "eval_loss": 0.41938120126724243, |
| "eval_runtime": 51.0832, |
| "eval_samples_per_second": 195.759, |
| "eval_steps_per_second": 24.47, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.040630453629161, |
| "grad_norm": 1.123412013053894, |
| "learning_rate": 5.828942980993664e-06, |
| "loss": 0.4598, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.040630453629161, |
| "eval_loss": 0.4131644666194916, |
| "eval_runtime": 51.0626, |
| "eval_samples_per_second": 195.838, |
| "eval_steps_per_second": 24.48, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.0466456585634338, |
| "grad_norm": 1.195304274559021, |
| "learning_rate": 5.827942647549183e-06, |
| "loss": 0.455, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.0466456585634338, |
| "eval_loss": 0.40582725405693054, |
| "eval_runtime": 51.2954, |
| "eval_samples_per_second": 194.949, |
| "eval_steps_per_second": 24.369, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.0526608634977064, |
| "grad_norm": 1.149339199066162, |
| "learning_rate": 5.826942314104702e-06, |
| "loss": 0.4547, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.0526608634977064, |
| "eval_loss": 0.4130345582962036, |
| "eval_runtime": 51.0931, |
| "eval_samples_per_second": 195.721, |
| "eval_steps_per_second": 24.465, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.058676068431979, |
| "grad_norm": 1.1289178133010864, |
| "learning_rate": 5.8259419806602205e-06, |
| "loss": 0.4551, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.058676068431979, |
| "eval_loss": 0.4048755466938019, |
| "eval_runtime": 51.0261, |
| "eval_samples_per_second": 195.978, |
| "eval_steps_per_second": 24.497, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.0646912733662515, |
| "grad_norm": 1.1146255731582642, |
| "learning_rate": 5.824941647215739e-06, |
| "loss": 0.4509, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.0646912733662515, |
| "eval_loss": 0.401869535446167, |
| "eval_runtime": 51.168, |
| "eval_samples_per_second": 195.435, |
| "eval_steps_per_second": 24.429, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.0707064783005242, |
| "grad_norm": 1.2300053834915161, |
| "learning_rate": 5.823941313771257e-06, |
| "loss": 0.4505, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.0707064783005242, |
| "eval_loss": 0.4011248052120209, |
| "eval_runtime": 51.0381, |
| "eval_samples_per_second": 195.932, |
| "eval_steps_per_second": 24.491, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.0767216832347968, |
| "grad_norm": 1.1278949975967407, |
| "learning_rate": 5.822940980326776e-06, |
| "loss": 0.4499, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.0767216832347968, |
| "eval_loss": 0.4098372459411621, |
| "eval_runtime": 51.1549, |
| "eval_samples_per_second": 195.485, |
| "eval_steps_per_second": 24.436, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.0827368881690693, |
| "grad_norm": 1.1039050817489624, |
| "learning_rate": 5.8219406468822945e-06, |
| "loss": 0.4479, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.0827368881690693, |
| "eval_loss": 0.4014202356338501, |
| "eval_runtime": 51.282, |
| "eval_samples_per_second": 195.0, |
| "eval_steps_per_second": 24.375, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.0887520931033419, |
| "grad_norm": 1.0981614589691162, |
| "learning_rate": 5.820940313437812e-06, |
| "loss": 0.4505, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.0887520931033419, |
| "eval_loss": 0.40326839685440063, |
| "eval_runtime": 51.0953, |
| "eval_samples_per_second": 195.713, |
| "eval_steps_per_second": 24.464, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.0947672980376146, |
| "grad_norm": 1.1146022081375122, |
| "learning_rate": 5.819939979993331e-06, |
| "loss": 0.4485, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.0947672980376146, |
| "eval_loss": 0.4028699994087219, |
| "eval_runtime": 51.095, |
| "eval_samples_per_second": 195.714, |
| "eval_steps_per_second": 24.464, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.1007825029718872, |
| "grad_norm": 1.0906445980072021, |
| "learning_rate": 5.81893964654885e-06, |
| "loss": 0.4441, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.1007825029718872, |
| "eval_loss": 0.39843133091926575, |
| "eval_runtime": 51.2428, |
| "eval_samples_per_second": 195.149, |
| "eval_steps_per_second": 24.394, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.1067977079061597, |
| "grad_norm": 1.0257636308670044, |
| "learning_rate": 5.8179393131043684e-06, |
| "loss": 0.4456, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.1067977079061597, |
| "eval_loss": 0.3976500630378723, |
| "eval_runtime": 51.0817, |
| "eval_samples_per_second": 195.765, |
| "eval_steps_per_second": 24.471, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.1128129128404325, |
| "grad_norm": 1.1339443922042847, |
| "learning_rate": 5.816938979659887e-06, |
| "loss": 0.4441, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.1128129128404325, |
| "eval_loss": 0.403137743473053, |
| "eval_runtime": 51.196, |
| "eval_samples_per_second": 195.328, |
| "eval_steps_per_second": 24.416, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.118828117774705, |
| "grad_norm": 1.146203637123108, |
| "learning_rate": 5.815938646215406e-06, |
| "loss": 0.4431, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1.118828117774705, |
| "eval_loss": 0.40482422709465027, |
| "eval_runtime": 51.0834, |
| "eval_samples_per_second": 195.758, |
| "eval_steps_per_second": 24.47, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1.1248433227089776, |
| "grad_norm": 1.1327886581420898, |
| "learning_rate": 5.814938312770924e-06, |
| "loss": 0.4446, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1.1248433227089776, |
| "eval_loss": 0.39922335743904114, |
| "eval_runtime": 51.1856, |
| "eval_samples_per_second": 195.367, |
| "eval_steps_per_second": 24.421, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1.1308585276432503, |
| "grad_norm": 1.1702196598052979, |
| "learning_rate": 5.8139379793264424e-06, |
| "loss": 0.4412, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1.1308585276432503, |
| "eval_loss": 0.39871400594711304, |
| "eval_runtime": 51.1987, |
| "eval_samples_per_second": 195.317, |
| "eval_steps_per_second": 24.415, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1.1368737325775229, |
| "grad_norm": 1.0438004732131958, |
| "learning_rate": 5.81293764588196e-06, |
| "loss": 0.44, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1.1368737325775229, |
| "eval_loss": 0.3967694044113159, |
| "eval_runtime": 51.0919, |
| "eval_samples_per_second": 195.726, |
| "eval_steps_per_second": 24.466, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1.1428889375117954, |
| "grad_norm": 1.0050268173217773, |
| "learning_rate": 5.811937312437479e-06, |
| "loss": 0.4395, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.1428889375117954, |
| "eval_loss": 0.3952539563179016, |
| "eval_runtime": 51.3885, |
| "eval_samples_per_second": 194.596, |
| "eval_steps_per_second": 24.325, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.148904142446068, |
| "grad_norm": 1.0875275135040283, |
| "learning_rate": 5.810936978992998e-06, |
| "loss": 0.4346, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1.148904142446068, |
| "eval_loss": 0.3918244242668152, |
| "eval_runtime": 51.0342, |
| "eval_samples_per_second": 195.947, |
| "eval_steps_per_second": 24.493, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1.1549193473803407, |
| "grad_norm": 1.0449281930923462, |
| "learning_rate": 5.809936645548516e-06, |
| "loss": 0.4391, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1.1549193473803407, |
| "eval_loss": 0.3855830729007721, |
| "eval_runtime": 51.1568, |
| "eval_samples_per_second": 195.478, |
| "eval_steps_per_second": 24.435, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1.1609345523146133, |
| "grad_norm": 0.9773437976837158, |
| "learning_rate": 5.808936312104035e-06, |
| "loss": 0.4355, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1.1609345523146133, |
| "eval_loss": 0.3886500597000122, |
| "eval_runtime": 51.1956, |
| "eval_samples_per_second": 195.329, |
| "eval_steps_per_second": 24.416, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1.1669497572488858, |
| "grad_norm": 1.091601014137268, |
| "learning_rate": 5.807935978659554e-06, |
| "loss": 0.4344, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1.1669497572488858, |
| "eval_loss": 0.3868565857410431, |
| "eval_runtime": 51.1098, |
| "eval_samples_per_second": 195.657, |
| "eval_steps_per_second": 24.457, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1.1729649621831584, |
| "grad_norm": 1.1882948875427246, |
| "learning_rate": 5.806935645215072e-06, |
| "loss": 0.434, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.1729649621831584, |
| "eval_loss": 0.38946595788002014, |
| "eval_runtime": 51.2843, |
| "eval_samples_per_second": 194.991, |
| "eval_steps_per_second": 24.374, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.1789801671174311, |
| "grad_norm": 1.0534999370574951, |
| "learning_rate": 5.80593531177059e-06, |
| "loss": 0.4329, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1.1789801671174311, |
| "eval_loss": 0.3830993175506592, |
| "eval_runtime": 50.9094, |
| "eval_samples_per_second": 196.428, |
| "eval_steps_per_second": 24.553, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1.1849953720517037, |
| "grad_norm": 1.0696886777877808, |
| "learning_rate": 5.804934978326108e-06, |
| "loss": 0.4311, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1.1849953720517037, |
| "eval_loss": 0.39124995470046997, |
| "eval_runtime": 51.1273, |
| "eval_samples_per_second": 195.59, |
| "eval_steps_per_second": 24.449, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1.1910105769859762, |
| "grad_norm": 1.0171489715576172, |
| "learning_rate": 5.803934644881627e-06, |
| "loss": 0.4332, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1.1910105769859762, |
| "eval_loss": 0.384937584400177, |
| "eval_runtime": 51.3256, |
| "eval_samples_per_second": 194.834, |
| "eval_steps_per_second": 24.354, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1.197025781920249, |
| "grad_norm": 1.1686575412750244, |
| "learning_rate": 5.802934311437146e-06, |
| "loss": 0.4289, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1.197025781920249, |
| "eval_loss": 0.38561180233955383, |
| "eval_runtime": 51.072, |
| "eval_samples_per_second": 195.802, |
| "eval_steps_per_second": 24.475, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1.2030409868545215, |
| "grad_norm": 1.0748465061187744, |
| "learning_rate": 5.801933977992664e-06, |
| "loss": 0.4334, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1.2030409868545215, |
| "eval_loss": 0.382721871137619, |
| "eval_runtime": 51.3966, |
| "eval_samples_per_second": 194.565, |
| "eval_steps_per_second": 24.321, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1.209056191788794, |
| "grad_norm": 1.100787878036499, |
| "learning_rate": 5.800933644548183e-06, |
| "loss": 0.4239, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1.209056191788794, |
| "eval_loss": 0.3841208517551422, |
| "eval_runtime": 51.057, |
| "eval_samples_per_second": 195.859, |
| "eval_steps_per_second": 24.482, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1.2150713967230669, |
| "grad_norm": 1.04718017578125, |
| "learning_rate": 5.799933311103702e-06, |
| "loss": 0.4271, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1.2150713967230669, |
| "eval_loss": 0.3771766424179077, |
| "eval_runtime": 51.2777, |
| "eval_samples_per_second": 195.017, |
| "eval_steps_per_second": 24.377, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1.2210866016573394, |
| "grad_norm": 1.1533209085464478, |
| "learning_rate": 5.79893297765922e-06, |
| "loss": 0.4254, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1.2210866016573394, |
| "eval_loss": 0.38013017177581787, |
| "eval_runtime": 51.0118, |
| "eval_samples_per_second": 196.033, |
| "eval_steps_per_second": 24.504, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1.227101806591612, |
| "grad_norm": 1.2025070190429688, |
| "learning_rate": 5.797932644214738e-06, |
| "loss": 0.4263, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1.227101806591612, |
| "eval_loss": 0.37795642018318176, |
| "eval_runtime": 51.132, |
| "eval_samples_per_second": 195.572, |
| "eval_steps_per_second": 24.447, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1.2331170115258845, |
| "grad_norm": 1.1051814556121826, |
| "learning_rate": 5.796932310770257e-06, |
| "loss": 0.4256, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1.2331170115258845, |
| "eval_loss": 0.37627479434013367, |
| "eval_runtime": 50.9072, |
| "eval_samples_per_second": 196.436, |
| "eval_steps_per_second": 24.554, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1.2391322164601573, |
| "grad_norm": 1.0987049341201782, |
| "learning_rate": 5.795931977325775e-06, |
| "loss": 0.4239, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1.2391322164601573, |
| "eval_loss": 0.3853623569011688, |
| "eval_runtime": 51.0608, |
| "eval_samples_per_second": 195.845, |
| "eval_steps_per_second": 24.481, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1.2451474213944298, |
| "grad_norm": 1.0989750623703003, |
| "learning_rate": 5.794931643881294e-06, |
| "loss": 0.4197, |
| "step": 20700 |
| }, |
| { |
| "epoch": 1.2451474213944298, |
| "eval_loss": 0.3807806670665741, |
| "eval_runtime": 51.3594, |
| "eval_samples_per_second": 194.706, |
| "eval_steps_per_second": 24.338, |
| "step": 20700 |
| }, |
| { |
| "epoch": 1.2511626263287023, |
| "grad_norm": 1.0866729021072388, |
| "learning_rate": 5.793931310436812e-06, |
| "loss": 0.4234, |
| "step": 20800 |
| }, |
| { |
| "epoch": 1.2511626263287023, |
| "eval_loss": 0.3777351379394531, |
| "eval_runtime": 51.0621, |
| "eval_samples_per_second": 195.84, |
| "eval_steps_per_second": 24.48, |
| "step": 20800 |
| }, |
| { |
| "epoch": 1.2571778312629749, |
| "grad_norm": 1.1387032270431519, |
| "learning_rate": 5.792930976992331e-06, |
| "loss": 0.4197, |
| "step": 20900 |
| }, |
| { |
| "epoch": 1.2571778312629749, |
| "eval_loss": 0.3739318549633026, |
| "eval_runtime": 51.1648, |
| "eval_samples_per_second": 195.447, |
| "eval_steps_per_second": 24.431, |
| "step": 20900 |
| }, |
| { |
| "epoch": 1.2631930361972477, |
| "grad_norm": 0.9848424792289734, |
| "learning_rate": 5.79193064354785e-06, |
| "loss": 0.4225, |
| "step": 21000 |
| }, |
| { |
| "epoch": 1.2631930361972477, |
| "eval_loss": 0.3804405629634857, |
| "eval_runtime": 51.1688, |
| "eval_samples_per_second": 195.431, |
| "eval_steps_per_second": 24.429, |
| "step": 21000 |
| }, |
| { |
| "epoch": 1.2692082411315202, |
| "grad_norm": 1.0492684841156006, |
| "learning_rate": 5.790930310103368e-06, |
| "loss": 0.4179, |
| "step": 21100 |
| }, |
| { |
| "epoch": 1.2692082411315202, |
| "eval_loss": 0.37157440185546875, |
| "eval_runtime": 51.0428, |
| "eval_samples_per_second": 195.914, |
| "eval_steps_per_second": 24.489, |
| "step": 21100 |
| }, |
| { |
| "epoch": 1.2752234460657927, |
| "grad_norm": 1.2355892658233643, |
| "learning_rate": 5.789929976658886e-06, |
| "loss": 0.4177, |
| "step": 21200 |
| }, |
| { |
| "epoch": 1.2752234460657927, |
| "eval_loss": 0.3794465661048889, |
| "eval_runtime": 51.1116, |
| "eval_samples_per_second": 195.65, |
| "eval_steps_per_second": 24.456, |
| "step": 21200 |
| }, |
| { |
| "epoch": 1.2812386510000655, |
| "grad_norm": 1.1180801391601562, |
| "learning_rate": 5.788929643214405e-06, |
| "loss": 0.4192, |
| "step": 21300 |
| }, |
| { |
| "epoch": 1.2812386510000655, |
| "eval_loss": 0.3741929829120636, |
| "eval_runtime": 51.043, |
| "eval_samples_per_second": 195.913, |
| "eval_steps_per_second": 24.489, |
| "step": 21300 |
| }, |
| { |
| "epoch": 1.287253855934338, |
| "grad_norm": 1.1260274648666382, |
| "learning_rate": 5.787929309769923e-06, |
| "loss": 0.4165, |
| "step": 21400 |
| }, |
| { |
| "epoch": 1.287253855934338, |
| "eval_loss": 0.37511906027793884, |
| "eval_runtime": 51.1867, |
| "eval_samples_per_second": 195.363, |
| "eval_steps_per_second": 24.42, |
| "step": 21400 |
| }, |
| { |
| "epoch": 1.2932690608686106, |
| "grad_norm": 1.0729244947433472, |
| "learning_rate": 5.7869289763254424e-06, |
| "loss": 0.4148, |
| "step": 21500 |
| }, |
| { |
| "epoch": 1.2932690608686106, |
| "eval_loss": 0.3755778670310974, |
| "eval_runtime": 50.9919, |
| "eval_samples_per_second": 196.11, |
| "eval_steps_per_second": 24.514, |
| "step": 21500 |
| }, |
| { |
| "epoch": 1.2992842658028834, |
| "grad_norm": 1.5396491289138794, |
| "learning_rate": 5.785928642880961e-06, |
| "loss": 0.4128, |
| "step": 21600 |
| }, |
| { |
| "epoch": 1.2992842658028834, |
| "eval_loss": 0.3713712990283966, |
| "eval_runtime": 51.0389, |
| "eval_samples_per_second": 195.929, |
| "eval_steps_per_second": 24.491, |
| "step": 21600 |
| }, |
| { |
| "epoch": 1.305299470737156, |
| "grad_norm": 0.9880481362342834, |
| "learning_rate": 5.784928309436479e-06, |
| "loss": 0.4138, |
| "step": 21700 |
| }, |
| { |
| "epoch": 1.305299470737156, |
| "eval_loss": 0.3710058033466339, |
| "eval_runtime": 51.3224, |
| "eval_samples_per_second": 194.847, |
| "eval_steps_per_second": 24.356, |
| "step": 21700 |
| }, |
| { |
| "epoch": 1.3113146756714285, |
| "grad_norm": 0.9788950085639954, |
| "learning_rate": 5.783927975991998e-06, |
| "loss": 0.4108, |
| "step": 21800 |
| }, |
| { |
| "epoch": 1.3113146756714285, |
| "eval_loss": 0.3687758147716522, |
| "eval_runtime": 51.0044, |
| "eval_samples_per_second": 196.062, |
| "eval_steps_per_second": 24.508, |
| "step": 21800 |
| }, |
| { |
| "epoch": 1.317329880605701, |
| "grad_norm": 1.0298100709915161, |
| "learning_rate": 5.782927642547516e-06, |
| "loss": 0.4129, |
| "step": 21900 |
| }, |
| { |
| "epoch": 1.317329880605701, |
| "eval_loss": 0.365496426820755, |
| "eval_runtime": 51.065, |
| "eval_samples_per_second": 195.829, |
| "eval_steps_per_second": 24.479, |
| "step": 21900 |
| }, |
| { |
| "epoch": 1.3233450855399735, |
| "grad_norm": 1.0753816366195679, |
| "learning_rate": 5.781927309103034e-06, |
| "loss": 0.413, |
| "step": 22000 |
| }, |
| { |
| "epoch": 1.3233450855399735, |
| "eval_loss": 0.3655156195163727, |
| "eval_runtime": 51.117, |
| "eval_samples_per_second": 195.63, |
| "eval_steps_per_second": 24.454, |
| "step": 22000 |
| }, |
| { |
| "epoch": 1.3293602904742463, |
| "grad_norm": 1.1379014253616333, |
| "learning_rate": 5.780926975658553e-06, |
| "loss": 0.4101, |
| "step": 22100 |
| }, |
| { |
| "epoch": 1.3293602904742463, |
| "eval_loss": 0.37188926339149475, |
| "eval_runtime": 51.0999, |
| "eval_samples_per_second": 195.695, |
| "eval_steps_per_second": 24.462, |
| "step": 22100 |
| }, |
| { |
| "epoch": 1.3353754954085189, |
| "grad_norm": 0.9869519472122192, |
| "learning_rate": 5.779926642214072e-06, |
| "loss": 0.4113, |
| "step": 22200 |
| }, |
| { |
| "epoch": 1.3353754954085189, |
| "eval_loss": 0.36685308814048767, |
| "eval_runtime": 50.9524, |
| "eval_samples_per_second": 196.262, |
| "eval_steps_per_second": 24.533, |
| "step": 22200 |
| }, |
| { |
| "epoch": 1.3413907003427914, |
| "grad_norm": 1.1977757215499878, |
| "learning_rate": 5.77892630876959e-06, |
| "loss": 0.4106, |
| "step": 22300 |
| }, |
| { |
| "epoch": 1.3413907003427914, |
| "eval_loss": 0.3694215714931488, |
| "eval_runtime": 50.8823, |
| "eval_samples_per_second": 196.532, |
| "eval_steps_per_second": 24.566, |
| "step": 22300 |
| }, |
| { |
| "epoch": 1.3474059052770642, |
| "grad_norm": 1.0620633363723755, |
| "learning_rate": 5.777925975325109e-06, |
| "loss": 0.407, |
| "step": 22400 |
| }, |
| { |
| "epoch": 1.3474059052770642, |
| "eval_loss": 0.36941900849342346, |
| "eval_runtime": 51.0452, |
| "eval_samples_per_second": 195.905, |
| "eval_steps_per_second": 24.488, |
| "step": 22400 |
| }, |
| { |
| "epoch": 1.3534211102113367, |
| "grad_norm": 1.0130232572555542, |
| "learning_rate": 5.776925641880627e-06, |
| "loss": 0.4076, |
| "step": 22500 |
| }, |
| { |
| "epoch": 1.3534211102113367, |
| "eval_loss": 0.3688518702983856, |
| "eval_runtime": 51.2935, |
| "eval_samples_per_second": 194.956, |
| "eval_steps_per_second": 24.37, |
| "step": 22500 |
| }, |
| { |
| "epoch": 1.3594363151456093, |
| "grad_norm": 1.1370288133621216, |
| "learning_rate": 5.775925308436146e-06, |
| "loss": 0.4058, |
| "step": 22600 |
| }, |
| { |
| "epoch": 1.3594363151456093, |
| "eval_loss": 0.35986149311065674, |
| "eval_runtime": 50.94, |
| "eval_samples_per_second": 196.309, |
| "eval_steps_per_second": 24.539, |
| "step": 22600 |
| }, |
| { |
| "epoch": 1.365451520079882, |
| "grad_norm": 1.0753254890441895, |
| "learning_rate": 5.7749249749916635e-06, |
| "loss": 0.404, |
| "step": 22700 |
| }, |
| { |
| "epoch": 1.365451520079882, |
| "eval_loss": 0.36281687021255493, |
| "eval_runtime": 51.0705, |
| "eval_samples_per_second": 195.808, |
| "eval_steps_per_second": 24.476, |
| "step": 22700 |
| }, |
| { |
| "epoch": 1.3714667250141546, |
| "grad_norm": 1.0779234170913696, |
| "learning_rate": 5.773924641547182e-06, |
| "loss": 0.4055, |
| "step": 22800 |
| }, |
| { |
| "epoch": 1.3714667250141546, |
| "eval_loss": 0.3607022762298584, |
| "eval_runtime": 51.2843, |
| "eval_samples_per_second": 194.992, |
| "eval_steps_per_second": 24.374, |
| "step": 22800 |
| }, |
| { |
| "epoch": 1.377481929948427, |
| "grad_norm": 1.0071178674697876, |
| "learning_rate": 5.772924308102701e-06, |
| "loss": 0.4038, |
| "step": 22900 |
| }, |
| { |
| "epoch": 1.377481929948427, |
| "eval_loss": 0.36346524953842163, |
| "eval_runtime": 50.9712, |
| "eval_samples_per_second": 196.189, |
| "eval_steps_per_second": 24.524, |
| "step": 22900 |
| }, |
| { |
| "epoch": 1.3834971348826999, |
| "grad_norm": 1.0683503150939941, |
| "learning_rate": 5.77192397465822e-06, |
| "loss": 0.4047, |
| "step": 23000 |
| }, |
| { |
| "epoch": 1.3834971348826999, |
| "eval_loss": 0.36117979884147644, |
| "eval_runtime": 51.0395, |
| "eval_samples_per_second": 195.927, |
| "eval_steps_per_second": 24.491, |
| "step": 23000 |
| }, |
| { |
| "epoch": 1.3895123398169724, |
| "grad_norm": 1.1770708560943604, |
| "learning_rate": 5.770923641213738e-06, |
| "loss": 0.4043, |
| "step": 23100 |
| }, |
| { |
| "epoch": 1.3895123398169724, |
| "eval_loss": 0.36106517910957336, |
| "eval_runtime": 51.0648, |
| "eval_samples_per_second": 195.83, |
| "eval_steps_per_second": 24.479, |
| "step": 23100 |
| }, |
| { |
| "epoch": 1.395527544751245, |
| "grad_norm": 0.9239141941070557, |
| "learning_rate": 5.769923307769257e-06, |
| "loss": 0.4011, |
| "step": 23200 |
| }, |
| { |
| "epoch": 1.395527544751245, |
| "eval_loss": 0.3578794598579407, |
| "eval_runtime": 51.0531, |
| "eval_samples_per_second": 195.875, |
| "eval_steps_per_second": 24.484, |
| "step": 23200 |
| }, |
| { |
| "epoch": 1.4015427496855175, |
| "grad_norm": 1.2712723016738892, |
| "learning_rate": 5.768922974324775e-06, |
| "loss": 0.4008, |
| "step": 23300 |
| }, |
| { |
| "epoch": 1.4015427496855175, |
| "eval_loss": 0.3636392652988434, |
| "eval_runtime": 51.1514, |
| "eval_samples_per_second": 195.498, |
| "eval_steps_per_second": 24.437, |
| "step": 23300 |
| }, |
| { |
| "epoch": 1.40755795461979, |
| "grad_norm": 1.040955901145935, |
| "learning_rate": 5.767922640880294e-06, |
| "loss": 0.3974, |
| "step": 23400 |
| }, |
| { |
| "epoch": 1.40755795461979, |
| "eval_loss": 0.3629893660545349, |
| "eval_runtime": 51.021, |
| "eval_samples_per_second": 195.998, |
| "eval_steps_per_second": 24.5, |
| "step": 23400 |
| }, |
| { |
| "epoch": 1.4135731595540628, |
| "grad_norm": 0.9896743893623352, |
| "learning_rate": 5.766922307435812e-06, |
| "loss": 0.3991, |
| "step": 23500 |
| }, |
| { |
| "epoch": 1.4135731595540628, |
| "eval_loss": 0.35531342029571533, |
| "eval_runtime": 51.17, |
| "eval_samples_per_second": 195.427, |
| "eval_steps_per_second": 24.428, |
| "step": 23500 |
| }, |
| { |
| "epoch": 1.4195883644883354, |
| "grad_norm": 1.088028073310852, |
| "learning_rate": 5.76592197399133e-06, |
| "loss": 0.3972, |
| "step": 23600 |
| }, |
| { |
| "epoch": 1.4195883644883354, |
| "eval_loss": 0.35938191413879395, |
| "eval_runtime": 51.2648, |
| "eval_samples_per_second": 195.066, |
| "eval_steps_per_second": 24.383, |
| "step": 23600 |
| }, |
| { |
| "epoch": 1.425603569422608, |
| "grad_norm": 1.0598886013031006, |
| "learning_rate": 5.764921640546849e-06, |
| "loss": 0.4021, |
| "step": 23700 |
| }, |
| { |
| "epoch": 1.425603569422608, |
| "eval_loss": 0.35533782839775085, |
| "eval_runtime": 51.0234, |
| "eval_samples_per_second": 195.989, |
| "eval_steps_per_second": 24.499, |
| "step": 23700 |
| }, |
| { |
| "epoch": 1.4316187743568807, |
| "grad_norm": 1.1906119585037231, |
| "learning_rate": 5.763921307102368e-06, |
| "loss": 0.3977, |
| "step": 23800 |
| }, |
| { |
| "epoch": 1.4316187743568807, |
| "eval_loss": 0.3564583361148834, |
| "eval_runtime": 51.0223, |
| "eval_samples_per_second": 195.993, |
| "eval_steps_per_second": 24.499, |
| "step": 23800 |
| }, |
| { |
| "epoch": 1.4376339792911532, |
| "grad_norm": 1.1549937725067139, |
| "learning_rate": 5.762920973657886e-06, |
| "loss": 0.3942, |
| "step": 23900 |
| }, |
| { |
| "epoch": 1.4376339792911532, |
| "eval_loss": 0.3534764051437378, |
| "eval_runtime": 51.1427, |
| "eval_samples_per_second": 195.531, |
| "eval_steps_per_second": 24.441, |
| "step": 23900 |
| }, |
| { |
| "epoch": 1.4436491842254258, |
| "grad_norm": 1.0571911334991455, |
| "learning_rate": 5.761920640213405e-06, |
| "loss": 0.3953, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1.4436491842254258, |
| "eval_loss": 0.3564269542694092, |
| "eval_runtime": 51.0367, |
| "eval_samples_per_second": 195.938, |
| "eval_steps_per_second": 24.492, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1.4496643891596985, |
| "grad_norm": 1.058688998222351, |
| "learning_rate": 5.760920306768923e-06, |
| "loss": 0.3957, |
| "step": 24100 |
| }, |
| { |
| "epoch": 1.4496643891596985, |
| "eval_loss": 0.3465494215488434, |
| "eval_runtime": 51.0338, |
| "eval_samples_per_second": 195.949, |
| "eval_steps_per_second": 24.494, |
| "step": 24100 |
| }, |
| { |
| "epoch": 1.455679594093971, |
| "grad_norm": 1.0260639190673828, |
| "learning_rate": 5.759919973324442e-06, |
| "loss": 0.3954, |
| "step": 24200 |
| }, |
| { |
| "epoch": 1.455679594093971, |
| "eval_loss": 0.34943073987960815, |
| "eval_runtime": 50.8891, |
| "eval_samples_per_second": 196.506, |
| "eval_steps_per_second": 24.563, |
| "step": 24200 |
| }, |
| { |
| "epoch": 1.4616947990282436, |
| "grad_norm": 0.9939345717430115, |
| "learning_rate": 5.75891963987996e-06, |
| "loss": 0.3944, |
| "step": 24300 |
| }, |
| { |
| "epoch": 1.4616947990282436, |
| "eval_loss": 0.35242801904678345, |
| "eval_runtime": 51.0489, |
| "eval_samples_per_second": 195.891, |
| "eval_steps_per_second": 24.486, |
| "step": 24300 |
| }, |
| { |
| "epoch": 1.4677100039625164, |
| "grad_norm": 1.0830129384994507, |
| "learning_rate": 5.757919306435478e-06, |
| "loss": 0.3894, |
| "step": 24400 |
| }, |
| { |
| "epoch": 1.4677100039625164, |
| "eval_loss": 0.34800294041633606, |
| "eval_runtime": 51.3057, |
| "eval_samples_per_second": 194.91, |
| "eval_steps_per_second": 24.364, |
| "step": 24400 |
| }, |
| { |
| "epoch": 1.473725208896789, |
| "grad_norm": 1.0526846647262573, |
| "learning_rate": 5.756918972990997e-06, |
| "loss": 0.39, |
| "step": 24500 |
| }, |
| { |
| "epoch": 1.473725208896789, |
| "eval_loss": 0.3510083556175232, |
| "eval_runtime": 50.9026, |
| "eval_samples_per_second": 196.454, |
| "eval_steps_per_second": 24.557, |
| "step": 24500 |
| }, |
| { |
| "epoch": 1.4797404138310615, |
| "grad_norm": 1.1267868280410767, |
| "learning_rate": 5.755918639546516e-06, |
| "loss": 0.3902, |
| "step": 24600 |
| }, |
| { |
| "epoch": 1.4797404138310615, |
| "eval_loss": 0.3532961308956146, |
| "eval_runtime": 51.0797, |
| "eval_samples_per_second": 195.773, |
| "eval_steps_per_second": 24.472, |
| "step": 24600 |
| }, |
| { |
| "epoch": 1.485755618765334, |
| "grad_norm": 1.1018403768539429, |
| "learning_rate": 5.754918306102034e-06, |
| "loss": 0.3908, |
| "step": 24700 |
| }, |
| { |
| "epoch": 1.485755618765334, |
| "eval_loss": 0.3456381559371948, |
| "eval_runtime": 51.3247, |
| "eval_samples_per_second": 194.838, |
| "eval_steps_per_second": 24.355, |
| "step": 24700 |
| }, |
| { |
| "epoch": 1.4917708236996066, |
| "grad_norm": 1.0022377967834473, |
| "learning_rate": 5.753917972657553e-06, |
| "loss": 0.3869, |
| "step": 24800 |
| }, |
| { |
| "epoch": 1.4917708236996066, |
| "eval_loss": 0.3509150445461273, |
| "eval_runtime": 51.0426, |
| "eval_samples_per_second": 195.915, |
| "eval_steps_per_second": 24.489, |
| "step": 24800 |
| }, |
| { |
| "epoch": 1.4977860286338793, |
| "grad_norm": 1.02973210811615, |
| "learning_rate": 5.752917639213071e-06, |
| "loss": 0.3885, |
| "step": 24900 |
| }, |
| { |
| "epoch": 1.4977860286338793, |
| "eval_loss": 0.3488512635231018, |
| "eval_runtime": 50.9719, |
| "eval_samples_per_second": 196.187, |
| "eval_steps_per_second": 24.523, |
| "step": 24900 |
| }, |
| { |
| "epoch": 1.5038012335681519, |
| "grad_norm": 1.0170624256134033, |
| "learning_rate": 5.7519173057685896e-06, |
| "loss": 0.386, |
| "step": 25000 |
| }, |
| { |
| "epoch": 1.5038012335681519, |
| "eval_loss": 0.344295859336853, |
| "eval_runtime": 51.2301, |
| "eval_samples_per_second": 195.198, |
| "eval_steps_per_second": 24.4, |
| "step": 25000 |
| }, |
| { |
| "epoch": 1.5098164385024244, |
| "grad_norm": 1.0053726434707642, |
| "learning_rate": 5.750916972324108e-06, |
| "loss": 0.3885, |
| "step": 25100 |
| }, |
| { |
| "epoch": 1.5098164385024244, |
| "eval_loss": 0.34295952320098877, |
| "eval_runtime": 51.2643, |
| "eval_samples_per_second": 195.068, |
| "eval_steps_per_second": 24.383, |
| "step": 25100 |
| }, |
| { |
| "epoch": 1.5158316434366972, |
| "grad_norm": 0.9546186327934265, |
| "learning_rate": 5.749916638879626e-06, |
| "loss": 0.3902, |
| "step": 25200 |
| }, |
| { |
| "epoch": 1.5158316434366972, |
| "eval_loss": 0.3494739234447479, |
| "eval_runtime": 51.1243, |
| "eval_samples_per_second": 195.602, |
| "eval_steps_per_second": 24.45, |
| "step": 25200 |
| }, |
| { |
| "epoch": 1.5218468483709697, |
| "grad_norm": 1.0184184312820435, |
| "learning_rate": 5.748916305435145e-06, |
| "loss": 0.3853, |
| "step": 25300 |
| }, |
| { |
| "epoch": 1.5218468483709697, |
| "eval_loss": 0.34722205996513367, |
| "eval_runtime": 51.0304, |
| "eval_samples_per_second": 195.961, |
| "eval_steps_per_second": 24.495, |
| "step": 25300 |
| }, |
| { |
| "epoch": 1.5278620533052423, |
| "grad_norm": 1.0732802152633667, |
| "learning_rate": 5.747915971990664e-06, |
| "loss": 0.3868, |
| "step": 25400 |
| }, |
| { |
| "epoch": 1.5278620533052423, |
| "eval_loss": 0.34737443923950195, |
| "eval_runtime": 51.1073, |
| "eval_samples_per_second": 195.667, |
| "eval_steps_per_second": 24.458, |
| "step": 25400 |
| }, |
| { |
| "epoch": 1.533877258239515, |
| "grad_norm": 1.023866891860962, |
| "learning_rate": 5.746915638546182e-06, |
| "loss": 0.3846, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1.533877258239515, |
| "eval_loss": 0.34227558970451355, |
| "eval_runtime": 51.0647, |
| "eval_samples_per_second": 195.83, |
| "eval_steps_per_second": 24.479, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1.5398924631737876, |
| "grad_norm": 0.9621095657348633, |
| "learning_rate": 5.745915305101701e-06, |
| "loss": 0.3853, |
| "step": 25600 |
| }, |
| { |
| "epoch": 1.5398924631737876, |
| "eval_loss": 0.33890464901924133, |
| "eval_runtime": 37.4533, |
| "eval_samples_per_second": 266.999, |
| "eval_steps_per_second": 33.375, |
| "step": 25600 |
| }, |
| { |
| "epoch": 1.5459076681080601, |
| "grad_norm": 1.0459903478622437, |
| "learning_rate": 5.744914971657219e-06, |
| "loss": 0.3867, |
| "step": 25700 |
| }, |
| { |
| "epoch": 1.5459076681080601, |
| "eval_loss": 0.3423731327056885, |
| "eval_runtime": 51.0943, |
| "eval_samples_per_second": 195.717, |
| "eval_steps_per_second": 24.465, |
| "step": 25700 |
| }, |
| { |
| "epoch": 1.551922873042333, |
| "grad_norm": 1.0103187561035156, |
| "learning_rate": 5.7439146382127375e-06, |
| "loss": 0.3846, |
| "step": 25800 |
| }, |
| { |
| "epoch": 1.551922873042333, |
| "eval_loss": 0.3495667576789856, |
| "eval_runtime": 51.0619, |
| "eval_samples_per_second": 195.841, |
| "eval_steps_per_second": 24.48, |
| "step": 25800 |
| }, |
| { |
| "epoch": 1.5579380779766052, |
| "grad_norm": 1.1959409713745117, |
| "learning_rate": 5.742914304768256e-06, |
| "loss": 0.3836, |
| "step": 25900 |
| }, |
| { |
| "epoch": 1.5579380779766052, |
| "eval_loss": 0.34345749020576477, |
| "eval_runtime": 50.9931, |
| "eval_samples_per_second": 196.105, |
| "eval_steps_per_second": 24.513, |
| "step": 25900 |
| }, |
| { |
| "epoch": 1.563953282910878, |
| "grad_norm": 1.0257697105407715, |
| "learning_rate": 5.741913971323774e-06, |
| "loss": 0.3832, |
| "step": 26000 |
| }, |
| { |
| "epoch": 1.563953282910878, |
| "eval_loss": 0.3426493704319, |
| "eval_runtime": 51.1309, |
| "eval_samples_per_second": 195.577, |
| "eval_steps_per_second": 24.447, |
| "step": 26000 |
| }, |
| { |
| "epoch": 1.5699684878451505, |
| "grad_norm": 1.1140973567962646, |
| "learning_rate": 5.740913637879294e-06, |
| "loss": 0.3797, |
| "step": 26100 |
| }, |
| { |
| "epoch": 1.5699684878451505, |
| "eval_loss": 0.34580498933792114, |
| "eval_runtime": 51.1787, |
| "eval_samples_per_second": 195.394, |
| "eval_steps_per_second": 24.424, |
| "step": 26100 |
| }, |
| { |
| "epoch": 1.575983692779423, |
| "grad_norm": 1.0050679445266724, |
| "learning_rate": 5.739913304434812e-06, |
| "loss": 0.3749, |
| "step": 26200 |
| }, |
| { |
| "epoch": 1.575983692779423, |
| "eval_loss": 0.3454411029815674, |
| "eval_runtime": 51.1577, |
| "eval_samples_per_second": 195.474, |
| "eval_steps_per_second": 24.434, |
| "step": 26200 |
| }, |
| { |
| "epoch": 1.5819988977136958, |
| "grad_norm": 1.0191149711608887, |
| "learning_rate": 5.73891297099033e-06, |
| "loss": 0.3772, |
| "step": 26300 |
| }, |
| { |
| "epoch": 1.5819988977136958, |
| "eval_loss": 0.3403486907482147, |
| "eval_runtime": 51.0929, |
| "eval_samples_per_second": 195.722, |
| "eval_steps_per_second": 24.465, |
| "step": 26300 |
| }, |
| { |
| "epoch": 1.5880141026479684, |
| "grad_norm": 1.1277610063552856, |
| "learning_rate": 5.737912637545849e-06, |
| "loss": 0.3783, |
| "step": 26400 |
| }, |
| { |
| "epoch": 1.5880141026479684, |
| "eval_loss": 0.3426676392555237, |
| "eval_runtime": 51.3622, |
| "eval_samples_per_second": 194.696, |
| "eval_steps_per_second": 24.337, |
| "step": 26400 |
| }, |
| { |
| "epoch": 1.594029307582241, |
| "grad_norm": 1.12416672706604, |
| "learning_rate": 5.736912304101368e-06, |
| "loss": 0.3765, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1.594029307582241, |
| "eval_loss": 0.3407214879989624, |
| "eval_runtime": 51.185, |
| "eval_samples_per_second": 195.37, |
| "eval_steps_per_second": 24.421, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1.6000445125165137, |
| "grad_norm": 0.9676984548568726, |
| "learning_rate": 5.7359119706568855e-06, |
| "loss": 0.377, |
| "step": 26600 |
| }, |
| { |
| "epoch": 1.6000445125165137, |
| "eval_loss": 0.3347455859184265, |
| "eval_runtime": 50.9838, |
| "eval_samples_per_second": 196.141, |
| "eval_steps_per_second": 24.518, |
| "step": 26600 |
| }, |
| { |
| "epoch": 1.6060597174507862, |
| "grad_norm": 1.0561347007751465, |
| "learning_rate": 5.734911637212404e-06, |
| "loss": 0.3768, |
| "step": 26700 |
| }, |
| { |
| "epoch": 1.6060597174507862, |
| "eval_loss": 0.3399183452129364, |
| "eval_runtime": 51.075, |
| "eval_samples_per_second": 195.79, |
| "eval_steps_per_second": 24.474, |
| "step": 26700 |
| }, |
| { |
| "epoch": 1.6120749223850588, |
| "grad_norm": 1.2122465372085571, |
| "learning_rate": 5.733911303767923e-06, |
| "loss": 0.3763, |
| "step": 26800 |
| }, |
| { |
| "epoch": 1.6120749223850588, |
| "eval_loss": 0.33461084961891174, |
| "eval_runtime": 51.0463, |
| "eval_samples_per_second": 195.901, |
| "eval_steps_per_second": 24.488, |
| "step": 26800 |
| }, |
| { |
| "epoch": 1.6180901273193316, |
| "grad_norm": 1.0054854154586792, |
| "learning_rate": 5.732910970323442e-06, |
| "loss": 0.3786, |
| "step": 26900 |
| }, |
| { |
| "epoch": 1.6180901273193316, |
| "eval_loss": 0.3318628668785095, |
| "eval_runtime": 51.0826, |
| "eval_samples_per_second": 195.761, |
| "eval_steps_per_second": 24.47, |
| "step": 26900 |
| }, |
| { |
| "epoch": 1.624105332253604, |
| "grad_norm": 1.072472333908081, |
| "learning_rate": 5.73191063687896e-06, |
| "loss": 0.3762, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1.624105332253604, |
| "eval_loss": 0.3293687403202057, |
| "eval_runtime": 51.072, |
| "eval_samples_per_second": 195.802, |
| "eval_steps_per_second": 24.475, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1.6301205371878766, |
| "grad_norm": 1.0058602094650269, |
| "learning_rate": 5.730910303434478e-06, |
| "loss": 0.3716, |
| "step": 27100 |
| }, |
| { |
| "epoch": 1.6301205371878766, |
| "eval_loss": 0.33610230684280396, |
| "eval_runtime": 51.0651, |
| "eval_samples_per_second": 195.828, |
| "eval_steps_per_second": 24.479, |
| "step": 27100 |
| }, |
| { |
| "epoch": 1.6361357421221494, |
| "grad_norm": 1.0208802223205566, |
| "learning_rate": 5.729909969989997e-06, |
| "loss": 0.3724, |
| "step": 27200 |
| }, |
| { |
| "epoch": 1.6361357421221494, |
| "eval_loss": 0.3361985981464386, |
| "eval_runtime": 51.1569, |
| "eval_samples_per_second": 195.477, |
| "eval_steps_per_second": 24.435, |
| "step": 27200 |
| }, |
| { |
| "epoch": 1.6421509470564217, |
| "grad_norm": 1.0464400053024292, |
| "learning_rate": 5.728909636545516e-06, |
| "loss": 0.3732, |
| "step": 27300 |
| }, |
| { |
| "epoch": 1.6421509470564217, |
| "eval_loss": 0.3356834053993225, |
| "eval_runtime": 21.647, |
| "eval_samples_per_second": 461.957, |
| "eval_steps_per_second": 57.745, |
| "step": 27300 |
| }, |
| { |
| "epoch": 1.6481661519906945, |
| "grad_norm": 1.1063635349273682, |
| "learning_rate": 5.7279093031010335e-06, |
| "loss": 0.3725, |
| "step": 27400 |
| }, |
| { |
| "epoch": 1.6481661519906945, |
| "eval_loss": 0.3378269374370575, |
| "eval_runtime": 48.6948, |
| "eval_samples_per_second": 205.361, |
| "eval_steps_per_second": 25.67, |
| "step": 27400 |
| }, |
| { |
| "epoch": 1.654181356924967, |
| "grad_norm": 0.8910077214241028, |
| "learning_rate": 5.726908969656552e-06, |
| "loss": 0.3707, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1.654181356924967, |
| "eval_loss": 0.3300679624080658, |
| "eval_runtime": 48.819, |
| "eval_samples_per_second": 204.838, |
| "eval_steps_per_second": 25.605, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1.6601965618592396, |
| "grad_norm": 0.9904689192771912, |
| "learning_rate": 5.725908636212071e-06, |
| "loss": 0.3722, |
| "step": 27600 |
| }, |
| { |
| "epoch": 1.6601965618592396, |
| "eval_loss": 0.33077552914619446, |
| "eval_runtime": 45.4305, |
| "eval_samples_per_second": 220.116, |
| "eval_steps_per_second": 27.515, |
| "step": 27600 |
| }, |
| { |
| "epoch": 1.6662117667935123, |
| "grad_norm": 1.0377715826034546, |
| "learning_rate": 5.72490830276759e-06, |
| "loss": 0.3693, |
| "step": 27700 |
| }, |
| { |
| "epoch": 1.6662117667935123, |
| "eval_loss": 0.3365156948566437, |
| "eval_runtime": 46.8492, |
| "eval_samples_per_second": 213.451, |
| "eval_steps_per_second": 26.681, |
| "step": 27700 |
| }, |
| { |
| "epoch": 1.672226971727785, |
| "grad_norm": 0.9838355183601379, |
| "learning_rate": 5.723907969323108e-06, |
| "loss": 0.373, |
| "step": 27800 |
| }, |
| { |
| "epoch": 1.672226971727785, |
| "eval_loss": 0.33353880047798157, |
| "eval_runtime": 47.6968, |
| "eval_samples_per_second": 209.658, |
| "eval_steps_per_second": 26.207, |
| "step": 27800 |
| }, |
| { |
| "epoch": 1.6782421766620574, |
| "grad_norm": 1.0050548315048218, |
| "learning_rate": 5.722907635878626e-06, |
| "loss": 0.3707, |
| "step": 27900 |
| }, |
| { |
| "epoch": 1.6782421766620574, |
| "eval_loss": 0.3265502154827118, |
| "eval_runtime": 48.1571, |
| "eval_samples_per_second": 207.654, |
| "eval_steps_per_second": 25.957, |
| "step": 27900 |
| }, |
| { |
| "epoch": 1.6842573815963302, |
| "grad_norm": 1.0083630084991455, |
| "learning_rate": 5.721907302434145e-06, |
| "loss": 0.3687, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1.6842573815963302, |
| "eval_loss": 0.33139145374298096, |
| "eval_runtime": 48.694, |
| "eval_samples_per_second": 205.364, |
| "eval_steps_per_second": 25.671, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1.6902725865306027, |
| "grad_norm": 0.9649508595466614, |
| "learning_rate": 5.7209069689896636e-06, |
| "loss": 0.3661, |
| "step": 28100 |
| }, |
| { |
| "epoch": 1.6902725865306027, |
| "eval_loss": 0.3332207202911377, |
| "eval_runtime": 40.0334, |
| "eval_samples_per_second": 249.792, |
| "eval_steps_per_second": 31.224, |
| "step": 28100 |
| }, |
| { |
| "epoch": 1.6962877914648753, |
| "grad_norm": 1.042528748512268, |
| "learning_rate": 5.7199066355451814e-06, |
| "loss": 0.3702, |
| "step": 28200 |
| }, |
| { |
| "epoch": 1.6962877914648753, |
| "eval_loss": 0.32571831345558167, |
| "eval_runtime": 49.2797, |
| "eval_samples_per_second": 202.923, |
| "eval_steps_per_second": 25.365, |
| "step": 28200 |
| }, |
| { |
| "epoch": 1.702302996399148, |
| "grad_norm": 0.9756554365158081, |
| "learning_rate": 5.7189063021007e-06, |
| "loss": 0.3647, |
| "step": 28300 |
| }, |
| { |
| "epoch": 1.702302996399148, |
| "eval_loss": 0.3234156668186188, |
| "eval_runtime": 49.7079, |
| "eval_samples_per_second": 201.175, |
| "eval_steps_per_second": 25.147, |
| "step": 28300 |
| }, |
| { |
| "epoch": 1.7083182013334206, |
| "grad_norm": 1.0613596439361572, |
| "learning_rate": 5.717905968656219e-06, |
| "loss": 0.3649, |
| "step": 28400 |
| }, |
| { |
| "epoch": 1.7083182013334206, |
| "eval_loss": 0.32939964532852173, |
| "eval_runtime": 50.06, |
| "eval_samples_per_second": 199.76, |
| "eval_steps_per_second": 24.97, |
| "step": 28400 |
| }, |
| { |
| "epoch": 1.7143334062676931, |
| "grad_norm": 1.0461217164993286, |
| "learning_rate": 5.7169056352117375e-06, |
| "loss": 0.3677, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1.7143334062676931, |
| "eval_loss": 0.32745957374572754, |
| "eval_runtime": 50.0541, |
| "eval_samples_per_second": 199.784, |
| "eval_steps_per_second": 24.973, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1.720348611201966, |
| "grad_norm": 1.0226540565490723, |
| "learning_rate": 5.715905301767256e-06, |
| "loss": 0.3642, |
| "step": 28600 |
| }, |
| { |
| "epoch": 1.720348611201966, |
| "eval_loss": 0.3290911316871643, |
| "eval_runtime": 50.4387, |
| "eval_samples_per_second": 198.26, |
| "eval_steps_per_second": 24.783, |
| "step": 28600 |
| }, |
| { |
| "epoch": 1.7263638161362382, |
| "grad_norm": 1.0498120784759521, |
| "learning_rate": 5.714904968322774e-06, |
| "loss": 0.3626, |
| "step": 28700 |
| }, |
| { |
| "epoch": 1.7263638161362382, |
| "eval_loss": 0.33111146092414856, |
| "eval_runtime": 50.7317, |
| "eval_samples_per_second": 197.115, |
| "eval_steps_per_second": 24.639, |
| "step": 28700 |
| }, |
| { |
| "epoch": 1.732379021070511, |
| "grad_norm": 1.0179612636566162, |
| "learning_rate": 5.713904634878293e-06, |
| "loss": 0.3611, |
| "step": 28800 |
| }, |
| { |
| "epoch": 1.732379021070511, |
| "eval_loss": 0.31966713070869446, |
| "eval_runtime": 35.8874, |
| "eval_samples_per_second": 278.65, |
| "eval_steps_per_second": 34.831, |
| "step": 28800 |
| }, |
| { |
| "epoch": 1.7383942260047835, |
| "grad_norm": 0.9876866340637207, |
| "learning_rate": 5.7129043014338115e-06, |
| "loss": 0.3609, |
| "step": 28900 |
| }, |
| { |
| "epoch": 1.7383942260047835, |
| "eval_loss": 0.3232952356338501, |
| "eval_runtime": 50.8899, |
| "eval_samples_per_second": 196.503, |
| "eval_steps_per_second": 24.563, |
| "step": 28900 |
| }, |
| { |
| "epoch": 1.744409430939056, |
| "grad_norm": 1.08419668674469, |
| "learning_rate": 5.711903967989329e-06, |
| "loss": 0.3621, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1.744409430939056, |
| "eval_loss": 0.32880115509033203, |
| "eval_runtime": 50.9007, |
| "eval_samples_per_second": 196.461, |
| "eval_steps_per_second": 24.558, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1.7504246358733289, |
| "grad_norm": 1.0506683588027954, |
| "learning_rate": 5.710903634544848e-06, |
| "loss": 0.3612, |
| "step": 29100 |
| }, |
| { |
| "epoch": 1.7504246358733289, |
| "eval_loss": 0.32626426219940186, |
| "eval_runtime": 51.3181, |
| "eval_samples_per_second": 194.863, |
| "eval_steps_per_second": 24.358, |
| "step": 29100 |
| }, |
| { |
| "epoch": 1.7564398408076014, |
| "grad_norm": 1.0610612630844116, |
| "learning_rate": 5.709903301100367e-06, |
| "loss": 0.3604, |
| "step": 29200 |
| }, |
| { |
| "epoch": 1.7564398408076014, |
| "eval_loss": 0.32427623867988586, |
| "eval_runtime": 51.1109, |
| "eval_samples_per_second": 195.653, |
| "eval_steps_per_second": 24.457, |
| "step": 29200 |
| }, |
| { |
| "epoch": 1.762455045741874, |
| "grad_norm": 1.0237441062927246, |
| "learning_rate": 5.7089029676558855e-06, |
| "loss": 0.3576, |
| "step": 29300 |
| }, |
| { |
| "epoch": 1.762455045741874, |
| "eval_loss": 0.325724720954895, |
| "eval_runtime": 51.0538, |
| "eval_samples_per_second": 195.872, |
| "eval_steps_per_second": 24.484, |
| "step": 29300 |
| }, |
| { |
| "epoch": 1.7684702506761467, |
| "grad_norm": 1.0518171787261963, |
| "learning_rate": 5.707902634211404e-06, |
| "loss": 0.3623, |
| "step": 29400 |
| }, |
| { |
| "epoch": 1.7684702506761467, |
| "eval_loss": 0.3236755430698395, |
| "eval_runtime": 51.279, |
| "eval_samples_per_second": 195.012, |
| "eval_steps_per_second": 24.376, |
| "step": 29400 |
| }, |
| { |
| "epoch": 1.7744854556104193, |
| "grad_norm": 1.008692741394043, |
| "learning_rate": 5.706902300766923e-06, |
| "loss": 0.3594, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1.7744854556104193, |
| "eval_loss": 0.322955846786499, |
| "eval_runtime": 50.9674, |
| "eval_samples_per_second": 196.204, |
| "eval_steps_per_second": 24.525, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1.7805006605446918, |
| "grad_norm": 1.0272122621536255, |
| "learning_rate": 5.705901967322441e-06, |
| "loss": 0.3589, |
| "step": 29600 |
| }, |
| { |
| "epoch": 1.7805006605446918, |
| "eval_loss": 0.32889479398727417, |
| "eval_runtime": 51.0901, |
| "eval_samples_per_second": 195.733, |
| "eval_steps_per_second": 24.467, |
| "step": 29600 |
| }, |
| { |
| "epoch": 1.7865158654789646, |
| "grad_norm": 0.9986202120780945, |
| "learning_rate": 5.7049016338779595e-06, |
| "loss": 0.3583, |
| "step": 29700 |
| }, |
| { |
| "epoch": 1.7865158654789646, |
| "eval_loss": 0.32579848170280457, |
| "eval_runtime": 51.3308, |
| "eval_samples_per_second": 194.815, |
| "eval_steps_per_second": 24.352, |
| "step": 29700 |
| }, |
| { |
| "epoch": 1.7925310704132371, |
| "grad_norm": 1.1426304578781128, |
| "learning_rate": 5.703901300433477e-06, |
| "loss": 0.3578, |
| "step": 29800 |
| }, |
| { |
| "epoch": 1.7925310704132371, |
| "eval_loss": 0.3219316303730011, |
| "eval_runtime": 51.0488, |
| "eval_samples_per_second": 195.891, |
| "eval_steps_per_second": 24.486, |
| "step": 29800 |
| }, |
| { |
| "epoch": 1.7985462753475097, |
| "grad_norm": 1.0315282344818115, |
| "learning_rate": 5.702900966988996e-06, |
| "loss": 0.3554, |
| "step": 29900 |
| }, |
| { |
| "epoch": 1.7985462753475097, |
| "eval_loss": 0.3245343267917633, |
| "eval_runtime": 51.1337, |
| "eval_samples_per_second": 195.566, |
| "eval_steps_per_second": 24.446, |
| "step": 29900 |
| }, |
| { |
| "epoch": 1.8045614802817824, |
| "grad_norm": 0.9708550572395325, |
| "learning_rate": 5.701900633544515e-06, |
| "loss": 0.3576, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1.8045614802817824, |
| "eval_loss": 0.3180968761444092, |
| "eval_runtime": 51.0446, |
| "eval_samples_per_second": 195.907, |
| "eval_steps_per_second": 24.488, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1.8105766852160547, |
| "grad_norm": 0.9034538865089417, |
| "learning_rate": 5.7009003001000335e-06, |
| "loss": 0.3537, |
| "step": 30100 |
| }, |
| { |
| "epoch": 1.8105766852160547, |
| "eval_loss": 0.3229399621486664, |
| "eval_runtime": 51.0689, |
| "eval_samples_per_second": 195.814, |
| "eval_steps_per_second": 24.477, |
| "step": 30100 |
| }, |
| { |
| "epoch": 1.8165918901503275, |
| "grad_norm": 1.0373872518539429, |
| "learning_rate": 5.699899966655552e-06, |
| "loss": 0.356, |
| "step": 30200 |
| }, |
| { |
| "epoch": 1.8165918901503275, |
| "eval_loss": 0.3164275288581848, |
| "eval_runtime": 51.4888, |
| "eval_samples_per_second": 194.217, |
| "eval_steps_per_second": 24.277, |
| "step": 30200 |
| }, |
| { |
| "epoch": 1.8226070950846, |
| "grad_norm": 1.073961615562439, |
| "learning_rate": 5.698899633211071e-06, |
| "loss": 0.3574, |
| "step": 30300 |
| }, |
| { |
| "epoch": 1.8226070950846, |
| "eval_loss": 0.3165951669216156, |
| "eval_runtime": 51.0637, |
| "eval_samples_per_second": 195.834, |
| "eval_steps_per_second": 24.479, |
| "step": 30300 |
| }, |
| { |
| "epoch": 1.8286223000188726, |
| "grad_norm": 0.9891506433486938, |
| "learning_rate": 5.697899299766589e-06, |
| "loss": 0.3548, |
| "step": 30400 |
| }, |
| { |
| "epoch": 1.8286223000188726, |
| "eval_loss": 0.3134399354457855, |
| "eval_runtime": 51.2735, |
| "eval_samples_per_second": 195.032, |
| "eval_steps_per_second": 24.379, |
| "step": 30400 |
| }, |
| { |
| "epoch": 1.8346375049531454, |
| "grad_norm": 0.9468514919281006, |
| "learning_rate": 5.6968989663221075e-06, |
| "loss": 0.3534, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1.8346375049531454, |
| "eval_loss": 0.3175615966320038, |
| "eval_runtime": 51.0054, |
| "eval_samples_per_second": 196.058, |
| "eval_steps_per_second": 24.507, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1.840652709887418, |
| "grad_norm": 1.0942094326019287, |
| "learning_rate": 5.695898632877625e-06, |
| "loss": 0.3551, |
| "step": 30600 |
| }, |
| { |
| "epoch": 1.840652709887418, |
| "eval_loss": 0.31934764981269836, |
| "eval_runtime": 50.744, |
| "eval_samples_per_second": 197.068, |
| "eval_steps_per_second": 24.633, |
| "step": 30600 |
| }, |
| { |
| "epoch": 1.8466679148216905, |
| "grad_norm": 1.0087659358978271, |
| "learning_rate": 5.694898299433144e-06, |
| "loss": 0.3534, |
| "step": 30700 |
| }, |
| { |
| "epoch": 1.8466679148216905, |
| "eval_loss": 0.3216070532798767, |
| "eval_runtime": 51.2443, |
| "eval_samples_per_second": 195.144, |
| "eval_steps_per_second": 24.393, |
| "step": 30700 |
| }, |
| { |
| "epoch": 1.8526831197559632, |
| "grad_norm": 0.973987340927124, |
| "learning_rate": 5.693897965988664e-06, |
| "loss": 0.3551, |
| "step": 30800 |
| }, |
| { |
| "epoch": 1.8526831197559632, |
| "eval_loss": 0.3222227990627289, |
| "eval_runtime": 51.317, |
| "eval_samples_per_second": 194.867, |
| "eval_steps_per_second": 24.358, |
| "step": 30800 |
| }, |
| { |
| "epoch": 1.8586983246902358, |
| "grad_norm": 1.0220999717712402, |
| "learning_rate": 5.6928976325441814e-06, |
| "loss": 0.3512, |
| "step": 30900 |
| }, |
| { |
| "epoch": 1.8586983246902358, |
| "eval_loss": 0.3149110972881317, |
| "eval_runtime": 50.9851, |
| "eval_samples_per_second": 196.136, |
| "eval_steps_per_second": 24.517, |
| "step": 30900 |
| }, |
| { |
| "epoch": 1.8647135296245083, |
| "grad_norm": 0.9891929626464844, |
| "learning_rate": 5.6918972990997e-06, |
| "loss": 0.3494, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1.8647135296245083, |
| "eval_loss": 0.3158430755138397, |
| "eval_runtime": 51.0404, |
| "eval_samples_per_second": 195.923, |
| "eval_steps_per_second": 24.49, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1.870728734558781, |
| "grad_norm": 1.0088871717453003, |
| "learning_rate": 5.690896965655219e-06, |
| "loss": 0.3554, |
| "step": 31100 |
| }, |
| { |
| "epoch": 1.870728734558781, |
| "eval_loss": 0.3154695928096771, |
| "eval_runtime": 51.3526, |
| "eval_samples_per_second": 194.732, |
| "eval_steps_per_second": 24.342, |
| "step": 31100 |
| }, |
| { |
| "epoch": 1.8767439394930534, |
| "grad_norm": 1.050904393196106, |
| "learning_rate": 5.689896632210737e-06, |
| "loss": 0.348, |
| "step": 31200 |
| }, |
| { |
| "epoch": 1.8767439394930534, |
| "eval_loss": 0.3176015019416809, |
| "eval_runtime": 50.968, |
| "eval_samples_per_second": 196.202, |
| "eval_steps_per_second": 24.525, |
| "step": 31200 |
| }, |
| { |
| "epoch": 1.8827591444273262, |
| "grad_norm": 0.9467193484306335, |
| "learning_rate": 5.688896298766255e-06, |
| "loss": 0.3495, |
| "step": 31300 |
| }, |
| { |
| "epoch": 1.8827591444273262, |
| "eval_loss": 0.31329813599586487, |
| "eval_runtime": 51.0441, |
| "eval_samples_per_second": 195.909, |
| "eval_steps_per_second": 24.489, |
| "step": 31300 |
| }, |
| { |
| "epoch": 1.888774349361599, |
| "grad_norm": 0.9775587916374207, |
| "learning_rate": 5.687895965321774e-06, |
| "loss": 0.348, |
| "step": 31400 |
| }, |
| { |
| "epoch": 1.888774349361599, |
| "eval_loss": 0.3119243383407593, |
| "eval_runtime": 51.4209, |
| "eval_samples_per_second": 194.474, |
| "eval_steps_per_second": 24.309, |
| "step": 31400 |
| }, |
| { |
| "epoch": 1.8947895542958713, |
| "grad_norm": 0.9961014986038208, |
| "learning_rate": 5.686895631877293e-06, |
| "loss": 0.3481, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1.8947895542958713, |
| "eval_loss": 0.3146650791168213, |
| "eval_runtime": 51.0401, |
| "eval_samples_per_second": 195.924, |
| "eval_steps_per_second": 24.491, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1.900804759230144, |
| "grad_norm": 0.9647944569587708, |
| "learning_rate": 5.6858952984328115e-06, |
| "loss": 0.3485, |
| "step": 31600 |
| }, |
| { |
| "epoch": 1.900804759230144, |
| "eval_loss": 0.3082703948020935, |
| "eval_runtime": 51.0736, |
| "eval_samples_per_second": 195.796, |
| "eval_steps_per_second": 24.474, |
| "step": 31600 |
| }, |
| { |
| "epoch": 1.9068199641644166, |
| "grad_norm": 0.977745532989502, |
| "learning_rate": 5.684894964988329e-06, |
| "loss": 0.346, |
| "step": 31700 |
| }, |
| { |
| "epoch": 1.9068199641644166, |
| "eval_loss": 0.31021973490715027, |
| "eval_runtime": 51.3893, |
| "eval_samples_per_second": 194.593, |
| "eval_steps_per_second": 24.324, |
| "step": 31700 |
| }, |
| { |
| "epoch": 1.9128351690986891, |
| "grad_norm": 1.007712960243225, |
| "learning_rate": 5.683894631543848e-06, |
| "loss": 0.3439, |
| "step": 31800 |
| }, |
| { |
| "epoch": 1.9128351690986891, |
| "eval_loss": 0.3149736225605011, |
| "eval_runtime": 50.9919, |
| "eval_samples_per_second": 196.109, |
| "eval_steps_per_second": 24.514, |
| "step": 31800 |
| }, |
| { |
| "epoch": 1.9188503740329619, |
| "grad_norm": 0.9901500940322876, |
| "learning_rate": 5.682894298099367e-06, |
| "loss": 0.3465, |
| "step": 31900 |
| }, |
| { |
| "epoch": 1.9188503740329619, |
| "eval_loss": 0.3099238872528076, |
| "eval_runtime": 49.9711, |
| "eval_samples_per_second": 200.116, |
| "eval_steps_per_second": 25.014, |
| "step": 31900 |
| }, |
| { |
| "epoch": 1.9248655789672344, |
| "grad_norm": 1.0771408081054688, |
| "learning_rate": 5.681893964654885e-06, |
| "loss": 0.3469, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1.9248655789672344, |
| "eval_loss": 0.3117373585700989, |
| "eval_runtime": 51.3413, |
| "eval_samples_per_second": 194.775, |
| "eval_steps_per_second": 24.347, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1.930880783901507, |
| "grad_norm": 0.9278393983840942, |
| "learning_rate": 5.680893631210403e-06, |
| "loss": 0.3449, |
| "step": 32100 |
| }, |
| { |
| "epoch": 1.930880783901507, |
| "eval_loss": 0.3087506890296936, |
| "eval_runtime": 50.9985, |
| "eval_samples_per_second": 196.084, |
| "eval_steps_per_second": 24.511, |
| "step": 32100 |
| }, |
| { |
| "epoch": 1.9368959888357797, |
| "grad_norm": 0.9451966285705566, |
| "learning_rate": 5.679893297765922e-06, |
| "loss": 0.3481, |
| "step": 32200 |
| }, |
| { |
| "epoch": 1.9368959888357797, |
| "eval_loss": 0.30677124857902527, |
| "eval_runtime": 51.0702, |
| "eval_samples_per_second": 195.809, |
| "eval_steps_per_second": 24.476, |
| "step": 32200 |
| }, |
| { |
| "epoch": 1.9429111937700523, |
| "grad_norm": 1.0483254194259644, |
| "learning_rate": 5.678892964321441e-06, |
| "loss": 0.3445, |
| "step": 32300 |
| }, |
| { |
| "epoch": 1.9429111937700523, |
| "eval_loss": 0.30840355157852173, |
| "eval_runtime": 51.0518, |
| "eval_samples_per_second": 195.879, |
| "eval_steps_per_second": 24.485, |
| "step": 32300 |
| }, |
| { |
| "epoch": 1.9489263987043248, |
| "grad_norm": 1.0422637462615967, |
| "learning_rate": 5.6778926308769595e-06, |
| "loss": 0.3441, |
| "step": 32400 |
| }, |
| { |
| "epoch": 1.9489263987043248, |
| "eval_loss": 0.3115750849246979, |
| "eval_runtime": 51.1153, |
| "eval_samples_per_second": 195.636, |
| "eval_steps_per_second": 24.455, |
| "step": 32400 |
| }, |
| { |
| "epoch": 1.9549416036385976, |
| "grad_norm": 0.9909389019012451, |
| "learning_rate": 5.676892297432478e-06, |
| "loss": 0.344, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1.9549416036385976, |
| "eval_loss": 0.30596745014190674, |
| "eval_runtime": 51.3225, |
| "eval_samples_per_second": 194.846, |
| "eval_steps_per_second": 24.356, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1.96095680857287, |
| "grad_norm": 0.9379361271858215, |
| "learning_rate": 5.675891963987996e-06, |
| "loss": 0.3451, |
| "step": 32600 |
| }, |
| { |
| "epoch": 1.96095680857287, |
| "eval_loss": 0.3045947253704071, |
| "eval_runtime": 48.1799, |
| "eval_samples_per_second": 207.555, |
| "eval_steps_per_second": 25.944, |
| "step": 32600 |
| }, |
| { |
| "epoch": 1.9669720135071427, |
| "grad_norm": 0.9916946887969971, |
| "learning_rate": 5.674891630543515e-06, |
| "loss": 0.3435, |
| "step": 32700 |
| }, |
| { |
| "epoch": 1.9669720135071427, |
| "eval_loss": 0.3098689019680023, |
| "eval_runtime": 51.0219, |
| "eval_samples_per_second": 195.994, |
| "eval_steps_per_second": 24.499, |
| "step": 32700 |
| }, |
| { |
| "epoch": 1.9729872184414154, |
| "grad_norm": 1.0491201877593994, |
| "learning_rate": 5.673891297099033e-06, |
| "loss": 0.3451, |
| "step": 32800 |
| }, |
| { |
| "epoch": 1.9729872184414154, |
| "eval_loss": 0.307062566280365, |
| "eval_runtime": 51.2447, |
| "eval_samples_per_second": 195.142, |
| "eval_steps_per_second": 24.393, |
| "step": 32800 |
| }, |
| { |
| "epoch": 1.9790024233756878, |
| "grad_norm": 1.0011417865753174, |
| "learning_rate": 5.672890963654551e-06, |
| "loss": 0.3438, |
| "step": 32900 |
| }, |
| { |
| "epoch": 1.9790024233756878, |
| "eval_loss": 0.30759868025779724, |
| "eval_runtime": 51.2551, |
| "eval_samples_per_second": 195.103, |
| "eval_steps_per_second": 24.388, |
| "step": 32900 |
| }, |
| { |
| "epoch": 1.9850176283099605, |
| "grad_norm": 0.997515082359314, |
| "learning_rate": 5.67189063021007e-06, |
| "loss": 0.3401, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1.9850176283099605, |
| "eval_loss": 0.30724722146987915, |
| "eval_runtime": 51.0456, |
| "eval_samples_per_second": 195.903, |
| "eval_steps_per_second": 24.488, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1.991032833244233, |
| "grad_norm": 1.00389564037323, |
| "learning_rate": 5.670890296765589e-06, |
| "loss": 0.3435, |
| "step": 33100 |
| }, |
| { |
| "epoch": 1.991032833244233, |
| "eval_loss": 0.30223432183265686, |
| "eval_runtime": 51.0634, |
| "eval_samples_per_second": 195.835, |
| "eval_steps_per_second": 24.479, |
| "step": 33100 |
| }, |
| { |
| "epoch": 1.9970480381785056, |
| "grad_norm": 1.0292458534240723, |
| "learning_rate": 5.6698899633211075e-06, |
| "loss": 0.342, |
| "step": 33200 |
| }, |
| { |
| "epoch": 1.9970480381785056, |
| "eval_loss": 0.3018937110900879, |
| "eval_runtime": 51.3884, |
| "eval_samples_per_second": 194.597, |
| "eval_steps_per_second": 24.325, |
| "step": 33200 |
| }, |
| { |
| "epoch": 2.0030632431127784, |
| "grad_norm": 0.9542250037193298, |
| "learning_rate": 5.668889629876626e-06, |
| "loss": 0.3437, |
| "step": 33300 |
| }, |
| { |
| "epoch": 2.0030632431127784, |
| "eval_loss": 0.3050287961959839, |
| "eval_runtime": 48.1087, |
| "eval_samples_per_second": 207.863, |
| "eval_steps_per_second": 25.983, |
| "step": 33300 |
| }, |
| { |
| "epoch": 2.0090784480470507, |
| "grad_norm": 0.9858297109603882, |
| "learning_rate": 5.667889296432144e-06, |
| "loss": 0.3376, |
| "step": 33400 |
| }, |
| { |
| "epoch": 2.0090784480470507, |
| "eval_loss": 0.3004157543182373, |
| "eval_runtime": 50.8704, |
| "eval_samples_per_second": 196.578, |
| "eval_steps_per_second": 24.572, |
| "step": 33400 |
| }, |
| { |
| "epoch": 2.0150936529813235, |
| "grad_norm": 0.9825339317321777, |
| "learning_rate": 5.666888962987663e-06, |
| "loss": 0.3387, |
| "step": 33500 |
| }, |
| { |
| "epoch": 2.0150936529813235, |
| "eval_loss": 0.3035270869731903, |
| "eval_runtime": 51.1972, |
| "eval_samples_per_second": 195.323, |
| "eval_steps_per_second": 24.415, |
| "step": 33500 |
| }, |
| { |
| "epoch": 2.0211088579155962, |
| "grad_norm": 0.9198622703552246, |
| "learning_rate": 5.665888629543181e-06, |
| "loss": 0.336, |
| "step": 33600 |
| }, |
| { |
| "epoch": 2.0211088579155962, |
| "eval_loss": 0.30675825476646423, |
| "eval_runtime": 50.9963, |
| "eval_samples_per_second": 196.093, |
| "eval_steps_per_second": 24.512, |
| "step": 33600 |
| }, |
| { |
| "epoch": 2.0271240628498686, |
| "grad_norm": 0.9473734498023987, |
| "learning_rate": 5.664888296098699e-06, |
| "loss": 0.336, |
| "step": 33700 |
| }, |
| { |
| "epoch": 2.0271240628498686, |
| "eval_loss": 0.3050824701786041, |
| "eval_runtime": 51.1058, |
| "eval_samples_per_second": 195.673, |
| "eval_steps_per_second": 24.459, |
| "step": 33700 |
| }, |
| { |
| "epoch": 2.0331392677841413, |
| "grad_norm": 0.9824632406234741, |
| "learning_rate": 5.663887962654218e-06, |
| "loss": 0.3366, |
| "step": 33800 |
| }, |
| { |
| "epoch": 2.0331392677841413, |
| "eval_loss": 0.3059363067150116, |
| "eval_runtime": 51.3136, |
| "eval_samples_per_second": 194.88, |
| "eval_steps_per_second": 24.36, |
| "step": 33800 |
| }, |
| { |
| "epoch": 2.039154472718414, |
| "grad_norm": 0.8891803622245789, |
| "learning_rate": 5.662887629209737e-06, |
| "loss": 0.3373, |
| "step": 33900 |
| }, |
| { |
| "epoch": 2.039154472718414, |
| "eval_loss": 0.2996893525123596, |
| "eval_runtime": 51.0027, |
| "eval_samples_per_second": 196.068, |
| "eval_steps_per_second": 24.509, |
| "step": 33900 |
| }, |
| { |
| "epoch": 2.0451696776526864, |
| "grad_norm": 1.0512337684631348, |
| "learning_rate": 5.6618872957652554e-06, |
| "loss": 0.3367, |
| "step": 34000 |
| }, |
| { |
| "epoch": 2.0451696776526864, |
| "eval_loss": 0.3059813976287842, |
| "eval_runtime": 48.247, |
| "eval_samples_per_second": 207.267, |
| "eval_steps_per_second": 25.908, |
| "step": 34000 |
| }, |
| { |
| "epoch": 2.051184882586959, |
| "grad_norm": 0.9054902791976929, |
| "learning_rate": 5.660886962320774e-06, |
| "loss": 0.3371, |
| "step": 34100 |
| }, |
| { |
| "epoch": 2.051184882586959, |
| "eval_loss": 0.3016323745250702, |
| "eval_runtime": 51.1014, |
| "eval_samples_per_second": 195.69, |
| "eval_steps_per_second": 24.461, |
| "step": 34100 |
| }, |
| { |
| "epoch": 2.057200087521232, |
| "grad_norm": 0.9262953400611877, |
| "learning_rate": 5.659886628876292e-06, |
| "loss": 0.3367, |
| "step": 34200 |
| }, |
| { |
| "epoch": 2.057200087521232, |
| "eval_loss": 0.29450055956840515, |
| "eval_runtime": 51.0335, |
| "eval_samples_per_second": 195.95, |
| "eval_steps_per_second": 24.494, |
| "step": 34200 |
| }, |
| { |
| "epoch": 2.0632152924555043, |
| "grad_norm": 0.9734236001968384, |
| "learning_rate": 5.658886295431811e-06, |
| "loss": 0.3343, |
| "step": 34300 |
| }, |
| { |
| "epoch": 2.0632152924555043, |
| "eval_loss": 0.3005402684211731, |
| "eval_runtime": 51.0508, |
| "eval_samples_per_second": 195.883, |
| "eval_steps_per_second": 24.485, |
| "step": 34300 |
| }, |
| { |
| "epoch": 2.069230497389777, |
| "grad_norm": 1.0002549886703491, |
| "learning_rate": 5.657885961987329e-06, |
| "loss": 0.3322, |
| "step": 34400 |
| }, |
| { |
| "epoch": 2.069230497389777, |
| "eval_loss": 0.2977810204029083, |
| "eval_runtime": 51.3717, |
| "eval_samples_per_second": 194.66, |
| "eval_steps_per_second": 24.332, |
| "step": 34400 |
| }, |
| { |
| "epoch": 2.07524570232405, |
| "grad_norm": 1.0582560300827026, |
| "learning_rate": 5.656885628542847e-06, |
| "loss": 0.3335, |
| "step": 34500 |
| }, |
| { |
| "epoch": 2.07524570232405, |
| "eval_loss": 0.30631959438323975, |
| "eval_runtime": 51.4392, |
| "eval_samples_per_second": 194.404, |
| "eval_steps_per_second": 24.301, |
| "step": 34500 |
| }, |
| { |
| "epoch": 2.081260907258322, |
| "grad_norm": 0.9257709383964539, |
| "learning_rate": 5.655885295098366e-06, |
| "loss": 0.3348, |
| "step": 34600 |
| }, |
| { |
| "epoch": 2.081260907258322, |
| "eval_loss": 0.296891450881958, |
| "eval_runtime": 51.1063, |
| "eval_samples_per_second": 195.671, |
| "eval_steps_per_second": 24.459, |
| "step": 34600 |
| }, |
| { |
| "epoch": 2.087276112192595, |
| "grad_norm": 0.9784733653068542, |
| "learning_rate": 5.654884961653885e-06, |
| "loss": 0.3351, |
| "step": 34700 |
| }, |
| { |
| "epoch": 2.087276112192595, |
| "eval_loss": 0.30041709542274475, |
| "eval_runtime": 36.3799, |
| "eval_samples_per_second": 274.877, |
| "eval_steps_per_second": 34.36, |
| "step": 34700 |
| }, |
| { |
| "epoch": 2.0932913171268677, |
| "grad_norm": 0.9119441509246826, |
| "learning_rate": 5.653884628209403e-06, |
| "loss": 0.3331, |
| "step": 34800 |
| }, |
| { |
| "epoch": 2.0932913171268677, |
| "eval_loss": 0.2985159754753113, |
| "eval_runtime": 51.0698, |
| "eval_samples_per_second": 195.811, |
| "eval_steps_per_second": 24.476, |
| "step": 34800 |
| }, |
| { |
| "epoch": 2.09930652206114, |
| "grad_norm": 0.8888152837753296, |
| "learning_rate": 5.652884294764922e-06, |
| "loss": 0.3329, |
| "step": 34900 |
| }, |
| { |
| "epoch": 2.09930652206114, |
| "eval_loss": 0.2997465431690216, |
| "eval_runtime": 51.2789, |
| "eval_samples_per_second": 195.012, |
| "eval_steps_per_second": 24.377, |
| "step": 34900 |
| }, |
| { |
| "epoch": 2.1053217269954128, |
| "grad_norm": 0.9288111329078674, |
| "learning_rate": 5.65188396132044e-06, |
| "loss": 0.3293, |
| "step": 35000 |
| }, |
| { |
| "epoch": 2.1053217269954128, |
| "eval_loss": 0.30220091342926025, |
| "eval_runtime": 51.0672, |
| "eval_samples_per_second": 195.82, |
| "eval_steps_per_second": 24.478, |
| "step": 35000 |
| }, |
| { |
| "epoch": 2.111336931929685, |
| "grad_norm": 0.9979832172393799, |
| "learning_rate": 5.650883627875959e-06, |
| "loss": 0.3335, |
| "step": 35100 |
| }, |
| { |
| "epoch": 2.111336931929685, |
| "eval_loss": 0.2983012795448303, |
| "eval_runtime": 51.1125, |
| "eval_samples_per_second": 195.647, |
| "eval_steps_per_second": 24.456, |
| "step": 35100 |
| }, |
| { |
| "epoch": 2.117352136863958, |
| "grad_norm": 0.9908544421195984, |
| "learning_rate": 5.649883294431477e-06, |
| "loss": 0.3308, |
| "step": 35200 |
| }, |
| { |
| "epoch": 2.117352136863958, |
| "eval_loss": 0.294648677110672, |
| "eval_runtime": 51.0363, |
| "eval_samples_per_second": 195.939, |
| "eval_steps_per_second": 24.492, |
| "step": 35200 |
| }, |
| { |
| "epoch": 2.1233673417982306, |
| "grad_norm": 0.9367330074310303, |
| "learning_rate": 5.648882960986995e-06, |
| "loss": 0.3308, |
| "step": 35300 |
| }, |
| { |
| "epoch": 2.1233673417982306, |
| "eval_loss": 0.2953595817089081, |
| "eval_runtime": 51.3129, |
| "eval_samples_per_second": 194.883, |
| "eval_steps_per_second": 24.36, |
| "step": 35300 |
| }, |
| { |
| "epoch": 2.129382546732503, |
| "grad_norm": 0.923230767250061, |
| "learning_rate": 5.647882627542515e-06, |
| "loss": 0.3305, |
| "step": 35400 |
| }, |
| { |
| "epoch": 2.129382546732503, |
| "eval_loss": 0.2954292893409729, |
| "eval_runtime": 51.1146, |
| "eval_samples_per_second": 195.639, |
| "eval_steps_per_second": 24.455, |
| "step": 35400 |
| }, |
| { |
| "epoch": 2.1353977516667757, |
| "grad_norm": 0.9737799167633057, |
| "learning_rate": 5.6468822940980335e-06, |
| "loss": 0.3321, |
| "step": 35500 |
| }, |
| { |
| "epoch": 2.1353977516667757, |
| "eval_loss": 0.2911643981933594, |
| "eval_runtime": 51.5291, |
| "eval_samples_per_second": 194.065, |
| "eval_steps_per_second": 24.258, |
| "step": 35500 |
| }, |
| { |
| "epoch": 2.1414129566010485, |
| "grad_norm": 0.957861602306366, |
| "learning_rate": 5.645881960653551e-06, |
| "loss": 0.3304, |
| "step": 35600 |
| }, |
| { |
| "epoch": 2.1414129566010485, |
| "eval_loss": 0.29846978187561035, |
| "eval_runtime": 50.954, |
| "eval_samples_per_second": 196.255, |
| "eval_steps_per_second": 24.532, |
| "step": 35600 |
| }, |
| { |
| "epoch": 2.147428161535321, |
| "grad_norm": 0.9183242321014404, |
| "learning_rate": 5.64488162720907e-06, |
| "loss": 0.3271, |
| "step": 35700 |
| }, |
| { |
| "epoch": 2.147428161535321, |
| "eval_loss": 0.2944715619087219, |
| "eval_runtime": 51.2205, |
| "eval_samples_per_second": 195.234, |
| "eval_steps_per_second": 24.404, |
| "step": 35700 |
| }, |
| { |
| "epoch": 2.1534433664695936, |
| "grad_norm": 0.9701703190803528, |
| "learning_rate": 5.643881293764588e-06, |
| "loss": 0.3293, |
| "step": 35800 |
| }, |
| { |
| "epoch": 2.1534433664695936, |
| "eval_loss": 0.29417359828948975, |
| "eval_runtime": 51.0579, |
| "eval_samples_per_second": 195.856, |
| "eval_steps_per_second": 24.482, |
| "step": 35800 |
| }, |
| { |
| "epoch": 2.1594585714038663, |
| "grad_norm": 0.992079496383667, |
| "learning_rate": 5.642880960320107e-06, |
| "loss": 0.3263, |
| "step": 35900 |
| }, |
| { |
| "epoch": 2.1594585714038663, |
| "eval_loss": 0.29444122314453125, |
| "eval_runtime": 51.0557, |
| "eval_samples_per_second": 195.864, |
| "eval_steps_per_second": 24.483, |
| "step": 35900 |
| }, |
| { |
| "epoch": 2.1654737763381386, |
| "grad_norm": 0.9776268005371094, |
| "learning_rate": 5.641880626875625e-06, |
| "loss": 0.3266, |
| "step": 36000 |
| }, |
| { |
| "epoch": 2.1654737763381386, |
| "eval_loss": 0.29786214232444763, |
| "eval_runtime": 44.4576, |
| "eval_samples_per_second": 224.934, |
| "eval_steps_per_second": 28.117, |
| "step": 36000 |
| }, |
| { |
| "epoch": 2.1714889812724114, |
| "grad_norm": 1.0352015495300293, |
| "learning_rate": 5.640880293431144e-06, |
| "loss": 0.3279, |
| "step": 36100 |
| }, |
| { |
| "epoch": 2.1714889812724114, |
| "eval_loss": 0.2935112416744232, |
| "eval_runtime": 51.0332, |
| "eval_samples_per_second": 195.951, |
| "eval_steps_per_second": 24.494, |
| "step": 36100 |
| }, |
| { |
| "epoch": 2.1775041862066837, |
| "grad_norm": 0.9267537593841553, |
| "learning_rate": 5.639879959986663e-06, |
| "loss": 0.3252, |
| "step": 36200 |
| }, |
| { |
| "epoch": 2.1775041862066837, |
| "eval_loss": 0.2946629822254181, |
| "eval_runtime": 51.0517, |
| "eval_samples_per_second": 195.88, |
| "eval_steps_per_second": 24.485, |
| "step": 36200 |
| }, |
| { |
| "epoch": 2.1835193911409565, |
| "grad_norm": 0.8838132619857788, |
| "learning_rate": 5.6388796265421815e-06, |
| "loss": 0.3273, |
| "step": 36300 |
| }, |
| { |
| "epoch": 2.1835193911409565, |
| "eval_loss": 0.28932899236679077, |
| "eval_runtime": 50.4286, |
| "eval_samples_per_second": 198.3, |
| "eval_steps_per_second": 24.788, |
| "step": 36300 |
| }, |
| { |
| "epoch": 2.1895345960752293, |
| "grad_norm": 0.9279465079307556, |
| "learning_rate": 5.637879293097699e-06, |
| "loss": 0.3282, |
| "step": 36400 |
| }, |
| { |
| "epoch": 2.1895345960752293, |
| "eval_loss": 0.2960895895957947, |
| "eval_runtime": 51.1104, |
| "eval_samples_per_second": 195.655, |
| "eval_steps_per_second": 24.457, |
| "step": 36400 |
| }, |
| { |
| "epoch": 2.1955498010095016, |
| "grad_norm": 1.0713165998458862, |
| "learning_rate": 5.636878959653218e-06, |
| "loss": 0.3269, |
| "step": 36500 |
| }, |
| { |
| "epoch": 2.1955498010095016, |
| "eval_loss": 0.29087430238723755, |
| "eval_runtime": 51.0616, |
| "eval_samples_per_second": 195.842, |
| "eval_steps_per_second": 24.48, |
| "step": 36500 |
| }, |
| { |
| "epoch": 2.2015650059437744, |
| "grad_norm": 0.966033935546875, |
| "learning_rate": 5.635878626208736e-06, |
| "loss": 0.3258, |
| "step": 36600 |
| }, |
| { |
| "epoch": 2.2015650059437744, |
| "eval_loss": 0.2945682108402252, |
| "eval_runtime": 51.2162, |
| "eval_samples_per_second": 195.251, |
| "eval_steps_per_second": 24.406, |
| "step": 36600 |
| }, |
| { |
| "epoch": 2.207580210878047, |
| "grad_norm": 1.0510607957839966, |
| "learning_rate": 5.634878292764255e-06, |
| "loss": 0.3239, |
| "step": 36700 |
| }, |
| { |
| "epoch": 2.207580210878047, |
| "eval_loss": 0.29083874821662903, |
| "eval_runtime": 51.0865, |
| "eval_samples_per_second": 195.746, |
| "eval_steps_per_second": 24.468, |
| "step": 36700 |
| }, |
| { |
| "epoch": 2.2135954158123194, |
| "grad_norm": 0.9516984224319458, |
| "learning_rate": 5.633877959319773e-06, |
| "loss": 0.3242, |
| "step": 36800 |
| }, |
| { |
| "epoch": 2.2135954158123194, |
| "eval_loss": 0.287597119808197, |
| "eval_runtime": 51.2859, |
| "eval_samples_per_second": 194.985, |
| "eval_steps_per_second": 24.373, |
| "step": 36800 |
| }, |
| { |
| "epoch": 2.219610620746592, |
| "grad_norm": 0.9704160094261169, |
| "learning_rate": 5.632877625875292e-06, |
| "loss": 0.3229, |
| "step": 36900 |
| }, |
| { |
| "epoch": 2.219610620746592, |
| "eval_loss": 0.28357696533203125, |
| "eval_runtime": 51.0184, |
| "eval_samples_per_second": 196.008, |
| "eval_steps_per_second": 24.501, |
| "step": 36900 |
| }, |
| { |
| "epoch": 2.225625825680865, |
| "grad_norm": 0.9318411350250244, |
| "learning_rate": 5.631877292430811e-06, |
| "loss": 0.3244, |
| "step": 37000 |
| }, |
| { |
| "epoch": 2.225625825680865, |
| "eval_loss": 0.2926484942436218, |
| "eval_runtime": 51.0515, |
| "eval_samples_per_second": 195.88, |
| "eval_steps_per_second": 24.485, |
| "step": 37000 |
| }, |
| { |
| "epoch": 2.2316410306151373, |
| "grad_norm": 0.9745403528213501, |
| "learning_rate": 5.6308769589863294e-06, |
| "loss": 0.3238, |
| "step": 37100 |
| }, |
| { |
| "epoch": 2.2316410306151373, |
| "eval_loss": 0.29221734404563904, |
| "eval_runtime": 51.0519, |
| "eval_samples_per_second": 195.879, |
| "eval_steps_per_second": 24.485, |
| "step": 37100 |
| }, |
| { |
| "epoch": 2.23765623554941, |
| "grad_norm": 1.0162553787231445, |
| "learning_rate": 5.629876625541847e-06, |
| "loss": 0.3209, |
| "step": 37200 |
| }, |
| { |
| "epoch": 2.23765623554941, |
| "eval_loss": 0.2900753319263458, |
| "eval_runtime": 51.0188, |
| "eval_samples_per_second": 196.006, |
| "eval_steps_per_second": 24.501, |
| "step": 37200 |
| }, |
| { |
| "epoch": 2.243671440483683, |
| "grad_norm": 0.9270024299621582, |
| "learning_rate": 5.628876292097366e-06, |
| "loss": 0.3218, |
| "step": 37300 |
| }, |
| { |
| "epoch": 2.243671440483683, |
| "eval_loss": 0.29185083508491516, |
| "eval_runtime": 49.1324, |
| "eval_samples_per_second": 203.532, |
| "eval_steps_per_second": 25.441, |
| "step": 37300 |
| }, |
| { |
| "epoch": 2.249686645417955, |
| "grad_norm": 1.0156973600387573, |
| "learning_rate": 5.627875958652885e-06, |
| "loss": 0.3221, |
| "step": 37400 |
| }, |
| { |
| "epoch": 2.249686645417955, |
| "eval_loss": 0.2883216440677643, |
| "eval_runtime": 51.0198, |
| "eval_samples_per_second": 196.002, |
| "eval_steps_per_second": 24.5, |
| "step": 37400 |
| }, |
| { |
| "epoch": 2.255701850352228, |
| "grad_norm": 0.884667694568634, |
| "learning_rate": 5.6268756252084026e-06, |
| "loss": 0.3231, |
| "step": 37500 |
| }, |
| { |
| "epoch": 2.255701850352228, |
| "eval_loss": 0.2843243181705475, |
| "eval_runtime": 51.199, |
| "eval_samples_per_second": 195.316, |
| "eval_steps_per_second": 24.415, |
| "step": 37500 |
| }, |
| { |
| "epoch": 2.2617170552865007, |
| "grad_norm": 1.0025333166122437, |
| "learning_rate": 5.625875291763921e-06, |
| "loss": 0.32, |
| "step": 37600 |
| }, |
| { |
| "epoch": 2.2617170552865007, |
| "eval_loss": 0.28985723853111267, |
| "eval_runtime": 51.0474, |
| "eval_samples_per_second": 195.896, |
| "eval_steps_per_second": 24.487, |
| "step": 37600 |
| }, |
| { |
| "epoch": 2.267732260220773, |
| "grad_norm": 0.9673831462860107, |
| "learning_rate": 5.62487495831944e-06, |
| "loss": 0.322, |
| "step": 37700 |
| }, |
| { |
| "epoch": 2.267732260220773, |
| "eval_loss": 0.2844723165035248, |
| "eval_runtime": 51.066, |
| "eval_samples_per_second": 195.825, |
| "eval_steps_per_second": 24.478, |
| "step": 37700 |
| }, |
| { |
| "epoch": 2.2737474651550458, |
| "grad_norm": 0.9513309597969055, |
| "learning_rate": 5.623874624874959e-06, |
| "loss": 0.3202, |
| "step": 37800 |
| }, |
| { |
| "epoch": 2.2737474651550458, |
| "eval_loss": 0.28764039278030396, |
| "eval_runtime": 51.061, |
| "eval_samples_per_second": 195.844, |
| "eval_steps_per_second": 24.481, |
| "step": 37800 |
| }, |
| { |
| "epoch": 2.279762670089318, |
| "grad_norm": 0.9131941795349121, |
| "learning_rate": 5.622874291430477e-06, |
| "loss": 0.3226, |
| "step": 37900 |
| }, |
| { |
| "epoch": 2.279762670089318, |
| "eval_loss": 0.28673484921455383, |
| "eval_runtime": 51.0581, |
| "eval_samples_per_second": 195.855, |
| "eval_steps_per_second": 24.482, |
| "step": 37900 |
| }, |
| { |
| "epoch": 2.285777875023591, |
| "grad_norm": 0.9458931684494019, |
| "learning_rate": 5.621873957985995e-06, |
| "loss": 0.3206, |
| "step": 38000 |
| }, |
| { |
| "epoch": 2.285777875023591, |
| "eval_loss": 0.2862774133682251, |
| "eval_runtime": 36.7081, |
| "eval_samples_per_second": 272.419, |
| "eval_steps_per_second": 34.052, |
| "step": 38000 |
| }, |
| { |
| "epoch": 2.2917930799578636, |
| "grad_norm": 0.997297465801239, |
| "learning_rate": 5.620873624541514e-06, |
| "loss": 0.3191, |
| "step": 38100 |
| }, |
| { |
| "epoch": 2.2917930799578636, |
| "eval_loss": 0.2823648750782013, |
| "eval_runtime": 51.0962, |
| "eval_samples_per_second": 195.709, |
| "eval_steps_per_second": 24.464, |
| "step": 38100 |
| }, |
| { |
| "epoch": 2.297808284892136, |
| "grad_norm": 0.9200996160507202, |
| "learning_rate": 5.619873291097033e-06, |
| "loss": 0.3187, |
| "step": 38200 |
| }, |
| { |
| "epoch": 2.297808284892136, |
| "eval_loss": 0.2872503995895386, |
| "eval_runtime": 51.0809, |
| "eval_samples_per_second": 195.768, |
| "eval_steps_per_second": 24.471, |
| "step": 38200 |
| }, |
| { |
| "epoch": 2.3038234898264087, |
| "grad_norm": 0.9441711902618408, |
| "learning_rate": 5.6188729576525505e-06, |
| "loss": 0.3209, |
| "step": 38300 |
| }, |
| { |
| "epoch": 2.3038234898264087, |
| "eval_loss": 0.28855210542678833, |
| "eval_runtime": 51.0269, |
| "eval_samples_per_second": 195.975, |
| "eval_steps_per_second": 24.497, |
| "step": 38300 |
| }, |
| { |
| "epoch": 2.3098386947606815, |
| "grad_norm": 1.0377998352050781, |
| "learning_rate": 5.617872624208069e-06, |
| "loss": 0.3189, |
| "step": 38400 |
| }, |
| { |
| "epoch": 2.3098386947606815, |
| "eval_loss": 0.2817797362804413, |
| "eval_runtime": 51.0556, |
| "eval_samples_per_second": 195.865, |
| "eval_steps_per_second": 24.483, |
| "step": 38400 |
| }, |
| { |
| "epoch": 2.315853899694954, |
| "grad_norm": 0.9088771939277649, |
| "learning_rate": 5.616872290763588e-06, |
| "loss": 0.3183, |
| "step": 38500 |
| }, |
| { |
| "epoch": 2.315853899694954, |
| "eval_loss": 0.28079554438591003, |
| "eval_runtime": 51.0907, |
| "eval_samples_per_second": 195.73, |
| "eval_steps_per_second": 24.466, |
| "step": 38500 |
| }, |
| { |
| "epoch": 2.3218691046292266, |
| "grad_norm": 0.8959800004959106, |
| "learning_rate": 5.615871957319107e-06, |
| "loss": 0.3174, |
| "step": 38600 |
| }, |
| { |
| "epoch": 2.3218691046292266, |
| "eval_loss": 0.28803524374961853, |
| "eval_runtime": 50.9133, |
| "eval_samples_per_second": 196.412, |
| "eval_steps_per_second": 24.552, |
| "step": 38600 |
| }, |
| { |
| "epoch": 2.3278843095634993, |
| "grad_norm": 0.9056723713874817, |
| "learning_rate": 5.614871623874625e-06, |
| "loss": 0.3167, |
| "step": 38700 |
| }, |
| { |
| "epoch": 2.3278843095634993, |
| "eval_loss": 0.2826622426509857, |
| "eval_runtime": 50.7905, |
| "eval_samples_per_second": 196.887, |
| "eval_steps_per_second": 24.611, |
| "step": 38700 |
| }, |
| { |
| "epoch": 2.3338995144977717, |
| "grad_norm": 0.9248780608177185, |
| "learning_rate": 5.613871290430143e-06, |
| "loss": 0.3176, |
| "step": 38800 |
| }, |
| { |
| "epoch": 2.3338995144977717, |
| "eval_loss": 0.2767186462879181, |
| "eval_runtime": 50.6115, |
| "eval_samples_per_second": 197.583, |
| "eval_steps_per_second": 24.698, |
| "step": 38800 |
| }, |
| { |
| "epoch": 2.3399147194320444, |
| "grad_norm": 0.9541249871253967, |
| "learning_rate": 5.612870956985662e-06, |
| "loss": 0.3187, |
| "step": 38900 |
| }, |
| { |
| "epoch": 2.3399147194320444, |
| "eval_loss": 0.28110334277153015, |
| "eval_runtime": 49.7615, |
| "eval_samples_per_second": 200.959, |
| "eval_steps_per_second": 25.12, |
| "step": 38900 |
| }, |
| { |
| "epoch": 2.3459299243663168, |
| "grad_norm": 0.9116654396057129, |
| "learning_rate": 5.611870623541181e-06, |
| "loss": 0.3147, |
| "step": 39000 |
| }, |
| { |
| "epoch": 2.3459299243663168, |
| "eval_loss": 0.2833644449710846, |
| "eval_runtime": 50.9711, |
| "eval_samples_per_second": 196.19, |
| "eval_steps_per_second": 24.524, |
| "step": 39000 |
| }, |
| { |
| "epoch": 2.3519451293005895, |
| "grad_norm": 0.9693782329559326, |
| "learning_rate": 5.6108702900966985e-06, |
| "loss": 0.3187, |
| "step": 39100 |
| }, |
| { |
| "epoch": 2.3519451293005895, |
| "eval_loss": 0.2744785249233246, |
| "eval_runtime": 51.0233, |
| "eval_samples_per_second": 195.989, |
| "eval_steps_per_second": 24.499, |
| "step": 39100 |
| }, |
| { |
| "epoch": 2.3579603342348623, |
| "grad_norm": 0.911391019821167, |
| "learning_rate": 5.609869956652217e-06, |
| "loss": 0.3144, |
| "step": 39200 |
| }, |
| { |
| "epoch": 2.3579603342348623, |
| "eval_loss": 0.27756959199905396, |
| "eval_runtime": 50.862, |
| "eval_samples_per_second": 196.61, |
| "eval_steps_per_second": 24.576, |
| "step": 39200 |
| }, |
| { |
| "epoch": 2.3639755391691346, |
| "grad_norm": 0.9383348822593689, |
| "learning_rate": 5.608869623207736e-06, |
| "loss": 0.3167, |
| "step": 39300 |
| }, |
| { |
| "epoch": 2.3639755391691346, |
| "eval_loss": 0.2751516103744507, |
| "eval_runtime": 51.0203, |
| "eval_samples_per_second": 196.001, |
| "eval_steps_per_second": 24.5, |
| "step": 39300 |
| }, |
| { |
| "epoch": 2.3699907441034074, |
| "grad_norm": 0.8825791478157043, |
| "learning_rate": 5.607869289763255e-06, |
| "loss": 0.3133, |
| "step": 39400 |
| }, |
| { |
| "epoch": 2.3699907441034074, |
| "eval_loss": 0.27583110332489014, |
| "eval_runtime": 51.0047, |
| "eval_samples_per_second": 196.06, |
| "eval_steps_per_second": 24.508, |
| "step": 39400 |
| }, |
| { |
| "epoch": 2.37600594903768, |
| "grad_norm": 0.9765325784683228, |
| "learning_rate": 5.606868956318773e-06, |
| "loss": 0.314, |
| "step": 39500 |
| }, |
| { |
| "epoch": 2.37600594903768, |
| "eval_loss": 0.2750406563282013, |
| "eval_runtime": 51.0333, |
| "eval_samples_per_second": 195.95, |
| "eval_steps_per_second": 24.494, |
| "step": 39500 |
| }, |
| { |
| "epoch": 2.3820211539719525, |
| "grad_norm": 0.968429684638977, |
| "learning_rate": 5.605868622874291e-06, |
| "loss": 0.3162, |
| "step": 39600 |
| }, |
| { |
| "epoch": 2.3820211539719525, |
| "eval_loss": 0.28406116366386414, |
| "eval_runtime": 50.9992, |
| "eval_samples_per_second": 196.081, |
| "eval_steps_per_second": 24.51, |
| "step": 39600 |
| }, |
| { |
| "epoch": 2.3880363589062252, |
| "grad_norm": 0.9351980686187744, |
| "learning_rate": 5.60486828942981e-06, |
| "loss": 0.3087, |
| "step": 39700 |
| }, |
| { |
| "epoch": 2.3880363589062252, |
| "eval_loss": 0.2797408103942871, |
| "eval_runtime": 51.0101, |
| "eval_samples_per_second": 196.04, |
| "eval_steps_per_second": 24.505, |
| "step": 39700 |
| }, |
| { |
| "epoch": 2.394051563840498, |
| "grad_norm": 0.9547052383422852, |
| "learning_rate": 5.603867955985329e-06, |
| "loss": 0.3139, |
| "step": 39800 |
| }, |
| { |
| "epoch": 2.394051563840498, |
| "eval_loss": 0.2779112458229065, |
| "eval_runtime": 50.9598, |
| "eval_samples_per_second": 196.233, |
| "eval_steps_per_second": 24.529, |
| "step": 39800 |
| }, |
| { |
| "epoch": 2.4000667687747703, |
| "grad_norm": 0.8971194624900818, |
| "learning_rate": 5.6028676225408465e-06, |
| "loss": 0.3113, |
| "step": 39900 |
| }, |
| { |
| "epoch": 2.4000667687747703, |
| "eval_loss": 0.28396087884902954, |
| "eval_runtime": 51.1122, |
| "eval_samples_per_second": 195.648, |
| "eval_steps_per_second": 24.456, |
| "step": 39900 |
| }, |
| { |
| "epoch": 2.406081973709043, |
| "grad_norm": 0.9058307409286499, |
| "learning_rate": 5.601867289096365e-06, |
| "loss": 0.314, |
| "step": 40000 |
| }, |
| { |
| "epoch": 2.406081973709043, |
| "eval_loss": 0.2806677222251892, |
| "eval_runtime": 50.9901, |
| "eval_samples_per_second": 196.117, |
| "eval_steps_per_second": 24.515, |
| "step": 40000 |
| }, |
| { |
| "epoch": 2.4120971786433154, |
| "grad_norm": 0.9002136588096619, |
| "learning_rate": 5.600866955651885e-06, |
| "loss": 0.3107, |
| "step": 40100 |
| }, |
| { |
| "epoch": 2.4120971786433154, |
| "eval_loss": 0.2816166579723358, |
| "eval_runtime": 50.992, |
| "eval_samples_per_second": 196.109, |
| "eval_steps_per_second": 24.514, |
| "step": 40100 |
| }, |
| { |
| "epoch": 2.418112383577588, |
| "grad_norm": 0.9614746570587158, |
| "learning_rate": 5.599866622207403e-06, |
| "loss": 0.3107, |
| "step": 40200 |
| }, |
| { |
| "epoch": 2.418112383577588, |
| "eval_loss": 0.2749168276786804, |
| "eval_runtime": 51.0276, |
| "eval_samples_per_second": 195.972, |
| "eval_steps_per_second": 24.497, |
| "step": 40200 |
| }, |
| { |
| "epoch": 2.424127588511861, |
| "grad_norm": 0.8742543458938599, |
| "learning_rate": 5.598866288762921e-06, |
| "loss": 0.3149, |
| "step": 40300 |
| }, |
| { |
| "epoch": 2.424127588511861, |
| "eval_loss": 0.2682496905326843, |
| "eval_runtime": 49.6212, |
| "eval_samples_per_second": 201.527, |
| "eval_steps_per_second": 25.191, |
| "step": 40300 |
| }, |
| { |
| "epoch": 2.4301427934461337, |
| "grad_norm": 0.9011858105659485, |
| "learning_rate": 5.59786595531844e-06, |
| "loss": 0.3094, |
| "step": 40400 |
| }, |
| { |
| "epoch": 2.4301427934461337, |
| "eval_loss": 0.277034729719162, |
| "eval_runtime": 51.0504, |
| "eval_samples_per_second": 195.885, |
| "eval_steps_per_second": 24.486, |
| "step": 40400 |
| }, |
| { |
| "epoch": 2.436157998380406, |
| "grad_norm": 0.9290640950202942, |
| "learning_rate": 5.596865621873958e-06, |
| "loss": 0.3114, |
| "step": 40500 |
| }, |
| { |
| "epoch": 2.436157998380406, |
| "eval_loss": 0.27406954765319824, |
| "eval_runtime": 51.0172, |
| "eval_samples_per_second": 196.012, |
| "eval_steps_per_second": 24.502, |
| "step": 40500 |
| }, |
| { |
| "epoch": 2.442173203314679, |
| "grad_norm": 0.89925616979599, |
| "learning_rate": 5.5958652884294766e-06, |
| "loss": 0.3096, |
| "step": 40600 |
| }, |
| { |
| "epoch": 2.442173203314679, |
| "eval_loss": 0.2777319848537445, |
| "eval_runtime": 51.1656, |
| "eval_samples_per_second": 195.444, |
| "eval_steps_per_second": 24.43, |
| "step": 40600 |
| }, |
| { |
| "epoch": 2.448188408248951, |
| "grad_norm": 0.8584897518157959, |
| "learning_rate": 5.594864954984994e-06, |
| "loss": 0.3123, |
| "step": 40700 |
| }, |
| { |
| "epoch": 2.448188408248951, |
| "eval_loss": 0.27250877022743225, |
| "eval_runtime": 51.0648, |
| "eval_samples_per_second": 195.829, |
| "eval_steps_per_second": 24.479, |
| "step": 40700 |
| }, |
| { |
| "epoch": 2.454203613183224, |
| "grad_norm": 0.9398366808891296, |
| "learning_rate": 5.593864621540514e-06, |
| "loss": 0.3108, |
| "step": 40800 |
| }, |
| { |
| "epoch": 2.454203613183224, |
| "eval_loss": 0.27442407608032227, |
| "eval_runtime": 51.06, |
| "eval_samples_per_second": 195.848, |
| "eval_steps_per_second": 24.481, |
| "step": 40800 |
| }, |
| { |
| "epoch": 2.4602188181174967, |
| "grad_norm": 0.8771011233329773, |
| "learning_rate": 5.592864288096033e-06, |
| "loss": 0.3107, |
| "step": 40900 |
| }, |
| { |
| "epoch": 2.4602188181174967, |
| "eval_loss": 0.27768152952194214, |
| "eval_runtime": 51.1346, |
| "eval_samples_per_second": 195.562, |
| "eval_steps_per_second": 24.445, |
| "step": 40900 |
| }, |
| { |
| "epoch": 2.466234023051769, |
| "grad_norm": 0.922232449054718, |
| "learning_rate": 5.5918639546515505e-06, |
| "loss": 0.3082, |
| "step": 41000 |
| }, |
| { |
| "epoch": 2.466234023051769, |
| "eval_loss": 0.27813389897346497, |
| "eval_runtime": 28.068, |
| "eval_samples_per_second": 356.278, |
| "eval_steps_per_second": 44.535, |
| "step": 41000 |
| }, |
| { |
| "epoch": 2.4722492279860417, |
| "grad_norm": 0.9415081143379211, |
| "learning_rate": 5.590863621207069e-06, |
| "loss": 0.3105, |
| "step": 41100 |
| }, |
| { |
| "epoch": 2.4722492279860417, |
| "eval_loss": 0.27401283383369446, |
| "eval_runtime": 50.7464, |
| "eval_samples_per_second": 197.058, |
| "eval_steps_per_second": 24.632, |
| "step": 41100 |
| }, |
| { |
| "epoch": 2.4782644329203145, |
| "grad_norm": 0.8894750475883484, |
| "learning_rate": 5.589863287762588e-06, |
| "loss": 0.31, |
| "step": 41200 |
| }, |
| { |
| "epoch": 2.4782644329203145, |
| "eval_loss": 0.2711414694786072, |
| "eval_runtime": 50.7544, |
| "eval_samples_per_second": 197.027, |
| "eval_steps_per_second": 24.628, |
| "step": 41200 |
| }, |
| { |
| "epoch": 2.484279637854587, |
| "grad_norm": 0.8910822868347168, |
| "learning_rate": 5.588862954318106e-06, |
| "loss": 0.3064, |
| "step": 41300 |
| }, |
| { |
| "epoch": 2.484279637854587, |
| "eval_loss": 0.2753881514072418, |
| "eval_runtime": 48.2521, |
| "eval_samples_per_second": 207.245, |
| "eval_steps_per_second": 25.906, |
| "step": 41300 |
| }, |
| { |
| "epoch": 2.4902948427888596, |
| "grad_norm": 0.890864908695221, |
| "learning_rate": 5.5878626208736245e-06, |
| "loss": 0.3042, |
| "step": 41400 |
| }, |
| { |
| "epoch": 2.4902948427888596, |
| "eval_loss": 0.27833056449890137, |
| "eval_runtime": 44.9081, |
| "eval_samples_per_second": 222.677, |
| "eval_steps_per_second": 27.835, |
| "step": 41400 |
| }, |
| { |
| "epoch": 2.4963100477231324, |
| "grad_norm": 0.8507567048072815, |
| "learning_rate": 5.586862287429143e-06, |
| "loss": 0.308, |
| "step": 41500 |
| }, |
| { |
| "epoch": 2.4963100477231324, |
| "eval_loss": 0.2749514579772949, |
| "eval_runtime": 45.6991, |
| "eval_samples_per_second": 218.823, |
| "eval_steps_per_second": 27.353, |
| "step": 41500 |
| }, |
| { |
| "epoch": 2.5023252526574047, |
| "grad_norm": 1.0246086120605469, |
| "learning_rate": 5.585861953984662e-06, |
| "loss": 0.308, |
| "step": 41600 |
| }, |
| { |
| "epoch": 2.5023252526574047, |
| "eval_loss": 0.2693102955818176, |
| "eval_runtime": 48.6013, |
| "eval_samples_per_second": 205.756, |
| "eval_steps_per_second": 25.719, |
| "step": 41600 |
| }, |
| { |
| "epoch": 2.5083404575916775, |
| "grad_norm": 1.015673279762268, |
| "learning_rate": 5.584861620540181e-06, |
| "loss": 0.3062, |
| "step": 41700 |
| }, |
| { |
| "epoch": 2.5083404575916775, |
| "eval_loss": 0.2740586996078491, |
| "eval_runtime": 49.0311, |
| "eval_samples_per_second": 203.952, |
| "eval_steps_per_second": 25.494, |
| "step": 41700 |
| }, |
| { |
| "epoch": 2.5143556625259498, |
| "grad_norm": 0.9325861930847168, |
| "learning_rate": 5.5838612870956985e-06, |
| "loss": 0.3085, |
| "step": 41800 |
| }, |
| { |
| "epoch": 2.5143556625259498, |
| "eval_loss": 0.2755836844444275, |
| "eval_runtime": 49.0354, |
| "eval_samples_per_second": 203.934, |
| "eval_steps_per_second": 25.492, |
| "step": 41800 |
| }, |
| { |
| "epoch": 2.5203708674602225, |
| "grad_norm": 0.8402740359306335, |
| "learning_rate": 5.582860953651217e-06, |
| "loss": 0.3074, |
| "step": 41900 |
| }, |
| { |
| "epoch": 2.5203708674602225, |
| "eval_loss": 0.2750794291496277, |
| "eval_runtime": 49.6049, |
| "eval_samples_per_second": 201.593, |
| "eval_steps_per_second": 25.199, |
| "step": 41900 |
| }, |
| { |
| "epoch": 2.5263860723944953, |
| "grad_norm": 0.8873264193534851, |
| "learning_rate": 5.581860620206736e-06, |
| "loss": 0.3073, |
| "step": 42000 |
| }, |
| { |
| "epoch": 2.5263860723944953, |
| "eval_loss": 0.2801840901374817, |
| "eval_runtime": 49.3914, |
| "eval_samples_per_second": 202.464, |
| "eval_steps_per_second": 25.308, |
| "step": 42000 |
| }, |
| { |
| "epoch": 2.5324012773287676, |
| "grad_norm": 0.9626051187515259, |
| "learning_rate": 5.580860286762254e-06, |
| "loss": 0.3068, |
| "step": 42100 |
| }, |
| { |
| "epoch": 2.5324012773287676, |
| "eval_loss": 0.2711939811706543, |
| "eval_runtime": 49.617, |
| "eval_samples_per_second": 201.544, |
| "eval_steps_per_second": 25.193, |
| "step": 42100 |
| }, |
| { |
| "epoch": 2.5384164822630404, |
| "grad_norm": 0.9168198108673096, |
| "learning_rate": 5.5798599533177725e-06, |
| "loss": 0.3059, |
| "step": 42200 |
| }, |
| { |
| "epoch": 2.5384164822630404, |
| "eval_loss": 0.270614355802536, |
| "eval_runtime": 50.1412, |
| "eval_samples_per_second": 199.437, |
| "eval_steps_per_second": 24.93, |
| "step": 42200 |
| }, |
| { |
| "epoch": 2.544431687197313, |
| "grad_norm": 0.9542158842086792, |
| "learning_rate": 5.578859619873291e-06, |
| "loss": 0.3061, |
| "step": 42300 |
| }, |
| { |
| "epoch": 2.544431687197313, |
| "eval_loss": 0.2705308198928833, |
| "eval_runtime": 50.4655, |
| "eval_samples_per_second": 198.155, |
| "eval_steps_per_second": 24.769, |
| "step": 42300 |
| }, |
| { |
| "epoch": 2.5504468921315855, |
| "grad_norm": 0.8468143939971924, |
| "learning_rate": 5.57785928642881e-06, |
| "loss": 0.3048, |
| "step": 42400 |
| }, |
| { |
| "epoch": 2.5504468921315855, |
| "eval_loss": 0.27329984307289124, |
| "eval_runtime": 50.4318, |
| "eval_samples_per_second": 198.288, |
| "eval_steps_per_second": 24.786, |
| "step": 42400 |
| }, |
| { |
| "epoch": 2.5564620970658583, |
| "grad_norm": 0.9493191838264465, |
| "learning_rate": 5.576858952984329e-06, |
| "loss": 0.3019, |
| "step": 42500 |
| }, |
| { |
| "epoch": 2.5564620970658583, |
| "eval_loss": 0.2731817364692688, |
| "eval_runtime": 50.5666, |
| "eval_samples_per_second": 197.759, |
| "eval_steps_per_second": 24.72, |
| "step": 42500 |
| }, |
| { |
| "epoch": 2.562477302000131, |
| "grad_norm": 0.9617642760276794, |
| "learning_rate": 5.5758586195398465e-06, |
| "loss": 0.3012, |
| "step": 42600 |
| }, |
| { |
| "epoch": 2.562477302000131, |
| "eval_loss": 0.26970621943473816, |
| "eval_runtime": 51.0766, |
| "eval_samples_per_second": 195.784, |
| "eval_steps_per_second": 24.473, |
| "step": 42600 |
| }, |
| { |
| "epoch": 2.5684925069344033, |
| "grad_norm": 0.9389893412590027, |
| "learning_rate": 5.574858286095365e-06, |
| "loss": 0.3027, |
| "step": 42700 |
| }, |
| { |
| "epoch": 2.5684925069344033, |
| "eval_loss": 0.27145934104919434, |
| "eval_runtime": 51.074, |
| "eval_samples_per_second": 195.794, |
| "eval_steps_per_second": 24.474, |
| "step": 42700 |
| }, |
| { |
| "epoch": 2.574507711868676, |
| "grad_norm": 0.9073367714881897, |
| "learning_rate": 5.573857952650884e-06, |
| "loss": 0.3021, |
| "step": 42800 |
| }, |
| { |
| "epoch": 2.574507711868676, |
| "eval_loss": 0.2711017429828644, |
| "eval_runtime": 51.072, |
| "eval_samples_per_second": 195.802, |
| "eval_steps_per_second": 24.475, |
| "step": 42800 |
| }, |
| { |
| "epoch": 2.5805229168029484, |
| "grad_norm": 0.8948126435279846, |
| "learning_rate": 5.572857619206402e-06, |
| "loss": 0.302, |
| "step": 42900 |
| }, |
| { |
| "epoch": 2.5805229168029484, |
| "eval_loss": 0.2703753113746643, |
| "eval_runtime": 51.0323, |
| "eval_samples_per_second": 195.954, |
| "eval_steps_per_second": 24.494, |
| "step": 42900 |
| }, |
| { |
| "epoch": 2.586538121737221, |
| "grad_norm": 0.943368136882782, |
| "learning_rate": 5.5718572857619205e-06, |
| "loss": 0.3007, |
| "step": 43000 |
| }, |
| { |
| "epoch": 2.586538121737221, |
| "eval_loss": 0.2676005959510803, |
| "eval_runtime": 51.147, |
| "eval_samples_per_second": 195.515, |
| "eval_steps_per_second": 24.439, |
| "step": 43000 |
| }, |
| { |
| "epoch": 2.592553326671494, |
| "grad_norm": 0.9073809385299683, |
| "learning_rate": 5.570856952317439e-06, |
| "loss": 0.3004, |
| "step": 43100 |
| }, |
| { |
| "epoch": 2.592553326671494, |
| "eval_loss": 0.26843926310539246, |
| "eval_runtime": 51.0148, |
| "eval_samples_per_second": 196.021, |
| "eval_steps_per_second": 24.503, |
| "step": 43100 |
| }, |
| { |
| "epoch": 2.5985685316057667, |
| "grad_norm": 0.9534226655960083, |
| "learning_rate": 5.569856618872958e-06, |
| "loss": 0.3039, |
| "step": 43200 |
| }, |
| { |
| "epoch": 2.5985685316057667, |
| "eval_loss": 0.2675269842147827, |
| "eval_runtime": 51.1418, |
| "eval_samples_per_second": 195.535, |
| "eval_steps_per_second": 24.442, |
| "step": 43200 |
| }, |
| { |
| "epoch": 2.604583736540039, |
| "grad_norm": 0.8546542525291443, |
| "learning_rate": 5.5688562854284766e-06, |
| "loss": 0.3008, |
| "step": 43300 |
| }, |
| { |
| "epoch": 2.604583736540039, |
| "eval_loss": 0.2680804133415222, |
| "eval_runtime": 51.0519, |
| "eval_samples_per_second": 195.879, |
| "eval_steps_per_second": 24.485, |
| "step": 43300 |
| }, |
| { |
| "epoch": 2.610598941474312, |
| "grad_norm": 0.9167499542236328, |
| "learning_rate": 5.567855951983995e-06, |
| "loss": 0.3001, |
| "step": 43400 |
| }, |
| { |
| "epoch": 2.610598941474312, |
| "eval_loss": 0.26866093277931213, |
| "eval_runtime": 51.2331, |
| "eval_samples_per_second": 195.186, |
| "eval_steps_per_second": 24.398, |
| "step": 43400 |
| }, |
| { |
| "epoch": 2.616614146408584, |
| "grad_norm": 0.9243641495704651, |
| "learning_rate": 5.566855618539513e-06, |
| "loss": 0.3007, |
| "step": 43500 |
| }, |
| { |
| "epoch": 2.616614146408584, |
| "eval_loss": 0.27828356623649597, |
| "eval_runtime": 35.4476, |
| "eval_samples_per_second": 282.107, |
| "eval_steps_per_second": 35.263, |
| "step": 43500 |
| }, |
| { |
| "epoch": 2.622629351342857, |
| "grad_norm": 0.9069240689277649, |
| "learning_rate": 5.565855285095032e-06, |
| "loss": 0.3039, |
| "step": 43600 |
| }, |
| { |
| "epoch": 2.622629351342857, |
| "eval_loss": 0.27373048663139343, |
| "eval_runtime": 51.0712, |
| "eval_samples_per_second": 195.805, |
| "eval_steps_per_second": 24.476, |
| "step": 43600 |
| }, |
| { |
| "epoch": 2.6286445562771297, |
| "grad_norm": 0.8967992663383484, |
| "learning_rate": 5.56485495165055e-06, |
| "loss": 0.3026, |
| "step": 43700 |
| }, |
| { |
| "epoch": 2.6286445562771297, |
| "eval_loss": 0.2672281861305237, |
| "eval_runtime": 51.0214, |
| "eval_samples_per_second": 195.996, |
| "eval_steps_per_second": 24.5, |
| "step": 43700 |
| }, |
| { |
| "epoch": 2.634659761211402, |
| "grad_norm": 0.8463547229766846, |
| "learning_rate": 5.563854618206068e-06, |
| "loss": 0.3018, |
| "step": 43800 |
| }, |
| { |
| "epoch": 2.634659761211402, |
| "eval_loss": 0.2690221071243286, |
| "eval_runtime": 51.0223, |
| "eval_samples_per_second": 195.993, |
| "eval_steps_per_second": 24.499, |
| "step": 43800 |
| }, |
| { |
| "epoch": 2.6406749661456748, |
| "grad_norm": 0.8656585812568665, |
| "learning_rate": 5.562854284761587e-06, |
| "loss": 0.3019, |
| "step": 43900 |
| }, |
| { |
| "epoch": 2.6406749661456748, |
| "eval_loss": 0.2694147229194641, |
| "eval_runtime": 51.2059, |
| "eval_samples_per_second": 195.29, |
| "eval_steps_per_second": 24.411, |
| "step": 43900 |
| }, |
| { |
| "epoch": 2.646690171079947, |
| "grad_norm": 0.8388367891311646, |
| "learning_rate": 5.561853951317106e-06, |
| "loss": 0.299, |
| "step": 44000 |
| }, |
| { |
| "epoch": 2.646690171079947, |
| "eval_loss": 0.27004268765449524, |
| "eval_runtime": 51.0385, |
| "eval_samples_per_second": 195.93, |
| "eval_steps_per_second": 24.491, |
| "step": 44000 |
| }, |
| { |
| "epoch": 2.65270537601422, |
| "grad_norm": 0.8733914494514465, |
| "learning_rate": 5.5608536178726245e-06, |
| "loss": 0.2996, |
| "step": 44100 |
| }, |
| { |
| "epoch": 2.65270537601422, |
| "eval_loss": 0.2620984613895416, |
| "eval_runtime": 51.1206, |
| "eval_samples_per_second": 195.616, |
| "eval_steps_per_second": 24.452, |
| "step": 44100 |
| }, |
| { |
| "epoch": 2.6587205809484926, |
| "grad_norm": 0.825485348701477, |
| "learning_rate": 5.559853284428143e-06, |
| "loss": 0.2996, |
| "step": 44200 |
| }, |
| { |
| "epoch": 2.6587205809484926, |
| "eval_loss": 0.26619336009025574, |
| "eval_runtime": 50.9856, |
| "eval_samples_per_second": 196.134, |
| "eval_steps_per_second": 24.517, |
| "step": 44200 |
| }, |
| { |
| "epoch": 2.6647357858827654, |
| "grad_norm": 0.9234973192214966, |
| "learning_rate": 5.558852950983661e-06, |
| "loss": 0.2994, |
| "step": 44300 |
| }, |
| { |
| "epoch": 2.6647357858827654, |
| "eval_loss": 0.269397497177124, |
| "eval_runtime": 51.1229, |
| "eval_samples_per_second": 195.607, |
| "eval_steps_per_second": 24.451, |
| "step": 44300 |
| }, |
| { |
| "epoch": 2.6707509908170377, |
| "grad_norm": 0.9815935492515564, |
| "learning_rate": 5.55785261753918e-06, |
| "loss": 0.2964, |
| "step": 44400 |
| }, |
| { |
| "epoch": 2.6707509908170377, |
| "eval_loss": 0.26540160179138184, |
| "eval_runtime": 51.0268, |
| "eval_samples_per_second": 195.975, |
| "eval_steps_per_second": 24.497, |
| "step": 44400 |
| }, |
| { |
| "epoch": 2.6767661957513105, |
| "grad_norm": 0.8895259499549866, |
| "learning_rate": 5.5568522840946985e-06, |
| "loss": 0.2943, |
| "step": 44500 |
| }, |
| { |
| "epoch": 2.6767661957513105, |
| "eval_loss": 0.2682526707649231, |
| "eval_runtime": 51.188, |
| "eval_samples_per_second": 195.358, |
| "eval_steps_per_second": 24.42, |
| "step": 44500 |
| }, |
| { |
| "epoch": 2.682781400685583, |
| "grad_norm": 0.8415577411651611, |
| "learning_rate": 5.555851950650216e-06, |
| "loss": 0.2972, |
| "step": 44600 |
| }, |
| { |
| "epoch": 2.682781400685583, |
| "eval_loss": 0.2677549421787262, |
| "eval_runtime": 51.1092, |
| "eval_samples_per_second": 195.66, |
| "eval_steps_per_second": 24.457, |
| "step": 44600 |
| }, |
| { |
| "epoch": 2.6887966056198556, |
| "grad_norm": 0.8922407031059265, |
| "learning_rate": 5.554851617205736e-06, |
| "loss": 0.2969, |
| "step": 44700 |
| }, |
| { |
| "epoch": 2.6887966056198556, |
| "eval_loss": 0.2671573758125305, |
| "eval_runtime": 51.0789, |
| "eval_samples_per_second": 195.776, |
| "eval_steps_per_second": 24.472, |
| "step": 44700 |
| }, |
| { |
| "epoch": 2.6948118105541283, |
| "grad_norm": 1.0156275033950806, |
| "learning_rate": 5.553851283761254e-06, |
| "loss": 0.2972, |
| "step": 44800 |
| }, |
| { |
| "epoch": 2.6948118105541283, |
| "eval_loss": 0.26524412631988525, |
| "eval_runtime": 51.0819, |
| "eval_samples_per_second": 195.764, |
| "eval_steps_per_second": 24.471, |
| "step": 44800 |
| }, |
| { |
| "epoch": 2.7008270154884007, |
| "grad_norm": 0.9283206462860107, |
| "learning_rate": 5.5528509503167725e-06, |
| "loss": 0.2953, |
| "step": 44900 |
| }, |
| { |
| "epoch": 2.7008270154884007, |
| "eval_loss": 0.26051226258277893, |
| "eval_runtime": 51.0731, |
| "eval_samples_per_second": 195.798, |
| "eval_steps_per_second": 24.475, |
| "step": 44900 |
| }, |
| { |
| "epoch": 2.7068422204226734, |
| "grad_norm": 0.9081267714500427, |
| "learning_rate": 5.551850616872291e-06, |
| "loss": 0.2956, |
| "step": 45000 |
| }, |
| { |
| "epoch": 2.7068422204226734, |
| "eval_loss": 0.26829174160957336, |
| "eval_runtime": 51.0764, |
| "eval_samples_per_second": 195.785, |
| "eval_steps_per_second": 24.473, |
| "step": 45000 |
| }, |
| { |
| "epoch": 2.712857425356946, |
| "grad_norm": 0.9797186255455017, |
| "learning_rate": 5.550850283427809e-06, |
| "loss": 0.2951, |
| "step": 45100 |
| }, |
| { |
| "epoch": 2.712857425356946, |
| "eval_loss": 0.2626285254955292, |
| "eval_runtime": 51.0441, |
| "eval_samples_per_second": 195.909, |
| "eval_steps_per_second": 24.489, |
| "step": 45100 |
| }, |
| { |
| "epoch": 2.7188726302912185, |
| "grad_norm": 0.972873866558075, |
| "learning_rate": 5.549849949983328e-06, |
| "loss": 0.2938, |
| "step": 45200 |
| }, |
| { |
| "epoch": 2.7188726302912185, |
| "eval_loss": 0.2651112675666809, |
| "eval_runtime": 51.1856, |
| "eval_samples_per_second": 195.368, |
| "eval_steps_per_second": 24.421, |
| "step": 45200 |
| }, |
| { |
| "epoch": 2.7248878352254913, |
| "grad_norm": 0.8637024164199829, |
| "learning_rate": 5.5488496165388465e-06, |
| "loss": 0.2951, |
| "step": 45300 |
| }, |
| { |
| "epoch": 2.7248878352254913, |
| "eval_loss": 0.26248618960380554, |
| "eval_runtime": 51.1456, |
| "eval_samples_per_second": 195.52, |
| "eval_steps_per_second": 24.44, |
| "step": 45300 |
| }, |
| { |
| "epoch": 2.730903040159764, |
| "grad_norm": 0.9163945317268372, |
| "learning_rate": 5.547849283094365e-06, |
| "loss": 0.2948, |
| "step": 45400 |
| }, |
| { |
| "epoch": 2.730903040159764, |
| "eval_loss": 0.2693786025047302, |
| "eval_runtime": 51.0867, |
| "eval_samples_per_second": 195.746, |
| "eval_steps_per_second": 24.468, |
| "step": 45400 |
| }, |
| { |
| "epoch": 2.7369182450940364, |
| "grad_norm": 1.0530128479003906, |
| "learning_rate": 5.546848949649884e-06, |
| "loss": 0.2944, |
| "step": 45500 |
| }, |
| { |
| "epoch": 2.7369182450940364, |
| "eval_loss": 0.2621295750141144, |
| "eval_runtime": 51.1036, |
| "eval_samples_per_second": 195.681, |
| "eval_steps_per_second": 24.46, |
| "step": 45500 |
| }, |
| { |
| "epoch": 2.742933450028309, |
| "grad_norm": 0.9258381128311157, |
| "learning_rate": 5.545848616205402e-06, |
| "loss": 0.2943, |
| "step": 45600 |
| }, |
| { |
| "epoch": 2.742933450028309, |
| "eval_loss": 0.25974345207214355, |
| "eval_runtime": 51.1397, |
| "eval_samples_per_second": 195.543, |
| "eval_steps_per_second": 24.443, |
| "step": 45600 |
| }, |
| { |
| "epoch": 2.7489486549625815, |
| "grad_norm": 0.8768019676208496, |
| "learning_rate": 5.5448482827609205e-06, |
| "loss": 0.2934, |
| "step": 45700 |
| }, |
| { |
| "epoch": 2.7489486549625815, |
| "eval_loss": 0.26323673129081726, |
| "eval_runtime": 51.1134, |
| "eval_samples_per_second": 195.643, |
| "eval_steps_per_second": 24.455, |
| "step": 45700 |
| }, |
| { |
| "epoch": 2.754963859896854, |
| "grad_norm": 0.8610267639160156, |
| "learning_rate": 5.543847949316439e-06, |
| "loss": 0.2934, |
| "step": 45800 |
| }, |
| { |
| "epoch": 2.754963859896854, |
| "eval_loss": 0.2621345818042755, |
| "eval_runtime": 51.0875, |
| "eval_samples_per_second": 195.743, |
| "eval_steps_per_second": 24.468, |
| "step": 45800 |
| }, |
| { |
| "epoch": 2.760979064831127, |
| "grad_norm": 0.8272863626480103, |
| "learning_rate": 5.542847615871957e-06, |
| "loss": 0.2952, |
| "step": 45900 |
| }, |
| { |
| "epoch": 2.760979064831127, |
| "eval_loss": 0.2651170790195465, |
| "eval_runtime": 51.1189, |
| "eval_samples_per_second": 195.622, |
| "eval_steps_per_second": 24.453, |
| "step": 45900 |
| }, |
| { |
| "epoch": 2.7669942697653997, |
| "grad_norm": 0.8691322207450867, |
| "learning_rate": 5.541847282427476e-06, |
| "loss": 0.2903, |
| "step": 46000 |
| }, |
| { |
| "epoch": 2.7669942697653997, |
| "eval_loss": 0.2674708664417267, |
| "eval_runtime": 51.0977, |
| "eval_samples_per_second": 195.704, |
| "eval_steps_per_second": 24.463, |
| "step": 46000 |
| }, |
| { |
| "epoch": 2.773009474699672, |
| "grad_norm": 0.9887429475784302, |
| "learning_rate": 5.5408469489829944e-06, |
| "loss": 0.2931, |
| "step": 46100 |
| }, |
| { |
| "epoch": 2.773009474699672, |
| "eval_loss": 0.2632472515106201, |
| "eval_runtime": 51.1106, |
| "eval_samples_per_second": 195.654, |
| "eval_steps_per_second": 24.457, |
| "step": 46100 |
| }, |
| { |
| "epoch": 2.779024679633945, |
| "grad_norm": 0.9419971704483032, |
| "learning_rate": 5.539846615538513e-06, |
| "loss": 0.2933, |
| "step": 46200 |
| }, |
| { |
| "epoch": 2.779024679633945, |
| "eval_loss": 0.2613042891025543, |
| "eval_runtime": 51.0338, |
| "eval_samples_per_second": 195.949, |
| "eval_steps_per_second": 24.494, |
| "step": 46200 |
| }, |
| { |
| "epoch": 2.785039884568217, |
| "grad_norm": 0.9267482161521912, |
| "learning_rate": 5.538846282094032e-06, |
| "loss": 0.2915, |
| "step": 46300 |
| }, |
| { |
| "epoch": 2.785039884568217, |
| "eval_loss": 0.2661626935005188, |
| "eval_runtime": 51.084, |
| "eval_samples_per_second": 195.756, |
| "eval_steps_per_second": 24.469, |
| "step": 46300 |
| }, |
| { |
| "epoch": 2.79105508950249, |
| "grad_norm": 0.9020786285400391, |
| "learning_rate": 5.5378459486495506e-06, |
| "loss": 0.2933, |
| "step": 46400 |
| }, |
| { |
| "epoch": 2.79105508950249, |
| "eval_loss": 0.2588748335838318, |
| "eval_runtime": 51.1198, |
| "eval_samples_per_second": 195.619, |
| "eval_steps_per_second": 24.452, |
| "step": 46400 |
| }, |
| { |
| "epoch": 2.7970702944367627, |
| "grad_norm": 0.893649160861969, |
| "learning_rate": 5.5368456152050684e-06, |
| "loss": 0.2914, |
| "step": 46500 |
| }, |
| { |
| "epoch": 2.7970702944367627, |
| "eval_loss": 0.2560584545135498, |
| "eval_runtime": 51.1578, |
| "eval_samples_per_second": 195.474, |
| "eval_steps_per_second": 24.434, |
| "step": 46500 |
| }, |
| { |
| "epoch": 2.803085499371035, |
| "grad_norm": 0.8569892644882202, |
| "learning_rate": 5.535845281760587e-06, |
| "loss": 0.2921, |
| "step": 46600 |
| }, |
| { |
| "epoch": 2.803085499371035, |
| "eval_loss": 0.26415926218032837, |
| "eval_runtime": 48.8588, |
| "eval_samples_per_second": 204.672, |
| "eval_steps_per_second": 25.584, |
| "step": 46600 |
| }, |
| { |
| "epoch": 2.809100704305308, |
| "grad_norm": 0.967966616153717, |
| "learning_rate": 5.534844948316105e-06, |
| "loss": 0.2932, |
| "step": 46700 |
| }, |
| { |
| "epoch": 2.809100704305308, |
| "eval_loss": 0.262004554271698, |
| "eval_runtime": 51.1167, |
| "eval_samples_per_second": 195.631, |
| "eval_steps_per_second": 24.454, |
| "step": 46700 |
| }, |
| { |
| "epoch": 2.81511590923958, |
| "grad_norm": 0.8977293968200684, |
| "learning_rate": 5.533844614871624e-06, |
| "loss": 0.291, |
| "step": 46800 |
| }, |
| { |
| "epoch": 2.81511590923958, |
| "eval_loss": 0.26304325461387634, |
| "eval_runtime": 51.1071, |
| "eval_samples_per_second": 195.668, |
| "eval_steps_per_second": 24.458, |
| "step": 46800 |
| }, |
| { |
| "epoch": 2.821131114173853, |
| "grad_norm": 0.8833451271057129, |
| "learning_rate": 5.532844281427142e-06, |
| "loss": 0.2879, |
| "step": 46900 |
| }, |
| { |
| "epoch": 2.821131114173853, |
| "eval_loss": 0.2652186155319214, |
| "eval_runtime": 51.1212, |
| "eval_samples_per_second": 195.614, |
| "eval_steps_per_second": 24.452, |
| "step": 46900 |
| }, |
| { |
| "epoch": 2.8271463191081256, |
| "grad_norm": 0.916098415851593, |
| "learning_rate": 5.531843947982661e-06, |
| "loss": 0.29, |
| "step": 47000 |
| }, |
| { |
| "epoch": 2.8271463191081256, |
| "eval_loss": 0.2618425190448761, |
| "eval_runtime": 51.1419, |
| "eval_samples_per_second": 195.534, |
| "eval_steps_per_second": 24.442, |
| "step": 47000 |
| }, |
| { |
| "epoch": 2.8331615240423984, |
| "grad_norm": 0.8808870315551758, |
| "learning_rate": 5.53084361453818e-06, |
| "loss": 0.2912, |
| "step": 47100 |
| }, |
| { |
| "epoch": 2.8331615240423984, |
| "eval_loss": 0.26288196444511414, |
| "eval_runtime": 51.1216, |
| "eval_samples_per_second": 195.612, |
| "eval_steps_per_second": 24.452, |
| "step": 47100 |
| }, |
| { |
| "epoch": 2.8391767289766707, |
| "grad_norm": 0.8972067832946777, |
| "learning_rate": 5.5298432810936985e-06, |
| "loss": 0.2914, |
| "step": 47200 |
| }, |
| { |
| "epoch": 2.8391767289766707, |
| "eval_loss": 0.2557620704174042, |
| "eval_runtime": 51.1227, |
| "eval_samples_per_second": 195.608, |
| "eval_steps_per_second": 24.451, |
| "step": 47200 |
| }, |
| { |
| "epoch": 2.8451919339109435, |
| "grad_norm": 0.8946945667266846, |
| "learning_rate": 5.528842947649216e-06, |
| "loss": 0.2894, |
| "step": 47300 |
| }, |
| { |
| "epoch": 2.8451919339109435, |
| "eval_loss": 0.26096677780151367, |
| "eval_runtime": 48.2836, |
| "eval_samples_per_second": 207.109, |
| "eval_steps_per_second": 25.889, |
| "step": 47300 |
| }, |
| { |
| "epoch": 2.851207138845216, |
| "grad_norm": 0.9023754000663757, |
| "learning_rate": 5.527842614204735e-06, |
| "loss": 0.2875, |
| "step": 47400 |
| }, |
| { |
| "epoch": 2.851207138845216, |
| "eval_loss": 0.25718143582344055, |
| "eval_runtime": 51.1174, |
| "eval_samples_per_second": 195.628, |
| "eval_steps_per_second": 24.453, |
| "step": 47400 |
| }, |
| { |
| "epoch": 2.8572223437794886, |
| "grad_norm": 0.8229103088378906, |
| "learning_rate": 5.526842280760254e-06, |
| "loss": 0.2875, |
| "step": 47500 |
| }, |
| { |
| "epoch": 2.8572223437794886, |
| "eval_loss": 0.26064789295196533, |
| "eval_runtime": 51.0796, |
| "eval_samples_per_second": 195.773, |
| "eval_steps_per_second": 24.472, |
| "step": 47500 |
| }, |
| { |
| "epoch": 2.8632375487137613, |
| "grad_norm": 0.7903328537940979, |
| "learning_rate": 5.525841947315772e-06, |
| "loss": 0.2888, |
| "step": 47600 |
| }, |
| { |
| "epoch": 2.8632375487137613, |
| "eval_loss": 0.25777605175971985, |
| "eval_runtime": 51.0732, |
| "eval_samples_per_second": 195.797, |
| "eval_steps_per_second": 24.475, |
| "step": 47600 |
| }, |
| { |
| "epoch": 2.8692527536480337, |
| "grad_norm": 0.9628756046295166, |
| "learning_rate": 5.52484161387129e-06, |
| "loss": 0.2909, |
| "step": 47700 |
| }, |
| { |
| "epoch": 2.8692527536480337, |
| "eval_loss": 0.2552904188632965, |
| "eval_runtime": 51.1083, |
| "eval_samples_per_second": 195.663, |
| "eval_steps_per_second": 24.458, |
| "step": 47700 |
| }, |
| { |
| "epoch": 2.8752679585823064, |
| "grad_norm": 0.8853189945220947, |
| "learning_rate": 5.523841280426809e-06, |
| "loss": 0.2885, |
| "step": 47800 |
| }, |
| { |
| "epoch": 2.8752679585823064, |
| "eval_loss": 0.2585737407207489, |
| "eval_runtime": 51.0832, |
| "eval_samples_per_second": 195.759, |
| "eval_steps_per_second": 24.47, |
| "step": 47800 |
| }, |
| { |
| "epoch": 2.8812831635165788, |
| "grad_norm": 0.9299560785293579, |
| "learning_rate": 5.522840946982328e-06, |
| "loss": 0.2865, |
| "step": 47900 |
| }, |
| { |
| "epoch": 2.8812831635165788, |
| "eval_loss": 0.2563331425189972, |
| "eval_runtime": 51.0909, |
| "eval_samples_per_second": 195.729, |
| "eval_steps_per_second": 24.466, |
| "step": 47900 |
| }, |
| { |
| "epoch": 2.8872983684508515, |
| "grad_norm": 0.9286957383155823, |
| "learning_rate": 5.5218406135378465e-06, |
| "loss": 0.2873, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2.8872983684508515, |
| "eval_loss": 0.2592049837112427, |
| "eval_runtime": 48.2359, |
| "eval_samples_per_second": 207.315, |
| "eval_steps_per_second": 25.914, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2.8933135733851243, |
| "grad_norm": 0.8729236125946045, |
| "learning_rate": 5.520840280093364e-06, |
| "loss": 0.2861, |
| "step": 48100 |
| }, |
| { |
| "epoch": 2.8933135733851243, |
| "eval_loss": 0.25870123505592346, |
| "eval_runtime": 51.1066, |
| "eval_samples_per_second": 195.669, |
| "eval_steps_per_second": 24.459, |
| "step": 48100 |
| }, |
| { |
| "epoch": 2.899328778319397, |
| "grad_norm": 0.8652471899986267, |
| "learning_rate": 5.519839946648883e-06, |
| "loss": 0.2867, |
| "step": 48200 |
| }, |
| { |
| "epoch": 2.899328778319397, |
| "eval_loss": 0.2612285017967224, |
| "eval_runtime": 51.1028, |
| "eval_samples_per_second": 195.684, |
| "eval_steps_per_second": 24.46, |
| "step": 48200 |
| }, |
| { |
| "epoch": 2.9053439832536694, |
| "grad_norm": 0.8425643444061279, |
| "learning_rate": 5.518839613204402e-06, |
| "loss": 0.2852, |
| "step": 48300 |
| }, |
| { |
| "epoch": 2.9053439832536694, |
| "eval_loss": 0.2628696858882904, |
| "eval_runtime": 51.123, |
| "eval_samples_per_second": 195.607, |
| "eval_steps_per_second": 24.451, |
| "step": 48300 |
| }, |
| { |
| "epoch": 2.911359188187942, |
| "grad_norm": 0.9844802021980286, |
| "learning_rate": 5.51783927975992e-06, |
| "loss": 0.2877, |
| "step": 48400 |
| }, |
| { |
| "epoch": 2.911359188187942, |
| "eval_loss": 0.2612448036670685, |
| "eval_runtime": 51.0987, |
| "eval_samples_per_second": 195.7, |
| "eval_steps_per_second": 24.462, |
| "step": 48400 |
| }, |
| { |
| "epoch": 2.9173743931222145, |
| "grad_norm": 0.878381073474884, |
| "learning_rate": 5.516838946315438e-06, |
| "loss": 0.2869, |
| "step": 48500 |
| }, |
| { |
| "epoch": 2.9173743931222145, |
| "eval_loss": 0.25639012455940247, |
| "eval_runtime": 51.1127, |
| "eval_samples_per_second": 195.646, |
| "eval_steps_per_second": 24.456, |
| "step": 48500 |
| }, |
| { |
| "epoch": 2.9233895980564872, |
| "grad_norm": 0.8658349514007568, |
| "learning_rate": 5.515838612870957e-06, |
| "loss": 0.2862, |
| "step": 48600 |
| }, |
| { |
| "epoch": 2.9233895980564872, |
| "eval_loss": 0.24971692264080048, |
| "eval_runtime": 51.1228, |
| "eval_samples_per_second": 195.607, |
| "eval_steps_per_second": 24.451, |
| "step": 48600 |
| }, |
| { |
| "epoch": 2.92940480299076, |
| "grad_norm": 0.8590924143791199, |
| "learning_rate": 5.514838279426476e-06, |
| "loss": 0.2868, |
| "step": 48700 |
| }, |
| { |
| "epoch": 2.92940480299076, |
| "eval_loss": 0.2601747214794159, |
| "eval_runtime": 51.129, |
| "eval_samples_per_second": 195.584, |
| "eval_steps_per_second": 24.448, |
| "step": 48700 |
| }, |
| { |
| "epoch": 2.9354200079250328, |
| "grad_norm": 0.8948882222175598, |
| "learning_rate": 5.5138379459819945e-06, |
| "loss": 0.2876, |
| "step": 48800 |
| }, |
| { |
| "epoch": 2.9354200079250328, |
| "eval_loss": 0.256122350692749, |
| "eval_runtime": 51.1826, |
| "eval_samples_per_second": 195.379, |
| "eval_steps_per_second": 24.422, |
| "step": 48800 |
| }, |
| { |
| "epoch": 2.941435212859305, |
| "grad_norm": 0.8714300990104675, |
| "learning_rate": 5.512837612537512e-06, |
| "loss": 0.2854, |
| "step": 48900 |
| }, |
| { |
| "epoch": 2.941435212859305, |
| "eval_loss": 0.2527640163898468, |
| "eval_runtime": 51.143, |
| "eval_samples_per_second": 195.53, |
| "eval_steps_per_second": 24.441, |
| "step": 48900 |
| }, |
| { |
| "epoch": 2.947450417793578, |
| "grad_norm": 0.8347595930099487, |
| "learning_rate": 5.511837279093031e-06, |
| "loss": 0.2859, |
| "step": 49000 |
| }, |
| { |
| "epoch": 2.947450417793578, |
| "eval_loss": 0.2613712549209595, |
| "eval_runtime": 51.1079, |
| "eval_samples_per_second": 195.664, |
| "eval_steps_per_second": 24.458, |
| "step": 49000 |
| }, |
| { |
| "epoch": 2.95346562272785, |
| "grad_norm": 0.8538709878921509, |
| "learning_rate": 5.51083694564855e-06, |
| "loss": 0.2852, |
| "step": 49100 |
| }, |
| { |
| "epoch": 2.95346562272785, |
| "eval_loss": 0.25488194823265076, |
| "eval_runtime": 51.1132, |
| "eval_samples_per_second": 195.644, |
| "eval_steps_per_second": 24.456, |
| "step": 49100 |
| }, |
| { |
| "epoch": 2.959480827662123, |
| "grad_norm": 0.922144889831543, |
| "learning_rate": 5.509836612204068e-06, |
| "loss": 0.2847, |
| "step": 49200 |
| }, |
| { |
| "epoch": 2.959480827662123, |
| "eval_loss": 0.2526051700115204, |
| "eval_runtime": 51.1124, |
| "eval_samples_per_second": 195.647, |
| "eval_steps_per_second": 24.456, |
| "step": 49200 |
| }, |
| { |
| "epoch": 2.9654960325963957, |
| "grad_norm": 0.8684960007667542, |
| "learning_rate": 5.508836278759587e-06, |
| "loss": 0.2837, |
| "step": 49300 |
| }, |
| { |
| "epoch": 2.9654960325963957, |
| "eval_loss": 0.25194811820983887, |
| "eval_runtime": 51.0578, |
| "eval_samples_per_second": 195.857, |
| "eval_steps_per_second": 24.482, |
| "step": 49300 |
| }, |
| { |
| "epoch": 2.971511237530668, |
| "grad_norm": 0.9055145978927612, |
| "learning_rate": 5.507835945315106e-06, |
| "loss": 0.2817, |
| "step": 49400 |
| }, |
| { |
| "epoch": 2.971511237530668, |
| "eval_loss": 0.25218260288238525, |
| "eval_runtime": 51.0821, |
| "eval_samples_per_second": 195.763, |
| "eval_steps_per_second": 24.47, |
| "step": 49400 |
| }, |
| { |
| "epoch": 2.977526442464941, |
| "grad_norm": 0.8636729121208191, |
| "learning_rate": 5.506835611870624e-06, |
| "loss": 0.2855, |
| "step": 49500 |
| }, |
| { |
| "epoch": 2.977526442464941, |
| "eval_loss": 0.25728458166122437, |
| "eval_runtime": 51.06, |
| "eval_samples_per_second": 195.848, |
| "eval_steps_per_second": 24.481, |
| "step": 49500 |
| }, |
| { |
| "epoch": 2.983541647399213, |
| "grad_norm": 0.9919777512550354, |
| "learning_rate": 5.5058352784261424e-06, |
| "loss": 0.2816, |
| "step": 49600 |
| }, |
| { |
| "epoch": 2.983541647399213, |
| "eval_loss": 0.2515828311443329, |
| "eval_runtime": 51.2113, |
| "eval_samples_per_second": 195.269, |
| "eval_steps_per_second": 24.409, |
| "step": 49600 |
| }, |
| { |
| "epoch": 2.989556852333486, |
| "grad_norm": 0.9122774600982666, |
| "learning_rate": 5.50483494498166e-06, |
| "loss": 0.2832, |
| "step": 49700 |
| }, |
| { |
| "epoch": 2.989556852333486, |
| "eval_loss": 0.25426608324050903, |
| "eval_runtime": 51.098, |
| "eval_samples_per_second": 195.702, |
| "eval_steps_per_second": 24.463, |
| "step": 49700 |
| }, |
| { |
| "epoch": 2.9955720572677587, |
| "grad_norm": 0.8778186440467834, |
| "learning_rate": 5.503834611537179e-06, |
| "loss": 0.2821, |
| "step": 49800 |
| }, |
| { |
| "epoch": 2.9955720572677587, |
| "eval_loss": 0.2510456442832947, |
| "eval_runtime": 51.0495, |
| "eval_samples_per_second": 195.888, |
| "eval_steps_per_second": 24.486, |
| "step": 49800 |
| }, |
| { |
| "epoch": 3.001587262202031, |
| "grad_norm": 0.8645954132080078, |
| "learning_rate": 5.502834278092698e-06, |
| "loss": 0.283, |
| "step": 49900 |
| }, |
| { |
| "epoch": 3.001587262202031, |
| "eval_loss": 0.2549561858177185, |
| "eval_runtime": 51.1194, |
| "eval_samples_per_second": 195.62, |
| "eval_steps_per_second": 24.453, |
| "step": 49900 |
| }, |
| { |
| "epoch": 3.0076024671363037, |
| "grad_norm": 0.971116304397583, |
| "learning_rate": 5.501833944648216e-06, |
| "loss": 0.2833, |
| "step": 50000 |
| }, |
| { |
| "epoch": 3.0076024671363037, |
| "eval_loss": 0.24709643423557281, |
| "eval_runtime": 50.6183, |
| "eval_samples_per_second": 197.557, |
| "eval_steps_per_second": 24.695, |
| "step": 50000 |
| }, |
| { |
| "epoch": 3.0136176720705765, |
| "grad_norm": 0.9352070093154907, |
| "learning_rate": 5.500833611203735e-06, |
| "loss": 0.2829, |
| "step": 50100 |
| }, |
| { |
| "epoch": 3.0136176720705765, |
| "eval_loss": 0.2510698139667511, |
| "eval_runtime": 50.9108, |
| "eval_samples_per_second": 196.422, |
| "eval_steps_per_second": 24.553, |
| "step": 50100 |
| }, |
| { |
| "epoch": 3.019632877004849, |
| "grad_norm": 0.8702713847160339, |
| "learning_rate": 5.499833277759254e-06, |
| "loss": 0.2806, |
| "step": 50200 |
| }, |
| { |
| "epoch": 3.019632877004849, |
| "eval_loss": 0.25517037510871887, |
| "eval_runtime": 51.143, |
| "eval_samples_per_second": 195.53, |
| "eval_steps_per_second": 24.441, |
| "step": 50200 |
| }, |
| { |
| "epoch": 3.0256480819391216, |
| "grad_norm": 0.8589245676994324, |
| "learning_rate": 5.498832944314772e-06, |
| "loss": 0.2828, |
| "step": 50300 |
| }, |
| { |
| "epoch": 3.0256480819391216, |
| "eval_loss": 0.25433140993118286, |
| "eval_runtime": 48.9769, |
| "eval_samples_per_second": 204.178, |
| "eval_steps_per_second": 25.522, |
| "step": 50300 |
| }, |
| { |
| "epoch": 3.0316632868733944, |
| "grad_norm": 0.8240871429443359, |
| "learning_rate": 5.49783261087029e-06, |
| "loss": 0.2786, |
| "step": 50400 |
| }, |
| { |
| "epoch": 3.0316632868733944, |
| "eval_loss": 0.2537357807159424, |
| "eval_runtime": 43.57, |
| "eval_samples_per_second": 229.516, |
| "eval_steps_per_second": 28.689, |
| "step": 50400 |
| }, |
| { |
| "epoch": 3.0376784918076667, |
| "grad_norm": 0.8937031030654907, |
| "learning_rate": 5.496832277425809e-06, |
| "loss": 0.2818, |
| "step": 50500 |
| }, |
| { |
| "epoch": 3.0376784918076667, |
| "eval_loss": 0.25536617636680603, |
| "eval_runtime": 43.9342, |
| "eval_samples_per_second": 227.613, |
| "eval_steps_per_second": 28.452, |
| "step": 50500 |
| }, |
| { |
| "epoch": 3.0436936967419395, |
| "grad_norm": 0.8851022720336914, |
| "learning_rate": 5.495831943981327e-06, |
| "loss": 0.28, |
| "step": 50600 |
| }, |
| { |
| "epoch": 3.0436936967419395, |
| "eval_loss": 0.2511354684829712, |
| "eval_runtime": 43.4697, |
| "eval_samples_per_second": 230.045, |
| "eval_steps_per_second": 28.756, |
| "step": 50600 |
| }, |
| { |
| "epoch": 3.0497089016762122, |
| "grad_norm": 0.9308133125305176, |
| "learning_rate": 5.494831610536846e-06, |
| "loss": 0.2822, |
| "step": 50700 |
| }, |
| { |
| "epoch": 3.0497089016762122, |
| "eval_loss": 0.2528564929962158, |
| "eval_runtime": 38.8722, |
| "eval_samples_per_second": 257.253, |
| "eval_steps_per_second": 32.157, |
| "step": 50700 |
| }, |
| { |
| "epoch": 3.0557241066104845, |
| "grad_norm": 1.0158571004867554, |
| "learning_rate": 5.493831277092364e-06, |
| "loss": 0.2829, |
| "step": 50800 |
| }, |
| { |
| "epoch": 3.0557241066104845, |
| "eval_loss": 0.24908022582530975, |
| "eval_runtime": 37.7881, |
| "eval_samples_per_second": 264.634, |
| "eval_steps_per_second": 33.079, |
| "step": 50800 |
| }, |
| { |
| "epoch": 3.0617393115447573, |
| "grad_norm": 0.8238421082496643, |
| "learning_rate": 5.492830943647883e-06, |
| "loss": 0.2804, |
| "step": 50900 |
| }, |
| { |
| "epoch": 3.0617393115447573, |
| "eval_loss": 0.24608242511749268, |
| "eval_runtime": 40.8226, |
| "eval_samples_per_second": 244.962, |
| "eval_steps_per_second": 30.62, |
| "step": 50900 |
| }, |
| { |
| "epoch": 3.06775451647903, |
| "grad_norm": 0.8686819672584534, |
| "learning_rate": 5.491830610203402e-06, |
| "loss": 0.2793, |
| "step": 51000 |
| }, |
| { |
| "epoch": 3.06775451647903, |
| "eval_loss": 0.24653884768486023, |
| "eval_runtime": 43.055, |
| "eval_samples_per_second": 232.261, |
| "eval_steps_per_second": 29.033, |
| "step": 51000 |
| }, |
| { |
| "epoch": 3.0737697214133024, |
| "grad_norm": 0.9399664998054504, |
| "learning_rate": 5.49083027675892e-06, |
| "loss": 0.2812, |
| "step": 51100 |
| }, |
| { |
| "epoch": 3.0737697214133024, |
| "eval_loss": 0.25110530853271484, |
| "eval_runtime": 44.132, |
| "eval_samples_per_second": 226.593, |
| "eval_steps_per_second": 28.324, |
| "step": 51100 |
| }, |
| { |
| "epoch": 3.079784926347575, |
| "grad_norm": 0.9775184988975525, |
| "learning_rate": 5.489829943314438e-06, |
| "loss": 0.2791, |
| "step": 51200 |
| }, |
| { |
| "epoch": 3.079784926347575, |
| "eval_loss": 0.24785326421260834, |
| "eval_runtime": 39.588, |
| "eval_samples_per_second": 252.602, |
| "eval_steps_per_second": 31.575, |
| "step": 51200 |
| }, |
| { |
| "epoch": 3.0858001312818475, |
| "grad_norm": 0.9678452014923096, |
| "learning_rate": 5.488829609869957e-06, |
| "loss": 0.2799, |
| "step": 51300 |
| }, |
| { |
| "epoch": 3.0858001312818475, |
| "eval_loss": 0.25371748208999634, |
| "eval_runtime": 40.7507, |
| "eval_samples_per_second": 245.395, |
| "eval_steps_per_second": 30.674, |
| "step": 51300 |
| }, |
| { |
| "epoch": 3.0918153362161203, |
| "grad_norm": 0.9417468309402466, |
| "learning_rate": 5.487829276425475e-06, |
| "loss": 0.2794, |
| "step": 51400 |
| }, |
| { |
| "epoch": 3.0918153362161203, |
| "eval_loss": 0.2551732659339905, |
| "eval_runtime": 42.2338, |
| "eval_samples_per_second": 236.777, |
| "eval_steps_per_second": 29.597, |
| "step": 51400 |
| }, |
| { |
| "epoch": 3.097830541150393, |
| "grad_norm": 0.8855278491973877, |
| "learning_rate": 5.486828942980994e-06, |
| "loss": 0.2798, |
| "step": 51500 |
| }, |
| { |
| "epoch": 3.097830541150393, |
| "eval_loss": 0.24791452288627625, |
| "eval_runtime": 48.1906, |
| "eval_samples_per_second": 207.509, |
| "eval_steps_per_second": 25.939, |
| "step": 51500 |
| }, |
| { |
| "epoch": 3.1038457460846653, |
| "grad_norm": 0.8699272274971008, |
| "learning_rate": 5.485828609536512e-06, |
| "loss": 0.2777, |
| "step": 51600 |
| }, |
| { |
| "epoch": 3.1038457460846653, |
| "eval_loss": 0.24532942473888397, |
| "eval_runtime": 45.8295, |
| "eval_samples_per_second": 218.2, |
| "eval_steps_per_second": 27.275, |
| "step": 51600 |
| }, |
| { |
| "epoch": 3.109860951018938, |
| "grad_norm": 0.8299559950828552, |
| "learning_rate": 5.484828276092031e-06, |
| "loss": 0.277, |
| "step": 51700 |
| }, |
| { |
| "epoch": 3.109860951018938, |
| "eval_loss": 0.24607662856578827, |
| "eval_runtime": 46.3442, |
| "eval_samples_per_second": 215.777, |
| "eval_steps_per_second": 26.972, |
| "step": 51700 |
| }, |
| { |
| "epoch": 3.115876155953211, |
| "grad_norm": 0.8937397003173828, |
| "learning_rate": 5.48382794264755e-06, |
| "loss": 0.2823, |
| "step": 51800 |
| }, |
| { |
| "epoch": 3.115876155953211, |
| "eval_loss": 0.2510640621185303, |
| "eval_runtime": 47.5854, |
| "eval_samples_per_second": 210.148, |
| "eval_steps_per_second": 26.269, |
| "step": 51800 |
| }, |
| { |
| "epoch": 3.121891360887483, |
| "grad_norm": 0.7908412218093872, |
| "learning_rate": 5.482827609203068e-06, |
| "loss": 0.2764, |
| "step": 51900 |
| }, |
| { |
| "epoch": 3.121891360887483, |
| "eval_loss": 0.24473002552986145, |
| "eval_runtime": 48.2096, |
| "eval_samples_per_second": 207.427, |
| "eval_steps_per_second": 25.928, |
| "step": 51900 |
| }, |
| { |
| "epoch": 3.127906565821756, |
| "grad_norm": 0.8543498516082764, |
| "learning_rate": 5.481827275758586e-06, |
| "loss": 0.2782, |
| "step": 52000 |
| }, |
| { |
| "epoch": 3.127906565821756, |
| "eval_loss": 0.24760138988494873, |
| "eval_runtime": 48.6773, |
| "eval_samples_per_second": 205.435, |
| "eval_steps_per_second": 25.679, |
| "step": 52000 |
| }, |
| { |
| "epoch": 3.1339217707560287, |
| "grad_norm": 0.869742751121521, |
| "learning_rate": 5.480826942314105e-06, |
| "loss": 0.2778, |
| "step": 52100 |
| }, |
| { |
| "epoch": 3.1339217707560287, |
| "eval_loss": 0.2506987452507019, |
| "eval_runtime": 49.27, |
| "eval_samples_per_second": 202.963, |
| "eval_steps_per_second": 25.37, |
| "step": 52100 |
| }, |
| { |
| "epoch": 3.139936975690301, |
| "grad_norm": 0.97697514295578, |
| "learning_rate": 5.479826608869623e-06, |
| "loss": 0.2765, |
| "step": 52200 |
| }, |
| { |
| "epoch": 3.139936975690301, |
| "eval_loss": 0.248337984085083, |
| "eval_runtime": 50.0788, |
| "eval_samples_per_second": 199.685, |
| "eval_steps_per_second": 24.961, |
| "step": 52200 |
| }, |
| { |
| "epoch": 3.145952180624574, |
| "grad_norm": 0.9102049469947815, |
| "learning_rate": 5.478826275425142e-06, |
| "loss": 0.2776, |
| "step": 52300 |
| }, |
| { |
| "epoch": 3.145952180624574, |
| "eval_loss": 0.24709181487560272, |
| "eval_runtime": 50.384, |
| "eval_samples_per_second": 198.476, |
| "eval_steps_per_second": 24.809, |
| "step": 52300 |
| }, |
| { |
| "epoch": 3.151967385558846, |
| "grad_norm": 0.9332506656646729, |
| "learning_rate": 5.47782594198066e-06, |
| "loss": 0.2777, |
| "step": 52400 |
| }, |
| { |
| "epoch": 3.151967385558846, |
| "eval_loss": 0.2484249472618103, |
| "eval_runtime": 50.292, |
| "eval_samples_per_second": 198.839, |
| "eval_steps_per_second": 24.855, |
| "step": 52400 |
| }, |
| { |
| "epoch": 3.157982590493119, |
| "grad_norm": 0.8517917394638062, |
| "learning_rate": 5.476825608536179e-06, |
| "loss": 0.278, |
| "step": 52500 |
| }, |
| { |
| "epoch": 3.157982590493119, |
| "eval_loss": 0.24207893013954163, |
| "eval_runtime": 48.3341, |
| "eval_samples_per_second": 206.893, |
| "eval_steps_per_second": 25.862, |
| "step": 52500 |
| }, |
| { |
| "epoch": 3.1639977954273917, |
| "grad_norm": 0.8629357814788818, |
| "learning_rate": 5.475825275091698e-06, |
| "loss": 0.2775, |
| "step": 52600 |
| }, |
| { |
| "epoch": 3.1639977954273917, |
| "eval_loss": 0.24527695775032043, |
| "eval_runtime": 50.4058, |
| "eval_samples_per_second": 198.39, |
| "eval_steps_per_second": 24.799, |
| "step": 52600 |
| }, |
| { |
| "epoch": 3.170013000361664, |
| "grad_norm": 0.9194425940513611, |
| "learning_rate": 5.4748249416472156e-06, |
| "loss": 0.2775, |
| "step": 52700 |
| }, |
| { |
| "epoch": 3.170013000361664, |
| "eval_loss": 0.2455427497625351, |
| "eval_runtime": 48.0608, |
| "eval_samples_per_second": 208.07, |
| "eval_steps_per_second": 26.009, |
| "step": 52700 |
| }, |
| { |
| "epoch": 3.1760282052959368, |
| "grad_norm": 0.8746848702430725, |
| "learning_rate": 5.473824608202734e-06, |
| "loss": 0.278, |
| "step": 52800 |
| }, |
| { |
| "epoch": 3.1760282052959368, |
| "eval_loss": 0.24813415110111237, |
| "eval_runtime": 42.6735, |
| "eval_samples_per_second": 234.338, |
| "eval_steps_per_second": 29.292, |
| "step": 52800 |
| }, |
| { |
| "epoch": 3.1820434102302095, |
| "grad_norm": 0.9082689881324768, |
| "learning_rate": 5.472824274758253e-06, |
| "loss": 0.2732, |
| "step": 52900 |
| }, |
| { |
| "epoch": 3.1820434102302095, |
| "eval_loss": 0.24827983975410461, |
| "eval_runtime": 44.2364, |
| "eval_samples_per_second": 226.058, |
| "eval_steps_per_second": 28.257, |
| "step": 52900 |
| }, |
| { |
| "epoch": 3.188058615164482, |
| "grad_norm": 0.8607956171035767, |
| "learning_rate": 5.471823941313771e-06, |
| "loss": 0.2772, |
| "step": 53000 |
| }, |
| { |
| "epoch": 3.188058615164482, |
| "eval_loss": 0.24322330951690674, |
| "eval_runtime": 44.8161, |
| "eval_samples_per_second": 223.134, |
| "eval_steps_per_second": 27.892, |
| "step": 53000 |
| }, |
| { |
| "epoch": 3.1940738200987546, |
| "grad_norm": 0.9439307451248169, |
| "learning_rate": 5.4708236078692896e-06, |
| "loss": 0.2734, |
| "step": 53100 |
| }, |
| { |
| "epoch": 3.1940738200987546, |
| "eval_loss": 0.24696892499923706, |
| "eval_runtime": 47.0223, |
| "eval_samples_per_second": 212.665, |
| "eval_steps_per_second": 26.583, |
| "step": 53100 |
| }, |
| { |
| "epoch": 3.2000890250330274, |
| "grad_norm": 1.0130066871643066, |
| "learning_rate": 5.469823274424808e-06, |
| "loss": 0.2737, |
| "step": 53200 |
| }, |
| { |
| "epoch": 3.2000890250330274, |
| "eval_loss": 0.2521739602088928, |
| "eval_runtime": 46.5164, |
| "eval_samples_per_second": 214.978, |
| "eval_steps_per_second": 26.872, |
| "step": 53200 |
| }, |
| { |
| "epoch": 3.2061042299672997, |
| "grad_norm": 0.9969391822814941, |
| "learning_rate": 5.468822940980327e-06, |
| "loss": 0.2767, |
| "step": 53300 |
| }, |
| { |
| "epoch": 3.2061042299672997, |
| "eval_loss": 0.25239297747612, |
| "eval_runtime": 46.7418, |
| "eval_samples_per_second": 213.941, |
| "eval_steps_per_second": 26.743, |
| "step": 53300 |
| }, |
| { |
| "epoch": 3.2121194349015725, |
| "grad_norm": 0.9380843639373779, |
| "learning_rate": 5.467822607535846e-06, |
| "loss": 0.2743, |
| "step": 53400 |
| }, |
| { |
| "epoch": 3.2121194349015725, |
| "eval_loss": 0.2427060306072235, |
| "eval_runtime": 47.8166, |
| "eval_samples_per_second": 209.133, |
| "eval_steps_per_second": 26.142, |
| "step": 53400 |
| }, |
| { |
| "epoch": 3.2181346398358452, |
| "grad_norm": 0.8498116135597229, |
| "learning_rate": 5.466822274091364e-06, |
| "loss": 0.2752, |
| "step": 53500 |
| }, |
| { |
| "epoch": 3.2181346398358452, |
| "eval_loss": 0.23972123861312866, |
| "eval_runtime": 48.9235, |
| "eval_samples_per_second": 204.401, |
| "eval_steps_per_second": 25.55, |
| "step": 53500 |
| }, |
| { |
| "epoch": 3.2241498447701176, |
| "grad_norm": 0.8372825980186462, |
| "learning_rate": 5.465821940646882e-06, |
| "loss": 0.273, |
| "step": 53600 |
| }, |
| { |
| "epoch": 3.2241498447701176, |
| "eval_loss": 0.2440669685602188, |
| "eval_runtime": 47.5669, |
| "eval_samples_per_second": 210.23, |
| "eval_steps_per_second": 26.279, |
| "step": 53600 |
| }, |
| { |
| "epoch": 3.2301650497043903, |
| "grad_norm": 0.9698020815849304, |
| "learning_rate": 5.464821607202401e-06, |
| "loss": 0.2767, |
| "step": 53700 |
| }, |
| { |
| "epoch": 3.2301650497043903, |
| "eval_loss": 0.23816044628620148, |
| "eval_runtime": 34.9463, |
| "eval_samples_per_second": 286.153, |
| "eval_steps_per_second": 35.769, |
| "step": 53700 |
| }, |
| { |
| "epoch": 3.236180254638663, |
| "grad_norm": 0.822875440120697, |
| "learning_rate": 5.463821273757919e-06, |
| "loss": 0.2751, |
| "step": 53800 |
| }, |
| { |
| "epoch": 3.236180254638663, |
| "eval_loss": 0.24079230427742004, |
| "eval_runtime": 35.4307, |
| "eval_samples_per_second": 282.241, |
| "eval_steps_per_second": 35.28, |
| "step": 53800 |
| }, |
| { |
| "epoch": 3.2421954595729354, |
| "grad_norm": 0.8933221101760864, |
| "learning_rate": 5.4628209403134375e-06, |
| "loss": 0.2753, |
| "step": 53900 |
| }, |
| { |
| "epoch": 3.2421954595729354, |
| "eval_loss": 0.25047245621681213, |
| "eval_runtime": 36.1364, |
| "eval_samples_per_second": 276.729, |
| "eval_steps_per_second": 34.591, |
| "step": 53900 |
| }, |
| { |
| "epoch": 3.248210664507208, |
| "grad_norm": 0.915135383605957, |
| "learning_rate": 5.461820606868957e-06, |
| "loss": 0.2736, |
| "step": 54000 |
| }, |
| { |
| "epoch": 3.248210664507208, |
| "eval_loss": 0.24464978277683258, |
| "eval_runtime": 35.7495, |
| "eval_samples_per_second": 279.724, |
| "eval_steps_per_second": 34.966, |
| "step": 54000 |
| }, |
| { |
| "epoch": 3.2542258694414805, |
| "grad_norm": 0.8490029573440552, |
| "learning_rate": 5.460820273424475e-06, |
| "loss": 0.274, |
| "step": 54100 |
| }, |
| { |
| "epoch": 3.2542258694414805, |
| "eval_loss": 0.2507534325122833, |
| "eval_runtime": 38.4129, |
| "eval_samples_per_second": 260.329, |
| "eval_steps_per_second": 32.541, |
| "step": 54100 |
| }, |
| { |
| "epoch": 3.2602410743757533, |
| "grad_norm": 0.9220608472824097, |
| "learning_rate": 5.459819939979994e-06, |
| "loss": 0.2736, |
| "step": 54200 |
| }, |
| { |
| "epoch": 3.2602410743757533, |
| "eval_loss": 0.24634374678134918, |
| "eval_runtime": 41.8157, |
| "eval_samples_per_second": 239.145, |
| "eval_steps_per_second": 29.893, |
| "step": 54200 |
| }, |
| { |
| "epoch": 3.266256279310026, |
| "grad_norm": 0.8318041563034058, |
| "learning_rate": 5.458819606535512e-06, |
| "loss": 0.271, |
| "step": 54300 |
| }, |
| { |
| "epoch": 3.266256279310026, |
| "eval_loss": 0.24672181904315948, |
| "eval_runtime": 39.1233, |
| "eval_samples_per_second": 255.602, |
| "eval_steps_per_second": 31.95, |
| "step": 54300 |
| }, |
| { |
| "epoch": 3.2722714842442984, |
| "grad_norm": 0.8373593091964722, |
| "learning_rate": 5.45781927309103e-06, |
| "loss": 0.272, |
| "step": 54400 |
| }, |
| { |
| "epoch": 3.2722714842442984, |
| "eval_loss": 0.24106918275356293, |
| "eval_runtime": 36.4825, |
| "eval_samples_per_second": 274.104, |
| "eval_steps_per_second": 34.263, |
| "step": 54400 |
| }, |
| { |
| "epoch": 3.278286689178571, |
| "grad_norm": 0.8802669644355774, |
| "learning_rate": 5.456818939646549e-06, |
| "loss": 0.2683, |
| "step": 54500 |
| }, |
| { |
| "epoch": 3.278286689178571, |
| "eval_loss": 0.24452929198741913, |
| "eval_runtime": 33.0976, |
| "eval_samples_per_second": 302.137, |
| "eval_steps_per_second": 37.767, |
| "step": 54500 |
| }, |
| { |
| "epoch": 3.284301894112844, |
| "grad_norm": 0.8867002129554749, |
| "learning_rate": 5.455818606202067e-06, |
| "loss": 0.2697, |
| "step": 54600 |
| }, |
| { |
| "epoch": 3.284301894112844, |
| "eval_loss": 0.23936684429645538, |
| "eval_runtime": 40.904, |
| "eval_samples_per_second": 244.475, |
| "eval_steps_per_second": 30.559, |
| "step": 54600 |
| }, |
| { |
| "epoch": 3.2903170990471162, |
| "grad_norm": 0.91335529088974, |
| "learning_rate": 5.454818272757586e-06, |
| "loss": 0.2739, |
| "step": 54700 |
| }, |
| { |
| "epoch": 3.2903170990471162, |
| "eval_loss": 0.24262717366218567, |
| "eval_runtime": 43.6033, |
| "eval_samples_per_second": 229.34, |
| "eval_steps_per_second": 28.668, |
| "step": 54700 |
| }, |
| { |
| "epoch": 3.296332303981389, |
| "grad_norm": 0.8662433624267578, |
| "learning_rate": 5.453817939313105e-06, |
| "loss": 0.2715, |
| "step": 54800 |
| }, |
| { |
| "epoch": 3.296332303981389, |
| "eval_loss": 0.24885956943035126, |
| "eval_runtime": 45.6743, |
| "eval_samples_per_second": 218.942, |
| "eval_steps_per_second": 27.368, |
| "step": 54800 |
| }, |
| { |
| "epoch": 3.3023475089156618, |
| "grad_norm": 0.943458616733551, |
| "learning_rate": 5.452817605868623e-06, |
| "loss": 0.2709, |
| "step": 54900 |
| }, |
| { |
| "epoch": 3.3023475089156618, |
| "eval_loss": 0.24570631980895996, |
| "eval_runtime": 46.9183, |
| "eval_samples_per_second": 213.136, |
| "eval_steps_per_second": 26.642, |
| "step": 54900 |
| }, |
| { |
| "epoch": 3.308362713849934, |
| "grad_norm": 0.8767443299293518, |
| "learning_rate": 5.451817272424142e-06, |
| "loss": 0.2724, |
| "step": 55000 |
| }, |
| { |
| "epoch": 3.308362713849934, |
| "eval_loss": 0.24481208622455597, |
| "eval_runtime": 47.56, |
| "eval_samples_per_second": 210.261, |
| "eval_steps_per_second": 26.283, |
| "step": 55000 |
| }, |
| { |
| "epoch": 3.314377918784207, |
| "grad_norm": 0.9032852053642273, |
| "learning_rate": 5.45081693897966e-06, |
| "loss": 0.2733, |
| "step": 55100 |
| }, |
| { |
| "epoch": 3.314377918784207, |
| "eval_loss": 0.24037285149097443, |
| "eval_runtime": 48.5117, |
| "eval_samples_per_second": 206.136, |
| "eval_steps_per_second": 25.767, |
| "step": 55100 |
| }, |
| { |
| "epoch": 3.320393123718479, |
| "grad_norm": 0.8414300084114075, |
| "learning_rate": 5.449816605535178e-06, |
| "loss": 0.2709, |
| "step": 55200 |
| }, |
| { |
| "epoch": 3.320393123718479, |
| "eval_loss": 0.24620996415615082, |
| "eval_runtime": 48.2151, |
| "eval_samples_per_second": 207.404, |
| "eval_steps_per_second": 25.925, |
| "step": 55200 |
| }, |
| { |
| "epoch": 3.326408328652752, |
| "grad_norm": 0.9093489646911621, |
| "learning_rate": 5.448816272090697e-06, |
| "loss": 0.2683, |
| "step": 55300 |
| }, |
| { |
| "epoch": 3.326408328652752, |
| "eval_loss": 0.24467670917510986, |
| "eval_runtime": 49.7086, |
| "eval_samples_per_second": 201.172, |
| "eval_steps_per_second": 25.147, |
| "step": 55300 |
| }, |
| { |
| "epoch": 3.3324235335870247, |
| "grad_norm": 0.920391857624054, |
| "learning_rate": 5.447815938646216e-06, |
| "loss": 0.2703, |
| "step": 55400 |
| }, |
| { |
| "epoch": 3.3324235335870247, |
| "eval_loss": 0.24019140005111694, |
| "eval_runtime": 50.0394, |
| "eval_samples_per_second": 199.843, |
| "eval_steps_per_second": 24.98, |
| "step": 55400 |
| }, |
| { |
| "epoch": 3.338438738521297, |
| "grad_norm": 0.9286474585533142, |
| "learning_rate": 5.446815605201734e-06, |
| "loss": 0.2705, |
| "step": 55500 |
| }, |
| { |
| "epoch": 3.338438738521297, |
| "eval_loss": 0.24543143808841705, |
| "eval_runtime": 50.344, |
| "eval_samples_per_second": 198.633, |
| "eval_steps_per_second": 24.829, |
| "step": 55500 |
| }, |
| { |
| "epoch": 3.34445394345557, |
| "grad_norm": 0.9175123572349548, |
| "learning_rate": 5.445815271757253e-06, |
| "loss": 0.2713, |
| "step": 55600 |
| }, |
| { |
| "epoch": 3.34445394345557, |
| "eval_loss": 0.23898915946483612, |
| "eval_runtime": 50.3195, |
| "eval_samples_per_second": 198.73, |
| "eval_steps_per_second": 24.841, |
| "step": 55600 |
| }, |
| { |
| "epoch": 3.3504691483898426, |
| "grad_norm": 0.8990902900695801, |
| "learning_rate": 5.444814938312771e-06, |
| "loss": 0.2713, |
| "step": 55700 |
| }, |
| { |
| "epoch": 3.3504691483898426, |
| "eval_loss": 0.24149462580680847, |
| "eval_runtime": 50.7504, |
| "eval_samples_per_second": 197.043, |
| "eval_steps_per_second": 24.63, |
| "step": 55700 |
| }, |
| { |
| "epoch": 3.356484353324115, |
| "grad_norm": 0.8217372298240662, |
| "learning_rate": 5.4438146048682896e-06, |
| "loss": 0.2694, |
| "step": 55800 |
| }, |
| { |
| "epoch": 3.356484353324115, |
| "eval_loss": 0.24138091504573822, |
| "eval_runtime": 50.9006, |
| "eval_samples_per_second": 196.461, |
| "eval_steps_per_second": 24.558, |
| "step": 55800 |
| }, |
| { |
| "epoch": 3.3624995582583876, |
| "grad_norm": 0.8727395534515381, |
| "learning_rate": 5.442814271423808e-06, |
| "loss": 0.2694, |
| "step": 55900 |
| }, |
| { |
| "epoch": 3.3624995582583876, |
| "eval_loss": 0.24046172201633453, |
| "eval_runtime": 36.3936, |
| "eval_samples_per_second": 274.773, |
| "eval_steps_per_second": 34.347, |
| "step": 55900 |
| }, |
| { |
| "epoch": 3.3685147631926604, |
| "grad_norm": 0.8453567028045654, |
| "learning_rate": 5.441813937979326e-06, |
| "loss": 0.2683, |
| "step": 56000 |
| }, |
| { |
| "epoch": 3.3685147631926604, |
| "eval_loss": 0.24423474073410034, |
| "eval_runtime": 50.8544, |
| "eval_samples_per_second": 196.64, |
| "eval_steps_per_second": 24.58, |
| "step": 56000 |
| }, |
| { |
| "epoch": 3.3745299681269327, |
| "grad_norm": 0.86241614818573, |
| "learning_rate": 5.440813604534845e-06, |
| "loss": 0.2649, |
| "step": 56100 |
| }, |
| { |
| "epoch": 3.3745299681269327, |
| "eval_loss": 0.2407056838274002, |
| "eval_runtime": 50.778, |
| "eval_samples_per_second": 196.936, |
| "eval_steps_per_second": 24.617, |
| "step": 56100 |
| }, |
| { |
| "epoch": 3.3805451730612055, |
| "grad_norm": 0.9142568111419678, |
| "learning_rate": 5.4398132710903636e-06, |
| "loss": 0.2696, |
| "step": 56200 |
| }, |
| { |
| "epoch": 3.3805451730612055, |
| "eval_loss": 0.24098168313503265, |
| "eval_runtime": 51.0703, |
| "eval_samples_per_second": 195.809, |
| "eval_steps_per_second": 24.476, |
| "step": 56200 |
| }, |
| { |
| "epoch": 3.386560377995478, |
| "grad_norm": 0.8302989602088928, |
| "learning_rate": 5.438812937645882e-06, |
| "loss": 0.2695, |
| "step": 56300 |
| }, |
| { |
| "epoch": 3.386560377995478, |
| "eval_loss": 0.23798757791519165, |
| "eval_runtime": 50.9646, |
| "eval_samples_per_second": 196.215, |
| "eval_steps_per_second": 24.527, |
| "step": 56300 |
| }, |
| { |
| "epoch": 3.3925755829297506, |
| "grad_norm": 0.8420681357383728, |
| "learning_rate": 5.437812604201401e-06, |
| "loss": 0.2682, |
| "step": 56400 |
| }, |
| { |
| "epoch": 3.3925755829297506, |
| "eval_loss": 0.24360163509845734, |
| "eval_runtime": 51.0498, |
| "eval_samples_per_second": 195.887, |
| "eval_steps_per_second": 24.486, |
| "step": 56400 |
| }, |
| { |
| "epoch": 3.3985907878640234, |
| "grad_norm": 0.8456258773803711, |
| "learning_rate": 5.436812270756919e-06, |
| "loss": 0.2661, |
| "step": 56500 |
| }, |
| { |
| "epoch": 3.3985907878640234, |
| "eval_loss": 0.23989547789096832, |
| "eval_runtime": 49.5593, |
| "eval_samples_per_second": 201.778, |
| "eval_steps_per_second": 25.222, |
| "step": 56500 |
| }, |
| { |
| "epoch": 3.404605992798296, |
| "grad_norm": 0.9097959399223328, |
| "learning_rate": 5.4358119373124375e-06, |
| "loss": 0.2684, |
| "step": 56600 |
| }, |
| { |
| "epoch": 3.404605992798296, |
| "eval_loss": 0.2373836487531662, |
| "eval_runtime": 48.7156, |
| "eval_samples_per_second": 205.273, |
| "eval_steps_per_second": 25.659, |
| "step": 56600 |
| }, |
| { |
| "epoch": 3.4106211977325684, |
| "grad_norm": 0.8549370169639587, |
| "learning_rate": 5.434811603867956e-06, |
| "loss": 0.266, |
| "step": 56700 |
| }, |
| { |
| "epoch": 3.4106211977325684, |
| "eval_loss": 0.2353491634130478, |
| "eval_runtime": 48.0299, |
| "eval_samples_per_second": 208.204, |
| "eval_steps_per_second": 26.025, |
| "step": 56700 |
| }, |
| { |
| "epoch": 3.416636402666841, |
| "grad_norm": 0.9058821797370911, |
| "learning_rate": 5.433811270423474e-06, |
| "loss": 0.2712, |
| "step": 56800 |
| }, |
| { |
| "epoch": 3.416636402666841, |
| "eval_loss": 0.24013860523700714, |
| "eval_runtime": 48.2564, |
| "eval_samples_per_second": 207.226, |
| "eval_steps_per_second": 25.903, |
| "step": 56800 |
| }, |
| { |
| "epoch": 3.4226516076011135, |
| "grad_norm": 0.7843255400657654, |
| "learning_rate": 5.432810936978993e-06, |
| "loss": 0.2667, |
| "step": 56900 |
| }, |
| { |
| "epoch": 3.4226516076011135, |
| "eval_loss": 0.2440056949853897, |
| "eval_runtime": 49.156, |
| "eval_samples_per_second": 203.434, |
| "eval_steps_per_second": 25.429, |
| "step": 56900 |
| }, |
| { |
| "epoch": 3.4286668125353863, |
| "grad_norm": 0.8476096987724304, |
| "learning_rate": 5.4318106035345115e-06, |
| "loss": 0.2647, |
| "step": 57000 |
| }, |
| { |
| "epoch": 3.4286668125353863, |
| "eval_loss": 0.24185192584991455, |
| "eval_runtime": 48.8755, |
| "eval_samples_per_second": 204.602, |
| "eval_steps_per_second": 25.575, |
| "step": 57000 |
| }, |
| { |
| "epoch": 3.434682017469659, |
| "grad_norm": 0.8693493008613586, |
| "learning_rate": 5.43081027009003e-06, |
| "loss": 0.2667, |
| "step": 57100 |
| }, |
| { |
| "epoch": 3.434682017469659, |
| "eval_loss": 0.23922978341579437, |
| "eval_runtime": 49.1662, |
| "eval_samples_per_second": 203.392, |
| "eval_steps_per_second": 25.424, |
| "step": 57100 |
| }, |
| { |
| "epoch": 3.4406972224039314, |
| "grad_norm": 0.7601708769798279, |
| "learning_rate": 5.429809936645549e-06, |
| "loss": 0.268, |
| "step": 57200 |
| }, |
| { |
| "epoch": 3.4406972224039314, |
| "eval_loss": 0.2391706109046936, |
| "eval_runtime": 49.6653, |
| "eval_samples_per_second": 201.348, |
| "eval_steps_per_second": 25.168, |
| "step": 57200 |
| }, |
| { |
| "epoch": 3.446712427338204, |
| "grad_norm": 0.8476257920265198, |
| "learning_rate": 5.428809603201068e-06, |
| "loss": 0.2668, |
| "step": 57300 |
| }, |
| { |
| "epoch": 3.446712427338204, |
| "eval_loss": 0.23998339474201202, |
| "eval_runtime": 49.9477, |
| "eval_samples_per_second": 200.209, |
| "eval_steps_per_second": 25.026, |
| "step": 57300 |
| }, |
| { |
| "epoch": 3.452727632272477, |
| "grad_norm": 0.9185997843742371, |
| "learning_rate": 5.4278092697565855e-06, |
| "loss": 0.2649, |
| "step": 57400 |
| }, |
| { |
| "epoch": 3.452727632272477, |
| "eval_loss": 0.2374006062746048, |
| "eval_runtime": 49.539, |
| "eval_samples_per_second": 201.861, |
| "eval_steps_per_second": 25.233, |
| "step": 57400 |
| }, |
| { |
| "epoch": 3.4587428372067492, |
| "grad_norm": 0.8186565041542053, |
| "learning_rate": 5.426808936312104e-06, |
| "loss": 0.2667, |
| "step": 57500 |
| }, |
| { |
| "epoch": 3.4587428372067492, |
| "eval_loss": 0.23729223012924194, |
| "eval_runtime": 50.958, |
| "eval_samples_per_second": 196.24, |
| "eval_steps_per_second": 24.53, |
| "step": 57500 |
| }, |
| { |
| "epoch": 3.464758042141022, |
| "grad_norm": 0.876054048538208, |
| "learning_rate": 5.425808602867622e-06, |
| "loss": 0.2644, |
| "step": 57600 |
| }, |
| { |
| "epoch": 3.464758042141022, |
| "eval_loss": 0.2387179434299469, |
| "eval_runtime": 36.8167, |
| "eval_samples_per_second": 271.616, |
| "eval_steps_per_second": 33.952, |
| "step": 57600 |
| }, |
| { |
| "epoch": 3.4707732470752948, |
| "grad_norm": 0.8078221678733826, |
| "learning_rate": 5.424808269423141e-06, |
| "loss": 0.2671, |
| "step": 57700 |
| }, |
| { |
| "epoch": 3.4707732470752948, |
| "eval_loss": 0.23494240641593933, |
| "eval_runtime": 50.9663, |
| "eval_samples_per_second": 196.208, |
| "eval_steps_per_second": 24.526, |
| "step": 57700 |
| }, |
| { |
| "epoch": 3.476788452009567, |
| "grad_norm": 0.8425822257995605, |
| "learning_rate": 5.4238079359786595e-06, |
| "loss": 0.2662, |
| "step": 57800 |
| }, |
| { |
| "epoch": 3.476788452009567, |
| "eval_loss": 0.23349033296108246, |
| "eval_runtime": 50.9349, |
| "eval_samples_per_second": 196.329, |
| "eval_steps_per_second": 24.541, |
| "step": 57800 |
| }, |
| { |
| "epoch": 3.48280365694384, |
| "grad_norm": 0.8718583583831787, |
| "learning_rate": 5.422807602534178e-06, |
| "loss": 0.267, |
| "step": 57900 |
| }, |
| { |
| "epoch": 3.48280365694384, |
| "eval_loss": 0.23534800112247467, |
| "eval_runtime": 50.6689, |
| "eval_samples_per_second": 197.36, |
| "eval_steps_per_second": 24.67, |
| "step": 57900 |
| }, |
| { |
| "epoch": 3.488818861878112, |
| "grad_norm": 0.8161312341690063, |
| "learning_rate": 5.421807269089697e-06, |
| "loss": 0.2641, |
| "step": 58000 |
| }, |
| { |
| "epoch": 3.488818861878112, |
| "eval_loss": 0.23691873252391815, |
| "eval_runtime": 50.9223, |
| "eval_samples_per_second": 196.377, |
| "eval_steps_per_second": 24.547, |
| "step": 58000 |
| }, |
| { |
| "epoch": 3.494834066812385, |
| "grad_norm": 0.781482458114624, |
| "learning_rate": 5.420806935645216e-06, |
| "loss": 0.2652, |
| "step": 58100 |
| }, |
| { |
| "epoch": 3.494834066812385, |
| "eval_loss": 0.2412412315607071, |
| "eval_runtime": 51.059, |
| "eval_samples_per_second": 195.852, |
| "eval_steps_per_second": 24.481, |
| "step": 58100 |
| }, |
| { |
| "epoch": 3.5008492717466577, |
| "grad_norm": 0.869367778301239, |
| "learning_rate": 5.4198066022007335e-06, |
| "loss": 0.2639, |
| "step": 58200 |
| }, |
| { |
| "epoch": 3.5008492717466577, |
| "eval_loss": 0.23919972777366638, |
| "eval_runtime": 50.9672, |
| "eval_samples_per_second": 196.205, |
| "eval_steps_per_second": 24.526, |
| "step": 58200 |
| }, |
| { |
| "epoch": 3.5068644766809305, |
| "grad_norm": 0.8614550828933716, |
| "learning_rate": 5.418806268756252e-06, |
| "loss": 0.2637, |
| "step": 58300 |
| }, |
| { |
| "epoch": 3.5068644766809305, |
| "eval_loss": 0.23232702910900116, |
| "eval_runtime": 50.8155, |
| "eval_samples_per_second": 196.79, |
| "eval_steps_per_second": 24.599, |
| "step": 58300 |
| }, |
| { |
| "epoch": 3.512879681615203, |
| "grad_norm": 0.9519971609115601, |
| "learning_rate": 5.417805935311771e-06, |
| "loss": 0.2636, |
| "step": 58400 |
| }, |
| { |
| "epoch": 3.512879681615203, |
| "eval_loss": 0.2359647899866104, |
| "eval_runtime": 51.0167, |
| "eval_samples_per_second": 196.014, |
| "eval_steps_per_second": 24.502, |
| "step": 58400 |
| }, |
| { |
| "epoch": 3.5188948865494756, |
| "grad_norm": 0.7815201282501221, |
| "learning_rate": 5.416805601867289e-06, |
| "loss": 0.263, |
| "step": 58500 |
| }, |
| { |
| "epoch": 3.5188948865494756, |
| "eval_loss": 0.2390337437391281, |
| "eval_runtime": 50.9327, |
| "eval_samples_per_second": 196.337, |
| "eval_steps_per_second": 24.542, |
| "step": 58500 |
| }, |
| { |
| "epoch": 3.524910091483748, |
| "grad_norm": 0.9015016555786133, |
| "learning_rate": 5.415805268422808e-06, |
| "loss": 0.2635, |
| "step": 58600 |
| }, |
| { |
| "epoch": 3.524910091483748, |
| "eval_loss": 0.23515385389328003, |
| "eval_runtime": 50.6423, |
| "eval_samples_per_second": 197.463, |
| "eval_steps_per_second": 24.683, |
| "step": 58600 |
| }, |
| { |
| "epoch": 3.5309252964180207, |
| "grad_norm": 0.9041895866394043, |
| "learning_rate": 5.414804934978326e-06, |
| "loss": 0.2633, |
| "step": 58700 |
| }, |
| { |
| "epoch": 3.5309252964180207, |
| "eval_loss": 0.2379036694765091, |
| "eval_runtime": 50.2383, |
| "eval_samples_per_second": 199.051, |
| "eval_steps_per_second": 24.881, |
| "step": 58700 |
| }, |
| { |
| "epoch": 3.5369405013522934, |
| "grad_norm": 0.884931743144989, |
| "learning_rate": 5.413804601533845e-06, |
| "loss": 0.2612, |
| "step": 58800 |
| }, |
| { |
| "epoch": 3.5369405013522934, |
| "eval_loss": 0.23683039844036102, |
| "eval_runtime": 50.2696, |
| "eval_samples_per_second": 198.928, |
| "eval_steps_per_second": 24.866, |
| "step": 58800 |
| }, |
| { |
| "epoch": 3.5429557062865658, |
| "grad_norm": 0.862382709980011, |
| "learning_rate": 5.4128042680893636e-06, |
| "loss": 0.2623, |
| "step": 58900 |
| }, |
| { |
| "epoch": 3.5429557062865658, |
| "eval_loss": 0.23638789355754852, |
| "eval_runtime": 50.4759, |
| "eval_samples_per_second": 198.114, |
| "eval_steps_per_second": 24.764, |
| "step": 58900 |
| }, |
| { |
| "epoch": 3.5489709112208385, |
| "grad_norm": 0.8239731788635254, |
| "learning_rate": 5.4118039346448814e-06, |
| "loss": 0.2652, |
| "step": 59000 |
| }, |
| { |
| "epoch": 3.5489709112208385, |
| "eval_loss": 0.23644813895225525, |
| "eval_runtime": 49.8805, |
| "eval_samples_per_second": 200.479, |
| "eval_steps_per_second": 25.06, |
| "step": 59000 |
| }, |
| { |
| "epoch": 3.554986116155111, |
| "grad_norm": 0.8433008193969727, |
| "learning_rate": 5.4108036012004e-06, |
| "loss": 0.2628, |
| "step": 59100 |
| }, |
| { |
| "epoch": 3.554986116155111, |
| "eval_loss": 0.23331347107887268, |
| "eval_runtime": 50.0038, |
| "eval_samples_per_second": 199.985, |
| "eval_steps_per_second": 24.998, |
| "step": 59100 |
| }, |
| { |
| "epoch": 3.5610013210893836, |
| "grad_norm": 0.8740643858909607, |
| "learning_rate": 5.409803267755919e-06, |
| "loss": 0.2615, |
| "step": 59200 |
| }, |
| { |
| "epoch": 3.5610013210893836, |
| "eval_loss": 0.23751728236675262, |
| "eval_runtime": 49.4105, |
| "eval_samples_per_second": 202.386, |
| "eval_steps_per_second": 25.298, |
| "step": 59200 |
| }, |
| { |
| "epoch": 3.5670165260236564, |
| "grad_norm": 0.7903056144714355, |
| "learning_rate": 5.4088029343114375e-06, |
| "loss": 0.2621, |
| "step": 59300 |
| }, |
| { |
| "epoch": 3.5670165260236564, |
| "eval_loss": 0.23228037357330322, |
| "eval_runtime": 49.2273, |
| "eval_samples_per_second": 203.139, |
| "eval_steps_per_second": 25.392, |
| "step": 59300 |
| }, |
| { |
| "epoch": 3.573031730957929, |
| "grad_norm": 0.8559598326683044, |
| "learning_rate": 5.407802600866956e-06, |
| "loss": 0.2621, |
| "step": 59400 |
| }, |
| { |
| "epoch": 3.573031730957929, |
| "eval_loss": 0.23780353367328644, |
| "eval_runtime": 49.4165, |
| "eval_samples_per_second": 202.362, |
| "eval_steps_per_second": 25.295, |
| "step": 59400 |
| }, |
| { |
| "epoch": 3.5790469358922015, |
| "grad_norm": 0.9178751111030579, |
| "learning_rate": 5.406802267422474e-06, |
| "loss": 0.2635, |
| "step": 59500 |
| }, |
| { |
| "epoch": 3.5790469358922015, |
| "eval_loss": 0.23736293613910675, |
| "eval_runtime": 49.1576, |
| "eval_samples_per_second": 203.427, |
| "eval_steps_per_second": 25.428, |
| "step": 59500 |
| }, |
| { |
| "epoch": 3.5850621408264742, |
| "grad_norm": 0.8310320377349854, |
| "learning_rate": 5.405801933977993e-06, |
| "loss": 0.2626, |
| "step": 59600 |
| }, |
| { |
| "epoch": 3.5850621408264742, |
| "eval_loss": 0.2320030778646469, |
| "eval_runtime": 49.4934, |
| "eval_samples_per_second": 202.047, |
| "eval_steps_per_second": 25.256, |
| "step": 59600 |
| }, |
| { |
| "epoch": 3.5910773457607466, |
| "grad_norm": 0.7860143184661865, |
| "learning_rate": 5.4048016005335115e-06, |
| "loss": 0.2632, |
| "step": 59700 |
| }, |
| { |
| "epoch": 3.5910773457607466, |
| "eval_loss": 0.2336650937795639, |
| "eval_runtime": 49.1673, |
| "eval_samples_per_second": 203.387, |
| "eval_steps_per_second": 25.423, |
| "step": 59700 |
| }, |
| { |
| "epoch": 3.5970925506950193, |
| "grad_norm": 0.836063027381897, |
| "learning_rate": 5.403801267089029e-06, |
| "loss": 0.2621, |
| "step": 59800 |
| }, |
| { |
| "epoch": 3.5970925506950193, |
| "eval_loss": 0.23437707126140594, |
| "eval_runtime": 49.5986, |
| "eval_samples_per_second": 201.619, |
| "eval_steps_per_second": 25.202, |
| "step": 59800 |
| }, |
| { |
| "epoch": 3.603107755629292, |
| "grad_norm": 0.8768342137336731, |
| "learning_rate": 5.402800933644548e-06, |
| "loss": 0.2609, |
| "step": 59900 |
| }, |
| { |
| "epoch": 3.603107755629292, |
| "eval_loss": 0.23560036718845367, |
| "eval_runtime": 49.2225, |
| "eval_samples_per_second": 203.159, |
| "eval_steps_per_second": 25.395, |
| "step": 59900 |
| }, |
| { |
| "epoch": 3.6091229605635644, |
| "grad_norm": 0.8093357682228088, |
| "learning_rate": 5.401800600200067e-06, |
| "loss": 0.26, |
| "step": 60000 |
| }, |
| { |
| "epoch": 3.6091229605635644, |
| "eval_loss": 0.2340717762708664, |
| "eval_runtime": 49.3844, |
| "eval_samples_per_second": 202.493, |
| "eval_steps_per_second": 25.312, |
| "step": 60000 |
| }, |
| { |
| "epoch": 3.615138165497837, |
| "grad_norm": 0.8731770515441895, |
| "learning_rate": 5.4008002667555855e-06, |
| "loss": 0.2614, |
| "step": 60100 |
| }, |
| { |
| "epoch": 3.615138165497837, |
| "eval_loss": 0.2342948466539383, |
| "eval_runtime": 48.6563, |
| "eval_samples_per_second": 205.523, |
| "eval_steps_per_second": 25.69, |
| "step": 60100 |
| }, |
| { |
| "epoch": 3.6211533704321095, |
| "grad_norm": 0.8906363844871521, |
| "learning_rate": 5.399799933311104e-06, |
| "loss": 0.2601, |
| "step": 60200 |
| }, |
| { |
| "epoch": 3.6211533704321095, |
| "eval_loss": 0.2331141084432602, |
| "eval_runtime": 49.3998, |
| "eval_samples_per_second": 202.43, |
| "eval_steps_per_second": 25.304, |
| "step": 60200 |
| }, |
| { |
| "epoch": 3.6271685753663823, |
| "grad_norm": 0.8565790057182312, |
| "learning_rate": 5.398799599866623e-06, |
| "loss": 0.2603, |
| "step": 60300 |
| }, |
| { |
| "epoch": 3.6271685753663823, |
| "eval_loss": 0.23420780897140503, |
| "eval_runtime": 48.2983, |
| "eval_samples_per_second": 207.046, |
| "eval_steps_per_second": 25.881, |
| "step": 60300 |
| }, |
| { |
| "epoch": 3.633183780300655, |
| "grad_norm": 0.9718087911605835, |
| "learning_rate": 5.397799266422141e-06, |
| "loss": 0.2635, |
| "step": 60400 |
| }, |
| { |
| "epoch": 3.633183780300655, |
| "eval_loss": 0.2375570833683014, |
| "eval_runtime": 48.9976, |
| "eval_samples_per_second": 204.091, |
| "eval_steps_per_second": 25.511, |
| "step": 60400 |
| }, |
| { |
| "epoch": 3.639198985234928, |
| "grad_norm": 0.8572448492050171, |
| "learning_rate": 5.3967989329776595e-06, |
| "loss": 0.2626, |
| "step": 60500 |
| }, |
| { |
| "epoch": 3.639198985234928, |
| "eval_loss": 0.23931777477264404, |
| "eval_runtime": 49.0436, |
| "eval_samples_per_second": 203.9, |
| "eval_steps_per_second": 25.488, |
| "step": 60500 |
| }, |
| { |
| "epoch": 3.6452141901692, |
| "grad_norm": 0.8994346857070923, |
| "learning_rate": 5.395798599533177e-06, |
| "loss": 0.2595, |
| "step": 60600 |
| }, |
| { |
| "epoch": 3.6452141901692, |
| "eval_loss": 0.2317589819431305, |
| "eval_runtime": 49.5846, |
| "eval_samples_per_second": 201.675, |
| "eval_steps_per_second": 25.209, |
| "step": 60600 |
| }, |
| { |
| "epoch": 3.651229395103473, |
| "grad_norm": 0.8513436913490295, |
| "learning_rate": 5.394798266088696e-06, |
| "loss": 0.2614, |
| "step": 60700 |
| }, |
| { |
| "epoch": 3.651229395103473, |
| "eval_loss": 0.23111025989055634, |
| "eval_runtime": 49.7262, |
| "eval_samples_per_second": 201.101, |
| "eval_steps_per_second": 25.138, |
| "step": 60700 |
| }, |
| { |
| "epoch": 3.657244600037745, |
| "grad_norm": 0.9126865267753601, |
| "learning_rate": 5.393797932644215e-06, |
| "loss": 0.2583, |
| "step": 60800 |
| }, |
| { |
| "epoch": 3.657244600037745, |
| "eval_loss": 0.23351147770881653, |
| "eval_runtime": 49.8967, |
| "eval_samples_per_second": 200.414, |
| "eval_steps_per_second": 25.052, |
| "step": 60800 |
| }, |
| { |
| "epoch": 3.663259804972018, |
| "grad_norm": 0.8021876811981201, |
| "learning_rate": 5.3927975991997335e-06, |
| "loss": 0.2601, |
| "step": 60900 |
| }, |
| { |
| "epoch": 3.663259804972018, |
| "eval_loss": 0.23443163931369781, |
| "eval_runtime": 49.9056, |
| "eval_samples_per_second": 200.378, |
| "eval_steps_per_second": 25.047, |
| "step": 60900 |
| }, |
| { |
| "epoch": 3.6692750099062907, |
| "grad_norm": 0.8586119413375854, |
| "learning_rate": 5.391797265755252e-06, |
| "loss": 0.2605, |
| "step": 61000 |
| }, |
| { |
| "epoch": 3.6692750099062907, |
| "eval_loss": 0.229187473654747, |
| "eval_runtime": 40.9269, |
| "eval_samples_per_second": 244.338, |
| "eval_steps_per_second": 30.542, |
| "step": 61000 |
| }, |
| { |
| "epoch": 3.6752902148405635, |
| "grad_norm": 0.9336073398590088, |
| "learning_rate": 5.390796932310771e-06, |
| "loss": 0.2612, |
| "step": 61100 |
| }, |
| { |
| "epoch": 3.6752902148405635, |
| "eval_loss": 0.23033183813095093, |
| "eval_runtime": 49.8614, |
| "eval_samples_per_second": 200.556, |
| "eval_steps_per_second": 25.069, |
| "step": 61100 |
| }, |
| { |
| "epoch": 3.681305419774836, |
| "grad_norm": 0.7944173812866211, |
| "learning_rate": 5.389796598866289e-06, |
| "loss": 0.2595, |
| "step": 61200 |
| }, |
| { |
| "epoch": 3.681305419774836, |
| "eval_loss": 0.22884014248847961, |
| "eval_runtime": 50.2375, |
| "eval_samples_per_second": 199.055, |
| "eval_steps_per_second": 24.882, |
| "step": 61200 |
| }, |
| { |
| "epoch": 3.6873206247091086, |
| "grad_norm": 0.8038543462753296, |
| "learning_rate": 5.3887962654218075e-06, |
| "loss": 0.2588, |
| "step": 61300 |
| }, |
| { |
| "epoch": 3.6873206247091086, |
| "eval_loss": 0.23328329622745514, |
| "eval_runtime": 51.0649, |
| "eval_samples_per_second": 195.829, |
| "eval_steps_per_second": 24.479, |
| "step": 61300 |
| }, |
| { |
| "epoch": 3.693335829643381, |
| "grad_norm": 0.8919224143028259, |
| "learning_rate": 5.387795931977326e-06, |
| "loss": 0.2592, |
| "step": 61400 |
| }, |
| { |
| "epoch": 3.693335829643381, |
| "eval_loss": 0.23098503053188324, |
| "eval_runtime": 51.0915, |
| "eval_samples_per_second": 195.727, |
| "eval_steps_per_second": 24.466, |
| "step": 61400 |
| }, |
| { |
| "epoch": 3.6993510345776537, |
| "grad_norm": 0.81063312292099, |
| "learning_rate": 5.386795598532844e-06, |
| "loss": 0.2598, |
| "step": 61500 |
| }, |
| { |
| "epoch": 3.6993510345776537, |
| "eval_loss": 0.23130032420158386, |
| "eval_runtime": 51.1499, |
| "eval_samples_per_second": 195.504, |
| "eval_steps_per_second": 24.438, |
| "step": 61500 |
| }, |
| { |
| "epoch": 3.7053662395119265, |
| "grad_norm": 0.8565428853034973, |
| "learning_rate": 5.385795265088363e-06, |
| "loss": 0.2569, |
| "step": 61600 |
| }, |
| { |
| "epoch": 3.7053662395119265, |
| "eval_loss": 0.23042194545269012, |
| "eval_runtime": 51.0719, |
| "eval_samples_per_second": 195.802, |
| "eval_steps_per_second": 24.475, |
| "step": 61600 |
| }, |
| { |
| "epoch": 3.7113814444461988, |
| "grad_norm": 0.8808117508888245, |
| "learning_rate": 5.3847949316438814e-06, |
| "loss": 0.2579, |
| "step": 61700 |
| }, |
| { |
| "epoch": 3.7113814444461988, |
| "eval_loss": 0.22964029014110565, |
| "eval_runtime": 51.1788, |
| "eval_samples_per_second": 195.393, |
| "eval_steps_per_second": 24.424, |
| "step": 61700 |
| }, |
| { |
| "epoch": 3.7173966493804715, |
| "grad_norm": 0.8812440037727356, |
| "learning_rate": 5.3837945981994e-06, |
| "loss": 0.2568, |
| "step": 61800 |
| }, |
| { |
| "epoch": 3.7173966493804715, |
| "eval_loss": 0.23177900910377502, |
| "eval_runtime": 51.1658, |
| "eval_samples_per_second": 195.443, |
| "eval_steps_per_second": 24.43, |
| "step": 61800 |
| }, |
| { |
| "epoch": 3.723411854314744, |
| "grad_norm": 0.8692899346351624, |
| "learning_rate": 5.382794264754919e-06, |
| "loss": 0.2567, |
| "step": 61900 |
| }, |
| { |
| "epoch": 3.723411854314744, |
| "eval_loss": 0.23119042813777924, |
| "eval_runtime": 51.1394, |
| "eval_samples_per_second": 195.544, |
| "eval_steps_per_second": 24.443, |
| "step": 61900 |
| }, |
| { |
| "epoch": 3.7294270592490166, |
| "grad_norm": 0.8057258725166321, |
| "learning_rate": 5.381793931310437e-06, |
| "loss": 0.2574, |
| "step": 62000 |
| }, |
| { |
| "epoch": 3.7294270592490166, |
| "eval_loss": 0.2311127930879593, |
| "eval_runtime": 51.1109, |
| "eval_samples_per_second": 195.653, |
| "eval_steps_per_second": 24.457, |
| "step": 62000 |
| }, |
| { |
| "epoch": 3.7354422641832894, |
| "grad_norm": 0.7970178127288818, |
| "learning_rate": 5.380793597865955e-06, |
| "loss": 0.2589, |
| "step": 62100 |
| }, |
| { |
| "epoch": 3.7354422641832894, |
| "eval_loss": 0.2320980727672577, |
| "eval_runtime": 51.1619, |
| "eval_samples_per_second": 195.458, |
| "eval_steps_per_second": 24.432, |
| "step": 62100 |
| }, |
| { |
| "epoch": 3.741457469117562, |
| "grad_norm": 0.8987645506858826, |
| "learning_rate": 5.379793264421474e-06, |
| "loss": 0.2565, |
| "step": 62200 |
| }, |
| { |
| "epoch": 3.741457469117562, |
| "eval_loss": 0.22809037566184998, |
| "eval_runtime": 51.1437, |
| "eval_samples_per_second": 195.527, |
| "eval_steps_per_second": 24.441, |
| "step": 62200 |
| }, |
| { |
| "epoch": 3.7474726740518345, |
| "grad_norm": 0.8491466641426086, |
| "learning_rate": 5.378792930976992e-06, |
| "loss": 0.2572, |
| "step": 62300 |
| }, |
| { |
| "epoch": 3.7474726740518345, |
| "eval_loss": 0.23448967933654785, |
| "eval_runtime": 51.1016, |
| "eval_samples_per_second": 195.688, |
| "eval_steps_per_second": 24.461, |
| "step": 62300 |
| }, |
| { |
| "epoch": 3.7534878789861073, |
| "grad_norm": 0.8310768008232117, |
| "learning_rate": 5.377792597532511e-06, |
| "loss": 0.2558, |
| "step": 62400 |
| }, |
| { |
| "epoch": 3.7534878789861073, |
| "eval_loss": 0.2314356416463852, |
| "eval_runtime": 51.1436, |
| "eval_samples_per_second": 195.528, |
| "eval_steps_per_second": 24.441, |
| "step": 62400 |
| }, |
| { |
| "epoch": 3.7595030839203796, |
| "grad_norm": 0.8902222514152527, |
| "learning_rate": 5.376792264088029e-06, |
| "loss": 0.256, |
| "step": 62500 |
| }, |
| { |
| "epoch": 3.7595030839203796, |
| "eval_loss": 0.23469364643096924, |
| "eval_runtime": 51.1102, |
| "eval_samples_per_second": 195.656, |
| "eval_steps_per_second": 24.457, |
| "step": 62500 |
| }, |
| { |
| "epoch": 3.7655182888546523, |
| "grad_norm": 0.7377832531929016, |
| "learning_rate": 5.375791930643548e-06, |
| "loss": 0.2574, |
| "step": 62600 |
| }, |
| { |
| "epoch": 3.7655182888546523, |
| "eval_loss": 0.23291806876659393, |
| "eval_runtime": 51.1312, |
| "eval_samples_per_second": 195.575, |
| "eval_steps_per_second": 24.447, |
| "step": 62600 |
| }, |
| { |
| "epoch": 3.771533493788925, |
| "grad_norm": 0.7997824549674988, |
| "learning_rate": 5.374791597199067e-06, |
| "loss": 0.257, |
| "step": 62700 |
| }, |
| { |
| "epoch": 3.771533493788925, |
| "eval_loss": 0.23000933229923248, |
| "eval_runtime": 48.2655, |
| "eval_samples_per_second": 207.187, |
| "eval_steps_per_second": 25.898, |
| "step": 62700 |
| }, |
| { |
| "epoch": 3.7775486987231974, |
| "grad_norm": 0.8683999180793762, |
| "learning_rate": 5.373791263754585e-06, |
| "loss": 0.2564, |
| "step": 62800 |
| }, |
| { |
| "epoch": 3.7775486987231974, |
| "eval_loss": 0.23462143540382385, |
| "eval_runtime": 51.0748, |
| "eval_samples_per_second": 195.791, |
| "eval_steps_per_second": 24.474, |
| "step": 62800 |
| }, |
| { |
| "epoch": 3.78356390365747, |
| "grad_norm": 0.8755656480789185, |
| "learning_rate": 5.372790930310103e-06, |
| "loss": 0.2558, |
| "step": 62900 |
| }, |
| { |
| "epoch": 3.78356390365747, |
| "eval_loss": 0.23621977865695953, |
| "eval_runtime": 51.1202, |
| "eval_samples_per_second": 195.617, |
| "eval_steps_per_second": 24.452, |
| "step": 62900 |
| }, |
| { |
| "epoch": 3.7895791085917425, |
| "grad_norm": 0.9032362699508667, |
| "learning_rate": 5.371790596865622e-06, |
| "loss": 0.2551, |
| "step": 63000 |
| }, |
| { |
| "epoch": 3.7895791085917425, |
| "eval_loss": 0.2294510453939438, |
| "eval_runtime": 51.1388, |
| "eval_samples_per_second": 195.546, |
| "eval_steps_per_second": 24.443, |
| "step": 63000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 600000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 37, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 10, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 8 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.304354533994406e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|