| { | |
| "best_global_step": 12000, | |
| "best_metric": 0.12119368463754654, | |
| "best_model_checkpoint": "/data/alamparan/mattext_ckpt/results/2026-02-04/03-04-44/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-12000", | |
| "epoch": 38.70967741935484, | |
| "eval_steps": 50, | |
| "global_step": 12000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 1.4281898736953735, | |
| "learning_rate": 0.00019936774193548388, | |
| "loss": 6.0329, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16129032258064516, | |
| "eval_loss": 4.073571681976318, | |
| "eval_runtime": 86.5227, | |
| "eval_samples_per_second": 219.642, | |
| "eval_steps_per_second": 4.577, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 1.1555994749069214, | |
| "learning_rate": 0.0001987225806451613, | |
| "loss": 3.8465, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "eval_loss": 3.6210274696350098, | |
| "eval_runtime": 87.4418, | |
| "eval_samples_per_second": 217.333, | |
| "eval_steps_per_second": 4.529, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "grad_norm": 1.2039262056350708, | |
| "learning_rate": 0.00019807741935483873, | |
| "loss": 3.5783, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "eval_loss": 3.4356088638305664, | |
| "eval_runtime": 86.395, | |
| "eval_samples_per_second": 219.966, | |
| "eval_steps_per_second": 4.584, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 0.8887826204299927, | |
| "learning_rate": 0.00019743225806451612, | |
| "loss": 3.411, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "eval_loss": 3.294156074523926, | |
| "eval_runtime": 89.1633, | |
| "eval_samples_per_second": 213.137, | |
| "eval_steps_per_second": 4.441, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "grad_norm": 0.9355249404907227, | |
| "learning_rate": 0.00019678709677419356, | |
| "loss": 3.3012, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "eval_loss": 3.1778066158294678, | |
| "eval_runtime": 87.9303, | |
| "eval_samples_per_second": 216.126, | |
| "eval_steps_per_second": 4.504, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.9125154614448547, | |
| "learning_rate": 0.000196141935483871, | |
| "loss": 3.1839, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "eval_loss": 3.0771267414093018, | |
| "eval_runtime": 88.1645, | |
| "eval_samples_per_second": 215.552, | |
| "eval_steps_per_second": 4.492, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.129032258064516, | |
| "grad_norm": 1.038801670074463, | |
| "learning_rate": 0.00019549677419354838, | |
| "loss": 3.0949, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.129032258064516, | |
| "eval_loss": 2.972693920135498, | |
| "eval_runtime": 87.3627, | |
| "eval_samples_per_second": 217.53, | |
| "eval_steps_per_second": 4.533, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "grad_norm": 0.9201057553291321, | |
| "learning_rate": 0.00019485161290322582, | |
| "loss": 3.0067, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "eval_loss": 2.8708949089050293, | |
| "eval_runtime": 86.2708, | |
| "eval_samples_per_second": 220.283, | |
| "eval_steps_per_second": 4.59, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4516129032258065, | |
| "grad_norm": 1.1271328926086426, | |
| "learning_rate": 0.00019420645161290323, | |
| "loss": 2.8894, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4516129032258065, | |
| "eval_loss": 2.741400957107544, | |
| "eval_runtime": 88.3865, | |
| "eval_samples_per_second": 215.01, | |
| "eval_steps_per_second": 4.48, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 1.6507548093795776, | |
| "learning_rate": 0.00019356129032258065, | |
| "loss": 2.7396, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "eval_loss": 2.5605974197387695, | |
| "eval_runtime": 88.4696, | |
| "eval_samples_per_second": 214.808, | |
| "eval_steps_per_second": 4.476, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7741935483870968, | |
| "grad_norm": 1.5166431665420532, | |
| "learning_rate": 0.00019291612903225806, | |
| "loss": 2.5539, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7741935483870968, | |
| "eval_loss": 2.2101876735687256, | |
| "eval_runtime": 88.7721, | |
| "eval_samples_per_second": 214.076, | |
| "eval_steps_per_second": 4.461, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "grad_norm": 1.8889034986495972, | |
| "learning_rate": 0.0001922709677419355, | |
| "loss": 2.043, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "eval_loss": 1.4450961351394653, | |
| "eval_runtime": 87.3615, | |
| "eval_samples_per_second": 217.533, | |
| "eval_steps_per_second": 4.533, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.096774193548387, | |
| "grad_norm": 1.4880077838897705, | |
| "learning_rate": 0.0001916258064516129, | |
| "loss": 1.4473, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.096774193548387, | |
| "eval_loss": 1.047796607017517, | |
| "eval_runtime": 88.4068, | |
| "eval_samples_per_second": 214.961, | |
| "eval_steps_per_second": 4.479, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "grad_norm": 1.2545322179794312, | |
| "learning_rate": 0.00019098064516129032, | |
| "loss": 1.1162, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "eval_loss": 0.8680551052093506, | |
| "eval_runtime": 89.0396, | |
| "eval_samples_per_second": 213.433, | |
| "eval_steps_per_second": 4.447, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4193548387096775, | |
| "grad_norm": 1.1532652378082275, | |
| "learning_rate": 0.00019033548387096776, | |
| "loss": 0.9731, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.4193548387096775, | |
| "eval_loss": 0.785129964351654, | |
| "eval_runtime": 90.5213, | |
| "eval_samples_per_second": 209.939, | |
| "eval_steps_per_second": 4.375, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "grad_norm": 0.8688052892684937, | |
| "learning_rate": 0.00018969032258064517, | |
| "loss": 0.8907, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "eval_loss": 0.7259724140167236, | |
| "eval_runtime": 88.9062, | |
| "eval_samples_per_second": 213.753, | |
| "eval_steps_per_second": 4.454, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.741935483870968, | |
| "grad_norm": 1.1574801206588745, | |
| "learning_rate": 0.00018904516129032259, | |
| "loss": 0.8301, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.741935483870968, | |
| "eval_loss": 0.6951683163642883, | |
| "eval_runtime": 89.0056, | |
| "eval_samples_per_second": 213.515, | |
| "eval_steps_per_second": 4.449, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.903225806451613, | |
| "grad_norm": 1.0460095405578613, | |
| "learning_rate": 0.0001884, | |
| "loss": 0.7883, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.903225806451613, | |
| "eval_loss": 0.6605275869369507, | |
| "eval_runtime": 87.0532, | |
| "eval_samples_per_second": 218.303, | |
| "eval_steps_per_second": 4.549, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.064516129032258, | |
| "grad_norm": 0.7760242223739624, | |
| "learning_rate": 0.00018775483870967744, | |
| "loss": 0.7428, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.064516129032258, | |
| "eval_loss": 0.6210550665855408, | |
| "eval_runtime": 87.5365, | |
| "eval_samples_per_second": 217.098, | |
| "eval_steps_per_second": 4.524, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "grad_norm": 0.7606909275054932, | |
| "learning_rate": 0.00018710967741935485, | |
| "loss": 0.7159, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "eval_loss": 0.60401451587677, | |
| "eval_runtime": 86.4168, | |
| "eval_samples_per_second": 219.911, | |
| "eval_steps_per_second": 4.582, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.3870967741935485, | |
| "grad_norm": 0.7674448490142822, | |
| "learning_rate": 0.00018646451612903226, | |
| "loss": 0.6965, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.3870967741935485, | |
| "eval_loss": 0.583739697933197, | |
| "eval_runtime": 87.6612, | |
| "eval_samples_per_second": 216.789, | |
| "eval_steps_per_second": 4.517, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.5483870967741935, | |
| "grad_norm": 0.8499358296394348, | |
| "learning_rate": 0.0001858193548387097, | |
| "loss": 0.6706, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.5483870967741935, | |
| "eval_loss": 0.567313551902771, | |
| "eval_runtime": 84.0187, | |
| "eval_samples_per_second": 226.188, | |
| "eval_steps_per_second": 4.713, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.709677419354839, | |
| "grad_norm": 0.8062635660171509, | |
| "learning_rate": 0.0001851741935483871, | |
| "loss": 0.6605, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.709677419354839, | |
| "eval_loss": 0.5546169281005859, | |
| "eval_runtime": 87.6004, | |
| "eval_samples_per_second": 216.94, | |
| "eval_steps_per_second": 4.521, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.870967741935484, | |
| "grad_norm": 0.9007663726806641, | |
| "learning_rate": 0.00018452903225806453, | |
| "loss": 0.6528, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.870967741935484, | |
| "eval_loss": 0.5342397093772888, | |
| "eval_runtime": 90.2247, | |
| "eval_samples_per_second": 210.63, | |
| "eval_steps_per_second": 4.389, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.032258064516129, | |
| "grad_norm": 0.6880891919136047, | |
| "learning_rate": 0.00018388387096774194, | |
| "loss": 0.6174, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.032258064516129, | |
| "eval_loss": 0.5208889245986938, | |
| "eval_runtime": 94.9972, | |
| "eval_samples_per_second": 200.048, | |
| "eval_steps_per_second": 4.169, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.193548387096774, | |
| "grad_norm": 0.708258330821991, | |
| "learning_rate": 0.00018323870967741935, | |
| "loss": 0.5884, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.193548387096774, | |
| "eval_loss": 0.5109750032424927, | |
| "eval_runtime": 94.4447, | |
| "eval_samples_per_second": 201.218, | |
| "eval_steps_per_second": 4.193, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.354838709677419, | |
| "grad_norm": 0.5771231651306152, | |
| "learning_rate": 0.0001825935483870968, | |
| "loss": 0.5515, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.354838709677419, | |
| "eval_loss": 0.5005462169647217, | |
| "eval_runtime": 92.4886, | |
| "eval_samples_per_second": 205.474, | |
| "eval_steps_per_second": 4.282, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.516129032258064, | |
| "grad_norm": 0.9058783054351807, | |
| "learning_rate": 0.0001819483870967742, | |
| "loss": 0.5876, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.516129032258064, | |
| "eval_loss": 0.4874018728733063, | |
| "eval_runtime": 90.1408, | |
| "eval_samples_per_second": 210.826, | |
| "eval_steps_per_second": 4.393, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.67741935483871, | |
| "grad_norm": 0.702014148235321, | |
| "learning_rate": 0.00018130322580645162, | |
| "loss": 0.5564, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.67741935483871, | |
| "eval_loss": 0.47600802779197693, | |
| "eval_runtime": 89.1009, | |
| "eval_samples_per_second": 213.286, | |
| "eval_steps_per_second": 4.444, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.838709677419355, | |
| "grad_norm": 0.6843072772026062, | |
| "learning_rate": 0.00018065806451612903, | |
| "loss": 0.5857, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.838709677419355, | |
| "eval_loss": 0.45552995800971985, | |
| "eval_runtime": 87.6377, | |
| "eval_samples_per_second": 216.847, | |
| "eval_steps_per_second": 4.519, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.9617322683334351, | |
| "learning_rate": 0.00018001290322580647, | |
| "loss": 0.5225, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.448142945766449, | |
| "eval_runtime": 88.2995, | |
| "eval_samples_per_second": 215.222, | |
| "eval_steps_per_second": 4.485, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 5.161290322580645, | |
| "grad_norm": 0.5962368249893188, | |
| "learning_rate": 0.00017936774193548388, | |
| "loss": 0.5237, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.161290322580645, | |
| "eval_loss": 0.4412171542644501, | |
| "eval_runtime": 86.7486, | |
| "eval_samples_per_second": 219.07, | |
| "eval_steps_per_second": 4.565, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.32258064516129, | |
| "grad_norm": 0.6392622590065002, | |
| "learning_rate": 0.0001787225806451613, | |
| "loss": 0.5279, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 5.32258064516129, | |
| "eval_loss": 0.43270382285118103, | |
| "eval_runtime": 86.3243, | |
| "eval_samples_per_second": 220.147, | |
| "eval_steps_per_second": 4.587, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 5.483870967741936, | |
| "grad_norm": 0.5868324637413025, | |
| "learning_rate": 0.00017807741935483873, | |
| "loss": 0.4941, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.483870967741936, | |
| "eval_loss": 0.42134207487106323, | |
| "eval_runtime": 87.3073, | |
| "eval_samples_per_second": 217.668, | |
| "eval_steps_per_second": 4.536, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.645161290322581, | |
| "grad_norm": 0.6335242986679077, | |
| "learning_rate": 0.00017743225806451615, | |
| "loss": 0.4771, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 5.645161290322581, | |
| "eval_loss": 0.4060479998588562, | |
| "eval_runtime": 86.2245, | |
| "eval_samples_per_second": 220.402, | |
| "eval_steps_per_second": 4.593, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 5.806451612903226, | |
| "grad_norm": 0.6034550666809082, | |
| "learning_rate": 0.00017678709677419356, | |
| "loss": 0.4411, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 5.806451612903226, | |
| "eval_loss": 0.39572522044181824, | |
| "eval_runtime": 86.9597, | |
| "eval_samples_per_second": 218.538, | |
| "eval_steps_per_second": 4.554, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 5.967741935483871, | |
| "grad_norm": 0.6557337045669556, | |
| "learning_rate": 0.00017614193548387097, | |
| "loss": 0.4511, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 5.967741935483871, | |
| "eval_loss": 0.3814023435115814, | |
| "eval_runtime": 88.9636, | |
| "eval_samples_per_second": 213.615, | |
| "eval_steps_per_second": 4.451, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 6.129032258064516, | |
| "grad_norm": 0.510995090007782, | |
| "learning_rate": 0.0001754967741935484, | |
| "loss": 0.4288, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 6.129032258064516, | |
| "eval_loss": 0.3621025085449219, | |
| "eval_runtime": 92.0174, | |
| "eval_samples_per_second": 206.526, | |
| "eval_steps_per_second": 4.304, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 6.290322580645161, | |
| "grad_norm": 0.644413948059082, | |
| "learning_rate": 0.00017485161290322582, | |
| "loss": 0.4234, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 6.290322580645161, | |
| "eval_loss": 0.35021910071372986, | |
| "eval_runtime": 95.7807, | |
| "eval_samples_per_second": 198.411, | |
| "eval_steps_per_second": 4.134, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 6.451612903225806, | |
| "grad_norm": 0.5785158276557922, | |
| "learning_rate": 0.00017420645161290323, | |
| "loss": 0.4009, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 6.451612903225806, | |
| "eval_loss": 0.33460476994514465, | |
| "eval_runtime": 146.86, | |
| "eval_samples_per_second": 129.402, | |
| "eval_steps_per_second": 2.696, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 6.612903225806452, | |
| "grad_norm": 0.7037348747253418, | |
| "learning_rate": 0.00017356129032258067, | |
| "loss": 0.3922, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 6.612903225806452, | |
| "eval_loss": 0.3163394331932068, | |
| "eval_runtime": 179.0958, | |
| "eval_samples_per_second": 106.111, | |
| "eval_steps_per_second": 2.211, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 6.774193548387097, | |
| "grad_norm": 0.6514049768447876, | |
| "learning_rate": 0.00017291612903225806, | |
| "loss": 0.3524, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 6.774193548387097, | |
| "eval_loss": 0.30841001868247986, | |
| "eval_runtime": 179.6968, | |
| "eval_samples_per_second": 105.756, | |
| "eval_steps_per_second": 2.204, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 6.935483870967742, | |
| "grad_norm": 0.6443042755126953, | |
| "learning_rate": 0.0001722709677419355, | |
| "loss": 0.3346, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 6.935483870967742, | |
| "eval_loss": 0.2951297163963318, | |
| "eval_runtime": 180.7736, | |
| "eval_samples_per_second": 105.126, | |
| "eval_steps_per_second": 2.191, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 7.096774193548387, | |
| "grad_norm": 0.6390765309333801, | |
| "learning_rate": 0.0001716258064516129, | |
| "loss": 0.3255, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 7.096774193548387, | |
| "eval_loss": 0.2864570915699005, | |
| "eval_runtime": 177.5655, | |
| "eval_samples_per_second": 107.025, | |
| "eval_steps_per_second": 2.23, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 7.258064516129032, | |
| "grad_norm": 0.6152281165122986, | |
| "learning_rate": 0.00017098064516129032, | |
| "loss": 0.3258, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 7.258064516129032, | |
| "eval_loss": 0.26722875237464905, | |
| "eval_runtime": 162.1355, | |
| "eval_samples_per_second": 117.211, | |
| "eval_steps_per_second": 2.442, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 7.419354838709677, | |
| "grad_norm": 0.5493067502975464, | |
| "learning_rate": 0.00017033548387096776, | |
| "loss": 0.3071, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 7.419354838709677, | |
| "eval_loss": 0.2526390552520752, | |
| "eval_runtime": 200.1286, | |
| "eval_samples_per_second": 94.959, | |
| "eval_steps_per_second": 1.979, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 7.580645161290323, | |
| "grad_norm": 0.5964747667312622, | |
| "learning_rate": 0.00016969032258064518, | |
| "loss": 0.288, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 7.580645161290323, | |
| "eval_loss": 0.24554206430912018, | |
| "eval_runtime": 275.0797, | |
| "eval_samples_per_second": 69.085, | |
| "eval_steps_per_second": 1.44, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 7.741935483870968, | |
| "grad_norm": 0.5316987037658691, | |
| "learning_rate": 0.0001690451612903226, | |
| "loss": 0.2739, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 7.741935483870968, | |
| "eval_loss": 0.23691914975643158, | |
| "eval_runtime": 175.1655, | |
| "eval_samples_per_second": 108.492, | |
| "eval_steps_per_second": 2.261, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 7.903225806451613, | |
| "grad_norm": 0.47612282633781433, | |
| "learning_rate": 0.0001684, | |
| "loss": 0.2706, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 7.903225806451613, | |
| "eval_loss": 0.23088718950748444, | |
| "eval_runtime": 174.1382, | |
| "eval_samples_per_second": 109.132, | |
| "eval_steps_per_second": 2.274, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 8.064516129032258, | |
| "grad_norm": 0.5631929039955139, | |
| "learning_rate": 0.00016775483870967744, | |
| "loss": 0.2645, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 8.064516129032258, | |
| "eval_loss": 0.229031041264534, | |
| "eval_runtime": 321.9761, | |
| "eval_samples_per_second": 59.023, | |
| "eval_steps_per_second": 1.23, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 8.225806451612904, | |
| "grad_norm": 0.5553293228149414, | |
| "learning_rate": 0.00016710967741935483, | |
| "loss": 0.2604, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 8.225806451612904, | |
| "eval_loss": 0.22551080584526062, | |
| "eval_runtime": 388.7037, | |
| "eval_samples_per_second": 48.891, | |
| "eval_steps_per_second": 1.019, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 8.387096774193548, | |
| "grad_norm": 0.5779751539230347, | |
| "learning_rate": 0.00016646451612903226, | |
| "loss": 0.2537, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 8.387096774193548, | |
| "eval_loss": 0.22034035623073578, | |
| "eval_runtime": 296.8732, | |
| "eval_samples_per_second": 64.014, | |
| "eval_steps_per_second": 1.334, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 8.548387096774194, | |
| "grad_norm": 0.4632438123226166, | |
| "learning_rate": 0.0001658193548387097, | |
| "loss": 0.2505, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 8.548387096774194, | |
| "eval_loss": 0.21870557963848114, | |
| "eval_runtime": 195.9343, | |
| "eval_samples_per_second": 96.992, | |
| "eval_steps_per_second": 2.021, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 8.709677419354838, | |
| "grad_norm": 0.5242488980293274, | |
| "learning_rate": 0.0001651741935483871, | |
| "loss": 0.2465, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 8.709677419354838, | |
| "eval_loss": 0.21318656206130981, | |
| "eval_runtime": 199.2062, | |
| "eval_samples_per_second": 95.399, | |
| "eval_steps_per_second": 1.988, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 8.870967741935484, | |
| "grad_norm": 0.5110979676246643, | |
| "learning_rate": 0.00016452903225806453, | |
| "loss": 0.2411, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 8.870967741935484, | |
| "eval_loss": 0.21142736077308655, | |
| "eval_runtime": 196.2975, | |
| "eval_samples_per_second": 96.812, | |
| "eval_steps_per_second": 2.017, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 9.03225806451613, | |
| "grad_norm": 0.4415765702724457, | |
| "learning_rate": 0.00016388387096774194, | |
| "loss": 0.2431, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 9.03225806451613, | |
| "eval_loss": 0.20968259871006012, | |
| "eval_runtime": 190.8363, | |
| "eval_samples_per_second": 99.583, | |
| "eval_steps_per_second": 2.075, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 9.193548387096774, | |
| "grad_norm": 0.4640558362007141, | |
| "learning_rate": 0.00016323870967741935, | |
| "loss": 0.2416, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 9.193548387096774, | |
| "eval_loss": 0.206209197640419, | |
| "eval_runtime": 219.8043, | |
| "eval_samples_per_second": 86.459, | |
| "eval_steps_per_second": 1.802, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 9.35483870967742, | |
| "grad_norm": 0.5190003514289856, | |
| "learning_rate": 0.00016259354838709677, | |
| "loss": 0.2321, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 9.35483870967742, | |
| "eval_loss": 0.20502887666225433, | |
| "eval_runtime": 232.185, | |
| "eval_samples_per_second": 81.849, | |
| "eval_steps_per_second": 1.706, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 9.516129032258064, | |
| "grad_norm": 0.5595571398735046, | |
| "learning_rate": 0.0001619483870967742, | |
| "loss": 0.2317, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 9.516129032258064, | |
| "eval_loss": 0.20400060713291168, | |
| "eval_runtime": 295.1096, | |
| "eval_samples_per_second": 64.396, | |
| "eval_steps_per_second": 1.342, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 9.67741935483871, | |
| "grad_norm": 0.5060888528823853, | |
| "learning_rate": 0.00016130322580645162, | |
| "loss": 0.2263, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 9.67741935483871, | |
| "eval_loss": 0.2005593329668045, | |
| "eval_runtime": 201.7676, | |
| "eval_samples_per_second": 94.188, | |
| "eval_steps_per_second": 1.963, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 9.838709677419354, | |
| "grad_norm": 0.5235562920570374, | |
| "learning_rate": 0.00016065806451612903, | |
| "loss": 0.2209, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 9.838709677419354, | |
| "eval_loss": 0.2015007883310318, | |
| "eval_runtime": 200.5101, | |
| "eval_samples_per_second": 94.778, | |
| "eval_steps_per_second": 1.975, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.7316103577613831, | |
| "learning_rate": 0.00016001290322580647, | |
| "loss": 0.2184, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.1977168172597885, | |
| "eval_runtime": 185.6624, | |
| "eval_samples_per_second": 102.358, | |
| "eval_steps_per_second": 2.133, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 10.161290322580646, | |
| "grad_norm": 0.4775584638118744, | |
| "learning_rate": 0.00015936774193548388, | |
| "loss": 0.2204, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 10.161290322580646, | |
| "eval_loss": 0.19870446622371674, | |
| "eval_runtime": 192.3529, | |
| "eval_samples_per_second": 98.798, | |
| "eval_steps_per_second": 2.059, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 10.32258064516129, | |
| "grad_norm": 0.4865228533744812, | |
| "learning_rate": 0.0001587225806451613, | |
| "loss": 0.2152, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 10.32258064516129, | |
| "eval_loss": 0.19557693600654602, | |
| "eval_runtime": 195.6795, | |
| "eval_samples_per_second": 97.118, | |
| "eval_steps_per_second": 2.024, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 10.483870967741936, | |
| "grad_norm": 0.5472003221511841, | |
| "learning_rate": 0.0001580774193548387, | |
| "loss": 0.2188, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 10.483870967741936, | |
| "eval_loss": 0.19073671102523804, | |
| "eval_runtime": 188.5485, | |
| "eval_samples_per_second": 100.791, | |
| "eval_steps_per_second": 2.1, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 10.64516129032258, | |
| "grad_norm": 0.46814078092575073, | |
| "learning_rate": 0.00015743225806451615, | |
| "loss": 0.2122, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 10.64516129032258, | |
| "eval_loss": 0.1892285943031311, | |
| "eval_runtime": 190.3556, | |
| "eval_samples_per_second": 99.834, | |
| "eval_steps_per_second": 2.08, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 10.806451612903226, | |
| "grad_norm": 0.39085471630096436, | |
| "learning_rate": 0.00015678709677419356, | |
| "loss": 0.2157, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 10.806451612903226, | |
| "eval_loss": 0.19001252949237823, | |
| "eval_runtime": 189.4897, | |
| "eval_samples_per_second": 100.29, | |
| "eval_steps_per_second": 2.09, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 10.967741935483872, | |
| "grad_norm": 0.4231501519680023, | |
| "learning_rate": 0.00015614193548387097, | |
| "loss": 0.2162, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 10.967741935483872, | |
| "eval_loss": 0.18620692193508148, | |
| "eval_runtime": 190.5331, | |
| "eval_samples_per_second": 99.741, | |
| "eval_steps_per_second": 2.078, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 11.129032258064516, | |
| "grad_norm": 0.5186159610748291, | |
| "learning_rate": 0.0001554967741935484, | |
| "loss": 0.2101, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 11.129032258064516, | |
| "eval_loss": 0.18647471070289612, | |
| "eval_runtime": 188.7317, | |
| "eval_samples_per_second": 100.693, | |
| "eval_steps_per_second": 2.098, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 11.290322580645162, | |
| "grad_norm": 0.4359528124332428, | |
| "learning_rate": 0.0001548516129032258, | |
| "loss": 0.2051, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 11.290322580645162, | |
| "eval_loss": 0.18508924543857574, | |
| "eval_runtime": 187.5602, | |
| "eval_samples_per_second": 101.322, | |
| "eval_steps_per_second": 2.111, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 11.451612903225806, | |
| "grad_norm": 0.35816341638565063, | |
| "learning_rate": 0.00015420645161290324, | |
| "loss": 0.2027, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 11.451612903225806, | |
| "eval_loss": 0.18324784934520721, | |
| "eval_runtime": 188.342, | |
| "eval_samples_per_second": 100.902, | |
| "eval_steps_per_second": 2.103, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 11.612903225806452, | |
| "grad_norm": 0.4818542003631592, | |
| "learning_rate": 0.00015356129032258065, | |
| "loss": 0.2059, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 11.612903225806452, | |
| "eval_loss": 0.18456551432609558, | |
| "eval_runtime": 190.2916, | |
| "eval_samples_per_second": 99.868, | |
| "eval_steps_per_second": 2.081, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 11.774193548387096, | |
| "grad_norm": 0.560644805431366, | |
| "learning_rate": 0.00015291612903225806, | |
| "loss": 0.1994, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 11.774193548387096, | |
| "eval_loss": 0.1837019920349121, | |
| "eval_runtime": 193.7316, | |
| "eval_samples_per_second": 98.094, | |
| "eval_steps_per_second": 2.044, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 11.935483870967742, | |
| "grad_norm": 0.4187397360801697, | |
| "learning_rate": 0.0001522709677419355, | |
| "loss": 0.208, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 11.935483870967742, | |
| "eval_loss": 0.18199948966503143, | |
| "eval_runtime": 190.9935, | |
| "eval_samples_per_second": 99.501, | |
| "eval_steps_per_second": 2.073, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 12.096774193548388, | |
| "grad_norm": 0.47370871901512146, | |
| "learning_rate": 0.0001516258064516129, | |
| "loss": 0.197, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 12.096774193548388, | |
| "eval_loss": 0.18183062970638275, | |
| "eval_runtime": 189.271, | |
| "eval_samples_per_second": 100.406, | |
| "eval_steps_per_second": 2.092, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 12.258064516129032, | |
| "grad_norm": 0.5040688514709473, | |
| "learning_rate": 0.00015098064516129033, | |
| "loss": 0.1984, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 12.258064516129032, | |
| "eval_loss": 0.17957282066345215, | |
| "eval_runtime": 187.7702, | |
| "eval_samples_per_second": 101.209, | |
| "eval_steps_per_second": 2.109, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 12.419354838709678, | |
| "grad_norm": 0.44129958748817444, | |
| "learning_rate": 0.00015033548387096774, | |
| "loss": 0.1905, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 12.419354838709678, | |
| "eval_loss": 0.17632746696472168, | |
| "eval_runtime": 189.2669, | |
| "eval_samples_per_second": 100.408, | |
| "eval_steps_per_second": 2.092, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 12.580645161290322, | |
| "grad_norm": 0.38648995757102966, | |
| "learning_rate": 0.00014969032258064518, | |
| "loss": 0.1915, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 12.580645161290322, | |
| "eval_loss": 0.1763821244239807, | |
| "eval_runtime": 187.8282, | |
| "eval_samples_per_second": 101.178, | |
| "eval_steps_per_second": 2.108, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 12.741935483870968, | |
| "grad_norm": 0.46786022186279297, | |
| "learning_rate": 0.0001490451612903226, | |
| "loss": 0.1964, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 12.741935483870968, | |
| "eval_loss": 0.1770430952310562, | |
| "eval_runtime": 192.9563, | |
| "eval_samples_per_second": 98.489, | |
| "eval_steps_per_second": 2.052, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 12.903225806451612, | |
| "grad_norm": 0.45657825469970703, | |
| "learning_rate": 0.0001484, | |
| "loss": 0.194, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 12.903225806451612, | |
| "eval_loss": 0.1744171380996704, | |
| "eval_runtime": 191.9963, | |
| "eval_samples_per_second": 98.981, | |
| "eval_steps_per_second": 2.063, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 13.064516129032258, | |
| "grad_norm": 0.4950830936431885, | |
| "learning_rate": 0.00014775483870967744, | |
| "loss": 0.1897, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 13.064516129032258, | |
| "eval_loss": 0.1715860813856125, | |
| "eval_runtime": 190.2705, | |
| "eval_samples_per_second": 99.879, | |
| "eval_steps_per_second": 2.081, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 13.225806451612904, | |
| "grad_norm": 0.4676801562309265, | |
| "learning_rate": 0.00014710967741935485, | |
| "loss": 0.1837, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 13.225806451612904, | |
| "eval_loss": 0.16961637139320374, | |
| "eval_runtime": 177.9195, | |
| "eval_samples_per_second": 106.812, | |
| "eval_steps_per_second": 2.226, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 13.387096774193548, | |
| "grad_norm": 0.3973780870437622, | |
| "learning_rate": 0.00014646451612903227, | |
| "loss": 0.1856, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 13.387096774193548, | |
| "eval_loss": 0.17067177593708038, | |
| "eval_runtime": 197.9884, | |
| "eval_samples_per_second": 95.985, | |
| "eval_steps_per_second": 2.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 13.548387096774194, | |
| "grad_norm": 0.46780818700790405, | |
| "learning_rate": 0.00014581935483870968, | |
| "loss": 0.186, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 13.548387096774194, | |
| "eval_loss": 0.17197734117507935, | |
| "eval_runtime": 196.3462, | |
| "eval_samples_per_second": 96.788, | |
| "eval_steps_per_second": 2.017, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 13.709677419354838, | |
| "grad_norm": 0.5169796347618103, | |
| "learning_rate": 0.00014517419354838712, | |
| "loss": 0.1886, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 13.709677419354838, | |
| "eval_loss": 0.1707415133714676, | |
| "eval_runtime": 194.2514, | |
| "eval_samples_per_second": 97.832, | |
| "eval_steps_per_second": 2.039, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 13.870967741935484, | |
| "grad_norm": 0.36574116349220276, | |
| "learning_rate": 0.00014452903225806453, | |
| "loss": 0.1857, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 13.870967741935484, | |
| "eval_loss": 0.16808755695819855, | |
| "eval_runtime": 192.206, | |
| "eval_samples_per_second": 98.873, | |
| "eval_steps_per_second": 2.06, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 14.03225806451613, | |
| "grad_norm": 0.4412609338760376, | |
| "learning_rate": 0.00014388387096774194, | |
| "loss": 0.1819, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 14.03225806451613, | |
| "eval_loss": 0.17015832662582397, | |
| "eval_runtime": 194.1959, | |
| "eval_samples_per_second": 97.86, | |
| "eval_steps_per_second": 2.039, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 14.193548387096774, | |
| "grad_norm": 0.35714399814605713, | |
| "learning_rate": 0.00014323870967741938, | |
| "loss": 0.179, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 14.193548387096774, | |
| "eval_loss": 0.16659317910671234, | |
| "eval_runtime": 190.7696, | |
| "eval_samples_per_second": 99.618, | |
| "eval_steps_per_second": 2.076, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 14.35483870967742, | |
| "grad_norm": 0.5498349070549011, | |
| "learning_rate": 0.00014259354838709677, | |
| "loss": 0.174, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 14.35483870967742, | |
| "eval_loss": 0.166724294424057, | |
| "eval_runtime": 194.2466, | |
| "eval_samples_per_second": 97.834, | |
| "eval_steps_per_second": 2.039, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 14.516129032258064, | |
| "grad_norm": 0.3779986798763275, | |
| "learning_rate": 0.0001419483870967742, | |
| "loss": 0.18, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 14.516129032258064, | |
| "eval_loss": 0.16606487333774567, | |
| "eval_runtime": 193.3423, | |
| "eval_samples_per_second": 98.292, | |
| "eval_steps_per_second": 2.048, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 14.67741935483871, | |
| "grad_norm": 0.5511322021484375, | |
| "learning_rate": 0.00014130322580645162, | |
| "loss": 0.1789, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 14.67741935483871, | |
| "eval_loss": 0.16656480729579926, | |
| "eval_runtime": 270.6846, | |
| "eval_samples_per_second": 70.207, | |
| "eval_steps_per_second": 1.463, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 14.838709677419354, | |
| "grad_norm": 0.3705432415008545, | |
| "learning_rate": 0.00014065806451612903, | |
| "loss": 0.1793, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 14.838709677419354, | |
| "eval_loss": 0.1630878746509552, | |
| "eval_runtime": 203.349, | |
| "eval_samples_per_second": 93.455, | |
| "eval_steps_per_second": 1.947, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.6959958076477051, | |
| "learning_rate": 0.00014001290322580647, | |
| "loss": 0.1849, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.16222645342350006, | |
| "eval_runtime": 196.7729, | |
| "eval_samples_per_second": 96.578, | |
| "eval_steps_per_second": 2.012, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 15.161290322580646, | |
| "grad_norm": 0.38614702224731445, | |
| "learning_rate": 0.00013936774193548388, | |
| "loss": 0.1729, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 15.161290322580646, | |
| "eval_loss": 0.16200029850006104, | |
| "eval_runtime": 191.2671, | |
| "eval_samples_per_second": 99.358, | |
| "eval_steps_per_second": 2.07, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 15.32258064516129, | |
| "grad_norm": 0.4022436738014221, | |
| "learning_rate": 0.0001387225806451613, | |
| "loss": 0.1766, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 15.32258064516129, | |
| "eval_loss": 0.1640605926513672, | |
| "eval_runtime": 198.7156, | |
| "eval_samples_per_second": 95.634, | |
| "eval_steps_per_second": 1.993, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 15.483870967741936, | |
| "grad_norm": 0.45434874296188354, | |
| "learning_rate": 0.0001380774193548387, | |
| "loss": 0.1698, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 15.483870967741936, | |
| "eval_loss": 0.16334731876850128, | |
| "eval_runtime": 188.1897, | |
| "eval_samples_per_second": 100.983, | |
| "eval_steps_per_second": 2.104, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 15.64516129032258, | |
| "grad_norm": 0.42634057998657227, | |
| "learning_rate": 0.00013743225806451615, | |
| "loss": 0.1766, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 15.64516129032258, | |
| "eval_loss": 0.16068318486213684, | |
| "eval_runtime": 190.5766, | |
| "eval_samples_per_second": 99.718, | |
| "eval_steps_per_second": 2.078, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 15.806451612903226, | |
| "grad_norm": 0.504154622554779, | |
| "learning_rate": 0.00013678709677419353, | |
| "loss": 0.1678, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 15.806451612903226, | |
| "eval_loss": 0.15998931229114532, | |
| "eval_runtime": 185.5464, | |
| "eval_samples_per_second": 102.422, | |
| "eval_steps_per_second": 2.134, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 15.967741935483872, | |
| "grad_norm": 0.5468097925186157, | |
| "learning_rate": 0.00013614193548387097, | |
| "loss": 0.1792, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 15.967741935483872, | |
| "eval_loss": 0.15972544252872467, | |
| "eval_runtime": 186.5203, | |
| "eval_samples_per_second": 101.887, | |
| "eval_steps_per_second": 2.123, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 16.129032258064516, | |
| "grad_norm": 0.4495028853416443, | |
| "learning_rate": 0.0001354967741935484, | |
| "loss": 0.1755, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 16.129032258064516, | |
| "eval_loss": 0.1596982628107071, | |
| "eval_runtime": 194.6559, | |
| "eval_samples_per_second": 97.629, | |
| "eval_steps_per_second": 2.034, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 16.29032258064516, | |
| "grad_norm": 0.36081984639167786, | |
| "learning_rate": 0.0001348516129032258, | |
| "loss": 0.1659, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 16.29032258064516, | |
| "eval_loss": 0.15741322934627533, | |
| "eval_runtime": 193.3704, | |
| "eval_samples_per_second": 98.278, | |
| "eval_steps_per_second": 2.048, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 16.451612903225808, | |
| "grad_norm": 0.3472287654876709, | |
| "learning_rate": 0.00013420645161290324, | |
| "loss": 0.1672, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 16.451612903225808, | |
| "eval_loss": 0.16025249660015106, | |
| "eval_runtime": 177.054, | |
| "eval_samples_per_second": 107.334, | |
| "eval_steps_per_second": 2.237, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 16.612903225806452, | |
| "grad_norm": 0.4033275842666626, | |
| "learning_rate": 0.00013356129032258065, | |
| "loss": 0.1687, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 16.612903225806452, | |
| "eval_loss": 0.15610146522521973, | |
| "eval_runtime": 197.4165, | |
| "eval_samples_per_second": 96.263, | |
| "eval_steps_per_second": 2.006, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 16.774193548387096, | |
| "grad_norm": 0.43590107560157776, | |
| "learning_rate": 0.00013291612903225806, | |
| "loss": 0.1685, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 16.774193548387096, | |
| "eval_loss": 0.15723493695259094, | |
| "eval_runtime": 195.3122, | |
| "eval_samples_per_second": 97.301, | |
| "eval_steps_per_second": 2.028, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 16.93548387096774, | |
| "grad_norm": 0.33427444100379944, | |
| "learning_rate": 0.00013227096774193548, | |
| "loss": 0.1711, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 16.93548387096774, | |
| "eval_loss": 0.15542824566364288, | |
| "eval_runtime": 192.6293, | |
| "eval_samples_per_second": 98.656, | |
| "eval_steps_per_second": 2.056, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 17.096774193548388, | |
| "grad_norm": 0.38596487045288086, | |
| "learning_rate": 0.00013162580645161291, | |
| "loss": 0.1662, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 17.096774193548388, | |
| "eval_loss": 0.15477755665779114, | |
| "eval_runtime": 218.9351, | |
| "eval_samples_per_second": 86.802, | |
| "eval_steps_per_second": 1.809, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 17.258064516129032, | |
| "grad_norm": 0.30149680376052856, | |
| "learning_rate": 0.00013098064516129033, | |
| "loss": 0.1632, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 17.258064516129032, | |
| "eval_loss": 0.15298867225646973, | |
| "eval_runtime": 109.5329, | |
| "eval_samples_per_second": 173.5, | |
| "eval_steps_per_second": 3.615, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 17.419354838709676, | |
| "grad_norm": 0.34499385952949524, | |
| "learning_rate": 0.00013033548387096774, | |
| "loss": 0.1627, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 17.419354838709676, | |
| "eval_loss": 0.1536460816860199, | |
| "eval_runtime": 89.4248, | |
| "eval_samples_per_second": 212.514, | |
| "eval_steps_per_second": 4.428, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 17.580645161290324, | |
| "grad_norm": 0.4599091410636902, | |
| "learning_rate": 0.00012969032258064518, | |
| "loss": 0.1611, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 17.580645161290324, | |
| "eval_loss": 0.15192009508609772, | |
| "eval_runtime": 87.9147, | |
| "eval_samples_per_second": 216.164, | |
| "eval_steps_per_second": 4.504, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 17.741935483870968, | |
| "grad_norm": 0.3737597167491913, | |
| "learning_rate": 0.0001290451612903226, | |
| "loss": 0.1644, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 17.741935483870968, | |
| "eval_loss": 0.15356366336345673, | |
| "eval_runtime": 87.8138, | |
| "eval_samples_per_second": 216.413, | |
| "eval_steps_per_second": 4.51, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 17.903225806451612, | |
| "grad_norm": 0.49820005893707275, | |
| "learning_rate": 0.0001284, | |
| "loss": 0.1641, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 17.903225806451612, | |
| "eval_loss": 0.15130773186683655, | |
| "eval_runtime": 89.806, | |
| "eval_samples_per_second": 211.612, | |
| "eval_steps_per_second": 4.41, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 18.06451612903226, | |
| "grad_norm": 0.431963711977005, | |
| "learning_rate": 0.00012775483870967742, | |
| "loss": 0.1629, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 18.06451612903226, | |
| "eval_loss": 0.15356026589870453, | |
| "eval_runtime": 90.5983, | |
| "eval_samples_per_second": 209.761, | |
| "eval_steps_per_second": 4.371, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 18.225806451612904, | |
| "grad_norm": 0.41033461689949036, | |
| "learning_rate": 0.00012710967741935486, | |
| "loss": 0.1592, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 18.225806451612904, | |
| "eval_loss": 0.15416103601455688, | |
| "eval_runtime": 88.2514, | |
| "eval_samples_per_second": 215.339, | |
| "eval_steps_per_second": 4.487, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 18.387096774193548, | |
| "grad_norm": 0.360398530960083, | |
| "learning_rate": 0.00012646451612903227, | |
| "loss": 0.1595, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 18.387096774193548, | |
| "eval_loss": 0.1532827764749527, | |
| "eval_runtime": 89.7789, | |
| "eval_samples_per_second": 211.675, | |
| "eval_steps_per_second": 4.411, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 18.548387096774192, | |
| "grad_norm": 0.3443894386291504, | |
| "learning_rate": 0.00012581935483870968, | |
| "loss": 0.1627, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 18.548387096774192, | |
| "eval_loss": 0.15046313405036926, | |
| "eval_runtime": 88.4154, | |
| "eval_samples_per_second": 214.94, | |
| "eval_steps_per_second": 4.479, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 18.70967741935484, | |
| "grad_norm": 0.384339839220047, | |
| "learning_rate": 0.00012517419354838712, | |
| "loss": 0.1609, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 18.70967741935484, | |
| "eval_loss": 0.1508285254240036, | |
| "eval_runtime": 89.1892, | |
| "eval_samples_per_second": 213.075, | |
| "eval_steps_per_second": 4.44, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 18.870967741935484, | |
| "grad_norm": 0.4039391577243805, | |
| "learning_rate": 0.0001245290322580645, | |
| "loss": 0.1593, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 18.870967741935484, | |
| "eval_loss": 0.15185365080833435, | |
| "eval_runtime": 87.6173, | |
| "eval_samples_per_second": 216.898, | |
| "eval_steps_per_second": 4.52, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 19.032258064516128, | |
| "grad_norm": 0.36354830861091614, | |
| "learning_rate": 0.00012388387096774195, | |
| "loss": 0.1609, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 19.032258064516128, | |
| "eval_loss": 0.15022191405296326, | |
| "eval_runtime": 88.0513, | |
| "eval_samples_per_second": 215.829, | |
| "eval_steps_per_second": 4.497, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 19.193548387096776, | |
| "grad_norm": 0.41100090742111206, | |
| "learning_rate": 0.00012323870967741936, | |
| "loss": 0.1573, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 19.193548387096776, | |
| "eval_loss": 0.14913968741893768, | |
| "eval_runtime": 89.4022, | |
| "eval_samples_per_second": 212.568, | |
| "eval_steps_per_second": 4.429, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 19.35483870967742, | |
| "grad_norm": 0.3832205832004547, | |
| "learning_rate": 0.00012259354838709677, | |
| "loss": 0.1542, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 19.35483870967742, | |
| "eval_loss": 0.14829514920711517, | |
| "eval_runtime": 86.5784, | |
| "eval_samples_per_second": 219.5, | |
| "eval_steps_per_second": 4.574, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 19.516129032258064, | |
| "grad_norm": 0.3583919405937195, | |
| "learning_rate": 0.00012194838709677421, | |
| "loss": 0.1562, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 19.516129032258064, | |
| "eval_loss": 0.14888739585876465, | |
| "eval_runtime": 84.8338, | |
| "eval_samples_per_second": 224.014, | |
| "eval_steps_per_second": 4.668, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 19.677419354838708, | |
| "grad_norm": 0.3783506751060486, | |
| "learning_rate": 0.00012130322580645161, | |
| "loss": 0.1551, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 19.677419354838708, | |
| "eval_loss": 0.1506240963935852, | |
| "eval_runtime": 87.3622, | |
| "eval_samples_per_second": 217.531, | |
| "eval_steps_per_second": 4.533, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 19.838709677419356, | |
| "grad_norm": 0.39638015627861023, | |
| "learning_rate": 0.00012065806451612905, | |
| "loss": 0.1544, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 19.838709677419356, | |
| "eval_loss": 0.1477993279695511, | |
| "eval_runtime": 90.2688, | |
| "eval_samples_per_second": 210.527, | |
| "eval_steps_per_second": 4.387, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.5352652072906494, | |
| "learning_rate": 0.00012001290322580645, | |
| "loss": 0.1558, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.1461372971534729, | |
| "eval_runtime": 88.0458, | |
| "eval_samples_per_second": 215.842, | |
| "eval_steps_per_second": 4.498, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 20.161290322580644, | |
| "grad_norm": 0.32452672719955444, | |
| "learning_rate": 0.00011936774193548387, | |
| "loss": 0.15, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 20.161290322580644, | |
| "eval_loss": 0.14649870991706848, | |
| "eval_runtime": 86.0304, | |
| "eval_samples_per_second": 220.899, | |
| "eval_steps_per_second": 4.603, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 20.322580645161292, | |
| "grad_norm": 0.3544420599937439, | |
| "learning_rate": 0.00011872258064516129, | |
| "loss": 0.1451, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 20.322580645161292, | |
| "eval_loss": 0.14727556705474854, | |
| "eval_runtime": 89.6974, | |
| "eval_samples_per_second": 211.868, | |
| "eval_steps_per_second": 4.415, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 20.483870967741936, | |
| "grad_norm": 0.3795066773891449, | |
| "learning_rate": 0.00011807741935483871, | |
| "loss": 0.1544, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 20.483870967741936, | |
| "eval_loss": 0.1442754566669464, | |
| "eval_runtime": 89.414, | |
| "eval_samples_per_second": 212.54, | |
| "eval_steps_per_second": 4.429, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 20.64516129032258, | |
| "grad_norm": 0.4115369915962219, | |
| "learning_rate": 0.00011743225806451614, | |
| "loss": 0.1507, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 20.64516129032258, | |
| "eval_loss": 0.14425314962863922, | |
| "eval_runtime": 88.6568, | |
| "eval_samples_per_second": 214.355, | |
| "eval_steps_per_second": 4.467, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 20.806451612903224, | |
| "grad_norm": 0.38675764203071594, | |
| "learning_rate": 0.00011678709677419355, | |
| "loss": 0.1497, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 20.806451612903224, | |
| "eval_loss": 0.1459958851337433, | |
| "eval_runtime": 87.6832, | |
| "eval_samples_per_second": 216.735, | |
| "eval_steps_per_second": 4.516, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 20.967741935483872, | |
| "grad_norm": 0.37480926513671875, | |
| "learning_rate": 0.00011614193548387098, | |
| "loss": 0.1509, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 20.967741935483872, | |
| "eval_loss": 0.14522159099578857, | |
| "eval_runtime": 86.8578, | |
| "eval_samples_per_second": 218.794, | |
| "eval_steps_per_second": 4.559, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 21.129032258064516, | |
| "grad_norm": 0.3740350306034088, | |
| "learning_rate": 0.00011549677419354839, | |
| "loss": 0.1504, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 21.129032258064516, | |
| "eval_loss": 0.14784836769104004, | |
| "eval_runtime": 85.4911, | |
| "eval_samples_per_second": 222.292, | |
| "eval_steps_per_second": 4.632, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 21.29032258064516, | |
| "grad_norm": 0.4533497095108032, | |
| "learning_rate": 0.00011485161290322581, | |
| "loss": 0.1517, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 21.29032258064516, | |
| "eval_loss": 0.14581461250782013, | |
| "eval_runtime": 86.0863, | |
| "eval_samples_per_second": 220.755, | |
| "eval_steps_per_second": 4.6, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 21.451612903225808, | |
| "grad_norm": 0.3758571743965149, | |
| "learning_rate": 0.00011420645161290323, | |
| "loss": 0.1452, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 21.451612903225808, | |
| "eval_loss": 0.1412593424320221, | |
| "eval_runtime": 86.2595, | |
| "eval_samples_per_second": 220.312, | |
| "eval_steps_per_second": 4.591, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 21.612903225806452, | |
| "grad_norm": 0.3700609803199768, | |
| "learning_rate": 0.00011356129032258065, | |
| "loss": 0.1461, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 21.612903225806452, | |
| "eval_loss": 0.1432737410068512, | |
| "eval_runtime": 86.1269, | |
| "eval_samples_per_second": 220.651, | |
| "eval_steps_per_second": 4.598, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 21.774193548387096, | |
| "grad_norm": 0.31164905428886414, | |
| "learning_rate": 0.00011291612903225808, | |
| "loss": 0.1463, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 21.774193548387096, | |
| "eval_loss": 0.14101718366146088, | |
| "eval_runtime": 88.7161, | |
| "eval_samples_per_second": 214.211, | |
| "eval_steps_per_second": 4.464, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 21.93548387096774, | |
| "grad_norm": 0.3831172287464142, | |
| "learning_rate": 0.00011227096774193549, | |
| "loss": 0.1509, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 21.93548387096774, | |
| "eval_loss": 0.14286787807941437, | |
| "eval_runtime": 88.4285, | |
| "eval_samples_per_second": 214.908, | |
| "eval_steps_per_second": 4.478, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 22.096774193548388, | |
| "grad_norm": 0.3675175905227661, | |
| "learning_rate": 0.00011162580645161292, | |
| "loss": 0.1478, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 22.096774193548388, | |
| "eval_loss": 0.14311262965202332, | |
| "eval_runtime": 86.1923, | |
| "eval_samples_per_second": 220.484, | |
| "eval_steps_per_second": 4.594, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 22.258064516129032, | |
| "grad_norm": 0.4077725410461426, | |
| "learning_rate": 0.00011098064516129032, | |
| "loss": 0.1455, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 22.258064516129032, | |
| "eval_loss": 0.1406867653131485, | |
| "eval_runtime": 88.1595, | |
| "eval_samples_per_second": 215.564, | |
| "eval_steps_per_second": 4.492, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 22.419354838709676, | |
| "grad_norm": 0.37918218970298767, | |
| "learning_rate": 0.00011033548387096775, | |
| "loss": 0.1438, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 22.419354838709676, | |
| "eval_loss": 0.14213036000728607, | |
| "eval_runtime": 88.6161, | |
| "eval_samples_per_second": 214.453, | |
| "eval_steps_per_second": 4.469, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 22.580645161290324, | |
| "grad_norm": 0.4112975597381592, | |
| "learning_rate": 0.00010969032258064518, | |
| "loss": 0.1471, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 22.580645161290324, | |
| "eval_loss": 0.14302890002727509, | |
| "eval_runtime": 87.2404, | |
| "eval_samples_per_second": 217.835, | |
| "eval_steps_per_second": 4.539, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 22.741935483870968, | |
| "grad_norm": 0.3555707335472107, | |
| "learning_rate": 0.00010904516129032258, | |
| "loss": 0.1435, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 22.741935483870968, | |
| "eval_loss": 0.14145122468471527, | |
| "eval_runtime": 87.7191, | |
| "eval_samples_per_second": 216.646, | |
| "eval_steps_per_second": 4.514, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 22.903225806451612, | |
| "grad_norm": 0.33775362372398376, | |
| "learning_rate": 0.00010840000000000002, | |
| "loss": 0.1459, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 22.903225806451612, | |
| "eval_loss": 0.14196287095546722, | |
| "eval_runtime": 85.2736, | |
| "eval_samples_per_second": 222.859, | |
| "eval_steps_per_second": 4.644, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 23.06451612903226, | |
| "grad_norm": 0.40644168853759766, | |
| "learning_rate": 0.00010775483870967742, | |
| "loss": 0.1403, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 23.06451612903226, | |
| "eval_loss": 0.13985274732112885, | |
| "eval_runtime": 88.2679, | |
| "eval_samples_per_second": 215.299, | |
| "eval_steps_per_second": 4.486, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 23.225806451612904, | |
| "grad_norm": 0.30164963006973267, | |
| "learning_rate": 0.00010710967741935484, | |
| "loss": 0.1438, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 23.225806451612904, | |
| "eval_loss": 0.13959668576717377, | |
| "eval_runtime": 85.8815, | |
| "eval_samples_per_second": 221.282, | |
| "eval_steps_per_second": 4.611, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 23.387096774193548, | |
| "grad_norm": 0.41760918498039246, | |
| "learning_rate": 0.00010646451612903226, | |
| "loss": 0.1455, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 23.387096774193548, | |
| "eval_loss": 0.1405312865972519, | |
| "eval_runtime": 87.4804, | |
| "eval_samples_per_second": 217.237, | |
| "eval_steps_per_second": 4.527, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 23.548387096774192, | |
| "grad_norm": 0.31449875235557556, | |
| "learning_rate": 0.00010581935483870968, | |
| "loss": 0.1416, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 23.548387096774192, | |
| "eval_loss": 0.1397952139377594, | |
| "eval_runtime": 84.3693, | |
| "eval_samples_per_second": 225.248, | |
| "eval_steps_per_second": 4.694, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 23.70967741935484, | |
| "grad_norm": 0.34104588627815247, | |
| "learning_rate": 0.00010517419354838711, | |
| "loss": 0.143, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 23.70967741935484, | |
| "eval_loss": 0.13995403051376343, | |
| "eval_runtime": 88.9533, | |
| "eval_samples_per_second": 213.64, | |
| "eval_steps_per_second": 4.452, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 23.870967741935484, | |
| "grad_norm": 0.43316343426704407, | |
| "learning_rate": 0.00010452903225806452, | |
| "loss": 0.1432, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 23.870967741935484, | |
| "eval_loss": 0.13980048894882202, | |
| "eval_runtime": 87.8388, | |
| "eval_samples_per_second": 216.351, | |
| "eval_steps_per_second": 4.508, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 24.032258064516128, | |
| "grad_norm": 0.35213446617126465, | |
| "learning_rate": 0.00010388387096774195, | |
| "loss": 0.1463, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 24.032258064516128, | |
| "eval_loss": 0.1396203339099884, | |
| "eval_runtime": 86.1081, | |
| "eval_samples_per_second": 220.699, | |
| "eval_steps_per_second": 4.599, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 24.193548387096776, | |
| "grad_norm": 0.2780129313468933, | |
| "learning_rate": 0.00010323870967741936, | |
| "loss": 0.1396, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 24.193548387096776, | |
| "eval_loss": 0.13865940272808075, | |
| "eval_runtime": 84.8365, | |
| "eval_samples_per_second": 224.007, | |
| "eval_steps_per_second": 4.668, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 24.35483870967742, | |
| "grad_norm": 0.34334343671798706, | |
| "learning_rate": 0.00010259354838709679, | |
| "loss": 0.1395, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 24.35483870967742, | |
| "eval_loss": 0.1386643797159195, | |
| "eval_runtime": 85.7437, | |
| "eval_samples_per_second": 221.637, | |
| "eval_steps_per_second": 4.618, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 24.516129032258064, | |
| "grad_norm": 0.3119650185108185, | |
| "learning_rate": 0.00010194838709677418, | |
| "loss": 0.1381, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 24.516129032258064, | |
| "eval_loss": 0.1378747671842575, | |
| "eval_runtime": 94.9067, | |
| "eval_samples_per_second": 200.239, | |
| "eval_steps_per_second": 4.173, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 24.677419354838708, | |
| "grad_norm": 0.36497557163238525, | |
| "learning_rate": 0.00010130322580645162, | |
| "loss": 0.1429, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 24.677419354838708, | |
| "eval_loss": 0.1373891532421112, | |
| "eval_runtime": 86.0356, | |
| "eval_samples_per_second": 220.885, | |
| "eval_steps_per_second": 4.603, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 24.838709677419356, | |
| "grad_norm": 0.3456083834171295, | |
| "learning_rate": 0.00010065806451612905, | |
| "loss": 0.1349, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 24.838709677419356, | |
| "eval_loss": 0.1388697326183319, | |
| "eval_runtime": 85.5136, | |
| "eval_samples_per_second": 222.234, | |
| "eval_steps_per_second": 4.631, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.4443909227848053, | |
| "learning_rate": 0.00010001290322580645, | |
| "loss": 0.1403, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.1377411037683487, | |
| "eval_runtime": 91.0633, | |
| "eval_samples_per_second": 208.69, | |
| "eval_steps_per_second": 4.349, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 25.161290322580644, | |
| "grad_norm": 0.34777510166168213, | |
| "learning_rate": 9.936774193548387e-05, | |
| "loss": 0.1383, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 25.161290322580644, | |
| "eval_loss": 0.13685546815395355, | |
| "eval_runtime": 88.0711, | |
| "eval_samples_per_second": 215.78, | |
| "eval_steps_per_second": 4.496, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 25.322580645161292, | |
| "grad_norm": 0.35419756174087524, | |
| "learning_rate": 9.87225806451613e-05, | |
| "loss": 0.1348, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 25.322580645161292, | |
| "eval_loss": 0.13754527270793915, | |
| "eval_runtime": 85.9879, | |
| "eval_samples_per_second": 221.008, | |
| "eval_steps_per_second": 4.605, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 25.483870967741936, | |
| "grad_norm": 0.35281285643577576, | |
| "learning_rate": 9.807741935483871e-05, | |
| "loss": 0.136, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 25.483870967741936, | |
| "eval_loss": 0.13788650929927826, | |
| "eval_runtime": 86.623, | |
| "eval_samples_per_second": 219.387, | |
| "eval_steps_per_second": 4.572, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 25.64516129032258, | |
| "grad_norm": 0.26881253719329834, | |
| "learning_rate": 9.743225806451614e-05, | |
| "loss": 0.1376, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 25.64516129032258, | |
| "eval_loss": 0.13465990126132965, | |
| "eval_runtime": 87.1032, | |
| "eval_samples_per_second": 218.178, | |
| "eval_steps_per_second": 4.546, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 25.806451612903224, | |
| "grad_norm": 0.38799649477005005, | |
| "learning_rate": 9.678709677419355e-05, | |
| "loss": 0.1365, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 25.806451612903224, | |
| "eval_loss": 0.13522003591060638, | |
| "eval_runtime": 95.2991, | |
| "eval_samples_per_second": 199.414, | |
| "eval_steps_per_second": 4.155, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 25.967741935483872, | |
| "grad_norm": 0.37531042098999023, | |
| "learning_rate": 9.614193548387098e-05, | |
| "loss": 0.1362, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 25.967741935483872, | |
| "eval_loss": 0.13398829102516174, | |
| "eval_runtime": 87.8409, | |
| "eval_samples_per_second": 216.346, | |
| "eval_steps_per_second": 4.508, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 26.129032258064516, | |
| "grad_norm": 0.3436211049556732, | |
| "learning_rate": 9.549677419354839e-05, | |
| "loss": 0.1342, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 26.129032258064516, | |
| "eval_loss": 0.13594096899032593, | |
| "eval_runtime": 84.6121, | |
| "eval_samples_per_second": 224.602, | |
| "eval_steps_per_second": 4.68, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 26.29032258064516, | |
| "grad_norm": 0.38407161831855774, | |
| "learning_rate": 9.48516129032258e-05, | |
| "loss": 0.1322, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 26.29032258064516, | |
| "eval_loss": 0.13434267044067383, | |
| "eval_runtime": 86.2264, | |
| "eval_samples_per_second": 220.396, | |
| "eval_steps_per_second": 4.593, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 26.451612903225808, | |
| "grad_norm": 0.3329039514064789, | |
| "learning_rate": 9.420645161290324e-05, | |
| "loss": 0.1338, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 26.451612903225808, | |
| "eval_loss": 0.1350966840982437, | |
| "eval_runtime": 87.8388, | |
| "eval_samples_per_second": 216.351, | |
| "eval_steps_per_second": 4.508, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 26.612903225806452, | |
| "grad_norm": 0.41340529918670654, | |
| "learning_rate": 9.356129032258065e-05, | |
| "loss": 0.1362, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 26.612903225806452, | |
| "eval_loss": 0.13463687896728516, | |
| "eval_runtime": 86.5564, | |
| "eval_samples_per_second": 219.556, | |
| "eval_steps_per_second": 4.575, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 26.774193548387096, | |
| "grad_norm": 0.33343157172203064, | |
| "learning_rate": 9.291612903225807e-05, | |
| "loss": 0.1331, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 26.774193548387096, | |
| "eval_loss": 0.1351860612630844, | |
| "eval_runtime": 88.4351, | |
| "eval_samples_per_second": 214.892, | |
| "eval_steps_per_second": 4.478, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 26.93548387096774, | |
| "grad_norm": 0.3529933989048004, | |
| "learning_rate": 9.227096774193549e-05, | |
| "loss": 0.1306, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 26.93548387096774, | |
| "eval_loss": 0.13507899641990662, | |
| "eval_runtime": 88.8438, | |
| "eval_samples_per_second": 213.904, | |
| "eval_steps_per_second": 4.457, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 27.096774193548388, | |
| "grad_norm": 0.3433696925640106, | |
| "learning_rate": 9.16258064516129e-05, | |
| "loss": 0.1339, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 27.096774193548388, | |
| "eval_loss": 0.13309802114963531, | |
| "eval_runtime": 89.0663, | |
| "eval_samples_per_second": 213.369, | |
| "eval_steps_per_second": 4.446, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 27.258064516129032, | |
| "grad_norm": 0.3371010720729828, | |
| "learning_rate": 9.098064516129032e-05, | |
| "loss": 0.1315, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 27.258064516129032, | |
| "eval_loss": 0.1346307247877121, | |
| "eval_runtime": 88.7761, | |
| "eval_samples_per_second": 214.067, | |
| "eval_steps_per_second": 4.461, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 27.419354838709676, | |
| "grad_norm": 0.36965006589889526, | |
| "learning_rate": 9.033548387096774e-05, | |
| "loss": 0.1349, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 27.419354838709676, | |
| "eval_loss": 0.13345304131507874, | |
| "eval_runtime": 89.2673, | |
| "eval_samples_per_second": 212.889, | |
| "eval_steps_per_second": 4.436, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 27.580645161290324, | |
| "grad_norm": 0.3361060917377472, | |
| "learning_rate": 8.969032258064517e-05, | |
| "loss": 0.1323, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 27.580645161290324, | |
| "eval_loss": 0.1327604055404663, | |
| "eval_runtime": 87.1092, | |
| "eval_samples_per_second": 218.163, | |
| "eval_steps_per_second": 4.546, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 27.741935483870968, | |
| "grad_norm": 0.2936784029006958, | |
| "learning_rate": 8.904516129032258e-05, | |
| "loss": 0.132, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 27.741935483870968, | |
| "eval_loss": 0.1343098133802414, | |
| "eval_runtime": 89.5037, | |
| "eval_samples_per_second": 212.326, | |
| "eval_steps_per_second": 4.424, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 27.903225806451612, | |
| "grad_norm": 0.332289457321167, | |
| "learning_rate": 8.840000000000001e-05, | |
| "loss": 0.1303, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 27.903225806451612, | |
| "eval_loss": 0.1309853345155716, | |
| "eval_runtime": 88.6535, | |
| "eval_samples_per_second": 214.363, | |
| "eval_steps_per_second": 4.467, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 28.06451612903226, | |
| "grad_norm": 0.3243560791015625, | |
| "learning_rate": 8.775483870967742e-05, | |
| "loss": 0.1295, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 28.06451612903226, | |
| "eval_loss": 0.13385291397571564, | |
| "eval_runtime": 89.1997, | |
| "eval_samples_per_second": 213.05, | |
| "eval_steps_per_second": 4.439, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 28.225806451612904, | |
| "grad_norm": 0.2707726061344147, | |
| "learning_rate": 8.710967741935485e-05, | |
| "loss": 0.1296, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 28.225806451612904, | |
| "eval_loss": 0.131495863199234, | |
| "eval_runtime": 87.7498, | |
| "eval_samples_per_second": 216.57, | |
| "eval_steps_per_second": 4.513, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 28.387096774193548, | |
| "grad_norm": 0.3294861912727356, | |
| "learning_rate": 8.646451612903226e-05, | |
| "loss": 0.1309, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 28.387096774193548, | |
| "eval_loss": 0.1315741389989853, | |
| "eval_runtime": 89.3762, | |
| "eval_samples_per_second": 212.629, | |
| "eval_steps_per_second": 4.431, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 28.548387096774192, | |
| "grad_norm": 0.3417121469974518, | |
| "learning_rate": 8.581935483870968e-05, | |
| "loss": 0.1259, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 28.548387096774192, | |
| "eval_loss": 0.13167841732501984, | |
| "eval_runtime": 89.5004, | |
| "eval_samples_per_second": 212.334, | |
| "eval_steps_per_second": 4.425, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 28.70967741935484, | |
| "grad_norm": 0.4122408628463745, | |
| "learning_rate": 8.517419354838711e-05, | |
| "loss": 0.1309, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 28.70967741935484, | |
| "eval_loss": 0.1316230446100235, | |
| "eval_runtime": 87.7329, | |
| "eval_samples_per_second": 216.612, | |
| "eval_steps_per_second": 4.514, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 28.870967741935484, | |
| "grad_norm": 0.28204530477523804, | |
| "learning_rate": 8.452903225806452e-05, | |
| "loss": 0.1275, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 28.870967741935484, | |
| "eval_loss": 0.13107319176197052, | |
| "eval_runtime": 89.6188, | |
| "eval_samples_per_second": 212.054, | |
| "eval_steps_per_second": 4.419, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 29.032258064516128, | |
| "grad_norm": 0.35629284381866455, | |
| "learning_rate": 8.388387096774194e-05, | |
| "loss": 0.1288, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 29.032258064516128, | |
| "eval_loss": 0.13227057456970215, | |
| "eval_runtime": 88.8381, | |
| "eval_samples_per_second": 213.917, | |
| "eval_steps_per_second": 4.458, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 29.193548387096776, | |
| "grad_norm": 0.309741348028183, | |
| "learning_rate": 8.323870967741936e-05, | |
| "loss": 0.128, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 29.193548387096776, | |
| "eval_loss": 0.13167747855186462, | |
| "eval_runtime": 88.4882, | |
| "eval_samples_per_second": 214.763, | |
| "eval_steps_per_second": 4.475, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 29.35483870967742, | |
| "grad_norm": 0.288798987865448, | |
| "learning_rate": 8.259354838709677e-05, | |
| "loss": 0.1267, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 29.35483870967742, | |
| "eval_loss": 0.1299341917037964, | |
| "eval_runtime": 86.532, | |
| "eval_samples_per_second": 219.618, | |
| "eval_steps_per_second": 4.576, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 29.516129032258064, | |
| "grad_norm": 0.3058416545391083, | |
| "learning_rate": 8.19483870967742e-05, | |
| "loss": 0.1262, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 29.516129032258064, | |
| "eval_loss": 0.1303175687789917, | |
| "eval_runtime": 87.4, | |
| "eval_samples_per_second": 217.437, | |
| "eval_steps_per_second": 4.531, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 29.677419354838708, | |
| "grad_norm": 0.357373982667923, | |
| "learning_rate": 8.130322580645163e-05, | |
| "loss": 0.1317, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 29.677419354838708, | |
| "eval_loss": 0.1297539323568344, | |
| "eval_runtime": 87.917, | |
| "eval_samples_per_second": 216.158, | |
| "eval_steps_per_second": 4.504, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 29.838709677419356, | |
| "grad_norm": 0.3070197105407715, | |
| "learning_rate": 8.065806451612904e-05, | |
| "loss": 0.1277, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 29.838709677419356, | |
| "eval_loss": 0.12903046607971191, | |
| "eval_runtime": 90.6017, | |
| "eval_samples_per_second": 209.753, | |
| "eval_steps_per_second": 4.371, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.36131608486175537, | |
| "learning_rate": 8.001290322580646e-05, | |
| "loss": 0.1244, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.1285567581653595, | |
| "eval_runtime": 88.2187, | |
| "eval_samples_per_second": 215.419, | |
| "eval_steps_per_second": 4.489, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 30.161290322580644, | |
| "grad_norm": 0.31005364656448364, | |
| "learning_rate": 7.936774193548388e-05, | |
| "loss": 0.1274, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 30.161290322580644, | |
| "eval_loss": 0.12861816585063934, | |
| "eval_runtime": 88.8974, | |
| "eval_samples_per_second": 213.775, | |
| "eval_steps_per_second": 4.455, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 30.322580645161292, | |
| "grad_norm": 0.3450087904930115, | |
| "learning_rate": 7.872258064516129e-05, | |
| "loss": 0.1315, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 30.322580645161292, | |
| "eval_loss": 0.12834839522838593, | |
| "eval_runtime": 88.9142, | |
| "eval_samples_per_second": 213.734, | |
| "eval_steps_per_second": 4.454, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 30.483870967741936, | |
| "grad_norm": 0.26987212896347046, | |
| "learning_rate": 7.807741935483871e-05, | |
| "loss": 0.1228, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 30.483870967741936, | |
| "eval_loss": 0.12817350029945374, | |
| "eval_runtime": 88.7737, | |
| "eval_samples_per_second": 214.073, | |
| "eval_steps_per_second": 4.461, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 30.64516129032258, | |
| "grad_norm": 0.3717745244503021, | |
| "learning_rate": 7.743225806451613e-05, | |
| "loss": 0.124, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 30.64516129032258, | |
| "eval_loss": 0.13065199553966522, | |
| "eval_runtime": 88.4187, | |
| "eval_samples_per_second": 214.932, | |
| "eval_steps_per_second": 4.479, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 30.806451612903224, | |
| "grad_norm": 0.29905572533607483, | |
| "learning_rate": 7.678709677419355e-05, | |
| "loss": 0.1212, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 30.806451612903224, | |
| "eval_loss": 0.1283879578113556, | |
| "eval_runtime": 87.9276, | |
| "eval_samples_per_second": 216.132, | |
| "eval_steps_per_second": 4.504, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 30.967741935483872, | |
| "grad_norm": 0.2816069722175598, | |
| "learning_rate": 7.614193548387098e-05, | |
| "loss": 0.1233, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 30.967741935483872, | |
| "eval_loss": 0.12923495471477509, | |
| "eval_runtime": 89.2797, | |
| "eval_samples_per_second": 212.859, | |
| "eval_steps_per_second": 4.436, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 31.129032258064516, | |
| "grad_norm": 0.30339810252189636, | |
| "learning_rate": 7.549677419354839e-05, | |
| "loss": 0.1226, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 31.129032258064516, | |
| "eval_loss": 0.12811945378780365, | |
| "eval_runtime": 85.4363, | |
| "eval_samples_per_second": 222.435, | |
| "eval_steps_per_second": 4.635, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 31.29032258064516, | |
| "grad_norm": 0.3012908697128296, | |
| "learning_rate": 7.48516129032258e-05, | |
| "loss": 0.1244, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 31.29032258064516, | |
| "eval_loss": 0.12850458920001984, | |
| "eval_runtime": 90.0213, | |
| "eval_samples_per_second": 211.106, | |
| "eval_steps_per_second": 4.399, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 31.451612903225808, | |
| "grad_norm": 0.36201730370521545, | |
| "learning_rate": 7.420645161290323e-05, | |
| "loss": 0.1272, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 31.451612903225808, | |
| "eval_loss": 0.12800218164920807, | |
| "eval_runtime": 95.4408, | |
| "eval_samples_per_second": 199.118, | |
| "eval_steps_per_second": 4.149, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 31.612903225806452, | |
| "grad_norm": 0.30312639474868774, | |
| "learning_rate": 7.356129032258064e-05, | |
| "loss": 0.1248, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 31.612903225806452, | |
| "eval_loss": 0.12891393899917603, | |
| "eval_runtime": 89.5417, | |
| "eval_samples_per_second": 212.236, | |
| "eval_steps_per_second": 4.423, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 31.774193548387096, | |
| "grad_norm": 0.34564414620399475, | |
| "learning_rate": 7.291612903225807e-05, | |
| "loss": 0.1245, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 31.774193548387096, | |
| "eval_loss": 0.12625598907470703, | |
| "eval_runtime": 92.1876, | |
| "eval_samples_per_second": 206.145, | |
| "eval_steps_per_second": 4.296, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 31.93548387096774, | |
| "grad_norm": 0.31116342544555664, | |
| "learning_rate": 7.22709677419355e-05, | |
| "loss": 0.1261, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 31.93548387096774, | |
| "eval_loss": 0.1266312599182129, | |
| "eval_runtime": 105.6265, | |
| "eval_samples_per_second": 179.917, | |
| "eval_steps_per_second": 3.749, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 32.096774193548384, | |
| "grad_norm": 0.2931393086910248, | |
| "learning_rate": 7.16258064516129e-05, | |
| "loss": 0.1228, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 32.096774193548384, | |
| "eval_loss": 0.12575842440128326, | |
| "eval_runtime": 89.3508, | |
| "eval_samples_per_second": 212.69, | |
| "eval_steps_per_second": 4.432, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 32.25806451612903, | |
| "grad_norm": 0.244206041097641, | |
| "learning_rate": 7.098064516129033e-05, | |
| "loss": 0.1232, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 32.25806451612903, | |
| "eval_loss": 0.12638460099697113, | |
| "eval_runtime": 101.1824, | |
| "eval_samples_per_second": 187.819, | |
| "eval_steps_per_second": 3.914, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 32.41935483870968, | |
| "grad_norm": 0.2691047489643097, | |
| "learning_rate": 7.033548387096774e-05, | |
| "loss": 0.1224, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 32.41935483870968, | |
| "eval_loss": 0.12849974632263184, | |
| "eval_runtime": 91.3103, | |
| "eval_samples_per_second": 208.126, | |
| "eval_steps_per_second": 4.337, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 32.58064516129032, | |
| "grad_norm": 0.30258217453956604, | |
| "learning_rate": 6.969032258064516e-05, | |
| "loss": 0.1239, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 32.58064516129032, | |
| "eval_loss": 0.12675543129444122, | |
| "eval_runtime": 87.3987, | |
| "eval_samples_per_second": 217.44, | |
| "eval_steps_per_second": 4.531, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 32.74193548387097, | |
| "grad_norm": 0.2675139009952545, | |
| "learning_rate": 6.904516129032258e-05, | |
| "loss": 0.1214, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 32.74193548387097, | |
| "eval_loss": 0.12617285549640656, | |
| "eval_runtime": 87.9466, | |
| "eval_samples_per_second": 216.086, | |
| "eval_steps_per_second": 4.503, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 32.903225806451616, | |
| "grad_norm": 0.3610474467277527, | |
| "learning_rate": 6.840000000000001e-05, | |
| "loss": 0.1205, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 32.903225806451616, | |
| "eval_loss": 0.12886326014995575, | |
| "eval_runtime": 88.5505, | |
| "eval_samples_per_second": 214.612, | |
| "eval_steps_per_second": 4.472, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 33.064516129032256, | |
| "grad_norm": 0.3488837480545044, | |
| "learning_rate": 6.775483870967742e-05, | |
| "loss": 0.1235, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 33.064516129032256, | |
| "eval_loss": 0.12522108852863312, | |
| "eval_runtime": 86.8524, | |
| "eval_samples_per_second": 218.808, | |
| "eval_steps_per_second": 4.559, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 33.225806451612904, | |
| "grad_norm": 0.31276750564575195, | |
| "learning_rate": 6.710967741935485e-05, | |
| "loss": 0.1205, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 33.225806451612904, | |
| "eval_loss": 0.12656815350055695, | |
| "eval_runtime": 88.7072, | |
| "eval_samples_per_second": 214.233, | |
| "eval_steps_per_second": 4.464, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 33.38709677419355, | |
| "grad_norm": 0.2674780488014221, | |
| "learning_rate": 6.646451612903226e-05, | |
| "loss": 0.1189, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 33.38709677419355, | |
| "eval_loss": 0.12716618180274963, | |
| "eval_runtime": 86.5042, | |
| "eval_samples_per_second": 219.689, | |
| "eval_steps_per_second": 4.578, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 33.54838709677419, | |
| "grad_norm": 0.35878920555114746, | |
| "learning_rate": 6.581935483870969e-05, | |
| "loss": 0.1187, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 33.54838709677419, | |
| "eval_loss": 0.12553632259368896, | |
| "eval_runtime": 89.3045, | |
| "eval_samples_per_second": 212.8, | |
| "eval_steps_per_second": 4.434, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 33.70967741935484, | |
| "grad_norm": 0.3341622054576874, | |
| "learning_rate": 6.51741935483871e-05, | |
| "loss": 0.1256, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 33.70967741935484, | |
| "eval_loss": 0.12624432146549225, | |
| "eval_runtime": 88.9284, | |
| "eval_samples_per_second": 213.7, | |
| "eval_steps_per_second": 4.453, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 33.87096774193548, | |
| "grad_norm": 0.30921441316604614, | |
| "learning_rate": 6.452903225806451e-05, | |
| "loss": 0.1226, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 33.87096774193548, | |
| "eval_loss": 0.12660150229930878, | |
| "eval_runtime": 89.8795, | |
| "eval_samples_per_second": 211.439, | |
| "eval_steps_per_second": 4.406, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 34.03225806451613, | |
| "grad_norm": 0.39118140935897827, | |
| "learning_rate": 6.388387096774194e-05, | |
| "loss": 0.1239, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 34.03225806451613, | |
| "eval_loss": 0.1243576630949974, | |
| "eval_runtime": 89.012, | |
| "eval_samples_per_second": 213.499, | |
| "eval_steps_per_second": 4.449, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 34.193548387096776, | |
| "grad_norm": 0.3129843771457672, | |
| "learning_rate": 6.323870967741936e-05, | |
| "loss": 0.1227, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 34.193548387096776, | |
| "eval_loss": 0.12571550905704498, | |
| "eval_runtime": 88.7287, | |
| "eval_samples_per_second": 214.181, | |
| "eval_steps_per_second": 4.463, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 34.354838709677416, | |
| "grad_norm": 0.3165799081325531, | |
| "learning_rate": 6.259354838709678e-05, | |
| "loss": 0.1176, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 34.354838709677416, | |
| "eval_loss": 0.12522710859775543, | |
| "eval_runtime": 87.961, | |
| "eval_samples_per_second": 216.05, | |
| "eval_steps_per_second": 4.502, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 34.516129032258064, | |
| "grad_norm": 0.30507832765579224, | |
| "learning_rate": 6.19483870967742e-05, | |
| "loss": 0.1224, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 34.516129032258064, | |
| "eval_loss": 0.1257481724023819, | |
| "eval_runtime": 88.8686, | |
| "eval_samples_per_second": 213.844, | |
| "eval_steps_per_second": 4.456, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 34.67741935483871, | |
| "grad_norm": 0.35299909114837646, | |
| "learning_rate": 6.130322580645161e-05, | |
| "loss": 0.1194, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 34.67741935483871, | |
| "eval_loss": 0.12496702373027802, | |
| "eval_runtime": 90.6297, | |
| "eval_samples_per_second": 209.689, | |
| "eval_steps_per_second": 4.369, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 34.83870967741935, | |
| "grad_norm": 0.29007554054260254, | |
| "learning_rate": 6.065806451612903e-05, | |
| "loss": 0.1187, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 34.83870967741935, | |
| "eval_loss": 0.12488405406475067, | |
| "eval_runtime": 88.9174, | |
| "eval_samples_per_second": 213.726, | |
| "eval_steps_per_second": 4.454, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 0.48845478892326355, | |
| "learning_rate": 6.001290322580645e-05, | |
| "loss": 0.1196, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 0.12530028820037842, | |
| "eval_runtime": 89.441, | |
| "eval_samples_per_second": 212.475, | |
| "eval_steps_per_second": 4.428, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 35.16129032258065, | |
| "grad_norm": 0.25860869884490967, | |
| "learning_rate": 5.936774193548388e-05, | |
| "loss": 0.1183, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 35.16129032258065, | |
| "eval_loss": 0.12282554060220718, | |
| "eval_runtime": 88.2534, | |
| "eval_samples_per_second": 215.334, | |
| "eval_steps_per_second": 4.487, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 35.32258064516129, | |
| "grad_norm": 0.330858051776886, | |
| "learning_rate": 5.87225806451613e-05, | |
| "loss": 0.1157, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 35.32258064516129, | |
| "eval_loss": 0.12600766122341156, | |
| "eval_runtime": 88.4405, | |
| "eval_samples_per_second": 214.879, | |
| "eval_steps_per_second": 4.478, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 35.483870967741936, | |
| "grad_norm": 0.2873861789703369, | |
| "learning_rate": 5.8077419354838716e-05, | |
| "loss": 0.1188, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 35.483870967741936, | |
| "eval_loss": 0.12375803291797638, | |
| "eval_runtime": 90.5675, | |
| "eval_samples_per_second": 209.832, | |
| "eval_steps_per_second": 4.372, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 35.645161290322584, | |
| "grad_norm": 0.3179507851600647, | |
| "learning_rate": 5.743225806451613e-05, | |
| "loss": 0.1166, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 35.645161290322584, | |
| "eval_loss": 0.12402182072401047, | |
| "eval_runtime": 89.7815, | |
| "eval_samples_per_second": 211.669, | |
| "eval_steps_per_second": 4.411, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 35.806451612903224, | |
| "grad_norm": 0.2623940110206604, | |
| "learning_rate": 5.678709677419355e-05, | |
| "loss": 0.116, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 35.806451612903224, | |
| "eval_loss": 0.12476334720849991, | |
| "eval_runtime": 89.8543, | |
| "eval_samples_per_second": 211.498, | |
| "eval_steps_per_second": 4.407, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 35.96774193548387, | |
| "grad_norm": 0.2534388601779938, | |
| "learning_rate": 5.614193548387097e-05, | |
| "loss": 0.1207, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 35.96774193548387, | |
| "eval_loss": 0.12570072710514069, | |
| "eval_runtime": 91.2008, | |
| "eval_samples_per_second": 208.375, | |
| "eval_steps_per_second": 4.342, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 36.12903225806452, | |
| "grad_norm": 0.30418768525123596, | |
| "learning_rate": 5.5496774193548386e-05, | |
| "loss": 0.1177, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 36.12903225806452, | |
| "eval_loss": 0.12448415905237198, | |
| "eval_runtime": 89.3046, | |
| "eval_samples_per_second": 212.8, | |
| "eval_steps_per_second": 4.434, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 36.29032258064516, | |
| "grad_norm": 0.290436327457428, | |
| "learning_rate": 5.485161290322581e-05, | |
| "loss": 0.1147, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 36.29032258064516, | |
| "eval_loss": 0.12383412569761276, | |
| "eval_runtime": 91.1386, | |
| "eval_samples_per_second": 208.517, | |
| "eval_steps_per_second": 4.345, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 36.45161290322581, | |
| "grad_norm": 0.3162536919116974, | |
| "learning_rate": 5.420645161290323e-05, | |
| "loss": 0.1154, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 36.45161290322581, | |
| "eval_loss": 0.12289289385080338, | |
| "eval_runtime": 90.6693, | |
| "eval_samples_per_second": 209.597, | |
| "eval_steps_per_second": 4.368, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 36.61290322580645, | |
| "grad_norm": 0.3101736903190613, | |
| "learning_rate": 5.356129032258065e-05, | |
| "loss": 0.1146, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 36.61290322580645, | |
| "eval_loss": 0.12234646826982498, | |
| "eval_runtime": 89.6047, | |
| "eval_samples_per_second": 212.087, | |
| "eval_steps_per_second": 4.419, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 36.774193548387096, | |
| "grad_norm": 0.32668620347976685, | |
| "learning_rate": 5.291612903225806e-05, | |
| "loss": 0.1196, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 36.774193548387096, | |
| "eval_loss": 0.12317313253879547, | |
| "eval_runtime": 88.2522, | |
| "eval_samples_per_second": 215.337, | |
| "eval_steps_per_second": 4.487, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 36.935483870967744, | |
| "grad_norm": 0.3477221429347992, | |
| "learning_rate": 5.227096774193548e-05, | |
| "loss": 0.1165, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 36.935483870967744, | |
| "eval_loss": 0.12321442365646362, | |
| "eval_runtime": 90.4924, | |
| "eval_samples_per_second": 210.007, | |
| "eval_steps_per_second": 4.376, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 37.096774193548384, | |
| "grad_norm": 0.3417079448699951, | |
| "learning_rate": 5.16258064516129e-05, | |
| "loss": 0.1193, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 37.096774193548384, | |
| "eval_loss": 0.12433473765850067, | |
| "eval_runtime": 87.8702, | |
| "eval_samples_per_second": 216.273, | |
| "eval_steps_per_second": 4.507, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 37.25806451612903, | |
| "grad_norm": 0.4154585301876068, | |
| "learning_rate": 5.098064516129033e-05, | |
| "loss": 0.1178, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 37.25806451612903, | |
| "eval_loss": 0.12312240153551102, | |
| "eval_runtime": 87.9067, | |
| "eval_samples_per_second": 216.184, | |
| "eval_steps_per_second": 4.505, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 37.41935483870968, | |
| "grad_norm": 0.3054625988006592, | |
| "learning_rate": 5.0335483870967747e-05, | |
| "loss": 0.1175, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 37.41935483870968, | |
| "eval_loss": 0.12180905044078827, | |
| "eval_runtime": 88.3639, | |
| "eval_samples_per_second": 215.065, | |
| "eval_steps_per_second": 4.481, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 37.58064516129032, | |
| "grad_norm": 0.28812453150749207, | |
| "learning_rate": 4.9690322580645166e-05, | |
| "loss": 0.117, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 37.58064516129032, | |
| "eval_loss": 0.12359043955802917, | |
| "eval_runtime": 88.3972, | |
| "eval_samples_per_second": 214.984, | |
| "eval_steps_per_second": 4.48, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 37.74193548387097, | |
| "grad_norm": 0.23818424344062805, | |
| "learning_rate": 4.9045161290322585e-05, | |
| "loss": 0.115, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 37.74193548387097, | |
| "eval_loss": 0.12239911407232285, | |
| "eval_runtime": 89.3047, | |
| "eval_samples_per_second": 212.8, | |
| "eval_steps_per_second": 4.434, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 37.903225806451616, | |
| "grad_norm": 0.28868499398231506, | |
| "learning_rate": 4.8400000000000004e-05, | |
| "loss": 0.1176, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 37.903225806451616, | |
| "eval_loss": 0.12191120535135269, | |
| "eval_runtime": 88.9578, | |
| "eval_samples_per_second": 213.629, | |
| "eval_steps_per_second": 4.452, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 38.064516129032256, | |
| "grad_norm": 0.33930733799934387, | |
| "learning_rate": 4.775483870967742e-05, | |
| "loss": 0.1145, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 38.064516129032256, | |
| "eval_loss": 0.12135030329227448, | |
| "eval_runtime": 90.281, | |
| "eval_samples_per_second": 210.498, | |
| "eval_steps_per_second": 4.386, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 38.225806451612904, | |
| "grad_norm": 0.3511495590209961, | |
| "learning_rate": 4.710967741935484e-05, | |
| "loss": 0.1161, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 38.225806451612904, | |
| "eval_loss": 0.12242971360683441, | |
| "eval_runtime": 89.1539, | |
| "eval_samples_per_second": 213.159, | |
| "eval_steps_per_second": 4.442, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 38.38709677419355, | |
| "grad_norm": 0.37273716926574707, | |
| "learning_rate": 4.646451612903226e-05, | |
| "loss": 0.1107, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 38.38709677419355, | |
| "eval_loss": 0.12210850417613983, | |
| "eval_runtime": 89.9424, | |
| "eval_samples_per_second": 211.291, | |
| "eval_steps_per_second": 4.403, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 38.54838709677419, | |
| "grad_norm": 0.2974016070365906, | |
| "learning_rate": 4.5819354838709674e-05, | |
| "loss": 0.1135, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 38.54838709677419, | |
| "eval_loss": 0.12212313711643219, | |
| "eval_runtime": 89.4298, | |
| "eval_samples_per_second": 212.502, | |
| "eval_steps_per_second": 4.428, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 38.70967741935484, | |
| "grad_norm": 0.32708504796028137, | |
| "learning_rate": 4.51741935483871e-05, | |
| "loss": 0.1167, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 38.70967741935484, | |
| "eval_loss": 0.12119368463754654, | |
| "eval_runtime": 88.7472, | |
| "eval_samples_per_second": 214.136, | |
| "eval_steps_per_second": 4.462, | |
| "step": 12000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 15500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2083464406274048e+17, | |
| "train_batch_size": 96, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |