| { | |
| "best_global_step": 5760, | |
| "best_metric": 0.05086889490485191, | |
| "best_model_checkpoint": "./results/checkpoint-5760", | |
| "epoch": 25.0, | |
| "eval_steps": 500, | |
| "global_step": 14400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08680555555555555, | |
| "grad_norm": 3.413226842880249, | |
| "learning_rate": 1.9931944444444447e-05, | |
| "loss": 1.8426, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1736111111111111, | |
| "grad_norm": 19.604671478271484, | |
| "learning_rate": 1.98625e-05, | |
| "loss": 1.2984, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2604166666666667, | |
| "grad_norm": 73.20538330078125, | |
| "learning_rate": 1.979305555555556e-05, | |
| "loss": 0.9999, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3472222222222222, | |
| "grad_norm": 4.863669395446777, | |
| "learning_rate": 1.972361111111111e-05, | |
| "loss": 0.8302, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4340277777777778, | |
| "grad_norm": 17.846479415893555, | |
| "learning_rate": 1.965416666666667e-05, | |
| "loss": 0.6309, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5208333333333334, | |
| "grad_norm": 11.574165344238281, | |
| "learning_rate": 1.9584722222222224e-05, | |
| "loss": 0.4987, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6076388888888888, | |
| "grad_norm": 31.07317352294922, | |
| "learning_rate": 1.9515277777777778e-05, | |
| "loss": 0.3274, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6944444444444444, | |
| "grad_norm": 3.164802074432373, | |
| "learning_rate": 1.9445833333333336e-05, | |
| "loss": 0.3305, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 2.5798959732055664, | |
| "learning_rate": 1.937638888888889e-05, | |
| "loss": 0.2624, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8680555555555556, | |
| "grad_norm": 12.735908508300781, | |
| "learning_rate": 1.9306944444444445e-05, | |
| "loss": 0.3035, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9548611111111112, | |
| "grad_norm": 17.255126953125, | |
| "learning_rate": 1.9237500000000003e-05, | |
| "loss": 0.2323, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.17769542336463928, | |
| "eval_runtime": 2.138, | |
| "eval_samples_per_second": 478.963, | |
| "eval_steps_per_second": 29.935, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.0416666666666667, | |
| "grad_norm": 32.9837646484375, | |
| "learning_rate": 1.9168055555555558e-05, | |
| "loss": 0.1779, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.1284722222222223, | |
| "grad_norm": 3.967150926589966, | |
| "learning_rate": 1.9098611111111113e-05, | |
| "loss": 0.145, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.2152777777777777, | |
| "grad_norm": 12.614818572998047, | |
| "learning_rate": 1.9029166666666667e-05, | |
| "loss": 0.1287, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.3020833333333333, | |
| "grad_norm": 26.14882469177246, | |
| "learning_rate": 1.8959722222222222e-05, | |
| "loss": 0.1397, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 39.56580352783203, | |
| "learning_rate": 1.889027777777778e-05, | |
| "loss": 0.1194, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4756944444444444, | |
| "grad_norm": 26.76836395263672, | |
| "learning_rate": 1.8820833333333335e-05, | |
| "loss": 0.1274, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 26.581575393676758, | |
| "learning_rate": 1.875138888888889e-05, | |
| "loss": 0.1633, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.6493055555555556, | |
| "grad_norm": 0.08225402981042862, | |
| "learning_rate": 1.8681944444444447e-05, | |
| "loss": 0.0979, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.7361111111111112, | |
| "grad_norm": 0.38262873888015747, | |
| "learning_rate": 1.8612500000000002e-05, | |
| "loss": 0.1227, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.8229166666666665, | |
| "grad_norm": 1.3281371593475342, | |
| "learning_rate": 1.8543055555555556e-05, | |
| "loss": 0.0741, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.9097222222222223, | |
| "grad_norm": 48.16311264038086, | |
| "learning_rate": 1.847361111111111e-05, | |
| "loss": 0.1376, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9965277777777777, | |
| "grad_norm": 0.10062026977539062, | |
| "learning_rate": 1.840416666666667e-05, | |
| "loss": 0.1454, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.08461333066225052, | |
| "eval_runtime": 2.1421, | |
| "eval_samples_per_second": 478.037, | |
| "eval_steps_per_second": 29.877, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 0.03965457156300545, | |
| "learning_rate": 1.8334722222222224e-05, | |
| "loss": 0.0425, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.170138888888889, | |
| "grad_norm": 0.06563384085893631, | |
| "learning_rate": 1.8265277777777778e-05, | |
| "loss": 0.0516, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.2569444444444446, | |
| "grad_norm": 1.1895874738693237, | |
| "learning_rate": 1.8195833333333336e-05, | |
| "loss": 0.0601, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 0.5909057259559631, | |
| "learning_rate": 1.812638888888889e-05, | |
| "loss": 0.0484, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.4305555555555554, | |
| "grad_norm": 0.06617053598165512, | |
| "learning_rate": 1.8056944444444446e-05, | |
| "loss": 0.093, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.517361111111111, | |
| "grad_norm": 0.03617614507675171, | |
| "learning_rate": 1.7987500000000004e-05, | |
| "loss": 0.0812, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.6041666666666665, | |
| "grad_norm": 1.4127732515335083, | |
| "learning_rate": 1.7918055555555558e-05, | |
| "loss": 0.0834, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.6909722222222223, | |
| "grad_norm": 0.03689781203866005, | |
| "learning_rate": 1.7848611111111113e-05, | |
| "loss": 0.0581, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 82.96894073486328, | |
| "learning_rate": 1.7779166666666667e-05, | |
| "loss": 0.0607, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.8645833333333335, | |
| "grad_norm": 20.012392044067383, | |
| "learning_rate": 1.7709722222222222e-05, | |
| "loss": 0.0886, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.951388888888889, | |
| "grad_norm": 0.08668874949216843, | |
| "learning_rate": 1.764027777777778e-05, | |
| "loss": 0.1223, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.08079180121421814, | |
| "eval_runtime": 2.156, | |
| "eval_samples_per_second": 474.96, | |
| "eval_steps_per_second": 29.685, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 3.0381944444444446, | |
| "grad_norm": 0.5279293656349182, | |
| "learning_rate": 1.7570833333333335e-05, | |
| "loss": 0.0713, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "grad_norm": 31.233030319213867, | |
| "learning_rate": 1.750138888888889e-05, | |
| "loss": 0.0746, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.2118055555555554, | |
| "grad_norm": 0.0305875763297081, | |
| "learning_rate": 1.7431944444444447e-05, | |
| "loss": 0.0742, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.298611111111111, | |
| "grad_norm": 43.64688491821289, | |
| "learning_rate": 1.7362500000000002e-05, | |
| "loss": 0.0365, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.3854166666666665, | |
| "grad_norm": 0.034333955496549606, | |
| "learning_rate": 1.7293055555555557e-05, | |
| "loss": 0.058, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.4722222222222223, | |
| "grad_norm": 86.51885986328125, | |
| "learning_rate": 1.722361111111111e-05, | |
| "loss": 0.0559, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.5590277777777777, | |
| "grad_norm": 71.18854522705078, | |
| "learning_rate": 1.7154166666666666e-05, | |
| "loss": 0.0511, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.6458333333333335, | |
| "grad_norm": 37.69118881225586, | |
| "learning_rate": 1.7084722222222224e-05, | |
| "loss": 0.0393, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.732638888888889, | |
| "grad_norm": 0.011939575895667076, | |
| "learning_rate": 1.701527777777778e-05, | |
| "loss": 0.0394, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.8194444444444446, | |
| "grad_norm": 0.23084449768066406, | |
| "learning_rate": 1.6945833333333333e-05, | |
| "loss": 0.02, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.90625, | |
| "grad_norm": 0.22158057987689972, | |
| "learning_rate": 1.687638888888889e-05, | |
| "loss": 0.0248, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.9930555555555554, | |
| "grad_norm": 0.06880240887403488, | |
| "learning_rate": 1.6806944444444446e-05, | |
| "loss": 0.054, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.07962702214717865, | |
| "eval_runtime": 2.1469, | |
| "eval_samples_per_second": 476.958, | |
| "eval_steps_per_second": 29.81, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 4.079861111111111, | |
| "grad_norm": 0.0237971693277359, | |
| "learning_rate": 1.6737500000000004e-05, | |
| "loss": 0.0151, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.012101912871003151, | |
| "learning_rate": 1.6668055555555558e-05, | |
| "loss": 0.0195, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.253472222222222, | |
| "grad_norm": 0.010883960872888565, | |
| "learning_rate": 1.6598611111111113e-05, | |
| "loss": 0.0415, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 4.340277777777778, | |
| "grad_norm": 0.021902142092585564, | |
| "learning_rate": 1.6529166666666668e-05, | |
| "loss": 0.0426, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.427083333333333, | |
| "grad_norm": 0.04989313334226608, | |
| "learning_rate": 1.6459722222222222e-05, | |
| "loss": 0.0136, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 4.513888888888889, | |
| "grad_norm": 0.22097109258174896, | |
| "learning_rate": 1.639027777777778e-05, | |
| "loss": 0.0465, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.600694444444445, | |
| "grad_norm": 0.010039995424449444, | |
| "learning_rate": 1.6320833333333335e-05, | |
| "loss": 0.028, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.6875, | |
| "grad_norm": 0.013202900998294353, | |
| "learning_rate": 1.625138888888889e-05, | |
| "loss": 0.0461, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.774305555555555, | |
| "grad_norm": 0.015310668386518955, | |
| "learning_rate": 1.6181944444444447e-05, | |
| "loss": 0.0278, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.861111111111111, | |
| "grad_norm": 0.008756318129599094, | |
| "learning_rate": 1.6112500000000002e-05, | |
| "loss": 0.0218, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.947916666666667, | |
| "grad_norm": 0.015537718310952187, | |
| "learning_rate": 1.6043055555555557e-05, | |
| "loss": 0.0525, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.07712521404027939, | |
| "eval_runtime": 2.1488, | |
| "eval_samples_per_second": 476.534, | |
| "eval_steps_per_second": 29.783, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 5.034722222222222, | |
| "grad_norm": 0.011155390180647373, | |
| "learning_rate": 1.597361111111111e-05, | |
| "loss": 0.0136, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 5.121527777777778, | |
| "grad_norm": 0.013077272102236748, | |
| "learning_rate": 1.5904166666666666e-05, | |
| "loss": 0.038, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 5.208333333333333, | |
| "grad_norm": 0.009396527893841267, | |
| "learning_rate": 1.5834722222222224e-05, | |
| "loss": 0.0007, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.295138888888889, | |
| "grad_norm": 0.03254946321249008, | |
| "learning_rate": 1.576527777777778e-05, | |
| "loss": 0.0218, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 5.381944444444445, | |
| "grad_norm": 0.005586525425314903, | |
| "learning_rate": 1.5695833333333333e-05, | |
| "loss": 0.0203, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 5.46875, | |
| "grad_norm": 0.009627276100218296, | |
| "learning_rate": 1.562638888888889e-05, | |
| "loss": 0.0668, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.0053014811128377914, | |
| "learning_rate": 1.5556944444444446e-05, | |
| "loss": 0.0119, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 5.642361111111111, | |
| "grad_norm": 0.038525983691215515, | |
| "learning_rate": 1.54875e-05, | |
| "loss": 0.0255, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 5.729166666666667, | |
| "grad_norm": 0.04675915837287903, | |
| "learning_rate": 1.541805555555556e-05, | |
| "loss": 0.0644, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 5.815972222222222, | |
| "grad_norm": 0.04972713068127632, | |
| "learning_rate": 1.5348611111111113e-05, | |
| "loss": 0.0225, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 5.902777777777778, | |
| "grad_norm": 0.004159330390393734, | |
| "learning_rate": 1.5279166666666668e-05, | |
| "loss": 0.0151, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 5.989583333333333, | |
| "grad_norm": 0.01926554925739765, | |
| "learning_rate": 1.5209722222222222e-05, | |
| "loss": 0.0205, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.07027073949575424, | |
| "eval_runtime": 2.1711, | |
| "eval_samples_per_second": 471.645, | |
| "eval_steps_per_second": 29.478, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 6.076388888888889, | |
| "grad_norm": 0.09293384104967117, | |
| "learning_rate": 1.5140277777777779e-05, | |
| "loss": 0.0077, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.163194444444445, | |
| "grad_norm": 0.010024973191320896, | |
| "learning_rate": 1.5070833333333335e-05, | |
| "loss": 0.0423, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 0.029724491760134697, | |
| "learning_rate": 1.500138888888889e-05, | |
| "loss": 0.0199, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 6.336805555555555, | |
| "grad_norm": 0.015937596559524536, | |
| "learning_rate": 1.4931944444444446e-05, | |
| "loss": 0.017, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 6.423611111111111, | |
| "grad_norm": 0.009556828066706657, | |
| "learning_rate": 1.4862500000000002e-05, | |
| "loss": 0.0004, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 6.510416666666667, | |
| "grad_norm": 0.032106589525938034, | |
| "learning_rate": 1.4793055555555558e-05, | |
| "loss": 0.0147, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 6.597222222222222, | |
| "grad_norm": 0.005125823896378279, | |
| "learning_rate": 1.4723611111111111e-05, | |
| "loss": 0.0056, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 6.684027777777778, | |
| "grad_norm": 0.018202291801571846, | |
| "learning_rate": 1.4654166666666668e-05, | |
| "loss": 0.0264, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 6.770833333333333, | |
| "grad_norm": 31.37969970703125, | |
| "learning_rate": 1.4584722222222222e-05, | |
| "loss": 0.0119, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 6.857638888888889, | |
| "grad_norm": 0.037925682961940765, | |
| "learning_rate": 1.4515277777777779e-05, | |
| "loss": 0.0203, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 0.1194610670208931, | |
| "learning_rate": 1.4445833333333335e-05, | |
| "loss": 0.0158, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.0702182799577713, | |
| "eval_runtime": 2.1701, | |
| "eval_samples_per_second": 471.867, | |
| "eval_steps_per_second": 29.492, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 7.03125, | |
| "grad_norm": 0.0035032748710364103, | |
| "learning_rate": 1.437638888888889e-05, | |
| "loss": 0.0065, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 7.118055555555555, | |
| "grad_norm": 0.0025438766460865736, | |
| "learning_rate": 1.4306944444444446e-05, | |
| "loss": 0.0003, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 7.204861111111111, | |
| "grad_norm": 4.609809875488281, | |
| "learning_rate": 1.4237500000000002e-05, | |
| "loss": 0.02, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 7.291666666666667, | |
| "grad_norm": 0.0041930885054171085, | |
| "learning_rate": 1.4168055555555558e-05, | |
| "loss": 0.0496, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 7.378472222222222, | |
| "grad_norm": 0.3876318037509918, | |
| "learning_rate": 1.4098611111111111e-05, | |
| "loss": 0.0078, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 7.465277777777778, | |
| "grad_norm": 0.003726869821548462, | |
| "learning_rate": 1.4029166666666668e-05, | |
| "loss": 0.0122, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 7.552083333333333, | |
| "grad_norm": 0.00823493953794241, | |
| "learning_rate": 1.3959722222222222e-05, | |
| "loss": 0.0136, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 7.638888888888889, | |
| "grad_norm": 0.004769823048263788, | |
| "learning_rate": 1.3890277777777779e-05, | |
| "loss": 0.0132, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 7.725694444444445, | |
| "grad_norm": 45.848167419433594, | |
| "learning_rate": 1.3820833333333335e-05, | |
| "loss": 0.0116, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 7.8125, | |
| "grad_norm": 0.0101216621696949, | |
| "learning_rate": 1.375138888888889e-05, | |
| "loss": 0.0275, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 7.899305555555555, | |
| "grad_norm": 0.009534699842333794, | |
| "learning_rate": 1.3681944444444446e-05, | |
| "loss": 0.0073, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 7.986111111111111, | |
| "grad_norm": 0.003179518273100257, | |
| "learning_rate": 1.3612500000000002e-05, | |
| "loss": 0.0128, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.08423544466495514, | |
| "eval_runtime": 2.1727, | |
| "eval_samples_per_second": 471.311, | |
| "eval_steps_per_second": 29.457, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 8.072916666666666, | |
| "grad_norm": 0.0026477461215108633, | |
| "learning_rate": 1.3543055555555557e-05, | |
| "loss": 0.0261, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 8.159722222222221, | |
| "grad_norm": 0.019464371725916862, | |
| "learning_rate": 1.3473611111111111e-05, | |
| "loss": 0.0015, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 8.246527777777779, | |
| "grad_norm": 0.0027786209248006344, | |
| "learning_rate": 1.3404166666666668e-05, | |
| "loss": 0.0125, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.0063923560082912445, | |
| "learning_rate": 1.3334722222222222e-05, | |
| "loss": 0.0107, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 8.42013888888889, | |
| "grad_norm": 0.0071762921288609505, | |
| "learning_rate": 1.3265277777777779e-05, | |
| "loss": 0.01, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 8.506944444444445, | |
| "grad_norm": 0.003973743878304958, | |
| "learning_rate": 1.3195833333333335e-05, | |
| "loss": 0.0262, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 8.59375, | |
| "grad_norm": 0.015603139996528625, | |
| "learning_rate": 1.312638888888889e-05, | |
| "loss": 0.0213, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 8.680555555555555, | |
| "grad_norm": 0.002647754270583391, | |
| "learning_rate": 1.3056944444444446e-05, | |
| "loss": 0.01, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 8.76736111111111, | |
| "grad_norm": 14.067428588867188, | |
| "learning_rate": 1.2987500000000002e-05, | |
| "loss": 0.0324, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 8.854166666666666, | |
| "grad_norm": 0.004289311822503805, | |
| "learning_rate": 1.2918055555555557e-05, | |
| "loss": 0.0099, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 8.940972222222221, | |
| "grad_norm": 0.01604336127638817, | |
| "learning_rate": 1.2848611111111112e-05, | |
| "loss": 0.0226, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.08734692633152008, | |
| "eval_runtime": 2.1655, | |
| "eval_samples_per_second": 472.877, | |
| "eval_steps_per_second": 29.555, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 9.027777777777779, | |
| "grad_norm": 21.617219924926758, | |
| "learning_rate": 1.2779166666666666e-05, | |
| "loss": 0.0284, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 9.114583333333334, | |
| "grad_norm": 0.0036318660713732243, | |
| "learning_rate": 1.2709722222222222e-05, | |
| "loss": 0.0242, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 9.20138888888889, | |
| "grad_norm": 1.550016164779663, | |
| "learning_rate": 1.2640277777777779e-05, | |
| "loss": 0.0195, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 9.288194444444445, | |
| "grad_norm": 0.0063721900805830956, | |
| "learning_rate": 1.2570833333333335e-05, | |
| "loss": 0.0326, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 9.375, | |
| "grad_norm": 6.12042236328125, | |
| "learning_rate": 1.250138888888889e-05, | |
| "loss": 0.0153, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 9.461805555555555, | |
| "grad_norm": 0.0024620750918984413, | |
| "learning_rate": 1.2431944444444446e-05, | |
| "loss": 0.0126, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 9.54861111111111, | |
| "grad_norm": 0.008434666320681572, | |
| "learning_rate": 1.2362500000000002e-05, | |
| "loss": 0.0168, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 9.635416666666666, | |
| "grad_norm": 24.517446517944336, | |
| "learning_rate": 1.2293055555555557e-05, | |
| "loss": 0.0236, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 0.0033647818490862846, | |
| "learning_rate": 1.2223611111111112e-05, | |
| "loss": 0.0128, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 9.809027777777779, | |
| "grad_norm": 0.008964108303189278, | |
| "learning_rate": 1.2154166666666666e-05, | |
| "loss": 0.0064, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 9.895833333333334, | |
| "grad_norm": 0.00735318660736084, | |
| "learning_rate": 1.2084722222222223e-05, | |
| "loss": 0.0069, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 9.98263888888889, | |
| "grad_norm": 68.18215942382812, | |
| "learning_rate": 1.2015277777777779e-05, | |
| "loss": 0.0278, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.05086889490485191, | |
| "eval_runtime": 2.1757, | |
| "eval_samples_per_second": 470.664, | |
| "eval_steps_per_second": 29.416, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 10.069444444444445, | |
| "grad_norm": 0.008082223124802113, | |
| "learning_rate": 1.1945833333333333e-05, | |
| "loss": 0.0001, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 10.15625, | |
| "grad_norm": 0.0015207990072667599, | |
| "learning_rate": 1.187638888888889e-05, | |
| "loss": 0.0023, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 10.243055555555555, | |
| "grad_norm": 0.003384481882676482, | |
| "learning_rate": 1.1806944444444446e-05, | |
| "loss": 0.0039, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 10.32986111111111, | |
| "grad_norm": 0.0017064920393750072, | |
| "learning_rate": 1.1737500000000002e-05, | |
| "loss": 0.0092, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 10.416666666666666, | |
| "grad_norm": 0.0034557634498924017, | |
| "learning_rate": 1.1668055555555557e-05, | |
| "loss": 0.0291, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 10.503472222222221, | |
| "grad_norm": 0.005785680841654539, | |
| "learning_rate": 1.1598611111111112e-05, | |
| "loss": 0.0002, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 10.590277777777779, | |
| "grad_norm": 0.005150977522134781, | |
| "learning_rate": 1.1529166666666666e-05, | |
| "loss": 0.0209, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 10.677083333333334, | |
| "grad_norm": 0.0023805610835552216, | |
| "learning_rate": 1.1459722222222223e-05, | |
| "loss": 0.0133, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 10.76388888888889, | |
| "grad_norm": 0.0017482911935076118, | |
| "learning_rate": 1.1390277777777779e-05, | |
| "loss": 0.0034, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 10.850694444444445, | |
| "grad_norm": 0.16960591077804565, | |
| "learning_rate": 1.1320833333333334e-05, | |
| "loss": 0.0023, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 10.9375, | |
| "grad_norm": 0.005295192822813988, | |
| "learning_rate": 1.125138888888889e-05, | |
| "loss": 0.0018, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.09595341980457306, | |
| "eval_runtime": 2.1636, | |
| "eval_samples_per_second": 473.292, | |
| "eval_steps_per_second": 29.581, | |
| "step": 6336 | |
| }, | |
| { | |
| "epoch": 11.024305555555555, | |
| "grad_norm": 0.0025636740028858185, | |
| "learning_rate": 1.1181944444444446e-05, | |
| "loss": 0.0003, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 0.001272220746614039, | |
| "learning_rate": 1.1112500000000002e-05, | |
| "loss": 0.0071, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 11.197916666666666, | |
| "grad_norm": 0.00381342857144773, | |
| "learning_rate": 1.1043055555555557e-05, | |
| "loss": 0.0146, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 11.284722222222221, | |
| "grad_norm": 0.001548461732454598, | |
| "learning_rate": 1.0973611111111112e-05, | |
| "loss": 0.012, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 11.371527777777779, | |
| "grad_norm": 0.0014353245496749878, | |
| "learning_rate": 1.0904166666666666e-05, | |
| "loss": 0.0001, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 11.458333333333334, | |
| "grad_norm": 0.0014444834087044, | |
| "learning_rate": 1.0834722222222223e-05, | |
| "loss": 0.02, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 11.54513888888889, | |
| "grad_norm": 0.002948822919279337, | |
| "learning_rate": 1.0765277777777779e-05, | |
| "loss": 0.0005, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 11.631944444444445, | |
| "grad_norm": 0.001776829012669623, | |
| "learning_rate": 1.0695833333333334e-05, | |
| "loss": 0.0138, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 11.71875, | |
| "grad_norm": 0.0016048089601099491, | |
| "learning_rate": 1.062638888888889e-05, | |
| "loss": 0.0229, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 11.805555555555555, | |
| "grad_norm": 0.0019577471539378166, | |
| "learning_rate": 1.0556944444444446e-05, | |
| "loss": 0.0318, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 11.89236111111111, | |
| "grad_norm": 0.02152959071099758, | |
| "learning_rate": 1.04875e-05, | |
| "loss": 0.0147, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 11.979166666666666, | |
| "grad_norm": 0.003166797338053584, | |
| "learning_rate": 1.0418055555555557e-05, | |
| "loss": 0.0002, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.0941697284579277, | |
| "eval_runtime": 2.1655, | |
| "eval_samples_per_second": 472.877, | |
| "eval_steps_per_second": 29.555, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 12.065972222222221, | |
| "grad_norm": 0.0014085551956668496, | |
| "learning_rate": 1.0348611111111112e-05, | |
| "loss": 0.0127, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 12.152777777777779, | |
| "grad_norm": 0.0020881230011582375, | |
| "learning_rate": 1.0279166666666666e-05, | |
| "loss": 0.0056, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 12.239583333333334, | |
| "grad_norm": 0.0022252153139561415, | |
| "learning_rate": 1.0209722222222223e-05, | |
| "loss": 0.0001, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 12.32638888888889, | |
| "grad_norm": 0.014756396412849426, | |
| "learning_rate": 1.0140277777777779e-05, | |
| "loss": 0.0021, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 12.413194444444445, | |
| "grad_norm": 0.0016215373761951923, | |
| "learning_rate": 1.0070833333333334e-05, | |
| "loss": 0.0017, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 8.211623191833496, | |
| "learning_rate": 1.000138888888889e-05, | |
| "loss": 0.0081, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 12.586805555555555, | |
| "grad_norm": 0.001198956393636763, | |
| "learning_rate": 9.931944444444446e-06, | |
| "loss": 0.0066, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 12.67361111111111, | |
| "grad_norm": 0.09066519886255264, | |
| "learning_rate": 9.862500000000001e-06, | |
| "loss": 0.0109, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 12.760416666666666, | |
| "grad_norm": 0.006189523730427027, | |
| "learning_rate": 9.793055555555555e-06, | |
| "loss": 0.0133, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 12.847222222222221, | |
| "grad_norm": 0.0009115805733017623, | |
| "learning_rate": 9.723611111111112e-06, | |
| "loss": 0.0002, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 12.934027777777779, | |
| "grad_norm": 0.001321441144682467, | |
| "learning_rate": 9.654166666666668e-06, | |
| "loss": 0.002, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.07037966698408127, | |
| "eval_runtime": 2.158, | |
| "eval_samples_per_second": 474.51, | |
| "eval_steps_per_second": 29.657, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 13.020833333333334, | |
| "grad_norm": 0.001222062623128295, | |
| "learning_rate": 9.584722222222223e-06, | |
| "loss": 0.0059, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 13.10763888888889, | |
| "grad_norm": 0.0009936870774254203, | |
| "learning_rate": 9.515277777777779e-06, | |
| "loss": 0.0147, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 13.194444444444445, | |
| "grad_norm": 0.0007664511213079095, | |
| "learning_rate": 9.445833333333334e-06, | |
| "loss": 0.0009, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 13.28125, | |
| "grad_norm": 0.0019496489549055696, | |
| "learning_rate": 9.37638888888889e-06, | |
| "loss": 0.0034, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 13.368055555555555, | |
| "grad_norm": 0.0010263716103509068, | |
| "learning_rate": 9.306944444444446e-06, | |
| "loss": 0.0157, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 13.45486111111111, | |
| "grad_norm": 0.0018417923711240292, | |
| "learning_rate": 9.237500000000001e-06, | |
| "loss": 0.0191, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 13.541666666666666, | |
| "grad_norm": 0.009017466567456722, | |
| "learning_rate": 9.168055555555556e-06, | |
| "loss": 0.0142, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 13.628472222222221, | |
| "grad_norm": 0.00774754025042057, | |
| "learning_rate": 9.098611111111112e-06, | |
| "loss": 0.0213, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 13.715277777777779, | |
| "grad_norm": 0.0038997160736471415, | |
| "learning_rate": 9.029166666666668e-06, | |
| "loss": 0.0006, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 13.802083333333334, | |
| "grad_norm": 0.001874367124401033, | |
| "learning_rate": 8.959722222222223e-06, | |
| "loss": 0.0135, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 13.88888888888889, | |
| "grad_norm": 0.0013082403456792235, | |
| "learning_rate": 8.890277777777777e-06, | |
| "loss": 0.005, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 13.975694444444445, | |
| "grad_norm": 0.005922501441091299, | |
| "learning_rate": 8.820833333333334e-06, | |
| "loss": 0.008, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.11005319654941559, | |
| "eval_runtime": 2.1582, | |
| "eval_samples_per_second": 474.459, | |
| "eval_steps_per_second": 29.654, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 14.0625, | |
| "grad_norm": 0.000921198632568121, | |
| "learning_rate": 8.75138888888889e-06, | |
| "loss": 0.0178, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 14.149305555555555, | |
| "grad_norm": 0.0010431046830490232, | |
| "learning_rate": 8.681944444444446e-06, | |
| "loss": 0.0002, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 14.23611111111111, | |
| "grad_norm": 0.0012669609859585762, | |
| "learning_rate": 8.612500000000001e-06, | |
| "loss": 0.0031, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 14.322916666666666, | |
| "grad_norm": 0.0029816783498972654, | |
| "learning_rate": 8.543055555555556e-06, | |
| "loss": 0.025, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 14.409722222222221, | |
| "grad_norm": 0.005197410471737385, | |
| "learning_rate": 8.473611111111112e-06, | |
| "loss": 0.0016, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 14.496527777777779, | |
| "grad_norm": 0.0018497951095923781, | |
| "learning_rate": 8.404166666666668e-06, | |
| "loss": 0.0002, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 14.583333333333334, | |
| "grad_norm": 0.0028671796899288893, | |
| "learning_rate": 8.334722222222223e-06, | |
| "loss": 0.0012, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 14.67013888888889, | |
| "grad_norm": 0.010854833759367466, | |
| "learning_rate": 8.265277777777777e-06, | |
| "loss": 0.0027, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 14.756944444444445, | |
| "grad_norm": 0.011619815602898598, | |
| "learning_rate": 8.195833333333334e-06, | |
| "loss": 0.0051, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 14.84375, | |
| "grad_norm": 0.004773242399096489, | |
| "learning_rate": 8.12638888888889e-06, | |
| "loss": 0.0203, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 14.930555555555555, | |
| "grad_norm": 0.0017284239875152707, | |
| "learning_rate": 8.056944444444446e-06, | |
| "loss": 0.0084, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.09546509385108948, | |
| "eval_runtime": 2.159, | |
| "eval_samples_per_second": 474.301, | |
| "eval_steps_per_second": 29.644, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 15.01736111111111, | |
| "grad_norm": 0.003186532063409686, | |
| "learning_rate": 7.987500000000001e-06, | |
| "loss": 0.0044, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 15.104166666666666, | |
| "grad_norm": 0.0046905651688575745, | |
| "learning_rate": 7.918055555555556e-06, | |
| "loss": 0.0003, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 15.190972222222221, | |
| "grad_norm": 0.001585672376677394, | |
| "learning_rate": 7.848611111111112e-06, | |
| "loss": 0.0014, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 15.277777777777779, | |
| "grad_norm": 0.001381274894811213, | |
| "learning_rate": 7.779166666666668e-06, | |
| "loss": 0.0002, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 15.364583333333334, | |
| "grad_norm": 0.0018836313392966986, | |
| "learning_rate": 7.709722222222223e-06, | |
| "loss": 0.0002, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 15.45138888888889, | |
| "grad_norm": 0.0009360660915262997, | |
| "learning_rate": 7.640277777777778e-06, | |
| "loss": 0.0009, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 15.538194444444445, | |
| "grad_norm": 0.0008543449803255498, | |
| "learning_rate": 7.570833333333334e-06, | |
| "loss": 0.0001, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 15.625, | |
| "grad_norm": 0.38564541935920715, | |
| "learning_rate": 7.501388888888889e-06, | |
| "loss": 0.0148, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 15.711805555555555, | |
| "grad_norm": 0.0016453195130452514, | |
| "learning_rate": 7.431944444444446e-06, | |
| "loss": 0.0103, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 15.79861111111111, | |
| "grad_norm": 0.001604390563443303, | |
| "learning_rate": 7.3625e-06, | |
| "loss": 0.0039, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 15.885416666666666, | |
| "grad_norm": 0.0020437692292034626, | |
| "learning_rate": 7.293055555555556e-06, | |
| "loss": 0.0001, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 15.972222222222221, | |
| "grad_norm": 0.004224094562232494, | |
| "learning_rate": 7.223611111111112e-06, | |
| "loss": 0.0024, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.057210568338632584, | |
| "eval_runtime": 2.1916, | |
| "eval_samples_per_second": 467.23, | |
| "eval_steps_per_second": 29.202, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 16.05902777777778, | |
| "grad_norm": 0.008274748921394348, | |
| "learning_rate": 7.1541666666666675e-06, | |
| "loss": 0.0002, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 16.145833333333332, | |
| "grad_norm": 0.0013211554614827037, | |
| "learning_rate": 7.084722222222222e-06, | |
| "loss": 0.0051, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 16.23263888888889, | |
| "grad_norm": 0.0025199875235557556, | |
| "learning_rate": 7.015277777777778e-06, | |
| "loss": 0.0001, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 16.319444444444443, | |
| "grad_norm": 0.00225885515101254, | |
| "learning_rate": 6.945833333333334e-06, | |
| "loss": 0.0144, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 16.40625, | |
| "grad_norm": 8.875648498535156, | |
| "learning_rate": 6.876388888888889e-06, | |
| "loss": 0.0097, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 16.493055555555557, | |
| "grad_norm": 0.0012961571337655187, | |
| "learning_rate": 6.806944444444446e-06, | |
| "loss": 0.0008, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 16.57986111111111, | |
| "grad_norm": 0.0071833692491054535, | |
| "learning_rate": 6.7375e-06, | |
| "loss": 0.0107, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 0.0016137856291607022, | |
| "learning_rate": 6.668055555555556e-06, | |
| "loss": 0.011, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 16.75347222222222, | |
| "grad_norm": 0.002170548541471362, | |
| "learning_rate": 6.598611111111112e-06, | |
| "loss": 0.0002, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 16.84027777777778, | |
| "grad_norm": 0.15523186326026917, | |
| "learning_rate": 6.5291666666666675e-06, | |
| "loss": 0.0001, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 16.927083333333332, | |
| "grad_norm": 0.0006897877901792526, | |
| "learning_rate": 6.459722222222222e-06, | |
| "loss": 0.0, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.07502129673957825, | |
| "eval_runtime": 2.1485, | |
| "eval_samples_per_second": 476.611, | |
| "eval_steps_per_second": 29.788, | |
| "step": 9792 | |
| }, | |
| { | |
| "epoch": 17.01388888888889, | |
| "grad_norm": 0.007862378843128681, | |
| "learning_rate": 6.390277777777778e-06, | |
| "loss": 0.0072, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 17.100694444444443, | |
| "grad_norm": 0.0048260875046253204, | |
| "learning_rate": 6.320833333333334e-06, | |
| "loss": 0.0001, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 17.1875, | |
| "grad_norm": 0.001168318442068994, | |
| "learning_rate": 6.251388888888889e-06, | |
| "loss": 0.0001, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 17.274305555555557, | |
| "grad_norm": 0.0010404903441667557, | |
| "learning_rate": 6.181944444444446e-06, | |
| "loss": 0.0, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 17.36111111111111, | |
| "grad_norm": 0.0008959461702033877, | |
| "learning_rate": 6.1125e-06, | |
| "loss": 0.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 17.447916666666668, | |
| "grad_norm": 0.0006551714614033699, | |
| "learning_rate": 6.043055555555556e-06, | |
| "loss": 0.0083, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 17.53472222222222, | |
| "grad_norm": 0.004672654904425144, | |
| "learning_rate": 5.973611111111111e-06, | |
| "loss": 0.0053, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 17.62152777777778, | |
| "grad_norm": 0.012874463573098183, | |
| "learning_rate": 5.904166666666668e-06, | |
| "loss": 0.0024, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 17.708333333333332, | |
| "grad_norm": 0.0007252011564560235, | |
| "learning_rate": 5.834722222222222e-06, | |
| "loss": 0.0081, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 17.79513888888889, | |
| "grad_norm": 0.003966485150158405, | |
| "learning_rate": 5.765277777777778e-06, | |
| "loss": 0.0001, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 17.881944444444443, | |
| "grad_norm": 0.004643677733838558, | |
| "learning_rate": 5.695833333333334e-06, | |
| "loss": 0.0016, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 17.96875, | |
| "grad_norm": 0.0022119064815342426, | |
| "learning_rate": 5.6263888888888895e-06, | |
| "loss": 0.0, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.081531822681427, | |
| "eval_runtime": 2.1481, | |
| "eval_samples_per_second": 476.707, | |
| "eval_steps_per_second": 29.794, | |
| "step": 10368 | |
| }, | |
| { | |
| "epoch": 18.055555555555557, | |
| "grad_norm": 0.0007135890191420913, | |
| "learning_rate": 5.556944444444446e-06, | |
| "loss": 0.0008, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 18.14236111111111, | |
| "grad_norm": 0.0011914368951693177, | |
| "learning_rate": 5.4875e-06, | |
| "loss": 0.0029, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 18.229166666666668, | |
| "grad_norm": 0.000494650739710778, | |
| "learning_rate": 5.418055555555556e-06, | |
| "loss": 0.0, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 18.31597222222222, | |
| "grad_norm": 0.0006326820584945381, | |
| "learning_rate": 5.348611111111111e-06, | |
| "loss": 0.0005, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 18.40277777777778, | |
| "grad_norm": 0.0005152356461621821, | |
| "learning_rate": 5.279166666666668e-06, | |
| "loss": 0.0, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 18.489583333333332, | |
| "grad_norm": 0.001848602551035583, | |
| "learning_rate": 5.209722222222222e-06, | |
| "loss": 0.0001, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 18.57638888888889, | |
| "grad_norm": 0.00047650947817601264, | |
| "learning_rate": 5.140277777777778e-06, | |
| "loss": 0.014, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 18.663194444444443, | |
| "grad_norm": 0.0004927313420921564, | |
| "learning_rate": 5.070833333333334e-06, | |
| "loss": 0.0101, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 18.75, | |
| "grad_norm": 0.0006253819447010756, | |
| "learning_rate": 5.0013888888888895e-06, | |
| "loss": 0.0, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 18.836805555555557, | |
| "grad_norm": 0.0005877416697330773, | |
| "learning_rate": 4.931944444444445e-06, | |
| "loss": 0.0, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 18.92361111111111, | |
| "grad_norm": 0.000749260769225657, | |
| "learning_rate": 4.8625000000000005e-06, | |
| "loss": 0.0034, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.08709079027175903, | |
| "eval_runtime": 2.1461, | |
| "eval_samples_per_second": 477.135, | |
| "eval_steps_per_second": 29.821, | |
| "step": 10944 | |
| }, | |
| { | |
| "epoch": 19.010416666666668, | |
| "grad_norm": 0.0033058973494917154, | |
| "learning_rate": 4.793055555555556e-06, | |
| "loss": 0.0028, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 19.09722222222222, | |
| "grad_norm": 0.0005344200180843472, | |
| "learning_rate": 4.723611111111111e-06, | |
| "loss": 0.0012, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 19.18402777777778, | |
| "grad_norm": 0.0011495526414364576, | |
| "learning_rate": 4.654166666666667e-06, | |
| "loss": 0.0, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 19.270833333333332, | |
| "grad_norm": 0.0007638471433892846, | |
| "learning_rate": 4.584722222222222e-06, | |
| "loss": 0.0008, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 19.35763888888889, | |
| "grad_norm": 0.0016245943261310458, | |
| "learning_rate": 4.515277777777778e-06, | |
| "loss": 0.0, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 19.444444444444443, | |
| "grad_norm": 0.001350992708466947, | |
| "learning_rate": 4.445833333333333e-06, | |
| "loss": 0.0006, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 19.53125, | |
| "grad_norm": 0.0007238370017148554, | |
| "learning_rate": 4.3763888888888896e-06, | |
| "loss": 0.0, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 19.618055555555557, | |
| "grad_norm": 0.0009173430735245347, | |
| "learning_rate": 4.306944444444445e-06, | |
| "loss": 0.005, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 19.70486111111111, | |
| "grad_norm": 0.002093716524541378, | |
| "learning_rate": 4.2375000000000005e-06, | |
| "loss": 0.0, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 19.791666666666668, | |
| "grad_norm": 0.0008814275497570634, | |
| "learning_rate": 4.168055555555556e-06, | |
| "loss": 0.0004, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 19.87847222222222, | |
| "grad_norm": 0.000544120033737272, | |
| "learning_rate": 4.0986111111111114e-06, | |
| "loss": 0.0001, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 19.96527777777778, | |
| "grad_norm": 0.0006753376801498234, | |
| "learning_rate": 4.029166666666667e-06, | |
| "loss": 0.0078, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.13657595217227936, | |
| "eval_runtime": 2.1501, | |
| "eval_samples_per_second": 476.265, | |
| "eval_steps_per_second": 29.767, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 20.052083333333332, | |
| "grad_norm": 0.000810706231277436, | |
| "learning_rate": 3.959722222222222e-06, | |
| "loss": 0.0, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 20.13888888888889, | |
| "grad_norm": 0.0008722566999495029, | |
| "learning_rate": 3.890277777777778e-06, | |
| "loss": 0.002, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 20.225694444444443, | |
| "grad_norm": 0.0010332430247217417, | |
| "learning_rate": 3.820833333333333e-06, | |
| "loss": 0.0, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 20.3125, | |
| "grad_norm": 0.0003171579446643591, | |
| "learning_rate": 3.751388888888889e-06, | |
| "loss": 0.0001, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 20.399305555555557, | |
| "grad_norm": 0.0003055994166061282, | |
| "learning_rate": 3.6819444444444447e-06, | |
| "loss": 0.0, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 20.48611111111111, | |
| "grad_norm": 0.0005060540861450136, | |
| "learning_rate": 3.6125000000000006e-06, | |
| "loss": 0.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 20.572916666666668, | |
| "grad_norm": 0.0005677157896570861, | |
| "learning_rate": 3.5430555555555556e-06, | |
| "loss": 0.0, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 20.65972222222222, | |
| "grad_norm": 0.000569184310734272, | |
| "learning_rate": 3.4736111111111115e-06, | |
| "loss": 0.0011, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 20.74652777777778, | |
| "grad_norm": 0.0008372145821340382, | |
| "learning_rate": 3.4041666666666665e-06, | |
| "loss": 0.0, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 20.833333333333332, | |
| "grad_norm": 0.0009117226582020521, | |
| "learning_rate": 3.3347222222222224e-06, | |
| "loss": 0.0027, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 20.92013888888889, | |
| "grad_norm": 0.0006149787222966552, | |
| "learning_rate": 3.265277777777778e-06, | |
| "loss": 0.0009, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.07403512299060822, | |
| "eval_runtime": 2.1707, | |
| "eval_samples_per_second": 471.738, | |
| "eval_steps_per_second": 29.484, | |
| "step": 12096 | |
| }, | |
| { | |
| "epoch": 21.006944444444443, | |
| "grad_norm": 0.0009870927315205336, | |
| "learning_rate": 3.1958333333333334e-06, | |
| "loss": 0.0133, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 21.09375, | |
| "grad_norm": 0.008030719123780727, | |
| "learning_rate": 3.1263888888888893e-06, | |
| "loss": 0.0, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 21.180555555555557, | |
| "grad_norm": 0.0010351603850722313, | |
| "learning_rate": 3.0569444444444447e-06, | |
| "loss": 0.0025, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 21.26736111111111, | |
| "grad_norm": 0.004166141152381897, | |
| "learning_rate": 2.9875e-06, | |
| "loss": 0.0023, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 21.354166666666668, | |
| "grad_norm": 0.0021667364053428173, | |
| "learning_rate": 2.9180555555555557e-06, | |
| "loss": 0.005, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 21.44097222222222, | |
| "grad_norm": 0.005213022232055664, | |
| "learning_rate": 2.8486111111111116e-06, | |
| "loss": 0.0001, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 21.52777777777778, | |
| "grad_norm": 0.0016998907085508108, | |
| "learning_rate": 2.7791666666666666e-06, | |
| "loss": 0.0, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 21.614583333333332, | |
| "grad_norm": 0.0011233899276703596, | |
| "learning_rate": 2.7097222222222225e-06, | |
| "loss": 0.0013, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 21.70138888888889, | |
| "grad_norm": 0.00541352853178978, | |
| "learning_rate": 2.6402777777777775e-06, | |
| "loss": 0.0, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 21.788194444444443, | |
| "grad_norm": 0.0006274741608649492, | |
| "learning_rate": 2.5708333333333334e-06, | |
| "loss": 0.0007, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 21.875, | |
| "grad_norm": 0.000486269302200526, | |
| "learning_rate": 2.5013888888888893e-06, | |
| "loss": 0.0, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 21.961805555555557, | |
| "grad_norm": 0.000523523660376668, | |
| "learning_rate": 2.4319444444444444e-06, | |
| "loss": 0.0005, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.09000910818576813, | |
| "eval_runtime": 2.1668, | |
| "eval_samples_per_second": 472.584, | |
| "eval_steps_per_second": 29.537, | |
| "step": 12672 | |
| }, | |
| { | |
| "epoch": 22.04861111111111, | |
| "grad_norm": 0.0002973914088215679, | |
| "learning_rate": 2.3625000000000003e-06, | |
| "loss": 0.0, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 22.135416666666668, | |
| "grad_norm": 0.00037905474891886115, | |
| "learning_rate": 2.2930555555555557e-06, | |
| "loss": 0.0003, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 22.22222222222222, | |
| "grad_norm": 0.0003634969179984182, | |
| "learning_rate": 2.223611111111111e-06, | |
| "loss": 0.0, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 22.30902777777778, | |
| "grad_norm": 0.0003171905700583011, | |
| "learning_rate": 2.154166666666667e-06, | |
| "loss": 0.0, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 22.395833333333332, | |
| "grad_norm": 0.0005504607688635588, | |
| "learning_rate": 2.0847222222222225e-06, | |
| "loss": 0.0037, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 22.48263888888889, | |
| "grad_norm": 0.001255808281712234, | |
| "learning_rate": 2.015277777777778e-06, | |
| "loss": 0.0025, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 22.569444444444443, | |
| "grad_norm": 0.0004096803313586861, | |
| "learning_rate": 1.9458333333333335e-06, | |
| "loss": 0.0021, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 22.65625, | |
| "grad_norm": 0.001167616923339665, | |
| "learning_rate": 1.876388888888889e-06, | |
| "loss": 0.0, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 22.743055555555557, | |
| "grad_norm": 0.0003252147580496967, | |
| "learning_rate": 1.8069444444444444e-06, | |
| "loss": 0.0006, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 22.82986111111111, | |
| "grad_norm": 0.0006310039316304028, | |
| "learning_rate": 1.7375e-06, | |
| "loss": 0.0001, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 22.916666666666668, | |
| "grad_norm": 0.00035475249751470983, | |
| "learning_rate": 1.6680555555555558e-06, | |
| "loss": 0.0026, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.0978098213672638, | |
| "eval_runtime": 2.1724, | |
| "eval_samples_per_second": 471.367, | |
| "eval_steps_per_second": 29.46, | |
| "step": 13248 | |
| }, | |
| { | |
| "epoch": 23.00347222222222, | |
| "grad_norm": 0.0005200915038585663, | |
| "learning_rate": 1.5986111111111112e-06, | |
| "loss": 0.0, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 23.09027777777778, | |
| "grad_norm": 0.622466504573822, | |
| "learning_rate": 1.529166666666667e-06, | |
| "loss": 0.0003, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 23.177083333333332, | |
| "grad_norm": 0.0003293896734248847, | |
| "learning_rate": 1.4597222222222224e-06, | |
| "loss": 0.0024, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 23.26388888888889, | |
| "grad_norm": 0.00028986833058297634, | |
| "learning_rate": 1.3902777777777779e-06, | |
| "loss": 0.0, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 23.350694444444443, | |
| "grad_norm": 0.00039684175862930715, | |
| "learning_rate": 1.3208333333333333e-06, | |
| "loss": 0.0004, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 23.4375, | |
| "grad_norm": 0.0009431451908312738, | |
| "learning_rate": 1.251388888888889e-06, | |
| "loss": 0.0, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 23.524305555555557, | |
| "grad_norm": 0.0009446613257750869, | |
| "learning_rate": 1.1819444444444447e-06, | |
| "loss": 0.0, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 23.61111111111111, | |
| "grad_norm": 0.000527396856341511, | |
| "learning_rate": 1.1125000000000001e-06, | |
| "loss": 0.0, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 23.697916666666668, | |
| "grad_norm": 0.0008534564403817058, | |
| "learning_rate": 1.0430555555555556e-06, | |
| "loss": 0.0, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 23.78472222222222, | |
| "grad_norm": 0.000303189066471532, | |
| "learning_rate": 9.73611111111111e-07, | |
| "loss": 0.0, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 23.87152777777778, | |
| "grad_norm": 0.0006793912034481764, | |
| "learning_rate": 9.041666666666668e-07, | |
| "loss": 0.0069, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 23.958333333333332, | |
| "grad_norm": 0.00035732006654143333, | |
| "learning_rate": 8.347222222222223e-07, | |
| "loss": 0.0, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.09862537682056427, | |
| "eval_runtime": 2.1709, | |
| "eval_samples_per_second": 471.693, | |
| "eval_steps_per_second": 29.481, | |
| "step": 13824 | |
| }, | |
| { | |
| "epoch": 24.04513888888889, | |
| "grad_norm": 0.000883078551851213, | |
| "learning_rate": 7.652777777777778e-07, | |
| "loss": 0.0, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 24.131944444444443, | |
| "grad_norm": 0.00040300763794220984, | |
| "learning_rate": 6.958333333333334e-07, | |
| "loss": 0.0005, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 24.21875, | |
| "grad_norm": 0.001412940793670714, | |
| "learning_rate": 6.263888888888888e-07, | |
| "loss": 0.0017, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 24.305555555555557, | |
| "grad_norm": 0.000599319173488766, | |
| "learning_rate": 5.569444444444444e-07, | |
| "loss": 0.0, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 24.39236111111111, | |
| "grad_norm": 0.0007638942333869636, | |
| "learning_rate": 4.875000000000001e-07, | |
| "loss": 0.0, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 24.479166666666668, | |
| "grad_norm": 0.00034263054840266705, | |
| "learning_rate": 4.1805555555555556e-07, | |
| "loss": 0.0009, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 24.56597222222222, | |
| "grad_norm": 0.0004652358475141227, | |
| "learning_rate": 3.4861111111111114e-07, | |
| "loss": 0.0015, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 24.65277777777778, | |
| "grad_norm": 0.00027831687475554645, | |
| "learning_rate": 2.7916666666666666e-07, | |
| "loss": 0.0, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 24.739583333333332, | |
| "grad_norm": 0.0002827314310707152, | |
| "learning_rate": 2.0972222222222223e-07, | |
| "loss": 0.0015, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 24.82638888888889, | |
| "grad_norm": 0.00031116604804992676, | |
| "learning_rate": 1.4027777777777778e-07, | |
| "loss": 0.0, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 24.913194444444443, | |
| "grad_norm": 0.00027885418967343867, | |
| "learning_rate": 7.083333333333334e-08, | |
| "loss": 0.0018, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.0005058420938439667, | |
| "learning_rate": 1.388888888888889e-09, | |
| "loss": 0.0005, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.09845860302448273, | |
| "eval_runtime": 2.17, | |
| "eval_samples_per_second": 471.882, | |
| "eval_steps_per_second": 29.493, | |
| "step": 14400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 14400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1377061586400000.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |