Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 72000, | |
| "best_metric": 3.5293209552764893, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_hit_frequency_5039/checkpoint-40000", | |
| "epoch": 29.13752913752914, | |
| "eval_steps": 1000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014568764568764568, | |
| "grad_norm": 1.6134361028671265, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4822, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029137529137529136, | |
| "grad_norm": 0.6467522382736206, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.7172, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.043706293706293704, | |
| "grad_norm": 0.4256949722766876, | |
| "learning_rate": 0.0005998285714285713, | |
| "loss": 6.3649, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05827505827505827, | |
| "grad_norm": 0.4922082722187042, | |
| "learning_rate": 0.0005996536443148687, | |
| "loss": 6.1487, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07284382284382285, | |
| "grad_norm": 0.4709911346435547, | |
| "learning_rate": 0.0005994787172011662, | |
| "loss": 6.0174, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08741258741258741, | |
| "grad_norm": 0.48052042722702026, | |
| "learning_rate": 0.0005993037900874635, | |
| "loss": 5.8773, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10198135198135198, | |
| "grad_norm": 0.5878971219062805, | |
| "learning_rate": 0.0005991288629737609, | |
| "loss": 5.7603, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11655011655011654, | |
| "grad_norm": 0.4191853702068329, | |
| "learning_rate": 0.0005989539358600582, | |
| "loss": 5.6447, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13111888111888112, | |
| "grad_norm": 0.49607983231544495, | |
| "learning_rate": 0.0005987790087463557, | |
| "loss": 5.5178, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1456876456876457, | |
| "grad_norm": 0.4248947501182556, | |
| "learning_rate": 0.000598604081632653, | |
| "loss": 5.4198, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16025641025641027, | |
| "grad_norm": 0.48206138610839844, | |
| "learning_rate": 0.0005984291545189504, | |
| "loss": 5.3384, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17482517482517482, | |
| "grad_norm": 0.4465309679508209, | |
| "learning_rate": 0.0005982542274052477, | |
| "loss": 5.2645, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1893939393939394, | |
| "grad_norm": 0.42823970317840576, | |
| "learning_rate": 0.0005980793002915452, | |
| "loss": 5.2088, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20396270396270397, | |
| "grad_norm": 0.4172956943511963, | |
| "learning_rate": 0.0005979043731778425, | |
| "loss": 5.1419, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21853146853146854, | |
| "grad_norm": 0.424402117729187, | |
| "learning_rate": 0.0005977294460641399, | |
| "loss": 5.0645, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2331002331002331, | |
| "grad_norm": 0.4406491816043854, | |
| "learning_rate": 0.0005975545189504372, | |
| "loss": 5.0224, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24766899766899766, | |
| "grad_norm": 0.4717820882797241, | |
| "learning_rate": 0.0005973795918367347, | |
| "loss": 5.003, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.26223776223776224, | |
| "grad_norm": 0.4521999657154083, | |
| "learning_rate": 0.000597204664723032, | |
| "loss": 4.9142, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2768065268065268, | |
| "grad_norm": 0.4754863679409027, | |
| "learning_rate": 0.0005970297376093294, | |
| "loss": 4.8762, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2913752913752914, | |
| "grad_norm": 0.41961297392845154, | |
| "learning_rate": 0.0005968548104956268, | |
| "loss": 4.8402, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2913752913752914, | |
| "eval_accuracy": 0.2549595710849709, | |
| "eval_loss": 4.753758430480957, | |
| "eval_runtime": 180.4427, | |
| "eval_samples_per_second": 92.229, | |
| "eval_steps_per_second": 5.769, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30594405594405594, | |
| "grad_norm": 0.6563605070114136, | |
| "learning_rate": 0.0005966798833819242, | |
| "loss": 4.7815, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.32051282051282054, | |
| "grad_norm": 0.4702153503894806, | |
| "learning_rate": 0.0005965049562682215, | |
| "loss": 4.7461, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3350815850815851, | |
| "grad_norm": 0.4264092743396759, | |
| "learning_rate": 0.0005963300291545189, | |
| "loss": 4.6878, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.34965034965034963, | |
| "grad_norm": 0.4903077185153961, | |
| "learning_rate": 0.0005961551020408162, | |
| "loss": 4.6656, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36421911421911424, | |
| "grad_norm": 0.4931991994380951, | |
| "learning_rate": 0.0005959801749271137, | |
| "loss": 4.6333, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3787878787878788, | |
| "grad_norm": 0.43286022543907166, | |
| "learning_rate": 0.000595805247813411, | |
| "loss": 4.6066, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.39335664335664333, | |
| "grad_norm": 0.40360233187675476, | |
| "learning_rate": 0.0005956303206997084, | |
| "loss": 4.5706, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.40792540792540793, | |
| "grad_norm": 0.420491486787796, | |
| "learning_rate": 0.0005954553935860059, | |
| "loss": 4.5591, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4224941724941725, | |
| "grad_norm": 0.4152667820453644, | |
| "learning_rate": 0.0005952804664723032, | |
| "loss": 4.5331, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4370629370629371, | |
| "grad_norm": 0.4153015613555908, | |
| "learning_rate": 0.0005951055393586005, | |
| "loss": 4.5102, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45163170163170163, | |
| "grad_norm": 0.4187549352645874, | |
| "learning_rate": 0.0005949306122448979, | |
| "loss": 4.4927, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4662004662004662, | |
| "grad_norm": 0.4402385354042053, | |
| "learning_rate": 0.0005947556851311952, | |
| "loss": 4.4652, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4807692307692308, | |
| "grad_norm": 0.41887184977531433, | |
| "learning_rate": 0.0005945807580174927, | |
| "loss": 4.4431, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.49533799533799533, | |
| "grad_norm": 0.4349214434623718, | |
| "learning_rate": 0.00059440583090379, | |
| "loss": 4.4302, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5099067599067599, | |
| "grad_norm": 0.416457861661911, | |
| "learning_rate": 0.0005942309037900874, | |
| "loss": 4.4188, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5244755244755245, | |
| "grad_norm": 0.3888656198978424, | |
| "learning_rate": 0.0005940559766763847, | |
| "loss": 4.3824, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.539044289044289, | |
| "grad_norm": 0.38429805636405945, | |
| "learning_rate": 0.0005938810495626822, | |
| "loss": 4.38, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5536130536130536, | |
| "grad_norm": 0.4373445510864258, | |
| "learning_rate": 0.0005937061224489796, | |
| "loss": 4.3664, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5681818181818182, | |
| "grad_norm": 0.43909236788749695, | |
| "learning_rate": 0.0005935311953352769, | |
| "loss": 4.3394, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5827505827505828, | |
| "grad_norm": 0.3650919795036316, | |
| "learning_rate": 0.0005933562682215743, | |
| "loss": 4.3353, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5827505827505828, | |
| "eval_accuracy": 0.30004649542771444, | |
| "eval_loss": 4.282717704772949, | |
| "eval_runtime": 180.3042, | |
| "eval_samples_per_second": 92.3, | |
| "eval_steps_per_second": 5.774, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5973193473193473, | |
| "grad_norm": 0.4164070188999176, | |
| "learning_rate": 0.0005931813411078717, | |
| "loss": 4.3193, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6118881118881119, | |
| "grad_norm": 0.370237797498703, | |
| "learning_rate": 0.000593006413994169, | |
| "loss": 4.3105, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6264568764568764, | |
| "grad_norm": 0.4082745611667633, | |
| "learning_rate": 0.0005928314868804664, | |
| "loss": 4.2941, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6410256410256411, | |
| "grad_norm": 0.3935624957084656, | |
| "learning_rate": 0.0005926565597667638, | |
| "loss": 4.2833, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6555944055944056, | |
| "grad_norm": 0.3984358310699463, | |
| "learning_rate": 0.0005924816326530612, | |
| "loss": 4.2679, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6701631701631702, | |
| "grad_norm": 0.3854668140411377, | |
| "learning_rate": 0.0005923067055393586, | |
| "loss": 4.2746, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6847319347319347, | |
| "grad_norm": 0.37700507044792175, | |
| "learning_rate": 0.0005921317784256559, | |
| "loss": 4.2429, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6993006993006993, | |
| "grad_norm": 0.3662063777446747, | |
| "learning_rate": 0.0005919568513119533, | |
| "loss": 4.2584, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7138694638694638, | |
| "grad_norm": 0.38568246364593506, | |
| "learning_rate": 0.0005917819241982507, | |
| "loss": 4.2344, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7284382284382285, | |
| "grad_norm": 0.43455713987350464, | |
| "learning_rate": 0.000591606997084548, | |
| "loss": 4.2328, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.743006993006993, | |
| "grad_norm": 0.3856061100959778, | |
| "learning_rate": 0.0005914320699708454, | |
| "loss": 4.2092, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "grad_norm": 0.38549911975860596, | |
| "learning_rate": 0.0005912571428571428, | |
| "loss": 4.218, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7721445221445221, | |
| "grad_norm": 0.39746782183647156, | |
| "learning_rate": 0.0005910822157434402, | |
| "loss": 4.1968, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7867132867132867, | |
| "grad_norm": 0.39923539757728577, | |
| "learning_rate": 0.0005909072886297376, | |
| "loss": 4.1886, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8012820512820513, | |
| "grad_norm": 0.3931720554828644, | |
| "learning_rate": 0.0005907323615160349, | |
| "loss": 4.1767, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8158508158508159, | |
| "grad_norm": 0.3459513783454895, | |
| "learning_rate": 0.0005905574344023324, | |
| "loss": 4.1785, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8304195804195804, | |
| "grad_norm": 0.39997783303260803, | |
| "learning_rate": 0.0005903825072886297, | |
| "loss": 4.1691, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.844988344988345, | |
| "grad_norm": 0.3813166618347168, | |
| "learning_rate": 0.000590207580174927, | |
| "loss": 4.1596, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8595571095571095, | |
| "grad_norm": 0.37703993916511536, | |
| "learning_rate": 0.0005900326530612244, | |
| "loss": 4.1536, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8741258741258742, | |
| "grad_norm": 0.3787032961845398, | |
| "learning_rate": 0.0005898577259475218, | |
| "loss": 4.1411, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8741258741258742, | |
| "eval_accuracy": 0.31596194853706383, | |
| "eval_loss": 4.094162940979004, | |
| "eval_runtime": 180.2783, | |
| "eval_samples_per_second": 92.313, | |
| "eval_steps_per_second": 5.774, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8886946386946387, | |
| "grad_norm": 0.35312145948410034, | |
| "learning_rate": 0.0005896827988338192, | |
| "loss": 4.1438, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9032634032634033, | |
| "grad_norm": 0.38526853919029236, | |
| "learning_rate": 0.0005895078717201166, | |
| "loss": 4.1301, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9178321678321678, | |
| "grad_norm": 0.36466994881629944, | |
| "learning_rate": 0.000589332944606414, | |
| "loss": 4.1235, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9324009324009324, | |
| "grad_norm": 0.3571998178958893, | |
| "learning_rate": 0.0005891580174927114, | |
| "loss": 4.1292, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.946969696969697, | |
| "grad_norm": 0.3403795063495636, | |
| "learning_rate": 0.0005889830903790087, | |
| "loss": 4.1082, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9615384615384616, | |
| "grad_norm": 0.38671016693115234, | |
| "learning_rate": 0.000588808163265306, | |
| "loss": 4.096, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9761072261072261, | |
| "grad_norm": 0.3343498408794403, | |
| "learning_rate": 0.0005886332361516035, | |
| "loss": 4.0914, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9906759906759907, | |
| "grad_norm": 0.3740348815917969, | |
| "learning_rate": 0.0005884583090379008, | |
| "loss": 4.1069, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0052447552447552, | |
| "grad_norm": 0.37788498401641846, | |
| "learning_rate": 0.0005882833819241982, | |
| "loss": 4.0461, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0198135198135199, | |
| "grad_norm": 0.3331277072429657, | |
| "learning_rate": 0.0005881084548104955, | |
| "loss": 4.0172, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0343822843822843, | |
| "grad_norm": 0.38395169377326965, | |
| "learning_rate": 0.000587933527696793, | |
| "loss": 4.014, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.048951048951049, | |
| "grad_norm": 0.35354799032211304, | |
| "learning_rate": 0.0005877586005830904, | |
| "loss": 4.0174, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0635198135198136, | |
| "grad_norm": 0.35139200091362, | |
| "learning_rate": 0.0005875836734693877, | |
| "loss": 4.0323, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.078088578088578, | |
| "grad_norm": 0.36168310046195984, | |
| "learning_rate": 0.0005874087463556851, | |
| "loss": 4.0024, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0926573426573427, | |
| "grad_norm": 0.3537745773792267, | |
| "learning_rate": 0.0005872338192419825, | |
| "loss": 4.0106, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1072261072261071, | |
| "grad_norm": 0.3509289026260376, | |
| "learning_rate": 0.0005870588921282798, | |
| "loss": 3.9951, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1217948717948718, | |
| "grad_norm": 0.3399880826473236, | |
| "learning_rate": 0.0005868839650145772, | |
| "loss": 3.9968, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1363636363636362, | |
| "grad_norm": 0.31629034876823425, | |
| "learning_rate": 0.0005867090379008745, | |
| "loss": 3.9957, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.150932400932401, | |
| "grad_norm": 0.34750989079475403, | |
| "learning_rate": 0.000586534110787172, | |
| "loss": 3.9916, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.1655011655011656, | |
| "grad_norm": 0.34375834465026855, | |
| "learning_rate": 0.0005863591836734694, | |
| "loss": 3.994, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1655011655011656, | |
| "eval_accuracy": 0.3255772359138492, | |
| "eval_loss": 3.989028215408325, | |
| "eval_runtime": 180.1395, | |
| "eval_samples_per_second": 92.384, | |
| "eval_steps_per_second": 5.779, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.18006993006993, | |
| "grad_norm": 0.37413716316223145, | |
| "learning_rate": 0.0005861842565597667, | |
| "loss": 3.9981, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.1946386946386947, | |
| "grad_norm": 0.35629984736442566, | |
| "learning_rate": 0.0005860093294460641, | |
| "loss": 3.975, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2092074592074593, | |
| "grad_norm": 0.3381233215332031, | |
| "learning_rate": 0.0005858344023323615, | |
| "loss": 3.9851, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2237762237762237, | |
| "grad_norm": 0.34585943818092346, | |
| "learning_rate": 0.0005856594752186588, | |
| "loss": 3.9808, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2383449883449884, | |
| "grad_norm": 0.35993272066116333, | |
| "learning_rate": 0.0005854845481049562, | |
| "loss": 3.975, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.2529137529137528, | |
| "grad_norm": 0.32540130615234375, | |
| "learning_rate": 0.0005853096209912535, | |
| "loss": 3.9714, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2674825174825175, | |
| "grad_norm": 0.36224445700645447, | |
| "learning_rate": 0.000585134693877551, | |
| "loss": 3.9754, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.282051282051282, | |
| "grad_norm": 0.3662620186805725, | |
| "learning_rate": 0.0005849597667638484, | |
| "loss": 3.9694, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.2966200466200466, | |
| "grad_norm": 0.35438838601112366, | |
| "learning_rate": 0.0005847848396501457, | |
| "loss": 3.9519, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3111888111888113, | |
| "grad_norm": 0.34450942277908325, | |
| "learning_rate": 0.0005846099125364432, | |
| "loss": 3.963, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3257575757575757, | |
| "grad_norm": 0.351962685585022, | |
| "learning_rate": 0.0005844349854227405, | |
| "loss": 3.9591, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.3403263403263403, | |
| "grad_norm": 0.3839578926563263, | |
| "learning_rate": 0.0005842600583090379, | |
| "loss": 3.9561, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.354895104895105, | |
| "grad_norm": 0.32113179564476013, | |
| "learning_rate": 0.0005840851311953352, | |
| "loss": 3.949, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.3694638694638694, | |
| "grad_norm": 0.33071938157081604, | |
| "learning_rate": 0.0005839102040816325, | |
| "loss": 3.9608, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.384032634032634, | |
| "grad_norm": 0.33803558349609375, | |
| "learning_rate": 0.00058373527696793, | |
| "loss": 3.9482, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.3986013986013985, | |
| "grad_norm": 0.31636884808540344, | |
| "learning_rate": 0.0005835603498542273, | |
| "loss": 3.9437, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4131701631701632, | |
| "grad_norm": 0.3646225035190582, | |
| "learning_rate": 0.0005833854227405247, | |
| "loss": 3.9303, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4277389277389276, | |
| "grad_norm": 0.3559642732143402, | |
| "learning_rate": 0.0005832104956268222, | |
| "loss": 3.9403, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4423076923076923, | |
| "grad_norm": 0.3481752276420593, | |
| "learning_rate": 0.0005830355685131195, | |
| "loss": 3.9357, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.456876456876457, | |
| "grad_norm": 0.313125878572464, | |
| "learning_rate": 0.0005828606413994169, | |
| "loss": 3.9303, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.456876456876457, | |
| "eval_accuracy": 0.3321257535516557, | |
| "eval_loss": 3.9129536151885986, | |
| "eval_runtime": 180.4532, | |
| "eval_samples_per_second": 92.223, | |
| "eval_steps_per_second": 5.769, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4714452214452214, | |
| "grad_norm": 0.33051010966300964, | |
| "learning_rate": 0.0005826857142857142, | |
| "loss": 3.9226, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.486013986013986, | |
| "grad_norm": 0.3060428500175476, | |
| "learning_rate": 0.0005825107871720116, | |
| "loss": 3.9254, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.5005827505827507, | |
| "grad_norm": 0.34262314438819885, | |
| "learning_rate": 0.000582335860058309, | |
| "loss": 3.9131, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 0.33539673686027527, | |
| "learning_rate": 0.0005821609329446063, | |
| "loss": 3.9158, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5297202797202796, | |
| "grad_norm": 0.3277048170566559, | |
| "learning_rate": 0.0005819860058309037, | |
| "loss": 3.9228, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.5442890442890445, | |
| "grad_norm": 0.31714221835136414, | |
| "learning_rate": 0.0005818110787172012, | |
| "loss": 3.9245, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.558857808857809, | |
| "grad_norm": 0.329098105430603, | |
| "learning_rate": 0.0005816361516034985, | |
| "loss": 3.9212, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.5734265734265733, | |
| "grad_norm": 0.33248335123062134, | |
| "learning_rate": 0.0005814612244897959, | |
| "loss": 3.9066, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.587995337995338, | |
| "grad_norm": 0.3300471305847168, | |
| "learning_rate": 0.0005812862973760932, | |
| "loss": 3.9076, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6025641025641026, | |
| "grad_norm": 0.3110630214214325, | |
| "learning_rate": 0.0005811113702623907, | |
| "loss": 3.8996, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.617132867132867, | |
| "grad_norm": 0.34096479415893555, | |
| "learning_rate": 0.000580936443148688, | |
| "loss": 3.8914, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.6317016317016317, | |
| "grad_norm": 0.3256978690624237, | |
| "learning_rate": 0.0005807615160349853, | |
| "loss": 3.8901, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6462703962703964, | |
| "grad_norm": 0.3170398771762848, | |
| "learning_rate": 0.0005805865889212827, | |
| "loss": 3.9086, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.6608391608391608, | |
| "grad_norm": 0.32134151458740234, | |
| "learning_rate": 0.0005804116618075802, | |
| "loss": 3.8843, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.6754079254079253, | |
| "grad_norm": 0.3455315828323364, | |
| "learning_rate": 0.0005802367346938775, | |
| "loss": 3.8936, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.68997668997669, | |
| "grad_norm": 0.33487361669540405, | |
| "learning_rate": 0.0005800618075801749, | |
| "loss": 3.903, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7045454545454546, | |
| "grad_norm": 0.3249671459197998, | |
| "learning_rate": 0.0005798868804664722, | |
| "loss": 3.8913, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.719114219114219, | |
| "grad_norm": 0.35598769783973694, | |
| "learning_rate": 0.0005797119533527697, | |
| "loss": 3.8821, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7336829836829837, | |
| "grad_norm": 0.34034013748168945, | |
| "learning_rate": 0.000579537026239067, | |
| "loss": 3.8849, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7482517482517483, | |
| "grad_norm": 0.33674389123916626, | |
| "learning_rate": 0.0005793620991253643, | |
| "loss": 3.8992, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7482517482517483, | |
| "eval_accuracy": 0.33753787307759514, | |
| "eval_loss": 3.85577654838562, | |
| "eval_runtime": 180.1447, | |
| "eval_samples_per_second": 92.381, | |
| "eval_steps_per_second": 5.779, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7628205128205128, | |
| "grad_norm": 0.32885122299194336, | |
| "learning_rate": 0.0005791871720116617, | |
| "loss": 3.8805, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.7773892773892774, | |
| "grad_norm": 0.32068461179733276, | |
| "learning_rate": 0.0005790122448979591, | |
| "loss": 3.8668, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.791958041958042, | |
| "grad_norm": 0.3308079242706299, | |
| "learning_rate": 0.0005788373177842565, | |
| "loss": 3.8776, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.8065268065268065, | |
| "grad_norm": 0.32728639245033264, | |
| "learning_rate": 0.0005786623906705539, | |
| "loss": 3.8633, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.821095571095571, | |
| "grad_norm": 0.3404487073421478, | |
| "learning_rate": 0.0005784874635568512, | |
| "loss": 3.8712, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.8356643356643356, | |
| "grad_norm": 0.32237741351127625, | |
| "learning_rate": 0.0005783125364431487, | |
| "loss": 3.8582, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8502331002331003, | |
| "grad_norm": 0.3479669392108917, | |
| "learning_rate": 0.000578137609329446, | |
| "loss": 3.8647, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.8648018648018647, | |
| "grad_norm": 0.3184560239315033, | |
| "learning_rate": 0.0005779626822157434, | |
| "loss": 3.847, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.8793706293706294, | |
| "grad_norm": 0.3197358548641205, | |
| "learning_rate": 0.0005777877551020408, | |
| "loss": 3.8617, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.893939393939394, | |
| "grad_norm": 0.2957116663455963, | |
| "learning_rate": 0.0005776128279883381, | |
| "loss": 3.854, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9085081585081585, | |
| "grad_norm": 0.3220060169696808, | |
| "learning_rate": 0.0005774379008746355, | |
| "loss": 3.851, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.3108726441860199, | |
| "learning_rate": 0.0005772629737609329, | |
| "loss": 3.8559, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9376456876456878, | |
| "grad_norm": 0.33560416102409363, | |
| "learning_rate": 0.0005770880466472303, | |
| "loss": 3.8505, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.9522144522144522, | |
| "grad_norm": 0.33253157138824463, | |
| "learning_rate": 0.0005769131195335277, | |
| "loss": 3.8468, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.9667832167832167, | |
| "grad_norm": 0.3143483102321625, | |
| "learning_rate": 0.000576738192419825, | |
| "loss": 3.8416, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9813519813519813, | |
| "grad_norm": 0.32564249634742737, | |
| "learning_rate": 0.0005765632653061224, | |
| "loss": 3.843, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.995920745920746, | |
| "grad_norm": 0.33519217371940613, | |
| "learning_rate": 0.0005763883381924198, | |
| "loss": 3.8431, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.0104895104895104, | |
| "grad_norm": 0.32294219732284546, | |
| "learning_rate": 0.0005762134110787171, | |
| "loss": 3.7722, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.025058275058275, | |
| "grad_norm": 0.3262682557106018, | |
| "learning_rate": 0.0005760384839650145, | |
| "loss": 3.7428, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0396270396270397, | |
| "grad_norm": 0.3397265374660492, | |
| "learning_rate": 0.0005758635568513119, | |
| "loss": 3.7487, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0396270396270397, | |
| "eval_accuracy": 0.3415744146738347, | |
| "eval_loss": 3.8127987384796143, | |
| "eval_runtime": 180.1348, | |
| "eval_samples_per_second": 92.386, | |
| "eval_steps_per_second": 5.779, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.054195804195804, | |
| "grad_norm": 0.3330610990524292, | |
| "learning_rate": 0.0005756886297376093, | |
| "loss": 3.7405, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.0687645687645686, | |
| "grad_norm": 0.3221195638179779, | |
| "learning_rate": 0.0005755137026239067, | |
| "loss": 3.7561, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 0.32453685998916626, | |
| "learning_rate": 0.000575338775510204, | |
| "loss": 3.7532, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.097902097902098, | |
| "grad_norm": 0.3615976870059967, | |
| "learning_rate": 0.0005751638483965014, | |
| "loss": 3.7618, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.1124708624708624, | |
| "grad_norm": 0.323742538690567, | |
| "learning_rate": 0.0005749889212827988, | |
| "loss": 3.7508, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.1270396270396272, | |
| "grad_norm": 0.3381347954273224, | |
| "learning_rate": 0.0005748139941690962, | |
| "loss": 3.7588, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.1416083916083917, | |
| "grad_norm": 0.3426363468170166, | |
| "learning_rate": 0.0005746390670553935, | |
| "loss": 3.7579, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.156177156177156, | |
| "grad_norm": 0.31964731216430664, | |
| "learning_rate": 0.000574464139941691, | |
| "loss": 3.7528, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.1707459207459205, | |
| "grad_norm": 0.3354383111000061, | |
| "learning_rate": 0.0005742892128279883, | |
| "loss": 3.7556, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.1853146853146854, | |
| "grad_norm": 0.3251858353614807, | |
| "learning_rate": 0.0005741142857142857, | |
| "loss": 3.7556, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.19988344988345, | |
| "grad_norm": 0.3399089276790619, | |
| "learning_rate": 0.000573939358600583, | |
| "loss": 3.7415, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.2144522144522143, | |
| "grad_norm": 0.3444349467754364, | |
| "learning_rate": 0.0005737644314868805, | |
| "loss": 3.7515, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.229020979020979, | |
| "grad_norm": 0.31715652346611023, | |
| "learning_rate": 0.0005735895043731778, | |
| "loss": 3.7618, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.2435897435897436, | |
| "grad_norm": 0.34369540214538574, | |
| "learning_rate": 0.0005734145772594752, | |
| "loss": 3.7687, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.258158508158508, | |
| "grad_norm": 0.3494495153427124, | |
| "learning_rate": 0.0005732396501457726, | |
| "loss": 3.748, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 0.31449177861213684, | |
| "learning_rate": 0.0005730647230320698, | |
| "loss": 3.7541, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.2872960372960374, | |
| "grad_norm": 0.3397660553455353, | |
| "learning_rate": 0.0005728897959183673, | |
| "loss": 3.7624, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.301864801864802, | |
| "grad_norm": 0.34240466356277466, | |
| "learning_rate": 0.0005727148688046647, | |
| "loss": 3.7432, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.3164335664335667, | |
| "grad_norm": 0.3217261731624603, | |
| "learning_rate": 0.000572539941690962, | |
| "loss": 3.7499, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.331002331002331, | |
| "grad_norm": 0.3246598243713379, | |
| "learning_rate": 0.0005723650145772595, | |
| "loss": 3.7619, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.331002331002331, | |
| "eval_accuracy": 0.34476242059382917, | |
| "eval_loss": 3.7828927040100098, | |
| "eval_runtime": 179.9962, | |
| "eval_samples_per_second": 92.458, | |
| "eval_steps_per_second": 5.783, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.3455710955710956, | |
| "grad_norm": 0.3367806673049927, | |
| "learning_rate": 0.0005721900874635568, | |
| "loss": 3.7485, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.36013986013986, | |
| "grad_norm": 0.3171541392803192, | |
| "learning_rate": 0.0005720151603498542, | |
| "loss": 3.7546, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.374708624708625, | |
| "grad_norm": 0.33225518465042114, | |
| "learning_rate": 0.0005718402332361515, | |
| "loss": 3.7429, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.3892773892773893, | |
| "grad_norm": 0.3193056881427765, | |
| "learning_rate": 0.000571665306122449, | |
| "loss": 3.7622, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.4038461538461537, | |
| "grad_norm": 0.3187880218029022, | |
| "learning_rate": 0.0005714903790087463, | |
| "loss": 3.7435, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.4184149184149186, | |
| "grad_norm": 0.33991068601608276, | |
| "learning_rate": 0.0005713154518950437, | |
| "loss": 3.7494, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.432983682983683, | |
| "grad_norm": 0.3092400133609772, | |
| "learning_rate": 0.000571140524781341, | |
| "loss": 3.7612, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.4475524475524475, | |
| "grad_norm": 0.31092721223831177, | |
| "learning_rate": 0.0005709655976676385, | |
| "loss": 3.7488, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.462121212121212, | |
| "grad_norm": 0.32930874824523926, | |
| "learning_rate": 0.0005707906705539358, | |
| "loss": 3.758, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.476689976689977, | |
| "grad_norm": 0.32361528277397156, | |
| "learning_rate": 0.0005706157434402332, | |
| "loss": 3.7454, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4912587412587412, | |
| "grad_norm": 0.33115440607070923, | |
| "learning_rate": 0.0005704408163265305, | |
| "loss": 3.7402, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.5058275058275057, | |
| "grad_norm": 0.328485369682312, | |
| "learning_rate": 0.000570265889212828, | |
| "loss": 3.7372, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.5203962703962706, | |
| "grad_norm": 0.35709500312805176, | |
| "learning_rate": 0.0005700909620991253, | |
| "loss": 3.7461, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.534965034965035, | |
| "grad_norm": 0.32163530588150024, | |
| "learning_rate": 0.0005699160349854227, | |
| "loss": 3.7541, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.5495337995337994, | |
| "grad_norm": 0.31789329648017883, | |
| "learning_rate": 0.00056974110787172, | |
| "loss": 3.7438, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.564102564102564, | |
| "grad_norm": 0.3170648515224457, | |
| "learning_rate": 0.0005695661807580175, | |
| "loss": 3.7557, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.5786713286713288, | |
| "grad_norm": 0.3424239158630371, | |
| "learning_rate": 0.0005693912536443148, | |
| "loss": 3.7398, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.593240093240093, | |
| "grad_norm": 0.318135529756546, | |
| "learning_rate": 0.0005692163265306122, | |
| "loss": 3.7284, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.607808857808858, | |
| "grad_norm": 0.33802515268325806, | |
| "learning_rate": 0.0005690413994169095, | |
| "loss": 3.738, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6223776223776225, | |
| "grad_norm": 0.32018738985061646, | |
| "learning_rate": 0.000568866472303207, | |
| "loss": 3.74, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6223776223776225, | |
| "eval_accuracy": 0.3477287677347602, | |
| "eval_loss": 3.751537799835205, | |
| "eval_runtime": 180.2979, | |
| "eval_samples_per_second": 92.303, | |
| "eval_steps_per_second": 5.774, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.636946386946387, | |
| "grad_norm": 0.3212384283542633, | |
| "learning_rate": 0.0005686915451895044, | |
| "loss": 3.7381, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.6515151515151514, | |
| "grad_norm": 0.3253323435783386, | |
| "learning_rate": 0.0005685166180758016, | |
| "loss": 3.739, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.666083916083916, | |
| "grad_norm": 0.3387431502342224, | |
| "learning_rate": 0.000568341690962099, | |
| "loss": 3.7248, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.6806526806526807, | |
| "grad_norm": 0.32496801018714905, | |
| "learning_rate": 0.0005681667638483965, | |
| "loss": 3.7298, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.695221445221445, | |
| "grad_norm": 0.32816433906555176, | |
| "learning_rate": 0.0005679918367346938, | |
| "loss": 3.7296, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.70979020979021, | |
| "grad_norm": 0.3408059775829315, | |
| "learning_rate": 0.0005678169096209912, | |
| "loss": 3.7364, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.7243589743589745, | |
| "grad_norm": 0.33964434266090393, | |
| "learning_rate": 0.0005676419825072885, | |
| "loss": 3.7332, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.738927738927739, | |
| "grad_norm": 0.31630218029022217, | |
| "learning_rate": 0.000567467055393586, | |
| "loss": 3.7283, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.7534965034965033, | |
| "grad_norm": 0.34303176403045654, | |
| "learning_rate": 0.0005672921282798833, | |
| "loss": 3.7337, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.768065268065268, | |
| "grad_norm": 0.30772241950035095, | |
| "learning_rate": 0.0005671172011661807, | |
| "loss": 3.7223, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7826340326340326, | |
| "grad_norm": 0.3346325755119324, | |
| "learning_rate": 0.000566942274052478, | |
| "loss": 3.7366, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.797202797202797, | |
| "grad_norm": 0.321429580450058, | |
| "learning_rate": 0.0005667673469387755, | |
| "loss": 3.7289, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.811771561771562, | |
| "grad_norm": 0.3273778259754181, | |
| "learning_rate": 0.0005665924198250728, | |
| "loss": 3.7253, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.8263403263403264, | |
| "grad_norm": 0.33299872279167175, | |
| "learning_rate": 0.0005664174927113702, | |
| "loss": 3.7264, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.840909090909091, | |
| "grad_norm": 0.31705546379089355, | |
| "learning_rate": 0.0005662425655976676, | |
| "loss": 3.7263, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.8554778554778553, | |
| "grad_norm": 0.34314480423927307, | |
| "learning_rate": 0.000566067638483965, | |
| "loss": 3.7151, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.87004662004662, | |
| "grad_norm": 0.32017573714256287, | |
| "learning_rate": 0.0005658927113702623, | |
| "loss": 3.7329, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.8846153846153846, | |
| "grad_norm": 0.31930816173553467, | |
| "learning_rate": 0.0005657177842565597, | |
| "loss": 3.7235, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.8991841491841495, | |
| "grad_norm": 0.31949570775032043, | |
| "learning_rate": 0.0005655428571428572, | |
| "loss": 3.7227, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.913752913752914, | |
| "grad_norm": 0.30999991297721863, | |
| "learning_rate": 0.0005653679300291545, | |
| "loss": 3.7152, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.913752913752914, | |
| "eval_accuracy": 0.34985880864932545, | |
| "eval_loss": 3.7270307540893555, | |
| "eval_runtime": 180.2671, | |
| "eval_samples_per_second": 92.319, | |
| "eval_steps_per_second": 5.775, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.9283216783216783, | |
| "grad_norm": 0.3184822201728821, | |
| "learning_rate": 0.0005651930029154518, | |
| "loss": 3.7289, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.9428904428904428, | |
| "grad_norm": 0.31392183899879456, | |
| "learning_rate": 0.0005650180758017492, | |
| "loss": 3.7275, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.957459207459207, | |
| "grad_norm": 0.3100379407405853, | |
| "learning_rate": 0.0005648431486880466, | |
| "loss": 3.7078, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.972027972027972, | |
| "grad_norm": 0.3107777237892151, | |
| "learning_rate": 0.000564668221574344, | |
| "loss": 3.7191, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.9865967365967365, | |
| "grad_norm": 0.31457746028900146, | |
| "learning_rate": 0.0005644932944606413, | |
| "loss": 3.7216, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.001165501165501, | |
| "grad_norm": 0.3300207555294037, | |
| "learning_rate": 0.0005643183673469387, | |
| "loss": 3.7241, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.015734265734266, | |
| "grad_norm": 0.33615049719810486, | |
| "learning_rate": 0.0005641434402332362, | |
| "loss": 3.6097, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.0303030303030303, | |
| "grad_norm": 0.32839542627334595, | |
| "learning_rate": 0.0005639685131195335, | |
| "loss": 3.6192, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.0448717948717947, | |
| "grad_norm": 0.32775548100471497, | |
| "learning_rate": 0.0005637935860058308, | |
| "loss": 3.6201, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.0594405594405596, | |
| "grad_norm": 0.3305208086967468, | |
| "learning_rate": 0.0005636186588921282, | |
| "loss": 3.6244, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.074009324009324, | |
| "grad_norm": 0.3248291015625, | |
| "learning_rate": 0.0005634437317784256, | |
| "loss": 3.6289, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.0885780885780885, | |
| "grad_norm": 0.334089070558548, | |
| "learning_rate": 0.000563268804664723, | |
| "loss": 3.6128, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.1031468531468533, | |
| "grad_norm": 0.33667150139808655, | |
| "learning_rate": 0.0005630938775510203, | |
| "loss": 3.6316, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.117715617715618, | |
| "grad_norm": 0.3139183223247528, | |
| "learning_rate": 0.0005629189504373177, | |
| "loss": 3.6267, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.132284382284382, | |
| "grad_norm": 0.3240184187889099, | |
| "learning_rate": 0.0005627440233236151, | |
| "loss": 3.6155, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.1468531468531467, | |
| "grad_norm": 0.3177716135978699, | |
| "learning_rate": 0.0005625690962099125, | |
| "loss": 3.6157, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.1614219114219115, | |
| "grad_norm": 0.32491302490234375, | |
| "learning_rate": 0.0005623941690962099, | |
| "loss": 3.6529, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.175990675990676, | |
| "grad_norm": 0.3269357681274414, | |
| "learning_rate": 0.0005622192419825073, | |
| "loss": 3.6252, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.1905594405594404, | |
| "grad_norm": 0.33358559012413025, | |
| "learning_rate": 0.0005620443148688046, | |
| "loss": 3.6477, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.2051282051282053, | |
| "grad_norm": 0.32112857699394226, | |
| "learning_rate": 0.000561869387755102, | |
| "loss": 3.6367, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2051282051282053, | |
| "eval_accuracy": 0.3516898159961675, | |
| "eval_loss": 3.7140629291534424, | |
| "eval_runtime": 180.3296, | |
| "eval_samples_per_second": 92.287, | |
| "eval_steps_per_second": 5.773, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2196969696969697, | |
| "grad_norm": 0.328512042760849, | |
| "learning_rate": 0.0005616944606413993, | |
| "loss": 3.6396, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.234265734265734, | |
| "grad_norm": 0.3449825644493103, | |
| "learning_rate": 0.0005615195335276968, | |
| "loss": 3.6327, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.248834498834499, | |
| "grad_norm": 0.32266926765441895, | |
| "learning_rate": 0.0005613446064139941, | |
| "loss": 3.6382, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.2634032634032635, | |
| "grad_norm": 0.3263072073459625, | |
| "learning_rate": 0.0005611696793002915, | |
| "loss": 3.6265, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.277972027972028, | |
| "grad_norm": 0.32438746094703674, | |
| "learning_rate": 0.0005609947521865889, | |
| "loss": 3.6519, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.2925407925407923, | |
| "grad_norm": 0.3556417226791382, | |
| "learning_rate": 0.0005608198250728863, | |
| "loss": 3.6388, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.3071095571095572, | |
| "grad_norm": 0.31459367275238037, | |
| "learning_rate": 0.0005606448979591836, | |
| "loss": 3.6413, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.3216783216783217, | |
| "grad_norm": 0.3164815902709961, | |
| "learning_rate": 0.000560469970845481, | |
| "loss": 3.6394, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.336247086247086, | |
| "grad_norm": 0.3238040804862976, | |
| "learning_rate": 0.0005602950437317783, | |
| "loss": 3.639, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.350815850815851, | |
| "grad_norm": 0.31536027789115906, | |
| "learning_rate": 0.0005601201166180758, | |
| "loss": 3.651, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.3653846153846154, | |
| "grad_norm": 0.3251273036003113, | |
| "learning_rate": 0.0005599451895043731, | |
| "loss": 3.6398, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.37995337995338, | |
| "grad_norm": 0.3183720111846924, | |
| "learning_rate": 0.0005597702623906705, | |
| "loss": 3.6425, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.3945221445221447, | |
| "grad_norm": 0.3452969193458557, | |
| "learning_rate": 0.0005595953352769679, | |
| "loss": 3.6396, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.409090909090909, | |
| "grad_norm": 0.31187903881073, | |
| "learning_rate": 0.0005594204081632653, | |
| "loss": 3.6399, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.4236596736596736, | |
| "grad_norm": 0.3159955143928528, | |
| "learning_rate": 0.0005592454810495627, | |
| "loss": 3.6371, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.438228438228438, | |
| "grad_norm": 0.3242449462413788, | |
| "learning_rate": 0.00055907055393586, | |
| "loss": 3.6376, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.452797202797203, | |
| "grad_norm": 0.33960285782814026, | |
| "learning_rate": 0.0005588956268221573, | |
| "loss": 3.6397, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.4673659673659674, | |
| "grad_norm": 0.34514838457107544, | |
| "learning_rate": 0.0005587206997084548, | |
| "loss": 3.6349, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.481934731934732, | |
| "grad_norm": 0.33326658606529236, | |
| "learning_rate": 0.0005585457725947521, | |
| "loss": 3.6432, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.4965034965034967, | |
| "grad_norm": 0.3219590187072754, | |
| "learning_rate": 0.0005583708454810495, | |
| "loss": 3.642, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4965034965034967, | |
| "eval_accuracy": 0.35353681570054407, | |
| "eval_loss": 3.697685480117798, | |
| "eval_runtime": 180.3495, | |
| "eval_samples_per_second": 92.276, | |
| "eval_steps_per_second": 5.772, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.511072261072261, | |
| "grad_norm": 0.315857470035553, | |
| "learning_rate": 0.0005581959183673468, | |
| "loss": 3.6484, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.5256410256410255, | |
| "grad_norm": 0.33714818954467773, | |
| "learning_rate": 0.0005580209912536443, | |
| "loss": 3.6465, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.54020979020979, | |
| "grad_norm": 0.3196263909339905, | |
| "learning_rate": 0.0005578460641399417, | |
| "loss": 3.6444, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.554778554778555, | |
| "grad_norm": 0.34034839272499084, | |
| "learning_rate": 0.000557671137026239, | |
| "loss": 3.6403, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5693473193473193, | |
| "grad_norm": 0.32852211594581604, | |
| "learning_rate": 0.0005574962099125363, | |
| "loss": 3.6461, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.583916083916084, | |
| "grad_norm": 0.3598001003265381, | |
| "learning_rate": 0.0005573212827988338, | |
| "loss": 3.6392, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.5984848484848486, | |
| "grad_norm": 0.3342962861061096, | |
| "learning_rate": 0.0005571463556851311, | |
| "loss": 3.6414, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.613053613053613, | |
| "grad_norm": 0.316803514957428, | |
| "learning_rate": 0.0005569714285714285, | |
| "loss": 3.6486, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.6276223776223775, | |
| "grad_norm": 0.31796908378601074, | |
| "learning_rate": 0.0005567965014577258, | |
| "loss": 3.6369, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.642191142191142, | |
| "grad_norm": 0.309007465839386, | |
| "learning_rate": 0.0005566215743440233, | |
| "loss": 3.6429, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.656759906759907, | |
| "grad_norm": 0.3321513831615448, | |
| "learning_rate": 0.0005564466472303207, | |
| "loss": 3.6487, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6713286713286712, | |
| "grad_norm": 0.35138118267059326, | |
| "learning_rate": 0.000556271720116618, | |
| "loss": 3.6527, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.685897435897436, | |
| "grad_norm": 0.3067615032196045, | |
| "learning_rate": 0.0005560967930029155, | |
| "loss": 3.6444, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.7004662004662006, | |
| "grad_norm": 0.33694183826446533, | |
| "learning_rate": 0.0005559218658892128, | |
| "loss": 3.6325, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.715034965034965, | |
| "grad_norm": 0.31776705384254456, | |
| "learning_rate": 0.0005557469387755101, | |
| "loss": 3.6527, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.7296037296037294, | |
| "grad_norm": 0.3377169668674469, | |
| "learning_rate": 0.0005555720116618075, | |
| "loss": 3.6424, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.7441724941724943, | |
| "grad_norm": 0.3101692199707031, | |
| "learning_rate": 0.0005553970845481049, | |
| "loss": 3.6359, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.7587412587412588, | |
| "grad_norm": 0.3166581392288208, | |
| "learning_rate": 0.0005552221574344023, | |
| "loss": 3.6416, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.773310023310023, | |
| "grad_norm": 0.31438636779785156, | |
| "learning_rate": 0.0005550472303206997, | |
| "loss": 3.6336, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "grad_norm": 0.3247930705547333, | |
| "learning_rate": 0.000554872303206997, | |
| "loss": 3.6416, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "eval_accuracy": 0.3546786229921654, | |
| "eval_loss": 3.6786322593688965, | |
| "eval_runtime": 180.419, | |
| "eval_samples_per_second": 92.241, | |
| "eval_steps_per_second": 5.77, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.8024475524475525, | |
| "grad_norm": 0.3598824441432953, | |
| "learning_rate": 0.0005546973760932945, | |
| "loss": 3.6428, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.817016317016317, | |
| "grad_norm": 0.32811933755874634, | |
| "learning_rate": 0.0005545224489795918, | |
| "loss": 3.6448, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.8315850815850814, | |
| "grad_norm": 0.3222385346889496, | |
| "learning_rate": 0.0005543475218658891, | |
| "loss": 3.6489, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.8461538461538463, | |
| "grad_norm": 0.326913058757782, | |
| "learning_rate": 0.0005541725947521865, | |
| "loss": 3.6217, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.8607226107226107, | |
| "grad_norm": 0.31770044565200806, | |
| "learning_rate": 0.0005539976676384839, | |
| "loss": 3.6383, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.875291375291375, | |
| "grad_norm": 0.3197103440761566, | |
| "learning_rate": 0.0005538227405247813, | |
| "loss": 3.6432, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.88986013986014, | |
| "grad_norm": 0.33483409881591797, | |
| "learning_rate": 0.0005536478134110787, | |
| "loss": 3.6325, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.9044289044289044, | |
| "grad_norm": 0.3026617765426636, | |
| "learning_rate": 0.000553472886297376, | |
| "loss": 3.6343, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.918997668997669, | |
| "grad_norm": 0.2976735532283783, | |
| "learning_rate": 0.0005532979591836735, | |
| "loss": 3.6483, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.9335664335664333, | |
| "grad_norm": 0.3455604612827301, | |
| "learning_rate": 0.0005531230320699708, | |
| "loss": 3.6413, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.948135198135198, | |
| "grad_norm": 0.3204672932624817, | |
| "learning_rate": 0.0005529481049562682, | |
| "loss": 3.6384, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.9627039627039626, | |
| "grad_norm": 0.340648889541626, | |
| "learning_rate": 0.0005527731778425655, | |
| "loss": 3.6425, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.9772727272727275, | |
| "grad_norm": 0.3379724323749542, | |
| "learning_rate": 0.0005525982507288629, | |
| "loss": 3.6327, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.991841491841492, | |
| "grad_norm": 0.3036077320575714, | |
| "learning_rate": 0.0005524233236151603, | |
| "loss": 3.6375, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.006410256410256, | |
| "grad_norm": 0.34318360686302185, | |
| "learning_rate": 0.0005522483965014576, | |
| "loss": 3.5803, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.020979020979021, | |
| "grad_norm": 0.3264276087284088, | |
| "learning_rate": 0.000552073469387755, | |
| "loss": 3.5362, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.035547785547785, | |
| "grad_norm": 0.3238934278488159, | |
| "learning_rate": 0.0005518985422740525, | |
| "loss": 3.5332, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.05011655011655, | |
| "grad_norm": 0.32926997542381287, | |
| "learning_rate": 0.0005517236151603498, | |
| "loss": 3.5372, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.064685314685315, | |
| "grad_norm": 0.32314813137054443, | |
| "learning_rate": 0.0005515486880466472, | |
| "loss": 3.5272, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.0792540792540795, | |
| "grad_norm": 0.3332814872264862, | |
| "learning_rate": 0.0005513737609329446, | |
| "loss": 3.5382, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.0792540792540795, | |
| "eval_accuracy": 0.3562127134068402, | |
| "eval_loss": 3.6715903282165527, | |
| "eval_runtime": 180.15, | |
| "eval_samples_per_second": 92.379, | |
| "eval_steps_per_second": 5.779, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.093822843822844, | |
| "grad_norm": 0.33132901787757874, | |
| "learning_rate": 0.0005511988338192419, | |
| "loss": 3.549, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.108391608391608, | |
| "grad_norm": 0.32595717906951904, | |
| "learning_rate": 0.0005510239067055393, | |
| "loss": 3.5445, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.122960372960373, | |
| "grad_norm": 0.3297913670539856, | |
| "learning_rate": 0.0005508489795918366, | |
| "loss": 3.5392, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.137529137529137, | |
| "grad_norm": 0.35622304677963257, | |
| "learning_rate": 0.0005506740524781341, | |
| "loss": 3.5387, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.1520979020979025, | |
| "grad_norm": 0.32156145572662354, | |
| "learning_rate": 0.0005504991253644315, | |
| "loss": 3.5461, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0005503241982507288, | |
| "loss": 3.5555, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.181235431235431, | |
| "grad_norm": 0.32054704427719116, | |
| "learning_rate": 0.0005501492711370262, | |
| "loss": 3.5634, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.195804195804196, | |
| "grad_norm": 0.3304331302642822, | |
| "learning_rate": 0.0005499743440233236, | |
| "loss": 3.557, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.21037296037296, | |
| "grad_norm": 0.33280083537101746, | |
| "learning_rate": 0.000549799416909621, | |
| "loss": 3.5636, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.224941724941725, | |
| "grad_norm": 0.3097744584083557, | |
| "learning_rate": 0.0005496244897959183, | |
| "loss": 3.5591, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.239510489510489, | |
| "grad_norm": 0.3197658658027649, | |
| "learning_rate": 0.0005494495626822156, | |
| "loss": 3.5661, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.2540792540792545, | |
| "grad_norm": 0.3759899437427521, | |
| "learning_rate": 0.0005492746355685131, | |
| "loss": 3.5621, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.268648018648019, | |
| "grad_norm": 0.34865570068359375, | |
| "learning_rate": 0.0005490997084548105, | |
| "loss": 3.5642, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.283216783216783, | |
| "grad_norm": 0.3441263735294342, | |
| "learning_rate": 0.0005489247813411078, | |
| "loss": 3.5676, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.297785547785548, | |
| "grad_norm": 0.33596622943878174, | |
| "learning_rate": 0.0005487498542274052, | |
| "loss": 3.5693, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.312354312354312, | |
| "grad_norm": 0.3372125029563904, | |
| "learning_rate": 0.0005485749271137026, | |
| "loss": 3.5674, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.326923076923077, | |
| "grad_norm": 0.3590675890445709, | |
| "learning_rate": 0.0005484, | |
| "loss": 3.5677, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.341491841491841, | |
| "grad_norm": 0.3344537615776062, | |
| "learning_rate": 0.0005482250728862973, | |
| "loss": 3.5582, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.356060606060606, | |
| "grad_norm": 0.3320492208003998, | |
| "learning_rate": 0.0005480501457725946, | |
| "loss": 3.5648, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.370629370629371, | |
| "grad_norm": 0.336557537317276, | |
| "learning_rate": 0.0005478752186588921, | |
| "loss": 3.5647, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.370629370629371, | |
| "eval_accuracy": 0.3576591986276676, | |
| "eval_loss": 3.6587440967559814, | |
| "eval_runtime": 180.1159, | |
| "eval_samples_per_second": 92.396, | |
| "eval_steps_per_second": 5.78, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.385198135198135, | |
| "grad_norm": 0.3224494159221649, | |
| "learning_rate": 0.0005477002915451894, | |
| "loss": 3.5672, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.3997668997669, | |
| "grad_norm": 0.3005123734474182, | |
| "learning_rate": 0.0005475253644314868, | |
| "loss": 3.567, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.414335664335664, | |
| "grad_norm": 0.31343790888786316, | |
| "learning_rate": 0.0005473504373177842, | |
| "loss": 3.5575, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.428904428904429, | |
| "grad_norm": 0.3172782361507416, | |
| "learning_rate": 0.0005471755102040816, | |
| "loss": 3.5665, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.443473193473194, | |
| "grad_norm": 0.32356658577919006, | |
| "learning_rate": 0.000547000583090379, | |
| "loss": 3.5787, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.458041958041958, | |
| "grad_norm": 0.35370585322380066, | |
| "learning_rate": 0.0005468256559766763, | |
| "loss": 3.5677, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.472610722610723, | |
| "grad_norm": 0.32113948464393616, | |
| "learning_rate": 0.0005466507288629738, | |
| "loss": 3.5588, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.487179487179487, | |
| "grad_norm": 0.3236648738384247, | |
| "learning_rate": 0.0005464758017492711, | |
| "loss": 3.5723, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.501748251748252, | |
| "grad_norm": 0.32024386525154114, | |
| "learning_rate": 0.0005463008746355684, | |
| "loss": 3.582, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.516317016317016, | |
| "grad_norm": 0.34335383772850037, | |
| "learning_rate": 0.0005461259475218658, | |
| "loss": 3.5728, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.5308857808857805, | |
| "grad_norm": 0.3075568377971649, | |
| "learning_rate": 0.0005459510204081633, | |
| "loss": 3.5652, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "grad_norm": 0.3292197585105896, | |
| "learning_rate": 0.0005457760932944606, | |
| "loss": 3.565, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.56002331002331, | |
| "grad_norm": 0.35107719898223877, | |
| "learning_rate": 0.000545601166180758, | |
| "loss": 3.5702, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.574592074592075, | |
| "grad_norm": 0.3471798598766327, | |
| "learning_rate": 0.0005454262390670553, | |
| "loss": 3.5681, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.589160839160839, | |
| "grad_norm": 0.31821051239967346, | |
| "learning_rate": 0.0005452513119533528, | |
| "loss": 3.582, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.603729603729604, | |
| "grad_norm": 0.3309209644794464, | |
| "learning_rate": 0.0005450763848396501, | |
| "loss": 3.5883, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.618298368298368, | |
| "grad_norm": 0.33727866411209106, | |
| "learning_rate": 0.0005449014577259474, | |
| "loss": 3.5817, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.632867132867133, | |
| "grad_norm": 0.3144679069519043, | |
| "learning_rate": 0.0005447265306122448, | |
| "loss": 3.5724, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.647435897435898, | |
| "grad_norm": 0.32342618703842163, | |
| "learning_rate": 0.0005445516034985423, | |
| "loss": 3.5855, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.662004662004662, | |
| "grad_norm": 0.3141750395298004, | |
| "learning_rate": 0.0005443766763848396, | |
| "loss": 3.5807, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.662004662004662, | |
| "eval_accuracy": 0.3584899780834147, | |
| "eval_loss": 3.6451687812805176, | |
| "eval_runtime": 180.0666, | |
| "eval_samples_per_second": 92.421, | |
| "eval_steps_per_second": 5.781, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.676573426573427, | |
| "grad_norm": 0.318861186504364, | |
| "learning_rate": 0.000544201749271137, | |
| "loss": 3.5705, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.691142191142191, | |
| "grad_norm": 0.31984490156173706, | |
| "learning_rate": 0.0005440268221574343, | |
| "loss": 3.5858, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.7057109557109555, | |
| "grad_norm": 0.3313526511192322, | |
| "learning_rate": 0.0005438518950437318, | |
| "loss": 3.5778, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.72027972027972, | |
| "grad_norm": 0.332089900970459, | |
| "learning_rate": 0.0005436769679300291, | |
| "loss": 3.5776, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.734848484848484, | |
| "grad_norm": 0.33302974700927734, | |
| "learning_rate": 0.0005435020408163265, | |
| "loss": 3.5832, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.74941724941725, | |
| "grad_norm": 0.3242354691028595, | |
| "learning_rate": 0.0005433271137026238, | |
| "loss": 3.5848, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.763986013986014, | |
| "grad_norm": 0.3078085482120514, | |
| "learning_rate": 0.0005431521865889212, | |
| "loss": 3.5824, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.778554778554779, | |
| "grad_norm": 0.3317912220954895, | |
| "learning_rate": 0.0005429772594752186, | |
| "loss": 3.5782, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.793123543123543, | |
| "grad_norm": 0.30730515718460083, | |
| "learning_rate": 0.000542802332361516, | |
| "loss": 3.5779, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.8076923076923075, | |
| "grad_norm": 0.35136038064956665, | |
| "learning_rate": 0.0005426274052478133, | |
| "loss": 3.583, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.822261072261072, | |
| "grad_norm": 0.3428604304790497, | |
| "learning_rate": 0.0005424524781341108, | |
| "loss": 3.578, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.836829836829837, | |
| "grad_norm": 0.3045051693916321, | |
| "learning_rate": 0.0005422775510204081, | |
| "loss": 3.5811, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.851398601398602, | |
| "grad_norm": 0.3164063096046448, | |
| "learning_rate": 0.0005421026239067055, | |
| "loss": 3.5821, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.865967365967366, | |
| "grad_norm": 0.33561450242996216, | |
| "learning_rate": 0.0005419276967930028, | |
| "loss": 3.5749, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.880536130536131, | |
| "grad_norm": 0.3375592529773712, | |
| "learning_rate": 0.0005417527696793002, | |
| "loss": 3.5713, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.895104895104895, | |
| "grad_norm": 0.3262588083744049, | |
| "learning_rate": 0.0005415778425655976, | |
| "loss": 3.5773, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.909673659673659, | |
| "grad_norm": 0.33031025528907776, | |
| "learning_rate": 0.000541402915451895, | |
| "loss": 3.5719, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.924242424242424, | |
| "grad_norm": 0.32215115427970886, | |
| "learning_rate": 0.0005412279883381923, | |
| "loss": 3.5679, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.938811188811189, | |
| "grad_norm": 0.3194146156311035, | |
| "learning_rate": 0.0005410530612244898, | |
| "loss": 3.5837, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.953379953379954, | |
| "grad_norm": 0.3187941312789917, | |
| "learning_rate": 0.0005408781341107871, | |
| "loss": 3.5693, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.953379953379954, | |
| "eval_accuracy": 0.35988401777879797, | |
| "eval_loss": 3.632936477661133, | |
| "eval_runtime": 180.0341, | |
| "eval_samples_per_second": 92.438, | |
| "eval_steps_per_second": 5.782, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.967948717948718, | |
| "grad_norm": 0.32214635610580444, | |
| "learning_rate": 0.0005407032069970845, | |
| "loss": 3.5817, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.9825174825174825, | |
| "grad_norm": 0.3381812870502472, | |
| "learning_rate": 0.0005405282798833819, | |
| "loss": 3.5721, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.997086247086247, | |
| "grad_norm": 0.328273206949234, | |
| "learning_rate": 0.0005403533527696793, | |
| "loss": 3.5866, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.011655011655011, | |
| "grad_norm": 0.32486042380332947, | |
| "learning_rate": 0.0005401784256559766, | |
| "loss": 3.4864, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.026223776223776, | |
| "grad_norm": 0.3191656172275543, | |
| "learning_rate": 0.000540003498542274, | |
| "loss": 3.4736, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.040792540792541, | |
| "grad_norm": 0.3504127264022827, | |
| "learning_rate": 0.0005398285714285714, | |
| "loss": 3.469, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.055361305361306, | |
| "grad_norm": 0.3454863727092743, | |
| "learning_rate": 0.0005396536443148688, | |
| "loss": 3.4665, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.06993006993007, | |
| "grad_norm": 0.30901169776916504, | |
| "learning_rate": 0.0005394787172011661, | |
| "loss": 3.4741, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.084498834498834, | |
| "grad_norm": 0.33311742544174194, | |
| "learning_rate": 0.0005393037900874635, | |
| "loss": 3.4876, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.099067599067599, | |
| "grad_norm": 0.33518463373184204, | |
| "learning_rate": 0.0005391288629737609, | |
| "loss": 3.4755, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.113636363636363, | |
| "grad_norm": 0.33938467502593994, | |
| "learning_rate": 0.0005389539358600583, | |
| "loss": 3.489, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.128205128205128, | |
| "grad_norm": 0.3346013128757477, | |
| "learning_rate": 0.0005387790087463557, | |
| "loss": 3.4899, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.142773892773893, | |
| "grad_norm": 0.3396677076816559, | |
| "learning_rate": 0.0005386040816326529, | |
| "loss": 3.4791, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.1573426573426575, | |
| "grad_norm": 0.32493624091148376, | |
| "learning_rate": 0.0005384291545189504, | |
| "loss": 3.4999, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.171911421911422, | |
| "grad_norm": 0.34523579478263855, | |
| "learning_rate": 0.0005382542274052478, | |
| "loss": 3.4942, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.186480186480186, | |
| "grad_norm": 0.34241601824760437, | |
| "learning_rate": 0.0005380793002915451, | |
| "loss": 3.4986, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.201048951048951, | |
| "grad_norm": 0.3449043035507202, | |
| "learning_rate": 0.0005379043731778425, | |
| "loss": 3.5096, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.215617715617715, | |
| "grad_norm": 0.33027029037475586, | |
| "learning_rate": 0.0005377294460641399, | |
| "loss": 3.5021, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.230186480186481, | |
| "grad_norm": 0.33586353063583374, | |
| "learning_rate": 0.0005375545189504373, | |
| "loss": 3.4964, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.244755244755245, | |
| "grad_norm": 0.3348841071128845, | |
| "learning_rate": 0.0005373795918367346, | |
| "loss": 3.5152, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.244755244755245, | |
| "eval_accuracy": 0.3607333765910926, | |
| "eval_loss": 3.6339569091796875, | |
| "eval_runtime": 180.5183, | |
| "eval_samples_per_second": 92.19, | |
| "eval_steps_per_second": 5.767, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.2593240093240095, | |
| "grad_norm": 0.33033329248428345, | |
| "learning_rate": 0.000537204664723032, | |
| "loss": 3.4922, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.273892773892774, | |
| "grad_norm": 0.32480764389038086, | |
| "learning_rate": 0.0005370297376093294, | |
| "loss": 3.5049, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.288461538461538, | |
| "grad_norm": 0.3114669919013977, | |
| "learning_rate": 0.0005368548104956268, | |
| "loss": 3.5045, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.303030303030303, | |
| "grad_norm": 0.32912948727607727, | |
| "learning_rate": 0.0005366798833819241, | |
| "loss": 3.5039, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.317599067599067, | |
| "grad_norm": 0.325888067483902, | |
| "learning_rate": 0.0005365049562682215, | |
| "loss": 3.5107, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.3321678321678325, | |
| "grad_norm": 0.3258603811264038, | |
| "learning_rate": 0.0005363300291545189, | |
| "loss": 3.5079, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.346736596736597, | |
| "grad_norm": 0.34344643354415894, | |
| "learning_rate": 0.0005361551020408163, | |
| "loss": 3.5056, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.361305361305361, | |
| "grad_norm": 0.34246399998664856, | |
| "learning_rate": 0.0005359801749271136, | |
| "loss": 3.5118, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.375874125874126, | |
| "grad_norm": 0.35261663794517517, | |
| "learning_rate": 0.000535805247813411, | |
| "loss": 3.5154, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.39044289044289, | |
| "grad_norm": 0.33429020643234253, | |
| "learning_rate": 0.0005356303206997085, | |
| "loss": 3.515, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.405011655011655, | |
| "grad_norm": 0.3388688266277313, | |
| "learning_rate": 0.0005354553935860058, | |
| "loss": 3.5011, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.41958041958042, | |
| "grad_norm": 0.31441932916641235, | |
| "learning_rate": 0.0005352804664723031, | |
| "loss": 3.524, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.4341491841491845, | |
| "grad_norm": 0.33346623182296753, | |
| "learning_rate": 0.0005351055393586006, | |
| "loss": 3.5096, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.448717948717949, | |
| "grad_norm": 0.3645952045917511, | |
| "learning_rate": 0.0005349306122448979, | |
| "loss": 3.5162, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.463286713286713, | |
| "grad_norm": 0.3252617120742798, | |
| "learning_rate": 0.0005347556851311953, | |
| "loss": 3.5166, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.477855477855478, | |
| "grad_norm": 0.32356569170951843, | |
| "learning_rate": 0.0005345807580174926, | |
| "loss": 3.5259, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.492424242424242, | |
| "grad_norm": 0.32452526688575745, | |
| "learning_rate": 0.0005344058309037901, | |
| "loss": 3.5419, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.506993006993007, | |
| "grad_norm": 0.3109516501426697, | |
| "learning_rate": 0.0005342309037900875, | |
| "loss": 3.523, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.521561771561771, | |
| "grad_norm": 0.32956892251968384, | |
| "learning_rate": 0.0005340559766763848, | |
| "loss": 3.5346, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.536130536130536, | |
| "grad_norm": 0.347649484872818, | |
| "learning_rate": 0.0005338810495626821, | |
| "loss": 3.5148, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.536130536130536, | |
| "eval_accuracy": 0.3614395097307616, | |
| "eval_loss": 3.624124526977539, | |
| "eval_runtime": 180.5076, | |
| "eval_samples_per_second": 92.196, | |
| "eval_steps_per_second": 5.767, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.550699300699301, | |
| "grad_norm": 0.34442394971847534, | |
| "learning_rate": 0.0005337061224489796, | |
| "loss": 3.508, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.565268065268065, | |
| "grad_norm": 0.3646959960460663, | |
| "learning_rate": 0.0005335311953352769, | |
| "loss": 3.5272, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.57983682983683, | |
| "grad_norm": 0.34306755661964417, | |
| "learning_rate": 0.0005333562682215743, | |
| "loss": 3.5253, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.594405594405594, | |
| "grad_norm": 0.34549543261528015, | |
| "learning_rate": 0.0005331813411078716, | |
| "loss": 3.5349, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.608974358974359, | |
| "grad_norm": 0.3486803472042084, | |
| "learning_rate": 0.0005330064139941691, | |
| "loss": 3.518, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.623543123543124, | |
| "grad_norm": 0.3553147315979004, | |
| "learning_rate": 0.0005328314868804665, | |
| "loss": 3.5229, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.638111888111888, | |
| "grad_norm": 0.3389810025691986, | |
| "learning_rate": 0.0005326565597667638, | |
| "loss": 3.5184, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.652680652680653, | |
| "grad_norm": 0.3389154076576233, | |
| "learning_rate": 0.0005324816326530612, | |
| "loss": 3.5242, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.667249417249417, | |
| "grad_norm": 0.31988218426704407, | |
| "learning_rate": 0.0005323067055393586, | |
| "loss": 3.5365, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.681818181818182, | |
| "grad_norm": 0.32239192724227905, | |
| "learning_rate": 0.0005321317784256559, | |
| "loss": 3.5347, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.696386946386946, | |
| "grad_norm": 0.3520359694957733, | |
| "learning_rate": 0.0005319568513119533, | |
| "loss": 3.5332, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.7109557109557105, | |
| "grad_norm": 0.3352511525154114, | |
| "learning_rate": 0.0005317819241982506, | |
| "loss": 3.534, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.725524475524476, | |
| "grad_norm": 0.3281591236591339, | |
| "learning_rate": 0.0005316069970845481, | |
| "loss": 3.5274, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.74009324009324, | |
| "grad_norm": 0.33789217472076416, | |
| "learning_rate": 0.0005314320699708454, | |
| "loss": 3.5266, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.754662004662005, | |
| "grad_norm": 0.34207120537757874, | |
| "learning_rate": 0.0005312571428571428, | |
| "loss": 3.5315, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.769230769230769, | |
| "grad_norm": 0.351068913936615, | |
| "learning_rate": 0.0005310822157434403, | |
| "loss": 3.5341, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.783799533799534, | |
| "grad_norm": 0.3352493643760681, | |
| "learning_rate": 0.0005309072886297376, | |
| "loss": 3.53, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.798368298368298, | |
| "grad_norm": 0.327741801738739, | |
| "learning_rate": 0.0005307323615160349, | |
| "loss": 3.5304, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.812937062937063, | |
| "grad_norm": 0.32836633920669556, | |
| "learning_rate": 0.0005305574344023323, | |
| "loss": 3.5286, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.827505827505828, | |
| "grad_norm": 0.3504875600337982, | |
| "learning_rate": 0.0005303825072886296, | |
| "loss": 3.5384, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.827505827505828, | |
| "eval_accuracy": 0.3625647367105273, | |
| "eval_loss": 3.612804412841797, | |
| "eval_runtime": 180.4713, | |
| "eval_samples_per_second": 92.214, | |
| "eval_steps_per_second": 5.768, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.842074592074592, | |
| "grad_norm": 0.3540632426738739, | |
| "learning_rate": 0.0005302075801749271, | |
| "loss": 3.5366, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.856643356643357, | |
| "grad_norm": 0.34035322070121765, | |
| "learning_rate": 0.0005300326530612244, | |
| "loss": 3.5303, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.871212121212121, | |
| "grad_norm": 0.31729087233543396, | |
| "learning_rate": 0.0005298577259475218, | |
| "loss": 3.5335, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.8857808857808855, | |
| "grad_norm": 0.3735673427581787, | |
| "learning_rate": 0.0005296827988338193, | |
| "loss": 3.5277, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.90034965034965, | |
| "grad_norm": 0.314452201128006, | |
| "learning_rate": 0.0005295078717201166, | |
| "loss": 3.5406, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.914918414918415, | |
| "grad_norm": 0.3204086422920227, | |
| "learning_rate": 0.000529332944606414, | |
| "loss": 3.5359, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.92948717948718, | |
| "grad_norm": 0.3485746681690216, | |
| "learning_rate": 0.0005291580174927113, | |
| "loss": 3.5245, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.944055944055944, | |
| "grad_norm": 0.34968072175979614, | |
| "learning_rate": 0.0005289830903790087, | |
| "loss": 3.54, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.958624708624709, | |
| "grad_norm": 0.3806632161140442, | |
| "learning_rate": 0.0005288081632653061, | |
| "loss": 3.525, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.973193473193473, | |
| "grad_norm": 0.3304056227207184, | |
| "learning_rate": 0.0005286332361516034, | |
| "loss": 3.5232, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.9877622377622375, | |
| "grad_norm": 0.33363205194473267, | |
| "learning_rate": 0.0005284583090379008, | |
| "loss": 3.5174, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 6.002331002331002, | |
| "grad_norm": 0.3507980704307556, | |
| "learning_rate": 0.0005282833819241983, | |
| "loss": 3.5095, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.016899766899767, | |
| "grad_norm": 0.3389154374599457, | |
| "learning_rate": 0.0005281084548104956, | |
| "loss": 3.4025, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.031468531468532, | |
| "grad_norm": 0.33325284719467163, | |
| "learning_rate": 0.000527933527696793, | |
| "loss": 3.4253, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.046037296037296, | |
| "grad_norm": 0.34633567929267883, | |
| "learning_rate": 0.0005277586005830903, | |
| "loss": 3.4288, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.0606060606060606, | |
| "grad_norm": 0.33911773562431335, | |
| "learning_rate": 0.0005275836734693877, | |
| "loss": 3.4302, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.075174825174825, | |
| "grad_norm": 0.3277522027492523, | |
| "learning_rate": 0.0005274087463556851, | |
| "loss": 3.4381, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.089743589743589, | |
| "grad_norm": 0.3419731855392456, | |
| "learning_rate": 0.0005272338192419824, | |
| "loss": 3.4431, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.104312354312355, | |
| "grad_norm": 0.35028308629989624, | |
| "learning_rate": 0.0005270588921282798, | |
| "loss": 3.4435, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.118881118881119, | |
| "grad_norm": 0.3204551339149475, | |
| "learning_rate": 0.0005268839650145772, | |
| "loss": 3.4338, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.118881118881119, | |
| "eval_accuracy": 0.3627538228202005, | |
| "eval_loss": 3.615595579147339, | |
| "eval_runtime": 180.6199, | |
| "eval_samples_per_second": 92.138, | |
| "eval_steps_per_second": 5.763, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.133449883449884, | |
| "grad_norm": 0.3347219228744507, | |
| "learning_rate": 0.0005267090379008746, | |
| "loss": 3.4486, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.148018648018648, | |
| "grad_norm": 0.3284785747528076, | |
| "learning_rate": 0.000526534110787172, | |
| "loss": 3.4548, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.1625874125874125, | |
| "grad_norm": 0.33264586329460144, | |
| "learning_rate": 0.0005263591836734693, | |
| "loss": 3.4476, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.177156177156177, | |
| "grad_norm": 0.3285725712776184, | |
| "learning_rate": 0.0005261842565597668, | |
| "loss": 3.4675, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.191724941724941, | |
| "grad_norm": 0.3390142321586609, | |
| "learning_rate": 0.0005260093294460641, | |
| "loss": 3.455, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.206293706293707, | |
| "grad_norm": 0.33934858441352844, | |
| "learning_rate": 0.0005258344023323614, | |
| "loss": 3.4463, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.220862470862471, | |
| "grad_norm": 0.3672083914279938, | |
| "learning_rate": 0.0005256594752186588, | |
| "loss": 3.4512, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.235431235431236, | |
| "grad_norm": 0.3115769624710083, | |
| "learning_rate": 0.0005254845481049562, | |
| "loss": 3.4634, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 0.32785558700561523, | |
| "learning_rate": 0.0005253096209912536, | |
| "loss": 3.4688, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.264568764568764, | |
| "grad_norm": 0.3327209949493408, | |
| "learning_rate": 0.000525134693877551, | |
| "loss": 3.4517, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.279137529137529, | |
| "grad_norm": 0.34631094336509705, | |
| "learning_rate": 0.0005249597667638484, | |
| "loss": 3.4574, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.293706293706293, | |
| "grad_norm": 0.3532359004020691, | |
| "learning_rate": 0.0005247848396501458, | |
| "loss": 3.4656, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.308275058275059, | |
| "grad_norm": 0.36950933933258057, | |
| "learning_rate": 0.0005246099125364431, | |
| "loss": 3.4769, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.322843822843823, | |
| "grad_norm": 0.336834579706192, | |
| "learning_rate": 0.0005244349854227404, | |
| "loss": 3.4637, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.3374125874125875, | |
| "grad_norm": 0.30184629559516907, | |
| "learning_rate": 0.0005242600583090379, | |
| "loss": 3.4716, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.351981351981352, | |
| "grad_norm": 0.34009432792663574, | |
| "learning_rate": 0.0005240851311953352, | |
| "loss": 3.4698, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.366550116550116, | |
| "grad_norm": 0.32678115367889404, | |
| "learning_rate": 0.0005239102040816326, | |
| "loss": 3.4706, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.381118881118881, | |
| "grad_norm": 0.34370940923690796, | |
| "learning_rate": 0.00052373527696793, | |
| "loss": 3.4649, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.395687645687646, | |
| "grad_norm": 0.31767651438713074, | |
| "learning_rate": 0.0005235603498542274, | |
| "loss": 3.4903, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.410256410256411, | |
| "grad_norm": 0.35483428835868835, | |
| "learning_rate": 0.0005233854227405248, | |
| "loss": 3.4762, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.410256410256411, | |
| "eval_accuracy": 0.3631498688509091, | |
| "eval_loss": 3.6074860095977783, | |
| "eval_runtime": 180.0487, | |
| "eval_samples_per_second": 92.431, | |
| "eval_steps_per_second": 5.782, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.424825174825175, | |
| "grad_norm": 0.31931906938552856, | |
| "learning_rate": 0.0005232104956268221, | |
| "loss": 3.4758, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.4393939393939394, | |
| "grad_norm": 0.3227771818637848, | |
| "learning_rate": 0.0005230355685131195, | |
| "loss": 3.4678, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.453962703962704, | |
| "grad_norm": 0.35156136751174927, | |
| "learning_rate": 0.0005228606413994169, | |
| "loss": 3.4803, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.468531468531468, | |
| "grad_norm": 0.33394086360931396, | |
| "learning_rate": 0.0005226857142857142, | |
| "loss": 3.471, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.483100233100233, | |
| "grad_norm": 0.3395681381225586, | |
| "learning_rate": 0.0005225107871720116, | |
| "loss": 3.4759, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.497668997668998, | |
| "grad_norm": 0.32322457432746887, | |
| "learning_rate": 0.0005223358600583089, | |
| "loss": 3.48, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.5122377622377625, | |
| "grad_norm": 0.32809075713157654, | |
| "learning_rate": 0.0005221609329446064, | |
| "loss": 3.4774, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.526806526806527, | |
| "grad_norm": 0.32868528366088867, | |
| "learning_rate": 0.0005219860058309038, | |
| "loss": 3.4811, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.541375291375291, | |
| "grad_norm": 0.33489176630973816, | |
| "learning_rate": 0.0005218110787172011, | |
| "loss": 3.4916, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.555944055944056, | |
| "grad_norm": 0.3436543941497803, | |
| "learning_rate": 0.0005216361516034985, | |
| "loss": 3.4859, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.57051282051282, | |
| "grad_norm": 0.3015133738517761, | |
| "learning_rate": 0.0005214612244897959, | |
| "loss": 3.4779, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.585081585081585, | |
| "grad_norm": 0.3797510862350464, | |
| "learning_rate": 0.0005212862973760932, | |
| "loss": 3.4846, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.59965034965035, | |
| "grad_norm": 0.327371209859848, | |
| "learning_rate": 0.0005211113702623906, | |
| "loss": 3.4941, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.6142191142191145, | |
| "grad_norm": 0.3728986084461212, | |
| "learning_rate": 0.0005209364431486879, | |
| "loss": 3.4986, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.628787878787879, | |
| "grad_norm": 0.3234831988811493, | |
| "learning_rate": 0.0005207615160349854, | |
| "loss": 3.4824, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.643356643356643, | |
| "grad_norm": 0.3303401470184326, | |
| "learning_rate": 0.0005205865889212828, | |
| "loss": 3.4857, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.657925407925408, | |
| "grad_norm": 0.3562447726726532, | |
| "learning_rate": 0.0005204116618075801, | |
| "loss": 3.4825, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.672494172494172, | |
| "grad_norm": 0.3363456428050995, | |
| "learning_rate": 0.0005202367346938776, | |
| "loss": 3.4786, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.687062937062937, | |
| "grad_norm": 0.337936669588089, | |
| "learning_rate": 0.0005200618075801749, | |
| "loss": 3.4894, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.701631701631702, | |
| "grad_norm": 0.34164348244667053, | |
| "learning_rate": 0.0005198868804664723, | |
| "loss": 3.4815, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.701631701631702, | |
| "eval_accuracy": 0.3637440554878363, | |
| "eval_loss": 3.6008543968200684, | |
| "eval_runtime": 180.4988, | |
| "eval_samples_per_second": 92.2, | |
| "eval_steps_per_second": 5.767, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.716200466200466, | |
| "grad_norm": 0.3702085018157959, | |
| "learning_rate": 0.0005197119533527696, | |
| "loss": 3.4993, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.730769230769231, | |
| "grad_norm": 0.33993563055992126, | |
| "learning_rate": 0.000519537026239067, | |
| "loss": 3.4772, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.745337995337995, | |
| "grad_norm": 0.33401525020599365, | |
| "learning_rate": 0.0005193620991253644, | |
| "loss": 3.4976, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.75990675990676, | |
| "grad_norm": 0.37840354442596436, | |
| "learning_rate": 0.0005191871720116618, | |
| "loss": 3.4828, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.774475524475524, | |
| "grad_norm": 0.3243924379348755, | |
| "learning_rate": 0.0005190122448979591, | |
| "loss": 3.4938, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.7890442890442895, | |
| "grad_norm": 0.3309505581855774, | |
| "learning_rate": 0.0005188373177842566, | |
| "loss": 3.4723, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.803613053613054, | |
| "grad_norm": 0.35153377056121826, | |
| "learning_rate": 0.0005186623906705539, | |
| "loss": 3.4872, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.818181818181818, | |
| "grad_norm": 0.3381296396255493, | |
| "learning_rate": 0.0005184874635568513, | |
| "loss": 3.4899, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.832750582750583, | |
| "grad_norm": 0.3551500737667084, | |
| "learning_rate": 0.0005183125364431486, | |
| "loss": 3.4895, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.847319347319347, | |
| "grad_norm": 0.33850058913230896, | |
| "learning_rate": 0.000518137609329446, | |
| "loss": 3.4793, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.861888111888112, | |
| "grad_norm": 0.3279431164264679, | |
| "learning_rate": 0.0005179626822157434, | |
| "loss": 3.4967, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.876456876456876, | |
| "grad_norm": 0.3145736753940582, | |
| "learning_rate": 0.0005177877551020407, | |
| "loss": 3.5046, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.891025641025641, | |
| "grad_norm": 0.3533722162246704, | |
| "learning_rate": 0.0005176128279883381, | |
| "loss": 3.4892, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.905594405594406, | |
| "grad_norm": 0.3434518575668335, | |
| "learning_rate": 0.0005174379008746356, | |
| "loss": 3.4818, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.92016317016317, | |
| "grad_norm": 0.30422964692115784, | |
| "learning_rate": 0.0005172629737609329, | |
| "loss": 3.4961, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.934731934731935, | |
| "grad_norm": 0.34872138500213623, | |
| "learning_rate": 0.0005170880466472303, | |
| "loss": 3.4941, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.949300699300699, | |
| "grad_norm": 0.3359842598438263, | |
| "learning_rate": 0.0005169131195335276, | |
| "loss": 3.4905, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.963869463869464, | |
| "grad_norm": 0.3362923264503479, | |
| "learning_rate": 0.0005167381924198251, | |
| "loss": 3.4967, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.978438228438229, | |
| "grad_norm": 0.33967387676239014, | |
| "learning_rate": 0.0005165632653061224, | |
| "loss": 3.4997, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.993006993006993, | |
| "grad_norm": 0.326475590467453, | |
| "learning_rate": 0.0005163883381924197, | |
| "loss": 3.4942, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.993006993006993, | |
| "eval_accuracy": 0.36491220313304396, | |
| "eval_loss": 3.5894508361816406, | |
| "eval_runtime": 180.3515, | |
| "eval_samples_per_second": 92.275, | |
| "eval_steps_per_second": 5.772, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.007575757575758, | |
| "grad_norm": 0.35610419511795044, | |
| "learning_rate": 0.0005162134110787171, | |
| "loss": 3.4349, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.022144522144522, | |
| "grad_norm": 0.3531475067138672, | |
| "learning_rate": 0.0005160384839650146, | |
| "loss": 3.3823, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.036713286713287, | |
| "grad_norm": 0.3476791977882385, | |
| "learning_rate": 0.0005158635568513119, | |
| "loss": 3.4016, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.051282051282051, | |
| "grad_norm": 0.35229551792144775, | |
| "learning_rate": 0.0005156886297376093, | |
| "loss": 3.3868, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.0658508158508155, | |
| "grad_norm": 0.3391362428665161, | |
| "learning_rate": 0.0005155137026239066, | |
| "loss": 3.3928, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.08041958041958, | |
| "grad_norm": 0.34460726380348206, | |
| "learning_rate": 0.0005153387755102041, | |
| "loss": 3.3913, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.094988344988345, | |
| "grad_norm": 0.35683906078338623, | |
| "learning_rate": 0.0005151638483965014, | |
| "loss": 3.3972, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.10955710955711, | |
| "grad_norm": 0.3500906825065613, | |
| "learning_rate": 0.0005149889212827987, | |
| "loss": 3.4121, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.124125874125874, | |
| "grad_norm": 0.32340437173843384, | |
| "learning_rate": 0.0005148139941690961, | |
| "loss": 3.4042, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.138694638694639, | |
| "grad_norm": 0.36307796835899353, | |
| "learning_rate": 0.0005146390670553936, | |
| "loss": 3.4152, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.153263403263403, | |
| "grad_norm": 0.35622280836105347, | |
| "learning_rate": 0.0005144641399416909, | |
| "loss": 3.4038, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.1678321678321675, | |
| "grad_norm": 0.34201404452323914, | |
| "learning_rate": 0.0005142892128279883, | |
| "loss": 3.413, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.182400932400933, | |
| "grad_norm": 0.3477611243724823, | |
| "learning_rate": 0.0005141142857142856, | |
| "loss": 3.4147, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.196969696969697, | |
| "grad_norm": 0.3193877339363098, | |
| "learning_rate": 0.0005139393586005831, | |
| "loss": 3.4349, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.211538461538462, | |
| "grad_norm": 0.3370342254638672, | |
| "learning_rate": 0.0005137644314868804, | |
| "loss": 3.4269, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.226107226107226, | |
| "grad_norm": 0.35344481468200684, | |
| "learning_rate": 0.0005135895043731778, | |
| "loss": 3.4046, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.2406759906759905, | |
| "grad_norm": 0.3530924320220947, | |
| "learning_rate": 0.0005134145772594752, | |
| "loss": 3.4115, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.255244755244755, | |
| "grad_norm": 0.3493140935897827, | |
| "learning_rate": 0.0005132396501457726, | |
| "loss": 3.4237, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.269813519813519, | |
| "grad_norm": 0.33685219287872314, | |
| "learning_rate": 0.0005130647230320699, | |
| "loss": 3.4313, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.284382284382285, | |
| "grad_norm": 0.3504573702812195, | |
| "learning_rate": 0.0005128897959183673, | |
| "loss": 3.4237, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.284382284382285, | |
| "eval_accuracy": 0.3649218455839104, | |
| "eval_loss": 3.599851369857788, | |
| "eval_runtime": 180.4257, | |
| "eval_samples_per_second": 92.237, | |
| "eval_steps_per_second": 5.77, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.298951048951049, | |
| "grad_norm": 0.34710603952407837, | |
| "learning_rate": 0.0005127148688046647, | |
| "loss": 3.4347, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.313519813519814, | |
| "grad_norm": 0.3456078767776489, | |
| "learning_rate": 0.0005125399416909621, | |
| "loss": 3.4325, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.328088578088578, | |
| "grad_norm": 0.36139947175979614, | |
| "learning_rate": 0.0005123650145772594, | |
| "loss": 3.4531, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.3426573426573425, | |
| "grad_norm": 0.3331305980682373, | |
| "learning_rate": 0.0005121900874635568, | |
| "loss": 3.4372, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.357226107226107, | |
| "grad_norm": 0.3419002294540405, | |
| "learning_rate": 0.0005120151603498543, | |
| "loss": 3.4222, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.371794871794872, | |
| "grad_norm": 0.37077078223228455, | |
| "learning_rate": 0.0005118402332361515, | |
| "loss": 3.438, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.386363636363637, | |
| "grad_norm": 0.37061864137649536, | |
| "learning_rate": 0.0005116653061224489, | |
| "loss": 3.4384, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.400932400932401, | |
| "grad_norm": 0.33451831340789795, | |
| "learning_rate": 0.0005114903790087463, | |
| "loss": 3.4323, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.415501165501166, | |
| "grad_norm": 0.36487630009651184, | |
| "learning_rate": 0.0005113154518950437, | |
| "loss": 3.4338, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.43006993006993, | |
| "grad_norm": 0.34303170442581177, | |
| "learning_rate": 0.0005111405247813411, | |
| "loss": 3.446, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.444638694638694, | |
| "grad_norm": 0.3491624593734741, | |
| "learning_rate": 0.0005109655976676384, | |
| "loss": 3.4407, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.459207459207459, | |
| "grad_norm": 0.3570358455181122, | |
| "learning_rate": 0.0005107906705539358, | |
| "loss": 3.4499, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.473776223776224, | |
| "grad_norm": 0.3398280739784241, | |
| "learning_rate": 0.0005106157434402332, | |
| "loss": 3.438, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.488344988344989, | |
| "grad_norm": 0.3448866307735443, | |
| "learning_rate": 0.0005104408163265306, | |
| "loss": 3.4396, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.502913752913753, | |
| "grad_norm": 0.35469329357147217, | |
| "learning_rate": 0.0005102658892128279, | |
| "loss": 3.4361, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.5174825174825175, | |
| "grad_norm": 0.35180532932281494, | |
| "learning_rate": 0.0005100909620991253, | |
| "loss": 3.4589, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.532051282051282, | |
| "grad_norm": 0.3383461833000183, | |
| "learning_rate": 0.0005099160349854227, | |
| "loss": 3.446, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.546620046620046, | |
| "grad_norm": 0.35350677371025085, | |
| "learning_rate": 0.0005097411078717201, | |
| "loss": 3.4507, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.561188811188811, | |
| "grad_norm": 0.3186721205711365, | |
| "learning_rate": 0.0005095661807580174, | |
| "loss": 3.4341, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.575757575757576, | |
| "grad_norm": 0.3171408474445343, | |
| "learning_rate": 0.0005093912536443149, | |
| "loss": 3.4501, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.575757575757576, | |
| "eval_accuracy": 0.3657292244576768, | |
| "eval_loss": 3.5900423526763916, | |
| "eval_runtime": 180.2145, | |
| "eval_samples_per_second": 92.346, | |
| "eval_steps_per_second": 5.776, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.590326340326341, | |
| "grad_norm": 0.35610276460647583, | |
| "learning_rate": 0.0005092163265306122, | |
| "loss": 3.4641, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.604895104895105, | |
| "grad_norm": 0.37525665760040283, | |
| "learning_rate": 0.0005090413994169096, | |
| "loss": 3.4647, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.619463869463869, | |
| "grad_norm": 0.33461683988571167, | |
| "learning_rate": 0.000508866472303207, | |
| "loss": 3.4597, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.634032634032634, | |
| "grad_norm": 0.3235708773136139, | |
| "learning_rate": 0.0005086915451895044, | |
| "loss": 3.4497, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.648601398601398, | |
| "grad_norm": 0.34801435470581055, | |
| "learning_rate": 0.0005085166180758017, | |
| "loss": 3.4504, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.663170163170163, | |
| "grad_norm": 0.32955285906791687, | |
| "learning_rate": 0.0005083416909620991, | |
| "loss": 3.449, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.677738927738928, | |
| "grad_norm": 0.3284403383731842, | |
| "learning_rate": 0.0005081667638483964, | |
| "loss": 3.4546, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.6923076923076925, | |
| "grad_norm": 0.32493704557418823, | |
| "learning_rate": 0.0005079918367346939, | |
| "loss": 3.4384, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.706876456876457, | |
| "grad_norm": 0.34628820419311523, | |
| "learning_rate": 0.0005078169096209912, | |
| "loss": 3.4534, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.721445221445221, | |
| "grad_norm": 0.33644041419029236, | |
| "learning_rate": 0.0005076419825072886, | |
| "loss": 3.4552, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.736013986013986, | |
| "grad_norm": 0.34094300866127014, | |
| "learning_rate": 0.000507467055393586, | |
| "loss": 3.4536, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.75058275058275, | |
| "grad_norm": 0.34397369623184204, | |
| "learning_rate": 0.0005072921282798834, | |
| "loss": 3.4635, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.765151515151516, | |
| "grad_norm": 0.3402233123779297, | |
| "learning_rate": 0.0005071172011661807, | |
| "loss": 3.4708, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.77972027972028, | |
| "grad_norm": 0.3712950050830841, | |
| "learning_rate": 0.0005069422740524781, | |
| "loss": 3.4713, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.7942890442890445, | |
| "grad_norm": 0.3284025490283966, | |
| "learning_rate": 0.0005067673469387754, | |
| "loss": 3.4556, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.808857808857809, | |
| "grad_norm": 0.34438565373420715, | |
| "learning_rate": 0.0005065924198250729, | |
| "loss": 3.4744, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.823426573426573, | |
| "grad_norm": 0.33172059059143066, | |
| "learning_rate": 0.0005064174927113702, | |
| "loss": 3.4534, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.837995337995338, | |
| "grad_norm": 0.3375876843929291, | |
| "learning_rate": 0.0005062425655976676, | |
| "loss": 3.4484, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.852564102564102, | |
| "grad_norm": 0.3456272780895233, | |
| "learning_rate": 0.0005060676384839649, | |
| "loss": 3.4474, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.867132867132867, | |
| "grad_norm": 0.3476708233356476, | |
| "learning_rate": 0.0005058927113702624, | |
| "loss": 3.4529, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.867132867132867, | |
| "eval_accuracy": 0.36633469981756955, | |
| "eval_loss": 3.5783565044403076, | |
| "eval_runtime": 182.5033, | |
| "eval_samples_per_second": 91.187, | |
| "eval_steps_per_second": 5.704, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.881701631701632, | |
| "grad_norm": 0.357236385345459, | |
| "learning_rate": 0.0005057177842565598, | |
| "loss": 3.4582, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.896270396270396, | |
| "grad_norm": 0.3404090404510498, | |
| "learning_rate": 0.0005055428571428571, | |
| "loss": 3.4633, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.910839160839161, | |
| "grad_norm": 0.341049462556839, | |
| "learning_rate": 0.0005053679300291544, | |
| "loss": 3.4626, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.925407925407925, | |
| "grad_norm": 0.321346640586853, | |
| "learning_rate": 0.0005051930029154519, | |
| "loss": 3.4529, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.93997668997669, | |
| "grad_norm": 0.31583067774772644, | |
| "learning_rate": 0.0005050180758017492, | |
| "loss": 3.4681, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.954545454545455, | |
| "grad_norm": 0.36198437213897705, | |
| "learning_rate": 0.0005048431486880466, | |
| "loss": 3.452, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.9691142191142195, | |
| "grad_norm": 0.34580230712890625, | |
| "learning_rate": 0.0005046682215743439, | |
| "loss": 3.4541, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.983682983682984, | |
| "grad_norm": 0.3525956869125366, | |
| "learning_rate": 0.0005044932944606414, | |
| "loss": 3.4677, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.998251748251748, | |
| "grad_norm": 0.312714546918869, | |
| "learning_rate": 0.0005043183673469388, | |
| "loss": 3.453, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.012820512820513, | |
| "grad_norm": 0.3349739909172058, | |
| "learning_rate": 0.0005041434402332361, | |
| "loss": 3.3605, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.027389277389277, | |
| "grad_norm": 0.35149267315864563, | |
| "learning_rate": 0.0005039685131195334, | |
| "loss": 3.3568, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.041958041958042, | |
| "grad_norm": 0.35762932896614075, | |
| "learning_rate": 0.0005037935860058309, | |
| "loss": 3.3594, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.056526806526806, | |
| "grad_norm": 0.3227376341819763, | |
| "learning_rate": 0.0005036186588921282, | |
| "loss": 3.3618, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.07109557109557, | |
| "grad_norm": 0.3321167826652527, | |
| "learning_rate": 0.0005034437317784256, | |
| "loss": 3.3757, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.085664335664335, | |
| "grad_norm": 0.34823182225227356, | |
| "learning_rate": 0.000503268804664723, | |
| "loss": 3.368, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.1002331002331, | |
| "grad_norm": 0.3144749701023102, | |
| "learning_rate": 0.0005030938775510204, | |
| "loss": 3.3762, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.114801864801866, | |
| "grad_norm": 0.33065780997276306, | |
| "learning_rate": 0.0005029189504373178, | |
| "loss": 3.3705, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.12937062937063, | |
| "grad_norm": 0.3569163382053375, | |
| "learning_rate": 0.0005027440233236151, | |
| "loss": 3.3917, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.143939393939394, | |
| "grad_norm": 0.336088091135025, | |
| "learning_rate": 0.0005025690962099126, | |
| "loss": 3.387, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.158508158508159, | |
| "grad_norm": 0.31934666633605957, | |
| "learning_rate": 0.0005023941690962099, | |
| "loss": 3.3725, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.158508158508159, | |
| "eval_accuracy": 0.3663358757262118, | |
| "eval_loss": 3.590759754180908, | |
| "eval_runtime": 182.8908, | |
| "eval_samples_per_second": 90.994, | |
| "eval_steps_per_second": 5.692, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.173076923076923, | |
| "grad_norm": 0.36414483189582825, | |
| "learning_rate": 0.0005022192419825072, | |
| "loss": 3.3922, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.187645687645688, | |
| "grad_norm": 0.3432634472846985, | |
| "learning_rate": 0.0005020443148688046, | |
| "loss": 3.3869, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.202214452214452, | |
| "grad_norm": 0.34101763367652893, | |
| "learning_rate": 0.000501869387755102, | |
| "loss": 3.3947, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.216783216783217, | |
| "grad_norm": 0.36927416920661926, | |
| "learning_rate": 0.0005016944606413994, | |
| "loss": 3.3831, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.231351981351981, | |
| "grad_norm": 0.3365326523780823, | |
| "learning_rate": 0.0005015195335276967, | |
| "loss": 3.3901, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.245920745920746, | |
| "grad_norm": 0.36989808082580566, | |
| "learning_rate": 0.0005013446064139941, | |
| "loss": 3.3948, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.26048951048951, | |
| "grad_norm": 0.3689005374908447, | |
| "learning_rate": 0.0005011696793002916, | |
| "loss": 3.3917, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.275058275058274, | |
| "grad_norm": 0.3656901717185974, | |
| "learning_rate": 0.0005009947521865889, | |
| "loss": 3.3954, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.289627039627039, | |
| "grad_norm": 0.3664736747741699, | |
| "learning_rate": 0.0005008198250728862, | |
| "loss": 3.4009, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.304195804195805, | |
| "grad_norm": 0.3412057161331177, | |
| "learning_rate": 0.0005006448979591836, | |
| "loss": 3.3916, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.31876456876457, | |
| "grad_norm": 0.37343958020210266, | |
| "learning_rate": 0.000500469970845481, | |
| "loss": 3.3981, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.40472412109375, | |
| "learning_rate": 0.0005002950437317784, | |
| "loss": 3.4063, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.347902097902098, | |
| "grad_norm": 0.33591440320014954, | |
| "learning_rate": 0.0005001201166180757, | |
| "loss": 3.3988, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.362470862470863, | |
| "grad_norm": 0.3387737572193146, | |
| "learning_rate": 0.0004999451895043731, | |
| "loss": 3.4103, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.377039627039627, | |
| "grad_norm": 0.3714272975921631, | |
| "learning_rate": 0.0004997702623906706, | |
| "loss": 3.4127, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.391608391608392, | |
| "grad_norm": 0.34964922070503235, | |
| "learning_rate": 0.0004995953352769679, | |
| "loss": 3.409, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.406177156177156, | |
| "grad_norm": 0.34254536032676697, | |
| "learning_rate": 0.0004994204081632653, | |
| "loss": 3.4076, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.42074592074592, | |
| "grad_norm": 0.34269341826438904, | |
| "learning_rate": 0.0004992454810495626, | |
| "loss": 3.3974, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.435314685314685, | |
| "grad_norm": 0.32962408661842346, | |
| "learning_rate": 0.00049907055393586, | |
| "loss": 3.3982, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.44988344988345, | |
| "grad_norm": 0.37458404898643494, | |
| "learning_rate": 0.0004988956268221574, | |
| "loss": 3.4032, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.44988344988345, | |
| "eval_accuracy": 0.3665615325946589, | |
| "eval_loss": 3.5825419425964355, | |
| "eval_runtime": 180.8696, | |
| "eval_samples_per_second": 92.011, | |
| "eval_steps_per_second": 5.756, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.464452214452214, | |
| "grad_norm": 0.34700310230255127, | |
| "learning_rate": 0.0004987206997084547, | |
| "loss": 3.4302, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.479020979020978, | |
| "grad_norm": 0.3363369405269623, | |
| "learning_rate": 0.0004985457725947521, | |
| "loss": 3.4196, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.493589743589745, | |
| "grad_norm": 0.34493017196655273, | |
| "learning_rate": 0.0004983708454810496, | |
| "loss": 3.4233, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.508158508158509, | |
| "grad_norm": 0.3357371389865875, | |
| "learning_rate": 0.0004981959183673469, | |
| "loss": 3.4124, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.522727272727273, | |
| "grad_norm": 0.3642560541629791, | |
| "learning_rate": 0.0004980209912536443, | |
| "loss": 3.4161, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.537296037296038, | |
| "grad_norm": 0.3482314944267273, | |
| "learning_rate": 0.0004978460641399417, | |
| "loss": 3.4204, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.551864801864802, | |
| "grad_norm": 0.3307981491088867, | |
| "learning_rate": 0.000497671137026239, | |
| "loss": 3.4275, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.566433566433567, | |
| "grad_norm": 0.3394106924533844, | |
| "learning_rate": 0.0004974962099125364, | |
| "loss": 3.4057, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.581002331002331, | |
| "grad_norm": 0.3372842073440552, | |
| "learning_rate": 0.0004973212827988337, | |
| "loss": 3.4092, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.595571095571096, | |
| "grad_norm": 0.32758432626724243, | |
| "learning_rate": 0.0004971463556851312, | |
| "loss": 3.4188, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.61013986013986, | |
| "grad_norm": 0.3386209309101105, | |
| "learning_rate": 0.0004969714285714286, | |
| "loss": 3.4205, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.624708624708624, | |
| "grad_norm": 0.3470524549484253, | |
| "learning_rate": 0.0004967965014577259, | |
| "loss": 3.4182, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.639277389277389, | |
| "grad_norm": 0.3339594900608063, | |
| "learning_rate": 0.0004966215743440233, | |
| "loss": 3.4176, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.653846153846153, | |
| "grad_norm": 0.3515772223472595, | |
| "learning_rate": 0.0004964466472303207, | |
| "loss": 3.4288, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.668414918414918, | |
| "grad_norm": 0.35019543766975403, | |
| "learning_rate": 0.000496271720116618, | |
| "loss": 3.4287, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.682983682983682, | |
| "grad_norm": 0.3279973566532135, | |
| "learning_rate": 0.0004960967930029154, | |
| "loss": 3.4211, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.697552447552448, | |
| "grad_norm": 0.33548232913017273, | |
| "learning_rate": 0.0004959218658892127, | |
| "loss": 3.4308, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.712121212121213, | |
| "grad_norm": 0.3599195182323456, | |
| "learning_rate": 0.0004957469387755102, | |
| "loss": 3.4228, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.726689976689977, | |
| "grad_norm": 0.34652629494667053, | |
| "learning_rate": 0.0004955720116618075, | |
| "loss": 3.4285, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.741258741258742, | |
| "grad_norm": 0.332381933927536, | |
| "learning_rate": 0.0004953970845481049, | |
| "loss": 3.4275, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.741258741258742, | |
| "eval_accuracy": 0.36736185601657184, | |
| "eval_loss": 3.5740106105804443, | |
| "eval_runtime": 182.2015, | |
| "eval_samples_per_second": 91.338, | |
| "eval_steps_per_second": 5.713, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.755827505827506, | |
| "grad_norm": 0.3524293601512909, | |
| "learning_rate": 0.0004952221574344023, | |
| "loss": 3.4245, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.77039627039627, | |
| "grad_norm": 0.33680975437164307, | |
| "learning_rate": 0.0004950472303206997, | |
| "loss": 3.4345, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.784965034965035, | |
| "grad_norm": 0.34272924065589905, | |
| "learning_rate": 0.0004948723032069971, | |
| "loss": 3.4335, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.7995337995338, | |
| "grad_norm": 0.3409082591533661, | |
| "learning_rate": 0.0004946973760932944, | |
| "loss": 3.4283, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.814102564102564, | |
| "grad_norm": 0.36862286925315857, | |
| "learning_rate": 0.0004945224489795917, | |
| "loss": 3.4235, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.828671328671328, | |
| "grad_norm": 0.3254280388355255, | |
| "learning_rate": 0.0004943475218658892, | |
| "loss": 3.4312, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.843240093240093, | |
| "grad_norm": 0.3392513394355774, | |
| "learning_rate": 0.0004941725947521865, | |
| "loss": 3.4257, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.857808857808857, | |
| "grad_norm": 0.3554167151451111, | |
| "learning_rate": 0.0004939976676384839, | |
| "loss": 3.4304, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.872377622377622, | |
| "grad_norm": 0.35996973514556885, | |
| "learning_rate": 0.0004938227405247813, | |
| "loss": 3.4399, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.886946386946388, | |
| "grad_norm": 0.36442074179649353, | |
| "learning_rate": 0.0004936478134110787, | |
| "loss": 3.4316, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.901515151515152, | |
| "grad_norm": 0.36240333318710327, | |
| "learning_rate": 0.0004934728862973761, | |
| "loss": 3.4262, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.916083916083917, | |
| "grad_norm": 0.33148348331451416, | |
| "learning_rate": 0.0004932979591836734, | |
| "loss": 3.4361, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.930652680652681, | |
| "grad_norm": 0.3203504681587219, | |
| "learning_rate": 0.0004931230320699707, | |
| "loss": 3.4466, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.945221445221446, | |
| "grad_norm": 0.357393741607666, | |
| "learning_rate": 0.0004929481049562682, | |
| "loss": 3.4402, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.95979020979021, | |
| "grad_norm": 0.36473795771598816, | |
| "learning_rate": 0.0004927731778425655, | |
| "loss": 3.4386, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.974358974358974, | |
| "grad_norm": 0.31827130913734436, | |
| "learning_rate": 0.0004925982507288629, | |
| "loss": 3.4416, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.988927738927739, | |
| "grad_norm": 0.35165274143218994, | |
| "learning_rate": 0.0004924233236151604, | |
| "loss": 3.4313, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 9.003496503496503, | |
| "grad_norm": 0.3829018771648407, | |
| "learning_rate": 0.0004922483965014577, | |
| "loss": 3.4134, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.018065268065268, | |
| "grad_norm": 0.3232770264148712, | |
| "learning_rate": 0.0004920734693877551, | |
| "loss": 3.3232, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.032634032634032, | |
| "grad_norm": 0.33512336015701294, | |
| "learning_rate": 0.0004918985422740524, | |
| "loss": 3.3293, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.032634032634032, | |
| "eval_accuracy": 0.3672867154543323, | |
| "eval_loss": 3.579836130142212, | |
| "eval_runtime": 181.2154, | |
| "eval_samples_per_second": 91.835, | |
| "eval_steps_per_second": 5.745, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.047202797202797, | |
| "grad_norm": 0.35247862339019775, | |
| "learning_rate": 0.0004917236151603499, | |
| "loss": 3.323, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.061771561771561, | |
| "grad_norm": 0.33538663387298584, | |
| "learning_rate": 0.0004915486880466472, | |
| "loss": 3.3462, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.076340326340326, | |
| "grad_norm": 0.3494170010089874, | |
| "learning_rate": 0.0004913737609329445, | |
| "loss": 3.3328, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.090909090909092, | |
| "grad_norm": 0.35296133160591125, | |
| "learning_rate": 0.0004911988338192419, | |
| "loss": 3.3354, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.105477855477856, | |
| "grad_norm": 0.3609370291233063, | |
| "learning_rate": 0.0004910239067055393, | |
| "loss": 3.3555, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.12004662004662, | |
| "grad_norm": 0.3352583050727844, | |
| "learning_rate": 0.0004908489795918367, | |
| "loss": 3.3444, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.134615384615385, | |
| "grad_norm": 0.3525612950325012, | |
| "learning_rate": 0.0004906740524781341, | |
| "loss": 3.3484, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.14918414918415, | |
| "grad_norm": 0.37619081139564514, | |
| "learning_rate": 0.0004904991253644314, | |
| "loss": 3.3576, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.163752913752914, | |
| "grad_norm": 0.3352401852607727, | |
| "learning_rate": 0.0004903241982507289, | |
| "loss": 3.3542, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.178321678321678, | |
| "grad_norm": 0.3672662675380707, | |
| "learning_rate": 0.0004901492711370262, | |
| "loss": 3.3445, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.192890442890443, | |
| "grad_norm": 0.36354750394821167, | |
| "learning_rate": 0.0004899743440233235, | |
| "loss": 3.3598, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.207459207459207, | |
| "grad_norm": 0.332333505153656, | |
| "learning_rate": 0.0004897994169096209, | |
| "loss": 3.3689, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.222027972027972, | |
| "grad_norm": 0.35484346747398376, | |
| "learning_rate": 0.0004896244897959183, | |
| "loss": 3.3683, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.236596736596736, | |
| "grad_norm": 0.3454098403453827, | |
| "learning_rate": 0.0004894495626822157, | |
| "loss": 3.3646, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.2511655011655, | |
| "grad_norm": 0.34342116117477417, | |
| "learning_rate": 0.0004892746355685131, | |
| "loss": 3.3622, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.265734265734265, | |
| "grad_norm": 0.3866739273071289, | |
| "learning_rate": 0.0004890997084548104, | |
| "loss": 3.3679, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.280303030303031, | |
| "grad_norm": 0.3526863753795624, | |
| "learning_rate": 0.0004889247813411079, | |
| "loss": 3.3813, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.294871794871796, | |
| "grad_norm": 0.36674419045448303, | |
| "learning_rate": 0.0004887498542274052, | |
| "loss": 3.3658, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.30944055944056, | |
| "grad_norm": 0.36833953857421875, | |
| "learning_rate": 0.0004885749271137026, | |
| "loss": 3.3734, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.324009324009324, | |
| "grad_norm": 0.3428957462310791, | |
| "learning_rate": 0.0004883999999999999, | |
| "loss": 3.3803, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.324009324009324, | |
| "eval_accuracy": 0.36745534075363046, | |
| "eval_loss": 3.579030990600586, | |
| "eval_runtime": 180.7616, | |
| "eval_samples_per_second": 92.066, | |
| "eval_steps_per_second": 5.759, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.338578088578089, | |
| "grad_norm": 0.3581482172012329, | |
| "learning_rate": 0.0004882250728862973, | |
| "loss": 3.3766, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.353146853146853, | |
| "grad_norm": 0.3714257478713989, | |
| "learning_rate": 0.0004880501457725947, | |
| "loss": 3.3814, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.367715617715618, | |
| "grad_norm": 0.361931174993515, | |
| "learning_rate": 0.00048787521865889207, | |
| "loss": 3.3851, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.382284382284382, | |
| "grad_norm": 0.3409428596496582, | |
| "learning_rate": 0.00048770029154518945, | |
| "loss": 3.384, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.396853146853147, | |
| "grad_norm": 0.40810930728912354, | |
| "learning_rate": 0.0004875253644314868, | |
| "loss": 3.3816, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.411421911421911, | |
| "grad_norm": 0.3254898190498352, | |
| "learning_rate": 0.0004873504373177842, | |
| "loss": 3.381, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.425990675990676, | |
| "grad_norm": 0.354233056306839, | |
| "learning_rate": 0.00048717551020408163, | |
| "loss": 3.3847, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.44055944055944, | |
| "grad_norm": 0.3318980038166046, | |
| "learning_rate": 0.000487000583090379, | |
| "loss": 3.3792, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.455128205128204, | |
| "grad_norm": 0.32618919014930725, | |
| "learning_rate": 0.00048682565597667633, | |
| "loss": 3.3899, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.469696969696969, | |
| "grad_norm": 0.39949190616607666, | |
| "learning_rate": 0.0004866507288629737, | |
| "loss": 3.3837, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.484265734265735, | |
| "grad_norm": 0.3685564398765564, | |
| "learning_rate": 0.0004864758017492711, | |
| "loss": 3.3851, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.4988344988345, | |
| "grad_norm": 0.359235018491745, | |
| "learning_rate": 0.00048630087463556845, | |
| "loss": 3.3893, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.513403263403264, | |
| "grad_norm": 0.33161383867263794, | |
| "learning_rate": 0.00048612594752186583, | |
| "loss": 3.4009, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.527972027972028, | |
| "grad_norm": 0.3646078109741211, | |
| "learning_rate": 0.0004859510204081632, | |
| "loss": 3.4062, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.542540792540793, | |
| "grad_norm": 0.32304298877716064, | |
| "learning_rate": 0.00048577609329446064, | |
| "loss": 3.4058, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.557109557109557, | |
| "grad_norm": 0.340385764837265, | |
| "learning_rate": 0.000485601166180758, | |
| "loss": 3.4003, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.571678321678322, | |
| "grad_norm": 0.353704571723938, | |
| "learning_rate": 0.0004854262390670554, | |
| "loss": 3.3916, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.586247086247086, | |
| "grad_norm": 0.3353423476219177, | |
| "learning_rate": 0.0004852513119533527, | |
| "loss": 3.4019, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.60081585081585, | |
| "grad_norm": 0.3232695758342743, | |
| "learning_rate": 0.0004850763848396501, | |
| "loss": 3.3974, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.615384615384615, | |
| "grad_norm": 0.36285659670829773, | |
| "learning_rate": 0.00048490145772594746, | |
| "loss": 3.3931, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.615384615384615, | |
| "eval_accuracy": 0.368203101059235, | |
| "eval_loss": 3.5726845264434814, | |
| "eval_runtime": 180.0842, | |
| "eval_samples_per_second": 92.412, | |
| "eval_steps_per_second": 5.781, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.62995337995338, | |
| "grad_norm": 0.3308947682380676, | |
| "learning_rate": 0.00048472653061224484, | |
| "loss": 3.4049, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.644522144522144, | |
| "grad_norm": 0.3408724367618561, | |
| "learning_rate": 0.0004845516034985422, | |
| "loss": 3.404, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.659090909090908, | |
| "grad_norm": 0.34324896335601807, | |
| "learning_rate": 0.0004843766763848396, | |
| "loss": 3.399, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.673659673659674, | |
| "grad_norm": 0.34077367186546326, | |
| "learning_rate": 0.000484201749271137, | |
| "loss": 3.3953, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.688228438228439, | |
| "grad_norm": 0.35905328392982483, | |
| "learning_rate": 0.0004840268221574344, | |
| "loss": 3.3853, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.702797202797203, | |
| "grad_norm": 0.3622050881385803, | |
| "learning_rate": 0.00048385189504373177, | |
| "loss": 3.4025, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.717365967365968, | |
| "grad_norm": 0.34367215633392334, | |
| "learning_rate": 0.0004836769679300291, | |
| "loss": 3.4029, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.731934731934732, | |
| "grad_norm": 0.32383468747138977, | |
| "learning_rate": 0.00048350204081632647, | |
| "loss": 3.4049, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.746503496503497, | |
| "grad_norm": 0.36959537863731384, | |
| "learning_rate": 0.00048332711370262384, | |
| "loss": 3.405, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.761072261072261, | |
| "grad_norm": 0.3404758870601654, | |
| "learning_rate": 0.0004831521865889212, | |
| "loss": 3.4005, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.775641025641026, | |
| "grad_norm": 0.36188212037086487, | |
| "learning_rate": 0.0004829772594752186, | |
| "loss": 3.4074, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.79020979020979, | |
| "grad_norm": 0.38642576336860657, | |
| "learning_rate": 0.00048280233236151597, | |
| "loss": 3.4068, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.804778554778554, | |
| "grad_norm": 0.32433605194091797, | |
| "learning_rate": 0.0004826274052478134, | |
| "loss": 3.4092, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.819347319347319, | |
| "grad_norm": 0.3639720678329468, | |
| "learning_rate": 0.0004824524781341108, | |
| "loss": 3.3985, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.833916083916083, | |
| "grad_norm": 0.3690209686756134, | |
| "learning_rate": 0.00048227755102040815, | |
| "loss": 3.407, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.848484848484848, | |
| "grad_norm": 0.32806217670440674, | |
| "learning_rate": 0.0004821026239067055, | |
| "loss": 3.4117, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.863053613053612, | |
| "grad_norm": 0.32632794976234436, | |
| "learning_rate": 0.00048192769679300285, | |
| "loss": 3.4169, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.877622377622378, | |
| "grad_norm": 0.34658604860305786, | |
| "learning_rate": 0.0004817527696793002, | |
| "loss": 3.4117, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.892191142191143, | |
| "grad_norm": 0.34974268078804016, | |
| "learning_rate": 0.0004815778425655976, | |
| "loss": 3.4073, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.906759906759907, | |
| "grad_norm": 0.3343101739883423, | |
| "learning_rate": 0.000481402915451895, | |
| "loss": 3.4063, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.906759906759907, | |
| "eval_accuracy": 0.3688429129514813, | |
| "eval_loss": 3.5587732791900635, | |
| "eval_runtime": 180.2379, | |
| "eval_samples_per_second": 92.334, | |
| "eval_steps_per_second": 5.776, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.921328671328672, | |
| "grad_norm": 0.33629804849624634, | |
| "learning_rate": 0.0004812279883381924, | |
| "loss": 3.4184, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.935897435897436, | |
| "grad_norm": 0.35826265811920166, | |
| "learning_rate": 0.0004810530612244898, | |
| "loss": 3.4062, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.9504662004662, | |
| "grad_norm": 0.3323402404785156, | |
| "learning_rate": 0.00048087813411078716, | |
| "loss": 3.4029, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.965034965034965, | |
| "grad_norm": 0.3231922388076782, | |
| "learning_rate": 0.00048070320699708453, | |
| "loss": 3.4137, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.97960372960373, | |
| "grad_norm": 0.35591524839401245, | |
| "learning_rate": 0.00048052827988338186, | |
| "loss": 3.4172, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.994172494172494, | |
| "grad_norm": 0.3526099920272827, | |
| "learning_rate": 0.00048035335276967923, | |
| "loss": 3.4215, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 10.008741258741258, | |
| "grad_norm": 0.367563933134079, | |
| "learning_rate": 0.0004801784256559766, | |
| "loss": 3.3311, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.023310023310023, | |
| "grad_norm": 0.34572193026542664, | |
| "learning_rate": 0.000480003498542274, | |
| "loss": 3.3062, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.037878787878787, | |
| "grad_norm": 0.362204909324646, | |
| "learning_rate": 0.00047982857142857136, | |
| "loss": 3.3028, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.052447552447552, | |
| "grad_norm": 0.3749389946460724, | |
| "learning_rate": 0.0004796536443148688, | |
| "loss": 3.3031, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.067016317016318, | |
| "grad_norm": 0.3729357421398163, | |
| "learning_rate": 0.00047947871720116616, | |
| "loss": 3.3036, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.081585081585082, | |
| "grad_norm": 0.3892238140106201, | |
| "learning_rate": 0.00047930379008746354, | |
| "loss": 3.3145, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.096153846153847, | |
| "grad_norm": 0.3650963008403778, | |
| "learning_rate": 0.0004791288629737609, | |
| "loss": 3.3232, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.110722610722611, | |
| "grad_norm": 0.3529200851917267, | |
| "learning_rate": 0.00047895393586005824, | |
| "loss": 3.3166, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.125291375291376, | |
| "grad_norm": 0.3430958390235901, | |
| "learning_rate": 0.0004787790087463556, | |
| "loss": 3.3311, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.13986013986014, | |
| "grad_norm": 0.35546183586120605, | |
| "learning_rate": 0.000478604081632653, | |
| "loss": 3.3229, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.154428904428904, | |
| "grad_norm": 0.3477681279182434, | |
| "learning_rate": 0.00047842915451895037, | |
| "loss": 3.3211, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.168997668997669, | |
| "grad_norm": 0.35804784297943115, | |
| "learning_rate": 0.0004782542274052478, | |
| "loss": 3.318, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.183566433566433, | |
| "grad_norm": 0.3714865744113922, | |
| "learning_rate": 0.00047807930029154517, | |
| "loss": 3.3529, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.198135198135198, | |
| "grad_norm": 0.37744787335395813, | |
| "learning_rate": 0.00047790437317784255, | |
| "loss": 3.3379, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.198135198135198, | |
| "eval_accuracy": 0.36837090322248356, | |
| "eval_loss": 3.5747363567352295, | |
| "eval_runtime": 180.0894, | |
| "eval_samples_per_second": 92.41, | |
| "eval_steps_per_second": 5.78, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.212703962703962, | |
| "grad_norm": 0.3652697801589966, | |
| "learning_rate": 0.0004777294460641399, | |
| "loss": 3.3403, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.227272727272727, | |
| "grad_norm": 0.3565238118171692, | |
| "learning_rate": 0.0004775545189504373, | |
| "loss": 3.3517, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.241841491841491, | |
| "grad_norm": 0.3647816777229309, | |
| "learning_rate": 0.0004773795918367346, | |
| "loss": 3.3465, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.256410256410255, | |
| "grad_norm": 0.3312961161136627, | |
| "learning_rate": 0.000477204664723032, | |
| "loss": 3.3448, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.270979020979022, | |
| "grad_norm": 0.3463350534439087, | |
| "learning_rate": 0.00047702973760932937, | |
| "loss": 3.329, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.285547785547786, | |
| "grad_norm": 0.36243367195129395, | |
| "learning_rate": 0.00047685481049562675, | |
| "loss": 3.3469, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.30011655011655, | |
| "grad_norm": 0.3585239350795746, | |
| "learning_rate": 0.0004766798833819242, | |
| "loss": 3.3488, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.314685314685315, | |
| "grad_norm": 0.33923816680908203, | |
| "learning_rate": 0.00047650495626822155, | |
| "loss": 3.357, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.32925407925408, | |
| "grad_norm": 0.3626267910003662, | |
| "learning_rate": 0.00047633002915451893, | |
| "loss": 3.356, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.343822843822844, | |
| "grad_norm": 0.36127206683158875, | |
| "learning_rate": 0.0004761551020408163, | |
| "loss": 3.3728, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.358391608391608, | |
| "grad_norm": 0.3516559600830078, | |
| "learning_rate": 0.0004759801749271137, | |
| "loss": 3.3548, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.372960372960373, | |
| "grad_norm": 0.38914352655410767, | |
| "learning_rate": 0.000475805247813411, | |
| "loss": 3.3593, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.387529137529137, | |
| "grad_norm": 0.3629930317401886, | |
| "learning_rate": 0.0004756303206997084, | |
| "loss": 3.3497, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.402097902097902, | |
| "grad_norm": 0.34036391973495483, | |
| "learning_rate": 0.00047545539358600575, | |
| "loss": 3.3635, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.416666666666666, | |
| "grad_norm": 0.35723787546157837, | |
| "learning_rate": 0.00047528046647230313, | |
| "loss": 3.364, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.43123543123543, | |
| "grad_norm": 0.3406592309474945, | |
| "learning_rate": 0.00047510553935860056, | |
| "loss": 3.3589, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.445804195804195, | |
| "grad_norm": 0.3650604784488678, | |
| "learning_rate": 0.00047493061224489794, | |
| "loss": 3.3673, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.460372960372961, | |
| "grad_norm": 0.33995601534843445, | |
| "learning_rate": 0.0004747556851311953, | |
| "loss": 3.3702, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.474941724941726, | |
| "grad_norm": 0.3596780002117157, | |
| "learning_rate": 0.0004745807580174927, | |
| "loss": 3.3651, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.48951048951049, | |
| "grad_norm": 0.358271062374115, | |
| "learning_rate": 0.00047440583090379006, | |
| "loss": 3.3768, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.48951048951049, | |
| "eval_accuracy": 0.3689066471998911, | |
| "eval_loss": 3.565972089767456, | |
| "eval_runtime": 180.2039, | |
| "eval_samples_per_second": 92.351, | |
| "eval_steps_per_second": 5.777, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.504079254079254, | |
| "grad_norm": 0.3587784767150879, | |
| "learning_rate": 0.0004742309037900874, | |
| "loss": 3.3685, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.518648018648019, | |
| "grad_norm": 0.36644667387008667, | |
| "learning_rate": 0.00047405597667638476, | |
| "loss": 3.3731, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.533216783216783, | |
| "grad_norm": 0.3659219741821289, | |
| "learning_rate": 0.00047388104956268214, | |
| "loss": 3.3799, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.547785547785548, | |
| "grad_norm": 0.36219388246536255, | |
| "learning_rate": 0.00047370612244897957, | |
| "loss": 3.366, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.562354312354312, | |
| "grad_norm": 0.3452727496623993, | |
| "learning_rate": 0.00047353119533527694, | |
| "loss": 3.3727, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.576923076923077, | |
| "grad_norm": 0.34664297103881836, | |
| "learning_rate": 0.0004733562682215743, | |
| "loss": 3.359, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.591491841491841, | |
| "grad_norm": 0.34712809324264526, | |
| "learning_rate": 0.0004731813411078717, | |
| "loss": 3.3701, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.606060606060606, | |
| "grad_norm": 0.34347906708717346, | |
| "learning_rate": 0.00047300641399416907, | |
| "loss": 3.3803, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.62062937062937, | |
| "grad_norm": 0.37337714433670044, | |
| "learning_rate": 0.00047283148688046645, | |
| "loss": 3.3882, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.635198135198134, | |
| "grad_norm": 0.36376672983169556, | |
| "learning_rate": 0.00047265655976676377, | |
| "loss": 3.383, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.649766899766899, | |
| "grad_norm": 0.34523946046829224, | |
| "learning_rate": 0.00047248163265306114, | |
| "loss": 3.3846, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.664335664335665, | |
| "grad_norm": 0.3508089482784271, | |
| "learning_rate": 0.0004723067055393585, | |
| "loss": 3.3739, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.67890442890443, | |
| "grad_norm": 0.3470657467842102, | |
| "learning_rate": 0.00047213177842565595, | |
| "loss": 3.3717, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.693473193473194, | |
| "grad_norm": 0.3334925174713135, | |
| "learning_rate": 0.0004719568513119533, | |
| "loss": 3.3814, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.708041958041958, | |
| "grad_norm": 0.3517080545425415, | |
| "learning_rate": 0.0004717819241982507, | |
| "loss": 3.3845, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.722610722610723, | |
| "grad_norm": 0.3703469932079315, | |
| "learning_rate": 0.0004716069970845481, | |
| "loss": 3.3785, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.737179487179487, | |
| "grad_norm": 0.3503482937812805, | |
| "learning_rate": 0.00047143206997084545, | |
| "loss": 3.3877, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.751748251748252, | |
| "grad_norm": 0.36413902044296265, | |
| "learning_rate": 0.00047125714285714283, | |
| "loss": 3.3901, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.766317016317016, | |
| "grad_norm": 0.35273477435112, | |
| "learning_rate": 0.00047108221574344015, | |
| "loss": 3.3989, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.78088578088578, | |
| "grad_norm": 0.3469065725803375, | |
| "learning_rate": 0.0004709072886297375, | |
| "loss": 3.3929, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.78088578088578, | |
| "eval_accuracy": 0.36930116454936474, | |
| "eval_loss": 3.5597054958343506, | |
| "eval_runtime": 180.1588, | |
| "eval_samples_per_second": 92.374, | |
| "eval_steps_per_second": 5.778, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.795454545454545, | |
| "grad_norm": 0.347210556268692, | |
| "learning_rate": 0.00047073236151603495, | |
| "loss": 3.3819, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.81002331002331, | |
| "grad_norm": 0.35915273427963257, | |
| "learning_rate": 0.00047055743440233233, | |
| "loss": 3.3801, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.824592074592074, | |
| "grad_norm": 0.3388284146785736, | |
| "learning_rate": 0.0004703825072886297, | |
| "loss": 3.3866, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.83916083916084, | |
| "grad_norm": 0.3657146990299225, | |
| "learning_rate": 0.0004702075801749271, | |
| "loss": 3.4009, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.853729603729604, | |
| "grad_norm": 0.35583174228668213, | |
| "learning_rate": 0.00047003265306122446, | |
| "loss": 3.387, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.868298368298369, | |
| "grad_norm": 0.3616805672645569, | |
| "learning_rate": 0.00046985772594752183, | |
| "loss": 3.3672, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.882867132867133, | |
| "grad_norm": 0.34906110167503357, | |
| "learning_rate": 0.0004696827988338192, | |
| "loss": 3.3822, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.897435897435898, | |
| "grad_norm": 0.37446925044059753, | |
| "learning_rate": 0.00046950787172011653, | |
| "loss": 3.3935, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.912004662004662, | |
| "grad_norm": 0.3785672187805176, | |
| "learning_rate": 0.0004693329446064139, | |
| "loss": 3.3824, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.926573426573427, | |
| "grad_norm": 0.37299731373786926, | |
| "learning_rate": 0.00046915801749271134, | |
| "loss": 3.3865, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.941142191142191, | |
| "grad_norm": 0.3548412621021271, | |
| "learning_rate": 0.0004689830903790087, | |
| "loss": 3.3952, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.955710955710956, | |
| "grad_norm": 0.36777183413505554, | |
| "learning_rate": 0.0004688081632653061, | |
| "loss": 3.3878, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.97027972027972, | |
| "grad_norm": 0.36412835121154785, | |
| "learning_rate": 0.00046863323615160346, | |
| "loss": 3.4091, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.984848484848484, | |
| "grad_norm": 0.3270232379436493, | |
| "learning_rate": 0.00046845830903790084, | |
| "loss": 3.3996, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 10.999417249417249, | |
| "grad_norm": 0.3319988250732422, | |
| "learning_rate": 0.0004682833819241982, | |
| "loss": 3.3991, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.013986013986013, | |
| "grad_norm": 0.35844141244888306, | |
| "learning_rate": 0.0004681084548104956, | |
| "loss": 3.2767, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.028554778554778, | |
| "grad_norm": 0.3383696377277374, | |
| "learning_rate": 0.0004679335276967929, | |
| "loss": 3.2718, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.043123543123544, | |
| "grad_norm": 0.3634346127510071, | |
| "learning_rate": 0.0004677586005830903, | |
| "loss": 3.2706, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.057692307692308, | |
| "grad_norm": 0.3992638885974884, | |
| "learning_rate": 0.0004675836734693877, | |
| "loss": 3.2881, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.072261072261073, | |
| "grad_norm": 0.35264912247657776, | |
| "learning_rate": 0.0004674087463556851, | |
| "loss": 3.2905, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.072261072261073, | |
| "eval_accuracy": 0.36926294751849176, | |
| "eval_loss": 3.5672919750213623, | |
| "eval_runtime": 180.0525, | |
| "eval_samples_per_second": 92.429, | |
| "eval_steps_per_second": 5.782, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.086829836829837, | |
| "grad_norm": 0.38650333881378174, | |
| "learning_rate": 0.00046723381924198247, | |
| "loss": 3.3106, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.101398601398602, | |
| "grad_norm": 0.3478892743587494, | |
| "learning_rate": 0.00046705889212827985, | |
| "loss": 3.3016, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.115967365967366, | |
| "grad_norm": 0.3671860992908478, | |
| "learning_rate": 0.0004668839650145772, | |
| "loss": 3.2985, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.13053613053613, | |
| "grad_norm": 0.3565201461315155, | |
| "learning_rate": 0.0004667090379008746, | |
| "loss": 3.3071, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.145104895104895, | |
| "grad_norm": 0.3274824321269989, | |
| "learning_rate": 0.000466534110787172, | |
| "loss": 3.3222, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.15967365967366, | |
| "grad_norm": 0.3710516691207886, | |
| "learning_rate": 0.0004663591836734693, | |
| "loss": 3.3109, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.174242424242424, | |
| "grad_norm": 0.37232545018196106, | |
| "learning_rate": 0.0004661842565597667, | |
| "loss": 3.3054, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.188811188811188, | |
| "grad_norm": 0.3739616274833679, | |
| "learning_rate": 0.0004660093294460641, | |
| "loss": 3.3147, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.203379953379953, | |
| "grad_norm": 0.35690245032310486, | |
| "learning_rate": 0.0004658344023323615, | |
| "loss": 3.3187, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.217948717948717, | |
| "grad_norm": 0.3522016704082489, | |
| "learning_rate": 0.00046565947521865885, | |
| "loss": 3.321, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.232517482517483, | |
| "grad_norm": 0.379158079624176, | |
| "learning_rate": 0.00046548454810495623, | |
| "loss": 3.3273, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.247086247086248, | |
| "grad_norm": 0.37325507402420044, | |
| "learning_rate": 0.0004653096209912536, | |
| "loss": 3.3222, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.261655011655012, | |
| "grad_norm": 0.3767625093460083, | |
| "learning_rate": 0.000465134693877551, | |
| "loss": 3.3269, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.276223776223777, | |
| "grad_norm": 0.3531850278377533, | |
| "learning_rate": 0.0004649597667638484, | |
| "loss": 3.3361, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.290792540792541, | |
| "grad_norm": 0.35781583189964294, | |
| "learning_rate": 0.0004647848396501457, | |
| "loss": 3.3308, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.305361305361306, | |
| "grad_norm": 0.35981640219688416, | |
| "learning_rate": 0.0004646099125364431, | |
| "loss": 3.3252, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.31993006993007, | |
| "grad_norm": 0.36371827125549316, | |
| "learning_rate": 0.0004644349854227405, | |
| "loss": 3.3374, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.334498834498834, | |
| "grad_norm": 0.37464508414268494, | |
| "learning_rate": 0.00046426005830903786, | |
| "loss": 3.3461, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.349067599067599, | |
| "grad_norm": 0.38214632868766785, | |
| "learning_rate": 0.00046408513119533523, | |
| "loss": 3.3348, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.363636363636363, | |
| "grad_norm": 0.40841469168663025, | |
| "learning_rate": 0.0004639102040816326, | |
| "loss": 3.3375, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.363636363636363, | |
| "eval_accuracy": 0.3695302315528744, | |
| "eval_loss": 3.563751220703125, | |
| "eval_runtime": 180.0277, | |
| "eval_samples_per_second": 92.441, | |
| "eval_steps_per_second": 5.782, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.378205128205128, | |
| "grad_norm": 0.35644689202308655, | |
| "learning_rate": 0.00046373527696793, | |
| "loss": 3.3485, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.392773892773892, | |
| "grad_norm": 0.3444243371486664, | |
| "learning_rate": 0.00046356034985422736, | |
| "loss": 3.3417, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.407342657342657, | |
| "grad_norm": 0.3749789893627167, | |
| "learning_rate": 0.0004633854227405248, | |
| "loss": 3.3419, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.421911421911421, | |
| "grad_norm": 0.3557623326778412, | |
| "learning_rate": 0.0004632104956268221, | |
| "loss": 3.3325, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.436480186480187, | |
| "grad_norm": 0.36125391721725464, | |
| "learning_rate": 0.0004630355685131195, | |
| "loss": 3.3398, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.451048951048952, | |
| "grad_norm": 0.3687732517719269, | |
| "learning_rate": 0.00046286064139941687, | |
| "loss": 3.3518, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.465617715617716, | |
| "grad_norm": 0.3502034842967987, | |
| "learning_rate": 0.00046268571428571424, | |
| "loss": 3.3484, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.48018648018648, | |
| "grad_norm": 0.3895909786224365, | |
| "learning_rate": 0.0004625107871720116, | |
| "loss": 3.3564, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.494755244755245, | |
| "grad_norm": 0.3652609884738922, | |
| "learning_rate": 0.000462335860058309, | |
| "loss": 3.346, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.50932400932401, | |
| "grad_norm": 0.372211754322052, | |
| "learning_rate": 0.00046216093294460637, | |
| "loss": 3.3468, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.523892773892774, | |
| "grad_norm": 0.3634597063064575, | |
| "learning_rate": 0.0004619860058309038, | |
| "loss": 3.343, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.538461538461538, | |
| "grad_norm": 0.3725431561470032, | |
| "learning_rate": 0.0004618110787172012, | |
| "loss": 3.3475, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.553030303030303, | |
| "grad_norm": 0.3666999042034149, | |
| "learning_rate": 0.0004616361516034985, | |
| "loss": 3.3463, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.567599067599067, | |
| "grad_norm": 0.33625391125679016, | |
| "learning_rate": 0.00046146122448979587, | |
| "loss": 3.3364, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.582167832167832, | |
| "grad_norm": 0.35108792781829834, | |
| "learning_rate": 0.00046128629737609325, | |
| "loss": 3.3491, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.596736596736596, | |
| "grad_norm": 0.36968687176704407, | |
| "learning_rate": 0.0004611113702623906, | |
| "loss": 3.3587, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.61130536130536, | |
| "grad_norm": 0.37255340814590454, | |
| "learning_rate": 0.000460936443148688, | |
| "loss": 3.3613, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.625874125874127, | |
| "grad_norm": 0.37071385979652405, | |
| "learning_rate": 0.0004607615160349854, | |
| "loss": 3.3637, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.640442890442891, | |
| "grad_norm": 0.3244622051715851, | |
| "learning_rate": 0.00046058658892128275, | |
| "loss": 3.347, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.655011655011656, | |
| "grad_norm": 0.33037108182907104, | |
| "learning_rate": 0.0004604116618075802, | |
| "loss": 3.352, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.655011655011656, | |
| "eval_accuracy": 0.3698865318714751, | |
| "eval_loss": 3.5570318698883057, | |
| "eval_runtime": 179.9937, | |
| "eval_samples_per_second": 92.459, | |
| "eval_steps_per_second": 5.784, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.66958041958042, | |
| "grad_norm": 0.3523600101470947, | |
| "learning_rate": 0.00046023673469387756, | |
| "loss": 3.3681, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.684149184149184, | |
| "grad_norm": 0.336599737405777, | |
| "learning_rate": 0.0004600618075801749, | |
| "loss": 3.3582, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.698717948717949, | |
| "grad_norm": 0.3519699275493622, | |
| "learning_rate": 0.00045988688046647225, | |
| "loss": 3.3528, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.713286713286713, | |
| "grad_norm": 0.34988924860954285, | |
| "learning_rate": 0.00045971195335276963, | |
| "loss": 3.3675, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.727855477855478, | |
| "grad_norm": 0.42452919483184814, | |
| "learning_rate": 0.000459537026239067, | |
| "loss": 3.3535, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.742424242424242, | |
| "grad_norm": 0.35697510838508606, | |
| "learning_rate": 0.0004593620991253644, | |
| "loss": 3.3677, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.756993006993007, | |
| "grad_norm": 0.35553404688835144, | |
| "learning_rate": 0.00045918717201166176, | |
| "loss": 3.3776, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.771561771561771, | |
| "grad_norm": 0.343811959028244, | |
| "learning_rate": 0.00045901224489795913, | |
| "loss": 3.3593, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.786130536130536, | |
| "grad_norm": 0.3576320707798004, | |
| "learning_rate": 0.00045883731778425656, | |
| "loss": 3.3734, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.8006993006993, | |
| "grad_norm": 0.38827261328697205, | |
| "learning_rate": 0.00045866239067055394, | |
| "loss": 3.3659, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.815268065268064, | |
| "grad_norm": 0.3964768350124359, | |
| "learning_rate": 0.00045848746355685126, | |
| "loss": 3.3767, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.82983682983683, | |
| "grad_norm": 0.38394802808761597, | |
| "learning_rate": 0.00045831253644314864, | |
| "loss": 3.3616, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.844405594405595, | |
| "grad_norm": 0.3503780961036682, | |
| "learning_rate": 0.000458137609329446, | |
| "loss": 3.3747, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.85897435897436, | |
| "grad_norm": 0.3336319625377655, | |
| "learning_rate": 0.0004579626822157434, | |
| "loss": 3.3788, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.873543123543124, | |
| "grad_norm": 0.3886152505874634, | |
| "learning_rate": 0.00045778775510204076, | |
| "loss": 3.3737, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.888111888111888, | |
| "grad_norm": 0.3735368549823761, | |
| "learning_rate": 0.00045761282798833814, | |
| "loss": 3.3729, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.902680652680653, | |
| "grad_norm": 0.3551517724990845, | |
| "learning_rate": 0.00045743790087463557, | |
| "loss": 3.3707, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.917249417249417, | |
| "grad_norm": 0.3696897625923157, | |
| "learning_rate": 0.00045726297376093294, | |
| "loss": 3.3682, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.931818181818182, | |
| "grad_norm": 0.36508408188819885, | |
| "learning_rate": 0.0004570880466472303, | |
| "loss": 3.3771, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.946386946386946, | |
| "grad_norm": 0.39585646986961365, | |
| "learning_rate": 0.00045691311953352764, | |
| "loss": 3.3692, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.946386946386946, | |
| "eval_accuracy": 0.37068826638375874, | |
| "eval_loss": 3.5467517375946045, | |
| "eval_runtime": 180.2482, | |
| "eval_samples_per_second": 92.328, | |
| "eval_steps_per_second": 5.775, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.96095571095571, | |
| "grad_norm": 0.3724261522293091, | |
| "learning_rate": 0.000456738192419825, | |
| "loss": 3.3709, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.975524475524475, | |
| "grad_norm": 0.3474469780921936, | |
| "learning_rate": 0.0004565632653061224, | |
| "loss": 3.3842, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.99009324009324, | |
| "grad_norm": 0.3345330059528351, | |
| "learning_rate": 0.00045638833819241977, | |
| "loss": 3.3731, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 12.004662004662004, | |
| "grad_norm": 0.37426432967185974, | |
| "learning_rate": 0.00045621341107871715, | |
| "loss": 3.3369, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.01923076923077, | |
| "grad_norm": 0.37405309081077576, | |
| "learning_rate": 0.0004560384839650145, | |
| "loss": 3.2543, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.033799533799534, | |
| "grad_norm": 0.36314573884010315, | |
| "learning_rate": 0.00045586355685131195, | |
| "loss": 3.2579, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.048368298368299, | |
| "grad_norm": 0.3675522208213806, | |
| "learning_rate": 0.0004556886297376093, | |
| "loss": 3.2689, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.062937062937063, | |
| "grad_norm": 0.3591010570526123, | |
| "learning_rate": 0.0004555137026239067, | |
| "loss": 3.2704, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.077505827505828, | |
| "grad_norm": 0.3727307617664337, | |
| "learning_rate": 0.000455338775510204, | |
| "loss": 3.2726, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.092074592074592, | |
| "grad_norm": 0.35560178756713867, | |
| "learning_rate": 0.0004551638483965014, | |
| "loss": 3.2717, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.106643356643357, | |
| "grad_norm": 0.3758648931980133, | |
| "learning_rate": 0.0004549889212827988, | |
| "loss": 3.277, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.121212121212121, | |
| "grad_norm": 0.3858795464038849, | |
| "learning_rate": 0.00045481399416909615, | |
| "loss": 3.2792, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.135780885780886, | |
| "grad_norm": 0.3726632297039032, | |
| "learning_rate": 0.00045463906705539353, | |
| "loss": 3.2981, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.15034965034965, | |
| "grad_norm": 0.3951834440231323, | |
| "learning_rate": 0.0004544641399416909, | |
| "loss": 3.2841, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.164918414918414, | |
| "grad_norm": 0.3515232503414154, | |
| "learning_rate": 0.00045428921282798833, | |
| "loss": 3.2943, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.179487179487179, | |
| "grad_norm": 0.3756238520145416, | |
| "learning_rate": 0.0004541142857142857, | |
| "loss": 3.2946, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.194055944055943, | |
| "grad_norm": 0.3413456678390503, | |
| "learning_rate": 0.0004539393586005831, | |
| "loss": 3.2894, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.20862470862471, | |
| "grad_norm": 0.35358941555023193, | |
| "learning_rate": 0.0004537644314868804, | |
| "loss": 3.2924, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.223193473193474, | |
| "grad_norm": 0.38770952820777893, | |
| "learning_rate": 0.0004535895043731778, | |
| "loss": 3.3127, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.237762237762238, | |
| "grad_norm": 0.3479763865470886, | |
| "learning_rate": 0.00045341457725947516, | |
| "loss": 3.3088, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.237762237762238, | |
| "eval_accuracy": 0.36980774599244454, | |
| "eval_loss": 3.5638058185577393, | |
| "eval_runtime": 180.1244, | |
| "eval_samples_per_second": 92.392, | |
| "eval_steps_per_second": 5.779, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.252331002331003, | |
| "grad_norm": 0.36385655403137207, | |
| "learning_rate": 0.00045323965014577253, | |
| "loss": 3.3019, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.266899766899767, | |
| "grad_norm": 0.3448290228843689, | |
| "learning_rate": 0.0004530647230320699, | |
| "loss": 3.3137, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.281468531468532, | |
| "grad_norm": 0.34945112466812134, | |
| "learning_rate": 0.00045288979591836734, | |
| "loss": 3.319, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.296037296037296, | |
| "grad_norm": 0.3717491328716278, | |
| "learning_rate": 0.0004527148688046647, | |
| "loss": 3.3187, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.31060606060606, | |
| "grad_norm": 0.37596195936203003, | |
| "learning_rate": 0.0004525399416909621, | |
| "loss": 3.3144, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.325174825174825, | |
| "grad_norm": 0.35946109890937805, | |
| "learning_rate": 0.00045236501457725947, | |
| "loss": 3.3281, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.33974358974359, | |
| "grad_norm": 0.3581676483154297, | |
| "learning_rate": 0.0004521900874635568, | |
| "loss": 3.3193, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.354312354312354, | |
| "grad_norm": 0.3581669330596924, | |
| "learning_rate": 0.00045201516034985416, | |
| "loss": 3.3187, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.368881118881118, | |
| "grad_norm": 0.3567669987678528, | |
| "learning_rate": 0.00045184023323615154, | |
| "loss": 3.3169, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.383449883449883, | |
| "grad_norm": 0.3985763490200043, | |
| "learning_rate": 0.0004516653061224489, | |
| "loss": 3.3228, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.398018648018647, | |
| "grad_norm": 0.3795642852783203, | |
| "learning_rate": 0.0004514903790087463, | |
| "loss": 3.3242, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.412587412587413, | |
| "grad_norm": 0.39964866638183594, | |
| "learning_rate": 0.0004513154518950437, | |
| "loss": 3.3192, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.427156177156178, | |
| "grad_norm": 0.368116557598114, | |
| "learning_rate": 0.0004511405247813411, | |
| "loss": 3.3117, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.441724941724942, | |
| "grad_norm": 0.4013516902923584, | |
| "learning_rate": 0.0004509655976676385, | |
| "loss": 3.3342, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.456293706293707, | |
| "grad_norm": 0.3931203782558441, | |
| "learning_rate": 0.00045079067055393585, | |
| "loss": 3.3392, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.470862470862471, | |
| "grad_norm": 0.3617455065250397, | |
| "learning_rate": 0.00045061574344023317, | |
| "loss": 3.3214, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.485431235431236, | |
| "grad_norm": 0.3787974715232849, | |
| "learning_rate": 0.00045044081632653055, | |
| "loss": 3.3154, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 0.3658803701400757, | |
| "learning_rate": 0.0004502658892128279, | |
| "loss": 3.324, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.514568764568764, | |
| "grad_norm": 0.3991664946079254, | |
| "learning_rate": 0.0004500909620991253, | |
| "loss": 3.3325, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.529137529137529, | |
| "grad_norm": 0.3415054976940155, | |
| "learning_rate": 0.00044991603498542273, | |
| "loss": 3.3395, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.529137529137529, | |
| "eval_accuracy": 0.37044708752123395, | |
| "eval_loss": 3.556431531906128, | |
| "eval_runtime": 180.1017, | |
| "eval_samples_per_second": 92.403, | |
| "eval_steps_per_second": 5.78, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.543706293706293, | |
| "grad_norm": 0.36171630024909973, | |
| "learning_rate": 0.0004497411078717201, | |
| "loss": 3.3305, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.558275058275058, | |
| "grad_norm": 0.3560401499271393, | |
| "learning_rate": 0.0004495661807580175, | |
| "loss": 3.3315, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.572843822843822, | |
| "grad_norm": 0.3484310209751129, | |
| "learning_rate": 0.00044939125364431486, | |
| "loss": 3.3413, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.587412587412587, | |
| "grad_norm": 0.35228946805000305, | |
| "learning_rate": 0.00044921632653061223, | |
| "loss": 3.3509, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.601981351981351, | |
| "grad_norm": 0.3399851620197296, | |
| "learning_rate": 0.00044904139941690955, | |
| "loss": 3.3375, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.616550116550117, | |
| "grad_norm": 0.37779510021209717, | |
| "learning_rate": 0.00044886647230320693, | |
| "loss": 3.3282, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.631118881118882, | |
| "grad_norm": 0.35238713026046753, | |
| "learning_rate": 0.0004486915451895043, | |
| "loss": 3.3347, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.645687645687646, | |
| "grad_norm": 0.36413443088531494, | |
| "learning_rate": 0.0004485166180758017, | |
| "loss": 3.341, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.66025641025641, | |
| "grad_norm": 0.38908377289772034, | |
| "learning_rate": 0.0004483416909620991, | |
| "loss": 3.3425, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.674825174825175, | |
| "grad_norm": 0.3778778612613678, | |
| "learning_rate": 0.0004481667638483965, | |
| "loss": 3.3466, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.68939393939394, | |
| "grad_norm": 0.3862821161746979, | |
| "learning_rate": 0.00044799183673469386, | |
| "loss": 3.3389, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.703962703962704, | |
| "grad_norm": 0.39754387736320496, | |
| "learning_rate": 0.00044781690962099124, | |
| "loss": 3.3586, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.718531468531468, | |
| "grad_norm": 0.36277809739112854, | |
| "learning_rate": 0.0004476419825072886, | |
| "loss": 3.344, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.733100233100233, | |
| "grad_norm": 0.34860557317733765, | |
| "learning_rate": 0.00044746705539358593, | |
| "loss": 3.3501, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.747668997668997, | |
| "grad_norm": 0.3457394242286682, | |
| "learning_rate": 0.0004472921282798833, | |
| "loss": 3.3452, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.762237762237762, | |
| "grad_norm": 0.33117881417274475, | |
| "learning_rate": 0.0004471172011661807, | |
| "loss": 3.3567, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.776806526806526, | |
| "grad_norm": 0.33471763134002686, | |
| "learning_rate": 0.00044694227405247806, | |
| "loss": 3.3463, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.791375291375292, | |
| "grad_norm": 0.36773985624313354, | |
| "learning_rate": 0.0004467673469387755, | |
| "loss": 3.3412, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.805944055944057, | |
| "grad_norm": 0.3666783571243286, | |
| "learning_rate": 0.00044659241982507287, | |
| "loss": 3.3385, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.820512820512821, | |
| "grad_norm": 0.36336493492126465, | |
| "learning_rate": 0.00044641749271137024, | |
| "loss": 3.3436, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.820512820512821, | |
| "eval_accuracy": 0.37076799298970303, | |
| "eval_loss": 3.5492851734161377, | |
| "eval_runtime": 202.0105, | |
| "eval_samples_per_second": 82.382, | |
| "eval_steps_per_second": 5.153, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.835081585081586, | |
| "grad_norm": 0.3824255168437958, | |
| "learning_rate": 0.0004462425655976676, | |
| "loss": 3.3686, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.84965034965035, | |
| "grad_norm": 0.3811502754688263, | |
| "learning_rate": 0.000446067638483965, | |
| "loss": 3.3473, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.864219114219114, | |
| "grad_norm": 0.35578691959381104, | |
| "learning_rate": 0.0004458927113702623, | |
| "loss": 3.3509, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.878787878787879, | |
| "grad_norm": 0.35379981994628906, | |
| "learning_rate": 0.0004457177842565597, | |
| "loss": 3.3516, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.893356643356643, | |
| "grad_norm": 0.3434535562992096, | |
| "learning_rate": 0.00044554285714285707, | |
| "loss": 3.3469, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.907925407925408, | |
| "grad_norm": 0.33877626061439514, | |
| "learning_rate": 0.0004453679300291545, | |
| "loss": 3.3577, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.922494172494172, | |
| "grad_norm": 0.3338939845561981, | |
| "learning_rate": 0.0004451930029154519, | |
| "loss": 3.3427, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.937062937062937, | |
| "grad_norm": 0.35932457447052, | |
| "learning_rate": 0.00044501807580174925, | |
| "loss": 3.363, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.951631701631701, | |
| "grad_norm": 0.37584739923477173, | |
| "learning_rate": 0.0004448431486880466, | |
| "loss": 3.3621, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.966200466200466, | |
| "grad_norm": 0.37329939007759094, | |
| "learning_rate": 0.000444668221574344, | |
| "loss": 3.3654, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.98076923076923, | |
| "grad_norm": 0.3690183460712433, | |
| "learning_rate": 0.0004444932944606414, | |
| "loss": 3.3546, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.995337995337996, | |
| "grad_norm": 0.3789139688014984, | |
| "learning_rate": 0.0004443183673469387, | |
| "loss": 3.3704, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 13.00990675990676, | |
| "grad_norm": 0.3670189678668976, | |
| "learning_rate": 0.0004441434402332361, | |
| "loss": 3.2863, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.024475524475525, | |
| "grad_norm": 0.3434585630893707, | |
| "learning_rate": 0.00044396851311953345, | |
| "loss": 3.2454, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.03904428904429, | |
| "grad_norm": 0.3941807746887207, | |
| "learning_rate": 0.0004437935860058309, | |
| "loss": 3.2563, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.053613053613054, | |
| "grad_norm": 0.36598798632621765, | |
| "learning_rate": 0.00044361865889212826, | |
| "loss": 3.2473, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.068181818181818, | |
| "grad_norm": 0.3792334794998169, | |
| "learning_rate": 0.00044344373177842563, | |
| "loss": 3.2651, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.082750582750583, | |
| "grad_norm": 0.3610612750053406, | |
| "learning_rate": 0.000443268804664723, | |
| "loss": 3.2663, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.097319347319347, | |
| "grad_norm": 0.3610003590583801, | |
| "learning_rate": 0.0004430938775510204, | |
| "loss": 3.2639, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.111888111888112, | |
| "grad_norm": 0.36222654581069946, | |
| "learning_rate": 0.00044291895043731776, | |
| "loss": 3.2689, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.111888111888112, | |
| "eval_accuracy": 0.3705993676904049, | |
| "eval_loss": 3.5606191158294678, | |
| "eval_runtime": 180.0392, | |
| "eval_samples_per_second": 92.435, | |
| "eval_steps_per_second": 5.782, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.126456876456876, | |
| "grad_norm": 0.36570093035697937, | |
| "learning_rate": 0.0004427440233236151, | |
| "loss": 3.2841, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.14102564102564, | |
| "grad_norm": 0.35010698437690735, | |
| "learning_rate": 0.00044256909620991246, | |
| "loss": 3.2813, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.155594405594405, | |
| "grad_norm": 0.3795238137245178, | |
| "learning_rate": 0.0004423941690962099, | |
| "loss": 3.2703, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.17016317016317, | |
| "grad_norm": 0.36561548709869385, | |
| "learning_rate": 0.00044221924198250726, | |
| "loss": 3.2844, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.184731934731936, | |
| "grad_norm": 0.36267200112342834, | |
| "learning_rate": 0.00044204431486880464, | |
| "loss": 3.285, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.1993006993007, | |
| "grad_norm": 0.361397385597229, | |
| "learning_rate": 0.000441869387755102, | |
| "loss": 3.2833, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.213869463869464, | |
| "grad_norm": 0.37071478366851807, | |
| "learning_rate": 0.0004416944606413994, | |
| "loss": 3.2714, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.228438228438229, | |
| "grad_norm": 0.36633357405662537, | |
| "learning_rate": 0.00044151953352769677, | |
| "loss": 3.2872, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.243006993006993, | |
| "grad_norm": 0.4010475277900696, | |
| "learning_rate": 0.00044134460641399414, | |
| "loss": 3.3027, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.257575757575758, | |
| "grad_norm": 0.3728073537349701, | |
| "learning_rate": 0.00044116967930029146, | |
| "loss": 3.2929, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.272144522144522, | |
| "grad_norm": 0.374639630317688, | |
| "learning_rate": 0.00044099475218658884, | |
| "loss": 3.2947, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.286713286713287, | |
| "grad_norm": 0.37459850311279297, | |
| "learning_rate": 0.00044081982507288627, | |
| "loss": 3.2916, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.301282051282051, | |
| "grad_norm": 0.3703133761882782, | |
| "learning_rate": 0.00044064489795918365, | |
| "loss": 3.3003, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.315850815850816, | |
| "grad_norm": 0.35315272212028503, | |
| "learning_rate": 0.000440469970845481, | |
| "loss": 3.2956, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.33041958041958, | |
| "grad_norm": 0.36796048283576965, | |
| "learning_rate": 0.0004402950437317784, | |
| "loss": 3.2999, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.344988344988344, | |
| "grad_norm": 0.3775721490383148, | |
| "learning_rate": 0.00044012011661807577, | |
| "loss": 3.296, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.359557109557109, | |
| "grad_norm": 0.3700525164604187, | |
| "learning_rate": 0.00043994518950437315, | |
| "loss": 3.2943, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.374125874125873, | |
| "grad_norm": 0.37419870495796204, | |
| "learning_rate": 0.0004397702623906705, | |
| "loss": 3.303, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.38869463869464, | |
| "grad_norm": 0.38533103466033936, | |
| "learning_rate": 0.00043959533527696785, | |
| "loss": 3.3144, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.403263403263404, | |
| "grad_norm": 0.36683189868927, | |
| "learning_rate": 0.0004394204081632652, | |
| "loss": 3.2973, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.403263403263404, | |
| "eval_accuracy": 0.3709427330139409, | |
| "eval_loss": 3.5570855140686035, | |
| "eval_runtime": 180.2232, | |
| "eval_samples_per_second": 92.341, | |
| "eval_steps_per_second": 5.776, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.417832167832168, | |
| "grad_norm": 0.40213772654533386, | |
| "learning_rate": 0.00043924548104956265, | |
| "loss": 3.3031, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.432400932400933, | |
| "grad_norm": 0.3665577471256256, | |
| "learning_rate": 0.00043907055393586003, | |
| "loss": 3.3059, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.446969696969697, | |
| "grad_norm": 0.36229410767555237, | |
| "learning_rate": 0.0004388956268221574, | |
| "loss": 3.3079, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.461538461538462, | |
| "grad_norm": 0.3800623416900635, | |
| "learning_rate": 0.0004387206997084548, | |
| "loss": 3.3198, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.476107226107226, | |
| "grad_norm": 0.33525654673576355, | |
| "learning_rate": 0.00043854577259475215, | |
| "loss": 3.3099, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.49067599067599, | |
| "grad_norm": 0.37382787466049194, | |
| "learning_rate": 0.00043837084548104953, | |
| "loss": 3.3125, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.505244755244755, | |
| "grad_norm": 0.3434448540210724, | |
| "learning_rate": 0.0004381959183673469, | |
| "loss": 3.3181, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.51981351981352, | |
| "grad_norm": 0.38704603910446167, | |
| "learning_rate": 0.00043802099125364423, | |
| "loss": 3.3258, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.534382284382284, | |
| "grad_norm": 0.36768314242362976, | |
| "learning_rate": 0.00043784606413994166, | |
| "loss": 3.322, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.548951048951048, | |
| "grad_norm": 0.3738141655921936, | |
| "learning_rate": 0.00043767113702623903, | |
| "loss": 3.3159, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.563519813519813, | |
| "grad_norm": 0.36555594205856323, | |
| "learning_rate": 0.0004374962099125364, | |
| "loss": 3.3224, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.578088578088579, | |
| "grad_norm": 0.39236852526664734, | |
| "learning_rate": 0.0004373212827988338, | |
| "loss": 3.3168, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.592657342657343, | |
| "grad_norm": 0.35549354553222656, | |
| "learning_rate": 0.00043714635568513116, | |
| "loss": 3.3138, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.607226107226108, | |
| "grad_norm": 0.38144198060035706, | |
| "learning_rate": 0.00043697142857142854, | |
| "loss": 3.315, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.621794871794872, | |
| "grad_norm": 0.3554701805114746, | |
| "learning_rate": 0.0004367965014577259, | |
| "loss": 3.3287, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.636363636363637, | |
| "grad_norm": 0.35527220368385315, | |
| "learning_rate": 0.00043662157434402334, | |
| "loss": 3.318, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.650932400932401, | |
| "grad_norm": 0.3695144057273865, | |
| "learning_rate": 0.0004364466472303206, | |
| "loss": 3.3222, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.665501165501166, | |
| "grad_norm": 0.3941607177257538, | |
| "learning_rate": 0.00043627172011661804, | |
| "loss": 3.3223, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.68006993006993, | |
| "grad_norm": 0.3855513036251068, | |
| "learning_rate": 0.0004360967930029154, | |
| "loss": 3.327, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.694638694638694, | |
| "grad_norm": 0.4064894914627075, | |
| "learning_rate": 0.0004359218658892128, | |
| "loss": 3.3302, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.694638694638694, | |
| "eval_accuracy": 0.3710078783527213, | |
| "eval_loss": 3.5502822399139404, | |
| "eval_runtime": 180.1072, | |
| "eval_samples_per_second": 92.401, | |
| "eval_steps_per_second": 5.78, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.709207459207459, | |
| "grad_norm": 0.36102309823036194, | |
| "learning_rate": 0.00043574693877551017, | |
| "loss": 3.3326, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.723776223776223, | |
| "grad_norm": 0.3735487461090088, | |
| "learning_rate": 0.00043557201166180754, | |
| "loss": 3.322, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.738344988344988, | |
| "grad_norm": 0.3891875147819519, | |
| "learning_rate": 0.0004353970845481049, | |
| "loss": 3.3352, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.752913752913752, | |
| "grad_norm": 0.35352927446365356, | |
| "learning_rate": 0.0004352221574344023, | |
| "loss": 3.3259, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.767482517482517, | |
| "grad_norm": 0.358306884765625, | |
| "learning_rate": 0.0004350472303206997, | |
| "loss": 3.3198, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.782051282051283, | |
| "grad_norm": 0.3883667290210724, | |
| "learning_rate": 0.00043487230320699705, | |
| "loss": 3.3426, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.796620046620047, | |
| "grad_norm": 0.36384516954421997, | |
| "learning_rate": 0.0004346973760932944, | |
| "loss": 3.3394, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.811188811188812, | |
| "grad_norm": 0.39026427268981934, | |
| "learning_rate": 0.0004345224489795918, | |
| "loss": 3.329, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.825757575757576, | |
| "grad_norm": 0.6027129888534546, | |
| "learning_rate": 0.0004343475218658892, | |
| "loss": 3.3378, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.84032634032634, | |
| "grad_norm": 0.39879918098449707, | |
| "learning_rate": 0.00043417259475218655, | |
| "loss": 3.3348, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.854895104895105, | |
| "grad_norm": 0.35614633560180664, | |
| "learning_rate": 0.0004339976676384839, | |
| "loss": 3.3318, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.86946386946387, | |
| "grad_norm": 0.37732526659965515, | |
| "learning_rate": 0.0004338227405247813, | |
| "loss": 3.3335, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.884032634032634, | |
| "grad_norm": 0.36290261149406433, | |
| "learning_rate": 0.00043364781341107873, | |
| "loss": 3.3439, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.898601398601398, | |
| "grad_norm": 0.3448139429092407, | |
| "learning_rate": 0.0004334728862973761, | |
| "loss": 3.331, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.913170163170163, | |
| "grad_norm": 0.373145192861557, | |
| "learning_rate": 0.00043329795918367343, | |
| "loss": 3.3491, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.927738927738927, | |
| "grad_norm": 0.3779900074005127, | |
| "learning_rate": 0.0004331230320699708, | |
| "loss": 3.3345, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.942307692307692, | |
| "grad_norm": 0.35789304971694946, | |
| "learning_rate": 0.0004329481049562682, | |
| "loss": 3.3379, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.956876456876456, | |
| "grad_norm": 0.3417630195617676, | |
| "learning_rate": 0.00043277317784256556, | |
| "loss": 3.3222, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.971445221445222, | |
| "grad_norm": 0.35347306728363037, | |
| "learning_rate": 0.00043259825072886293, | |
| "loss": 3.3365, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.986013986013987, | |
| "grad_norm": 0.4118961989879608, | |
| "learning_rate": 0.0004324233236151603, | |
| "loss": 3.3507, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.986013986013987, | |
| "eval_accuracy": 0.3716781462788018, | |
| "eval_loss": 3.5405595302581787, | |
| "eval_runtime": 180.2892, | |
| "eval_samples_per_second": 92.307, | |
| "eval_steps_per_second": 5.774, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 14.000582750582751, | |
| "grad_norm": 0.4268363416194916, | |
| "learning_rate": 0.0004322483965014577, | |
| "loss": 3.3316, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 14.015151515151516, | |
| "grad_norm": 0.3893098533153534, | |
| "learning_rate": 0.0004320734693877551, | |
| "loss": 3.2321, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.02972027972028, | |
| "grad_norm": 0.39411571621894836, | |
| "learning_rate": 0.0004318985422740525, | |
| "loss": 3.2315, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.044289044289044, | |
| "grad_norm": 0.36802518367767334, | |
| "learning_rate": 0.0004317236151603498, | |
| "loss": 3.2374, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.058857808857809, | |
| "grad_norm": 0.39768052101135254, | |
| "learning_rate": 0.0004315486880466472, | |
| "loss": 3.2414, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.073426573426573, | |
| "grad_norm": 0.38169392943382263, | |
| "learning_rate": 0.00043137376093294456, | |
| "loss": 3.2486, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.087995337995338, | |
| "grad_norm": 0.3780570924282074, | |
| "learning_rate": 0.00043119883381924194, | |
| "loss": 3.2548, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.102564102564102, | |
| "grad_norm": 0.35785311460494995, | |
| "learning_rate": 0.0004310239067055393, | |
| "loss": 3.2463, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.117132867132867, | |
| "grad_norm": 0.408586323261261, | |
| "learning_rate": 0.0004308489795918367, | |
| "loss": 3.2536, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.131701631701631, | |
| "grad_norm": 0.38487815856933594, | |
| "learning_rate": 0.00043067405247813407, | |
| "loss": 3.2604, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.146270396270396, | |
| "grad_norm": 0.36268073320388794, | |
| "learning_rate": 0.0004304991253644315, | |
| "loss": 3.2614, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.16083916083916, | |
| "grad_norm": 0.38010692596435547, | |
| "learning_rate": 0.00043032419825072887, | |
| "loss": 3.2638, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.175407925407926, | |
| "grad_norm": 0.36735981702804565, | |
| "learning_rate": 0.0004301492711370262, | |
| "loss": 3.2635, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.18997668997669, | |
| "grad_norm": 0.37236976623535156, | |
| "learning_rate": 0.00042997434402332357, | |
| "loss": 3.2752, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.204545454545455, | |
| "grad_norm": 0.3588123619556427, | |
| "learning_rate": 0.00042979941690962094, | |
| "loss": 3.2593, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.21911421911422, | |
| "grad_norm": 0.3798566162586212, | |
| "learning_rate": 0.0004296244897959183, | |
| "loss": 3.2667, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.233682983682984, | |
| "grad_norm": 0.3849664628505707, | |
| "learning_rate": 0.0004294495626822157, | |
| "loss": 3.2756, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.248251748251748, | |
| "grad_norm": 0.3689548075199127, | |
| "learning_rate": 0.00042927463556851307, | |
| "loss": 3.2641, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.262820512820513, | |
| "grad_norm": 0.38583266735076904, | |
| "learning_rate": 0.0004290997084548105, | |
| "loss": 3.2747, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.277389277389277, | |
| "grad_norm": 0.3908044099807739, | |
| "learning_rate": 0.0004289247813411079, | |
| "loss": 3.2781, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.277389277389277, | |
| "eval_accuracy": 0.3713245505500783, | |
| "eval_loss": 3.555830717086792, | |
| "eval_runtime": 180.235, | |
| "eval_samples_per_second": 92.335, | |
| "eval_steps_per_second": 5.776, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.291958041958042, | |
| "grad_norm": 0.4048484265804291, | |
| "learning_rate": 0.00042874985422740525, | |
| "loss": 3.2815, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.306526806526806, | |
| "grad_norm": 0.3809373080730438, | |
| "learning_rate": 0.0004285749271137026, | |
| "loss": 3.2844, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.32109557109557, | |
| "grad_norm": 0.3947436213493347, | |
| "learning_rate": 0.00042839999999999995, | |
| "loss": 3.2895, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.335664335664335, | |
| "grad_norm": 0.4283214211463928, | |
| "learning_rate": 0.0004282250728862973, | |
| "loss": 3.2771, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.3502331002331, | |
| "grad_norm": 0.404867023229599, | |
| "learning_rate": 0.0004280501457725947, | |
| "loss": 3.2798, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.364801864801866, | |
| "grad_norm": 0.4032004773616791, | |
| "learning_rate": 0.0004278752186588921, | |
| "loss": 3.2898, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.37937062937063, | |
| "grad_norm": 0.3930039703845978, | |
| "learning_rate": 0.00042770029154518945, | |
| "loss": 3.2876, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.393939393939394, | |
| "grad_norm": 0.3659101724624634, | |
| "learning_rate": 0.0004275253644314869, | |
| "loss": 3.296, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.408508158508159, | |
| "grad_norm": 0.3935369551181793, | |
| "learning_rate": 0.00042735043731778426, | |
| "loss": 3.2924, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.423076923076923, | |
| "grad_norm": 0.3779597580432892, | |
| "learning_rate": 0.00042717551020408164, | |
| "loss": 3.2885, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.437645687645688, | |
| "grad_norm": 0.3811582922935486, | |
| "learning_rate": 0.00042700058309037896, | |
| "loss": 3.2916, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.452214452214452, | |
| "grad_norm": 0.3844025135040283, | |
| "learning_rate": 0.00042682565597667633, | |
| "loss": 3.2914, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.466783216783217, | |
| "grad_norm": 0.37973514199256897, | |
| "learning_rate": 0.0004266507288629737, | |
| "loss": 3.2938, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.481351981351981, | |
| "grad_norm": 0.4009278416633606, | |
| "learning_rate": 0.0004264758017492711, | |
| "loss": 3.2984, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.495920745920746, | |
| "grad_norm": 0.3518688678741455, | |
| "learning_rate": 0.00042630087463556846, | |
| "loss": 3.3005, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.51048951048951, | |
| "grad_norm": 0.38609257340431213, | |
| "learning_rate": 0.00042612594752186584, | |
| "loss": 3.2928, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.525058275058274, | |
| "grad_norm": 0.3609243333339691, | |
| "learning_rate": 0.00042595102040816327, | |
| "loss": 3.2912, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.539627039627039, | |
| "grad_norm": 0.38364362716674805, | |
| "learning_rate": 0.00042577609329446064, | |
| "loss": 3.3073, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.554195804195803, | |
| "grad_norm": 0.36876383423805237, | |
| "learning_rate": 0.000425601166180758, | |
| "loss": 3.307, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.56876456876457, | |
| "grad_norm": 0.36288002133369446, | |
| "learning_rate": 0.00042542623906705534, | |
| "loss": 3.3064, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.56876456876457, | |
| "eval_accuracy": 0.37152504297358135, | |
| "eval_loss": 3.5497894287109375, | |
| "eval_runtime": 180.1492, | |
| "eval_samples_per_second": 92.379, | |
| "eval_steps_per_second": 5.779, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.583333333333334, | |
| "grad_norm": 0.3770962953567505, | |
| "learning_rate": 0.0004252513119533527, | |
| "loss": 3.3217, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.597902097902098, | |
| "grad_norm": 0.36296921968460083, | |
| "learning_rate": 0.0004250763848396501, | |
| "loss": 3.3085, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.612470862470863, | |
| "grad_norm": 0.3781993091106415, | |
| "learning_rate": 0.00042490145772594747, | |
| "loss": 3.315, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.627039627039627, | |
| "grad_norm": 0.38002365827560425, | |
| "learning_rate": 0.00042472653061224484, | |
| "loss": 3.3129, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.641608391608392, | |
| "grad_norm": 0.3946523070335388, | |
| "learning_rate": 0.00042455160349854227, | |
| "loss": 3.3144, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.656177156177156, | |
| "grad_norm": 0.36559757590293884, | |
| "learning_rate": 0.00042437667638483965, | |
| "loss": 3.3157, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.67074592074592, | |
| "grad_norm": 0.40196385979652405, | |
| "learning_rate": 0.000424201749271137, | |
| "loss": 3.3064, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.685314685314685, | |
| "grad_norm": 0.39326444268226624, | |
| "learning_rate": 0.0004240268221574344, | |
| "loss": 3.2974, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.69988344988345, | |
| "grad_norm": 0.38454002141952515, | |
| "learning_rate": 0.0004238518950437317, | |
| "loss": 3.3185, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.714452214452214, | |
| "grad_norm": 0.3821794092655182, | |
| "learning_rate": 0.0004236769679300291, | |
| "loss": 3.3211, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.729020979020978, | |
| "grad_norm": 0.36120903491973877, | |
| "learning_rate": 0.00042350204081632647, | |
| "loss": 3.3132, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.743589743589745, | |
| "grad_norm": 0.37556177377700806, | |
| "learning_rate": 0.00042332711370262385, | |
| "loss": 3.3167, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.758158508158509, | |
| "grad_norm": 0.40567317605018616, | |
| "learning_rate": 0.0004231521865889212, | |
| "loss": 3.3081, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.772727272727273, | |
| "grad_norm": 0.37133491039276123, | |
| "learning_rate": 0.00042297725947521865, | |
| "loss": 3.3122, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.787296037296038, | |
| "grad_norm": 0.36307451128959656, | |
| "learning_rate": 0.00042280233236151603, | |
| "loss": 3.3193, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.801864801864802, | |
| "grad_norm": 0.35331910848617554, | |
| "learning_rate": 0.0004226274052478134, | |
| "loss": 3.3101, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.816433566433567, | |
| "grad_norm": 0.3815527558326721, | |
| "learning_rate": 0.0004224524781341108, | |
| "loss": 3.3149, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.831002331002331, | |
| "grad_norm": 0.35823893547058105, | |
| "learning_rate": 0.0004222775510204081, | |
| "loss": 3.3195, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.845571095571096, | |
| "grad_norm": 0.3718653619289398, | |
| "learning_rate": 0.0004221026239067055, | |
| "loss": 3.3149, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.86013986013986, | |
| "grad_norm": 0.4067089557647705, | |
| "learning_rate": 0.00042192769679300285, | |
| "loss": 3.318, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.86013986013986, | |
| "eval_accuracy": 0.3718885163348997, | |
| "eval_loss": 3.541916608810425, | |
| "eval_runtime": 180.0954, | |
| "eval_samples_per_second": 92.407, | |
| "eval_steps_per_second": 5.78, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.874708624708624, | |
| "grad_norm": 0.3652386963367462, | |
| "learning_rate": 0.00042175276967930023, | |
| "loss": 3.3265, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.889277389277389, | |
| "grad_norm": 0.3712463080883026, | |
| "learning_rate": 0.00042157784256559766, | |
| "loss": 3.3248, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.903846153846153, | |
| "grad_norm": 0.37910208106040955, | |
| "learning_rate": 0.00042140291545189504, | |
| "loss": 3.3283, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.918414918414918, | |
| "grad_norm": 0.36953410506248474, | |
| "learning_rate": 0.0004212279883381924, | |
| "loss": 3.3234, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.932983682983682, | |
| "grad_norm": 0.3733784556388855, | |
| "learning_rate": 0.0004210530612244898, | |
| "loss": 3.3219, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.947552447552448, | |
| "grad_norm": 0.3792276382446289, | |
| "learning_rate": 0.0004208781341107871, | |
| "loss": 3.3232, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.962121212121213, | |
| "grad_norm": 0.3714187741279602, | |
| "learning_rate": 0.0004207032069970845, | |
| "loss": 3.3271, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.976689976689977, | |
| "grad_norm": 0.3657480478286743, | |
| "learning_rate": 0.00042052827988338186, | |
| "loss": 3.3185, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.991258741258742, | |
| "grad_norm": 0.3697648048400879, | |
| "learning_rate": 0.00042035335276967924, | |
| "loss": 3.3353, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 15.005827505827506, | |
| "grad_norm": 0.41941073536872864, | |
| "learning_rate": 0.0004201784256559766, | |
| "loss": 3.2813, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.02039627039627, | |
| "grad_norm": 0.3773099482059479, | |
| "learning_rate": 0.00042000349854227404, | |
| "loss": 3.2151, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.034965034965035, | |
| "grad_norm": 0.37644755840301514, | |
| "learning_rate": 0.0004198285714285714, | |
| "loss": 3.2213, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.0495337995338, | |
| "grad_norm": 0.3629944622516632, | |
| "learning_rate": 0.0004196536443148688, | |
| "loss": 3.2295, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.064102564102564, | |
| "grad_norm": 0.3724285960197449, | |
| "learning_rate": 0.00041947871720116617, | |
| "loss": 3.218, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.078671328671328, | |
| "grad_norm": 0.3703879117965698, | |
| "learning_rate": 0.0004193037900874635, | |
| "loss": 3.2367, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.093240093240093, | |
| "grad_norm": 0.3728949725627899, | |
| "learning_rate": 0.00041912886297376087, | |
| "loss": 3.2414, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.107808857808857, | |
| "grad_norm": 0.3949025273323059, | |
| "learning_rate": 0.00041895393586005824, | |
| "loss": 3.2269, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.122377622377622, | |
| "grad_norm": 0.3763778507709503, | |
| "learning_rate": 0.0004187790087463556, | |
| "loss": 3.2327, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.136946386946388, | |
| "grad_norm": 0.41053783893585205, | |
| "learning_rate": 0.000418604081632653, | |
| "loss": 3.2503, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.151515151515152, | |
| "grad_norm": 0.41025668382644653, | |
| "learning_rate": 0.0004184291545189504, | |
| "loss": 3.2508, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.151515151515152, | |
| "eval_accuracy": 0.37150011371036573, | |
| "eval_loss": 3.556598663330078, | |
| "eval_runtime": 180.2269, | |
| "eval_samples_per_second": 92.339, | |
| "eval_steps_per_second": 5.776, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.166083916083917, | |
| "grad_norm": 0.4072258770465851, | |
| "learning_rate": 0.0004182542274052478, | |
| "loss": 3.2452, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.180652680652681, | |
| "grad_norm": 0.40523266792297363, | |
| "learning_rate": 0.0004180793002915452, | |
| "loss": 3.2578, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.195221445221446, | |
| "grad_norm": 0.38300174474716187, | |
| "learning_rate": 0.00041790437317784255, | |
| "loss": 3.2492, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.20979020979021, | |
| "grad_norm": 0.36747708916664124, | |
| "learning_rate": 0.0004177294460641399, | |
| "loss": 3.2537, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.224358974358974, | |
| "grad_norm": 0.3879857659339905, | |
| "learning_rate": 0.00041755451895043725, | |
| "loss": 3.2632, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.238927738927739, | |
| "grad_norm": 0.368695467710495, | |
| "learning_rate": 0.0004173795918367346, | |
| "loss": 3.2581, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.253496503496503, | |
| "grad_norm": 0.3739892840385437, | |
| "learning_rate": 0.000417204664723032, | |
| "loss": 3.265, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.268065268065268, | |
| "grad_norm": 0.36324694752693176, | |
| "learning_rate": 0.00041702973760932943, | |
| "loss": 3.2505, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.282634032634032, | |
| "grad_norm": 0.39305976033210754, | |
| "learning_rate": 0.0004168548104956268, | |
| "loss": 3.2693, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.297202797202797, | |
| "grad_norm": 0.377043217420578, | |
| "learning_rate": 0.0004166798833819242, | |
| "loss": 3.26, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.311771561771561, | |
| "grad_norm": 0.3906829059123993, | |
| "learning_rate": 0.00041650495626822156, | |
| "loss": 3.2712, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.326340326340326, | |
| "grad_norm": 0.4162210524082184, | |
| "learning_rate": 0.00041633002915451893, | |
| "loss": 3.2637, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.340909090909092, | |
| "grad_norm": 0.4120563566684723, | |
| "learning_rate": 0.00041615510204081626, | |
| "loss": 3.2763, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.355477855477856, | |
| "grad_norm": 0.3810875713825226, | |
| "learning_rate": 0.00041598017492711363, | |
| "loss": 3.2636, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.37004662004662, | |
| "grad_norm": 0.3553137481212616, | |
| "learning_rate": 0.000415805247813411, | |
| "loss": 3.2651, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.384615384615385, | |
| "grad_norm": 0.4119865894317627, | |
| "learning_rate": 0.0004156303206997084, | |
| "loss": 3.2804, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.39918414918415, | |
| "grad_norm": 0.39863142371177673, | |
| "learning_rate": 0.0004154553935860058, | |
| "loss": 3.2859, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.413752913752914, | |
| "grad_norm": 0.38426119089126587, | |
| "learning_rate": 0.0004152804664723032, | |
| "loss": 3.3041, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.428321678321678, | |
| "grad_norm": 0.39725998044013977, | |
| "learning_rate": 0.00041510553935860056, | |
| "loss": 3.2784, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.442890442890443, | |
| "grad_norm": 0.3563925325870514, | |
| "learning_rate": 0.00041493061224489794, | |
| "loss": 3.2845, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.442890442890443, | |
| "eval_accuracy": 0.37165650955978446, | |
| "eval_loss": 3.5524237155914307, | |
| "eval_runtime": 180.1313, | |
| "eval_samples_per_second": 92.388, | |
| "eval_steps_per_second": 5.779, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.457459207459207, | |
| "grad_norm": 0.4049256145954132, | |
| "learning_rate": 0.0004147556851311953, | |
| "loss": 3.291, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.472027972027972, | |
| "grad_norm": 0.3597942292690277, | |
| "learning_rate": 0.00041458075801749264, | |
| "loss": 3.2731, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.486596736596736, | |
| "grad_norm": 0.38720211386680603, | |
| "learning_rate": 0.00041440583090379, | |
| "loss": 3.2912, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.5011655011655, | |
| "grad_norm": 0.3816832900047302, | |
| "learning_rate": 0.0004142309037900874, | |
| "loss": 3.2809, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.515734265734265, | |
| "grad_norm": 0.3881129026412964, | |
| "learning_rate": 0.0004140559766763848, | |
| "loss": 3.292, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.530303030303031, | |
| "grad_norm": 0.3714061677455902, | |
| "learning_rate": 0.0004138810495626822, | |
| "loss": 3.2879, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.544871794871796, | |
| "grad_norm": 0.37834274768829346, | |
| "learning_rate": 0.00041370612244897957, | |
| "loss": 3.292, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.55944055944056, | |
| "grad_norm": 0.392358660697937, | |
| "learning_rate": 0.00041353119533527695, | |
| "loss": 3.2852, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.574009324009324, | |
| "grad_norm": 0.36605942249298096, | |
| "learning_rate": 0.0004133562682215743, | |
| "loss": 3.2933, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.588578088578089, | |
| "grad_norm": 0.3728536069393158, | |
| "learning_rate": 0.0004131813411078717, | |
| "loss": 3.2953, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.603146853146853, | |
| "grad_norm": 0.37290915846824646, | |
| "learning_rate": 0.000413006413994169, | |
| "loss": 3.3002, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.617715617715618, | |
| "grad_norm": 0.3864482343196869, | |
| "learning_rate": 0.0004128314868804664, | |
| "loss": 3.2898, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.632284382284382, | |
| "grad_norm": 0.3864837884902954, | |
| "learning_rate": 0.00041265655976676377, | |
| "loss": 3.3014, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.646853146853147, | |
| "grad_norm": 0.372283011674881, | |
| "learning_rate": 0.0004124816326530612, | |
| "loss": 3.2943, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.661421911421911, | |
| "grad_norm": 0.38414448499679565, | |
| "learning_rate": 0.0004123067055393586, | |
| "loss": 3.3033, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.675990675990676, | |
| "grad_norm": 0.3930130898952484, | |
| "learning_rate": 0.00041213177842565595, | |
| "loss": 3.29, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.69055944055944, | |
| "grad_norm": 0.3804117441177368, | |
| "learning_rate": 0.00041195685131195333, | |
| "loss": 3.2942, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.705128205128204, | |
| "grad_norm": 0.3625752627849579, | |
| "learning_rate": 0.0004117819241982507, | |
| "loss": 3.3051, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.719696969696969, | |
| "grad_norm": 0.3937501013278961, | |
| "learning_rate": 0.0004116069970845481, | |
| "loss": 3.2888, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.734265734265735, | |
| "grad_norm": 0.3985256850719452, | |
| "learning_rate": 0.0004114320699708454, | |
| "loss": 3.2991, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.734265734265735, | |
| "eval_accuracy": 0.37245154139280734, | |
| "eval_loss": 3.542123317718506, | |
| "eval_runtime": 180.2054, | |
| "eval_samples_per_second": 92.35, | |
| "eval_steps_per_second": 5.777, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.7488344988345, | |
| "grad_norm": 0.38483700156211853, | |
| "learning_rate": 0.0004112571428571428, | |
| "loss": 3.2974, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.763403263403264, | |
| "grad_norm": 0.3814025819301605, | |
| "learning_rate": 0.00041108221574344015, | |
| "loss": 3.2975, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.777972027972028, | |
| "grad_norm": 0.3753068745136261, | |
| "learning_rate": 0.0004109072886297376, | |
| "loss": 3.2989, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.792540792540793, | |
| "grad_norm": 0.3675210475921631, | |
| "learning_rate": 0.00041073236151603496, | |
| "loss": 3.3017, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.807109557109557, | |
| "grad_norm": 0.38489532470703125, | |
| "learning_rate": 0.00041055743440233234, | |
| "loss": 3.3112, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.821678321678322, | |
| "grad_norm": 0.38765832781791687, | |
| "learning_rate": 0.0004103825072886297, | |
| "loss": 3.3079, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.836247086247086, | |
| "grad_norm": 0.37342846393585205, | |
| "learning_rate": 0.0004102075801749271, | |
| "loss": 3.3045, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.85081585081585, | |
| "grad_norm": 0.37730568647384644, | |
| "learning_rate": 0.00041003265306122446, | |
| "loss": 3.3083, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.865384615384615, | |
| "grad_norm": 0.4032607674598694, | |
| "learning_rate": 0.0004098577259475218, | |
| "loss": 3.3173, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.87995337995338, | |
| "grad_norm": 0.37657999992370605, | |
| "learning_rate": 0.00040968279883381916, | |
| "loss": 3.3093, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.894522144522144, | |
| "grad_norm": 0.35823720693588257, | |
| "learning_rate": 0.0004095078717201166, | |
| "loss": 3.3004, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.909090909090908, | |
| "grad_norm": 0.36570194363594055, | |
| "learning_rate": 0.00040933294460641397, | |
| "loss": 3.3058, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.923659673659674, | |
| "grad_norm": 0.39722785353660583, | |
| "learning_rate": 0.00040915801749271134, | |
| "loss": 3.2995, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.938228438228439, | |
| "grad_norm": 0.40323764085769653, | |
| "learning_rate": 0.0004089830903790087, | |
| "loss": 3.3116, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.952797202797203, | |
| "grad_norm": 0.3630395829677582, | |
| "learning_rate": 0.0004088081632653061, | |
| "loss": 3.3032, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.967365967365968, | |
| "grad_norm": 0.39046773314476013, | |
| "learning_rate": 0.00040863323615160347, | |
| "loss": 3.3096, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.981934731934732, | |
| "grad_norm": 0.4046758711338043, | |
| "learning_rate": 0.00040845830903790085, | |
| "loss": 3.3049, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 15.996503496503497, | |
| "grad_norm": 0.38751789927482605, | |
| "learning_rate": 0.00040828338192419817, | |
| "loss": 3.3225, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 16.01107226107226, | |
| "grad_norm": 0.38193270564079285, | |
| "learning_rate": 0.00040810845481049554, | |
| "loss": 3.2365, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.025641025641026, | |
| "grad_norm": 0.36620691418647766, | |
| "learning_rate": 0.00040793352769679297, | |
| "loss": 3.21, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.025641025641026, | |
| "eval_accuracy": 0.37201010528850803, | |
| "eval_loss": 3.5520431995391846, | |
| "eval_runtime": 180.173, | |
| "eval_samples_per_second": 92.367, | |
| "eval_steps_per_second": 5.778, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.04020979020979, | |
| "grad_norm": 0.40240392088890076, | |
| "learning_rate": 0.00040775860058309035, | |
| "loss": 3.2147, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.054778554778554, | |
| "grad_norm": 0.38166630268096924, | |
| "learning_rate": 0.0004075836734693877, | |
| "loss": 3.2147, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.06934731934732, | |
| "grad_norm": 0.3995780348777771, | |
| "learning_rate": 0.0004074087463556851, | |
| "loss": 3.212, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.083916083916083, | |
| "grad_norm": 0.39974555373191833, | |
| "learning_rate": 0.0004072338192419825, | |
| "loss": 3.2148, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.098484848484848, | |
| "grad_norm": 0.4011850655078888, | |
| "learning_rate": 0.00040705889212827985, | |
| "loss": 3.2189, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.113053613053612, | |
| "grad_norm": 0.38059523701667786, | |
| "learning_rate": 0.00040688396501457723, | |
| "loss": 3.2259, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.127622377622377, | |
| "grad_norm": 0.4111097455024719, | |
| "learning_rate": 0.00040670903790087455, | |
| "loss": 3.2338, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.14219114219114, | |
| "grad_norm": 0.4073127210140228, | |
| "learning_rate": 0.0004065341107871719, | |
| "loss": 3.2335, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.156759906759905, | |
| "grad_norm": 0.35866570472717285, | |
| "learning_rate": 0.00040635918367346935, | |
| "loss": 3.2314, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.17132867132867, | |
| "grad_norm": 0.3810841739177704, | |
| "learning_rate": 0.00040618425655976673, | |
| "loss": 3.2441, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.185897435897434, | |
| "grad_norm": 0.3563622832298279, | |
| "learning_rate": 0.0004060093294460641, | |
| "loss": 3.238, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.2004662004662, | |
| "grad_norm": 0.3715536594390869, | |
| "learning_rate": 0.0004058344023323615, | |
| "loss": 3.242, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.215034965034967, | |
| "grad_norm": 0.386802077293396, | |
| "learning_rate": 0.00040565947521865886, | |
| "loss": 3.2535, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.22960372960373, | |
| "grad_norm": 0.3603656589984894, | |
| "learning_rate": 0.00040548454810495623, | |
| "loss": 3.2459, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.244172494172496, | |
| "grad_norm": 0.36066412925720215, | |
| "learning_rate": 0.0004053096209912536, | |
| "loss": 3.2382, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.25874125874126, | |
| "grad_norm": 0.40658852458000183, | |
| "learning_rate": 0.00040513469387755093, | |
| "loss": 3.2406, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.273310023310025, | |
| "grad_norm": 0.3678629398345947, | |
| "learning_rate": 0.00040495976676384836, | |
| "loss": 3.2542, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.28787878787879, | |
| "grad_norm": 0.3898155093193054, | |
| "learning_rate": 0.00040478483965014574, | |
| "loss": 3.256, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.302447552447553, | |
| "grad_norm": 0.3732418417930603, | |
| "learning_rate": 0.0004046099125364431, | |
| "loss": 3.2631, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.317016317016318, | |
| "grad_norm": 0.35981082916259766, | |
| "learning_rate": 0.0004044349854227405, | |
| "loss": 3.2507, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.317016317016318, | |
| "eval_accuracy": 0.3716827323225066, | |
| "eval_loss": 3.5528886318206787, | |
| "eval_runtime": 180.4051, | |
| "eval_samples_per_second": 92.248, | |
| "eval_steps_per_second": 5.77, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.331585081585082, | |
| "grad_norm": 0.3968351483345032, | |
| "learning_rate": 0.00040426005830903786, | |
| "loss": 3.2583, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.346153846153847, | |
| "grad_norm": 0.3820105791091919, | |
| "learning_rate": 0.00040408513119533524, | |
| "loss": 3.2505, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.36072261072261, | |
| "grad_norm": 0.36826398968696594, | |
| "learning_rate": 0.0004039102040816326, | |
| "loss": 3.2555, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.375291375291376, | |
| "grad_norm": 0.42605534195899963, | |
| "learning_rate": 0.00040373527696793005, | |
| "loss": 3.2613, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.38986013986014, | |
| "grad_norm": 0.38200804591178894, | |
| "learning_rate": 0.0004035603498542273, | |
| "loss": 3.2553, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.404428904428904, | |
| "grad_norm": 0.3612186908721924, | |
| "learning_rate": 0.00040338542274052474, | |
| "loss": 3.2572, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.41899766899767, | |
| "grad_norm": 0.3683592677116394, | |
| "learning_rate": 0.0004032104956268221, | |
| "loss": 3.2684, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.433566433566433, | |
| "grad_norm": 0.3807704448699951, | |
| "learning_rate": 0.0004030355685131195, | |
| "loss": 3.2704, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.448135198135198, | |
| "grad_norm": 0.3632655739784241, | |
| "learning_rate": 0.00040286064139941687, | |
| "loss": 3.2699, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.462703962703962, | |
| "grad_norm": 0.37278565764427185, | |
| "learning_rate": 0.00040268571428571425, | |
| "loss": 3.2681, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.477272727272727, | |
| "grad_norm": 0.3842940628528595, | |
| "learning_rate": 0.0004025107871720116, | |
| "loss": 3.2758, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.49184149184149, | |
| "grad_norm": 0.3898995518684387, | |
| "learning_rate": 0.000402335860058309, | |
| "loss": 3.275, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.506410256410255, | |
| "grad_norm": 0.378342866897583, | |
| "learning_rate": 0.00040216093294460643, | |
| "loss": 3.2697, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.52097902097902, | |
| "grad_norm": 0.40605059266090393, | |
| "learning_rate": 0.00040198600583090375, | |
| "loss": 3.2796, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.535547785547784, | |
| "grad_norm": 0.40337276458740234, | |
| "learning_rate": 0.0004018110787172011, | |
| "loss": 3.2796, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.55011655011655, | |
| "grad_norm": 0.3659421503543854, | |
| "learning_rate": 0.0004016361516034985, | |
| "loss": 3.2631, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.564685314685313, | |
| "grad_norm": 0.3794923424720764, | |
| "learning_rate": 0.0004014612244897959, | |
| "loss": 3.2902, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.579254079254078, | |
| "grad_norm": 0.38088348507881165, | |
| "learning_rate": 0.00040128629737609325, | |
| "loss": 3.2792, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.593822843822842, | |
| "grad_norm": 0.3972194194793701, | |
| "learning_rate": 0.00040111137026239063, | |
| "loss": 3.2859, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.60839160839161, | |
| "grad_norm": 0.39679381251335144, | |
| "learning_rate": 0.000400936443148688, | |
| "loss": 3.2887, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.60839160839161, | |
| "eval_accuracy": 0.3726747288531057, | |
| "eval_loss": 3.5446503162384033, | |
| "eval_runtime": 180.0719, | |
| "eval_samples_per_second": 92.419, | |
| "eval_steps_per_second": 5.781, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.622960372960375, | |
| "grad_norm": 0.38704535365104675, | |
| "learning_rate": 0.00040076151603498543, | |
| "loss": 3.2904, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.63752913752914, | |
| "grad_norm": 0.446439653635025, | |
| "learning_rate": 0.0004005865889212828, | |
| "loss": 3.2839, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.652097902097903, | |
| "grad_norm": 0.3731631636619568, | |
| "learning_rate": 0.00040041166180758013, | |
| "loss": 3.2772, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 0.39002761244773865, | |
| "learning_rate": 0.0004002367346938775, | |
| "loss": 3.2918, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.681235431235432, | |
| "grad_norm": 0.41322073340415955, | |
| "learning_rate": 0.0004000618075801749, | |
| "loss": 3.275, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.695804195804197, | |
| "grad_norm": 0.3777833878993988, | |
| "learning_rate": 0.00039988688046647226, | |
| "loss": 3.2789, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.71037296037296, | |
| "grad_norm": 0.402190238237381, | |
| "learning_rate": 0.00039971195335276963, | |
| "loss": 3.2938, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.724941724941726, | |
| "grad_norm": 0.37010467052459717, | |
| "learning_rate": 0.000399537026239067, | |
| "loss": 3.2931, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.73951048951049, | |
| "grad_norm": 0.39115190505981445, | |
| "learning_rate": 0.0003993620991253644, | |
| "loss": 3.2819, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.754079254079254, | |
| "grad_norm": 0.38583433628082275, | |
| "learning_rate": 0.0003991871720116618, | |
| "loss": 3.2987, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.76864801864802, | |
| "grad_norm": 0.38979053497314453, | |
| "learning_rate": 0.0003990122448979592, | |
| "loss": 3.2919, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.783216783216783, | |
| "grad_norm": 0.37497058510780334, | |
| "learning_rate": 0.0003988373177842565, | |
| "loss": 3.2887, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.797785547785548, | |
| "grad_norm": 0.381260484457016, | |
| "learning_rate": 0.0003986623906705539, | |
| "loss": 3.2818, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.812354312354312, | |
| "grad_norm": 0.38522014021873474, | |
| "learning_rate": 0.00039848746355685127, | |
| "loss": 3.2875, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.826923076923077, | |
| "grad_norm": 0.3929021656513214, | |
| "learning_rate": 0.00039831253644314864, | |
| "loss": 3.2837, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.84149184149184, | |
| "grad_norm": 0.38312867283821106, | |
| "learning_rate": 0.000398137609329446, | |
| "loss": 3.2974, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.856060606060606, | |
| "grad_norm": 0.36243414878845215, | |
| "learning_rate": 0.0003979626822157434, | |
| "loss": 3.2953, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.87062937062937, | |
| "grad_norm": 0.4058583378791809, | |
| "learning_rate": 0.00039778775510204077, | |
| "loss": 3.288, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.885198135198134, | |
| "grad_norm": 0.3815045952796936, | |
| "learning_rate": 0.0003976128279883382, | |
| "loss": 3.292, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.8997668997669, | |
| "grad_norm": 0.41494324803352356, | |
| "learning_rate": 0.0003974379008746356, | |
| "loss": 3.2842, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.8997668997669, | |
| "eval_accuracy": 0.3731525005344505, | |
| "eval_loss": 3.5340723991394043, | |
| "eval_runtime": 180.0789, | |
| "eval_samples_per_second": 92.415, | |
| "eval_steps_per_second": 5.781, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.914335664335663, | |
| "grad_norm": 0.3929421901702881, | |
| "learning_rate": 0.0003972629737609329, | |
| "loss": 3.2962, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.928904428904428, | |
| "grad_norm": 0.3763855993747711, | |
| "learning_rate": 0.00039708804664723027, | |
| "loss": 3.2997, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.943473193473192, | |
| "grad_norm": 0.39408525824546814, | |
| "learning_rate": 0.00039691311953352765, | |
| "loss": 3.29, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.958041958041957, | |
| "grad_norm": 0.3723476827144623, | |
| "learning_rate": 0.000396738192419825, | |
| "loss": 3.2971, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.97261072261072, | |
| "grad_norm": 0.4028165340423584, | |
| "learning_rate": 0.0003965632653061224, | |
| "loss": 3.3058, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.98717948717949, | |
| "grad_norm": 0.3577312231063843, | |
| "learning_rate": 0.0003963883381924198, | |
| "loss": 3.297, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 17.001748251748253, | |
| "grad_norm": 0.417402446269989, | |
| "learning_rate": 0.0003962134110787172, | |
| "loss": 3.284, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 17.016317016317018, | |
| "grad_norm": 0.3560294806957245, | |
| "learning_rate": 0.0003960384839650146, | |
| "loss": 3.195, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.030885780885782, | |
| "grad_norm": 0.37743493914604187, | |
| "learning_rate": 0.00039586355685131196, | |
| "loss": 3.1971, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.045454545454547, | |
| "grad_norm": 0.4051961302757263, | |
| "learning_rate": 0.0003956886297376093, | |
| "loss": 3.207, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.06002331002331, | |
| "grad_norm": 0.36903810501098633, | |
| "learning_rate": 0.00039551370262390665, | |
| "loss": 3.2031, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.074592074592076, | |
| "grad_norm": 0.3800497055053711, | |
| "learning_rate": 0.00039533877551020403, | |
| "loss": 3.2032, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.08916083916084, | |
| "grad_norm": 0.381396621465683, | |
| "learning_rate": 0.0003951638483965014, | |
| "loss": 3.2163, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.103729603729604, | |
| "grad_norm": 0.3686426877975464, | |
| "learning_rate": 0.0003949889212827988, | |
| "loss": 3.209, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.11829836829837, | |
| "grad_norm": 0.39614012837409973, | |
| "learning_rate": 0.00039481399416909616, | |
| "loss": 3.2147, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.132867132867133, | |
| "grad_norm": 0.4042656719684601, | |
| "learning_rate": 0.0003946390670553936, | |
| "loss": 3.2183, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.147435897435898, | |
| "grad_norm": 0.387961208820343, | |
| "learning_rate": 0.00039446413994169096, | |
| "loss": 3.225, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.162004662004662, | |
| "grad_norm": 0.3967694342136383, | |
| "learning_rate": 0.00039428921282798834, | |
| "loss": 3.2239, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.176573426573427, | |
| "grad_norm": 0.3902028799057007, | |
| "learning_rate": 0.00039411428571428566, | |
| "loss": 3.2237, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.19114219114219, | |
| "grad_norm": 0.4269110858440399, | |
| "learning_rate": 0.00039393935860058304, | |
| "loss": 3.2203, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.19114219114219, | |
| "eval_accuracy": 0.3721861388122523, | |
| "eval_loss": 3.5538175106048584, | |
| "eval_runtime": 180.0807, | |
| "eval_samples_per_second": 92.414, | |
| "eval_steps_per_second": 5.781, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.205710955710956, | |
| "grad_norm": 0.4304862916469574, | |
| "learning_rate": 0.0003937644314868804, | |
| "loss": 3.2185, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.22027972027972, | |
| "grad_norm": 0.4079671800136566, | |
| "learning_rate": 0.0003935895043731778, | |
| "loss": 3.2219, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.234848484848484, | |
| "grad_norm": 0.3957606554031372, | |
| "learning_rate": 0.00039341457725947516, | |
| "loss": 3.2282, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.24941724941725, | |
| "grad_norm": 0.40197688341140747, | |
| "learning_rate": 0.0003932396501457726, | |
| "loss": 3.2433, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.263986013986013, | |
| "grad_norm": 0.36207619309425354, | |
| "learning_rate": 0.00039306472303206997, | |
| "loss": 3.2276, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.278554778554778, | |
| "grad_norm": 0.39968717098236084, | |
| "learning_rate": 0.00039288979591836734, | |
| "loss": 3.2461, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.293123543123542, | |
| "grad_norm": 0.435290664434433, | |
| "learning_rate": 0.0003927148688046647, | |
| "loss": 3.2376, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.307692307692307, | |
| "grad_norm": 0.3796147108078003, | |
| "learning_rate": 0.00039253994169096204, | |
| "loss": 3.2374, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.32226107226107, | |
| "grad_norm": 0.41960448026657104, | |
| "learning_rate": 0.0003923650145772594, | |
| "loss": 3.2476, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.336829836829835, | |
| "grad_norm": 0.39346998929977417, | |
| "learning_rate": 0.0003921900874635568, | |
| "loss": 3.2321, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.3513986013986, | |
| "grad_norm": 0.4165276288986206, | |
| "learning_rate": 0.00039201516034985417, | |
| "loss": 3.2445, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.365967365967364, | |
| "grad_norm": 0.4580809772014618, | |
| "learning_rate": 0.00039184023323615155, | |
| "loss": 3.2556, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.38053613053613, | |
| "grad_norm": 0.3814791738986969, | |
| "learning_rate": 0.000391665306122449, | |
| "loss": 3.2478, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.395104895104897, | |
| "grad_norm": 0.3738664388656616, | |
| "learning_rate": 0.00039149037900874635, | |
| "loss": 3.2556, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.40967365967366, | |
| "grad_norm": 0.3924182951450348, | |
| "learning_rate": 0.0003913154518950437, | |
| "loss": 3.2628, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.424242424242426, | |
| "grad_norm": 0.40077292919158936, | |
| "learning_rate": 0.0003911405247813411, | |
| "loss": 3.2548, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.43881118881119, | |
| "grad_norm": 0.3633650243282318, | |
| "learning_rate": 0.0003909655976676384, | |
| "loss": 3.2568, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.453379953379955, | |
| "grad_norm": 0.3840297758579254, | |
| "learning_rate": 0.0003907906705539358, | |
| "loss": 3.2474, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.46794871794872, | |
| "grad_norm": 0.3745962083339691, | |
| "learning_rate": 0.0003906157434402332, | |
| "loss": 3.2496, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.482517482517483, | |
| "grad_norm": 0.3834511935710907, | |
| "learning_rate": 0.00039044081632653055, | |
| "loss": 3.2473, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.482517482517483, | |
| "eval_accuracy": 0.3724052105923028, | |
| "eval_loss": 3.548959732055664, | |
| "eval_runtime": 180.0523, | |
| "eval_samples_per_second": 92.429, | |
| "eval_steps_per_second": 5.782, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.497086247086248, | |
| "grad_norm": 0.3921569287776947, | |
| "learning_rate": 0.00039026588921282793, | |
| "loss": 3.2826, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.511655011655012, | |
| "grad_norm": 0.39372700452804565, | |
| "learning_rate": 0.00039009096209912536, | |
| "loss": 3.259, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.526223776223777, | |
| "grad_norm": 0.4190043807029724, | |
| "learning_rate": 0.00038991603498542273, | |
| "loss": 3.2739, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.54079254079254, | |
| "grad_norm": 0.39327865839004517, | |
| "learning_rate": 0.0003897411078717201, | |
| "loss": 3.2594, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.555361305361306, | |
| "grad_norm": 0.36699846386909485, | |
| "learning_rate": 0.0003895661807580175, | |
| "loss": 3.2714, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.56993006993007, | |
| "grad_norm": 0.4013952314853668, | |
| "learning_rate": 0.0003893912536443148, | |
| "loss": 3.2694, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.584498834498834, | |
| "grad_norm": 0.38588792085647583, | |
| "learning_rate": 0.0003892163265306122, | |
| "loss": 3.2683, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.5990675990676, | |
| "grad_norm": 0.36934489011764526, | |
| "learning_rate": 0.00038904139941690956, | |
| "loss": 3.271, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.613636363636363, | |
| "grad_norm": 0.3960769772529602, | |
| "learning_rate": 0.00038886647230320693, | |
| "loss": 3.2859, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.628205128205128, | |
| "grad_norm": 0.37103790044784546, | |
| "learning_rate": 0.00038869154518950436, | |
| "loss": 3.2722, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.642773892773892, | |
| "grad_norm": 0.4212398827075958, | |
| "learning_rate": 0.00038851661807580174, | |
| "loss": 3.2787, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.657342657342657, | |
| "grad_norm": 0.3627692461013794, | |
| "learning_rate": 0.0003883416909620991, | |
| "loss": 3.2775, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.67191142191142, | |
| "grad_norm": 0.3897765576839447, | |
| "learning_rate": 0.0003881667638483965, | |
| "loss": 3.2723, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.686480186480185, | |
| "grad_norm": 0.38437172770500183, | |
| "learning_rate": 0.00038799183673469387, | |
| "loss": 3.2617, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.70104895104895, | |
| "grad_norm": 0.3970353305339813, | |
| "learning_rate": 0.0003878169096209912, | |
| "loss": 3.2641, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.715617715617714, | |
| "grad_norm": 0.38862720131874084, | |
| "learning_rate": 0.00038764198250728856, | |
| "loss": 3.2757, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.73018648018648, | |
| "grad_norm": 0.3624350130558014, | |
| "learning_rate": 0.00038746705539358594, | |
| "loss": 3.2727, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.744755244755243, | |
| "grad_norm": 0.38513702154159546, | |
| "learning_rate": 0.0003872921282798833, | |
| "loss": 3.2819, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.759324009324008, | |
| "grad_norm": 0.3679780066013336, | |
| "learning_rate": 0.00038711720116618075, | |
| "loss": 3.2728, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.773892773892776, | |
| "grad_norm": 0.38644713163375854, | |
| "learning_rate": 0.0003869422740524781, | |
| "loss": 3.2853, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.773892773892776, | |
| "eval_accuracy": 0.37325527494978283, | |
| "eval_loss": 3.5380146503448486, | |
| "eval_runtime": 180.1126, | |
| "eval_samples_per_second": 92.398, | |
| "eval_steps_per_second": 5.78, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.78846153846154, | |
| "grad_norm": 0.4223131537437439, | |
| "learning_rate": 0.0003867673469387755, | |
| "loss": 3.266, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.803030303030305, | |
| "grad_norm": 0.3889198899269104, | |
| "learning_rate": 0.0003865924198250729, | |
| "loss": 3.2749, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.81759906759907, | |
| "grad_norm": 0.37233766913414, | |
| "learning_rate": 0.00038641749271137025, | |
| "loss": 3.2852, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.832167832167833, | |
| "grad_norm": 0.37709736824035645, | |
| "learning_rate": 0.00038624256559766757, | |
| "loss": 3.2859, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.846736596736598, | |
| "grad_norm": 0.390927255153656, | |
| "learning_rate": 0.00038606763848396495, | |
| "loss": 3.2781, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.861305361305362, | |
| "grad_norm": 0.36890149116516113, | |
| "learning_rate": 0.0003858927113702623, | |
| "loss": 3.2759, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.875874125874127, | |
| "grad_norm": 0.36303025484085083, | |
| "learning_rate": 0.00038571778425655975, | |
| "loss": 3.2851, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.89044289044289, | |
| "grad_norm": 0.43360111117362976, | |
| "learning_rate": 0.00038554285714285713, | |
| "loss": 3.2648, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.905011655011656, | |
| "grad_norm": 0.40794098377227783, | |
| "learning_rate": 0.0003853679300291545, | |
| "loss": 3.2768, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.91958041958042, | |
| "grad_norm": 0.4121857285499573, | |
| "learning_rate": 0.0003851930029154519, | |
| "loss": 3.2776, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.934149184149184, | |
| "grad_norm": 0.40042850375175476, | |
| "learning_rate": 0.00038501807580174926, | |
| "loss": 3.2776, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.94871794871795, | |
| "grad_norm": 0.38233354687690735, | |
| "learning_rate": 0.00038484314868804663, | |
| "loss": 3.2909, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.963286713286713, | |
| "grad_norm": 0.3981986939907074, | |
| "learning_rate": 0.00038466822157434395, | |
| "loss": 3.2791, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.977855477855478, | |
| "grad_norm": 0.38394173979759216, | |
| "learning_rate": 0.00038449329446064133, | |
| "loss": 3.2932, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.992424242424242, | |
| "grad_norm": 0.3739701211452484, | |
| "learning_rate": 0.0003843183673469387, | |
| "loss": 3.3018, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 18.006993006993007, | |
| "grad_norm": 0.4334484338760376, | |
| "learning_rate": 0.00038414344023323613, | |
| "loss": 3.2268, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.02156177156177, | |
| "grad_norm": 0.42861032485961914, | |
| "learning_rate": 0.0003839685131195335, | |
| "loss": 3.1798, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.036130536130536, | |
| "grad_norm": 0.41923579573631287, | |
| "learning_rate": 0.0003837935860058309, | |
| "loss": 3.1827, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.0506993006993, | |
| "grad_norm": 0.40670716762542725, | |
| "learning_rate": 0.00038361865889212826, | |
| "loss": 3.193, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.065268065268064, | |
| "grad_norm": 0.3825482428073883, | |
| "learning_rate": 0.00038344373177842564, | |
| "loss": 3.1926, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.065268065268064, | |
| "eval_accuracy": 0.37260476228889206, | |
| "eval_loss": 3.55277419090271, | |
| "eval_runtime": 180.0064, | |
| "eval_samples_per_second": 92.452, | |
| "eval_steps_per_second": 5.783, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.07983682983683, | |
| "grad_norm": 0.4042184352874756, | |
| "learning_rate": 0.000383268804664723, | |
| "loss": 3.2012, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.094405594405593, | |
| "grad_norm": 0.3851967453956604, | |
| "learning_rate": 0.00038309387755102034, | |
| "loss": 3.1931, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.108974358974358, | |
| "grad_norm": 0.4039771556854248, | |
| "learning_rate": 0.0003829189504373177, | |
| "loss": 3.2051, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.123543123543122, | |
| "grad_norm": 0.39950627088546753, | |
| "learning_rate": 0.0003827440233236151, | |
| "loss": 3.2027, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.138111888111887, | |
| "grad_norm": 0.44474369287490845, | |
| "learning_rate": 0.0003825690962099125, | |
| "loss": 3.2036, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.15268065268065, | |
| "grad_norm": 0.38847821950912476, | |
| "learning_rate": 0.0003823941690962099, | |
| "loss": 3.2128, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.16724941724942, | |
| "grad_norm": 0.3725135922431946, | |
| "learning_rate": 0.00038221924198250727, | |
| "loss": 3.2201, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.181818181818183, | |
| "grad_norm": 0.4199862480163574, | |
| "learning_rate": 0.00038204431486880464, | |
| "loss": 3.2117, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.196386946386948, | |
| "grad_norm": 0.41751933097839355, | |
| "learning_rate": 0.000381869387755102, | |
| "loss": 3.2126, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.210955710955712, | |
| "grad_norm": 0.404287725687027, | |
| "learning_rate": 0.0003816944606413994, | |
| "loss": 3.225, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.225524475524477, | |
| "grad_norm": 0.4140659272670746, | |
| "learning_rate": 0.0003815195335276967, | |
| "loss": 3.2262, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.24009324009324, | |
| "grad_norm": 0.4209405779838562, | |
| "learning_rate": 0.0003813446064139941, | |
| "loss": 3.2267, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.254662004662006, | |
| "grad_norm": 0.39384543895721436, | |
| "learning_rate": 0.0003811696793002915, | |
| "loss": 3.2191, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.26923076923077, | |
| "grad_norm": 0.3749587833881378, | |
| "learning_rate": 0.0003809947521865889, | |
| "loss": 3.2332, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.283799533799534, | |
| "grad_norm": 0.4175374507904053, | |
| "learning_rate": 0.0003808198250728863, | |
| "loss": 3.2345, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.2983682983683, | |
| "grad_norm": 0.4039442837238312, | |
| "learning_rate": 0.00038064489795918365, | |
| "loss": 3.2273, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.312937062937063, | |
| "grad_norm": 0.4071449935436249, | |
| "learning_rate": 0.000380469970845481, | |
| "loss": 3.2226, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.327505827505828, | |
| "grad_norm": 0.43547382950782776, | |
| "learning_rate": 0.0003802950437317784, | |
| "loss": 3.2412, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.342074592074592, | |
| "grad_norm": 0.4017612040042877, | |
| "learning_rate": 0.0003801201166180758, | |
| "loss": 3.2346, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.356643356643357, | |
| "grad_norm": 0.38543474674224854, | |
| "learning_rate": 0.0003799451895043731, | |
| "loss": 3.2341, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.356643356643357, | |
| "eval_accuracy": 0.37304784466529056, | |
| "eval_loss": 3.545969009399414, | |
| "eval_runtime": 180.0148, | |
| "eval_samples_per_second": 92.448, | |
| "eval_steps_per_second": 5.783, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.37121212121212, | |
| "grad_norm": 0.3899354934692383, | |
| "learning_rate": 0.0003797702623906705, | |
| "loss": 3.2418, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.385780885780886, | |
| "grad_norm": 0.38112348318099976, | |
| "learning_rate": 0.0003795953352769679, | |
| "loss": 3.2467, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.40034965034965, | |
| "grad_norm": 0.4406619966030121, | |
| "learning_rate": 0.0003794204081632653, | |
| "loss": 3.243, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.414918414918414, | |
| "grad_norm": 0.3870476484298706, | |
| "learning_rate": 0.00037924548104956266, | |
| "loss": 3.2393, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.42948717948718, | |
| "grad_norm": 0.3903353214263916, | |
| "learning_rate": 0.00037907055393586003, | |
| "loss": 3.2434, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.444055944055943, | |
| "grad_norm": 0.3915194272994995, | |
| "learning_rate": 0.0003788956268221574, | |
| "loss": 3.248, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.458624708624708, | |
| "grad_norm": 0.442837655544281, | |
| "learning_rate": 0.0003787206997084548, | |
| "loss": 3.2541, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.473193473193472, | |
| "grad_norm": 0.38805052638053894, | |
| "learning_rate": 0.00037854577259475216, | |
| "loss": 3.2386, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.487762237762237, | |
| "grad_norm": 0.4246932566165924, | |
| "learning_rate": 0.0003783708454810495, | |
| "loss": 3.2384, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.502331002331, | |
| "grad_norm": 0.3854350745677948, | |
| "learning_rate": 0.00037819591836734686, | |
| "loss": 3.2573, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.516899766899765, | |
| "grad_norm": 0.4202209413051605, | |
| "learning_rate": 0.0003780209912536443, | |
| "loss": 3.2449, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.53146853146853, | |
| "grad_norm": 0.43968549370765686, | |
| "learning_rate": 0.00037784606413994166, | |
| "loss": 3.2394, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.546037296037294, | |
| "grad_norm": 0.4122573137283325, | |
| "learning_rate": 0.00037767113702623904, | |
| "loss": 3.2401, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.560606060606062, | |
| "grad_norm": 0.39152082800865173, | |
| "learning_rate": 0.0003774962099125364, | |
| "loss": 3.2445, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.575174825174827, | |
| "grad_norm": 0.38848137855529785, | |
| "learning_rate": 0.0003773212827988338, | |
| "loss": 3.262, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.58974358974359, | |
| "grad_norm": 0.4093928635120392, | |
| "learning_rate": 0.00037714635568513117, | |
| "loss": 3.2593, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.604312354312356, | |
| "grad_norm": 0.4007672667503357, | |
| "learning_rate": 0.00037697142857142854, | |
| "loss": 3.2601, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.61888111888112, | |
| "grad_norm": 0.3751702606678009, | |
| "learning_rate": 0.00037679650145772586, | |
| "loss": 3.2469, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.633449883449885, | |
| "grad_norm": 0.40671372413635254, | |
| "learning_rate": 0.0003766215743440233, | |
| "loss": 3.2575, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.64801864801865, | |
| "grad_norm": 0.40447014570236206, | |
| "learning_rate": 0.00037644664723032067, | |
| "loss": 3.2576, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.64801864801865, | |
| "eval_accuracy": 0.3732128246477977, | |
| "eval_loss": 3.5389418601989746, | |
| "eval_runtime": 180.229, | |
| "eval_samples_per_second": 92.338, | |
| "eval_steps_per_second": 5.776, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.662587412587413, | |
| "grad_norm": 0.406858891248703, | |
| "learning_rate": 0.00037627172011661805, | |
| "loss": 3.2618, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.677156177156178, | |
| "grad_norm": 0.39155733585357666, | |
| "learning_rate": 0.0003760967930029154, | |
| "loss": 3.2583, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.691724941724942, | |
| "grad_norm": 0.4028909504413605, | |
| "learning_rate": 0.0003759218658892128, | |
| "loss": 3.2607, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.706293706293707, | |
| "grad_norm": 0.3763161599636078, | |
| "learning_rate": 0.00037574693877551017, | |
| "loss": 3.2557, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.72086247086247, | |
| "grad_norm": 0.3901912569999695, | |
| "learning_rate": 0.00037557201166180755, | |
| "loss": 3.2591, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.735431235431236, | |
| "grad_norm": 0.3856413662433624, | |
| "learning_rate": 0.000375397084548105, | |
| "loss": 3.2764, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.75, | |
| "grad_norm": 0.38662955164909363, | |
| "learning_rate": 0.00037522215743440225, | |
| "loss": 3.2628, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.764568764568764, | |
| "grad_norm": 0.3822769820690155, | |
| "learning_rate": 0.0003750472303206997, | |
| "loss": 3.2645, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.77913752913753, | |
| "grad_norm": 0.40630531311035156, | |
| "learning_rate": 0.00037487230320699705, | |
| "loss": 3.2667, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.793706293706293, | |
| "grad_norm": 0.40045738220214844, | |
| "learning_rate": 0.00037469737609329443, | |
| "loss": 3.2628, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.808275058275058, | |
| "grad_norm": 0.38867759704589844, | |
| "learning_rate": 0.0003745224489795918, | |
| "loss": 3.2687, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.822843822843822, | |
| "grad_norm": 0.4422551393508911, | |
| "learning_rate": 0.0003743475218658892, | |
| "loss": 3.275, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.837412587412587, | |
| "grad_norm": 0.39280012249946594, | |
| "learning_rate": 0.00037417259475218655, | |
| "loss": 3.2652, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.85198135198135, | |
| "grad_norm": 0.39500296115875244, | |
| "learning_rate": 0.00037399766763848393, | |
| "loss": 3.2573, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.866550116550115, | |
| "grad_norm": 0.420569509267807, | |
| "learning_rate": 0.00037382274052478136, | |
| "loss": 3.284, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.88111888111888, | |
| "grad_norm": 0.395648330450058, | |
| "learning_rate": 0.0003736478134110787, | |
| "loss": 3.2792, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.895687645687644, | |
| "grad_norm": 0.4250222146511078, | |
| "learning_rate": 0.00037347288629737606, | |
| "loss": 3.2648, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.91025641025641, | |
| "grad_norm": 0.3909446895122528, | |
| "learning_rate": 0.00037329795918367343, | |
| "loss": 3.2897, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.924825174825173, | |
| "grad_norm": 0.38186371326446533, | |
| "learning_rate": 0.0003731230320699708, | |
| "loss": 3.2589, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.939393939393938, | |
| "grad_norm": 0.3965621888637543, | |
| "learning_rate": 0.0003729481049562682, | |
| "loss": 3.2692, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.939393939393938, | |
| "eval_accuracy": 0.3734865761797127, | |
| "eval_loss": 3.534069776535034, | |
| "eval_runtime": 180.2328, | |
| "eval_samples_per_second": 92.336, | |
| "eval_steps_per_second": 5.776, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.953962703962706, | |
| "grad_norm": 0.404230535030365, | |
| "learning_rate": 0.00037277317784256556, | |
| "loss": 3.27, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.96853146853147, | |
| "grad_norm": 0.3822128474712372, | |
| "learning_rate": 0.00037259825072886294, | |
| "loss": 3.2876, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.983100233100235, | |
| "grad_norm": 0.4366745054721832, | |
| "learning_rate": 0.00037242332361516037, | |
| "loss": 3.2788, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 18.997668997669, | |
| "grad_norm": 0.38679859042167664, | |
| "learning_rate": 0.00037224839650145774, | |
| "loss": 3.2719, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 19.012237762237763, | |
| "grad_norm": 0.3948976695537567, | |
| "learning_rate": 0.00037207346938775506, | |
| "loss": 3.1947, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.026806526806528, | |
| "grad_norm": 0.4068011939525604, | |
| "learning_rate": 0.00037189854227405244, | |
| "loss": 3.175, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.041375291375292, | |
| "grad_norm": 0.39423683285713196, | |
| "learning_rate": 0.0003717236151603498, | |
| "loss": 3.1707, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.055944055944057, | |
| "grad_norm": 0.3666231036186218, | |
| "learning_rate": 0.0003715486880466472, | |
| "loss": 3.1813, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.07051282051282, | |
| "grad_norm": 0.3903319537639618, | |
| "learning_rate": 0.00037137376093294457, | |
| "loss": 3.1921, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.085081585081586, | |
| "grad_norm": 0.4077537953853607, | |
| "learning_rate": 0.00037119883381924194, | |
| "loss": 3.1883, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.09965034965035, | |
| "grad_norm": 0.40144649147987366, | |
| "learning_rate": 0.0003710239067055393, | |
| "loss": 3.1822, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.114219114219114, | |
| "grad_norm": 0.42461147904396057, | |
| "learning_rate": 0.00037084897959183675, | |
| "loss": 3.1975, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.12878787878788, | |
| "grad_norm": 0.3930090367794037, | |
| "learning_rate": 0.0003706740524781341, | |
| "loss": 3.1864, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.143356643356643, | |
| "grad_norm": 0.3844420909881592, | |
| "learning_rate": 0.00037049912536443145, | |
| "loss": 3.1978, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.157925407925408, | |
| "grad_norm": 0.4048396944999695, | |
| "learning_rate": 0.0003703241982507288, | |
| "loss": 3.2046, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.172494172494172, | |
| "grad_norm": 0.3697856068611145, | |
| "learning_rate": 0.0003701492711370262, | |
| "loss": 3.1964, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.187062937062937, | |
| "grad_norm": 0.42250409722328186, | |
| "learning_rate": 0.0003699743440233236, | |
| "loss": 3.2116, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.2016317016317, | |
| "grad_norm": 0.3881857693195343, | |
| "learning_rate": 0.00036979941690962095, | |
| "loss": 3.2068, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.216200466200466, | |
| "grad_norm": 0.4232952296733856, | |
| "learning_rate": 0.0003696244897959183, | |
| "loss": 3.1967, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.23076923076923, | |
| "grad_norm": 0.39698526263237, | |
| "learning_rate": 0.0003694495626822157, | |
| "loss": 3.2072, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.23076923076923, | |
| "eval_accuracy": 0.3729909306870058, | |
| "eval_loss": 3.551021099090576, | |
| "eval_runtime": 180.0611, | |
| "eval_samples_per_second": 92.424, | |
| "eval_steps_per_second": 5.781, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.245337995337994, | |
| "grad_norm": 0.4016408920288086, | |
| "learning_rate": 0.00036927463556851313, | |
| "loss": 3.2204, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.25990675990676, | |
| "grad_norm": 0.42734089493751526, | |
| "learning_rate": 0.0003690997084548105, | |
| "loss": 3.2053, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.274475524475523, | |
| "grad_norm": 0.44750434160232544, | |
| "learning_rate": 0.00036892478134110783, | |
| "loss": 3.2228, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.289044289044288, | |
| "grad_norm": 0.3957410156726837, | |
| "learning_rate": 0.0003687498542274052, | |
| "loss": 3.2057, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.303613053613052, | |
| "grad_norm": 0.41760486364364624, | |
| "learning_rate": 0.0003685749271137026, | |
| "loss": 3.2128, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.318181818181817, | |
| "grad_norm": 0.37982597947120667, | |
| "learning_rate": 0.00036839999999999996, | |
| "loss": 3.2198, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.33275058275058, | |
| "grad_norm": 0.39311569929122925, | |
| "learning_rate": 0.00036822507288629733, | |
| "loss": 3.2262, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.34731934731935, | |
| "grad_norm": 0.40117841958999634, | |
| "learning_rate": 0.0003680501457725947, | |
| "loss": 3.2224, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.361888111888113, | |
| "grad_norm": 0.43732988834381104, | |
| "learning_rate": 0.00036787521865889214, | |
| "loss": 3.2191, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.376456876456878, | |
| "grad_norm": 0.41658449172973633, | |
| "learning_rate": 0.0003677002915451895, | |
| "loss": 3.224, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.391025641025642, | |
| "grad_norm": 0.39912378787994385, | |
| "learning_rate": 0.0003675253644314869, | |
| "loss": 3.2165, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.405594405594407, | |
| "grad_norm": 0.41178327798843384, | |
| "learning_rate": 0.0003673504373177842, | |
| "loss": 3.2439, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.42016317016317, | |
| "grad_norm": 0.3769591450691223, | |
| "learning_rate": 0.0003671755102040816, | |
| "loss": 3.2277, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.434731934731936, | |
| "grad_norm": 0.3860316872596741, | |
| "learning_rate": 0.00036700058309037896, | |
| "loss": 3.2286, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.4493006993007, | |
| "grad_norm": 0.38730889558792114, | |
| "learning_rate": 0.00036682565597667634, | |
| "loss": 3.2281, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.463869463869464, | |
| "grad_norm": 0.39924201369285583, | |
| "learning_rate": 0.0003666507288629737, | |
| "loss": 3.2363, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.47843822843823, | |
| "grad_norm": 0.4194006323814392, | |
| "learning_rate": 0.0003664758017492711, | |
| "loss": 3.227, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.493006993006993, | |
| "grad_norm": 0.39736315608024597, | |
| "learning_rate": 0.0003663008746355685, | |
| "loss": 3.2454, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.507575757575758, | |
| "grad_norm": 0.39378735423088074, | |
| "learning_rate": 0.0003661259475218659, | |
| "loss": 3.2404, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.522144522144522, | |
| "grad_norm": 0.4026825428009033, | |
| "learning_rate": 0.00036595102040816327, | |
| "loss": 3.2441, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.522144522144522, | |
| "eval_accuracy": 0.37330019465991665, | |
| "eval_loss": 3.542999267578125, | |
| "eval_runtime": 179.9313, | |
| "eval_samples_per_second": 92.491, | |
| "eval_steps_per_second": 5.786, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.536713286713287, | |
| "grad_norm": 0.38818758726119995, | |
| "learning_rate": 0.0003657760932944606, | |
| "loss": 3.2529, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.55128205128205, | |
| "grad_norm": 0.42251184582710266, | |
| "learning_rate": 0.00036560116618075797, | |
| "loss": 3.2517, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.565850815850816, | |
| "grad_norm": 0.45560574531555176, | |
| "learning_rate": 0.00036542623906705534, | |
| "loss": 3.2418, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.58041958041958, | |
| "grad_norm": 0.4155190885066986, | |
| "learning_rate": 0.0003652513119533527, | |
| "loss": 3.2352, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.594988344988344, | |
| "grad_norm": 0.46676522493362427, | |
| "learning_rate": 0.0003650763848396501, | |
| "loss": 3.2424, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.60955710955711, | |
| "grad_norm": 0.4105874001979828, | |
| "learning_rate": 0.0003649014577259475, | |
| "loss": 3.2472, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.624125874125873, | |
| "grad_norm": 0.40313708782196045, | |
| "learning_rate": 0.0003647265306122449, | |
| "loss": 3.2436, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.638694638694638, | |
| "grad_norm": 0.400311142206192, | |
| "learning_rate": 0.0003645516034985423, | |
| "loss": 3.2383, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.653263403263402, | |
| "grad_norm": 0.387896865606308, | |
| "learning_rate": 0.00036437667638483965, | |
| "loss": 3.2514, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.667832167832167, | |
| "grad_norm": 0.39638492465019226, | |
| "learning_rate": 0.000364201749271137, | |
| "loss": 3.2524, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.68240093240093, | |
| "grad_norm": 0.40816885232925415, | |
| "learning_rate": 0.00036402682215743435, | |
| "loss": 3.2627, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.696969696969695, | |
| "grad_norm": 0.39976105093955994, | |
| "learning_rate": 0.0003638518950437317, | |
| "loss": 3.2588, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.71153846153846, | |
| "grad_norm": 0.38313519954681396, | |
| "learning_rate": 0.0003636769679300291, | |
| "loss": 3.2609, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.726107226107224, | |
| "grad_norm": 0.4652852416038513, | |
| "learning_rate": 0.0003635020408163265, | |
| "loss": 3.264, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.740675990675992, | |
| "grad_norm": 0.4328801929950714, | |
| "learning_rate": 0.0003633271137026239, | |
| "loss": 3.2553, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.755244755244757, | |
| "grad_norm": 0.38108712434768677, | |
| "learning_rate": 0.0003631521865889213, | |
| "loss": 3.2439, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.76981351981352, | |
| "grad_norm": 0.3857068121433258, | |
| "learning_rate": 0.00036297725947521866, | |
| "loss": 3.2565, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.784382284382286, | |
| "grad_norm": 0.4026941657066345, | |
| "learning_rate": 0.00036280233236151604, | |
| "loss": 3.2531, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.79895104895105, | |
| "grad_norm": 0.3889337480068207, | |
| "learning_rate": 0.00036262740524781336, | |
| "loss": 3.2349, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.813519813519815, | |
| "grad_norm": 0.4145674705505371, | |
| "learning_rate": 0.00036245247813411073, | |
| "loss": 3.2584, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.813519813519815, | |
| "eval_accuracy": 0.3736507330261703, | |
| "eval_loss": 3.5368125438690186, | |
| "eval_runtime": 180.8631, | |
| "eval_samples_per_second": 92.014, | |
| "eval_steps_per_second": 5.756, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.82808857808858, | |
| "grad_norm": 0.42266836762428284, | |
| "learning_rate": 0.0003622775510204081, | |
| "loss": 3.255, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.842657342657343, | |
| "grad_norm": 0.4349689781665802, | |
| "learning_rate": 0.0003621026239067055, | |
| "loss": 3.2449, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.857226107226108, | |
| "grad_norm": 0.4042121171951294, | |
| "learning_rate": 0.00036192769679300286, | |
| "loss": 3.2478, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.871794871794872, | |
| "grad_norm": 0.40951982140541077, | |
| "learning_rate": 0.0003617527696793003, | |
| "loss": 3.2761, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.886363636363637, | |
| "grad_norm": 0.4130703806877136, | |
| "learning_rate": 0.00036157784256559767, | |
| "loss": 3.2723, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.9009324009324, | |
| "grad_norm": 0.39606547355651855, | |
| "learning_rate": 0.00036140291545189504, | |
| "loss": 3.2622, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.915501165501166, | |
| "grad_norm": 0.3855551779270172, | |
| "learning_rate": 0.0003612279883381924, | |
| "loss": 3.2545, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.93006993006993, | |
| "grad_norm": 0.39541083574295044, | |
| "learning_rate": 0.00036105306122448974, | |
| "loss": 3.2581, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.944638694638694, | |
| "grad_norm": 0.43517494201660156, | |
| "learning_rate": 0.0003608781341107871, | |
| "loss": 3.2528, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.95920745920746, | |
| "grad_norm": 0.39319750666618347, | |
| "learning_rate": 0.0003607032069970845, | |
| "loss": 3.2749, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.973776223776223, | |
| "grad_norm": 0.42578259110450745, | |
| "learning_rate": 0.00036052827988338187, | |
| "loss": 3.2813, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.988344988344988, | |
| "grad_norm": 0.41607028245925903, | |
| "learning_rate": 0.0003603533527696793, | |
| "loss": 3.263, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 20.002913752913752, | |
| "grad_norm": 0.40466973185539246, | |
| "learning_rate": 0.00036017842565597667, | |
| "loss": 3.2452, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 20.017482517482517, | |
| "grad_norm": 0.4109882712364197, | |
| "learning_rate": 0.00036000349854227405, | |
| "loss": 3.1553, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 20.03205128205128, | |
| "grad_norm": 0.44405311346054077, | |
| "learning_rate": 0.0003598285714285714, | |
| "loss": 3.1552, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 20.046620046620045, | |
| "grad_norm": 0.405329167842865, | |
| "learning_rate": 0.0003596536443148688, | |
| "loss": 3.1718, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 20.06118881118881, | |
| "grad_norm": 0.4313248097896576, | |
| "learning_rate": 0.0003594787172011661, | |
| "loss": 3.1793, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 20.075757575757574, | |
| "grad_norm": 0.46406006813049316, | |
| "learning_rate": 0.0003593037900874635, | |
| "loss": 3.1654, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 20.09032634032634, | |
| "grad_norm": 0.42525312304496765, | |
| "learning_rate": 0.00035912886297376087, | |
| "loss": 3.1819, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 20.104895104895103, | |
| "grad_norm": 0.38830262422561646, | |
| "learning_rate": 0.00035895393586005825, | |
| "loss": 3.1789, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.104895104895103, | |
| "eval_accuracy": 0.3730249144467667, | |
| "eval_loss": 3.551114559173584, | |
| "eval_runtime": 180.1938, | |
| "eval_samples_per_second": 92.356, | |
| "eval_steps_per_second": 5.777, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.11946386946387, | |
| "grad_norm": 0.40085506439208984, | |
| "learning_rate": 0.0003587790087463557, | |
| "loss": 3.1873, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 20.134032634032636, | |
| "grad_norm": 0.38859525322914124, | |
| "learning_rate": 0.00035860408163265305, | |
| "loss": 3.1785, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 20.1486013986014, | |
| "grad_norm": 0.39980462193489075, | |
| "learning_rate": 0.00035842915451895043, | |
| "loss": 3.1818, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 20.163170163170165, | |
| "grad_norm": 0.45090606808662415, | |
| "learning_rate": 0.0003582542274052478, | |
| "loss": 3.1847, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 20.17773892773893, | |
| "grad_norm": 0.41566649079322815, | |
| "learning_rate": 0.0003580793002915452, | |
| "loss": 3.1746, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 20.192307692307693, | |
| "grad_norm": 0.3906833231449127, | |
| "learning_rate": 0.0003579043731778425, | |
| "loss": 3.1916, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 20.206876456876458, | |
| "grad_norm": 0.3998190760612488, | |
| "learning_rate": 0.0003577294460641399, | |
| "loss": 3.1898, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 20.221445221445222, | |
| "grad_norm": 0.39457687735557556, | |
| "learning_rate": 0.00035755451895043725, | |
| "loss": 3.2008, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 20.236013986013987, | |
| "grad_norm": 0.41978347301483154, | |
| "learning_rate": 0.00035737959183673463, | |
| "loss": 3.1958, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 20.25058275058275, | |
| "grad_norm": 0.4217435121536255, | |
| "learning_rate": 0.00035720466472303206, | |
| "loss": 3.1889, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 20.265151515151516, | |
| "grad_norm": 0.4164603352546692, | |
| "learning_rate": 0.00035702973760932944, | |
| "loss": 3.2118, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 20.27972027972028, | |
| "grad_norm": 0.39068740606307983, | |
| "learning_rate": 0.0003568548104956268, | |
| "loss": 3.2086, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 20.294289044289044, | |
| "grad_norm": 0.36841413378715515, | |
| "learning_rate": 0.0003566798833819242, | |
| "loss": 3.2165, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 20.30885780885781, | |
| "grad_norm": 0.37738609313964844, | |
| "learning_rate": 0.00035650495626822156, | |
| "loss": 3.2106, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 20.323426573426573, | |
| "grad_norm": 0.3986680209636688, | |
| "learning_rate": 0.0003563300291545189, | |
| "loss": 3.2111, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 20.337995337995338, | |
| "grad_norm": 0.40461841225624084, | |
| "learning_rate": 0.00035615510204081626, | |
| "loss": 3.2073, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 20.352564102564102, | |
| "grad_norm": 0.4184028208255768, | |
| "learning_rate": 0.00035598017492711364, | |
| "loss": 3.2081, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 20.367132867132867, | |
| "grad_norm": 0.39725759625434875, | |
| "learning_rate": 0.00035580524781341107, | |
| "loss": 3.214, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 20.38170163170163, | |
| "grad_norm": 0.3868226706981659, | |
| "learning_rate": 0.00035563032069970844, | |
| "loss": 3.2195, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 20.396270396270396, | |
| "grad_norm": 0.43626147508621216, | |
| "learning_rate": 0.0003554553935860058, | |
| "loss": 3.2145, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.396270396270396, | |
| "eval_accuracy": 0.3731882481571748, | |
| "eval_loss": 3.5491859912872314, | |
| "eval_runtime": 180.0447, | |
| "eval_samples_per_second": 92.433, | |
| "eval_steps_per_second": 5.782, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.41083916083916, | |
| "grad_norm": 0.42190268635749817, | |
| "learning_rate": 0.0003552804664723032, | |
| "loss": 3.2188, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 20.425407925407924, | |
| "grad_norm": 0.38069528341293335, | |
| "learning_rate": 0.00035510553935860057, | |
| "loss": 3.2178, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 20.43997668997669, | |
| "grad_norm": 0.39980804920196533, | |
| "learning_rate": 0.00035493061224489795, | |
| "loss": 3.2249, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 20.454545454545453, | |
| "grad_norm": 0.4361363649368286, | |
| "learning_rate": 0.00035475568513119527, | |
| "loss": 3.2327, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 20.469114219114218, | |
| "grad_norm": 0.4003528952598572, | |
| "learning_rate": 0.00035458075801749264, | |
| "loss": 3.2181, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 20.483682983682982, | |
| "grad_norm": 0.4071844816207886, | |
| "learning_rate": 0.00035440583090379, | |
| "loss": 3.2183, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 20.498251748251747, | |
| "grad_norm": 0.4098977744579315, | |
| "learning_rate": 0.00035423090379008745, | |
| "loss": 3.229, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 20.51282051282051, | |
| "grad_norm": 0.38837113976478577, | |
| "learning_rate": 0.0003540559766763848, | |
| "loss": 3.2301, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 20.52738927738928, | |
| "grad_norm": 0.4276978671550751, | |
| "learning_rate": 0.0003538810495626822, | |
| "loss": 3.2347, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 20.541958041958043, | |
| "grad_norm": 0.39745762944221497, | |
| "learning_rate": 0.0003537061224489796, | |
| "loss": 3.2343, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 20.556526806526808, | |
| "grad_norm": 0.39127087593078613, | |
| "learning_rate": 0.00035353119533527695, | |
| "loss": 3.2333, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 20.571095571095572, | |
| "grad_norm": 0.38400810956954956, | |
| "learning_rate": 0.00035335626822157433, | |
| "loss": 3.2294, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 20.585664335664337, | |
| "grad_norm": 0.48885342478752136, | |
| "learning_rate": 0.00035318134110787165, | |
| "loss": 3.2447, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 20.6002331002331, | |
| "grad_norm": 0.4058760404586792, | |
| "learning_rate": 0.000353006413994169, | |
| "loss": 3.2314, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 20.614801864801866, | |
| "grad_norm": 0.44265225529670715, | |
| "learning_rate": 0.00035283148688046646, | |
| "loss": 3.2408, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 20.62937062937063, | |
| "grad_norm": 0.39163583517074585, | |
| "learning_rate": 0.00035265655976676383, | |
| "loss": 3.2402, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 20.643939393939394, | |
| "grad_norm": 0.42056941986083984, | |
| "learning_rate": 0.0003524816326530612, | |
| "loss": 3.2357, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 20.65850815850816, | |
| "grad_norm": 0.41706550121307373, | |
| "learning_rate": 0.0003523067055393586, | |
| "loss": 3.2457, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 20.673076923076923, | |
| "grad_norm": 0.41059020161628723, | |
| "learning_rate": 0.00035213177842565596, | |
| "loss": 3.237, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 20.687645687645688, | |
| "grad_norm": 0.38980886340141296, | |
| "learning_rate": 0.00035195685131195333, | |
| "loss": 3.2446, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.687645687645688, | |
| "eval_accuracy": 0.3736142798582607, | |
| "eval_loss": 3.539933919906616, | |
| "eval_runtime": 180.0196, | |
| "eval_samples_per_second": 92.445, | |
| "eval_steps_per_second": 5.783, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.702214452214452, | |
| "grad_norm": 0.42689770460128784, | |
| "learning_rate": 0.0003517819241982507, | |
| "loss": 3.2479, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 20.716783216783217, | |
| "grad_norm": 0.37170732021331787, | |
| "learning_rate": 0.00035160699708454803, | |
| "loss": 3.2448, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 20.73135198135198, | |
| "grad_norm": 0.40953195095062256, | |
| "learning_rate": 0.0003514320699708454, | |
| "loss": 3.2477, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 20.745920745920746, | |
| "grad_norm": 0.44523903727531433, | |
| "learning_rate": 0.00035125714285714284, | |
| "loss": 3.2392, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 20.76048951048951, | |
| "grad_norm": 0.41191455721855164, | |
| "learning_rate": 0.0003510822157434402, | |
| "loss": 3.2403, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 20.775058275058274, | |
| "grad_norm": 0.38925042748451233, | |
| "learning_rate": 0.0003509072886297376, | |
| "loss": 3.2517, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 20.78962703962704, | |
| "grad_norm": 0.44074341654777527, | |
| "learning_rate": 0.00035073236151603497, | |
| "loss": 3.2596, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 20.804195804195803, | |
| "grad_norm": 0.4275226891040802, | |
| "learning_rate": 0.00035055743440233234, | |
| "loss": 3.2408, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 20.818764568764568, | |
| "grad_norm": 0.42843613028526306, | |
| "learning_rate": 0.0003503825072886297, | |
| "loss": 3.252, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 20.833333333333332, | |
| "grad_norm": 0.42051464319229126, | |
| "learning_rate": 0.0003502075801749271, | |
| "loss": 3.2386, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 20.847902097902097, | |
| "grad_norm": 0.37405192852020264, | |
| "learning_rate": 0.0003500326530612244, | |
| "loss": 3.2573, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 20.86247086247086, | |
| "grad_norm": 0.42603862285614014, | |
| "learning_rate": 0.0003498577259475218, | |
| "loss": 3.2459, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 20.877039627039625, | |
| "grad_norm": 0.45300430059432983, | |
| "learning_rate": 0.0003496827988338192, | |
| "loss": 3.25, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 20.89160839160839, | |
| "grad_norm": 0.40416771173477173, | |
| "learning_rate": 0.0003495078717201166, | |
| "loss": 3.2399, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 20.906177156177158, | |
| "grad_norm": 0.39509859681129456, | |
| "learning_rate": 0.00034933294460641397, | |
| "loss": 3.2518, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 20.920745920745922, | |
| "grad_norm": 0.4095703661441803, | |
| "learning_rate": 0.00034915801749271135, | |
| "loss": 3.2553, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 20.935314685314687, | |
| "grad_norm": 0.4047284722328186, | |
| "learning_rate": 0.0003489830903790087, | |
| "loss": 3.2604, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 20.94988344988345, | |
| "grad_norm": 0.4120003581047058, | |
| "learning_rate": 0.0003488081632653061, | |
| "loss": 3.2629, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 20.964452214452216, | |
| "grad_norm": 0.3764059543609619, | |
| "learning_rate": 0.0003486332361516035, | |
| "loss": 3.2592, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 20.97902097902098, | |
| "grad_norm": 0.4122651219367981, | |
| "learning_rate": 0.0003484583090379008, | |
| "loss": 3.2494, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.97902097902098, | |
| "eval_accuracy": 0.37445611285524494, | |
| "eval_loss": 3.5293209552764893, | |
| "eval_runtime": 182.1054, | |
| "eval_samples_per_second": 91.387, | |
| "eval_steps_per_second": 5.716, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.993589743589745, | |
| "grad_norm": 0.424654096364975, | |
| "learning_rate": 0.0003482833819241982, | |
| "loss": 3.2465, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 21.00815850815851, | |
| "grad_norm": 0.4177544414997101, | |
| "learning_rate": 0.0003481084548104956, | |
| "loss": 3.1941, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 21.022727272727273, | |
| "grad_norm": 0.397794634103775, | |
| "learning_rate": 0.000347933527696793, | |
| "loss": 3.1429, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 21.037296037296038, | |
| "grad_norm": 0.41001781821250916, | |
| "learning_rate": 0.00034775860058309035, | |
| "loss": 3.1562, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 21.051864801864802, | |
| "grad_norm": 0.39250168204307556, | |
| "learning_rate": 0.00034758367346938773, | |
| "loss": 3.1583, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 21.066433566433567, | |
| "grad_norm": 0.4046579599380493, | |
| "learning_rate": 0.0003474087463556851, | |
| "loss": 3.1576, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 21.08100233100233, | |
| "grad_norm": 0.3915032744407654, | |
| "learning_rate": 0.0003472338192419825, | |
| "loss": 3.1568, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 21.095571095571096, | |
| "grad_norm": 0.40777382254600525, | |
| "learning_rate": 0.0003470588921282799, | |
| "loss": 3.1622, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 21.11013986013986, | |
| "grad_norm": 0.4149940013885498, | |
| "learning_rate": 0.0003468839650145772, | |
| "loss": 3.1852, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 21.124708624708624, | |
| "grad_norm": 0.43859758973121643, | |
| "learning_rate": 0.0003467090379008746, | |
| "loss": 3.1629, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 21.13927738927739, | |
| "grad_norm": 0.43607211112976074, | |
| "learning_rate": 0.000346534110787172, | |
| "loss": 3.1859, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 21.153846153846153, | |
| "grad_norm": 0.40747860074043274, | |
| "learning_rate": 0.00034635918367346936, | |
| "loss": 3.1825, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 21.168414918414918, | |
| "grad_norm": 0.41333723068237305, | |
| "learning_rate": 0.00034618425655976674, | |
| "loss": 3.1762, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 21.182983682983682, | |
| "grad_norm": 0.4163745641708374, | |
| "learning_rate": 0.0003460093294460641, | |
| "loss": 3.1871, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 21.197552447552447, | |
| "grad_norm": 0.403562068939209, | |
| "learning_rate": 0.0003458344023323615, | |
| "loss": 3.1731, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 21.21212121212121, | |
| "grad_norm": 0.4139896631240845, | |
| "learning_rate": 0.00034565947521865886, | |
| "loss": 3.1912, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 21.226689976689975, | |
| "grad_norm": 0.41359269618988037, | |
| "learning_rate": 0.0003454845481049562, | |
| "loss": 3.1819, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 21.24125874125874, | |
| "grad_norm": 0.41218870878219604, | |
| "learning_rate": 0.0003453096209912536, | |
| "loss": 3.1962, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 21.255827505827504, | |
| "grad_norm": 0.414986789226532, | |
| "learning_rate": 0.000345134693877551, | |
| "loss": 3.1907, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 21.27039627039627, | |
| "grad_norm": 0.47167742252349854, | |
| "learning_rate": 0.00034495976676384837, | |
| "loss": 3.1886, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.27039627039627, | |
| "eval_accuracy": 0.3735304375720685, | |
| "eval_loss": 3.546489715576172, | |
| "eval_runtime": 229.5916, | |
| "eval_samples_per_second": 72.485, | |
| "eval_steps_per_second": 4.534, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.284965034965033, | |
| "grad_norm": 0.42384305596351624, | |
| "learning_rate": 0.00034478483965014574, | |
| "loss": 3.2018, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 21.2995337995338, | |
| "grad_norm": 0.4221298098564148, | |
| "learning_rate": 0.0003446099125364431, | |
| "loss": 3.2019, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 21.314102564102566, | |
| "grad_norm": 0.3836268484592438, | |
| "learning_rate": 0.0003444349854227405, | |
| "loss": 3.2082, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 21.32867132867133, | |
| "grad_norm": 0.41144731640815735, | |
| "learning_rate": 0.00034426005830903787, | |
| "loss": 3.1878, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 21.343240093240095, | |
| "grad_norm": 0.4177376627922058, | |
| "learning_rate": 0.0003440851311953353, | |
| "loss": 3.2069, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 21.35780885780886, | |
| "grad_norm": 0.4552435576915741, | |
| "learning_rate": 0.00034391020408163257, | |
| "loss": 3.1999, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 21.372377622377623, | |
| "grad_norm": 0.41348081827163696, | |
| "learning_rate": 0.00034373527696793, | |
| "loss": 3.2043, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 21.386946386946388, | |
| "grad_norm": 0.40806278586387634, | |
| "learning_rate": 0.00034356034985422737, | |
| "loss": 3.1899, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 21.401515151515152, | |
| "grad_norm": 0.4172794222831726, | |
| "learning_rate": 0.00034338542274052475, | |
| "loss": 3.2038, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 21.416083916083917, | |
| "grad_norm": 0.4296078383922577, | |
| "learning_rate": 0.0003432104956268221, | |
| "loss": 3.2136, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 21.43065268065268, | |
| "grad_norm": 0.379256010055542, | |
| "learning_rate": 0.0003430355685131195, | |
| "loss": 3.1992, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 21.445221445221446, | |
| "grad_norm": 0.4346024692058563, | |
| "learning_rate": 0.0003428606413994169, | |
| "loss": 3.2252, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 21.45979020979021, | |
| "grad_norm": 0.40021541714668274, | |
| "learning_rate": 0.00034268571428571425, | |
| "loss": 3.2068, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 21.474358974358974, | |
| "grad_norm": 0.4183707535266876, | |
| "learning_rate": 0.0003425107871720117, | |
| "loss": 3.2267, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 21.48892773892774, | |
| "grad_norm": 0.4039231836795807, | |
| "learning_rate": 0.00034233586005830895, | |
| "loss": 3.2168, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 21.503496503496503, | |
| "grad_norm": 0.39682918787002563, | |
| "learning_rate": 0.0003421609329446064, | |
| "loss": 3.2184, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 21.518065268065268, | |
| "grad_norm": 0.42791664600372314, | |
| "learning_rate": 0.00034198600583090375, | |
| "loss": 3.2254, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 21.532634032634032, | |
| "grad_norm": 0.447919100522995, | |
| "learning_rate": 0.00034181107871720113, | |
| "loss": 3.2198, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 21.547202797202797, | |
| "grad_norm": 0.4114047586917877, | |
| "learning_rate": 0.0003416361516034985, | |
| "loss": 3.2176, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 21.56177156177156, | |
| "grad_norm": 0.442635715007782, | |
| "learning_rate": 0.0003414612244897959, | |
| "loss": 3.2211, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.56177156177156, | |
| "eval_accuracy": 0.37385510594819277, | |
| "eval_loss": 3.5401127338409424, | |
| "eval_runtime": 180.1598, | |
| "eval_samples_per_second": 92.374, | |
| "eval_steps_per_second": 5.778, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.576340326340326, | |
| "grad_norm": 0.4588135778903961, | |
| "learning_rate": 0.00034128629737609326, | |
| "loss": 3.2151, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 21.59090909090909, | |
| "grad_norm": 0.4199243187904358, | |
| "learning_rate": 0.00034111137026239063, | |
| "loss": 3.231, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 21.605477855477854, | |
| "grad_norm": 0.41805458068847656, | |
| "learning_rate": 0.00034093644314868806, | |
| "loss": 3.2262, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 21.62004662004662, | |
| "grad_norm": 0.4202272891998291, | |
| "learning_rate": 0.0003407615160349854, | |
| "loss": 3.2326, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 21.634615384615383, | |
| "grad_norm": 0.47583162784576416, | |
| "learning_rate": 0.00034058658892128276, | |
| "loss": 3.2182, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 21.649184149184148, | |
| "grad_norm": 0.40350261330604553, | |
| "learning_rate": 0.00034041166180758014, | |
| "loss": 3.2307, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 21.663752913752912, | |
| "grad_norm": 0.43340378999710083, | |
| "learning_rate": 0.0003402367346938775, | |
| "loss": 3.2299, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 21.67832167832168, | |
| "grad_norm": 0.4040905237197876, | |
| "learning_rate": 0.0003400618075801749, | |
| "loss": 3.238, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 21.692890442890445, | |
| "grad_norm": 0.42097172141075134, | |
| "learning_rate": 0.00033988688046647226, | |
| "loss": 3.2294, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 21.70745920745921, | |
| "grad_norm": 0.39376163482666016, | |
| "learning_rate": 0.00033971195335276964, | |
| "loss": 3.2284, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 21.722027972027973, | |
| "grad_norm": 0.4100789725780487, | |
| "learning_rate": 0.00033953702623906707, | |
| "loss": 3.2379, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 21.736596736596738, | |
| "grad_norm": 0.3906741440296173, | |
| "learning_rate": 0.00033936209912536445, | |
| "loss": 3.2339, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 21.751165501165502, | |
| "grad_norm": 0.4219715893268585, | |
| "learning_rate": 0.00033918717201166177, | |
| "loss": 3.2304, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 21.765734265734267, | |
| "grad_norm": 0.41775527596473694, | |
| "learning_rate": 0.00033901224489795914, | |
| "loss": 3.2425, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 21.78030303030303, | |
| "grad_norm": 0.4094178378582001, | |
| "learning_rate": 0.0003388373177842565, | |
| "loss": 3.2364, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 21.794871794871796, | |
| "grad_norm": 0.4222930073738098, | |
| "learning_rate": 0.0003386623906705539, | |
| "loss": 3.2422, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 21.80944055944056, | |
| "grad_norm": 0.4146324396133423, | |
| "learning_rate": 0.00033848746355685127, | |
| "loss": 3.2283, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 21.824009324009324, | |
| "grad_norm": 0.42270055413246155, | |
| "learning_rate": 0.00033831253644314865, | |
| "loss": 3.25, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 21.83857808857809, | |
| "grad_norm": 0.41215404868125916, | |
| "learning_rate": 0.000338137609329446, | |
| "loss": 3.2358, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 21.853146853146853, | |
| "grad_norm": 0.40176627039909363, | |
| "learning_rate": 0.00033796268221574345, | |
| "loss": 3.2288, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.853146853146853, | |
| "eval_accuracy": 0.37451243887920854, | |
| "eval_loss": 3.533621072769165, | |
| "eval_runtime": 180.1417, | |
| "eval_samples_per_second": 92.383, | |
| "eval_steps_per_second": 5.779, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.867715617715618, | |
| "grad_norm": 0.41212475299835205, | |
| "learning_rate": 0.00033778775510204083, | |
| "loss": 3.2464, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 21.882284382284382, | |
| "grad_norm": 0.4054611027240753, | |
| "learning_rate": 0.00033761282798833815, | |
| "loss": 3.239, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 21.896853146853147, | |
| "grad_norm": 0.42206400632858276, | |
| "learning_rate": 0.0003374379008746355, | |
| "loss": 3.2397, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 21.91142191142191, | |
| "grad_norm": 0.39644235372543335, | |
| "learning_rate": 0.0003372629737609329, | |
| "loss": 3.2399, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 21.925990675990676, | |
| "grad_norm": 0.46023765206336975, | |
| "learning_rate": 0.0003370880466472303, | |
| "loss": 3.2508, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 21.94055944055944, | |
| "grad_norm": 0.39913883805274963, | |
| "learning_rate": 0.00033691311953352765, | |
| "loss": 3.2372, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 21.955128205128204, | |
| "grad_norm": 0.39186689257621765, | |
| "learning_rate": 0.00033673819241982503, | |
| "loss": 3.2402, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 21.96969696969697, | |
| "grad_norm": 0.42302364110946655, | |
| "learning_rate": 0.00033656326530612246, | |
| "loss": 3.2357, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 21.984265734265733, | |
| "grad_norm": 0.43021634221076965, | |
| "learning_rate": 0.00033638833819241983, | |
| "loss": 3.2478, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 21.998834498834498, | |
| "grad_norm": 0.4020933210849762, | |
| "learning_rate": 0.0003362134110787172, | |
| "loss": 3.2506, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 22.013403263403262, | |
| "grad_norm": 0.40365955233573914, | |
| "learning_rate": 0.00033603848396501453, | |
| "loss": 3.1508, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 22.027972027972027, | |
| "grad_norm": 0.4142661988735199, | |
| "learning_rate": 0.0003358635568513119, | |
| "loss": 3.1432, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 22.04254079254079, | |
| "grad_norm": 0.4072301685810089, | |
| "learning_rate": 0.0003356886297376093, | |
| "loss": 3.1519, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 22.057109557109555, | |
| "grad_norm": 0.43228116631507874, | |
| "learning_rate": 0.00033551370262390666, | |
| "loss": 3.1468, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 22.071678321678323, | |
| "grad_norm": 0.4091419577598572, | |
| "learning_rate": 0.00033533877551020403, | |
| "loss": 3.1599, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 22.086247086247088, | |
| "grad_norm": 0.42613473534584045, | |
| "learning_rate": 0.0003351638483965014, | |
| "loss": 3.1608, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 22.100815850815852, | |
| "grad_norm": 0.41074615716934204, | |
| "learning_rate": 0.00033498892128279884, | |
| "loss": 3.1487, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 22.115384615384617, | |
| "grad_norm": 0.43605613708496094, | |
| "learning_rate": 0.0003348139941690962, | |
| "loss": 3.1563, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 22.12995337995338, | |
| "grad_norm": 0.3988359272480011, | |
| "learning_rate": 0.0003346390670553936, | |
| "loss": 3.1575, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 22.144522144522146, | |
| "grad_norm": 0.3911682367324829, | |
| "learning_rate": 0.0003344641399416909, | |
| "loss": 3.1628, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.144522144522146, | |
| "eval_accuracy": 0.3734473008310617, | |
| "eval_loss": 3.5492424964904785, | |
| "eval_runtime": 250.473, | |
| "eval_samples_per_second": 66.442, | |
| "eval_steps_per_second": 4.156, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.15909090909091, | |
| "grad_norm": 0.4275292158126831, | |
| "learning_rate": 0.0003342892128279883, | |
| "loss": 3.1832, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 22.173659673659674, | |
| "grad_norm": 0.4048565626144409, | |
| "learning_rate": 0.00033411428571428567, | |
| "loss": 3.1722, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 22.18822843822844, | |
| "grad_norm": 0.45824673771858215, | |
| "learning_rate": 0.00033393935860058304, | |
| "loss": 3.176, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 22.202797202797203, | |
| "grad_norm": 0.4052969813346863, | |
| "learning_rate": 0.0003337644314868804, | |
| "loss": 3.1814, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 22.217365967365968, | |
| "grad_norm": 0.4052939713001251, | |
| "learning_rate": 0.0003335895043731778, | |
| "loss": 3.176, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 22.231934731934732, | |
| "grad_norm": 0.4383087456226349, | |
| "learning_rate": 0.0003334145772594752, | |
| "loss": 3.1829, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 22.246503496503497, | |
| "grad_norm": 0.41173261404037476, | |
| "learning_rate": 0.0003332396501457726, | |
| "loss": 3.1896, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 22.26107226107226, | |
| "grad_norm": 0.42085108160972595, | |
| "learning_rate": 0.00033306472303207, | |
| "loss": 3.1796, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 22.275641025641026, | |
| "grad_norm": 0.4438980221748352, | |
| "learning_rate": 0.0003328897959183673, | |
| "loss": 3.1888, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 22.29020979020979, | |
| "grad_norm": 0.4162982702255249, | |
| "learning_rate": 0.00033271486880466467, | |
| "loss": 3.188, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 22.304778554778554, | |
| "grad_norm": 0.4152987003326416, | |
| "learning_rate": 0.00033253994169096205, | |
| "loss": 3.1885, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 22.31934731934732, | |
| "grad_norm": 0.41857367753982544, | |
| "learning_rate": 0.0003323650145772594, | |
| "loss": 3.1949, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 22.333916083916083, | |
| "grad_norm": 0.43874943256378174, | |
| "learning_rate": 0.0003321900874635568, | |
| "loss": 3.2027, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 22.348484848484848, | |
| "grad_norm": 0.43016231060028076, | |
| "learning_rate": 0.00033201516034985423, | |
| "loss": 3.1932, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 22.363053613053612, | |
| "grad_norm": 0.41237112879753113, | |
| "learning_rate": 0.0003318402332361516, | |
| "loss": 3.1969, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 22.377622377622377, | |
| "grad_norm": 0.39711132645606995, | |
| "learning_rate": 0.000331665306122449, | |
| "loss": 3.2021, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 22.39219114219114, | |
| "grad_norm": 0.41648826003074646, | |
| "learning_rate": 0.00033149037900874636, | |
| "loss": 3.202, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 22.406759906759905, | |
| "grad_norm": 0.43639883399009705, | |
| "learning_rate": 0.0003313154518950437, | |
| "loss": 3.1975, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 22.42132867132867, | |
| "grad_norm": 0.43766576051712036, | |
| "learning_rate": 0.00033114052478134105, | |
| "loss": 3.211, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 22.435897435897434, | |
| "grad_norm": 0.40724071860313416, | |
| "learning_rate": 0.00033096559766763843, | |
| "loss": 3.205, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.435897435897434, | |
| "eval_accuracy": 0.37394717959488066, | |
| "eval_loss": 3.5423734188079834, | |
| "eval_runtime": 351.8544, | |
| "eval_samples_per_second": 47.298, | |
| "eval_steps_per_second": 2.959, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.4504662004662, | |
| "grad_norm": 0.4043303430080414, | |
| "learning_rate": 0.0003307906705539358, | |
| "loss": 3.2015, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 22.465034965034967, | |
| "grad_norm": 0.4104682207107544, | |
| "learning_rate": 0.0003306157434402332, | |
| "loss": 3.2104, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 22.47960372960373, | |
| "grad_norm": 0.4327828288078308, | |
| "learning_rate": 0.0003304408163265306, | |
| "loss": 3.1976, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 22.494172494172496, | |
| "grad_norm": 0.40035805106163025, | |
| "learning_rate": 0.000330265889212828, | |
| "loss": 3.1973, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 22.50874125874126, | |
| "grad_norm": 0.4154951274394989, | |
| "learning_rate": 0.00033009096209912536, | |
| "loss": 3.2096, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 22.523310023310025, | |
| "grad_norm": 0.4317122995853424, | |
| "learning_rate": 0.00032991603498542274, | |
| "loss": 3.2028, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 22.53787878787879, | |
| "grad_norm": 0.4079900085926056, | |
| "learning_rate": 0.00032974110787172006, | |
| "loss": 3.2131, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 22.552447552447553, | |
| "grad_norm": 0.4091930389404297, | |
| "learning_rate": 0.00032956618075801744, | |
| "loss": 3.202, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 22.567016317016318, | |
| "grad_norm": 0.42974573373794556, | |
| "learning_rate": 0.0003293912536443148, | |
| "loss": 3.221, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 22.581585081585082, | |
| "grad_norm": 0.40895313024520874, | |
| "learning_rate": 0.0003292163265306122, | |
| "loss": 3.2192, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 22.596153846153847, | |
| "grad_norm": 0.4025968611240387, | |
| "learning_rate": 0.00032904139941690956, | |
| "loss": 3.212, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 22.61072261072261, | |
| "grad_norm": 0.40843942761421204, | |
| "learning_rate": 0.000328866472303207, | |
| "loss": 3.2103, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 22.625291375291376, | |
| "grad_norm": 0.3990994095802307, | |
| "learning_rate": 0.00032869154518950437, | |
| "loss": 3.2009, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 22.63986013986014, | |
| "grad_norm": 0.4064718186855316, | |
| "learning_rate": 0.00032851661807580174, | |
| "loss": 3.2152, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 22.654428904428904, | |
| "grad_norm": 0.40382882952690125, | |
| "learning_rate": 0.0003283416909620991, | |
| "loss": 3.2168, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 22.66899766899767, | |
| "grad_norm": 0.4538167417049408, | |
| "learning_rate": 0.00032816676384839644, | |
| "loss": 3.2197, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 22.683566433566433, | |
| "grad_norm": 0.4410167932510376, | |
| "learning_rate": 0.0003279918367346938, | |
| "loss": 3.2207, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 22.698135198135198, | |
| "grad_norm": 0.40950748324394226, | |
| "learning_rate": 0.0003278169096209912, | |
| "loss": 3.225, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 22.712703962703962, | |
| "grad_norm": 0.44317975640296936, | |
| "learning_rate": 0.00032764198250728857, | |
| "loss": 3.2251, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 22.727272727272727, | |
| "grad_norm": 0.4140080213546753, | |
| "learning_rate": 0.000327467055393586, | |
| "loss": 3.2246, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.727272727272727, | |
| "eval_accuracy": 0.37460909856960123, | |
| "eval_loss": 3.534909963607788, | |
| "eval_runtime": 180.2239, | |
| "eval_samples_per_second": 92.341, | |
| "eval_steps_per_second": 5.776, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.74184149184149, | |
| "grad_norm": 0.41078007221221924, | |
| "learning_rate": 0.0003272921282798834, | |
| "loss": 3.2131, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 22.756410256410255, | |
| "grad_norm": 0.4103488028049469, | |
| "learning_rate": 0.00032711720116618075, | |
| "loss": 3.227, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 22.77097902097902, | |
| "grad_norm": 0.4068191945552826, | |
| "learning_rate": 0.0003269422740524781, | |
| "loss": 3.2225, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 22.785547785547784, | |
| "grad_norm": 0.4318259656429291, | |
| "learning_rate": 0.0003267673469387755, | |
| "loss": 3.2264, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 22.80011655011655, | |
| "grad_norm": 0.40115559101104736, | |
| "learning_rate": 0.0003265924198250728, | |
| "loss": 3.2255, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 22.814685314685313, | |
| "grad_norm": 0.4201257526874542, | |
| "learning_rate": 0.0003264174927113702, | |
| "loss": 3.2325, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 22.829254079254078, | |
| "grad_norm": 0.4182145893573761, | |
| "learning_rate": 0.0003262425655976676, | |
| "loss": 3.2292, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 22.843822843822842, | |
| "grad_norm": 0.4127039611339569, | |
| "learning_rate": 0.00032606763848396495, | |
| "loss": 3.2223, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 22.85839160839161, | |
| "grad_norm": 0.38099154829978943, | |
| "learning_rate": 0.0003258927113702624, | |
| "loss": 3.2266, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 22.872960372960375, | |
| "grad_norm": 0.4170164167881012, | |
| "learning_rate": 0.00032571778425655976, | |
| "loss": 3.2318, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 22.88752913752914, | |
| "grad_norm": 0.41228339076042175, | |
| "learning_rate": 0.00032554285714285713, | |
| "loss": 3.2338, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 22.902097902097903, | |
| "grad_norm": 0.3996942639350891, | |
| "learning_rate": 0.0003253679300291545, | |
| "loss": 3.2232, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 22.916666666666668, | |
| "grad_norm": 0.4170009195804596, | |
| "learning_rate": 0.0003251930029154519, | |
| "loss": 3.2358, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 22.931235431235432, | |
| "grad_norm": 0.40195149183273315, | |
| "learning_rate": 0.0003250180758017492, | |
| "loss": 3.2308, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 22.945804195804197, | |
| "grad_norm": 0.39445433020591736, | |
| "learning_rate": 0.0003248431486880466, | |
| "loss": 3.2227, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 22.96037296037296, | |
| "grad_norm": 0.40387430787086487, | |
| "learning_rate": 0.00032466822157434396, | |
| "loss": 3.2338, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 22.974941724941726, | |
| "grad_norm": 0.4055889844894409, | |
| "learning_rate": 0.0003244932944606414, | |
| "loss": 3.2283, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 22.98951048951049, | |
| "grad_norm": 0.37491124868392944, | |
| "learning_rate": 0.00032431836734693876, | |
| "loss": 3.221, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 23.004079254079254, | |
| "grad_norm": 0.43204745650291443, | |
| "learning_rate": 0.00032414344023323614, | |
| "loss": 3.2207, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 23.01864801864802, | |
| "grad_norm": 0.4061754643917084, | |
| "learning_rate": 0.0003239685131195335, | |
| "loss": 3.1374, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.01864801864802, | |
| "eval_accuracy": 0.3740811155892325, | |
| "eval_loss": 3.54728102684021, | |
| "eval_runtime": 180.1553, | |
| "eval_samples_per_second": 92.376, | |
| "eval_steps_per_second": 5.778, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.033216783216783, | |
| "grad_norm": 0.4229036867618561, | |
| "learning_rate": 0.0003237935860058309, | |
| "loss": 3.1273, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 23.047785547785548, | |
| "grad_norm": 0.40628716349601746, | |
| "learning_rate": 0.00032361865889212827, | |
| "loss": 3.1343, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 23.062354312354312, | |
| "grad_norm": 0.4286355674266815, | |
| "learning_rate": 0.0003234437317784256, | |
| "loss": 3.1428, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 23.076923076923077, | |
| "grad_norm": 0.4400956332683563, | |
| "learning_rate": 0.00032326880466472296, | |
| "loss": 3.1596, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 23.09149184149184, | |
| "grad_norm": 0.43168753385543823, | |
| "learning_rate": 0.00032309387755102034, | |
| "loss": 3.1411, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 23.106060606060606, | |
| "grad_norm": 0.42677047848701477, | |
| "learning_rate": 0.00032291895043731777, | |
| "loss": 3.1558, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 23.12062937062937, | |
| "grad_norm": 0.425641804933548, | |
| "learning_rate": 0.00032274402332361515, | |
| "loss": 3.1493, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 23.135198135198134, | |
| "grad_norm": 0.41930001974105835, | |
| "learning_rate": 0.0003225690962099125, | |
| "loss": 3.1602, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 23.1497668997669, | |
| "grad_norm": 0.43074584007263184, | |
| "learning_rate": 0.0003223941690962099, | |
| "loss": 3.1561, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 23.164335664335663, | |
| "grad_norm": 0.43243861198425293, | |
| "learning_rate": 0.0003222192419825073, | |
| "loss": 3.1647, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 23.178904428904428, | |
| "grad_norm": 0.44113197922706604, | |
| "learning_rate": 0.00032204431486880465, | |
| "loss": 3.1676, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 23.193473193473192, | |
| "grad_norm": 0.4136090874671936, | |
| "learning_rate": 0.00032186938775510197, | |
| "loss": 3.1604, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 23.208041958041957, | |
| "grad_norm": 0.4486762285232544, | |
| "learning_rate": 0.00032169446064139935, | |
| "loss": 3.1636, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 23.22261072261072, | |
| "grad_norm": 0.47251448035240173, | |
| "learning_rate": 0.0003215195335276967, | |
| "loss": 3.1705, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 23.237179487179485, | |
| "grad_norm": 0.4380472004413605, | |
| "learning_rate": 0.00032134460641399415, | |
| "loss": 3.1662, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 23.251748251748253, | |
| "grad_norm": 0.42042556405067444, | |
| "learning_rate": 0.00032116967930029153, | |
| "loss": 3.1799, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 23.266317016317018, | |
| "grad_norm": 0.4275950491428375, | |
| "learning_rate": 0.0003209947521865889, | |
| "loss": 3.1781, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 23.280885780885782, | |
| "grad_norm": 0.44534599781036377, | |
| "learning_rate": 0.0003208198250728863, | |
| "loss": 3.1668, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 23.295454545454547, | |
| "grad_norm": 0.4342590272426605, | |
| "learning_rate": 0.00032064489795918366, | |
| "loss": 3.1745, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 23.31002331002331, | |
| "grad_norm": 0.4515496492385864, | |
| "learning_rate": 0.00032046997084548103, | |
| "loss": 3.1667, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.31002331002331, | |
| "eval_accuracy": 0.3738672178072079, | |
| "eval_loss": 3.548288345336914, | |
| "eval_runtime": 180.3608, | |
| "eval_samples_per_second": 92.271, | |
| "eval_steps_per_second": 5.772, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.324592074592076, | |
| "grad_norm": 0.39993399381637573, | |
| "learning_rate": 0.00032029504373177835, | |
| "loss": 3.123, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 23.33916083916084, | |
| "grad_norm": 0.4498705565929413, | |
| "learning_rate": 0.00032012011661807573, | |
| "loss": 3.1427, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 23.353729603729604, | |
| "grad_norm": 0.405280739068985, | |
| "learning_rate": 0.00031994518950437316, | |
| "loss": 3.1473, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 23.36829836829837, | |
| "grad_norm": 0.4300670921802521, | |
| "learning_rate": 0.00031977026239067053, | |
| "loss": 3.1478, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 23.382867132867133, | |
| "grad_norm": 0.4063658118247986, | |
| "learning_rate": 0.0003195953352769679, | |
| "loss": 3.148, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 23.397435897435898, | |
| "grad_norm": 0.4089643359184265, | |
| "learning_rate": 0.0003194204081632653, | |
| "loss": 3.1584, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 23.412004662004662, | |
| "grad_norm": 0.4139532148838043, | |
| "learning_rate": 0.00031924548104956266, | |
| "loss": 3.1676, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 23.426573426573427, | |
| "grad_norm": 0.47233232855796814, | |
| "learning_rate": 0.00031907055393586004, | |
| "loss": 3.1632, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 23.44114219114219, | |
| "grad_norm": 0.4001595973968506, | |
| "learning_rate": 0.0003188956268221574, | |
| "loss": 3.1748, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 23.455710955710956, | |
| "grad_norm": 0.41345474123954773, | |
| "learning_rate": 0.00031872069970845474, | |
| "loss": 3.1672, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 23.47027972027972, | |
| "grad_norm": 0.40790998935699463, | |
| "learning_rate": 0.0003185457725947521, | |
| "loss": 3.1537, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 23.484848484848484, | |
| "grad_norm": 0.405627578496933, | |
| "learning_rate": 0.00031837084548104954, | |
| "loss": 3.1764, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 23.49941724941725, | |
| "grad_norm": 0.42736586928367615, | |
| "learning_rate": 0.0003181959183673469, | |
| "loss": 3.1721, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 23.513986013986013, | |
| "grad_norm": 0.4293386936187744, | |
| "learning_rate": 0.0003180209912536443, | |
| "loss": 3.1747, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 23.528554778554778, | |
| "grad_norm": 0.4108065366744995, | |
| "learning_rate": 0.00031784606413994167, | |
| "loss": 3.1666, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 23.543123543123542, | |
| "grad_norm": 0.4118288457393646, | |
| "learning_rate": 0.00031767113702623904, | |
| "loss": 3.1701, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 23.557692307692307, | |
| "grad_norm": 0.41944974660873413, | |
| "learning_rate": 0.0003174962099125364, | |
| "loss": 3.1658, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 23.57226107226107, | |
| "grad_norm": 0.4191264808177948, | |
| "learning_rate": 0.0003173212827988338, | |
| "loss": 3.1584, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 23.586829836829835, | |
| "grad_norm": 0.41242533922195435, | |
| "learning_rate": 0.0003171463556851311, | |
| "loss": 3.1647, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 23.6013986013986, | |
| "grad_norm": 0.4619014263153076, | |
| "learning_rate": 0.00031697142857142855, | |
| "loss": 3.1871, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.6013986013986, | |
| "eval_accuracy": 0.3739357732810509, | |
| "eval_loss": 3.549891233444214, | |
| "eval_runtime": 179.5335, | |
| "eval_samples_per_second": 92.696, | |
| "eval_steps_per_second": 5.798, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.615967365967364, | |
| "grad_norm": 0.38858968019485474, | |
| "learning_rate": 0.0003167965014577259, | |
| "loss": 3.1771, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 23.63053613053613, | |
| "grad_norm": 0.43501484394073486, | |
| "learning_rate": 0.0003166215743440233, | |
| "loss": 3.1881, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 23.645104895104897, | |
| "grad_norm": 0.4217482805252075, | |
| "learning_rate": 0.0003164466472303207, | |
| "loss": 3.1812, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 23.65967365967366, | |
| "grad_norm": 0.4172874391078949, | |
| "learning_rate": 0.00031627172011661805, | |
| "loss": 3.1826, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 23.674242424242426, | |
| "grad_norm": 0.3954772651195526, | |
| "learning_rate": 0.0003160967930029154, | |
| "loss": 3.1845, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 23.68881118881119, | |
| "grad_norm": 0.4089130461215973, | |
| "learning_rate": 0.0003159218658892128, | |
| "loss": 3.1718, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 23.703379953379955, | |
| "grad_norm": 0.43155068159103394, | |
| "learning_rate": 0.00031574693877551023, | |
| "loss": 3.2006, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 23.71794871794872, | |
| "grad_norm": 0.4237709045410156, | |
| "learning_rate": 0.0003155720116618075, | |
| "loss": 3.1878, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 23.732517482517483, | |
| "grad_norm": 0.4215518832206726, | |
| "learning_rate": 0.00031539708454810493, | |
| "loss": 3.1905, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 23.747086247086248, | |
| "grad_norm": 0.44571006298065186, | |
| "learning_rate": 0.0003152221574344023, | |
| "loss": 3.1912, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 23.761655011655012, | |
| "grad_norm": 0.443622887134552, | |
| "learning_rate": 0.0003150472303206997, | |
| "loss": 3.199, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 23.776223776223777, | |
| "grad_norm": 0.4024413228034973, | |
| "learning_rate": 0.00031487230320699706, | |
| "loss": 3.1939, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 23.79079254079254, | |
| "grad_norm": 0.45250263810157776, | |
| "learning_rate": 0.00031469737609329443, | |
| "loss": 3.1852, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 23.805361305361306, | |
| "grad_norm": 0.5000615119934082, | |
| "learning_rate": 0.0003145224489795918, | |
| "loss": 3.1972, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 23.81993006993007, | |
| "grad_norm": 0.40920284390449524, | |
| "learning_rate": 0.0003143475218658892, | |
| "loss": 3.1951, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 23.834498834498834, | |
| "grad_norm": 0.400705486536026, | |
| "learning_rate": 0.0003141725947521866, | |
| "loss": 3.2042, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 23.8490675990676, | |
| "grad_norm": 0.4304558038711548, | |
| "learning_rate": 0.0003139976676384839, | |
| "loss": 3.1997, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 23.863636363636363, | |
| "grad_norm": 0.4087577164173126, | |
| "learning_rate": 0.0003138227405247813, | |
| "loss": 3.1963, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 23.878205128205128, | |
| "grad_norm": 0.4356166422367096, | |
| "learning_rate": 0.0003136478134110787, | |
| "loss": 3.2014, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 23.892773892773892, | |
| "grad_norm": 0.4396255314350128, | |
| "learning_rate": 0.00031347288629737606, | |
| "loss": 3.2005, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.892773892773892, | |
| "eval_accuracy": 0.37419106304728256, | |
| "eval_loss": 3.5441832542419434, | |
| "eval_runtime": 189.8124, | |
| "eval_samples_per_second": 87.676, | |
| "eval_steps_per_second": 5.484, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.907342657342657, | |
| "grad_norm": 0.49265164136886597, | |
| "learning_rate": 0.00031329795918367344, | |
| "loss": 3.1957, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 23.92191142191142, | |
| "grad_norm": 0.42900264263153076, | |
| "learning_rate": 0.0003131230320699708, | |
| "loss": 3.2143, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 23.936480186480185, | |
| "grad_norm": 0.42176422476768494, | |
| "learning_rate": 0.0003129481049562682, | |
| "loss": 3.2066, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 23.95104895104895, | |
| "grad_norm": 0.4115244746208191, | |
| "learning_rate": 0.00031277317784256557, | |
| "loss": 3.1994, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 23.965617715617714, | |
| "grad_norm": 0.4627463221549988, | |
| "learning_rate": 0.000312598250728863, | |
| "loss": 3.205, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 23.98018648018648, | |
| "grad_norm": 0.4280329942703247, | |
| "learning_rate": 0.0003124233236151603, | |
| "loss": 3.2046, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 23.994755244755243, | |
| "grad_norm": 0.4357800781726837, | |
| "learning_rate": 0.0003122483965014577, | |
| "loss": 3.2168, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 24.009324009324008, | |
| "grad_norm": 0.44521504640579224, | |
| "learning_rate": 0.00031207346938775507, | |
| "loss": 3.1641, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 24.023892773892776, | |
| "grad_norm": 0.4130154252052307, | |
| "learning_rate": 0.00031189854227405245, | |
| "loss": 3.1328, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 24.03846153846154, | |
| "grad_norm": 0.4017220437526703, | |
| "learning_rate": 0.0003117236151603498, | |
| "loss": 3.1364, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 24.053030303030305, | |
| "grad_norm": 0.42192500829696655, | |
| "learning_rate": 0.0003115486880466472, | |
| "loss": 3.1237, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 24.06759906759907, | |
| "grad_norm": 0.44063979387283325, | |
| "learning_rate": 0.00031137376093294457, | |
| "loss": 3.1436, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 24.082167832167833, | |
| "grad_norm": 0.4516114592552185, | |
| "learning_rate": 0.000311198833819242, | |
| "loss": 3.1318, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 24.096736596736598, | |
| "grad_norm": 0.4406895339488983, | |
| "learning_rate": 0.0003110239067055394, | |
| "loss": 3.1506, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 24.111305361305362, | |
| "grad_norm": 0.46922528743743896, | |
| "learning_rate": 0.0003108489795918367, | |
| "loss": 3.1586, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 24.125874125874127, | |
| "grad_norm": 0.42122822999954224, | |
| "learning_rate": 0.0003106740524781341, | |
| "loss": 3.1544, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 24.14044289044289, | |
| "grad_norm": 0.46628227829933167, | |
| "learning_rate": 0.00031049912536443145, | |
| "loss": 3.1556, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 24.155011655011656, | |
| "grad_norm": 0.43666237592697144, | |
| "learning_rate": 0.00031032419825072883, | |
| "loss": 3.1521, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 24.16958041958042, | |
| "grad_norm": 0.4410806894302368, | |
| "learning_rate": 0.0003101492711370262, | |
| "loss": 3.1573, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 24.184149184149184, | |
| "grad_norm": 0.4260009825229645, | |
| "learning_rate": 0.0003099743440233236, | |
| "loss": 3.1587, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.184149184149184, | |
| "eval_accuracy": 0.3736513209804914, | |
| "eval_loss": 3.5505282878875732, | |
| "eval_runtime": 189.3113, | |
| "eval_samples_per_second": 87.908, | |
| "eval_steps_per_second": 5.499, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.19871794871795, | |
| "grad_norm": 0.46315836906433105, | |
| "learning_rate": 0.00030979941690962095, | |
| "loss": 3.1663, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 24.213286713286713, | |
| "grad_norm": 0.40869399905204773, | |
| "learning_rate": 0.0003096244897959184, | |
| "loss": 3.1599, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 24.227855477855478, | |
| "grad_norm": 0.4249207675457001, | |
| "learning_rate": 0.00030944956268221576, | |
| "loss": 3.1681, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 24.242424242424242, | |
| "grad_norm": 0.442433625459671, | |
| "learning_rate": 0.0003092746355685131, | |
| "loss": 3.1746, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 24.256993006993007, | |
| "grad_norm": 0.4539443552494049, | |
| "learning_rate": 0.00030909970845481046, | |
| "loss": 3.1707, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 24.27156177156177, | |
| "grad_norm": 0.4181159734725952, | |
| "learning_rate": 0.00030892478134110783, | |
| "loss": 3.167, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 24.286130536130536, | |
| "grad_norm": 0.45877861976623535, | |
| "learning_rate": 0.0003087498542274052, | |
| "loss": 3.1731, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 24.3006993006993, | |
| "grad_norm": 0.42578256130218506, | |
| "learning_rate": 0.0003085749271137026, | |
| "loss": 3.1787, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 24.315268065268064, | |
| "grad_norm": 0.46334999799728394, | |
| "learning_rate": 0.00030839999999999996, | |
| "loss": 3.1842, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 24.32983682983683, | |
| "grad_norm": 0.4494231045246124, | |
| "learning_rate": 0.0003082250728862974, | |
| "loss": 3.1693, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 24.344405594405593, | |
| "grad_norm": 0.5533521771430969, | |
| "learning_rate": 0.00030805014577259477, | |
| "loss": 3.1596, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 24.358974358974358, | |
| "grad_norm": 0.44728219509124756, | |
| "learning_rate": 0.00030787521865889214, | |
| "loss": 3.1803, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 24.373543123543122, | |
| "grad_norm": 0.43010392785072327, | |
| "learning_rate": 0.00030770029154518946, | |
| "loss": 3.1895, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 24.388111888111887, | |
| "grad_norm": 0.42528676986694336, | |
| "learning_rate": 0.00030752536443148684, | |
| "loss": 3.1935, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 24.40268065268065, | |
| "grad_norm": 0.41684848070144653, | |
| "learning_rate": 0.0003073504373177842, | |
| "loss": 3.1724, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 24.41724941724942, | |
| "grad_norm": 0.42178863286972046, | |
| "learning_rate": 0.0003071755102040816, | |
| "loss": 3.1743, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 24.431818181818183, | |
| "grad_norm": 0.3935507535934448, | |
| "learning_rate": 0.00030700058309037897, | |
| "loss": 3.1922, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 24.446386946386948, | |
| "grad_norm": 0.43585166335105896, | |
| "learning_rate": 0.00030682565597667634, | |
| "loss": 3.1775, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 24.460955710955712, | |
| "grad_norm": 0.4166204035282135, | |
| "learning_rate": 0.0003066507288629738, | |
| "loss": 3.1917, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 24.475524475524477, | |
| "grad_norm": 0.4170168340206146, | |
| "learning_rate": 0.00030647580174927115, | |
| "loss": 3.1966, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.475524475524477, | |
| "eval_accuracy": 0.37420364526975464, | |
| "eval_loss": 3.542668104171753, | |
| "eval_runtime": 189.5519, | |
| "eval_samples_per_second": 87.797, | |
| "eval_steps_per_second": 5.492, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.49009324009324, | |
| "grad_norm": 0.4078339636325836, | |
| "learning_rate": 0.0003063008746355685, | |
| "loss": 3.1876, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 24.504662004662006, | |
| "grad_norm": 0.39984050393104553, | |
| "learning_rate": 0.00030612594752186585, | |
| "loss": 3.1865, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 24.51923076923077, | |
| "grad_norm": 0.4368041455745697, | |
| "learning_rate": 0.0003059510204081632, | |
| "loss": 3.1874, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 24.533799533799534, | |
| "grad_norm": 0.4150752127170563, | |
| "learning_rate": 0.0003057760932944606, | |
| "loss": 3.1968, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 24.5483682983683, | |
| "grad_norm": 0.4473128914833069, | |
| "learning_rate": 0.000305601166180758, | |
| "loss": 3.1893, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 24.562937062937063, | |
| "grad_norm": 0.40939411520957947, | |
| "learning_rate": 0.00030542623906705535, | |
| "loss": 3.2025, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 24.577505827505828, | |
| "grad_norm": 0.46430233120918274, | |
| "learning_rate": 0.0003052513119533527, | |
| "loss": 3.193, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 24.592074592074592, | |
| "grad_norm": 0.4168838858604431, | |
| "learning_rate": 0.00030507638483965016, | |
| "loss": 3.2013, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 24.606643356643357, | |
| "grad_norm": 0.4212021827697754, | |
| "learning_rate": 0.00030490145772594753, | |
| "loss": 3.1924, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 24.62121212121212, | |
| "grad_norm": 0.4454532563686371, | |
| "learning_rate": 0.0003047265306122449, | |
| "loss": 3.2061, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 24.635780885780886, | |
| "grad_norm": 0.4298340082168579, | |
| "learning_rate": 0.00030455160349854223, | |
| "loss": 3.2033, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 24.65034965034965, | |
| "grad_norm": 0.4507395625114441, | |
| "learning_rate": 0.0003043766763848396, | |
| "loss": 3.2133, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 24.664918414918414, | |
| "grad_norm": 0.40621644258499146, | |
| "learning_rate": 0.000304201749271137, | |
| "loss": 3.1939, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 24.67948717948718, | |
| "grad_norm": 0.43747299909591675, | |
| "learning_rate": 0.00030402682215743436, | |
| "loss": 3.2021, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 24.694055944055943, | |
| "grad_norm": 0.47679662704467773, | |
| "learning_rate": 0.00030385189504373173, | |
| "loss": 3.202, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 24.708624708624708, | |
| "grad_norm": 0.4054070711135864, | |
| "learning_rate": 0.00030367696793002916, | |
| "loss": 3.206, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 24.723193473193472, | |
| "grad_norm": 0.4033360481262207, | |
| "learning_rate": 0.00030350204081632654, | |
| "loss": 3.2069, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 24.737762237762237, | |
| "grad_norm": 0.4348824620246887, | |
| "learning_rate": 0.0003033271137026239, | |
| "loss": 3.2131, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 24.752331002331, | |
| "grad_norm": 0.41652145981788635, | |
| "learning_rate": 0.0003031521865889213, | |
| "loss": 3.2136, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 24.766899766899765, | |
| "grad_norm": 0.4814378321170807, | |
| "learning_rate": 0.0003029772594752186, | |
| "loss": 3.211, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.766899766899765, | |
| "eval_accuracy": 0.3745153786508142, | |
| "eval_loss": 3.5338070392608643, | |
| "eval_runtime": 189.6066, | |
| "eval_samples_per_second": 87.771, | |
| "eval_steps_per_second": 5.49, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.78146853146853, | |
| "grad_norm": 0.378551185131073, | |
| "learning_rate": 0.000302802332361516, | |
| "loss": 3.214, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 24.796037296037294, | |
| "grad_norm": 0.43937408924102783, | |
| "learning_rate": 0.00030262740524781336, | |
| "loss": 3.2026, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 24.810606060606062, | |
| "grad_norm": 0.399384468793869, | |
| "learning_rate": 0.00030245247813411074, | |
| "loss": 3.2208, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 24.825174825174827, | |
| "grad_norm": 0.4425372779369354, | |
| "learning_rate": 0.0003022775510204081, | |
| "loss": 3.212, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 24.83974358974359, | |
| "grad_norm": 0.4405309855937958, | |
| "learning_rate": 0.00030210262390670554, | |
| "loss": 3.1984, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 24.854312354312356, | |
| "grad_norm": 0.40886348485946655, | |
| "learning_rate": 0.0003019276967930029, | |
| "loss": 3.2036, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 24.86888111888112, | |
| "grad_norm": 0.43906837701797485, | |
| "learning_rate": 0.0003017527696793003, | |
| "loss": 3.2168, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 24.883449883449885, | |
| "grad_norm": 0.44705092906951904, | |
| "learning_rate": 0.00030157784256559767, | |
| "loss": 3.2108, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 24.89801864801865, | |
| "grad_norm": 0.42035573720932007, | |
| "learning_rate": 0.000301402915451895, | |
| "loss": 3.2158, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 24.912587412587413, | |
| "grad_norm": 0.4399532377719879, | |
| "learning_rate": 0.00030122798833819237, | |
| "loss": 3.2247, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 24.927156177156178, | |
| "grad_norm": 0.42403891682624817, | |
| "learning_rate": 0.00030105306122448974, | |
| "loss": 3.2092, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 24.941724941724942, | |
| "grad_norm": 0.40814968943595886, | |
| "learning_rate": 0.0003008781341107871, | |
| "loss": 3.2136, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 24.956293706293707, | |
| "grad_norm": 0.4622436761856079, | |
| "learning_rate": 0.0003007032069970845, | |
| "loss": 3.2217, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 24.97086247086247, | |
| "grad_norm": 0.44937148690223694, | |
| "learning_rate": 0.0003005282798833819, | |
| "loss": 3.2059, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 24.985431235431236, | |
| "grad_norm": 0.43933066725730896, | |
| "learning_rate": 0.0003003533527696793, | |
| "loss": 3.2154, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.39813855290412903, | |
| "learning_rate": 0.0003001784256559767, | |
| "loss": 3.2131, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 25.014568764568764, | |
| "grad_norm": 0.4538300633430481, | |
| "learning_rate": 0.00030000349854227405, | |
| "loss": 3.116, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 25.02913752913753, | |
| "grad_norm": 0.41572535037994385, | |
| "learning_rate": 0.00029982857142857143, | |
| "loss": 3.1081, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 25.043706293706293, | |
| "grad_norm": 0.4343355596065521, | |
| "learning_rate": 0.0002996536443148688, | |
| "loss": 3.1264, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 25.058275058275058, | |
| "grad_norm": 0.47448500990867615, | |
| "learning_rate": 0.0002994787172011661, | |
| "loss": 3.1336, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.058275058275058, | |
| "eval_accuracy": 0.37419223895592485, | |
| "eval_loss": 3.5482711791992188, | |
| "eval_runtime": 188.7946, | |
| "eval_samples_per_second": 88.149, | |
| "eval_steps_per_second": 5.514, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.072843822843822, | |
| "grad_norm": 0.4407516419887543, | |
| "learning_rate": 0.0002993037900874635, | |
| "loss": 3.1332, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 25.087412587412587, | |
| "grad_norm": 0.42840850353240967, | |
| "learning_rate": 0.00029912886297376093, | |
| "loss": 3.1299, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 25.10198135198135, | |
| "grad_norm": 0.4274645745754242, | |
| "learning_rate": 0.0002989539358600583, | |
| "loss": 3.1388, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 25.116550116550115, | |
| "grad_norm": 0.4566427767276764, | |
| "learning_rate": 0.0002987790087463557, | |
| "loss": 3.1259, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 25.13111888111888, | |
| "grad_norm": 0.40977421402931213, | |
| "learning_rate": 0.000298604081632653, | |
| "loss": 3.1378, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 25.145687645687644, | |
| "grad_norm": 0.43461692333221436, | |
| "learning_rate": 0.00029842915451895044, | |
| "loss": 3.1386, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 25.16025641025641, | |
| "grad_norm": 0.42161956429481506, | |
| "learning_rate": 0.0002982542274052478, | |
| "loss": 3.1499, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 25.174825174825173, | |
| "grad_norm": 0.42874354124069214, | |
| "learning_rate": 0.0002980793002915452, | |
| "loss": 3.1479, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 25.189393939393938, | |
| "grad_norm": 0.4363137185573578, | |
| "learning_rate": 0.0002979043731778425, | |
| "loss": 3.1367, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 25.203962703962706, | |
| "grad_norm": 0.43857312202453613, | |
| "learning_rate": 0.0002977294460641399, | |
| "loss": 3.1444, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 25.21853146853147, | |
| "grad_norm": 0.4245397448539734, | |
| "learning_rate": 0.0002975545189504373, | |
| "loss": 3.1602, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 25.233100233100235, | |
| "grad_norm": 0.42843955755233765, | |
| "learning_rate": 0.0002973795918367347, | |
| "loss": 3.1556, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 25.247668997669, | |
| "grad_norm": 0.41731715202331543, | |
| "learning_rate": 0.00029720466472303207, | |
| "loss": 3.1485, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 25.262237762237763, | |
| "grad_norm": 0.4179287254810333, | |
| "learning_rate": 0.0002970297376093294, | |
| "loss": 3.1711, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 25.276806526806528, | |
| "grad_norm": 0.4194510579109192, | |
| "learning_rate": 0.0002968548104956268, | |
| "loss": 3.1617, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 25.291375291375292, | |
| "grad_norm": 0.420640766620636, | |
| "learning_rate": 0.0002966798833819242, | |
| "loss": 3.1645, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 25.305944055944057, | |
| "grad_norm": 0.44683873653411865, | |
| "learning_rate": 0.00029650495626822157, | |
| "loss": 3.1687, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 25.32051282051282, | |
| "grad_norm": 0.44092994928359985, | |
| "learning_rate": 0.0002963300291545189, | |
| "loss": 3.1555, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 25.335081585081586, | |
| "grad_norm": 0.46043768525123596, | |
| "learning_rate": 0.0002961551020408163, | |
| "loss": 3.1795, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 25.34965034965035, | |
| "grad_norm": 0.43037664890289307, | |
| "learning_rate": 0.0002959801749271137, | |
| "loss": 3.1568, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.34965034965035, | |
| "eval_accuracy": 0.37438896847177267, | |
| "eval_loss": 3.5446794033050537, | |
| "eval_runtime": 180.0732, | |
| "eval_samples_per_second": 92.418, | |
| "eval_steps_per_second": 5.781, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.364219114219114, | |
| "grad_norm": 0.46643906831741333, | |
| "learning_rate": 0.00029580524781341107, | |
| "loss": 3.1654, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 25.37878787878788, | |
| "grad_norm": 0.46620315313339233, | |
| "learning_rate": 0.00029563032069970845, | |
| "loss": 3.1712, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 25.393356643356643, | |
| "grad_norm": 0.4388134777545929, | |
| "learning_rate": 0.00029545539358600577, | |
| "loss": 3.1756, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 25.407925407925408, | |
| "grad_norm": 0.4645821154117584, | |
| "learning_rate": 0.0002952804664723032, | |
| "loss": 3.1865, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 25.422494172494172, | |
| "grad_norm": 0.4313008487224579, | |
| "learning_rate": 0.0002951055393586006, | |
| "loss": 3.1656, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 25.437062937062937, | |
| "grad_norm": 0.4366571307182312, | |
| "learning_rate": 0.00029493061224489795, | |
| "loss": 3.1726, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 25.4516317016317, | |
| "grad_norm": 0.4275836944580078, | |
| "learning_rate": 0.00029475568513119527, | |
| "loss": 3.1824, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 25.466200466200466, | |
| "grad_norm": 0.41847360134124756, | |
| "learning_rate": 0.0002945807580174927, | |
| "loss": 3.165, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 25.48076923076923, | |
| "grad_norm": 0.4382728636264801, | |
| "learning_rate": 0.0002944058309037901, | |
| "loss": 3.1943, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 25.495337995337994, | |
| "grad_norm": 0.45048829913139343, | |
| "learning_rate": 0.00029423090379008745, | |
| "loss": 3.177, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 25.50990675990676, | |
| "grad_norm": 0.44650429487228394, | |
| "learning_rate": 0.00029405597667638483, | |
| "loss": 3.1752, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 25.524475524475523, | |
| "grad_norm": 0.4350736439228058, | |
| "learning_rate": 0.0002938810495626822, | |
| "loss": 3.176, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 25.539044289044288, | |
| "grad_norm": 0.43065324425697327, | |
| "learning_rate": 0.0002937061224489796, | |
| "loss": 3.1841, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 25.553613053613052, | |
| "grad_norm": 0.494675874710083, | |
| "learning_rate": 0.00029353119533527696, | |
| "loss": 3.1645, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 25.568181818181817, | |
| "grad_norm": 0.4301419258117676, | |
| "learning_rate": 0.00029335626822157433, | |
| "loss": 3.1787, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 25.582750582750585, | |
| "grad_norm": 0.4418124854564667, | |
| "learning_rate": 0.00029318134110787166, | |
| "loss": 3.1955, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 25.59731934731935, | |
| "grad_norm": 0.4510349929332733, | |
| "learning_rate": 0.0002930064139941691, | |
| "loss": 3.2034, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 25.611888111888113, | |
| "grad_norm": 0.44014590978622437, | |
| "learning_rate": 0.00029283148688046646, | |
| "loss": 3.1952, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 25.626456876456878, | |
| "grad_norm": 0.4308202862739563, | |
| "learning_rate": 0.00029265655976676384, | |
| "loss": 3.196, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 25.641025641025642, | |
| "grad_norm": 0.4513147175312042, | |
| "learning_rate": 0.0002924816326530612, | |
| "loss": 3.1885, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.641025641025642, | |
| "eval_accuracy": 0.37464590451010354, | |
| "eval_loss": 3.537010908126831, | |
| "eval_runtime": 180.0096, | |
| "eval_samples_per_second": 92.451, | |
| "eval_steps_per_second": 5.783, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.655594405594407, | |
| "grad_norm": 0.41755375266075134, | |
| "learning_rate": 0.0002923067055393586, | |
| "loss": 3.1938, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 25.67016317016317, | |
| "grad_norm": 0.43336576223373413, | |
| "learning_rate": 0.00029213177842565596, | |
| "loss": 3.1946, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 25.684731934731936, | |
| "grad_norm": 0.4581342041492462, | |
| "learning_rate": 0.00029195685131195334, | |
| "loss": 3.1944, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 25.6993006993007, | |
| "grad_norm": 0.46270158886909485, | |
| "learning_rate": 0.0002917819241982507, | |
| "loss": 3.186, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 25.713869463869464, | |
| "grad_norm": 0.4232158958911896, | |
| "learning_rate": 0.0002916069970845481, | |
| "loss": 3.1969, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 25.72843822843823, | |
| "grad_norm": 0.4254876673221588, | |
| "learning_rate": 0.00029143206997084547, | |
| "loss": 3.2034, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 25.743006993006993, | |
| "grad_norm": 0.4381236433982849, | |
| "learning_rate": 0.00029125714285714284, | |
| "loss": 3.1831, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 25.757575757575758, | |
| "grad_norm": 0.4526331424713135, | |
| "learning_rate": 0.0002910822157434402, | |
| "loss": 3.1882, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 25.772144522144522, | |
| "grad_norm": 0.43877992033958435, | |
| "learning_rate": 0.0002909072886297376, | |
| "loss": 3.1952, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 25.786713286713287, | |
| "grad_norm": 0.42094627022743225, | |
| "learning_rate": 0.00029073236151603497, | |
| "loss": 3.1945, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 25.80128205128205, | |
| "grad_norm": 0.4453929662704468, | |
| "learning_rate": 0.00029055743440233235, | |
| "loss": 3.1953, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 25.815850815850816, | |
| "grad_norm": 0.4521614909172058, | |
| "learning_rate": 0.0002903825072886297, | |
| "loss": 3.2105, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 25.83041958041958, | |
| "grad_norm": 0.4133826494216919, | |
| "learning_rate": 0.0002902075801749271, | |
| "loss": 3.194, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 25.844988344988344, | |
| "grad_norm": 0.41386139392852783, | |
| "learning_rate": 0.0002900326530612245, | |
| "loss": 3.2099, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 25.85955710955711, | |
| "grad_norm": 0.4450267255306244, | |
| "learning_rate": 0.00028985772594752185, | |
| "loss": 3.1998, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 25.874125874125873, | |
| "grad_norm": 0.4442254602909088, | |
| "learning_rate": 0.0002896827988338192, | |
| "loss": 3.2076, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 25.888694638694638, | |
| "grad_norm": 0.45003944635391235, | |
| "learning_rate": 0.0002895078717201166, | |
| "loss": 3.2119, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 25.903263403263402, | |
| "grad_norm": 0.4601682722568512, | |
| "learning_rate": 0.000289332944606414, | |
| "loss": 3.2007, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 25.917832167832167, | |
| "grad_norm": 0.41649630665779114, | |
| "learning_rate": 0.00028915801749271135, | |
| "loss": 3.2134, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 25.93240093240093, | |
| "grad_norm": 0.4264170527458191, | |
| "learning_rate": 0.00028898309037900873, | |
| "loss": 3.2063, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.93240093240093, | |
| "eval_accuracy": 0.37524891046184755, | |
| "eval_loss": 3.5297417640686035, | |
| "eval_runtime": 180.0669, | |
| "eval_samples_per_second": 92.421, | |
| "eval_steps_per_second": 5.781, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.946969696969695, | |
| "grad_norm": 0.4318842589855194, | |
| "learning_rate": 0.0002888081632653061, | |
| "loss": 3.2068, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 25.96153846153846, | |
| "grad_norm": 0.40909937024116516, | |
| "learning_rate": 0.0002886332361516035, | |
| "loss": 3.1974, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 25.976107226107224, | |
| "grad_norm": 0.45897090435028076, | |
| "learning_rate": 0.00028845830903790086, | |
| "loss": 3.2034, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 25.990675990675992, | |
| "grad_norm": 0.4648137390613556, | |
| "learning_rate": 0.00028828338192419823, | |
| "loss": 3.2068, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 26.005244755244757, | |
| "grad_norm": 0.4189426600933075, | |
| "learning_rate": 0.0002881084548104956, | |
| "loss": 3.1742, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 26.01981351981352, | |
| "grad_norm": 0.42856845259666443, | |
| "learning_rate": 0.000287933527696793, | |
| "loss": 3.1066, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 26.034382284382286, | |
| "grad_norm": 0.4568479061126709, | |
| "learning_rate": 0.00028775860058309036, | |
| "loss": 3.1162, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 26.04895104895105, | |
| "grad_norm": 0.4555512070655823, | |
| "learning_rate": 0.00028758367346938773, | |
| "loss": 3.1287, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 26.063519813519815, | |
| "grad_norm": 0.45386287569999695, | |
| "learning_rate": 0.0002874087463556851, | |
| "loss": 3.1103, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 26.07808857808858, | |
| "grad_norm": 0.45175591111183167, | |
| "learning_rate": 0.0002872338192419825, | |
| "loss": 3.1168, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 26.092657342657343, | |
| "grad_norm": 0.4288419783115387, | |
| "learning_rate": 0.00028705889212827986, | |
| "loss": 3.1154, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 26.107226107226108, | |
| "grad_norm": 0.4286247193813324, | |
| "learning_rate": 0.00028688396501457724, | |
| "loss": 3.118, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 26.121794871794872, | |
| "grad_norm": 0.44961321353912354, | |
| "learning_rate": 0.0002867090379008746, | |
| "loss": 3.1236, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 26.136363636363637, | |
| "grad_norm": 0.42312365770339966, | |
| "learning_rate": 0.000286534110787172, | |
| "loss": 3.1243, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 26.1509324009324, | |
| "grad_norm": 0.43461480736732483, | |
| "learning_rate": 0.00028635918367346937, | |
| "loss": 3.1339, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 26.165501165501166, | |
| "grad_norm": 0.4479959309101105, | |
| "learning_rate": 0.00028618425655976674, | |
| "loss": 3.1364, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 26.18006993006993, | |
| "grad_norm": 0.44000333547592163, | |
| "learning_rate": 0.0002860093294460641, | |
| "loss": 3.1317, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 26.194638694638694, | |
| "grad_norm": 0.42374539375305176, | |
| "learning_rate": 0.0002858344023323615, | |
| "loss": 3.1386, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 26.20920745920746, | |
| "grad_norm": 0.49846577644348145, | |
| "learning_rate": 0.00028565947521865887, | |
| "loss": 3.1444, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 26.223776223776223, | |
| "grad_norm": 0.42031097412109375, | |
| "learning_rate": 0.00028548454810495624, | |
| "loss": 3.1457, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.223776223776223, | |
| "eval_accuracy": 0.3741142762129439, | |
| "eval_loss": 3.549098491668701, | |
| "eval_runtime": 180.1606, | |
| "eval_samples_per_second": 92.373, | |
| "eval_steps_per_second": 5.778, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.238344988344988, | |
| "grad_norm": 0.45871150493621826, | |
| "learning_rate": 0.0002853096209912536, | |
| "loss": 3.1347, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 26.252913752913752, | |
| "grad_norm": 0.4206872284412384, | |
| "learning_rate": 0.000285134693877551, | |
| "loss": 3.1534, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 26.267482517482517, | |
| "grad_norm": 0.4632352888584137, | |
| "learning_rate": 0.00028495976676384837, | |
| "loss": 3.1387, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 26.28205128205128, | |
| "grad_norm": 0.45894187688827515, | |
| "learning_rate": 0.00028478483965014575, | |
| "loss": 3.1571, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 26.296620046620045, | |
| "grad_norm": 0.4624476730823517, | |
| "learning_rate": 0.0002846099125364431, | |
| "loss": 3.1518, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 26.31118881118881, | |
| "grad_norm": 0.42303577065467834, | |
| "learning_rate": 0.0002844349854227405, | |
| "loss": 3.1664, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 26.325757575757574, | |
| "grad_norm": 0.45280689001083374, | |
| "learning_rate": 0.0002842600583090379, | |
| "loss": 3.1568, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 26.34032634032634, | |
| "grad_norm": 0.46011117100715637, | |
| "learning_rate": 0.00028408513119533525, | |
| "loss": 3.1678, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 26.354895104895103, | |
| "grad_norm": 0.41634801030158997, | |
| "learning_rate": 0.0002839102040816326, | |
| "loss": 3.1702, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 26.36946386946387, | |
| "grad_norm": 0.42648565769195557, | |
| "learning_rate": 0.00028373527696793, | |
| "loss": 3.1612, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 26.384032634032636, | |
| "grad_norm": 0.45400530099868774, | |
| "learning_rate": 0.0002835603498542274, | |
| "loss": 3.1615, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 26.3986013986014, | |
| "grad_norm": 0.4465737044811249, | |
| "learning_rate": 0.00028338542274052475, | |
| "loss": 3.1665, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 26.413170163170165, | |
| "grad_norm": 0.43689030408859253, | |
| "learning_rate": 0.00028321049562682213, | |
| "loss": 3.1611, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 26.42773892773893, | |
| "grad_norm": 0.45843949913978577, | |
| "learning_rate": 0.0002830355685131195, | |
| "loss": 3.1735, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 26.442307692307693, | |
| "grad_norm": 0.4294235408306122, | |
| "learning_rate": 0.0002828606413994169, | |
| "loss": 3.1689, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 26.456876456876458, | |
| "grad_norm": 0.43327438831329346, | |
| "learning_rate": 0.00028268571428571426, | |
| "loss": 3.1698, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 26.471445221445222, | |
| "grad_norm": 0.42804962396621704, | |
| "learning_rate": 0.00028251078717201163, | |
| "loss": 3.1556, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 26.486013986013987, | |
| "grad_norm": 0.43207576870918274, | |
| "learning_rate": 0.000282335860058309, | |
| "loss": 3.1692, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 26.50058275058275, | |
| "grad_norm": 0.43317896127700806, | |
| "learning_rate": 0.0002821609329446064, | |
| "loss": 3.1605, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 26.515151515151516, | |
| "grad_norm": 0.47736456990242004, | |
| "learning_rate": 0.00028198600583090376, | |
| "loss": 3.1848, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.515151515151516, | |
| "eval_accuracy": 0.37442060041424907, | |
| "eval_loss": 3.5440564155578613, | |
| "eval_runtime": 179.9678, | |
| "eval_samples_per_second": 92.472, | |
| "eval_steps_per_second": 5.784, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.52972027972028, | |
| "grad_norm": 0.42366132140159607, | |
| "learning_rate": 0.00028181107871720114, | |
| "loss": 3.1739, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 26.544289044289044, | |
| "grad_norm": 0.4395405054092407, | |
| "learning_rate": 0.0002816361516034985, | |
| "loss": 3.1667, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 26.55885780885781, | |
| "grad_norm": 0.41824859380722046, | |
| "learning_rate": 0.0002814612244897959, | |
| "loss": 3.1733, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 26.573426573426573, | |
| "grad_norm": 0.4543924331665039, | |
| "learning_rate": 0.00028128629737609326, | |
| "loss": 3.179, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 26.587995337995338, | |
| "grad_norm": 0.44957053661346436, | |
| "learning_rate": 0.00028111137026239064, | |
| "loss": 3.1781, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 26.602564102564102, | |
| "grad_norm": 0.4380970895290375, | |
| "learning_rate": 0.000280936443148688, | |
| "loss": 3.1863, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 26.617132867132867, | |
| "grad_norm": 0.4364950656890869, | |
| "learning_rate": 0.0002807615160349854, | |
| "loss": 3.1805, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 26.63170163170163, | |
| "grad_norm": 0.46392178535461426, | |
| "learning_rate": 0.00028058658892128277, | |
| "loss": 3.1899, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 26.646270396270396, | |
| "grad_norm": 0.44661247730255127, | |
| "learning_rate": 0.00028041166180758014, | |
| "loss": 3.1845, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 26.66083916083916, | |
| "grad_norm": 0.4507148563861847, | |
| "learning_rate": 0.0002802367346938775, | |
| "loss": 3.1823, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 26.675407925407924, | |
| "grad_norm": 0.4523918926715851, | |
| "learning_rate": 0.0002800618075801749, | |
| "loss": 3.1867, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 26.68997668997669, | |
| "grad_norm": 0.4648110270500183, | |
| "learning_rate": 0.00027988688046647227, | |
| "loss": 3.1882, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 26.704545454545453, | |
| "grad_norm": 0.4386864900588989, | |
| "learning_rate": 0.00027971195335276965, | |
| "loss": 3.1715, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 26.719114219114218, | |
| "grad_norm": 0.43069586157798767, | |
| "learning_rate": 0.000279537026239067, | |
| "loss": 3.1736, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 26.733682983682982, | |
| "grad_norm": 0.4628106355667114, | |
| "learning_rate": 0.0002793620991253644, | |
| "loss": 3.1872, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 26.748251748251747, | |
| "grad_norm": 0.43363499641418457, | |
| "learning_rate": 0.00027918717201166177, | |
| "loss": 3.1906, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 26.76282051282051, | |
| "grad_norm": 0.4345760643482208, | |
| "learning_rate": 0.00027901224489795915, | |
| "loss": 3.1949, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 26.77738927738928, | |
| "grad_norm": 0.4561176896095276, | |
| "learning_rate": 0.0002788373177842565, | |
| "loss": 3.1857, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 26.791958041958043, | |
| "grad_norm": 0.44225165247917175, | |
| "learning_rate": 0.0002786623906705539, | |
| "loss": 3.1934, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 26.806526806526808, | |
| "grad_norm": 0.43117964267730713, | |
| "learning_rate": 0.0002784874635568513, | |
| "loss": 3.1945, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.806526806526808, | |
| "eval_accuracy": 0.37495940175412645, | |
| "eval_loss": 3.5328190326690674, | |
| "eval_runtime": 180.0693, | |
| "eval_samples_per_second": 92.42, | |
| "eval_steps_per_second": 5.781, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.821095571095572, | |
| "grad_norm": 0.4259069859981537, | |
| "learning_rate": 0.00027831253644314865, | |
| "loss": 3.192, | |
| "step": 92050 | |
| }, | |
| { | |
| "epoch": 26.835664335664337, | |
| "grad_norm": 0.41003715991973877, | |
| "learning_rate": 0.00027813760932944603, | |
| "loss": 3.1884, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 26.8502331002331, | |
| "grad_norm": 0.42846834659576416, | |
| "learning_rate": 0.0002779626822157434, | |
| "loss": 3.1897, | |
| "step": 92150 | |
| }, | |
| { | |
| "epoch": 26.864801864801866, | |
| "grad_norm": 0.4506886899471283, | |
| "learning_rate": 0.0002777877551020408, | |
| "loss": 3.1907, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 26.87937062937063, | |
| "grad_norm": 0.45437026023864746, | |
| "learning_rate": 0.0002776128279883382, | |
| "loss": 3.1979, | |
| "step": 92250 | |
| }, | |
| { | |
| "epoch": 26.893939393939394, | |
| "grad_norm": 0.43997761607170105, | |
| "learning_rate": 0.00027743790087463553, | |
| "loss": 3.1893, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 26.90850815850816, | |
| "grad_norm": 0.4390334188938141, | |
| "learning_rate": 0.0002772629737609329, | |
| "loss": 3.1929, | |
| "step": 92350 | |
| }, | |
| { | |
| "epoch": 26.923076923076923, | |
| "grad_norm": 0.41957342624664307, | |
| "learning_rate": 0.0002770880466472303, | |
| "loss": 3.1948, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 26.937645687645688, | |
| "grad_norm": 0.43660280108451843, | |
| "learning_rate": 0.00027691311953352766, | |
| "loss": 3.1896, | |
| "step": 92450 | |
| }, | |
| { | |
| "epoch": 26.952214452214452, | |
| "grad_norm": 0.42752009630203247, | |
| "learning_rate": 0.00027673819241982503, | |
| "loss": 3.2083, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 26.966783216783217, | |
| "grad_norm": 0.4077240526676178, | |
| "learning_rate": 0.0002765632653061224, | |
| "loss": 3.1988, | |
| "step": 92550 | |
| }, | |
| { | |
| "epoch": 26.98135198135198, | |
| "grad_norm": 0.44178691506385803, | |
| "learning_rate": 0.0002763883381924198, | |
| "loss": 3.1943, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 26.995920745920746, | |
| "grad_norm": 0.4497985541820526, | |
| "learning_rate": 0.00027621341107871716, | |
| "loss": 3.1956, | |
| "step": 92650 | |
| }, | |
| { | |
| "epoch": 27.01048951048951, | |
| "grad_norm": 0.4393564760684967, | |
| "learning_rate": 0.0002760384839650146, | |
| "loss": 3.1167, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 27.025058275058274, | |
| "grad_norm": 0.42496976256370544, | |
| "learning_rate": 0.0002758635568513119, | |
| "loss": 3.1006, | |
| "step": 92750 | |
| }, | |
| { | |
| "epoch": 27.03962703962704, | |
| "grad_norm": 0.4464763104915619, | |
| "learning_rate": 0.0002756886297376093, | |
| "loss": 3.1044, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 27.054195804195803, | |
| "grad_norm": 0.4169905185699463, | |
| "learning_rate": 0.00027551370262390666, | |
| "loss": 3.1068, | |
| "step": 92850 | |
| }, | |
| { | |
| "epoch": 27.068764568764568, | |
| "grad_norm": 0.441429078578949, | |
| "learning_rate": 0.0002753387755102041, | |
| "loss": 3.1149, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 27.083333333333332, | |
| "grad_norm": 0.41629430651664734, | |
| "learning_rate": 0.0002751638483965014, | |
| "loss": 3.1169, | |
| "step": 92950 | |
| }, | |
| { | |
| "epoch": 27.097902097902097, | |
| "grad_norm": 0.4784495234489441, | |
| "learning_rate": 0.0002749889212827988, | |
| "loss": 3.1137, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.097902097902097, | |
| "eval_accuracy": 0.37444059086116727, | |
| "eval_loss": 3.551065683364868, | |
| "eval_runtime": 180.0346, | |
| "eval_samples_per_second": 92.438, | |
| "eval_steps_per_second": 5.782, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.11247086247086, | |
| "grad_norm": 0.43689513206481934, | |
| "learning_rate": 0.00027481399416909617, | |
| "loss": 3.1218, | |
| "step": 93050 | |
| }, | |
| { | |
| "epoch": 27.127039627039625, | |
| "grad_norm": 0.4687103033065796, | |
| "learning_rate": 0.00027463906705539354, | |
| "loss": 3.1118, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 27.14160839160839, | |
| "grad_norm": 0.42937007546424866, | |
| "learning_rate": 0.000274464139941691, | |
| "loss": 3.1084, | |
| "step": 93150 | |
| }, | |
| { | |
| "epoch": 27.156177156177158, | |
| "grad_norm": 0.4537714719772339, | |
| "learning_rate": 0.0002742892128279883, | |
| "loss": 3.1285, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 27.170745920745922, | |
| "grad_norm": 0.4483446180820465, | |
| "learning_rate": 0.00027411428571428567, | |
| "loss": 3.1191, | |
| "step": 93250 | |
| }, | |
| { | |
| "epoch": 27.185314685314687, | |
| "grad_norm": 0.4289822280406952, | |
| "learning_rate": 0.00027393935860058305, | |
| "loss": 3.1271, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 27.19988344988345, | |
| "grad_norm": 0.4680139720439911, | |
| "learning_rate": 0.0002737644314868805, | |
| "loss": 3.1336, | |
| "step": 93350 | |
| }, | |
| { | |
| "epoch": 27.214452214452216, | |
| "grad_norm": 0.4301610589027405, | |
| "learning_rate": 0.0002735895043731778, | |
| "loss": 3.1374, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 27.22902097902098, | |
| "grad_norm": 0.4446755349636078, | |
| "learning_rate": 0.0002734145772594752, | |
| "loss": 3.1381, | |
| "step": 93450 | |
| }, | |
| { | |
| "epoch": 27.243589743589745, | |
| "grad_norm": 0.4434562623500824, | |
| "learning_rate": 0.00027323965014577255, | |
| "loss": 3.1335, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 27.25815850815851, | |
| "grad_norm": 0.49129512906074524, | |
| "learning_rate": 0.00027306472303207, | |
| "loss": 3.1463, | |
| "step": 93550 | |
| }, | |
| { | |
| "epoch": 27.272727272727273, | |
| "grad_norm": 0.43248531222343445, | |
| "learning_rate": 0.00027288979591836736, | |
| "loss": 3.1415, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 27.287296037296038, | |
| "grad_norm": 0.46865126490592957, | |
| "learning_rate": 0.0002727148688046647, | |
| "loss": 3.1394, | |
| "step": 93650 | |
| }, | |
| { | |
| "epoch": 27.301864801864802, | |
| "grad_norm": 0.46066224575042725, | |
| "learning_rate": 0.00027253994169096205, | |
| "loss": 3.1466, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 27.316433566433567, | |
| "grad_norm": 0.44381311535835266, | |
| "learning_rate": 0.00027236501457725943, | |
| "loss": 3.1549, | |
| "step": 93750 | |
| }, | |
| { | |
| "epoch": 27.33100233100233, | |
| "grad_norm": 0.4424683153629303, | |
| "learning_rate": 0.00027219008746355686, | |
| "loss": 3.1552, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 27.345571095571096, | |
| "grad_norm": 0.4684559404850006, | |
| "learning_rate": 0.0002720151603498542, | |
| "loss": 3.1601, | |
| "step": 93850 | |
| }, | |
| { | |
| "epoch": 27.36013986013986, | |
| "grad_norm": 0.4258364737033844, | |
| "learning_rate": 0.00027184023323615156, | |
| "loss": 3.1472, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 27.374708624708624, | |
| "grad_norm": 0.43364837765693665, | |
| "learning_rate": 0.00027166530612244893, | |
| "loss": 3.1483, | |
| "step": 93950 | |
| }, | |
| { | |
| "epoch": 27.38927738927739, | |
| "grad_norm": 0.5024469494819641, | |
| "learning_rate": 0.00027149037900874636, | |
| "loss": 3.1564, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.38927738927739, | |
| "eval_accuracy": 0.3745912247582391, | |
| "eval_loss": 3.5458009243011475, | |
| "eval_runtime": 179.9539, | |
| "eval_samples_per_second": 92.479, | |
| "eval_steps_per_second": 5.785, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.403846153846153, | |
| "grad_norm": 0.4278470575809479, | |
| "learning_rate": 0.00027131545189504374, | |
| "loss": 3.1459, | |
| "step": 94050 | |
| }, | |
| { | |
| "epoch": 27.418414918414918, | |
| "grad_norm": 0.43033117055892944, | |
| "learning_rate": 0.00027114052478134106, | |
| "loss": 3.1585, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 27.432983682983682, | |
| "grad_norm": 0.4715587794780731, | |
| "learning_rate": 0.00027096559766763843, | |
| "loss": 3.1575, | |
| "step": 94150 | |
| }, | |
| { | |
| "epoch": 27.447552447552447, | |
| "grad_norm": 0.4445301592350006, | |
| "learning_rate": 0.00027079067055393586, | |
| "loss": 3.1454, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 27.46212121212121, | |
| "grad_norm": 0.45014679431915283, | |
| "learning_rate": 0.00027061574344023324, | |
| "loss": 3.15, | |
| "step": 94250 | |
| }, | |
| { | |
| "epoch": 27.476689976689975, | |
| "grad_norm": 0.47272783517837524, | |
| "learning_rate": 0.00027044081632653056, | |
| "loss": 3.1527, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 27.49125874125874, | |
| "grad_norm": 0.47329598665237427, | |
| "learning_rate": 0.00027026588921282794, | |
| "loss": 3.1613, | |
| "step": 94350 | |
| }, | |
| { | |
| "epoch": 27.505827505827504, | |
| "grad_norm": 0.4669019877910614, | |
| "learning_rate": 0.0002700909620991253, | |
| "loss": 3.1612, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 27.52039627039627, | |
| "grad_norm": 0.43642956018447876, | |
| "learning_rate": 0.00026991603498542274, | |
| "loss": 3.1634, | |
| "step": 94450 | |
| }, | |
| { | |
| "epoch": 27.534965034965033, | |
| "grad_norm": 0.45903342962265015, | |
| "learning_rate": 0.0002697411078717201, | |
| "loss": 3.1715, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 27.5495337995338, | |
| "grad_norm": 0.43123385310173035, | |
| "learning_rate": 0.00026956618075801744, | |
| "loss": 3.1693, | |
| "step": 94550 | |
| }, | |
| { | |
| "epoch": 27.564102564102566, | |
| "grad_norm": 0.43567144870758057, | |
| "learning_rate": 0.0002693912536443148, | |
| "loss": 3.1722, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 27.57867132867133, | |
| "grad_norm": 0.4707465171813965, | |
| "learning_rate": 0.00026921632653061225, | |
| "loss": 3.1703, | |
| "step": 94650 | |
| }, | |
| { | |
| "epoch": 27.593240093240095, | |
| "grad_norm": 0.4384394586086273, | |
| "learning_rate": 0.0002690413994169096, | |
| "loss": 3.1706, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 27.60780885780886, | |
| "grad_norm": 0.4365791976451874, | |
| "learning_rate": 0.00026886647230320694, | |
| "loss": 3.1742, | |
| "step": 94750 | |
| }, | |
| { | |
| "epoch": 27.622377622377623, | |
| "grad_norm": 0.41515985131263733, | |
| "learning_rate": 0.0002686915451895043, | |
| "loss": 3.1535, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 27.636946386946388, | |
| "grad_norm": 0.4355006217956543, | |
| "learning_rate": 0.00026851661807580175, | |
| "loss": 3.1686, | |
| "step": 94850 | |
| }, | |
| { | |
| "epoch": 27.651515151515152, | |
| "grad_norm": 0.47344303131103516, | |
| "learning_rate": 0.0002683416909620991, | |
| "loss": 3.172, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 27.666083916083917, | |
| "grad_norm": 0.434416800737381, | |
| "learning_rate": 0.0002681667638483965, | |
| "loss": 3.1708, | |
| "step": 94950 | |
| }, | |
| { | |
| "epoch": 27.68065268065268, | |
| "grad_norm": 0.4230683445930481, | |
| "learning_rate": 0.0002679918367346938, | |
| "loss": 3.1807, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.68065268065268, | |
| "eval_accuracy": 0.37487167896941487, | |
| "eval_loss": 3.5402252674102783, | |
| "eval_runtime": 179.9377, | |
| "eval_samples_per_second": 92.488, | |
| "eval_steps_per_second": 5.785, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.695221445221446, | |
| "grad_norm": 0.44491046667099, | |
| "learning_rate": 0.00026781690962099125, | |
| "loss": 3.1754, | |
| "step": 95050 | |
| }, | |
| { | |
| "epoch": 27.70979020979021, | |
| "grad_norm": 0.43584510684013367, | |
| "learning_rate": 0.00026764198250728863, | |
| "loss": 3.1754, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 27.724358974358974, | |
| "grad_norm": 0.45981577038764954, | |
| "learning_rate": 0.000267467055393586, | |
| "loss": 3.1746, | |
| "step": 95150 | |
| }, | |
| { | |
| "epoch": 27.73892773892774, | |
| "grad_norm": 0.46015459299087524, | |
| "learning_rate": 0.0002672921282798833, | |
| "loss": 3.1925, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 27.753496503496503, | |
| "grad_norm": 0.4347175359725952, | |
| "learning_rate": 0.0002671172011661807, | |
| "loss": 3.1811, | |
| "step": 95250 | |
| }, | |
| { | |
| "epoch": 27.768065268065268, | |
| "grad_norm": 0.4558524191379547, | |
| "learning_rate": 0.00026694227405247813, | |
| "loss": 3.1854, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 27.782634032634032, | |
| "grad_norm": 0.46495407819747925, | |
| "learning_rate": 0.0002667673469387755, | |
| "loss": 3.1795, | |
| "step": 95350 | |
| }, | |
| { | |
| "epoch": 27.797202797202797, | |
| "grad_norm": 0.4345493018627167, | |
| "learning_rate": 0.0002665924198250729, | |
| "loss": 3.1858, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 27.81177156177156, | |
| "grad_norm": 0.4338386058807373, | |
| "learning_rate": 0.0002664174927113702, | |
| "loss": 3.1728, | |
| "step": 95450 | |
| }, | |
| { | |
| "epoch": 27.826340326340326, | |
| "grad_norm": 0.46094560623168945, | |
| "learning_rate": 0.00026624256559766764, | |
| "loss": 3.1693, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 27.84090909090909, | |
| "grad_norm": 0.44629111886024475, | |
| "learning_rate": 0.000266067638483965, | |
| "loss": 3.186, | |
| "step": 95550 | |
| }, | |
| { | |
| "epoch": 27.855477855477854, | |
| "grad_norm": 0.4409028887748718, | |
| "learning_rate": 0.0002658927113702624, | |
| "loss": 3.1921, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 27.87004662004662, | |
| "grad_norm": 0.4406682848930359, | |
| "learning_rate": 0.0002657177842565597, | |
| "loss": 3.1829, | |
| "step": 95650 | |
| }, | |
| { | |
| "epoch": 27.884615384615383, | |
| "grad_norm": 0.41936755180358887, | |
| "learning_rate": 0.00026554285714285714, | |
| "loss": 3.1896, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 27.899184149184148, | |
| "grad_norm": 0.4212104082107544, | |
| "learning_rate": 0.0002653679300291545, | |
| "loss": 3.1837, | |
| "step": 95750 | |
| }, | |
| { | |
| "epoch": 27.913752913752912, | |
| "grad_norm": 0.4641505181789398, | |
| "learning_rate": 0.0002651930029154519, | |
| "loss": 3.1885, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 27.92832167832168, | |
| "grad_norm": 0.4994050860404968, | |
| "learning_rate": 0.00026501807580174927, | |
| "loss": 3.1885, | |
| "step": 95850 | |
| }, | |
| { | |
| "epoch": 27.942890442890445, | |
| "grad_norm": 0.45975276827812195, | |
| "learning_rate": 0.0002648431486880466, | |
| "loss": 3.1928, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 27.95745920745921, | |
| "grad_norm": 0.4463437497615814, | |
| "learning_rate": 0.000264668221574344, | |
| "loss": 3.1852, | |
| "step": 95950 | |
| }, | |
| { | |
| "epoch": 27.972027972027973, | |
| "grad_norm": 0.44903019070625305, | |
| "learning_rate": 0.0002644932944606414, | |
| "loss": 3.1905, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.972027972027973, | |
| "eval_accuracy": 0.3752865395383994, | |
| "eval_loss": 3.5301098823547363, | |
| "eval_runtime": 180.0722, | |
| "eval_samples_per_second": 92.418, | |
| "eval_steps_per_second": 5.781, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.986596736596738, | |
| "grad_norm": 0.4516717195510864, | |
| "learning_rate": 0.00026431836734693877, | |
| "loss": 3.1865, | |
| "step": 96050 | |
| }, | |
| { | |
| "epoch": 28.001165501165502, | |
| "grad_norm": 0.45636415481567383, | |
| "learning_rate": 0.0002641434402332361, | |
| "loss": 3.1982, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 28.015734265734267, | |
| "grad_norm": 0.4726836681365967, | |
| "learning_rate": 0.0002639685131195335, | |
| "loss": 3.0816, | |
| "step": 96150 | |
| }, | |
| { | |
| "epoch": 28.03030303030303, | |
| "grad_norm": 0.4496645927429199, | |
| "learning_rate": 0.0002637935860058309, | |
| "loss": 3.0853, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 28.044871794871796, | |
| "grad_norm": 0.47952306270599365, | |
| "learning_rate": 0.00026361865889212827, | |
| "loss": 3.1046, | |
| "step": 96250 | |
| }, | |
| { | |
| "epoch": 28.05944055944056, | |
| "grad_norm": 0.45253846049308777, | |
| "learning_rate": 0.00026344373177842565, | |
| "loss": 3.0992, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 28.074009324009324, | |
| "grad_norm": 0.4647113084793091, | |
| "learning_rate": 0.000263268804664723, | |
| "loss": 3.0939, | |
| "step": 96350 | |
| }, | |
| { | |
| "epoch": 28.08857808857809, | |
| "grad_norm": 0.46444422006607056, | |
| "learning_rate": 0.0002630938775510204, | |
| "loss": 3.105, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 28.103146853146853, | |
| "grad_norm": 0.45401421189308167, | |
| "learning_rate": 0.0002629189504373178, | |
| "loss": 3.1085, | |
| "step": 96450 | |
| }, | |
| { | |
| "epoch": 28.117715617715618, | |
| "grad_norm": 0.4496326744556427, | |
| "learning_rate": 0.00026274402332361515, | |
| "loss": 3.109, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 28.132284382284382, | |
| "grad_norm": 0.4805924892425537, | |
| "learning_rate": 0.0002625690962099125, | |
| "loss": 3.1118, | |
| "step": 96550 | |
| }, | |
| { | |
| "epoch": 28.146853146853147, | |
| "grad_norm": 0.4368390142917633, | |
| "learning_rate": 0.0002623941690962099, | |
| "loss": 3.1099, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 28.16142191142191, | |
| "grad_norm": 0.43335017561912537, | |
| "learning_rate": 0.0002622192419825073, | |
| "loss": 3.1148, | |
| "step": 96650 | |
| }, | |
| { | |
| "epoch": 28.175990675990676, | |
| "grad_norm": 0.5231890082359314, | |
| "learning_rate": 0.00026204431486880465, | |
| "loss": 3.128, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 28.19055944055944, | |
| "grad_norm": 0.44500666856765747, | |
| "learning_rate": 0.00026186938775510203, | |
| "loss": 3.1168, | |
| "step": 96750 | |
| }, | |
| { | |
| "epoch": 28.205128205128204, | |
| "grad_norm": 0.4381171464920044, | |
| "learning_rate": 0.0002616944606413994, | |
| "loss": 3.1199, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 28.21969696969697, | |
| "grad_norm": 0.4727230370044708, | |
| "learning_rate": 0.0002615195335276968, | |
| "loss": 3.1237, | |
| "step": 96850 | |
| }, | |
| { | |
| "epoch": 28.234265734265733, | |
| "grad_norm": 0.45557940006256104, | |
| "learning_rate": 0.00026134460641399416, | |
| "loss": 3.136, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 28.248834498834498, | |
| "grad_norm": 0.42124268412590027, | |
| "learning_rate": 0.00026116967930029153, | |
| "loss": 3.1277, | |
| "step": 96950 | |
| }, | |
| { | |
| "epoch": 28.263403263403262, | |
| "grad_norm": 0.4520432651042938, | |
| "learning_rate": 0.0002609947521865889, | |
| "loss": 3.1347, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.263403263403262, | |
| "eval_accuracy": 0.37456488440465274, | |
| "eval_loss": 3.5478932857513428, | |
| "eval_runtime": 180.1693, | |
| "eval_samples_per_second": 92.369, | |
| "eval_steps_per_second": 5.778, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.277972027972027, | |
| "grad_norm": 0.45506972074508667, | |
| "learning_rate": 0.0002608198250728863, | |
| "loss": 3.1352, | |
| "step": 97050 | |
| }, | |
| { | |
| "epoch": 28.29254079254079, | |
| "grad_norm": 0.45318546891212463, | |
| "learning_rate": 0.00026064489795918366, | |
| "loss": 3.1371, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 28.307109557109555, | |
| "grad_norm": 0.46015825867652893, | |
| "learning_rate": 0.00026046997084548104, | |
| "loss": 3.1348, | |
| "step": 97150 | |
| }, | |
| { | |
| "epoch": 28.32167832167832, | |
| "grad_norm": 0.4895924925804138, | |
| "learning_rate": 0.0002602950437317784, | |
| "loss": 3.141, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 28.336247086247088, | |
| "grad_norm": 0.44168609380722046, | |
| "learning_rate": 0.0002601201166180758, | |
| "loss": 3.1361, | |
| "step": 97250 | |
| }, | |
| { | |
| "epoch": 28.350815850815852, | |
| "grad_norm": 0.44923096895217896, | |
| "learning_rate": 0.00025994518950437316, | |
| "loss": 3.1442, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 28.365384615384617, | |
| "grad_norm": 0.45875313878059387, | |
| "learning_rate": 0.00025977026239067054, | |
| "loss": 3.1504, | |
| "step": 97350 | |
| }, | |
| { | |
| "epoch": 28.37995337995338, | |
| "grad_norm": 0.4395272135734558, | |
| "learning_rate": 0.0002595953352769679, | |
| "loss": 3.1461, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 28.394522144522146, | |
| "grad_norm": 0.4497947096824646, | |
| "learning_rate": 0.0002594204081632653, | |
| "loss": 3.15, | |
| "step": 97450 | |
| }, | |
| { | |
| "epoch": 28.40909090909091, | |
| "grad_norm": 0.4851381182670593, | |
| "learning_rate": 0.00025924548104956267, | |
| "loss": 3.1458, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 28.423659673659674, | |
| "grad_norm": 0.4465165436267853, | |
| "learning_rate": 0.00025907055393586004, | |
| "loss": 3.1481, | |
| "step": 97550 | |
| }, | |
| { | |
| "epoch": 28.43822843822844, | |
| "grad_norm": 0.4833471179008484, | |
| "learning_rate": 0.0002588956268221574, | |
| "loss": 3.1435, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 28.452797202797203, | |
| "grad_norm": 0.4438728392124176, | |
| "learning_rate": 0.0002587206997084548, | |
| "loss": 3.1695, | |
| "step": 97650 | |
| }, | |
| { | |
| "epoch": 28.467365967365968, | |
| "grad_norm": 0.4530220031738281, | |
| "learning_rate": 0.00025854577259475217, | |
| "loss": 3.153, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 28.481934731934732, | |
| "grad_norm": 0.4695287346839905, | |
| "learning_rate": 0.00025837084548104955, | |
| "loss": 3.1391, | |
| "step": 97750 | |
| }, | |
| { | |
| "epoch": 28.496503496503497, | |
| "grad_norm": 0.4446166455745697, | |
| "learning_rate": 0.0002581959183673469, | |
| "loss": 3.1654, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 28.51107226107226, | |
| "grad_norm": 0.4539489448070526, | |
| "learning_rate": 0.0002580209912536443, | |
| "loss": 3.1569, | |
| "step": 97850 | |
| }, | |
| { | |
| "epoch": 28.525641025641026, | |
| "grad_norm": 0.44202882051467896, | |
| "learning_rate": 0.0002578460641399417, | |
| "loss": 3.1621, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 28.54020979020979, | |
| "grad_norm": 0.46865367889404297, | |
| "learning_rate": 0.00025767113702623905, | |
| "loss": 3.1566, | |
| "step": 97950 | |
| }, | |
| { | |
| "epoch": 28.554778554778554, | |
| "grad_norm": 0.4353398084640503, | |
| "learning_rate": 0.0002574962099125364, | |
| "loss": 3.1577, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.554778554778554, | |
| "eval_accuracy": 0.37477490168815797, | |
| "eval_loss": 3.543243169784546, | |
| "eval_runtime": 180.0326, | |
| "eval_samples_per_second": 92.439, | |
| "eval_steps_per_second": 5.782, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.56934731934732, | |
| "grad_norm": 0.4895898103713989, | |
| "learning_rate": 0.0002573212827988338, | |
| "loss": 3.1526, | |
| "step": 98050 | |
| }, | |
| { | |
| "epoch": 28.583916083916083, | |
| "grad_norm": 0.4979413151741028, | |
| "learning_rate": 0.0002571463556851312, | |
| "loss": 3.1605, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 28.598484848484848, | |
| "grad_norm": 0.4730426073074341, | |
| "learning_rate": 0.00025697142857142855, | |
| "loss": 3.1606, | |
| "step": 98150 | |
| }, | |
| { | |
| "epoch": 28.613053613053612, | |
| "grad_norm": 0.4984246492385864, | |
| "learning_rate": 0.00025679650145772593, | |
| "loss": 3.1651, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 28.627622377622377, | |
| "grad_norm": 0.4455728232860565, | |
| "learning_rate": 0.0002566215743440233, | |
| "loss": 3.151, | |
| "step": 98250 | |
| }, | |
| { | |
| "epoch": 28.64219114219114, | |
| "grad_norm": 0.4372442364692688, | |
| "learning_rate": 0.0002564466472303207, | |
| "loss": 3.1691, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 28.656759906759905, | |
| "grad_norm": 0.45729032158851624, | |
| "learning_rate": 0.00025627172011661806, | |
| "loss": 3.1627, | |
| "step": 98350 | |
| }, | |
| { | |
| "epoch": 28.67132867132867, | |
| "grad_norm": 0.4392733871936798, | |
| "learning_rate": 0.00025609679300291543, | |
| "loss": 3.1576, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 28.685897435897434, | |
| "grad_norm": 0.46147841215133667, | |
| "learning_rate": 0.0002559218658892128, | |
| "loss": 3.1723, | |
| "step": 98450 | |
| }, | |
| { | |
| "epoch": 28.7004662004662, | |
| "grad_norm": 0.4356614351272583, | |
| "learning_rate": 0.0002557469387755102, | |
| "loss": 3.1619, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 28.715034965034967, | |
| "grad_norm": 0.4666474461555481, | |
| "learning_rate": 0.00025557201166180756, | |
| "loss": 3.1639, | |
| "step": 98550 | |
| }, | |
| { | |
| "epoch": 28.72960372960373, | |
| "grad_norm": 0.44134947657585144, | |
| "learning_rate": 0.00025539708454810493, | |
| "loss": 3.1696, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 28.744172494172496, | |
| "grad_norm": 0.43788793683052063, | |
| "learning_rate": 0.0002552221574344023, | |
| "loss": 3.169, | |
| "step": 98650 | |
| }, | |
| { | |
| "epoch": 28.75874125874126, | |
| "grad_norm": 0.44114676117897034, | |
| "learning_rate": 0.0002550472303206997, | |
| "loss": 3.175, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 28.773310023310025, | |
| "grad_norm": 0.4305495619773865, | |
| "learning_rate": 0.00025487230320699706, | |
| "loss": 3.164, | |
| "step": 98750 | |
| }, | |
| { | |
| "epoch": 28.78787878787879, | |
| "grad_norm": 0.45937755703926086, | |
| "learning_rate": 0.00025469737609329444, | |
| "loss": 3.1809, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 28.802447552447553, | |
| "grad_norm": 0.4551764726638794, | |
| "learning_rate": 0.0002545224489795918, | |
| "loss": 3.1747, | |
| "step": 98850 | |
| }, | |
| { | |
| "epoch": 28.817016317016318, | |
| "grad_norm": 0.4215174615383148, | |
| "learning_rate": 0.0002543475218658892, | |
| "loss": 3.1714, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 28.831585081585082, | |
| "grad_norm": 0.4614230990409851, | |
| "learning_rate": 0.00025417259475218657, | |
| "loss": 3.175, | |
| "step": 98950 | |
| }, | |
| { | |
| "epoch": 28.846153846153847, | |
| "grad_norm": 0.43585631251335144, | |
| "learning_rate": 0.00025399766763848394, | |
| "loss": 3.16, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.846153846153847, | |
| "eval_accuracy": 0.37547186274041744, | |
| "eval_loss": 3.535378932952881, | |
| "eval_runtime": 179.9708, | |
| "eval_samples_per_second": 92.471, | |
| "eval_steps_per_second": 5.784, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.86072261072261, | |
| "grad_norm": 0.4275636672973633, | |
| "learning_rate": 0.0002538227405247813, | |
| "loss": 3.1658, | |
| "step": 99050 | |
| }, | |
| { | |
| "epoch": 28.875291375291376, | |
| "grad_norm": 0.4248562753200531, | |
| "learning_rate": 0.0002536478134110787, | |
| "loss": 3.1766, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 28.88986013986014, | |
| "grad_norm": 0.4694676995277405, | |
| "learning_rate": 0.00025347288629737607, | |
| "loss": 3.169, | |
| "step": 99150 | |
| }, | |
| { | |
| "epoch": 28.904428904428904, | |
| "grad_norm": 0.446225106716156, | |
| "learning_rate": 0.00025329795918367344, | |
| "loss": 3.1817, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 28.91899766899767, | |
| "grad_norm": 0.44096997380256653, | |
| "learning_rate": 0.0002531230320699708, | |
| "loss": 3.1872, | |
| "step": 99250 | |
| }, | |
| { | |
| "epoch": 28.933566433566433, | |
| "grad_norm": 0.4305557906627655, | |
| "learning_rate": 0.0002529481049562682, | |
| "loss": 3.1849, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 28.948135198135198, | |
| "grad_norm": 0.45161905884742737, | |
| "learning_rate": 0.00025277317784256557, | |
| "loss": 3.1889, | |
| "step": 99350 | |
| }, | |
| { | |
| "epoch": 28.962703962703962, | |
| "grad_norm": 0.4424970746040344, | |
| "learning_rate": 0.00025259825072886295, | |
| "loss": 3.1705, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 28.977272727272727, | |
| "grad_norm": 0.451418936252594, | |
| "learning_rate": 0.0002524233236151603, | |
| "loss": 3.1821, | |
| "step": 99450 | |
| }, | |
| { | |
| "epoch": 28.99184149184149, | |
| "grad_norm": 0.43637681007385254, | |
| "learning_rate": 0.0002522483965014577, | |
| "loss": 3.1801, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 29.006410256410255, | |
| "grad_norm": 0.5177982449531555, | |
| "learning_rate": 0.0002520734693877551, | |
| "loss": 3.138, | |
| "step": 99550 | |
| }, | |
| { | |
| "epoch": 29.02097902097902, | |
| "grad_norm": 0.4267365336418152, | |
| "learning_rate": 0.00025189854227405245, | |
| "loss": 3.0772, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 29.035547785547784, | |
| "grad_norm": 0.45578399300575256, | |
| "learning_rate": 0.0002517236151603498, | |
| "loss": 3.0917, | |
| "step": 99650 | |
| }, | |
| { | |
| "epoch": 29.05011655011655, | |
| "grad_norm": 0.47677499055862427, | |
| "learning_rate": 0.0002515486880466472, | |
| "loss": 3.0774, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 29.064685314685313, | |
| "grad_norm": 0.44193127751350403, | |
| "learning_rate": 0.0002513737609329446, | |
| "loss": 3.0921, | |
| "step": 99750 | |
| }, | |
| { | |
| "epoch": 29.079254079254078, | |
| "grad_norm": 0.4814469516277313, | |
| "learning_rate": 0.00025119883381924195, | |
| "loss": 3.108, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 29.093822843822842, | |
| "grad_norm": 0.45265457034111023, | |
| "learning_rate": 0.00025102390670553933, | |
| "loss": 3.0893, | |
| "step": 99850 | |
| }, | |
| { | |
| "epoch": 29.10839160839161, | |
| "grad_norm": 0.46795418858528137, | |
| "learning_rate": 0.0002508489795918367, | |
| "loss": 3.1063, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 29.122960372960375, | |
| "grad_norm": 0.46038514375686646, | |
| "learning_rate": 0.0002506740524781341, | |
| "loss": 3.1051, | |
| "step": 99950 | |
| }, | |
| { | |
| "epoch": 29.13752913752914, | |
| "grad_norm": 0.47460585832595825, | |
| "learning_rate": 0.00025049912536443146, | |
| "loss": 3.1113, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.13752913752914, | |
| "eval_accuracy": 0.3746667180930713, | |
| "eval_loss": 3.5507586002349854, | |
| "eval_runtime": 180.1614, | |
| "eval_samples_per_second": 92.373, | |
| "eval_steps_per_second": 5.778, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.13752913752914, | |
| "step": 100000, | |
| "total_flos": 2.090305946124288e+18, | |
| "train_loss": 0.6333386157226563, | |
| "train_runtime": 40009.1315, | |
| "train_samples_per_second": 343.117, | |
| "train_steps_per_second": 4.289 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 171600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 20 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.090305946124288e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |