| { | |
| "best_metric": 0.4589254856109619, | |
| "best_model_checkpoint": "Action_model/checkpoint-1500", | |
| "epoch": 10.0, | |
| "eval_steps": 100, | |
| "global_step": 2680, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.7369908094406128, | |
| "learning_rate": 9.96268656716418e-05, | |
| "loss": 2.2759, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.753720998764038, | |
| "learning_rate": 9.925373134328359e-05, | |
| "loss": 2.1743, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.8532754182815552, | |
| "learning_rate": 9.888059701492539e-05, | |
| "loss": 2.0233, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 2.195688486099243, | |
| "learning_rate": 9.850746268656717e-05, | |
| "loss": 1.8293, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.392077684402466, | |
| "learning_rate": 9.813432835820896e-05, | |
| "loss": 1.7307, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.851775646209717, | |
| "learning_rate": 9.776119402985075e-05, | |
| "loss": 1.5716, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.2557411193847656, | |
| "learning_rate": 9.738805970149254e-05, | |
| "loss": 1.4694, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.4612302780151367, | |
| "learning_rate": 9.701492537313434e-05, | |
| "loss": 1.3609, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.7514560222625732, | |
| "learning_rate": 9.664179104477612e-05, | |
| "loss": 1.2871, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 3.6256659030914307, | |
| "learning_rate": 9.626865671641792e-05, | |
| "loss": 1.2754, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_accuracy": 0.7328646748681898, | |
| "eval_loss": 1.1163370609283447, | |
| "eval_runtime": 12.5514, | |
| "eval_samples_per_second": 45.333, | |
| "eval_steps_per_second": 5.736, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.642601728439331, | |
| "learning_rate": 9.58955223880597e-05, | |
| "loss": 1.2354, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.4862725734710693, | |
| "learning_rate": 9.552238805970149e-05, | |
| "loss": 1.169, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 3.962764263153076, | |
| "learning_rate": 9.514925373134329e-05, | |
| "loss": 1.2546, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.9388816356658936, | |
| "learning_rate": 9.477611940298507e-05, | |
| "loss": 1.1702, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 4.958592414855957, | |
| "learning_rate": 9.440298507462687e-05, | |
| "loss": 1.0865, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 3.4470815658569336, | |
| "learning_rate": 9.402985074626867e-05, | |
| "loss": 1.0097, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 4.423004627227783, | |
| "learning_rate": 9.365671641791045e-05, | |
| "loss": 1.0749, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.808164358139038, | |
| "learning_rate": 9.328358208955224e-05, | |
| "loss": 0.9732, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 6.00456428527832, | |
| "learning_rate": 9.291044776119402e-05, | |
| "loss": 1.0009, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 5.091552734375, | |
| "learning_rate": 9.253731343283582e-05, | |
| "loss": 0.9345, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_accuracy": 0.7996485061511424, | |
| "eval_loss": 0.8296495079994202, | |
| "eval_runtime": 7.8912, | |
| "eval_samples_per_second": 72.105, | |
| "eval_steps_per_second": 9.124, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.2533326148986816, | |
| "learning_rate": 9.216417910447762e-05, | |
| "loss": 0.793, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 6.073918342590332, | |
| "learning_rate": 9.17910447761194e-05, | |
| "loss": 0.9835, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 3.6311192512512207, | |
| "learning_rate": 9.14179104477612e-05, | |
| "loss": 0.8801, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 4.446895599365234, | |
| "learning_rate": 9.104477611940299e-05, | |
| "loss": 1.0534, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 4.668705463409424, | |
| "learning_rate": 9.067164179104479e-05, | |
| "loss": 0.9396, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 6.191302299499512, | |
| "learning_rate": 9.029850746268657e-05, | |
| "loss": 0.9275, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 3.170959711074829, | |
| "learning_rate": 8.992537313432836e-05, | |
| "loss": 0.8595, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 3.690964460372925, | |
| "learning_rate": 8.955223880597016e-05, | |
| "loss": 0.733, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 4.871851444244385, | |
| "learning_rate": 8.917910447761194e-05, | |
| "loss": 0.7623, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 3.3851799964904785, | |
| "learning_rate": 8.880597014925374e-05, | |
| "loss": 0.8816, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_accuracy": 0.8101933216168717, | |
| "eval_loss": 0.7156229615211487, | |
| "eval_runtime": 7.8519, | |
| "eval_samples_per_second": 72.467, | |
| "eval_steps_per_second": 9.17, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 3.334380865097046, | |
| "learning_rate": 8.843283582089554e-05, | |
| "loss": 0.8567, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 4.673859596252441, | |
| "learning_rate": 8.805970149253732e-05, | |
| "loss": 0.7926, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 3.3042550086975098, | |
| "learning_rate": 8.76865671641791e-05, | |
| "loss": 0.6847, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 5.4356513023376465, | |
| "learning_rate": 8.731343283582089e-05, | |
| "loss": 0.7656, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 7.050413131713867, | |
| "learning_rate": 8.694029850746269e-05, | |
| "loss": 0.6658, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 5.980592727661133, | |
| "learning_rate": 8.656716417910447e-05, | |
| "loss": 0.7948, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 3.894716739654541, | |
| "learning_rate": 8.619402985074627e-05, | |
| "loss": 0.8381, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 7.189664363861084, | |
| "learning_rate": 8.582089552238807e-05, | |
| "loss": 0.6532, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 4.317276477813721, | |
| "learning_rate": 8.544776119402986e-05, | |
| "loss": 0.7763, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 4.480589866638184, | |
| "learning_rate": 8.511194029850747e-05, | |
| "loss": 0.7425, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "eval_accuracy": 0.8066783831282952, | |
| "eval_loss": 0.6529447436332703, | |
| "eval_runtime": 7.793, | |
| "eval_samples_per_second": 73.014, | |
| "eval_steps_per_second": 9.239, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 4.1799163818359375, | |
| "learning_rate": 8.473880597014926e-05, | |
| "loss": 0.6928, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 4.81996488571167, | |
| "learning_rate": 8.436567164179105e-05, | |
| "loss": 0.7769, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 7.18645715713501, | |
| "learning_rate": 8.399253731343283e-05, | |
| "loss": 0.6848, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 3.888197660446167, | |
| "learning_rate": 8.361940298507463e-05, | |
| "loss": 0.5977, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 7.374312877655029, | |
| "learning_rate": 8.324626865671642e-05, | |
| "loss": 0.6001, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.553064823150635, | |
| "learning_rate": 8.287313432835821e-05, | |
| "loss": 0.6683, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 3.466761589050293, | |
| "learning_rate": 8.25e-05, | |
| "loss": 0.6484, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 3.534076690673828, | |
| "learning_rate": 8.21268656716418e-05, | |
| "loss": 0.6589, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 3.581280469894409, | |
| "learning_rate": 8.17537313432836e-05, | |
| "loss": 0.6173, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 6.162041664123535, | |
| "learning_rate": 8.138059701492538e-05, | |
| "loss": 0.6883, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "eval_accuracy": 0.8242530755711776, | |
| "eval_loss": 0.6078779697418213, | |
| "eval_runtime": 7.6716, | |
| "eval_samples_per_second": 74.169, | |
| "eval_steps_per_second": 9.385, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 5.477086067199707, | |
| "learning_rate": 8.100746268656717e-05, | |
| "loss": 0.5952, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 2.389667510986328, | |
| "learning_rate": 8.063432835820895e-05, | |
| "loss": 0.5193, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 5.730781555175781, | |
| "learning_rate": 8.026119402985075e-05, | |
| "loss": 0.6818, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 6.305990219116211, | |
| "learning_rate": 7.992537313432836e-05, | |
| "loss": 0.5738, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 3.507434368133545, | |
| "learning_rate": 7.955223880597016e-05, | |
| "loss": 0.5685, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 12.683993339538574, | |
| "learning_rate": 7.917910447761194e-05, | |
| "loss": 0.6684, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 5.5166916847229, | |
| "learning_rate": 7.880597014925374e-05, | |
| "loss": 0.4787, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 6.427499294281006, | |
| "learning_rate": 7.843283582089552e-05, | |
| "loss": 0.5818, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 5.062973976135254, | |
| "learning_rate": 7.805970149253732e-05, | |
| "loss": 0.4766, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 5.720675945281982, | |
| "learning_rate": 7.768656716417911e-05, | |
| "loss": 0.5454, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_accuracy": 0.8347978910369068, | |
| "eval_loss": 0.5604887008666992, | |
| "eval_runtime": 7.7133, | |
| "eval_samples_per_second": 73.769, | |
| "eval_steps_per_second": 9.335, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 7.875051021575928, | |
| "learning_rate": 7.731343283582089e-05, | |
| "loss": 0.5935, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 4.378401756286621, | |
| "learning_rate": 7.694029850746269e-05, | |
| "loss": 0.4639, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 7.522930145263672, | |
| "learning_rate": 7.656716417910448e-05, | |
| "loss": 0.4867, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 6.3615288734436035, | |
| "learning_rate": 7.619402985074627e-05, | |
| "loss": 0.5302, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 3.8204784393310547, | |
| "learning_rate": 7.582089552238806e-05, | |
| "loss": 0.3864, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 2.3520662784576416, | |
| "learning_rate": 7.544776119402986e-05, | |
| "loss": 0.6458, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 3.9832942485809326, | |
| "learning_rate": 7.507462686567166e-05, | |
| "loss": 0.494, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 3.6783320903778076, | |
| "learning_rate": 7.470149253731343e-05, | |
| "loss": 0.6213, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 4.528789520263672, | |
| "learning_rate": 7.432835820895523e-05, | |
| "loss": 0.615, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 5.556227207183838, | |
| "learning_rate": 7.395522388059701e-05, | |
| "loss": 0.5383, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "eval_accuracy": 0.8295254833040422, | |
| "eval_loss": 0.5571200251579285, | |
| "eval_runtime": 7.8934, | |
| "eval_samples_per_second": 72.085, | |
| "eval_steps_per_second": 9.122, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 4.617480754852295, | |
| "learning_rate": 7.358208955223881e-05, | |
| "loss": 0.4987, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 4.6940412521362305, | |
| "learning_rate": 7.32089552238806e-05, | |
| "loss": 0.5466, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 3.8839175701141357, | |
| "learning_rate": 7.283582089552239e-05, | |
| "loss": 0.5409, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 6.855696201324463, | |
| "learning_rate": 7.246268656716419e-05, | |
| "loss": 0.3972, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 3.9779269695281982, | |
| "learning_rate": 7.208955223880597e-05, | |
| "loss": 0.4719, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 10.327420234680176, | |
| "learning_rate": 7.171641791044776e-05, | |
| "loss": 0.668, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 5.06951904296875, | |
| "learning_rate": 7.134328358208956e-05, | |
| "loss": 0.5899, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 5.539373397827148, | |
| "learning_rate": 7.097014925373134e-05, | |
| "loss": 0.5813, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 4.622121334075928, | |
| "learning_rate": 7.059701492537314e-05, | |
| "loss": 0.5294, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 2.6457552909851074, | |
| "learning_rate": 7.022388059701493e-05, | |
| "loss": 0.5442, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "eval_accuracy": 0.8189806678383128, | |
| "eval_loss": 0.5864126682281494, | |
| "eval_runtime": 7.8507, | |
| "eval_samples_per_second": 72.478, | |
| "eval_steps_per_second": 9.171, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 3.373798370361328, | |
| "learning_rate": 6.985074626865672e-05, | |
| "loss": 0.4183, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 4.0179667472839355, | |
| "learning_rate": 6.947761194029851e-05, | |
| "loss": 0.3611, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 7.72437858581543, | |
| "learning_rate": 6.91044776119403e-05, | |
| "loss": 0.4543, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 3.1097893714904785, | |
| "learning_rate": 6.873134328358209e-05, | |
| "loss": 0.5194, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 6.581250190734863, | |
| "learning_rate": 6.835820895522388e-05, | |
| "loss": 0.3839, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 5.605171203613281, | |
| "learning_rate": 6.798507462686568e-05, | |
| "loss": 0.4499, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 2.834651231765747, | |
| "learning_rate": 6.761194029850747e-05, | |
| "loss": 0.5067, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 4.615099906921387, | |
| "learning_rate": 6.723880597014926e-05, | |
| "loss": 0.4869, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 6.115981101989746, | |
| "learning_rate": 6.686567164179106e-05, | |
| "loss": 0.4793, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 1.1021697521209717, | |
| "learning_rate": 6.649253731343283e-05, | |
| "loss": 0.3986, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "eval_accuracy": 0.8312829525483304, | |
| "eval_loss": 0.5632173418998718, | |
| "eval_runtime": 7.731, | |
| "eval_samples_per_second": 73.6, | |
| "eval_steps_per_second": 9.313, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 7.019008159637451, | |
| "learning_rate": 6.611940298507463e-05, | |
| "loss": 0.383, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 2.586031913757324, | |
| "learning_rate": 6.574626865671642e-05, | |
| "loss": 0.2752, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 2.5189669132232666, | |
| "learning_rate": 6.537313432835821e-05, | |
| "loss": 0.2944, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 10.028382301330566, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 0.4378, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 1.8697803020477295, | |
| "learning_rate": 6.462686567164179e-05, | |
| "loss": 0.3956, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 5.872415065765381, | |
| "learning_rate": 6.425373134328359e-05, | |
| "loss": 0.338, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 8.272451400756836, | |
| "learning_rate": 6.388059701492538e-05, | |
| "loss": 0.4264, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 9.422249794006348, | |
| "learning_rate": 6.350746268656716e-05, | |
| "loss": 0.4258, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 8.768738746643066, | |
| "learning_rate": 6.313432835820896e-05, | |
| "loss": 0.3308, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 6.355968475341797, | |
| "learning_rate": 6.276119402985074e-05, | |
| "loss": 0.3438, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "eval_accuracy": 0.836555360281195, | |
| "eval_loss": 0.5606371760368347, | |
| "eval_runtime": 7.818, | |
| "eval_samples_per_second": 72.781, | |
| "eval_steps_per_second": 9.21, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 3.973480463027954, | |
| "learning_rate": 6.238805970149254e-05, | |
| "loss": 0.5042, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 5.739313125610352, | |
| "learning_rate": 6.201492537313434e-05, | |
| "loss": 0.4515, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 4.196649074554443, | |
| "learning_rate": 6.164179104477613e-05, | |
| "loss": 0.4404, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 4.671971321105957, | |
| "learning_rate": 6.126865671641791e-05, | |
| "loss": 0.4746, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 6.87581205368042, | |
| "learning_rate": 6.08955223880597e-05, | |
| "loss": 0.4637, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 7.224815368652344, | |
| "learning_rate": 6.052238805970149e-05, | |
| "loss": 0.4754, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 4.4340314865112305, | |
| "learning_rate": 6.014925373134329e-05, | |
| "loss": 0.4165, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 1.151932716369629, | |
| "learning_rate": 5.9776119402985076e-05, | |
| "loss": 0.3498, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 6.31879997253418, | |
| "learning_rate": 5.940298507462687e-05, | |
| "loss": 0.3505, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 4.674696445465088, | |
| "learning_rate": 5.902985074626865e-05, | |
| "loss": 0.4345, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "eval_accuracy": 0.836555360281195, | |
| "eval_loss": 0.5353797674179077, | |
| "eval_runtime": 7.9559, | |
| "eval_samples_per_second": 71.519, | |
| "eval_steps_per_second": 9.05, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 6.790203094482422, | |
| "learning_rate": 5.865671641791045e-05, | |
| "loss": 0.3189, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 5.554905414581299, | |
| "learning_rate": 5.828358208955225e-05, | |
| "loss": 0.3255, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 1.87189781665802, | |
| "learning_rate": 5.7910447761194034e-05, | |
| "loss": 0.2613, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 3.4729249477386475, | |
| "learning_rate": 5.7537313432835826e-05, | |
| "loss": 0.4037, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 3.2373063564300537, | |
| "learning_rate": 5.716417910447761e-05, | |
| "loss": 0.384, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 1.8042526245117188, | |
| "learning_rate": 5.679104477611941e-05, | |
| "loss": 0.4024, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 0.9592193365097046, | |
| "learning_rate": 5.64179104477612e-05, | |
| "loss": 0.3646, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 4.0469584465026855, | |
| "learning_rate": 5.6044776119402986e-05, | |
| "loss": 0.3622, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 4.470405578613281, | |
| "learning_rate": 5.5671641791044784e-05, | |
| "loss": 0.2996, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 6.086768627166748, | |
| "learning_rate": 5.529850746268657e-05, | |
| "loss": 0.4523, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "eval_accuracy": 0.8576449912126538, | |
| "eval_loss": 0.49876561760902405, | |
| "eval_runtime": 7.8527, | |
| "eval_samples_per_second": 72.459, | |
| "eval_steps_per_second": 9.169, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 3.478428363800049, | |
| "learning_rate": 5.492537313432836e-05, | |
| "loss": 0.4198, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 4.539990425109863, | |
| "learning_rate": 5.455223880597016e-05, | |
| "loss": 0.3125, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 3.971435070037842, | |
| "learning_rate": 5.4179104477611943e-05, | |
| "loss": 0.2773, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 7.168191909790039, | |
| "learning_rate": 5.3805970149253735e-05, | |
| "loss": 0.4852, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 2.896576166152954, | |
| "learning_rate": 5.343283582089552e-05, | |
| "loss": 0.3425, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 1.4190607070922852, | |
| "learning_rate": 5.305970149253732e-05, | |
| "loss": 0.2219, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 5.066045761108398, | |
| "learning_rate": 5.268656716417911e-05, | |
| "loss": 0.3447, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 4.2649126052856445, | |
| "learning_rate": 5.2313432835820895e-05, | |
| "loss": 0.3931, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 5.704684734344482, | |
| "learning_rate": 5.197761194029851e-05, | |
| "loss": 0.4274, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 6.395939350128174, | |
| "learning_rate": 5.16044776119403e-05, | |
| "loss": 0.3162, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "eval_accuracy": 0.8541300527240774, | |
| "eval_loss": 0.5099390745162964, | |
| "eval_runtime": 7.9919, | |
| "eval_samples_per_second": 71.197, | |
| "eval_steps_per_second": 9.009, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 2.4717729091644287, | |
| "learning_rate": 5.123134328358209e-05, | |
| "loss": 0.3442, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 0.6504545211791992, | |
| "learning_rate": 5.0858208955223885e-05, | |
| "loss": 0.3313, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 4.316141128540039, | |
| "learning_rate": 5.048507462686567e-05, | |
| "loss": 0.3787, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 4.9243998527526855, | |
| "learning_rate": 5.011194029850746e-05, | |
| "loss": 0.38, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 5.312038421630859, | |
| "learning_rate": 4.973880597014925e-05, | |
| "loss": 0.3268, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 5.07, | |
| "grad_norm": 3.5483176708221436, | |
| "learning_rate": 4.9365671641791045e-05, | |
| "loss": 0.3423, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 5.11, | |
| "grad_norm": 4.414547920227051, | |
| "learning_rate": 4.899253731343284e-05, | |
| "loss": 0.2421, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "grad_norm": 5.7323689460754395, | |
| "learning_rate": 4.861940298507463e-05, | |
| "loss": 0.2795, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "grad_norm": 4.2763471603393555, | |
| "learning_rate": 4.824626865671642e-05, | |
| "loss": 0.2402, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 5.22, | |
| "grad_norm": 9.259199142456055, | |
| "learning_rate": 4.787313432835821e-05, | |
| "loss": 0.3793, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.22, | |
| "eval_accuracy": 0.843585237258348, | |
| "eval_loss": 0.5190387964248657, | |
| "eval_runtime": 7.7562, | |
| "eval_samples_per_second": 73.361, | |
| "eval_steps_per_second": 9.283, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.26, | |
| "grad_norm": 4.773892402648926, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.3476, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 1.1271159648895264, | |
| "learning_rate": 4.7126865671641794e-05, | |
| "loss": 0.1949, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "grad_norm": 2.823958158493042, | |
| "learning_rate": 4.6753731343283586e-05, | |
| "loss": 0.3009, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 5.37, | |
| "grad_norm": 0.35977163910865784, | |
| "learning_rate": 4.638059701492538e-05, | |
| "loss": 0.1821, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 5.41, | |
| "grad_norm": 3.380308151245117, | |
| "learning_rate": 4.600746268656716e-05, | |
| "loss": 0.323, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 5.45, | |
| "grad_norm": 5.946179389953613, | |
| "learning_rate": 4.5634328358208954e-05, | |
| "loss": 0.5344, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 5.49, | |
| "grad_norm": 8.254781723022461, | |
| "learning_rate": 4.526119402985075e-05, | |
| "loss": 0.2799, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 6.808130741119385, | |
| "learning_rate": 4.4888059701492544e-05, | |
| "loss": 0.3173, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 5.56, | |
| "grad_norm": 17.452037811279297, | |
| "learning_rate": 4.451492537313433e-05, | |
| "loss": 0.3251, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 2.3097095489501953, | |
| "learning_rate": 4.414179104477612e-05, | |
| "loss": 0.3228, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "eval_accuracy": 0.8576449912126538, | |
| "eval_loss": 0.4589254856109619, | |
| "eval_runtime": 8.0547, | |
| "eval_samples_per_second": 70.642, | |
| "eval_steps_per_second": 8.939, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.63, | |
| "grad_norm": 3.337970018386841, | |
| "learning_rate": 4.376865671641791e-05, | |
| "loss": 0.2528, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 5.67, | |
| "grad_norm": 0.5921415090560913, | |
| "learning_rate": 4.33955223880597e-05, | |
| "loss": 0.2459, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 5.71, | |
| "grad_norm": 4.148998260498047, | |
| "learning_rate": 4.3022388059701495e-05, | |
| "loss": 0.2927, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 5.740537166595459, | |
| "learning_rate": 4.2649253731343286e-05, | |
| "loss": 0.423, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 5.78, | |
| "grad_norm": 5.316250324249268, | |
| "learning_rate": 4.227611940298508e-05, | |
| "loss": 0.3735, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 5.82, | |
| "grad_norm": 5.52378511428833, | |
| "learning_rate": 4.190298507462686e-05, | |
| "loss": 0.3613, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 5.86, | |
| "grad_norm": 2.1002511978149414, | |
| "learning_rate": 4.152985074626866e-05, | |
| "loss": 0.259, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 5.9, | |
| "grad_norm": 5.339119911193848, | |
| "learning_rate": 4.115671641791045e-05, | |
| "loss": 0.3355, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 5.93, | |
| "grad_norm": 3.0551536083221436, | |
| "learning_rate": 4.0783582089552244e-05, | |
| "loss": 0.4342, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "grad_norm": 6.549235820770264, | |
| "learning_rate": 4.041044776119403e-05, | |
| "loss": 0.1795, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "eval_accuracy": 0.8488576449912126, | |
| "eval_loss": 0.5095508694648743, | |
| "eval_runtime": 7.7872, | |
| "eval_samples_per_second": 73.068, | |
| "eval_steps_per_second": 9.246, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 6.01, | |
| "grad_norm": 11.5170316696167, | |
| "learning_rate": 4.003731343283582e-05, | |
| "loss": 0.3778, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 6.04, | |
| "grad_norm": 6.004143238067627, | |
| "learning_rate": 3.966417910447761e-05, | |
| "loss": 0.3624, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 4.328847885131836, | |
| "learning_rate": 3.9291044776119404e-05, | |
| "loss": 0.3478, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 6.12, | |
| "grad_norm": 3.5757558345794678, | |
| "learning_rate": 3.8917910447761195e-05, | |
| "loss": 0.2208, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 8.37783432006836, | |
| "learning_rate": 3.854477611940299e-05, | |
| "loss": 0.3614, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 6.19, | |
| "grad_norm": 2.4890713691711426, | |
| "learning_rate": 3.817164179104478e-05, | |
| "loss": 0.2514, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 6.23, | |
| "grad_norm": 8.873276710510254, | |
| "learning_rate": 3.7798507462686563e-05, | |
| "loss": 0.2233, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 6.27, | |
| "grad_norm": 0.29393309354782104, | |
| "learning_rate": 3.742537313432836e-05, | |
| "loss": 0.2474, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 6.31, | |
| "grad_norm": 3.810150384902954, | |
| "learning_rate": 3.7052238805970153e-05, | |
| "loss": 0.2481, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 6.34, | |
| "grad_norm": 1.989057183265686, | |
| "learning_rate": 3.6679104477611945e-05, | |
| "loss": 0.2626, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 6.34, | |
| "eval_accuracy": 0.8488576449912126, | |
| "eval_loss": 0.5402765274047852, | |
| "eval_runtime": 7.9293, | |
| "eval_samples_per_second": 71.759, | |
| "eval_steps_per_second": 9.08, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 6.38, | |
| "grad_norm": 8.488819122314453, | |
| "learning_rate": 3.630597014925373e-05, | |
| "loss": 0.2826, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 6.42, | |
| "grad_norm": 5.542993068695068, | |
| "learning_rate": 3.593283582089552e-05, | |
| "loss": 0.3552, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 6.46, | |
| "grad_norm": 6.646905422210693, | |
| "learning_rate": 3.555970149253732e-05, | |
| "loss": 0.4405, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 6.49, | |
| "grad_norm": 4.022976398468018, | |
| "learning_rate": 3.5186567164179105e-05, | |
| "loss": 0.2738, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 6.53, | |
| "grad_norm": 3.5472657680511475, | |
| "learning_rate": 3.4813432835820896e-05, | |
| "loss": 0.2807, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 6.57, | |
| "grad_norm": 12.070052146911621, | |
| "learning_rate": 3.444029850746269e-05, | |
| "loss": 0.3634, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 6.6, | |
| "grad_norm": 5.368374347686768, | |
| "learning_rate": 3.406716417910448e-05, | |
| "loss": 0.3252, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 5.566130638122559, | |
| "learning_rate": 3.369402985074627e-05, | |
| "loss": 0.3034, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 6.68, | |
| "grad_norm": 5.875336170196533, | |
| "learning_rate": 3.332089552238806e-05, | |
| "loss": 0.3406, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 2.4168920516967773, | |
| "learning_rate": 3.2947761194029854e-05, | |
| "loss": 0.3041, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "eval_accuracy": 0.8488576449912126, | |
| "eval_loss": 0.4907586872577667, | |
| "eval_runtime": 7.8209, | |
| "eval_samples_per_second": 72.754, | |
| "eval_steps_per_second": 9.206, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 3.1040282249450684, | |
| "learning_rate": 3.2574626865671646e-05, | |
| "loss": 0.3167, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 6.79, | |
| "grad_norm": 1.8458846807479858, | |
| "learning_rate": 3.220149253731343e-05, | |
| "loss": 0.2061, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 6.83, | |
| "grad_norm": 0.4053177833557129, | |
| "learning_rate": 3.182835820895523e-05, | |
| "loss": 0.3113, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "grad_norm": 0.23064230382442474, | |
| "learning_rate": 3.145522388059702e-05, | |
| "loss": 0.2368, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 6.9, | |
| "grad_norm": 1.006479263305664, | |
| "learning_rate": 3.1082089552238805e-05, | |
| "loss": 0.2265, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 6.94, | |
| "grad_norm": 4.072957992553711, | |
| "learning_rate": 3.07089552238806e-05, | |
| "loss": 0.2976, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 6.98, | |
| "grad_norm": 16.575963973999023, | |
| "learning_rate": 3.033582089552239e-05, | |
| "loss": 0.1504, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 7.01, | |
| "grad_norm": 2.9144656658172607, | |
| "learning_rate": 2.9962686567164183e-05, | |
| "loss": 0.2156, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 7.05, | |
| "grad_norm": 4.547207832336426, | |
| "learning_rate": 2.958955223880597e-05, | |
| "loss": 0.2693, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 7.09, | |
| "grad_norm": 0.5566532611846924, | |
| "learning_rate": 2.9216417910447763e-05, | |
| "loss": 0.1831, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 7.09, | |
| "eval_accuracy": 0.8383128295254832, | |
| "eval_loss": 0.5721341967582703, | |
| "eval_runtime": 7.7377, | |
| "eval_samples_per_second": 73.536, | |
| "eval_steps_per_second": 9.305, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 7.13, | |
| "grad_norm": 7.9241838455200195, | |
| "learning_rate": 2.8843283582089555e-05, | |
| "loss": 0.3037, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 7.16, | |
| "grad_norm": 4.847833156585693, | |
| "learning_rate": 2.8470149253731343e-05, | |
| "loss": 0.2744, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 4.368974208831787, | |
| "learning_rate": 2.8097014925373134e-05, | |
| "loss": 0.1603, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 7.24, | |
| "grad_norm": 5.848027229309082, | |
| "learning_rate": 2.772388059701493e-05, | |
| "loss": 0.3318, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 5.53363037109375, | |
| "learning_rate": 2.7350746268656718e-05, | |
| "loss": 0.2568, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 7.31, | |
| "grad_norm": 1.3791863918304443, | |
| "learning_rate": 2.697761194029851e-05, | |
| "loss": 0.2186, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "grad_norm": 13.533841133117676, | |
| "learning_rate": 2.6604477611940297e-05, | |
| "loss": 0.2772, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 7.39, | |
| "grad_norm": 1.113595962524414, | |
| "learning_rate": 2.623134328358209e-05, | |
| "loss": 0.3396, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 7.43, | |
| "grad_norm": 3.193376064300537, | |
| "learning_rate": 2.5858208955223884e-05, | |
| "loss": 0.2171, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 7.46, | |
| "grad_norm": 2.8687243461608887, | |
| "learning_rate": 2.5485074626865672e-05, | |
| "loss": 0.2275, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 7.46, | |
| "eval_accuracy": 0.8312829525483304, | |
| "eval_loss": 0.5349107980728149, | |
| "eval_runtime": 8.0113, | |
| "eval_samples_per_second": 71.025, | |
| "eval_steps_per_second": 8.987, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 6.330258846282959, | |
| "learning_rate": 2.5111940298507464e-05, | |
| "loss": 0.2165, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 7.54, | |
| "grad_norm": 2.457519769668579, | |
| "learning_rate": 2.4738805970149252e-05, | |
| "loss": 0.3275, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 7.57, | |
| "grad_norm": 1.468772053718567, | |
| "learning_rate": 2.4365671641791047e-05, | |
| "loss": 0.186, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 7.61, | |
| "grad_norm": 4.308888912200928, | |
| "learning_rate": 2.3992537313432835e-05, | |
| "loss": 0.3182, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 7.65, | |
| "grad_norm": 1.8849867582321167, | |
| "learning_rate": 2.361940298507463e-05, | |
| "loss": 0.2631, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 7.69, | |
| "grad_norm": 2.6795170307159424, | |
| "learning_rate": 2.3246268656716418e-05, | |
| "loss": 0.1724, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 7.72, | |
| "grad_norm": 0.22702960669994354, | |
| "learning_rate": 2.287313432835821e-05, | |
| "loss": 0.2542, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 7.76, | |
| "grad_norm": 4.6633429527282715, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.259, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 6.543178558349609, | |
| "learning_rate": 2.2126865671641793e-05, | |
| "loss": 0.3752, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 7.109080791473389, | |
| "learning_rate": 2.1753731343283585e-05, | |
| "loss": 0.1762, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "eval_accuracy": 0.8541300527240774, | |
| "eval_loss": 0.5203543901443481, | |
| "eval_runtime": 7.8922, | |
| "eval_samples_per_second": 72.096, | |
| "eval_steps_per_second": 9.123, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 7.87, | |
| "grad_norm": 3.3965115547180176, | |
| "learning_rate": 2.1380597014925373e-05, | |
| "loss": 0.1965, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 7.91, | |
| "grad_norm": 0.1386798918247223, | |
| "learning_rate": 2.1007462686567164e-05, | |
| "loss": 0.1448, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 7.95, | |
| "grad_norm": 8.268773078918457, | |
| "learning_rate": 2.0634328358208956e-05, | |
| "loss": 0.2203, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 7.99, | |
| "grad_norm": 2.712890625, | |
| "learning_rate": 2.0261194029850748e-05, | |
| "loss": 0.2104, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 8.02, | |
| "grad_norm": 2.0390050411224365, | |
| "learning_rate": 1.988805970149254e-05, | |
| "loss": 0.2063, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 8.06, | |
| "grad_norm": 4.355598449707031, | |
| "learning_rate": 1.951492537313433e-05, | |
| "loss": 0.1356, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 8.1, | |
| "grad_norm": 9.854630470275879, | |
| "learning_rate": 1.914179104477612e-05, | |
| "loss": 0.1686, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 8.13, | |
| "grad_norm": 4.178330421447754, | |
| "learning_rate": 1.8768656716417914e-05, | |
| "loss": 0.2578, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 8.17, | |
| "grad_norm": 5.019784450531006, | |
| "learning_rate": 1.8395522388059702e-05, | |
| "loss": 0.1923, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 8.21, | |
| "grad_norm": 3.8136210441589355, | |
| "learning_rate": 1.8022388059701494e-05, | |
| "loss": 0.2112, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 8.21, | |
| "eval_accuracy": 0.8629173989455184, | |
| "eval_loss": 0.5188840627670288, | |
| "eval_runtime": 8.1412, | |
| "eval_samples_per_second": 69.891, | |
| "eval_steps_per_second": 8.844, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 2.7035305500030518, | |
| "learning_rate": 1.7649253731343285e-05, | |
| "loss": 0.2501, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "grad_norm": 6.736306190490723, | |
| "learning_rate": 1.7276119402985073e-05, | |
| "loss": 0.2213, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 3.0436556339263916, | |
| "learning_rate": 1.690298507462687e-05, | |
| "loss": 0.1285, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 8.36, | |
| "grad_norm": 4.729572772979736, | |
| "learning_rate": 1.6529850746268657e-05, | |
| "loss": 0.2984, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 3.6665098667144775, | |
| "learning_rate": 1.6156716417910448e-05, | |
| "loss": 0.1796, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 8.43, | |
| "grad_norm": 8.485068321228027, | |
| "learning_rate": 1.578358208955224e-05, | |
| "loss": 0.2137, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 8.47, | |
| "grad_norm": 4.643974304199219, | |
| "learning_rate": 1.541044776119403e-05, | |
| "loss": 0.3009, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 8.51, | |
| "grad_norm": 2.91859769821167, | |
| "learning_rate": 1.5037313432835823e-05, | |
| "loss": 0.1855, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 8.54, | |
| "grad_norm": 9.799684524536133, | |
| "learning_rate": 1.4664179104477613e-05, | |
| "loss": 0.2186, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 8.58, | |
| "grad_norm": 4.92659330368042, | |
| "learning_rate": 1.4291044776119403e-05, | |
| "loss": 0.1242, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 8.58, | |
| "eval_accuracy": 0.8471001757469244, | |
| "eval_loss": 0.5376706123352051, | |
| "eval_runtime": 7.8653, | |
| "eval_samples_per_second": 72.343, | |
| "eval_steps_per_second": 9.154, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 8.62, | |
| "grad_norm": 0.7728621363639832, | |
| "learning_rate": 1.3917910447761196e-05, | |
| "loss": 0.2769, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 8.66, | |
| "grad_norm": 3.757192373275757, | |
| "learning_rate": 1.3544776119402986e-05, | |
| "loss": 0.31, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 8.69, | |
| "grad_norm": 5.901330471038818, | |
| "learning_rate": 1.3171641791044777e-05, | |
| "loss": 0.2488, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 8.73, | |
| "grad_norm": 0.1360226422548294, | |
| "learning_rate": 1.2798507462686567e-05, | |
| "loss": 0.2359, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 8.77, | |
| "grad_norm": 5.801501750946045, | |
| "learning_rate": 1.2425373134328359e-05, | |
| "loss": 0.23, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 8.81, | |
| "grad_norm": 3.3060359954833984, | |
| "learning_rate": 1.2052238805970149e-05, | |
| "loss": 0.1114, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 8.84, | |
| "grad_norm": 2.0813100337982178, | |
| "learning_rate": 1.167910447761194e-05, | |
| "loss": 0.1569, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 8.88, | |
| "grad_norm": 0.42951256036758423, | |
| "learning_rate": 1.1305970149253732e-05, | |
| "loss": 0.2636, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 8.92, | |
| "grad_norm": 3.2714788913726807, | |
| "learning_rate": 1.0932835820895524e-05, | |
| "loss": 0.2197, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 4.24855375289917, | |
| "learning_rate": 1.0559701492537313e-05, | |
| "loss": 0.1207, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "eval_accuracy": 0.8558875219683656, | |
| "eval_loss": 0.5324714779853821, | |
| "eval_runtime": 7.9022, | |
| "eval_samples_per_second": 72.006, | |
| "eval_steps_per_second": 9.111, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 8.99, | |
| "grad_norm": 3.989713430404663, | |
| "learning_rate": 1.0186567164179105e-05, | |
| "loss": 0.2336, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 9.03, | |
| "grad_norm": 5.590869903564453, | |
| "learning_rate": 9.813432835820897e-06, | |
| "loss": 0.2292, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 9.07, | |
| "grad_norm": 3.405966281890869, | |
| "learning_rate": 9.440298507462688e-06, | |
| "loss": 0.1654, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 9.1, | |
| "grad_norm": 3.733381986618042, | |
| "learning_rate": 9.067164179104478e-06, | |
| "loss": 0.2104, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 9.14, | |
| "grad_norm": 0.1994183361530304, | |
| "learning_rate": 8.694029850746268e-06, | |
| "loss": 0.0789, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 9.18, | |
| "grad_norm": 7.948019504547119, | |
| "learning_rate": 8.32089552238806e-06, | |
| "loss": 0.3335, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 9.22, | |
| "grad_norm": 3.020522117614746, | |
| "learning_rate": 7.947761194029851e-06, | |
| "loss": 0.1838, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 9.25, | |
| "grad_norm": 2.4797592163085938, | |
| "learning_rate": 7.574626865671643e-06, | |
| "loss": 0.1573, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 9.29, | |
| "grad_norm": 0.7854322195053101, | |
| "learning_rate": 7.201492537313433e-06, | |
| "loss": 0.1868, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 9.33, | |
| "grad_norm": 8.424530982971191, | |
| "learning_rate": 6.828358208955224e-06, | |
| "loss": 0.1806, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 9.33, | |
| "eval_accuracy": 0.8646748681898067, | |
| "eval_loss": 0.5149648785591125, | |
| "eval_runtime": 7.8422, | |
| "eval_samples_per_second": 72.556, | |
| "eval_steps_per_second": 9.181, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 9.37, | |
| "grad_norm": 2.9176523685455322, | |
| "learning_rate": 6.455223880597015e-06, | |
| "loss": 0.1977, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 9.4, | |
| "grad_norm": 4.15384578704834, | |
| "learning_rate": 6.082089552238806e-06, | |
| "loss": 0.2007, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "grad_norm": 2.4758641719818115, | |
| "learning_rate": 5.708955223880597e-06, | |
| "loss": 0.2, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 9.48, | |
| "grad_norm": 4.053123950958252, | |
| "learning_rate": 5.335820895522389e-06, | |
| "loss": 0.2514, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 9.51, | |
| "grad_norm": 2.3916337490081787, | |
| "learning_rate": 4.9626865671641796e-06, | |
| "loss": 0.2104, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 9.55, | |
| "grad_norm": 4.113661766052246, | |
| "learning_rate": 4.58955223880597e-06, | |
| "loss": 0.1998, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 9.59, | |
| "grad_norm": 3.558722972869873, | |
| "learning_rate": 4.216417910447761e-06, | |
| "loss": 0.144, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 9.63, | |
| "grad_norm": 2.689765691757202, | |
| "learning_rate": 3.843283582089553e-06, | |
| "loss": 0.1691, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 9.66, | |
| "grad_norm": 4.95484733581543, | |
| "learning_rate": 3.4701492537313434e-06, | |
| "loss": 0.1875, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 9.7, | |
| "grad_norm": 6.025635242462158, | |
| "learning_rate": 3.0970149253731345e-06, | |
| "loss": 0.1793, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 9.7, | |
| "eval_accuracy": 0.8664323374340949, | |
| "eval_loss": 0.5153330564498901, | |
| "eval_runtime": 7.9144, | |
| "eval_samples_per_second": 71.894, | |
| "eval_steps_per_second": 9.097, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 9.74, | |
| "grad_norm": 0.3092793822288513, | |
| "learning_rate": 2.7238805970149257e-06, | |
| "loss": 0.1385, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 9.78, | |
| "grad_norm": 1.1317028999328613, | |
| "learning_rate": 2.3507462686567164e-06, | |
| "loss": 0.1628, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 9.81, | |
| "grad_norm": 7.642726898193359, | |
| "learning_rate": 1.9776119402985076e-06, | |
| "loss": 0.2142, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 9.85, | |
| "grad_norm": 4.3891191482543945, | |
| "learning_rate": 1.6044776119402985e-06, | |
| "loss": 0.2115, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 9.89, | |
| "grad_norm": 5.876834869384766, | |
| "learning_rate": 1.2313432835820897e-06, | |
| "loss": 0.2859, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 9.93, | |
| "grad_norm": 1.6104581356048584, | |
| "learning_rate": 8.582089552238806e-07, | |
| "loss": 0.2752, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 9.96, | |
| "grad_norm": 5.835386276245117, | |
| "learning_rate": 4.850746268656717e-07, | |
| "loss": 0.2057, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 7.006475925445557, | |
| "learning_rate": 1.119402985074627e-07, | |
| "loss": 0.2098, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 2680, | |
| "total_flos": 3.3230947683690086e+18, | |
| "train_loss": 0.45543073504718384, | |
| "train_runtime": 1353.2313, | |
| "train_samples_per_second": 31.687, | |
| "train_steps_per_second": 1.98 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2680, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "total_flos": 3.3230947683690086e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |