| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 30, | |
| "global_step": 855, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03508771929824561, | |
| "grad_norm": 4.878787040710449, | |
| "learning_rate": 2.0930232558139536e-05, | |
| "loss": 2.4934, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 0.776142418384552, | |
| "learning_rate": 4.418604651162791e-05, | |
| "loss": 0.9685, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.7527797818183899, | |
| "learning_rate": 6.744186046511628e-05, | |
| "loss": 0.5823, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "eval_loss": 0.45968249440193176, | |
| "eval_runtime": 25.548, | |
| "eval_samples_per_second": 4.697, | |
| "eval_steps_per_second": 4.697, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 0.5810465812683105, | |
| "learning_rate": 9.069767441860465e-05, | |
| "loss": 0.4143, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 0.6364761590957642, | |
| "learning_rate": 0.00011395348837209304, | |
| "loss": 0.3, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.7183641791343689, | |
| "learning_rate": 0.0001372093023255814, | |
| "loss": 0.2551, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "eval_loss": 0.21707303822040558, | |
| "eval_runtime": 24.8756, | |
| "eval_samples_per_second": 4.824, | |
| "eval_steps_per_second": 4.824, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24561403508771928, | |
| "grad_norm": 0.4606807231903076, | |
| "learning_rate": 0.00016046511627906978, | |
| "loss": 0.223, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 0.5203229784965515, | |
| "learning_rate": 0.00018372093023255815, | |
| "loss": 0.1933, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.44623491168022156, | |
| "learning_rate": 0.0001999924897543333, | |
| "loss": 0.1902, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "eval_loss": 0.1671793907880783, | |
| "eval_runtime": 25.9688, | |
| "eval_samples_per_second": 4.621, | |
| "eval_steps_per_second": 4.621, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.5045297145843506, | |
| "learning_rate": 0.000199859005655354, | |
| "loss": 0.1724, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.38596491228070173, | |
| "grad_norm": 0.2889767289161682, | |
| "learning_rate": 0.00019955888361169273, | |
| "loss": 0.1666, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.4197916090488434, | |
| "learning_rate": 0.00019909262444668715, | |
| "loss": 0.1611, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "eval_loss": 0.13614678382873535, | |
| "eval_runtime": 25.0008, | |
| "eval_samples_per_second": 4.8, | |
| "eval_steps_per_second": 4.8, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.45614035087719296, | |
| "grad_norm": 0.3050219714641571, | |
| "learning_rate": 0.00019846100622204974, | |
| "loss": 0.1593, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 0.24451592564582825, | |
| "learning_rate": 0.00019766508293949108, | |
| "loss": 0.1594, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.29427045583724976, | |
| "learning_rate": 0.00019670618278187318, | |
| "loss": 0.1424, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "eval_loss": 0.13314871490001678, | |
| "eval_runtime": 25.9586, | |
| "eval_samples_per_second": 4.623, | |
| "eval_steps_per_second": 4.623, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.36817243695259094, | |
| "learning_rate": 0.00019558590589682795, | |
| "loss": 0.1383, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5964912280701754, | |
| "grad_norm": 0.16424335539340973, | |
| "learning_rate": 0.00019430612172653908, | |
| "loss": 0.1447, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.17733046412467957, | |
| "learning_rate": 0.00019286896588814373, | |
| "loss": 0.1335, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "eval_loss": 0.13173671066761017, | |
| "eval_runtime": 25.9972, | |
| "eval_samples_per_second": 4.616, | |
| "eval_steps_per_second": 4.616, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.22004777193069458, | |
| "learning_rate": 0.00019127683660995914, | |
| "loss": 0.1457, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.2817850112915039, | |
| "learning_rate": 0.00018953239072948182, | |
| "loss": 0.1304, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.22587062418460846, | |
| "learning_rate": 0.00018763853925983693, | |
| "loss": 0.1426, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "eval_loss": 0.12666551768779755, | |
| "eval_runtime": 25.8854, | |
| "eval_samples_per_second": 4.636, | |
| "eval_steps_per_second": 4.636, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 0.1741916984319687, | |
| "learning_rate": 0.0001855984425320769, | |
| "loss": 0.1404, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8070175438596491, | |
| "grad_norm": 0.14528629183769226, | |
| "learning_rate": 0.00018341550492143496, | |
| "loss": 0.1486, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.26741355657577515, | |
| "learning_rate": 0.00018109336916633426, | |
| "loss": 0.1281, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "eval_loss": 0.12144554406404495, | |
| "eval_runtime": 24.7052, | |
| "eval_samples_per_second": 4.857, | |
| "eval_steps_per_second": 4.857, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.12268830090761185, | |
| "learning_rate": 0.000178635910289633, | |
| "loss": 0.1281, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 0.2916463613510132, | |
| "learning_rate": 0.00017604722913224842, | |
| "loss": 0.1348, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.13800819218158722, | |
| "learning_rate": 0.00017333164550995152, | |
| "loss": 0.1258, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "eval_loss": 0.11921060085296631, | |
| "eval_runtime": 24.7901, | |
| "eval_samples_per_second": 4.841, | |
| "eval_steps_per_second": 4.841, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.11254072189331055, | |
| "learning_rate": 0.000170493691004751, | |
| "loss": 0.1304, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0175438596491229, | |
| "grad_norm": 0.14597080647945404, | |
| "learning_rate": 0.00016753810140289607, | |
| "loss": 0.1288, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.12981514632701874, | |
| "learning_rate": 0.0001644698087921173, | |
| "loss": 0.1307, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "eval_loss": 0.1170278862118721, | |
| "eval_runtime": 24.9635, | |
| "eval_samples_per_second": 4.807, | |
| "eval_steps_per_second": 4.807, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.087719298245614, | |
| "grad_norm": 0.10934378206729889, | |
| "learning_rate": 0.0001612939333312926, | |
| "loss": 0.1111, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 0.1015116274356842, | |
| "learning_rate": 0.00015801577470627286, | |
| "loss": 0.1288, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.1372503787279129, | |
| "learning_rate": 0.0001546408032861252, | |
| "loss": 0.116, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "eval_loss": 0.11487225443124771, | |
| "eval_runtime": 24.9694, | |
| "eval_samples_per_second": 4.806, | |
| "eval_steps_per_second": 4.806, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1929824561403508, | |
| "grad_norm": 0.2865680456161499, | |
| "learning_rate": 0.00015117465099455174, | |
| "loss": 0.1237, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "grad_norm": 0.12083390355110168, | |
| "learning_rate": 0.00014762310191171655, | |
| "loss": 0.1305, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.09869180619716644, | |
| "learning_rate": 0.00014399208262216474, | |
| "loss": 0.1167, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "eval_loss": 0.11612139642238617, | |
| "eval_runtime": 24.9809, | |
| "eval_samples_per_second": 4.804, | |
| "eval_steps_per_second": 4.804, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2982456140350878, | |
| "grad_norm": 0.10779603570699692, | |
| "learning_rate": 0.0001402876523249394, | |
| "loss": 0.1325, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.20813943445682526, | |
| "learning_rate": 0.00013651599272240076, | |
| "loss": 0.1203, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.10286935418844223, | |
| "learning_rate": 0.0001326833977046199, | |
| "loss": 0.1198, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "eval_loss": 0.1140877828001976, | |
| "eval_runtime": 25.1216, | |
| "eval_samples_per_second": 4.777, | |
| "eval_steps_per_second": 4.777, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 0.10826771706342697, | |
| "learning_rate": 0.00012879626284656142, | |
| "loss": 0.1215, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4385964912280702, | |
| "grad_norm": 0.09107205271720886, | |
| "learning_rate": 0.00012486107473558118, | |
| "loss": 0.1258, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.23670166730880737, | |
| "learning_rate": 0.00012088440014704858, | |
| "loss": 0.1194, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "eval_loss": 0.11560888588428497, | |
| "eval_runtime": 25.1077, | |
| "eval_samples_per_second": 4.779, | |
| "eval_steps_per_second": 4.779, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5087719298245614, | |
| "grad_norm": 0.09810595214366913, | |
| "learning_rate": 0.0001168728750861567, | |
| "loss": 0.1282, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 0.11235892027616501, | |
| "learning_rate": 0.0001128331937142062, | |
| "loss": 0.1234, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.09267847239971161, | |
| "learning_rate": 0.0001087720971778426, | |
| "loss": 0.1292, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "eval_loss": 0.11425888538360596, | |
| "eval_runtime": 25.0231, | |
| "eval_samples_per_second": 4.796, | |
| "eval_steps_per_second": 4.796, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6140350877192984, | |
| "grad_norm": 0.12776316702365875, | |
| "learning_rate": 0.0001046963623598871, | |
| "loss": 0.12, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6491228070175439, | |
| "grad_norm": 0.12537536025047302, | |
| "learning_rate": 0.00010061279057053386, | |
| "loss": 0.1231, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.11115119606256485, | |
| "learning_rate": 9.652819619778386e-05, | |
| "loss": 0.1262, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "eval_loss": 0.11288858950138092, | |
| "eval_runtime": 24.9273, | |
| "eval_samples_per_second": 4.814, | |
| "eval_steps_per_second": 4.814, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.719298245614035, | |
| "grad_norm": 0.09119213372468948, | |
| "learning_rate": 9.244939533605619e-05, | |
| "loss": 0.1233, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.12485823780298233, | |
| "learning_rate": 8.838319441195105e-05, | |
| "loss": 0.1212, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.10320759564638138, | |
| "learning_rate": 8.433637882614625e-05, | |
| "loss": 0.1298, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "eval_loss": 0.11233663558959961, | |
| "eval_runtime": 24.9513, | |
| "eval_samples_per_second": 4.809, | |
| "eval_steps_per_second": 4.809, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 0.115440234541893, | |
| "learning_rate": 8.031570163038005e-05, | |
| "loss": 0.1298, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8596491228070176, | |
| "grad_norm": 0.09397278726100922, | |
| "learning_rate": 7.632787225841593e-05, | |
| "loss": 0.1258, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.08502210676670074, | |
| "learning_rate": 7.237954532979401e-05, | |
| "loss": 0.119, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "eval_loss": 0.11294586211442947, | |
| "eval_runtime": 25.0566, | |
| "eval_samples_per_second": 4.789, | |
| "eval_steps_per_second": 4.789, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "grad_norm": 0.09859590232372284, | |
| "learning_rate": 6.847730954505261e-05, | |
| "loss": 0.1222, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 0.07687846571207047, | |
| "learning_rate": 6.462767669095109e-05, | |
| "loss": 0.1143, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.09160079807043076, | |
| "learning_rate": 6.0837070774041284e-05, | |
| "loss": 0.1227, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.11345366388559341, | |
| "eval_runtime": 25.1502, | |
| "eval_samples_per_second": 4.771, | |
| "eval_steps_per_second": 4.771, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.0350877192982457, | |
| "grad_norm": 0.08900679647922516, | |
| "learning_rate": 5.711181730072044e-05, | |
| "loss": 0.1068, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.11684390157461166, | |
| "learning_rate": 5.3458132721654564e-05, | |
| "loss": 0.122, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.11595682054758072, | |
| "learning_rate": 4.9882114058186616e-05, | |
| "loss": 0.1493, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "eval_loss": 0.11257700622081757, | |
| "eval_runtime": 24.9265, | |
| "eval_samples_per_second": 4.814, | |
| "eval_steps_per_second": 4.814, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.1403508771929824, | |
| "grad_norm": 0.10183674842119217, | |
| "learning_rate": 4.638972872804038e-05, | |
| "loss": 0.1168, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.175438596491228, | |
| "grad_norm": 0.10958362370729446, | |
| "learning_rate": 4.298680458729792e-05, | |
| "loss": 0.1206, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.09745143353939056, | |
| "learning_rate": 3.967902020526797e-05, | |
| "loss": 0.1272, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "eval_loss": 0.11299394816160202, | |
| "eval_runtime": 25.0837, | |
| "eval_samples_per_second": 4.784, | |
| "eval_steps_per_second": 4.784, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.245614035087719, | |
| "grad_norm": 0.08707338571548462, | |
| "learning_rate": 3.647189538847432e-05, | |
| "loss": 0.1212, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.280701754385965, | |
| "grad_norm": 0.09840196371078491, | |
| "learning_rate": 3.337078196957647e-05, | |
| "loss": 0.1137, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.10066810250282288, | |
| "learning_rate": 3.0380854876593723e-05, | |
| "loss": 0.1147, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "eval_loss": 0.11169523745775223, | |
| "eval_runtime": 24.9457, | |
| "eval_samples_per_second": 4.81, | |
| "eval_steps_per_second": 4.81, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.3508771929824563, | |
| "grad_norm": 0.11805769056081772, | |
| "learning_rate": 2.7507103497336016e-05, | |
| "loss": 0.124, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.3859649122807016, | |
| "grad_norm": 0.09834598004817963, | |
| "learning_rate": 2.475432335345128e-05, | |
| "loss": 0.1269, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.11241532862186432, | |
| "learning_rate": 2.212710809798393e-05, | |
| "loss": 0.1137, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "eval_loss": 0.11132458597421646, | |
| "eval_runtime": 24.998, | |
| "eval_samples_per_second": 4.8, | |
| "eval_steps_per_second": 4.8, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.456140350877193, | |
| "grad_norm": 0.10056956857442856, | |
| "learning_rate": 1.9629841849797736e-05, | |
| "loss": 0.1151, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4912280701754383, | |
| "grad_norm": 0.09482597559690475, | |
| "learning_rate": 1.7266691877655127e-05, | |
| "loss": 0.1091, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.11021561920642853, | |
| "learning_rate": 1.5041601646161585e-05, | |
| "loss": 0.1225, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "eval_loss": 0.11140093952417374, | |
| "eval_runtime": 25.0288, | |
| "eval_samples_per_second": 4.794, | |
| "eval_steps_per_second": 4.794, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.56140350877193, | |
| "grad_norm": 0.09573051333427429, | |
| "learning_rate": 1.295828423517874e-05, | |
| "loss": 0.1124, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.5964912280701755, | |
| "grad_norm": 0.1159501001238823, | |
| "learning_rate": 1.1020216143688444e-05, | |
| "loss": 0.1181, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.09099686145782471, | |
| "learning_rate": 9.230631488446639e-06, | |
| "loss": 0.1185, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "eval_loss": 0.11138518154621124, | |
| "eval_runtime": 24.9674, | |
| "eval_samples_per_second": 4.806, | |
| "eval_steps_per_second": 4.806, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.10761657357215881, | |
| "learning_rate": 7.592516607108324e-06, | |
| "loss": 0.1131, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.7017543859649122, | |
| "grad_norm": 0.11243995279073715, | |
| "learning_rate": 6.108605074829709e-06, | |
| "loss": 0.1178, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.07883230596780777, | |
| "learning_rate": 4.781373142663004e-06, | |
| "loss": 0.1095, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "eval_loss": 0.11120390146970749, | |
| "eval_runtime": 24.9973, | |
| "eval_samples_per_second": 4.801, | |
| "eval_steps_per_second": 4.801, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.11856161803007126, | |
| "learning_rate": 3.613035605356463e-06, | |
| "loss": 0.1148, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 0.08602704107761383, | |
| "learning_rate": 2.6055421054549613e-06, | |
| "loss": 0.1095, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.0988084003329277, | |
| "learning_rate": 1.7605738798684768e-06, | |
| "loss": 0.1132, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "eval_loss": 0.11113225668668747, | |
| "eval_runtime": 24.9096, | |
| "eval_samples_per_second": 4.817, | |
| "eval_steps_per_second": 4.817, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.8771929824561404, | |
| "grad_norm": 0.10732480138540268, | |
| "learning_rate": 1.0795409543379097e-06, | |
| "loss": 0.1219, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.912280701754386, | |
| "grad_norm": 0.11461273580789566, | |
| "learning_rate": 5.635797904795847e-07, | |
| "loss": 0.1167, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.1043296605348587, | |
| "learning_rate": 2.13551389335076e-07, | |
| "loss": 0.115, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "eval_loss": 0.11119555681943893, | |
| "eval_runtime": 25.0035, | |
| "eval_samples_per_second": 4.799, | |
| "eval_steps_per_second": 4.799, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.982456140350877, | |
| "grad_norm": 0.11734732985496521, | |
| "learning_rate": 3.003985459101299e-08, | |
| "loss": 0.1228, | |
| "step": 850 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 855, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1985551891035546e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |