| { | |
| "best_metric": 2.3842289447784424, | |
| "best_model_checkpoint": "./results/checkpoint-720", | |
| "epoch": 3.977900552486188, | |
| "eval_steps": 10, | |
| "global_step": 720, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.055248618784530384, | |
| "grad_norm": 3.8338866233825684, | |
| "learning_rate": 4.972375690607735e-05, | |
| "loss": 5.1215, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.055248618784530384, | |
| "eval_loss": 4.150951862335205, | |
| "eval_runtime": 107.472, | |
| "eval_samples_per_second": 11.947, | |
| "eval_steps_per_second": 0.195, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11049723756906077, | |
| "grad_norm": 1.0968620777130127, | |
| "learning_rate": 4.94475138121547e-05, | |
| "loss": 4.172, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11049723756906077, | |
| "eval_loss": 4.160157203674316, | |
| "eval_runtime": 108.2249, | |
| "eval_samples_per_second": 11.864, | |
| "eval_steps_per_second": 0.194, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.16574585635359115, | |
| "grad_norm": 9.813711166381836, | |
| "learning_rate": 4.9171270718232046e-05, | |
| "loss": 4.1638, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16574585635359115, | |
| "eval_loss": 4.136767387390137, | |
| "eval_runtime": 106.48, | |
| "eval_samples_per_second": 12.059, | |
| "eval_steps_per_second": 0.197, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.22099447513812154, | |
| "grad_norm": 1.0513194799423218, | |
| "learning_rate": 4.8895027624309394e-05, | |
| "loss": 4.1615, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22099447513812154, | |
| "eval_loss": 4.142675399780273, | |
| "eval_runtime": 108.3224, | |
| "eval_samples_per_second": 11.853, | |
| "eval_steps_per_second": 0.194, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.27624309392265195, | |
| "grad_norm": 2.283129930496216, | |
| "learning_rate": 4.861878453038674e-05, | |
| "loss": 4.1346, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.27624309392265195, | |
| "eval_loss": 4.11836576461792, | |
| "eval_runtime": 107.0752, | |
| "eval_samples_per_second": 11.992, | |
| "eval_steps_per_second": 0.196, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3314917127071823, | |
| "grad_norm": 2.6535284519195557, | |
| "learning_rate": 4.834254143646409e-05, | |
| "loss": 4.1412, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3314917127071823, | |
| "eval_loss": 4.119892597198486, | |
| "eval_runtime": 108.68, | |
| "eval_samples_per_second": 11.814, | |
| "eval_steps_per_second": 0.193, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3867403314917127, | |
| "grad_norm": 2.246750831604004, | |
| "learning_rate": 4.806629834254144e-05, | |
| "loss": 4.2136, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3867403314917127, | |
| "eval_loss": 4.110352516174316, | |
| "eval_runtime": 109.321, | |
| "eval_samples_per_second": 11.745, | |
| "eval_steps_per_second": 0.192, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4419889502762431, | |
| "grad_norm": 5.01633358001709, | |
| "learning_rate": 4.7790055248618785e-05, | |
| "loss": 4.1121, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4419889502762431, | |
| "eval_loss": 4.114729404449463, | |
| "eval_runtime": 108.7219, | |
| "eval_samples_per_second": 11.81, | |
| "eval_steps_per_second": 0.193, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4972375690607735, | |
| "grad_norm": 6.723274230957031, | |
| "learning_rate": 4.751381215469613e-05, | |
| "loss": 4.0936, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4972375690607735, | |
| "eval_loss": 4.197766304016113, | |
| "eval_runtime": 109.473, | |
| "eval_samples_per_second": 11.729, | |
| "eval_steps_per_second": 0.192, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5524861878453039, | |
| "grad_norm": 2.4457666873931885, | |
| "learning_rate": 4.723756906077349e-05, | |
| "loss": 4.1228, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5524861878453039, | |
| "eval_loss": 4.094923496246338, | |
| "eval_runtime": 109.8167, | |
| "eval_samples_per_second": 11.692, | |
| "eval_steps_per_second": 0.191, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6077348066298343, | |
| "grad_norm": 2.2743124961853027, | |
| "learning_rate": 4.6961325966850835e-05, | |
| "loss": 4.0925, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6077348066298343, | |
| "eval_loss": 4.078197479248047, | |
| "eval_runtime": 110.7223, | |
| "eval_samples_per_second": 11.597, | |
| "eval_steps_per_second": 0.19, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6629834254143646, | |
| "grad_norm": 3.141187906265259, | |
| "learning_rate": 4.6685082872928176e-05, | |
| "loss": 4.0646, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6629834254143646, | |
| "eval_loss": 4.101787090301514, | |
| "eval_runtime": 108.3177, | |
| "eval_samples_per_second": 11.854, | |
| "eval_steps_per_second": 0.194, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7182320441988951, | |
| "grad_norm": 7.580991744995117, | |
| "learning_rate": 4.6408839779005524e-05, | |
| "loss": 4.0936, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7182320441988951, | |
| "eval_loss": 4.074957847595215, | |
| "eval_runtime": 109.0627, | |
| "eval_samples_per_second": 11.773, | |
| "eval_steps_per_second": 0.193, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7734806629834254, | |
| "grad_norm": 5.120913028717041, | |
| "learning_rate": 4.613259668508287e-05, | |
| "loss": 4.0563, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7734806629834254, | |
| "eval_loss": 4.028809070587158, | |
| "eval_runtime": 108.4672, | |
| "eval_samples_per_second": 11.838, | |
| "eval_steps_per_second": 0.194, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8287292817679558, | |
| "grad_norm": 6.340137004852295, | |
| "learning_rate": 4.585635359116022e-05, | |
| "loss": 4.0016, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8287292817679558, | |
| "eval_loss": 4.02580451965332, | |
| "eval_runtime": 108.9318, | |
| "eval_samples_per_second": 11.787, | |
| "eval_steps_per_second": 0.193, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8839779005524862, | |
| "grad_norm": 3.361588954925537, | |
| "learning_rate": 4.5580110497237574e-05, | |
| "loss": 4.0029, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8839779005524862, | |
| "eval_loss": 4.001326560974121, | |
| "eval_runtime": 110.4105, | |
| "eval_samples_per_second": 11.629, | |
| "eval_steps_per_second": 0.19, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9392265193370166, | |
| "grad_norm": 10.81383991241455, | |
| "learning_rate": 4.530386740331492e-05, | |
| "loss": 4.0072, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9392265193370166, | |
| "eval_loss": 4.006492614746094, | |
| "eval_runtime": 110.8042, | |
| "eval_samples_per_second": 11.588, | |
| "eval_steps_per_second": 0.19, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.994475138121547, | |
| "grad_norm": 4.824268817901611, | |
| "learning_rate": 4.502762430939227e-05, | |
| "loss": 4.0382, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.994475138121547, | |
| "eval_loss": 4.0064873695373535, | |
| "eval_runtime": 109.8857, | |
| "eval_samples_per_second": 11.685, | |
| "eval_steps_per_second": 0.191, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0497237569060773, | |
| "grad_norm": 6.9584760665893555, | |
| "learning_rate": 4.475138121546962e-05, | |
| "loss": 3.8569, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0497237569060773, | |
| "eval_loss": 3.9973037242889404, | |
| "eval_runtime": 111.7743, | |
| "eval_samples_per_second": 11.487, | |
| "eval_steps_per_second": 0.188, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.1049723756906078, | |
| "grad_norm": 10.90897274017334, | |
| "learning_rate": 4.447513812154696e-05, | |
| "loss": 3.9025, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1049723756906078, | |
| "eval_loss": 3.920397996902466, | |
| "eval_runtime": 110.0718, | |
| "eval_samples_per_second": 11.665, | |
| "eval_steps_per_second": 0.191, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.160220994475138, | |
| "grad_norm": 6.581048011779785, | |
| "learning_rate": 4.419889502762431e-05, | |
| "loss": 3.9265, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.160220994475138, | |
| "eval_loss": 3.9369547367095947, | |
| "eval_runtime": 108.7006, | |
| "eval_samples_per_second": 11.812, | |
| "eval_steps_per_second": 0.193, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.2154696132596685, | |
| "grad_norm": 7.869502544403076, | |
| "learning_rate": 4.392265193370166e-05, | |
| "loss": 3.8242, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2154696132596685, | |
| "eval_loss": 3.932981491088867, | |
| "eval_runtime": 110.8711, | |
| "eval_samples_per_second": 11.581, | |
| "eval_steps_per_second": 0.189, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.270718232044199, | |
| "grad_norm": 6.994544982910156, | |
| "learning_rate": 4.364640883977901e-05, | |
| "loss": 3.8785, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.270718232044199, | |
| "eval_loss": 3.914726495742798, | |
| "eval_runtime": 108.3935, | |
| "eval_samples_per_second": 11.846, | |
| "eval_steps_per_second": 0.194, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.3259668508287292, | |
| "grad_norm": 9.24843978881836, | |
| "learning_rate": 4.337016574585636e-05, | |
| "loss": 3.8126, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3259668508287292, | |
| "eval_loss": 3.878098487854004, | |
| "eval_runtime": 108.7329, | |
| "eval_samples_per_second": 11.809, | |
| "eval_steps_per_second": 0.193, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3812154696132597, | |
| "grad_norm": 7.285367965698242, | |
| "learning_rate": 4.3093922651933705e-05, | |
| "loss": 3.8305, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3812154696132597, | |
| "eval_loss": 3.9157791137695312, | |
| "eval_runtime": 109.2735, | |
| "eval_samples_per_second": 11.75, | |
| "eval_steps_per_second": 0.192, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.43646408839779, | |
| "grad_norm": 6.021206378936768, | |
| "learning_rate": 4.281767955801105e-05, | |
| "loss": 3.7631, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.43646408839779, | |
| "eval_loss": 3.818014144897461, | |
| "eval_runtime": 109.4955, | |
| "eval_samples_per_second": 11.727, | |
| "eval_steps_per_second": 0.192, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4917127071823204, | |
| "grad_norm": 5.70164680480957, | |
| "learning_rate": 4.25414364640884e-05, | |
| "loss": 3.7908, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4917127071823204, | |
| "eval_loss": 3.7703230381011963, | |
| "eval_runtime": 108.0073, | |
| "eval_samples_per_second": 11.888, | |
| "eval_steps_per_second": 0.194, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5469613259668509, | |
| "grad_norm": 8.903672218322754, | |
| "learning_rate": 4.226519337016575e-05, | |
| "loss": 3.7319, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.5469613259668509, | |
| "eval_loss": 3.813979387283325, | |
| "eval_runtime": 108.0907, | |
| "eval_samples_per_second": 11.879, | |
| "eval_steps_per_second": 0.194, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.6022099447513813, | |
| "grad_norm": 7.5272111892700195, | |
| "learning_rate": 4.1988950276243096e-05, | |
| "loss": 3.765, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.6022099447513813, | |
| "eval_loss": 3.822376012802124, | |
| "eval_runtime": 107.6694, | |
| "eval_samples_per_second": 11.925, | |
| "eval_steps_per_second": 0.195, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.6574585635359116, | |
| "grad_norm": 8.640732765197754, | |
| "learning_rate": 4.1712707182320444e-05, | |
| "loss": 3.7465, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6574585635359116, | |
| "eval_loss": 3.7796106338500977, | |
| "eval_runtime": 107.2692, | |
| "eval_samples_per_second": 11.97, | |
| "eval_steps_per_second": 0.196, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7127071823204418, | |
| "grad_norm": 8.788119316101074, | |
| "learning_rate": 4.143646408839779e-05, | |
| "loss": 3.6646, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7127071823204418, | |
| "eval_loss": 3.711517810821533, | |
| "eval_runtime": 108.3013, | |
| "eval_samples_per_second": 11.856, | |
| "eval_steps_per_second": 0.194, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7679558011049723, | |
| "grad_norm": 12.237804412841797, | |
| "learning_rate": 4.116022099447514e-05, | |
| "loss": 3.67, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7679558011049723, | |
| "eval_loss": 3.686821460723877, | |
| "eval_runtime": 109.2103, | |
| "eval_samples_per_second": 11.757, | |
| "eval_steps_per_second": 0.192, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.8232044198895028, | |
| "grad_norm": 10.34768295288086, | |
| "learning_rate": 4.088397790055249e-05, | |
| "loss": 3.6374, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.8232044198895028, | |
| "eval_loss": 3.646404504776001, | |
| "eval_runtime": 108.3039, | |
| "eval_samples_per_second": 11.856, | |
| "eval_steps_per_second": 0.194, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.8784530386740332, | |
| "grad_norm": 10.875980377197266, | |
| "learning_rate": 4.0607734806629835e-05, | |
| "loss": 3.5907, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8784530386740332, | |
| "eval_loss": 3.631521463394165, | |
| "eval_runtime": 108.2559, | |
| "eval_samples_per_second": 11.861, | |
| "eval_steps_per_second": 0.194, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9337016574585635, | |
| "grad_norm": 9.671201705932617, | |
| "learning_rate": 4.033149171270719e-05, | |
| "loss": 3.6003, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.9337016574585635, | |
| "eval_loss": 3.551748037338257, | |
| "eval_runtime": 108.4367, | |
| "eval_samples_per_second": 11.841, | |
| "eval_steps_per_second": 0.194, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.988950276243094, | |
| "grad_norm": 11.600411415100098, | |
| "learning_rate": 4.005524861878453e-05, | |
| "loss": 3.5967, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.988950276243094, | |
| "eval_loss": 3.5431878566741943, | |
| "eval_runtime": 107.9821, | |
| "eval_samples_per_second": 11.891, | |
| "eval_steps_per_second": 0.194, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.044198895027624, | |
| "grad_norm": 17.58928680419922, | |
| "learning_rate": 3.977900552486188e-05, | |
| "loss": 3.3887, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.044198895027624, | |
| "eval_loss": 3.5802059173583984, | |
| "eval_runtime": 107.9756, | |
| "eval_samples_per_second": 11.892, | |
| "eval_steps_per_second": 0.194, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.0994475138121547, | |
| "grad_norm": 12.583136558532715, | |
| "learning_rate": 3.950276243093923e-05, | |
| "loss": 3.2413, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.0994475138121547, | |
| "eval_loss": 3.5067298412323, | |
| "eval_runtime": 108.2401, | |
| "eval_samples_per_second": 11.863, | |
| "eval_steps_per_second": 0.194, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.154696132596685, | |
| "grad_norm": 13.868547439575195, | |
| "learning_rate": 3.9226519337016574e-05, | |
| "loss": 3.0502, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.154696132596685, | |
| "eval_loss": 3.548964738845825, | |
| "eval_runtime": 108.3243, | |
| "eval_samples_per_second": 11.853, | |
| "eval_steps_per_second": 0.194, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.2099447513812156, | |
| "grad_norm": 18.013263702392578, | |
| "learning_rate": 3.895027624309392e-05, | |
| "loss": 3.2647, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.2099447513812156, | |
| "eval_loss": 3.41357159614563, | |
| "eval_runtime": 108.3964, | |
| "eval_samples_per_second": 11.845, | |
| "eval_steps_per_second": 0.194, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.265193370165746, | |
| "grad_norm": 12.967714309692383, | |
| "learning_rate": 3.867403314917128e-05, | |
| "loss": 3.1265, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.265193370165746, | |
| "eval_loss": 3.4157204627990723, | |
| "eval_runtime": 108.3207, | |
| "eval_samples_per_second": 11.854, | |
| "eval_steps_per_second": 0.194, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.320441988950276, | |
| "grad_norm": 17.192251205444336, | |
| "learning_rate": 3.8397790055248625e-05, | |
| "loss": 3.0176, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.320441988950276, | |
| "eval_loss": 3.4587888717651367, | |
| "eval_runtime": 108.7782, | |
| "eval_samples_per_second": 11.804, | |
| "eval_steps_per_second": 0.193, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.3756906077348066, | |
| "grad_norm": 17.41048240661621, | |
| "learning_rate": 3.812154696132597e-05, | |
| "loss": 3.0366, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.3756906077348066, | |
| "eval_loss": 3.359968900680542, | |
| "eval_runtime": 108.3212, | |
| "eval_samples_per_second": 11.854, | |
| "eval_steps_per_second": 0.194, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.430939226519337, | |
| "grad_norm": 14.966797828674316, | |
| "learning_rate": 3.7845303867403314e-05, | |
| "loss": 3.0515, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.430939226519337, | |
| "eval_loss": 3.405341148376465, | |
| "eval_runtime": 108.5034, | |
| "eval_samples_per_second": 11.834, | |
| "eval_steps_per_second": 0.194, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.4861878453038675, | |
| "grad_norm": 14.554710388183594, | |
| "learning_rate": 3.756906077348066e-05, | |
| "loss": 3.1383, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.4861878453038675, | |
| "eval_loss": 3.261054754257202, | |
| "eval_runtime": 107.8836, | |
| "eval_samples_per_second": 11.902, | |
| "eval_steps_per_second": 0.195, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.541436464088398, | |
| "grad_norm": 22.434762954711914, | |
| "learning_rate": 3.729281767955801e-05, | |
| "loss": 2.9971, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.541436464088398, | |
| "eval_loss": 3.229337692260742, | |
| "eval_runtime": 108.7988, | |
| "eval_samples_per_second": 11.802, | |
| "eval_steps_per_second": 0.193, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.596685082872928, | |
| "grad_norm": 15.667607307434082, | |
| "learning_rate": 3.7016574585635364e-05, | |
| "loss": 3.047, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.596685082872928, | |
| "eval_loss": 3.224137783050537, | |
| "eval_runtime": 110.5326, | |
| "eval_samples_per_second": 11.616, | |
| "eval_steps_per_second": 0.19, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.6519337016574585, | |
| "grad_norm": 16.986766815185547, | |
| "learning_rate": 3.674033149171271e-05, | |
| "loss": 2.8851, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.6519337016574585, | |
| "eval_loss": 3.2184762954711914, | |
| "eval_runtime": 108.027, | |
| "eval_samples_per_second": 11.886, | |
| "eval_steps_per_second": 0.194, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.707182320441989, | |
| "grad_norm": 13.545926094055176, | |
| "learning_rate": 3.646408839779006e-05, | |
| "loss": 2.8976, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.707182320441989, | |
| "eval_loss": 3.082709550857544, | |
| "eval_runtime": 108.5833, | |
| "eval_samples_per_second": 11.825, | |
| "eval_steps_per_second": 0.193, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.7624309392265194, | |
| "grad_norm": 16.030040740966797, | |
| "learning_rate": 3.618784530386741e-05, | |
| "loss": 2.8307, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.7624309392265194, | |
| "eval_loss": 3.0571742057800293, | |
| "eval_runtime": 107.7725, | |
| "eval_samples_per_second": 11.914, | |
| "eval_steps_per_second": 0.195, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.81767955801105, | |
| "grad_norm": 16.842382431030273, | |
| "learning_rate": 3.5911602209944755e-05, | |
| "loss": 2.8896, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.81767955801105, | |
| "eval_loss": 2.9949567317962646, | |
| "eval_runtime": 108.4232, | |
| "eval_samples_per_second": 11.842, | |
| "eval_steps_per_second": 0.194, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.87292817679558, | |
| "grad_norm": 18.767789840698242, | |
| "learning_rate": 3.5635359116022096e-05, | |
| "loss": 2.7774, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.87292817679558, | |
| "eval_loss": 2.9752790927886963, | |
| "eval_runtime": 108.043, | |
| "eval_samples_per_second": 11.884, | |
| "eval_steps_per_second": 0.194, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.9281767955801103, | |
| "grad_norm": 15.322210311889648, | |
| "learning_rate": 3.535911602209945e-05, | |
| "loss": 2.7361, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.9281767955801103, | |
| "eval_loss": 2.9297850131988525, | |
| "eval_runtime": 109.8044, | |
| "eval_samples_per_second": 11.694, | |
| "eval_steps_per_second": 0.191, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.983425414364641, | |
| "grad_norm": 19.184162139892578, | |
| "learning_rate": 3.50828729281768e-05, | |
| "loss": 2.6885, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.983425414364641, | |
| "eval_loss": 2.9156270027160645, | |
| "eval_runtime": 107.5015, | |
| "eval_samples_per_second": 11.944, | |
| "eval_steps_per_second": 0.195, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.0386740331491713, | |
| "grad_norm": 19.8149356842041, | |
| "learning_rate": 3.4806629834254147e-05, | |
| "loss": 2.2378, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.0386740331491713, | |
| "eval_loss": 3.0476300716400146, | |
| "eval_runtime": 110.8258, | |
| "eval_samples_per_second": 11.586, | |
| "eval_steps_per_second": 0.189, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.0939226519337018, | |
| "grad_norm": 19.72810935974121, | |
| "learning_rate": 3.4530386740331494e-05, | |
| "loss": 2.0295, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.0939226519337018, | |
| "eval_loss": 2.9673562049865723, | |
| "eval_runtime": 107.877, | |
| "eval_samples_per_second": 11.902, | |
| "eval_steps_per_second": 0.195, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.149171270718232, | |
| "grad_norm": 34.11488723754883, | |
| "learning_rate": 3.425414364640884e-05, | |
| "loss": 1.9957, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.149171270718232, | |
| "eval_loss": 3.2292628288269043, | |
| "eval_runtime": 109.6888, | |
| "eval_samples_per_second": 11.706, | |
| "eval_steps_per_second": 0.191, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.2044198895027622, | |
| "grad_norm": 24.54149055480957, | |
| "learning_rate": 3.397790055248619e-05, | |
| "loss": 1.9727, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.2044198895027622, | |
| "eval_loss": 2.8876142501831055, | |
| "eval_runtime": 107.7637, | |
| "eval_samples_per_second": 11.915, | |
| "eval_steps_per_second": 0.195, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.2596685082872927, | |
| "grad_norm": 24.705358505249023, | |
| "learning_rate": 3.370165745856354e-05, | |
| "loss": 1.9746, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.2596685082872927, | |
| "eval_loss": 2.84237003326416, | |
| "eval_runtime": 107.9704, | |
| "eval_samples_per_second": 11.892, | |
| "eval_steps_per_second": 0.194, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.314917127071823, | |
| "grad_norm": 17.960529327392578, | |
| "learning_rate": 3.3425414364640886e-05, | |
| "loss": 1.9393, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.314917127071823, | |
| "eval_loss": 2.7950246334075928, | |
| "eval_runtime": 107.4211, | |
| "eval_samples_per_second": 11.953, | |
| "eval_steps_per_second": 0.195, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.3701657458563536, | |
| "grad_norm": 21.799556732177734, | |
| "learning_rate": 3.3149171270718233e-05, | |
| "loss": 1.8191, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.3701657458563536, | |
| "eval_loss": 2.8050460815429688, | |
| "eval_runtime": 109.142, | |
| "eval_samples_per_second": 11.764, | |
| "eval_steps_per_second": 0.192, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.425414364640884, | |
| "grad_norm": 22.035696029663086, | |
| "learning_rate": 3.287292817679558e-05, | |
| "loss": 1.7885, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.425414364640884, | |
| "eval_loss": 2.837017774581909, | |
| "eval_runtime": 108.619, | |
| "eval_samples_per_second": 11.821, | |
| "eval_steps_per_second": 0.193, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.4806629834254146, | |
| "grad_norm": 20.61678695678711, | |
| "learning_rate": 3.259668508287293e-05, | |
| "loss": 1.8065, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.4806629834254146, | |
| "eval_loss": 2.6381585597991943, | |
| "eval_runtime": 107.6552, | |
| "eval_samples_per_second": 11.927, | |
| "eval_steps_per_second": 0.195, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.5359116022099446, | |
| "grad_norm": 20.068431854248047, | |
| "learning_rate": 3.232044198895028e-05, | |
| "loss": 1.9027, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.5359116022099446, | |
| "eval_loss": 2.6253230571746826, | |
| "eval_runtime": 108.6112, | |
| "eval_samples_per_second": 11.822, | |
| "eval_steps_per_second": 0.193, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.591160220994475, | |
| "grad_norm": 20.27581024169922, | |
| "learning_rate": 3.2044198895027625e-05, | |
| "loss": 1.7976, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.591160220994475, | |
| "eval_loss": 2.8042409420013428, | |
| "eval_runtime": 109.5439, | |
| "eval_samples_per_second": 11.721, | |
| "eval_steps_per_second": 0.192, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.6464088397790055, | |
| "grad_norm": 22.138561248779297, | |
| "learning_rate": 3.176795580110498e-05, | |
| "loss": 1.8324, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.6464088397790055, | |
| "eval_loss": 2.6126017570495605, | |
| "eval_runtime": 108.5998, | |
| "eval_samples_per_second": 11.823, | |
| "eval_steps_per_second": 0.193, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.701657458563536, | |
| "grad_norm": 18.944120407104492, | |
| "learning_rate": 3.149171270718232e-05, | |
| "loss": 1.7634, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.701657458563536, | |
| "eval_loss": 2.5312118530273438, | |
| "eval_runtime": 107.8698, | |
| "eval_samples_per_second": 11.903, | |
| "eval_steps_per_second": 0.195, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.7569060773480665, | |
| "grad_norm": 21.863069534301758, | |
| "learning_rate": 3.121546961325967e-05, | |
| "loss": 1.8946, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.7569060773480665, | |
| "eval_loss": 2.480397939682007, | |
| "eval_runtime": 108.6826, | |
| "eval_samples_per_second": 11.814, | |
| "eval_steps_per_second": 0.193, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.8121546961325965, | |
| "grad_norm": 23.169885635375977, | |
| "learning_rate": 3.0939226519337016e-05, | |
| "loss": 1.5957, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.8121546961325965, | |
| "eval_loss": 2.6412222385406494, | |
| "eval_runtime": 107.9762, | |
| "eval_samples_per_second": 11.892, | |
| "eval_steps_per_second": 0.194, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.867403314917127, | |
| "grad_norm": 20.805410385131836, | |
| "learning_rate": 3.0662983425414364e-05, | |
| "loss": 1.6951, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.867403314917127, | |
| "eval_loss": 2.462557315826416, | |
| "eval_runtime": 110.0483, | |
| "eval_samples_per_second": 11.668, | |
| "eval_steps_per_second": 0.191, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.9226519337016574, | |
| "grad_norm": 24.351552963256836, | |
| "learning_rate": 3.0386740331491715e-05, | |
| "loss": 1.789, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.9226519337016574, | |
| "eval_loss": 2.510899305343628, | |
| "eval_runtime": 108.097, | |
| "eval_samples_per_second": 11.878, | |
| "eval_steps_per_second": 0.194, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.977900552486188, | |
| "grad_norm": 20.56439971923828, | |
| "learning_rate": 3.0110497237569063e-05, | |
| "loss": 1.7312, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.977900552486188, | |
| "eval_loss": 2.3842289447784424, | |
| "eval_runtime": 110.1597, | |
| "eval_samples_per_second": 11.656, | |
| "eval_steps_per_second": 0.191, | |
| "step": 720 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1810, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3437308831020480.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |