{ "best_metric": 2.3842289447784424, "best_model_checkpoint": "./results/checkpoint-720", "epoch": 3.977900552486188, "eval_steps": 10, "global_step": 720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.055248618784530384, "grad_norm": 3.8338866233825684, "learning_rate": 4.972375690607735e-05, "loss": 5.1215, "step": 10 }, { "epoch": 0.055248618784530384, "eval_loss": 4.150951862335205, "eval_runtime": 107.472, "eval_samples_per_second": 11.947, "eval_steps_per_second": 0.195, "step": 10 }, { "epoch": 0.11049723756906077, "grad_norm": 1.0968620777130127, "learning_rate": 4.94475138121547e-05, "loss": 4.172, "step": 20 }, { "epoch": 0.11049723756906077, "eval_loss": 4.160157203674316, "eval_runtime": 108.2249, "eval_samples_per_second": 11.864, "eval_steps_per_second": 0.194, "step": 20 }, { "epoch": 0.16574585635359115, "grad_norm": 9.813711166381836, "learning_rate": 4.9171270718232046e-05, "loss": 4.1638, "step": 30 }, { "epoch": 0.16574585635359115, "eval_loss": 4.136767387390137, "eval_runtime": 106.48, "eval_samples_per_second": 12.059, "eval_steps_per_second": 0.197, "step": 30 }, { "epoch": 0.22099447513812154, "grad_norm": 1.0513194799423218, "learning_rate": 4.8895027624309394e-05, "loss": 4.1615, "step": 40 }, { "epoch": 0.22099447513812154, "eval_loss": 4.142675399780273, "eval_runtime": 108.3224, "eval_samples_per_second": 11.853, "eval_steps_per_second": 0.194, "step": 40 }, { "epoch": 0.27624309392265195, "grad_norm": 2.283129930496216, "learning_rate": 4.861878453038674e-05, "loss": 4.1346, "step": 50 }, { "epoch": 0.27624309392265195, "eval_loss": 4.11836576461792, "eval_runtime": 107.0752, "eval_samples_per_second": 11.992, "eval_steps_per_second": 0.196, "step": 50 }, { "epoch": 0.3314917127071823, "grad_norm": 2.6535284519195557, "learning_rate": 4.834254143646409e-05, "loss": 4.1412, "step": 60 }, { "epoch": 0.3314917127071823, "eval_loss": 4.119892597198486, "eval_runtime": 108.68, "eval_samples_per_second": 11.814, "eval_steps_per_second": 0.193, "step": 60 }, { "epoch": 0.3867403314917127, "grad_norm": 2.246750831604004, "learning_rate": 4.806629834254144e-05, "loss": 4.2136, "step": 70 }, { "epoch": 0.3867403314917127, "eval_loss": 4.110352516174316, "eval_runtime": 109.321, "eval_samples_per_second": 11.745, "eval_steps_per_second": 0.192, "step": 70 }, { "epoch": 0.4419889502762431, "grad_norm": 5.01633358001709, "learning_rate": 4.7790055248618785e-05, "loss": 4.1121, "step": 80 }, { "epoch": 0.4419889502762431, "eval_loss": 4.114729404449463, "eval_runtime": 108.7219, "eval_samples_per_second": 11.81, "eval_steps_per_second": 0.193, "step": 80 }, { "epoch": 0.4972375690607735, "grad_norm": 6.723274230957031, "learning_rate": 4.751381215469613e-05, "loss": 4.0936, "step": 90 }, { "epoch": 0.4972375690607735, "eval_loss": 4.197766304016113, "eval_runtime": 109.473, "eval_samples_per_second": 11.729, "eval_steps_per_second": 0.192, "step": 90 }, { "epoch": 0.5524861878453039, "grad_norm": 2.4457666873931885, "learning_rate": 4.723756906077349e-05, "loss": 4.1228, "step": 100 }, { "epoch": 0.5524861878453039, "eval_loss": 4.094923496246338, "eval_runtime": 109.8167, "eval_samples_per_second": 11.692, "eval_steps_per_second": 0.191, "step": 100 }, { "epoch": 0.6077348066298343, "grad_norm": 2.2743124961853027, "learning_rate": 4.6961325966850835e-05, "loss": 4.0925, "step": 110 }, { "epoch": 0.6077348066298343, "eval_loss": 4.078197479248047, "eval_runtime": 110.7223, "eval_samples_per_second": 11.597, "eval_steps_per_second": 0.19, "step": 110 }, { "epoch": 0.6629834254143646, "grad_norm": 3.141187906265259, "learning_rate": 4.6685082872928176e-05, "loss": 4.0646, "step": 120 }, { "epoch": 0.6629834254143646, "eval_loss": 4.101787090301514, "eval_runtime": 108.3177, "eval_samples_per_second": 11.854, "eval_steps_per_second": 0.194, "step": 120 }, { "epoch": 0.7182320441988951, "grad_norm": 7.580991744995117, "learning_rate": 4.6408839779005524e-05, "loss": 4.0936, "step": 130 }, { "epoch": 0.7182320441988951, "eval_loss": 4.074957847595215, "eval_runtime": 109.0627, "eval_samples_per_second": 11.773, "eval_steps_per_second": 0.193, "step": 130 }, { "epoch": 0.7734806629834254, "grad_norm": 5.120913028717041, "learning_rate": 4.613259668508287e-05, "loss": 4.0563, "step": 140 }, { "epoch": 0.7734806629834254, "eval_loss": 4.028809070587158, "eval_runtime": 108.4672, "eval_samples_per_second": 11.838, "eval_steps_per_second": 0.194, "step": 140 }, { "epoch": 0.8287292817679558, "grad_norm": 6.340137004852295, "learning_rate": 4.585635359116022e-05, "loss": 4.0016, "step": 150 }, { "epoch": 0.8287292817679558, "eval_loss": 4.02580451965332, "eval_runtime": 108.9318, "eval_samples_per_second": 11.787, "eval_steps_per_second": 0.193, "step": 150 }, { "epoch": 0.8839779005524862, "grad_norm": 3.361588954925537, "learning_rate": 4.5580110497237574e-05, "loss": 4.0029, "step": 160 }, { "epoch": 0.8839779005524862, "eval_loss": 4.001326560974121, "eval_runtime": 110.4105, "eval_samples_per_second": 11.629, "eval_steps_per_second": 0.19, "step": 160 }, { "epoch": 0.9392265193370166, "grad_norm": 10.81383991241455, "learning_rate": 4.530386740331492e-05, "loss": 4.0072, "step": 170 }, { "epoch": 0.9392265193370166, "eval_loss": 4.006492614746094, "eval_runtime": 110.8042, "eval_samples_per_second": 11.588, "eval_steps_per_second": 0.19, "step": 170 }, { "epoch": 0.994475138121547, "grad_norm": 4.824268817901611, "learning_rate": 4.502762430939227e-05, "loss": 4.0382, "step": 180 }, { "epoch": 0.994475138121547, "eval_loss": 4.0064873695373535, "eval_runtime": 109.8857, "eval_samples_per_second": 11.685, "eval_steps_per_second": 0.191, "step": 180 }, { "epoch": 1.0497237569060773, "grad_norm": 6.9584760665893555, "learning_rate": 4.475138121546962e-05, "loss": 3.8569, "step": 190 }, { "epoch": 1.0497237569060773, "eval_loss": 3.9973037242889404, "eval_runtime": 111.7743, "eval_samples_per_second": 11.487, "eval_steps_per_second": 0.188, "step": 190 }, { "epoch": 1.1049723756906078, "grad_norm": 10.90897274017334, "learning_rate": 4.447513812154696e-05, "loss": 3.9025, "step": 200 }, { "epoch": 1.1049723756906078, "eval_loss": 3.920397996902466, "eval_runtime": 110.0718, "eval_samples_per_second": 11.665, "eval_steps_per_second": 0.191, "step": 200 }, { "epoch": 1.160220994475138, "grad_norm": 6.581048011779785, "learning_rate": 4.419889502762431e-05, "loss": 3.9265, "step": 210 }, { "epoch": 1.160220994475138, "eval_loss": 3.9369547367095947, "eval_runtime": 108.7006, "eval_samples_per_second": 11.812, "eval_steps_per_second": 0.193, "step": 210 }, { "epoch": 1.2154696132596685, "grad_norm": 7.869502544403076, "learning_rate": 4.392265193370166e-05, "loss": 3.8242, "step": 220 }, { "epoch": 1.2154696132596685, "eval_loss": 3.932981491088867, "eval_runtime": 110.8711, "eval_samples_per_second": 11.581, "eval_steps_per_second": 0.189, "step": 220 }, { "epoch": 1.270718232044199, "grad_norm": 6.994544982910156, "learning_rate": 4.364640883977901e-05, "loss": 3.8785, "step": 230 }, { "epoch": 1.270718232044199, "eval_loss": 3.914726495742798, "eval_runtime": 108.3935, "eval_samples_per_second": 11.846, "eval_steps_per_second": 0.194, "step": 230 }, { "epoch": 1.3259668508287292, "grad_norm": 9.24843978881836, "learning_rate": 4.337016574585636e-05, "loss": 3.8126, "step": 240 }, { "epoch": 1.3259668508287292, "eval_loss": 3.878098487854004, "eval_runtime": 108.7329, "eval_samples_per_second": 11.809, "eval_steps_per_second": 0.193, "step": 240 }, { "epoch": 1.3812154696132597, "grad_norm": 7.285367965698242, "learning_rate": 4.3093922651933705e-05, "loss": 3.8305, "step": 250 }, { "epoch": 1.3812154696132597, "eval_loss": 3.9157791137695312, "eval_runtime": 109.2735, "eval_samples_per_second": 11.75, "eval_steps_per_second": 0.192, "step": 250 }, { "epoch": 1.43646408839779, "grad_norm": 6.021206378936768, "learning_rate": 4.281767955801105e-05, "loss": 3.7631, "step": 260 }, { "epoch": 1.43646408839779, "eval_loss": 3.818014144897461, "eval_runtime": 109.4955, "eval_samples_per_second": 11.727, "eval_steps_per_second": 0.192, "step": 260 }, { "epoch": 1.4917127071823204, "grad_norm": 5.70164680480957, "learning_rate": 4.25414364640884e-05, "loss": 3.7908, "step": 270 }, { "epoch": 1.4917127071823204, "eval_loss": 3.7703230381011963, "eval_runtime": 108.0073, "eval_samples_per_second": 11.888, "eval_steps_per_second": 0.194, "step": 270 }, { "epoch": 1.5469613259668509, "grad_norm": 8.903672218322754, "learning_rate": 4.226519337016575e-05, "loss": 3.7319, "step": 280 }, { "epoch": 1.5469613259668509, "eval_loss": 3.813979387283325, "eval_runtime": 108.0907, "eval_samples_per_second": 11.879, "eval_steps_per_second": 0.194, "step": 280 }, { "epoch": 1.6022099447513813, "grad_norm": 7.5272111892700195, "learning_rate": 4.1988950276243096e-05, "loss": 3.765, "step": 290 }, { "epoch": 1.6022099447513813, "eval_loss": 3.822376012802124, "eval_runtime": 107.6694, "eval_samples_per_second": 11.925, "eval_steps_per_second": 0.195, "step": 290 }, { "epoch": 1.6574585635359116, "grad_norm": 8.640732765197754, "learning_rate": 4.1712707182320444e-05, "loss": 3.7465, "step": 300 }, { "epoch": 1.6574585635359116, "eval_loss": 3.7796106338500977, "eval_runtime": 107.2692, "eval_samples_per_second": 11.97, "eval_steps_per_second": 0.196, "step": 300 }, { "epoch": 1.7127071823204418, "grad_norm": 8.788119316101074, "learning_rate": 4.143646408839779e-05, "loss": 3.6646, "step": 310 }, { "epoch": 1.7127071823204418, "eval_loss": 3.711517810821533, "eval_runtime": 108.3013, "eval_samples_per_second": 11.856, "eval_steps_per_second": 0.194, "step": 310 }, { "epoch": 1.7679558011049723, "grad_norm": 12.237804412841797, "learning_rate": 4.116022099447514e-05, "loss": 3.67, "step": 320 }, { "epoch": 1.7679558011049723, "eval_loss": 3.686821460723877, "eval_runtime": 109.2103, "eval_samples_per_second": 11.757, "eval_steps_per_second": 0.192, "step": 320 }, { "epoch": 1.8232044198895028, "grad_norm": 10.34768295288086, "learning_rate": 4.088397790055249e-05, "loss": 3.6374, "step": 330 }, { "epoch": 1.8232044198895028, "eval_loss": 3.646404504776001, "eval_runtime": 108.3039, "eval_samples_per_second": 11.856, "eval_steps_per_second": 0.194, "step": 330 }, { "epoch": 1.8784530386740332, "grad_norm": 10.875980377197266, "learning_rate": 4.0607734806629835e-05, "loss": 3.5907, "step": 340 }, { "epoch": 1.8784530386740332, "eval_loss": 3.631521463394165, "eval_runtime": 108.2559, "eval_samples_per_second": 11.861, "eval_steps_per_second": 0.194, "step": 340 }, { "epoch": 1.9337016574585635, "grad_norm": 9.671201705932617, "learning_rate": 4.033149171270719e-05, "loss": 3.6003, "step": 350 }, { "epoch": 1.9337016574585635, "eval_loss": 3.551748037338257, "eval_runtime": 108.4367, "eval_samples_per_second": 11.841, "eval_steps_per_second": 0.194, "step": 350 }, { "epoch": 1.988950276243094, "grad_norm": 11.600411415100098, "learning_rate": 4.005524861878453e-05, "loss": 3.5967, "step": 360 }, { "epoch": 1.988950276243094, "eval_loss": 3.5431878566741943, "eval_runtime": 107.9821, "eval_samples_per_second": 11.891, "eval_steps_per_second": 0.194, "step": 360 }, { "epoch": 2.044198895027624, "grad_norm": 17.58928680419922, "learning_rate": 3.977900552486188e-05, "loss": 3.3887, "step": 370 }, { "epoch": 2.044198895027624, "eval_loss": 3.5802059173583984, "eval_runtime": 107.9756, "eval_samples_per_second": 11.892, "eval_steps_per_second": 0.194, "step": 370 }, { "epoch": 2.0994475138121547, "grad_norm": 12.583136558532715, "learning_rate": 3.950276243093923e-05, "loss": 3.2413, "step": 380 }, { "epoch": 2.0994475138121547, "eval_loss": 3.5067298412323, "eval_runtime": 108.2401, "eval_samples_per_second": 11.863, "eval_steps_per_second": 0.194, "step": 380 }, { "epoch": 2.154696132596685, "grad_norm": 13.868547439575195, "learning_rate": 3.9226519337016574e-05, "loss": 3.0502, "step": 390 }, { "epoch": 2.154696132596685, "eval_loss": 3.548964738845825, "eval_runtime": 108.3243, "eval_samples_per_second": 11.853, "eval_steps_per_second": 0.194, "step": 390 }, { "epoch": 2.2099447513812156, "grad_norm": 18.013263702392578, "learning_rate": 3.895027624309392e-05, "loss": 3.2647, "step": 400 }, { "epoch": 2.2099447513812156, "eval_loss": 3.41357159614563, "eval_runtime": 108.3964, "eval_samples_per_second": 11.845, "eval_steps_per_second": 0.194, "step": 400 }, { "epoch": 2.265193370165746, "grad_norm": 12.967714309692383, "learning_rate": 3.867403314917128e-05, "loss": 3.1265, "step": 410 }, { "epoch": 2.265193370165746, "eval_loss": 3.4157204627990723, "eval_runtime": 108.3207, "eval_samples_per_second": 11.854, "eval_steps_per_second": 0.194, "step": 410 }, { "epoch": 2.320441988950276, "grad_norm": 17.192251205444336, "learning_rate": 3.8397790055248625e-05, "loss": 3.0176, "step": 420 }, { "epoch": 2.320441988950276, "eval_loss": 3.4587888717651367, "eval_runtime": 108.7782, "eval_samples_per_second": 11.804, "eval_steps_per_second": 0.193, "step": 420 }, { "epoch": 2.3756906077348066, "grad_norm": 17.41048240661621, "learning_rate": 3.812154696132597e-05, "loss": 3.0366, "step": 430 }, { "epoch": 2.3756906077348066, "eval_loss": 3.359968900680542, "eval_runtime": 108.3212, "eval_samples_per_second": 11.854, "eval_steps_per_second": 0.194, "step": 430 }, { "epoch": 2.430939226519337, "grad_norm": 14.966797828674316, "learning_rate": 3.7845303867403314e-05, "loss": 3.0515, "step": 440 }, { "epoch": 2.430939226519337, "eval_loss": 3.405341148376465, "eval_runtime": 108.5034, "eval_samples_per_second": 11.834, "eval_steps_per_second": 0.194, "step": 440 }, { "epoch": 2.4861878453038675, "grad_norm": 14.554710388183594, "learning_rate": 3.756906077348066e-05, "loss": 3.1383, "step": 450 }, { "epoch": 2.4861878453038675, "eval_loss": 3.261054754257202, "eval_runtime": 107.8836, "eval_samples_per_second": 11.902, "eval_steps_per_second": 0.195, "step": 450 }, { "epoch": 2.541436464088398, "grad_norm": 22.434762954711914, "learning_rate": 3.729281767955801e-05, "loss": 2.9971, "step": 460 }, { "epoch": 2.541436464088398, "eval_loss": 3.229337692260742, "eval_runtime": 108.7988, "eval_samples_per_second": 11.802, "eval_steps_per_second": 0.193, "step": 460 }, { "epoch": 2.596685082872928, "grad_norm": 15.667607307434082, "learning_rate": 3.7016574585635364e-05, "loss": 3.047, "step": 470 }, { "epoch": 2.596685082872928, "eval_loss": 3.224137783050537, "eval_runtime": 110.5326, "eval_samples_per_second": 11.616, "eval_steps_per_second": 0.19, "step": 470 }, { "epoch": 2.6519337016574585, "grad_norm": 16.986766815185547, "learning_rate": 3.674033149171271e-05, "loss": 2.8851, "step": 480 }, { "epoch": 2.6519337016574585, "eval_loss": 3.2184762954711914, "eval_runtime": 108.027, "eval_samples_per_second": 11.886, "eval_steps_per_second": 0.194, "step": 480 }, { "epoch": 2.707182320441989, "grad_norm": 13.545926094055176, "learning_rate": 3.646408839779006e-05, "loss": 2.8976, "step": 490 }, { "epoch": 2.707182320441989, "eval_loss": 3.082709550857544, "eval_runtime": 108.5833, "eval_samples_per_second": 11.825, "eval_steps_per_second": 0.193, "step": 490 }, { "epoch": 2.7624309392265194, "grad_norm": 16.030040740966797, "learning_rate": 3.618784530386741e-05, "loss": 2.8307, "step": 500 }, { "epoch": 2.7624309392265194, "eval_loss": 3.0571742057800293, "eval_runtime": 107.7725, "eval_samples_per_second": 11.914, "eval_steps_per_second": 0.195, "step": 500 }, { "epoch": 2.81767955801105, "grad_norm": 16.842382431030273, "learning_rate": 3.5911602209944755e-05, "loss": 2.8896, "step": 510 }, { "epoch": 2.81767955801105, "eval_loss": 2.9949567317962646, "eval_runtime": 108.4232, "eval_samples_per_second": 11.842, "eval_steps_per_second": 0.194, "step": 510 }, { "epoch": 2.87292817679558, "grad_norm": 18.767789840698242, "learning_rate": 3.5635359116022096e-05, "loss": 2.7774, "step": 520 }, { "epoch": 2.87292817679558, "eval_loss": 2.9752790927886963, "eval_runtime": 108.043, "eval_samples_per_second": 11.884, "eval_steps_per_second": 0.194, "step": 520 }, { "epoch": 2.9281767955801103, "grad_norm": 15.322210311889648, "learning_rate": 3.535911602209945e-05, "loss": 2.7361, "step": 530 }, { "epoch": 2.9281767955801103, "eval_loss": 2.9297850131988525, "eval_runtime": 109.8044, "eval_samples_per_second": 11.694, "eval_steps_per_second": 0.191, "step": 530 }, { "epoch": 2.983425414364641, "grad_norm": 19.184162139892578, "learning_rate": 3.50828729281768e-05, "loss": 2.6885, "step": 540 }, { "epoch": 2.983425414364641, "eval_loss": 2.9156270027160645, "eval_runtime": 107.5015, "eval_samples_per_second": 11.944, "eval_steps_per_second": 0.195, "step": 540 }, { "epoch": 3.0386740331491713, "grad_norm": 19.8149356842041, "learning_rate": 3.4806629834254147e-05, "loss": 2.2378, "step": 550 }, { "epoch": 3.0386740331491713, "eval_loss": 3.0476300716400146, "eval_runtime": 110.8258, "eval_samples_per_second": 11.586, "eval_steps_per_second": 0.189, "step": 550 }, { "epoch": 3.0939226519337018, "grad_norm": 19.72810935974121, "learning_rate": 3.4530386740331494e-05, "loss": 2.0295, "step": 560 }, { "epoch": 3.0939226519337018, "eval_loss": 2.9673562049865723, "eval_runtime": 107.877, "eval_samples_per_second": 11.902, "eval_steps_per_second": 0.195, "step": 560 }, { "epoch": 3.149171270718232, "grad_norm": 34.11488723754883, "learning_rate": 3.425414364640884e-05, "loss": 1.9957, "step": 570 }, { "epoch": 3.149171270718232, "eval_loss": 3.2292628288269043, "eval_runtime": 109.6888, "eval_samples_per_second": 11.706, "eval_steps_per_second": 0.191, "step": 570 }, { "epoch": 3.2044198895027622, "grad_norm": 24.54149055480957, "learning_rate": 3.397790055248619e-05, "loss": 1.9727, "step": 580 }, { "epoch": 3.2044198895027622, "eval_loss": 2.8876142501831055, "eval_runtime": 107.7637, "eval_samples_per_second": 11.915, "eval_steps_per_second": 0.195, "step": 580 }, { "epoch": 3.2596685082872927, "grad_norm": 24.705358505249023, "learning_rate": 3.370165745856354e-05, "loss": 1.9746, "step": 590 }, { "epoch": 3.2596685082872927, "eval_loss": 2.84237003326416, "eval_runtime": 107.9704, "eval_samples_per_second": 11.892, "eval_steps_per_second": 0.194, "step": 590 }, { "epoch": 3.314917127071823, "grad_norm": 17.960529327392578, "learning_rate": 3.3425414364640886e-05, "loss": 1.9393, "step": 600 }, { "epoch": 3.314917127071823, "eval_loss": 2.7950246334075928, "eval_runtime": 107.4211, "eval_samples_per_second": 11.953, "eval_steps_per_second": 0.195, "step": 600 }, { "epoch": 3.3701657458563536, "grad_norm": 21.799556732177734, "learning_rate": 3.3149171270718233e-05, "loss": 1.8191, "step": 610 }, { "epoch": 3.3701657458563536, "eval_loss": 2.8050460815429688, "eval_runtime": 109.142, "eval_samples_per_second": 11.764, "eval_steps_per_second": 0.192, "step": 610 }, { "epoch": 3.425414364640884, "grad_norm": 22.035696029663086, "learning_rate": 3.287292817679558e-05, "loss": 1.7885, "step": 620 }, { "epoch": 3.425414364640884, "eval_loss": 2.837017774581909, "eval_runtime": 108.619, "eval_samples_per_second": 11.821, "eval_steps_per_second": 0.193, "step": 620 }, { "epoch": 3.4806629834254146, "grad_norm": 20.61678695678711, "learning_rate": 3.259668508287293e-05, "loss": 1.8065, "step": 630 }, { "epoch": 3.4806629834254146, "eval_loss": 2.6381585597991943, "eval_runtime": 107.6552, "eval_samples_per_second": 11.927, "eval_steps_per_second": 0.195, "step": 630 }, { "epoch": 3.5359116022099446, "grad_norm": 20.068431854248047, "learning_rate": 3.232044198895028e-05, "loss": 1.9027, "step": 640 }, { "epoch": 3.5359116022099446, "eval_loss": 2.6253230571746826, "eval_runtime": 108.6112, "eval_samples_per_second": 11.822, "eval_steps_per_second": 0.193, "step": 640 }, { "epoch": 3.591160220994475, "grad_norm": 20.27581024169922, "learning_rate": 3.2044198895027625e-05, "loss": 1.7976, "step": 650 }, { "epoch": 3.591160220994475, "eval_loss": 2.8042409420013428, "eval_runtime": 109.5439, "eval_samples_per_second": 11.721, "eval_steps_per_second": 0.192, "step": 650 }, { "epoch": 3.6464088397790055, "grad_norm": 22.138561248779297, "learning_rate": 3.176795580110498e-05, "loss": 1.8324, "step": 660 }, { "epoch": 3.6464088397790055, "eval_loss": 2.6126017570495605, "eval_runtime": 108.5998, "eval_samples_per_second": 11.823, "eval_steps_per_second": 0.193, "step": 660 }, { "epoch": 3.701657458563536, "grad_norm": 18.944120407104492, "learning_rate": 3.149171270718232e-05, "loss": 1.7634, "step": 670 }, { "epoch": 3.701657458563536, "eval_loss": 2.5312118530273438, "eval_runtime": 107.8698, "eval_samples_per_second": 11.903, "eval_steps_per_second": 0.195, "step": 670 }, { "epoch": 3.7569060773480665, "grad_norm": 21.863069534301758, "learning_rate": 3.121546961325967e-05, "loss": 1.8946, "step": 680 }, { "epoch": 3.7569060773480665, "eval_loss": 2.480397939682007, "eval_runtime": 108.6826, "eval_samples_per_second": 11.814, "eval_steps_per_second": 0.193, "step": 680 }, { "epoch": 3.8121546961325965, "grad_norm": 23.169885635375977, "learning_rate": 3.0939226519337016e-05, "loss": 1.5957, "step": 690 }, { "epoch": 3.8121546961325965, "eval_loss": 2.6412222385406494, "eval_runtime": 107.9762, "eval_samples_per_second": 11.892, "eval_steps_per_second": 0.194, "step": 690 }, { "epoch": 3.867403314917127, "grad_norm": 20.805410385131836, "learning_rate": 3.0662983425414364e-05, "loss": 1.6951, "step": 700 }, { "epoch": 3.867403314917127, "eval_loss": 2.462557315826416, "eval_runtime": 110.0483, "eval_samples_per_second": 11.668, "eval_steps_per_second": 0.191, "step": 700 }, { "epoch": 3.9226519337016574, "grad_norm": 24.351552963256836, "learning_rate": 3.0386740331491715e-05, "loss": 1.789, "step": 710 }, { "epoch": 3.9226519337016574, "eval_loss": 2.510899305343628, "eval_runtime": 108.097, "eval_samples_per_second": 11.878, "eval_steps_per_second": 0.194, "step": 710 }, { "epoch": 3.977900552486188, "grad_norm": 20.56439971923828, "learning_rate": 3.0110497237569063e-05, "loss": 1.7312, "step": 720 }, { "epoch": 3.977900552486188, "eval_loss": 2.3842289447784424, "eval_runtime": 110.1597, "eval_samples_per_second": 11.656, "eval_steps_per_second": 0.191, "step": 720 } ], "logging_steps": 10, "max_steps": 1810, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3437308831020480.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }